[X86] Move VPTESTM matching from the isel table to custom code in X86ISelDAGToDAG.

We had many tablegen patterns for these instructions. And due to the
commutability of the patterns, tablegen expands them to even more patterns. All
together VPTESTMD patterns accounted for more the 50K of the 610K isel table.
This had gotten bad when we stopped canonicalizing AND to vXi64. This required
a pattern for every combination of bitcast input type.

This change moves the matching to custom code where it is easier to look through
the bitcasts without being concerned with the specific types.

The test changes are because we are now stricter with one use checks as its
required to make load folding legal. We now require the AND and any BITCAST to
only have a single use. This prevents forming VPTESTM and a VPAND with the same
inputs.

We now support broadcast loads for 128/256 patterns without VLX. We'll widen to
512-bit like and still fold the broadcast since the amount of memory read
doesn't change.

There are a few tests that got slightly longer because are now prefering
load + VPTESTM over XOR+VPCMPEQ for (seteq (load), allzeros). Previously we were
able to share the XOR with multiple VPTESTM instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@358359 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index cce4d9b..8266930 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -483,6 +483,7 @@
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
     bool tryShiftAmountMod(SDNode *N);
+    bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
 
     MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
                                 const SDLoc &dl, MVT VT, SDNode *Node);
@@ -3441,6 +3442,347 @@
   return true;
 }
 
+static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
+                              bool FoldedBCast, bool Masked) {
+  if (Masked) {
+    if (FoldedLoad) {
+      switch (TestVT.SimpleTy) {
+      default: llvm_unreachable("Unexpected VT!");
+      case MVT::v16i8:
+        return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
+      case MVT::v8i16:
+        return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
+      case MVT::v4i32:
+        return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
+      case MVT::v2i64:
+        return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
+      case MVT::v32i8:
+        return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
+      case MVT::v16i16:
+        return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
+      case MVT::v8i32:
+        return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
+      case MVT::v4i64:
+        return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
+      case MVT::v64i8:
+        return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
+      case MVT::v32i16:
+        return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
+      case MVT::v16i32:
+        return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
+      case MVT::v8i64:
+        return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
+      }
+    }
+
+    if (FoldedBCast) {
+      switch (TestVT.SimpleTy) {
+      default: llvm_unreachable("Unexpected VT!");
+      case MVT::v4i32:
+        return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
+      case MVT::v2i64:
+        return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
+      case MVT::v8i32:
+        return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
+      case MVT::v4i64:
+        return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
+      case MVT::v16i32:
+        return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
+      case MVT::v8i64:
+        return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
+      }
+    }
+
+    switch (TestVT.SimpleTy) {
+    default: llvm_unreachable("Unexpected VT!");
+    case MVT::v16i8:
+      return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
+    case MVT::v8i16:
+      return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
+    case MVT::v4i32:
+      return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
+    case MVT::v2i64:
+      return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
+    case MVT::v32i8:
+      return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
+    case MVT::v16i16:
+      return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
+    case MVT::v8i32:
+      return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
+    case MVT::v4i64:
+      return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
+    case MVT::v64i8:
+      return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
+    case MVT::v32i16:
+      return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
+    case MVT::v16i32:
+      return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
+    case MVT::v8i64:
+      return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
+    }
+  }
+
+  if (FoldedLoad) {
+    switch (TestVT.SimpleTy) {
+    default: llvm_unreachable("Unexpected VT!");
+    case MVT::v16i8:
+      return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
+    case MVT::v8i16:
+      return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
+    case MVT::v4i32:
+      return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
+    case MVT::v2i64:
+      return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
+    case MVT::v32i8:
+      return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
+    case MVT::v16i16:
+      return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
+    case MVT::v8i32:
+      return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
+    case MVT::v4i64:
+      return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
+    case MVT::v64i8:
+      return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
+    case MVT::v32i16:
+      return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
+    case MVT::v16i32:
+      return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
+    case MVT::v8i64:
+      return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
+    }
+  }
+
+  if (FoldedBCast) {
+    switch (TestVT.SimpleTy) {
+    default: llvm_unreachable("Unexpected VT!");
+    case MVT::v4i32:
+      return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
+    case MVT::v2i64:
+      return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
+    case MVT::v8i32:
+      return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
+    case MVT::v4i64:
+      return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
+    case MVT::v16i32:
+      return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
+    case MVT::v8i64:
+      return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
+    }
+  }
+
+  switch (TestVT.SimpleTy) {
+  default: llvm_unreachable("Unexpected VT!");
+  case MVT::v16i8:
+    return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
+  case MVT::v8i16:
+    return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
+  case MVT::v4i32:
+    return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
+  case MVT::v2i64:
+    return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
+  case MVT::v32i8:
+    return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
+  case MVT::v16i16:
+    return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
+  case MVT::v8i32:
+    return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
+  case MVT::v4i64:
+    return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
+  case MVT::v64i8:
+    return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
+  case MVT::v32i16:
+    return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
+  case MVT::v16i32:
+    return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
+  case MVT::v8i64:
+    return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
+  }
+}
+
+// Try to create VPTESTM instruction. If InMask is not null, it will be used
+// to form a masked operation.
+bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
+                                 SDValue InMask) {
+  assert(Subtarget->hasAVX512() && "Expected AVX512!");
+  assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+         "Unexpected VT!");
+
+  // Look for equal and not equal compares.
+  ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
+  if (CC != ISD::SETEQ && CC != ISD::SETNE)
+    return false;
+
+  // See if we're comparing against zero. This should have been canonicalized
+  // to RHS during lowering.
+  if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
+    return false;
+
+  SDValue N0 = Setcc.getOperand(0);
+
+  MVT CmpVT = N0.getSimpleValueType();
+  MVT CmpSVT = CmpVT.getVectorElementType();
+
+  // Start with both operands the same. We'll try to refine this.
+  SDValue Src0 = N0;
+  SDValue Src1 = N0;
+
+  {
+    // Look through single use bitcasts.
+    SDValue N0Temp = N0;
+    if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
+      N0Temp = N0.getOperand(0);
+
+     // Look for single use AND.
+    if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
+      Src0 = N0Temp.getOperand(0);
+      Src1 = N0Temp.getOperand(1);
+    }
+  }
+
+  // Without VLX we need to widen the load.
+  bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
+
+  // We can only fold loads if the sources are unique.
+  bool CanFoldLoads = Src0 != Src1;
+
+  // Try to fold loads unless we need to widen.
+  bool FoldedLoad = false;
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
+  if (!Widen && CanFoldLoads) {
+    Load = Src1;
+    FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
+                             Tmp4);
+    if (!FoldedLoad) {
+      // And is computative.
+      Load = Src0;
+      FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
+                               Tmp3, Tmp4);
+      if (FoldedLoad)
+        std::swap(Src0, Src1);
+    }
+  }
+
+  auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
+    // Look through single use bitcasts.
+    if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
+      Src = Src.getOperand(0);
+
+    if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
+      Parent = Src.getNode();
+      Src = Src.getOperand(0);
+      if (Src.getSimpleValueType() == CmpSVT)
+        return Src;
+    }
+
+    return SDValue();
+  };
+
+  // If we didn't fold a load, try to match broadcast. No widening limitation
+  // for this. But only 32 and 64 bit types are supported.
+  bool FoldedBCast = false;
+  if (!FoldedLoad && CanFoldLoads &&
+      (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
+    SDNode *ParentNode;
+    if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
+      FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
+                                Tmp1, Tmp2, Tmp3, Tmp4);
+    }
+
+    // Try the other operand.
+    if (!FoldedBCast) {
+      if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
+        FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
+                                  Tmp1, Tmp2, Tmp3, Tmp4);
+        if (FoldedBCast)
+          std::swap(Src0, Src1);
+      }
+    }
+  }
+
+  auto getMaskRC = [](MVT MaskVT) {
+    switch (MaskVT.SimpleTy) {
+    default: llvm_unreachable("Unexpected VT!");
+    case MVT::v2i1:  return X86::VK2RegClassID;
+    case MVT::v4i1:  return X86::VK4RegClassID;
+    case MVT::v8i1:  return X86::VK8RegClassID;
+    case MVT::v16i1: return X86::VK16RegClassID;
+    case MVT::v32i1: return X86::VK32RegClassID;
+    case MVT::v64i1: return X86::VK64RegClassID;
+    }
+  };
+
+  bool IsMasked = InMask.getNode() != nullptr;
+
+  SDLoc dl(Root);
+
+  MVT ResVT = Setcc.getSimpleValueType();
+  MVT MaskVT = ResVT;
+  if (Widen) {
+    // Widen the inputs using insert_subreg or copy_to_regclass.
+    unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
+    unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
+    unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
+    CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
+    MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
+                                                     CmpVT), 0);
+    Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
+
+    assert(!FoldedLoad && "Shouldn't have folded the load");
+    if (!FoldedBCast)
+      Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
+
+    if (IsMasked) {
+      // Widen the mask.
+      unsigned RegClass = getMaskRC(MaskVT);
+      SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
+      InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                              dl, MaskVT, InMask, RC), 0);
+    }
+  }
+
+  bool IsTestN = CC == ISD::SETEQ;
+  unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
+                               IsMasked);
+
+  MachineSDNode *CNode;
+  if (FoldedLoad || FoldedBCast) {
+    SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
+
+    if (IsMasked) {
+      SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
+                        Load.getOperand(0) };
+      CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+    } else {
+      SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
+                        Load.getOperand(0) };
+      CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+    }
+
+    // Update the chain.
+    ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
+    // Record the mem-refs
+    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+  } else {
+    if (IsMasked)
+      CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
+    else
+      CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
+  }
+
+  // If we widened, we need to shrink the mask VT.
+  if (Widen) {
+    unsigned RegClass = getMaskRC(ResVT);
+    SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
+    CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                   dl, ResVT, SDValue(CNode, 0), RC);
+  }
+
+  ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
+  CurDAG->RemoveDeadNode(Root);
+  return true;
+}
+
 void X86DAGToDAGISel::Select(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   unsigned Opcode = Node->getOpcode();
@@ -3570,6 +3912,18 @@
     break;
 
   case ISD::AND:
+    if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
+      // Try to form a masked VPTESTM. Operands can be in either order.
+      SDValue N0 = Node->getOperand(0);
+      SDValue N1 = Node->getOperand(1);
+      if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
+          tryVPTESTM(Node, N0, N1))
+        return;
+      if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
+          tryVPTESTM(Node, N1, N0))
+        return;
+    }
+
     if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
       ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
       CurDAG->RemoveDeadNode(Node);
@@ -4207,6 +4561,13 @@
     return;
   }
 
+  case ISD::SETCC: {
+    if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
+      return;
+
+    break;
+  }
+
   case ISD::STORE:
     if (foldLoadStoreIntoMemOperand(Node))
       return;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 4403f98..f86d989 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -5813,309 +5813,93 @@
 // AVX-512  VPTESTM instructions
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
-                         PatFrag OpNode_su,
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
                          X86FoldableSchedWrite sched, X86VectorVTInfo _,
                          string Name> {
-  let ExeDomain = _.ExeDomain in {
+  // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG.
+  // There are just too many permuations due to commutability and bitcasts.
+  let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
   let isCommutable = 1 in
   defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
-                   (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV),
-                   (OpNode_su (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)>,
+                   (null_frag), (null_frag)>,
                    EVEX_4V, Sched<[sched]>;
+  let mayLoad = 1 in
   defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (OpNode (and _.RC:$src1, (_.LdFrag addr:$src2)),
-                           _.ImmAllZerosV),
-                   (OpNode_su (and _.RC:$src1, (_.LdFrag addr:$src2)),
-                           _.ImmAllZerosV)>,
+                   (null_frag), (null_frag)>,
                    EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
-
-  // Patterns for compare with 0 that just use the same source twice.
-  def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
-            (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rr")
-                                      _.RC:$src, _.RC:$src))>;
-
-  def : Pat<(_.KVT (and _.KRC:$mask, (OpNode_su _.RC:$src, _.ImmAllZerosV))),
-            (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rrk")
-                                      _.KRC:$mask, _.RC:$src, _.RC:$src))>;
 }
 
-multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
-                            PatFrag OpNode_su, X86FoldableSchedWrite sched,
-                            X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr,
+                            X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
   defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                     "${src2}"##_.BroadcastStr##", $src1",
                     "$src1, ${src2}"##_.BroadcastStr,
-                    (OpNode (and _.RC:$src1,
-                                       (X86VBroadcast
-                                        (_.ScalarLdFrag addr:$src2))),
-                            _.ImmAllZerosV),
-                    (OpNode_su (and _.RC:$src1,
-                                          (X86VBroadcast
-                                           (_.ScalarLdFrag addr:$src2))),
-                               _.ImmAllZerosV)>,
+                    (null_frag), (null_frag)>,
                     EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
-// Use 512bit version to implement 128/256 bit in case NoVLX.
-multiclass avx512_vptest_lowering<PatFrag OpNode, PatFrag OpNode_su,
-                                  X86VectorVTInfo ExtendInfo, X86VectorVTInfo _,
-                                  string Name> {
-  def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2),
-                           _.ImmAllZerosV)),
-            (_.KVT (COPY_TO_REGCLASS
-                     (!cast<Instruction>(Name # "Zrr")
-                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                                      _.RC:$src1, _.SubRegIdx),
-                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                                      _.RC:$src2, _.SubRegIdx)),
-                   _.KRC))>;
-
-  def : Pat<(_.KVT (and _.KRC:$mask,
-                        (OpNode_su (and _.RC:$src1, _.RC:$src2),
-                                   _.ImmAllZerosV))),
-            (COPY_TO_REGCLASS
-             (!cast<Instruction>(Name # "Zrrk")
-              (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
-              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                             _.RC:$src1, _.SubRegIdx),
-              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                             _.RC:$src2, _.SubRegIdx)),
-             _.KRC)>;
-
-  def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
-            (_.KVT (COPY_TO_REGCLASS
-                     (!cast<Instruction>(Name # "Zrr")
-                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                                      _.RC:$src, _.SubRegIdx),
-                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                                      _.RC:$src, _.SubRegIdx)),
-                   _.KRC))>;
-
-  def : Pat<(_.KVT (and _.KRC:$mask, (OpNode_su _.RC:$src, _.ImmAllZerosV))),
-            (COPY_TO_REGCLASS
-             (!cast<Instruction>(Name # "Zrrk")
-              (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
-              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                             _.RC:$src, _.SubRegIdx),
-              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                             _.RC:$src, _.SubRegIdx)),
-             _.KRC)>;
-}
-
-multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, PatFrag OpNode,
-                                  PatFrag OpNode_su, X86SchedWriteWidths sched,
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr,
+                                  X86SchedWriteWidths sched,
                                   AVX512VLVectorVTInfo _> {
   let Predicates  = [HasAVX512] in
-  defm Z : avx512_vptest<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM, _.info512, NAME>,
-           avx512_vptest_mb<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM, _.info512>, EVEX_V512;
+  defm Z : avx512_vptest<opc, OpcodeStr, sched.ZMM, _.info512, NAME>,
+           avx512_vptest_mb<opc, OpcodeStr, sched.ZMM, _.info512>, EVEX_V512;
 
   let Predicates = [HasAVX512, HasVLX] in {
-  defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM, _.info256, NAME>,
-              avx512_vptest_mb<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM, _.info256>, EVEX_V256;
-  defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM, _.info128, NAME>,
-              avx512_vptest_mb<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM, _.info128>, EVEX_V128;
-  }
-  let Predicates = [HasAVX512, NoVLX] in {
-  defm Z256_Alt : avx512_vptest_lowering< OpNode, OpNode_su, _.info512, _.info256, NAME>;
-  defm Z128_Alt : avx512_vptest_lowering< OpNode, OpNode_su, _.info512, _.info128, NAME>;
+  defm Z256 : avx512_vptest<opc, OpcodeStr, sched.YMM, _.info256, NAME>,
+              avx512_vptest_mb<opc, OpcodeStr, sched.YMM, _.info256>, EVEX_V256;
+  defm Z128 : avx512_vptest<opc, OpcodeStr, sched.XMM, _.info128, NAME>,
+              avx512_vptest_mb<opc, OpcodeStr, sched.XMM, _.info128>, EVEX_V128;
   }
 }
 
-multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, PatFrag OpNode,
-                            PatFrag OpNode_su, X86SchedWriteWidths sched> {
-  defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, OpNode_su, sched,
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr,
+                            X86SchedWriteWidths sched> {
+  defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", sched,
                                  avx512vl_i32_info>;
-  defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, OpNode_su, sched,
+  defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", sched,
                                  avx512vl_i64_info>, VEX_W;
 }
 
 multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
-                            PatFrag OpNode, PatFrag OpNode_su,
                             X86SchedWriteWidths sched> {
   let Predicates = [HasBWI] in {
-  defm WZ:    avx512_vptest<opc, OpcodeStr#"w", OpNode, OpNode_su, sched.ZMM,
+  defm WZ:    avx512_vptest<opc, OpcodeStr#"w", sched.ZMM,
                             v32i16_info, NAME#"W">, EVEX_V512, VEX_W;
-  defm BZ:    avx512_vptest<opc, OpcodeStr#"b", OpNode, OpNode_su, sched.ZMM,
+  defm BZ:    avx512_vptest<opc, OpcodeStr#"b", sched.ZMM,
                             v64i8_info, NAME#"B">, EVEX_V512;
   }
   let Predicates = [HasVLX, HasBWI] in {
 
-  defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, OpNode_su, sched.YMM,
+  defm WZ256: avx512_vptest<opc, OpcodeStr#"w", sched.YMM,
                             v16i16x_info, NAME#"W">, EVEX_V256, VEX_W;
-  defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, OpNode_su, sched.XMM,
+  defm WZ128: avx512_vptest<opc, OpcodeStr#"w", sched.XMM,
                             v8i16x_info, NAME#"W">, EVEX_V128, VEX_W;
-  defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, OpNode_su, sched.YMM,
+  defm BZ256: avx512_vptest<opc, OpcodeStr#"b", sched.YMM,
                             v32i8x_info, NAME#"B">, EVEX_V256;
-  defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, OpNode_su, sched.XMM,
+  defm BZ128: avx512_vptest<opc, OpcodeStr#"b", sched.XMM,
                             v16i8x_info, NAME#"B">, EVEX_V128;
   }
-
-  let Predicates = [HasBWI, NoVLX] in {
-  defm BZ256_Alt : avx512_vptest_lowering<OpNode, OpNode_su, v64i8_info, v32i8x_info, NAME#"B">;
-  defm BZ128_Alt : avx512_vptest_lowering<OpNode, OpNode_su, v64i8_info, v16i8x_info, NAME#"B">;
-  defm WZ256_Alt : avx512_vptest_lowering<OpNode, OpNode_su, v32i16_info, v16i16x_info, NAME#"W">;
-  defm WZ128_Alt : avx512_vptest_lowering<OpNode, OpNode_su, v32i16_info, v8i16x_info, NAME#"W">;
-  }
 }
 
-// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm
-// as commutable here because we already canonicalized all zeros vectors to the
-// RHS during lowering.
-def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2),
-                         (setcc node:$src1, node:$src2, SETEQ)>;
-def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2),
-                         (setcc node:$src1, node:$src2, SETNE)>;
-
-def X86pcmpeqm_su : PatFrag<(ops node:$src1, node:$src2),
-                            (X86pcmpeqm node:$src1, node:$src2), [{
-  return N->hasOneUse();
-}]>;
-def X86pcmpnem_su : PatFrag<(ops node:$src1, node:$src2),
-                            (X86pcmpnem node:$src1, node:$src2), [{
-  return N->hasOneUse();
-}]>;
-
 multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
-                                   PatFrag OpNode, PatFrag OpNode_su,
                                    X86SchedWriteWidths sched> :
-  avx512_vptest_wb<opc_wb, OpcodeStr, OpNode, OpNode_su, sched>,
-  avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, OpNode_su, sched>;
+  avx512_vptest_wb<opc_wb, OpcodeStr, sched>,
+  avx512_vptest_dq<opc_dq, OpcodeStr, sched>;
 
-defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem,
-                                         X86pcmpnem_su, SchedWriteVecLogic>, T8PD;
-defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm,
-                                         X86pcmpeqm_su, SchedWriteVecLogic>, T8XS;
-
-
-multiclass avx512_vptest_lowering_pats<string InstrStr, PatFrag OpNode,
-                                       PatFrag OpNode_su, X86VectorVTInfo _,
-                                       X86VectorVTInfo AndInfo> {
-  def : Pat<(_.KVT (OpNode (bitconvert
-                            (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
-                           _.ImmAllZerosV)),
-            (!cast<Instruction>(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>;
-
-  def : Pat<(_.KVT (and _.KRC:$mask,
-                    (OpNode_su (bitconvert
-                                (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
-                               _.ImmAllZerosV))),
-            (!cast<Instruction>(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1,
-                                                  _.RC:$src2)>;
-
-  def : Pat<(_.KVT (OpNode (bitconvert
-                            (AndInfo.VT (and _.RC:$src1,
-                                             (AndInfo.LdFrag addr:$src2)))),
-                           _.ImmAllZerosV)),
-            (!cast<Instruction>(InstrStr # "rm") _.RC:$src1, addr:$src2)>;
-
-  def : Pat<(_.KVT (and _.KRC:$mask,
-                    (OpNode_su (bitconvert
-                                (AndInfo.VT (and _.RC:$src1,
-                                                 (AndInfo.LdFrag addr:$src2)))),
-                            _   .ImmAllZerosV))),
-            (!cast<Instruction>(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1,
-                                                  addr:$src2)>;
-}
-
-// Patterns to use 512-bit instructions when 128/256 are not available.
-multiclass avx512_vptest_lowering_wide_pats<string InstrStr, PatFrag OpNode,
-                                            PatFrag OpNode_su,
-                                            X86VectorVTInfo _,
-                                            X86VectorVTInfo AndInfo,
-                                            X86VectorVTInfo ExtendInfo> {
-  def : Pat<(_.KVT (OpNode (bitconvert
-                            (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
-                           _.ImmAllZerosV)),
-            (_.KVT (COPY_TO_REGCLASS
-                     (!cast<Instruction>(InstrStr#"rr")
-                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                                      _.RC:$src1, _.SubRegIdx),
-                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                                      _.RC:$src2, _.SubRegIdx)),
-                   _.KRC))>;
-
-  def : Pat<(_.KVT (and _.KRC:$mask,
-                    (OpNode_su (bitconvert
-                                (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
-                               _.ImmAllZerosV))),
-            (COPY_TO_REGCLASS
-             (!cast<Instruction>(InstrStr#"rrk")
-              (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
-              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                             _.RC:$src1, _.SubRegIdx),
-              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
-                             _.RC:$src2, _.SubRegIdx)),
-             _.KRC)>;
-}
-
-multiclass avx512_vptest_lowering_sizes<string InstrStr, PatFrag OpNode,
-                                        PatFrag OpNode_su, Predicate prd,
-                                        AVX512VLVectorVTInfo CmpInfo,
-                                        AVX512VLVectorVTInfo AndInfo> {
-let Predicates = [prd, HasVLX] in {
-  defm : avx512_vptest_lowering_pats<InstrStr#"Z128", OpNode, OpNode_su,
-                                     CmpInfo.info128, AndInfo.info128>;
-  defm : avx512_vptest_lowering_pats<InstrStr#"Z256", OpNode, OpNode_su,
-                                     CmpInfo.info256, AndInfo.info256>;
-}
-let Predicates = [prd] in {
-  defm : avx512_vptest_lowering_pats<InstrStr#"Z", OpNode, OpNode_su,
-                                     CmpInfo.info512, AndInfo.info512>;
-}
-
-let Predicates = [prd, NoVLX] in {
-  defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode, OpNode_su,
-                                          CmpInfo.info128, AndInfo.info128,
-                                          CmpInfo.info512>;
-  defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode, OpNode_su,
-                                          CmpInfo.info256, AndInfo.info256,
-                                          CmpInfo.info512>;
-}
-}
-
-multiclass avx512_vptest_lowering_types<string InstrStr, PatFrag OpNode,
-                                        PatFrag OpNode_su> {
-  defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, OpNode_su, HasBWI,
-                                      avx512vl_i8_info, avx512vl_i16_info>;
-  defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, OpNode_su, HasBWI,
-                                      avx512vl_i8_info, avx512vl_i32_info>;
-  defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, OpNode_su, HasBWI,
-                                      avx512vl_i8_info, avx512vl_i64_info>;
-
-  defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, OpNode_su, HasBWI,
-                                      avx512vl_i16_info, avx512vl_i8_info>;
-  defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, OpNode_su, HasBWI,
-                                      avx512vl_i16_info, avx512vl_i32_info>;
-  defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, OpNode_su, HasBWI,
-                                      avx512vl_i16_info, avx512vl_i64_info>;
-
-  defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, OpNode_su, HasAVX512,
-                                      avx512vl_i32_info, avx512vl_i8_info>;
-  defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, OpNode_su, HasAVX512,
-                                      avx512vl_i32_info, avx512vl_i16_info>;
-  defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, OpNode_su, HasAVX512,
-                                      avx512vl_i32_info, avx512vl_i64_info>;
-
-  defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, OpNode_su, HasAVX512,
-                                      avx512vl_i64_info, avx512vl_i8_info>;
-  defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, OpNode_su, HasAVX512,
-                                      avx512vl_i64_info, avx512vl_i16_info>;
-  defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, OpNode_su, HasAVX512,
-                                      avx512vl_i64_info, avx512vl_i32_info>;
-}
-
-defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem, X86pcmpnem_su>;
-defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm, X86pcmpeqm_su>;
+defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm",
+                                         SchedWriteVecLogic>, T8PD;
+defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm",
+                                         SchedWriteVecLogic>, T8XS;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Shift instructions
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index 5b9b400..2c7d63d 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -853,8 +853,8 @@
 ; KNL-NEXT:    vpmovzxwq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd2]
 ; KNL-NEXT:    ## zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
 ; KNL-NEXT:    vpsllq $63, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xf2,0x3f]
-; KNL-NEXT:    vptestmq %zmm2, %zmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x48,0x27,0xca]
-; KNL-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xc2,0x0f,0x01]
+; KNL-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x58,0xc2,0x0f,0x01]
+; KNL-NEXT:    vptestmq %zmm2, %zmm2, %k1 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x27,0xca]
 ; KNL-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x65,0xc0]
 ; KNL-NEXT:    retq ## encoding: [0xc3]
 ;
diff --git a/test/CodeGen/X86/kshift.ll b/test/CodeGen/X86/kshift.ll
index aaa5ff7..a12fd46 100644
--- a/test/CodeGen/X86/kshift.ll
+++ b/test/CodeGen/X86/kshift.ll
@@ -10,8 +10,8 @@
 ; KNL-NEXT:    movb $-2, %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    # kill: def $al killed $al killed $eax
 ; KNL-NEXT:    vzeroupper
@@ -75,15 +75,15 @@
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    valignd {{.*#+}} zmm0 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT:    kshiftlw $1, %k2, %k2
-; KNL-NEXT:    vpcmpeqw %ymm4, %ymm3, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    kshiftlw $1, %k2, %k1
+; KNL-NEXT:    vpcmpeqw %ymm4, %ymm3, %ymm1
+; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k2
 ; KNL-NEXT:    vpcmpeqw %ymm4, %ymm2, %ymm1
 ; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k2}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    shll $16, %eax
 ; KNL-NEXT:    orl %ecx, %eax
@@ -112,38 +112,38 @@
 ; KNL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; KNL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm5
-; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k3
+; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k1
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k2
 ; KNL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k3
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k4
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
 ; KNL-NEXT:    valignd {{.*#+}} zmm1 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; KNL-NEXT:    valignd {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
-; KNL-NEXT:    valignd {{.*#+}} zmm0 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k4
-; KNL-NEXT:    kshiftlw $1, %k3, %k3
-; KNL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm0
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
+; KNL-NEXT:    valignd {{.*#+}} zmm0 = zmm5[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
+; KNL-NEXT:    valignd {{.*#+}} zmm5 = zmm6[15],zmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT:    kshiftlw $1, %k1, %k3
+; KNL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm3
+; KNL-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; KNL-NEXT:    vpmovsxbd %xmm6, %zmm6
+; KNL-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; KNL-NEXT:    vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k2
 ; KNL-NEXT:    vpcmpeqb %ymm4, %ymm2, %ymm2
 ; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; KNL-NEXT:    vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k4
 ; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
 ; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0 {%k3}
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0 {%k4}
+; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k0 {%k4}
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
@@ -182,8 +182,8 @@
 ; KNL-NEXT:    movb $-128, %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    # kill: def $al killed $al killed $eax
 ; KNL-NEXT:    vzeroupper
@@ -308,8 +308,8 @@
 ; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,2,3,4,5,6,7,15]
 ; KNL-NEXT:    vpermi2q %zmm0, %zmm2, %zmm3
-; KNL-NEXT:    vptestmq %zmm3, %zmm3, %k1
-; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vptestmq %zmm3, %zmm3, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    # kill: def $al killed $al killed $eax
 ; KNL-NEXT:    vzeroupper
@@ -373,10 +373,10 @@
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k2
 ; KNL-NEXT:    kshiftrw $1, %k1, %k1
-; KNL-NEXT:    vpcmpeqw %ymm4, %ymm2, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpcmpeqw %ymm4, %ymm2, %ymm1
+; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k2
 ; KNL-NEXT:    vpcmpeqw %ymm4, %ymm3, %ymm1
 ; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
 ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k1}
@@ -411,44 +411,44 @@
 ; KNL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm1
 ; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; KNL-NEXT:    vpmovsxbd %xmm5, %zmm5
-; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k3
+; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k1
 ; KNL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm5
 ; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k2
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k3
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k4
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
 ; KNL-NEXT:    valignd {{.*#+}} zmm5 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
-; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
-; KNL-NEXT:    valignd {{.*#+}} zmm0 = zmm5[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
-; KNL-NEXT:    valignd {{.*#+}} zmm0 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k4
-; KNL-NEXT:    kshiftrw $1, %k3, %k3
-; KNL-NEXT:    vpcmpeqb %ymm4, %ymm2, %ymm0
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; KNL-NEXT:    valignd {{.*#+}} zmm0 = zmm6[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
+; KNL-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
+; KNL-NEXT:    valignd {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm6[0]
+; KNL-NEXT:    kshiftrw $1, %k1, %k3
+; KNL-NEXT:    vpcmpeqb %ymm4, %ymm2, %ymm2
+; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; KNL-NEXT:    vpmovsxbd %xmm6, %zmm6
+; KNL-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k2
 ; KNL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm2
 ; KNL-NEXT:    vpmovsxbd %xmm2, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k4
 ; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
 ; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k0 {%k3}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    shll $16, %eax
-; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0 {%k4}
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k4}
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
 ; KNL-NEXT:    shlq $32, %rcx
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k2}
 ; KNL-NEXT:    kmovw %k0, %edx
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT:    vptestmd %zmm5, %zmm5, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    shll $16, %eax
 ; KNL-NEXT:    orl %edx, %eax
@@ -480,8 +480,8 @@
 ; KNL-NEXT:    movb $-2, %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    # kill: def $al killed $al killed $eax
 ; KNL-NEXT:    vzeroupper
@@ -605,8 +605,8 @@
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <8,u,1,2,3,u,5,6>
 ; KNL-NEXT:    vpermi2q %zmm0, %zmm2, %zmm3
 ; KNL-NEXT:    vpsllq $63, %zmm3, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    # kill: def $al killed $al killed $eax
 ; KNL-NEXT:    vzeroupper
@@ -636,8 +636,8 @@
 ; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6]
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    # kill: def $al killed $al killed $eax
 ; KNL-NEXT:    vzeroupper
@@ -669,8 +669,8 @@
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <1,u,3,u,5,6,7,15>
 ; KNL-NEXT:    vpermi2q %zmm0, %zmm2, %zmm3
 ; KNL-NEXT:    vpsllq $63, %zmm3, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    # kill: def $al killed $al killed $eax
 ; KNL-NEXT:    vzeroupper
@@ -700,8 +700,8 @@
 ; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,0,1]
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    # kill: def $al killed $al killed $eax
 ; KNL-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/movmsk-cmp.ll b/test/CodeGen/X86/movmsk-cmp.ll
index 01d5252..0686280 100644
--- a/test/CodeGen/X86/movmsk-cmp.ll
+++ b/test/CodeGen/X86/movmsk-cmp.ll
@@ -2087,8 +2087,7 @@
 ; KNL-LABEL: allones_v4i32_and1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    andb $15, %al
 ; KNL-NEXT:    cmpb $15, %al
@@ -2131,8 +2130,7 @@
 ; KNL-LABEL: allzeros_v4i32_and1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb $15, %al
 ; KNL-NEXT:    sete %al
@@ -2192,8 +2190,7 @@
 ; KNL-LABEL: allones_v8i32_and1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    cmpb $-1, %al
 ; KNL-NEXT:    sete %al
@@ -2253,8 +2250,7 @@
 ; KNL-LABEL: allzeros_v8i32_and1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
 ; KNL-NEXT:    sete %al
@@ -2573,8 +2569,7 @@
 ; KNL-LABEL: allones_v4i64_and1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
-; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    andb $15, %al
 ; KNL-NEXT:    cmpb $15, %al
@@ -2640,8 +2635,7 @@
 ; KNL-LABEL: allzeros_v4i64_and1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
-; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb $15, %al
 ; KNL-NEXT:    sete %al
@@ -3686,8 +3680,7 @@
 ; KNL-LABEL: allones_v4i32_and4:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    andb $15, %al
 ; KNL-NEXT:    cmpb $15, %al
@@ -3730,8 +3723,7 @@
 ; KNL-LABEL: allzeros_v4i32_and4:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb $15, %al
 ; KNL-NEXT:    sete %al
@@ -3791,8 +3783,7 @@
 ; KNL-LABEL: allones_v8i32_and4:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    cmpb $-1, %al
 ; KNL-NEXT:    sete %al
@@ -3852,8 +3843,7 @@
 ; KNL-LABEL: allzeros_v8i32_and4:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
-; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
 ; KNL-NEXT:    sete %al
@@ -4172,8 +4162,7 @@
 ; KNL-LABEL: allones_v4i64_and4:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4]
-; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    andb $15, %al
 ; KNL-NEXT:    cmpb $15, %al
@@ -4239,8 +4228,7 @@
 ; KNL-LABEL: allzeros_v4i64_and4:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4]
-; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb $15, %al
 ; KNL-NEXT:    sete %al
diff --git a/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/test/CodeGen/X86/prefer-avx256-mask-extend.ll
index b4d452f..2ee68e3 100644
--- a/test/CodeGen/X86/prefer-avx256-mask-extend.ll
+++ b/test/CodeGen/X86/prefer-avx256-mask-extend.ll
@@ -7,8 +7,8 @@
 define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
 ; AVX256-LABEL: testv8i1_sext_v8i16:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
@@ -17,8 +17,8 @@
 ;
 ; AVX512VL-LABEL: testv8i1_sext_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
@@ -42,9 +42,10 @@
 define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
 ; AVX256-LABEL: testv16i1_sext_v16i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
 ; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
 ; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
@@ -56,9 +57,10 @@
 ;
 ; AVX512VL-LABEL: testv16i1_sext_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -88,9 +90,10 @@
 define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
 ; AVX256-LABEL: testv16i1_sext_v16i16:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
 ; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
 ; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
@@ -101,9 +104,10 @@
 ;
 ; AVX512VL-LABEL: testv16i1_sext_v16i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
@@ -131,8 +135,8 @@
 define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
 ; AVX256-LABEL: testv8i1_zext_v8i16:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
@@ -142,8 +146,8 @@
 ;
 ; AVX512VL-LABEL: testv8i1_zext_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
@@ -168,9 +172,10 @@
 define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
 ; AVX256-LABEL: testv16i1_zext_v16i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
 ; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
 ; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
@@ -184,9 +189,10 @@
 ;
 ; AVX512VL-LABEL: testv16i1_zext_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512VL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -216,9 +222,10 @@
 define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
 ; AVX256-LABEL: testv16i1_zext_v16i16:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
 ; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
 ; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
@@ -230,9 +237,10 @@
 ;
 ; AVX512VL-LABEL: testv16i1_zext_v16i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
diff --git a/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
index 1983b7a..7556498 100644
--- a/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
+++ b/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
@@ -11,9 +11,10 @@
 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 x i32>* %b) {
 ; AVX256VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX256VL:       # %bb.0:
-; AVX256VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256VL-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX256VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX256VL-NEXT:    vptestnmd %ymm1, %ymm1, %k2
 ; AVX256VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
 ; AVX256VL-NEXT:    vpmovdw %ymm1, %xmm1
@@ -42,9 +43,10 @@
 ;
 ; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX512VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vptestnmd %ymm1, %ymm1, %k2
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -57,9 +59,10 @@
 ;
 ; AVX256VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX256VLBW:       # %bb.0:
-; AVX256VLBW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256VLBW-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; AVX256VLBW-NEXT:    vpcmpeqd (%rsi), %ymm0, %k1
+; AVX256VLBW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256VLBW-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX256VLBW-NEXT:    vptestnmd %ymm0, %ymm0, %k0
+; AVX256VLBW-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; AVX256VLBW-NEXT:    vpmovm2w %k1, %ymm0
 ; AVX256VLBW-NEXT:    vpmovm2w %k0, %ymm1
 ; AVX256VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -71,9 +74,10 @@
 ;
 ; AVX512VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX512VLBW-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX512VLBW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VLBW-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512VLBW-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX512VLBW-NEXT:    vptestnmd %ymm1, %ymm1, %k2
 ; AVX512VLBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; AVX512VLBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 9bcb4fd..1cf20bf 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -24,8 +24,7 @@
 ; KNL-32-LABEL: pr25080:
 ; KNL-32:       # %bb.0: # %entry
 ; KNL-32-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
-; KNL-32-NEXT:    vptestnmd %zmm1, %zmm0, %k0
+; KNL-32-NEXT:    vptestnmd {{\.LCPI.*}}{1to16}, %zmm0, %k0
 ; KNL-32-NEXT:    movb $15, %al
 ; KNL-32-NEXT:    kmovw %eax, %k1
 ; KNL-32-NEXT:    korw %k1, %k0, %k1
diff --git a/test/CodeGen/X86/vector-fshl-128.ll b/test/CodeGen/X86/vector-fshl-128.ll
index 4a51e33..e7e1876 100644
--- a/test/CodeGen/X86/vector-fshl-128.ll
+++ b/test/CodeGen/X86/vector-fshl-128.ll
@@ -108,16 +108,14 @@
 ;
 ; AVX512F-LABEL: var_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT:    vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -125,30 +123,27 @@
 ;
 ; AVX512VL-LABEL: var_funnnel_v2i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT:    vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v2i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -156,16 +151,14 @@
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v2i64:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -173,14 +166,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v2i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -358,16 +350,15 @@
 ;
 ; AVX512F-LABEL: var_funnnel_v4i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT:    vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -375,30 +366,28 @@
 ;
 ; AVX512VL-LABEL: var_funnnel_v4i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT:    vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v4i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -406,16 +395,15 @@
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v4i32:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -423,14 +411,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v4i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -719,17 +706,15 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -737,17 +722,15 @@
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v8i16:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -755,14 +738,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpsllvw %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpsllvw %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlvw %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT:    vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -1040,21 +1022,19 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT:    vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1062,21 +1042,19 @@
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v16i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT:    vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT:    vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512VBMI2-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -1084,19 +1062,18 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v16i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT:    vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512VLBW-NEXT:    vpmovwb %ymm1, %xmm1
-; AVX512VLBW-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLBW-NEXT:    vzeroupper
@@ -1104,19 +1081,18 @@
 ;
 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT:    vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT:    vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512VLVBMI2-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLVBMI2-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLVBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512VLVBMI2-NEXT:    vpmovwb %ymm1, %xmm1
-; AVX512VLVBMI2-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
 ; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLVBMI2-NEXT:    vzeroupper
@@ -1276,14 +1252,13 @@
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT:    vpsllq %xmm4, %xmm0, %xmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -1292,14 +1267,13 @@
 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT:    vpsllq %xmm4, %xmm0, %xmm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
@@ -1308,14 +1282,13 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpsllq %xmm4, %xmm0, %xmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1325,14 +1298,13 @@
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpsllq %xmm4, %xmm0, %xmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -1341,14 +1313,13 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpsllq %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -1497,15 +1468,15 @@
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT:    vpslld %xmm5, %xmm0, %xmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -1514,16 +1485,15 @@
 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT:    vpslld %xmm5, %xmm0, %xmm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
@@ -1533,15 +1503,15 @@
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT:    vpslld %xmm5, %xmm0, %xmm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1552,15 +1522,15 @@
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT:    vpslld %xmm5, %xmm0, %xmm5
-; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VBMI2-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -1569,16 +1539,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT:    vpslld %xmm5, %xmm0, %xmm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT:    vpslld %xmm3, %xmm0, %xmm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -1759,16 +1728,15 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT:    vpsllw %xmm5, %xmm0, %xmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1778,16 +1746,15 @@
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT:    vpsllw %xmm5, %xmm0, %xmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -1796,16 +1763,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT:    vpsllw %xmm5, %xmm0, %xmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT:    vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -2031,18 +1997,17 @@
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT:    vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -2053,18 +2018,17 @@
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT:    vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT:    vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512VBMI2-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -2073,19 +2037,18 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT:    vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512VLBW-NEXT:    vpmovwb %ymm1, %xmm1
-; AVX512VLBW-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLBW-NEXT:    vzeroupper
@@ -2094,19 +2057,18 @@
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT:    vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT:    vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512VLVBMI2-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLVBMI2-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512VLVBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX512VLVBMI2-NEXT:    vpmovwb %ymm1, %xmm1
-; AVX512VLVBMI2-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
 ; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLVBMI2-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-fshl-256.ll b/test/CodeGen/X86/vector-fshl-256.ll
index 0133d9a..bf89b15 100644
--- a/test/CodeGen/X86/vector-fshl-256.ll
+++ b/test/CodeGen/X86/vector-fshl-256.ll
@@ -71,76 +71,71 @@
 ;
 ; AVX512F-LABEL: var_funnnel_v4i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT:    vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512F-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512F-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_funnnel_v4i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VL-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VL-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v4i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512BW-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512BW-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v4i64:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VBMI2-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VBMI2-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
 ; AVX512VBMI2-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v4i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VLBW-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VLBW-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
 ; AVX512VLBW-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -257,76 +252,71 @@
 ;
 ; AVX512F-LABEL: var_funnnel_v8i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT:    vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512F-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_funnnel_v8i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v8i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v8i32:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VBMI2-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VBMI2-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
 ; AVX512VBMI2-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v8i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
 ; AVX512VLBW-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -514,48 +504,43 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v16i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v16i16:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT:    vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
 ; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpsllvw %ymm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT:    vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -792,80 +777,74 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VBMI2-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512VBMI2-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v32i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512VLBW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512VLBW-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VLVBMI2-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512VLVBMI2-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
 ; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLVBMI2-NEXT:    retq
@@ -970,13 +949,13 @@
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT:    vpsllq %xmm4, %ymm0, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
@@ -984,14 +963,13 @@
 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpsllq %xmm4, %ymm0, %ymm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1001,13 +979,13 @@
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpsllq %xmm4, %ymm0, %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1017,13 +995,13 @@
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpsllq %xmm4, %ymm0, %ymm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VBMI2-NEXT:    retq
@@ -1031,14 +1009,13 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpsllq %xmm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -1144,15 +1121,15 @@
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT:    vpslld %xmm5, %ymm0, %ymm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
@@ -1160,16 +1137,15 @@
 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT:    vpslld %xmm5, %ymm0, %ymm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1179,15 +1155,15 @@
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT:    vpslld %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1197,15 +1173,15 @@
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT:    vpslld %xmm5, %ymm0, %ymm5
-; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VBMI2-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VBMI2-NEXT:    retq
@@ -1213,16 +1189,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT:    vpslld %xmm5, %ymm0, %ymm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -1366,16 +1341,15 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT:    vpsllw %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1384,16 +1358,15 @@
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT:    vpsllw %xmm5, %ymm0, %ymm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VBMI2-NEXT:    retq
@@ -1401,16 +1374,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT:    vpsllw %xmm5, %ymm0, %ymm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT:    vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -1589,19 +1561,18 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1610,19 +1581,18 @@
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VBMI2-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512VBMI2-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VBMI2-NEXT:    retq
@@ -1630,19 +1600,18 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512VLBW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512VLBW-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -1650,19 +1619,18 @@
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT:    vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT:    vporq %zmm1, %zmm5, %zmm1
+; AVX512VLVBMI2-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512VLVBMI2-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
 ; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLVBMI2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-fshl-512.ll b/test/CodeGen/X86/vector-fshl-512.ll
index cb29f33..ca7a26a 100644
--- a/test/CodeGen/X86/vector-fshl-512.ll
+++ b/test/CodeGen/X86/vector-fshl-512.ll
@@ -18,42 +18,39 @@
 define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
 ; AVX512F-LABEL: var_funnnel_v8i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512F-NEXT:    vpsubq %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512F-NEXT:    vpsubq %zmm2, %zmm4, %zmm4
 ; AVX512F-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_funnnel_v8i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT:    vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VL-NEXT:    vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VL-NEXT:    vpsubq %zmm2, %zmm4, %zmm4
 ; AVX512VL-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512BW-NEXT:    vpsubq %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512BW-NEXT:    vpsubq %zmm2, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -65,14 +62,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v8i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VLBW-NEXT:    vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VLBW-NEXT:    vpsubq %zmm2, %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -88,42 +84,39 @@
 define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
 ; AVX512F-LABEL: var_funnnel_v16i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT:    vpsubd %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpsubd %zmm2, %zmm4, %zmm4
 ; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT:    vpord %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_funnnel_v16i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %zmm2, %zmm4, %zmm4
 ; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT:    vpord %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %zmm2, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpord %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -135,14 +128,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v16i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %zmm2, %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vpord %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -220,14 +212,13 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %zmm2, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -239,14 +230,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %zmm2, %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -398,160 +388,156 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %zmm4, %zmm5, %zmm5
-; AVX512BW-NEXT:    vpsllw $5, %zmm5, %zmm5
-; AVX512BW-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
-; AVX512BW-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT:    vpmovb2m %zmm5, %k2
-; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm3, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm3, %zmm5
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm5
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm5
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
-; AVX512BW-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpsllw $5, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm5
-; AVX512BW-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k2
-; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512BW-NEXT:    vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512BW-NEXT:    vpsllw $2, %zmm4, %zmm6
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512BW-NEXT:    vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm5, %zmm5, %zmm5
-; AVX512BW-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512BW-NEXT:    vporq %zmm1, %zmm4, %zmm1
-; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %zmm4, %zmm5, %zmm5
-; AVX512VBMI2-NEXT:    vpsllw $5, %zmm5, %zmm5
-; AVX512VBMI2-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VBMI2-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT:    vpmovb2m %zmm5, %k2
-; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %zmm2, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpsllw $5, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT:    vpmovb2m %zmm3, %k2
+; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm3
+; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512VBMI2-NEXT:    vpsrlw $2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VBMI2-NEXT:    vpsrlw $1, %zmm1, %zmm3
+; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpsllw $5, %zmm2, %zmm4
+; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VBMI2-NEXT:    vpsllw $2, %zmm3, %zmm5
 ; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512VBMI2-NEXT:    vpsrlw $2, %zmm1, %zmm5
-; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VBMI2-NEXT:    vpsrlw $1, %zmm1, %zmm5
-; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VBMI2-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VBMI2-NEXT:    vpsllw $5, %zmm4, %zmm4
-; AVX512VBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm5
-; AVX512VBMI2-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k2
-; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm4
-; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VBMI2-NEXT:    vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512VBMI2-NEXT:    vpsllw $2, %zmm4, %zmm6
-; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512VBMI2-NEXT:    vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512VBMI2-NEXT:    vpaddb %zmm5, %zmm5, %zmm5
-; AVX512VBMI2-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512VBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512VBMI2-NEXT:    vporq %zmm1, %zmm4, %zmm1
-; AVX512VBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT:    vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VBMI2-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %zmm4, %zmm5, %zmm5
-; AVX512VLBW-NEXT:    vpsllw $5, %zmm5, %zmm5
-; AVX512VLBW-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLBW-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT:    vpmovb2m %zmm5, %k2
-; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %zmm2, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm3, %k2
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512VLBW-NEXT:    vpsrlw $2, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $1, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm3, %zmm5
 ; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512VLBW-NEXT:    vpsrlw $2, %zmm1, %zmm5
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLBW-NEXT:    vpsrlw $1, %zmm1, %zmm5
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLBW-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLBW-NEXT:    vpsllw $5, %zmm4, %zmm4
-; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm5
-; AVX512VLBW-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k2
-; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm4
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLBW-NEXT:    vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512VLBW-NEXT:    vpsllw $2, %zmm4, %zmm6
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512VLBW-NEXT:    vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512VLBW-NEXT:    vpaddb %zmm5, %zmm5, %zmm5
-; AVX512VLBW-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512VLBW-NEXT:    vporq %zmm1, %zmm4, %zmm1
-; AVX512VLBW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %zmm4, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT:    vpsllw $5, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm5, %k2
-; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %zmm2, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vpsllw $5, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm3, %k2
+; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512VLVBMI2-NEXT:    vpsrlw $2, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT:    vpsrlw $1, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vpsllw $5, %zmm2, %zmm4
+; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT:    vpsllw $2, %zmm3, %zmm5
 ; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512VLVBMI2-NEXT:    vpsrlw $2, %zmm1, %zmm5
-; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT:    vpsrlw $1, %zmm1, %zmm5
-; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT:    vpsllw $5, %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm5
-; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k2
-; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm4
-; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT:    vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512VLVBMI2-NEXT:    vpsllw $2, %zmm4, %zmm6
-; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512VLVBMI2-NEXT:    vpaddb %zmm5, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512VLVBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512VLVBMI2-NEXT:    vporq %zmm1, %zmm4, %zmm1
-; AVX512VLVBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT:    vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLVBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VLVBMI2-NEXT:    retq
@@ -567,14 +553,13 @@
 ; AVX512F-LABEL: splatvar_funnnel_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vpsllq %xmm4, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsllq %xmm2, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512F-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
@@ -582,14 +567,13 @@
 ; AVX512VL-LABEL: splatvar_funnnel_v8i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT:    vpsllq %xmm4, %zmm0, %zmm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsllq %xmm2, %zmm0, %zmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512VL-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VL-NEXT:    retq
@@ -597,14 +581,13 @@
 ; AVX512BW-LABEL: splatvar_funnnel_v8i64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpsllq %xmm4, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllq %xmm2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -618,14 +601,13 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpsllq %xmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllq %xmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -644,16 +626,15 @@
 ; AVX512F-LABEL: splatvar_funnnel_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT:    vpslld %xmm5, %zmm0, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT:    vpslld %xmm3, %zmm0, %zmm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
-; AVX512F-NEXT:    vpord %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
@@ -661,16 +642,15 @@
 ; AVX512VL-LABEL: splatvar_funnnel_v16i32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT:    vpslld %xmm5, %zmm0, %zmm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT:    vpslld %xmm3, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
-; AVX512VL-NEXT:    vpord %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VL-NEXT:    retq
@@ -678,16 +658,15 @@
 ; AVX512BW-LABEL: splatvar_funnnel_v16i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT:    vpslld %xmm5, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT:    vpslld %xmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpord %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -701,16 +680,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT:    vpslld %xmm5, %zmm0, %zmm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT:    vpslld %xmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vpord %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -769,16 +747,15 @@
 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT:    vpsllw %xmm5, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -792,16 +769,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT:    vpsllw %xmm5, %zmm0, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -880,24 +856,23 @@
 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vpsllw %xmm5, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512BW-NEXT:    vpsllw %xmm5, %zmm7, %zmm5
-; AVX512BW-NEXT:    vpbroadcastb %xmm5, %zmm5
-; AVX512BW-NEXT:    vpandq %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm3, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT:    vpsllw %xmm3, %zmm5, %zmm3
+; AVX512BW-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm5, %zmm4
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -905,24 +880,23 @@
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT:    vpsllw %xmm5, %zmm0, %zmm6
-; AVX512VBMI2-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VBMI2-NEXT:    vpsllw %xmm5, %zmm7, %zmm5
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm5, %zmm5
-; AVX512VBMI2-NEXT:    vpandq %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT:    vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VBMI2-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VBMI2-NEXT:    vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT:    vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512VBMI2-NEXT:    vpsrlw %xmm4, %zmm5, %zmm4
 ; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm4, %zmm4
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VBMI2-NEXT:    vpandq %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512VBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VBMI2-NEXT:    retq
@@ -930,24 +904,23 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT:    vpsllw %xmm5, %zmm0, %zmm6
-; AVX512VLBW-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLBW-NEXT:    vpsllw %xmm5, %zmm7, %zmm5
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm5, %zmm5
-; AVX512VLBW-NEXT:    vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VLBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm5, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlw $8, %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -955,24 +928,23 @@
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT:    vpsllw %xmm5, %zmm0, %zmm6
-; AVX512VLVBMI2-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT:    vpsllw %xmm5, %zmm7, %zmm5
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm5, %zmm5
-; AVX512VLVBMI2-NEXT:    vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT:    vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VLVBMI2-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT:    vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLVBMI2-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT:    vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512VLVBMI2-NEXT:    vpsrlw %xmm4, %zmm5, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm4, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpandq %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT:    vporq %zmm1, %zmm5, %zmm1
-; AVX512VLVBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLVBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512VLVBMI2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-fshr-128.ll b/test/CodeGen/X86/vector-fshr-128.ll
index 1f70fc9..7c31f9c 100644
--- a/test/CodeGen/X86/vector-fshr-128.ll
+++ b/test/CodeGen/X86/vector-fshr-128.ll
@@ -110,16 +110,14 @@
 ;
 ; AVX512F-LABEL: var_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -127,29 +125,26 @@
 ;
 ; AVX512VL-LABEL: var_funnnel_v2i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v2i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -157,16 +152,14 @@
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v2i64:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -174,14 +167,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v2i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -363,16 +355,15 @@
 ;
 ; AVX512F-LABEL: var_funnnel_v4i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -380,29 +371,27 @@
 ;
 ; AVX512VL-LABEL: var_funnnel_v4i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v4i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -410,16 +399,15 @@
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v4i32:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -427,14 +415,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v4i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -727,17 +714,15 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -745,17 +730,15 @@
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v8i16:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -763,14 +746,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpsrlvw %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpsrlvw %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsllvw %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -1056,21 +1038,19 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT:    vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1078,21 +1058,19 @@
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v16i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT:    vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT:    vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -1100,38 +1078,36 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v16i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT:    vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    vzeroupper
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT:    vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT:    vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512VLVBMI2-NEXT:    vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512VLVBMI2-NEXT:    vpmovwb %ymm0, %xmm0
-; AVX512VLVBMI2-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
 ; AVX512VLVBMI2-NEXT:    vzeroupper
 ; AVX512VLVBMI2-NEXT:    retq
@@ -1294,14 +1270,13 @@
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -1310,14 +1285,13 @@
 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
@@ -1325,14 +1299,13 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1342,14 +1315,13 @@
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -1358,14 +1330,13 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -1516,15 +1487,15 @@
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpslld %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -1533,16 +1504,15 @@
 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
@@ -1551,15 +1521,15 @@
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1570,15 +1540,15 @@
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
-; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VBMI2-NEXT:    vpslld %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -1587,16 +1557,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -1779,16 +1748,15 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT:    vpsrlw %xmm5, %xmm1, %xmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1798,16 +1766,15 @@
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT:    vpsrlw %xmm5, %xmm1, %xmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -1816,16 +1783,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT:    vpsrlw %xmm5, %xmm1, %xmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -2053,18 +2019,17 @@
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT:    vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -2075,18 +2040,17 @@
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT:    vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT:    vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT:    vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
@@ -2095,19 +2059,18 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT:    vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    vzeroupper
 ; AVX512VLBW-NEXT:    retq
@@ -2115,19 +2078,18 @@
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT:    vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT:    vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512VLVBMI2-NEXT:    vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT:    vpor %ymm5, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512VLVBMI2-NEXT:    vpmovwb %ymm0, %xmm0
-; AVX512VLVBMI2-NEXT:    vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
 ; AVX512VLVBMI2-NEXT:    vzeroupper
 ; AVX512VLVBMI2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-fshr-256.ll b/test/CodeGen/X86/vector-fshr-256.ll
index 64283a6..1e55383 100644
--- a/test/CodeGen/X86/vector-fshr-256.ll
+++ b/test/CodeGen/X86/vector-fshr-256.ll
@@ -71,75 +71,70 @@
 ;
 ; AVX512F-LABEL: var_funnnel_v4i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512F-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512F-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_funnnel_v4i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VL-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VL-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v4i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512BW-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512BW-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v4i64:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VBMI2-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VBMI2-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
 ; AVX512VBMI2-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v4i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VLBW-NEXT:    vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VLBW-NEXT:    vpsubq %ymm2, %ymm4, %ymm4
 ; AVX512VLBW-NEXT:    vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -258,75 +253,70 @@
 ;
 ; AVX512F-LABEL: var_funnnel_v8i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512F-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_funnnel_v8i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v8i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v8i32:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VBMI2-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VBMI2-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
 ; AVX512VBMI2-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v8i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %ymm2, %ymm4, %ymm4
 ; AVX512VLBW-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -516,48 +506,43 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v16i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v16i16:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT:    vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpsrlvw %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %ymm2, %ymm4, %ymm4
 ; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -796,79 +781,73 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v32i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512VLVBMI2-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512VLVBMI2-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
 ; AVX512VLVBMI2-NEXT:    retq
 ;
@@ -974,13 +953,13 @@
 ; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512F-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT:    vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
@@ -988,14 +967,13 @@
 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
@@ -1004,13 +982,13 @@
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
@@ -1020,13 +998,13 @@
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
@@ -1034,14 +1012,13 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -1147,15 +1124,15 @@
 ; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT:    vpsrld %xmm5, %ymm1, %ymm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpslld %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
@@ -1163,16 +1140,15 @@
 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT:    vpsrld %xmm5, %ymm1, %ymm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
@@ -1181,15 +1157,15 @@
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT:    vpsrld %xmm5, %ymm1, %ymm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
@@ -1199,15 +1175,15 @@
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT:    vpsrld %xmm5, %ymm1, %ymm5
-; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VBMI2-NEXT:    vpslld %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
@@ -1215,16 +1191,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT:    vpsrld %xmm5, %ymm1, %ymm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -1368,16 +1343,15 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT:    vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
@@ -1386,16 +1360,15 @@
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT:    vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
@@ -1403,16 +1376,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT:    vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -1590,19 +1562,18 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512BW-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
@@ -1611,19 +1582,18 @@
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
@@ -1631,38 +1601,36 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512VLVBMI2-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512VLVBMI2-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT:    vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
 ; AVX512VLVBMI2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-fshr-512.ll b/test/CodeGen/X86/vector-fshr-512.ll
index 063fd03..dd469fb 100644
--- a/test/CodeGen/X86/vector-fshr-512.ll
+++ b/test/CodeGen/X86/vector-fshr-512.ll
@@ -18,40 +18,37 @@
 define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
 ; AVX512F-LABEL: var_funnnel_v8i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512F-NEXT:    vpsubq %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512F-NEXT:    vpsubq %zmm2, %zmm4, %zmm4
 ; AVX512F-NEXT:    vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_funnnel_v8i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VL-NEXT:    vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VL-NEXT:    vpsubq %zmm2, %zmm4, %zmm4
 ; AVX512VL-NEXT:    vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512BW-NEXT:    vpsubq %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512BW-NEXT:    vpsubq %zmm2, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
@@ -63,14 +60,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v8i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VLBW-NEXT:    vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VLBW-NEXT:    vpsubq %zmm2, %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -86,40 +82,37 @@
 define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
 ; AVX512F-LABEL: var_funnnel_v16i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT:    vpsubd %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpsubd %zmm2, %zmm4, %zmm4
 ; AVX512F-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vpord %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_funnnel_v16i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %zmm2, %zmm4, %zmm4
 ; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpord %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_funnnel_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %zmm2, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpord %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
@@ -131,14 +124,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v16i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %zmm2, %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpord %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -216,14 +208,13 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %zmm2, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
@@ -235,14 +226,13 @@
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %zmm2, %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -394,28 +384,27 @@
 ;
 ; AVX512BW-LABEL: var_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpsllw $5, %zmm4, %zmm5
-; AVX512BW-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
-; AVX512BW-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT:    vpmovb2m %zmm5, %k2
-; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm3, %zmm5
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT:    vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512BW-NEXT:    vpsrlw $2, %zmm5, %zmm7
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512BW-NEXT:    vpsrlw $1, %zmm5, %zmm7
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512BW-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
-; AVX512BW-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm3, %zmm5
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %zmm2, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpsllw $5, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm6
-; AVX512BW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512BW-NEXT:    vpaddb %zmm4, %zmm4, %zmm5
+; AVX512BW-NEXT:    vpmovb2m %zmm5, %k1
 ; AVX512BW-NEXT:    vpmovb2m %zmm4, %k2
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -423,38 +412,37 @@
 ; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
 ; AVX512BW-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm6, %zmm6, %zmm4
+; AVX512BW-NEXT:    vpaddb %zmm5, %zmm5, %zmm4
 ; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: var_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT:    vpsllw $5, %zmm4, %zmm5
-; AVX512VBMI2-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VBMI2-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT:    vpmovb2m %zmm5, %k2
-; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm3
+; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT:    vpsllw $5, %zmm2, %zmm4
+; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT:    vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512VBMI2-NEXT:    vpsrlw $2, %zmm3, %zmm5
 ; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT:    vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512VBMI2-NEXT:    vpsrlw $2, %zmm5, %zmm7
-; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VBMI2-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VBMI2-NEXT:    vpsrlw $1, %zmm5, %zmm7
-; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VBMI2-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VBMI2-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %zmm4, %zmm6, %zmm4
+; AVX512VBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT:    vpsrlw $1, %zmm3, %zmm5
+; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %zmm2, %zmm4, %zmm4
 ; AVX512VBMI2-NEXT:    vpsllw $5, %zmm4, %zmm4
-; AVX512VBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm6
-; AVX512VBMI2-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512VBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm5
+; AVX512VBMI2-NEXT:    vpmovb2m %zmm5, %k1
 ; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k2
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm4
 ; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -462,38 +450,37 @@
 ; AVX512VBMI2-NEXT:    vpsllw $2, %zmm0, %zmm4
 ; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VBMI2-NEXT:    vpaddb %zmm6, %zmm6, %zmm4
+; AVX512VBMI2-NEXT:    vpaddb %zmm5, %zmm5, %zmm4
 ; AVX512VBMI2-NEXT:    vpmovb2m %zmm4, %k1
 ; AVX512VBMI2-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VBMI2-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpsllw $5, %zmm4, %zmm5
-; AVX512VLBW-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLBW-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT:    vpmovb2m %zmm5, %k2
-; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm2, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $2, %zmm3, %zmm5
 ; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT:    vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512VLBW-NEXT:    vpsrlw $2, %zmm5, %zmm7
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLBW-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLBW-NEXT:    vpsrlw $1, %zmm5, %zmm7
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLBW-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLBW-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $1, %zmm3, %zmm5
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %zmm2, %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpsllw $5, %zmm4, %zmm4
-; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm6
-; AVX512VLBW-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm4, %zmm4, %zmm5
+; AVX512VLBW-NEXT:    vpmovb2m %zmm5, %k1
 ; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k2
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm4
 ; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -501,38 +488,37 @@
 ; AVX512VLBW-NEXT:    vpsllw $2, %zmm0, %zmm4
 ; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VLBW-NEXT:    vpaddb %zmm6, %zmm6, %zmm4
+; AVX512VLBW-NEXT:    vpaddb %zmm5, %zmm5, %zmm4
 ; AVX512VLBW-NEXT:    vpmovb2m %zmm4, %k1
 ; AVX512VLBW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT:    vpsllw $5, %zmm4, %zmm5
-; AVX512VLVBMI2-NEXT:    vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm5, %k2
-; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm5
+; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT:    vpsllw $5, %zmm2, %zmm4
+; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT:    vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT:    vpsrlw $2, %zmm3, %zmm5
 ; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT:    vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512VLVBMI2-NEXT:    vpsrlw $2, %zmm5, %zmm7
-; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLVBMI2-NEXT:    vpsrlw $1, %zmm5, %zmm7
-; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT:    vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %zmm4, %zmm6, %zmm4
+; AVX512VLVBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT:    vpsrlw $1, %zmm3, %zmm5
+; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %zmm2, %zmm4, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpsllw $5, %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm6
-; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm6, %k1
+; AVX512VLVBMI2-NEXT:    vpaddb %zmm4, %zmm4, %zmm5
+; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm5, %k1
 ; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k2
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -540,11 +526,11 @@
 ; AVX512VLVBMI2-NEXT:    vpsllw $2, %zmm0, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VLVBMI2-NEXT:    vpaddb %zmm6, %zmm6, %zmm4
+; AVX512VLVBMI2-NEXT:    vpaddb %zmm5, %zmm5, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpmovb2m %zmm4, %k1
 ; AVX512VLVBMI2-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VLVBMI2-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
@@ -559,42 +545,39 @@
 ; AVX512F-LABEL: splatvar_funnnel_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsllq %xmm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT:    vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsllq %xmm4, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v8i64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsllq %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
@@ -608,14 +591,13 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsllq %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -634,48 +616,45 @@
 ; AVX512F-LABEL: splatvar_funnnel_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT:    vpsrld %xmm5, %zmm1, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpslld %xmm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vpord %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v16i32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT:    vpsrld %xmm5, %zmm1, %zmm5
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm4, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpord %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v16i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT:    vpsrld %xmm5, %zmm1, %zmm5
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpord %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
@@ -689,16 +668,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT:    vpsrld %xmm5, %zmm1, %zmm5
-; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT:    vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpord %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -757,16 +735,15 @@
 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT:    vpsrlw %xmm5, %zmm1, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
@@ -780,16 +757,15 @@
 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT:    vpsrlw %xmm5, %zmm1, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -868,96 +844,92 @@
 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512BW-NEXT:    vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512BW-NEXT:    vpsrlw $8, %zmm5, %zmm5
-; AVX512BW-NEXT:    vpbroadcastb %xmm5, %zmm5
-; AVX512BW-NEXT:    vpandq %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsllw %xmm4, %zmm7, %zmm4
+; AVX512BW-NEXT:    vpsllw %xmm4, %zmm5, %zmm4
 ; AVX512BW-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT:    vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512VBMI2-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VBMI2-NEXT:    vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm5, %zmm5
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm5, %zmm5
-; AVX512VBMI2-NEXT:    vpandq %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT:    vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512VBMI2-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VBMI2-NEXT:    vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512VBMI2-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpsllw %xmm4, %zmm7, %zmm4
+; AVX512VBMI2-NEXT:    vpsllw %xmm4, %zmm5, %zmm4
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VBMI2-NEXT:    vpandq %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT:    vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512VLBW-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLBW-NEXT:    vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512VLBW-NEXT:    vpsrlw $8, %zmm5, %zmm5
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm5, %zmm5
-; AVX512VLBW-NEXT:    vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512VLBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512VLBW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpsllw %xmm4, %zmm7, %zmm4
+; AVX512VLBW-NEXT:    vpsllw %xmm4, %zmm5, %zmm4
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT:    vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT:    vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512VLVBMI2-NEXT:    vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT:    vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm5, %zmm5
-; AVX512VLVBMI2-NEXT:    vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT:    vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512VLVBMI2-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT:    vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm3, %zmm3
+; AVX512VLVBMI2-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLVBMI2-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpsllw %xmm4, %zmm7, %zmm4
+; AVX512VLVBMI2-NEXT:    vpsllw %xmm4, %zmm5, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpandq %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VLVBMI2-NEXT:    retq
   %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
diff --git a/test/CodeGen/X86/vector-lzcnt-512.ll b/test/CodeGen/X86/vector-lzcnt-512.ll
index f709713..d5b7b71 100644
--- a/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -359,16 +359,15 @@
 ;
 ; AVX512BW-LABEL: testv32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm3, %k0
-; AVX512BW-NEXT:    vpmovm2b %k0, %zmm5
-; AVX512BW-NEXT:    vpandq %zmm5, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
@@ -442,16 +441,15 @@
 ;
 ; AVX512BW-LABEL: testv32i16u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm3, %k0
-; AVX512BW-NEXT:    vpmovm2b %k0, %zmm5
-; AVX512BW-NEXT:    vpandq %zmm5, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
@@ -549,16 +547,15 @@
 ;
 ; AVX512BW-LABEL: testv64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm0, %k0
-; AVX512BW-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: testv64i8:
@@ -640,16 +637,15 @@
 ;
 ; AVX512BW-LABEL: testv64i8u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm0, %k0
-; AVX512BW-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: testv64i8u: