[ScalarizeMaskedMemIntrin] Add support for scalarizing expandload and compressstore intrinsics.

This adds support for scalarizing these intrinsics as well the X86TargetTransformInfo support to avoid scalarizing them in the cases X86 can handle.

I've omitted handling special cases for constant masks for this first pass. Though CodeGenPrepare can constant fold the branch conditions and remove some of the control flow anyway.

Fixes PR40994 and is covers most of PR3666. Might want to implement constant masks to close that.

Differential Revision: https://reviews.llvm.org/D59180

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356687 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 26e86c2..75f0798 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -499,17 +499,21 @@
   /// modes that operate across loop iterations.
   bool shouldFavorBackedgeIndex(const Loop *L) const;
 
-  /// Return true if the target supports masked load/store
-  /// AVX2 and AVX-512 targets allow masks for consecutive load and store
+  /// Return true if the target supports masked load.
   bool isLegalMaskedStore(Type *DataType) const;
+  /// Return true if the target supports masked store.
   bool isLegalMaskedLoad(Type *DataType) const;
 
-  /// Return true if the target supports masked gather/scatter
-  /// AVX-512 fully supports gather and scatter for vectors with 32 and 64
-  /// bits scalar type.
+  /// Return true if the target supports masked scatter.
   bool isLegalMaskedScatter(Type *DataType) const;
+  /// Return true if the target supports masked gather.
   bool isLegalMaskedGather(Type *DataType) const;
 
+  /// Return true if the target supports masked compress store.
+  bool isLegalMaskedCompressStore(Type *DataType) const;
+  /// Return true if the target supports masked expand load.
+  bool isLegalMaskedExpandLoad(Type *DataType) const;
+
   /// Return true if the target has a unified operation to calculate division
   /// and remainder. If so, the additional implicit multiplication and
   /// subtraction required to calculate a remainder from division are free. This
@@ -1085,6 +1089,8 @@
   virtual bool isLegalMaskedLoad(Type *DataType) = 0;
   virtual bool isLegalMaskedScatter(Type *DataType) = 0;
   virtual bool isLegalMaskedGather(Type *DataType) = 0;
+  virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
+  virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
   virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
   virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
   virtual bool prefersVectorizedAddressing() = 0;
@@ -1336,6 +1342,12 @@
   bool isLegalMaskedGather(Type *DataType) override {
     return Impl.isLegalMaskedGather(DataType);
   }
+  bool isLegalMaskedCompressStore(Type *DataType) override {
+    return Impl.isLegalMaskedCompressStore(DataType);
+  }
+  bool isLegalMaskedExpandLoad(Type *DataType) override {
+    return Impl.isLegalMaskedExpandLoad(DataType);
+  }
   bool hasDivRemOp(Type *DataType, bool IsSigned) override {
     return Impl.hasDivRemOp(DataType, IsSigned);
   }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 8e66726..6b209ed 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -270,6 +270,10 @@
 
   bool isLegalMaskedGather(Type *DataType) { return false; }
 
+  bool isLegalMaskedCompressStore(Type *DataType) { return false; }
+
+  bool isLegalMaskedExpandLoad(Type *DataType) { return false; }
+
   bool hasDivRemOp(Type *DataType, bool IsSigned) { return false; }
 
   bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) { return false; }
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 7cd7951..404159d 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -185,6 +185,14 @@
   return TTIImpl->isLegalMaskedScatter(DataType);
 }
 
+bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const {
+  return TTIImpl->isLegalMaskedCompressStore(DataType);
+}
+
+bool TargetTransformInfo::isLegalMaskedExpandLoad(Type *DataType) const {
+  return TTIImpl->isLegalMaskedExpandLoad(DataType);
+}
+
 bool TargetTransformInfo::hasDivRemOp(Type *DataType, bool IsSigned) const {
   return TTIImpl->hasDivRemOp(DataType, IsSigned);
 }
diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
index 3a27035..e2ee9f2 100644
--- a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -534,6 +534,154 @@
   ModifiedDT = true;
 }
 
+static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
+  Value *Ptr = CI->getArgOperand(0);
+  Value *Mask = CI->getArgOperand(1);
+  Value *PassThru = CI->getArgOperand(2);
+
+  VectorType *VecType = cast<VectorType>(CI->getType());
+
+  Type *EltTy = VecType->getElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // The result vector
+  Value *VResult = PassThru;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  br i1 %mask_1, label %cond.load, label %else
+    //
+
+    Value *Predicate =
+        Builder.CreateExtractElement(Mask, Idx);
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
+                                                     "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, 1);
+    Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
+
+    // Move the pointer if there are more blocks to come.
+    Value *NewPtr;
+    if ((Idx + 1) != VectorWidth)
+      NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+    OldBr->eraseFromParent();
+    BasicBlock *PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+
+    // Create the phi to join the new and previous value.
+    PHINode *ResultPhi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+    ResultPhi->addIncoming(NewVResult, CondBlock);
+    ResultPhi->addIncoming(VResult, PrevIfBlock);
+    VResult = ResultPhi;
+
+    // Add a PHI for the pointer if this isn't the last iteration.
+    if ((Idx + 1) != VectorWidth) {
+      PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
+      PtrPhi->addIncoming(NewPtr, CondBlock);
+      PtrPhi->addIncoming(Ptr, PrevIfBlock);
+      Ptr = PtrPhi;
+    }
+  }
+
+  CI->replaceAllUsesWith(VResult);
+  CI->eraseFromParent();
+
+  ModifiedDT = true;
+}
+
+static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptr = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+
+  VectorType *VecType = cast<VectorType>(Src->getType());
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  Type *EltTy = VecType->getVectorElementType();
+
+  unsigned VectorWidth = VecType->getNumElements();
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  br i1 %mask_1, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock =
+        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Idx);
+    Builder.CreateAlignedStore(OneElt, Ptr, 1);
+
+    // Move the pointer if there are more blocks to come.
+    Value *NewPtr;
+    if ((Idx + 1) != VectorWidth)
+      NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
+    OldBr->eraseFromParent();
+    BasicBlock *PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+
+    // Add a PHI for the pointer if this isn't the last iteration.
+    if ((Idx + 1) != VectorWidth) {
+      PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else");
+      PtrPhi->addIncoming(NewPtr, CondBlock);
+      PtrPhi->addIncoming(Ptr, PrevIfBlock);
+      Ptr = PtrPhi;
+    }
+  }
+  CI->eraseFromParent();
+
+  ModifiedDT = true;
+}
+
 bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
   bool EverMadeChange = false;
 
@@ -600,6 +748,16 @@
         return false;
       scalarizeMaskedScatter(CI, ModifiedDT);
       return true;
+    case Intrinsic::masked_expandload:
+      if (TTI->isLegalMaskedExpandLoad(CI->getType()))
+        return false;
+      scalarizeMaskedExpandLoad(CI, ModifiedDT);
+      return true;
+    case Intrinsic::masked_compressstore:
+      if (TTI->isLegalMaskedCompressStore(CI->getArgOperand(0)->getType()))
+        return false;
+      scalarizeMaskedCompressStore(CI, ModifiedDT);
+      return true;
     }
   }
 
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 4529c2c..16128ce 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3014,6 +3014,34 @@
   return isLegalMaskedLoad(DataType);
 }
 
+bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
+  if (!isa<VectorType>(DataTy))
+    return false;
+
+  if (!ST->hasAVX512())
+    return false;
+
+  // The backend can't handle a single element vector.
+  if (DataTy->getVectorNumElements() == 1)
+    return false;
+
+  Type *ScalarTy = DataTy->getVectorElementType();
+
+  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+    return true;
+
+  if (!ScalarTy->isIntegerTy())
+    return false;
+
+  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+  return IntWidth == 32 || IntWidth == 64 ||
+         ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
+}
+
+bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
+  return isLegalMaskedExpandLoad(DataTy);
+}
+
 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
   // Some CPUs have better gather performance than others.
   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 8bfcd07..5dce3a7 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -184,6 +184,8 @@
   bool isLegalMaskedStore(Type *DataType);
   bool isLegalMaskedGather(Type *DataType);
   bool isLegalMaskedScatter(Type *DataType);
+  bool isLegalMaskedExpandLoad(Type *DataType);
+  bool isLegalMaskedCompressStore(Type *DataType);
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
   bool areInlineCompatible(const Function *Caller,
diff --git a/test/CodeGen/X86/pr39666.ll b/test/CodeGen/X86/pr39666.ll
new file mode 100644
index 0000000..949e8ab
--- /dev/null
+++ b/test/CodeGen/X86/pr39666.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s
+
+define <2 x i64> @test5(i64* %base, <2 x i64> %src0) {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %else
+; CHECK-NEXT:    vpinsrq $1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+ %res = call <2 x i64> @llvm.masked.expandload.v2i64(i64* %base, <2 x i1> <i1 false, i1 true>, <2 x i64> %src0)
+ ret <2 x i64>%res
+}
+declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
+
+define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) {
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrb $0, %xmm1, %eax
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %cond.store
+; CHECK-NEXT:    vmovq %xmm0, (%rdi)
+; CHECK-NEXT:    addq $8, %rdi
+; CHECK-NEXT:  .LBB1_2: # %else
+; CHECK-NEXT:    vpextrb $8, %xmm1, %eax
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB1_4
+; CHECK-NEXT:  # %bb.3: # %cond.store1
+; CHECK-NEXT:    vpextrq $1, %xmm0, (%rdi)
+; CHECK-NEXT:  .LBB1_4: # %else2
+; CHECK-NEXT:    retq
+ call void @llvm.masked.compressstore.v2i64(<2 x i64> %V, i64* %base, <2 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64* , <2 x i1>)
diff --git a/test/CodeGen/X86/pr40994.ll b/test/CodeGen/X86/pr40994.ll
new file mode 100644
index 0000000..66d5594
--- /dev/null
+++ b/test/CodeGen/X86/pr40994.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s
+
+define <8 x i8> @foo(<16 x i8> %a) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %cond.store
+; CHECK-NEXT:    pextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $2, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $4, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $6, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $8, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $10, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $12, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pextrb $14, %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT:    retq
+  %v = alloca i8, i32 8, align 16
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> %a, i8* %v, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
+  %ptr = bitcast i8* %v to <8 x i8>*
+  %out = load <8 x i8>, <8 x i8>* %ptr
+  ret <8 x i8> %out
+}
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, i8*, <16 x i1>) #0
diff --git a/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-compressstore.ll b/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-compressstore.ll
new file mode 100644
index 0000000..cc53a2c
--- /dev/null
+++ b/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-compressstore.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define void @scalarize_v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.store:
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT:    store i64 [[TMP2]], i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
+; CHECK-NEXT:    br i1 [[TMP4]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.store1:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT:    store i64 [[TMP5]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> %mask)
+  ret void
+}
+
+define void @scalarize_v2i64_ones_mask(i64* %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-NEXT:    br i1 true, label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.store:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT:    store i64 [[TMP1]], i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.store1:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT:    store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> <i1 true, i1 true>)
+  ret void
+}
+
+define void @scalarize_v2i64_zero_mask(i64* %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-NEXT:    br i1 false, label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.store:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT:    store i64 [[TMP1]], i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 false, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.store1:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT:    store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> <i1 false, i1 false>)
+  ret void
+}
+
+define void @scalarize_v2i64_const_mask(i64* %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-NEXT:    br i1 false, label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.store:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT:    store i64 [[TMP1]], i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.store1:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT:    store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> <i1 false, i1 true>)
+  ret void
+}
+
+declare void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64>, i64*, <2 x i1>)
diff --git a/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-expandload.ll b/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-expandload.ll
new file mode 100644
index 0000000..3b8c7aa
--- /dev/null
+++ b/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-expandload.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define <2 x i64> @scalarize_v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.load:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP3]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP4]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[MASK]], i64 1
+; CHECK-NEXT:    br i1 [[TMP5]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.load1:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP6]], i64 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP7]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_ones_mask(i64* %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-NEXT:    br i1 true, label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.load:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT:    br i1 true, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.load1:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> <i1 true, i1 true>, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_zero_mask(i64* %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-NEXT:    br i1 false, label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.load:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT:    br i1 false, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.load1:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_const_mask(i64* %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-NEXT:    br i1 false, label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.load:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT:    br i1 true, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.load1:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+declare <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64*,  <2 x i1>, <2 x i64>)