[IA][RISCV] Support gap mask for loads that are de-interleaved through intrinsics (#197062)

In the context of (de)interleaved loads and stores, a gap mask is a mask
that effectively skips the entire component / field. Starting from
#151612 , the InterleavedAccessPass gained support to recognize masks of
this kind and pass it to the TLI hook. RISC-V originally only supported
gap mask on fixed vectors, this patch adds support for recognizing gap
masks on loads that are de-interleaved through the
`llvm.vector.deinterleaveN` intrinsics, with both scalable vectors and
fixed vectors.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3187631..82c47cc 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3342,8 +3342,11 @@
   /// \p Load is the accompanying load instruction.  Can be either a plain load
   /// instruction or a vp.load intrinsic.
   /// \p DI represents the deinterleaveN intrinsic.
+  /// \p GapMask is a mask with zeros for components / fields that may not be
+  /// accessed.
   virtual bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
-                                                IntrinsicInst *DI) const {
+                                                IntrinsicInst *DI,
+                                                const APInt &GapMask) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index bdfbeea..5498ce1 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -698,6 +698,7 @@
   assert(Factor && "unexpected deinterleave intrinsic");
 
   Value *Mask = nullptr;
+  auto GapMask = APInt::getAllOnes(Factor);
   if (LI) {
     if (!LI->isSimple())
       return false;
@@ -711,24 +712,20 @@
       return false;
 
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    APInt GapMask(Factor, 0);
     std::tie(Mask, GapMask) =
         getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
     if (!Mask)
       return false;
-    // We haven't supported gap mask if it's deinterleaving using intrinsics.
-    // Yet it is possible that we already changed the IR, hence returning true
-    // here.
-    if (GapMask.popcount() != Factor)
-      return true;
 
     LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
                       << " intrinsic " << *DI << " and factor = "
                       << Factor << "\n");
+    LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
+                      << " and actual factor " << GapMask.popcount() << "\n");
   }
 
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(LoadedVal, Mask, DI))
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(LoadedVal, Mask, DI, GapMask))
     return false;
 
   DeadInsts.insert(DI);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60ce666..0034c03 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18906,8 +18906,10 @@
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+    Instruction *Load, Value *Mask, IntrinsicInst *DI,
+    const APInt &GapMask) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+  assert(GapMask.getBitWidth() == Factor);
   if (Factor != 2 && Factor != 3 && Factor != 4) {
     LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
     return false;
@@ -18917,6 +18919,10 @@
     return false;
   assert(!Mask && "Unexpected mask on a load\n");
 
+  // Gap mask is currently not supported.
+  if (!GapMask.isAllOnes())
+    return false;
+
   VectorType *VTy = getDeinterleavedVectorType(DI);
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2e66b5a..25cab4d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -232,7 +232,8 @@
                              const APInt &GapMask) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
-                                        IntrinsicInst *DI) const override;
+                                        IntrinsicInst *DI,
+                                        const APInt &GapMask) const override;
 
   bool lowerInterleaveIntrinsicToStore(
       Instruction *Store, Value *Mask,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 7ef15bc..558df27 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -438,7 +438,8 @@
                              const APInt &GapMask) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
-                                        IntrinsicInst *DI) const override;
+                                        IntrinsicInst *DI,
+                                        const APInt &GapMask) const override;
 
   bool lowerInterleaveIntrinsicToStore(
       Instruction *Store, Value *Mask,
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 6e2626d..7026541 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -69,6 +69,12 @@
     Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask,
     Intrinsic::riscv_sseg8_load_mask};
 
+static const Intrinsic::ID ScalableVlssegIntrIds[] = {
+    Intrinsic::riscv_vlsseg2_mask, Intrinsic::riscv_vlsseg3_mask,
+    Intrinsic::riscv_vlsseg4_mask, Intrinsic::riscv_vlsseg5_mask,
+    Intrinsic::riscv_vlsseg6_mask, Intrinsic::riscv_vlsseg7_mask,
+    Intrinsic::riscv_vlsseg8_mask};
+
 static const Intrinsic::ID ScalableVlsegIntrIds[] = {
     Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
     Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
@@ -349,15 +355,25 @@
 }
 
 bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+    Instruction *Load, Value *Mask, IntrinsicInst *DI,
+    const APInt &GapMask) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+  assert(GapMask.getBitWidth() == Factor);
   if (Factor > 8)
     return false;
 
+  // We only support cases where the skipped fields are the trailing ones.
+  if (!GapMask.isMask())
+    return false;
   IRBuilder<> Builder(Load);
 
   VectorType *ResVTy = getDeinterleavedVectorType(DI);
 
+  unsigned MaskFactor = GapMask.getActiveBits();
+  // For MaskFactor of 1, we still want to lower it with segmented load
+  // (of the original Factor), because the sole field extraction will eventually
+  // turn it into a strided load.
+  bool UseStridedSeg = MaskFactor < Factor && MaskFactor > 1;
   const DataLayout &DL = Load->getDataLayout();
   auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
 
@@ -371,22 +387,53 @@
   if (!isLegalInterleavedAccessType(ResVTy, Factor, Alignment, AS, DL))
     return false;
 
+  unsigned ElementSizeInBytes = DL.getTypeStoreSize(ResVTy->getElementType());
   Value *Return;
   if (isa<FixedVectorType>(ResVTy)) {
-    Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
-                                     {ResVTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+    Value *SegLoad;
+    if (UseStridedSeg) {
+      // Lower to strided segmented load.
+      Value *Stride = ConstantInt::get(XLenTy, Factor * ElementSizeInBytes);
+      SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2],
+                                        {ResVTy, PtrTy, XLenTy, XLenTy},
+                                        {Ptr, Stride, Mask, VL});
+    } else {
+      SegLoad =
+          Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
+                                  {ResVTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+    }
+
+    if (MaskFactor != Factor) {
+      // Replace masked-off factors with poisons.
+      SmallVector<Type *, 8> AggrTypes{Factor, ResVTy};
+      Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
+      for (unsigned I = 0; I < MaskFactor; ++I) {
+        Value *SubVec = Builder.CreateExtractValue(SegLoad, I);
+        Return = Builder.CreateInsertValue(Return, SubVec, I);
+      }
+    } else {
+      Return = SegLoad;
+    }
   } else {
     unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType());
     unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
     Type *VecTupTy = TargetExtType::get(
         Load->getContext(), "riscv.vector.tuple",
         ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8),
-        Factor);
-    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
-        Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
-        {VecTupTy, PtrTy, Mask->getType(), VL->getType()});
+        UseStridedSeg ? MaskFactor : Factor);
+    Function *SegLoadFunc;
+    if (UseStridedSeg) {
+      // Lower to strided segmented load.
+      SegLoadFunc = Intrinsic::getOrInsertDeclaration(
+          Load->getModule(), ScalableVlssegIntrIds[MaskFactor - 2],
+          {VecTupTy, PtrTy, XLenTy, Mask->getType()});
+    } else {
+      SegLoadFunc = Intrinsic::getOrInsertDeclaration(
+          Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
+          {VecTupTy, PtrTy, Mask->getType(), VL->getType()});
+    }
 
-    Value *Operands[] = {
+    SmallVector<Value *, 8> Operands = {
         PoisonValue::get(VecTupTy),
         Ptr,
         Mask,
@@ -394,12 +441,16 @@
         ConstantInt::get(XLenTy,
                          RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
         ConstantInt::get(XLenTy, Log2_64(SEW))};
+    if (UseStridedSeg) {
+      Value *Stride = ConstantInt::get(XLenTy, Factor * ElementSizeInBytes);
+      Operands.insert(std::next(Operands.begin(), 2), Stride);
+    }
 
-    CallInst *Vlseg = Builder.CreateCall(VlsegNFunc, Operands);
+    CallInst *Vlseg = Builder.CreateCall(SegLoadFunc, Operands);
 
-    SmallVector<Type *, 2> AggrTypes{Factor, ResVTy};
+    SmallVector<Type *, 8> AggrTypes{Factor, ResVTy};
     Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
-    for (unsigned i = 0; i < Factor; ++i) {
+    for (unsigned i = 0; i < MaskFactor; ++i) {
       Value *VecExtract = Builder.CreateIntrinsic(
           Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy},
           {Vlseg, Builder.getInt32(i)});
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 82419c2..c2981c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -431,6 +431,45 @@
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3
 }
 
+; mask = 1010, skip the last two fields
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @gap_mask_vpload_factor4_intrinsics(ptr %ptr) {
+; CHECK-LABEL: gap_mask_vpload_factor4_intrinsics:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    vlsseg2e32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  %wide.masked.load = call <16 x i32> @llvm.vp.load(ptr %ptr, <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, i32 16)
+  %d = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.vector.deinterleave4(<16 x i32> %wide.masked.load)
+  %t0 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %d, 0
+  %t1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %d, 1
+
+  %res0 = insertvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } poison, <4 x i32> %t0, 0
+  %res1 = insertvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %res0, <4 x i32> %t1, 1
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %res1
+}
+
+; mask = 1010, skip the last three fields. We should not apply the gap-mask optimization here but
+; we can extract only the first field so that it can be turned into a strided load.
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @gap_mask_single_field_vpload_factor4_intrinsics(ptr %ptr) {
+; CHECK-LABEL: gap_mask_single_field_vpload_factor4_intrinsics:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    vlse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+  %wide.masked.load = call <16 x i32> @llvm.vp.load(ptr %ptr, <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, i32 16)
+  %d = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.vector.deinterleave4(<16 x i32> %wide.masked.load)
+  %t0 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %d, 0
+  %t1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %d, 1
+
+  %res0 = insertvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } poison, <4 x i32> %t0, 0
+  %res1 = insertvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %res0, <4 x i32> %t1, 1
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %res1
+}
+
 define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor5(ptr %ptr) {
 ; CHECK-LABEL: vpload_factor5:
 ; CHECK:       # %bb.0:
@@ -615,8 +654,8 @@
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v28, (a1) # vscale x 32-byte Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI27_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI29_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI29_0)
 ; RV32-NEXT:    lui a6, 49164
 ; RV32-NEXT:    lui t1, 3
 ; RV32-NEXT:    lui t0, 196656
@@ -777,8 +816,8 @@
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI27_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_1)
+; RV32-NEXT:    lui a1, %hi(.LCPI29_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI29_1)
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    addi a3, a3, -1024
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -803,8 +842,8 @@
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI27_3)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_3)
+; RV32-NEXT:    lui a1, %hi(.LCPI29_3)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI29_3)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v28, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -814,8 +853,8 @@
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v30
-; RV32-NEXT:    lui a1, %hi(.LCPI27_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_2)
+; RV32-NEXT:    lui a1, %hi(.LCPI29_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI29_2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -863,16 +902,16 @@
 ; RV32-NEXT:    vrgatherei16.vv v24, v12, v20
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v24, v8
-; RV32-NEXT:    lui a1, %hi(.LCPI27_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI27_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI27_5)
+; RV32-NEXT:    lui a1, %hi(.LCPI29_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI29_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI29_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI29_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v28, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v1, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI27_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_7)
+; RV32-NEXT:    lui a1, %hi(.LCPI29_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI29_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v2, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -895,14 +934,14 @@
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI27_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_6)
-; RV32-NEXT:    lui a2, %hi(.LCPI27_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI27_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI29_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI29_6)
+; RV32-NEXT:    lui a2, %hi(.LCPI29_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI29_8)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI27_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI29_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI29_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -1210,8 +1249,8 @@
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs4r.v v12, (a2) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a2, %hi(.LCPI27_0)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI27_0)
+; RV64-NEXT:    lui a2, %hi(.LCPI29_0)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI29_0)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
@@ -1252,8 +1291,8 @@
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI27_1)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI27_1)
+; RV64-NEXT:    lui a1, %hi(.LCPI29_1)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI29_1)
 ; RV64-NEXT:    vle16.v v24, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 37
@@ -1268,8 +1307,8 @@
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v0, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI27_2)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI27_2)
+; RV64-NEXT:    lui a1, %hi(.LCPI29_2)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI29_2)
 ; RV64-NEXT:    vle16.v v12, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 21
@@ -1335,12 +1374,12 @@
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI27_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI27_3)
+; RV64-NEXT:    lui a1, %hi(.LCPI29_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI29_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v8, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI27_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI27_4)
+; RV64-NEXT:    lui a1, %hi(.LCPI29_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI29_4)
 ; RV64-NEXT:    vle16.v v10, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
@@ -1388,8 +1427,8 @@
 ; RV64-NEXT:    vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v0, v8, v16
-; RV64-NEXT:    lui a1, %hi(.LCPI27_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI27_5)
+; RV64-NEXT:    lui a1, %hi(.LCPI29_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI29_5)
 ; RV64-NEXT:    vle16.v v12, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 61
@@ -2037,8 +2076,8 @@
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI65_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI65_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI67_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI67_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -2113,8 +2152,8 @@
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI66_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI66_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI68_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI68_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36
@@ -2380,8 +2419,8 @@
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI78_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI78_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI80_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI80_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 36d1aee..97fc2e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -494,6 +494,85 @@
   ret <vscale x 2 x i32> %t3
 }
 
+; mask = all ones, skip the last 2 fields
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @gap_mask_load_factor4(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: gap_mask_load_factor4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vlsseg2e32.v v8, (a0), a2
+; CHECK-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 4
+  %gap.mask = call <vscale x 8 x i1> @llvm.vector.interleave4(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 false), <vscale x 2 x i1> splat (i1 false))
+  %combined.mask = and <vscale x 8 x i1> %gap.mask, splat (i1 true)
+
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %combined.mask, i32 %rvl)
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
+
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+; mask = %m, skip the last field
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @gap_mask_load_factor4_mask(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: gap_mask_load_factor4_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vlsseg3e32.v v8, (a0), a2, v0.t
+; CHECK-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 4
+  %actual.mask = call <vscale x 8 x i1> @llvm.vector.interleave4(<vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m)
+  %gap.mask = call <vscale x 8 x i1> @llvm.vector.interleave4(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 false))
+  %combined.mask = and <vscale x 8 x i1> %gap.mask, %actual.mask
+
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %combined.mask, i32 %rvl)
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
+
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+; mask = %m, skip the last 3 fields. We should not apply the gap-mask optimization here but we can extract only the first field so that it can be turned into a strided load.
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @gap_mask_single_field_load_factor4_mask(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: gap_mask_single_field_load_factor4_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vlse32.v v8, (a0), a2, v0.t
+; CHECK-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 4
+  %actual.mask = call <vscale x 8 x i1> @llvm.vector.interleave4(<vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m)
+  %gap.mask = call <vscale x 8 x i1> @llvm.vector.interleave4(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 false), <vscale x 2 x i1> splat (i1 false), <vscale x 2 x i1> splat (i1 false))
+  %combined.mask = and <vscale x 8 x i1> %gap.mask, %actual.mask
+
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %combined.mask, i32 %rvl)
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
+
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
 
 ; Negative tests
 
@@ -615,4 +694,3 @@
   %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
   ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
 }
-