[IA][RISCV] Support gap mask for loads that are de-interleaved through intrinsics (#197062)
In the context of (de)interleaved loads and stores, a gap mask is a mask
that effectively skips the entire component / field. Starting from
#151612 , the InterleavedAccessPass gained support to recognize masks of
this kind and pass it to the TLI hook. RISC-V originally only supported
gap mask on fixed vectors, this patch adds support for recognizing gap
masks on loads that are de-interleaved through the
`llvm.vector.deinterleaveN` intrinsics, with both scalable vectors and
fixed vectors.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3187631..82c47cc 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3342,8 +3342,11 @@
/// \p Load is the accompanying load instruction. Can be either a plain load
/// instruction or a vp.load intrinsic.
/// \p DI represents the deinterleaveN intrinsic.
+ /// \p GapMask is a mask with zeros for components / fields that may not be
+ /// accessed.
virtual bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
- IntrinsicInst *DI) const {
+ IntrinsicInst *DI,
+ const APInt &GapMask) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index bdfbeea..5498ce1 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -698,6 +698,7 @@
assert(Factor && "unexpected deinterleave intrinsic");
Value *Mask = nullptr;
+ auto GapMask = APInt::getAllOnes(Factor);
if (LI) {
if (!LI->isSimple())
return false;
@@ -711,24 +712,20 @@
return false;
// Check mask operand. Handle both all-true/false and interleaved mask.
- APInt GapMask(Factor, 0);
std::tie(Mask, GapMask) =
getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
if (!Mask)
return false;
- // We haven't supported gap mask if it's deinterleaving using intrinsics.
- // Yet it is possible that we already changed the IR, hence returning true
- // here.
- if (GapMask.popcount() != Factor)
- return true;
LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
<< " intrinsic " << *DI << " and factor = "
<< Factor << "\n");
+ LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
+ << " and actual factor " << GapMask.popcount() << "\n");
}
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(LoadedVal, Mask, DI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(LoadedVal, Mask, DI, GapMask))
return false;
DeadInsts.insert(DI);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60ce666..0034c03 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18906,8 +18906,10 @@
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+ Instruction *Load, Value *Mask, IntrinsicInst *DI,
+ const APInt &GapMask) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+ assert(GapMask.getBitWidth() == Factor);
if (Factor != 2 && Factor != 3 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
return false;
@@ -18917,6 +18919,10 @@
return false;
assert(!Mask && "Unexpected mask on a load\n");
+ // Gap mask is currently not supported.
+ if (!GapMask.isAllOnes())
+ return false;
+
VectorType *VTy = getDeinterleavedVectorType(DI);
const DataLayout &DL = LI->getModule()->getDataLayout();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2e66b5a..25cab4d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -232,7 +232,8 @@
const APInt &GapMask) const override;
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
- IntrinsicInst *DI) const override;
+ IntrinsicInst *DI,
+ const APInt &GapMask) const override;
bool lowerInterleaveIntrinsicToStore(
Instruction *Store, Value *Mask,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 7ef15bc..558df27 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -438,7 +438,8 @@
const APInt &GapMask) const override;
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
- IntrinsicInst *DI) const override;
+ IntrinsicInst *DI,
+ const APInt &GapMask) const override;
bool lowerInterleaveIntrinsicToStore(
Instruction *Store, Value *Mask,
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 6e2626d..7026541 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -69,6 +69,12 @@
Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask,
Intrinsic::riscv_sseg8_load_mask};
+static const Intrinsic::ID ScalableVlssegIntrIds[] = {
+ Intrinsic::riscv_vlsseg2_mask, Intrinsic::riscv_vlsseg3_mask,
+ Intrinsic::riscv_vlsseg4_mask, Intrinsic::riscv_vlsseg5_mask,
+ Intrinsic::riscv_vlsseg6_mask, Intrinsic::riscv_vlsseg7_mask,
+ Intrinsic::riscv_vlsseg8_mask};
+
static const Intrinsic::ID ScalableVlsegIntrIds[] = {
Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
@@ -349,15 +355,25 @@
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
- Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+ Instruction *Load, Value *Mask, IntrinsicInst *DI,
+ const APInt &GapMask) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+ assert(GapMask.getBitWidth() == Factor);
if (Factor > 8)
return false;
+ // We only support cases where the skipped fields are the trailing ones.
+ if (!GapMask.isMask())
+ return false;
IRBuilder<> Builder(Load);
VectorType *ResVTy = getDeinterleavedVectorType(DI);
+ unsigned MaskFactor = GapMask.getActiveBits();
+ // For MaskFactor of 1, we still want to lower it with segmented load
+ // (of the original Factor), because the sole field extraction will eventually
+ // turn it into a strided load.
+ bool UseStridedSeg = MaskFactor < Factor && MaskFactor > 1;
const DataLayout &DL = Load->getDataLayout();
auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
@@ -371,22 +387,53 @@
if (!isLegalInterleavedAccessType(ResVTy, Factor, Alignment, AS, DL))
return false;
+ unsigned ElementSizeInBytes = DL.getTypeStoreSize(ResVTy->getElementType());
Value *Return;
if (isa<FixedVectorType>(ResVTy)) {
- Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
- {ResVTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+ Value *SegLoad;
+ if (UseStridedSeg) {
+ // Lower to strided segmented load.
+ Value *Stride = ConstantInt::get(XLenTy, Factor * ElementSizeInBytes);
+ SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2],
+ {ResVTy, PtrTy, XLenTy, XLenTy},
+ {Ptr, Stride, Mask, VL});
+ } else {
+ SegLoad =
+ Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
+ {ResVTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+ }
+
+ if (MaskFactor != Factor) {
+ // Replace masked-off factors with poisons.
+ SmallVector<Type *, 8> AggrTypes{Factor, ResVTy};
+ Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
+ for (unsigned I = 0; I < MaskFactor; ++I) {
+ Value *SubVec = Builder.CreateExtractValue(SegLoad, I);
+ Return = Builder.CreateInsertValue(Return, SubVec, I);
+ }
+ } else {
+ Return = SegLoad;
+ }
} else {
unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType());
unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
Load->getContext(), "riscv.vector.tuple",
ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8),
- Factor);
- Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
- Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
- {VecTupTy, PtrTy, Mask->getType(), VL->getType()});
+ UseStridedSeg ? MaskFactor : Factor);
+ Function *SegLoadFunc;
+ if (UseStridedSeg) {
+ // Lower to strided segmented load.
+ SegLoadFunc = Intrinsic::getOrInsertDeclaration(
+ Load->getModule(), ScalableVlssegIntrIds[MaskFactor - 2],
+ {VecTupTy, PtrTy, XLenTy, Mask->getType()});
+ } else {
+ SegLoadFunc = Intrinsic::getOrInsertDeclaration(
+ Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
+ {VecTupTy, PtrTy, Mask->getType(), VL->getType()});
+ }
- Value *Operands[] = {
+ SmallVector<Value *, 8> Operands = {
PoisonValue::get(VecTupTy),
Ptr,
Mask,
@@ -394,12 +441,16 @@
ConstantInt::get(XLenTy,
RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
ConstantInt::get(XLenTy, Log2_64(SEW))};
+ if (UseStridedSeg) {
+ Value *Stride = ConstantInt::get(XLenTy, Factor * ElementSizeInBytes);
+ Operands.insert(std::next(Operands.begin(), 2), Stride);
+ }
- CallInst *Vlseg = Builder.CreateCall(VlsegNFunc, Operands);
+ CallInst *Vlseg = Builder.CreateCall(SegLoadFunc, Operands);
- SmallVector<Type *, 2> AggrTypes{Factor, ResVTy};
+ SmallVector<Type *, 8> AggrTypes{Factor, ResVTy};
Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
- for (unsigned i = 0; i < Factor; ++i) {
+ for (unsigned i = 0; i < MaskFactor; ++i) {
Value *VecExtract = Builder.CreateIntrinsic(
Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy},
{Vlseg, Builder.getInt32(i)});
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 82419c2..c2981c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -431,6 +431,45 @@
ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3
}
+; mask = 1010, skip the last two fields
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @gap_mask_vpload_factor4_intrinsics(ptr %ptr) {
+; CHECK-LABEL: gap_mask_vpload_factor4_intrinsics:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v0, 5
+; CHECK-NEXT: li a1, 16
+; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t
+; CHECK-NEXT: ret
+ %wide.masked.load = call <16 x i32> @llvm.vp.load(ptr %ptr, <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, i32 16)
+ %d = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.vector.deinterleave4(<16 x i32> %wide.masked.load)
+ %t0 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %d, 0
+ %t1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %d, 1
+
+ %res0 = insertvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } poison, <4 x i32> %t0, 0
+ %res1 = insertvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %res0, <4 x i32> %t1, 1
+ ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %res1
+}
+
+; mask = 1010, skip the last three fields. We should not apply the gap-mask optimization here but
+; we can extract only the first field so that it can be turned into a strided load.
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @gap_mask_single_field_vpload_factor4_intrinsics(ptr %ptr) {
+; CHECK-LABEL: gap_mask_single_field_vpload_factor4_intrinsics:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v0, 5
+; CHECK-NEXT: li a1, 16
+; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT: ret
+ %wide.masked.load = call <16 x i32> @llvm.vp.load(ptr %ptr, <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, i32 16)
+ %d = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.vector.deinterleave4(<16 x i32> %wide.masked.load)
+ %t0 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %d, 0
+ %t1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %d, 1
+
+ %res0 = insertvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } poison, <4 x i32> %t0, 0
+ %res1 = insertvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %res0, <4 x i32> %t1, 1
+ ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %res1
+}
+
define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor5(ptr %ptr) {
; CHECK-LABEL: vpload_factor5:
; CHECK: # %bb.0:
@@ -615,8 +654,8 @@
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs4r.v v28, (a1) # vscale x 32-byte Folded Spill
-; RV32-NEXT: lui a1, %hi(.LCPI27_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_0)
+; RV32-NEXT: lui a1, %hi(.LCPI29_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI29_0)
; RV32-NEXT: lui a6, 49164
; RV32-NEXT: lui t1, 3
; RV32-NEXT: lui t0, 196656
@@ -777,8 +816,8 @@
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV32-NEXT: vrgatherei16.vv v24, v8, v2
-; RV32-NEXT: lui a1, %hi(.LCPI27_1)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_1)
+; RV32-NEXT: lui a1, %hi(.LCPI29_1)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI29_1)
; RV32-NEXT: lui a3, 3073
; RV32-NEXT: addi a3, a3, -1024
; RV32-NEXT: vmv.s.x v0, a3
@@ -803,8 +842,8 @@
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: lui a1, %hi(.LCPI27_3)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_3)
+; RV32-NEXT: lui a1, %hi(.LCPI29_3)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI29_3)
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle16.v v28, (a1)
; RV32-NEXT: csrr a1, vlenb
@@ -814,8 +853,8 @@
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vrgatherei16.vv v16, v8, v30
-; RV32-NEXT: lui a1, %hi(.LCPI27_2)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_2)
+; RV32-NEXT: lui a1, %hi(.LCPI29_2)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI29_2)
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; RV32-NEXT: vle16.v v20, (a1)
; RV32-NEXT: csrr a1, vlenb
@@ -863,16 +902,16 @@
; RV32-NEXT: vrgatherei16.vv v24, v12, v20
; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
; RV32-NEXT: vmv.v.v v24, v8
-; RV32-NEXT: lui a1, %hi(.LCPI27_4)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_4)
-; RV32-NEXT: lui a2, %hi(.LCPI27_5)
-; RV32-NEXT: addi a2, a2, %lo(.LCPI27_5)
+; RV32-NEXT: lui a1, %hi(.LCPI29_4)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI29_4)
+; RV32-NEXT: lui a2, %hi(.LCPI29_5)
+; RV32-NEXT: addi a2, a2, %lo(.LCPI29_5)
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV32-NEXT: vle16.v v28, (a2)
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; RV32-NEXT: vle16.v v1, (a1)
-; RV32-NEXT: lui a1, %hi(.LCPI27_7)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_7)
+; RV32-NEXT: lui a1, %hi(.LCPI29_7)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI29_7)
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle16.v v2, (a1)
; RV32-NEXT: csrr a1, vlenb
@@ -895,14 +934,14 @@
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vrgatherei16.vv v16, v8, v2
-; RV32-NEXT: lui a1, %hi(.LCPI27_6)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_6)
-; RV32-NEXT: lui a2, %hi(.LCPI27_8)
-; RV32-NEXT: addi a2, a2, %lo(.LCPI27_8)
+; RV32-NEXT: lui a1, %hi(.LCPI29_6)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI29_6)
+; RV32-NEXT: lui a2, %hi(.LCPI29_8)
+; RV32-NEXT: addi a2, a2, %lo(.LCPI29_8)
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; RV32-NEXT: vle16.v v8, (a1)
-; RV32-NEXT: lui a1, %hi(.LCPI27_9)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_9)
+; RV32-NEXT: lui a1, %hi(.LCPI29_9)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI29_9)
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV32-NEXT: vle16.v v10, (a1)
; RV32-NEXT: csrr a1, vlenb
@@ -1210,8 +1249,8 @@
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vs4r.v v12, (a2) # vscale x 32-byte Folded Spill
-; RV64-NEXT: lui a2, %hi(.LCPI27_0)
-; RV64-NEXT: addi a2, a2, %lo(.LCPI27_0)
+; RV64-NEXT: lui a2, %hi(.LCPI29_0)
+; RV64-NEXT: addi a2, a2, %lo(.LCPI29_0)
; RV64-NEXT: li a3, 1040
; RV64-NEXT: vmv.s.x v0, a3
; RV64-NEXT: addi a1, a1, -2016
@@ -1252,8 +1291,8 @@
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: lui a1, %hi(.LCPI27_1)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI27_1)
+; RV64-NEXT: lui a1, %hi(.LCPI29_1)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI29_1)
; RV64-NEXT: vle16.v v24, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 37
@@ -1268,8 +1307,8 @@
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill
-; RV64-NEXT: lui a1, %hi(.LCPI27_2)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI27_2)
+; RV64-NEXT: lui a1, %hi(.LCPI29_2)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI29_2)
; RV64-NEXT: vle16.v v12, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 21
@@ -1335,12 +1374,12 @@
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT: lui a1, %hi(.LCPI27_3)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI27_3)
+; RV64-NEXT: lui a1, %hi(.LCPI29_3)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI29_3)
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV64-NEXT: vle16.v v8, (a1)
-; RV64-NEXT: lui a1, %hi(.LCPI27_4)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI27_4)
+; RV64-NEXT: lui a1, %hi(.LCPI29_4)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI29_4)
; RV64-NEXT: vle16.v v10, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 77
@@ -1388,8 +1427,8 @@
; RV64-NEXT: vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vrgatherei16.vv v0, v8, v16
-; RV64-NEXT: lui a1, %hi(.LCPI27_5)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI27_5)
+; RV64-NEXT: lui a1, %hi(.LCPI29_5)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI29_5)
; RV64-NEXT: vle16.v v12, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 61
@@ -2037,8 +2076,8 @@
; RV32-NEXT: vle32.v v12, (a0), v0.t
; RV32-NEXT: li a0, 36
; RV32-NEXT: vmv.s.x v20, a1
-; RV32-NEXT: lui a1, %hi(.LCPI65_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI65_0)
+; RV32-NEXT: lui a1, %hi(.LCPI67_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI67_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle16.v v21, (a1)
; RV32-NEXT: vcompress.vm v8, v12, v11
@@ -2113,8 +2152,8 @@
; RV32-NEXT: vmv.s.x v10, a0
; RV32-NEXT: li a0, 146
; RV32-NEXT: vmv.s.x v11, a0
-; RV32-NEXT: lui a0, %hi(.LCPI66_0)
-; RV32-NEXT: addi a0, a0, %lo(.LCPI66_0)
+; RV32-NEXT: lui a0, %hi(.LCPI68_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI68_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle16.v v20, (a0)
; RV32-NEXT: li a0, 36
@@ -2380,8 +2419,8 @@
; RV32-NEXT: vle32.v v12, (a0), v0.t
; RV32-NEXT: li a0, 36
; RV32-NEXT: vmv.s.x v20, a1
-; RV32-NEXT: lui a1, %hi(.LCPI78_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI78_0)
+; RV32-NEXT: lui a1, %hi(.LCPI80_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI80_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle16.v v21, (a1)
; RV32-NEXT: vcompress.vm v8, v12, v11
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 36d1aee..97fc2e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -494,6 +494,85 @@
ret <vscale x 2 x i32> %t3
}
+; mask = all ones, skip the last 2 fields
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @gap_mask_load_factor4(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: gap_mask_load_factor4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg2e32.v v8, (a0), a2
+; CHECK-NEXT: ret
+ %rvl = mul nuw i32 %evl, 4
+ %gap.mask = call <vscale x 8 x i1> @llvm.vector.interleave4(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 false), <vscale x 2 x i1> splat (i1 false))
+ %combined.mask = and <vscale x 8 x i1> %gap.mask, splat (i1 true)
+
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %combined.mask, i32 %rvl)
+ %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+; mask = %m, skip the last field
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @gap_mask_load_factor4_mask(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: gap_mask_load_factor4_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg3e32.v v8, (a0), a2, v0.t
+; CHECK-NEXT: ret
+ %rvl = mul nuw i32 %evl, 4
+ %actual.mask = call <vscale x 8 x i1> @llvm.vector.interleave4(<vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m)
+ %gap.mask = call <vscale x 8 x i1> @llvm.vector.interleave4(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 false))
+ %combined.mask = and <vscale x 8 x i1> %gap.mask, %actual.mask
+
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %combined.mask, i32 %rvl)
+ %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+; mask = %m, skip the last 3 fields. We should not apply the gap-mask optimization here but we can extract only the first field so that it can be turned into a strided load.
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @gap_mask_single_field_load_factor4_mask(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: gap_mask_single_field_load_factor4_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vlse32.v v8, (a0), a2, v0.t
+; CHECK-NEXT: ret
+ %rvl = mul nuw i32 %evl, 4
+ %actual.mask = call <vscale x 8 x i1> @llvm.vector.interleave4(<vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m)
+ %gap.mask = call <vscale x 8 x i1> @llvm.vector.interleave4(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> splat (i1 false), <vscale x 2 x i1> splat (i1 false), <vscale x 2 x i1> splat (i1 false))
+ %combined.mask = and <vscale x 8 x i1> %gap.mask, %actual.mask
+
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %combined.mask, i32 %rvl)
+ %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
; Negative tests
@@ -615,4 +694,3 @@
%res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
}
-