[AArch64][CodeGen] Lower (de)interleave2 intrinsics to ld2/st2
The InterleavedAccess pass currently matches (de)interleaving
shufflevector instructions with loads or stores, and calls into
target lowering to generate ldN or stN instructions.
Since we can't use shufflevector for scalable vectors (besides a
splat with zeroinitializer), we have interleave2 and deinterleave2
intrinsics. This patch extends InterleavedAccess to recognize those
intrinsics and if possible replace them with ld2/st2.
Reviewed By: paulwalker-arm
Differential Revision: https://reviews.llvm.org/D146218
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 0aa4fa2..6daf623 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2972,6 +2972,28 @@
return false;
}
+ /// Lower a deinterleave intrinsic to a target specific load intrinsic.
+ /// Return true on success. Currently only supports
+ /// llvm.experimental.vector.deinterleave2
+ ///
+ /// \p DI is the deinterleave intrinsic.
+ /// \p LI is the accompanying load instruction
+ virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ LoadInst *LI) const {
+ return false;
+ }
+
+ /// Lower an interleave intrinsic to a target specific store intrinsic.
+ /// Return true on success. Currently only supports
+ /// llvm.experimental.vector.interleave2
+ ///
+ /// \p II is the interleave intrinsic.
+ /// \p SI is the accompanying store instruction
+ virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ StoreInst *SI) const {
+ return false;
+ }
+
/// Return true if an fpext operation is free (for instance, because
/// single-precision floating-point numbers are implicitly extended to
/// double-precision).
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index e4f581f..6b38485 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -58,6 +58,7 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
@@ -113,6 +114,16 @@
bool lowerInterleavedStore(StoreInst *SI,
SmallVector<Instruction *, 32> &DeadInsts);
+ /// Transform a load and a deinterleave intrinsic into target specific
+ /// instructions.
+ bool lowerDeinterleaveIntrinsic(IntrinsicInst *II,
+ SmallVector<Instruction *, 32> &DeadInsts);
+
+ /// Transform an interleave intrinsic and a store into target specific
+ /// instructions.
+ bool lowerInterleaveIntrinsic(IntrinsicInst *II,
+ SmallVector<Instruction *, 32> &DeadInsts);
+
/// Returns true if the uses of an interleaved load by the
/// extractelement instructions in \p Extracts can be replaced by uses of the
/// shufflevector instructions in \p Shuffles instead. If so, the necessary
@@ -446,6 +457,47 @@
return true;
}
+bool InterleavedAccess::lowerDeinterleaveIntrinsic(
+ IntrinsicInst *DI, SmallVector<Instruction *, 32> &DeadInsts) {
+ LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0));
+
+ if (!LI || !LI->hasOneUse() || !LI->isSimple())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+
+ // Try and match this with target specific intrinsics.
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+ return false;
+
+ // We now have a target-specific load, so delete the old one.
+ DeadInsts.push_back(DI);
+ DeadInsts.push_back(LI);
+ return true;
+}
+
+bool InterleavedAccess::lowerInterleaveIntrinsic(
+ IntrinsicInst *II, SmallVector<Instruction *, 32> &DeadInsts) {
+ if (!II->hasOneUse())
+ return false;
+
+ StoreInst *SI = dyn_cast<StoreInst>(*(II->users().begin()));
+
+ if (!SI || !SI->isSimple())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
+
+ // Try and match this with target specific intrinsics.
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+ return false;
+
+ // We now have a target-specific store, so delete the old one.
+ DeadInsts.push_back(SI);
+ DeadInsts.push_back(II);
+ return true;
+}
+
bool InterleavedAccess::runOnFunction(Function &F) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC || !LowerInterleavedAccesses)
@@ -468,6 +520,15 @@
if (auto *SI = dyn_cast<StoreInst>(&I))
Changed |= lowerInterleavedStore(SI, DeadInsts);
+
+ if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ // At present, we only have intrinsics to represent (de)interleaving
+ // with a factor of 2.
+ if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
+ Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
+ if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+ Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
+ }
}
for (auto *I : DeadInsts)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8331104..f1a649b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14754,12 +14754,18 @@
bool AArch64TargetLowering::isLegalInterleavedAccessType(
VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
-
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
auto EC = VecTy->getElementCount();
unsigned MinElts = EC.getKnownMinValue();
UseScalable = false;
+
+ if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
+ return false;
+
+ if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
+ return false;
+
// Ensure that the predicate for this number of elements is available.
if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
return false;
@@ -14772,8 +14778,10 @@
if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
return false;
- if (EC.isScalable())
- return MinElts * ElSize == 128;
+ if (EC.isScalable()) {
+ UseScalable = true;
+ return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
+ }
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
if (Subtarget->forceStreamingCompatibleSVE() ||
@@ -14818,6 +14826,38 @@
llvm_unreachable("Cannot handle input vector type");
}
+static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
+ bool Scalable, Type *LDVTy,
+ Type *PtrTy) {
+ assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
+ static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
+ Intrinsic::aarch64_sve_ld3_sret,
+ Intrinsic::aarch64_sve_ld4_sret};
+ static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
+ Intrinsic::aarch64_neon_ld3,
+ Intrinsic::aarch64_neon_ld4};
+ if (Scalable)
+ return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
+
+ return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
+}
+
+static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
+ bool Scalable, Type *STVTy,
+ Type *PtrTy) {
+ assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
+ static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
+ Intrinsic::aarch64_sve_st3,
+ Intrinsic::aarch64_sve_st4};
+ static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
+ Intrinsic::aarch64_neon_st3,
+ Intrinsic::aarch64_neon_st4};
+ if (Scalable)
+ return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
+
+ return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
+}
+
/// Lower an interleaved load into a ldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
@@ -14883,26 +14923,12 @@
LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
}
- Type *PtrTy =
- UseScalable
- ? LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())
- : LDVTy->getPointerTo(LI->getPointerAddressSpace());
+ Type *PtrTy = LI->getPointerOperandType();
Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
LDVTy->getElementCount());
- static const Intrinsic::ID SVELoadIntrs[3] = {
- Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret,
- Intrinsic::aarch64_sve_ld4_sret};
- static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2,
- Intrinsic::aarch64_neon_ld3,
- Intrinsic::aarch64_neon_ld4};
- Function *LdNFunc;
- if (UseScalable)
- LdNFunc = Intrinsic::getDeclaration(LI->getModule(),
- SVELoadIntrs[Factor - 2], {LDVTy});
- else
- LdNFunc = Intrinsic::getDeclaration(
- LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy});
+ Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
+ UseScalable, LDVTy, PtrTy);
// Holds sub-vectors extracted from the load intrinsic return values. The
// sub-vectors are associated with the shufflevector instructions they will
@@ -15080,26 +15106,12 @@
if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 && Mask[0] != 0)
return false;
- Type *PtrTy =
- UseScalable
- ? STVTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())
- : STVTy->getPointerTo(SI->getPointerAddressSpace());
+ Type *PtrTy = SI->getPointerOperandType();
Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
STVTy->getElementCount());
- static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2,
- Intrinsic::aarch64_sve_st3,
- Intrinsic::aarch64_sve_st4};
- static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2,
- Intrinsic::aarch64_neon_st3,
- Intrinsic::aarch64_neon_st4};
- Function *StNFunc;
- if (UseScalable)
- StNFunc = Intrinsic::getDeclaration(SI->getModule(),
- SVEStoreIntrs[Factor - 2], {STVTy});
- else
- StNFunc = Intrinsic::getDeclaration(
- SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy});
+ Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
+ UseScalable, STVTy, PtrTy);
Value *PTrue = nullptr;
if (UseScalable) {
@@ -15169,6 +15181,144 @@
return true;
}
+bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *DI, LoadInst *LI) const {
+ // Only deinterleave2 supported at present.
+ if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+ return false;
+
+ // Only a factor of 2 supported at present.
+ const unsigned Factor = 2;
+
+ VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ const DataLayout &DL = DI->getModule()->getDataLayout();
+ bool UseScalable;
+ if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
+ return false;
+
+ // TODO: Add support for using SVE instructions with fixed types later, using
+ // the code from lowerInterleavedLoad to obtain the correct container type.
+ if (UseScalable && !VTy->isScalableTy())
+ return false;
+
+ unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
+
+ VectorType *LdTy =
+ VectorType::get(VTy->getElementType(),
+ VTy->getElementCount().divideCoefficientBy(NumLoads));
+
+ Type *PtrTy = LI->getPointerOperandType();
+ Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
+ UseScalable, LdTy, PtrTy);
+
+ IRBuilder<> Builder(LI);
+
+ Value *Pred = nullptr;
+ if (UseScalable)
+ Pred =
+ Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
+
+ Value *BaseAddr = LI->getPointerOperand();
+ Value *Result;
+ if (NumLoads > 1) {
+ Value *Left = PoisonValue::get(VTy);
+ Value *Right = PoisonValue::get(VTy);
+
+ for (unsigned I = 0; I < NumLoads; ++I) {
+ Value *Offset = Builder.getInt64(I * Factor);
+
+ Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
+ Value *LdN = nullptr;
+ if (UseScalable)
+ LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
+ else
+ LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
+
+ Value *Idx =
+ Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
+ Left = Builder.CreateInsertVector(
+ VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
+ Right = Builder.CreateInsertVector(
+ VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
+ }
+
+ Result = PoisonValue::get(DI->getType());
+ Result = Builder.CreateInsertValue(Result, Left, 0);
+ Result = Builder.CreateInsertValue(Result, Right, 1);
+ } else {
+ if (UseScalable)
+ Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
+ else
+ Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
+ }
+
+ DI->replaceAllUsesWith(Result);
+ return true;
+}
+
+bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
+ IntrinsicInst *II, StoreInst *SI) const {
+ // Only interleave2 supported at present.
+ if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+ return false;
+
+ // Only a factor of 2 supported at present.
+ const unsigned Factor = 2;
+
+ VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
+ const DataLayout &DL = II->getModule()->getDataLayout();
+ bool UseScalable;
+ if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
+ return false;
+
+ // TODO: Add support for using SVE instructions with fixed types later, using
+ // the code from lowerInterleavedStore to obtain the correct container type.
+ if (UseScalable && !VTy->isScalableTy())
+ return false;
+
+ unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
+
+ VectorType *StTy =
+ VectorType::get(VTy->getElementType(),
+ VTy->getElementCount().divideCoefficientBy(NumStores));
+
+ Type *PtrTy = SI->getPointerOperandType();
+ Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
+ UseScalable, StTy, PtrTy);
+
+ IRBuilder<> Builder(SI);
+
+ Value *BaseAddr = SI->getPointerOperand();
+ Value *Pred = nullptr;
+
+ if (UseScalable)
+ Pred =
+ Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
+
+ Value *L = II->getOperand(0);
+ Value *R = II->getOperand(1);
+
+ for (unsigned I = 0; I < NumStores; ++I) {
+ Value *Address = BaseAddr;
+ if (NumStores > 1) {
+ Value *Offset = Builder.getInt64(I * Factor);
+ Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
+
+ Value *Idx =
+ Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
+ L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
+ R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
+ }
+
+ if (UseScalable)
+ Builder.CreateCall(StNFunc, {L, R, Pred, Address});
+ else
+ Builder.CreateCall(StNFunc, {L, R, Address});
+ }
+
+ return true;
+}
+
EVT AArch64TargetLowering::getOptimalMemOpType(
const MemOp &Op, const AttributeList &FuncAttributes) const {
bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 3ce29a7..aca45f1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -651,6 +651,12 @@
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ LoadInst *LI) const override;
+
+ bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ StoreInst *SI) const override;
+
bool isLegalAddImmediate(int64_t) const override;
bool isLegalICmpImmediate(int64_t) const override;
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
new file mode 100644
index 0000000..ab70d62
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -interleaved-access -S | FileCheck %s --check-prefix=NEON
+; RUN: opt < %s -interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible-sve -S | FileCheck %s --check-prefix=SVE-FIXED
+
+target triple = "aarch64-linux-gnu"
+
+define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2(ptr %ptr) {
+; NEON-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
+; NEON-SAME: (ptr [[PTR:%.*]]) {
+; NEON-NEXT: [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]])
+; NEON-NEXT: ret { <16 x i8>, <16 x i8> } [[LDN]]
+;
+; SVE-FIXED-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <32 x i8>, ptr [[PTR]], align 1
+; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]])
+; SVE-FIXED-NEXT: ret { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]]
+;
+ %load = load <32 x i8>, ptr %ptr, align 1
+ %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %load)
+ ret { <16 x i8>, <16 x i8> } %deinterleave
+}
+
+define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) {
+; NEON-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
+; NEON-SAME: (ptr [[PTR:%.*]]) {
+; NEON-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]])
+; NEON-NEXT: ret { <8 x i16>, <8 x i16> } [[LDN]]
+;
+; SVE-FIXED-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2
+; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
+; SVE-FIXED-NEXT: ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]]
+;
+ %load = load <16 x i16>, ptr %ptr, align 2
+ %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %load)
+ ret { <8 x i16>, <8 x i16> } %deinterleave
+}
+
+define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) {
+; NEON-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
+; NEON-SAME: (ptr [[PTR:%.*]]) {
+; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]])
+; NEON-NEXT: ret { <4 x i32>, <4 x i32> } [[LDN]]
+;
+; SVE-FIXED-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4
+; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
+; SVE-FIXED-NEXT: ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]]
+;
+ %load = load <8 x i32>, ptr %ptr, align 4
+ %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %load)
+ ret { <4 x i32>, <4 x i32> } %deinterleave
+}
+
+define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) {
+; NEON-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
+; NEON-SAME: (ptr [[PTR:%.*]]) {
+; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
+; NEON-NEXT: ret { <2 x i64>, <2 x i64> } [[LDN]]
+;
+; SVE-FIXED-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8
+; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
+; SVE-FIXED-NEXT: ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]]
+;
+ %load = load <4 x i64>, ptr %ptr, align 8
+ %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %load)
+ ret { <2 x i64>, <2 x i64> } %deinterleave
+}
+
+define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) {
+; NEON-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
+; NEON-SAME: (ptr [[PTR:%.*]]) {
+; NEON-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]])
+; NEON-NEXT: ret { <4 x float>, <4 x float> } [[LDN]]
+;
+; SVE-FIXED-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4
+; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
+; SVE-FIXED-NEXT: ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]]
+;
+ %load = load <8 x float>, ptr %ptr, align 4
+ %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %load)
+ ret { <4 x float>, <4 x float> } %deinterleave
+}
+
+define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) {
+; NEON-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
+; NEON-SAME: (ptr [[PTR:%.*]]) {
+; NEON-NEXT: [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]])
+; NEON-NEXT: ret { <2 x double>, <2 x double> } [[LDN]]
+;
+; SVE-FIXED-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8
+; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
+; SVE-FIXED-NEXT: ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]]
+;
+ %load = load <4 x double>, ptr %ptr, align 8
+ %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %load)
+ ret { <2 x double>, <2 x double> } %deinterleave
+}
+
+define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2(ptr %ptr) {
+; NEON-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
+; NEON-SAME: (ptr [[PTR:%.*]]) {
+; NEON-NEXT: [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]])
+; NEON-NEXT: ret { <2 x ptr>, <2 x ptr> } [[LDN]]
+;
+; SVE-FIXED-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8
+; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
+; SVE-FIXED-NEXT: ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]]
+;
+ %load = load <4 x ptr>, ptr %ptr, align 8
+ %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> %load)
+ ret { <2 x ptr>, <2 x ptr> } %deinterleave
+}
+
+define void @interleave_i8_factor2(ptr %ptr, <16 x i8> %l, <16 x i8> %r) {
+; NEON-LABEL: define void @interleave_i8_factor2
+; NEON-SAME: (ptr [[PTR:%.*]], <16 x i8> [[L:%.*]], <16 x i8> [[R:%.*]]) {
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[L]], <16 x i8> [[R]], ptr [[PTR]])
+; NEON-NEXT: ret void
+;
+; SVE-FIXED-LABEL: define void @interleave_i8_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <16 x i8> [[L:%.*]], <16 x i8> [[R:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> [[L]], <16 x i8> [[R]])
+; SVE-FIXED-NEXT: store <32 x i8> [[INTERLEAVE]], ptr [[PTR]], align 1
+; SVE-FIXED-NEXT: ret void
+;
+ %interleave = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %l, <16 x i8> %r)
+ store <32 x i8> %interleave, ptr %ptr, align 1
+ ret void
+}
+
+define void @interleave_i16_factor2(ptr %ptr, <8 x i16> %l, <8 x i16> %r) {
+; NEON-LABEL: define void @interleave_i16_factor2
+; NEON-SAME: (ptr [[PTR:%.*]], <8 x i16> [[L:%.*]], <8 x i16> [[R:%.*]]) {
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[L]], <8 x i16> [[R]], ptr [[PTR]])
+; NEON-NEXT: ret void
+;
+; SVE-FIXED-LABEL: define void @interleave_i16_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <8 x i16> [[L:%.*]], <8 x i16> [[R:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> [[L]], <8 x i16> [[R]])
+; SVE-FIXED-NEXT: store <16 x i16> [[INTERLEAVE]], ptr [[PTR]], align 2
+; SVE-FIXED-NEXT: ret void
+;
+ %interleave = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %l, <8 x i16> %r)
+ store <16 x i16> %interleave, ptr %ptr, align 2
+ ret void
+}
+
+define void @interleave_i32_factor2(ptr %ptr, <4 x i32> %l, <4 x i32> %r) {
+; NEON-LABEL: define void @interleave_i32_factor2
+; NEON-SAME: (ptr [[PTR:%.*]], <4 x i32> [[L:%.*]], <4 x i32> [[R:%.*]]) {
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[L]], <4 x i32> [[R]], ptr [[PTR]])
+; NEON-NEXT: ret void
+;
+; SVE-FIXED-LABEL: define void @interleave_i32_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <4 x i32> [[L:%.*]], <4 x i32> [[R:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> [[L]], <4 x i32> [[R]])
+; SVE-FIXED-NEXT: store <8 x i32> [[INTERLEAVE]], ptr [[PTR]], align 4
+; SVE-FIXED-NEXT: ret void
+;
+ %interleave = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %l, <4 x i32> %r)
+ store <8 x i32> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define void @interleave_i64_factor2(ptr %ptr, <2 x i64> %l, <2 x i64> %r) {
+; NEON-LABEL: define void @interleave_i64_factor2
+; NEON-SAME: (ptr [[PTR:%.*]], <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) {
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[L]], <2 x i64> [[R]], ptr [[PTR]])
+; NEON-NEXT: ret void
+;
+; SVE-FIXED-LABEL: define void @interleave_i64_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> [[L]], <2 x i64> [[R]])
+; SVE-FIXED-NEXT: store <4 x i64> [[INTERLEAVE]], ptr [[PTR]], align 8
+; SVE-FIXED-NEXT: ret void
+;
+ %interleave = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %l, <2 x i64> %r)
+ store <4 x i64> %interleave, ptr %ptr, align 8
+ ret void
+}
+
+define void @interleave_float_factor2(ptr %ptr, <4 x float> %l, <4 x float> %r) {
+; NEON-LABEL: define void @interleave_float_factor2
+; NEON-SAME: (ptr [[PTR:%.*]], <4 x float> [[L:%.*]], <4 x float> [[R:%.*]]) {
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[L]], <4 x float> [[R]], ptr [[PTR]])
+; NEON-NEXT: ret void
+;
+; SVE-FIXED-LABEL: define void @interleave_float_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <4 x float> [[L:%.*]], <4 x float> [[R:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> [[L]], <4 x float> [[R]])
+; SVE-FIXED-NEXT: store <8 x float> [[INTERLEAVE]], ptr [[PTR]], align 4
+; SVE-FIXED-NEXT: ret void
+;
+ %interleave = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %l, <4 x float> %r)
+ store <8 x float> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define void @interleave_double_factor2(ptr %ptr, <2 x double> %l, <2 x double> %r) {
+; NEON-LABEL: define void @interleave_double_factor2
+; NEON-SAME: (ptr [[PTR:%.*]], <2 x double> [[L:%.*]], <2 x double> [[R:%.*]]) {
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[L]], <2 x double> [[R]], ptr [[PTR]])
+; NEON-NEXT: ret void
+;
+; SVE-FIXED-LABEL: define void @interleave_double_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x double> [[L:%.*]], <2 x double> [[R:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> [[L]], <2 x double> [[R]])
+; SVE-FIXED-NEXT: store <4 x double> [[INTERLEAVE]], ptr [[PTR]], align 4
+; SVE-FIXED-NEXT: ret void
+;
+ %interleave = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %l, <2 x double> %r)
+ store <4 x double> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) {
+; NEON-LABEL: define void @interleave_ptr_factor2
+; NEON-SAME: (ptr [[PTR:%.*]], <2 x ptr> [[L:%.*]], <2 x ptr> [[R:%.*]]) {
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[L]], <2 x ptr> [[R]], ptr [[PTR]])
+; NEON-NEXT: ret void
+;
+; SVE-FIXED-LABEL: define void @interleave_ptr_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x ptr> [[L:%.*]], <2 x ptr> [[R:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> [[L]], <2 x ptr> [[R]])
+; SVE-FIXED-NEXT: store <4 x ptr> [[INTERLEAVE]], ptr [[PTR]], align 4
+; SVE-FIXED-NEXT: ret void
+;
+ %interleave = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> %l, <2 x ptr> %r)
+ store <4 x ptr> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
+; NEON-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
+; NEON-SAME: (ptr [[PTR:%.*]]) {
+; NEON-NEXT: [[TMP1:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 0
+; NEON-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP1]])
+; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
+; NEON-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP2]], i64 0)
+; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
+; NEON-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP4]], i64 0)
+; NEON-NEXT: [[TMP6:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 2
+; NEON-NEXT: [[LDN1:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP6]])
+; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 0
+; NEON-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8)
+; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1
+; NEON-NEXT: [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8)
+; NEON-NEXT: [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0
+; NEON-NEXT: [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1
+; NEON-NEXT: ret { <16 x i16>, <16 x i16> } [[TMP12]]
+;
+; SVE-FIXED-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2
+; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
+; SVE-FIXED-NEXT: ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]]
+;
+ %load = load <32 x i16>, ptr %ptr, align 2
+ %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> %load)
+ ret { <16 x i16>, <16 x i16> } %deinterleave
+}
+
+define void @interleave_wide_ptr_factor2(ptr %ptr, <8 x ptr> %l, <8 x ptr> %r) {
+; NEON-LABEL: define void @interleave_wide_ptr_factor2
+; NEON-SAME: (ptr [[PTR:%.*]], <8 x ptr> [[L:%.*]], <8 x ptr> [[R:%.*]]) {
+; NEON-NEXT: [[TMP1:%.*]] = getelementptr <2 x ptr>, ptr [[PTR]], i64 0
+; NEON-NEXT: [[TMP2:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 0)
+; NEON-NEXT: [[TMP3:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 0)
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP2]], <2 x ptr> [[TMP3]], ptr [[TMP1]])
+; NEON-NEXT: [[TMP4:%.*]] = getelementptr <2 x ptr>, ptr [[PTR]], i64 2
+; NEON-NEXT: [[TMP5:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 2)
+; NEON-NEXT: [[TMP6:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 2)
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP5]], <2 x ptr> [[TMP6]], ptr [[TMP4]])
+; NEON-NEXT: [[TMP7:%.*]] = getelementptr <2 x ptr>, ptr [[PTR]], i64 4
+; NEON-NEXT: [[TMP8:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 4)
+; NEON-NEXT: [[TMP9:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 4)
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP8]], <2 x ptr> [[TMP9]], ptr [[TMP7]])
+; NEON-NEXT: [[TMP10:%.*]] = getelementptr <2 x ptr>, ptr [[PTR]], i64 6
+; NEON-NEXT: [[TMP11:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 6)
+; NEON-NEXT: [[TMP12:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 6)
+; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP11]], <2 x ptr> [[TMP12]], ptr [[TMP10]])
+; NEON-NEXT: ret void
+;
+; SVE-FIXED-LABEL: define void @interleave_wide_ptr_factor2
+; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <8 x ptr> [[L:%.*]], <8 x ptr> [[R:%.*]]) #[[ATTR0]] {
+; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> [[L]], <8 x ptr> [[R]])
+; SVE-FIXED-NEXT: store <16 x ptr> [[INTERLEAVE]], ptr [[PTR]], align 4
+; SVE-FIXED-NEXT: ret void
+;
+ %interleave = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> %l, <8 x ptr> %r)
+ store <16 x ptr> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
+declare { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
+declare { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
+declare { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
+declare { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
+declare { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
+declare { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr>)
+declare { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16>)
+
+declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
+declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr>, <2 x ptr>)
+declare <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr>, <8 x ptr>)
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
new file mode 100644
index 0000000..c04464b
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -interleaved-access -S | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]]
+;
+ %load = load <vscale x 32 x i8>, ptr %ptr, align 1
+ %deinterleave = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %load)
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]]
+;
+ %load = load <vscale x 16 x i16>, ptr %ptr, align 2
+ %deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleave
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]]
+;
+ %load = load <vscale x 8 x i32>, ptr %ptr, align 4
+ %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
+ ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]]
+;
+ %load = load <vscale x 4 x i64>, ptr %ptr, align 8
+ %deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
+ ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %deinterleave
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]]
+;
+ %load = load <vscale x 8 x float>, ptr %ptr, align 4
+ %deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
+ ret { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleave
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]]
+;
+ %load = load <vscale x 4 x double>, ptr %ptr, align 8
+ %deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
+ ret { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleave
+}
+
+define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.aarch64.sve.ld2.sret.nxv2p0(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]]
+;
+ %load = load <vscale x 4 x ptr>, ptr %ptr, align 8
+ %deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.experimental.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
+ ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleave
+}
+
+define void @interleave_nxi8_factor2(ptr %ptr, <vscale x 16 x i8> %l, <vscale x 16 x i8> %r) #0 {
+; CHECK-LABEL: define void @interleave_nxi8_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 16 x i8> [[L:%.*]], <vscale x 16 x i8> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[L]], <vscale x 16 x i8> [[R]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %l, <vscale x 16 x i8> %r)
+ store <vscale x 32 x i8> %interleave, ptr %ptr, align 1
+ ret void
+}
+
+define void @interleave_nxi16_factor2(ptr %ptr, <vscale x 8 x i16> %l, <vscale x 8 x i16> %r) #0 {
+; CHECK-LABEL: define void @interleave_nxi16_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i16> [[L:%.*]], <vscale x 8 x i16> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[L]], <vscale x 8 x i16> [[R]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %l, <vscale x 8 x i16> %r)
+ store <vscale x 16 x i16> %interleave, ptr %ptr, align 2
+ ret void
+}
+
+define void @interleave_nxi32_factor2(ptr %ptr, <vscale x 4 x i32> %l, <vscale x 4 x i32> %r) #0 {
+; CHECK-LABEL: define void @interleave_nxi32_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 4 x i32> [[L:%.*]], <vscale x 4 x i32> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[L]], <vscale x 4 x i32> [[R]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %l, <vscale x 4 x i32> %r)
+ store <vscale x 8 x i32> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define void @interleave_nxi64_factor2(ptr %ptr, <vscale x 2 x i64> %l, <vscale x 2 x i64> %r) #0 {
+; CHECK-LABEL: define void @interleave_nxi64_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 2 x i64> [[L:%.*]], <vscale x 2 x i64> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[L]], <vscale x 2 x i64> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %l, <vscale x 2 x i64> %r)
+ store <vscale x 4 x i64> %interleave, ptr %ptr, align 8
+ ret void
+}
+
+define void @interleave_nxfloat_factor2(ptr %ptr, <vscale x 4 x float> %l, <vscale x 4 x float> %r) #0 {
+; CHECK-LABEL: define void @interleave_nxfloat_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 4 x float> [[L:%.*]], <vscale x 4 x float> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[L]], <vscale x 4 x float> [[R]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %l, <vscale x 4 x float> %r)
+ store <vscale x 8 x float> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define void @interleave_nxdouble_factor2(ptr %ptr, <vscale x 2 x double> %l, <vscale x 2 x double> %r) #0 {
+; CHECK-LABEL: define void @interleave_nxdouble_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 2 x double> [[L:%.*]], <vscale x 2 x double> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[L]], <vscale x 2 x double> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %l, <vscale x 2 x double> %r)
+ store <vscale x 4 x double> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define void @interleave_nxptr_factor2(ptr %ptr, <vscale x 2 x ptr> %l, <vscale x 2 x ptr> %r) #0 {
+; CHECK-LABEL: define void @interleave_nxptr_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 2 x ptr> [[L:%.*]], <vscale x 2 x ptr> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2p0(<vscale x 2 x ptr> [[L]], <vscale x 2 x ptr> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 4 x ptr> @llvm.experimental.vector.interleave2.nxv4p0(<vscale x 2 x ptr> %l, <vscale x 2 x ptr> %r)
+ store <vscale x 4 x ptr> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+;;; Check that we 'legalize' operations that are wider than the target supports.
+
+define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 2
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP7]], i64 4)
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP9]], i64 4)
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 4
+; CHECK-NEXT: [[LDN2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP11]])
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN2]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP12]], i64 8)
+; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN2]], 1
+; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP14]], i64 8)
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 6
+; CHECK-NEXT: [[LDN3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP16]])
+; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 0
+; CHECK-NEXT: [[TMP18:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP13]], <vscale x 4 x i32> [[TMP17]], i64 12)
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 1
+; CHECK-NEXT: [[TMP20:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP15]], <vscale x 4 x i32> [[TMP19]], i64 12)
+; CHECK-NEXT: [[TMP21:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } poison, <vscale x 16 x i32> [[TMP18]], 0
+; CHECK-NEXT: [[TMP22:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP21]], <vscale x 16 x i32> [[TMP20]], 1
+; CHECK-NEXT: ret { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]]
+;
+ %load = load <vscale x 32 x i32>, ptr %ptr, align 4
+ %deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
+ ret { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave
+}
+
+define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdouble_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 2
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
+; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
+; CHECK-NEXT: ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
+;
+ %load = load <vscale x 8 x double>, ptr %ptr, align 8
+ %deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
+ ret { <vscale x 4 x double>, <vscale x 4 x double> } %deinterleave
+}
+
+define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l, <vscale x 4 x double> %r) #0 {
+; CHECK-LABEL: define void @interleave_wide_nxdouble_factor2
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 4 x double> [[L:%.*]], <vscale x 4 x double> [[R:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[L]], i64 0)
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[R]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 2
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[L]], i64 2)
+; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[R]], i64 2)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP4]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %l, <vscale x 4 x double> %r)
+ store <vscale x 8 x double> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.experimental.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr>)
+
+; Larger deinterleaves to test 'legalization'
+declare { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+
+declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x ptr> @llvm.experimental.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <vscale x 2 x ptr>)
+
+; Larger interleaves to test 'legalization'
+declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }