[VPlan] Move VPInterleaveRecipe::execute to VPlanRecipes.cpp (NFC). Move ::exeute and ::print to VPlanRecipes.cpp in line with other recipe definitions.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 86e4149..6d28b8f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -441,37 +441,6 @@ return std::nullopt; } -/// Return a vector containing interleaved elements from multiple -/// smaller input vectors. -static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, - const Twine &Name) { - unsigned Factor = Vals.size(); - assert(Factor > 1 && "Tried to interleave invalid number of vectors"); - - VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); -#ifndef NDEBUG - for (Value *Val : Vals) - assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); -#endif - - // Scalable vectors cannot use arbitrary shufflevectors (only splats), so - // must use intrinsics to interleave. - if (VecTy->isScalableTy()) { - VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); - return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, - Vals, - /*FMFSource=*/nullptr, Name); - } - - // Fixed length. Start by concatenating all vectors into a wide vector. - Value *WideVec = concatenateVectors(Builder, Vals); - - // Interleave the elements into the wide vector. - const unsigned NumElts = VecTy->getElementCount().getFixedValue(); - return Builder.CreateShuffleVector( - WideVec, createInterleaveMask(NumElts, Factor), Name); -} - namespace { // Forward declare GeneratedRTChecks. class GeneratedRTChecks; @@ -553,16 +522,6 @@ const VPIteration &Instance, VPTransformState &State); - /// Try to vectorize interleaved access group \p Group with the base address - /// given in \p Addr, optionally masking the vector operations if \p - /// BlockInMask is non-null. Use \p State to translate given VPValues to IR - /// values in the vectorized loop. - void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, - ArrayRef<VPValue *> VPDefs, - VPTransformState &State, VPValue *Addr, - ArrayRef<VPValue *> StoredValues, - VPValue *BlockInMask, bool NeedsMaskForGaps); - /// Fix the non-induction PHIs in \p Plan. void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); @@ -611,11 +570,6 @@ /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); - /// Returns a bitcasted value to the requested vector type. - /// Also handles bitcasts of vector<float> <-> vector<pointer> types. - Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, - const DataLayout &DL); - /// Emit a bypass check to see if the vector trip count is zero, including if /// it overflows. void emitIterationCountCheck(BasicBlock *Bypass); @@ -2393,275 +2347,6 @@ return TTI.enableMaskedInterleavedAccessVectorization(); } -// Try to vectorize the interleave group that \p Instr belongs to. -// -// E.g. Translate following interleaved load group (factor = 3): -// for (i = 0; i < N; i+=3) { -// R = Pic[i]; // Member of index 0 -// G = Pic[i+1]; // Member of index 1 -// B = Pic[i+2]; // Member of index 2 -// ... // do something to R, G, B -// } -// To: -// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B -// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements -// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements -// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements -// -// Or translate following interleaved store group (factor = 3): -// for (i = 0; i < N; i+=3) { -// ... do something to R, G, B -// Pic[i] = R; // Member of index 0 -// Pic[i+1] = G; // Member of index 1 -// Pic[i+2] = B; // Member of index 2 -// } -// To: -// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> -// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> -// %interleaved.vec = shuffle %R_G.vec, %B_U.vec, -// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements -// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void InnerLoopVectorizer::vectorizeInterleaveGroup( - const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, - VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, - VPValue *BlockInMask, bool NeedsMaskForGaps) { - Instruction *Instr = Group->getInsertPos(); - const DataLayout &DL = Instr->getDataLayout(); - - // Prepare for the vector type of the interleaved load/store. - Type *ScalarTy = getLoadStoreType(Instr); - unsigned InterleaveFactor = Group->getFactor(); - auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); - - // Prepare for the new pointers. - SmallVector<Value *, 2> AddrParts; - unsigned Index = Group->getIndex(Instr); - - // TODO: extend the masked interleaved-group support to reversed access. - assert((!BlockInMask || !Group->isReverse()) && - "Reversed masked interleave-group not supported."); - - Value *Idx; - // If the group is reverse, adjust the index to refer to the last vector lane - // instead of the first. We adjust the index from the first vector lane, - // rather than directly getting the pointer for lane VF - 1, because the - // pointer operand of the interleaved access is supposed to be uniform. For - // uniform instructions, we're only required to generate a value for the - // first vector lane in each unroll iteration. - if (Group->isReverse()) { - Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); - Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); - Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); - Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); - Idx = Builder.CreateNeg(Idx); - } else - Idx = Builder.getInt32(-Index); - - for (unsigned Part = 0; Part < State.UF; Part++) { - Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); - if (auto *I = dyn_cast<Instruction>(AddrPart)) - State.setDebugLocFrom(I->getDebugLoc()); - - // Notice current instruction could be any index. Need to adjust the address - // to the member of index 0. - // - // E.g. a = A[i+1]; // Member of index 1 (Current instruction) - // b = A[i]; // Member of index 0 - // Current pointer is pointed to A[i+1], adjust it to A[i]. - // - // E.g. A[i+1] = a; // Member of index 1 - // A[i] = b; // Member of index 0 - // A[i+2] = c; // Member of index 2 (Current instruction) - // Current pointer is pointed to A[i+2], adjust it to A[i]. - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) - InBounds = gep->isInBounds(); - AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); - AddrParts.push_back(AddrPart); - } - - State.setDebugLocFrom(Instr->getDebugLoc()); - Value *PoisonVec = PoisonValue::get(VecTy); - - auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( - unsigned Part, Value *MaskForGaps) -> Value * { - if (State.VF.isScalable()) { - assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(InterleaveFactor == 2 && - "Unsupported deinterleave factor for scalable vectors"); - auto *BlockInMaskPart = State.get(BlockInMask, Part); - SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; - auto *MaskTy = VectorType::get(Builder.getInt1Ty(), - State.VF.getKnownMinValue() * 2, true); - return Builder.CreateIntrinsic(MaskTy, Intrinsic::vector_interleave2, Ops, - /*FMFSource=*/nullptr, "interleaved.mask"); - } - - if (!BlockInMask) - return MaskForGaps; - - Value *BlockInMaskPart = State.get(BlockInMask, Part); - Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, - createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), - "interleaved.mask"); - return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, - MaskForGaps) - : ShuffledMask; - }; - - // Vectorize the interleaved load group. - if (isa<LoadInst>(Instr)) { - Value *MaskForGaps = nullptr; - if (NeedsMaskForGaps) { - MaskForGaps = - createBitMaskForGaps(Builder, State.VF.getKnownMinValue(), *Group); - assert(MaskForGaps && "Mask for Gaps is required but it is null"); - } - - // For each unroll part, create a wide load for the group. - SmallVector<Value *, 2> NewLoads; - for (unsigned Part = 0; Part < State.UF; Part++) { - Instruction *NewLoad; - if (BlockInMask || MaskForGaps) { - assert(useMaskedInterleavedAccesses(*TTI) && - "masked interleaved groups are not allowed."); - Value *GroupMask = CreateGroupMask(Part, MaskForGaps); - NewLoad = - Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), - GroupMask, PoisonVec, "wide.masked.vec"); - } - else - NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], - Group->getAlign(), "wide.vec"); - Group->addMetadata(NewLoad); - NewLoads.push_back(NewLoad); - } - - if (VecTy->isScalableTy()) { - assert(InterleaveFactor == 2 && - "Unsupported deinterleave factor for scalable vectors"); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - Value *DI = Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part], - /*FMFSource=*/nullptr, "strided.vec"); - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - - if (!Member) - continue; - - Value *StridedVec = Builder.CreateExtractValue(DI, I); - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); - StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); - } - - if (Group->isReverse()) - StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); - - State.set(VPDefs[J], StridedVec, Part); - ++J; - } - } - - return; - } - - // For each member in the group, shuffle out the appropriate data from the - // wide loads. - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - - // Skip the gaps in the group. - if (!Member) - continue; - - auto StrideMask = - createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); - for (unsigned Part = 0; Part < State.UF; Part++) { - Value *StridedVec = Builder.CreateShuffleVector( - NewLoads[Part], StrideMask, "strided.vec"); - - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); - VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); - StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); - } - - if (Group->isReverse()) - StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); - - State.set(VPDefs[J], StridedVec, Part); - } - ++J; - } - return; - } - - // The sub vector type for current instruction. - auto *SubVT = VectorType::get(ScalarTy, State.VF); - - // Vectorize the interleaved store group. - Value *MaskForGaps = - createBitMaskForGaps(Builder, State.VF.getKnownMinValue(), *Group); - assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && - "masked interleaved groups are not allowed."); - assert((!MaskForGaps || !State.VF.isScalable()) && - "masking gaps for scalable vectors is not yet supported."); - for (unsigned Part = 0; Part < State.UF; Part++) { - // Collect the stored vector from each member. - SmallVector<Value *, 4> StoredVecs; - unsigned StoredIdx = 0; - for (unsigned i = 0; i < InterleaveFactor; i++) { - assert((Group->getMember(i) || MaskForGaps) && - "Fail to get a member from an interleaved store group"); - Instruction *Member = Group->getMember(i); - - // Skip the gaps in the group. - if (!Member) { - Value *Undef = PoisonValue::get(SubVT); - StoredVecs.push_back(Undef); - continue; - } - - Value *StoredVec = State.get(StoredValues[StoredIdx], Part); - ++StoredIdx; - - if (Group->isReverse()) - StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); - - // If this member has different type, cast it to a unified type. - - if (StoredVec->getType() != SubVT) - StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); - - StoredVecs.push_back(StoredVec); - } - - // Interleave all the smaller vectors into one wider vector. - Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); - Instruction *NewStoreInstr; - if (BlockInMask || MaskForGaps) { - Value *GroupMask = CreateGroupMask(Part, MaskForGaps); - NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], - Group->getAlign(), GroupMask); - } else - NewStoreInstr = - Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); - - Group->addMetadata(NewStoreInstr); - } -} - void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, @@ -2769,36 +2454,6 @@ return VectorTripCount; } -Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, - const DataLayout &DL) { - // Verify that V is a vector type with same number of elements as DstVTy. - auto VF = DstVTy->getElementCount(); - auto *SrcVecTy = cast<VectorType>(V->getType()); - assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); - Type *SrcElemTy = SrcVecTy->getElementType(); - Type *DstElemTy = DstVTy->getElementType(); - assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && - "Vector elements must have same size"); - - // Do a direct cast if element types are castable. - if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { - return Builder.CreateBitOrPointerCast(V, DstVTy); - } - // V cannot be directly casted to desired vector type. - // May happen when V is a floating point vector but DstVTy is a vector of - // pointers or vice-versa. Handle this using a two-step bitcast using an - // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. - assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && - "Only one type should be a pointer type"); - assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && - "Only one type should be a floating point type"); - Type *IntTy = - IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); - auto *VecIntTy = VectorType::get(IntTy, VF); - Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); - return Builder.CreateBitOrPointerCast(CastVal, DstVTy); -} - void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. @@ -9396,37 +9051,6 @@ VPlanTransforms::clearReductionWrapFlags(*Plan); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; - IG->getInsertPos()->printAsOperand(O, false); - O << ", "; - getAddr()->printAsOperand(O, SlotTracker); - VPValue *Mask = getMask(); - if (Mask) { - O << ", "; - Mask->printAsOperand(O, SlotTracker); - } - - unsigned OpIdx = 0; - for (unsigned i = 0; i < IG->getFactor(); ++i) { - if (!IG->getMember(i)) - continue; - if (getNumStoreOperands() > 0) { - O << "\n" << Indent << " store "; - getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); - O << " to index " << i; - } else { - O << "\n" << Indent << " "; - getVPValue(OpIdx)->printAsOperand(O, SlotTracker); - O << " = load from index " << i; - } - ++OpIdx; - } -} -#endif - void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && "Not a pointer induction according to InductionDescriptor!"); @@ -9510,13 +9134,6 @@ State.set(this, DerivedIV, VPIteration(0, 0)); } -void VPInterleaveRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), - getStoredValues(), getMask(), - NeedsMaskForGaps); -} - void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); if (State.Instance) { // Generate a single instance.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4b1ac79..1b787d0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2037,6 +2037,373 @@ } #endif +static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, + VectorType *DstVTy, const DataLayout &DL) { + // Verify that V is a vector type with same number of elements as DstVTy. + auto VF = DstVTy->getElementCount(); + auto *SrcVecTy = cast<VectorType>(V->getType()); + assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); + Type *SrcElemTy = SrcVecTy->getElementType(); + Type *DstElemTy = DstVTy->getElementType(); + assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && + "Vector elements must have same size"); + + // Do a direct cast if element types are castable. + if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { + return Builder.CreateBitOrPointerCast(V, DstVTy); + } + // V cannot be directly casted to desired vector type. + // May happen when V is a floating point vector but DstVTy is a vector of + // pointers or vice-versa. Handle this using a two-step bitcast using an + // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. + assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && + "Only one type should be a pointer type"); + assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && + "Only one type should be a floating point type"); + Type *IntTy = + IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); + auto *VecIntTy = VectorType::get(IntTy, VF); + Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); + return Builder.CreateBitOrPointerCast(CastVal, DstVTy); +} + +/// Return a vector containing interleaved elements from multiple +/// smaller input vectors. +static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, + const Twine &Name) { + unsigned Factor = Vals.size(); + assert(Factor > 1 && "Tried to interleave invalid number of vectors"); + + VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); +#ifndef NDEBUG + for (Value *Val : Vals) + assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); +#endif + + // Scalable vectors cannot use arbitrary shufflevectors (only splats), so + // must use intrinsics to interleave. + if (VecTy->isScalableTy()) { + VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); + return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, + Vals, + /*FMFSource=*/nullptr, Name); + } + + // Fixed length. Start by concatenating all vectors into a wide vector. + Value *WideVec = concatenateVectors(Builder, Vals); + + // Interleave the elements into the wide vector. + const unsigned NumElts = VecTy->getElementCount().getFixedValue(); + return Builder.CreateShuffleVector( + WideVec, createInterleaveMask(NumElts, Factor), Name); +} + +// Try to vectorize the interleave group that \p Instr belongs to. +// +// E.g. Translate following interleaved load group (factor = 3): +// for (i = 0; i < N; i+=3) { +// R = Pic[i]; // Member of index 0 +// G = Pic[i+1]; // Member of index 1 +// B = Pic[i+2]; // Member of index 2 +// ... // do something to R, G, B +// } +// To: +// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B +// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements +// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements +// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements +// +// Or translate following interleaved store group (factor = 3): +// for (i = 0; i < N; i+=3) { +// ... do something to R, G, B +// Pic[i] = R; // Member of index 0 +// Pic[i+1] = G; // Member of index 1 +// Pic[i+2] = B; // Member of index 2 +// } +// To: +// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> +// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> +// %interleaved.vec = shuffle %R_G.vec, %B_U.vec, +// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements +// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B +void VPInterleaveRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "Interleave group being replicated."); + const InterleaveGroup<Instruction> *Group = IG; + Instruction *Instr = Group->getInsertPos(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = getLoadStoreType(Instr); + unsigned InterleaveFactor = Group->getFactor(); + auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); + + // Prepare for the new pointers. + SmallVector<Value *, 2> AddrParts; + unsigned Index = Group->getIndex(Instr); + + // TODO: extend the masked interleaved-group support to reversed access. + VPValue *BlockInMask = getMask(); + assert((!BlockInMask || !Group->isReverse()) && + "Reversed masked interleave-group not supported."); + + Value *Idx; + // If the group is reverse, adjust the index to refer to the last vector lane + // instead of the first. We adjust the index from the first vector lane, + // rather than directly getting the pointer for lane VF - 1, because the + // pointer operand of the interleaved access is supposed to be uniform. For + // uniform instructions, we're only required to generate a value for the + // first vector lane in each unroll iteration. + if (Group->isReverse()) { + Value *RuntimeVF = + getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); + Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); + Idx = State.Builder.CreateMul(Idx, + State.Builder.getInt32(Group->getFactor())); + Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index)); + Idx = State.Builder.CreateNeg(Idx); + } else + Idx = State.Builder.getInt32(-Index); + + VPValue *Addr = getAddr(); + for (unsigned Part = 0; Part < State.UF; Part++) { + Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); + if (auto *I = dyn_cast<Instruction>(AddrPart)) + State.setDebugLocFrom(I->getDebugLoc()); + + // Notice current instruction could be any index. Need to adjust the address + // to the member of index 0. + // + // E.g. a = A[i+1]; // Member of index 1 (Current instruction) + // b = A[i]; // Member of index 0 + // Current pointer is pointed to A[i+1], adjust it to A[i]. + // + // E.g. A[i+1] = a; // Member of index 1 + // A[i] = b; // Member of index 0 + // A[i+2] = c; // Member of index 2 (Current instruction) + // Current pointer is pointed to A[i+2], adjust it to A[i]. + + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) + InBounds = gep->isInBounds(); + AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); + AddrParts.push_back(AddrPart); + } + + State.setDebugLocFrom(Instr->getDebugLoc()); + Value *PoisonVec = PoisonValue::get(VecTy); + + auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor]( + unsigned Part, Value *MaskForGaps) -> Value * { + if (State.VF.isScalable()) { + assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); + assert(InterleaveFactor == 2 && + "Unsupported deinterleave factor for scalable vectors"); + auto *BlockInMaskPart = State.get(BlockInMask, Part); + SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; + auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), + State.VF.getKnownMinValue() * 2, true); + return State.Builder.CreateIntrinsic( + MaskTy, Intrinsic::vector_interleave2, Ops, + /*FMFSource=*/nullptr, "interleaved.mask"); + } + + if (!BlockInMask) + return MaskForGaps; + + Value *BlockInMaskPart = State.get(BlockInMask, Part); + Value *ShuffledMask = State.Builder.CreateShuffleVector( + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), + "interleaved.mask"); + return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, + ShuffledMask, MaskForGaps) + : ShuffledMask; + }; + + const DataLayout &DL = Instr->getDataLayout(); + // Vectorize the interleaved load group. + if (isa<LoadInst>(Instr)) { + Value *MaskForGaps = nullptr; + if (NeedsMaskForGaps) { + MaskForGaps = createBitMaskForGaps(State.Builder, + State.VF.getKnownMinValue(), *Group); + assert(MaskForGaps && "Mask for Gaps is required but it is null"); + } + + // For each unroll part, create a wide load for the group. + SmallVector<Value *, 2> NewLoads; + for (unsigned Part = 0; Part < State.UF; Part++) { + Instruction *NewLoad; + if (BlockInMask || MaskForGaps) { + Value *GroupMask = CreateGroupMask(Part, MaskForGaps); + NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part], + Group->getAlign(), GroupMask, + PoisonVec, "wide.masked.vec"); + } else + NewLoad = State.Builder.CreateAlignedLoad( + VecTy, AddrParts[Part], Group->getAlign(), "wide.vec"); + Group->addMetadata(NewLoad); + NewLoads.push_back(NewLoad); + } + + ArrayRef<VPValue *> VPDefs = definedValues(); + const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); + if (VecTy->isScalableTy()) { + assert(InterleaveFactor == 2 && + "Unsupported deinterleave factor for scalable vectors"); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + Value *DI = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part], + /*FMFSource=*/nullptr, "strided.vec"); + unsigned J = 0; + for (unsigned I = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + + if (!Member) + continue; + + Value *StridedVec = State.Builder.CreateExtractValue(DI, I); + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); + } + + if (Group->isReverse()) + StridedVec = + State.Builder.CreateVectorReverse(StridedVec, "reverse"); + + State.set(VPDefs[J], StridedVec, Part); + ++J; + } + } + + return; + } + + // For each member in the group, shuffle out the appropriate data from the + // wide loads. + unsigned J = 0; + for (unsigned I = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + + // Skip the gaps in the group. + if (!Member) + continue; + + auto StrideMask = + createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); + for (unsigned Part = 0; Part < State.UF; Part++) { + Value *StridedVec = State.Builder.CreateShuffleVector( + NewLoads[Part], StrideMask, "strided.vec"); + + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); + } + + if (Group->isReverse()) + StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); + + State.set(VPDefs[J], StridedVec, Part); + } + ++J; + } + return; + } + + // The sub vector type for current instruction. + auto *SubVT = VectorType::get(ScalarTy, State.VF); + + // Vectorize the interleaved store group. + Value *MaskForGaps = + createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); + assert((!MaskForGaps || !State.VF.isScalable()) && + "masking gaps for scalable vectors is not yet supported."); + ArrayRef<VPValue *> StoredValues = getStoredValues(); + for (unsigned Part = 0; Part < State.UF; Part++) { + // Collect the stored vector from each member. + SmallVector<Value *, 4> StoredVecs; + unsigned StoredIdx = 0; + for (unsigned i = 0; i < InterleaveFactor; i++) { + assert((Group->getMember(i) || MaskForGaps) && + "Fail to get a member from an interleaved store group"); + Instruction *Member = Group->getMember(i); + + // Skip the gaps in the group. + if (!Member) { + Value *Undef = PoisonValue::get(SubVT); + StoredVecs.push_back(Undef); + continue; + } + + Value *StoredVec = State.get(StoredValues[StoredIdx], Part); + ++StoredIdx; + + if (Group->isReverse()) + StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); + + // If this member has different type, cast it to a unified type. + + if (StoredVec->getType() != SubVT) + StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); + + StoredVecs.push_back(StoredVec); + } + + // Interleave all the smaller vectors into one wider vector. + Value *IVec = + interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); + Instruction *NewStoreInstr; + if (BlockInMask || MaskForGaps) { + Value *GroupMask = CreateGroupMask(Part, MaskForGaps); + NewStoreInstr = State.Builder.CreateMaskedStore( + IVec, AddrParts[Part], Group->getAlign(), GroupMask); + } else + NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part], + Group->getAlign()); + + Group->addMetadata(NewStoreInstr); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + IG->getInsertPos()->printAsOperand(O, false); + O << ", "; + getAddr()->printAsOperand(O, SlotTracker); + VPValue *Mask = getMask(); + if (Mask) { + O << ", "; + Mask->printAsOperand(O, SlotTracker); + } + + unsigned OpIdx = 0; + for (unsigned i = 0; i < IG->getFactor(); ++i) { + if (!IG->getMember(i)) + continue; + if (getNumStoreOperands() > 0) { + O << "\n" << Indent << " store "; + getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); + O << " to index " << i; + } else { + O << "\n" << Indent << " "; + getVPValue(OpIdx)->printAsOperand(O, SlotTracker); + O << " = load from index " << i; + } + ++OpIdx; + } +} +#endif + void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { Value *Start = getStartValue()->getLiveInIRValue(); PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");