| //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| /// |
| /// \file |
| /// This file implements explicit unrolling for VPlans. |
| /// |
| //===----------------------------------------------------------------------===// |
| |
| #include "VPRecipeBuilder.h" |
| #include "VPlan.h" |
| #include "VPlanAnalysis.h" |
| #include "VPlanCFG.h" |
| #include "VPlanHelpers.h" |
| #include "VPlanPatternMatch.h" |
| #include "VPlanTransforms.h" |
| #include "VPlanUtils.h" |
| #include "llvm/ADT/PostOrderIterator.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/ScopeExit.h" |
| #include "llvm/Analysis/IVDescriptors.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/Intrinsics.h" |
| |
| using namespace llvm; |
| using namespace llvm::VPlanPatternMatch; |
| |
| namespace { |
| |
| /// Helper to hold state needed for unrolling. It holds the Plan to unroll by |
| /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate |
| /// the unrolling transformation, where the original VPValues are retained for |
| /// part zero. |
| class UnrollState { |
| /// Plan to unroll. |
| VPlan &Plan; |
| /// Unroll factor to unroll by. |
| const unsigned UF; |
| /// Analysis for types. |
| VPTypeAnalysis TypeInfo; |
| |
| /// Unrolling may create recipes that should not be unrolled themselves. |
| /// Those are tracked in ToSkip. |
| SmallPtrSet<VPRecipeBase *, 8> ToSkip; |
| |
| // Associate with each VPValue of part 0 its unrolled instances of parts 1, |
| // ..., UF-1. |
| DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts; |
| |
| /// Unroll replicate region \p VPR by cloning the region UF - 1 times. |
| void unrollReplicateRegionByUF(VPRegionBlock *VPR); |
| |
| /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across |
| /// all parts. |
| void unrollRecipeByUF(VPRecipeBase &R); |
| |
| /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled |
| /// depends on the concrete header phi. Inserts newly created recipes at \p |
| /// InsertPtForPhi. |
| void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, |
| VPBasicBlock::iterator InsertPtForPhi); |
| |
| /// Unroll a widen induction recipe \p IV. This introduces recipes to compute |
| /// the induction steps for each part. |
| void unrollWidenInductionByUF(VPWidenInductionRecipe *IV, |
| VPBasicBlock::iterator InsertPtForPhi); |
| |
| VPValue *getConstantInt(unsigned Part) { |
| Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType(); |
| return Plan.getConstantInt(CanIVIntTy, Part); |
| } |
| |
| public: |
| UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {} |
| |
| void unrollBlock(VPBlockBase *VPB); |
| |
| VPValue *getValueForPart(VPValue *V, unsigned Part) { |
| if (Part == 0 || isa<VPIRValue, VPSymbolicValue, VPRegionValue>(V)) |
| return V; |
| assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && |
| "accessed value does not exist"); |
| return VPV2Parts[V][Part - 1]; |
| } |
| |
| /// Given a single original recipe \p OrigR (of part zero), and its copy \p |
| /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its |
| /// corresponding VPValue defined by \p CopyR. |
| void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, |
| unsigned Part) { |
| for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) { |
| const auto &[V, _] = VPV2Parts.try_emplace(VPV); |
| assert(V->second.size() == Part - 1 && "earlier parts not set"); |
| V->second.push_back(CopyR->getVPValue(Idx)); |
| } |
| } |
| |
| /// Given a uniform recipe \p R, add it for all parts. |
| void addUniformForAllParts(VPSingleDefRecipe *R) { |
| const auto &[V, Inserted] = VPV2Parts.try_emplace(R); |
| assert(Inserted && "uniform value already added"); |
| for (unsigned Part = 0; Part != UF; ++Part) |
| V->second.push_back(R); |
| } |
| |
| bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); } |
| |
| /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part |
| /// \p P. |
| void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { |
| auto *Op = R->getOperand(OpIdx); |
| R->setOperand(OpIdx, getValueForPart(Op, Part)); |
| } |
| |
| /// Update \p R's operands with their corresponding VPValues for part \p P. |
| void remapOperands(VPRecipeBase *R, unsigned Part) { |
| for (const auto &[OpIdx, Op] : enumerate(R->operands())) |
| R->setOperand(OpIdx, getValueForPart(Op, Part)); |
| } |
| }; |
| } // namespace |
| |
| static void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps, |
| unsigned Part, VPlan &Plan, |
| VPTypeAnalysis &TypeInfo) { |
| if (Part == 0) |
| return; |
| |
| VPBuilder Builder(Steps); |
| Type *BaseIVTy = TypeInfo.inferScalarType(Steps->getOperand(0)); |
| Type *IntStepTy = |
| IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); |
| VPValue *StartIndex = Steps->getVFValue(); |
| if (Part > 1) { |
| StartIndex = Builder.createOverflowingOp( |
| Instruction::Mul, |
| {StartIndex, |
| Plan.getConstantInt(TypeInfo.inferScalarType(StartIndex), Part)}); |
| } |
| StartIndex = Builder.createScalarSExtOrTrunc( |
| StartIndex, IntStepTy, TypeInfo.inferScalarType(StartIndex), |
| Steps->getDebugLoc()); |
| |
| if (BaseIVTy->isFloatingPointTy()) |
| StartIndex = Builder.createScalarCast(Instruction::SIToFP, StartIndex, |
| BaseIVTy, Steps->getDebugLoc()); |
| |
| Steps->setStartIndex(StartIndex); |
| } |
| |
| void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { |
| VPBlockBase *InsertPt = VPR->getSingleSuccessor(); |
| for (unsigned Part = 1; Part != UF; ++Part) { |
| auto *Copy = VPR->clone(); |
| VPBlockUtils::insertBlockBefore(Copy, InsertPt); |
| |
| auto PartI = vp_depth_first_shallow(Copy->getEntry()); |
| auto Part0 = vp_depth_first_shallow(VPR->getEntry()); |
| for (const auto &[PartIVPBB, Part0VPBB] : |
| zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI), |
| VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) { |
| for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { |
| remapOperands(&PartIR, Part); |
| if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) |
| addStartIndexForScalarSteps(Steps, Part, Plan, TypeInfo); |
| |
| addRecipeForPart(&Part0R, &PartIR, Part); |
| } |
| } |
| } |
| } |
| |
| void UnrollState::unrollWidenInductionByUF( |
| VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { |
| VPBasicBlock *PH = cast<VPBasicBlock>( |
| IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); |
| Type *IVTy = TypeInfo.inferScalarType(IV); |
| auto &ID = IV->getInductionDescriptor(); |
| FastMathFlags FMF; |
| VPIRFlags::WrapFlagsTy WrapFlags(false, false); |
| if (auto *IntOrFPInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(IV)) { |
| if (IntOrFPInd->hasFastMathFlags()) |
| FMF = IntOrFPInd->getFastMathFlags(); |
| if (IntOrFPInd->hasNoWrapFlags()) |
| WrapFlags = IntOrFPInd->getNoWrapFlags(); |
| } |
| |
| VPValue *ScalarStep = IV->getStepValue(); |
| VPBuilder Builder(PH); |
| Type *VectorStepTy = |
| IVTy->isPointerTy() ? TypeInfo.inferScalarType(ScalarStep) : IVTy; |
| VPInstruction *VectorStep = Builder.createNaryOp( |
| VPInstruction::WideIVStep, {&Plan.getVF(), ScalarStep}, VectorStepTy, FMF, |
| IV->getDebugLoc()); |
| |
| ToSkip.insert(VectorStep); |
| |
| // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 |
| // remains the header phi. Parts > 0 are computed by adding Step to the |
| // previous part. The header phi recipe will get 2 new operands: the step |
| // value for a single part and the last part, used to compute the backedge |
| // value during VPWidenInductionRecipe::execute. |
| // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 |
| // %Part.1 = %Part.0 + %VectorStep |
| // %Part.2 = %Part.1 + %VectorStep |
| // %Part.3 = %Part.2 + %VectorStep |
| // |
| // The newly added recipes are added to ToSkip to avoid interleaving them |
| // again. |
| VPValue *Prev = IV; |
| Builder.setInsertPoint(IV->getParent(), InsertPtForPhi); |
| unsigned AddOpc; |
| VPIRFlags AddFlags; |
| if (IVTy->isPointerTy()) { |
| AddOpc = VPInstruction::WidePtrAdd; |
| AddFlags = GEPNoWrapFlags::none(); |
| } else if (IVTy->isFloatingPointTy()) { |
| AddOpc = ID.getInductionOpcode(); |
| AddFlags = FMF; |
| } else { |
| AddOpc = Instruction::Add; |
| AddFlags = WrapFlags; |
| if (cast<VPWidenIntOrFpInductionRecipe>(IV)->isCanonical()) |
| AddFlags = VPIRFlags::WrapFlagsTy(/*NUW=*/true, /*NSW=*/false); |
| } |
| for (unsigned Part = 1; Part != UF; ++Part) { |
| std::string Name = |
| Part > 1 ? "step.add." + std::to_string(Part) : "step.add"; |
| |
| VPInstruction *Add = |
| Builder.createNaryOp(AddOpc, |
| { |
| Prev, |
| VectorStep, |
| }, |
| AddFlags, IV->getDebugLoc(), Name); |
| ToSkip.insert(Add); |
| addRecipeForPart(IV, Add, Part); |
| Prev = Add; |
| } |
| IV->addOperand(VectorStep); |
| IV->addOperand(Prev); |
| } |
| |
| void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, |
| VPBasicBlock::iterator InsertPtForPhi) { |
| // First-order recurrences pass a single vector or scalar through their header |
| // phis, irrespective of interleaving. |
| if (isa<VPFirstOrderRecurrencePHIRecipe>(R)) |
| return; |
| |
| // Generate step vectors for each unrolled part. |
| if (auto *IV = dyn_cast<VPWidenInductionRecipe>(R)) { |
| unrollWidenInductionByUF(IV, InsertPtForPhi); |
| return; |
| } |
| |
| auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R); |
| if (RdxPhi && RdxPhi->isOrdered()) |
| return; |
| |
| auto InsertPt = std::next(R->getIterator()); |
| for (unsigned Part = 1; Part != UF; ++Part) { |
| VPRecipeBase *Copy = R->clone(); |
| Copy->insertBefore(*R->getParent(), InsertPt); |
| addRecipeForPart(R, Copy, Part); |
| if (RdxPhi) { |
| // If the start value is a ReductionStartVector, use the identity value |
| // (second operand) for unrolled parts. If the scaling factor is > 1, |
| // create a new ReductionStartVector with the scale factor and both |
| // operands set to the identity value. |
| if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) { |
| assert(VPI->getOpcode() == VPInstruction::ReductionStartVector && |
| "unexpected start VPInstruction"); |
| if (Part != 1) |
| continue; |
| VPValue *StartV; |
| if (match(VPI->getOperand(2), m_One())) { |
| StartV = VPI->getOperand(1); |
| } else { |
| auto *C = VPI->clone(); |
| C->setOperand(0, C->getOperand(1)); |
| C->insertAfter(VPI); |
| StartV = C; |
| } |
| for (unsigned Part = 1; Part != UF; ++Part) |
| VPV2Parts[VPI][Part - 1] = StartV; |
| } |
| } else { |
| assert(isa<VPActiveLaneMaskPHIRecipe>(R) && |
| "unexpected header phi recipe not needing unrolled part"); |
| } |
| } |
| } |
| |
| /// Handle non-header-phi recipes. |
| void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { |
| if (match(&R, m_CombineOr(m_BranchOnCond(), m_BranchOnCount()))) |
| return; |
| |
| if (auto *VPI = dyn_cast<VPInstruction>(&R)) { |
| if (vputils::onlyFirstPartUsed(VPI)) { |
| addUniformForAllParts(VPI); |
| return; |
| } |
| } |
| if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) { |
| if (isa<StoreInst>(RepR->getUnderlyingValue()) && |
| RepR->getOperand(1)->isDefinedOutsideLoopRegions()) { |
| // Stores to an invariant address only need to store the last part. |
| remapOperands(&R, UF - 1); |
| return; |
| } |
| if (match(RepR, |
| m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) { |
| addUniformForAllParts(RepR); |
| return; |
| } |
| } |
| |
| // Unroll non-uniform recipes. |
| auto InsertPt = std::next(R.getIterator()); |
| VPBasicBlock &VPBB = *R.getParent(); |
| for (unsigned Part = 1; Part != UF; ++Part) { |
| VPRecipeBase *Copy = R.clone(); |
| Copy->insertBefore(VPBB, InsertPt); |
| addRecipeForPart(&R, Copy, Part); |
| |
| // Phi operands are updated once all other recipes have been unrolled. |
| if (isa<VPWidenPHIRecipe>(Copy)) |
| continue; |
| |
| VPValue *Op; |
| if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( |
| m_VPValue(), m_VPValue(Op)))) { |
| Copy->setOperand(0, getValueForPart(Op, Part - 1)); |
| Copy->setOperand(1, getValueForPart(Op, Part)); |
| continue; |
| } |
| if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(&R)) { |
| VPBuilder Builder(VPR); |
| const DataLayout &DL = Plan.getDataLayout(); |
| Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(VPR)); |
| Type *VFTy = TypeInfo.inferScalarType(&Plan.getVF()); |
| VPValue *VF = Builder.createScalarZExtOrTrunc( |
| &Plan.getVF(), IndexTy, VFTy, DebugLoc::getUnknown()); |
| // VFxUF does not wrap, so VF * Part also cannot wrap. |
| VPValue *VFxPart = Builder.createOverflowingOp( |
| Instruction::Mul, {VF, Plan.getConstantInt(IndexTy, Part)}, |
| {true, true}); |
| Copy->setOperand(0, VPR->getOperand(0)); |
| Copy->addOperand(VFxPart); |
| continue; |
| } |
| if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) { |
| auto *Phi = dyn_cast<VPReductionPHIRecipe>(R.getOperand(0)); |
| if (Phi && Phi->isOrdered()) { |
| auto &Parts = VPV2Parts[Phi]; |
| if (Part == 1) { |
| Parts.clear(); |
| Parts.push_back(Red); |
| } |
| Parts.push_back(Copy->getVPSingleValue()); |
| Phi->setOperand(1, Copy->getVPSingleValue()); |
| } |
| } |
| if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Copy)) { |
| // Materialize PartN offset for VectorEndPointer. |
| VEPR->setOperand(0, R.getOperand(0)); |
| VEPR->setOperand(1, R.getOperand(1)); |
| VEPR->materializeOffset(Part); |
| continue; |
| } |
| |
| remapOperands(Copy, Part); |
| |
| if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Copy)) |
| addStartIndexForScalarSteps(ScalarIVSteps, Part, Plan, TypeInfo); |
| |
| // Add operand indicating the part to generate code for, to recipes still |
| // requiring it. |
| if (isa<VPWidenCanonicalIVRecipe>(Copy)) |
| Copy->addOperand(getConstantInt(Part)); |
| |
| if (match(Copy, |
| m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) { |
| VPBuilder Builder(Copy); |
| VPValue *ScaledByPart = Builder.createOverflowingOp( |
| Instruction::Mul, {Copy->getOperand(1), getConstantInt(Part)}); |
| Copy->setOperand(1, ScaledByPart); |
| } |
| } |
| if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) { |
| // Materialize Part0 offset for VectorEndPointer. |
| VEPR->materializeOffset(); |
| } |
| } |
| |
| void UnrollState::unrollBlock(VPBlockBase *VPB) { |
| auto *VPR = dyn_cast<VPRegionBlock>(VPB); |
| if (VPR) { |
| if (VPR->isReplicator()) |
| return unrollReplicateRegionByUF(VPR); |
| |
| // Traverse blocks in region in RPO to ensure defs are visited before uses |
| // across blocks. |
| ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> |
| RPOT(VPR->getEntry()); |
| for (VPBlockBase *VPB : RPOT) |
| unrollBlock(VPB); |
| return; |
| } |
| |
| // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. |
| auto *VPBB = cast<VPBasicBlock>(VPB); |
| auto InsertPtForPhi = VPBB->getFirstNonPhi(); |
| for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { |
| if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R)) |
| continue; |
| |
| // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and |
| // ComputeReductionResult which combine all parts to compute the final |
| // value. |
| VPValue *Op1; |
| if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) || |
| match(&R, m_FirstActiveLane(m_VPValue(Op1))) || |
| match(&R, m_LastActiveLane(m_VPValue(Op1))) || |
| match(&R, m_ComputeReductionResult(m_VPValue(Op1)))) { |
| addUniformForAllParts(cast<VPInstruction>(&R)); |
| for (unsigned Part = 1; Part != UF; ++Part) |
| R.addOperand(getValueForPart(Op1, Part)); |
| continue; |
| } |
| VPValue *Op0; |
| if (match(&R, m_ExtractLane(m_VPValue(Op0), m_VPValue(Op1)))) { |
| addUniformForAllParts(cast<VPInstruction>(&R)); |
| for (unsigned Part = 1; Part != UF; ++Part) |
| R.addOperand(getValueForPart(Op1, Part)); |
| continue; |
| } |
| |
| VPValue *Op2; |
| if (match(&R, m_ExtractLastActive(m_VPValue(), m_VPValue(Op1), |
| m_VPValue(Op2)))) { |
| addUniformForAllParts(cast<VPInstruction>(&R)); |
| for (unsigned Part = 1; Part != UF; ++Part) { |
| R.addOperand(getValueForPart(Op1, Part)); |
| R.addOperand(getValueForPart(Op2, Part)); |
| } |
| continue; |
| } |
| |
| if (Plan.hasScalarVFOnly()) { |
| if (match(&R, m_ExtractLastPart(m_VPValue(Op0))) || |
| match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) { |
| auto *I = cast<VPInstruction>(&R); |
| bool IsPenultimatePart = |
| I->getOpcode() == VPInstruction::ExtractPenultimateElement; |
| unsigned PartIdx = IsPenultimatePart ? UF - 2 : UF - 1; |
| // For scalar VF, directly use the scalar part value. |
| I->replaceAllUsesWith(getValueForPart(Op0, PartIdx)); |
| continue; |
| } |
| } |
| // For vector VF, the penultimate element is always extracted from the last part. |
| if (match(&R, m_ExtractLastLaneOfLastPart(m_VPValue(Op0))) || |
| match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) { |
| addUniformForAllParts(cast<VPSingleDefRecipe>(&R)); |
| R.setOperand(0, getValueForPart(Op0, UF - 1)); |
| continue; |
| } |
| |
| auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R); |
| if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) { |
| addUniformForAllParts(SingleDef); |
| continue; |
| } |
| |
| if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) { |
| unrollHeaderPHIByUF(H, InsertPtForPhi); |
| continue; |
| } |
| |
| unrollRecipeByUF(R); |
| } |
| } |
| |
| void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) { |
| assert(UF > 0 && "Unroll factor must be positive"); |
| Plan.setUF(UF); |
| llvm::scope_exit Cleanup([&Plan, UF]() { |
| auto Iter = vp_depth_first_deep(Plan.getEntry()); |
| // Remove recipes that are redundant after unrolling. |
| for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { |
| for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { |
| auto *VPI = dyn_cast<VPInstruction>(&R); |
| if (VPI && |
| VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && |
| VPI->getOperand(1) == &Plan.getVF()) { |
| VPI->replaceAllUsesWith(VPI->getOperand(0)); |
| VPI->eraseFromParent(); |
| } |
| } |
| } |
| |
| Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); |
| Plan.getUF().replaceAllUsesWith(Plan.getConstantInt(TCTy, UF)); |
| }); |
| if (UF == 1) { |
| return; |
| } |
| |
| UnrollState Unroller(Plan, UF); |
| |
| // Iterate over all blocks in the plan starting from Entry, and unroll |
| // recipes inside them. This includes the vector preheader and middle blocks, |
| // which may set up or post-process per-part values. |
| ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( |
| Plan.getEntry()); |
| for (VPBlockBase *VPB : RPOT) |
| Unroller.unrollBlock(VPB); |
| |
| unsigned Part = 1; |
| // Remap operands of cloned header phis to update backedge values. The header |
| // phis cloned during unrolling are just after the header phi for part 0. |
| // Reset Part to 1 when reaching the first (part 0) recipe of a block. |
| for (VPRecipeBase &H : |
| Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
| // The second operand of Fixed Order Recurrence phi's, feeding the spliced |
| // value across the backedge, needs to remap to the last part of the spliced |
| // value. |
| if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) { |
| Unroller.remapOperand(&H, 1, UF - 1); |
| continue; |
| } |
| if (Unroller.contains(H.getVPSingleValue())) { |
| Part = 1; |
| continue; |
| } |
| Unroller.remapOperands(&H, Part); |
| Part++; |
| } |
| |
| VPlanTransforms::removeDeadRecipes(Plan); |
| } |
| |
| /// Add a lane offset to the start index of \p Steps. |
| static void addLaneToStartIndex(VPScalarIVStepsRecipe *Steps, unsigned Lane, |
| VPlan &Plan, VPRecipeBase *InsertPt) { |
| assert(Lane > 0 && "Zero lane adds no offset to start index"); |
| VPTypeAnalysis TypeInfo(Plan); |
| Type *BaseIVTy = TypeInfo.inferScalarType(Steps->getOperand(0)); |
| |
| VPValue *OldStartIndex = Steps->getStartIndex(); |
| VPValue *LaneOffset; |
| unsigned AddOpcode; |
| // TODO: Retrieve the flags from Steps unconditionally. |
| VPIRFlags Flags; |
| if (BaseIVTy->isFloatingPointTy()) { |
| int SignedLane = static_cast<int>(Lane); |
| if (!OldStartIndex && Steps->getInductionOpcode() == Instruction::FSub) |
| SignedLane = -SignedLane; |
| LaneOffset = Plan.getOrAddLiveIn(ConstantFP::get(BaseIVTy, SignedLane)); |
| AddOpcode = Steps->getInductionOpcode(); |
| Flags = VPIRFlags(FastMathFlags()); |
| } else { |
| unsigned BaseIVBits = BaseIVTy->getScalarSizeInBits(); |
| LaneOffset = Plan.getConstantInt( |
| APInt(BaseIVBits, Lane, /*isSigned*/ false, /*implicitTrunc*/ true)); |
| AddOpcode = Instruction::Add; |
| Flags = VPIRFlags(VPIRFlags::WrapFlagsTy(false, false)); |
| } |
| |
| VPValue *NewStartIndex = LaneOffset; |
| if (OldStartIndex) { |
| VPBuilder Builder(InsertPt); |
| NewStartIndex = |
| Builder.createNaryOp(AddOpcode, {OldStartIndex, LaneOffset}, Flags); |
| } |
| Steps->setStartIndex(NewStartIndex); |
| } |
| |
| /// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe, |
| /// VPInstruction or VPScalarIVStepsRecipe) for lane \p Lane. Use \p |
| /// Def2LaneDefs to look up scalar definitions for operands of \DefR. |
| static VPValue * |
| cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, |
| VPSingleDefRecipe *DefR, VPLane Lane, |
| const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) { |
| assert((isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(DefR)) && |
| "DefR must be a VPReplicateRecipe, VPInstruction or " |
| "VPScalarIVStepsRecipe"); |
| VPValue *Op; |
| if (match(DefR, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)))) { |
| auto LaneDefs = Def2LaneDefs.find(Op); |
| if (LaneDefs != Def2LaneDefs.end()) |
| return LaneDefs->second[Lane.getKnownLane()]; |
| |
| VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); |
| return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); |
| } |
| |
| // Collect the operands at Lane, creating extracts as needed. |
| SmallVector<VPValue *> NewOps; |
| for (VPValue *Op : DefR->operands()) { |
| // If Op is a definition that has been unrolled, directly use the clone for |
| // the corresponding lane. |
| auto LaneDefs = Def2LaneDefs.find(Op); |
| if (LaneDefs != Def2LaneDefs.end()) { |
| NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]); |
| continue; |
| } |
| if (Lane.getKind() == VPLane::Kind::ScalableLast) { |
| // Look through mandatory Unpack. |
| [[maybe_unused]] bool Matched = |
| match(Op, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op))); |
| assert(Matched && "original op must have been Unpack"); |
| auto *ExtractPart = |
| Builder.createNaryOp(VPInstruction::ExtractLastPart, {Op}); |
| NewOps.push_back( |
| Builder.createNaryOp(VPInstruction::ExtractLastLane, {ExtractPart})); |
| continue; |
| } |
| if (vputils::isSingleScalar(Op)) { |
| NewOps.push_back(Op); |
| continue; |
| } |
| |
| // Look through buildvector to avoid unnecessary extracts. |
| if (match(Op, m_BuildVector())) { |
| NewOps.push_back( |
| cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane())); |
| continue; |
| } |
| VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); |
| VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); |
| NewOps.push_back(Ext); |
| } |
| |
| VPSingleDefRecipe *New; |
| if (auto *RepR = dyn_cast<VPReplicateRecipe>(DefR)) { |
| // TODO: have cloning of replicate recipes also provide the desired result |
| // coupled with setting its operands to NewOps (deriving IsSingleScalar and |
| // Mask from the operands?) |
| New = new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps, |
| /*IsSingleScalar=*/true, /*Mask=*/nullptr, |
| *RepR, *RepR, RepR->getDebugLoc()); |
| } else { |
| New = DefR->clone(); |
| for (const auto &[Idx, Op] : enumerate(NewOps)) { |
| New->setOperand(Idx, Op); |
| } |
| if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(New)) { |
| // Skip lane 0: an absent start index is implicitly zero. |
| unsigned KnownLane = Lane.getKnownLane(); |
| if (KnownLane != 0) |
| addLaneToStartIndex(Steps, KnownLane, Plan, DefR); |
| } |
| } |
| New->insertBefore(DefR); |
| return New; |
| } |
| |
| /// Convert recipes in region blocks to operate on a single lane 0. |
| /// VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is |
| /// converted into BranchOnCond, PredInstPhi recipes are replaced by scalar phi |
| /// recipes with an additional poison operand, and extracts are created as |
| /// needed. |
| static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy, |
| VPBlockBase *Entry, |
| ElementCount VF) { |
| VPValue *Idx0 = Plan.getZero(IdxTy); |
| VPTypeAnalysis TypeInfo(Plan); |
| for (VPBlockBase *VPB : vp_depth_first_shallow(Entry)) { |
| for (VPRecipeBase &OldR : make_early_inc_range(cast<VPBasicBlock>(*VPB))) { |
| assert( |
| !isa<VPWidenPHIRecipe>(&OldR) && |
| !match(&OldR, |
| m_CombineOr( |
| m_InsertElement(m_VPValue(), m_VPValue(), m_VPValue()), |
| m_ExtractElement(m_VPValue(), m_VPValue()))) && |
| "must not contain wide phis, inserts or extracts before conversion"); |
| |
| VPBuilder Builder(&OldR); |
| DebugLoc OldDL = OldR.getDebugLoc(); |
| // For scalar VF, operands are already scalar; no extraction needed. |
| if (!VF.isScalar()) { |
| for (const auto &[I, Op] : enumerate(OldR.operands())) { |
| // Skip operands that don't need extraction: values defined in the |
| // same block (already scalar), or values that are already single |
| // scalars. |
| // TODO: Support isSingleScalar for VPScalarIVStepsRecipe. |
| auto *DefR = Op->getDefiningRecipe(); |
| if ((isa_and_present<VPScalarIVStepsRecipe>(DefR) && |
| DefR->getParent() == VPB) || |
| vputils::isSingleScalar(Op)) |
| continue; |
| |
| // Extract lane zero from values defined outside the region. |
| VPValue *Extract = Builder.createNaryOp(Instruction::ExtractElement, |
| {Op, Idx0}, OldDL); |
| OldR.setOperand(I, Extract); |
| } |
| } |
| |
| if (auto *RepR = dyn_cast<VPReplicateRecipe>(&OldR)) { |
| auto *NewR = new VPReplicateRecipe( |
| RepR->getUnderlyingInstr(), RepR->operands(), |
| /* IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR, *RepR, OldDL); |
| NewR->insertBefore(RepR); |
| RepR->replaceAllUsesWith(NewR); |
| RepR->eraseFromParent(); |
| } else if (auto *BranchOnMask = dyn_cast<VPBranchOnMaskRecipe>(&OldR)) { |
| Builder.createNaryOp(VPInstruction::BranchOnCond, |
| {BranchOnMask->getOperand(0)}, OldDL); |
| BranchOnMask->eraseFromParent(); |
| } else if (auto *PredPhi = dyn_cast<VPPredInstPHIRecipe>(&OldR)) { |
| VPValue *PredOp = PredPhi->getOperand(0); |
| Type *PredTy = TypeInfo.inferScalarType(PredOp); |
| VPValue *Poison = Plan.getOrAddLiveIn(PoisonValue::get(PredTy)); |
| VPPhi *NewPhi = Builder.createScalarPhi({Poison, PredOp}, OldDL); |
| PredPhi->replaceAllUsesWith(NewPhi); |
| PredPhi->eraseFromParent(); |
| } else { |
| // TODO: Support isSingleScalar for VPScalarIVStepsRecipe. |
| assert((isa<VPScalarIVStepsRecipe>(OldR) || |
| (isa<VPInstruction>(OldR) && |
| vputils::isSingleScalar(OldR.getVPSingleValue()))) && |
| "unexpected unhandled recipe"); |
| } |
| } |
| } |
| } |
| |
| /// Update recipes in the cloned blocks rooted at \p NewEntry to match \p Lane, |
| /// using the original blocks rooted at \p OldEntry as reference. |
| static void processLaneForReplicateRegion(VPlan &Plan, Type *IdxTy, |
| unsigned Lane, VPBasicBlock *OldEntry, |
| VPBasicBlock *NewEntry) { |
| DenseMap<VPValue *, VPValue *> Old2NewVPValues; |
| VPValue *IdxLane = Plan.getConstantInt(IdxTy, Lane); |
| for (const auto &[OldBB, NewBB] : |
| zip_equal(vp_depth_first_shallow(OldEntry), |
| vp_depth_first_shallow(NewEntry))) { |
| for (auto &&[OldR, NewR] : |
| zip_equal(*cast<VPBasicBlock>(OldBB), *cast<VPBasicBlock>(NewBB))) { |
| for (const auto &[OldV, NewV] : |
| zip_equal(OldR.definedValues(), NewR.definedValues())) |
| Old2NewVPValues[OldV] = NewV; |
| |
| // Remap operands to use lane-specific values. |
| for (const auto &[I, OldOp] : enumerate(NewR.operands())) { |
| // Use cloned value if operand was defined in the region. |
| if (auto *NewOp = Old2NewVPValues.lookup(OldOp)) |
| NewR.setOperand(I, NewOp); |
| } |
| |
| if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR)) { |
| addLaneToStartIndex(Steps, Lane, Plan, Steps); |
| } else if (match(&NewR, m_ExtractElement(m_VPValue(), m_VPValue()))) { |
| assert(match(NewR.getOperand(1), m_ZeroInt()) && |
| "extract indices must be zero"); |
| NewR.setOperand(1, IdxLane); |
| } else if (auto *NewPhi = dyn_cast<VPPhi>(&NewR)) { |
| auto *OldPhi = cast<VPPhi>(&OldR); |
| assert(vputils::onlyFirstLaneUsed(OldPhi) && |
| "VPPhis expected to have only first lane used"); |
| auto *BVUser = dyn_cast_or_null<VPInstruction>(OldPhi->getSingleUser()); |
| if (BVUser && match(BVUser, m_CombineOr(m_BuildVector(), |
| m_BuildStructVector()))) { |
| assert(BVUser->getOperand(0) == OldPhi && |
| "Unexpected first operand of build vector user"); |
| BVUser->setOperand(Lane, NewPhi); |
| } |
| } |
| } |
| } |
| } |
| |
| /// Dissolve a single replicate region by replicating its blocks for each lane |
| /// of \p VF. The region is disconnected, its blocks are reparented, cloned for |
| /// each lane, and reconnected in sequence. |
| static void dissolveReplicateRegion(VPRegionBlock *Region, ElementCount VF, |
| VPlan &Plan, Type *IdxTy) { |
| auto *FirstLaneEntry = cast<VPBasicBlock>(Region->getEntry()); |
| auto *FirstLaneExiting = cast<VPBasicBlock>(Region->getExiting()); |
| |
| // Disconnect and dissolve the region. |
| VPBlockBase *Predecessor = Region->getSinglePredecessor(); |
| assert(Predecessor && "Replicate region must have a single predecessor"); |
| auto *Successor = cast<VPBasicBlock>(Region->getSingleSuccessor()); |
| VPBlockUtils::disconnectBlocks(Predecessor, Region); |
| VPBlockUtils::disconnectBlocks(Region, Successor); |
| |
| VPRegionBlock *ParentRegion = Region->getParent(); |
| for (VPBlockBase *VPB : vp_depth_first_shallow(FirstLaneEntry)) |
| VPB->setParent(ParentRegion); |
| |
| // Process the original blocks for lane 0: converting their recipes to |
| // single-scalar. |
| convertRecipesInRegionBlocksToSingleScalar(Plan, IdxTy, FirstLaneEntry, VF); |
| |
| // For scalar VF, just wire the blocks and return; no cloning or packing |
| // needed. |
| if (VF.isScalar()) { |
| VPBlockUtils::connectBlocks(Predecessor, FirstLaneEntry); |
| VPBlockUtils::connectBlocks(FirstLaneExiting, Successor); |
| return; |
| } |
| |
| // Create a BuildVector or BuildStructVector in successor block for every |
| // VPPhi in (first lane's) exiting block having vector uses. All their |
| // operands are initialized to poison and will be replaced when processing |
| // each clone, except for the operand of the first lane which set here. |
| // BuildVectors are recorded to be replaced later by chains of insert-element |
| // and widen phi's. |
| unsigned NumLanes = VF.getFixedValue(); |
| VPTypeAnalysis TypeInfo(Plan); |
| SmallVector<VPInstruction *> BuildVectors; |
| for (auto &R : FirstLaneExiting->phis()) { |
| auto *Phi = cast<VPPhi>(&R); |
| if (vputils::onlyFirstLaneUsed(Phi)) |
| continue; |
| |
| Type *ScalarTy = TypeInfo.inferScalarType(Phi); |
| bool IsStruct = isa<StructType>(ScalarTy); |
| VPValue *Poison = Plan.getOrAddLiveIn(PoisonValue::get(ScalarTy)); |
| SmallVector<VPValue *> BVOps(NumLanes, Poison); |
| auto *BV = new VPInstruction(IsStruct ? VPInstruction::BuildStructVector |
| : VPInstruction::BuildVector, |
| BVOps); |
| if (!IsStruct) |
| BuildVectors.push_back(BV); |
| Phi->replaceAllUsesWith(BV); |
| BV->setOperand(0, Phi); |
| BV->insertBefore(*Successor, Successor->getFirstNonPhi()); |
| } |
| |
| // Clone converted blocks for remaining lanes and process each in reverse |
| // order, connecting each lane's Exiting block to the subsequent lane's entry. |
| VPBlockBase *NextLaneEntry = Successor; |
| for (int Lane = NumLanes - 1; Lane > 0; --Lane) { |
| const auto &[CurrentLaneEntry, CurrentLaneExiting] = |
| VPBlockUtils::cloneFrom(FirstLaneEntry); |
| for (VPBlockBase *VPB : vp_depth_first_shallow(CurrentLaneEntry)) |
| VPB->setParent(ParentRegion); |
| processLaneForReplicateRegion(Plan, IdxTy, Lane, |
| cast<VPBasicBlock>(FirstLaneEntry), |
| cast<VPBasicBlock>(CurrentLaneEntry)); |
| VPBlockUtils::connectBlocks(CurrentLaneExiting, NextLaneEntry); |
| NextLaneEntry = CurrentLaneEntry; |
| } |
| |
| // Connect Predecessor to FirstLaneEntry, and FirstLaneRegionExit to |
| // NextLaneEntry which is the second lane region entry. The latter is |
| // done last so that earlier clonings from FirstLaneEntry stop at |
| // FirstLaneExiting. |
| VPBlockUtils::connectBlocks(Predecessor, FirstLaneEntry); |
| VPBlockUtils::connectBlocks(FirstLaneExiting, NextLaneEntry); |
| |
| // Fold BuildVector fed by scalar phis into VPWidenPHIRecipes with |
| // InsertElement per lane. |
| // TODO: check if this folding should be dropped. |
| for (VPInstruction *BV : BuildVectors) { |
| assert(BV->getNumOperands() == NumLanes && |
| "BuildVector must have one operand per lane"); |
| for (const auto &[Idx, Op] : enumerate(BV->operands())) { |
| auto *ScalarPhi = cast<VPPhi>(Op); |
| auto DL = ScalarPhi->getDebugLoc(); |
| auto *PredOp = cast<VPSingleDefRecipe>(ScalarPhi->getOperand(1)); |
| VPValue *Poison = ScalarPhi->getOperand(0); |
| VPValue *PrevVal = Idx == 0 ? Poison : BV->getOperand(Idx - 1); |
| auto Builder = VPBuilder::getToInsertAfter(PredOp->getDefiningRecipe()); |
| auto *Insert = Builder.createNaryOp( |
| Instruction::InsertElement, |
| {PrevVal, PredOp, Plan.getConstantInt(64, Idx)}, DL); |
| Builder.setInsertPoint(ScalarPhi); |
| auto *NewPhi = Builder.createWidenPhi({PrevVal, Insert}, DL); |
| ScalarPhi->replaceAllUsesWith(NewPhi); |
| ScalarPhi->eraseFromParent(); |
| } |
| BV->replaceAllUsesWith(BV->getOperand(NumLanes - 1)); |
| BV->eraseFromParent(); |
| } |
| } |
| |
| /// Collect and dissolve all replicate regions in the vector loop, replicating |
| /// their blocks and recipes for each lane of \p VF. |
| static void replicateReplicateRegionsByVF(VPlan &Plan, ElementCount VF, |
| Type *IdxTy) { |
| // Collect all replicate regions before modifying the CFG. |
| SmallVector<VPRegionBlock *> ReplicateRegions; |
| for (VPRegionBlock *Region : VPBlockUtils::blocksOnly<VPRegionBlock>( |
| vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { |
| if (Region->isReplicator()) |
| ReplicateRegions.push_back(Region); |
| } |
| |
| assert((ReplicateRegions.empty() || !VF.isScalable()) && |
| "cannot replicate across scalable VFs"); |
| |
| // Dissolve replicate regions by replicating their blocks for each lane. |
| // Traversing regions in reverse ensures that the successor of every region |
| // being processed is a basic-block, rather than another region. |
| for (VPRegionBlock *Region : reverse(ReplicateRegions)) |
| dissolveReplicateRegion(Region, VF, Plan, IdxTy); |
| |
| VPlanTransforms::mergeBlocksIntoPredecessors(Plan); |
| } |
| |
| void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { |
| Type *IdxTy = IntegerType::get( |
| Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32); |
| |
| if (Plan.hasScalarVFOnly()) { |
| // When Plan is only unrolled by UF, replicating by VF amounts to dissolving |
| // replicate regions. |
| replicateReplicateRegionsByVF(Plan, VF, IdxTy); |
| return; |
| } |
| |
| // Visit all VPBBs outside the loop region and directly inside the top-level |
| // loop region. |
| auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( |
| vp_depth_first_shallow(Plan.getEntry())); |
| auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( |
| vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())); |
| auto VPBBsToUnroll = |
| concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion); |
| // A mapping of current VPValue definitions to collections of new VPValues |
| // defined per lane. Serves to hook-up potential users of current VPValue |
| // definition that are replicated-per-VF later. |
| DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs; |
| // The removal of current recipes being replaced by new ones needs to be |
| // delayed after Def2LaneDefs is no longer in use. |
| SmallVector<VPRecipeBase *> ToRemove; |
| for (VPBasicBlock *VPBB : VPBBsToUnroll) { |
| for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { |
| if (!isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(&R) || |
| (isa<VPReplicateRecipe>(&R) && |
| cast<VPReplicateRecipe>(&R)->isSingleScalar()) || |
| (isa<VPInstruction>(&R) && |
| !cast<VPInstruction>(&R)->doesGeneratePerAllLanes() && |
| cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack)) |
| continue; |
| |
| auto *DefR = cast<VPSingleDefRecipe>(&R); |
| VPBuilder Builder(DefR); |
| if (DefR->getNumUsers() == 0) { |
| // Create single-scalar version of DefR for all lanes. |
| for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) |
| cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs); |
| DefR->eraseFromParent(); |
| continue; |
| } |
| /// Create single-scalar version of DefR for all lanes. |
| SmallVector<VPValue *> LaneDefs; |
| for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) |
| LaneDefs.push_back( |
| cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs)); |
| |
| Def2LaneDefs[DefR] = LaneDefs; |
| /// Users that only demand the first lane can use the definition for lane |
| /// 0. |
| DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) { |
| return U.usesFirstLaneOnly(DefR); |
| }); |
| |
| // Update each build vector user that currently has DefR as its only |
| // operand, to have all LaneDefs as its operands. |
| for (VPUser *U : to_vector(DefR->users())) { |
| auto *VPI = dyn_cast<VPInstruction>(U); |
| if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector && |
| VPI->getOpcode() != VPInstruction::BuildStructVector)) |
| continue; |
| assert(VPI->getNumOperands() == 1 && |
| "Build(Struct)Vector must have a single operand before " |
| "replicating by VF"); |
| VPI->setOperand(0, LaneDefs[0]); |
| for (VPValue *LaneDef : drop_begin(LaneDefs)) |
| VPI->addOperand(LaneDef); |
| } |
| ToRemove.push_back(DefR); |
| } |
| } |
| for (auto *R : reverse(ToRemove)) |
| R->eraseFromParent(); |
| |
| replicateReplicateRegionsByVF(Plan, VF, IdxTy); |
| } |