| //===---- EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This pass optimizes a vectorized loop with canonical IV to using EVL-based |
| // IV if it was tail-folded by predicated EVL. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/Analysis/IVDescriptors.h" |
| #include "llvm/Analysis/LoopInfo.h" |
| #include "llvm/Analysis/LoopPass.h" |
| #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| #include "llvm/Analysis/ScalarEvolution.h" |
| #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| #include "llvm/Analysis/ValueTracking.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/PatternMatch.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Transforms/Scalar/LoopPassManager.h" |
| #include "llvm/Transforms/Utils/Local.h" |
| |
| #define DEBUG_TYPE "evl-iv-simplify" |
| |
| using namespace llvm; |
| |
| STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated"); |
| |
| static cl::opt<bool> EnableEVLIndVarSimplify( |
| "enable-evl-indvar-simplify", |
| cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden, |
| cl::init(true)); |
| |
| namespace { |
| struct EVLIndVarSimplifyImpl { |
| ScalarEvolution &SE; |
| OptimizationRemarkEmitter *ORE = nullptr; |
| |
| EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR, |
| OptimizationRemarkEmitter *ORE) |
| : SE(LAR.SE), ORE(ORE) {} |
| |
| /// Returns true if modify the loop. |
| bool run(Loop &L); |
| }; |
| } // anonymous namespace |
| |
| /// Returns the constant part of vectorization factor from the induction |
| /// variable's step value SCEV expression. |
| static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) { |
| if (!Step) |
| return 0U; |
| |
| // Looking for loops with IV step value in the form of `(<constant VF> x |
| // vscale)`. |
| if (const auto *Mul = dyn_cast<SCEVMulExpr>(Step)) { |
| if (Mul->getNumOperands() == 2) { |
| const SCEV *LHS = Mul->getOperand(0); |
| const SCEV *RHS = Mul->getOperand(1); |
| if (const auto *Const = dyn_cast<SCEVConstant>(LHS); |
| Const && isa<SCEVVScale>(RHS)) { |
| uint64_t V = Const->getAPInt().getLimitedValue(); |
| if (llvm::isUInt<32>(V)) |
| return V; |
| } |
| } |
| } |
| |
| // If not, see if the vscale_range of the parent function is a fixed value, |
| // which makes the step value to be replaced by a constant. |
| if (F.hasFnAttribute(Attribute::VScaleRange)) |
| if (const auto *ConstStep = dyn_cast<SCEVConstant>(Step)) { |
| APInt V = ConstStep->getAPInt().abs(); |
| ConstantRange CR = llvm::getVScaleRange(&F, 64); |
| if (const APInt *Fixed = CR.getSingleElement()) { |
| V = V.zextOrTrunc(Fixed->getBitWidth()); |
| uint64_t VF = V.udiv(*Fixed).getLimitedValue(); |
| if (VF && llvm::isUInt<32>(VF) && |
| // Make sure step is divisible by vscale. |
| V.urem(*Fixed).isZero()) |
| return VF; |
| } |
| } |
| |
| return 0U; |
| } |
| |
| bool EVLIndVarSimplifyImpl::run(Loop &L) { |
| if (!EnableEVLIndVarSimplify) |
| return false; |
| |
| if (!getBooleanLoopAttribute(&L, "llvm.loop.isvectorized")) |
| return false; |
| const MDOperand *EVLMD = |
| findStringMetadataForLoop(&L, "llvm.loop.isvectorized.tailfoldingstyle") |
| .value_or(nullptr); |
| if (!EVLMD || !EVLMD->equalsStr("evl")) |
| return false; |
| |
| BasicBlock *LatchBlock = L.getLoopLatch(); |
| ICmpInst *OrigLatchCmp = L.getLatchCmpInst(); |
| if (!LatchBlock || !OrigLatchCmp) |
| return false; |
| |
| InductionDescriptor IVD; |
| PHINode *IndVar = L.getInductionVariable(SE); |
| if (!IndVar || !L.getInductionDescriptor(SE, IVD)) { |
| const char *Reason = (IndVar ? "induction descriptor is not available" |
| : "cannot recognize induction variable"); |
| LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName() |
| << " because" << Reason << "\n"); |
| if (ORE) { |
| ORE->emit([&]() { |
| return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar", |
| L.getStartLoc(), L.getHeader()) |
| << "Cannot retrieve IV because " << ore::NV("Reason", Reason); |
| }); |
| } |
| return false; |
| } |
| |
| BasicBlock *InitBlock, *BackEdgeBlock; |
| if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) { |
| LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in " |
| << L.getName() << "\n"); |
| if (ORE) { |
| ORE->emit([&]() { |
| return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure", |
| L.getStartLoc(), L.getHeader()) |
| << "Does not have a unique incoming and backedge"; |
| }); |
| } |
| return false; |
| } |
| |
| // Retrieve the loop bounds. |
| std::optional<Loop::LoopBounds> Bounds = L.getBounds(SE); |
| if (!Bounds) { |
| LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName() |
| << "\n"); |
| if (ORE) { |
| ORE->emit([&]() { |
| return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure", |
| L.getStartLoc(), L.getHeader()) |
| << "Could not obtain the loop bounds"; |
| }); |
| } |
| return false; |
| } |
| Value *CanonicalIVInit = &Bounds->getInitialIVValue(); |
| Value *CanonicalIVFinal = &Bounds->getFinalIVValue(); |
| |
| const SCEV *StepV = IVD.getStep(); |
| uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent()); |
| if (!VF) { |
| LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV |
| << "'\n"); |
| if (ORE) { |
| ORE->emit([&]() { |
| return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar", |
| L.getStartLoc(), L.getHeader()) |
| << "Could not infer VF from IndVar step " |
| << ore::NV("Step", StepV); |
| }); |
| } |
| return false; |
| } |
| LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName() |
| << "\n"); |
| |
| // Try to find the EVL-based induction variable. |
| using namespace PatternMatch; |
| BasicBlock *BB = IndVar->getParent(); |
| |
| Value *EVLIndVar = nullptr; |
| Value *RemTC = nullptr; |
| Value *TC = nullptr; |
| auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>( |
| m_Value(RemTC), m_SpecificInt(VF), |
| /*Scalable=*/m_SpecificInt(1)); |
| for (PHINode &PN : BB->phis()) { |
| if (&PN == IndVar) |
| continue; |
| |
| // Check 1: it has to contain both incoming (init) & backedge blocks |
| // from IndVar. |
| if (PN.getBasicBlockIndex(InitBlock) < 0 || |
| PN.getBasicBlockIndex(BackEdgeBlock) < 0) |
| continue; |
| // Check 2: EVL index is always increasing, thus its inital value has to be |
| // equal to either the initial IV value (when the canonical IV is also |
| // increasing) or the last IV value (when canonical IV is decreasing). |
| Value *Init = PN.getIncomingValueForBlock(InitBlock); |
| using Direction = Loop::LoopBounds::Direction; |
| switch (Bounds->getDirection()) { |
| case Direction::Increasing: |
| if (Init != CanonicalIVInit) |
| continue; |
| break; |
| case Direction::Decreasing: |
| if (Init != CanonicalIVFinal) |
| continue; |
| break; |
| case Direction::Unknown: |
| // To be more permissive and see if either the initial or final IV value |
| // matches PN's init value. |
| if (Init != CanonicalIVInit && Init != CanonicalIVFinal) |
| continue; |
| break; |
| } |
| Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock); |
| assert(RecValue && "expect recurrent IndVar value"); |
| |
| LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN |
| << "\n"); |
| |
| // Check 3: Pattern match to find the EVL-based index and total trip count |
| // (TC). |
| if (match(RecValue, |
| m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) && |
| match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) { |
| EVLIndVar = RecValue; |
| break; |
| } |
| } |
| |
| if (!EVLIndVar || !TC) |
| return false; |
| |
| LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n"); |
| if (ORE) { |
| ORE->emit([&]() { |
| DebugLoc DL; |
| BasicBlock *Region = nullptr; |
| if (auto *I = dyn_cast<Instruction>(EVLIndVar)) { |
| DL = I->getDebugLoc(); |
| Region = I->getParent(); |
| } else { |
| DL = L.getStartLoc(); |
| Region = L.getHeader(); |
| } |
| return OptimizationRemark(DEBUG_TYPE, "UseEVLIndVar", DL, Region) |
| << "Using " << ore::NV("EVLIndVar", EVLIndVar) |
| << " for EVL-based IndVar"; |
| }); |
| } |
| |
| // Create an EVL-based comparison and replace the branch to use it as |
| // predicate. |
| |
| // Loop::getLatchCmpInst check at the beginning of this function has ensured |
| // that latch block ends in a conditional branch. |
| auto *LatchBranch = cast<BranchInst>(LatchBlock->getTerminator()); |
| assert(LatchBranch->isConditional() && |
| "expect the loop latch to be ended with a conditional branch"); |
| ICmpInst::Predicate Pred; |
| if (LatchBranch->getSuccessor(0) == L.getHeader()) |
| Pred = ICmpInst::ICMP_NE; |
| else |
| Pred = ICmpInst::ICMP_EQ; |
| |
| IRBuilder<> Builder(OrigLatchCmp); |
| auto *NewLatchCmp = Builder.CreateICmp(Pred, EVLIndVar, TC); |
| OrigLatchCmp->replaceAllUsesWith(NewLatchCmp); |
| |
| // llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are |
| // not used outside the cycles. However, in this case the now-RAUW-ed |
| // OrigLatchCmp will be considered a use outside the cycle while in reality |
| // it's practically dead. Thus we need to remove it before calling |
| // RecursivelyDeleteDeadPHINode. |
| (void)RecursivelyDeleteTriviallyDeadInstructions(OrigLatchCmp); |
| if (llvm::RecursivelyDeleteDeadPHINode(IndVar)) |
| LLVM_DEBUG(dbgs() << "Removed original IndVar\n"); |
| |
| ++NumEliminatedCanonicalIV; |
| |
| return true; |
| } |
| |
| PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM, |
| LoopStandardAnalysisResults &AR, |
| LPMUpdater &U) { |
| Function &F = *L.getHeader()->getParent(); |
| auto &FAMProxy = LAM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR); |
| OptimizationRemarkEmitter *ORE = |
| FAMProxy.getCachedResult<OptimizationRemarkEmitterAnalysis>(F); |
| |
| if (EVLIndVarSimplifyImpl(AR, ORE).run(L)) |
| return PreservedAnalyses::allInSet<CFGAnalyses>(); |
| return PreservedAnalyses::all(); |
| } |