llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp - llvm-project - Git at Google

 //===- LoopVectorizationPlanner.cpp - VF selection and planning -----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file implements VFSelectionContext methods for loop vectorization
 /// VF selection, independent of cost-modeling decisions.
 ///
 //===----------------------------------------------------------------------===//

 #include "LoopVectorizationPlanner.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"

 using namespace llvm;
 using namespace LoopVectorizationUtils;

 #define DEBUG_TYPE "loop-vectorize"

 extern cl::opt<bool> VPlanBuildOuterloopStressTest;

 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
              "will be determined by the smallest type in loop."));

 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
     cl::Hidden,
     cl::desc("Try wider VFs if they enable the use of vector variants"));

 static cl::opt<bool> ConsiderRegPressure(
     "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
     cl::desc("Discard VFs if their register pressure is too high."));

 static cl::opt<bool> ForceTargetSupportsScalableVectors(
     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
     cl::desc(
         "Pretend that scalable vectors are supported, even if the target does "
         "not support them. This flag should only be used for testing."));

 cl::opt<bool> llvm::PreferInLoopReductions(
     "prefer-inloop-reductions", cl::init(false), cl::Hidden,
     cl::desc("Prefer in-loop vector reductions, "
              "overriding the targets preference."));

 /// Note: This currently only applies to `llvm.masked.load` and
 /// `llvm.masked.store`. TODO: Extend this to cover other operations as needed.
 static cl::opt<bool> ForceTargetSupportsMaskedMemoryOps(
     "force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden,
     cl::desc("Assume the target supports masked memory operations (used for "
              "testing)."));

 static cl::opt<bool> ForceTargetSupportsGatherScatterOps(
     "force-target-supports-gather-scatter-ops", cl::init(false), cl::Hidden,
     cl::desc("Assume the target supports gather/scatter operations (used for "
              "testing)."));

 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
 /// is passed, the message relates to that particular instruction.
 #ifndef NDEBUG
 static void debugVectorizationMessage(const StringRef Prefix,
                                       const StringRef DebugMsg,
                                       Instruction *I) {
   dbgs() << "LV: " << Prefix << DebugMsg;
   if (I != nullptr)
     dbgs() << " " << *I;
   else
     dbgs() << '.';
   dbgs() << '\n';
 }
 #endif

 /// Create an analysis remark that explains why vectorization failed
 /// \p RemarkName is the identifier for the remark.  If \p I is passed it is an
 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
 /// the location of the remark. If \p DL is passed, use it as debug location for
 /// the remark. \return the remark object that can be streamed to.
 static OptimizationRemarkAnalysis createLVAnalysis(StringRef RemarkName,
                                                    const Loop *TheLoop,
                                                    Instruction *I,
                                                    DebugLoc DL = {}) {
   BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
   // If debug location is attached to the instruction, use it. Otherwise if DL
   // was not provided, use the loop's.
   if (I && I->getDebugLoc())
     DL = I->getDebugLoc();
   else if (!DL)
     DL = TheLoop->getStartLoc();

   return OptimizationRemarkAnalysis(DEBUG_TYPE, RemarkName, DL, CodeRegion);
 }

 void LoopVectorizationUtils::reportVectorizationFailure(
     const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag,
     OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I) {
   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
   ORE->emit(createLVAnalysis(ORETag, TheLoop, I)
             << "loop not vectorized: " << OREMsg);
 }

 void LoopVectorizationUtils::reportVectorizationInfo(
     const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE,
     const Loop *TheLoop, Instruction *I, DebugLoc DL) {
   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
   ORE->emit(createLVAnalysis(ORETag, TheLoop, I, DL) << Msg);
 }

 void LoopVectorizationUtils::reportVectorization(OptimizationRemarkEmitter *ORE,
                                                  Loop *TheLoop,
                                                  ElementCount VFWidth,
                                                  unsigned IC) {
   LLVM_DEBUG(debugVectorizationMessage(
       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
       nullptr));
   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
   ORE->emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "Vectorized", TheLoop->getStartLoc(),
                               TheLoop->getHeader())
            << "vectorized " << LoopType << "loop (vectorization width: "
            << ore::NV("VectorizationFactor", VFWidth)
            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
   });
 }

 bool VFSelectionContext::isLegalMaskedLoadOrStore(Instruction *I,
                                                   ElementCount VF) const {
   assert(isa<LoadInst>(I) || isa<StoreInst>(I));
   auto *Ty = getLoadStoreType(I);
   const unsigned AS = getLoadStoreAddressSpace(I);
   const Align Alignment = getLoadStoreAlignment(I);

   return ForceTargetSupportsMaskedMemoryOps ||
          (isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
                            : TTI.isLegalMaskedStore(Ty, Alignment, AS));
 }

 bool VFSelectionContext::isLegalGatherOrScatter(Value *V,
                                                 ElementCount VF) const {
   bool LI = isa<LoadInst>(V);
   bool SI = isa<StoreInst>(V);
   if (!LI && !SI)
     return false;
   auto *Ty = getLoadStoreType(V);
   Align Align = getLoadStoreAlignment(V);
   if (VF.isVector())
     Ty = VectorType::get(Ty, VF);
   return ForceTargetSupportsGatherScatterOps ||
          (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
          (SI && TTI.isLegalMaskedScatter(Ty, Align));
 }

 bool VFSelectionContext::supportsScalableVectors() const {
   return TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors;
 }

 bool VFSelectionContext::useMaxBandwidth(bool IsScalable) const {
   TargetTransformInfo::RegisterKind RegKind =
       IsScalable ? TargetTransformInfo::RGK_ScalableVector
                  : TargetTransformInfo::RGK_FixedWidthVector;
   return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
                                (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
                                 (UseWiderVFIfCallVariantsPresent &&
                                  Legal->hasVectorCallVariants())));
 }

 bool VFSelectionContext::shouldConsiderRegPressureForVF(ElementCount VF) const {
   if (ConsiderRegPressure.getNumOccurrences())
     return ConsiderRegPressure;

   // TODO: We should eventually consider register pressure for all targets. The
   // TTI hook is temporary whilst target-specific issues are being fixed.
   if (TTI.shouldConsiderVectorizationRegPressure())
     return true;

   if (!useMaxBandwidth(VF.isScalable()))
     return false;
   // Only calculate register pressure for VFs enabled by MaxBandwidth.
   return ElementCount::isKnownGT(
       VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
                           : MaxPermissibleVFWithoutMaxBW.FixedVF);
 }

 ElementCount VFSelectionContext::clampVFByMaxTripCount(
     ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
     bool FoldTailByMasking, bool RequiresScalarEpilogue) const {
   unsigned EstimatedVF = VF.getKnownMinValue();
   if (VF.isScalable() && F.hasFnAttribute(Attribute::VScaleRange)) {
     auto Attr = F.getFnAttribute(Attribute::VScaleRange);
     auto Min = Attr.getVScaleRangeMin();
     EstimatedVF *= Min;
   }

   // When a scalar epilogue is required, at least one iteration of the scalar
   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
   // max VF that results in a dead vector loop.
   if (MaxTripCount > 0 && RequiresScalarEpilogue)
     MaxTripCount -= 1;

   // When the user specifies an interleave count, we need to ensure that
   // VF * UserIC <= MaxTripCount to avoid a dead vector loop.
   unsigned IC = UserIC > 0 ? UserIC : 1;
   unsigned EstimatedVFTimesIC = EstimatedVF * IC;

   if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
     // If upper bound loop trip count (TC) is known at compile time there is no
     // point in choosing VF greater than TC / IC (as done in the loop below).
     // Select maximum power of two which doesn't exceed TC / IC. If VF is
     // scalable, we only fall back on a fixed VF when the TC is less than or
     // equal to the known number of lanes.
     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount / IC);
     if (ClampedUpperTripCount == 0)
       ClampedUpperTripCount = 1;
     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
                          "exceeding the constant trip count"
                       << (UserIC > 0 ? " divided by UserIC" : "") << ": "
                       << ClampedUpperTripCount << "\n");
     return ElementCount::get(ClampedUpperTripCount,
                              FoldTailByMasking ? VF.isScalable() : false);
   }
   return VF;
 }

 ElementCount VFSelectionContext::getMaximizedVFForTarget(
     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
     ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking,
     bool RequiresScalarEpilogue) {
   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
                            : TargetTransformInfo::RGK_FixedWidthVector);

   // Convenience function to return the minimum of two ElementCounts.
   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
     assert((LHS.isScalable() == RHS.isScalable()) &&
            "Scalable flags must match");
     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
   };

   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
   // Note that both WidestRegister and WidestType may not be a powers of 2.
   auto MaxVectorElementCount = ElementCount::get(
       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
       ComputeScalableMaxVF);
   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
                     << (MaxVectorElementCount * WidestType) << " bits.\n");

   if (!MaxVectorElementCount) {
     LLVM_DEBUG(dbgs() << "LV: The target has no "
                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
                       << " vector registers.\n");
     return ElementCount::getFixed(1);
   }

   ElementCount MaxVF =
       clampVFByMaxTripCount(MaxVectorElementCount, MaxTripCount, UserIC,
                             FoldTailByMasking, RequiresScalarEpilogue);
   // If the MaxVF was already clamped, there's no point in trying to pick a
   // larger one.
   if (MaxVF != MaxVectorElementCount)
     return MaxVF;

   if (MaxVF.isScalable())
     MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
   else
     MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;

   if (useMaxBandwidth(ComputeScalableMaxVF)) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
         ComputeScalableMaxVF);
     MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);

     if (ElementCount MinVF =
             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
                           << ") with target's minimum: " << MinVF << '\n');
         MaxVF = MinVF;
       }
     }

     MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, UserIC,
                                   FoldTailByMasking, RequiresScalarEpilogue);
   }
   return MaxVF;
 }

 std::optional<unsigned> llvm::getMaxVScale(const Function &F,
                                            const TargetTransformInfo &TTI) {
   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
     return MaxVScale;

   if (F.hasFnAttribute(Attribute::VScaleRange))
     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();

   return std::nullopt;
 }

 bool VFSelectionContext::isScalableVectorizationAllowed() {
   if (IsScalableVectorizationAllowed)
     return *IsScalableVectorizationAllowed;

   IsScalableVectorizationAllowed = false;
   if (!supportsScalableVectors())
     return false;

   if (Hints->isScalableVectorizationDisabled()) {
     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
                             "ScalableVectorizationDisabled", ORE, TheLoop);
     return false;
   }

   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");

   auto MaxScalableVF = ElementCount::getScalable(
       std::numeric_limits<ElementCount::ScalarTy>::max());

   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
   // FIXME: While for scalable vectors this is currently sufficient, this should
   // be replaced by a more detailed mechanism that filters out specific VFs,
   // instead of invalidating vectorization for a whole set of VFs based on the
   // MaxVF.

   // Disable scalable vectorization if the loop contains unsupported reductions.
   if (!all_of(Legal->getReductionVars(), [&](const auto &Reduction) -> bool {
         return TTI.isLegalToVectorizeReduction(Reduction.second, MaxScalableVF);
       })) {
     reportVectorizationInfo(
         "Scalable vectorization not supported for the reduction "
         "operations found in this loop.",
         "ScalableVFUnfeasible", ORE, TheLoop);
     return false;
   }

   // Disable scalable vectorization if the loop contains any instructions
   // with element types not supported for scalable vectors.
   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
         return !Ty->isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty);
       })) {
     reportVectorizationInfo("Scalable vectorization is not supported "
                             "for all element types found in this loop.",
                             "ScalableVFUnfeasible", ORE, TheLoop);
     return false;
   }

   if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F, TTI)) {
     reportVectorizationInfo("The target does not provide maximum vscale value "
                             "for safe distance analysis.",
                             "ScalableVFUnfeasible", ORE, TheLoop);
     return false;
   }

   IsScalableVectorizationAllowed = true;
   return true;
 }

 ElementCount
 VFSelectionContext::getMaxLegalScalableVF(unsigned MaxSafeElements) {
   if (!isScalableVectorizationAllowed())
     return ElementCount::getScalable(0);

   auto MaxScalableVF = ElementCount::getScalable(
       std::numeric_limits<ElementCount::ScalarTy>::max());
   if (Legal->isSafeForAnyVectorWidth())
     return MaxScalableVF;

   std::optional<unsigned> MaxVScale = getMaxVScale(F, TTI);
   // Limit MaxScalableVF by the maximum safe dependence distance.
   MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);

   if (!MaxScalableVF)
     reportVectorizationInfo(
         "Max legal vector width too small, scalable vectorization "
         "unfeasible.",
         "ScalableVFUnfeasible", ORE, TheLoop);

   return MaxScalableVF;
 }

 FixedScalableVFPair VFSelectionContext::computeFeasibleMaxVF(
     unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
     bool FoldTailByMasking, bool RequiresScalarEpilogue) {
   auto [SmallestType, WidestType] = getSmallestAndWidestTypes();

   // Get the maximum safe dependence distance in bits computed by LAA.
   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
   // the memory accesses that is most restrictive (involved in the smallest
   // dependence distance).
   unsigned MaxSafeElementsPowerOf2 =
       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
   if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
     unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
     MaxSafeElementsPowerOf2 =
         std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
   }

   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);

   if (!Legal->isSafeForAnyVectorWidth())
     MaxSafeElements = MaxSafeElementsPowerOf2;

   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
                     << ".\n");
   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
                     << ".\n");

   // First analyze the UserVF, fall back if the UserVF should be ignored.
   if (UserVF) {
     auto MaxSafeUserVF =
         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;

     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
       // If `VF=vscale x N` is safe, then so is `VF=N`
       if (UserVF.isScalable())
         return FixedScalableVFPair(
             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);

       return UserVF;
     }

     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));

     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
     // is better to ignore the hint and let the compiler choose a suitable VF.
     if (!UserVF.isScalable()) {
       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
                         << " is unsafe, clamping to max safe VF="
                         << MaxSafeFixedVF << ".\n");
       ORE->emit([&]() {
         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
                                           TheLoop->getStartLoc(),
                                           TheLoop->getHeader())
                << "User-specified vectorization factor "
                << ore::NV("UserVectorizationFactor", UserVF)
                << " is unsafe, clamping to maximum safe vectorization factor "
                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
       });
       return MaxSafeFixedVF;
     }

     if (!supportsScalableVectors()) {
       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
                         << " is ignored because scalable vectors are not "
                            "available.\n");
       ORE->emit([&]() {
         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
                                           TheLoop->getStartLoc(),
                                           TheLoop->getHeader())
                << "User-specified vectorization factor "
                << ore::NV("UserVectorizationFactor", UserVF)
                << " is ignored because the target does not support scalable "
                   "vectors. The compiler will pick a more suitable value.";
       });
     } else {
       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
                         << " is unsafe. Ignoring scalable UserVF.\n");
       ORE->emit([&]() {
         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
                                           TheLoop->getStartLoc(),
                                           TheLoop->getHeader())
                << "User-specified vectorization factor "
                << ore::NV("UserVectorizationFactor", UserVF)
                << " is unsafe. Ignoring the hint to let the compiler pick a "
                   "more suitable value.";
       });
     }
   }

   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
                     << " / " << WidestType << " bits.\n");

   FixedScalableVFPair Result(ElementCount::getFixed(1),
                              ElementCount::getScalable(0));
   if (auto MaxVF = getMaximizedVFForTarget(
           MaxTripCount, SmallestType, WidestType, MaxSafeFixedVF, UserIC,
           FoldTailByMasking, RequiresScalarEpilogue))
     Result.FixedVF = MaxVF;

   if (auto MaxVF = getMaximizedVFForTarget(
           MaxTripCount, SmallestType, WidestType, MaxSafeScalableVF, UserIC,
           FoldTailByMasking, RequiresScalarEpilogue))
     if (MaxVF.isScalable()) {
       Result.ScalableVF = MaxVF;
       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
                         << "\n");
     }

   return Result;
 }

 std::pair<unsigned, unsigned>
 VFSelectionContext::getSmallestAndWidestTypes() const {
   unsigned MinWidth = -1U;
   unsigned MaxWidth = 8;
   const DataLayout &DL = F.getDataLayout();
   // For in-loop reductions, no element types are added to ElementTypesInLoop
   // if there are no loads/stores in the loop. In this case, check through the
   // reduction variables to determine the maximum width.
   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
     for (const auto &[_, RdxDesc] : Legal->getReductionVars()) {
       // When finding the min width used by the recurrence we need to account
       // for casts on the input operands of the recurrence.
       MinWidth = std::min(
           MinWidth,
           std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
                    RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
       MaxWidth = std::max(MaxWidth,
                           RdxDesc.getRecurrenceType()->getScalarSizeInBits());
     }
   } else {
     for (Type *T : ElementTypesInLoop) {
       MinWidth = std::min<unsigned>(
           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
       MaxWidth = std::max<unsigned>(
           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
     }
   }
   return {MinWidth, MaxWidth};
 }

 void VFSelectionContext::collectElementTypesForWidening(
     const SmallPtrSetImpl<const Value *> *ValuesToIgnore) {
   ElementTypesInLoop.clear();
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
     // For each instruction in the loop.
     for (Instruction &I : *BB) {
       Type *T = I.getType();

       // Skip ignored values.
       if (ValuesToIgnore && ValuesToIgnore->contains(&I))
         continue;

       // Only examine Loads, Stores and PHINodes.
       if (!isa<LoadInst, StoreInst, PHINode>(I))
         continue;

       // Examine PHI nodes that are reduction variables. Update the type to
       // account for the recurrence type.
       if (auto *PN = dyn_cast<PHINode>(&I)) {
         if (!Legal->isReductionVariable(PN))
           continue;
         const RecurrenceDescriptor &RdxDesc =
             Legal->getRecurrenceDescriptor(PN);
         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
             TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(),
                                       RdxDesc.getRecurrenceType()))
           continue;
         T = RdxDesc.getRecurrenceType();
       }

       // Examine the stored values.
       if (auto *ST = dyn_cast<StoreInst>(&I))
         T = ST->getValueOperand()->getType();

       assert(T->isSized() &&
              "Expected the load/store/recurrence type to be sized");

       ElementTypesInLoop.insert(T);
     }
   }
 }

 void VFSelectionContext::initializeVScaleForTuning() {
   if (!supportsScalableVectors())
     return;

   if (F.hasFnAttribute(Attribute::VScaleRange)) {
     auto Attr = F.getFnAttribute(Attribute::VScaleRange);
     auto Min = Attr.getVScaleRangeMin();
     auto Max = Attr.getVScaleRangeMax();
     if (Max && Min == Max) {
       VScaleForTuning = Max;
       return;
     }
   }

   VScaleForTuning = TTI.getVScaleForTuning();
 }

 bool VFSelectionContext::useOrderedReductions(
     const RecurrenceDescriptor &RdxDesc) const {
   return !Hints->allowReordering() && RdxDesc.isOrdered();
 }

 bool VFSelectionContext::runtimeChecksRequired() {
   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");

   Loop *L = const_cast<Loop *>(TheLoop);
   if (Legal->getRuntimePointerChecking()->Need) {
     reportVectorizationFailure(
         "Runtime ptr check is required with -Os/-Oz",
         "runtime pointer checks needed. Enable vectorization of this "
         "loop with '#pragma clang loop vectorize(enable)' when "
         "compiling with -Os/-Oz",
         "CantVersionLoopWithOptForSize", ORE, L);
     return true;
   }

   if (!PSE.getPredicate().isAlwaysTrue()) {
     reportVectorizationFailure(
         "Runtime SCEV check is required with -Os/-Oz",
         "runtime SCEV checks needed. Enable vectorization of this "
         "loop with '#pragma clang loop vectorize(enable)' when "
         "compiling with -Os/-Oz",
         "CantVersionLoopWithOptForSize", ORE, L);
     return true;
   }

   // FIXME: Avoid specializing for stride==1 instead of bailing out.
   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
     reportVectorizationFailure(
         "Runtime stride check for small trip count",
         "runtime stride == 1 checks needed. Enable vectorization of "
         "this loop without such check by compiling with -Os/-Oz",
         "CantVersionLoopWithOptForSize", ORE, L);
     return true;
   }

   return false;
 }

 void VFSelectionContext::computeMinimalBitwidths() {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
 }

 void VFSelectionContext::collectInLoopReductions() {
   // Avoid duplicating work finding in-loop reductions.
   if (!InLoopReductions.empty())
     return;

   for (const auto &Reduction : Legal->getReductionVars()) {
     PHINode *Phi = Reduction.first;
     const RecurrenceDescriptor &RdxDesc = Reduction.second;

     // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
     // separately and should not be considered for in-loop reductions.
     if (RdxDesc.hasUsesOutsideReductionChain())
       continue;

     // We don't collect reductions that are type promoted (yet).
     if (RdxDesc.getRecurrenceType() != Phi->getType())
       continue;

     // In-loop AnyOf and FindIV reductions are not yet supported.
     RecurKind Kind = RdxDesc.getRecurrenceKind();
     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) ||
         RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) ||
         RecurrenceDescriptor::isFindLastRecurrenceKind(Kind))
       continue;

     // If the target would prefer this reduction to happen "in-loop", then we
     // want to record it as such.
     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
         !TTI.preferInLoopReduction(Kind, Phi->getType()))
       continue;

     // Check that we can correctly put the reductions into the loop, by
     // finding the chain of operations that leads from the phi to the loop
     // exit value.
     SmallVector<Instruction *, 4> ReductionOperations =
         RdxDesc.getReductionOpChain(Phi, const_cast<Loop *>(TheLoop));
     bool InLoop = !ReductionOperations.empty();

     if (InLoop) {
       InLoopReductions.insert(Phi);
       // Add the elements to InLoopReductionImmediateChains for cost modelling.
       Instruction *LastChain = Phi;
       for (auto *I : ReductionOperations) {
         InLoopReductionImmediateChains[I] = LastChain;
         LastChain = I;
       }
     }
     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
                       << " reduction for phi: " << *Phi << "\n");
   }
 }

 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
                                                 const VectorizationFactor &B,
                                                 const unsigned MaxTripCount,
                                                 bool HasTail,
                                                 bool IsEpilogue) const {
   InstructionCost CostA = A.Cost;
   InstructionCost CostB = B.Cost;

   // When there is a hint to always prefer scalable vectors, honour that hint.
   if (Hints.isScalableVectorizationAlwaysPreferred())
     if (A.Width.isScalable() && CostA.isValid() && !B.Width.isScalable() &&
         !B.Width.isScalar())
       return true;

   // Improve estimate for the vector width if it is scalable.
   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
   if (std::optional<unsigned> VScale = Config.getVScaleForTuning()) {
     if (A.Width.isScalable())
       EstimatedWidthA *= *VScale;
     if (B.Width.isScalable())
       EstimatedWidthB *= *VScale;
   }

   // When optimizing for size choose whichever is smallest, which will be the
   // one with the smallest cost for the whole loop. On a tie pick the larger
   // vector width, on the assumption that throughput will be greater.
   if (Config.CostKind == TTI::TCK_CodeSize)
     return CostA < CostB ||
            (CostA == CostB && EstimatedWidthA > EstimatedWidthB);

   // Assume vscale may be larger than 1 (or the value being tuned for),
   // so that scalable vectorization is slightly favorable over fixed-width
   // vectorization.
   bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
                         A.Width.isScalable() && !B.Width.isScalable();

   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
                                 const InstructionCost &RHS) {
     return PreferScalable ? LHS <= RHS : LHS < RHS;
   };

   // To avoid the need for FP division:
   //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
   // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
   bool LowerCostWithoutTC =
       CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
   if (!MaxTripCount)
     return LowerCostWithoutTC;

   auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
                                               InstructionCost VectorCost,
                                               InstructionCost ScalarCost) {
     // If the trip count is a known (possibly small) constant, the trip count
     // will be rounded up to an integer number of iterations under
     // FoldTailByMasking. The total cost in that case will be
     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
     // some extra overheads, but for the purpose of comparing the costs of
     // different VFs we can use this to compare the total loop-body cost
     // expected after vectorization.
     if (HasTail)
       return VectorCost * (MaxTripCount / VF) +
              ScalarCost * (MaxTripCount % VF);
     return VectorCost * divideCeil(MaxTripCount, VF);
   };

   auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
   auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
   bool LowerCostWithTC = CmpFn(RTCostA, RTCostB);
   LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) {
     dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width)
            << " has lower cost than VF "
            << (LowerCostWithTC ? B.Width : A.Width)
            << " when taking the cost of the remaining scalar loop iterations "
               "into consideration for a maximum trip count of "
            << MaxTripCount << ".\n";
   });
   return LowerCostWithTC;
 }

 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
                                                 const VectorizationFactor &B,
                                                 bool HasTail,
                                                 bool IsEpilogue) const {
   const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
   return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
                                                     IsEpilogue);
 }

 // TODO: we could return a pair of values that specify the max VF and
 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
 // doesn't have a cost model that can choose which plan to execute if
 // more than one is generated.
 FixedScalableVFPair
 VFSelectionContext::computeVPlanOuterloopVF(ElementCount UserVF) {
   if (UserVF.isScalable() && !supportsScalableVectors()) {
     reportVectorizationFailure(
         "Scalable vectorization requested but not supported by the target",
         "the scalable user-specified vectorization width for outer-loop "
         "vectorization cannot be used because the target does not support "
         "scalable vectors.",
         "ScalableVFUnfeasible", ORE, TheLoop);
     return FixedScalableVFPair::getNone();
   }

   ElementCount VF = UserVF;
   if (VF.isZero()) {
     auto [_, WidestType] = getSmallestAndWidestTypes();

     auto RegKind = TTI.enableScalableVectorization()
                        ? TargetTransformInfo::RGK_ScalableVector
                        : TargetTransformInfo::RGK_FixedWidthVector;

     TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
     unsigned N = RegSize.getKnownMinValue() / WidestType;
     VF = ElementCount::get(N, RegSize.isScalable());
     LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");

     // Make sure we have a VF > 1 for stress testing.
     if (VPlanBuildOuterloopStressTest && VF.isScalar()) {
       LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
                         << "overriding computed VF.\n");
       VF = ElementCount::getFixed(4);
     }
   }
   assert(isPowerOf2_32(VF.getKnownMinValue()) &&
          "VF needs to be a power of two");
   if (VF.isScalar())
     return FixedScalableVFPair::getNone();
   LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
                     << "VF " << VF << " to build VPlans.\n");
   return FixedScalableVFPair(VF);
 }