| //===- LoopVectorizationPlanner.cpp - VF selection and planning -----------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| /// |
| /// \file |
| /// This file implements VFSelectionContext methods for loop vectorization |
| /// VF selection, independent of cost-modeling decisions. |
| /// |
| //===----------------------------------------------------------------------===// |
| |
| #include "LoopVectorizationPlanner.h" |
| #include "llvm/Analysis/LoopInfo.h" |
| #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| #include "llvm/Analysis/ScalarEvolution.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
| #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
| |
| using namespace llvm; |
| using namespace LoopVectorizationUtils; |
| |
| #define DEBUG_TYPE "loop-vectorize" |
| |
| extern cl::opt<bool> VPlanBuildOuterloopStressTest; |
| |
| static cl::opt<bool> MaximizeBandwidth( |
| "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, |
| cl::desc("Maximize bandwidth when selecting vectorization factor which " |
| "will be determined by the smallest type in loop.")); |
| |
| static cl::opt<bool> UseWiderVFIfCallVariantsPresent( |
| "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), |
| cl::Hidden, |
| cl::desc("Try wider VFs if they enable the use of vector variants")); |
| |
| static cl::opt<bool> ConsiderRegPressure( |
| "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, |
| cl::desc("Discard VFs if their register pressure is too high.")); |
| |
| static cl::opt<bool> ForceTargetSupportsScalableVectors( |
| "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, |
| cl::desc( |
| "Pretend that scalable vectors are supported, even if the target does " |
| "not support them. This flag should only be used for testing.")); |
| |
| cl::opt<bool> llvm::PreferInLoopReductions( |
| "prefer-inloop-reductions", cl::init(false), cl::Hidden, |
| cl::desc("Prefer in-loop vector reductions, " |
| "overriding the targets preference.")); |
| |
| /// Note: This currently only applies to `llvm.masked.load` and |
| /// `llvm.masked.store`. TODO: Extend this to cover other operations as needed. |
| static cl::opt<bool> ForceTargetSupportsMaskedMemoryOps( |
| "force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden, |
| cl::desc("Assume the target supports masked memory operations (used for " |
| "testing).")); |
| |
| static cl::opt<bool> ForceTargetSupportsGatherScatterOps( |
| "force-target-supports-gather-scatter-ops", cl::init(false), cl::Hidden, |
| cl::desc("Assume the target supports gather/scatter operations (used for " |
| "testing).")); |
| |
| /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I |
| /// is passed, the message relates to that particular instruction. |
| #ifndef NDEBUG |
| static void debugVectorizationMessage(const StringRef Prefix, |
| const StringRef DebugMsg, |
| Instruction *I) { |
| dbgs() << "LV: " << Prefix << DebugMsg; |
| if (I != nullptr) |
| dbgs() << " " << *I; |
| else |
| dbgs() << '.'; |
| dbgs() << '\n'; |
| } |
| #endif |
| |
| /// Create an analysis remark that explains why vectorization failed |
| /// \p RemarkName is the identifier for the remark. If \p I is passed it is an |
| /// instruction that prevents vectorization. Otherwise \p TheLoop is used for |
| /// the location of the remark. If \p DL is passed, use it as debug location for |
| /// the remark. \return the remark object that can be streamed to. |
| static OptimizationRemarkAnalysis createLVAnalysis(StringRef RemarkName, |
| const Loop *TheLoop, |
| Instruction *I, |
| DebugLoc DL = {}) { |
| BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader(); |
| // If debug location is attached to the instruction, use it. Otherwise if DL |
| // was not provided, use the loop's. |
| if (I && I->getDebugLoc()) |
| DL = I->getDebugLoc(); |
| else if (!DL) |
| DL = TheLoop->getStartLoc(); |
| |
| return OptimizationRemarkAnalysis(DEBUG_TYPE, RemarkName, DL, CodeRegion); |
| } |
| |
| void LoopVectorizationUtils::reportVectorizationFailure( |
| const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, |
| OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I) { |
| LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); |
| ORE->emit(createLVAnalysis(ORETag, TheLoop, I) |
| << "loop not vectorized: " << OREMsg); |
| } |
| |
| void LoopVectorizationUtils::reportVectorizationInfo( |
| const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, |
| const Loop *TheLoop, Instruction *I, DebugLoc DL) { |
| LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); |
| ORE->emit(createLVAnalysis(ORETag, TheLoop, I, DL) << Msg); |
| } |
| |
| void LoopVectorizationUtils::reportVectorization(OptimizationRemarkEmitter *ORE, |
| Loop *TheLoop, |
| ElementCount VFWidth, |
| unsigned IC) { |
| LLVM_DEBUG(debugVectorizationMessage( |
| "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", |
| nullptr)); |
| StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; |
| ORE->emit([&]() { |
| return OptimizationRemark(DEBUG_TYPE, "Vectorized", TheLoop->getStartLoc(), |
| TheLoop->getHeader()) |
| << "vectorized " << LoopType << "loop (vectorization width: " |
| << ore::NV("VectorizationFactor", VFWidth) |
| << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; |
| }); |
| } |
| |
| bool VFSelectionContext::isLegalMaskedLoadOrStore(Instruction *I, |
| ElementCount VF) const { |
| assert(isa<LoadInst>(I) || isa<StoreInst>(I)); |
| auto *Ty = getLoadStoreType(I); |
| const unsigned AS = getLoadStoreAddressSpace(I); |
| const Align Alignment = getLoadStoreAlignment(I); |
| |
| return ForceTargetSupportsMaskedMemoryOps || |
| (isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS) |
| : TTI.isLegalMaskedStore(Ty, Alignment, AS)); |
| } |
| |
| bool VFSelectionContext::isLegalGatherOrScatter(Value *V, |
| ElementCount VF) const { |
| bool LI = isa<LoadInst>(V); |
| bool SI = isa<StoreInst>(V); |
| if (!LI && !SI) |
| return false; |
| auto *Ty = getLoadStoreType(V); |
| Align Align = getLoadStoreAlignment(V); |
| if (VF.isVector()) |
| Ty = VectorType::get(Ty, VF); |
| return ForceTargetSupportsGatherScatterOps || |
| (LI && TTI.isLegalMaskedGather(Ty, Align)) || |
| (SI && TTI.isLegalMaskedScatter(Ty, Align)); |
| } |
| |
| bool VFSelectionContext::supportsScalableVectors() const { |
| return TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors; |
| } |
| |
| bool VFSelectionContext::useMaxBandwidth(bool IsScalable) const { |
| TargetTransformInfo::RegisterKind RegKind = |
| IsScalable ? TargetTransformInfo::RGK_ScalableVector |
| : TargetTransformInfo::RGK_FixedWidthVector; |
| return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && |
| (TTI.shouldMaximizeVectorBandwidth(RegKind) || |
| (UseWiderVFIfCallVariantsPresent && |
| Legal->hasVectorCallVariants()))); |
| } |
| |
| bool VFSelectionContext::shouldConsiderRegPressureForVF(ElementCount VF) const { |
| if (ConsiderRegPressure.getNumOccurrences()) |
| return ConsiderRegPressure; |
| |
| // TODO: We should eventually consider register pressure for all targets. The |
| // TTI hook is temporary whilst target-specific issues are being fixed. |
| if (TTI.shouldConsiderVectorizationRegPressure()) |
| return true; |
| |
| if (!useMaxBandwidth(VF.isScalable())) |
| return false; |
| // Only calculate register pressure for VFs enabled by MaxBandwidth. |
| return ElementCount::isKnownGT( |
| VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF |
| : MaxPermissibleVFWithoutMaxBW.FixedVF); |
| } |
| |
| ElementCount VFSelectionContext::clampVFByMaxTripCount( |
| ElementCount VF, unsigned MaxTripCount, unsigned UserIC, |
| bool FoldTailByMasking, bool RequiresScalarEpilogue) const { |
| unsigned EstimatedVF = VF.getKnownMinValue(); |
| if (VF.isScalable() && F.hasFnAttribute(Attribute::VScaleRange)) { |
| auto Attr = F.getFnAttribute(Attribute::VScaleRange); |
| auto Min = Attr.getVScaleRangeMin(); |
| EstimatedVF *= Min; |
| } |
| |
| // When a scalar epilogue is required, at least one iteration of the scalar |
| // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a |
| // max VF that results in a dead vector loop. |
| if (MaxTripCount > 0 && RequiresScalarEpilogue) |
| MaxTripCount -= 1; |
| |
| // When the user specifies an interleave count, we need to ensure that |
| // VF * UserIC <= MaxTripCount to avoid a dead vector loop. |
| unsigned IC = UserIC > 0 ? UserIC : 1; |
| unsigned EstimatedVFTimesIC = EstimatedVF * IC; |
| |
| if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC && |
| (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { |
| // If upper bound loop trip count (TC) is known at compile time there is no |
| // point in choosing VF greater than TC / IC (as done in the loop below). |
| // Select maximum power of two which doesn't exceed TC / IC. If VF is |
| // scalable, we only fall back on a fixed VF when the TC is less than or |
| // equal to the known number of lanes. |
| auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount / IC); |
| if (ClampedUpperTripCount == 0) |
| ClampedUpperTripCount = 1; |
| LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " |
| "exceeding the constant trip count" |
| << (UserIC > 0 ? " divided by UserIC" : "") << ": " |
| << ClampedUpperTripCount << "\n"); |
| return ElementCount::get(ClampedUpperTripCount, |
| FoldTailByMasking ? VF.isScalable() : false); |
| } |
| return VF; |
| } |
| |
| ElementCount VFSelectionContext::getMaximizedVFForTarget( |
| unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, |
| ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking, |
| bool RequiresScalarEpilogue) { |
| bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); |
| const TypeSize WidestRegister = TTI.getRegisterBitWidth( |
| ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector |
| : TargetTransformInfo::RGK_FixedWidthVector); |
| |
| // Convenience function to return the minimum of two ElementCounts. |
| auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { |
| assert((LHS.isScalable() == RHS.isScalable()) && |
| "Scalable flags must match"); |
| return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; |
| }; |
| |
| // Ensure MaxVF is a power of 2; the dependence distance bound may not be. |
| // Note that both WidestRegister and WidestType may not be a powers of 2. |
| auto MaxVectorElementCount = ElementCount::get( |
| llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), |
| ComputeScalableMaxVF); |
| MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); |
| LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " |
| << (MaxVectorElementCount * WidestType) << " bits.\n"); |
| |
| if (!MaxVectorElementCount) { |
| LLVM_DEBUG(dbgs() << "LV: The target has no " |
| << (ComputeScalableMaxVF ? "scalable" : "fixed") |
| << " vector registers.\n"); |
| return ElementCount::getFixed(1); |
| } |
| |
| ElementCount MaxVF = |
| clampVFByMaxTripCount(MaxVectorElementCount, MaxTripCount, UserIC, |
| FoldTailByMasking, RequiresScalarEpilogue); |
| // If the MaxVF was already clamped, there's no point in trying to pick a |
| // larger one. |
| if (MaxVF != MaxVectorElementCount) |
| return MaxVF; |
| |
| if (MaxVF.isScalable()) |
| MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF; |
| else |
| MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF; |
| |
| if (useMaxBandwidth(ComputeScalableMaxVF)) { |
| auto MaxVectorElementCountMaxBW = ElementCount::get( |
| llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), |
| ComputeScalableMaxVF); |
| MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); |
| |
| if (ElementCount MinVF = |
| TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { |
| if (ElementCount::isKnownLT(MaxVF, MinVF)) { |
| LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF |
| << ") with target's minimum: " << MinVF << '\n'); |
| MaxVF = MinVF; |
| } |
| } |
| |
| MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, UserIC, |
| FoldTailByMasking, RequiresScalarEpilogue); |
| } |
| return MaxVF; |
| } |
| |
| std::optional<unsigned> llvm::getMaxVScale(const Function &F, |
| const TargetTransformInfo &TTI) { |
| if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) |
| return MaxVScale; |
| |
| if (F.hasFnAttribute(Attribute::VScaleRange)) |
| return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); |
| |
| return std::nullopt; |
| } |
| |
| bool VFSelectionContext::isScalableVectorizationAllowed() { |
| if (IsScalableVectorizationAllowed) |
| return *IsScalableVectorizationAllowed; |
| |
| IsScalableVectorizationAllowed = false; |
| if (!supportsScalableVectors()) |
| return false; |
| |
| if (Hints->isScalableVectorizationDisabled()) { |
| reportVectorizationInfo("Scalable vectorization is explicitly disabled", |
| "ScalableVectorizationDisabled", ORE, TheLoop); |
| return false; |
| } |
| |
| LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); |
| |
| auto MaxScalableVF = ElementCount::getScalable( |
| std::numeric_limits<ElementCount::ScalarTy>::max()); |
| |
| // Test that the loop-vectorizer can legalize all operations for this MaxVF. |
| // FIXME: While for scalable vectors this is currently sufficient, this should |
| // be replaced by a more detailed mechanism that filters out specific VFs, |
| // instead of invalidating vectorization for a whole set of VFs based on the |
| // MaxVF. |
| |
| // Disable scalable vectorization if the loop contains unsupported reductions. |
| if (!all_of(Legal->getReductionVars(), [&](const auto &Reduction) -> bool { |
| return TTI.isLegalToVectorizeReduction(Reduction.second, MaxScalableVF); |
| })) { |
| reportVectorizationInfo( |
| "Scalable vectorization not supported for the reduction " |
| "operations found in this loop.", |
| "ScalableVFUnfeasible", ORE, TheLoop); |
| return false; |
| } |
| |
| // Disable scalable vectorization if the loop contains any instructions |
| // with element types not supported for scalable vectors. |
| if (any_of(ElementTypesInLoop, [&](Type *Ty) { |
| return !Ty->isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty); |
| })) { |
| reportVectorizationInfo("Scalable vectorization is not supported " |
| "for all element types found in this loop.", |
| "ScalableVFUnfeasible", ORE, TheLoop); |
| return false; |
| } |
| |
| if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F, TTI)) { |
| reportVectorizationInfo("The target does not provide maximum vscale value " |
| "for safe distance analysis.", |
| "ScalableVFUnfeasible", ORE, TheLoop); |
| return false; |
| } |
| |
| IsScalableVectorizationAllowed = true; |
| return true; |
| } |
| |
| ElementCount |
| VFSelectionContext::getMaxLegalScalableVF(unsigned MaxSafeElements) { |
| if (!isScalableVectorizationAllowed()) |
| return ElementCount::getScalable(0); |
| |
| auto MaxScalableVF = ElementCount::getScalable( |
| std::numeric_limits<ElementCount::ScalarTy>::max()); |
| if (Legal->isSafeForAnyVectorWidth()) |
| return MaxScalableVF; |
| |
| std::optional<unsigned> MaxVScale = getMaxVScale(F, TTI); |
| // Limit MaxScalableVF by the maximum safe dependence distance. |
| MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); |
| |
| if (!MaxScalableVF) |
| reportVectorizationInfo( |
| "Max legal vector width too small, scalable vectorization " |
| "unfeasible.", |
| "ScalableVFUnfeasible", ORE, TheLoop); |
| |
| return MaxScalableVF; |
| } |
| |
| FixedScalableVFPair VFSelectionContext::computeFeasibleMaxVF( |
| unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC, |
| bool FoldTailByMasking, bool RequiresScalarEpilogue) { |
| auto [SmallestType, WidestType] = getSmallestAndWidestTypes(); |
| |
| // Get the maximum safe dependence distance in bits computed by LAA. |
| // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from |
| // the memory accesses that is most restrictive (involved in the smallest |
| // dependence distance). |
| unsigned MaxSafeElementsPowerOf2 = |
| llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); |
| if (!Legal->isSafeForAnyStoreLoadForwardDistances()) { |
| unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits(); |
| MaxSafeElementsPowerOf2 = |
| std::min(MaxSafeElementsPowerOf2, SLDist / WidestType); |
| } |
| |
| auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2); |
| auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2); |
| |
| if (!Legal->isSafeForAnyVectorWidth()) |
| MaxSafeElements = MaxSafeElementsPowerOf2; |
| |
| LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF |
| << ".\n"); |
| LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF |
| << ".\n"); |
| |
| // First analyze the UserVF, fall back if the UserVF should be ignored. |
| if (UserVF) { |
| auto MaxSafeUserVF = |
| UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; |
| |
| if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { |
| // If `VF=vscale x N` is safe, then so is `VF=N` |
| if (UserVF.isScalable()) |
| return FixedScalableVFPair( |
| ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); |
| |
| return UserVF; |
| } |
| |
| assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); |
| |
| // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it |
| // is better to ignore the hint and let the compiler choose a suitable VF. |
| if (!UserVF.isScalable()) { |
| LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
| << " is unsafe, clamping to max safe VF=" |
| << MaxSafeFixedVF << ".\n"); |
| ORE->emit([&]() { |
| return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", |
| TheLoop->getStartLoc(), |
| TheLoop->getHeader()) |
| << "User-specified vectorization factor " |
| << ore::NV("UserVectorizationFactor", UserVF) |
| << " is unsafe, clamping to maximum safe vectorization factor " |
| << ore::NV("VectorizationFactor", MaxSafeFixedVF); |
| }); |
| return MaxSafeFixedVF; |
| } |
| |
| if (!supportsScalableVectors()) { |
| LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
| << " is ignored because scalable vectors are not " |
| "available.\n"); |
| ORE->emit([&]() { |
| return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", |
| TheLoop->getStartLoc(), |
| TheLoop->getHeader()) |
| << "User-specified vectorization factor " |
| << ore::NV("UserVectorizationFactor", UserVF) |
| << " is ignored because the target does not support scalable " |
| "vectors. The compiler will pick a more suitable value."; |
| }); |
| } else { |
| LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
| << " is unsafe. Ignoring scalable UserVF.\n"); |
| ORE->emit([&]() { |
| return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", |
| TheLoop->getStartLoc(), |
| TheLoop->getHeader()) |
| << "User-specified vectorization factor " |
| << ore::NV("UserVectorizationFactor", UserVF) |
| << " is unsafe. Ignoring the hint to let the compiler pick a " |
| "more suitable value."; |
| }); |
| } |
| } |
| |
| LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType |
| << " / " << WidestType << " bits.\n"); |
| |
| FixedScalableVFPair Result(ElementCount::getFixed(1), |
| ElementCount::getScalable(0)); |
| if (auto MaxVF = getMaximizedVFForTarget( |
| MaxTripCount, SmallestType, WidestType, MaxSafeFixedVF, UserIC, |
| FoldTailByMasking, RequiresScalarEpilogue)) |
| Result.FixedVF = MaxVF; |
| |
| if (auto MaxVF = getMaximizedVFForTarget( |
| MaxTripCount, SmallestType, WidestType, MaxSafeScalableVF, UserIC, |
| FoldTailByMasking, RequiresScalarEpilogue)) |
| if (MaxVF.isScalable()) { |
| Result.ScalableVF = MaxVF; |
| LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF |
| << "\n"); |
| } |
| |
| return Result; |
| } |
| |
| std::pair<unsigned, unsigned> |
| VFSelectionContext::getSmallestAndWidestTypes() const { |
| unsigned MinWidth = -1U; |
| unsigned MaxWidth = 8; |
| const DataLayout &DL = F.getDataLayout(); |
| // For in-loop reductions, no element types are added to ElementTypesInLoop |
| // if there are no loads/stores in the loop. In this case, check through the |
| // reduction variables to determine the maximum width. |
| if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { |
| for (const auto &[_, RdxDesc] : Legal->getReductionVars()) { |
| // When finding the min width used by the recurrence we need to account |
| // for casts on the input operands of the recurrence. |
| MinWidth = std::min( |
| MinWidth, |
| std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), |
| RdxDesc.getRecurrenceType()->getScalarSizeInBits())); |
| MaxWidth = std::max(MaxWidth, |
| RdxDesc.getRecurrenceType()->getScalarSizeInBits()); |
| } |
| } else { |
| for (Type *T : ElementTypesInLoop) { |
| MinWidth = std::min<unsigned>( |
| MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); |
| MaxWidth = std::max<unsigned>( |
| MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); |
| } |
| } |
| return {MinWidth, MaxWidth}; |
| } |
| |
| void VFSelectionContext::collectElementTypesForWidening( |
| const SmallPtrSetImpl<const Value *> *ValuesToIgnore) { |
| ElementTypesInLoop.clear(); |
| // For each block. |
| for (BasicBlock *BB : TheLoop->blocks()) { |
| // For each instruction in the loop. |
| for (Instruction &I : *BB) { |
| Type *T = I.getType(); |
| |
| // Skip ignored values. |
| if (ValuesToIgnore && ValuesToIgnore->contains(&I)) |
| continue; |
| |
| // Only examine Loads, Stores and PHINodes. |
| if (!isa<LoadInst, StoreInst, PHINode>(I)) |
| continue; |
| |
| // Examine PHI nodes that are reduction variables. Update the type to |
| // account for the recurrence type. |
| if (auto *PN = dyn_cast<PHINode>(&I)) { |
| if (!Legal->isReductionVariable(PN)) |
| continue; |
| const RecurrenceDescriptor &RdxDesc = |
| Legal->getRecurrenceDescriptor(PN); |
| if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || |
| TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(), |
| RdxDesc.getRecurrenceType())) |
| continue; |
| T = RdxDesc.getRecurrenceType(); |
| } |
| |
| // Examine the stored values. |
| if (auto *ST = dyn_cast<StoreInst>(&I)) |
| T = ST->getValueOperand()->getType(); |
| |
| assert(T->isSized() && |
| "Expected the load/store/recurrence type to be sized"); |
| |
| ElementTypesInLoop.insert(T); |
| } |
| } |
| } |
| |
| void VFSelectionContext::initializeVScaleForTuning() { |
| if (!supportsScalableVectors()) |
| return; |
| |
| if (F.hasFnAttribute(Attribute::VScaleRange)) { |
| auto Attr = F.getFnAttribute(Attribute::VScaleRange); |
| auto Min = Attr.getVScaleRangeMin(); |
| auto Max = Attr.getVScaleRangeMax(); |
| if (Max && Min == Max) { |
| VScaleForTuning = Max; |
| return; |
| } |
| } |
| |
| VScaleForTuning = TTI.getVScaleForTuning(); |
| } |
| |
| bool VFSelectionContext::useOrderedReductions( |
| const RecurrenceDescriptor &RdxDesc) const { |
| return !Hints->allowReordering() && RdxDesc.isOrdered(); |
| } |
| |
| bool VFSelectionContext::runtimeChecksRequired() { |
| LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); |
| |
| Loop *L = const_cast<Loop *>(TheLoop); |
| if (Legal->getRuntimePointerChecking()->Need) { |
| reportVectorizationFailure( |
| "Runtime ptr check is required with -Os/-Oz", |
| "runtime pointer checks needed. Enable vectorization of this " |
| "loop with '#pragma clang loop vectorize(enable)' when " |
| "compiling with -Os/-Oz", |
| "CantVersionLoopWithOptForSize", ORE, L); |
| return true; |
| } |
| |
| if (!PSE.getPredicate().isAlwaysTrue()) { |
| reportVectorizationFailure( |
| "Runtime SCEV check is required with -Os/-Oz", |
| "runtime SCEV checks needed. Enable vectorization of this " |
| "loop with '#pragma clang loop vectorize(enable)' when " |
| "compiling with -Os/-Oz", |
| "CantVersionLoopWithOptForSize", ORE, L); |
| return true; |
| } |
| |
| // FIXME: Avoid specializing for stride==1 instead of bailing out. |
| if (!Legal->getLAI()->getSymbolicStrides().empty()) { |
| reportVectorizationFailure( |
| "Runtime stride check for small trip count", |
| "runtime stride == 1 checks needed. Enable vectorization of " |
| "this loop without such check by compiling with -Os/-Oz", |
| "CantVersionLoopWithOptForSize", ORE, L); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| void VFSelectionContext::computeMinimalBitwidths() { |
| MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); |
| } |
| |
| void VFSelectionContext::collectInLoopReductions() { |
| // Avoid duplicating work finding in-loop reductions. |
| if (!InLoopReductions.empty()) |
| return; |
| |
| for (const auto &Reduction : Legal->getReductionVars()) { |
| PHINode *Phi = Reduction.first; |
| const RecurrenceDescriptor &RdxDesc = Reduction.second; |
| |
| // Multi-use reductions (e.g., used in FindLastIV patterns) are handled |
| // separately and should not be considered for in-loop reductions. |
| if (RdxDesc.hasUsesOutsideReductionChain()) |
| continue; |
| |
| // We don't collect reductions that are type promoted (yet). |
| if (RdxDesc.getRecurrenceType() != Phi->getType()) |
| continue; |
| |
| // In-loop AnyOf and FindIV reductions are not yet supported. |
| RecurKind Kind = RdxDesc.getRecurrenceKind(); |
| if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) || |
| RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) || |
| RecurrenceDescriptor::isFindLastRecurrenceKind(Kind)) |
| continue; |
| |
| // If the target would prefer this reduction to happen "in-loop", then we |
| // want to record it as such. |
| if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && |
| !TTI.preferInLoopReduction(Kind, Phi->getType())) |
| continue; |
| |
| // Check that we can correctly put the reductions into the loop, by |
| // finding the chain of operations that leads from the phi to the loop |
| // exit value. |
| SmallVector<Instruction *, 4> ReductionOperations = |
| RdxDesc.getReductionOpChain(Phi, const_cast<Loop *>(TheLoop)); |
| bool InLoop = !ReductionOperations.empty(); |
| |
| if (InLoop) { |
| InLoopReductions.insert(Phi); |
| // Add the elements to InLoopReductionImmediateChains for cost modelling. |
| Instruction *LastChain = Phi; |
| for (auto *I : ReductionOperations) { |
| InLoopReductionImmediateChains[I] = LastChain; |
| LastChain = I; |
| } |
| } |
| LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") |
| << " reduction for phi: " << *Phi << "\n"); |
| } |
| } |
| |
| bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, |
| const VectorizationFactor &B, |
| const unsigned MaxTripCount, |
| bool HasTail, |
| bool IsEpilogue) const { |
| InstructionCost CostA = A.Cost; |
| InstructionCost CostB = B.Cost; |
| |
| // When there is a hint to always prefer scalable vectors, honour that hint. |
| if (Hints.isScalableVectorizationAlwaysPreferred()) |
| if (A.Width.isScalable() && CostA.isValid() && !B.Width.isScalable() && |
| !B.Width.isScalar()) |
| return true; |
| |
| // Improve estimate for the vector width if it is scalable. |
| unsigned EstimatedWidthA = A.Width.getKnownMinValue(); |
| unsigned EstimatedWidthB = B.Width.getKnownMinValue(); |
| if (std::optional<unsigned> VScale = Config.getVScaleForTuning()) { |
| if (A.Width.isScalable()) |
| EstimatedWidthA *= *VScale; |
| if (B.Width.isScalable()) |
| EstimatedWidthB *= *VScale; |
| } |
| |
| // When optimizing for size choose whichever is smallest, which will be the |
| // one with the smallest cost for the whole loop. On a tie pick the larger |
| // vector width, on the assumption that throughput will be greater. |
| if (Config.CostKind == TTI::TCK_CodeSize) |
| return CostA < CostB || |
| (CostA == CostB && EstimatedWidthA > EstimatedWidthB); |
| |
| // Assume vscale may be larger than 1 (or the value being tuned for), |
| // so that scalable vectorization is slightly favorable over fixed-width |
| // vectorization. |
| bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) && |
| A.Width.isScalable() && !B.Width.isScalable(); |
| |
| auto CmpFn = [PreferScalable](const InstructionCost &LHS, |
| const InstructionCost &RHS) { |
| return PreferScalable ? LHS <= RHS : LHS < RHS; |
| }; |
| |
| // To avoid the need for FP division: |
| // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB) |
| // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA) |
| bool LowerCostWithoutTC = |
| CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA); |
| if (!MaxTripCount) |
| return LowerCostWithoutTC; |
| |
| auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF, |
| InstructionCost VectorCost, |
| InstructionCost ScalarCost) { |
| // If the trip count is a known (possibly small) constant, the trip count |
| // will be rounded up to an integer number of iterations under |
| // FoldTailByMasking. The total cost in that case will be |
| // VecCost*ceil(TripCount/VF). When not folding the tail, the total |
| // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be |
| // some extra overheads, but for the purpose of comparing the costs of |
| // different VFs we can use this to compare the total loop-body cost |
| // expected after vectorization. |
| if (HasTail) |
| return VectorCost * (MaxTripCount / VF) + |
| ScalarCost * (MaxTripCount % VF); |
| return VectorCost * divideCeil(MaxTripCount, VF); |
| }; |
| |
| auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost); |
| auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost); |
| bool LowerCostWithTC = CmpFn(RTCostA, RTCostB); |
| LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) { |
| dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width) |
| << " has lower cost than VF " |
| << (LowerCostWithTC ? B.Width : A.Width) |
| << " when taking the cost of the remaining scalar loop iterations " |
| "into consideration for a maximum trip count of " |
| << MaxTripCount << ".\n"; |
| }); |
| return LowerCostWithTC; |
| } |
| |
| bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, |
| const VectorizationFactor &B, |
| bool HasTail, |
| bool IsEpilogue) const { |
| const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); |
| return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail, |
| IsEpilogue); |
| } |
| |
| // TODO: we could return a pair of values that specify the max VF and |
| // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of |
| // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment |
| // doesn't have a cost model that can choose which plan to execute if |
| // more than one is generated. |
| FixedScalableVFPair |
| VFSelectionContext::computeVPlanOuterloopVF(ElementCount UserVF) { |
| if (UserVF.isScalable() && !supportsScalableVectors()) { |
| reportVectorizationFailure( |
| "Scalable vectorization requested but not supported by the target", |
| "the scalable user-specified vectorization width for outer-loop " |
| "vectorization cannot be used because the target does not support " |
| "scalable vectors.", |
| "ScalableVFUnfeasible", ORE, TheLoop); |
| return FixedScalableVFPair::getNone(); |
| } |
| |
| ElementCount VF = UserVF; |
| if (VF.isZero()) { |
| auto [_, WidestType] = getSmallestAndWidestTypes(); |
| |
| auto RegKind = TTI.enableScalableVectorization() |
| ? TargetTransformInfo::RGK_ScalableVector |
| : TargetTransformInfo::RGK_FixedWidthVector; |
| |
| TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); |
| unsigned N = RegSize.getKnownMinValue() / WidestType; |
| VF = ElementCount::get(N, RegSize.isScalable()); |
| LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); |
| |
| // Make sure we have a VF > 1 for stress testing. |
| if (VPlanBuildOuterloopStressTest && VF.isScalar()) { |
| LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " |
| << "overriding computed VF.\n"); |
| VF = ElementCount::getFixed(4); |
| } |
| } |
| assert(isPowerOf2_32(VF.getKnownMinValue()) && |
| "VF needs to be a power of two"); |
| if (VF.isScalar()) |
| return FixedScalableVFPair::getNone(); |
| LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") |
| << "VF " << VF << " to build VPlans.\n"); |
| return FixedScalableVFPair(VF); |
| } |