| //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops |
| // and generates target-independent LLVM-IR. |
| // The vectorizer uses the TargetTransformInfo analysis to estimate the costs |
| // of instructions in order to estimate the profitability of vectorization. |
| // |
| // The loop vectorizer combines consecutive loop iterations into a single |
| // 'wide' iteration. After this transformation the index is incremented |
| // by the SIMD vector width, and not by one. |
| // |
| // This pass has three parts: |
| // 1. The main loop pass that drives the different parts. |
| // 2. LoopVectorizationLegality - A unit that checks for the legality |
| // of the vectorization. |
| // 3. InnerLoopVectorizer - A unit that performs the actual |
| // widening of instructions. |
| // 4. LoopVectorizationCostModel - A unit that checks for the profitability |
| // of vectorization. It decides on the optimal vector width, which |
| // can be one, if vectorization is not profitable. |
| // |
| // There is a development effort going on to migrate loop vectorizer to the |
| // VPlan infrastructure and to introduce outer loop vectorization support (see |
| // docs/Proposal/VectorizationPlan.rst and |
| // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this |
| // purpose, we temporarily introduced the VPlan-native vectorization path: an |
| // alternative vectorization path that is natively implemented on top of the |
| // VPlan infrastructure. See EnableVPlanNativePath for enabling. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // The reduction-variable vectorization is based on the paper: |
| // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. |
| // |
| // Variable uniformity checks are inspired by: |
| // Karrenberg, R. and Hack, S. Whole Function Vectorization. |
| // |
| // The interleaved access vectorization is based on the paper: |
| // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved |
| // Data for SIMD |
| // |
| // Other ideas/concepts are from: |
| // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. |
| // |
| // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of |
| // Vectorizing Compilers. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
| #include "LoopVectorizationPlanner.h" |
| #include "VPRecipeBuilder.h" |
| #include "VPlan.h" |
| #include "VPlanHCFGBuilder.h" |
| #include "VPlanPredicator.h" |
| #include "VPlanTransforms.h" |
| #include "llvm/ADT/APInt.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/ADT/DenseMapInfo.h" |
| #include "llvm/ADT/Hashing.h" |
| #include "llvm/ADT/MapVector.h" |
| #include "llvm/ADT/None.h" |
| #include "llvm/ADT/Optional.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| #include "llvm/ADT/SmallSet.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/ADT/Twine.h" |
| #include "llvm/ADT/iterator_range.h" |
| #include "llvm/Analysis/AssumptionCache.h" |
| #include "llvm/Analysis/BasicAliasAnalysis.h" |
| #include "llvm/Analysis/BlockFrequencyInfo.h" |
| #include "llvm/Analysis/CFG.h" |
| #include "llvm/Analysis/CodeMetrics.h" |
| #include "llvm/Analysis/DemandedBits.h" |
| #include "llvm/Analysis/GlobalsModRef.h" |
| #include "llvm/Analysis/LoopAccessAnalysis.h" |
| #include "llvm/Analysis/LoopAnalysisManager.h" |
| #include "llvm/Analysis/LoopInfo.h" |
| #include "llvm/Analysis/LoopIterator.h" |
| #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| #include "llvm/Analysis/ProfileSummaryInfo.h" |
| #include "llvm/Analysis/ScalarEvolution.h" |
| #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| #include "llvm/Analysis/TargetLibraryInfo.h" |
| #include "llvm/Analysis/TargetTransformInfo.h" |
| #include "llvm/Analysis/VectorUtils.h" |
| #include "llvm/IR/Attributes.h" |
| #include "llvm/IR/BasicBlock.h" |
| #include "llvm/IR/CFG.h" |
| #include "llvm/IR/Constant.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/DebugInfoMetadata.h" |
| #include "llvm/IR/DebugLoc.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/IR/Dominators.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/InstrTypes.h" |
| #include "llvm/IR/Instruction.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/IR/LLVMContext.h" |
| #include "llvm/IR/Metadata.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/IR/Operator.h" |
| #include "llvm/IR/PatternMatch.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/IR/Use.h" |
| #include "llvm/IR/User.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/IR/ValueHandle.h" |
| #include "llvm/IR/Verifier.h" |
| #include "llvm/InitializePasses.h" |
| #include "llvm/Pass.h" |
| #include "llvm/Support/Casting.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Compiler.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/InstructionCost.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| #include "llvm/Transforms/Utils/InjectTLIMappings.h" |
| #include "llvm/Transforms/Utils/LoopSimplify.h" |
| #include "llvm/Transforms/Utils/LoopUtils.h" |
| #include "llvm/Transforms/Utils/LoopVersioning.h" |
| #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
| #include "llvm/Transforms/Utils/SizeOpts.h" |
| #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
| #include <algorithm> |
| #include <cassert> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <functional> |
| #include <iterator> |
| #include <limits> |
| #include <memory> |
| #include <string> |
| #include <tuple> |
| #include <utility> |
| |
| using namespace llvm; |
| |
| #define LV_NAME "loop-vectorize" |
| #define DEBUG_TYPE LV_NAME |
| |
| #ifndef NDEBUG |
| const char VerboseDebug[] = DEBUG_TYPE "-verbose"; |
| #endif |
| |
| /// @{ |
| /// Metadata attribute names |
| const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; |
| const char LLVMLoopVectorizeFollowupVectorized[] = |
| "llvm.loop.vectorize.followup_vectorized"; |
| const char LLVMLoopVectorizeFollowupEpilogue[] = |
| "llvm.loop.vectorize.followup_epilogue"; |
| /// @} |
| |
| STATISTIC(LoopsVectorized, "Number of loops vectorized"); |
| STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); |
| STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); |
| |
| static cl::opt<bool> EnableEpilogueVectorization( |
| "enable-epilogue-vectorization", cl::init(true), cl::Hidden, |
| cl::desc("Enable vectorization of epilogue loops.")); |
| |
| static cl::opt<unsigned> EpilogueVectorizationForceVF( |
| "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, |
| cl::desc("When epilogue vectorization is enabled, and a value greater than " |
| "1 is specified, forces the given VF for all applicable epilogue " |
| "loops.")); |
| |
| static cl::opt<unsigned> EpilogueVectorizationMinVF( |
| "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, |
| cl::desc("Only loops with vectorization factor equal to or larger than " |
| "the specified value are considered for epilogue vectorization.")); |
| |
| /// Loops with a known constant trip count below this number are vectorized only |
| /// if no scalar iteration overheads are incurred. |
| static cl::opt<unsigned> TinyTripCountVectorThreshold( |
| "vectorizer-min-trip-count", cl::init(16), cl::Hidden, |
| cl::desc("Loops with a constant trip count that is smaller than this " |
| "value are vectorized only if no scalar iteration overheads " |
| "are incurred.")); |
| |
| static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( |
| "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, |
| cl::desc("The maximum allowed number of runtime memory checks with a " |
| "vectorize(enable) pragma.")); |
| |
| // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, |
| // that predication is preferred, and this lists all options. I.e., the |
| // vectorizer will try to fold the tail-loop (epilogue) into the vector body |
| // and predicate the instructions accordingly. If tail-folding fails, there are |
| // different fallback strategies depending on these values: |
| namespace PreferPredicateTy { |
| enum Option { |
| ScalarEpilogue = 0, |
| PredicateElseScalarEpilogue, |
| PredicateOrDontVectorize |
| }; |
| } // namespace PreferPredicateTy |
| |
| static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( |
| "prefer-predicate-over-epilogue", |
| cl::init(PreferPredicateTy::ScalarEpilogue), |
| cl::Hidden, |
| cl::desc("Tail-folding and predication preferences over creating a scalar " |
| "epilogue loop."), |
| cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, |
| "scalar-epilogue", |
| "Don't tail-predicate loops, create scalar epilogue"), |
| clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, |
| "predicate-else-scalar-epilogue", |
| "prefer tail-folding, create scalar epilogue if tail " |
| "folding fails."), |
| clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, |
| "predicate-dont-vectorize", |
| "prefers tail-folding, don't attempt vectorization if " |
| "tail-folding fails."))); |
| |
| static cl::opt<bool> MaximizeBandwidth( |
| "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, |
| cl::desc("Maximize bandwidth when selecting vectorization factor which " |
| "will be determined by the smallest type in loop.")); |
| |
| static cl::opt<bool> EnableInterleavedMemAccesses( |
| "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, |
| cl::desc("Enable vectorization on interleaved memory accesses in a loop")); |
| |
| /// An interleave-group may need masking if it resides in a block that needs |
| /// predication, or in order to mask away gaps. |
| static cl::opt<bool> EnableMaskedInterleavedMemAccesses( |
| "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, |
| cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); |
| |
| static cl::opt<unsigned> TinyTripCountInterleaveThreshold( |
| "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, |
| cl::desc("We don't interleave loops with a estimated constant trip count " |
| "below this number")); |
| |
| static cl::opt<unsigned> ForceTargetNumScalarRegs( |
| "force-target-num-scalar-regs", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's number of scalar registers.")); |
| |
| static cl::opt<unsigned> ForceTargetNumVectorRegs( |
| "force-target-num-vector-regs", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's number of vector registers.")); |
| |
| static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( |
| "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's max interleave factor for " |
| "scalar loops.")); |
| |
| static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( |
| "force-target-max-vector-interleave", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's max interleave factor for " |
| "vectorized loops.")); |
| |
| static cl::opt<unsigned> ForceTargetInstructionCost( |
| "force-target-instruction-cost", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's expected cost for " |
| "an instruction to a single constant value. Mostly " |
| "useful for getting consistent testing.")); |
| |
| static cl::opt<bool> ForceTargetSupportsScalableVectors( |
| "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, |
| cl::desc( |
| "Pretend that scalable vectors are supported, even if the target does " |
| "not support them. This flag should only be used for testing.")); |
| |
| static cl::opt<unsigned> SmallLoopCost( |
| "small-loop-cost", cl::init(20), cl::Hidden, |
| cl::desc( |
| "The cost of a loop that is considered 'small' by the interleaver.")); |
| |
| static cl::opt<bool> LoopVectorizeWithBlockFrequency( |
| "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, |
| cl::desc("Enable the use of the block frequency analysis to access PGO " |
| "heuristics minimizing code growth in cold regions and being more " |
| "aggressive in hot regions.")); |
| |
| // Runtime interleave loops for load/store throughput. |
| static cl::opt<bool> EnableLoadStoreRuntimeInterleave( |
| "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, |
| cl::desc( |
| "Enable runtime interleaving until load/store ports are saturated")); |
| |
| /// Interleave small loops with scalar reductions. |
| static cl::opt<bool> InterleaveSmallLoopScalarReduction( |
| "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, |
| cl::desc("Enable interleaving for loops with small iteration counts that " |
| "contain scalar reductions to expose ILP.")); |
| |
| /// The number of stores in a loop that are allowed to need predication. |
| static cl::opt<unsigned> NumberOfStoresToPredicate( |
| "vectorize-num-stores-pred", cl::init(1), cl::Hidden, |
| cl::desc("Max number of stores to be predicated behind an if.")); |
| |
| static cl::opt<bool> EnableIndVarRegisterHeur( |
| "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, |
| cl::desc("Count the induction variable only once when interleaving")); |
| |
| static cl::opt<bool> EnableCondStoresVectorization( |
| "enable-cond-stores-vec", cl::init(true), cl::Hidden, |
| cl::desc("Enable if predication of stores during vectorization.")); |
| |
| static cl::opt<unsigned> MaxNestedScalarReductionIC( |
| "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, |
| cl::desc("The maximum interleave count to use when interleaving a scalar " |
| "reduction in a nested loop.")); |
| |
| static cl::opt<bool> |
| PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), |
| cl::Hidden, |
| cl::desc("Prefer in-loop vector reductions, " |
| "overriding the targets preference.")); |
| |
| static cl::opt<bool> ForceOrderedReductions( |
| "force-ordered-reductions", cl::init(false), cl::Hidden, |
| cl::desc("Enable the vectorisation of loops with in-order (strict) " |
| "FP reductions")); |
| |
| static cl::opt<bool> PreferPredicatedReductionSelect( |
| "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, |
| cl::desc( |
| "Prefer predicating a reduction operation over an after loop select.")); |
| |
| cl::opt<bool> EnableVPlanNativePath( |
| "enable-vplan-native-path", cl::init(false), cl::Hidden, |
| cl::desc("Enable VPlan-native vectorization path with " |
| "support for outer loop vectorization.")); |
| |
| // FIXME: Remove this switch once we have divergence analysis. Currently we |
| // assume divergent non-backedge branches when this switch is true. |
| cl::opt<bool> EnableVPlanPredication( |
| "enable-vplan-predication", cl::init(false), cl::Hidden, |
| cl::desc("Enable VPlan-native vectorization path predicator with " |
| "support for outer loop vectorization.")); |
| |
| // This flag enables the stress testing of the VPlan H-CFG construction in the |
| // VPlan-native vectorization path. It must be used in conjuction with |
| // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the |
| // verification of the H-CFGs built. |
| static cl::opt<bool> VPlanBuildStressTest( |
| "vplan-build-stress-test", cl::init(false), cl::Hidden, |
| cl::desc( |
| "Build VPlan for every supported loop nest in the function and bail " |
| "out right after the build (stress test the VPlan H-CFG construction " |
| "in the VPlan-native vectorization path).")); |
| |
| cl::opt<bool> llvm::EnableLoopInterleaving( |
| "interleave-loops", cl::init(true), cl::Hidden, |
| cl::desc("Enable loop interleaving in Loop vectorization passes")); |
| cl::opt<bool> llvm::EnableLoopVectorization( |
| "vectorize-loops", cl::init(true), cl::Hidden, |
| cl::desc("Run the Loop vectorization passes")); |
| |
| cl::opt<bool> PrintVPlansInDotFormat( |
| "vplan-print-in-dot-format", cl::init(false), cl::Hidden, |
| cl::desc("Use dot format instead of plain text when dumping VPlans")); |
| |
| /// A helper function that returns true if the given type is irregular. The |
| /// type is irregular if its allocated size doesn't equal the store size of an |
| /// element of the corresponding vector type. |
| static bool hasIrregularType(Type *Ty, const DataLayout &DL) { |
| // Determine if an array of N elements of type Ty is "bitcast compatible" |
| // with a <N x Ty> vector. |
| // This is only true if there is no padding between the array elements. |
| return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); |
| } |
| |
| /// A helper function that returns the reciprocal of the block probability of |
| /// predicated blocks. If we return X, we are assuming the predicated block |
| /// will execute once for every X iterations of the loop header. |
| /// |
| /// TODO: We should use actual block probability here, if available. Currently, |
| /// we always assume predicated blocks have a 50% chance of executing. |
| static unsigned getReciprocalPredBlockProb() { return 2; } |
| |
| /// A helper function that returns an integer or floating-point constant with |
| /// value C. |
| static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { |
| return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) |
| : ConstantFP::get(Ty, C); |
| } |
| |
| /// Returns "best known" trip count for the specified loop \p L as defined by |
| /// the following procedure: |
| /// 1) Returns exact trip count if it is known. |
| /// 2) Returns expected trip count according to profile data if any. |
| /// 3) Returns upper bound estimate if it is known. |
| /// 4) Returns None if all of the above failed. |
| static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { |
| // Check if exact trip count is known. |
| if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) |
| return ExpectedTC; |
| |
| // Check if there is an expected trip count available from profile data. |
| if (LoopVectorizeWithBlockFrequency) |
| if (auto EstimatedTC = getLoopEstimatedTripCount(L)) |
| return EstimatedTC; |
| |
| // Check if upper bound estimate is known. |
| if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) |
| return ExpectedTC; |
| |
| return None; |
| } |
| |
| // Forward declare GeneratedRTChecks. |
| class GeneratedRTChecks; |
| |
| namespace llvm { |
| |
| /// InnerLoopVectorizer vectorizes loops which contain only one basic |
| /// block to a specified vectorization factor (VF). |
| /// This class performs the widening of scalars into vectors, or multiple |
| /// scalars. This class also implements the following features: |
| /// * It inserts an epilogue loop for handling loops that don't have iteration |
| /// counts that are known to be a multiple of the vectorization factor. |
| /// * It handles the code generation for reduction variables. |
| /// * Scalarization (implementation using scalars) of un-vectorizable |
| /// instructions. |
| /// InnerLoopVectorizer does not perform any vectorization-legality |
| /// checks, and relies on the caller to check for the different legality |
| /// aspects. The InnerLoopVectorizer relies on the |
| /// LoopVectorizationLegality class to provide information about the induction |
| /// and reduction variables that were found to a given vectorization factor. |
| class InnerLoopVectorizer { |
| public: |
| InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
| LoopInfo *LI, DominatorTree *DT, |
| const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, ElementCount VecWidth, |
| unsigned UnrollFactor, LoopVectorizationLegality *LVL, |
| LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, |
| ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) |
| : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), |
| AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), |
| Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), |
| PSI(PSI), RTChecks(RTChecks) { |
| // Query this against the original loop and save it here because the profile |
| // of the original loop header may change as the transformation happens. |
| OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( |
| OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); |
| } |
| |
| virtual ~InnerLoopVectorizer() = default; |
| |
| /// Create a new empty loop that will contain vectorized instructions later |
| /// on, while the old loop will be used as the scalar remainder. Control flow |
| /// is generated around the vectorized (and scalar epilogue) loops consisting |
| /// of various checks and bypasses. Return the pre-header block of the new |
| /// loop. |
| /// In the case of epilogue vectorization, this function is overriden to |
| /// handle the more complex control flow around the loops. |
| virtual BasicBlock *createVectorizedLoopSkeleton(); |
| |
| /// Widen a single call instruction within the innermost loop. |
| void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, |
| VPTransformState &State); |
| |
| /// Fix the vectorized code, taking care of header phi's, live-outs, and more. |
| void fixVectorizedLoop(VPTransformState &State); |
| |
| // Return true if any runtime check is added. |
| bool areSafetyChecksAdded() { return AddedSafetyChecks; } |
| |
| /// A type for vectorized values in the new loop. Each value from the |
| /// original loop, when vectorized, is represented by UF vector values in the |
| /// new unrolled loop, where UF is the unroll factor. |
| using VectorParts = SmallVector<Value *, 2>; |
| |
| /// Vectorize a single first-order recurrence or pointer induction PHINode in |
| /// a block. This method handles the induction variable canonicalization. It |
| /// supports both VF = 1 for unrolled loops and arbitrary length vectors. |
| void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, |
| VPTransformState &State); |
| |
| /// A helper function to scalarize a single Instruction in the innermost loop. |
| /// Generates a sequence of scalar instances for each lane between \p MinLane |
| /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, |
| /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p |
| /// Instr's operands. |
| void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, |
| const VPIteration &Instance, bool IfPredicateInstr, |
| VPTransformState &State); |
| |
| /// Widen an integer or floating-point induction variable \p IV. If \p Trunc |
| /// is provided, the integer induction variable will first be truncated to |
| /// the corresponding type. |
| void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, |
| VPValue *Def, VPValue *CastDef, |
| VPTransformState &State); |
| |
| /// Construct the vector value of a scalarized value \p V one lane at a time. |
| void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, |
| VPTransformState &State); |
| |
| /// Try to vectorize interleaved access group \p Group with the base address |
| /// given in \p Addr, optionally masking the vector operations if \p |
| /// BlockInMask is non-null. Use \p State to translate given VPValues to IR |
| /// values in the vectorized loop. |
| void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, |
| ArrayRef<VPValue *> VPDefs, |
| VPTransformState &State, VPValue *Addr, |
| ArrayRef<VPValue *> StoredValues, |
| VPValue *BlockInMask = nullptr); |
| |
| /// Vectorize Load and Store instructions with the base address given in \p |
| /// Addr, optionally masking the vector operations if \p BlockInMask is |
| /// non-null. Use \p State to translate given VPValues to IR values in the |
| /// vectorized loop. |
| void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, |
| VPValue *Def, VPValue *Addr, |
| VPValue *StoredValue, VPValue *BlockInMask, |
| bool ConsecutiveStride, bool Reverse); |
| |
| /// Set the debug location in the builder \p Ptr using the debug location in |
| /// \p V. If \p Ptr is None then it uses the class member's Builder. |
| void setDebugLocFromInst(const Value *V, |
| Optional<IRBuilder<> *> CustomBuilder = None); |
| |
| /// Fix the non-induction PHIs in the OrigPHIsToFix vector. |
| void fixNonInductionPHIs(VPTransformState &State); |
| |
| /// Returns true if the reordering of FP operations is not allowed, but we are |
| /// able to vectorize with strict in-order reductions for the given RdxDesc. |
| bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); |
| |
| /// Create a broadcast instruction. This method generates a broadcast |
| /// instruction (shuffle) for loop invariant values and for the induction |
| /// value. If this is the induction variable then we extend it to N, N+1, ... |
| /// this is needed because each iteration in the loop corresponds to a SIMD |
| /// element. |
| virtual Value *getBroadcastInstrs(Value *V); |
| |
| /// Add metadata from one instruction to another. |
| /// |
| /// This includes both the original MDs from \p From and additional ones (\see |
| /// addNewMetadata). Use this for *newly created* instructions in the vector |
| /// loop. |
| void addMetadata(Instruction *To, Instruction *From); |
| |
| /// Similar to the previous function but it adds the metadata to a |
| /// vector of instructions. |
| void addMetadata(ArrayRef<Value *> To, Instruction *From); |
| |
| protected: |
| friend class LoopVectorizationPlanner; |
| |
| /// A small list of PHINodes. |
| using PhiVector = SmallVector<PHINode *, 4>; |
| |
| /// A type for scalarized values in the new loop. Each value from the |
| /// original loop, when scalarized, is represented by UF x VF scalar values |
| /// in the new unrolled loop, where UF is the unroll factor and VF is the |
| /// vectorization factor. |
| using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; |
| |
| /// Set up the values of the IVs correctly when exiting the vector loop. |
| void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, |
| Value *CountRoundDown, Value *EndValue, |
| BasicBlock *MiddleBlock); |
| |
| /// Create a new induction variable inside L. |
| PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, |
| Value *Step, Instruction *DL); |
| |
| /// Handle all cross-iteration phis in the header. |
| void fixCrossIterationPHIs(VPTransformState &State); |
| |
| /// Create the exit value of first order recurrences in the middle block and |
| /// update their users. |
| void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); |
| |
| /// Create code for the loop exit value of the reduction. |
| void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); |
| |
| /// Clear NSW/NUW flags from reduction instructions if necessary. |
| void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, |
| VPTransformState &State); |
| |
| /// Fixup the LCSSA phi nodes in the unique exit block. This simply |
| /// means we need to add the appropriate incoming value from the middle |
| /// block as exiting edges from the scalar epilogue loop (if present) are |
| /// already in place, and we exit the vector loop exclusively to the middle |
| /// block. |
| void fixLCSSAPHIs(VPTransformState &State); |
| |
| /// Iteratively sink the scalarized operands of a predicated instruction into |
| /// the block that was created for it. |
| void sinkScalarOperands(Instruction *PredInst); |
| |
| /// Shrinks vector element sizes to the smallest bitwidth they can be legally |
| /// represented as. |
| void truncateToMinimalBitwidths(VPTransformState &State); |
| |
| /// This function adds |
| /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) |
| /// to each vector element of Val. The sequence starts at StartIndex. |
| /// \p Opcode is relevant for FP induction variable. |
| virtual Value * |
| getStepVector(Value *Val, Value *StartIdx, Value *Step, |
| Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); |
| |
| /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
| /// variable on which to base the steps, \p Step is the size of the step, and |
| /// \p EntryVal is the value from the original loop that maps to the steps. |
| /// Note that \p EntryVal doesn't have to be an induction variable - it |
| /// can also be a truncate instruction. |
| void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, |
| const InductionDescriptor &ID, VPValue *Def, |
| VPValue *CastDef, VPTransformState &State); |
| |
| /// Create a vector induction phi node based on an existing scalar one. \p |
| /// EntryVal is the value from the original loop that maps to the vector phi |
| /// node, and \p Step is the loop-invariant step. If \p EntryVal is a |
| /// truncate instruction, instead of widening the original IV, we widen a |
| /// version of the IV truncated to \p EntryVal's type. |
| void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, |
| Value *Step, Value *Start, |
| Instruction *EntryVal, VPValue *Def, |
| VPValue *CastDef, |
| VPTransformState &State); |
| |
| /// Returns true if an instruction \p I should be scalarized instead of |
| /// vectorized for the chosen vectorization factor. |
| bool shouldScalarizeInstruction(Instruction *I) const; |
| |
| /// Returns true if we should generate a scalar version of \p IV. |
| bool needsScalarInduction(Instruction *IV) const; |
| |
| /// If there is a cast involved in the induction variable \p ID, which should |
| /// be ignored in the vectorized loop body, this function records the |
| /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the |
| /// cast. We had already proved that the casted Phi is equal to the uncasted |
| /// Phi in the vectorized loop (under a runtime guard), and therefore |
| /// there is no need to vectorize the cast - the same value can be used in the |
| /// vector loop for both the Phi and the cast. |
| /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, |
| /// Otherwise, \p VectorLoopValue is a widened/vectorized value. |
| /// |
| /// \p EntryVal is the value from the original loop that maps to the vector |
| /// phi node and is used to distinguish what is the IV currently being |
| /// processed - original one (if \p EntryVal is a phi corresponding to the |
| /// original IV) or the "newly-created" one based on the proof mentioned above |
| /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the |
| /// latter case \p EntryVal is a TruncInst and we must not record anything for |
| /// that IV, but it's error-prone to expect callers of this routine to care |
| /// about that, hence this explicit parameter. |
| void recordVectorLoopValueForInductionCast( |
| const InductionDescriptor &ID, const Instruction *EntryVal, |
| Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, |
| unsigned Part, unsigned Lane = UINT_MAX); |
| |
| /// Generate a shuffle sequence that will reverse the vector Vec. |
| virtual Value *reverseVector(Value *Vec); |
| |
| /// Returns (and creates if needed) the original loop trip count. |
| Value *getOrCreateTripCount(Loop *NewLoop); |
| |
| /// Returns (and creates if needed) the trip count of the widened loop. |
| Value *getOrCreateVectorTripCount(Loop *NewLoop); |
| |
| /// Returns a bitcasted value to the requested vector type. |
| /// Also handles bitcasts of vector<float> <-> vector<pointer> types. |
| Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, |
| const DataLayout &DL); |
| |
| /// Emit a bypass check to see if the vector trip count is zero, including if |
| /// it overflows. |
| void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); |
| |
| /// Emit a bypass check to see if all of the SCEV assumptions we've |
| /// had to make are correct. Returns the block containing the checks or |
| /// nullptr if no checks have been added. |
| BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); |
| |
| /// Emit bypass checks to check any memory assumptions we may have made. |
| /// Returns the block containing the checks or nullptr if no checks have been |
| /// added. |
| BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); |
| |
| /// Compute the transformed value of Index at offset StartValue using step |
| /// StepValue. |
| /// For integer induction, returns StartValue + Index * StepValue. |
| /// For pointer induction, returns StartValue[Index * StepValue]. |
| /// FIXME: The newly created binary instructions should contain nsw/nuw |
| /// flags, which can be found from the original scalar operations. |
| Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, |
| const DataLayout &DL, |
| const InductionDescriptor &ID) const; |
| |
| /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, |
| /// vector loop preheader, middle block and scalar preheader. Also |
| /// allocate a loop object for the new vector loop and return it. |
| Loop *createVectorLoopSkeleton(StringRef Prefix); |
| |
| /// Create new phi nodes for the induction variables to resume iteration count |
| /// in the scalar epilogue, from where the vectorized loop left off (given by |
| /// \p VectorTripCount). |
| /// In cases where the loop skeleton is more complicated (eg. epilogue |
| /// vectorization) and the resume values can come from an additional bypass |
| /// block, the \p AdditionalBypass pair provides information about the bypass |
| /// block and the end value on the edge from bypass to this loop. |
| void createInductionResumeValues( |
| Loop *L, Value *VectorTripCount, |
| std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); |
| |
| /// Complete the loop skeleton by adding debug MDs, creating appropriate |
| /// conditional branches in the middle block, preparing the builder and |
| /// running the verifier. Take in the vector loop \p L as argument, and return |
| /// the preheader of the completed vector loop. |
| BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); |
| |
| /// Add additional metadata to \p To that was not present on \p Orig. |
| /// |
| /// Currently this is used to add the noalias annotations based on the |
| /// inserted memchecks. Use this for instructions that are *cloned* into the |
| /// vector loop. |
| void addNewMetadata(Instruction *To, const Instruction *Orig); |
| |
| /// Collect poison-generating recipes that may generate a poison value that is |
| /// used after vectorization, even when their operands are not poison. Those |
| /// recipes meet the following conditions: |
| /// * Contribute to the address computation of a recipe generating a widen |
| /// memory load/store (VPWidenMemoryInstructionRecipe or |
| /// VPInterleaveRecipe). |
| /// * Such a widen memory load/store has at least one underlying Instruction |
| /// that is in a basic block that needs predication and after vectorization |
| /// the generated instruction won't be predicated. |
| void collectPoisonGeneratingRecipes(VPTransformState &State); |
| |
| /// Allow subclasses to override and print debug traces before/after vplan |
| /// execution, when trace information is requested. |
| virtual void printDebugTracesAtStart(){}; |
| virtual void printDebugTracesAtEnd(){}; |
| |
| /// The original loop. |
| Loop *OrigLoop; |
| |
| /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies |
| /// dynamic knowledge to simplify SCEV expressions and converts them to a |
| /// more usable form. |
| PredicatedScalarEvolution &PSE; |
| |
| /// Loop Info. |
| LoopInfo *LI; |
| |
| /// Dominator Tree. |
| DominatorTree *DT; |
| |
| /// Alias Analysis. |
| AAResults *AA; |
| |
| /// Target Library Info. |
| const TargetLibraryInfo *TLI; |
| |
| /// Target Transform Info. |
| const TargetTransformInfo *TTI; |
| |
| /// Assumption Cache. |
| AssumptionCache *AC; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter *ORE; |
| |
| /// LoopVersioning. It's only set up (non-null) if memchecks were |
| /// used. |
| /// |
| /// This is currently only used to add no-alias metadata based on the |
| /// memchecks. The actually versioning is performed manually. |
| std::unique_ptr<LoopVersioning> LVer; |
| |
| /// The vectorization SIMD factor to use. Each vector will have this many |
| /// vector elements. |
| ElementCount VF; |
| |
| /// The vectorization unroll factor to use. Each scalar is vectorized to this |
| /// many different vector instructions. |
| unsigned UF; |
| |
| /// The builder that we use |
| IRBuilder<> Builder; |
| |
| // --- Vectorization state --- |
| |
| /// The vector-loop preheader. |
| BasicBlock *LoopVectorPreHeader; |
| |
| /// The scalar-loop preheader. |
| BasicBlock *LoopScalarPreHeader; |
| |
| /// Middle Block between the vector and the scalar. |
| BasicBlock *LoopMiddleBlock; |
| |
| /// The unique ExitBlock of the scalar loop if one exists. Note that |
| /// there can be multiple exiting edges reaching this block. |
| BasicBlock *LoopExitBlock; |
| |
| /// The vector loop body. |
| BasicBlock *LoopVectorBody; |
| |
| /// The scalar loop body. |
| BasicBlock *LoopScalarBody; |
| |
| /// A list of all bypass blocks. The first block is the entry of the loop. |
| SmallVector<BasicBlock *, 4> LoopBypassBlocks; |
| |
| /// The new Induction variable which was added to the new block. |
| PHINode *Induction = nullptr; |
| |
| /// The induction variable of the old basic block. |
| PHINode *OldInduction = nullptr; |
| |
| /// Store instructions that were predicated. |
| SmallVector<Instruction *, 4> PredicatedInstructions; |
| |
| /// Trip count of the original loop. |
| Value *TripCount = nullptr; |
| |
| /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) |
| Value *VectorTripCount = nullptr; |
| |
| /// The legality analysis. |
| LoopVectorizationLegality *Legal; |
| |
| /// The profitablity analysis. |
| LoopVectorizationCostModel *Cost; |
| |
| // Record whether runtime checks are added. |
| bool AddedSafetyChecks = false; |
| |
| // Holds the end values for each induction variable. We save the end values |
| // so we can later fix-up the external users of the induction variables. |
| DenseMap<PHINode *, Value *> IVEndValues; |
| |
| // Vector of original scalar PHIs whose corresponding widened PHIs need to be |
| // fixed up at the end of vector code generation. |
| SmallVector<PHINode *, 8> OrigPHIsToFix; |
| |
| /// BFI and PSI are used to check for profile guided size optimizations. |
| BlockFrequencyInfo *BFI; |
| ProfileSummaryInfo *PSI; |
| |
| // Whether this loop should be optimized for size based on profile guided size |
| // optimizatios. |
| bool OptForSizeBasedOnProfile; |
| |
| /// Structure to hold information about generated runtime checks, responsible |
| /// for cleaning the checks, if vectorization turns out unprofitable. |
| GeneratedRTChecks &RTChecks; |
| }; |
| |
| class InnerLoopUnroller : public InnerLoopVectorizer { |
| public: |
| InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
| LoopInfo *LI, DominatorTree *DT, |
| const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, |
| LoopVectorizationLegality *LVL, |
| LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, |
| ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) |
| : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
| ElementCount::getFixed(1), UnrollFactor, LVL, CM, |
| BFI, PSI, Check) {} |
| |
| private: |
| Value *getBroadcastInstrs(Value *V) override; |
| Value *getStepVector( |
| Value *Val, Value *StartIdx, Value *Step, |
| Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; |
| Value *reverseVector(Value *Vec) override; |
| }; |
| |
| /// Encapsulate information regarding vectorization of a loop and its epilogue. |
| /// This information is meant to be updated and used across two stages of |
| /// epilogue vectorization. |
| struct EpilogueLoopVectorizationInfo { |
| ElementCount MainLoopVF = ElementCount::getFixed(0); |
| unsigned MainLoopUF = 0; |
| ElementCount EpilogueVF = ElementCount::getFixed(0); |
| unsigned EpilogueUF = 0; |
| BasicBlock *MainLoopIterationCountCheck = nullptr; |
| BasicBlock *EpilogueIterationCountCheck = nullptr; |
| BasicBlock *SCEVSafetyCheck = nullptr; |
| BasicBlock *MemSafetyCheck = nullptr; |
| Value *TripCount = nullptr; |
| Value *VectorTripCount = nullptr; |
| |
| EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, |
| ElementCount EVF, unsigned EUF) |
| : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { |
| assert(EUF == 1 && |
| "A high UF for the epilogue loop is likely not beneficial."); |
| } |
| }; |
| |
| /// An extension of the inner loop vectorizer that creates a skeleton for a |
| /// vectorized loop that has its epilogue (residual) also vectorized. |
| /// The idea is to run the vplan on a given loop twice, firstly to setup the |
| /// skeleton and vectorize the main loop, and secondly to complete the skeleton |
| /// from the first step and vectorize the epilogue. This is achieved by |
| /// deriving two concrete strategy classes from this base class and invoking |
| /// them in succession from the loop vectorizer planner. |
| class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { |
| public: |
| InnerLoopAndEpilogueVectorizer( |
| Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
| DominatorTree *DT, const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
| LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
| BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
| GeneratedRTChecks &Checks) |
| : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
| EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, |
| Checks), |
| EPI(EPI) {} |
| |
| // Override this function to handle the more complex control flow around the |
| // three loops. |
| BasicBlock *createVectorizedLoopSkeleton() final override { |
| return createEpilogueVectorizedLoopSkeleton(); |
| } |
| |
| /// The interface for creating a vectorized skeleton using one of two |
| /// different strategies, each corresponding to one execution of the vplan |
| /// as described above. |
| virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; |
| |
| /// Holds and updates state information required to vectorize the main loop |
| /// and its epilogue in two separate passes. This setup helps us avoid |
| /// regenerating and recomputing runtime safety checks. It also helps us to |
| /// shorten the iteration-count-check path length for the cases where the |
| /// iteration count of the loop is so small that the main vector loop is |
| /// completely skipped. |
| EpilogueLoopVectorizationInfo &EPI; |
| }; |
| |
| /// A specialized derived class of inner loop vectorizer that performs |
| /// vectorization of *main* loops in the process of vectorizing loops and their |
| /// epilogues. |
| class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { |
| public: |
| EpilogueVectorizerMainLoop( |
| Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
| DominatorTree *DT, const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
| LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
| BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
| GeneratedRTChecks &Check) |
| : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
| EPI, LVL, CM, BFI, PSI, Check) {} |
| /// Implements the interface for creating a vectorized skeleton using the |
| /// *main loop* strategy (ie the first pass of vplan execution). |
| BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; |
| |
| protected: |
| /// Emits an iteration count bypass check once for the main loop (when \p |
| /// ForEpilogue is false) and once for the epilogue loop (when \p |
| /// ForEpilogue is true). |
| BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, |
| bool ForEpilogue); |
| void printDebugTracesAtStart() override; |
| void printDebugTracesAtEnd() override; |
| }; |
| |
| // A specialized derived class of inner loop vectorizer that performs |
| // vectorization of *epilogue* loops in the process of vectorizing loops and |
| // their epilogues. |
| class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { |
| public: |
| EpilogueVectorizerEpilogueLoop( |
| Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
| DominatorTree *DT, const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
| LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
| BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
| GeneratedRTChecks &Checks) |
| : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
| EPI, LVL, CM, BFI, PSI, Checks) {} |
| /// Implements the interface for creating a vectorized skeleton using the |
| /// *epilogue loop* strategy (ie the second pass of vplan execution). |
| BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; |
| |
| protected: |
| /// Emits an iteration count bypass check after the main vector loop has |
| /// finished to see if there are any iterations left to execute by either |
| /// the vector epilogue or the scalar epilogue. |
| BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, |
| BasicBlock *Bypass, |
| BasicBlock *Insert); |
| void printDebugTracesAtStart() override; |
| void printDebugTracesAtEnd() override; |
| }; |
| } // end namespace llvm |
| |
| /// Look for a meaningful debug location on the instruction or it's |
| /// operands. |
| static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { |
| if (!I) |
| return I; |
| |
| DebugLoc Empty; |
| if (I->getDebugLoc() != Empty) |
| return I; |
| |
| for (Use &Op : I->operands()) { |
| if (Instruction *OpInst = dyn_cast<Instruction>(Op)) |
| if (OpInst->getDebugLoc() != Empty) |
| return OpInst; |
| } |
| |
| return I; |
| } |
| |
| void InnerLoopVectorizer::setDebugLocFromInst( |
| const Value *V, Optional<IRBuilder<> *> CustomBuilder) { |
| IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; |
| if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { |
| const DILocation *DIL = Inst->getDebugLoc(); |
| |
| // When a FSDiscriminator is enabled, we don't need to add the multiply |
| // factors to the discriminators. |
| if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && |
| !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { |
| // FIXME: For scalable vectors, assume vscale=1. |
| auto NewDIL = |
| DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); |
| if (NewDIL) |
| B->SetCurrentDebugLocation(NewDIL.getValue()); |
| else |
| LLVM_DEBUG(dbgs() |
| << "Failed to create new discriminator: " |
| << DIL->getFilename() << " Line: " << DIL->getLine()); |
| } else |
| B->SetCurrentDebugLocation(DIL); |
| } else |
| B->SetCurrentDebugLocation(DebugLoc()); |
| } |
| |
| /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I |
| /// is passed, the message relates to that particular instruction. |
| #ifndef NDEBUG |
| static void debugVectorizationMessage(const StringRef Prefix, |
| const StringRef DebugMsg, |
| Instruction *I) { |
| dbgs() << "LV: " << Prefix << DebugMsg; |
| if (I != nullptr) |
| dbgs() << " " << *I; |
| else |
| dbgs() << '.'; |
| dbgs() << '\n'; |
| } |
| #endif |
| |
| /// Create an analysis remark that explains why vectorization failed |
| /// |
| /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p |
| /// RemarkName is the identifier for the remark. If \p I is passed it is an |
| /// instruction that prevents vectorization. Otherwise \p TheLoop is used for |
| /// the location of the remark. \return the remark object that can be |
| /// streamed to. |
| static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, |
| StringRef RemarkName, Loop *TheLoop, Instruction *I) { |
| Value *CodeRegion = TheLoop->getHeader(); |
| DebugLoc DL = TheLoop->getStartLoc(); |
| |
| if (I) { |
| CodeRegion = I->getParent(); |
| // If there is no debug location attached to the instruction, revert back to |
| // using the loop's. |
| if (I->getDebugLoc()) |
| DL = I->getDebugLoc(); |
| } |
| |
| return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); |
| } |
| |
| /// Return a value for Step multiplied by VF. |
| static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, |
| int64_t Step) { |
| assert(Ty->isIntegerTy() && "Expected an integer step"); |
| Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); |
| return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; |
| } |
| |
| namespace llvm { |
| |
| /// Return the runtime value for VF. |
| Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { |
| Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); |
| return VF.isScalable() ? B.CreateVScale(EC) : EC; |
| } |
| |
| static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { |
| assert(FTy->isFloatingPointTy() && "Expected floating point type!"); |
| Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); |
| Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); |
| return B.CreateUIToFP(RuntimeVF, FTy); |
| } |
| |
| void reportVectorizationFailure(const StringRef DebugMsg, |
| const StringRef OREMsg, const StringRef ORETag, |
| OptimizationRemarkEmitter *ORE, Loop *TheLoop, |
| Instruction *I) { |
| LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); |
| LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); |
| ORE->emit( |
| createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) |
| << "loop not vectorized: " << OREMsg); |
| } |
| |
| void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, |
| OptimizationRemarkEmitter *ORE, Loop *TheLoop, |
| Instruction *I) { |
| LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); |
| LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); |
| ORE->emit( |
| createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) |
| << Msg); |
| } |
| |
| } // end namespace llvm |
| |
| #ifndef NDEBUG |
| /// \return string containing a file name and a line # for the given loop. |
| static std::string getDebugLocString(const Loop *L) { |
| std::string Result; |
| if (L) { |
| raw_string_ostream OS(Result); |
| if (const DebugLoc LoopDbgLoc = L->getStartLoc()) |
| LoopDbgLoc.print(OS); |
| else |
| // Just print the module name. |
| OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); |
| OS.flush(); |
| } |
| return Result; |
| } |
| #endif |
| |
| void InnerLoopVectorizer::addNewMetadata(Instruction *To, |
| const Instruction *Orig) { |
| // If the loop was versioned with memchecks, add the corresponding no-alias |
| // metadata. |
| if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) |
| LVer->annotateInstWithNoAlias(To, Orig); |
| } |
| |
| void InnerLoopVectorizer::collectPoisonGeneratingRecipes( |
| VPTransformState &State) { |
| |
| // Collect recipes in the backward slice of `Root` that may generate a poison |
| // value that is used after vectorization. |
| SmallPtrSet<VPRecipeBase *, 16> Visited; |
| auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { |
| SmallVector<VPRecipeBase *, 16> Worklist; |
| Worklist.push_back(Root); |
| |
| // Traverse the backward slice of Root through its use-def chain. |
| while (!Worklist.empty()) { |
| VPRecipeBase *CurRec = Worklist.back(); |
| Worklist.pop_back(); |
| |
| if (!Visited.insert(CurRec).second) |
| continue; |
| |
| // Prune search if we find another recipe generating a widen memory |
| // instruction. Widen memory instructions involved in address computation |
| // will lead to gather/scatter instructions, which don't need to be |
| // handled. |
| if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || |
| isa<VPInterleaveRecipe>(CurRec)) |
| continue; |
| |
| // This recipe contributes to the address computation of a widen |
| // load/store. Collect recipe if its underlying instruction has |
| // poison-generating flags. |
| Instruction *Instr = CurRec->getUnderlyingInstr(); |
| if (Instr && Instr->hasPoisonGeneratingFlags()) |
| State.MayGeneratePoisonRecipes.insert(CurRec); |
| |
| // Add new definitions to the worklist. |
| for (VPValue *operand : CurRec->operands()) |
| if (VPDef *OpDef = operand->getDef()) |
| Worklist.push_back(cast<VPRecipeBase>(OpDef)); |
| } |
| }); |
| |
| // Traverse all the recipes in the VPlan and collect the poison-generating |
| // recipes in the backward slice starting at the address of a VPWidenRecipe or |
| // VPInterleaveRecipe. |
| auto Iter = depth_first( |
| VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); |
| for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { |
| for (VPRecipeBase &Recipe : *VPBB) { |
| if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { |
| Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); |
| VPDef *AddrDef = WidenRec->getAddr()->getDef(); |
| if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && |
| Legal->blockNeedsPredication(UnderlyingInstr->getParent())) |
| collectPoisonGeneratingInstrsInBackwardSlice( |
| cast<VPRecipeBase>(AddrDef)); |
| } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { |
| VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); |
| if (AddrDef) { |
| // Check if any member of the interleave group needs predication. |
| const InterleaveGroup<Instruction> *InterGroup = |
| InterleaveRec->getInterleaveGroup(); |
| bool NeedPredication = false; |
| for (int I = 0, NumMembers = InterGroup->getNumMembers(); |
| I < NumMembers; ++I) { |
| Instruction *Member = InterGroup->getMember(I); |
| if (Member) |
| NeedPredication |= |
| Legal->blockNeedsPredication(Member->getParent()); |
| } |
| |
| if (NeedPredication) |
| collectPoisonGeneratingInstrsInBackwardSlice( |
| cast<VPRecipeBase>(AddrDef)); |
| } |
| } |
| } |
| } |
| } |
| |
| void InnerLoopVectorizer::addMetadata(Instruction *To, |
| Instruction *From) { |
| propagateMetadata(To, From); |
| addNewMetadata(To, From); |
| } |
| |
| void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, |
| Instruction *From) { |
| for (Value *V : To) { |
| if (Instruction *I = dyn_cast<Instruction>(V)) |
| addMetadata(I, From); |
| } |
| } |
| |
| namespace llvm { |
| |
| // Loop vectorization cost-model hints how the scalar epilogue loop should be |
| // lowered. |
| enum ScalarEpilogueLowering { |
| |
| // The default: allowing scalar epilogues. |
| CM_ScalarEpilogueAllowed, |
| |
| // Vectorization with OptForSize: don't allow epilogues. |
| CM_ScalarEpilogueNotAllowedOptSize, |
| |
| // A special case of vectorisation with OptForSize: loops with a very small |
| // trip count are considered for vectorization under OptForSize, thereby |
| // making sure the cost of their loop body is dominant, free of runtime |
| // guards and scalar iteration overheads. |
| CM_ScalarEpilogueNotAllowedLowTripLoop, |
| |
| // Loop hint predicate indicating an epilogue is undesired. |
| CM_ScalarEpilogueNotNeededUsePredicate, |
| |
| // Directive indicating we must either tail fold or not vectorize |
| CM_ScalarEpilogueNotAllowedUsePredicate |
| }; |
| |
| /// ElementCountComparator creates a total ordering for ElementCount |
| /// for the purposes of using it in a set structure. |
| struct ElementCountComparator { |
| bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { |
| return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < |
| std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); |
| } |
| }; |
| using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; |
| |
| /// LoopVectorizationCostModel - estimates the expected speedups due to |
| /// vectorization. |
| /// In many cases vectorization is not profitable. This can happen because of |
| /// a number of reasons. In this class we mainly attempt to predict the |
| /// expected speedup/slowdowns due to the supported instruction set. We use the |
| /// TargetTransformInfo to query the different backends for the cost of |
| /// different operations. |
| class LoopVectorizationCostModel { |
| public: |
| LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, |
| PredicatedScalarEvolution &PSE, LoopInfo *LI, |
| LoopVectorizationLegality *Legal, |
| const TargetTransformInfo &TTI, |
| const TargetLibraryInfo *TLI, DemandedBits *DB, |
| AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, const Function *F, |
| const LoopVectorizeHints *Hints, |
| InterleavedAccessInfo &IAI) |
| : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), |
| TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), |
| Hints(Hints), InterleaveInfo(IAI) {} |
| |
| /// \return An upper bound for the vectorization factors (both fixed and |
| /// scalable). If the factors are 0, vectorization and interleaving should be |
| /// avoided up front. |
| FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); |
| |
| /// \return True if runtime checks are required for vectorization, and false |
| /// otherwise. |
| bool runtimeChecksRequired(); |
| |
| /// \return The most profitable vectorization factor and the cost of that VF. |
| /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO |
| /// then this vectorization factor will be selected if vectorization is |
| /// possible. |
| VectorizationFactor |
| selectVectorizationFactor(const ElementCountSet &CandidateVFs); |
| |
| VectorizationFactor |
| selectEpilogueVectorizationFactor(const ElementCount MaxVF, |
| const LoopVectorizationPlanner &LVP); |
| |
| /// Setup cost-based decisions for user vectorization factor. |
| /// \return true if the UserVF is a feasible VF to be chosen. |
| bool selectUserVectorizationFactor(ElementCount UserVF) { |
| collectUniformsAndScalars(UserVF); |
| collectInstsToScalarize(UserVF); |
| return expectedCost(UserVF).first.isValid(); |
| } |
| |
| /// \return The size (in bits) of the smallest and widest types in the code |
| /// that needs to be vectorized. We ignore values that remain scalar such as |
| /// 64 bit loop indices. |
| std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); |
| |
| /// \return The desired interleave count. |
| /// If interleave count has been specified by metadata it will be returned. |
| /// Otherwise, the interleave count is computed and returned. VF and LoopCost |
| /// are the selected vectorization factor and the cost of the selected VF. |
| unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); |
| |
| /// Memory access instruction may be vectorized in more than one way. |
| /// Form of instruction after vectorization depends on cost. |
| /// This function takes cost-based decisions for Load/Store instructions |
| /// and collects them in a map. This decisions map is used for building |
| /// the lists of loop-uniform and loop-scalar instructions. |
| /// The calculated cost is saved with widening decision in order to |
| /// avoid redundant calculations. |
| void setCostBasedWideningDecision(ElementCount VF); |
| |
| /// A struct that represents some properties of the register usage |
| /// of a loop. |
| struct RegisterUsage { |
| /// Holds the number of loop invariant values that are used in the loop. |
| /// The key is ClassID of target-provided register class. |
| SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; |
| /// Holds the maximum number of concurrent live intervals in the loop. |
| /// The key is ClassID of target-provided register class. |
| SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; |
| }; |
| |
| /// \return Returns information about the register usages of the loop for the |
| /// given vectorization factors. |
| SmallVector<RegisterUsage, 8> |
| calculateRegisterUsage(ArrayRef<ElementCount> VFs); |
| |
| /// Collect values we want to ignore in the cost model. |
| void collectValuesToIgnore(); |
| |
| /// Collect all element types in the loop for which widening is needed. |
| void collectElementTypesForWidening(); |
| |
| /// Split reductions into those that happen in the loop, and those that happen |
| /// outside. In loop reductions are collected into InLoopReductionChains. |
| void collectInLoopReductions(); |
| |
| /// Returns true if we should use strict in-order reductions for the given |
| /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, |
| /// the IsOrdered flag of RdxDesc is set and we do not allow reordering |
| /// of FP operations. |
| bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { |
| return !Hints->allowReordering() && RdxDesc.isOrdered(); |
| } |
| |
| /// \returns The smallest bitwidth each instruction can be represented with. |
| /// The vector equivalents of these instructions should be truncated to this |
| /// type. |
| const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { |
| return MinBWs; |
| } |
| |
| /// \returns True if it is more profitable to scalarize instruction \p I for |
| /// vectorization factor \p VF. |
| bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { |
| assert(VF.isVector() && |
| "Profitable to scalarize relevant only for VF > 1."); |
| |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return false; |
| |
| auto Scalars = InstsToScalarize.find(VF); |
| assert(Scalars != InstsToScalarize.end() && |
| "VF not yet analyzed for scalarization profitability"); |
| return Scalars->second.find(I) != Scalars->second.end(); |
| } |
| |
| /// Returns true if \p I is known to be uniform after vectorization. |
| bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { |
| if (VF.isScalar()) |
| return true; |
| |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return false; |
| |
| auto UniformsPerVF = Uniforms.find(VF); |
| assert(UniformsPerVF != Uniforms.end() && |
| "VF not yet analyzed for uniformity"); |
| return UniformsPerVF->second.count(I); |
| } |
| |
| /// Returns true if \p I is known to be scalar after vectorization. |
| bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { |
| if (VF.isScalar()) |
| return true; |
| |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return false; |
| |
| auto ScalarsPerVF = Scalars.find(VF); |
| assert(ScalarsPerVF != Scalars.end() && |
| "Scalar values are not calculated for VF"); |
| return ScalarsPerVF->second.count(I); |
| } |
| |
| /// \returns True if instruction \p I can be truncated to a smaller bitwidth |
| /// for vectorization factor \p VF. |
| bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { |
| return VF.isVector() && MinBWs.find(I) != MinBWs.end() && |
| !isProfitableToScalarize(I, VF) && |
| !isScalarAfterVectorization(I, VF); |
| } |
| |
| /// Decision that was taken during cost calculation for memory instruction. |
| enum InstWidening { |
| CM_Unknown, |
| CM_Widen, // For consecutive accesses with stride +1. |
| CM_Widen_Reverse, // For consecutive accesses with stride -1. |
| CM_Interleave, |
| CM_GatherScatter, |
| CM_Scalarize |
| }; |
| |
| /// Save vectorization decision \p W and \p Cost taken by the cost model for |
| /// instruction \p I and vector width \p VF. |
| void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, |
| InstructionCost Cost) { |
| assert(VF.isVector() && "Expected VF >=2"); |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); |
| } |
| |
| /// Save vectorization decision \p W and \p Cost taken by the cost model for |
| /// interleaving group \p Grp and vector width \p VF. |
| void setWideningDecision(const InterleaveGroup<Instruction> *Grp, |
| ElementCount VF, InstWidening W, |
| InstructionCost Cost) { |
| assert(VF.isVector() && "Expected VF >=2"); |
| /// Broadcast this decicion to all instructions inside the group. |
| /// But the cost will be assigned to one instruction only. |
| for (unsigned i = 0; i < Grp->getFactor(); ++i) { |
| if (auto *I = Grp->getMember(i)) { |
| if (Grp->getInsertPos() == I) |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); |
| else |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); |
| } |
| } |
| } |
| |
| /// Return the cost model decision for the given instruction \p I and vector |
| /// width \p VF. Return CM_Unknown if this instruction did not pass |
| /// through the cost modeling. |
| InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { |
| assert(VF.isVector() && "Expected VF to be a vector VF"); |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return CM_GatherScatter; |
| |
| std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); |
| auto Itr = WideningDecisions.find(InstOnVF); |
| if (Itr == WideningDecisions.end()) |
| return CM_Unknown; |
| return Itr->second.first; |
| } |
| |
| /// Return the vectorization cost for the given instruction \p I and vector |
| /// width \p VF. |
| InstructionCost getWideningCost(Instruction *I, ElementCount VF) { |
| assert(VF.isVector() && "Expected VF >=2"); |
| std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); |
| assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && |
| "The cost is not calculated"); |
| return WideningDecisions[InstOnVF].second; |
| } |
| |
| /// Return True if instruction \p I is an optimizable truncate whose operand |
| /// is an induction variable. Such a truncate will be removed by adding a new |
| /// induction variable with the destination type. |
| bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { |
| // If the instruction is not a truncate, return false. |
| auto *Trunc = dyn_cast<TruncInst>(I); |
| if (!Trunc) |
| return false; |
| |
| // Get the source and destination types of the truncate. |
| Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); |
| Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); |
| |
| // If the truncate is free for the given types, return false. Replacing a |
| // free truncate with an induction variable would add an induction variable |
| // update instruction to each iteration of the loop. We exclude from this |
| // check the primary induction variable since it will need an update |
| // instruction regardless. |
| Value *Op = Trunc->getOperand(0); |
| if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) |
| return false; |
| |
| // If the truncated value is not an induction variable, return false. |
| return Legal->isInductionPhi(Op); |
| } |
| |
| /// Collects the instructions to scalarize for each predicated instruction in |
| /// the loop. |
| void collectInstsToScalarize(ElementCount VF); |
| |
| /// Collect Uniform and Scalar values for the given \p VF. |
| /// The sets depend on CM decision for Load/Store instructions |
| /// that may be vectorized as interleave, gather-scatter or scalarized. |
| void collectUniformsAndScalars(ElementCount VF) { |
| // Do the analysis once. |
| if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) |
| return; |
| setCostBasedWideningDecision(VF); |
| collectLoopUniforms(VF); |
| collectLoopScalars(VF); |
| } |
| |
| /// Returns true if the target machine supports masked store operation |
| /// for the given \p DataType and kind of access to \p Ptr. |
| bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { |
| return Legal->isConsecutivePtr(DataType, Ptr) && |
| TTI.isLegalMaskedStore(DataType, Alignment); |
| } |
| |
| /// Returns true if the target machine supports masked load operation |
| /// for the given \p DataType and kind of access to \p Ptr. |
| bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { |
| return Legal->isConsecutivePtr(DataType, Ptr) && |
| TTI.isLegalMaskedLoad(DataType, Alignment); |
| } |
| |
| /// Returns true if the target machine can represent \p V as a masked gather |
| /// or scatter operation. |
| bool isLegalGatherOrScatter(Value *V) { |
| bool LI = isa<LoadInst>(V); |
| bool SI = isa<StoreInst>(V); |
| if (!LI && !SI) |
| return false; |
| auto *Ty = getLoadStoreType(V); |
| Align Align = getLoadStoreAlignment(V); |
| return (LI && TTI.isLegalMaskedGather(Ty, Align)) || |
| (SI && TTI.isLegalMaskedScatter(Ty, Align)); |
| } |
| |
| /// Returns true if the target machine supports all of the reduction |
| /// variables found for the given VF. |
| bool canVectorizeReductions(ElementCount VF) const { |
| return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { |
| const RecurrenceDescriptor &RdxDesc = Reduction.second; |
| return TTI.isLegalToVectorizeReduction(RdxDesc, VF); |
| })); |
| } |
| |
| /// Returns true if \p I is an instruction that will be scalarized with |
| /// predication. Such instructions include conditional stores and |
| /// instructions that may divide by zero. |
| /// If a non-zero VF has been calculated, we check if I will be scalarized |
| /// predication for that VF. |
| bool isScalarWithPredication(Instruction *I) const; |
| |
| // Returns true if \p I is an instruction that will be predicated either |
| // through scalar predication or masked load/store or masked gather/scatter. |
| // Superset of instructions that return true for isScalarWithPredication. |
| bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { |
| // When we know the load is uniform and the original scalar loop was not |
| // predicated we don't need to mark it as a predicated instruction. Any |
| // vectorised blocks created when tail-folding are something artificial we |
| // have introduced and we know there is always at least one active lane. |
| // That's why we call Legal->blockNeedsPredication here because it doesn't |
| // query tail-folding. |
| if (IsKnownUniform && isa<LoadInst>(I) && |
| !Legal->blockNeedsPredication(I->getParent())) |
| return false; |
| if (!blockNeedsPredicationForAnyReason(I->getParent())) |
| return false; |
| // Loads and stores that need some form of masked operation are predicated |
| // instructions. |
| if (isa<LoadInst>(I) || isa<StoreInst>(I)) |
| return Legal->isMaskRequired(I); |
| return isScalarWithPredication(I); |
| } |
| |
| /// Returns true if \p I is a memory instruction with consecutive memory |
| /// access that can be widened. |
| bool |
| memoryInstructionCanBeWidened(Instruction *I, |
| ElementCount VF = ElementCount::getFixed(1)); |
| |
| /// Returns true if \p I is a memory instruction in an interleaved-group |
| /// of memory accesses that can be vectorized with wide vector loads/stores |
| /// and shuffles. |
| bool |
| interleavedAccessCanBeWidened(Instruction *I, |
| ElementCount VF = ElementCount::getFixed(1)); |
| |
| /// Check if \p Instr belongs to any interleaved access group. |
| bool isAccessInterleaved(Instruction *Instr) { |
| return InterleaveInfo.isInterleaved(Instr); |
| } |
| |
| /// Get the interleaved access group that \p Instr belongs to. |
| const InterleaveGroup<Instruction> * |
| getInterleavedAccessGroup(Instruction *Instr) { |
| return InterleaveInfo.getInterleaveGroup(Instr); |
| } |
| |
| /// Returns true if we're required to use a scalar epilogue for at least |
| /// the final iteration of the original loop. |
| bool requiresScalarEpilogue(ElementCount VF) const { |
| if (!isScalarEpilogueAllowed()) |
| return false; |
| // If we might exit from anywhere but the latch, must run the exiting |
| // iteration in scalar form. |
| if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) |
| return true; |
| return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); |
| } |
| |
| /// Returns true if a scalar epilogue is not allowed due to optsize or a |
| /// loop hint annotation. |
| bool isScalarEpilogueAllowed() const { |
| return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; |
| } |
| |
| /// Returns true if all loop blocks should be masked to fold tail loop. |
| bool foldTailByMasking() const { return FoldTailByMasking; } |
| |
| /// Returns true if the instructions in this block requires predication |
| /// for any reason, e.g. because tail folding now requires a predicate |
| /// or because the block in the original loop was predicated. |
| bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { |
| return foldTailByMasking() || Legal->blockNeedsPredication(BB); |
| } |
| |
| /// A SmallMapVector to store the InLoop reduction op chains, mapping phi |
| /// nodes to the chain of instructions representing the reductions. Uses a |
| /// MapVector to ensure deterministic iteration order. |
| using ReductionChainMap = |
| SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; |
| |
| /// Return the chain of instructions representing an inloop reduction. |
| const ReductionChainMap &getInLoopReductionChains() const { |
| return InLoopReductionChains; |
| } |
| |
| /// Returns true if the Phi is part of an inloop reduction. |
| bool isInLoopReduction(PHINode *Phi) const { |
| return InLoopReductionChains.count(Phi); |
| } |
| |
| /// Estimate cost of an intrinsic call instruction CI if it were vectorized |
| /// with factor VF. Return the cost of the instruction, including |
| /// scalarization overhead if it's needed. |
| InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; |
| |
| /// Estimate cost of a call instruction CI if it were vectorized with factor |
| /// VF. Return the cost of the instruction, including scalarization overhead |
| /// if it's needed. The flag NeedToScalarize shows if the call needs to be |
| /// scalarized - |
| /// i.e. either vector version isn't available, or is too expensive. |
| InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, |
| bool &NeedToScalarize) const; |
| |
| /// Returns true if the per-lane cost of VectorizationFactor A is lower than |
| /// that of B. |
| bool isMoreProfitable(const VectorizationFactor &A, |
| const VectorizationFactor &B) const; |
| |
| /// Invalidates decisions already taken by the cost model. |
| void invalidateCostModelingDecisions() { |
| WideningDecisions.clear(); |
| Uniforms.clear(); |
| Scalars.clear(); |
| } |
| |
| private: |
| unsigned NumPredStores = 0; |
| |
| /// \return An upper bound for the vectorization factors for both |
| /// fixed and scalable vectorization, where the minimum-known number of |
| /// elements is a power-of-2 larger than zero. If scalable vectorization is |
| /// disabled or unsupported, then the scalable part will be equal to |
| /// ElementCount::getScalable(0). |
| FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, |
| ElementCount UserVF); |
| |
| /// \return the maximized element count based on the targets vector |
| /// registers and the loop trip-count, but limited to a maximum safe VF. |
| /// This is a helper function of computeFeasibleMaxVF. |
| /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure |
| /// issue that occurred on one of the buildbots which cannot be reproduced |
| /// without having access to the properietary compiler (see comments on |
| /// D98509). The issue is currently under investigation and this workaround |
| /// will be removed as soon as possible. |
| ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, |
| unsigned SmallestType, |
| unsigned WidestType, |
| const ElementCount &MaxSafeVF); |
| |
| /// \return the maximum legal scalable VF, based on the safe max number |
| /// of elements. |
| ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); |
| |
| /// The vectorization cost is a combination of the cost itself and a boolean |
| /// indicating whether any of the contributing operations will actually |
| /// operate on vector values after type legalization in the backend. If this |
| /// latter value is false, then all operations will be scalarized (i.e. no |
| /// vectorization has actually taken place). |
| using VectorizationCostTy = std::pair<InstructionCost, bool>; |
| |
| /// Returns the expected execution cost. The unit of the cost does |
| /// not matter because we use the 'cost' units to compare different |
| /// vector widths. The cost that is returned is *not* normalized by |
| /// the factor width. If \p Invalid is not nullptr, this function |
| /// will add a pair(Instruction*, ElementCount) to \p Invalid for |
| /// each instruction that has an Invalid cost for the given VF. |
| using InstructionVFPair = std::pair<Instruction *, ElementCount>; |
| VectorizationCostTy |
| expectedCost(ElementCount VF, |
| SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); |
| |
| /// Returns the execution time cost of an instruction for a given vector |
| /// width. Vector width of one means scalar. |
| VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); |
| |
| /// The cost-computation logic from getInstructionCost which provides |
| /// the vector type as an output parameter. |
| InstructionCost getInstructionCost(Instruction *I, ElementCount VF, |
| Type *&VectorTy); |
| |
| /// Return the cost of instructions in an inloop reduction pattern, if I is |
| /// part of that pattern. |
| Optional<InstructionCost> |
| getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, |
| TTI::TargetCostKind CostKind); |
| |
| /// Calculate vectorization cost of memory instruction \p I. |
| InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); |
| |
| /// The cost computation for scalarized memory instruction. |
| InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); |
| |
| /// The cost computation for interleaving group of memory instructions. |
| InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); |
| |
| /// The cost computation for Gather/Scatter instruction. |
| InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); |
| |
| /// The cost computation for widening instruction \p I with consecutive |
| /// memory access. |
| InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); |
| |
| /// The cost calculation for Load/Store instruction \p I with uniform pointer - |
| /// Load: scalar load + broadcast. |
| /// Store: scalar store + (loop invariant value stored? 0 : extract of last |
| /// element) |
| InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); |
| |
| /// Estimate the overhead of scalarizing an instruction. This is a |
| /// convenience wrapper for the type-based getScalarizationOverhead API. |
| InstructionCost getScalarizationOverhead(Instruction *I, |
| ElementCount VF) const; |
| |
| /// Returns whether the instruction is a load or store and will be a emitted |
| /// as a vector operation. |
| bool isConsecutiveLoadOrStore(Instruction *I); |
| |
| /// Returns true if an artificially high cost for emulated masked memrefs |
| /// should be used. |
| bool useEmulatedMaskMemRefHack(Instruction *I); |
| |
| /// Map of scalar integer values to the smallest bitwidth they can be legally |
| /// represented as. The vector equivalents of these values should be truncated |
| /// to this type. |
| MapVector<Instruction *, uint64_t> MinBWs; |
| |
| /// A type representing the costs for instructions if they were to be |
| /// scalarized rather than vectorized. The entries are Instruction-Cost |
| /// pairs. |
| using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; |
| |
| /// A set containing all BasicBlocks that are known to present after |
| /// vectorization as a predicated block. |
| SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; |
| |
| /// Records whether it is allowed to have the original scalar loop execute at |
| /// least once. This may be needed as a fallback loop in case runtime |
| /// aliasing/dependence checks fail, or to handle the tail/remainder |
| /// iterations when the trip count is unknown or doesn't divide by the VF, |
| /// or as a peel-loop to handle gaps in interleave-groups. |
| /// Under optsize and when the trip count is very small we don't allow any |
| /// iterations to execute in the scalar loop. |
| ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; |
| |
| /// All blocks of loop are to be masked to fold tail of scalar iterations. |
| bool FoldTailByMasking = false; |
| |
| /// A map holding scalar costs for different vectorization factors. The |
| /// presence of a cost for an instruction in the mapping indicates that the |
| /// instruction will be scalarized when vectorizing with the associated |
| /// vectorization factor. The entries are VF-ScalarCostTy pairs. |
| DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; |
| |
| /// Holds the instructions known to be uniform after vectorization. |
| /// The data is collected per VF. |
| DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; |
| |
| /// Holds the instructions known to be scalar after vectorization. |
| /// The data is collected per VF. |
| DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; |
| |
| /// Holds the instructions (address computations) that are forced to be |
| /// scalarized. |
| DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; |
| |
| /// PHINodes of the reductions that should be expanded in-loop along with |
| /// their associated chains of reduction operations, in program order from top |
| /// (PHI) to bottom |
| ReductionChainMap InLoopReductionChains; |
| |
| /// A Map of inloop reduction operations and their immediate chain operand. |
| /// FIXME: This can be removed once reductions can be costed correctly in |
| /// vplan. This was added to allow quick lookup to the inloop operations, |
| /// without having to loop through InLoopReductionChains. |
| DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; |
| |
| /// Returns the expected difference in cost from scalarizing the expression |
| /// feeding a predicated instruction \p PredInst. The instructions to |
| /// scalarize and their scalar costs are collected in \p ScalarCosts. A |
| /// non-negative return value implies the expression will be scalarized. |
| /// Currently, only single-use chains are considered for scalarization. |
| int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, |
| ElementCount VF); |
| |
| /// Collect the instructions that are uniform after vectorization. An |
| /// instruction is uniform if we represent it with a single scalar value in |
| /// the vectorized loop corresponding to each vector iteration. Examples of |
| /// uniform instructions include pointer operands of consecutive or |
| /// interleaved memory accesses. Note that although uniformity implies an |
| /// instruction will be scalar, the reverse is not true. In general, a |
| /// scalarized instruction will be represented by VF scalar values in the |
| /// vectorized loop, each corresponding to an iteration of the original |
| /// scalar loop. |
| void collectLoopUniforms(ElementCount VF); |
| |
| /// Collect the instructions that are scalar after vectorization. An |
| /// instruction is scalar if it is known to be uniform or will be scalarized |
| /// during vectorization. collectLoopScalars should only add non-uniform nodes |
| /// to the list if they are used by a load/store instruction that is marked as |
| /// CM_Scalarize. Non-uniform scalarized instructions will be represented by |
| /// VF values in the vectorized loop, each corresponding to an iteration of |
| /// the original scalar loop. |
| void collectLoopScalars(ElementCount VF); |
| |
| /// Keeps cost model vectorization decision and cost for instructions. |
| /// Right now it is used for memory instructions only. |
| using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, |
| std::pair<InstWidening, InstructionCost>>; |
| |
| DecisionList WideningDecisions; |
| |
| /// Returns true if \p V is expected to be vectorized and it needs to be |
| /// extracted. |
| bool needsExtract(Value *V, ElementCount VF) const { |
| Instruction *I = dyn_cast<Instruction>(V); |
| if (VF.isScalar() || !I || !TheLoop->contains(I) || |
| TheLoop->isLoopInvariant(I)) |
| return false; |
| |
| // Assume we can vectorize V (and hence we need extraction) if the |
| // scalars are not computed yet. This can happen, because it is called |
| // via getScalarizationOverhead from setCostBasedWideningDecision, before |
| // the scalars are collected. That should be a safe assumption in most |
| // cases, because we check if the operands have vectorizable types |
| // beforehand in LoopVectorizationLegality. |
| return Scalars.find(VF) == Scalars.end() || |
| !isScalarAfterVectorization(I, VF); |
| }; |
| |
| /// Returns a range containing only operands needing to be extracted. |
| SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, |
| ElementCount VF) const { |
| return SmallVector<Value *, 4>(make_filter_range( |
| Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); |
| } |
| |
| /// Determines if we have the infrastructure to vectorize loop \p L and its |
| /// epilogue, assuming the main loop is vectorized by \p VF. |
| bool isCandidateForEpilogueVectorization(const Loop &L, |
| const ElementCount VF) const; |
| |
| /// Returns true if epilogue vectorization is considered profitable, and |
| /// false otherwise. |
| /// \p VF is the vectorization factor chosen for the original loop. |
| bool isEpilogueVectorizationProfitable(const ElementCount VF) const; |
| |
| public: |
| /// The loop that we evaluate. |
| Loop *TheLoop; |
| |
| /// Predicated scalar evolution analysis. |
| PredicatedScalarEvolution &PSE; |
| |
| /// Loop Info analysis. |
| LoopInfo *LI; |
| |
| /// Vectorization legality. |
| LoopVectorizationLegality *Legal; |
| |
| /// Vector target information. |
| const TargetTransformInfo &TTI; |
| |
| /// Target Library Info. |
| const TargetLibraryInfo *TLI; |
| |
| /// Demanded bits analysis. |
| DemandedBits *DB; |
| |
| /// Assumption cache. |
| AssumptionCache *AC; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter *ORE; |
| |
| const Function *TheFunction; |
| |
| /// Loop Vectorize Hint. |
| const LoopVectorizeHints *Hints; |
| |
| /// The interleave access information contains groups of interleaved accesses |
| /// with the same stride and close to each other. |
| InterleavedAccessInfo &InterleaveInfo; |
| |
| /// Values to ignore in the cost model. |
| SmallPtrSet<const Value *, 16> ValuesToIgnore; |
| |
| /// Values to ignore in the cost model when VF > 1. |
| SmallPtrSet<const Value *, 16> VecValuesToIgnore; |
| |
| /// All element types found in the loop. |
| SmallPtrSet<Type *, 16> ElementTypesInLoop; |
| |
| /// Profitable vector factors. |
| SmallVector<VectorizationFactor, 8> ProfitableVFs; |
| }; |
| } // end namespace llvm |
| |
| /// Helper struct to manage generating runtime checks for vectorization. |
| /// |
| /// The runtime checks are created up-front in temporary blocks to allow better |
| /// estimating the cost and un-linked from the existing IR. After deciding to |
| /// vectorize, the checks are moved back. If deciding not to vectorize, the |
| /// temporary blocks are completely removed. |
| class GeneratedRTChecks { |
| /// Basic block which contains the generated SCEV checks, if any. |
| BasicBlock *SCEVCheckBlock = nullptr; |
| |
| /// The value representing the result of the generated SCEV checks. If it is |
| /// nullptr, either no SCEV checks have been generated or they have been used. |
| Value *SCEVCheckCond = nullptr; |
| |
| /// Basic block which contains the generated memory runtime checks, if any. |
| BasicBlock *MemCheckBlock = nullptr; |
| |
| /// The value representing the result of the generated memory runtime checks. |
| /// If it is nullptr, either no memory runtime checks have been generated or |
| /// they have been used. |
| Value *MemRuntimeCheckCond = nullptr; |
| |
| DominatorTree *DT; |
| LoopInfo *LI; |
| |
| SCEVExpander SCEVExp; |
| SCEVExpander MemCheckExp; |
| |
| public: |
| GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, |
| const DataLayout &DL) |
| : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), |
| MemCheckExp(SE, DL, "scev.check") {} |
| |
| /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can |
| /// accurately estimate the cost of the runtime checks. The blocks are |
| /// un-linked from the IR and is added back during vector code generation. If |
| /// there is no vector code generation, the check blocks are removed |
| /// completely. |
| void Create(Loop *L, const LoopAccessInfo &LAI, |
| const SCEVUnionPredicate &UnionPred) { |
| |
| BasicBlock *LoopHeader = L->getHeader(); |
| BasicBlock *Preheader = L->getLoopPreheader(); |
| |
| // Use SplitBlock to create blocks for SCEV & memory runtime checks to |
| // ensure the blocks are properly added to LoopInfo & DominatorTree. Those |
| // may be used by SCEVExpander. The blocks will be un-linked from their |
| // predecessors and removed from LI & DT at the end of the function. |
| if (!UnionPred.isAlwaysTrue()) { |
| SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, |
| nullptr, "vector.scevcheck"); |
| |
| SCEVCheckCond = SCEVExp.expandCodeForPredicate( |
| &UnionPred, SCEVCheckBlock->getTerminator()); |
| } |
| |
| const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); |
| if (RtPtrChecking.Need) { |
| auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; |
| MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, |
| "vector.memcheck"); |
| |
| MemRuntimeCheckCond = |
| addRuntimeChecks(MemCheckBlock->getTerminator(), L, |
| RtPtrChecking.getChecks(), MemCheckExp); |
| assert(MemRuntimeCheckCond && |
| "no RT checks generated although RtPtrChecking " |
| "claimed checks are required"); |
| } |
| |
| if (!MemCheckBlock && !SCEVCheckBlock) |
| return; |
| |
| // Unhook the temporary block with the checks, update various places |
| // accordingly. |
| if (SCEVCheckBlock) |
| SCEVCheckBlock->replaceAllUsesWith(Preheader); |
| if (MemCheckBlock) |
| MemCheckBlock->replaceAllUsesWith(Preheader); |
| |
| if (SCEVCheckBlock) { |
| SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); |
| new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); |
| Preheader->getTerminator()->eraseFromParent(); |
| } |
| if (MemCheckBlock) { |
| MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); |
| new UnreachableInst(Preheader->getContext(), MemCheckBlock); |
| Preheader->getTerminator()->eraseFromParent(); |
| } |
| |
| DT->changeImmediateDominator(LoopHeader, Preheader); |
| if (MemCheckBlock) { |
| DT->eraseNode(MemCheckBlock); |
| LI->removeBlock(MemCheckBlock); |
| } |
| if (SCEVCheckBlock) { |
| DT->eraseNode(SCEVCheckBlock); |
| LI->removeBlock(SCEVCheckBlock); |
| } |
| } |
| |
| /// Remove the created SCEV & memory runtime check blocks & instructions, if |
| /// unused. |
| ~GeneratedRTChecks() { |
| SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); |
| SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); |
| if (!SCEVCheckCond) |
| SCEVCleaner.markResultUsed(); |
| |
| if (!MemRuntimeCheckCond) |
| MemCheckCleaner.markResultUsed(); |
| |
| if (MemRuntimeCheckCond) { |
| auto &SE = *MemCheckExp.getSE(); |
| // Memory runtime check generation creates compares that use expanded |
| // values. Remove them before running the SCEVExpanderCleaners. |
| for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { |
| if (MemCheckExp.isInsertedInstruction(&I)) |
| continue; |
| SE.forgetValue(&I); |
| I.eraseFromParent(); |
| } |
| } |
| MemCheckCleaner.cleanup(); |
| SCEVCleaner.cleanup(); |
| |
| if (SCEVCheckCond) |
| SCEVCheckBlock->eraseFromParent(); |
| if (MemRuntimeCheckCond) |
| MemCheckBlock->eraseFromParent(); |
| } |
| |
| /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and |
| /// adjusts the branches to branch to the vector preheader or \p Bypass, |
| /// depending on the generated condition. |
| BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, |
| BasicBlock *LoopVectorPreHeader, |
| BasicBlock *LoopExitBlock) { |
| if (!SCEVCheckCond) |
| return nullptr; |
| if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) |
| if (C->isZero()) |
| return nullptr; |
| |
| auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); |
| |
| BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); |
| // Create new preheader for vector loop. |
| if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) |
| PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); |
| |
| SCEVCheckBlock->getTerminator()->eraseFromParent(); |
| SCEVCheckBlock->moveBefore(LoopVectorPreHeader); |
| Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, |
| SCEVCheckBlock); |
| |
| DT->addNewBlock(SCEVCheckBlock, Pred); |
| DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); |
| |
| ReplaceInstWithInst( |
| SCEVCheckBlock->getTerminator(), |
| BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); |
| // Mark the check as used, to prevent it from being removed during cleanup. |
| SCEVCheckCond = nullptr; |
| return SCEVCheckBlock; |
| } |
| |
| /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts |
| /// the branches to branch to the vector preheader or \p Bypass, depending on |
| /// the generated condition. |
| BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, |
| BasicBlock *LoopVectorPreHeader) { |
| // Check if we generated code that checks in runtime if arrays overlap. |
| if (!MemRuntimeCheckCond) |
| return nullptr; |
| |
| auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); |
| Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, |
| MemCheckBlock); |
| |
| DT->addNewBlock(MemCheckBlock, Pred); |
| DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); |
| MemCheckBlock->moveBefore(LoopVectorPreHeader); |
| |
| if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) |
| PL->addBasicBlockToLoop(MemCheckBlock, *LI); |
| |
| ReplaceInstWithInst( |
| MemCheckBlock->getTerminator(), |
| BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); |
| MemCheckBlock->getTerminator()->setDebugLoc( |
| Pred->getTerminator()->getDebugLoc()); |
| |
| // Mark the check as used, to prevent it from being removed during cleanup. |
| MemRuntimeCheckCond = nullptr; |
| return MemCheckBlock; |
| } |
| }; |
| |
| // Return true if \p OuterLp is an outer loop annotated with hints for explicit |
| // vectorization. The loop needs to be annotated with #pragma omp simd |
| // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the |
| // vector length information is not provided, vectorization is not considered |
| // explicit. Interleave hints are not allowed either. These limitations will be |
| // relaxed in the future. |
| // Please, note that we are currently forced to abuse the pragma 'clang |
| // vectorize' semantics. This pragma provides *auto-vectorization hints* |
| // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' |
| // provides *explicit vectorization hints* (LV can bypass legal checks and |
| // assume that vectorization is legal). However, both hints are implemented |
| // using the same metadata (llvm.loop.vectorize, processed by |
| // LoopVectorizeHints). This will be fixed in the future when the native IR |
| // representation for pragma 'omp simd' is introduced. |
| static bool isExplicitVecOuterLoop(Loop *OuterLp, |
| OptimizationRemarkEmitter *ORE) { |
| assert(!OuterLp->isInnermost() && "This is not an outer loop"); |
| LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); |
| |
| // Only outer loops with an explicit vectorization hint are supported. |
| // Unannotated outer loops are ignored. |
| if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) |
| return false; |
| |
| Function *Fn = OuterLp->getHeader()->getParent(); |
| if (!Hints.allowVectorization(Fn, OuterLp, |
| true /*VectorizeOnlyWhenForced*/)) { |
| LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); |
| return false; |
| } |
| |
| if (Hints.getInterleave() > 1) { |
| // TODO: Interleave support is future work. |
| LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " |
| "outer loops.\n"); |
| Hints.emitRemarkWithHints(); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static void collectSupportedLoops(Loop &L, LoopInfo *LI, |
| OptimizationRemarkEmitter *ORE, |
| SmallVectorImpl<Loop *> &V) { |
| // Collect inner loops and outer loops without irreducible control flow. For |
| // now, only collect outer loops that have explicit vectorization hints. If we |
| // are stress testing the VPlan H-CFG construction, we collect the outermost |
| // loop of every loop nest. |
| if (L.isInnermost() || VPlanBuildStressTest || |
| (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { |
| LoopBlocksRPO RPOT(&L); |
| RPOT.perform(LI); |
| if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { |
| V.push_back(&L); |
| // TODO: Collect inner loops inside marked outer loops in case |
| // vectorization fails for the outer loop. Do not invoke |
| // 'containsIrreducibleCFG' again for inner loops when the outer loop is |
| // already known to be reducible. We can use an inherited attribute for |
| // that. |
| return; |
| } |
| } |
| for (Loop *InnerL : L) |
| collectSupportedLoops(*InnerL, LI, ORE, V); |
| } |
| |
| namespace { |
| |
| /// The LoopVectorize Pass. |
| struct LoopVectorize : public FunctionPass { |
| /// Pass identification, replacement for typeid |
| static char ID; |
| |
| LoopVectorizePass Impl; |
| |
| explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, |
| bool VectorizeOnlyWhenForced = false) |
| : FunctionPass(ID), |
| Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { |
| initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); |
| } |
| |
| bool runOnFunction(Function &F) override { |
| if (skipFunction(F)) |
| return false; |
| |
| auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
| auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
| auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
| auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
| auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); |
| auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); |
| auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; |
| auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
| auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
| auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); |
| auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); |
| auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
| auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
| |
| std::function<const LoopAccessInfo &(Loop &)> GetLAA = |
| [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; |
| |
| return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, |
| GetLAA, *ORE, PSI).MadeAnyChange; |
| } |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override { |
| AU.addRequired<AssumptionCacheTracker>(); |
| AU.addRequired<BlockFrequencyInfoWrapperPass>(); |
| AU.addRequired<DominatorTreeWrapperPass>(); |
| AU.addRequired<LoopInfoWrapperPass>(); |
| AU.addRequired<ScalarEvolutionWrapperPass>(); |
| AU.addRequired<TargetTransformInfoWrapperPass>(); |
| AU.addRequired<AAResultsWrapperPass>(); |
| AU.addRequired<LoopAccessLegacyAnalysis>(); |
| AU.addRequired<DemandedBitsWrapperPass>(); |
| AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
| AU.addRequired<InjectTLIMappingsLegacy>(); |
| |
| // We currently do not preserve loopinfo/dominator analyses with outer loop |
| // vectorization. Until this is addressed, mark these analyses as preserved |
| // only for non-VPlan-native path. |
| // TODO: Preserve Loop and Dominator analyses for VPlan-native path. |
| if (!EnableVPlanNativePath) { |
| AU.addPreserved<LoopInfoWrapperPass>(); |
| AU.addPreserved<DominatorTreeWrapperPass>(); |
| } |
| |
| AU.addPreserved<BasicAAWrapperPass>(); |
| AU.addPreserved<GlobalsAAWrapperPass>(); |
| AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
| } |
| }; |
| |
| } // end anonymous namespace |
| |
| //===----------------------------------------------------------------------===// |
| // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and |
| // LoopVectorizationCostModel and LoopVectorizationPlanner. |
| //===----------------------------------------------------------------------===// |
| |
| Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { |
| // We need to place the broadcast of invariant variables outside the loop, |
| // but only if it's proven safe to do so. Else, broadcast will be inside |
| // vector loop body. |
| Instruction *Instr = dyn_cast<Instruction>(V); |
| bool SafeToHoist = OrigLoop->isLoopInvariant(V) && |
| (!Instr || |
| DT->dominates(Instr->getParent(), LoopVectorPreHeader)); |
| // Place the code for broadcasting invariant variables in the new preheader. |
| IRBuilder<>::InsertPointGuard Guard(Builder); |
| if (SafeToHoist) |
| Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
| |
| // Broadcast the scalar into all locations in the vector. |
| Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); |
| |
| return Shuf; |
| } |
| |
| void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( |
| const InductionDescriptor &II, Value *Step, Value *Start, |
| Instruction *EntryVal, VPValue *Def, VPValue *CastDef, |
| VPTransformState &State) { |
| assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && |
| "Expected either an induction phi-node or a truncate of it!"); |
| |
| // Construct the initial value of the vector IV in the vector loop preheader |
| auto CurrIP = Builder.saveIP(); |
| Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
| if (isa<TruncInst>(EntryVal)) { |
| assert(Start->getType()->isIntegerTy() && |
| "Truncation requires an integer type"); |
| auto *TruncType = cast<IntegerType>(EntryVal->getType()); |
| Step = Builder.CreateTrunc(Step, TruncType); |
| Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); |
| } |
| |
| Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); |
| Value *SplatStart = Builder.CreateVectorSplat(VF, Start); |
| Value *SteppedStart = |
| getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); |
| |
| // We create vector phi nodes for both integer and floating-point induction |
| // variables. Here, we determine the kind of arithmetic we will perform. |
| Instruction::BinaryOps AddOp; |
| Instruction::BinaryOps MulOp; |
| if (Step->getType()->isIntegerTy()) { |
| AddOp = Instruction::Add; |
| MulOp = Instruction::Mul; |
| } else { |
| AddOp = II.getInductionOpcode(); |
| MulOp = Instruction::FMul; |
| } |
| |
| // Multiply the vectorization factor by the step using integer or |
| // floating-point arithmetic as appropriate. |
| Type *StepType = Step->getType(); |
| Value *RuntimeVF; |
| if (Step->getType()->isFloatingPointTy()) |
| RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); |
| else |
| RuntimeVF = getRuntimeVF(Builder, StepType, VF); |
| Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); |
| |
| // Create a vector splat to use in the induction update. |
| // |
| // FIXME: If the step is non-constant, we create the vector splat with |
| // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't |
| // handle a constant vector splat. |
| Value *SplatVF = isa<Constant>(Mul) |
| ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) |
| : Builder.CreateVectorSplat(VF, Mul); |
| Builder.restoreIP(CurrIP); |
| |
| // We may need to add the step a number of times, depending on the unroll |
| // factor. The last of those goes into the PHI. |
| PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", |
| &*LoopVectorBody->getFirstInsertionPt()); |
| VecInd->setDebugLoc(EntryVal->getDebugLoc()); |
| Instruction *LastInduction = VecInd; |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| State.set(Def, LastInduction, Part); |
| |
| if (isa<TruncInst>(EntryVal)) |
| addMetadata(LastInduction, EntryVal); |
| recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, |
| State, Part); |
| |
| LastInduction = cast<Instruction>( |
| Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); |
| LastInduction->setDebugLoc(EntryVal->getDebugLoc()); |
| } |
| |
| // Move the last step to the end of the latch block. This ensures consistent |
| // placement of all induction updates. |
| auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); |
| auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); |
| auto *ICmp = cast<Instruction>(Br->getCondition()); |
| LastInduction->moveBefore(ICmp); |
| LastInduction->setName("vec.ind.next"); |
| |
| VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); |
| VecInd->addIncoming(LastInduction, LoopVectorLatch); |
| } |
| |
| bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { |
| return Cost->isScalarAfterVectorization(I, VF) || |
| Cost->isProfitableToScalarize(I, VF); |
| } |
| |
| bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { |
| if (shouldScalarizeInstruction(IV)) |
| return true; |
| auto isScalarInst = [&](User *U) -> bool { |
| auto *I = cast<Instruction>(U); |
| return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); |
| }; |
| return llvm::any_of(IV->users(), isScalarInst); |
| } |
| |
| void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( |
| const InductionDescriptor &ID, const Instruction *EntryVal, |
| Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, |
| unsigned Part, unsigned Lane) { |
| assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && |
| "Expected either an induction phi-node or a truncate of it!"); |
| |
| // This induction variable is not the phi from the original loop but the |
| // newly-created IV based on the proof that casted Phi is equal to the |
| // uncasted Phi in the vectorized loop (under a runtime guard possibly). It |
| // re-uses the same InductionDescriptor that original IV uses but we don't |
| // have to do any recording in this case - that is done when original IV is |
| // processed. |
| if (isa<TruncInst>(EntryVal)) |
| return; |
| |
| if (!CastDef) { |
| assert(ID.getCastInsts().empty() && |
| "there are casts for ID, but no CastDef"); |
| return; |
| } |
| assert(!ID.getCastInsts().empty() && |
| "there is a CastDef, but no casts for ID"); |
| // Only the first Cast instruction in the Casts vector is of interest. |
| // The rest of the Casts (if exist) have no uses outside the |
| // induction update chain itself. |
| if (Lane < UINT_MAX) |
| State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); |
| else |
| State.set(CastDef, VectorLoopVal, Part); |
| } |
| |
| void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, |
| TruncInst *Trunc, VPValue *Def, |
| VPValue *CastDef, |
| VPTransformState &State) { |
| assert((IV->getType()->isIntegerTy() || IV != OldInduction) && |
| "Primary induction variable must have an integer type"); |
| |
| auto II = Legal->getInductionVars().find(IV); |
| assert(II != Legal->getInductionVars().end() && "IV is not an induction"); |
| |
| auto ID = II->second; |
| assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); |
| |
| // The value from the original loop to which we are mapping the new induction |
| // variable. |
| Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; |
| |
| auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); |
| |
| // Generate code for the induction step. Note that induction steps are |
| // required to be loop-invariant |
| auto CreateStepValue = [&](const SCEV *Step) -> Value * { |
| assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && |
| "Induction step should be loop invariant"); |
| if (PSE.getSE()->isSCEVable(IV->getType())) { |
| SCEVExpander Exp(*PSE.getSE(), DL, "induction"); |
| return Exp.expandCodeFor(Step, Step->getType(), |
| LoopVectorPreHeader->getTerminator()); |
| } |
| return cast<SCEVUnknown>(Step)->getValue(); |
| }; |
| |
| // The scalar value to broadcast. This is derived from the canonical |
| // induction variable. If a truncation type is given, truncate the canonical |
| // induction variable and step. Otherwise, derive these values from the |
| // induction descriptor. |
| auto CreateScalarIV = [&](Value *&Step) -> Value * { |
| Value *ScalarIV = Induction; |
| if (IV != OldInduction) { |
| ScalarIV = IV->getType()->isIntegerTy() |
| ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) |
| : Builder.CreateCast(Instruction::SIToFP, Induction, |
| IV->getType()); |
| ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); |
| ScalarIV->setName("offset.idx"); |
| } |
| if (Trunc) { |
| auto *TruncType = cast<IntegerType>(Trunc->getType()); |
| assert(Step->getType()->isIntegerTy() && |
| "Truncation requires an integer step"); |
| ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); |
| Step = Builder.CreateTrunc(Step, TruncType); |
| } |
| return ScalarIV; |
| }; |
| |
| // Create the vector values from the scalar IV, in the absence of creating a |
| // vector IV. |
| auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { |
| Value *Broadcasted = getBroadcastInstrs(ScalarIV); |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| assert(!VF.isScalable() && "scalable vectors not yet supported."); |
| Value *StartIdx; |
| if (Step->getType()->isFloatingPointTy()) |
| StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); |
| else |
| StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); |
| |
| Value *EntryPart = |
| getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); |
| State.set(Def, EntryPart, Part); |
| if (Trunc) |
| addMetadata(EntryPart, Trunc); |
| recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, |
| State, Part); |
| } |
| }; |
| |
| // Fast-math-flags propagate from the original induction instruction. |
| IRBuilder<>::FastMathFlagGuard FMFG(Builder); |
| if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) |
| Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); |
| |
| // Now do the actual transformations, and start with creating the step value. |
| Value *Step = CreateStepValue(ID.getStep()); |
| if (VF.isZero() || VF.isScalar()) { |
| Value *ScalarIV = CreateScalarIV(Step); |
| CreateSplatIV(ScalarIV, Step); |
| return; |
| } |
| |
| // Determine if we want a scalar version of the induction variable. This is |
| // true if the induction variable itself is not widened, or if it has at |
| // least one user in the loop that is not widened. |
| auto NeedsScalarIV = needsScalarInduction(EntryVal); |
| if (!NeedsScalarIV) { |
| createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, |
| State); |
| return; |
| } |
| |
| // Try to create a new independent vector induction variable. If we can't |
| // create the phi node, we will splat the scalar induction variable in each |
| // loop iteration. |
| if (!shouldScalarizeInstruction(EntryVal)) { |
| createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, |
| State); |
| Value *ScalarIV = CreateScalarIV(Step); |
| // Create scalar steps that can be used by instructions we will later |
| // scalarize. Note that the addition of the scalar steps will not increase |
| // the number of instructions in the loop in the common case prior to |
| // InstCombine. We will be trading one vector extract for each scalar step. |
| buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); |
| return; |
| } |
| |
| // All IV users are scalar instructions, so only emit a scalar IV, not a |
| // vectorised IV. Except when we tail-fold, then the splat IV feeds the |
| // predicate used by the masked loads/stores. |
| Value *ScalarIV = CreateScalarIV(Step); |
| if (!Cost->isScalarEpilogueAllowed()) |
| CreateSplatIV(ScalarIV, Step); |
| buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); |
| } |
| |
| Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, |
| Value *Step, |
| Instruction::BinaryOps BinOp) { |
| // Create and check the types. |
| auto *ValVTy = cast<VectorType>(Val->getType()); |
| ElementCount VLen = ValVTy->getElementCount(); |
| |
| Type *STy = Val->getType()->getScalarType(); |
| assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && |
| "Induction Step must be an integer or FP"); |
| assert(Step->getType() == STy && "Step has wrong type"); |
| |
| SmallVector<Constant *, 8> Indices; |
| |
| // Create a vector of consecutive numbers from zero to VF. |
| VectorType *InitVecValVTy = ValVTy; |
| Type *InitVecValSTy = STy; |
| if (STy->isFloatingPointTy()) { |
| InitVecValSTy = |
| IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); |
| InitVecValVTy = VectorType::get(InitVecValSTy, VLen); |
| } |
| Value *InitVec = Builder.CreateStepVector(InitVecValVTy); |
| |
| // Splat the StartIdx |
| Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); |
| |
| if (STy->isIntegerTy()) { |
| InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); |
| Step = Builder.CreateVectorSplat(VLen, Step); |
| assert(Step->getType() == Val->getType() && "Invalid step vec"); |
| // FIXME: The newly created binary instructions should contain nsw/nuw flags, |
| // which can be found from the original scalar operations. |
| Step = Builder.CreateMul(InitVec, Step); |
| return Builder.CreateAdd(Val, Step, "induction"); |
| } |
| |
| // Floating point induction. |
| assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && |
| "Binary Opcode should be specified for FP induction"); |
| InitVec = Builder.CreateUIToFP(InitVec, ValVTy); |
| InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); |
| |
| Step = Builder.CreateVectorSplat(VLen, Step); |
| Value *MulOp = Builder.CreateFMul(InitVec, Step); |
| return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); |
| } |
| |
| void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, |
| Instruction *EntryVal, |
| const InductionDescriptor &ID, |
| VPValue *Def, VPValue *CastDef, |
| VPTransformState &State) { |
| // We shouldn't have to build scalar steps if we aren't vectorizing. |
| assert(VF.isVector() && "VF should be greater than one"); |
| // Get the value type and ensure it and the step have the same integer type. |
| Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); |
| assert(ScalarIVTy == Step->getType() && |
| "Val and Step should have the same type"); |
| |
| // We build scalar steps for both integer and floating-point induction |
| // variables. Here, we determine the kind of arithmetic we will perform. |
| Instruction::BinaryOps AddOp; |
| Instruction::BinaryOps MulOp; |
| if (ScalarIVTy->isIntegerTy()) { |
| AddOp = Instruction::Add; |
| MulOp = Instruction::Mul; |
| } else { |
| AddOp = ID.getInductionOpcode(); |
| MulOp = Instruction::FMul; |
| } |
| |
| // Determine the number of scalars we need to generate for each unroll |
| // iteration. If EntryVal is uniform, we only need to generate the first |
| // lane. Otherwise, we generate all VF values. |
| bool IsUniform = |
| Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); |
| unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); |
| // Compute the scalar steps and save the results in State. |
| Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), |
| ScalarIVTy->getScalarSizeInBits()); |
| Type *VecIVTy = nullptr; |
| Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; |
| if (!IsUniform && VF.isScalable()) { |
| VecIVTy = VectorType::get(ScalarIVTy, VF); |
| UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); |
| SplatStep = Builder.CreateVectorSplat(VF, Step); |
| SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); |
| } |
| |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part); |
| |
| if (!IsUniform && VF.isScalable()) { |
| auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); |
| auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); |
| if (ScalarIVTy->isFloatingPointTy()) |
| InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); |
| auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); |
| auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); |
| State.set(Def, Add, Part); |
| recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, |
| Part); |
| // It's useful to record the lane values too for the known minimum number |
| // of elements so we do those below. This improves the code quality when |
| // trying to extract the first element, for example. |
| } |
| |
| if (ScalarIVTy->isFloatingPointTy()) |
| StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); |
| |
| for (unsigned Lane = 0; Lane < Lanes; ++Lane) { |
| Value *StartIdx = Builder.CreateBinOp( |
| AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); |
| // The step returned by `createStepForVF` is a runtime-evaluated value |
| // when VF is scalable. Otherwise, it should be folded into a Constant. |
| assert((VF.isScalable() || isa<Constant>(StartIdx)) && |
| "Expected StartIdx to be folded to a constant when VF is not " |
| "scalable"); |
| auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); |
| auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); |
| State.set(Def, Add, VPIteration(Part, Lane)); |
| recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, |
| Part, Lane); |
| } |
| } |
| } |
| |
| void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, |
| const VPIteration &Instance, |
| VPTransformState &State) { |
| Value *ScalarInst = State.get(Def, Instance); |
| Value *VectorValue = State.get(Def, Instance.Part); |
| VectorValue = Builder.CreateInsertElement( |
| VectorValue, ScalarInst, |
| Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); |
| State.set(Def, VectorValue, Instance.Part); |
| } |
| |
| Value *InnerLoopVectorizer::reverseVector(Value *Vec) { |
| assert(Vec->getType()->isVectorTy() && "Invalid type"); |
| return Builder.CreateVectorReverse(Vec, "reverse"); |
| } |
| |
| // Return whether we allow using masked interleave-groups (for dealing with |
| // strided loads/stores that reside in predicated blocks, or for dealing |
| // with gaps). |
| static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { |
| // If an override option has been passed in for interleaved accesses, use it. |
| if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) |
| return EnableMaskedInterleavedMemAccesses; |
| |
| return TTI.enableMaskedInterleavedAccessVectorization(); |
| } |
| |
| // Try to vectorize the interleave group that \p Instr belongs to. |
| // |
| // E.g. Translate following interleaved load group (factor = 3): |
| // for (i = 0; i < N; i+=3) { |
| // R = Pic[i]; // Member of index 0 |
| // G = Pic[i+1]; // Member of index 1 |
| // B = Pic[i+2]; // Member of index 2 |
| // ... // do something to R, G, B |
| // } |
| // To: |
| // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
| // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements |
| // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements |
| // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements |
| // |
| // Or translate following interleaved store group (factor = 3): |
| // for (i = 0; i < N; i+=3) { |
| // ... do something to R, G, B |
| // Pic[i] = R; // Member of index 0 |
| // Pic[i+1] = G; // Member of index 1 |
| // Pic[i+2] = B; // Member of index 2 |
| // } |
| // To: |
| // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
| // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> |
| // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
| // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
| // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
| void InnerLoopVectorizer::vectorizeInterleaveGroup( |
| const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, |
| VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, |
| VPValue *BlockInMask) { |
| Instruction *Instr = Group->getInsertPos(); |
| const DataLayout &DL = Instr->getModule()->getDataLayout(); |
| |
| // Prepare for the vector type of the interleaved load/store. |
| Type *ScalarTy = getLoadStoreType(Instr); |
| unsigned InterleaveFactor = Group->getFactor(); |
| assert(!VF.isScalable() && "scalable vectors not yet supported."); |
| auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); |
| |
| // Prepare for the new pointers. |
| SmallVector<Value *, 2> AddrParts; |
| unsigned Index = Group->getIndex(Instr); |
| |
| // TODO: extend the masked interleaved-group support to reversed access. |
| assert((!BlockInMask || !Group->isReverse()) && |
| "Reversed masked interleave-group not supported."); |
| |
| // If the group is reverse, adjust the index to refer to the last vector lane |
| // instead of the first. We adjust the index from the first vector lane, |
| // rather than directly getting the pointer for lane VF - 1, because the |
| // pointer operand of the interleaved access is supposed to be uniform. For |
| // uniform instructions, we're only required to generate a value for the |
| // first vector lane in each unroll iteration. |
| if (Group->isReverse()) |
| Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); |
| |
| for (unsigned Part = 0; Part < UF; Part++) { |
| |