| //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops |
| // and generates target-independent LLVM-IR. |
| // The vectorizer uses the TargetTransformInfo analysis to estimate the costs |
| // of instructions in order to estimate the profitability of vectorization. |
| // |
| // The loop vectorizer combines consecutive loop iterations into a single |
| // 'wide' iteration. After this transformation the index is incremented |
| // by the SIMD vector width, and not by one. |
| // |
| // This pass has three parts: |
| // 1. The main loop pass that drives the different parts. |
| // 2. LoopVectorizationLegality - A unit that checks for the legality |
| // of the vectorization. |
| // 3. InnerLoopVectorizer - A unit that performs the actual |
| // widening of instructions. |
| // 4. LoopVectorizationCostModel - A unit that checks for the profitability |
| // of vectorization. It decides on the optimal vector width, which |
| // can be one, if vectorization is not profitable. |
| // |
| // There is a development effort going on to migrate loop vectorizer to the |
| // VPlan infrastructure and to introduce outer loop vectorization support (see |
| // docs/Proposal/VectorizationPlan.rst and |
| // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this |
| // purpose, we temporarily introduced the VPlan-native vectorization path: an |
| // alternative vectorization path that is natively implemented on top of the |
| // VPlan infrastructure. See EnableVPlanNativePath for enabling. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // The reduction-variable vectorization is based on the paper: |
| // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. |
| // |
| // Variable uniformity checks are inspired by: |
| // Karrenberg, R. and Hack, S. Whole Function Vectorization. |
| // |
| // The interleaved access vectorization is based on the paper: |
| // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved |
| // Data for SIMD |
| // |
| // Other ideas/concepts are from: |
| // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. |
| // |
| // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of |
| // Vectorizing Compilers. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
| #include "LoopVectorizationPlanner.h" |
| #include "VPRecipeBuilder.h" |
| #include "VPlan.h" |
| #include "VPlanHCFGBuilder.h" |
| #include "VPlanPredicator.h" |
| #include "VPlanTransforms.h" |
| #include "llvm/ADT/APInt.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/ADT/DenseMapInfo.h" |
| #include "llvm/ADT/Hashing.h" |
| #include "llvm/ADT/MapVector.h" |
| #include "llvm/ADT/None.h" |
| #include "llvm/ADT/Optional.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/ADT/Twine.h" |
| #include "llvm/ADT/iterator_range.h" |
| #include "llvm/Analysis/AssumptionCache.h" |
| #include "llvm/Analysis/BasicAliasAnalysis.h" |
| #include "llvm/Analysis/BlockFrequencyInfo.h" |
| #include "llvm/Analysis/CFG.h" |
| #include "llvm/Analysis/CodeMetrics.h" |
| #include "llvm/Analysis/DemandedBits.h" |
| #include "llvm/Analysis/GlobalsModRef.h" |
| #include "llvm/Analysis/LoopAccessAnalysis.h" |
| #include "llvm/Analysis/LoopAnalysisManager.h" |
| #include "llvm/Analysis/LoopInfo.h" |
| #include "llvm/Analysis/LoopIterator.h" |
| #include "llvm/Analysis/MemorySSA.h" |
| #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| #include "llvm/Analysis/ProfileSummaryInfo.h" |
| #include "llvm/Analysis/ScalarEvolution.h" |
| #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| #include "llvm/Analysis/TargetLibraryInfo.h" |
| #include "llvm/Analysis/TargetTransformInfo.h" |
| #include "llvm/Analysis/VectorUtils.h" |
| #include "llvm/IR/Attributes.h" |
| #include "llvm/IR/BasicBlock.h" |
| #include "llvm/IR/CFG.h" |
| #include "llvm/IR/Constant.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/DebugInfoMetadata.h" |
| #include "llvm/IR/DebugLoc.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/IR/Dominators.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/InstrTypes.h" |
| #include "llvm/IR/Instruction.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/IR/LLVMContext.h" |
| #include "llvm/IR/Metadata.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/IR/Operator.h" |
| #include "llvm/IR/PatternMatch.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/IR/Use.h" |
| #include "llvm/IR/User.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/IR/ValueHandle.h" |
| #include "llvm/IR/Verifier.h" |
| #include "llvm/InitializePasses.h" |
| #include "llvm/Pass.h" |
| #include "llvm/Support/Casting.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Compiler.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/InstructionCost.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| #include "llvm/Transforms/Utils/InjectTLIMappings.h" |
| #include "llvm/Transforms/Utils/LoopSimplify.h" |
| #include "llvm/Transforms/Utils/LoopUtils.h" |
| #include "llvm/Transforms/Utils/LoopVersioning.h" |
| #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
| #include "llvm/Transforms/Utils/SizeOpts.h" |
| #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
| #include <algorithm> |
| #include <cassert> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <functional> |
| #include <iterator> |
| #include <limits> |
| #include <memory> |
| #include <string> |
| #include <tuple> |
| #include <utility> |
| |
| using namespace llvm; |
| |
| #define LV_NAME "loop-vectorize" |
| #define DEBUG_TYPE LV_NAME |
| |
| #ifndef NDEBUG |
| const char VerboseDebug[] = DEBUG_TYPE "-verbose"; |
| #endif |
| |
| /// @{ |
| /// Metadata attribute names |
| const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; |
| const char LLVMLoopVectorizeFollowupVectorized[] = |
| "llvm.loop.vectorize.followup_vectorized"; |
| const char LLVMLoopVectorizeFollowupEpilogue[] = |
| "llvm.loop.vectorize.followup_epilogue"; |
| /// @} |
| |
| STATISTIC(LoopsVectorized, "Number of loops vectorized"); |
| STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); |
| STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); |
| |
| static cl::opt<bool> EnableEpilogueVectorization( |
| "enable-epilogue-vectorization", cl::init(true), cl::Hidden, |
| cl::desc("Enable vectorization of epilogue loops.")); |
| |
| static cl::opt<unsigned> EpilogueVectorizationForceVF( |
| "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, |
| cl::desc("When epilogue vectorization is enabled, and a value greater than " |
| "1 is specified, forces the given VF for all applicable epilogue " |
| "loops.")); |
| |
| static cl::opt<unsigned> EpilogueVectorizationMinVF( |
| "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, |
| cl::desc("Only loops with vectorization factor equal to or larger than " |
| "the specified value are considered for epilogue vectorization.")); |
| |
| /// Loops with a known constant trip count below this number are vectorized only |
| /// if no scalar iteration overheads are incurred. |
| static cl::opt<unsigned> TinyTripCountVectorThreshold( |
| "vectorizer-min-trip-count", cl::init(16), cl::Hidden, |
| cl::desc("Loops with a constant trip count that is smaller than this " |
| "value are vectorized only if no scalar iteration overheads " |
| "are incurred.")); |
| |
| static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( |
| "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, |
| cl::desc("The maximum allowed number of runtime memory checks with a " |
| "vectorize(enable) pragma.")); |
| |
| // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, |
| // that predication is preferred, and this lists all options. I.e., the |
| // vectorizer will try to fold the tail-loop (epilogue) into the vector body |
| // and predicate the instructions accordingly. If tail-folding fails, there are |
| // different fallback strategies depending on these values: |
| namespace PreferPredicateTy { |
| enum Option { |
| ScalarEpilogue = 0, |
| PredicateElseScalarEpilogue, |
| PredicateOrDontVectorize |
| }; |
| } // namespace PreferPredicateTy |
| |
| static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( |
| "prefer-predicate-over-epilogue", |
| cl::init(PreferPredicateTy::ScalarEpilogue), |
| cl::Hidden, |
| cl::desc("Tail-folding and predication preferences over creating a scalar " |
| "epilogue loop."), |
| cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, |
| "scalar-epilogue", |
| "Don't tail-predicate loops, create scalar epilogue"), |
| clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, |
| "predicate-else-scalar-epilogue", |
| "prefer tail-folding, create scalar epilogue if tail " |
| "folding fails."), |
| clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, |
| "predicate-dont-vectorize", |
| "prefers tail-folding, don't attempt vectorization if " |
| "tail-folding fails."))); |
| |
| static cl::opt<bool> MaximizeBandwidth( |
| "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, |
| cl::desc("Maximize bandwidth when selecting vectorization factor which " |
| "will be determined by the smallest type in loop.")); |
| |
| static cl::opt<bool> EnableInterleavedMemAccesses( |
| "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, |
| cl::desc("Enable vectorization on interleaved memory accesses in a loop")); |
| |
| /// An interleave-group may need masking if it resides in a block that needs |
| /// predication, or in order to mask away gaps. |
| static cl::opt<bool> EnableMaskedInterleavedMemAccesses( |
| "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, |
| cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); |
| |
| static cl::opt<unsigned> TinyTripCountInterleaveThreshold( |
| "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, |
| cl::desc("We don't interleave loops with a estimated constant trip count " |
| "below this number")); |
| |
| static cl::opt<unsigned> ForceTargetNumScalarRegs( |
| "force-target-num-scalar-regs", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's number of scalar registers.")); |
| |
| static cl::opt<unsigned> ForceTargetNumVectorRegs( |
| "force-target-num-vector-regs", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's number of vector registers.")); |
| |
| static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( |
| "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's max interleave factor for " |
| "scalar loops.")); |
| |
| static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( |
| "force-target-max-vector-interleave", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's max interleave factor for " |
| "vectorized loops.")); |
| |
| static cl::opt<unsigned> ForceTargetInstructionCost( |
| "force-target-instruction-cost", cl::init(0), cl::Hidden, |
| cl::desc("A flag that overrides the target's expected cost for " |
| "an instruction to a single constant value. Mostly " |
| "useful for getting consistent testing.")); |
| |
| static cl::opt<bool> ForceTargetSupportsScalableVectors( |
| "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, |
| cl::desc( |
| "Pretend that scalable vectors are supported, even if the target does " |
| "not support them. This flag should only be used for testing.")); |
| |
| static cl::opt<unsigned> SmallLoopCost( |
| "small-loop-cost", cl::init(20), cl::Hidden, |
| cl::desc( |
| "The cost of a loop that is considered 'small' by the interleaver.")); |
| |
| static cl::opt<bool> LoopVectorizeWithBlockFrequency( |
| "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, |
| cl::desc("Enable the use of the block frequency analysis to access PGO " |
| "heuristics minimizing code growth in cold regions and being more " |
| "aggressive in hot regions.")); |
| |
| // Runtime interleave loops for load/store throughput. |
| static cl::opt<bool> EnableLoadStoreRuntimeInterleave( |
| "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, |
| cl::desc( |
| "Enable runtime interleaving until load/store ports are saturated")); |
| |
| /// Interleave small loops with scalar reductions. |
| static cl::opt<bool> InterleaveSmallLoopScalarReduction( |
| "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, |
| cl::desc("Enable interleaving for loops with small iteration counts that " |
| "contain scalar reductions to expose ILP.")); |
| |
| /// The number of stores in a loop that are allowed to need predication. |
| static cl::opt<unsigned> NumberOfStoresToPredicate( |
| "vectorize-num-stores-pred", cl::init(1), cl::Hidden, |
| cl::desc("Max number of stores to be predicated behind an if.")); |
| |
| static cl::opt<bool> EnableIndVarRegisterHeur( |
| "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, |
| cl::desc("Count the induction variable only once when interleaving")); |
| |
| static cl::opt<bool> EnableCondStoresVectorization( |
| "enable-cond-stores-vec", cl::init(true), cl::Hidden, |
| cl::desc("Enable if predication of stores during vectorization.")); |
| |
| static cl::opt<unsigned> MaxNestedScalarReductionIC( |
| "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, |
| cl::desc("The maximum interleave count to use when interleaving a scalar " |
| "reduction in a nested loop.")); |
| |
| static cl::opt<bool> |
| PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), |
| cl::Hidden, |
| cl::desc("Prefer in-loop vector reductions, " |
| "overriding the targets preference.")); |
| |
| cl::opt<bool> EnableStrictReductions( |
| "enable-strict-reductions", cl::init(false), cl::Hidden, |
| cl::desc("Enable the vectorisation of loops with in-order (strict) " |
| "FP reductions")); |
| |
| static cl::opt<bool> PreferPredicatedReductionSelect( |
| "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, |
| cl::desc( |
| "Prefer predicating a reduction operation over an after loop select.")); |
| |
| cl::opt<bool> EnableVPlanNativePath( |
| "enable-vplan-native-path", cl::init(false), cl::Hidden, |
| cl::desc("Enable VPlan-native vectorization path with " |
| "support for outer loop vectorization.")); |
| |
| // FIXME: Remove this switch once we have divergence analysis. Currently we |
| // assume divergent non-backedge branches when this switch is true. |
| cl::opt<bool> EnableVPlanPredication( |
| "enable-vplan-predication", cl::init(false), cl::Hidden, |
| cl::desc("Enable VPlan-native vectorization path predicator with " |
| "support for outer loop vectorization.")); |
| |
| // This flag enables the stress testing of the VPlan H-CFG construction in the |
| // VPlan-native vectorization path. It must be used in conjuction with |
| // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the |
| // verification of the H-CFGs built. |
| static cl::opt<bool> VPlanBuildStressTest( |
| "vplan-build-stress-test", cl::init(false), cl::Hidden, |
| cl::desc( |
| "Build VPlan for every supported loop nest in the function and bail " |
| "out right after the build (stress test the VPlan H-CFG construction " |
| "in the VPlan-native vectorization path).")); |
| |
| cl::opt<bool> llvm::EnableLoopInterleaving( |
| "interleave-loops", cl::init(true), cl::Hidden, |
| cl::desc("Enable loop interleaving in Loop vectorization passes")); |
| cl::opt<bool> llvm::EnableLoopVectorization( |
| "vectorize-loops", cl::init(true), cl::Hidden, |
| cl::desc("Run the Loop vectorization passes")); |
| |
| cl::opt<bool> PrintVPlansInDotFormat( |
| "vplan-print-in-dot-format", cl::init(false), cl::Hidden, |
| cl::desc("Use dot format instead of plain text when dumping VPlans")); |
| |
| /// A helper function that returns the type of loaded or stored value. |
| static Type *getMemInstValueType(Value *I) { |
| assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && |
| "Expected Load or Store instruction"); |
| if (auto *LI = dyn_cast<LoadInst>(I)) |
| return LI->getType(); |
| return cast<StoreInst>(I)->getValueOperand()->getType(); |
| } |
| |
| /// A helper function that returns true if the given type is irregular. The |
| /// type is irregular if its allocated size doesn't equal the store size of an |
| /// element of the corresponding vector type. |
| static bool hasIrregularType(Type *Ty, const DataLayout &DL) { |
| // Determine if an array of N elements of type Ty is "bitcast compatible" |
| // with a <N x Ty> vector. |
| // This is only true if there is no padding between the array elements. |
| return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); |
| } |
| |
| /// A helper function that returns the reciprocal of the block probability of |
| /// predicated blocks. If we return X, we are assuming the predicated block |
| /// will execute once for every X iterations of the loop header. |
| /// |
| /// TODO: We should use actual block probability here, if available. Currently, |
| /// we always assume predicated blocks have a 50% chance of executing. |
| static unsigned getReciprocalPredBlockProb() { return 2; } |
| |
| /// A helper function that returns an integer or floating-point constant with |
| /// value C. |
| static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { |
| return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) |
| : ConstantFP::get(Ty, C); |
| } |
| |
| /// Returns "best known" trip count for the specified loop \p L as defined by |
| /// the following procedure: |
| /// 1) Returns exact trip count if it is known. |
| /// 2) Returns expected trip count according to profile data if any. |
| /// 3) Returns upper bound estimate if it is known. |
| /// 4) Returns None if all of the above failed. |
| static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { |
| // Check if exact trip count is known. |
| if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) |
| return ExpectedTC; |
| |
| // Check if there is an expected trip count available from profile data. |
| if (LoopVectorizeWithBlockFrequency) |
| if (auto EstimatedTC = getLoopEstimatedTripCount(L)) |
| return EstimatedTC; |
| |
| // Check if upper bound estimate is known. |
| if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) |
| return ExpectedTC; |
| |
| return None; |
| } |
| |
| // Forward declare GeneratedRTChecks. |
| class GeneratedRTChecks; |
| |
| namespace llvm { |
| |
| /// InnerLoopVectorizer vectorizes loops which contain only one basic |
| /// block to a specified vectorization factor (VF). |
| /// This class performs the widening of scalars into vectors, or multiple |
| /// scalars. This class also implements the following features: |
| /// * It inserts an epilogue loop for handling loops that don't have iteration |
| /// counts that are known to be a multiple of the vectorization factor. |
| /// * It handles the code generation for reduction variables. |
| /// * Scalarization (implementation using scalars) of un-vectorizable |
| /// instructions. |
| /// InnerLoopVectorizer does not perform any vectorization-legality |
| /// checks, and relies on the caller to check for the different legality |
| /// aspects. The InnerLoopVectorizer relies on the |
| /// LoopVectorizationLegality class to provide information about the induction |
| /// and reduction variables that were found to a given vectorization factor. |
| class InnerLoopVectorizer { |
| public: |
| InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
| LoopInfo *LI, DominatorTree *DT, |
| const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, ElementCount VecWidth, |
| unsigned UnrollFactor, LoopVectorizationLegality *LVL, |
| LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, |
| ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) |
| : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), |
| AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), |
| Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), |
| PSI(PSI), RTChecks(RTChecks) { |
| // Query this against the original loop and save it here because the profile |
| // of the original loop header may change as the transformation happens. |
| OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( |
| OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); |
| } |
| |
| virtual ~InnerLoopVectorizer() = default; |
| |
| /// Create a new empty loop that will contain vectorized instructions later |
| /// on, while the old loop will be used as the scalar remainder. Control flow |
| /// is generated around the vectorized (and scalar epilogue) loops consisting |
| /// of various checks and bypasses. Return the pre-header block of the new |
| /// loop. |
| /// In the case of epilogue vectorization, this function is overriden to |
| /// handle the more complex control flow around the loops. |
| virtual BasicBlock *createVectorizedLoopSkeleton(); |
| |
| /// Widen a single instruction within the innermost loop. |
| void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, |
| VPTransformState &State); |
| |
| /// Widen a single call instruction within the innermost loop. |
| void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, |
| VPTransformState &State); |
| |
| /// Widen a single select instruction within the innermost loop. |
| void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, |
| bool InvariantCond, VPTransformState &State); |
| |
| /// Fix the vectorized code, taking care of header phi's, live-outs, and more. |
| void fixVectorizedLoop(VPTransformState &State); |
| |
| // Return true if any runtime check is added. |
| bool areSafetyChecksAdded() { return AddedSafetyChecks; } |
| |
| /// A type for vectorized values in the new loop. Each value from the |
| /// original loop, when vectorized, is represented by UF vector values in the |
| /// new unrolled loop, where UF is the unroll factor. |
| using VectorParts = SmallVector<Value *, 2>; |
| |
| /// Vectorize a single GetElementPtrInst based on information gathered and |
| /// decisions taken during planning. |
| void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, |
| unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, |
| SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); |
| |
| /// Vectorize a single PHINode in a block. This method handles the induction |
| /// variable canonicalization. It supports both VF = 1 for unrolled loops and |
| /// arbitrary length vectors. |
| void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, |
| VPWidenPHIRecipe *PhiR, VPTransformState &State); |
| |
| /// A helper function to scalarize a single Instruction in the innermost loop. |
| /// Generates a sequence of scalar instances for each lane between \p MinLane |
| /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, |
| /// inclusive. Uses the VPValue operands from \p Operands instead of \p |
| /// Instr's operands. |
| void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, |
| const VPIteration &Instance, bool IfPredicateInstr, |
| VPTransformState &State); |
| |
| /// Widen an integer or floating-point induction variable \p IV. If \p Trunc |
| /// is provided, the integer induction variable will first be truncated to |
| /// the corresponding type. |
| void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, |
| VPValue *Def, VPValue *CastDef, |
| VPTransformState &State); |
| |
| /// Construct the vector value of a scalarized value \p V one lane at a time. |
| void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, |
| VPTransformState &State); |
| |
| /// Try to vectorize interleaved access group \p Group with the base address |
| /// given in \p Addr, optionally masking the vector operations if \p |
| /// BlockInMask is non-null. Use \p State to translate given VPValues to IR |
| /// values in the vectorized loop. |
| void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, |
| ArrayRef<VPValue *> VPDefs, |
| VPTransformState &State, VPValue *Addr, |
| ArrayRef<VPValue *> StoredValues, |
| VPValue *BlockInMask = nullptr); |
| |
| /// Vectorize Load and Store instructions with the base address given in \p |
| /// Addr, optionally masking the vector operations if \p BlockInMask is |
| /// non-null. Use \p State to translate given VPValues to IR values in the |
| /// vectorized loop. |
| void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, |
| VPValue *Def, VPValue *Addr, |
| VPValue *StoredValue, VPValue *BlockInMask); |
| |
| /// Set the debug location in the builder using the debug location in |
| /// the instruction. |
| void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); |
| |
| /// Fix the non-induction PHIs in the OrigPHIsToFix vector. |
| void fixNonInductionPHIs(VPTransformState &State); |
| |
| /// Create a broadcast instruction. This method generates a broadcast |
| /// instruction (shuffle) for loop invariant values and for the induction |
| /// value. If this is the induction variable then we extend it to N, N+1, ... |
| /// this is needed because each iteration in the loop corresponds to a SIMD |
| /// element. |
| virtual Value *getBroadcastInstrs(Value *V); |
| |
| protected: |
| friend class LoopVectorizationPlanner; |
| |
| /// A small list of PHINodes. |
| using PhiVector = SmallVector<PHINode *, 4>; |
| |
| /// A type for scalarized values in the new loop. Each value from the |
| /// original loop, when scalarized, is represented by UF x VF scalar values |
| /// in the new unrolled loop, where UF is the unroll factor and VF is the |
| /// vectorization factor. |
| using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; |
| |
| /// Set up the values of the IVs correctly when exiting the vector loop. |
| void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, |
| Value *CountRoundDown, Value *EndValue, |
| BasicBlock *MiddleBlock); |
| |
| /// Create a new induction variable inside L. |
| PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, |
| Value *Step, Instruction *DL); |
| |
| /// Handle all cross-iteration phis in the header. |
| void fixCrossIterationPHIs(VPTransformState &State); |
| |
| /// Fix a first-order recurrence. This is the second phase of vectorizing |
| /// this phi node. |
| void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); |
| |
| /// Fix a reduction cross-iteration phi. This is the second phase of |
| /// vectorizing this phi node. |
| void fixReduction(PHINode *Phi, VPTransformState &State); |
| |
| /// Clear NSW/NUW flags from reduction instructions if necessary. |
| void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, |
| VPTransformState &State); |
| |
| /// Fixup the LCSSA phi nodes in the unique exit block. This simply |
| /// means we need to add the appropriate incoming value from the middle |
| /// block as exiting edges from the scalar epilogue loop (if present) are |
| /// already in place, and we exit the vector loop exclusively to the middle |
| /// block. |
| void fixLCSSAPHIs(VPTransformState &State); |
| |
| /// Iteratively sink the scalarized operands of a predicated instruction into |
| /// the block that was created for it. |
| void sinkScalarOperands(Instruction *PredInst); |
| |
| /// Shrinks vector element sizes to the smallest bitwidth they can be legally |
| /// represented as. |
| void truncateToMinimalBitwidths(VPTransformState &State); |
| |
| /// This function adds |
| /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) |
| /// to each vector element of Val. The sequence starts at StartIndex. |
| /// \p Opcode is relevant for FP induction variable. |
| virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, |
| Instruction::BinaryOps Opcode = |
| Instruction::BinaryOpsEnd); |
| |
| /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
| /// variable on which to base the steps, \p Step is the size of the step, and |
| /// \p EntryVal is the value from the original loop that maps to the steps. |
| /// Note that \p EntryVal doesn't have to be an induction variable - it |
| /// can also be a truncate instruction. |
| void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, |
| const InductionDescriptor &ID, VPValue *Def, |
| VPValue *CastDef, VPTransformState &State); |
| |
| /// Create a vector induction phi node based on an existing scalar one. \p |
| /// EntryVal is the value from the original loop that maps to the vector phi |
| /// node, and \p Step is the loop-invariant step. If \p EntryVal is a |
| /// truncate instruction, instead of widening the original IV, we widen a |
| /// version of the IV truncated to \p EntryVal's type. |
| void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, |
| Value *Step, Value *Start, |
| Instruction *EntryVal, VPValue *Def, |
| VPValue *CastDef, |
| VPTransformState &State); |
| |
| /// Returns true if an instruction \p I should be scalarized instead of |
| /// vectorized for the chosen vectorization factor. |
| bool shouldScalarizeInstruction(Instruction *I) const; |
| |
| /// Returns true if we should generate a scalar version of \p IV. |
| bool needsScalarInduction(Instruction *IV) const; |
| |
| /// If there is a cast involved in the induction variable \p ID, which should |
| /// be ignored in the vectorized loop body, this function records the |
| /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the |
| /// cast. We had already proved that the casted Phi is equal to the uncasted |
| /// Phi in the vectorized loop (under a runtime guard), and therefore |
| /// there is no need to vectorize the cast - the same value can be used in the |
| /// vector loop for both the Phi and the cast. |
| /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, |
| /// Otherwise, \p VectorLoopValue is a widened/vectorized value. |
| /// |
| /// \p EntryVal is the value from the original loop that maps to the vector |
| /// phi node and is used to distinguish what is the IV currently being |
| /// processed - original one (if \p EntryVal is a phi corresponding to the |
| /// original IV) or the "newly-created" one based on the proof mentioned above |
| /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the |
| /// latter case \p EntryVal is a TruncInst and we must not record anything for |
| /// that IV, but it's error-prone to expect callers of this routine to care |
| /// about that, hence this explicit parameter. |
| void recordVectorLoopValueForInductionCast( |
| const InductionDescriptor &ID, const Instruction *EntryVal, |
| Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, |
| unsigned Part, unsigned Lane = UINT_MAX); |
| |
| /// Generate a shuffle sequence that will reverse the vector Vec. |
| virtual Value *reverseVector(Value *Vec); |
| |
| /// Returns (and creates if needed) the original loop trip count. |
| Value *getOrCreateTripCount(Loop *NewLoop); |
| |
| /// Returns (and creates if needed) the trip count of the widened loop. |
| Value *getOrCreateVectorTripCount(Loop *NewLoop); |
| |
| /// Returns a bitcasted value to the requested vector type. |
| /// Also handles bitcasts of vector<float> <-> vector<pointer> types. |
| Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, |
| const DataLayout &DL); |
| |
| /// Emit a bypass check to see if the vector trip count is zero, including if |
| /// it overflows. |
| void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); |
| |
| /// Emit a bypass check to see if all of the SCEV assumptions we've |
| /// had to make are correct. Returns the block containing the checks or |
| /// nullptr if no checks have been added. |
| BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); |
| |
| /// Emit bypass checks to check any memory assumptions we may have made. |
| /// Returns the block containing the checks or nullptr if no checks have been |
| /// added. |
| BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); |
| |
| /// Compute the transformed value of Index at offset StartValue using step |
| /// StepValue. |
| /// For integer induction, returns StartValue + Index * StepValue. |
| /// For pointer induction, returns StartValue[Index * StepValue]. |
| /// FIXME: The newly created binary instructions should contain nsw/nuw |
| /// flags, which can be found from the original scalar operations. |
| Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, |
| const DataLayout &DL, |
| const InductionDescriptor &ID) const; |
| |
| /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, |
| /// vector loop preheader, middle block and scalar preheader. Also |
| /// allocate a loop object for the new vector loop and return it. |
| Loop *createVectorLoopSkeleton(StringRef Prefix); |
| |
| /// Create new phi nodes for the induction variables to resume iteration count |
| /// in the scalar epilogue, from where the vectorized loop left off (given by |
| /// \p VectorTripCount). |
| /// In cases where the loop skeleton is more complicated (eg. epilogue |
| /// vectorization) and the resume values can come from an additional bypass |
| /// block, the \p AdditionalBypass pair provides information about the bypass |
| /// block and the end value on the edge from bypass to this loop. |
| void createInductionResumeValues( |
| Loop *L, Value *VectorTripCount, |
| std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); |
| |
| /// Complete the loop skeleton by adding debug MDs, creating appropriate |
| /// conditional branches in the middle block, preparing the builder and |
| /// running the verifier. Take in the vector loop \p L as argument, and return |
| /// the preheader of the completed vector loop. |
| BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); |
| |
| /// Add additional metadata to \p To that was not present on \p Orig. |
| /// |
| /// Currently this is used to add the noalias annotations based on the |
| /// inserted memchecks. Use this for instructions that are *cloned* into the |
| /// vector loop. |
| void addNewMetadata(Instruction *To, const Instruction *Orig); |
| |
| /// Add metadata from one instruction to another. |
| /// |
| /// This includes both the original MDs from \p From and additional ones (\see |
| /// addNewMetadata). Use this for *newly created* instructions in the vector |
| /// loop. |
| void addMetadata(Instruction *To, Instruction *From); |
| |
| /// Similar to the previous function but it adds the metadata to a |
| /// vector of instructions. |
| void addMetadata(ArrayRef<Value *> To, Instruction *From); |
| |
| /// Allow subclasses to override and print debug traces before/after vplan |
| /// execution, when trace information is requested. |
| virtual void printDebugTracesAtStart(){}; |
| virtual void printDebugTracesAtEnd(){}; |
| |
| /// The original loop. |
| Loop *OrigLoop; |
| |
| /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies |
| /// dynamic knowledge to simplify SCEV expressions and converts them to a |
| /// more usable form. |
| PredicatedScalarEvolution &PSE; |
| |
| /// Loop Info. |
| LoopInfo *LI; |
| |
| /// Dominator Tree. |
| DominatorTree *DT; |
| |
| /// Alias Analysis. |
| AAResults *AA; |
| |
| /// Target Library Info. |
| const TargetLibraryInfo *TLI; |
| |
| /// Target Transform Info. |
| const TargetTransformInfo *TTI; |
| |
| /// Assumption Cache. |
| AssumptionCache *AC; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter *ORE; |
| |
| /// LoopVersioning. It's only set up (non-null) if memchecks were |
| /// used. |
| /// |
| /// This is currently only used to add no-alias metadata based on the |
| /// memchecks. The actually versioning is performed manually. |
| std::unique_ptr<LoopVersioning> LVer; |
| |
| /// The vectorization SIMD factor to use. Each vector will have this many |
| /// vector elements. |
| ElementCount VF; |
| |
| /// The vectorization unroll factor to use. Each scalar is vectorized to this |
| /// many different vector instructions. |
| unsigned UF; |
| |
| /// The builder that we use |
| IRBuilder<> Builder; |
| |
| // --- Vectorization state --- |
| |
| /// The vector-loop preheader. |
| BasicBlock *LoopVectorPreHeader; |
| |
| /// The scalar-loop preheader. |
| BasicBlock *LoopScalarPreHeader; |
| |
| /// Middle Block between the vector and the scalar. |
| BasicBlock *LoopMiddleBlock; |
| |
| /// The (unique) ExitBlock of the scalar loop. Note that |
| /// there can be multiple exiting edges reaching this block. |
| BasicBlock *LoopExitBlock; |
| |
| /// The vector loop body. |
| BasicBlock *LoopVectorBody; |
| |
| /// The scalar loop body. |
| BasicBlock *LoopScalarBody; |
| |
| /// A list of all bypass blocks. The first block is the entry of the loop. |
| SmallVector<BasicBlock *, 4> LoopBypassBlocks; |
| |
| /// The new Induction variable which was added to the new block. |
| PHINode *Induction = nullptr; |
| |
| /// The induction variable of the old basic block. |
| PHINode *OldInduction = nullptr; |
| |
| /// Store instructions that were predicated. |
| SmallVector<Instruction *, 4> PredicatedInstructions; |
| |
| /// Trip count of the original loop. |
| Value *TripCount = nullptr; |
| |
| /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) |
| Value *VectorTripCount = nullptr; |
| |
| /// The legality analysis. |
| LoopVectorizationLegality *Legal; |
| |
| /// The profitablity analysis. |
| LoopVectorizationCostModel *Cost; |
| |
| // Record whether runtime checks are added. |
| bool AddedSafetyChecks = false; |
| |
| // Holds the end values for each induction variable. We save the end values |
| // so we can later fix-up the external users of the induction variables. |
| DenseMap<PHINode *, Value *> IVEndValues; |
| |
| // Vector of original scalar PHIs whose corresponding widened PHIs need to be |
| // fixed up at the end of vector code generation. |
| SmallVector<PHINode *, 8> OrigPHIsToFix; |
| |
| /// BFI and PSI are used to check for profile guided size optimizations. |
| BlockFrequencyInfo *BFI; |
| ProfileSummaryInfo *PSI; |
| |
| // Whether this loop should be optimized for size based on profile guided size |
| // optimizatios. |
| bool OptForSizeBasedOnProfile; |
| |
| /// Structure to hold information about generated runtime checks, responsible |
| /// for cleaning the checks, if vectorization turns out unprofitable. |
| GeneratedRTChecks &RTChecks; |
| }; |
| |
| class InnerLoopUnroller : public InnerLoopVectorizer { |
| public: |
| InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
| LoopInfo *LI, DominatorTree *DT, |
| const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, |
| LoopVectorizationLegality *LVL, |
| LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, |
| ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) |
| : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
| ElementCount::getFixed(1), UnrollFactor, LVL, CM, |
| BFI, PSI, Check) {} |
| |
| private: |
| Value *getBroadcastInstrs(Value *V) override; |
| Value *getStepVector(Value *Val, int StartIdx, Value *Step, |
| Instruction::BinaryOps Opcode = |
| Instruction::BinaryOpsEnd) override; |
| Value *reverseVector(Value *Vec) override; |
| }; |
| |
| /// Encapsulate information regarding vectorization of a loop and its epilogue. |
| /// This information is meant to be updated and used across two stages of |
| /// epilogue vectorization. |
| struct EpilogueLoopVectorizationInfo { |
| ElementCount MainLoopVF = ElementCount::getFixed(0); |
| unsigned MainLoopUF = 0; |
| ElementCount EpilogueVF = ElementCount::getFixed(0); |
| unsigned EpilogueUF = 0; |
| BasicBlock *MainLoopIterationCountCheck = nullptr; |
| BasicBlock *EpilogueIterationCountCheck = nullptr; |
| BasicBlock *SCEVSafetyCheck = nullptr; |
| BasicBlock *MemSafetyCheck = nullptr; |
| Value *TripCount = nullptr; |
| Value *VectorTripCount = nullptr; |
| |
| EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, |
| unsigned EUF) |
| : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), |
| EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { |
| assert(EUF == 1 && |
| "A high UF for the epilogue loop is likely not beneficial."); |
| } |
| }; |
| |
| /// An extension of the inner loop vectorizer that creates a skeleton for a |
| /// vectorized loop that has its epilogue (residual) also vectorized. |
| /// The idea is to run the vplan on a given loop twice, firstly to setup the |
| /// skeleton and vectorize the main loop, and secondly to complete the skeleton |
| /// from the first step and vectorize the epilogue. This is achieved by |
| /// deriving two concrete strategy classes from this base class and invoking |
| /// them in succession from the loop vectorizer planner. |
| class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { |
| public: |
| InnerLoopAndEpilogueVectorizer( |
| Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
| DominatorTree *DT, const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
| LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
| BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
| GeneratedRTChecks &Checks) |
| : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
| EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, |
| Checks), |
| EPI(EPI) {} |
| |
| // Override this function to handle the more complex control flow around the |
| // three loops. |
| BasicBlock *createVectorizedLoopSkeleton() final override { |
| return createEpilogueVectorizedLoopSkeleton(); |
| } |
| |
| /// The interface for creating a vectorized skeleton using one of two |
| /// different strategies, each corresponding to one execution of the vplan |
| /// as described above. |
| virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; |
| |
| /// Holds and updates state information required to vectorize the main loop |
| /// and its epilogue in two separate passes. This setup helps us avoid |
| /// regenerating and recomputing runtime safety checks. It also helps us to |
| /// shorten the iteration-count-check path length for the cases where the |
| /// iteration count of the loop is so small that the main vector loop is |
| /// completely skipped. |
| EpilogueLoopVectorizationInfo &EPI; |
| }; |
| |
| /// A specialized derived class of inner loop vectorizer that performs |
| /// vectorization of *main* loops in the process of vectorizing loops and their |
| /// epilogues. |
| class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { |
| public: |
| EpilogueVectorizerMainLoop( |
| Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
| DominatorTree *DT, const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
| LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
| BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
| GeneratedRTChecks &Check) |
| : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
| EPI, LVL, CM, BFI, PSI, Check) {} |
| /// Implements the interface for creating a vectorized skeleton using the |
| /// *main loop* strategy (ie the first pass of vplan execution). |
| BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; |
| |
| protected: |
| /// Emits an iteration count bypass check once for the main loop (when \p |
| /// ForEpilogue is false) and once for the epilogue loop (when \p |
| /// ForEpilogue is true). |
| BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, |
| bool ForEpilogue); |
| void printDebugTracesAtStart() override; |
| void printDebugTracesAtEnd() override; |
| }; |
| |
| // A specialized derived class of inner loop vectorizer that performs |
| // vectorization of *epilogue* loops in the process of vectorizing loops and |
| // their epilogues. |
| class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { |
| public: |
| EpilogueVectorizerEpilogueLoop( |
| Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
| DominatorTree *DT, const TargetLibraryInfo *TLI, |
| const TargetTransformInfo *TTI, AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
| LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
| BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
| GeneratedRTChecks &Checks) |
| : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
| EPI, LVL, CM, BFI, PSI, Checks) {} |
| /// Implements the interface for creating a vectorized skeleton using the |
| /// *epilogue loop* strategy (ie the second pass of vplan execution). |
| BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; |
| |
| protected: |
| /// Emits an iteration count bypass check after the main vector loop has |
| /// finished to see if there are any iterations left to execute by either |
| /// the vector epilogue or the scalar epilogue. |
| BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, |
| BasicBlock *Bypass, |
| BasicBlock *Insert); |
| void printDebugTracesAtStart() override; |
| void printDebugTracesAtEnd() override; |
| }; |
| } // end namespace llvm |
| |
| /// Look for a meaningful debug location on the instruction or it's |
| /// operands. |
| static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { |
| if (!I) |
| return I; |
| |
| DebugLoc Empty; |
| if (I->getDebugLoc() != Empty) |
| return I; |
| |
| for (Use &Op : I->operands()) { |
| if (Instruction *OpInst = dyn_cast<Instruction>(Op)) |
| if (OpInst->getDebugLoc() != Empty) |
| return OpInst; |
| } |
| |
| return I; |
| } |
| |
| void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { |
| if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { |
| const DILocation *DIL = Inst->getDebugLoc(); |
| if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && |
| !isa<DbgInfoIntrinsic>(Inst)) { |
| assert(!VF.isScalable() && "scalable vectors not yet supported."); |
| auto NewDIL = |
| DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); |
| if (NewDIL) |
| B.SetCurrentDebugLocation(NewDIL.getValue()); |
| else |
| LLVM_DEBUG(dbgs() |
| << "Failed to create new discriminator: " |
| << DIL->getFilename() << " Line: " << DIL->getLine()); |
| } |
| else |
| B.SetCurrentDebugLocation(DIL); |
| } else |
| B.SetCurrentDebugLocation(DebugLoc()); |
| } |
| |
| /// Write a record \p DebugMsg about vectorization failure to the debug |
| /// output stream. If \p I is passed, it is an instruction that prevents |
| /// vectorization. |
| #ifndef NDEBUG |
| static void debugVectorizationFailure(const StringRef DebugMsg, |
| Instruction *I) { |
| dbgs() << "LV: Not vectorizing: " << DebugMsg; |
| if (I != nullptr) |
| dbgs() << " " << *I; |
| else |
| dbgs() << '.'; |
| dbgs() << '\n'; |
| } |
| #endif |
| |
| /// Create an analysis remark that explains why vectorization failed |
| /// |
| /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p |
| /// RemarkName is the identifier for the remark. If \p I is passed it is an |
| /// instruction that prevents vectorization. Otherwise \p TheLoop is used for |
| /// the location of the remark. \return the remark object that can be |
| /// streamed to. |
| static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, |
| StringRef RemarkName, Loop *TheLoop, Instruction *I) { |
| Value *CodeRegion = TheLoop->getHeader(); |
| DebugLoc DL = TheLoop->getStartLoc(); |
| |
| if (I) { |
| CodeRegion = I->getParent(); |
| // If there is no debug location attached to the instruction, revert back to |
| // using the loop's. |
| if (I->getDebugLoc()) |
| DL = I->getDebugLoc(); |
| } |
| |
| OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); |
| R << "loop not vectorized: "; |
| return R; |
| } |
| |
| /// Return a value for Step multiplied by VF. |
| static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { |
| assert(isa<ConstantInt>(Step) && "Expected an integer step"); |
| Constant *StepVal = ConstantInt::get( |
| Step->getType(), |
| cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); |
| return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; |
| } |
| |
| namespace llvm { |
| |
| /// Return the runtime value for VF. |
| Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { |
| Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); |
| return VF.isScalable() ? B.CreateVScale(EC) : EC; |
| } |
| |
| void reportVectorizationFailure(const StringRef DebugMsg, |
| const StringRef OREMsg, const StringRef ORETag, |
| OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { |
| LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); |
| LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); |
| ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), |
| ORETag, TheLoop, I) << OREMsg); |
| } |
| |
| } // end namespace llvm |
| |
| #ifndef NDEBUG |
| /// \return string containing a file name and a line # for the given loop. |
| static std::string getDebugLocString(const Loop *L) { |
| std::string Result; |
| if (L) { |
| raw_string_ostream OS(Result); |
| if (const DebugLoc LoopDbgLoc = L->getStartLoc()) |
| LoopDbgLoc.print(OS); |
| else |
| // Just print the module name. |
| OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); |
| OS.flush(); |
| } |
| return Result; |
| } |
| #endif |
| |
| void InnerLoopVectorizer::addNewMetadata(Instruction *To, |
| const Instruction *Orig) { |
| // If the loop was versioned with memchecks, add the corresponding no-alias |
| // metadata. |
| if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) |
| LVer->annotateInstWithNoAlias(To, Orig); |
| } |
| |
| void InnerLoopVectorizer::addMetadata(Instruction *To, |
| Instruction *From) { |
| propagateMetadata(To, From); |
| addNewMetadata(To, From); |
| } |
| |
| void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, |
| Instruction *From) { |
| for (Value *V : To) { |
| if (Instruction *I = dyn_cast<Instruction>(V)) |
| addMetadata(I, From); |
| } |
| } |
| |
| namespace llvm { |
| |
| // Loop vectorization cost-model hints how the scalar epilogue loop should be |
| // lowered. |
| enum ScalarEpilogueLowering { |
| |
| // The default: allowing scalar epilogues. |
| CM_ScalarEpilogueAllowed, |
| |
| // Vectorization with OptForSize: don't allow epilogues. |
| CM_ScalarEpilogueNotAllowedOptSize, |
| |
| // A special case of vectorisation with OptForSize: loops with a very small |
| // trip count are considered for vectorization under OptForSize, thereby |
| // making sure the cost of their loop body is dominant, free of runtime |
| // guards and scalar iteration overheads. |
| CM_ScalarEpilogueNotAllowedLowTripLoop, |
| |
| // Loop hint predicate indicating an epilogue is undesired. |
| CM_ScalarEpilogueNotNeededUsePredicate, |
| |
| // Directive indicating we must either tail fold or not vectorize |
| CM_ScalarEpilogueNotAllowedUsePredicate |
| }; |
| |
| /// LoopVectorizationCostModel - estimates the expected speedups due to |
| /// vectorization. |
| /// In many cases vectorization is not profitable. This can happen because of |
| /// a number of reasons. In this class we mainly attempt to predict the |
| /// expected speedup/slowdowns due to the supported instruction set. We use the |
| /// TargetTransformInfo to query the different backends for the cost of |
| /// different operations. |
| class LoopVectorizationCostModel { |
| public: |
| LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, |
| PredicatedScalarEvolution &PSE, LoopInfo *LI, |
| LoopVectorizationLegality *Legal, |
| const TargetTransformInfo &TTI, |
| const TargetLibraryInfo *TLI, DemandedBits *DB, |
| AssumptionCache *AC, |
| OptimizationRemarkEmitter *ORE, const Function *F, |
| const LoopVectorizeHints *Hints, |
| InterleavedAccessInfo &IAI) |
| : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), |
| TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), |
| Hints(Hints), InterleaveInfo(IAI) {} |
| |
| /// \return An upper bound for the vectorization factor, or None if |
| /// vectorization and interleaving should be avoided up front. |
| Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); |
| |
| /// \return True if runtime checks are required for vectorization, and false |
| /// otherwise. |
| bool runtimeChecksRequired(); |
| |
| /// \return The most profitable vectorization factor and the cost of that VF. |
| /// This method checks every power of two up to MaxVF. If UserVF is not ZERO |
| /// then this vectorization factor will be selected if vectorization is |
| /// possible. |
| VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); |
| VectorizationFactor |
| selectEpilogueVectorizationFactor(const ElementCount MaxVF, |
| const LoopVectorizationPlanner &LVP); |
| |
| /// Setup cost-based decisions for user vectorization factor. |
| void selectUserVectorizationFactor(ElementCount UserVF) { |
| collectUniformsAndScalars(UserVF); |
| collectInstsToScalarize(UserVF); |
| } |
| |
| /// \return The size (in bits) of the smallest and widest types in the code |
| /// that needs to be vectorized. We ignore values that remain scalar such as |
| /// 64 bit loop indices. |
| std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); |
| |
| /// \return The desired interleave count. |
| /// If interleave count has been specified by metadata it will be returned. |
| /// Otherwise, the interleave count is computed and returned. VF and LoopCost |
| /// are the selected vectorization factor and the cost of the selected VF. |
| unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); |
| |
| /// Memory access instruction may be vectorized in more than one way. |
| /// Form of instruction after vectorization depends on cost. |
| /// This function takes cost-based decisions for Load/Store instructions |
| /// and collects them in a map. This decisions map is used for building |
| /// the lists of loop-uniform and loop-scalar instructions. |
| /// The calculated cost is saved with widening decision in order to |
| /// avoid redundant calculations. |
| void setCostBasedWideningDecision(ElementCount VF); |
| |
| /// A struct that represents some properties of the register usage |
| /// of a loop. |
| struct RegisterUsage { |
| /// Holds the number of loop invariant values that are used in the loop. |
| /// The key is ClassID of target-provided register class. |
| SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; |
| /// Holds the maximum number of concurrent live intervals in the loop. |
| /// The key is ClassID of target-provided register class. |
| SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; |
| }; |
| |
| /// \return Returns information about the register usages of the loop for the |
| /// given vectorization factors. |
| SmallVector<RegisterUsage, 8> |
| calculateRegisterUsage(ArrayRef<ElementCount> VFs); |
| |
| /// Collect values we want to ignore in the cost model. |
| void collectValuesToIgnore(); |
| |
| /// Split reductions into those that happen in the loop, and those that happen |
| /// outside. In loop reductions are collected into InLoopReductionChains. |
| void collectInLoopReductions(); |
| |
| /// \returns The smallest bitwidth each instruction can be represented with. |
| /// The vector equivalents of these instructions should be truncated to this |
| /// type. |
| const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { |
| return MinBWs; |
| } |
| |
| /// \returns True if it is more profitable to scalarize instruction \p I for |
| /// vectorization factor \p VF. |
| bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { |
| assert(VF.isVector() && |
| "Profitable to scalarize relevant only for VF > 1."); |
| |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return false; |
| |
| auto Scalars = InstsToScalarize.find(VF); |
| assert(Scalars != InstsToScalarize.end() && |
| "VF not yet analyzed for scalarization profitability"); |
| return Scalars->second.find(I) != Scalars->second.end(); |
| } |
| |
| /// Returns true if \p I is known to be uniform after vectorization. |
| bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { |
| if (VF.isScalar()) |
| return true; |
| |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return false; |
| |
| auto UniformsPerVF = Uniforms.find(VF); |
| assert(UniformsPerVF != Uniforms.end() && |
| "VF not yet analyzed for uniformity"); |
| return UniformsPerVF->second.count(I); |
| } |
| |
| /// Returns true if \p I is known to be scalar after vectorization. |
| bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { |
| if (VF.isScalar()) |
| return true; |
| |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return false; |
| |
| auto ScalarsPerVF = Scalars.find(VF); |
| assert(ScalarsPerVF != Scalars.end() && |
| "Scalar values are not calculated for VF"); |
| return ScalarsPerVF->second.count(I); |
| } |
| |
| /// \returns True if instruction \p I can be truncated to a smaller bitwidth |
| /// for vectorization factor \p VF. |
| bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { |
| return VF.isVector() && MinBWs.find(I) != MinBWs.end() && |
| !isProfitableToScalarize(I, VF) && |
| !isScalarAfterVectorization(I, VF); |
| } |
| |
| /// Decision that was taken during cost calculation for memory instruction. |
| enum InstWidening { |
| CM_Unknown, |
| CM_Widen, // For consecutive accesses with stride +1. |
| CM_Widen_Reverse, // For consecutive accesses with stride -1. |
| CM_Interleave, |
| CM_GatherScatter, |
| CM_Scalarize |
| }; |
| |
| /// Save vectorization decision \p W and \p Cost taken by the cost model for |
| /// instruction \p I and vector width \p VF. |
| void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, |
| InstructionCost Cost) { |
| assert(VF.isVector() && "Expected VF >=2"); |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); |
| } |
| |
| /// Save vectorization decision \p W and \p Cost taken by the cost model for |
| /// interleaving group \p Grp and vector width \p VF. |
| void setWideningDecision(const InterleaveGroup<Instruction> *Grp, |
| ElementCount VF, InstWidening W, |
| InstructionCost Cost) { |
| assert(VF.isVector() && "Expected VF >=2"); |
| /// Broadcast this decicion to all instructions inside the group. |
| /// But the cost will be assigned to one instruction only. |
| for (unsigned i = 0; i < Grp->getFactor(); ++i) { |
| if (auto *I = Grp->getMember(i)) { |
| if (Grp->getInsertPos() == I) |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); |
| else |
| WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); |
| } |
| } |
| } |
| |
| /// Return the cost model decision for the given instruction \p I and vector |
| /// width \p VF. Return CM_Unknown if this instruction did not pass |
| /// through the cost modeling. |
| InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { |
| assert(VF.isVector() && "Expected VF to be a vector VF"); |
| // Cost model is not run in the VPlan-native path - return conservative |
| // result until this changes. |
| if (EnableVPlanNativePath) |
| return CM_GatherScatter; |
| |
| std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); |
| auto Itr = WideningDecisions.find(InstOnVF); |
| if (Itr == WideningDecisions.end()) |
| return CM_Unknown; |
| return Itr->second.first; |
| } |
| |
| /// Return the vectorization cost for the given instruction \p I and vector |
| /// width \p VF. |
| InstructionCost getWideningCost(Instruction *I, ElementCount VF) { |
| assert(VF.isVector() && "Expected VF >=2"); |
| std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); |
| assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && |
| "The cost is not calculated"); |
| return WideningDecisions[InstOnVF].second; |
| } |
| |
| /// Return True if instruction \p I is an optimizable truncate whose operand |
| /// is an induction variable. Such a truncate will be removed by adding a new |
| /// induction variable with the destination type. |
| bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { |
| // If the instruction is not a truncate, return false. |
| auto *Trunc = dyn_cast<TruncInst>(I); |
| if (!Trunc) |
| return false; |
| |
| // Get the source and destination types of the truncate. |
| Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); |
| Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); |
| |
| // If the truncate is free for the given types, return false. Replacing a |
| // free truncate with an induction variable would add an induction variable |
| // update instruction to each iteration of the loop. We exclude from this |
| // check the primary induction variable since it will need an update |
| // instruction regardless. |
| Value *Op = Trunc->getOperand(0); |
| if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) |
| return false; |
| |
| // If the truncated value is not an induction variable, return false. |
| return Legal->isInductionPhi(Op); |
| } |
| |
| /// Collects the instructions to scalarize for each predicated instruction in |
| /// the loop. |
| void collectInstsToScalarize(ElementCount VF); |
| |
| /// Collect Uniform and Scalar values for the given \p VF. |
| /// The sets depend on CM decision for Load/Store instructions |
| /// that may be vectorized as interleave, gather-scatter or scalarized. |
| void collectUniformsAndScalars(ElementCount VF) { |
| // Do the analysis once. |
| if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) |
| return; |
| setCostBasedWideningDecision(VF); |
| collectLoopUniforms(VF); |
| collectLoopScalars(VF); |
| } |
| |
| /// Returns true if the target machine supports masked store operation |
| /// for the given \p DataType and kind of access to \p Ptr. |
| bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { |
| return Legal->isConsecutivePtr(Ptr) && |
| TTI.isLegalMaskedStore(DataType, Alignment); |
| } |
| |
| /// Returns true if the target machine supports masked load operation |
| /// for the given \p DataType and kind of access to \p Ptr. |
| bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { |
| return Legal->isConsecutivePtr(Ptr) && |
| TTI.isLegalMaskedLoad(DataType, Alignment); |
| } |
| |
| /// Returns true if the target machine supports masked scatter operation |
| /// for the given \p DataType. |
| bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { |
| return TTI.isLegalMaskedScatter(DataType, Alignment); |
| } |
| |
| /// Returns true if the target machine supports masked gather operation |
| /// for the given \p DataType. |
| bool isLegalMaskedGather(Type *DataType, Align Alignment) const { |
| return TTI.isLegalMaskedGather(DataType, Alignment); |
| } |
| |
| /// Returns true if the target machine can represent \p V as a masked gather |
| /// or scatter operation. |
| bool isLegalGatherOrScatter(Value *V) { |
| bool LI = isa<LoadInst>(V); |
| bool SI = isa<StoreInst>(V); |
| if (!LI && !SI) |
| return false; |
| auto *Ty = getMemInstValueType(V); |
| Align Align = getLoadStoreAlignment(V); |
| return (LI && isLegalMaskedGather(Ty, Align)) || |
| (SI && isLegalMaskedScatter(Ty, Align)); |
| } |
| |
| /// Returns true if the target machine supports all of the reduction |
| /// variables found for the given VF. |
| bool canVectorizeReductions(ElementCount VF) { |
| return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { |
| RecurrenceDescriptor RdxDesc = Reduction.second; |
| return TTI.isLegalToVectorizeReduction(RdxDesc, VF); |
| })); |
| } |
| |
| /// Returns true if \p I is an instruction that will be scalarized with |
| /// predication. Such instructions include conditional stores and |
| /// instructions that may divide by zero. |
| /// If a non-zero VF has been calculated, we check if I will be scalarized |
| /// predication for that VF. |
| bool |
| isScalarWithPredication(Instruction *I, |
| ElementCount VF = ElementCount::getFixed(1)) const; |
| |
| // Returns true if \p I is an instruction that will be predicated either |
| // through scalar predication or masked load/store or masked gather/scatter. |
| // Superset of instructions that return true for isScalarWithPredication. |
| bool isPredicatedInst(Instruction *I) { |
| if (!blockNeedsPredication(I->getParent())) |
| return false; |
| // Loads and stores that need some form of masked operation are predicated |
| // instructions. |
| if (isa<LoadInst>(I) || isa<StoreInst>(I)) |
| return Legal->isMaskRequired(I); |
| return isScalarWithPredication(I); |
| } |
| |
| /// Returns true if \p I is a memory instruction with consecutive memory |
| /// access that can be widened. |
| bool |
| memoryInstructionCanBeWidened(Instruction *I, |
| ElementCount VF = ElementCount::getFixed(1)); |
| |
| /// Returns true if \p I is a memory instruction in an interleaved-group |
| /// of memory accesses that can be vectorized with wide vector loads/stores |
| /// and shuffles. |
| bool |
| interleavedAccessCanBeWidened(Instruction *I, |
| ElementCount VF = ElementCount::getFixed(1)); |
| |
| /// Check if \p Instr belongs to any interleaved access group. |
| bool isAccessInterleaved(Instruction *Instr) { |
| return InterleaveInfo.isInterleaved(Instr); |
| } |
| |
| /// Get the interleaved access group that \p Instr belongs to. |
| const InterleaveGroup<Instruction> * |
| getInterleavedAccessGroup(Instruction *Instr) { |
| return InterleaveInfo.getInterleaveGroup(Instr); |
| } |
| |
| /// Returns true if we're required to use a scalar epilogue for at least |
| /// the final iteration of the original loop. |
| bool requiresScalarEpilogue() const { |
| if (!isScalarEpilogueAllowed()) |
| return false; |
| // If we might exit from anywhere but the latch, must run the exiting |
| // iteration in scalar form. |
| if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) |
| return true; |
| return InterleaveInfo.requiresScalarEpilogue(); |
| } |
| |
| /// Returns true if a scalar epilogue is not allowed due to optsize or a |
| /// loop hint annotation. |
| bool isScalarEpilogueAllowed() const { |
| return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; |
| } |
| |
| /// Returns true if all loop blocks should be masked to fold tail loop. |
| bool foldTailByMasking() const { return FoldTailByMasking; } |
| |
| bool blockNeedsPredication(BasicBlock *BB) const { |
| return foldTailByMasking() || Legal->blockNeedsPredication(BB); |
| } |
| |
| /// A SmallMapVector to store the InLoop reduction op chains, mapping phi |
| /// nodes to the chain of instructions representing the reductions. Uses a |
| /// MapVector to ensure deterministic iteration order. |
| using ReductionChainMap = |
| SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; |
| |
| /// Return the chain of instructions representing an inloop reduction. |
| const ReductionChainMap &getInLoopReductionChains() const { |
| return InLoopReductionChains; |
| } |
| |
| /// Returns true if the Phi is part of an inloop reduction. |
| bool isInLoopReduction(PHINode *Phi) const { |
| return InLoopReductionChains.count(Phi); |
| } |
| |
| /// Estimate cost of an intrinsic call instruction CI if it were vectorized |
| /// with factor VF. Return the cost of the instruction, including |
| /// scalarization overhead if it's needed. |
| InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; |
| |
| /// Estimate cost of a call instruction CI if it were vectorized with factor |
| /// VF. Return the cost of the instruction, including scalarization overhead |
| /// if it's needed. The flag NeedToScalarize shows if the call needs to be |
| /// scalarized - |
| /// i.e. either vector version isn't available, or is too expensive. |
| InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, |
| bool &NeedToScalarize) const; |
| |
| /// Invalidates decisions already taken by the cost model. |
| void invalidateCostModelingDecisions() { |
| WideningDecisions.clear(); |
| Uniforms.clear(); |
| Scalars.clear(); |
| } |
| |
| private: |
| unsigned NumPredStores = 0; |
| |
| /// \return An upper bound for the vectorization factor, a power-of-2 larger |
| /// than zero. One is returned if vectorization should best be avoided due |
| /// to cost. |
| ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, |
| ElementCount UserVF); |
| |
| /// The vectorization cost is a combination of the cost itself and a boolean |
| /// indicating whether any of the contributing operations will actually |
| /// operate on |
| /// vector values after type legalization in the backend. If this latter value |
| /// is |
| /// false, then all operations will be scalarized (i.e. no vectorization has |
| /// actually taken place). |
| using VectorizationCostTy = std::pair<InstructionCost, bool>; |
| |
| /// Returns the expected execution cost. The unit of the cost does |
| /// not matter because we use the 'cost' units to compare different |
| /// vector widths. The cost that is returned is *not* normalized by |
| /// the factor width. |
| VectorizationCostTy expectedCost(ElementCount VF); |
| |
| /// Returns the execution time cost of an instruction for a given vector |
| /// width. Vector width of one means scalar. |
| VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); |
| |
| /// The cost-computation logic from getInstructionCost which provides |
| /// the vector type as an output parameter. |
| InstructionCost getInstructionCost(Instruction *I, ElementCount VF, |
| Type *&VectorTy); |
| |
| /// Return the cost of instructions in an inloop reduction pattern, if I is |
| /// part of that pattern. |
| InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, |
| Type *VectorTy, |
| TTI::TargetCostKind CostKind); |
| |
| /// Calculate vectorization cost of memory instruction \p I. |
| InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); |
| |
| /// The cost computation for scalarized memory instruction. |
| InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); |
| |
| /// The cost computation for interleaving group of memory instructions. |
| InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); |
| |
| /// The cost computation for Gather/Scatter instruction. |
| InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); |
| |
| /// The cost computation for widening instruction \p I with consecutive |
| /// memory access. |
| InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); |
| |
| /// The cost calculation for Load/Store instruction \p I with uniform pointer - |
| /// Load: scalar load + broadcast. |
| /// Store: scalar store + (loop invariant value stored? 0 : extract of last |
| /// element) |
| InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); |
| |
| /// Estimate the overhead of scalarizing an instruction. This is a |
| /// convenience wrapper for the type-based getScalarizationOverhead API. |
| InstructionCost getScalarizationOverhead(Instruction *I, |
| ElementCount VF) const; |
| |
| /// Returns whether the instruction is a load or store and will be a emitted |
| /// as a vector operation. |
| bool isConsecutiveLoadOrStore(Instruction *I); |
| |
| /// Returns true if an artificially high cost for emulated masked memrefs |
| /// should be used. |
| bool useEmulatedMaskMemRefHack(Instruction *I); |
| |
| /// Map of scalar integer values to the smallest bitwidth they can be legally |
| /// represented as. The vector equivalents of these values should be truncated |
| /// to this type. |
| MapVector<Instruction *, uint64_t> MinBWs; |
| |
| /// A type representing the costs for instructions if they were to be |
| /// scalarized rather than vectorized. The entries are Instruction-Cost |
| /// pairs. |
| using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; |
| |
| /// A set containing all BasicBlocks that are known to present after |
| /// vectorization as a predicated block. |
| SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; |
| |
| /// Records whether it is allowed to have the original scalar loop execute at |
| /// least once. This may be needed as a fallback loop in case runtime |
| /// aliasing/dependence checks fail, or to handle the tail/remainder |
| /// iterations when the trip count is unknown or doesn't divide by the VF, |
| /// or as a peel-loop to handle gaps in interleave-groups. |
| /// Under optsize and when the trip count is very small we don't allow any |
| /// iterations to execute in the scalar loop. |
| ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; |
| |
| /// All blocks of loop are to be masked to fold tail of scalar iterations. |
| bool FoldTailByMasking = false; |
| |
| /// A map holding scalar costs for different vectorization factors. The |
| /// presence of a cost for an instruction in the mapping indicates that the |
| /// instruction will be scalarized when vectorizing with the associated |
| /// vectorization factor. The entries are VF-ScalarCostTy pairs. |
| DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; |
| |
| /// Holds the instructions known to be uniform after vectorization. |
| /// The data is collected per VF. |
| DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; |
| |
| /// Holds the instructions known to be scalar after vectorization. |
| /// The data is collected per VF. |
| DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; |
| |
| /// Holds the instructions (address computations) that are forced to be |
| /// scalarized. |
| DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; |
| |
| /// PHINodes of the reductions that should be expanded in-loop along with |
| /// their associated chains of reduction operations, in program order from top |
| /// (PHI) to bottom |
| ReductionChainMap InLoopReductionChains; |
| |
| /// A Map of inloop reduction operations and their immediate chain operand. |
| /// FIXME: This can be removed once reductions can be costed correctly in |
| /// vplan. This was added to allow quick lookup to the inloop operations, |
| /// without having to loop through InLoopReductionChains. |
| DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; |
| |
| /// Returns the expected difference in cost from scalarizing the expression |
| /// feeding a predicated instruction \p PredInst. The instructions to |
| /// scalarize and their scalar costs are collected in \p ScalarCosts. A |
| /// non-negative return value implies the expression will be scalarized. |
| /// Currently, only single-use chains are considered for scalarization. |
| int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, |
| ElementCount VF); |
| |
| /// Collect the instructions that are uniform after vectorization. An |
| /// instruction is uniform if we represent it with a single scalar value in |
| /// the vectorized loop corresponding to each vector iteration. Examples of |
| /// uniform instructions include pointer operands of consecutive or |
| /// interleaved memory accesses. Note that although uniformity implies an |
| /// instruction will be scalar, the reverse is not true. In general, a |
| /// scalarized instruction will be represented by VF scalar values in the |
| /// vectorized loop, each corresponding to an iteration of the original |
| /// scalar loop. |
| void collectLoopUniforms(ElementCount VF); |
| |
| /// Collect the instructions that are scalar after vectorization. An |
| /// instruction is scalar if it is known to be uniform or will be scalarized |
| /// during vectorization. Non-uniform scalarized instructions will be |
| /// represented by VF values in the vectorized loop, each corresponding to an |
| /// iteration of the original scalar loop. |
| void collectLoopScalars(ElementCount VF); |
| |
| /// Keeps cost model vectorization decision and cost for instructions. |
| /// Right now it is used for memory instructions only. |
| using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, |
| std::pair<InstWidening, InstructionCost>>; |
| |
| DecisionList WideningDecisions; |
| |
| /// Returns true if \p V is expected to be vectorized and it needs to be |
| /// extracted. |
| bool needsExtract(Value *V, ElementCount VF) const { |
| Instruction *I = dyn_cast<Instruction>(V); |
| if (VF.isScalar() || !I || !TheLoop->contains(I) || |
| TheLoop->isLoopInvariant(I)) |
| return false; |
| |
| // Assume we can vectorize V (and hence we need extraction) if the |
| // scalars are not computed yet. This can happen, because it is called |
| // via getScalarizationOverhead from setCostBasedWideningDecision, before |
| // the scalars are collected. That should be a safe assumption in most |
| // cases, because we check if the operands have vectorizable types |
| // beforehand in LoopVectorizationLegality. |
| return Scalars.find(VF) == Scalars.end() || |
| !isScalarAfterVectorization(I, VF); |
| }; |
| |
| /// Returns a range containing only operands needing to be extracted. |
| SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, |
| ElementCount VF) const { |
| return SmallVector<Value *, 4>(make_filter_range( |
| Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); |
| } |
| |
| /// Determines if we have the infrastructure to vectorize loop \p L and its |
| /// epilogue, assuming the main loop is vectorized by \p VF. |
| bool isCandidateForEpilogueVectorization(const Loop &L, |
| const ElementCount VF) const; |
| |
| /// Returns true if epilogue vectorization is considered profitable, and |
| /// false otherwise. |
| /// \p VF is the vectorization factor chosen for the original loop. |
| bool isEpilogueVectorizationProfitable(const ElementCount VF) const; |
| |
| public: |
| /// The loop that we evaluate. |
| Loop *TheLoop; |
| |
| /// Predicated scalar evolution analysis. |
| PredicatedScalarEvolution &PSE; |
| |
| /// Loop Info analysis. |
| LoopInfo *LI; |
| |
| /// Vectorization legality. |
| LoopVectorizationLegality *Legal; |
| |
| /// Vector target information. |
| const TargetTransformInfo &TTI; |
| |
| /// Target Library Info. |
| const TargetLibraryInfo *TLI; |
| |
| /// Demanded bits analysis. |
| DemandedBits *DB; |
| |
| /// Assumption cache. |
| AssumptionCache *AC; |
| |
| /// Interface to emit optimization remarks. |
| OptimizationRemarkEmitter *ORE; |
| |
| const Function *TheFunction; |
| |
| /// Loop Vectorize Hint. |
| const LoopVectorizeHints *Hints; |
| |
| /// The interleave access information contains groups of interleaved accesses |
| /// with the same stride and close to each other. |
| InterleavedAccessInfo &InterleaveInfo; |
| |
| /// Values to ignore in the cost model. |
| SmallPtrSet<const Value *, 16> ValuesToIgnore; |
| |
| /// Values to ignore in the cost model when VF > 1. |
| SmallPtrSet<const Value *, 16> VecValuesToIgnore; |
| |
| /// Profitable vector factors. |
| SmallVector<VectorizationFactor, 8> ProfitableVFs; |
| }; |
| } // end namespace llvm |
| |
| /// Helper struct to manage generating runtime checks for vectorization. |
| /// |
| /// The runtime checks are created up-front in temporary blocks to allow better |
| /// estimating the cost and un-linked from the existing IR. After deciding to |
| /// vectorize, the checks are moved back. If deciding not to vectorize, the |
| /// temporary blocks are completely removed. |
| class GeneratedRTChecks { |
| /// Basic block which contains the generated SCEV checks, if any. |
| BasicBlock *SCEVCheckBlock = nullptr; |
| |
| /// The value representing the result of the generated SCEV checks. If it is |
| /// nullptr, either no SCEV checks have been generated or they have been used. |
| Value *SCEVCheckCond = nullptr; |
| |
| /// Basic block which contains the generated memory runtime checks, if any. |
| BasicBlock *MemCheckBlock = nullptr; |
| |
| /// The value representing the result of the generated memory runtime checks. |
| /// If it is nullptr, either no memory runtime checks have been generated or |
| /// they have been used. |
| Instruction *MemRuntimeCheckCond = nullptr; |
| |
| DominatorTree *DT; |
| LoopInfo *LI; |
| |
| SCEVExpander SCEVExp; |
| SCEVExpander MemCheckExp; |
| |
| public: |
| GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, |
| const DataLayout &DL) |
| : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), |
| MemCheckExp(SE, DL, "scev.check") {} |
| |
| /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can |
| /// accurately estimate the cost of the runtime checks. The blocks are |
| /// un-linked from the IR and is added back during vector code generation. If |
| /// there is no vector code generation, the check blocks are removed |
| /// completely. |
| void Create(Loop *L, const LoopAccessInfo &LAI, |
| const SCEVUnionPredicate &UnionPred) { |
| |
| BasicBlock *LoopHeader = L->getHeader(); |
| BasicBlock *Preheader = L->getLoopPreheader(); |
| |
| // Use SplitBlock to create blocks for SCEV & memory runtime checks to |
| // ensure the blocks are properly added to LoopInfo & DominatorTree. Those |
| // may be used by SCEVExpander. The blocks will be un-linked from their |
| // predecessors and removed from LI & DT at the end of the function. |
| if (!UnionPred.isAlwaysTrue()) { |
| SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, |
| nullptr, "vector.scevcheck"); |
| |
| SCEVCheckCond = SCEVExp.expandCodeForPredicate( |
| &UnionPred, SCEVCheckBlock->getTerminator()); |
| } |
| |
| const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); |
| if (RtPtrChecking.Need) { |
| auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; |
| MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, |
| "vector.memcheck"); |
| |
| std::tie(std::ignore, MemRuntimeCheckCond) = |
| addRuntimeChecks(MemCheckBlock->getTerminator(), L, |
| RtPtrChecking.getChecks(), MemCheckExp); |
| assert(MemRuntimeCheckCond && |
| "no RT checks generated although RtPtrChecking " |
| "claimed checks are required"); |
| } |
| |
| if (!MemCheckBlock && !SCEVCheckBlock) |
| return; |
| |
| // Unhook the temporary block with the checks, update various places |
| // accordingly. |
| if (SCEVCheckBlock) |
| SCEVCheckBlock->replaceAllUsesWith(Preheader); |
| if (MemCheckBlock) |
| MemCheckBlock->replaceAllUsesWith(Preheader); |
| |
| if (SCEVCheckBlock) { |
| SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); |
| new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); |
| Preheader->getTerminator()->eraseFromParent(); |
| } |
| if (MemCheckBlock) { |
| MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); |
| new UnreachableInst(Preheader->getContext(), MemCheckBlock); |
| Preheader->getTerminator()->eraseFromParent(); |
| } |
| |
| DT->changeImmediateDominator(LoopHeader, Preheader); |
| if (MemCheckBlock) { |
| DT->eraseNode(MemCheckBlock); |
| LI->removeBlock(MemCheckBlock); |
| } |
| if (SCEVCheckBlock) { |
| DT->eraseNode(SCEVCheckBlock); |
| LI->removeBlock(SCEVCheckBlock); |
| } |
| } |
| |
| /// Remove the created SCEV & memory runtime check blocks & instructions, if |
| /// unused. |
| ~GeneratedRTChecks() { |
| SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); |
| SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); |
| if (!SCEVCheckCond) |
| SCEVCleaner.markResultUsed(); |
| |
| if (!MemRuntimeCheckCond) |
| MemCheckCleaner.markResultUsed(); |
| |
| if (MemRuntimeCheckCond) { |
| auto &SE = *MemCheckExp.getSE(); |
| // Memory runtime check generation creates compares that use expanded |
| // values. Remove them before running the SCEVExpanderCleaners. |
| for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { |
| if (MemCheckExp.isInsertedInstruction(&I)) |
| continue; |
| SE.forgetValue(&I); |
| SE.eraseValueFromMap(&I); |
| I.eraseFromParent(); |
| } |
| } |
| MemCheckCleaner.cleanup(); |
| SCEVCleaner.cleanup(); |
| |
| if (SCEVCheckCond) |
| SCEVCheckBlock->eraseFromParent(); |
| if (MemRuntimeCheckCond) |
| MemCheckBlock->eraseFromParent(); |
| } |
| |
| /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and |
| /// adjusts the branches to branch to the vector preheader or \p Bypass, |
| /// depending on the generated condition. |
| BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, |
| BasicBlock *LoopVectorPreHeader, |
| BasicBlock *LoopExitBlock) { |
| if (!SCEVCheckCond) |
| return nullptr; |
| if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) |
| if (C->isZero()) |
| return nullptr; |
| |
| auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); |
| |
| BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); |
| // Create new preheader for vector loop. |
| if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) |
| PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); |
| |
| SCEVCheckBlock->getTerminator()->eraseFromParent(); |
| SCEVCheckBlock->moveBefore(LoopVectorPreHeader); |
| Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, |
| SCEVCheckBlock); |
| |
| DT->addNewBlock(SCEVCheckBlock, Pred); |
| DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); |
| |
| ReplaceInstWithInst( |
| SCEVCheckBlock->getTerminator(), |
| BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); |
| // Mark the check as used, to prevent it from being removed during cleanup. |
| SCEVCheckCond = nullptr; |
| return SCEVCheckBlock; |
| } |
| |
| /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts |
| /// the branches to branch to the vector preheader or \p Bypass, depending on |
| /// the generated condition. |
| BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, |
| BasicBlock *LoopVectorPreHeader) { |
| // Check if we generated code that checks in runtime if arrays overlap. |
| if (!MemRuntimeCheckCond) |
| return nullptr; |
| |
| auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); |
| Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, |
| MemCheckBlock); |
| |
| DT->addNewBlock(MemCheckBlock, Pred); |
| DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); |
| MemCheckBlock->moveBefore(LoopVectorPreHeader); |
| |
| if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) |
| PL->addBasicBlockToLoop(MemCheckBlock, *LI); |
| |
| ReplaceInstWithInst( |
| MemCheckBlock->getTerminator(), |
| BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); |
| MemCheckBlock->getTerminator()->setDebugLoc( |
| Pred->getTerminator()->getDebugLoc()); |
| |
| // Mark the check as used, to prevent it from being removed during cleanup. |
| MemRuntimeCheckCond = nullptr; |
| return MemCheckBlock; |
| } |
| }; |
| |
| // Return true if \p OuterLp is an outer loop annotated with hints for explicit |
| // vectorization. The loop needs to be annotated with #pragma omp simd |
| // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the |
| // vector length information is not provided, vectorization is not considered |
| // explicit. Interleave hints are not allowed either. These limitations will be |
| // relaxed in the future. |
| // Please, note that we are currently forced to abuse the pragma 'clang |
| // vectorize' semantics. This pragma provides *auto-vectorization hints* |
| // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' |
| // provides *explicit vectorization hints* (LV can bypass legal checks and |
| // assume that vectorization is legal). However, both hints are implemented |
| // using the same metadata (llvm.loop.vectorize, processed by |
| // LoopVectorizeHints). This will be fixed in the future when the native IR |
| // representation for pragma 'omp simd' is introduced. |
| static bool isExplicitVecOuterLoop(Loop *OuterLp, |
| OptimizationRemarkEmitter *ORE) { |
| assert(!OuterLp->isInnermost() && "This is not an outer loop"); |
| LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); |
| |
| // Only outer loops with an explicit vectorization hint are supported. |
| // Unannotated outer loops are ignored. |
| if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) |
| return false; |
| |
| Function *Fn = OuterLp->getHeader()->getParent(); |
| if (!Hints.allowVectorization(Fn, OuterLp, |
| true /*VectorizeOnlyWhenForced*/)) { |
| LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); |
| return false; |
| } |
| |
| if (Hints.getInterleave() > 1) { |
| // TODO: Interleave support is future work. |
| LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " |
| "outer loops.\n"); |
| Hints.emitRemarkWithHints(); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static void collectSupportedLoops(Loop &L, LoopInfo *LI, |
| OptimizationRemarkEmitter *ORE, |
| SmallVectorImpl<Loop *> &V) { |
| // Collect inner loops and outer loops without irreducible control flow. For |
| // now, only collect outer loops that have explicit vectorization hints. If we |
| // are stress testing the VPlan H-CFG construction, we collect the outermost |
| // loop of every loop nest. |
| if (L.isInnermost() || VPlanBuildStressTest || |
| (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { |
| LoopBlocksRPO RPOT(&L); |
| RPOT.perform(LI); |
| if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { |
| V.push_back(&L); |
| // TODO: Collect inner loops inside marked outer loops in case |
| // vectorization fails for the outer loop. Do not invoke |
| // 'containsIrreducibleCFG' again for inner loops when the outer loop is |
| // already known to be reducible. We can use an inherited attribute for |
| // that. |
| return; |
| } |
| } |
| for (Loop *InnerL : L) |
| collectSupportedLoops(*InnerL, LI, ORE, V); |
| } |
| |
| namespace { |
| |
| /// The LoopVectorize Pass. |
| struct LoopVectorize : public FunctionPass { |
| /// Pass identification, replacement for typeid |
| static char ID; |
| |
| LoopVectorizePass Impl; |
| |
| explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, |
| bool VectorizeOnlyWhenForced = false) |
| : FunctionPass(ID), |
| Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { |
| initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); |
| } |
| |
| bool runOnFunction(Function &F) override { |
| if (skipFunction(F)) |
| return false; |
| |
| auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
| auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
| auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
| auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
| auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); |
| auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); |
| auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; |
| auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
| auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
| auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); |
| auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); |
| auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
| auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
| |
| std::function<const LoopAccessInfo &(Loop &)> GetLAA = |
| [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; |
| |
| return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, |
| GetLAA, *ORE, PSI).MadeAnyChange; |
| } |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override { |
| AU.addRequired<AssumptionCacheTracker>(); |
| AU.addRequired<BlockFrequencyInfoWrapperPass>(); |
| AU.addRequired<DominatorTreeWrapperPass>(); |
| AU.addRequired<LoopInfoWrapperPass>(); |
| AU.addRequired<ScalarEvolutionWrapperPass>(); |
| AU.addRequired<TargetTransformInfoWrapperPass>(); |
| AU.addRequired<AAResultsWrapperPass>(); |
| AU.addRequired<LoopAccessLegacyAnalysis>(); |
| AU.addRequired<DemandedBitsWrapperPass>(); |
| AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
| AU.addRequired<InjectTLIMappingsLegacy>(); |
| |
| // We currently do not preserve loopinfo/dominator analyses with outer loop |
| // vectorization. Until this is addressed, mark these analyses as preserved |
| // only for non-VPlan-native path. |
| // TODO: Preserve Loop and Dominator analyses for VPlan-native path. |
| if (!EnableVPlanNativePath) { |
| AU.addPreserved<LoopInfoWrapperPass>(); |
| AU.addPreserved<DominatorTreeWrapperPass>(); |
| } |
| |
| AU.addPreserved<BasicAAWrapperPass>(); |
| AU.addPreserved<GlobalsAAWrapperPass>(); |
| AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
| } |
| }; |
| |
| } // end anonymous namespace |
| |
| //===----------------------------------------------------------------------===// |
| // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and |
| // LoopVectorizationCostModel and LoopVectorizationPlanner. |
| //===----------------------------------------------------------------------===// |
| |
| Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { |
| // We need to place the broadcast of invariant variables outside the loop, |
| // but only if it's proven safe to do so. Else, broadcast will be inside |
| // vector loop body. |
| Instruction *Instr = dyn_cast<Instruction>(V); |
| bool SafeToHoist = OrigLoop->isLoopInvariant(V) && |
| (!Instr || |
| DT->dominates(Instr->getParent(), LoopVectorPreHeader)); |
| // Place the code for broadcasting invariant variables in the new preheader. |
| IRBuilder<>::InsertPointGuard Guard(Builder); |
| if (SafeToHoist) |
| Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
| |
| // Broadcast the scalar into all locations in the vector. |
| Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); |
| |
| return Shuf; |
| } |
| |
| void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( |
| const InductionDescriptor &II, Value *Step, Value *Start, |
| Instruction *EntryVal, VPValue *Def, VPValue *CastDef, |
| VPTransformState &State) { |
| assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && |
| "Expected either an induction phi-node or a truncate of it!"); |
| |
| // Construct the initial value of the vector IV in the vector loop preheader |
| auto CurrIP = Builder.saveIP(); |
| Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
| if (isa<TruncInst>(EntryVal)) { |
| assert(Start->getType()->isIntegerTy() && |
| "Truncation requires an integer type"); |
| auto *TruncType = cast<IntegerType>(EntryVal->getType()); |
| Step = Builder.CreateTrunc(Step, TruncType); |
| Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); |
| } |
| Value *SplatStart = Builder.CreateVectorSplat(VF, Start); |
| Value *SteppedStart = |
| getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); |
| |
| // We create vector phi nodes for both integer and floating-point induction |
| // variables. Here, we determine the kind of arithmetic we will perform. |
| Instruction::BinaryOps AddOp; |
| Instruction::BinaryOps MulOp; |
| if (Step->getType()->isIntegerTy()) { |
| AddOp = Instruction::Add; |
| MulOp = Instruction::Mul; |
| } else { |
| AddOp = II.getInductionOpcode(); |
| MulOp = Instruction::FMul; |
| } |
| |
| // Multiply the vectorization factor by the step using integer or |
| // floating-point arithmetic as appropriate. |
| Type *StepType = Step->getType(); |
| if (Step->getType()->isFloatingPointTy()) |
| StepType = IntegerType::get(StepType->getContext(), |
| StepType->getScalarSizeInBits()); |
| Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); |
| if (Step->getType()->isFloatingPointTy()) |
| RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); |
| Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); |
| |
| // Create a vector splat to use in the induction update. |
| // |
| // FIXME: If the step is non-constant, we create the vector splat with |
| // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't |
| // handle a constant vector splat. |
| Value *SplatVF = isa<Constant>(Mul) |
| ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) |
| : Builder.CreateVectorSplat(VF, Mul); |
| Builder.restoreIP(CurrIP); |
| |
| // We may need to add the step a number of times, depending on the unroll |
| // factor. The last of those goes into the PHI. |
| PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", |
| &*LoopVectorBody->getFirstInsertionPt()); |
| VecInd->setDebugLoc(EntryVal->getDebugLoc()); |
| Instruction *LastInduction = VecInd; |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| State.set(Def, LastInduction, Part); |
| |
| if (isa<TruncInst>(EntryVal)) |
| addMetadata(LastInduction, EntryVal); |
| recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, |
| State, Part); |
| |
| LastInduction = cast<Instruction>( |
| Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); |
| LastInduction->setDebugLoc(EntryVal->getDebugLoc()); |
| } |
| |
| // Move the last step to the end of the latch block. This ensures consistent |
| // placement of all induction updates. |
| auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); |
| auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); |
| auto *ICmp = cast<Instruction>(Br->getCondition()); |
| LastInduction->moveBefore(ICmp); |
| LastInduction->setName("vec.ind.next"); |
| |
| VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); |
| VecInd->addIncoming(LastInduction, LoopVectorLatch); |
| } |
| |
| bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { |
| return Cost->isScalarAfterVectorization(I, VF) || |
| Cost->isProfitableToScalarize(I, VF); |
| } |
| |
| bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { |
| if (shouldScalarizeInstruction(IV)) |
| return true; |
| auto isScalarInst = [&](User *U) -> bool { |
| auto *I = cast<Instruction>(U); |
| return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); |
| }; |
| return llvm::any_of(IV->users(), isScalarInst); |
| } |
| |
| void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( |
| const InductionDescriptor &ID, const Instruction *EntryVal, |
| Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, |
| unsigned Part, unsigned Lane) { |
| assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && |
| "Expected either an induction phi-node or a truncate of it!"); |
| |
| // This induction variable is not the phi from the original loop but the |
| // newly-created IV based on the proof that casted Phi is equal to the |
| // uncasted Phi in the vectorized loop (under a runtime guard possibly). It |
| // re-uses the same InductionDescriptor that original IV uses but we don't |
| // have to do any recording in this case - that is done when original IV is |
| // processed. |
| if (isa<TruncInst>(EntryVal)) |
| return; |
| |
| const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); |
| if (Casts.empty()) |
| return; |
| // Only the first Cast instruction in the Casts vector is of interest. |
| // The rest of the Casts (if exist) have no uses outside the |
| // induction update chain itself. |
| if (Lane < UINT_MAX) |
| State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); |
| else |
| State.set(CastDef, VectorLoopVal, Part); |
| } |
| |
| void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, |
| TruncInst *Trunc, VPValue *Def, |
| VPValue *CastDef, |
| VPTransformState &State) { |
| assert((IV->getType()->isIntegerTy() || IV != OldInduction) && |
| "Primary induction variable must have an integer type"); |
| |
| auto II = Legal->getInductionVars().find(IV); |
| assert(II != Legal->getInductionVars().end() && "IV is not an induction"); |
| |
| auto ID = II->second; |
| assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); |
| |
| // The value from the original loop to which we are mapping the new induction |
| // variable. |
| Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; |
| |
| auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); |
| |
| // Generate code for the induction step. Note that induction steps are |
| // required to be loop-invariant |
| auto CreateStepValue = [&](const SCEV *Step) -> Value * { |
| assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && |
| "Induction step should be loop invariant"); |
| if (PSE.getSE()->isSCEVable(IV->getType())) { |
| SCEVExpander Exp(*PSE.getSE(), DL, "induction"); |
| return Exp.expandCodeFor(Step, Step->getType(), |
| LoopVectorPreHeader->getTerminator()); |
| } |
| return cast<SCEVUnknown>(Step)->getValue(); |
| }; |
| |
| // The scalar value to broadcast. This is derived from the canonical |
| // induction variable. If a truncation type is given, truncate the canonical |
| // induction variable and step. Otherwise, derive these values from the |
| // induction descriptor. |
| auto CreateScalarIV = [&](Value *&Step) -> Value * { |
| Value *ScalarIV = Induction; |
| if (IV != OldInduction) { |
| ScalarIV = IV->getType()->isIntegerTy() |
| ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) |
| : Builder.CreateCast(Instruction::SIToFP, Induction, |
| IV->getType()); |
| ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); |
| ScalarIV->setName("offset.idx"); |
| } |
| if (Trunc) { |
| auto *TruncType = cast<IntegerType>(Trunc->getType()); |
| assert(Step->getType()->isIntegerTy() && |
| "Truncation requires an integer step"); |
| ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); |
| Step = Builder.CreateTrunc(Step, TruncType); |
| } |
| return ScalarIV; |
| }; |
| |
| // Create the vector values from the scalar IV, in the absence of creating a |
| // vector IV. |
| auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { |
| Value *Broadcasted = getBroadcastInstrs(ScalarIV); |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| assert(!VF.isScalable() && "scalable vectors not yet supported."); |
| Value *EntryPart = |
| getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, |
| ID.getInductionOpcode()); |
| State.set(Def, EntryPart, Part); |
| if (Trunc) |
| addMetadata(EntryPart, Trunc); |
| recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, |
| State, Part); |
| } |
| }; |
| |
| // Fast-math-flags propagate from the original induction instruction. |
| IRBuilder<>::FastMathFlagGuard FMFG(Builder); |
| if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) |
| Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); |
| |
| // Now do the actual transformations, and start with creating the step value. |
| Value *Step = CreateStepValue(ID.getStep()); |
| if (VF.isZero() || VF.isScalar()) { |
| Value *ScalarIV = CreateScalarIV(Step); |
| CreateSplatIV(ScalarIV, Step); |
| return; |
| } |
| |
| // Determine if we want a scalar version of the induction variable. This is |
| // true if the induction variable itself is not widened, or if it has at |
| // least one user in the loop that is not widened. |
| auto NeedsScalarIV = needsScalarInduction(EntryVal); |
| if (!NeedsScalarIV) { |
| createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, |
| State); |
| return; |
| } |
| |
| // Try to create a new independent vector induction variable. If we can't |
| // create the phi node, we will splat the scalar induction variable in each |
| // loop iteration. |
| if (!shouldScalarizeInstruction(EntryVal)) { |
| createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, |
| State); |
| Value *ScalarIV = CreateScalarIV(Step); |
| // Create scalar steps that can be used by instructions we will later |
| // scalarize. Note that the addition of the scalar steps will not increase |
| // the number of instructions in the loop in the common case prior to |
| // InstCombine. We will be trading one vector extract for each scalar step. |
| buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); |
| return; |
| } |
| |
| // All IV users are scalar instructions, so only emit a scalar IV, not a |
| // vectorised IV. Except when we tail-fold, then the splat IV feeds the |
| // predicate used by the masked loads/stores. |
| Value *ScalarIV = CreateScalarIV(Step); |
| if (!Cost->isScalarEpilogueAllowed()) |
| CreateSplatIV(ScalarIV, Step); |
| buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); |
| } |
| |
| Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, |
| Instruction::BinaryOps BinOp) { |
| // Create and check the types. |
| auto *ValVTy = cast<VectorType>(Val->getType()); |
| ElementCount VLen = ValVTy->getElementCount(); |
| |
| Type *STy = Val->getType()->getScalarType(); |
| assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && |
| "Induction Step must be an integer or FP"); |
| assert(Step->getType() == STy && "Step has wrong type"); |
| |
| SmallVector<Constant *, 8> Indices; |
| |
| // Create a vector of consecutive numbers from zero to VF. |
| VectorType *InitVecValVTy = ValVTy; |
| Type *InitVecValSTy = STy; |
| if (STy->isFloatingPointTy()) { |
| InitVecValSTy = |
| IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); |
| InitVecValVTy = VectorType::get(InitVecValSTy, VLen); |
| } |
| Value *InitVec = Builder.CreateStepVector(InitVecValVTy); |
| |
| // Add on StartIdx |
| Value *StartIdxSplat = Builder.CreateVectorSplat( |
| VLen, ConstantInt::get(InitVecValSTy, StartIdx)); |
| InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); |
| |
| if (STy->isIntegerTy()) { |
| Step = Builder.CreateVectorSplat(VLen, Step); |
| assert(Step->getType() == Val->getType() && "Invalid step vec"); |
| // FIXME: The newly created binary instructions should contain nsw/nuw flags, |
| // which can be found from the original scalar operations. |
| Step = Builder.CreateMul(InitVec, Step); |
| return Builder.CreateAdd(Val, Step, "induction"); |
| } |
| |
| // Floating point induction. |
| assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && |
| "Binary Opcode should be specified for FP induction"); |
| InitVec = Builder.CreateUIToFP(InitVec, ValVTy); |
| Step = Builder.CreateVectorSplat(VLen, Step); |
| Value *MulOp = Builder.CreateFMul(InitVec, Step); |
| return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); |
| } |
| |
| void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, |
| Instruction *EntryVal, |
| const InductionDescriptor &ID, |
| VPValue *Def, VPValue *CastDef, |
| VPTransformState &State) { |
| // We shouldn't have to build scalar steps if we aren't vectorizing. |
| assert(VF.isVector() && "VF should be greater than one"); |
| // Get the value type and ensure it and the step have the same integer type. |
| Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); |
| assert(ScalarIVTy == Step->getType() && |
| "Val and Step should have the same type"); |
| |
| // We build scalar steps for both integer and floating-point induction |
| // variables. Here, we determine the kind of arithmetic we will perform. |
| Instruction::BinaryOps AddOp; |
| Instruction::BinaryOps MulOp; |
| if (ScalarIVTy->isIntegerTy()) { |
| AddOp = Instruction::Add; |
| MulOp = Instruction::Mul; |
| } else { |
| AddOp = ID.getInductionOpcode(); |
| MulOp = Instruction::FMul; |
| } |
| |
| // Determine the number of scalars we need to generate for each unroll |
| // iteration. If EntryVal is uniform, we only need to generate the first |
| // lane. Otherwise, we generate all VF values. |
| bool IsUniform = |
| Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); |
| unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); |
| // Compute the scalar steps and save the results in State. |
| Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), |
| ScalarIVTy->getScalarSizeInBits()); |
| Type *VecIVTy = nullptr; |
| Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; |
| if (!IsUniform && VF.isScalable()) { |
| VecIVTy = VectorType::get(ScalarIVTy, VF); |
| UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); |
| SplatStep = Builder.CreateVectorSplat(VF, Step); |
| SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); |
| } |
| |
| for (unsigned Part = 0; Part < UF; ++Part) { |
| Value *StartIdx0 = |
| createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); |
| |
| if (!IsUniform && VF.isScalable()) { |
| auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); |
| auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); |
| if (ScalarIVTy->isFloatingPointTy()) |
| InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); |
| auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); |
| auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); |
| State.set(Def, Add, Part); |
| recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, |
| Part); |
| // It's useful to record the lane values too for the known minimum number |
| // of elements so we do those below. This improves the code quality when |
| // trying to extract the first element, for example. |
| } |
| |
| if (ScalarIVTy->isFloatingPointTy()) |
| StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); |
| |
| for (unsigned Lane = 0; Lane < Lanes; ++Lane) { |
| Value *StartIdx = Builder.CreateBinOp( |
| AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); |
| // The step returned by `createStepForVF` is a runtime-evaluated value |
| // when VF is scalable. Otherwise, it should be folded into a Constant. |
| assert((VF.isScalable() || isa<Constant>(StartIdx)) && |
| "Expected StartIdx to be folded to a constant when VF is not " |
| "scalable"); |
| auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); |
| auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); |
| State.set(Def, Add, VPIteration(Part, Lane)); |
| recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, |
| Part, Lane); |
| } |
| } |
| } |
| |
| void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, |
| const VPIteration &Instance, |
| VPTransformState &State) { |
| Value *ScalarInst = State.get(Def, Instance); |
| Value *VectorValue = State.get(Def, Instance.Part); |
| VectorValue = Builder.CreateInsertElement( |
| VectorValue, ScalarInst, |
| Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); |
| State.set(Def, VectorValue, Instance.Part); |
| } |
| |
| Value *InnerLoopVectorizer::reverseVector(Value *Vec) { |
| assert(Vec->getType()->isVectorTy() && "Invalid type"); |
| return Builder.CreateVectorReverse(Vec, "reverse"); |
| } |
| |
| // Return whether we allow using masked interleave-groups (for dealing with |
| // strided loads/stores that reside in predicated blocks, or for dealing |
| // with gaps). |
| static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { |
| // If an override option has been passed in for interleaved accesses, use it. |
| if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) |
| return EnableMaskedInterleavedMemAccesses; |
| |
| return TTI.enableMaskedInterleavedAccessVectorization(); |
| } |
| |
| // Try to vectorize the interleave group that \p Instr belongs to. |
| // |
| // E.g. Translate following interleaved load group (factor = 3): |
| // for (i = 0; i < N; i+=3) { |
| // R = Pic[i]; // Member of index 0 |
| // G = Pic[i+1]; // Member of index 1 |
| // B = Pic[i+2]; // Member of index 2 |
| // ... // do something to R, G, B |
| // } |
| // To: |
| // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
| // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements |
| // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements |
| // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements |
| // |
| // Or translate following interleaved store group (factor = 3): |
| // for (i = 0; i < N; i+=3) { |
| // ... do something to R, G, B |
| // Pic[i] = R; // Member of index 0 |
| // Pic[i+1] = G; // Member of index 1 |
| // Pic[i+2] = B; // Member of index 2 |
| // } |
| // To: |
| // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
| // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> |
| // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
| // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
| // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
| void InnerLoopVectorizer::vectorizeInterleaveGroup( |
| const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, |
| VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, |
| VPValue *BlockInMask) { |
| Instruction *Instr = Group->getInsertPos(); |
| const DataLayout &DL = Instr->getModule()->getDataLayout(); |
| |
| // Prepare for the vector type of the interleaved load/store. |
| Type *ScalarTy = getMemInstValueType(Instr); |
| unsigned InterleaveFactor = Group->getFactor(); |
| assert(!VF.isScalable() && "scalable vectors not yet supported."); |
| auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); |
| |
| // Prepare for the new pointers. |
| SmallVector<Value *, 2> AddrParts; |
| unsigned Index = Group->getIndex(Instr); |
| |
| // TODO: extend the masked interleaved-group support to reversed access. |
| assert((!BlockInMask || !Group->isReverse()) && |
| "Reversed masked interleave-group not supported."); |
| |
| // If the group is reverse, adjust the index to refer to the last vector lane |
| // instead of the first. We adjust the index from the first vector lane, |
| // rather than directly getting the pointer for lane VF - 1, because the |
| // pointer operand of the interleaved access is supposed to be uniform. For |
| // uniform instructions, we're only required to generate a value for the |
| // first vector lane in each unroll iteration. |
| if (Group->isReverse()) |
| Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); |
| |
| for (unsigned Part = 0; Part < UF; Part++) { |
| Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); |
| setDebugLocFromInst(Builder, AddrPart); |
| |
| // Notice current instruction could be any index. Need to adjust the address |
| // to the member of index 0. |
| // |
| // E.g. a = A[i+1]; // Member of index 1 (Current instruction) |
| // b = A[i]; // Member of index 0 |
| // Current pointer is pointed to A[i+1], adjust it to A[i]. |
| // |
| // E.g. A[i+1] = a; // Member of index 1 |
| // A[i] = b; // Member of index 0 |
| // A[i+2] = c; // Member of index 2 (Current instruction) |
| // Current pointer is pointed to A[i+2], adjust it to A[i]. |
| |
| bool InBounds = false; |
| if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) |
| InBounds = gep->isInBounds(); |
| AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); |
| cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); |
| |
| // Cast to the vector pointer type. |
| unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); |
| Type *PtrTy = VecTy->getPointerTo(AddressSpace); |
| AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); |
| } |
| |
| setDebugLocFromInst(Builder, Instr); |
| Value *PoisonVec = PoisonValue::get(VecTy); |
| |
| Value *MaskForGaps = nullptr; |
| if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { |
| MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); |
| assert(MaskForGaps && "Mask for Gaps is required but it is null"); |
| } |
| |
| // Vectorize the interleaved load group. |
| if (isa<LoadInst>(Instr)) { |
| // For each unroll part, create a wide load for the group. |
| SmallVector<Value *, 2> NewLoads; |
| for (unsigned Part = 0; Part < UF; Part++) { |
| Instruction *NewLoad; |
| if (BlockInMask || MaskForGaps) { |
| assert(useMaskedInterleavedAccesses(*TTI) && |
| "masked interleaved groups are not allowed."); |
| Value *GroupMask = MaskForGaps; |
| if (BlockInMask) { |
| Value *BlockInMaskPart = State.get(BlockInMask, Part); |
| Value *ShuffledMask = Builder.CreateShuffleVector( |
| BlockInMaskPart, |
| createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), |
| "interleaved.mask"); |
| GroupMask = MaskForGaps |
| ? Builder.CreateBinOp(Instruction::And, ShuffledMask, |
| MaskForGaps) |
| : ShuffledMask; |
| } |
| NewLoad = |
| Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), |
| GroupMask, PoisonVec, "wide.masked.vec"); |
| } |
| else |
| NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], |
| Group->getAlign(), "wide.vec"); |
| Group->addMetadata(NewLoad); |
| NewLoads.push_back(NewLoad); |
| } |
| |
| // For each member in the group, shuffle out the appropriate data from the |
| // wide loads. |
| unsigned J = 0; |
| for (unsigned I = 0; I < InterleaveFactor; ++I) { |
| Instruction *Member = Group->getMember(I); |
| |
| // Skip the gaps in the group. |
| if (!Member) |
| continue; |
| |
| auto StrideMask = |
| createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); |
| for (unsigned Part = 0; Part < UF; Part++) { |
| Value *StridedVec = Builder.CreateShuffleVector( |
| NewLoads[Part], StrideMask, "strided.vec"); |
| |
| // If this member has different type, cast the result type. |
| if (Member->getType() != ScalarTy) { |
| assert(!VF.isScalable() && "VF is assumed to be non scalable."); |
| VectorType *OtherVTy = VectorType::get(Member->getType(), VF); |
| StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); |
| } |
| |
| if (Group->isReverse()) |
| StridedVec = reverseVector(StridedVec); |
| |
| State.set(VPDefs[J], StridedVec, Part); |
| } |
| ++J; |
| } |
| return; |
| } |
| |
| // The sub vector type for current instruction. |
| auto *SubVT = VectorType::get(ScalarTy, VF); |
| |
| // Vectorize the interleaved store group. |
| for (unsigned Part = 0; Part < UF; Part++) { |
| // Collect the stored vector from each member. |
| SmallVector<Value *, 4> StoredVecs; |
| for (unsigned i = 0; i < InterleaveFactor; i++) { |
| // Interleaved store group doesn't allow a gap, so each index has a member |
| assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); |
| |
| Value *StoredVec = State.get(StoredValues[i], Part); |
| |
| if (Group->isReverse()) |
| StoredVec = reverseVector(StoredVec); |
| |
| // If this member has different type, cast it to a unified type. |
| |
| if (StoredVec->getType() != SubVT) |
| StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); |
| |
| StoredVecs.push_back(StoredVec); |
| } |
| |
| // Concatenate all vectors into a wide vector. |
| Value *WideVec = concatenateVectors(Builder, StoredVecs); |
| |
| // Interleave the elements in the wide vector. |
| Value *IVec = Builder.CreateShuffleVector( |
| WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), |
| "interleaved.vec"); |
| |
| Instruction *NewStoreInstr; |
| if (BlockInMask) { |
| Value *BlockInMaskPart = State.get(BlockInMask, Part); |
| Value *ShuffledMask = Builder.CreateShuffleVector( |
| BlockInMaskPart, |
| createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), |
| "interleaved.mask"); |
| NewStoreInstr = Builder.CreateMaskedStore( |
| IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); |
| } |
| else |
| NewStoreInstr = |
| Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); |
| |
| Group->addMetadata(NewStoreInstr); |
| } |
| } |
|