| //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This pass merges loads/stores to/from sequential memory addresses into vector |
| // loads/stores. Although there's nothing GPU-specific in here, this pass is |
| // motivated by the microarchitectural quirks of nVidia and AMD GPUs. |
| // |
| // (For simplicity below we talk about loads only, but everything also applies |
| // to stores.) |
| // |
| // This pass is intended to be run late in the pipeline, after other |
| // vectorization opportunities have been exploited. So the assumption here is |
| // that immediately following our new vector load we'll need to extract out the |
| // individual elements of the load, so we can operate on them individually. |
| // |
| // On CPUs this transformation is usually not beneficial, because extracting the |
| // elements of a vector register is expensive on most architectures. It's |
| // usually better just to load each element individually into its own scalar |
| // register. |
| // |
| // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a |
| // "vector load" loads directly into a series of scalar registers. In effect, |
| // extracting the elements of the vector is free. It's therefore always |
| // beneficial to vectorize a sequence of loads on these architectures. |
| // |
| // Vectorizing (perhaps a better name might be "coalescing") loads can have |
| // large performance impacts on GPU kernels, and opportunities for vectorizing |
| // are common in GPU code. This pass tries very hard to find such |
| // opportunities; its runtime is quadratic in the number of loads in a BB. |
| // |
| // Some CPU architectures, such as ARM, have instructions that load into |
| // multiple scalar registers, similar to a GPU vectorized load. In theory ARM |
| // could use this pass (with some modifications), but currently it implements |
| // its own pass to do something similar to what we do here. |
| |
| #include "llvm/ADT/APInt.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/MapVector.h" |
| #include "llvm/ADT/PostOrderIterator.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/ADT/iterator_range.h" |
| #include "llvm/Analysis/AliasAnalysis.h" |
| #include "llvm/Analysis/MemoryLocation.h" |
| #include "llvm/Analysis/OrderedBasicBlock.h" |
| #include "llvm/Analysis/ScalarEvolution.h" |
| #include "llvm/Analysis/TargetTransformInfo.h" |
| #include "llvm/Transforms/Utils/Local.h" |
| #include "llvm/Analysis/ValueTracking.h" |
| #include "llvm/Analysis/VectorUtils.h" |
| #include "llvm/IR/Attributes.h" |
| #include "llvm/IR/BasicBlock.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/Dominators.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/InstrTypes.h" |
| #include "llvm/IR/Instruction.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/IR/User.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/Pass.h" |
| #include "llvm/Support/Casting.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/KnownBits.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Transforms/Vectorize.h" |
| #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" |
| #include <algorithm> |
| #include <cassert> |
| #include <cstdlib> |
| #include <tuple> |
| #include <utility> |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "load-store-vectorizer" |
| |
| STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); |
| STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); |
| |
| // FIXME: Assuming stack alignment of 4 is always good enough |
| static const unsigned StackAdjustedAlignment = 4; |
| |
| namespace { |
| |
| /// ChainID is an arbitrary token that is allowed to be different only for the |
| /// accesses that are guaranteed to be considered non-consecutive by |
| /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions |
| /// together and reducing the number of instructions the main search operates on |
| /// at a time, i.e. this is to reduce compile time and nothing else as the main |
| /// search has O(n^2) time complexity. The underlying type of ChainID should not |
| /// be relied upon. |
| using ChainID = const Value *; |
| using InstrList = SmallVector<Instruction *, 8>; |
| using InstrListMap = MapVector<ChainID, InstrList>; |
| |
| class Vectorizer { |
| Function &F; |
| AliasAnalysis &AA; |
| DominatorTree &DT; |
| ScalarEvolution &SE; |
| TargetTransformInfo &TTI; |
| const DataLayout &DL; |
| IRBuilder<> Builder; |
| |
| public: |
| Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, |
| ScalarEvolution &SE, TargetTransformInfo &TTI) |
| : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), |
| DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} |
| |
| bool run(); |
| |
| private: |
| unsigned getPointerAddressSpace(Value *I); |
| |
| unsigned getAlignment(LoadInst *LI) const { |
| unsigned Align = LI->getAlignment(); |
| if (Align != 0) |
| return Align; |
| |
| return DL.getABITypeAlignment(LI->getType()); |
| } |
| |
| unsigned getAlignment(StoreInst *SI) const { |
| unsigned Align = SI->getAlignment(); |
| if (Align != 0) |
| return Align; |
| |
| return DL.getABITypeAlignment(SI->getValueOperand()->getType()); |
| } |
| |
| static const unsigned MaxDepth = 3; |
| |
| bool isConsecutiveAccess(Value *A, Value *B); |
| bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, |
| unsigned Depth = 0) const; |
| bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, |
| unsigned Depth) const; |
| bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, |
| unsigned Depth) const; |
| |
| /// After vectorization, reorder the instructions that I depends on |
| /// (the instructions defining its operands), to ensure they dominate I. |
| void reorder(Instruction *I); |
| |
| /// Returns the first and the last instructions in Chain. |
| std::pair<BasicBlock::iterator, BasicBlock::iterator> |
| getBoundaryInstrs(ArrayRef<Instruction *> Chain); |
| |
| /// Erases the original instructions after vectorizing. |
| void eraseInstructions(ArrayRef<Instruction *> Chain); |
| |
| /// "Legalize" the vector type that would be produced by combining \p |
| /// ElementSizeBits elements in \p Chain. Break into two pieces such that the |
| /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is |
| /// expected to have more than 4 elements. |
| std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> |
| splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); |
| |
| /// Finds the largest prefix of Chain that's vectorizable, checking for |
| /// intervening instructions which may affect the memory accessed by the |
| /// instructions within Chain. |
| /// |
| /// The elements of \p Chain must be all loads or all stores and must be in |
| /// address order. |
| ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); |
| |
| /// Collects load and store instructions to vectorize. |
| std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); |
| |
| /// Processes the collected instructions, the \p Map. The values of \p Map |
| /// should be all loads or all stores. |
| bool vectorizeChains(InstrListMap &Map); |
| |
| /// Finds the load/stores to consecutive memory addresses and vectorizes them. |
| bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); |
| |
| /// Vectorizes the load instructions in Chain. |
| bool |
| vectorizeLoadChain(ArrayRef<Instruction *> Chain, |
| SmallPtrSet<Instruction *, 16> *InstructionsProcessed); |
| |
| /// Vectorizes the store instructions in Chain. |
| bool |
| vectorizeStoreChain(ArrayRef<Instruction *> Chain, |
| SmallPtrSet<Instruction *, 16> *InstructionsProcessed); |
| |
| /// Check if this load/store access is misaligned accesses. |
| bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, |
| unsigned Alignment); |
| }; |
| |
| class LoadStoreVectorizerLegacyPass : public FunctionPass { |
| public: |
| static char ID; |
| |
| LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { |
| initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); |
| } |
| |
| bool runOnFunction(Function &F) override; |
| |
| StringRef getPassName() const override { |
| return "GPU Load and Store Vectorizer"; |
| } |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override { |
| AU.addRequired<AAResultsWrapperPass>(); |
| AU.addRequired<ScalarEvolutionWrapperPass>(); |
| AU.addRequired<DominatorTreeWrapperPass>(); |
| AU.addRequired<TargetTransformInfoWrapperPass>(); |
| AU.setPreservesCFG(); |
| } |
| }; |
| |
| } // end anonymous namespace |
| |
| char LoadStoreVectorizerLegacyPass::ID = 0; |
| |
| INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, |
| "Vectorize load and Store instructions", false, false) |
| INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) |
| INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
| INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
| INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) |
| INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
| INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, |
| "Vectorize load and store instructions", false, false) |
| |
| Pass *llvm::createLoadStoreVectorizerPass() { |
| return new LoadStoreVectorizerLegacyPass(); |
| } |
| |
| bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { |
| // Don't vectorize when the attribute NoImplicitFloat is used. |
| if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) |
| return false; |
| |
| AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); |
| DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
| ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
| TargetTransformInfo &TTI = |
| getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
| |
| Vectorizer V(F, AA, DT, SE, TTI); |
| return V.run(); |
| } |
| |
| PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { |
| // Don't vectorize when the attribute NoImplicitFloat is used. |
| if (F.hasFnAttribute(Attribute::NoImplicitFloat)) |
| return PreservedAnalyses::all(); |
| |
| AliasAnalysis &AA = AM.getResult<AAManager>(F); |
| DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); |
| ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); |
| TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); |
| |
| Vectorizer V(F, AA, DT, SE, TTI); |
| bool Changed = V.run(); |
| PreservedAnalyses PA; |
| PA.preserveSet<CFGAnalyses>(); |
| return Changed ? PA : PreservedAnalyses::all(); |
| } |
| |
| // The real propagateMetadata expects a SmallVector<Value*>, but we deal in |
| // vectors of Instructions. |
| static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { |
| SmallVector<Value *, 8> VL(IL.begin(), IL.end()); |
| propagateMetadata(I, VL); |
| } |
| |
| // Vectorizer Implementation |
| bool Vectorizer::run() { |
| bool Changed = false; |
| |
| // Scan the blocks in the function in post order. |
| for (BasicBlock *BB : post_order(&F)) { |
| InstrListMap LoadRefs, StoreRefs; |
| std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); |
| Changed |= vectorizeChains(LoadRefs); |
| Changed |= vectorizeChains(StoreRefs); |
| } |
| |
| return Changed; |
| } |
| |
| unsigned Vectorizer::getPointerAddressSpace(Value *I) { |
| if (LoadInst *L = dyn_cast<LoadInst>(I)) |
| return L->getPointerAddressSpace(); |
| if (StoreInst *S = dyn_cast<StoreInst>(I)) |
| return S->getPointerAddressSpace(); |
| return -1; |
| } |
| |
| // FIXME: Merge with llvm::isConsecutiveAccess |
| bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { |
| Value *PtrA = getLoadStorePointerOperand(A); |
| Value *PtrB = getLoadStorePointerOperand(B); |
| unsigned ASA = getPointerAddressSpace(A); |
| unsigned ASB = getPointerAddressSpace(B); |
| |
| // Check that the address spaces match and that the pointers are valid. |
| if (!PtrA || !PtrB || (ASA != ASB)) |
| return false; |
| |
| // Make sure that A and B are different pointers of the same size type. |
| Type *PtrATy = PtrA->getType()->getPointerElementType(); |
| Type *PtrBTy = PtrB->getType()->getPointerElementType(); |
| if (PtrA == PtrB || |
| PtrATy->isVectorTy() != PtrBTy->isVectorTy() || |
| DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || |
| DL.getTypeStoreSize(PtrATy->getScalarType()) != |
| DL.getTypeStoreSize(PtrBTy->getScalarType())) |
| return false; |
| |
| unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); |
| APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); |
| |
| return areConsecutivePointers(PtrA, PtrB, Size); |
| } |
| |
| bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, |
| APInt PtrDelta, unsigned Depth) const { |
| unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); |
| APInt OffsetA(PtrBitWidth, 0); |
| APInt OffsetB(PtrBitWidth, 0); |
| PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); |
| PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); |
| |
| unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); |
| |
| if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) |
| return false; |
| |
| // In case if we have to shrink the pointer |
| // stripAndAccumulateInBoundsConstantOffsets should properly handle a |
| // possible overflow and the value should fit into a smallest data type |
| // used in the cast/gep chain. |
| assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth && |
| OffsetB.getMinSignedBits() <= NewPtrBitWidth); |
| |
| OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); |
| OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); |
| PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); |
| |
| APInt OffsetDelta = OffsetB - OffsetA; |
| |
| // Check if they are based on the same pointer. That makes the offsets |
| // sufficient. |
| if (PtrA == PtrB) |
| return OffsetDelta == PtrDelta; |
| |
| // Compute the necessary base pointer delta to have the necessary final delta |
| // equal to the pointer delta requested. |
| APInt BaseDelta = PtrDelta - OffsetDelta; |
| |
| // Compute the distance with SCEV between the base pointers. |
| const SCEV *PtrSCEVA = SE.getSCEV(PtrA); |
| const SCEV *PtrSCEVB = SE.getSCEV(PtrB); |
| const SCEV *C = SE.getConstant(BaseDelta); |
| const SCEV *X = SE.getAddExpr(PtrSCEVA, C); |
| if (X == PtrSCEVB) |
| return true; |
| |
| // The above check will not catch the cases where one of the pointers is |
| // factorized but the other one is not, such as (C + (S * (A + B))) vs |
| // (AS + BS). Get the minus scev. That will allow re-combining the expresions |
| // and getting the simplified difference. |
| const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); |
| if (C == Dist) |
| return true; |
| |
| // Sometimes even this doesn't work, because SCEV can't always see through |
| // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking |
| // things the hard way. |
| return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); |
| } |
| |
| bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, |
| APInt PtrDelta, |
| unsigned Depth) const { |
| auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); |
| auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); |
| if (!GEPA || !GEPB) |
| return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); |
| |
| // Look through GEPs after checking they're the same except for the last |
| // index. |
| if (GEPA->getNumOperands() != GEPB->getNumOperands() || |
| GEPA->getPointerOperand() != GEPB->getPointerOperand()) |
| return false; |
| gep_type_iterator GTIA = gep_type_begin(GEPA); |
| gep_type_iterator GTIB = gep_type_begin(GEPB); |
| for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { |
| if (GTIA.getOperand() != GTIB.getOperand()) |
| return false; |
| ++GTIA; |
| ++GTIB; |
| } |
| |
| Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand()); |
| Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); |
| if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || |
| OpA->getType() != OpB->getType()) |
| return false; |
| |
| if (PtrDelta.isNegative()) { |
| if (PtrDelta.isMinSignedValue()) |
| return false; |
| PtrDelta.negate(); |
| std::swap(OpA, OpB); |
| } |
| uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); |
| if (PtrDelta.urem(Stride) != 0) |
| return false; |
| unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); |
| APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); |
| |
| // Only look through a ZExt/SExt. |
| if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) |
| return false; |
| |
| bool Signed = isa<SExtInst>(OpA); |
| |
| // At this point A could be a function parameter, i.e. not an instruction |
| Value *ValA = OpA->getOperand(0); |
| OpB = dyn_cast<Instruction>(OpB->getOperand(0)); |
| if (!OpB || ValA->getType() != OpB->getType()) |
| return false; |
| |
| // Now we need to prove that adding IdxDiff to ValA won't overflow. |
| bool Safe = false; |
| // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to |
| // ValA, we're okay. |
| if (OpB->getOpcode() == Instruction::Add && |
| isa<ConstantInt>(OpB->getOperand(1)) && |
| IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) { |
| if (Signed) |
| Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap(); |
| else |
| Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap(); |
| } |
| |
| unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); |
| |
| // Second attempt: |
| // If all set bits of IdxDiff or any higher order bit other than the sign bit |
| // are known to be zero in ValA, we can add Diff to it while guaranteeing no |
| // overflow of any sort. |
| if (!Safe) { |
| OpA = dyn_cast<Instruction>(ValA); |
| if (!OpA) |
| return false; |
| KnownBits Known(BitWidth); |
| computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); |
| APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); |
| if (Signed) |
| BitsAllowedToBeSet.clearBit(BitWidth - 1); |
| if (BitsAllowedToBeSet.ult(IdxDiff)) |
| return false; |
| } |
| |
| const SCEV *OffsetSCEVA = SE.getSCEV(ValA); |
| const SCEV *OffsetSCEVB = SE.getSCEV(OpB); |
| const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); |
| const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); |
| return X == OffsetSCEVB; |
| } |
| |
| bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, |
| const APInt &PtrDelta, |
| unsigned Depth) const { |
| if (Depth++ == MaxDepth) |
| return false; |
| |
| if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { |
| if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { |
| return SelectA->getCondition() == SelectB->getCondition() && |
| areConsecutivePointers(SelectA->getTrueValue(), |
| SelectB->getTrueValue(), PtrDelta, Depth) && |
| areConsecutivePointers(SelectA->getFalseValue(), |
| SelectB->getFalseValue(), PtrDelta, Depth); |
| } |
| } |
| return false; |
| } |
| |
| void Vectorizer::reorder(Instruction *I) { |
| OrderedBasicBlock OBB(I->getParent()); |
| SmallPtrSet<Instruction *, 16> InstructionsToMove; |
| SmallVector<Instruction *, 16> Worklist; |
| |
| Worklist.push_back(I); |
| while (!Worklist.empty()) { |
| Instruction *IW = Worklist.pop_back_val(); |
| int NumOperands = IW->getNumOperands(); |
| for (int i = 0; i < NumOperands; i++) { |
| Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); |
| if (!IM || IM->getOpcode() == Instruction::PHI) |
| continue; |
| |
| // If IM is in another BB, no need to move it, because this pass only |
| // vectorizes instructions within one BB. |
| if (IM->getParent() != I->getParent()) |
| continue; |
| |
| if (!OBB.dominates(IM, I)) { |
| InstructionsToMove.insert(IM); |
| Worklist.push_back(IM); |
| } |
| } |
| } |
| |
| // All instructions to move should follow I. Start from I, not from begin(). |
| for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; |
| ++BBI) { |
| if (!InstructionsToMove.count(&*BBI)) |
| continue; |
| Instruction *IM = &*BBI; |
| --BBI; |
| IM->removeFromParent(); |
| IM->insertBefore(I); |
| } |
| } |
| |
| std::pair<BasicBlock::iterator, BasicBlock::iterator> |
| Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { |
| Instruction *C0 = Chain[0]; |
| BasicBlock::iterator FirstInstr = C0->getIterator(); |
| BasicBlock::iterator LastInstr = C0->getIterator(); |
| |
| BasicBlock *BB = C0->getParent(); |
| unsigned NumFound = 0; |
| for (Instruction &I : *BB) { |
| if (!is_contained(Chain, &I)) |
| continue; |
| |
| ++NumFound; |
| if (NumFound == 1) { |
| FirstInstr = I.getIterator(); |
| } |
| if (NumFound == Chain.size()) { |
| LastInstr = I.getIterator(); |
| break; |
| } |
| } |
| |
| // Range is [first, last). |
| return std::make_pair(FirstInstr, ++LastInstr); |
| } |
| |
| void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { |
| SmallVector<Instruction *, 16> Instrs; |
| for (Instruction *I : Chain) { |
| Value *PtrOperand = getLoadStorePointerOperand(I); |
| assert(PtrOperand && "Instruction must have a pointer operand."); |
| Instrs.push_back(I); |
| if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) |
| Instrs.push_back(GEP); |
| } |
| |
| // Erase instructions. |
| for (Instruction *I : Instrs) |
| if (I->use_empty()) |
| I->eraseFromParent(); |
| } |
| |
| std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> |
| Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, |
| unsigned ElementSizeBits) { |
| unsigned ElementSizeBytes = ElementSizeBits / 8; |
| unsigned SizeBytes = ElementSizeBytes * Chain.size(); |
| unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; |
| if (NumLeft == Chain.size()) { |
| if ((NumLeft & 1) == 0) |
| NumLeft /= 2; // Split even in half |
| else |
| --NumLeft; // Split off last element |
| } else if (NumLeft == 0) |
| NumLeft = 1; |
| return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); |
| } |
| |
| ArrayRef<Instruction *> |
| Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { |
| // These are in BB order, unlike Chain, which is in address order. |
| SmallVector<Instruction *, 16> MemoryInstrs; |
| SmallVector<Instruction *, 16> ChainInstrs; |
| |
| bool IsLoadChain = isa<LoadInst>(Chain[0]); |
| LLVM_DEBUG({ |
| for (Instruction *I : Chain) { |
| if (IsLoadChain) |
| assert(isa<LoadInst>(I) && |
| "All elements of Chain must be loads, or all must be stores."); |
| else |
| assert(isa<StoreInst>(I) && |
| "All elements of Chain must be loads, or all must be stores."); |
| } |
| }); |
| |
| for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { |
| if (isa<LoadInst>(I) || isa<StoreInst>(I)) { |
| if (!is_contained(Chain, &I)) |
| MemoryInstrs.push_back(&I); |
| else |
| ChainInstrs.push_back(&I); |
| } else if (isa<IntrinsicInst>(&I) && |
| cast<IntrinsicInst>(&I)->getIntrinsicID() == |
| Intrinsic::sideeffect) { |
| // Ignore llvm.sideeffect calls. |
| } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { |
| LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I |
| << '\n'); |
| break; |
| } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { |
| LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I |
| << '\n'); |
| break; |
| } |
| } |
| |
| OrderedBasicBlock OBB(Chain[0]->getParent()); |
| |
| // Loop until we find an instruction in ChainInstrs that we can't vectorize. |
| unsigned ChainInstrIdx = 0; |
| Instruction *BarrierMemoryInstr = nullptr; |
| |
| for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { |
| Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; |
| |
| // If a barrier memory instruction was found, chain instructions that follow |
| // will not be added to the valid prefix. |
| if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr)) |
| break; |
| |
| // Check (in BB order) if any instruction prevents ChainInstr from being |
| // vectorized. Find and store the first such "conflicting" instruction. |
| for (Instruction *MemInstr : MemoryInstrs) { |
| // If a barrier memory instruction was found, do not check past it. |
| if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr)) |
| break; |
| |
| auto *MemLoad = dyn_cast<LoadInst>(MemInstr); |
| auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr); |
| if (MemLoad && ChainLoad) |
| continue; |
| |
| // We can ignore the alias if the we have a load store pair and the load |
| // is known to be invariant. The load cannot be clobbered by the store. |
| auto IsInvariantLoad = [](const LoadInst *LI) -> bool { |
| return LI->hasMetadata(LLVMContext::MD_invariant_load); |
| }; |
| |
| // We can ignore the alias as long as the load comes before the store, |
| // because that means we won't be moving the load past the store to |
| // vectorize it (the vectorized load is inserted at the location of the |
| // first load in the chain). |
| if (isa<StoreInst>(MemInstr) && ChainLoad && |
| (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr))) |
| continue; |
| |
| // Same case, but in reverse. |
| if (MemLoad && isa<StoreInst>(ChainInstr) && |
| (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr))) |
| continue; |
| |
| if (!AA.isNoAlias(MemoryLocation::get(MemInstr), |
| MemoryLocation::get(ChainInstr))) { |
| LLVM_DEBUG({ |
| dbgs() << "LSV: Found alias:\n" |
| " Aliasing instruction and pointer:\n" |
| << " " << *MemInstr << '\n' |
| << " " << *getLoadStorePointerOperand(MemInstr) << '\n' |
| << " Aliased instruction and pointer:\n" |
| << " " << *ChainInstr << '\n' |
| << " " << *getLoadStorePointerOperand(ChainInstr) << '\n'; |
| }); |
| // Save this aliasing memory instruction as a barrier, but allow other |
| // instructions that precede the barrier to be vectorized with this one. |
| BarrierMemoryInstr = MemInstr; |
| break; |
| } |
| } |
| // Continue the search only for store chains, since vectorizing stores that |
| // precede an aliasing load is valid. Conversely, vectorizing loads is valid |
| // up to an aliasing store, but should not pull loads from further down in |
| // the basic block. |
| if (IsLoadChain && BarrierMemoryInstr) { |
| // The BarrierMemoryInstr is a store that precedes ChainInstr. |
| assert(OBB.dominates(BarrierMemoryInstr, ChainInstr)); |
| break; |
| } |
| } |
| |
| // Find the largest prefix of Chain whose elements are all in |
| // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of |
| // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB |
| // order.) |
| SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( |
| ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); |
| unsigned ChainIdx = 0; |
| for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { |
| if (!VectorizableChainInstrs.count(Chain[ChainIdx])) |
| break; |
| } |
| return Chain.slice(0, ChainIdx); |
| } |
| |
| static ChainID getChainID(const Value *Ptr, const DataLayout &DL) { |
| const Value *ObjPtr = GetUnderlyingObject(Ptr, DL); |
| if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { |
| // The select's themselves are distinct instructions even if they share the |
| // same condition and evaluate to consecutive pointers for true and false |
| // values of the condition. Therefore using the select's themselves for |
| // grouping instructions would put consecutive accesses into different lists |
| // and they won't be even checked for being consecutive, and won't be |
| // vectorized. |
| return Sel->getCondition(); |
| } |
| return ObjPtr; |
| } |
| |
| std::pair<InstrListMap, InstrListMap> |
| Vectorizer::collectInstructions(BasicBlock *BB) { |
| InstrListMap LoadRefs; |
| InstrListMap StoreRefs; |
| |
| for (Instruction &I : *BB) { |
| if (!I.mayReadOrWriteMemory()) |
| continue; |
| |
| if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { |
| if (!LI->isSimple()) |
| continue; |
| |
| // Skip if it's not legal. |
| if (!TTI.isLegalToVectorizeLoad(LI)) |
| continue; |
| |
| Type *Ty = LI->getType(); |
| if (!VectorType::isValidElementType(Ty->getScalarType())) |
| continue; |
| |
| // Skip weird non-byte sizes. They probably aren't worth the effort of |
| // handling correctly. |
| unsigned TySize = DL.getTypeSizeInBits(Ty); |
| if ((TySize % 8) != 0) |
| continue; |
| |
| // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain |
| // functions are currently using an integer type for the vectorized |
| // load/store, and does not support casting between the integer type and a |
| // vector of pointers (e.g. i64 to <2 x i16*>) |
| if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) |
| continue; |
| |
| Value *Ptr = LI->getPointerOperand(); |
| unsigned AS = Ptr->getType()->getPointerAddressSpace(); |
| unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); |
| |
| unsigned VF = VecRegSize / TySize; |
| VectorType *VecTy = dyn_cast<VectorType>(Ty); |
| |
| // No point in looking at these if they're too big to vectorize. |
| if (TySize > VecRegSize / 2 || |
| (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) |
| continue; |
| |
| // Make sure all the users of a vector are constant-index extracts. |
| if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) { |
| const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); |
| return EEI && isa<ConstantInt>(EEI->getOperand(1)); |
| })) |
| continue; |
| |
| // Save the load locations. |
| const ChainID ID = getChainID(Ptr, DL); |
| LoadRefs[ID].push_back(LI); |
| } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { |
| if (!SI->isSimple()) |
| continue; |
| |
| // Skip if it's not legal. |
| if (!TTI.isLegalToVectorizeStore(SI)) |
| continue; |
| |
| Type *Ty = SI->getValueOperand()->getType(); |
| if (!VectorType::isValidElementType(Ty->getScalarType())) |
| continue; |
| |
| // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain |
| // functions are currently using an integer type for the vectorized |
| // load/store, and does not support casting between the integer type and a |
| // vector of pointers (e.g. i64 to <2 x i16*>) |
| if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) |
| continue; |
| |
| // Skip weird non-byte sizes. They probably aren't worth the effort of |
| // handling correctly. |
| unsigned TySize = DL.getTypeSizeInBits(Ty); |
| if ((TySize % 8) != 0) |
| continue; |
| |
| Value *Ptr = SI->getPointerOperand(); |
| unsigned AS = Ptr->getType()->getPointerAddressSpace(); |
| unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); |
| |
| unsigned VF = VecRegSize / TySize; |
| VectorType *VecTy = dyn_cast<VectorType>(Ty); |
| |
| // No point in looking at these if they're too big to vectorize. |
| if (TySize > VecRegSize / 2 || |
| (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) |
| continue; |
| |
| if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) { |
| const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); |
| return EEI && isa<ConstantInt>(EEI->getOperand(1)); |
| })) |
| continue; |
| |
| // Save store location. |
| const ChainID ID = getChainID(Ptr, DL); |
| StoreRefs[ID].push_back(SI); |
| } |
| } |
| |
| return {LoadRefs, StoreRefs}; |
| } |
| |
| bool Vectorizer::vectorizeChains(InstrListMap &Map) { |
| bool Changed = false; |
| |
| for (const std::pair<ChainID, InstrList> &Chain : Map) { |
| unsigned Size = Chain.second.size(); |
| if (Size < 2) |
| continue; |
| |
| LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); |
| |
| // Process the stores in chunks of 64. |
| for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { |
| unsigned Len = std::min<unsigned>(CE - CI, 64); |
| ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); |
| Changed |= vectorizeInstructions(Chunk); |
| } |
| } |
| |
| return Changed; |
| } |
| |
| bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { |
| LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() |
| << " instructions.\n"); |
| SmallVector<int, 16> Heads, Tails; |
| int ConsecutiveChain[64]; |
| |
| // Do a quadratic search on all of the given loads/stores and find all of the |
| // pairs of loads/stores that follow each other. |
| for (int i = 0, e = Instrs.size(); i < e; ++i) { |
| ConsecutiveChain[i] = -1; |
| for (int j = e - 1; j >= 0; --j) { |
| if (i == j) |
| continue; |
| |
| if (isConsecutiveAccess(Instrs[i], Instrs[j])) { |
| if (ConsecutiveChain[i] != -1) { |
| int CurDistance = std::abs(ConsecutiveChain[i] - i); |
| int NewDistance = std::abs(ConsecutiveChain[i] - j); |
| if (j < i || NewDistance > CurDistance) |
| continue; // Should not insert. |
| } |
| |
| Tails.push_back(j); |
| Heads.push_back(i); |
| ConsecutiveChain[i] = j; |
| } |
| } |
| } |
| |
| bool Changed = false; |
| SmallPtrSet<Instruction *, 16> InstructionsProcessed; |
| |
| for (int Head : Heads) { |
| if (InstructionsProcessed.count(Instrs[Head])) |
| continue; |
| bool LongerChainExists = false; |
| for (unsigned TIt = 0; TIt < Tails.size(); TIt++) |
| if (Head == Tails[TIt] && |
| !InstructionsProcessed.count(Instrs[Heads[TIt]])) { |
| LongerChainExists = true; |
| break; |
| } |
| if (LongerChainExists) |
| continue; |
| |
| // We found an instr that starts a chain. Now follow the chain and try to |
| // vectorize it. |
| SmallVector<Instruction *, 16> Operands; |
| int I = Head; |
| while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { |
| if (InstructionsProcessed.count(Instrs[I])) |
| break; |
| |
| Operands.push_back(Instrs[I]); |
| I = ConsecutiveChain[I]; |
| } |
| |
| bool Vectorized = false; |
| if (isa<LoadInst>(*Operands.begin())) |
| Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); |
| else |
| Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); |
| |
| Changed |= Vectorized; |
| } |
| |
| return Changed; |
| } |
| |
| bool Vectorizer::vectorizeStoreChain( |
| ArrayRef<Instruction *> Chain, |
| SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { |
| StoreInst *S0 = cast<StoreInst>(Chain[0]); |
| |
| // If the vector has an int element, default to int for the whole store. |
| Type *StoreTy = nullptr; |
| for (Instruction *I : Chain) { |
| StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); |
| if (StoreTy->isIntOrIntVectorTy()) |
| break; |
| |
| if (StoreTy->isPtrOrPtrVectorTy()) { |
| StoreTy = Type::getIntNTy(F.getParent()->getContext(), |
| DL.getTypeSizeInBits(StoreTy)); |
| break; |
| } |
| } |
| assert(StoreTy && "Failed to find store type"); |
| |
| unsigned Sz = DL.getTypeSizeInBits(StoreTy); |
| unsigned AS = S0->getPointerAddressSpace(); |
| unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); |
| unsigned VF = VecRegSize / Sz; |
| unsigned ChainSize = Chain.size(); |
| unsigned Alignment = getAlignment(S0); |
| |
| if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { |
| InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
| return false; |
| } |
| |
| ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); |
| if (NewChain.empty()) { |
| // No vectorization possible. |
| InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
| return false; |
| } |
| if (NewChain.size() == 1) { |
| // Failed after the first instruction. Discard it and try the smaller chain. |
| InstructionsProcessed->insert(NewChain.front()); |
| return false; |
| } |
| |
| // Update Chain to the valid vectorizable subchain. |
| Chain = NewChain; |
| ChainSize = Chain.size(); |
| |
| // Check if it's legal to vectorize this chain. If not, split the chain and |
| // try again. |
| unsigned EltSzInBytes = Sz / 8; |
| unsigned SzInBytes = EltSzInBytes * ChainSize; |
| |
| VectorType *VecTy; |
| VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy); |
| if (VecStoreTy) |
| VecTy = VectorType::get(StoreTy->getScalarType(), |
| Chain.size() * VecStoreTy->getNumElements()); |
| else |
| VecTy = VectorType::get(StoreTy, Chain.size()); |
| |
| // If it's more than the max vector size or the target has a better |
| // vector factor, break it into two pieces. |
| unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); |
| if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { |
| LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." |
| " Creating two separate arrays.\n"); |
| return vectorizeStoreChain(Chain.slice(0, TargetVF), |
| InstructionsProcessed) | |
| vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); |
| } |
| |
| LLVM_DEBUG({ |
| dbgs() << "LSV: Stores to vectorize:\n"; |
| for (Instruction *I : Chain) |
| dbgs() << " " << *I << "\n"; |
| }); |
| |
| // We won't try again to vectorize the elements of the chain, regardless of |
| // whether we succeed below. |
| InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
| |
| // If the store is going to be misaligned, don't vectorize it. |
| if (accessIsMisaligned(SzInBytes, AS, Alignment)) { |
| if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { |
| auto Chains = splitOddVectorElts(Chain, Sz); |
| return vectorizeStoreChain(Chains.first, InstructionsProcessed) | |
| vectorizeStoreChain(Chains.second, InstructionsProcessed); |
| } |
| |
| unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), |
| StackAdjustedAlignment, |
| DL, S0, nullptr, &DT); |
| if (NewAlign != 0) |
| Alignment = NewAlign; |
| } |
| |
| if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { |
| auto Chains = splitOddVectorElts(Chain, Sz); |
| return vectorizeStoreChain(Chains.first, InstructionsProcessed) | |
| vectorizeStoreChain(Chains.second, InstructionsProcessed); |
| } |
| |
| BasicBlock::iterator First, Last; |
| std::tie(First, Last) = getBoundaryInstrs(Chain); |
| Builder.SetInsertPoint(&*Last); |
| |
| Value *Vec = UndefValue::get(VecTy); |
| |
| if (VecStoreTy) { |
| unsigned VecWidth = VecStoreTy->getNumElements(); |
| for (unsigned I = 0, E = Chain.size(); I != E; ++I) { |
| StoreInst *Store = cast<StoreInst>(Chain[I]); |
| for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { |
| unsigned NewIdx = J + I * VecWidth; |
| Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), |
| Builder.getInt32(J)); |
| if (Extract->getType() != StoreTy->getScalarType()) |
| Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); |
| |
| Value *Insert = |
| Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); |
| Vec = Insert; |
| } |
| } |
| } else { |
| for (unsigned I = 0, E = Chain.size(); I != E; ++I) { |
| StoreInst *Store = cast<StoreInst>(Chain[I]); |
| Value *Extract = Store->getValueOperand(); |
| if (Extract->getType() != StoreTy->getScalarType()) |
| Extract = |
| Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); |
| |
| Value *Insert = |
| Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); |
| Vec = Insert; |
| } |
| } |
| |
| StoreInst *SI = Builder.CreateAlignedStore( |
| Vec, |
| Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), |
| Alignment); |
| propagateMetadata(SI, Chain); |
| |
| eraseInstructions(Chain); |
| ++NumVectorInstructions; |
| NumScalarsVectorized += Chain.size(); |
| return true; |
| } |
| |
| bool Vectorizer::vectorizeLoadChain( |
| ArrayRef<Instruction *> Chain, |
| SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { |
| LoadInst *L0 = cast<LoadInst>(Chain[0]); |
| |
| // If the vector has an int element, default to int for the whole load. |
| Type *LoadTy = nullptr; |
| for (const auto &V : Chain) { |
| LoadTy = cast<LoadInst>(V)->getType(); |
| if (LoadTy->isIntOrIntVectorTy()) |
| break; |
| |
| if (LoadTy->isPtrOrPtrVectorTy()) { |
| LoadTy = Type::getIntNTy(F.getParent()->getContext(), |
| DL.getTypeSizeInBits(LoadTy)); |
| break; |
| } |
| } |
| assert(LoadTy && "Can't determine LoadInst type from chain"); |
| |
| unsigned Sz = DL.getTypeSizeInBits(LoadTy); |
| unsigned AS = L0->getPointerAddressSpace(); |
| unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); |
| unsigned VF = VecRegSize / Sz; |
| unsigned ChainSize = Chain.size(); |
| unsigned Alignment = getAlignment(L0); |
| |
| if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { |
| InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
| return false; |
| } |
| |
| ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); |
| if (NewChain.empty()) { |
| // No vectorization possible. |
| InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
| return false; |
| } |
| if (NewChain.size() == 1) { |
| // Failed after the first instruction. Discard it and try the smaller chain. |
| InstructionsProcessed->insert(NewChain.front()); |
| return false; |
| } |
| |
| // Update Chain to the valid vectorizable subchain. |
| Chain = NewChain; |
| ChainSize = Chain.size(); |
| |
| // Check if it's legal to vectorize this chain. If not, split the chain and |
| // try again. |
| unsigned EltSzInBytes = Sz / 8; |
| unsigned SzInBytes = EltSzInBytes * ChainSize; |
| VectorType *VecTy; |
| VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy); |
| if (VecLoadTy) |
| VecTy = VectorType::get(LoadTy->getScalarType(), |
| Chain.size() * VecLoadTy->getNumElements()); |
| else |
| VecTy = VectorType::get(LoadTy, Chain.size()); |
| |
| // If it's more than the max vector size or the target has a better |
| // vector factor, break it into two pieces. |
| unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); |
| if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { |
| LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." |
| " Creating two separate arrays.\n"); |
| return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) | |
| vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); |
| } |
| |
| // We won't try again to vectorize the elements of the chain, regardless of |
| // whether we succeed below. |
| InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
| |
| // If the load is going to be misaligned, don't vectorize it. |
| if (accessIsMisaligned(SzInBytes, AS, Alignment)) { |
| if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { |
| auto Chains = splitOddVectorElts(Chain, Sz); |
| return vectorizeLoadChain(Chains.first, InstructionsProcessed) | |
| vectorizeLoadChain(Chains.second, InstructionsProcessed); |
| } |
| |
| Alignment = getOrEnforceKnownAlignment( |
| L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT); |
| } |
| |
| if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { |
| auto Chains = splitOddVectorElts(Chain, Sz); |
| return vectorizeLoadChain(Chains.first, InstructionsProcessed) | |
| vectorizeLoadChain(Chains.second, InstructionsProcessed); |
| } |
| |
| LLVM_DEBUG({ |
| dbgs() << "LSV: Loads to vectorize:\n"; |
| for (Instruction *I : Chain) |
| I->dump(); |
| }); |
| |
| // getVectorizablePrefix already computed getBoundaryInstrs. The value of |
| // Last may have changed since then, but the value of First won't have. If it |
| // matters, we could compute getBoundaryInstrs only once and reuse it here. |
| BasicBlock::iterator First, Last; |
| std::tie(First, Last) = getBoundaryInstrs(Chain); |
| Builder.SetInsertPoint(&*First); |
| |
| Value *Bitcast = |
| Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); |
| LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment); |
| propagateMetadata(LI, Chain); |
| |
| if (VecLoadTy) { |
| SmallVector<Instruction *, 16> InstrsToErase; |
| |
| unsigned VecWidth = VecLoadTy->getNumElements(); |
| for (unsigned I = 0, E = Chain.size(); I != E; ++I) { |
| for (auto Use : Chain[I]->users()) { |
| // All users of vector loads are ExtractElement instructions with |
| // constant indices, otherwise we would have bailed before now. |
| Instruction *UI = cast<Instruction>(Use); |
| unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); |
| unsigned NewIdx = Idx + I * VecWidth; |
| Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), |
| UI->getName()); |
| if (V->getType() != UI->getType()) |
| V = Builder.CreateBitCast(V, UI->getType()); |
| |
| // Replace the old instruction. |
| UI->replaceAllUsesWith(V); |
| InstrsToErase.push_back(UI); |
| } |
| } |
| |
| // Bitcast might not be an Instruction, if the value being loaded is a |
| // constant. In that case, no need to reorder anything. |
| if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) |
| reorder(BitcastInst); |
| |
| for (auto I : InstrsToErase) |
| I->eraseFromParent(); |
| } else { |
| for (unsigned I = 0, E = Chain.size(); I != E; ++I) { |
| Value *CV = Chain[I]; |
| Value *V = |
| Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); |
| if (V->getType() != CV->getType()) { |
| V = Builder.CreateBitOrPointerCast(V, CV->getType()); |
| } |
| |
| // Replace the old instruction. |
| CV->replaceAllUsesWith(V); |
| } |
| |
| if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) |
| reorder(BitcastInst); |
| } |
| |
| eraseInstructions(Chain); |
| |
| ++NumVectorInstructions; |
| NumScalarsVectorized += Chain.size(); |
| return true; |
| } |
| |
| bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, |
| unsigned Alignment) { |
| if (Alignment % SzInBytes == 0) |
| return false; |
| |
| bool Fast = false; |
| bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), |
| SzInBytes * 8, AddressSpace, |
| Alignment, &Fast); |
| LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows |
| << " and fast? " << Fast << "\n";); |
| return !Allows || !Fast; |
| } |