blob: 2b5d4d81dd13b4ee45dab7ddd13780a079f4c022 [file] [log] [blame]
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.
//
// The pass is inspired by the work described in the paper:
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>
#include <memory>
#include <set>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
using namespace llvm::PatternMatch;
using namespace slpvectorizer;
#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize if you gain more than this "
"number "));
static cl::opt<bool>
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
cl::desc("Attempt to vectorize horizontal reductions"));
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
/// This limit is way higher than needed by real-world functions.
static cl::opt<int>
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
cl::desc("Limit the size of the SLP scheduling region per block"));
static cl::opt<int> MinVectorRegSizeOption(
"slp-min-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
static cl::opt<unsigned> RecursionMaxDepth(
"slp-recursion-max-depth", cl::init(12), cl::Hidden,
cl::desc("Limit the recursion depth when building a vectorizable tree"));
static cl::opt<unsigned> MinTreeSize(
"slp-min-tree-size", cl::init(3), cl::Hidden,
cl::desc("Only vectorize small trees if they are fully vectorizable"));
static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
// Another limit for the alias checks: The maximum distance between load/store
// instructions where alias checks are done.
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance = 160;
/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
/// regions to be handled.
static const int MinScheduleRegionSize = 16;
/// Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
/// vectors. We also filter target specific types which have absolutely no
/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
/// avoids spending time checking the cost model and realizing that they will
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
!Ty->isPPC_FP128Ty();
}
/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return false;
BasicBlock *BB = I0->getParent();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I)
return false;
if (BB != I->getParent())
return false;
}
return true;
}
/// \returns True if all of the values in \p VL are constants.
static bool allConstant(ArrayRef<Value *> VL) {
for (Value *i : VL)
if (!isa<Constant>(i))
return false;
return true;
}
/// \returns True if all of the values in \p VL are identical.
static bool isSplat(ArrayRef<Value *> VL) {
for (unsigned i = 1, e = VL.size(); i < e; ++i)
if (VL[i] != VL[0])
return false;
return true;
}
/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
static bool isCommutative(Instruction *I) {
if (auto *IC = dyn_cast<CmpInst>(I))
return IC->isCommutative();
return I->isCommutative();
}
/// Checks if the vector of instructions can be represented as a shuffle, like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
/// %x0x0 = mul i8 %x0, %x0
/// %x3x3 = mul i8 %x3, %x3
/// %y1y1 = mul i8 %y1, %y1
/// %y2y2 = mul i8 %y2, %y2
/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
/// ret <4 x i8> %ins4
/// can be transformed into:
/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
/// i32 6>
/// %2 = mul <4 x i8> %1, %1
/// ret <4 x i8> %2
/// We convert this initially to something like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
/// %5 = mul <4 x i8> %4, %4
/// %6 = extractelement <4 x i8> %5, i32 0
/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
/// %7 = extractelement <4 x i8> %5, i32 1
/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
/// %8 = extractelement <4 x i8> %5, i32 2
/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
/// %9 = extractelement <4 x i8> %5, i32 3
/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
/// ret <4 x i8> %ins4
/// InstCombiner transforms this into a shuffle and vector mul
/// TODO: Can we split off and reuse the shuffle mask detection from
/// TargetTransformInfo::getInstructionThroughput?
static Optional<TargetTransformInfo::ShuffleKind>
isShuffle(ArrayRef<Value *> VL) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
auto *EI = cast<ExtractElementInst>(VL[I]);
auto *Vec = EI->getVectorOperand();
// All vector operands must have the same number of vector elements.
if (Vec->getType()->getVectorNumElements() != Size)
return None;
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
return None;
// Undefined behavior if Idx is negative or >= Size.
if (Idx->getValue().uge(Size))
continue;
unsigned IntIdx = Idx->getValue().getZExtValue();
// We can extractelement from undef vector.
if (isa<UndefValue>(Vec))
continue;
// For correct shuffling we have to have at most 2 different vector operands
// in all extractelement instructions.
if (!Vec1 || Vec1 == Vec)
Vec1 = Vec;
else if (!Vec2 || Vec2 == Vec)
Vec2 = Vec;
else
return None;
if (CommonShuffleMode == Permute)
continue;
// If the extract index is not the same as the operation number, it is a
// permutation.
if (IntIdx != I) {
CommonShuffleMode = Permute;
continue;
}
CommonShuffleMode = Select;
}
// If we're not crossing lanes in different vectors, consider it as blending.
if (CommonShuffleMode == Select && Vec2)
return TargetTransformInfo::SK_Select;
// If Vec2 was never used, we have a permutation of a single vector, otherwise
// we have permutation of 2 vectors.
return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
: TargetTransformInfo::SK_PermuteSingleSrc;
}
namespace {
/// Main data required for vectorization of instructions.
struct InstructionsState {
/// The very first instruction in the list with the main opcode.
Value *OpValue = nullptr;
/// The main/alternate instruction.
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
/// The main/alternate opcodes for the list of instructions.
unsigned getOpcode() const {
return MainOp ? MainOp->getOpcode() : 0;
}
unsigned getAltOpcode() const {
return AltOp ? AltOp->getOpcode() : 0;
}
/// Some of the instructions in the list have alternate opcodes.
bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
bool isOpcodeOrAlt(Instruction *I) const {
unsigned CheckedOpcode = I->getOpcode();
return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
}
InstructionsState() = delete;
InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
: OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
};
} // end anonymous namespace
/// Chooses the correct key for scheduling data. If \p Op has the same (or
/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
/// OpValue.
static Value *isOneOf(const InstructionsState &S, Value *Op) {
auto *I = dyn_cast<Instruction>(Op);
if (I && S.isOpcodeOrAlt(I))
return Op;
return S.OpValue;
}
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
unsigned BaseIndex = 0) {
// Make sure these are all Instructions.
if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
unsigned AltOpcode = Opcode;
unsigned AltIndex = BaseIndex;
// Check for one alternate opcode from another BinaryOperator.
// TODO - generalize to support all operators (types, calls etc.).
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
if (Opcode == AltOpcode) {
AltOpcode = InstOpcode;
AltIndex = Cnt;
continue;
}
} else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
if (Ty0 == Ty1) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
if (Opcode == AltOpcode) {
AltOpcode = InstOpcode;
AltIndex = Cnt;
continue;
}
}
} else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}
return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
cast<Instruction>(VL[AltIndex]));
}
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
Type *Ty = VL[0]->getType();
for (int i = 1, e = VL.size(); i < e; i++)
if (VL[i]->getType() != Ty)
return false;
return true;
}
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
static Optional<unsigned> getExtractIndex(Instruction *E) {
unsigned Opcode = E->getOpcode();
assert((Opcode == Instruction::ExtractElement ||
Opcode == Instruction::ExtractValue) &&
"Expected extractelement or extractvalue instruction.");
if (Opcode == Instruction::ExtractElement) {
auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
if (!CI)
return None;
return CI->getZExtValue();
}
ExtractValueInst *EI = cast<ExtractValueInst>(E);
if (EI->getNumIndices() != 1)
return None;
return *EI->idx_begin();
}
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
TargetLibraryInfo *TLI) {
unsigned Opcode = UserInst->getOpcode();
switch (Opcode) {
case Instruction::Load: {
LoadInst *LI = cast<LoadInst>(UserInst);
return (LI->getPointerOperand() == Scalar);
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(UserInst);
return (SI->getPointerOperand() == Scalar);
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
if (hasVectorInstrinsicScalarOpd(ID, i))
return (CI->getArgOperand(i) == Scalar);
}
LLVM_FALLTHROUGH;
}
default:
return false;
}
}
/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return MemoryLocation::get(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return MemoryLocation::get(LI);
return MemoryLocation();
}
/// \returns True if the instruction is not a volatile or atomic load/store.
static bool isSimple(Instruction *I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return LI->isSimple();
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return SI->isSimple();
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
return !MI->isVolatile();
return true;
}
namespace llvm {
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
using ValueList = SmallVector<Value *, 8>;
using InstrList = SmallVector<Instruction *, 16>;
using ValueSet = SmallPtrSet<Value *, 16>;
using StoreList = SmallVector<StoreInst *, 8>;
using ExtraValueToDebugLocsMap =
MapVector<Value *, SmallVector<Instruction *, 2>>;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
: F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
// TODO: It would be better to limit the vectorization factor based on
// data type rather than just register size. For example, x86 AVX has
// 256-bit registers, but it does not support integer operations
// at that width (that requires AVX2).
if (MaxVectorRegSizeOption.getNumOccurrences())
MaxVecRegSize = MaxVectorRegSizeOption;
else
MaxVecRegSize = TTI->getRegisterBitWidth(true);
if (MinVectorRegSizeOption.getNumOccurrences())
MinVecRegSize = MinVectorRegSizeOption;
else
MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
}
/// Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
/// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions.
Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
int getSpillCost();
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
int getTreeCost();
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst = None);
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
/// into account (anf updating it, if required) list of externally used
/// values stored in \p ExternallyUsedValues.
void buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap &ExternallyUsedValues,
ArrayRef<Value *> UserIgnoreLst = None);
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
VectorizableTree.clear();
ScalarToTreeEntry.clear();
MustGather.clear();
ExternalUses.clear();
NumOpsWantToKeepOrder.clear();
NumOpsWantToKeepOriginalOrder = 0;
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
}
MinBWs.clear();
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
/// Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
/// \returns The best order of instructions for vectorization.
Optional<ArrayRef<unsigned>> bestOrder() const {
auto I = std::max_element(
NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
[](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
return D1.second < D2.second;
});
if (I == NumOpsWantToKeepOrder.end() ||
I->getSecond() <= NumOpsWantToKeepOriginalOrder)
return None;
return makeArrayRef(I->getFirst());
}
/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of
/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.
unsigned getVectorElementSize(Value *V) const;
/// Compute the minimum type sizes required to represent the entries in a
/// vectorizable tree.
void computeMinimumValueSizes();
// \returns maximum vector register size as set by TTI or overridden by cl::opt.
unsigned getMaxVecRegSize() const {
return MaxVecRegSize;
}
// \returns minimum vector register size as set by cl::opt.
unsigned getMinVecRegSize() const {
return MinVecRegSize;
}
/// Check if ArrayType or StructType is isomorphic to some VectorType.
///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
/// \returns True if the VectorizableTree is both tiny and not fully
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable() const;
OptimizationRemarkEmitter *getORE() { return ORE; }
/// This structure holds any data we need about the edges being traversed
/// during buildTree_rec(). We keep track of:
/// (i) the user TreeEntry index, and
/// (ii) the index of the edge.
struct EdgeInfo {
EdgeInfo() = default;
/// The index of the user TreeEntry in VectorizableTree.
int Idx = -1;
/// The operand index of the use.
unsigned EdgeIdx = UINT_MAX;
#ifndef NDEBUG
friend inline raw_ostream &operator<<(raw_ostream &OS,
const BoUpSLP::EdgeInfo &EI) {
EI.dump(OS);
return OS;
}
/// Debug print.
void dump(raw_ostream &OS) const {
OS << "{User:" << Idx << " EdgeIdx:" << EdgeIdx << "}";
}
LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
#endif
};
private:
struct TreeEntry;
/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(Instruction *I) const;
/// \returns the cost of the vectorizable entry.
int getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, EdgeInfo EI);
/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
/// be vectorized to use the original vector (or aggregate "bitcast" to a
/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
/// returns false, setting \p CurrentOrder to either an empty vector or a
/// non-identity permutation that allows to reuse extract instructions.
bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
SmallVectorImpl<unsigned> &CurrentOrder) const;
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
/// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL);
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const;
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
int getGatherCost(ArrayRef<Value *> VL) const;
/// Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(ArrayRef<Value *> VL,
const InstructionsState &S);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
bool isFullyVectorizableTinyTree() const;
/// \reorder commutative operands to get better probability of
/// generating vectorized code.
void reorderInputsAccordingToOpcode(const InstructionsState &S,
ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) const;
struct TreeEntry {
TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
if (VL.size() == Scalars.size())
return std::equal(VL.begin(), VL.end(), Scalars.begin());
return VL.size() == ReuseShuffleIndices.size() &&
std::equal(
VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
[this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
}
/// A vector of scalars.
ValueList Scalars;
/// The Scalars are vectorized into this value. It is initialized to Null.
Value *VectorizedValue = nullptr;
/// Do we need to gather this sequence ?
bool NeedToGather = false;
/// Does this sequence require some shuffling?
SmallVector<unsigned, 4> ReuseShuffleIndices;
/// Does this entry require reordering?
ArrayRef<unsigned> ReorderIndices;
/// Points back to the VectorizableTree.
///
/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
/// to be a pointer and needs to be able to initialize the child iterator.
/// Thus we need a reference back to the container to translate the indices
/// to entries.
std::vector<TreeEntry> &Container;
/// The TreeEntry index containing the user of this entry. We can actually
/// have multiple users so the data structure is not truly a tree.
SmallVector<EdgeInfo, 1> UserTreeIndices;
private:
/// The operands of each instruction in each lane Operands[op_index][lane].
/// Note: This helps avoid the replication of the code that performs the
/// reordering of operands during buildTree_rec() and vectorizeTree().
SmallVector<ValueList, 2> Operands;
public:
/// Set this bundle's \p OpIdx'th operand to \p OpVL.
void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL,
ArrayRef<unsigned> ReuseShuffleIndices) {
if (Operands.size() < OpIdx + 1)
Operands.resize(OpIdx + 1);
assert(Operands[OpIdx].size() == 0 && "Already resized?");
Operands[OpIdx].resize(Scalars.size());
for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty())
? OpVL[ReuseShuffleIndices[Lane]]
: OpVL[Lane];
}
/// If there is a user TreeEntry, then set its operand.
void trySetUserTEOperand(const EdgeInfo &UserTreeIdx,
ArrayRef<Value *> OpVL,
ArrayRef<unsigned> ReuseShuffleIndices) {
if (UserTreeIdx.Idx >= 0) {
auto &VectorizableTree = Container;
VectorizableTree[UserTreeIdx.Idx].setOperand(UserTreeIdx.EdgeIdx, OpVL,
ReuseShuffleIndices);
}
}
/// \returns the \p OpIdx operand of this TreeEntry.
ValueList &getOperand(unsigned OpIdx) {
assert(OpIdx < Operands.size() && "Off bounds");
return Operands[OpIdx];
}
/// \return the single \p OpIdx operand.
Value *getSingleOperand(unsigned OpIdx) const {
assert(OpIdx < Operands.size() && "Off bounds");
assert(!Operands[OpIdx].empty() && "No operand availabe");
return Operands[OpIdx][0];
}
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
dbgs() << "Operand " << OpI << ":\n";
for (const Value *V : Operands[OpI])
dbgs().indent(2) << *V << "\n";
}
dbgs() << "Scalars: \n";
for (Value *V : Scalars)
dbgs().indent(2) << *V << "\n";
dbgs() << "NeedToGather: " << NeedToGather << "\n";
dbgs() << "VectorizedValue: ";
if (VectorizedValue)
dbgs() << *VectorizedValue;
else
dbgs() << "NULL";
dbgs() << "\n";
dbgs() << "ReuseShuffleIndices: ";
if (ReuseShuffleIndices.empty())
dbgs() << "Emtpy";
else
for (unsigned Idx : ReuseShuffleIndices)
dbgs() << Idx << ", ";
dbgs() << "\n";
dbgs() << "ReorderIndices: ";
for (unsigned Idx : ReorderIndices)
dbgs() << Idx << ", ";
dbgs() << "\n";
dbgs() << "UserTreeIndices: ";
for (const auto &EInfo : UserTreeIndices)
dbgs() << EInfo << ", ";
dbgs() << "\n";
}
#endif
};
/// Create a new VectorizableTree entry.
void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
EdgeInfo &UserTreeIdx,
ArrayRef<unsigned> ReuseShuffleIndices = None,
ArrayRef<unsigned> ReorderIndices = None) {
VectorizableTree.emplace_back(VectorizableTree);
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
Last->ReorderIndices = ReorderIndices;
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
ScalarToTreeEntry[VL[i]] = idx;
}
} else {
MustGather.insert(VL.begin(), VL.end());
}
if (UserTreeIdx.Idx >= 0)
Last->UserTreeIndices.push_back(UserTreeIdx);
Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices);
UserTreeIdx.Idx = idx;
}
/// -- Vectorization State --
/// Holds all of the tree entries.
std::vector<TreeEntry> VectorizableTree;
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dumpVectorizableTree() const {
for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
dbgs() << Id << ".\n";
VectorizableTree[Id].dump();
dbgs() << "\n";
}
}
#endif
TreeEntry *getTreeEntry(Value *V) {
auto I = ScalarToTreeEntry.find(V);
if (I != ScalarToTreeEntry.end())
return &VectorizableTree[I->second];
return nullptr;
}
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, int> ScalarToTreeEntry;
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser(Value *S, llvm::User *U, int L)
: Scalar(S), User(U), Lane(L) {}
// Which scalar in our function.
Value *Scalar;
// Which user that uses the scalar.
llvm::User *User;
// Which lane does the scalar belong to.
int Lane;
};
using UserList = SmallVector<ExternalUser, 16>;
/// Checks if two instructions may access the same memory.
///
/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
/// is invariant in the calling loop.
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
// First check if the result is already in the cache.
AliasCacheKey key = std::make_pair(Inst1, Inst2);
Optional<bool> &result = AliasCache[key];
if (result.hasValue()) {
return result.getValue();
}
MemoryLocation Loc2 = getLocation(Inst2, AA);
bool aliased = true;
if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
// Do the alias check.
aliased = AA->alias(Loc1, Loc2);
}
// Store the result in the cache.
result = aliased;
return aliased;
}
using AliasCacheKey = std::pair<Instruction *, Instruction *>;
/// Cache for alias results.
/// TODO: consider moving this to the AliasAnalysis itself.
DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
/// Removes an instruction from its block and eventually deletes it.
/// It's like Instruction::eraseFromParent() except that the actual deletion
/// is delayed until BoUpSLP is destructed.
/// This is required to ensure that there are no incorrect collisions in the
/// AliasCache, which can happen if a new instruction is allocated at the
/// same address as a previously deleted instruction.
void eraseInstruction(Instruction *I) {
I->removeFromParent();
I->dropAllReferences();
DeletedInstructions.emplace_back(I);
}
/// Temporary store for deleted instructions. Instructions will be deleted
/// eventually when the BoUpSLP is destructed.
SmallVector<unique_value, 8> DeletedInstructions;
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User). External User
/// can be nullptr, it means that this Internal Scalar will be used later,
/// after vectorization.
UserList ExternalUses;
/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
/// Contains all scheduling relevant data for an instruction.
/// A ScheduleData either represents a single instruction or a member of an
/// instruction bundle (= a group of instructions which is combined into a
/// vector instruction).
struct ScheduleData {
// The initial value for the dependency counters. It means that the
// dependencies are not calculated yet.
enum { InvalidDeps = -1 };
ScheduleData() = default;
void init(int BlockSchedulingRegionID, Value *OpVal) {
FirstInBundle = this;
NextInBundle = nullptr;
NextLoadStore = nullptr;
IsScheduled = false;
SchedulingRegionID = BlockSchedulingRegionID;
UnscheduledDepsInBundle = UnscheduledDeps;
clearDependencies();
OpValue = OpVal;
}
/// Returns true if the dependency information has been calculated.
bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
/// Returns true for single instructions and for bundle representatives
/// (= the head of a bundle).
bool isSchedulingEntity() const { return FirstInBundle == this; }
/// Returns true if it represents an instruction bundle and not only a
/// single instruction.
bool isPartOfBundle() const {
return NextInBundle != nullptr || FirstInBundle != this;
}
/// Returns true if it is ready for scheduling, i.e. it has no more
/// unscheduled depending instructions/bundles.
bool isReady() const {
assert(isSchedulingEntity() &&
"can't consider non-scheduling entity for ready list");
return UnscheduledDepsInBundle == 0 && !IsScheduled;
}
/// Modifies the number of unscheduled dependencies, also updating it for
/// the whole bundle.
int incrementUnscheduledDeps(int Incr) {
UnscheduledDeps += Incr;
return FirstInBundle->UnscheduledDepsInBundle += Incr;
}
/// Sets the number of unscheduled dependencies to the number of
/// dependencies.
void resetUnscheduledDeps() {
incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
}
/// Clears all dependency information.
void clearDependencies() {
Dependencies = InvalidDeps;
resetUnscheduledDeps();
MemoryDependencies.clear();
}
void dump(raw_ostream &os) const {
if (!isSchedulingEntity()) {
os << "/ " << *Inst;
} else if (NextInBundle) {
os << '[' << *Inst;
ScheduleData *SD = NextInBundle;
while (SD) {
os << ';' << *SD->Inst;
SD = SD->NextInBundle;
}
os << ']';
} else {
os << *Inst;
}
}
Instruction *Inst = nullptr;
/// Points to the head in an instruction bundle (and always to this for
/// single instructions).
ScheduleData *FirstInBundle = nullptr;
/// Single linked list of all instructions in a bundle. Null if it is a
/// single instruction.
ScheduleData *NextInBundle = nullptr;
/// Single linked list of all memory instructions (e.g. load, store, call)
/// in the block - until the end of the scheduling region.
ScheduleData *NextLoadStore = nullptr;
/// The dependent memory instructions.
/// This list is derived on demand in calculateDependencies().
SmallVector<ScheduleData *, 4> MemoryDependencies;
/// This ScheduleData is in the current scheduling region if this matches
/// the current SchedulingRegionID of BlockScheduling.
int SchedulingRegionID = 0;
/// Used for getting a "good" final ordering of instructions.
int SchedulingPriority = 0;
/// The number of dependencies. Constitutes of the number of users of the
/// instruction plus the number of dependent memory instructions (if any).
/// This value is calculated on demand.
/// If InvalidDeps, the number of dependencies is not calculated yet.
int Dependencies = InvalidDeps;
/// The number of dependencies minus the number of dependencies of scheduled
/// instructions. As soon as this is zero, the instruction/bundle gets ready
/// for scheduling.
/// Note that this is negative as long as Dependencies is not calculated.
int UnscheduledDeps = InvalidDeps;
/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
/// single instructions.
int UnscheduledDepsInBundle = InvalidDeps;
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
bool IsScheduled = false;
/// Opcode of the current instruction in the schedule data.
Value *OpValue = nullptr;
};
#ifndef NDEBUG
friend inline raw_ostream &operator<<(raw_ostream &os,
const BoUpSLP::ScheduleData &SD) {
SD.dump(os);
return os;
}
#endif
friend struct GraphTraits<BoUpSLP *>;
friend struct DOTGraphTraits<BoUpSLP *>;
/// Contains all scheduling data for a basic block.
struct BlockScheduling {
BlockScheduling(BasicBlock *BB)
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
void clear() {
ReadyInsts.clear();
ScheduleStart = nullptr;
ScheduleEnd = nullptr;
FirstLoadStoreInRegion = nullptr;
LastLoadStoreInRegion = nullptr;
// Reduce the maximum schedule region size by the size of the
// previous scheduling run.
ScheduleRegionSizeLimit -= ScheduleRegionSize;
if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
ScheduleRegionSizeLimit = MinScheduleRegionSize;
ScheduleRegionSize = 0;
// Make a new scheduling region, i.e. all existing ScheduleData is not
// in the new region yet.
++SchedulingRegionID;
}
ScheduleData *getScheduleData(Value *V) {
ScheduleData *SD = ScheduleDataMap[V];
if (SD && SD->SchedulingRegionID == SchedulingRegionID)
return SD;
return nullptr;
}
ScheduleData *getScheduleData(Value *V, Value *Key) {
if (V == Key)
return getScheduleData(V);
auto I = ExtraScheduleDataMap.find(V);
if (I != ExtraScheduleDataMap.end()) {
ScheduleData *SD = I->second[Key];
if (SD && SD->SchedulingRegionID == SchedulingRegionID)
return SD;
}
return nullptr;
}
bool isInSchedulingRegion(ScheduleData *SD) {
return SD->SchedulingRegionID == SchedulingRegionID;
}
/// Marks an instruction as scheduled and puts all dependent ready
/// instructions into the ready-list.
template <typename ReadyListType>
void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
SD->IsScheduled = true;
LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
ScheduleData *BundleMember = SD;
while (BundleMember) {
if (BundleMember->Inst != BundleMember->OpValue) {
BundleMember = BundleMember->NextInBundle;
continue;
}
// Handle the def-use chain dependencies.
for (Use &U : BundleMember->Inst->operands()) {
auto *I = dyn_cast<Instruction>(U.get());
if (!I)
continue;
doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
if (OpDef && OpDef->hasValidDependencies() &&
OpDef->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after
// decrementing, so we can put the dependent instruction
// into the ready list.
ScheduleData *DepBundle = OpDef->FirstInBundle;
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
LLVM_DEBUG(dbgs()
<< "SLP: gets ready (def): " << *DepBundle << "\n");
}
});
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
LLVM_DEBUG(dbgs()
<< "SLP: gets ready (mem): " << *DepBundle << "\n");
}
}
BundleMember = BundleMember->NextInBundle;
}
}
void doForAllOpcodes(Value *V,
function_ref<void(ScheduleData *SD)> Action) {
if (ScheduleData *SD = getScheduleData(V))
Action(SD);
auto I = ExtraScheduleDataMap.find(V);
if (I != ExtraScheduleDataMap.end())
for (auto &P : I->second)
if (P.second->SchedulingRegionID == SchedulingRegionID)
Action(P.second);
}
/// Put all instructions into the ReadyList which are ready for scheduling.
template <typename ReadyListType>
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
doForAllOpcodes(I, [&](ScheduleData *SD) {
if (SD->isSchedulingEntity() && SD->isReady()) {
ReadyList.insert(SD);
LLVM_DEBUG(dbgs()
<< "SLP: initially in ready list: " << *I << "\n");
}
});
}
}
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
/// Allocates schedule data chunk.
ScheduleData *allocateScheduleDataChunks();
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
bool extendSchedulingRegion(Value *V, const InstructionsState &S);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
void initScheduleData(Instruction *FromI, Instruction *ToI,
ScheduleData *PrevLoadStore,
ScheduleData *NextLoadStore);
/// Updates the dependency information of a bundle and of all instructions/
/// bundles which depend on the original bundle.
void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
BoUpSLP *SLP);
/// Sets all instruction in the scheduling region to un-scheduled.
void resetSchedule();
BasicBlock *BB;
/// Simple memory allocation for ScheduleData.
std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
/// The size of a ScheduleData array in ScheduleDataChunks.
int ChunkSize;
/// The allocator position in the current chunk, which is the last entry
/// of ScheduleDataChunks.
int ChunkPos;
/// Attaches ScheduleData to Instruction.
/// Note that the mapping survives during all vectorization iterations, i.e.
/// ScheduleData structures are recycled.
DenseMap<Value *, ScheduleData *> ScheduleDataMap;
/// Attaches ScheduleData to Instruction with the leading key.
DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
ExtraScheduleDataMap;
struct ReadyList : SmallVector<ScheduleData *, 8> {
void insert(ScheduleData *SD) { push_back(SD); }
};
/// The ready-list for scheduling (only used for the dry-run).
ReadyList ReadyInsts;
/// The first instruction of the scheduling region.
Instruction *ScheduleStart = nullptr;
/// The first instruction _after_ the scheduling region.
Instruction *ScheduleEnd = nullptr;
/// The first memory accessing instruction in the scheduling region
/// (can be null).
ScheduleData *FirstLoadStoreInRegion = nullptr;
/// The last memory accessing instruction in the scheduling region
/// (can be null).
ScheduleData *LastLoadStoreInRegion = nullptr;
/// The current size of the scheduling region.
int ScheduleRegionSize = 0;
/// The maximum size allowed for the scheduling region.
int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
// Make sure that the initial SchedulingRegionID is greater than the
// initial SchedulingRegionID in ScheduleData (which is 0).
int SchedulingRegionID = 1;
};
/// Attaches the BlockScheduling structures to basic blocks.
MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
/// Performs the "real" scheduling. Done before vectorization is actually
/// performed in a basic block.
void scheduleBlock(BlockScheduling *BS);
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
using OrdersType = SmallVector<unsigned, 4>;
/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
/// sorted SmallVectors of unsigned.
struct OrdersTypeDenseMapInfo {
static OrdersType getEmptyKey() {
OrdersType V;
V.push_back(~1U);
return V;
}
static OrdersType getTombstoneKey() {
OrdersType V;
V.push_back(~2U);
return V;
}
static unsigned getHashValue(const OrdersType &V) {
return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
}
static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
return LHS == RHS;
}
};
/// Contains orders of operations along with the number of bundles that have
/// operations in this order. It stores only those orders that require
/// reordering, if reordering is not required it is counted using \a
/// NumOpsWantToKeepOriginalOrder.
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
/// Number of bundles that do not require reordering.
unsigned NumOpsWantToKeepOriginalOrder = 0;
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
LoopInfo *LI;
DominatorTree *DT;
AssumptionCache *AC;
DemandedBits *DB;
const DataLayout *DL;
OptimizationRemarkEmitter *ORE;
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;
/// A map of scalar integer values to the smallest bit width with which they
/// can legally be represented. The values map to (width, signed) pairs,
/// where "width" indicates the minimum bit width and "signed" is True if the
/// value must be signed-extended, rather than zero-extended, back to its
/// original width.
MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
};
} // end namespace slpvectorizer
template <> struct GraphTraits<BoUpSLP *> {
using TreeEntry = BoUpSLP::TreeEntry;
/// NodeRef has to be a pointer per the GraphWriter.
using NodeRef = TreeEntry *;
/// Add the VectorizableTree to the index iterator to be able to return
/// TreeEntry pointers.
struct ChildIteratorType
: public iterator_adaptor_base<
ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
std::vector<TreeEntry> &VectorizableTree;
ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
std::vector<TreeEntry> &VT)
: ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
NodeRef operator*() { return &VectorizableTree[I->Idx]; }
};
static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
static ChildIteratorType child_begin(NodeRef N) {
return {N->UserTreeIndices.begin(), N->Container};
}
static ChildIteratorType child_end(NodeRef N) {
return {N->UserTreeIndices.end(), N->Container};
}
/// For the node iterator we just need to turn the TreeEntry iterator into a
/// TreeEntry* iterator so that it dereferences to NodeRef.
using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;
static nodes_iterator nodes_begin(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.begin());
}
static nodes_iterator nodes_end(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.end());
}
static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
};
template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
using TreeEntry = BoUpSLP::TreeEntry;
DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
std::string Str;
raw_string_ostream OS(Str);
if (isSplat(Entry->Scalars)) {
OS << "<splat> " << *Entry->Scalars[0];
return Str;
}
for (auto V : Entry->Scalars) {
OS << *V;
if (std::any_of(
R->ExternalUses.begin(), R->ExternalUses.end(),
[&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
OS << " <extract>";
OS << "\n";
}
return Str;
}
static std::string getNodeAttributes(const TreeEntry *Entry,
const BoUpSLP *) {
if (Entry->NeedToGather)
return "color=red";
return "";
}
};
} // end namespace llvm
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
ExtraValueToDebugLocsMap ExternallyUsedValues;
buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
}
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap &ExternallyUsedValues,
ArrayRef<Value *> UserIgnoreLst) {
deleteTree();
UserIgnoreList = UserIgnoreLst;
if (!allSameType(Roots))
return;
buildTree_rec(Roots, 0, EdgeInfo());
// Collect the values that we need to extract from the tree.
for (TreeEntry &EIdx : VectorizableTree) {
TreeEntry *Entry = &EIdx;
// No need to handle users of gathered values.
if (Entry->NeedToGather)
continue;
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
int FoundLane = Lane;
if (!Entry->ReuseShuffleIndices.empty()) {
FoundLane =
std::distance(Entry->ReuseShuffleIndices.begin(),
llvm::find(Entry->ReuseShuffleIndices, FoundLane));
}
// Check if the scalar is externally used as an extra arg.
auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n");
ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
}
for (User *U : Scalar->users()) {
LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
continue;
// Skip in-tree scalars that become vectors
if (TreeEntry *UseEntry = getTreeEntry(U)) {
Value *UseScalar = UseEntry->Scalars[0];
// Some in-tree scalars will remain as scalar in vectorized
// instructions. If that is the case, the one in Lane 0 will
// be used.
if (UseScalar != U ||
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
assert(!UseEntry->NeedToGather && "Bad state");
continue;
}
}
// Ignore users in the user ignore list.
if (is_contained(UserIgnoreList, UserInst))
continue;
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
<< Lane << " from " << *Scalar << ".\n");
ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
}
}
}
}
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
EdgeInfo UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Don't handle vectors.
if (S.OpValue->getType()->isVectorTy()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// If all of the operands are identical or constant we have a simple solution.
if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// We now know that this is a vector of instructions of the same type from
// the same block.
// Don't vectorize ephemeral values.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (EphValues.count(VL[i])) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
<< ") is ephemeral.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
// Check if this is a duplicate of another entry.
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
if (!E->isSame(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Record the reuse of the tree node. FIXME, currently this is only used to
// properly draw the graph rather than for the actual vectorization.
E->UserTreeIndices.push_back(UserTreeIdx);
LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
<< ".\n");
E->trySetUserTEOperand(UserTreeIdx, VL, None);
return;
}
// Check that none of the instructions in the bundle are already in the tree.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
auto *I = dyn_cast<Instruction>(VL[i]);
if (!I)
continue;
if (getTreeEntry(I)) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
<< ") is already in tree.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
// If any of the scalars is marked as a value that needs to stay scalar, then
// we need to gather the scalars.
// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
}
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
auto *VL0 = cast<Instruction>(S.OpValue);
BasicBlock *BB = VL0->getParent();
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
// Check that every instruction appears once in this bundle.
SmallVector<unsigned, 4> ReuseShuffleIndicies;
SmallVector<Value *, 4> UniqueValues;
DenseMap<Value *, unsigned> UniquePositions;
for (Value *V : VL) {
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
ReuseShuffleIndicies.emplace_back(Res.first->second);
if (Res.second)
UniqueValues.emplace_back(V);
}
if (UniqueValues.size() == VL.size()) {
ReuseShuffleIndicies.clear();
} else {
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, false, UserTreeIdx);
return;
}
VL = UniqueValues;
}
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
BSRef = llvm::make_unique<BlockScheduling>(BB);
BlockScheduling &BS = *BSRef.get();
if (!BS.tryScheduleBundle(VL, this, S)) {
LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
unsigned ShuffleOrOp = S.isAltShuffle() ?
(unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
// Check for terminator values (e.g. invoke).
for (unsigned j = 0; j < VL.size(); ++j)
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
Instruction *Term = dyn_cast<Instruction>(
cast<PHINode>(VL[j])->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
if (Term && Term->isTerminator()) {
LLVM_DEBUG(dbgs()
<< "SLP: Need to swizzle PHINodes (terminator use).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
UserTreeIdx.EdgeIdx = i;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
OrdersType CurrentOrder;
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
if (Reuse) {
LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
++NumOpsWantToKeepOriginalOrder;
newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
ReuseShuffleIndicies);
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
Op0.assign(VL.size(), VL0->getOperand(0));
VectorizableTree.back().setOperand(0, Op0, ReuseShuffleIndicies);
return;
}
if (!CurrentOrder.empty()) {
LLVM_DEBUG({
dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order";
for (unsigned Idx : CurrentOrder)
dbgs() << " " << Idx;
dbgs() << "\n";
});
// Insert new order with initial value 0, if it does not exist,
// otherwise return the iterator to the existing one.
auto StoredCurrentOrderAndNum =
NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
++StoredCurrentOrderAndNum->getSecond();
newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
StoredCurrentOrderAndNum->getFirst());
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
Op0.assign(VL.size(), VL0->getOperand(0));
VectorizableTree.back().setOperand(0, Op0, ReuseShuffleIndicies);
return;
}
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
BS.cancelScheduling(VL, VL0);
return;
}
case Instruction::Load: {
// Check that a vectorized load would load the same memory as a scalar
// load. For example, we don't want to vectorize loads that are smaller
// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
// treats loading/storing it as an i8 struct. If we vectorize loads/stores
// from such a struct, we read/write packed bits disagreeing with the
// unvectorized version.
Type *ScalarTy = VL0->getType();
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
// Make sure all loads in the bundle are simple - we can't vectorize
// atomic or volatile loads.
SmallVector<Value *, 4> PointerOps(VL.size());
auto POIter = PointerOps.begin();
for (Value *V : VL) {
auto *L = cast<LoadInst>(V);
if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
*POIter = L->getPointerOperand();
++POIter;
}
OrdersType CurrentOrder;
// Check the order of pointer operands.
if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
Value *Ptr0;
Value *PtrN;
if (CurrentOrder.empty()) {
Ptr0 = PointerOps.front();
PtrN = PointerOps.back();
} else {
Ptr0 = PointerOps[CurrentOrder.front()];
PtrN = PointerOps[CurrentOrder.back()];
}
const SCEV *Scev0 = SE->getSCEV(Ptr0);
const SCEV *ScevN = SE->getSCEV(PtrN);
const auto *Diff =
dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
uint64_t Size = DL->getTypeAllocSize(ScalarTy);
// Check that the sorted loads are consecutive.
if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
++NumOpsWantToKeepOriginalOrder;
newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
} else {
// Need to reorder.
auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
++I->getSecond();
newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
ReuseShuffleIndicies, I->getFirst());
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
}
return;
}
}
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
for (unsigned i = 0; i < VL.size(); ++i) {
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs()
<< "SLP: Gathering casts with different src types.\n");
return;
}
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
UserTreeIdx.EdgeIdx = i;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
case Instruction::ICmp:
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
Type *ComparedTy = VL0->getOperand(0)->getType();
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
CmpInst *Cmp = cast<CmpInst>(VL[i]);
if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs()
<< "SLP: Gathering cmp with different predicate.\n");
return;
}
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
ValueList Left, Right;
if (cast<CmpInst>(VL0)->isCommutative()) {
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
assert(P0 == SwapP0 && "Commutative Predicate mismatch");
reorderInputsAccordingToOpcode(S, VL, Left, Right);
} else {
// Collect operands - commute if it uses the swapped predicate.
for (Value *V : VL) {
auto *Cmp = cast<CmpInst>(V);
Value *LHS = Cmp->getOperand(0);
Value *RHS = Cmp->getOperand(1);
if (Cmp->getPredicate() != P0)
std::swap(LHS, RHS);
Left.push_back(LHS);
Right.push_back(RHS);
}
}
UserTreeIdx.EdgeIdx = 0;
buildTree_rec(Left, Depth + 1, UserTreeIdx);
UserTreeIdx.EdgeIdx = 1;
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
}
case Instruction::Select:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(S, VL, Left, Right);
UserTreeIdx.EdgeIdx = 0;
buildTree_rec(Left, Depth + 1, UserTreeIdx);
UserTreeIdx.EdgeIdx = 1;
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
}
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
UserTreeIdx.EdgeIdx = i;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
for (unsigned j = 0; j < VL.size(); ++j) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
// We can't combine several GEPs into one vector if they operate on
// different types.
Type *Ty0 = VL0->getOperand(0)->getType();
for (unsigned j = 0; j < VL.size(); ++j) {
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
if (Ty0 != CurTy) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
// We don't combine GEPs with non-constant indexes.
for (unsigned j = 0; j < VL.size(); ++j) {
auto Op = cast<Instruction>(VL[j])->getOperand(1);
if (!isa<ConstantInt>(Op)) {
LLVM_DEBUG(dbgs()
<< "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
return;
}
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
UserTreeIdx.EdgeIdx = i;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
case Instruction::Store: {
// Check if the stores are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(0));
UserTreeIdx.EdgeIdx = 0;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
return;
}
case Instruction::Call: {
// Check if the calls are all to the same vectorizable intrinsic.
CallInst *CI = cast<CallInst>(VL0);
// Check if this is an Intrinsic call or something that can be
// represented by an intrinsic call
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
Function *Int = CI->getCalledFunction();
unsigned NumArgs = CI->getNumArgOperands();
SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
for (unsigned j = 0; j != NumArgs; ++j)
if (hasVectorInstrinsicScalarOpd(ID, j))
ScalarArgs[j] = CI->getArgOperand(j);
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
if (!CI2 || CI2->getCalledFunction() != Int ||
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
<< "\n");
return;
}
// Some intrinsics have scalar arguments and should be same in order for
// them to be vectorized.
for (unsigned j = 0; j != NumArgs; ++j) {
if (hasVectorInstrinsicScalarOpd(ID, j)) {
Value *A1J = CI2->getArgOperand(j);
if (ScalarArgs[j] != A1J) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument " << ScalarArgs[j] << "!=" << A1J
<< "\n");
return;
}
}
}
// Verify that the bundle operands are identical between the two calls.
if (CI->hasOperandBundles() &&
!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *VL[i] << '\n');
return;
}
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL) {
CallInst *CI2 = dyn_cast<CallInst>(j);
Operands.push_back(CI2->getArgOperand(i));
}
UserTreeIdx.EdgeIdx = i;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
}
case Instruction::ShuffleVector:
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
if (!S.isAltShuffle()) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(S, VL, Left, Right);
UserTreeIdx.EdgeIdx = 0;
buildTree_rec(Left, Depth + 1, UserTreeIdx);
UserTreeIdx.EdgeIdx = 1;
buildTree_rec(Right, Depth + 1, UserTreeIdx);
return;
}
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
UserTreeIdx.EdgeIdx = i;
buildTree_rec(Operands, Depth + 1, UserTreeIdx);
}
return;
default:
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
}
}
unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
unsigned N;
Type *EltTy;
auto *ST = dyn_cast<StructType>(T);
if (ST) {
N = ST->getNumElements();
EltTy = *ST->element_begin();
} else {
N = cast<ArrayType>(T)->getNumElements();
EltTy = cast<ArrayType>(T)->getElementType();
}
if (!isValidElementType(EltTy))
return 0;
uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
return 0;
if (ST) {
// Check that struct is homogeneous.
for (const auto *Ty : ST->elements())
if (Ty != EltTy)
return 0;
}
return N;
}
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
SmallVectorImpl<unsigned> &CurrentOrder) const {
Instruction *E0 = cast<Instruction>(OpValue);
assert(E0->getOpcode() == Instruction::ExtractElement ||
E0->getOpcode() == Instruction::ExtractValue);
assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
Value *Vec = E0->getOperand(0);
CurrentOrder.clear();
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
if (E0->getOpcode() == Instruction::ExtractValue) {
const DataLayout &DL = E0->getModule()->getDataLayout();
NElts = canMapToVector(Vec->getType(), DL);
if (!NElts)
return false;
// Check if load can be rewritten as load of vector.
LoadInst *LI = dyn_cast<LoadInst>(Vec);
if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
return false;
} else {
NElts = Vec->getType()->getVectorNumElements();
}
if (NElts != VL.size())
return false;
// Check that all of the indices extract from the correct offset.
bool ShouldKeepOrder = true;
unsigned E = VL.size();
// Assign to all items the initial value E + 1 so we can check if the extract
// instruction index was used already.
// Also, later we can check that all the indices are used and we have a
// consecutive access in the extract instructions, by checking that no
// element of CurrentOrder still has value E + 1.
CurrentOrder.assign(E, E + 1);
unsigned I = 0;
for (; I < E; ++I) {
auto *Inst = cast<Instruction>(VL[I]);
if (Inst->getOperand(0) != Vec)
break;
Optional<unsigned> Idx = getExtractIndex(Inst);
if (!Idx)
break;
const unsigned ExtIdx = *Idx;
if (ExtIdx != I) {
if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
break;
ShouldKeepOrder = false;
CurrentOrder[ExtIdx] = I;
} else {
if (CurrentOrder[I] != E + 1)
break;
CurrentOrder[I] = I;
}
}
if (I < E) {
CurrentOrder.clear();
return false;
}
return ShouldKeepOrder;
}
bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
return I->hasOneUse() ||
std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
return ScalarToTreeEntry.count(U) > 0;
});
}
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
ScalarTy = CI->getOperand(0)->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
if (MinBWs.count(VL[0]))
VecTy = VectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
int ReuseShuffleCost = 0;
if (NeedToShuffleReuses) {
ReuseShuffleCost =
TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}
if (E->NeedToGather) {
if (allConstant(VL))
return 0;
if (isSplat(VL)) {
return ReuseShuffleCost +
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}
if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
allSameType(VL) && allSameBlock(VL)) {
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
if (ShuffleKind.hasValue()) {
int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
for (auto *V : VL) {
// If all users of instruction are going to be vectorized and this
// instruction itself is not going to be vectorized, consider this
// instruction as dead and remove its cost from the final cost of the
// vectorized tree.
if (areAllUsersVectorized(cast<Instruction>(V)) &&
!ScalarToTreeEntry.count(V)) {
auto *IO = cast<ConstantInt>(
cast<ExtractElementInst>(V)->getIndexOperand());
Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
IO->getZExtValue());
}
}
return ReuseShuffleCost + Cost;
}
}
return ReuseShuffleCost + getGatherCost(VL);
}
InstructionsState S = getSameOpcode(VL);
assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(S.OpValue);
unsigned ShuffleOrOp = S.isAltShuffle() ?
(unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI:
return 0;
case Instruction::ExtractValue:
case Instruction::ExtractElement:
if (NeedToShuffleReuses) {
unsigned Idx = 0;
for (unsigned I : E->ReuseShuffleIndices) {
if (ShuffleOrOp == Instruction::ExtractElement) {
auto *IO = cast<ConstantInt>(
cast<ExtractElementInst>(VL[I])->getIndexOperand());
Idx = IO->getZExtValue();
ReuseShuffleCost -= TTI->getVectorInstrCost(
Instruction::ExtractElement, VecTy, Idx);
} else {
ReuseShuffleCost -= TTI->getVectorInstrCost(
Instruction::ExtractElement, VecTy, Idx);
++Idx;
}
}
Idx = ReuseShuffleNumbers;
for (Value *V : VL) {
if (ShuffleOrOp == Instruction::ExtractElement) {
auto *IO = cast<ConstantInt>(
cast<ExtractElementInst>(V)->getIndexOperand());
Idx = IO->getZExtValue();
} else {
--Idx;
}
ReuseShuffleCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
}
}
if (!E->NeedToGather) {
int DeadCost = ReuseShuffleCost;
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
DeadCost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
// If all users are going to be vectorized, instruction can be
// considered as dead.
// The same, if have only one user, it will be vectorized for sure.
if (areAllUsersVectorized(E)) {
// Take credit for instruction that will become dead.
if (E->hasOneUse()) {
Instruction *Ext = E->user_back();
if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
all_of(Ext->users(),
[](User *U) { return isa<GetElementPtrInst>(U); })) {
// Use getExtractWithExtendCost() to calculate the cost of
// extractelement/ext pair.
DeadCost -= TTI->getExtractWithExtendCost(
Ext->getOpcode(), Ext->getType(), VecTy, i);
// Add back the cost of s|zext which is subtracted separately.
DeadCost += TTI->getCastInstrCost(
Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
continue;
}
}
DeadCost -=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
}
}
return DeadCost;
}
return ReuseShuffleCost + getGatherCost(VL);
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
int ScalarEltCost =
TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
// Calculate the cost of this instruction.
int ScalarCost = VL.size() * ScalarEltCost;
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
int VecCost = 0;
// Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
VecCost = ReuseShuffleCost +
TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
}
return VecCost - ScalarCost;
}
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
// Calculate the cost of this instruction.
int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
Builder.getInt1Ty(), VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
// Certain instructions can be cheaper to vectorize if they have a
// constant second vector operand.
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP =
TargetTransformInfo::OP_PowerOf2;
// If all operands are exactly the same ConstantInt then set the
// operand kind to OK_UniformConstantValue.
// If instead not all operands are constants, then set the operand kind
// to OK_AnyValue. If all operands are constants but not the same,
// then set the operand kind to OK_NonUniformConstantValue.
ConstantInt *CInt0 = nullptr;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
const Instruction *I = cast<Instruction>(VL[i]);
ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
if (!CInt) {
Op2VK = TargetTransformInfo::OK_AnyValue;
Op2VP = TargetTransformInfo::OP_None;
break;
}
if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
!CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_None;
if (i == 0) {
CInt0 = CInt;
continue;
}