blob: ea7c465bd4da180c0e956ad56ea4cfaad4be05a9 [file] [log] [blame]
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.
// The pass is inspired by the work described in the paper:
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>
#include <memory>
#include <optional>
#include <set>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
using namespace llvm::PatternMatch;
using namespace slpvectorizer;
#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
cl::desc("Run the SLP vectorization passes"));
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize if you gain more than this "
"number "));
static cl::opt<bool>
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
cl::desc("Attempt to vectorize horizontal reductions"));
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
"Attempt to vectorize horizontal reductions feeding into a store"));
// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
// even if we match a reduction but do not vectorize in the end.
static cl::opt<bool> AllowHorRdxIdenityOptimization(
"slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
cl::desc("Allow optimization of original scalar identity operations on "
"matched horizontal reductions."));
static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
static cl::opt<unsigned>
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
/// This limit is way higher than needed by real-world functions.
static cl::opt<int>
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
cl::desc("Limit the size of the SLP scheduling region per block"));
static cl::opt<int> MinVectorRegSizeOption(
"slp-min-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
static cl::opt<unsigned> RecursionMaxDepth(
"slp-recursion-max-depth", cl::init(12), cl::Hidden,
cl::desc("Limit the recursion depth when building a vectorizable tree"));
static cl::opt<unsigned> MinTreeSize(
"slp-min-tree-size", cl::init(3), cl::Hidden,
cl::desc("Only vectorize small trees if they are fully vectorizable"));
// The maximum depth that the look-ahead score heuristic will explore.
// The higher this value, the higher the compilation time overhead.
static cl::opt<int> LookAheadMaxDepth(
"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
cl::desc("The maximum look-ahead depth for operand reordering scores"));
// The maximum depth that the look-ahead score heuristic will explore
// when it probing among candidates for vectorization tree roots.
// The higher this value, the higher the compilation time overhead but unlike
// similar limit for operands ordering this is less frequently used, hence
// impact of higher value is less noticeable.
static cl::opt<int> RootLookAheadMaxDepth(
"slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
cl::desc("The maximum look-ahead depth for searching best rooting option"));
static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
// Another limit for the alias checks: The maximum distance between load/store
// instructions where alias checks are done.
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance = 160;
/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
/// regions to be handled.
static const int MinScheduleRegionSize = 16;
/// Predicate for the element types that the SLP vectorizer supports.
/// The most important thing to filter here are types which are invalid in LLVM
/// vectors. We also filter target specific types which have absolutely no
/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
/// avoids spending time checking the cost model and realizing that they will
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
/// \returns True if the value is a constant (but not globals/constant
/// expressions).
static bool isConstant(Value *V) {
return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
/// Checks if \p V is one of vector-like instructions, i.e. undef,
/// insertelement/extractelement with constant indices for fixed vector type or
/// extractvalue instruction.
static bool isVectorLikeInstWithConstOps(Value *V) {
if (!isa<InsertElementInst, ExtractElementInst>(V) &&
!isa<ExtractValueInst, UndefValue>(V))
return false;
auto *I = dyn_cast<Instruction>(V);
if (!I || isa<ExtractValueInst>(I))
return true;
if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
return false;
if (isa<ExtractElementInst>(I))
return isConstant(I->getOperand(1));
assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
return isConstant(I->getOperand(2));
#if !defined(NDEBUG)
/// Print a short descriptor of the instruction bundle suitable for debug output.
static std::string shortBundleName(ArrayRef<Value *> VL) {
std::string Result;
raw_string_ostream OS(Result);
OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
return Result;
/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return false;
if (all_of(VL, isVectorLikeInstWithConstOps))
return true;
BasicBlock *BB = I0->getParent();
for (int I = 1, E = VL.size(); I < E; I++) {
auto *II = dyn_cast<Instruction>(VL[I]);
if (!II)
return false;
if (BB != II->getParent())
return false;
return true;
/// \returns True if all of the values in \p VL are constants (but not
/// globals/constant expressions).
static bool allConstant(ArrayRef<Value *> VL) {
// Constant expressions and globals can't be vectorized like normal integer/FP
// constants.
return all_of(VL, isConstant);
/// \returns True if all of the values in \p VL are identical or some of them
/// are UndefValue.
static bool isSplat(ArrayRef<Value *> VL) {
Value *FirstNonUndef = nullptr;
for (Value *V : VL) {
if (isa<UndefValue>(V))
if (!FirstNonUndef) {
FirstNonUndef = V;
if (V != FirstNonUndef)
return false;
return FirstNonUndef != nullptr;
/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
static bool isCommutative(Instruction *I) {
if (auto *Cmp = dyn_cast<CmpInst>(I))
return Cmp->isCommutative();
if (auto *BO = dyn_cast<BinaryOperator>(I))
return BO->isCommutative();
// TODO: This should check for generic Instruction::isCommutative(), but
// we need to confirm that the caller code correctly handles Intrinsics
// for example (does not have 2 operands).
return false;
/// \returns inserting index of InsertElement or InsertValue instruction,
/// using Offset as base offset for index.
static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
unsigned Offset = 0) {
int Index = Offset;
if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
if (!VT)
return std::nullopt;
const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
if (!CI)
return std::nullopt;
if (CI->getValue().uge(VT->getNumElements()))
return std::nullopt;
Index *= VT->getNumElements();
Index += CI->getZExtValue();
return Index;
const auto *IV = cast<InsertValueInst>(InsertInst);
Type *CurrentType = IV->getType();
for (unsigned I : IV->indices()) {
if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
Index *= ST->getNumElements();
CurrentType = ST->getElementType(I);
} else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
Index *= AT->getNumElements();
CurrentType = AT->getElementType();
} else {
return std::nullopt;
Index += I;
return Index;
namespace {
/// Specifies the way the mask should be analyzed for undefs/poisonous elements
/// in the shuffle mask.
enum class UseMask {
FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
///< check for the mask elements for the first argument (mask
///< indices are in range [0:VF)).
SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
///< for the mask elements for the second argument (mask indices
///< are in range [VF:2*VF))
UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
///< future shuffle elements and mark them as ones as being used
///< in future. Non-undef elements are considered as unused since
///< they're already marked as used in the mask.
} // namespace
/// Prepares a use bitset for the given mask either for the first argument or
/// for the second.
static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
UseMask MaskArg) {
SmallBitVector UseMask(VF, true);
for (auto [Idx, Value] : enumerate(Mask)) {
if (Value == PoisonMaskElem) {
if (MaskArg == UseMask::UndefsAsMask)
if (MaskArg == UseMask::FirstArg && Value < VF)
else if (MaskArg == UseMask::SecondArg && Value >= VF)
UseMask.reset(Value - VF);
return UseMask;
/// Checks if the given value is actually an undefined constant vector.
/// Also, if the \p UseMask is not empty, tries to check if the non-masked
/// elements actually mask the insertelement buildvector, if any.
template <bool IsPoisonOnly = false>
static SmallBitVector isUndefVector(const Value *V,
const SmallBitVector &UseMask = {}) {
SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
if (isa<T>(V))
return Res;
auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
if (!VecTy)
return Res.reset();
auto *C = dyn_cast<Constant>(V);
if (!C) {
if (!UseMask.empty()) {
const Value *Base = V;
while (auto *II = dyn_cast<InsertElementInst>(Base)) {
Base = II->getOperand(0);
if (isa<T>(II->getOperand(1)))
std::optional<unsigned> Idx = getInsertIndex(II);
if (!Idx)
if (*Idx < UseMask.size() && !UseMask.test(*Idx))
// TODO: Add analysis for shuffles here too.
if (V == Base) {
} else {
SmallBitVector SubMask(UseMask.size(), false);
Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
} else {
return Res;
for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
if (Constant *Elem = C->getAggregateElement(I))
if (!isa<T>(Elem) &&
(UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
return Res;
/// Checks if the vector of instructions can be represented as a shuffle, like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
/// %x0x0 = mul i8 %x0, %x0
/// %x3x3 = mul i8 %x3, %x3
/// %y1y1 = mul i8 %y1, %y1
/// %y2y2 = mul i8 %y2, %y2
/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
/// ret <4 x i8> %ins4
/// can be transformed into:
/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
/// i32 6>
/// %2 = mul <4 x i8> %1, %1
/// ret <4 x i8> %2
/// We convert this initially to something like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
/// %5 = mul <4 x i8> %4, %4
/// %6 = extractelement <4 x i8> %5, i32 0
/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
/// %7 = extractelement <4 x i8> %5, i32 1
/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
/// %8 = extractelement <4 x i8> %5, i32 2
/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
/// %9 = extractelement <4 x i8> %5, i32 3
/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
/// ret <4 x i8> %ins4
/// InstCombiner transforms this into a shuffle and vector mul
/// Mask will return the Shuffle Mask equivalent to the extracted elements.
/// TODO: Can we split off and reuse the shuffle mask detection from
/// ShuffleVectorInst/getShuffleCost?
static std::optional<TargetTransformInfo::ShuffleKind>
isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
const auto *It =
find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
if (It == VL.end())
return std::nullopt;
auto *EI0 = cast<ExtractElementInst>(*It);
if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
return std::nullopt;
unsigned Size =
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
Mask.assign(VL.size(), PoisonMaskElem);
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
// Undef can be represented as an undef element in a vector.
if (isa<UndefValue>(VL[I]))
auto *EI = cast<ExtractElementInst>(VL[I]);
if (isa<ScalableVectorType>(EI->getVectorOperandType()))
return std::nullopt;
auto *Vec = EI->getVectorOperand();
// We can extractelement from undef or poison vector.
if (isUndefVector(Vec).all())
// All vector operands must have the same number of vector elements.
if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
return std::nullopt;
if (isa<UndefValue>(EI->getIndexOperand()))
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
return std::nullopt;
// Undefined behavior if Idx is negative or >= Size.
if (Idx->getValue().uge(Size))
unsigned IntIdx = Idx->getValue().getZExtValue();
Mask[I] = IntIdx;
// For correct shuffling we have to have at most 2 different vector operands
// in all extractelement instructions.
if (!Vec1 || Vec1 == Vec) {
Vec1 = Vec;
} else if (!Vec2 || Vec2 == Vec) {
Vec2 = Vec;
Mask[I] += Size;
} else {
return std::nullopt;
if (CommonShuffleMode == Permute)
// If the extract index is not the same as the operation number, it is a
// permutation.
if (IntIdx != I) {
CommonShuffleMode = Permute;
CommonShuffleMode = Select;
// If we're not crossing lanes in different vectors, consider it as blending.
if (CommonShuffleMode == Select && Vec2)
return TargetTransformInfo::SK_Select;
// If Vec2 was never used, we have a permutation of a single vector, otherwise
// we have permutation of 2 vectors.
return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
: TargetTransformInfo::SK_PermuteSingleSrc;
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
static std::optional<unsigned> getExtractIndex(Instruction *E) {
unsigned Opcode = E->getOpcode();
assert((Opcode == Instruction::ExtractElement ||
Opcode == Instruction::ExtractValue) &&
"Expected extractelement or extractvalue instruction.");
if (Opcode == Instruction::ExtractElement) {
auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
if (!CI)
return std::nullopt;
return CI->getZExtValue();
auto *EI = cast<ExtractValueInst>(E);
if (EI->getNumIndices() != 1)
return std::nullopt;
return *EI->idx_begin();
/// Tries to find extractelement instructions with constant indices from fixed
/// vector type and gather such instructions into a bunch, which highly likely
/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
/// successful, the matched scalars are replaced by poison values in \p VL for
/// future analysis.
static std::optional<TTI::ShuffleKind>
tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
SmallVectorImpl<int> &Mask) {
// Scan list of gathered scalars for extractelements that can be represented
// as shuffles.
MapVector<Value *, SmallVector<int>> VectorOpToIdx;
SmallVector<int> UndefVectorExtracts;
for (int I = 0, E = VL.size(); I < E; ++I) {
auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
if (!EI) {
if (isa<UndefValue>(VL[I]))
auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
std::optional<unsigned> Idx = getExtractIndex(EI);
// Undefined index.
if (!Idx) {
SmallBitVector ExtractMask(VecTy->getNumElements(), true);
if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
// Sort the vector operands by the maximum number of uses in extractelements.
MapVector<unsigned, SmallVector<Value *>> VFToVector;
for (const auto &Data : VectorOpToIdx)
for (auto &Data : VFToVector) {
stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
return VectorOpToIdx.find(V1)->second.size() >
// Find the best pair of the vectors with the same number of elements or a
// single vector.
const int UndefSz = UndefVectorExtracts.size();
unsigned SingleMax = 0;
Value *SingleVec = nullptr;
unsigned PairMax = 0;
std::pair<Value *, Value *> PairVec(nullptr, nullptr);
for (auto &Data : VFToVector) {
Value *V1 = Data.second.front();
if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
SingleMax = VectorOpToIdx[V1].size() + UndefSz;
SingleVec = V1;
Value *V2 = nullptr;
if (Data.second.size() > 1)
V2 = *std::next(Data.second.begin());
if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
UndefSz) {
PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
PairVec = std::make_pair(V1, V2);
if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
return std::nullopt;
// Check if better to perform a shuffle of 2 vectors or just of a single
// vector.
SmallVector<Value *> SavedVL(VL.begin(), VL.end());
SmallVector<Value *> GatheredExtracts(
VL.size(), PoisonValue::get(VL.front()->getType()));
if (SingleMax >= PairMax && SingleMax) {
for (int Idx : VectorOpToIdx[SingleVec])
std::swap(GatheredExtracts[Idx], VL[Idx]);
} else {
for (Value *V : {PairVec.first, PairVec.second})
for (int Idx : VectorOpToIdx[V])
std::swap(GatheredExtracts[Idx], VL[Idx]);
// Add extracts from undefs too.
for (int Idx : UndefVectorExtracts)
std::swap(GatheredExtracts[Idx], VL[Idx]);
// Check that gather of extractelements can be represented as just a
// shuffle of a single/two vectors the scalars are extracted from.
std::optional<TTI::ShuffleKind> Res =
isFixedVectorShuffle(GatheredExtracts, Mask);
if (!Res) {
// TODO: try to check other subsets if possible.
// Restore the original VL if attempt was not successful.
return std::nullopt;
// Restore unused scalars from mask, if some of the extractelements were not
// selected for shuffle.
for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
!isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
is_contained(UndefVectorExtracts, I))
if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]))
std::swap(VL[I], GatheredExtracts[I]);
return Res;
namespace {
/// Main data required for vectorization of instructions.
struct InstructionsState {
/// The very first instruction in the list with the main opcode.
Value *OpValue = nullptr;
/// The main/alternate instruction.
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
/// The main/alternate opcodes for the list of instructions.
unsigned getOpcode() const {
return MainOp ? MainOp->getOpcode() : 0;
unsigned getAltOpcode() const {
return AltOp ? AltOp->getOpcode() : 0;
/// Some of the instructions in the list have alternate opcodes.
bool isAltShuffle() const { return AltOp != MainOp; }
bool isOpcodeOrAlt(Instruction *I) const {
unsigned CheckedOpcode = I->getOpcode();
return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
InstructionsState() = delete;
InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
: OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
} // end anonymous namespace
/// Chooses the correct key for scheduling data. If \p Op has the same (or
/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
/// OpValue.
static Value *isOneOf(const InstructionsState &S, Value *Op) {
auto *I = dyn_cast<Instruction>(Op);
if (I && S.isOpcodeOrAlt(I))
return Op;
return S.OpValue;
/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
/// Example of unsupported opcode is SDIV that can potentially cause UB if the
/// "shuffled out" lane would result in division by zero.
static bool isValidForAlternation(unsigned Opcode) {
if (Instruction::isIntDivRem(Opcode))
return false;
return true;
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
const TargetLibraryInfo &TLI,
unsigned BaseIndex = 0);
/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
/// compatible instructions or constants, or just some other regular values.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
Value *Op1, const TargetLibraryInfo &TLI) {
return (isConstant(BaseOp0) && isConstant(Op0)) ||
(isConstant(BaseOp1) && isConstant(Op1)) ||
(!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
!isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
BaseOp0 == Op0 || BaseOp1 == Op1 ||
getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
/// \returns true if a compare instruction \p CI has similar "look" and
/// same predicate as \p BaseCI, "as is" or with its operands and predicate
/// swapped, false otherwise.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
const TargetLibraryInfo &TLI) {
assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
"Assessing comparisons of different types?");
CmpInst::Predicate BasePred = BaseCI->getPredicate();
CmpInst::Predicate Pred = CI->getPredicate();
CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred);
Value *BaseOp0 = BaseCI->getOperand(0);
Value *BaseOp1 = BaseCI->getOperand(1);
Value *Op0 = CI->getOperand(0);
Value *Op1 = CI->getOperand(1);
return (BasePred == Pred &&
areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
(BasePred == SwappedPred &&
areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
const TargetLibraryInfo &TLI,
unsigned BaseIndex) {
// Make sure these are all Instructions.
if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
CmpInst::Predicate BasePred =
IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
unsigned AltOpcode = Opcode;
unsigned AltIndex = BaseIndex;
// Check for one alternate opcode from another BinaryOperator.
// TODO - generalize to support all operators (types, calls etc.).
auto *IBase = cast<Instruction>(VL[BaseIndex]);
Intrinsic::ID BaseID = 0;
SmallVector<VFInfo> BaseMappings;
if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
auto *I = cast<Instruction>(VL[Cnt]);
unsigned InstOpcode = I->getOpcode();
if (IsBinOp && isa<BinaryOperator>(I)) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
isValidForAlternation(Opcode)) {
AltOpcode = InstOpcode;
AltIndex = Cnt;
} else if (IsCastOp && isa<CastInst>(I)) {
Value *Op0 = IBase->getOperand(0);
Type *Ty0 = Op0->getType();
Value *Op1 = I->getOperand(0);
Type *Ty1 = Op1->getType();
if (Ty0 == Ty1) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
if (Opcode == AltOpcode) {
assert(isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) &&
"Cast isn't safe for alternation, logic needs to be updated!");
AltOpcode = InstOpcode;
AltIndex = Cnt;
} else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
Type *Ty0 = BaseInst->getOperand(0)->getType();
Type *Ty1 = Inst->getOperand(0)->getType();
if (Ty0 == Ty1) {
assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
// Check for compatible operands. If the corresponding operands are not
// compatible - need to perform alternate vectorization.
CmpInst::Predicate CurrentPred = Inst->getPredicate();
CmpInst::Predicate SwappedCurrentPred =
if (E == 2 &&
(BasePred == CurrentPred || BasePred == SwappedCurrentPred))
if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
auto *AltInst = cast<CmpInst>(VL[AltIndex]);
if (AltIndex != BaseIndex) {
if (isCmpSameOrSwapped(AltInst, Inst, TLI))
} else if (BasePred != CurrentPred) {
isValidForAlternation(InstOpcode) &&
"CmpInst isn't safe for alternation, logic needs to be updated!");
AltIndex = Cnt;
CmpInst::Predicate AltPred = AltInst->getPredicate();
if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
AltPred == CurrentPred || AltPred == SwappedCurrentPred)
} else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
if (Gep->getNumOperands() != 2 ||
Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
} else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
if (!isVectorLikeInstWithConstOps(EI))
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
} else if (auto *LI = dyn_cast<LoadInst>(I)) {
auto *BaseLI = cast<LoadInst>(IBase);
if (!LI->isSimple() || !BaseLI->isSimple())
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
} else if (auto *Call = dyn_cast<CallInst>(I)) {
auto *CallBase = cast<CallInst>(IBase);
if (Call->getCalledFunction() != CallBase->getCalledFunction())
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
if (Call->hasOperandBundles() &&
!std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
Call->op_begin() + Call->getBundleOperandsEndIndex(),
CallBase->op_begin() +
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI);
if (ID != BaseID)
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
if (!ID) {
SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
if (Mappings.size() != BaseMappings.size() ||
Mappings.front().ISA != BaseMappings.front().ISA ||
Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
Mappings.front().VectorName != BaseMappings.front().VectorName ||
Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
Mappings.front().Shape.Parameters !=
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
Type *Ty = VL.front()->getType();
return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
TargetLibraryInfo *TLI) {
unsigned Opcode = UserInst->getOpcode();
switch (Opcode) {
case Instruction::Load: {
LoadInst *LI = cast<LoadInst>(UserInst);
return (LI->getPointerOperand() == Scalar);
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(UserInst);
return (SI->getPointerOperand() == Scalar);
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
return any_of(enumerate(CI->args()), [&](auto &&Arg) {
return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
Arg.value().get() == Scalar;
return false;
/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return MemoryLocation::get(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return MemoryLocation::get(LI);
return MemoryLocation();
/// \returns True if the instruction is not a volatile or atomic load/store.
static bool isSimple(Instruction *I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return LI->isSimple();
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return SI->isSimple();
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
return !MI->isVolatile();
return true;
/// Shuffles \p Mask in accordance with the given \p SubMask.
/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
/// one but two input vectors.
static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
bool ExtendingManyInputs = false) {
if (SubMask.empty())
(!ExtendingManyInputs || SubMask.size() > Mask.size() ||
// Check if input scalars were extended to match the size of other node.
(SubMask.size() == Mask.size() &&
std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
[](int Idx) { return Idx == PoisonMaskElem; }))) &&
"SubMask with many inputs support must be larger than the mask.");
if (Mask.empty()) {
Mask.append(SubMask.begin(), SubMask.end());
SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
int TermValue = std::min(Mask.size(), SubMask.size());
for (int I = 0, E = SubMask.size(); I < E; ++I) {
if (SubMask[I] == PoisonMaskElem ||
(!ExtendingManyInputs &&
(SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
NewMask[I] = Mask[SubMask[I]];
/// Order may have elements assigned special value (size) which is out of
/// bounds. Such indices only appear on places which correspond to undef values
/// (see canReuseExtract for details) and used in order to avoid undef values
/// have effect on operands ordering.
/// The first loop below simply finds all unused indices and then the next loop
/// nest assigns these indices for undef values positions.
/// As an example below Order has two undef positions and they have assigned
/// values 3 and 7 respectively:
/// before: 6 9 5 4 9 2 1 0
/// after: 6 3 5 4 7 2 1 0
static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {
const unsigned Sz = Order.size();
SmallBitVector UnusedIndices(Sz, /*t=*/true);
SmallBitVector MaskedIndices(Sz);
for (unsigned I = 0; I < Sz; ++I) {
if (Order[I] < Sz)
if (MaskedIndices.none())
assert(UnusedIndices.count() == MaskedIndices.count() &&
"Non-synced masked/available indices.");
int Idx = UnusedIndices.find_first();
int MIdx = MaskedIndices.find_first();
while (MIdx >= 0) {
assert(Idx >= 0 && "Indices must be synced.");
Order[MIdx] = Idx;
Idx = UnusedIndices.find_next(Idx);
MIdx = MaskedIndices.find_next(MIdx);
namespace llvm {
static void inversePermutation(ArrayRef<unsigned> Indices,
SmallVectorImpl<int> &Mask) {
const unsigned E = Indices.size();
Mask.resize(E, PoisonMaskElem);
for (unsigned I = 0; I < E; ++I)
Mask[Indices[I]] = I;
/// Reorders the list of scalars in accordance with the given \p Mask.
static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
ArrayRef<int> Mask) {
assert(!Mask.empty() && "Expected non-empty mask.");
SmallVector<Value *> Prev(Scalars.size(),
for (unsigned I = 0, E = Prev.size(); I < E; ++I)
if (Mask[I] != PoisonMaskElem)
Scalars[Mask[I]] = Prev[I];
/// Checks if the provided value does not require scheduling. It does not
/// require scheduling if this is not an instruction or it is an instruction
/// that does not read/write memory and all operands are either not instructions
/// or phi nodes or instructions from different blocks.
static bool areAllOperandsNonInsts(Value *V) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
return true;
return !mayHaveNonDefUseDependency(*I) &&
all_of(I->operands(), [I](Value *V) {
auto *IO = dyn_cast<Instruction>(V);
if (!IO)
return true;
return isa<PHINode>(IO) || IO->getParent() != I->getParent();
/// Checks if the provided value does not require scheduling. It does not
/// require scheduling if this is not an instruction or it is an instruction
/// that does not read/write memory and all users are phi nodes or instructions
/// from the different blocks.
static bool isUsedOutsideBlock(Value *V) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
return true;
// Limits the number of uses to save compile time.
constexpr int UsesLimit = 8;
return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
all_of(I->users(), [I](User *U) {
auto *IU = dyn_cast<Instruction>(U);
if (!IU)
return true;
return IU->getParent() != I->getParent() || isa<PHINode>(IU);
/// Checks if the specified value does not require scheduling. It does not
/// require scheduling if all operands and all users do not need to be scheduled
/// in the current basic block.
static bool doesNotNeedToBeScheduled(Value *V) {
return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
/// Checks if the specified array of instructions does not require scheduling.
/// It is so if all either instructions have operands that do not require
/// scheduling or their users do not require scheduling since they are phis or
/// in other basic blocks.
static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
return !VL.empty() &&
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
struct TreeEntry;
struct ScheduleData;
class ShuffleCostEstimator;
class ShuffleInstructionBuilder;
using ValueList = SmallVector<Value *, 8>;
using InstrList = SmallVector<Instruction *, 16>;
using ValueSet = SmallPtrSet<Value *, 16>;
using StoreList = SmallVector<StoreInst *, 8>;
using ExtraValueToDebugLocsMap =
MapVector<Value *, SmallVector<Instruction *, 2>>;
using OrdersType = SmallVector<unsigned, 4>;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
: BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
// TODO: It would be better to limit the vectorization factor based on
// data type rather than just register size. For example, x86 AVX has
// 256-bit registers, but it does not support integer operations
// at that width (that requires AVX2).
if (MaxVectorRegSizeOption.getNumOccurrences())
MaxVecRegSize = MaxVectorRegSizeOption;
MaxVecRegSize =
if (MinVectorRegSizeOption.getNumOccurrences())
MinVecRegSize = MinVectorRegSizeOption;
MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
/// Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
/// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions.
/// \param ReplacedExternals containd list of replaced external values
/// {scalar, replace} after emitting extractelement for external uses.
Value *
vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
Instruction *ReductionRoot = nullptr);
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
InstructionCost getSpillCost() const;
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
const SmallDenseSet<Value *> &UserIgnoreLst);
/// Construct a vectorizable tree that starts at \p Roots.
void buildTree(ArrayRef<Value *> Roots);
/// Returns whether the root node has in-tree uses.
bool doesRootHaveInTreeUses() const {
return !VectorizableTree.empty() &&
/// Return the scalars of the root node.
ArrayRef<Value *> getRootNodeScalars() const {
assert(!VectorizableTree.empty() && "No graph to get the first node from");
return VectorizableTree.front()->Scalars;
/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
/// ExternallyUsedValues contains additional list of external uses to handle
/// vectorization of reductions.
buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
UserIgnoreList = nullptr;
unsigned getTreeSize() const { return VectorizableTree.size(); }
/// Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
/// Checks if the specified gather tree entry \p TE can be represented as a
/// shuffled vector entry + (possibly) permutation with other gathers. It
/// implements the checks only for possibly ordered scalars (Loads,
/// ExtractElement, ExtractValue), which can be part of the graph.
std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
/// Sort loads into increasing pointers offsets to allow greater clustering.
std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
/// Gets reordering data for the given tree entry. If the entry is vectorized
/// - just return ReorderIndices, otherwise check if the scalars can be
/// reordered and return the most optimal order.
/// \return std::nullopt if ordering is not important, empty order, if
/// identity order is important, or the actual order.
/// \param TopToBottom If true, include the order of vectorized stores and
/// insertelement nodes, otherwise skip them.
std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
bool TopToBottom);
/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
/// of the same size (vectorization factor). Smaller nodes are considered
/// parts of subgraph with smaller VF and they are reordered independently. We
/// can make it because we still need to extend smaller nodes to the wider VF
/// and we can merge reordering shuffles with the widening shuffles.
void reorderTopToBottom();
/// Reorders the current graph to the most profitable order starting from
/// leaves to the root. It allows to rotate small subgraphs and reduce the
/// number of reshuffles if the leaf nodes use the same order. In this case we
/// can merge the orders and just shuffle user node instead of shuffling its
/// operands. Plus, even the leaf nodes have different orders, it allows to
/// sink reordering in the graph closer to the root node and merge it later
/// during analysis.
void reorderBottomToTop(bool IgnoreReorder = false);
/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of
/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.
unsigned getVectorElementSize(Value *V);
/// Compute the minimum type sizes required to represent the entries in a
/// vectorizable tree.
void computeMinimumValueSizes();
// \returns maximum vector register size as set by TTI or overridden by cl::opt.
unsigned getMaxVecRegSize() const {
return MaxVecRegSize;
// \returns minimum vector register size as set by cl::opt.
unsigned getMinVecRegSize() const {
return MinVecRegSize;
unsigned getMinVF(unsigned Sz) const {
return std::max(2U, getMinVecRegSize() / Sz);
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
return MaxVF ? MaxVF : UINT_MAX;
/// Check if homogeneous aggregate is isomorphic to some VectorType.
/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
/// \returns True if the VectorizableTree is both tiny and not fully
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
/// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
/// can be load combined in the backend. Load combining may not be allowed in
/// the IR optimizer, so we do not want to alter the pattern. For example,
/// partially transforming a scalar bswap() pattern into vector code is
/// effectively impossible for the backend to undo.
/// TODO: If load combining is allowed in the IR optimizer, this analysis
/// may not be necessary.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
/// can be load combined in the backend. Load combining may not be allowed in
/// the IR optimizer, so we do not want to alter the pattern. For example,
/// partially transforming a scalar bswap() pattern into vector code is
/// effectively impossible for the backend to undo.
/// TODO: If load combining is allowed in the IR optimizer, this analysis
/// may not be necessary.
bool isLoadCombineCandidate() const;
OptimizationRemarkEmitter *getORE() { return ORE; }
/// This structure holds any data we need about the edges being traversed
/// during buildTree_rec(). We keep track of:
/// (i) the user TreeEntry index, and
/// (ii) the index of the edge.
struct EdgeInfo {
EdgeInfo() = default;
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
: UserTE(UserTE), EdgeIdx(EdgeIdx) {}
/// The user TreeEntry.
TreeEntry *UserTE = nullptr;
/// The operand index of the use.
unsigned EdgeIdx = UINT_MAX;
#ifndef NDEBUG
friend inline raw_ostream &operator<<(raw_ostream &OS,
const BoUpSLP::EdgeInfo &EI) {
return OS;
/// Debug print.
void dump(raw_ostream &OS) const {
OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
<< " EdgeIdx:" << EdgeIdx << "}";
LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
bool operator == (const EdgeInfo &Other) const {
return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
/// A helper class used for scoring candidates for two consecutive lanes.
class LookAheadHeuristics {
const TargetLibraryInfo &TLI;
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
int NumLanes; // Total number of lanes (aka vectorization factor).
int MaxLevel; // The maximum recursion depth for accumulating score.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
int MaxLevel)
: TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
MaxLevel(MaxLevel) {}
// The hard-coded scores listed here are not very important, though it shall
// be higher for better matches to improve the resulting cost. When
// computing the scores of matching one sub-tree with another, we are
// basically counting the number of values that are matching. So even if all
// scores are set to 1, we would still get a decent matching result.
// However, sometimes we have to break ties. For example we may have to
// choose between matching loads vs matching opcodes. This is what these
// scores are helping us with: they provide the order of preference. Also,
// this is important if the scalar is externally used or used in another
// tree entry node in the different lane.
/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreConsecutiveLoads = 4;
/// The same load multiple times. This should have a better score than
/// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
/// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
/// a vector load and 1.0 for a broadcast.
static const int ScoreSplatLoads = 3;
/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreReversedLoads = 3;
/// A load candidate for masked gather.
static const int ScoreMaskedGatherCandidate = 1;
/// ExtractElementInst from same vector and consecutive indexes.
static const int ScoreConsecutiveExtracts = 4;
/// ExtractElementInst from same vector and reversed indices.
static const int ScoreReversedExtracts = 3;
/// Constants.
static const int ScoreConstants = 2;
/// Instructions with the same opcode.
static const int ScoreSameOpcode = 2;
/// Instructions with alt opcodes (e.g, add + sub).
static const int ScoreAltOpcodes = 1;
/// Identical instructions (a.k.a. splat or broadcast).
static const int ScoreSplat = 1;
/// Matching with an undef is preferable to failing.
static const int ScoreUndef = 1;
/// Score for failing to find a decent match.
static const int ScoreFail = 0;
/// Score if all users are vectorized.
static const int ScoreAllUserVectorized = 1;
/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
/// \p U1 and \p U2 are the users of \p V1 and \p V2.
/// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
/// MainAltOps.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
ArrayRef<Value *> MainAltOps) const {
if (!isValidElementType(V1->getType()) ||
return LookAheadHeuristics::ScoreFail;
if (V1 == V2) {
if (isa<LoadInst>(V1)) {
// Retruns true if the users of V1 and V2 won't need to be extracted.
auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
// Bail out if we have too many uses to save compilation time.
static constexpr unsigned Limit = 8;
if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
return false;
auto AllUsersVectorized = [U1, U2, this](Value *V) {
return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
return AllUsersVectorized(V1) && AllUsersVectorized(V2);
// A broadcast of a load can be cheaper on some targets.
if (R.TTI->isLegalBroadcastLoad(V1->getType(),
ElementCount::getFixed(NumLanes)) &&
((int)V1->getNumUses() == NumLanes ||
AllUsersAreInternal(V1, V2)))
return LookAheadHeuristics::ScoreSplatLoads;
return LookAheadHeuristics::ScoreSplat;
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2) {
if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
return LookAheadHeuristics::ScoreFail;
std::optional<int> Dist = getPointersDiff(
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
if (!Dist || *Dist == 0) {
if (getUnderlyingObject(LI1->getPointerOperand()) ==
getUnderlyingObject(LI2->getPointerOperand()) &&
FixedVectorType::get(LI1->getType(), NumLanes),
return LookAheadHeuristics::ScoreMaskedGatherCandidate;
return LookAheadHeuristics::ScoreFail;
// The distance is too large - still may be profitable to use masked
// loads/gathers.
if (std::abs(*Dist) > NumLanes / 2)
return LookAheadHeuristics::ScoreMaskedGatherCandidate;
// This still will detect consecutive loads, but we might have "holes"
// in some cases. It is ok for non-power-2 vectorization and may produce
// better results. It should not affect current vectorization.
return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
: LookAheadHeuristics::ScoreReversedLoads;
auto *C1 = dyn_cast<Constant>(V1);
auto *C2 = dyn_cast<Constant>(V2);
if (C1 && C2)
return LookAheadHeuristics::ScoreConstants;
// Extracts from consecutive indexes of the same vector better score as
// the extracts could be optimized away.
Value *EV1;
ConstantInt *Ex1Idx;
if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
// Undefs are always profitable for extractelements.
// Compiler can easily combine poison and extractelement <non-poison> or
// undef and extractelement <poison>. But combining undef +
// extractelement <non-poison-but-may-produce-poison> requires some
// extra operations.
if (isa<UndefValue>(V2))
return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
? LookAheadHeuristics::ScoreConsecutiveExtracts
: LookAheadHeuristics::ScoreSameOpcode;
Value *EV2 = nullptr;
ConstantInt *Ex2Idx = nullptr;
if (match(V2,
m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
m_Undef())))) {
// Undefs are always profitable for extractelements.
if (!Ex2Idx)
return LookAheadHeuristics::ScoreConsecutiveExtracts;
if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
return LookAheadHeuristics::ScoreConsecutiveExtracts;
if (EV2 == EV1) {
int Idx1 = Ex1Idx->getZExtValue();
int Idx2 = Ex2Idx->getZExtValue();
int Dist = Idx2 - Idx1;
// The distance is too large - still may be profitable to use
// shuffles.
if (std::abs(Dist) == 0)
return LookAheadHeuristics::ScoreSplat;
if (std::abs(Dist) > NumLanes / 2)
return LookAheadHeuristics::ScoreSameOpcode;
return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
: LookAheadHeuristics::ScoreReversedExtracts;
return LookAheadHeuristics::ScoreAltOpcodes;
return LookAheadHeuristics::ScoreFail;
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (I1 && I2) {
if (I1->getParent() != I2->getParent())
return LookAheadHeuristics::ScoreFail;
SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
InstructionsState S = getSameOpcode(Ops, TLI);
// Note: Only consider instructions with <= 2 operands to avoid
// complexity explosion.
if (S.getOpcode() &&
(S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
!S.isAltShuffle()) &&
all_of(Ops, [&S](Value *V) {
return cast<Instruction>(V)->getNumOperands() ==
return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
: LookAheadHeuristics::ScoreSameOpcode;
if (isa<UndefValue>(V2))
return LookAheadHeuristics::ScoreUndef;
return LookAheadHeuristics::ScoreFail;
/// Go through the operands of \p LHS and \p RHS recursively until
/// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
/// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
/// of \p U1 and \p U2), except at the beginning of the recursion where
/// these are set to nullptr.
/// For example:
/// \verbatim
/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
/// \ / \ / \ / \ /
/// + + + +
/// G1 G2 G3 G4
/// \endverbatim
/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
/// each level recursively, accumulating the score. It starts from matching
/// the additions at level 0, then moves on to the loads (level 1). The
/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
/// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
/// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
/// Please note that the order of the operands does not matter, as we
/// evaluate the score of all profitable combinations of operands. In
/// other words the score of G1 and G4 is the same as G1 and G2. This
/// heuristic is based on ideas described in:
/// Look-ahead SLP: Auto-vectorization in the presence of commutative
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
/// Luís F. W. Góes
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
Instruction *U2, int CurrLevel,
ArrayRef<Value *> MainAltOps) const {
// Get the shallow score of V1 and V2.
int ShallowScoreAtThisLevel =
getShallowScore(LHS, RHS, U1, U2, MainAltOps);
// If reached MaxLevel,
// or if V1 and V2 are not instructions,
// or if they are SPLAT,
// or if they are not consecutive,
// or if profitable to vectorize loads or extractelements, early return
// the current cost.
auto *I1 = dyn_cast<Instruction>(LHS);
auto *I2 = dyn_cast<Instruction>(RHS);
if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
(((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
(I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
(isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
return ShallowScoreAtThisLevel;
assert(I1 && I2 && "Should have early exited.");
// Contains the I2 operand indexes that got matched with I1 operands.
SmallSet<unsigned, 4> Op2Used;
// Recursion towards the operands of I1 and I2. We are trying all possible
// operand pairs, and keeping track of the best score.
for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
OpIdx1 != NumOperands1; ++OpIdx1) {
// Try to pair op1I with the best operand of I2.
int MaxTmpScore = 0;
unsigned MaxOpIdx2 = 0;
bool FoundBest = false;
// If I2 is commutative try all combinations.
unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
unsigned ToIdx = isCommutative(I2)
? I2->getNumOperands()
: std::min(I2->getNumOperands(), OpIdx1 + 1);
assert(FromIdx <= ToIdx && "Bad index");
for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
// Skip operands already paired with OpIdx1.
if (Op2Used.count(OpIdx2))
// Recursively calculate the cost at each level
int TmpScore =
getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
I1, I2, CurrLevel + 1, std::nullopt);
// Look for the best score.
if (TmpScore > LookAheadHeuristics::ScoreFail &&
TmpScore > MaxTmpScore) {
MaxTmpScore = TmpScore;
MaxOpIdx2 = OpIdx2;
FoundBest = true;
if (FoundBest) {
// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
ShallowScoreAtThisLevel += MaxTmpScore;
return ShallowScoreAtThisLevel;
/// A helper data structure to hold the operands of a vector of instructions.
/// This supports a fixed vector length for all operand vectors.
class VLOperands {
/// For each operand we need (i) the value, and (ii) the opcode that it
/// would be attached to if the expression was in a left-linearized form.
/// This is required to avoid illegal operand reordering.
/// For example:
/// \verbatim
/// 0 Op1
/// |/
/// Op1 Op2 Linearized + Op2
/// \ / ----------> |/
/// - -
/// Op1 - Op2 (0 + Op1) - Op2
/// \endverbatim
/// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
/// Another way to think of this is to track all the operations across the
/// path from the operand all the way to the root of the tree and to
/// calculate the operation that corresponds to this path. For example, the
/// path from Op2 to the root crosses the RHS of the '-', therefore the
/// corresponding operation is a '-' (which matches the one in the
/// linearized tree, as shown above).
/// For lack of a better term, we refer to this operation as Accumulated
/// Path Operation (APO).
struct OperandData {
OperandData() = default;
OperandData(Value *V, bool APO, bool IsUsed)
: V(V), APO(APO), IsUsed(IsUsed) {}
/// The operand value.
Value *V = nullptr;
/// TreeEntries only allow a single opcode, or an alternate sequence of
/// them (e.g, +, -). Therefore, we can safely use a boolean value for the
/// APO. It is set to 'true' if 'V' is attached to an inverse operation
/// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
/// (e.g., Add/Mul)
bool APO = false;
/// Helper data for the reordering function.
bool IsUsed = false;
/// During operand reordering, we are trying to select the operand at lane
/// that matches best with the operand at the neighboring lane. Our
/// selection is based on the type of value we are looking for. For example,
/// if the neighboring lane has a load, we need to look for a load that is
/// accessing a consecutive address. These strategies are summarized in the
/// 'ReorderingMode' enumerator.
enum class ReorderingMode {
Load, ///< Matching loads to consecutive memory addresses
Opcode, ///< Matching instructions based on opcode (same or alternate)
Constant, ///< Matching constants
Splat, ///< Matching the same instruction multiple times (broadcast)
Failed, ///< We failed to create a vectorizable group
using OperandDataVec = SmallVector<OperandData, 2>;
/// A vector of operand vectors.
SmallVector<OperandDataVec, 4> OpsVec;
const TargetLibraryInfo &TLI;
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
/// \returns the operand data at \p OpIdx and \p Lane.
OperandData &getData(unsigned OpIdx, unsigned Lane) {
return OpsVec[OpIdx][Lane];
/// \returns the operand data at \p OpIdx and \p Lane. Const version.
const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
return OpsVec[OpIdx][Lane];
/// Clears the used flag for all entries.
void clearUsed() {
for (unsigned OpIdx = 0, NumOperands = getNumOperands();
OpIdx != NumOperands; ++OpIdx)
for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
OpsVec[OpIdx][Lane].IsUsed = false;
/// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
/// \param Lane lane of the operands under analysis.
/// \param OpIdx operand index in \p Lane lane we're looking the best
/// candidate for.
/// \param Idx operand index of the current candidate value.
/// \returns The additional score due to possible broadcasting of the
/// elements in the lane. It is more profitable to have power-of-2 unique
/// elements in the lane, it will be vectorized with higher probability
/// after removing duplicates. Currently the SLP vectorizer supports only
/// vectorization of the power-of-2 number of unique scalars.
int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
Value *IdxLaneV = getData(Idx, Lane).V;
if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
return 0;
SmallPtrSet<Value *, 4> Uniques;
for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
if (Ln == Lane)
Value *OpIdxLnV = getData(OpIdx, Ln).V;
if (!isa<Instruction>(OpIdxLnV))
return 0;
int UniquesCount = Uniques.size();
int UniquesCntWithIdxLaneV =
Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
Value *OpIdxLaneV = getData(OpIdx, Lane).V;
int UniquesCntWithOpIdxLaneV =
Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
return 0;
return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
UniquesCntWithOpIdxLaneV) -
(PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
/// \param Lane lane of the operands under analysis.
/// \param OpIdx operand index in \p Lane lane we're looking the best
/// candidate for.
/// \param Idx operand index of the current candidate value.
/// \returns The additional score for the scalar which users are all
/// vectorized.
int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
Value *IdxLaneV = getData(Idx, Lane).V;
Value *OpIdxLaneV = getData(OpIdx, Lane).V;
// Do not care about number of uses for vector-like instructions
// (extractelement/extractvalue with constant indices), they are extracts
// themselves and already externally used. Vectorization of such
// instructions does not add extra extractelement instruction, just may
// remove it.
if (isVectorLikeInstWithConstOps(IdxLaneV) &&
return LookAheadHeuristics::ScoreAllUserVectorized;
auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
return 0;
return R.areAllUsersVectorized(IdxLaneI)
? LookAheadHeuristics::ScoreAllUserVectorized
: 0;
/// Score scaling factor for fully compatible instructions but with
/// different number of external uses. Allows better selection of the
/// instructions with less external uses.
static const int ScoreScaleFactor = 10;
/// \Returns the look-ahead score, which tells us how much the sub-trees
/// rooted at \p LHS and \p RHS match, the more they match the higher the
/// score. This helps break ties in an informed way when we cannot decide on
/// the order of the operands by just considering the immediate
/// predecessors.
int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
int Lane, unsigned OpIdx, unsigned Idx,
bool &IsUsed) {
LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
// Keep track of the instruction stack as we recurse into the operands
// during the look-ahead score exploration.
int Score =
LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
/*CurrLevel=*/1, MainAltOps);
if (Score) {
int SplatScore = getSplatScore(Lane, OpIdx, Idx);
if (Score <= -SplatScore) {
// Set the minimum score for splat-like sequence to avoid setting
// failed state.
Score = 1;
} else {
Score += SplatScore;
// Scale score to see the difference between different operands
// and similar operands but all vectorized/not all vectorized
// uses. It does not affect actual selection of the best
// compatible operand in general, just allows to select the
// operand with all vectorized uses.
Score *= ScoreScaleFactor;
Score += getExternalUseScore(Lane, OpIdx, Idx);
IsUsed = true;
return Score;
/// Best defined scores per lanes between the passes. Used to choose the
/// best operand (with the highest score) between the passes.
/// The key - {Operand Index, Lane}.
/// The value - the best score between the passes for the lane and the
/// operand.
SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
// Search all operands in Ops[*][Lane] for the one that matches best
// Ops[OpIdx][LastLane] and return its opreand index.
// If no good match can be found, return std::nullopt.
getBestOperand(unsigned OpIdx, int Lane, int LastLane,
ArrayRef<ReorderingMode> ReorderingModes,
ArrayRef<Value *> MainAltOps) {
unsigned NumOperands = getNumOperands();
// The operand of the previous lane at OpIdx.
Value *OpLastLane = getData(OpIdx, LastLane).V;
// Our strategy mode for OpIdx.
ReorderingMode RMode = ReorderingModes[OpIdx];
if (RMode == ReorderingMode::Failed)
return std::nullopt;
// The linearized opcode of the operand at OpIdx, Lane.
bool OpIdxAPO = getData(OpIdx, Lane).APO;
// The best operand index and its score.
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
// are using the score to differentiate between the two.
struct BestOpData {
std::optional<unsigned> Idx;
unsigned Score = 0;
} BestOp;
BestOp.Score =
BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
// Track if the operand must be marked as used. If the operand is set to
// Score 1 explicitly (because of non power-of-2 unique scalars, we may
// want to reestimate the operands again on the following iterations).
bool IsUsed =
RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
// Iterate through all unused operands and look for the best.
for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
// Get the operand at Idx and Lane.
OperandData &OpData = getData(Idx, Lane);
Value *Op = OpData.V;
bool OpAPO = OpData.APO;
// Skip already selected operands.
if (OpData.IsUsed)
// Skip if we are trying to move the operand to a position with a
// different opcode in the linearized tree form. This would break the
// semantics.
if (OpAPO != OpIdxAPO)
// Look for an operand that matches the current mode.
switch (RMode) {
case ReorderingMode::Load:
case ReorderingMode::Constant:
case ReorderingMode::Opcode: {
bool LeftToRight = Lane > LastLane;
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
OpIdx, Idx, IsUsed);
if (Score > static_cast<int>(BestOp.Score)) {
BestOp.Idx = Idx;
BestOp.Score = Score;
BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
case ReorderingMode::Splat:
if (Op == OpLastLane)
BestOp.Idx = Idx;
case ReorderingMode::Failed:
llvm_unreachable("Not expected Failed reordering mode.");
if (BestOp.Idx) {
getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
return BestOp.Idx;
// If we could not find a good match return std::nullopt.
return std::nullopt;
/// Helper for reorderOperandVecs.
/// \returns the lane that we should start reordering from. This is the one
/// which has the least number of operands that can freely move about or
/// less profitable because it already has the most optimal set of operands.
unsigned getBestLaneToStartReordering() const {
unsigned Min = UINT_MAX;
unsigned SameOpNumber = 0;
// std::pair<unsigned, unsigned> is used to implement a simple voting
// algorithm and choose the lane with the least number of operands that
// can freely move about or less profitable because it already has the
// most optimal set of operands. The first unsigned is a counter for
// voting, the second unsigned is the counter of lanes with instructions
// with same/alternate opcodes and same parent basic block.
MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
// Try to be closer to the original results, if we have multiple lanes
// with same cost. If 2 lanes have the same cost, use the one with the
// lowest index.
for (int I = getNumLanes(); I > 0; --I) {
unsigned Lane = I - 1;
OperandsOrderData NumFreeOpsHash =
// Compare the number of operands that can move and choose the one with
// the least number.
if (NumFreeOpsHash.NumOfAPOs < Min) {
Min = NumFreeOpsHash.NumOfAPOs;
SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
} else if (NumFreeOpsHash.NumOfAPOs == Min &&
NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
// Select the most optimal lane in terms of number of operands that
// should be moved around.
SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
} else if (NumFreeOpsHash.NumOfAPOs == Min &&
NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
auto It = HashMap.find(NumFreeOpsHash.Hash);
if (It == HashMap.end())
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
// Select the lane with the minimum counter.
unsigned BestLane = 0;
unsigned CntMin = UINT_MAX;
for (const auto &Data : reverse(HashMap)) {
if (Data.second.first < CntMin) {
CntMin = Data.second.first;
BestLane = Data.second.second;
return BestLane;
/// Data structure that helps to reorder operands.
struct OperandsOrderData {
/// The best number of operands with the same APOs, which can be
/// reordered.
unsigned NumOfAPOs = UINT_MAX;
/// Number of operands with the same/alternate instruction opcode and
/// parent.
unsigned NumOpsWithSameOpcodeParent = 0;
/// Hash for the actual operands ordering.
/// Used to count operands, actually their position id and opcode
/// value. It is used in the voting mechanism to find the lane with the
/// least number of operands that can freely move about or less profitable
/// because it already has the most optimal set of operands. Can be
/// replaced with SmallVector<unsigned> instead but hash code is faster
/// and requires less memory.
unsigned Hash = 0;
/// \returns the maximum number of operands that are allowed to be reordered
/// for \p Lane and the number of compatible instructions(with the same
/// parent/opcode). This is used as a heuristic for selecting the first lane
/// to start operand reordering.
OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
unsigned CntTrue = 0;
unsigned NumOperands = getNumOperands();
// Operands with the same APO can be reordered. We therefore need to count
// how many of them we have for each APO, like this: Cnt[APO] = x.
// Since we only have two APOs, namely true and false, we can avoid using
// a map. Instead we can simply count the number of operands that
// correspond to one of them (in this case the 'true' APO), and calculate
// the other by subtracting it from the total number of operands.
// Operands with the same instruction opcode and parent are more
// profitable since we don't need to move them in many cases, with a high
// probability such lane already can be vectorized effectively.
bool AllUndefs = true;
unsigned NumOpsWithSameOpcodeParent = 0;
Instruction *OpcodeI = nullptr;
BasicBlock *Parent = nullptr;
unsigned Hash = 0;
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
const OperandData &OpData = getData(OpIdx, Lane);
if (OpData.APO)
// Use Boyer-Moore majority voting for finding the majority opcode and
// the number of times it occurs.
if (auto *I = dyn_cast<Instruction>(OpData.V)) {
if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
I->getParent() != Parent) {
if (NumOpsWithSameOpcodeParent == 0) {
NumOpsWithSameOpcodeParent = 1;
OpcodeI = I;
Parent = I->getParent();
} else {
} else {
Hash = hash_combine(
Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
if (AllUndefs)
return {};
OperandsOrderData Data;
Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
Data.Hash = Hash;
return Data;
/// Go through the instructions in VL and append their operands.
void appendOperandsOfVL(ArrayRef<Value *> VL) {
assert(!VL.empty() && "Bad VL");
assert((empty() || VL.size() == getNumLanes()) &&
"Expected same number of lanes");
assert(isa<Instruction>(VL[0]) && "Expected instruction");
unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
unsigned NumLanes = VL.size();
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
// Our tree has just 3 nodes: the root and two operands.
// It is therefore trivial to get the APO. We only need to check the
// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
// RHS operand. The LHS operand of both add and sub is never attached
// to an inversese operation in the linearized form, therefore its APO
// is false. The RHS is true only if VL[Lane] is an inverse operation.
// Since operand reordering is performed on groups of commutative
// operations or alternating sequences (e.g., +, -), we can safely
// tell the inverse operations by checking commutativity.
bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
APO, false};
/// \returns the number of operands.
unsigned getNumOperands() const { return OpsVec.size(); }
/// \returns the number of lanes.
unsigned getNumLanes() const { return OpsVec[0].size(); }
/// \returns the operand value at \p OpIdx and \p Lane.
Value *getValue(unsigned OpIdx, unsigned Lane) const {
return getData(OpIdx, Lane).V;
/// \returns true if the data structure is empty.
bool empty() const { return OpsVec.empty(); }
/// Clears the data.
void clear() { OpsVec.clear(); }
/// \Returns true if there are enough operands identical to \p Op to fill
/// the whole vector.
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
bool OpAPO = getData(OpIdx, Lane).APO;
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
if (Ln == Lane)
// This is set to true if we found a candidate for broadcast at Lane.
bool FoundCandidate = false;
for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
OperandData &Data = getData(OpI, Ln);
if (Data.APO != OpAPO || Data.IsUsed)
if (Data.V == Op) {
FoundCandidate = true;
Data.IsUsed = true;
if (!FoundCandidate)
return false;
return true;
/// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, const TargetLibraryInfo &TLI,
const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
: TLI(TLI), DL(DL), SE(SE), R(R) {
// Append all the operands of RootVL.
/// \Returns a value vector with the operands across all lanes for the
/// opearnd at \p OpIdx.
ValueList getVL(unsigned OpIdx) const {
ValueList OpVL(OpsVec[OpIdx].size());
assert(OpsVec[OpIdx].size() == getNumLanes() &&
"Expected same num of lanes across all operands");
for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
OpVL[Lane] = OpsVec[OpIdx][Lane].V;
return OpVL;
// Performs operand reordering for 2 or more operands.
// The original operands are in OrigOps[OpIdx][Lane].
// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
void reorder() {
unsigned NumOperands = getNumOperands();
unsigned NumLanes = getNumLanes();
// Each operand has its own mode. We are using this mode to help us select
// the instructions for each lane, so that they match best with the ones
// we have selected so far.
SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
// This is a greedy single-pass algorithm. We are going over each lane
// once and deciding on the best order right away with no back-tracking.
// However, in order to increase its effectiveness, we start with the lane
// that has operands that can move the least. For example, given the
// following lanes:
// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
// we will start at Lane 1, since the operands of the subtraction cannot
// be reordered. Then we will visit the rest of the lanes in a circular
// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
// Find the first lane that we will start our search from.
unsigned FirstLane = getBestLaneToStartReordering();
// Initialize the modes.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
Value *OpLane0 = getValue(OpIdx, FirstLane);
// Keep track if we have instructions with all the same opcode on one
// side.
if (isa<LoadInst>(OpLane0))
ReorderingModes[OpIdx] = ReorderingMode::Load;
else if (isa<Instruction>(OpLane0)) {
// Check if OpLane0 should be broadcast.
if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
ReorderingModes[OpIdx] = ReorderingMode::Splat;
ReorderingModes[OpIdx] = ReorderingMode::Opcode;
else if (isa<Constant>(OpLane0))
ReorderingModes[OpIdx] = ReorderingMode::Constant;
else if (isa<Argument>(OpLane0))
// Our best hope is a Splat. It may save some cost in some cases.
ReorderingModes[OpIdx] = ReorderingMode::Splat;
// NOTE: This should be unreachable.
ReorderingModes[OpIdx] = ReorderingMode::Failed;
// Check that we don't have same operands. No need to reorder if operands
// are just perfect diamond or shuffled diamond match. Do not do it only
// for possible broadcasts or non-power of 2 number of scalars (just for
// now).
auto &&SkipReordering = [this]() {
SmallPtrSet<Value *, 4> UniqueValues;
ArrayRef<OperandData> Op0 = OpsVec.front();
for (const OperandData &Data : Op0)
for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
if (any_of(Op, [&UniqueValues](const OperandData &Data) {
return !UniqueValues.contains(Data.V);
return false;
// TODO: Check if we can remove a check for non-power-2 number of
// scalars after full support of non-power-2 vectorization.
return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
// If the initial strategy fails for any of the operand indexes, then we
// perform reordering again in a second pass. This helps avoid assigning
// high priority to the failed strategy, and should improve reordering for
// the non-failed operand indexes.
for (int Pass = 0; Pass != 2; ++Pass) {
// Check if no need to reorder operands since they're are perfect or
// shuffled diamond match.
// Need to do it to avoid extra external use cost counting for
// shuffled matches, which may cause regressions.
if (SkipReordering())
// Skip the second pass if the first pass did not fail.
bool StrategyFailed = false;
// Mark all operand data as free to use.
// We keep the original operand order for the FirstLane, so reorder the
// rest of the lanes. We are visiting the nodes in a circular fashion,
// using FirstLane as the center point and increasing the radius
// distance.
SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
for (unsigned I = 0; I < NumOperands; ++I)
MainAltOps[I].push_back(getData(I, FirstLane).V);
for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
// Visit the lane on the right and then the lane on the left.
for (int Direction : {+1, -1}) {
int Lane = FirstLane + Direction * Distance;
if (Lane < 0 || Lane >= (int)NumLanes)
int LastLane = Lane - Direction;
assert(LastLane >= 0 && LastLane < (int)NumLanes &&
"Out of bounds");
// Look for a good match for each operand.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
// Search for the operand that matches SortedOps[OpIdx][Lane-1].
std::optional<unsigned> BestIdx = getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
// By not selecting a value, we allow the operands that follow to
// select a better matching value. We will get a non-null value in
// the next run of getBestOperand().
if (BestIdx) {
// Swap the current operand with the one returned by
// getBestOperand().
swap(OpIdx, *BestIdx, Lane);
} else {
// We failed to find a best operand, set mode to 'Failed'.
ReorderingModes[OpIdx] = ReorderingMode::Failed;
// Enable the second pass.
StrategyFailed = true;
// Try to get the alternate opcode and follow it during analysis.
if (MainAltOps[OpIdx].size() != 2) {
OperandData &AltOp = getData(OpIdx, Lane);
InstructionsState OpS =
getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
if (OpS.getOpcode() && OpS.isAltShuffle())
// Skip second pass if the strategy did not fail.
if (!StrategyFailed)
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
switch (RMode) {
case ReorderingMode::Load:
return "Load";
case ReorderingMode::Opcode:
return "Opcode";
case ReorderingMode::Constant:
return "Constant";
case ReorderingMode::Splat:
return "Splat";
case ReorderingMode::Failed:
return "Failed";
llvm_unreachable("Unimplemented Reordering Type");
LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
raw_ostream &OS) {
return OS << getModeStr(RMode);
/// Debug print.
LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
printMode(RMode, dbgs());
friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
return printMode(RMode, OS);
LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
const unsigned Indent = 2;
unsigned Cnt = 0;
for (const OperandDataVec &OpDataVec : OpsVec) {
OS << "Operand " << Cnt++ << "\n";
for (const OperandData &OpData : OpDataVec) {
OS.indent(Indent) << "{";
if (Value *V = OpData.V)
OS << *V;
OS << "null";
OS << ", APO:" << OpData.APO << "}\n";
OS << "\n";
return OS;
/// Debug print.
LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
/// Evaluate each pair in \p Candidates and return index into \p Candidates
/// for a pair which have highest score deemed to have best chance to form
/// root of profitable tree to vectorize. Return std::nullopt if no candidate
/// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
/// of the cost, considered to be good enough score.
findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
int Limit = LookAheadHeuristics::ScoreFail) {
LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
int BestScore = Limit;
std::optional<int> Index;
for (int I : seq<int>(0, Candidates.size())) {
int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
/*U1=*/nullptr, /*U2=*/nullptr,
/*Level=*/1, std::nullopt);
if (Score > BestScore) {
BestScore = Score;
Index = I;
return Index;
/// Checks if the instruction is marked for deletion.
bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
/// Removes an instruction from its block and eventually deletes it.
/// It's like Instruction::eraseFromParent() except that the actual deletion
/// is delayed until BoUpSLP is destructed.
void eraseInstruction(Instruction *I) {
/// Checks if the instruction was already analyzed for being possible
/// reduction root.
bool isAnalyzedReductionRoot(Instruction *I) const {
return AnalyzedReductionsRoots.count(I);
/// Register given instruction as already analyzed for being possible
/// reduction root.
void analyzedReductionRoot(Instruction *I) {
/// Checks if the provided list of reduced values was checked already for
/// vectorization.
bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
return AnalyzedReductionVals.contains(hash_value(VL));
/// Adds the list of reduced values to list of already checked values for the
/// vectorization.
void analyzedReductionVals(ArrayRef<Value *> VL) {
/// Clear the list of the analyzed reduction root instructions.
void clearReductionData() {
/// Checks if the given value is gathered in one of the nodes.
bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
/// Check if the value is vectorized in the tree.
bool isVectorized(Value *V) const { return getTreeEntry(V); }
/// Check if the operands on the edges \p Edges of the \p UserTE allows
/// reordering (i.e. the operands can be reordered because they have only one
/// user and reordarable).
/// \param ReorderableGathers List of all gather nodes that require reordering
/// (e.g., gather of extractlements or partially vectorizable loads).
/// \param GatherOps List of gather operand nodes for \p UserTE that require
/// reordering, subset of \p NonVectorized.
canReorderOperands(TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps);
/// Checks if the given \p TE is a gather node with clustered reused scalars
/// and reorders it per given \p Mask.
void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
/// if any. If it is not vectorized (gather node), returns nullptr.
TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
TreeEntry *TE = nullptr;
const auto *It = find_if(VL, [&](Value *V) {
TE = getTreeEntry(V);
if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
return true;
auto It = MultiNodeScalars.find(V);
if (It != MultiNodeScalars.end()) {
for (TreeEntry *E : It->second) {
if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
TE = E;
return true;
return false;
if (It != VL.end()) {
assert(TE->isSame(VL) && "Expected same scalars.");
return TE;
return nullptr;
/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
/// if any. If it is not vectorized (gather node), returns nullptr.
const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
unsigned OpIdx) const {
return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
const_cast<TreeEntry *>(UserTE), OpIdx);
/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(
Instruction *I,
const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
/// Return information about the vector formed for the specified index
/// of a vector of (the same) instruction.
TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
const EdgeInfo &EI);
/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
/// be vectorized to use the original vector (or aggregate "bitcast" to a
/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
/// returns false, setting \p CurrentOrder to either an empty vector or a
/// non-identity permutation that allows to reuse extract instructions.
/// \param ResizeAllowed indicates whether it is allowed to handle subvector
/// extract order.
bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
SmallVectorImpl<unsigned> &CurrentOrder,
bool ResizeAllowed = false) const;
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
/// \p E.
Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
/// Create a new vector from a list of scalar values. Produces a sequence
/// which exploits values reused across lanes, and arranges the inserts
/// for ease of later optimization.
template <typename BVTy, typename ResTy, typename... Args>
ResTy processBuildVector(const TreeEntry *E, Args &...Params);
/// Create a new vector from a list of scalar values. Produces a sequence
/// which exploits values reused across lanes, and arranges the inserts
/// for ease of later optimization.
Value *createBuildVector(const TreeEntry *E);
/// Returns the instruction in the bundle, which can be used as a base point
/// for scheduling. Usually it is the last instruction in the bundle, except
/// for the case when all operands are external (in this case, it is the first
/// instruction in the list).
Instruction &getLastInstructionInBundle(const TreeEntry *E);
/// Checks if the gathered \p VL can be represented as shuffle(s) of previous
/// tree entries.
/// \param TE Tree entry checked for permutation.
/// \param VL List of scalars (a subset of the TE scalar), checked for
/// permutations.
/// \returns ShuffleKind, if gathered values can be represented as shuffles of
/// previous tree entries. \p Mask is filled with the shuffle mask.
isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
SmallVectorImpl<int> &Mask,
SmallVectorImpl<const TreeEntry *> &Entries);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
/// \param ForPoisonSrc true if initial vector is poison, false otherwise.
InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
/// Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(const TreeEntry *E);
/// \returns a vector from a collection of scalars in \p VL. if \p Root is not
/// specified, the starting vector value is poison.
Value *gather(ArrayRef<Value *> VL, Value *Root);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
bool isFullyVectorizableTinyTree(bool ForReduction) const;
/// Reorder commutative or alt operands to get better probability of
/// generating vectorized code.
static void reorderInputsAccordingToOpcode(
ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI,
const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R);
/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
/// users of \p TE and collects the stores. It returns the map from the store
/// pointers to the collected stores.
DenseMap<Value *, SmallVector<StoreInst *>>
collectUserStores(const BoUpSLP::TreeEntry *TE) const;
/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
/// stores in \p StoresVec can form a vector instruction. If so it returns
/// true and populates \p ReorderIndices with the shuffle indices of the
/// stores when compared to the sorted vector.
bool canFormVector(ArrayRef<StoreInst *> StoresVec,
OrdersType &ReorderIndices) const;
/// Iterates through the users of \p TE, looking for scalar stores that can be
/// potentially vectorized in a future SLP-tree. If found, it keeps track of
/// their order and builds an order index vector for each store bundle. It
/// returns all these order vectors found.
/// We run this after the tree has formed, otherwise we may come across user
/// instructions that are not yet in the tree.
SmallVector<OrdersType, 1>
findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
struct TreeEntry {
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntry(VecTreeTy &Container) : Container(Container) {}
/// \returns Common mask for reorder indices and reused scalars.
SmallVector<int> getCommonMask() const {
SmallVector<int> Mask;
inversePermutation(ReorderIndices, Mask);
::addMask(Mask, ReuseShuffleIndices);
return Mask;
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
if (Mask.size() != VL.size() && VL.size() == Scalars.size())
return std::equal(VL.begin(), VL.end(), Scalars.begin());
return VL.size() == Mask.size() &&
std::equal(VL.begin(), VL.end(), Mask.begin(),
[Scalars](Value *V, int Idx) {
return (isa<UndefValue>(V) &&
Idx == PoisonMaskElem) ||
(Idx != PoisonMaskElem && V == Scalars[Idx]);
if (!ReorderIndices.empty()) {
// TODO: implement matching if the nodes are just reordered, still can
// treat the vector as the same if the list of scalars matches VL
// directly, without reordering.
SmallVector<int> Mask;
inversePermutation(ReorderIndices, Mask);
if (VL.size() == Scalars.size())
return IsSame(Scalars, Mask);
if (VL.size() == ReuseShuffleIndices.size()) {
::addMask(Mask, ReuseShuffleIndices);
return IsSame(Scalars, Mask);
return false;
return IsSame(Scalars, ReuseShuffleIndices);
bool isOperandGatherNode(const EdgeInfo &UserEI) const {
return State == TreeEntry::NeedToGather &&
UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
UserTreeIndices.front().UserTE == UserEI.UserTE;
/// \returns true if current entry has same operands as \p TE.
bool hasEqualOperands(const TreeEntry &TE) const {
if (TE.getNumOperands() != getNumOperands())
return false;
SmallBitVector Used(getNumOperands());
for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
unsigned PrevCount = Used.count();
for (unsigned K = 0; K < E; ++K) {
if (Used.test(K))
if (getOperand(K) == TE.getOperand(I)) {
// Check if we actually found the matching operand.
if (PrevCount == Used.count())
return false;
return true;
/// \return Final vectorization factor for the node. Defined by the total