blob: 0ccef5aecd28b7a966c498c094aa2241630b11f4 [file] [log] [blame]
//===- BottomUpVec.cpp - A bottom-up vectorizer pass ----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/SandboxIR/Function.h"
#include "llvm/SandboxIR/Instruction.h"
#include "llvm/SandboxIR/Module.h"
#include "llvm/SandboxIR/Region.h"
#include "llvm/SandboxIR/Utils.h"
#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h"
#include "llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h"
#include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h"
namespace llvm {
static cl::opt<unsigned>
OverrideVecRegBits("sbvec-vec-reg-bits", cl::init(0), cl::Hidden,
cl::desc("Override the vector register size in bits, "
"which is otherwise found by querying TTI."));
static cl::opt<bool>
AllowNonPow2("sbvec-allow-non-pow2", cl::init(false), cl::Hidden,
cl::desc("Allow non-power-of-2 vectorization."));
#ifndef NDEBUG
static cl::opt<bool>
AlwaysVerify("sbvec-always-verify", cl::init(false), cl::Hidden,
cl::desc("Helps find bugs by verifying the IR whenever we "
"emit new instructions (*very* expensive)."));
#endif // NDEBUG
namespace sandboxir {
BottomUpVec::BottomUpVec(StringRef Pipeline)
: FunctionPass("bottom-up-vec"),
RPM("rpm", Pipeline, SandboxVectorizerPassBuilder::createRegionPass) {}
static SmallVector<Value *, 4> getOperand(ArrayRef<Value *> Bndl,
unsigned OpIdx) {
SmallVector<Value *, 4> Operands;
for (Value *BndlV : Bndl) {
auto *BndlI = cast<Instruction>(BndlV);
Operands.push_back(BndlI->getOperand(OpIdx));
}
return Operands;
}
/// \Returns the BB iterator after the lowest instruction in \p Vals, or the top
/// of BB if no instruction found in \p Vals.
static BasicBlock::iterator getInsertPointAfterInstrs(ArrayRef<Value *> Vals,
BasicBlock *BB) {
auto *BotI = VecUtils::getLastPHIOrSelf(VecUtils::getLowest(Vals, BB));
if (BotI == nullptr)
// We are using BB->begin() (or after PHIs) as the fallback insert point.
return BB->empty()
? BB->begin()
: std::next(
VecUtils::getLastPHIOrSelf(&*BB->begin())->getIterator());
return std::next(BotI->getIterator());
}
Value *BottomUpVec::createVectorInstr(ArrayRef<Value *> Bndl,
ArrayRef<Value *> Operands) {
auto CreateVectorInstr = [](ArrayRef<Value *> Bndl,
ArrayRef<Value *> Operands) -> Value * {
assert(all_of(Bndl, [](auto *V) { return isa<Instruction>(V); }) &&
"Expect Instructions!");
auto &Ctx = Bndl[0]->getContext();
Type *ScalarTy = VecUtils::getElementType(Utils::getExpectedType(Bndl[0]));
auto *VecTy = VecUtils::getWideType(ScalarTy, VecUtils::getNumLanes(Bndl));
BasicBlock::iterator WhereIt = getInsertPointAfterInstrs(
Bndl, cast<Instruction>(Bndl[0])->getParent());
auto Opcode = cast<Instruction>(Bndl[0])->getOpcode();
switch (Opcode) {
case Instruction::Opcode::ZExt:
case Instruction::Opcode::SExt:
case Instruction::Opcode::FPToUI:
case Instruction::Opcode::FPToSI:
case Instruction::Opcode::FPExt:
case Instruction::Opcode::PtrToInt:
case Instruction::Opcode::IntToPtr:
case Instruction::Opcode::SIToFP:
case Instruction::Opcode::UIToFP:
case Instruction::Opcode::Trunc:
case Instruction::Opcode::FPTrunc:
case Instruction::Opcode::BitCast: {
assert(Operands.size() == 1u && "Casts are unary!");
return CastInst::create(VecTy, Opcode, Operands[0], WhereIt, Ctx,
"VCast");
}
case Instruction::Opcode::FCmp:
case Instruction::Opcode::ICmp: {
auto Pred = cast<CmpInst>(Bndl[0])->getPredicate();
assert(all_of(drop_begin(Bndl),
[Pred](auto *SBV) {
return cast<CmpInst>(SBV)->getPredicate() == Pred;
}) &&
"Expected same predicate across bundle.");
return CmpInst::create(Pred, Operands[0], Operands[1], WhereIt, Ctx,
"VCmp");
}
case Instruction::Opcode::Select: {
return SelectInst::create(Operands[0], Operands[1], Operands[2], WhereIt,
Ctx, "Vec");
}
case Instruction::Opcode::FNeg: {
auto *UOp0 = cast<UnaryOperator>(Bndl[0]);
auto OpC = UOp0->getOpcode();
return UnaryOperator::createWithCopiedFlags(OpC, Operands[0], UOp0,
WhereIt, Ctx, "Vec");
}
case Instruction::Opcode::Add:
case Instruction::Opcode::FAdd:
case Instruction::Opcode::Sub:
case Instruction::Opcode::FSub:
case Instruction::Opcode::Mul:
case Instruction::Opcode::FMul:
case Instruction::Opcode::UDiv:
case Instruction::Opcode::SDiv:
case Instruction::Opcode::FDiv:
case Instruction::Opcode::URem:
case Instruction::Opcode::SRem:
case Instruction::Opcode::FRem:
case Instruction::Opcode::Shl:
case Instruction::Opcode::LShr:
case Instruction::Opcode::AShr:
case Instruction::Opcode::And:
case Instruction::Opcode::Or:
case Instruction::Opcode::Xor: {
auto *BinOp0 = cast<BinaryOperator>(Bndl[0]);
auto *LHS = Operands[0];
auto *RHS = Operands[1];
return BinaryOperator::createWithCopiedFlags(
BinOp0->getOpcode(), LHS, RHS, BinOp0, WhereIt, Ctx, "Vec");
}
case Instruction::Opcode::Load: {
auto *Ld0 = cast<LoadInst>(Bndl[0]);
Value *Ptr = Ld0->getPointerOperand();
return LoadInst::create(VecTy, Ptr, Ld0->getAlign(), WhereIt, Ctx,
"VecL");
}
case Instruction::Opcode::Store: {
auto Align = cast<StoreInst>(Bndl[0])->getAlign();
Value *Val = Operands[0];
Value *Ptr = Operands[1];
return StoreInst::create(Val, Ptr, Align, WhereIt, Ctx);
}
case Instruction::Opcode::Br:
case Instruction::Opcode::Ret:
case Instruction::Opcode::PHI:
case Instruction::Opcode::AddrSpaceCast:
case Instruction::Opcode::Call:
case Instruction::Opcode::GetElementPtr:
llvm_unreachable("Unimplemented");
break;
default:
llvm_unreachable("Unimplemented");
break;
}
llvm_unreachable("Missing switch case!");
// TODO: Propagate debug info.
};
auto *VecI = CreateVectorInstr(Bndl, Operands);
if (VecI != nullptr) {
Change = true;
IMaps->registerVector(Bndl, VecI);
}
return VecI;
}
void BottomUpVec::tryEraseDeadInstrs() {
DenseMap<BasicBlock *, SmallVector<Instruction *>> SortedDeadInstrCandidates;
// The dead instrs could span BBs, so we need to collect and sort them per BB.
for (auto *DeadI : DeadInstrCandidates)
SortedDeadInstrCandidates[DeadI->getParent()].push_back(DeadI);
for (auto &Pair : SortedDeadInstrCandidates)
sort(Pair.second,
[](Instruction *I1, Instruction *I2) { return I1->comesBefore(I2); });
for (const auto &Pair : SortedDeadInstrCandidates) {
for (Instruction *I : reverse(Pair.second)) {
if (I->hasNUses(0))
// Erase the dead instructions bottom-to-top.
I->eraseFromParent();
}
}
DeadInstrCandidates.clear();
}
Value *BottomUpVec::createShuffle(Value *VecOp, const ShuffleMask &Mask,
BasicBlock *UserBB) {
BasicBlock::iterator WhereIt = getInsertPointAfterInstrs({VecOp}, UserBB);
return ShuffleVectorInst::create(VecOp, VecOp, Mask, WhereIt,
VecOp->getContext(), "VShuf");
}
Value *BottomUpVec::createPack(ArrayRef<Value *> ToPack, BasicBlock *UserBB) {
BasicBlock::iterator WhereIt = getInsertPointAfterInstrs(ToPack, UserBB);
Type *ScalarTy = VecUtils::getCommonScalarType(ToPack);
unsigned Lanes = VecUtils::getNumLanes(ToPack);
Type *VecTy = VecUtils::getWideType(ScalarTy, Lanes);
// Create a series of pack instructions.
Value *LastInsert = PoisonValue::get(VecTy);
Context &Ctx = ToPack[0]->getContext();
unsigned InsertIdx = 0;
for (Value *Elm : ToPack) {
// An element can be either scalar or vector. We need to generate different
// IR for each case.
if (Elm->getType()->isVectorTy()) {
unsigned NumElms =
cast<FixedVectorType>(Elm->getType())->getNumElements();
for (auto ExtrLane : seq<int>(0, NumElms)) {
// We generate extract-insert pairs, for each lane in `Elm`.
Constant *ExtrLaneC =
ConstantInt::getSigned(Type::getInt32Ty(Ctx), ExtrLane);
// This may return a Constant if Elm is a Constant.
auto *ExtrI =
ExtractElementInst::create(Elm, ExtrLaneC, WhereIt, Ctx, "VPack");
if (!isa<Constant>(ExtrI))
WhereIt = std::next(cast<Instruction>(ExtrI)->getIterator());
Constant *InsertLaneC =
ConstantInt::getSigned(Type::getInt32Ty(Ctx), InsertIdx++);
// This may also return a Constant if ExtrI is a Constant.
auto *InsertI = InsertElementInst::create(
LastInsert, ExtrI, InsertLaneC, WhereIt, Ctx, "VPack");
if (!isa<Constant>(InsertI)) {
LastInsert = InsertI;
WhereIt = std::next(cast<Instruction>(LastInsert)->getIterator());
}
}
} else {
Constant *InsertLaneC =
ConstantInt::getSigned(Type::getInt32Ty(Ctx), InsertIdx++);
// This may be folded into a Constant if LastInsert is a Constant. In
// that case we only collect the last constant.
LastInsert = InsertElementInst::create(LastInsert, Elm, InsertLaneC,
WhereIt, Ctx, "Pack");
if (auto *NewI = dyn_cast<Instruction>(LastInsert))
WhereIt = std::next(NewI->getIterator());
}
}
return LastInsert;
}
void BottomUpVec::collectPotentiallyDeadInstrs(ArrayRef<Value *> Bndl) {
for (Value *V : Bndl)
DeadInstrCandidates.insert(cast<Instruction>(V));
// Also collect the GEPs of vectorized loads and stores.
auto Opcode = cast<Instruction>(Bndl[0])->getOpcode();
switch (Opcode) {
case Instruction::Opcode::Load: {
for (Value *V : drop_begin(Bndl))
if (auto *Ptr =
dyn_cast<Instruction>(cast<LoadInst>(V)->getPointerOperand()))
DeadInstrCandidates.insert(Ptr);
break;
}
case Instruction::Opcode::Store: {
for (Value *V : drop_begin(Bndl))
if (auto *Ptr =
dyn_cast<Instruction>(cast<StoreInst>(V)->getPointerOperand()))
DeadInstrCandidates.insert(Ptr);
break;
}
default:
break;
}
}
Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl,
ArrayRef<Value *> UserBndl, unsigned Depth) {
Value *NewVec = nullptr;
auto *UserBB = !UserBndl.empty()
? cast<Instruction>(UserBndl.front())->getParent()
: cast<Instruction>(Bndl[0])->getParent();
const auto &LegalityRes = Legality->canVectorize(Bndl);
switch (LegalityRes.getSubclassID()) {
case LegalityResultID::Widen: {
auto *I = cast<Instruction>(Bndl[0]);
SmallVector<Value *, 2> VecOperands;
switch (I->getOpcode()) {
case Instruction::Opcode::Load:
// Don't recurse towards the pointer operand.
VecOperands.push_back(cast<LoadInst>(I)->getPointerOperand());
break;
case Instruction::Opcode::Store: {
// Don't recurse towards the pointer operand.
auto *VecOp = vectorizeRec(getOperand(Bndl, 0), Bndl, Depth + 1);
VecOperands.push_back(VecOp);
VecOperands.push_back(cast<StoreInst>(I)->getPointerOperand());
break;
}
default:
// Visit all operands.
for (auto OpIdx : seq<unsigned>(I->getNumOperands())) {
auto *VecOp = vectorizeRec(getOperand(Bndl, OpIdx), Bndl, Depth + 1);
VecOperands.push_back(VecOp);
}
break;
}
NewVec = createVectorInstr(Bndl, VecOperands);
// Collect any potentially dead scalar instructions, including the original
// scalars and pointer operands of loads/stores.
if (NewVec != nullptr)
collectPotentiallyDeadInstrs(Bndl);
break;
}
case LegalityResultID::DiamondReuse: {
NewVec = cast<DiamondReuse>(LegalityRes).getVector();
break;
}
case LegalityResultID::DiamondReuseWithShuffle: {
auto *VecOp = cast<DiamondReuseWithShuffle>(LegalityRes).getVector();
const ShuffleMask &Mask =
cast<DiamondReuseWithShuffle>(LegalityRes).getMask();
NewVec = createShuffle(VecOp, Mask, UserBB);
assert(NewVec->getType() == VecOp->getType() &&
"Expected same type! Bad mask ?");
break;
}
case LegalityResultID::DiamondReuseMultiInput: {
const auto &Descr =
cast<DiamondReuseMultiInput>(LegalityRes).getCollectDescr();
Type *ResTy = VecUtils::getWideType(Bndl[0]->getType(), Bndl.size());
// TODO: Try to get WhereIt without creating a vector.
SmallVector<Value *, 4> DescrInstrs;
for (const auto &ElmDescr : Descr.getDescrs()) {
if (auto *I = dyn_cast<Instruction>(ElmDescr.getValue()))
DescrInstrs.push_back(I);
}
BasicBlock::iterator WhereIt =
getInsertPointAfterInstrs(DescrInstrs, UserBB);
Value *LastV = PoisonValue::get(ResTy);
unsigned Lane = 0;
for (const auto &ElmDescr : Descr.getDescrs()) {
Value *VecOp = ElmDescr.getValue();
Context &Ctx = VecOp->getContext();
Value *ValueToInsert;
if (ElmDescr.needsExtract()) {
ConstantInt *IdxC =
ConstantInt::get(Type::getInt32Ty(Ctx), ElmDescr.getExtractIdx());
ValueToInsert = ExtractElementInst::create(VecOp, IdxC, WhereIt,
VecOp->getContext(), "VExt");
} else {
ValueToInsert = VecOp;
}
auto NumLanesToInsert = VecUtils::getNumLanes(ValueToInsert);
if (NumLanesToInsert == 1) {
// If we are inserting a scalar element then we need a single insert.
// %VIns = insert %DstVec, %SrcScalar, Lane
ConstantInt *LaneC = ConstantInt::get(Type::getInt32Ty(Ctx), Lane);
LastV = InsertElementInst::create(LastV, ValueToInsert, LaneC, WhereIt,
Ctx, "VIns");
} else {
// If we are inserting a vector element then we need to extract and
// insert each vector element one by one with a chain of extracts and
// inserts, for example:
// %VExt0 = extract %SrcVec, 0
// %VIns0 = insert %DstVec, %Vect0, Lane + 0
// %VExt1 = extract %SrcVec, 1
// %VIns1 = insert %VIns0, %Vect0, Lane + 1
for (unsigned LnCnt = 0; LnCnt != NumLanesToInsert; ++LnCnt) {
auto *ExtrIdxC = ConstantInt::get(Type::getInt32Ty(Ctx), LnCnt);
auto *ExtrI = ExtractElementInst::create(ValueToInsert, ExtrIdxC,
WhereIt, Ctx, "VExt");
unsigned InsLane = Lane + LnCnt;
auto *InsLaneC = ConstantInt::get(Type::getInt32Ty(Ctx), InsLane);
LastV = InsertElementInst::create(LastV, ExtrI, InsLaneC, WhereIt,
Ctx, "VIns");
}
}
Lane += NumLanesToInsert;
}
NewVec = LastV;
break;
}
case LegalityResultID::Pack: {
// If we can't vectorize the seeds then just return.
if (Depth == 0)
return nullptr;
NewVec = createPack(Bndl, UserBB);
break;
}
}
#ifndef NDEBUG
if (AlwaysVerify) {
// This helps find broken IR by constantly verifying the function. Note that
// this is very expensive and should only be used for debugging.
Instruction *I0 = isa<Instruction>(Bndl[0])
? cast<Instruction>(Bndl[0])
: cast<Instruction>(UserBndl[0]);
assert(!Utils::verifyFunction(I0->getParent()->getParent(), dbgs()) &&
"Broken function!");
}
#endif
return NewVec;
}
bool BottomUpVec::tryVectorize(ArrayRef<Value *> Bndl) {
DeadInstrCandidates.clear();
Legality->clear();
vectorizeRec(Bndl, {}, /*Depth=*/0);
tryEraseDeadInstrs();
return Change;
}
bool BottomUpVec::runOnFunction(Function &F, const Analyses &A) {
IMaps = std::make_unique<InstrMaps>(F.getContext());
Legality = std::make_unique<LegalityAnalysis>(
A.getAA(), A.getScalarEvolution(), F.getParent()->getDataLayout(),
F.getContext(), *IMaps);
Change = false;
const auto &DL = F.getParent()->getDataLayout();
unsigned VecRegBits =
OverrideVecRegBits != 0
? OverrideVecRegBits
: A.getTTI()
.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
.getFixedValue();
// TODO: Start from innermost BBs first
for (auto &BB : F) {
SeedCollector SC(&BB, A.getScalarEvolution());
for (SeedBundle &Seeds : SC.getStoreSeeds()) {
unsigned ElmBits =
Utils::getNumBits(VecUtils::getElementType(Utils::getExpectedType(
Seeds[Seeds.getFirstUnusedElementIdx()])),
DL);
auto DivideBy2 = [](unsigned Num) {
auto Floor = VecUtils::getFloorPowerOf2(Num);
if (Floor == Num)
return Floor / 2;
return Floor;
};
// Try to create the largest vector supported by the target. If it fails
// reduce the vector size by half.
for (unsigned SliceElms = std::min(VecRegBits / ElmBits,
Seeds.getNumUnusedBits() / ElmBits);
SliceElms >= 2u; SliceElms = DivideBy2(SliceElms)) {
if (Seeds.allUsed())
break;
// Keep trying offsets after FirstUnusedElementIdx, until we vectorize
// the slice. This could be quite expensive, so we enforce a limit.
for (unsigned Offset = Seeds.getFirstUnusedElementIdx(),
OE = Seeds.size();
Offset + 1 < OE; Offset += 1) {
// Seeds are getting used as we vectorize, so skip them.
if (Seeds.isUsed(Offset))
continue;
if (Seeds.allUsed())
break;
auto SeedSlice =
Seeds.getSlice(Offset, SliceElms * ElmBits, !AllowNonPow2);
if (SeedSlice.empty())
continue;
assert(SeedSlice.size() >= 2 && "Should have been rejected!");
// TODO: Refactor to remove the unnecessary copy to SeedSliceVals.
SmallVector<Value *> SeedSliceVals(SeedSlice.begin(),
SeedSlice.end());
// Create an empty region. Instructions get added to the region
// automatically by the callbacks.
auto &Ctx = F.getContext();
Region Rgn(Ctx, A.getTTI());
// Save the state of the IR before we make any changes. The
// transaction gets accepted/reverted by the tr-accept-or-revert pass.
Ctx.save();
// Try to vectorize starting from the seed slice. The returned value
// is true if we found vectorizable code and generated some vector
// code for it. It does not mean that the code is profitable.
bool VecSuccess = tryVectorize(SeedSliceVals);
if (VecSuccess)
// WARNING: All passes should return false, except those that
// accept/revert the state.
Change |= RPM.runOnRegion(Rgn, A);
}
}
}
}
return Change;
}
} // namespace sandboxir
} // namespace llvm