blob: 6f6dea08950a7dd1ba425e3824b9bc36646f447c [file] [log] [blame]
//===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the FrameOptimizerPass class.
//
//===----------------------------------------------------------------------===//
#include "bolt/Passes/FrameOptimizer.h"
#include "bolt/Core/ParallelUtilities.h"
#include "bolt/Passes/BinaryFunctionCallGraph.h"
#include "bolt/Passes/DataflowInfoManager.h"
#include "bolt/Passes/ShrinkWrapping.h"
#include "bolt/Passes/StackAvailableExpressions.h"
#include "bolt/Passes/StackReachingUses.h"
#include "bolt/Utils/CommandLineOpts.h"
#include "llvm/Support/Timer.h"
#include <deque>
#include <unordered_map>
#define DEBUG_TYPE "fop"
using namespace llvm;
namespace opts {
extern cl::opt<unsigned> Verbosity;
extern cl::opt<bool> TimeOpts;
extern cl::OptionCategory BoltOptCategory;
using namespace bolt;
cl::opt<FrameOptimizationType>
FrameOptimization("frame-opt",
cl::init(FOP_NONE),
cl::desc("optimize stack frame accesses"),
cl::values(
clEnumValN(FOP_NONE, "none", "do not perform frame optimization"),
clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"),
clEnumValN(FOP_ALL, "all", "perform FOP on all functions")),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
cl::opt<bool> RemoveStores(
"frame-opt-rm-stores", cl::init(FOP_NONE),
cl::desc("apply additional analysis to remove stores (experimental)"),
cl::cat(BoltOptCategory));
} // namespace opts
namespace llvm {
namespace bolt {
void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
const FrameAnalysis &FA,
BinaryFunction &BF) {
StackAvailableExpressions SAE(RA, FA, BF);
SAE.run();
LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
bool Changed = false;
const auto ExprEnd = SAE.expr_end();
MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
for (BinaryBasicBlock &BB : BF) {
LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
const MCInst *Prev = nullptr;
for (MCInst &Inst : BB) {
LLVM_DEBUG({
dbgs() << "\t\tNow at ";
Inst.dump();
for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
I != ExprEnd; ++I) {
dbgs() << "\t\t\tReached by: ";
(*I)->dump();
}
});
// if Inst is a load from stack and the current available expressions show
// this value is available in a register or immediate, replace this load
// with move from register or from immediate.
ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
if (!FIEX) {
Prev = &Inst;
continue;
}
// FIXME: Change to remove IsSimple == 0. We're being conservative here,
// but once replaceMemOperandWithReg is ready, we should feed it with all
// sorts of complex instructions.
if (FIEX->IsLoad == false || FIEX->IsSimple == false ||
FIEX->StackOffset >= 0) {
Prev = &Inst;
continue;
}
for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
I != ExprEnd; ++I) {
const MCInst *AvailableInst = *I;
ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst);
if (!FIEY)
continue;
assert(FIEY->IsStore && FIEY->IsSimple);
if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size)
continue;
// TODO: Change push/pops to stack adjustment instruction
if (MIB->isPop(Inst))
continue;
++NumRedundantLoads;
FreqRedundantLoads += BB.getKnownExecutionCount();
Changed = true;
LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
LLVM_DEBUG(Inst.dump());
LLVM_DEBUG(dbgs() << "Related store instruction: ");
LLVM_DEBUG(AvailableInst->dump());
LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
// Replace load
if (FIEY->IsStoreFromReg) {
if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) {
LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
break;
}
FreqLoadsChangedToReg += BB.getKnownExecutionCount();
MIB->removeAnnotation(Inst, "FrameAccessEntry");
LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
if (MIB->isRedundantMove(Inst)) {
++NumLoadsDeleted;
FreqLoadsDeleted += BB.getKnownExecutionCount();
LLVM_DEBUG(dbgs() << "Created a redundant move\n");
// Delete it!
ToErase.push_front(std::make_pair(&BB, &Inst));
}
} else {
char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
LLVM_DEBUG(dbgs() << "FAILED\n");
} else {
FreqLoadsChangedToImm += BB.getKnownExecutionCount();
MIB->removeAnnotation(Inst, "FrameAccessEntry");
LLVM_DEBUG(dbgs() << "Ok\n");
}
}
LLVM_DEBUG(dbgs() << "Changed to: ");
LLVM_DEBUG(Inst.dump());
break;
}
Prev = &Inst;
}
}
if (Changed)
LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
// TODO: Implement an interface of eraseInstruction that works out the
// complete list of elements to remove.
for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
I.first->eraseInstruction(I.first->findInstruction(I.second));
}
void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
BinaryFunction &BF) {
StackReachingUses SRU(FA, BF);
SRU.run();
LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
bool Changed = false;
for (BinaryBasicBlock &BB : BF) {
LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
const MCInst *Prev = nullptr;
for (MCInst &Inst : llvm::reverse(BB)) {
LLVM_DEBUG({
dbgs() << "\t\tNow at ";
Inst.dump();
for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
I != SRU.expr_end(); ++I) {
dbgs() << "\t\t\tReached by: ";
(*I)->dump();
}
});
ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
if (!FIEX) {
Prev = &Inst;
continue;
}
if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) {
Prev = &Inst;
continue;
}
if (SRU.isStoreUsed(*FIEX,
Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) {
Prev = &Inst;
continue;
}
// TODO: Change push/pops to stack adjustment instruction
if (BF.getBinaryContext().MIB->isPush(Inst))
continue;
++NumRedundantStores;
FreqRedundantStores += BB.getKnownExecutionCount();
Changed = true;
LLVM_DEBUG(dbgs() << "Unused store instruction: ");
LLVM_DEBUG(Inst.dump());
LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
<< " size = " << (int)FIEX->Size << "\n");
// Delete it!
ToErase.emplace_back(&BB, &Inst);
Prev = &Inst;
}
}
for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
I.first->eraseInstruction(I.first->findInstruction(I.second));
if (Changed)
LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
}
void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
if (opts::FrameOptimization == FOP_NONE)
return;
std::unique_ptr<BinaryFunctionCallGraph> CG;
std::unique_ptr<FrameAnalysis> FA;
std::unique_ptr<RegAnalysis> RA;
{
NamedRegionTimer T1("callgraph", "create call graph", "FOP",
"FOP breakdown", opts::TimeOpts);
CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
}
{
NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
"FOP breakdown", opts::TimeOpts);
FA = std::make_unique<FrameAnalysis>(BC, *CG);
}
{
NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown",
opts::TimeOpts);
RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
}
// Perform caller-saved register optimizations, then callee-saved register
// optimizations (shrink wrapping)
for (auto &I : BC.getBinaryFunctions()) {
if (!FA->hasFrameInfo(I.second))
continue;
// Restrict pass execution if user asked to only run on hot functions
if (opts::FrameOptimization == FOP_HOT) {
if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
continue;
LLVM_DEBUG(
dbgs() << "Considering " << I.second.getPrintName()
<< " for frame optimizations because its execution count ( "
<< I.second.getKnownExecutionCount()
<< " ) exceeds our hotness threshold ( "
<< BC.getHotThreshold() << " )\n");
}
{
NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
opts::TimeOpts);
if (!FA->hasStackArithmetic(I.second))
removeUnnecessaryLoads(*RA, *FA, I.second);
}
if (opts::RemoveStores) {
NamedRegionTimer T1("removestores", "remove stores", "FOP",
"FOP breakdown", opts::TimeOpts);
if (!FA->hasStackArithmetic(I.second))
removeUnusedStores(*FA, I.second);
}
// Don't even start shrink wrapping if no profiling info is available
if (I.second.getKnownExecutionCount() == 0)
continue;
}
{
NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
"FOP breakdown", opts::TimeOpts);
performShrinkWrapping(*RA, *FA, BC);
}
outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
<< " redundant load(s) and " << NumRedundantStores
<< " unused store(s)\n";
outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads
<< " and frequency of unused stores is " << FreqRedundantStores
<< "\n";
outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
<< FreqLoadsChangedToReg
<< " and frequency of loads changed to use an immediate is "
<< FreqLoadsChangedToImm << "\n";
outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
<< " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
<< NumRedundantStores << " store(s)\n";
FA->printStats();
ShrinkWrapping::printStats();
}
void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
const FrameAnalysis &FA,
BinaryContext &BC) {
// Initialize necessary annotations to allow safe parallel accesses to
// annotation index in MIB
BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
BC.MIB->getOrCreateAnnotationIndex(
StackLayoutModifier::getOffsetCFIRegTagName());
BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
std::vector<std::pair<uint64_t, const BinaryFunction *>> Top10Funcs;
auto LogFunc = [&](BinaryFunction &BF) {
auto Lower = llvm::lower_bound(
Top10Funcs, BF.getKnownExecutionCount(),
[](const std::pair<uint64_t, const BinaryFunction *> &Elmt,
uint64_t Value) { return Elmt.first > Value; });
if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10)
return;
Top10Funcs.insert(Lower,
std::make_pair<>(BF.getKnownExecutionCount(), &BF));
if (Top10Funcs.size() > 10)
Top10Funcs.resize(10);
};
(void)LogFunc;
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
if (BF.getFunctionScore() == 0)
return true;
return false;
};
const bool HotOnly = opts::FrameOptimization == FOP_HOT;
ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
ShrinkWrapping SW(FA, BF, Info, AllocatorId);
if (SW.perform(HotOnly)) {
std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
FuncsChanged.insert(&BF);
LLVM_DEBUG(LogFunc(BF));
}
};
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
SkipPredicate, "shrink-wrapping");
if (!Top10Funcs.empty()) {
outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
for (const auto &Elmt : Top10Funcs)
outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
}
}
} // namespace bolt
} // namespace llvm