blob: 9768f4533f782e3f1831fb7002fd765bd8eb9cfd [file] [log] [blame]
//===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "Uops.h"
#include "Assembler.h"
#include "BenchmarkRunner.h"
#include "MCInstrDescView.h"
#include "Target.h"
// FIXME: Load constants into registers (e.g. with fld1) to not break
// instructions like x87.
// Ideally we would like the only limitation on executing uops to be the issue
// ports. Maximizing port pressure increases the likelihood that the load is
// distributed evenly across possible ports.
// To achieve that, one approach is to generate instructions that do not have
// data dependencies between them.
//
// For some instructions, this is trivial:
// mov rax, qword ptr [rsi]
// mov rax, qword ptr [rsi]
// mov rax, qword ptr [rsi]
// mov rax, qword ptr [rsi]
// For the above snippet, haswell just renames rax four times and executes the
// four instructions two at a time on P23 and P0126.
//
// For some instructions, we just need to make sure that the source is
// different from the destination. For example, IDIV8r reads from GPR and
// writes to AX. We just need to ensure that the Var is assigned a
// register which is different from AX:
// idiv bx
// idiv bx
// idiv bx
// idiv bx
// The above snippet will be able to fully saturate the ports, while the same
// with ax would issue one uop every `latency(IDIV8r)` cycles.
//
// Some instructions make this harder because they both read and write from
// the same register:
// inc rax
// inc rax
// inc rax
// inc rax
// This has a data dependency from each instruction to the next, limit the
// number of instructions that can be issued in parallel.
// It turns out that this is not a big issue on recent Intel CPUs because they
// have heuristics to balance port pressure. In the snippet above, subsequent
// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
// might end up executing them all on P0 (just because they can), or try
// avoiding P5 because it's usually under high pressure from vector
// instructions.
// This issue is even more important for high-latency instructions because
// they increase the idle time of the CPU, e.g. :
// imul rax, rbx
// imul rax, rbx
// imul rax, rbx
// imul rax, rbx
//
// To avoid that, we do the renaming statically by generating as many
// independent exclusive assignments as possible (until all possible registers
// are exhausted) e.g.:
// imul rax, rbx
// imul rcx, rbx
// imul rdx, rbx
// imul r8, rbx
//
// Some instruction even make the above static renaming impossible because
// they implicitly read and write from the same operand, e.g. ADC16rr reads
// and writes from EFLAGS.
// In that case we just use a greedy register assignment and hope for the
// best.
namespace llvm {
namespace exegesis {
static llvm::SmallVector<const Variable *, 8>
getVariablesWithTiedOperands(const Instruction &Instr) {
llvm::SmallVector<const Variable *, 8> Result;
for (const auto &Var : Instr.Variables)
if (Var.hasTiedOperands())
Result.push_back(&Var);
return Result;
}
static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
assert(a.size() == b.size());
for (auto I : b.set_bits())
a.reset(I);
}
UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
UopsSnippetGenerator::~UopsSnippetGenerator() = default;
void UopsSnippetGenerator::instantiateMemoryOperands(
const unsigned ScratchSpacePointerInReg,
std::vector<InstructionTemplate> &Instructions) const {
if (ScratchSpacePointerInReg == 0)
return; // no memory operands.
const auto &ET = State.getExegesisTarget();
const unsigned MemStep = ET.getMaxMemoryAccessSize();
const size_t OriginalInstructionsSize = Instructions.size();
size_t I = 0;
for (InstructionTemplate &IT : Instructions) {
ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
++I;
}
while (Instructions.size() < kMinNumDifferentAddresses) {
InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
++I;
Instructions.push_back(std::move(IT));
}
assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
"not enough scratch space");
}
llvm::Expected<std::vector<CodeTemplate>>
UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
CodeTemplate CT;
const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
if (Instr.hasMemoryOperands()) {
const auto &ET = State.getExegesisTarget();
CT.ScratchSpacePointerInReg =
ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
if (CT.ScratchSpacePointerInReg == 0)
return llvm::make_error<BenchmarkFailure>(
"Infeasible : target does not support memory instructions");
ScratchSpaceAliasedRegs =
&State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
// If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
// FIXME: We could make a copy of the scratch register.
for (const auto &Op : Instr.Operands) {
if (Op.isDef() && Op.isImplicitReg() &&
ScratchSpaceAliasedRegs->test(Op.getImplicitReg()))
return llvm::make_error<BenchmarkFailure>(
"Infeasible : memory instruction uses scratch memory register");
}
}
const AliasingConfigurations SelfAliasing(Instr, Instr);
InstructionTemplate IT(Instr);
if (SelfAliasing.empty()) {
CT.Info = "instruction is parallel, repeating a random one.";
CT.Instructions.push_back(std::move(IT));
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
return getSingleton(std::move(CT));
}
if (SelfAliasing.hasImplicitAliasing()) {
CT.Info = "instruction is serial, repeating a random one.";
CT.Instructions.push_back(std::move(IT));
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
return getSingleton(std::move(CT));
}
const auto TiedVariables = getVariablesWithTiedOperands(Instr);
if (!TiedVariables.empty()) {
if (TiedVariables.size() > 1)
return llvm::make_error<llvm::StringError>(
"Infeasible : don't know how to handle several tied variables",
llvm::inconvertibleErrorCode());
const Variable *Var = TiedVariables.front();
assert(Var);
const Operand &Op = Instr.getPrimaryOperand(*Var);
assert(Op.isReg());
CT.Info = "instruction has tied variables using static renaming.";
for (const llvm::MCPhysReg Reg :
Op.getRegisterAliasing().sourceBits().set_bits()) {
if (ScratchSpaceAliasedRegs && ScratchSpaceAliasedRegs->test(Reg))
continue; // Do not use the scratch memory address register.
InstructionTemplate TmpIT = IT;
TmpIT.getValueFor(*Var) = llvm::MCOperand::createReg(Reg);
CT.Instructions.push_back(std::move(TmpIT));
}
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
return getSingleton(std::move(CT));
}
const auto &ReservedRegisters = State.getRATC().reservedRegisters();
// No tied variables, we pick random values for defs.
llvm::BitVector Defs(State.getRegInfo().getNumRegs());
for (const auto &Op : Instr.Operands) {
if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
remove(PossibleRegisters, ReservedRegisters);
// Do not use the scratch memory address register.
if (ScratchSpaceAliasedRegs)
remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
assert(PossibleRegisters.any() && "No register left to choose from");
const auto RandomReg = randomBit(PossibleRegisters);
Defs.set(RandomReg);
IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
}
}
// And pick random use values that are not reserved and don't alias with defs.
const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
for (const auto &Op : Instr.Operands) {
if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
remove(PossibleRegisters, ReservedRegisters);
// Do not use the scratch memory address register.
if (ScratchSpaceAliasedRegs)
remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
remove(PossibleRegisters, DefAliases);
assert(PossibleRegisters.any() && "No register left to choose from");
const auto RandomReg = randomBit(PossibleRegisters);
IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
}
}
CT.Info =
"instruction has no tied variables picking Uses different from defs";
CT.Instructions.push_back(std::move(IT));
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
return getSingleton(std::move(CT));
}
llvm::Expected<std::vector<BenchmarkMeasure>>
UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
std::vector<BenchmarkMeasure> Result;
const PfmCountersInfo &PCI = State.getPfmCounters();
// Uops per port.
for (const auto *IssueCounter = PCI.IssueCounters,
*IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
IssueCounter != IssueCounterEnd; ++IssueCounter) {
if (!IssueCounter->Counter)
continue;
auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter);
if (!ExpectedCounterValue)
return ExpectedCounterValue.takeError();
Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
*ExpectedCounterValue));
}
// NumMicroOps.
if (const char *const UopsCounter = PCI.UopsCounter) {
auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
if (!ExpectedCounterValue)
return ExpectedCounterValue.takeError();
Result.push_back(
BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
}
return std::move(Result);
}
constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
} // namespace exegesis
} // namespace llvm