tools/llvm-exegesis/lib/Uops.cpp - llvm - Git at Google

 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//

 #include "Uops.h"

 #include "Assembler.h"
 #include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
 #include "Target.h"

 // FIXME: Load constants into registers (e.g. with fld1) to not break
 // instructions like x87.

 // Ideally we would like the only limitation on executing uops to be the issue
 // ports. Maximizing port pressure increases the likelihood that the load is
 // distributed evenly across possible ports.

 // To achieve that, one approach is to generate instructions that do not have
 // data dependencies between them.
 //
 // For some instructions, this is trivial:
 //    mov rax, qword ptr [rsi]
 //    mov rax, qword ptr [rsi]
 //    mov rax, qword ptr [rsi]
 //    mov rax, qword ptr [rsi]
 // For the above snippet, haswell just renames rax four times and executes the
 // four instructions two at a time on P23 and P0126.
 //
 // For some instructions, we just need to make sure that the source is
 // different from the destination. For example, IDIV8r reads from GPR and
 // writes to AX. We just need to ensure that the Var is assigned a
 // register which is different from AX:
 //    idiv bx
 //    idiv bx
 //    idiv bx
 //    idiv bx
 // The above snippet will be able to fully saturate the ports, while the same
 // with ax would issue one uop every `latency(IDIV8r)` cycles.
 //
 // Some instructions make this harder because they both read and write from
 // the same register:
 //    inc rax
 //    inc rax
 //    inc rax
 //    inc rax
 // This has a data dependency from each instruction to the next, limit the
 // number of instructions that can be issued in parallel.
 // It turns out that this is not a big issue on recent Intel CPUs because they
 // have heuristics to balance port pressure. In the snippet above, subsequent
 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
 // might end up executing them all on P0 (just because they can), or try
 // avoiding P5 because it's usually under high pressure from vector
 // instructions.
 // This issue is even more important for high-latency instructions because
 // they increase the idle time of the CPU, e.g. :
 //    imul rax, rbx
 //    imul rax, rbx
 //    imul rax, rbx
 //    imul rax, rbx
 //
 // To avoid that, we do the renaming statically by generating as many
 // independent exclusive assignments as possible (until all possible registers
 // are exhausted) e.g.:
 //    imul rax, rbx
 //    imul rcx, rbx
 //    imul rdx, rbx
 //    imul r8,  rbx
 //
 // Some instruction even make the above static renaming impossible because
 // they implicitly read and write from the same operand, e.g. ADC16rr reads
 // and writes from EFLAGS.
 // In that case we just use a greedy register assignment and hope for the
 // best.

 namespace llvm {
 namespace exegesis {

 static llvm::SmallVector<const Variable *, 8>
 getVariablesWithTiedOperands(const Instruction &Instr) {
   llvm::SmallVector<const Variable *, 8> Result;
   for (const auto &Var : Instr.Variables)
     if (Var.hasTiedOperands())
       Result.push_back(&Var);
   return Result;
 }

 static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
   assert(a.size() == b.size());
   for (auto I : b.set_bits())
     a.reset(I);
 }

 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;

 UopsSnippetGenerator::~UopsSnippetGenerator() = default;

 void UopsSnippetGenerator::instantiateMemoryOperands(
     const unsigned ScratchSpacePointerInReg,
     std::vector<InstructionTemplate> &Instructions) const {
   if (ScratchSpacePointerInReg == 0)
     return; // no memory operands.
   const auto &ET = State.getExegesisTarget();
   const unsigned MemStep = ET.getMaxMemoryAccessSize();
   const size_t OriginalInstructionsSize = Instructions.size();
   size_t I = 0;
   for (InstructionTemplate &IT : Instructions) {
     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
     ++I;
   }

   while (Instructions.size() < kMinNumDifferentAddresses) {
     InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
     ++I;
     Instructions.push_back(std::move(IT));
   }
   assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
          "not enough scratch space");
 }

 llvm::Expected<std::vector<CodeTemplate>>
 UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
   CodeTemplate CT;
   const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
   if (Instr.hasMemoryOperands()) {
     const auto &ET = State.getExegesisTarget();
     CT.ScratchSpacePointerInReg =
         ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
     if (CT.ScratchSpacePointerInReg == 0)
       return llvm::make_error<BenchmarkFailure>(
           "Infeasible : target does not support memory instructions");
     ScratchSpaceAliasedRegs =
         &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
     // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
     // FIXME: We could make a copy of the scratch register.
     for (const auto &Op : Instr.Operands) {
       if (Op.isDef() && Op.isImplicitReg() &&
           ScratchSpaceAliasedRegs->test(Op.getImplicitReg()))
         return llvm::make_error<BenchmarkFailure>(
             "Infeasible : memory instruction uses scratch memory register");
     }
   }

   const AliasingConfigurations SelfAliasing(Instr, Instr);
   InstructionTemplate IT(Instr);
   if (SelfAliasing.empty()) {
     CT.Info = "instruction is parallel, repeating a random one.";
     CT.Instructions.push_back(std::move(IT));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
     return getSingleton(std::move(CT));
   }
   if (SelfAliasing.hasImplicitAliasing()) {
     CT.Info = "instruction is serial, repeating a random one.";
     CT.Instructions.push_back(std::move(IT));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
     return getSingleton(std::move(CT));
   }
   const auto TiedVariables = getVariablesWithTiedOperands(Instr);
   if (!TiedVariables.empty()) {
     if (TiedVariables.size() > 1)
       return llvm::make_error<llvm::StringError>(
           "Infeasible : don't know how to handle several tied variables",
           llvm::inconvertibleErrorCode());
     const Variable *Var = TiedVariables.front();
     assert(Var);
     const Operand &Op = Instr.getPrimaryOperand(*Var);
     assert(Op.isReg());
     CT.Info = "instruction has tied variables using static renaming.";
     for (const llvm::MCPhysReg Reg :
          Op.getRegisterAliasing().sourceBits().set_bits()) {
       if (ScratchSpaceAliasedRegs && ScratchSpaceAliasedRegs->test(Reg))
         continue; // Do not use the scratch memory address register.
       InstructionTemplate TmpIT = IT;
       TmpIT.getValueFor(*Var) = llvm::MCOperand::createReg(Reg);
       CT.Instructions.push_back(std::move(TmpIT));
     }
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
     return getSingleton(std::move(CT));
   }
   const auto &ReservedRegisters = State.getRATC().reservedRegisters();
   // No tied variables, we pick random values for defs.
   llvm::BitVector Defs(State.getRegInfo().getNumRegs());
   for (const auto &Op : Instr.Operands) {
     if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
       auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
       remove(PossibleRegisters, ReservedRegisters);
       // Do not use the scratch memory address register.
       if (ScratchSpaceAliasedRegs)
         remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
       assert(PossibleRegisters.any() && "No register left to choose from");
       const auto RandomReg = randomBit(PossibleRegisters);
       Defs.set(RandomReg);
       IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
     }
   }
   // And pick random use values that are not reserved and don't alias with defs.
   const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
   for (const auto &Op : Instr.Operands) {
     if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
       auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
       remove(PossibleRegisters, ReservedRegisters);
       // Do not use the scratch memory address register.
       if (ScratchSpaceAliasedRegs)
         remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
       remove(PossibleRegisters, DefAliases);
       assert(PossibleRegisters.any() && "No register left to choose from");
       const auto RandomReg = randomBit(PossibleRegisters);
       IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
     }
   }
   CT.Info =
       "instruction has no tied variables picking Uses different from defs";
   CT.Instructions.push_back(std::move(IT));
   instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
   return getSingleton(std::move(CT));
 }

 llvm::Expected<std::vector<BenchmarkMeasure>>
 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
   std::vector<BenchmarkMeasure> Result;
   const PfmCountersInfo &PCI = State.getPfmCounters();
   // Uops per port.
   for (const auto *IssueCounter = PCI.IssueCounters,
                   *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
        IssueCounter != IssueCounterEnd; ++IssueCounter) {
     if (!IssueCounter->Counter)
       continue;
     auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter);
     if (!ExpectedCounterValue)
       return ExpectedCounterValue.takeError();
     Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
                                               *ExpectedCounterValue));
   }
   // NumMicroOps.
   if (const char *const UopsCounter = PCI.UopsCounter) {
     auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
     if (!ExpectedCounterValue)
       return ExpectedCounterValue.takeError();
     Result.push_back(
         BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
   }
   return std::move(Result);
 }

 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;

 } // namespace exegesis
 } // namespace llvm
	//===-- Uops.cpp ------------------------------------------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "Uops.h"

	#include "Assembler.h"
	#include "BenchmarkRunner.h"
	#include "MCInstrDescView.h"
	#include "Target.h"

	// FIXME: Load constants into registers (e.g. with fld1) to not break
	// instructions like x87.

	// Ideally we would like the only limitation on executing uops to be the issue
	// ports. Maximizing port pressure increases the likelihood that the load is
	// distributed evenly across possible ports.

	// To achieve that, one approach is to generate instructions that do not have
	// data dependencies between them.
	//
	// For some instructions, this is trivial:
	// mov rax, qword ptr [rsi]
	// mov rax, qword ptr [rsi]
	// mov rax, qword ptr [rsi]
	// mov rax, qword ptr [rsi]
	// For the above snippet, haswell just renames rax four times and executes the
	// four instructions two at a time on P23 and P0126.
	//
	// For some instructions, we just need to make sure that the source is
	// different from the destination. For example, IDIV8r reads from GPR and
	// writes to AX. We just need to ensure that the Var is assigned a
	// register which is different from AX:
	// idiv bx
	// idiv bx
	// idiv bx
	// idiv bx
	// The above snippet will be able to fully saturate the ports, while the same
	// with ax would issue one uop every `latency(IDIV8r)` cycles.
	//
	// Some instructions make this harder because they both read and write from
	// the same register:
	// inc rax
	// inc rax
	// inc rax
	// inc rax
	// This has a data dependency from each instruction to the next, limit the
	// number of instructions that can be issued in parallel.
	// It turns out that this is not a big issue on recent Intel CPUs because they
	// have heuristics to balance port pressure. In the snippet above, subsequent
	// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
	// might end up executing them all on P0 (just because they can), or try
	// avoiding P5 because it's usually under high pressure from vector
	// instructions.
	// This issue is even more important for high-latency instructions because
	// they increase the idle time of the CPU, e.g. :
	// imul rax, rbx
	// imul rax, rbx
	// imul rax, rbx
	// imul rax, rbx
	//
	// To avoid that, we do the renaming statically by generating as many
	// independent exclusive assignments as possible (until all possible registers
	// are exhausted) e.g.:
	// imul rax, rbx
	// imul rcx, rbx
	// imul rdx, rbx
	// imul r8, rbx
	//
	// Some instruction even make the above static renaming impossible because
	// they implicitly read and write from the same operand, e.g. ADC16rr reads
	// and writes from EFLAGS.
	// In that case we just use a greedy register assignment and hope for the
	// best.

	namespace llvm {
	namespace exegesis {

	static llvm::SmallVector<const Variable *, 8>
	getVariablesWithTiedOperands(const Instruction &Instr) {
	llvm::SmallVector<const Variable *, 8> Result;
	for (const auto &Var : Instr.Variables)
	if (Var.hasTiedOperands())
	Result.push_back(&Var);
	return Result;
	}

	static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
	assert(a.size() == b.size());
	for (auto I : b.set_bits())
	a.reset(I);
	}

	UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;

	UopsSnippetGenerator::~UopsSnippetGenerator() = default;

	void UopsSnippetGenerator::instantiateMemoryOperands(
	const unsigned ScratchSpacePointerInReg,
	std::vector<InstructionTemplate> &Instructions) const {
	if (ScratchSpacePointerInReg == 0)
	return; // no memory operands.
	const auto &ET = State.getExegesisTarget();
	const unsigned MemStep = ET.getMaxMemoryAccessSize();
	const size_t OriginalInstructionsSize = Instructions.size();
	size_t I = 0;
	for (InstructionTemplate &IT : Instructions) {
	ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
	++I;
	}

	while (Instructions.size() < kMinNumDifferentAddresses) {
	InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
	ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
	++I;
	Instructions.push_back(std::move(IT));
	}
	assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
	"not enough scratch space");
	}

	llvm::Expected<std::vector<CodeTemplate>>
	UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
	CodeTemplate CT;
	const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
	if (Instr.hasMemoryOperands()) {
	const auto &ET = State.getExegesisTarget();
	CT.ScratchSpacePointerInReg =
	ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
	if (CT.ScratchSpacePointerInReg == 0)
	return llvm::make_error<BenchmarkFailure>(
	"Infeasible : target does not support memory instructions");
	ScratchSpaceAliasedRegs =
	&State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
	// If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
	// FIXME: We could make a copy of the scratch register.
	for (const auto &Op : Instr.Operands) {
	if (Op.isDef() && Op.isImplicitReg() &&
	ScratchSpaceAliasedRegs->test(Op.getImplicitReg()))
	return llvm::make_error<BenchmarkFailure>(
	"Infeasible : memory instruction uses scratch memory register");
	}
	}

	const AliasingConfigurations SelfAliasing(Instr, Instr);
	InstructionTemplate IT(Instr);
	if (SelfAliasing.empty()) {
	CT.Info = "instruction is parallel, repeating a random one.";
	CT.Instructions.push_back(std::move(IT));
	instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
	return getSingleton(std::move(CT));
	}
	if (SelfAliasing.hasImplicitAliasing()) {
	CT.Info = "instruction is serial, repeating a random one.";
	CT.Instructions.push_back(std::move(IT));
	instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
	return getSingleton(std::move(CT));
	}
	const auto TiedVariables = getVariablesWithTiedOperands(Instr);
	if (!TiedVariables.empty()) {
	if (TiedVariables.size() > 1)
	return llvm::make_error<llvm::StringError>(
	"Infeasible : don't know how to handle several tied variables",
	llvm::inconvertibleErrorCode());
	const Variable *Var = TiedVariables.front();
	assert(Var);
	const Operand &Op = Instr.getPrimaryOperand(*Var);
	assert(Op.isReg());
	CT.Info = "instruction has tied variables using static renaming.";
	for (const llvm::MCPhysReg Reg :
	Op.getRegisterAliasing().sourceBits().set_bits()) {
	if (ScratchSpaceAliasedRegs && ScratchSpaceAliasedRegs->test(Reg))
	continue; // Do not use the scratch memory address register.
	InstructionTemplate TmpIT = IT;
	TmpIT.getValueFor(*Var) = llvm::MCOperand::createReg(Reg);
	CT.Instructions.push_back(std::move(TmpIT));
	}
	instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
	return getSingleton(std::move(CT));
	}
	const auto &ReservedRegisters = State.getRATC().reservedRegisters();
	// No tied variables, we pick random values for defs.
	llvm::BitVector Defs(State.getRegInfo().getNumRegs());
	for (const auto &Op : Instr.Operands) {
	if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
	auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
	remove(PossibleRegisters, ReservedRegisters);
	// Do not use the scratch memory address register.
	if (ScratchSpaceAliasedRegs)
	remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
	assert(PossibleRegisters.any() && "No register left to choose from");
	const auto RandomReg = randomBit(PossibleRegisters);
	Defs.set(RandomReg);
	IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
	}
	}
	// And pick random use values that are not reserved and don't alias with defs.
	const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
	for (const auto &Op : Instr.Operands) {
	if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
	auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
	remove(PossibleRegisters, ReservedRegisters);
	// Do not use the scratch memory address register.
	if (ScratchSpaceAliasedRegs)
	remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
	remove(PossibleRegisters, DefAliases);
	assert(PossibleRegisters.any() && "No register left to choose from");
	const auto RandomReg = randomBit(PossibleRegisters);
	IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
	}
	}
	CT.Info =
	"instruction has no tied variables picking Uses different from defs";
	CT.Instructions.push_back(std::move(IT));
	instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
	return getSingleton(std::move(CT));
	}

	llvm::Expected<std::vector<BenchmarkMeasure>>
	UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
	std::vector<BenchmarkMeasure> Result;
	const PfmCountersInfo &PCI = State.getPfmCounters();
	// Uops per port.
	for (const auto *IssueCounter = PCI.IssueCounters,
	*IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
	IssueCounter != IssueCounterEnd; ++IssueCounter) {
	if (!IssueCounter->Counter)
	continue;
	auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter);
	if (!ExpectedCounterValue)
	return ExpectedCounterValue.takeError();
	Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
	*ExpectedCounterValue));
	}
	// NumMicroOps.
	if (const char *const UopsCounter = PCI.UopsCounter) {
	auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
	if (!ExpectedCounterValue)
	return ExpectedCounterValue.takeError();
	Result.push_back(
	BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
	}
	return std::move(Result);
	}

	constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;

	} // namespace exegesis
	} // namespace llvm