| //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "ParallelSnippetGenerator.h" |
| |
| #include "BenchmarkRunner.h" |
| #include "MCInstrDescView.h" |
| #include "Target.h" |
| |
| // FIXME: Load constants into registers (e.g. with fld1) to not break |
| // instructions like x87. |
| |
| // Ideally we would like the only limitation on executing instructions to be the |
| // availability of the CPU resources (e.g. execution ports) needed to execute |
| // them, instead of the availability of their data dependencies. |
| |
| // To achieve that, one approach is to generate instructions that do not have |
| // data dependencies between them. |
| // |
| // For some instructions, this is trivial: |
| // mov rax, qword ptr [rsi] |
| // mov rax, qword ptr [rsi] |
| // mov rax, qword ptr [rsi] |
| // mov rax, qword ptr [rsi] |
| // For the above snippet, haswell just renames rax four times and executes the |
| // four instructions two at a time on P23 and P0126. |
| // |
| // For some instructions, we just need to make sure that the source is |
| // different from the destination. For example, IDIV8r reads from GPR and |
| // writes to AX. We just need to ensure that the Var is assigned a |
| // register which is different from AX: |
| // idiv bx |
| // idiv bx |
| // idiv bx |
| // idiv bx |
| // The above snippet will be able to fully saturate the ports, while the same |
| // with ax would issue one uop every `latency(IDIV8r)` cycles. |
| // |
| // Some instructions make this harder because they both read and write from |
| // the same register: |
| // inc rax |
| // inc rax |
| // inc rax |
| // inc rax |
| // This has a data dependency from each instruction to the next, limit the |
| // number of instructions that can be issued in parallel. |
| // It turns out that this is not a big issue on recent Intel CPUs because they |
| // have heuristics to balance port pressure. In the snippet above, subsequent |
| // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs |
| // might end up executing them all on P0 (just because they can), or try |
| // avoiding P5 because it's usually under high pressure from vector |
| // instructions. |
| // This issue is even more important for high-latency instructions because |
| // they increase the idle time of the CPU, e.g. : |
| // imul rax, rbx |
| // imul rax, rbx |
| // imul rax, rbx |
| // imul rax, rbx |
| // |
| // To avoid that, we do the renaming statically by generating as many |
| // independent exclusive assignments as possible (until all possible registers |
| // are exhausted) e.g.: |
| // imul rax, rbx |
| // imul rcx, rbx |
| // imul rdx, rbx |
| // imul r8, rbx |
| // |
| // Some instruction even make the above static renaming impossible because |
| // they implicitly read and write from the same operand, e.g. ADC16rr reads |
| // and writes from EFLAGS. |
| // In that case we just use a greedy register assignment and hope for the |
| // best. |
| |
| namespace llvm { |
| namespace exegesis { |
| |
| static bool hasVariablesWithTiedOperands(const Instruction &Instr) { |
| SmallVector<const Variable *, 8> Result; |
| for (const auto &Var : Instr.Variables) |
| if (Var.hasTiedOperands()) |
| return true; |
| return false; |
| } |
| |
| ParallelSnippetGenerator::~ParallelSnippetGenerator() = default; |
| |
| void ParallelSnippetGenerator::instantiateMemoryOperands( |
| const unsigned ScratchSpacePointerInReg, |
| std::vector<InstructionTemplate> &Instructions) const { |
| if (ScratchSpacePointerInReg == 0) |
| return; // no memory operands. |
| const auto &ET = State.getExegesisTarget(); |
| const unsigned MemStep = ET.getMaxMemoryAccessSize(); |
| const size_t OriginalInstructionsSize = Instructions.size(); |
| size_t I = 0; |
| for (InstructionTemplate &IT : Instructions) { |
| ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); |
| ++I; |
| } |
| |
| while (Instructions.size() < kMinNumDifferentAddresses) { |
| InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; |
| ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); |
| ++I; |
| Instructions.push_back(std::move(IT)); |
| } |
| assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && |
| "not enough scratch space"); |
| } |
| |
| enum class RegRandomizationStrategy : uint8_t { |
| PickRandomRegs, |
| SingleStaticRegPerOperand, |
| SingleStaticReg, |
| |
| FIRST = PickRandomRegs, |
| LAST = SingleStaticReg, |
| }; |
| |
| } // namespace exegesis |
| |
| template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> { |
| static constexpr bool is_iterable = true; |
| }; |
| |
| namespace exegesis { |
| |
| const char *getDescription(RegRandomizationStrategy S) { |
| switch (S) { |
| case RegRandomizationStrategy::PickRandomRegs: |
| return "randomizing registers"; |
| case RegRandomizationStrategy::SingleStaticRegPerOperand: |
| return "one unique register for each position"; |
| case RegRandomizationStrategy::SingleStaticReg: |
| return "reusing the same register for all positions"; |
| } |
| llvm_unreachable("Unknown UseRegRandomizationStrategy enum"); |
| } |
| |
| static std::variant<std::nullopt_t, MCOperand, Register> |
| generateSingleRegisterForInstrAvoidingDefUseOverlap( |
| const LLVMState &State, const BitVector &ForbiddenRegisters, |
| const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, |
| const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT, |
| const Operand &Op, const ArrayRef<InstructionTemplate> Instructions, |
| RegRandomizationStrategy S) { |
| const Instruction &Instr = IT.getInstr(); |
| assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() && |
| !IT.getValueFor(Op).isValid()); |
| assert((!Op.isUse() || !Op.isTied()) && |
| "Not expecting to see a tied use reg"); |
| |
| if (Op.isUse()) { |
| switch (S) { |
| case RegRandomizationStrategy::PickRandomRegs: |
| break; |
| case RegRandomizationStrategy::SingleStaticReg: |
| case RegRandomizationStrategy::SingleStaticRegPerOperand: { |
| if (!Instructions.empty()) |
| return Instructions.front().getValueFor(Op); |
| if (S != RegRandomizationStrategy::SingleStaticReg) |
| break; |
| BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits(); |
| const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses); |
| if (std::optional<int> CommonBit = |
| getFirstCommonBit(PossibleRegisters, UseAliases)) |
| return *CommonBit; |
| break; |
| } |
| } |
| } |
| |
| BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits(); |
| remove(PossibleRegisters, ForbiddenRegisters); |
| |
| if (Op.isDef()) { |
| remove(PossibleRegisters, ImplicitUseAliases); |
| const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses); |
| remove(PossibleRegisters, UseAliases); |
| } |
| |
| if (Op.isUse()) { |
| remove(PossibleRegisters, ImplicitDefAliases); |
| // NOTE: in general, using same reg for multiple Use's is fine. |
| if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) { |
| const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses); |
| remove(PossibleRegisters, UseAliases); |
| } |
| } |
| |
| bool IsDefWithTiedUse = |
| Instr.Variables[Op.getVariableIndex()].hasTiedOperands(); |
| if (Op.isUse() || IsDefWithTiedUse) { |
| // Now, important bit: if we have used some register for def, |
| // then we can not use that same register for *any* use, |
| // be it either an untied use, or an use tied to a def. |
| // But def-ing same regs is fine, as long as there are no uses! |
| const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs); |
| remove(PossibleRegisters, DefsAliases); |
| } |
| |
| if (!PossibleRegisters.any()) |
| return std::nullopt; |
| |
| return randomBit(PossibleRegisters); |
| } |
| |
| static std::optional<InstructionTemplate> |
| generateSingleSnippetForInstrAvoidingDefUseOverlap( |
| const LLVMState &State, const BitVector &ForbiddenRegisters, |
| const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, |
| BitVector &Uses, BitVector &Defs, InstructionTemplate IT, |
| const ArrayRef<InstructionTemplate> Instructions, |
| RegRandomizationStrategy S) { |
| const Instruction &Instr = IT.getInstr(); |
| for (const Operand &Op : Instr.Operands) { |
| if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() || |
| IT.getValueFor(Op).isValid()) |
| continue; |
| assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses."); |
| |
| std::variant<std::nullopt_t, MCOperand, Register> R = |
| generateSingleRegisterForInstrAvoidingDefUseOverlap( |
| State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases, |
| Uses, Defs, IT, Op, Instructions, S); |
| |
| if (std::holds_alternative<std::nullopt_t>(R)) |
| return {}; |
| |
| MCOperand MCOp; |
| if (std::holds_alternative<MCOperand>(R)) |
| MCOp = std::get<MCOperand>(R); |
| else { |
| Register RandomReg = std::get<Register>(R); |
| if (Op.isDef()) |
| Defs.set(RandomReg); |
| if (Op.isUse()) |
| Uses.set(RandomReg); |
| MCOp = MCOperand::createReg(RandomReg); |
| } |
| IT.getValueFor(Op) = MCOp; |
| } |
| return IT; |
| } |
| |
| static std::vector<InstructionTemplate> |
| generateSnippetForInstrAvoidingDefUseOverlap( |
| const LLVMState &State, const InstructionTemplate &IT, |
| RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) { |
| // We don't want to accidentally serialize the instruction, |
| // so we must be sure that we don't pick a def that is an implicit use, |
| // or a use that is an implicit def, so record implicit regs now. |
| BitVector ImplicitUses(State.getRegInfo().getNumRegs()); |
| BitVector ImplicitDefs(State.getRegInfo().getNumRegs()); |
| for (const auto &Op : IT.getInstr().Operands) { |
| if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) { |
| assert(Op.isImplicitReg() && "Not an implicit register operand?"); |
| if (Op.isUse()) |
| ImplicitUses.set(Op.getImplicitReg()); |
| else { |
| assert(Op.isDef() && "Not a use and not a def?"); |
| ImplicitDefs.set(Op.getImplicitReg()); |
| } |
| } |
| } |
| const BitVector ImplicitUseAliases = |
| getAliasedBits(State.getRegInfo(), ImplicitUses); |
| const BitVector ImplicitDefAliases = |
| getAliasedBits(State.getRegInfo(), ImplicitDefs); |
| |
| BitVector Defs(State.getRegInfo().getNumRegs()); |
| BitVector Uses(State.getRegInfo().getNumRegs()); |
| std::vector<InstructionTemplate> Instructions; |
| |
| while (true) { |
| std::optional<InstructionTemplate> TmpIT = |
| generateSingleSnippetForInstrAvoidingDefUseOverlap( |
| State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases, |
| Uses, Defs, IT, Instructions, S); |
| if (!TmpIT) |
| return Instructions; |
| Instructions.push_back(std::move(*TmpIT)); |
| if (!hasVariablesWithTiedOperands(IT.getInstr())) |
| return Instructions; |
| assert(Instructions.size() <= 128 && "Stuck in endless loop?"); |
| } |
| } |
| |
| Expected<std::vector<CodeTemplate>> |
| ParallelSnippetGenerator::generateCodeTemplates( |
| InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { |
| const Instruction &Instr = Variant.getInstr(); |
| CodeTemplate CT; |
| CT.ScratchSpacePointerInReg = |
| Instr.hasMemoryOperands() |
| ? State.getExegesisTarget().getScratchMemoryRegister( |
| State.getTargetMachine().getTargetTriple()) |
| : 0; |
| const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters); |
| if (SelfAliasing.empty()) { |
| CT.Info = "instruction is parallel, repeating a random one."; |
| CT.Instructions.push_back(std::move(Variant)); |
| instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); |
| return getSingleton(std::move(CT)); |
| } |
| if (SelfAliasing.hasImplicitAliasing()) { |
| CT.Info = "instruction is serial, repeating a random one."; |
| CT.Instructions.push_back(std::move(Variant)); |
| instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); |
| return getSingleton(std::move(CT)); |
| } |
| std::vector<CodeTemplate> Result; |
| bool HasTiedOperands = hasVariablesWithTiedOperands(Instr); |
| // If there are no tied operands, then we don't want to "saturate backedge", |
| // and the template we will produce will have only a single instruction. |
| unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) { |
| return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() && |
| !Op.isTied(); |
| }); |
| SmallVector<RegRandomizationStrategy, 3> Strategies; |
| if (HasTiedOperands || NumUntiedUseRegs >= 3) |
| Strategies.push_back(RegRandomizationStrategy::PickRandomRegs); |
| if (NumUntiedUseRegs >= 2) |
| Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand); |
| Strategies.push_back(RegRandomizationStrategy::SingleStaticReg); |
| for (RegRandomizationStrategy S : Strategies) { |
| CodeTemplate CurrCT = CT.clone(); |
| CurrCT.Info = |
| Twine("instruction has ") |
| .concat(HasTiedOperands ? "" : "no ") |
| .concat("tied variables, avoiding " |
| "Read-After-Write issue, picking random def and use " |
| "registers not aliasing each other, for uses, ") |
| .concat(getDescription(S)) |
| .str(); |
| CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap( |
| State, Variant, S, ForbiddenRegisters); |
| if (CurrCT.Instructions.empty()) |
| return make_error<StringError>( |
| Twine("Failed to produce any snippet via: ").concat(CurrCT.Info), |
| inconvertibleErrorCode()); |
| instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg, |
| CurrCT.Instructions); |
| Result.push_back(std::move(CurrCT)); |
| } |
| return Result; |
| } |
| |
| constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses; |
| |
| } // namespace exegesis |
| } // namespace llvm |