tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp - llvm-project/llvm - Git at Google

 //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "ParallelSnippetGenerator.h"

 #include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
 #include "Target.h"

 // FIXME: Load constants into registers (e.g. with fld1) to not break
 // instructions like x87.

 // Ideally we would like the only limitation on executing instructions to be the
 // availability of the CPU resources (e.g. execution ports) needed to execute
 // them, instead of the availability of their data dependencies.

 // To achieve that, one approach is to generate instructions that do not have
 // data dependencies between them.
 //
 // For some instructions, this is trivial:
 //    mov rax, qword ptr [rsi]
 //    mov rax, qword ptr [rsi]
 //    mov rax, qword ptr [rsi]
 //    mov rax, qword ptr [rsi]
 // For the above snippet, haswell just renames rax four times and executes the
 // four instructions two at a time on P23 and P0126.
 //
 // For some instructions, we just need to make sure that the source is
 // different from the destination. For example, IDIV8r reads from GPR and
 // writes to AX. We just need to ensure that the Var is assigned a
 // register which is different from AX:
 //    idiv bx
 //    idiv bx
 //    idiv bx
 //    idiv bx
 // The above snippet will be able to fully saturate the ports, while the same
 // with ax would issue one uop every `latency(IDIV8r)` cycles.
 //
 // Some instructions make this harder because they both read and write from
 // the same register:
 //    inc rax
 //    inc rax
 //    inc rax
 //    inc rax
 // This has a data dependency from each instruction to the next, limit the
 // number of instructions that can be issued in parallel.
 // It turns out that this is not a big issue on recent Intel CPUs because they
 // have heuristics to balance port pressure. In the snippet above, subsequent
 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
 // might end up executing them all on P0 (just because they can), or try
 // avoiding P5 because it's usually under high pressure from vector
 // instructions.
 // This issue is even more important for high-latency instructions because
 // they increase the idle time of the CPU, e.g. :
 //    imul rax, rbx
 //    imul rax, rbx
 //    imul rax, rbx
 //    imul rax, rbx
 //
 // To avoid that, we do the renaming statically by generating as many
 // independent exclusive assignments as possible (until all possible registers
 // are exhausted) e.g.:
 //    imul rax, rbx
 //    imul rcx, rbx
 //    imul rdx, rbx
 //    imul r8,  rbx
 //
 // Some instruction even make the above static renaming impossible because
 // they implicitly read and write from the same operand, e.g. ADC16rr reads
 // and writes from EFLAGS.
 // In that case we just use a greedy register assignment and hope for the
 // best.

 namespace llvm {
 namespace exegesis {

 static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
   SmallVector<const Variable *, 8> Result;
   for (const auto &Var : Instr.Variables)
     if (Var.hasTiedOperands())
       return true;
   return false;
 }

 ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;

 void ParallelSnippetGenerator::instantiateMemoryOperands(
     const unsigned ScratchSpacePointerInReg,
     std::vector<InstructionTemplate> &Instructions) const {
   if (ScratchSpacePointerInReg == 0)
     return; // no memory operands.
   const auto &ET = State.getExegesisTarget();
   const unsigned MemStep = ET.getMaxMemoryAccessSize();
   const size_t OriginalInstructionsSize = Instructions.size();
   size_t I = 0;
   for (InstructionTemplate &IT : Instructions) {
     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
     ++I;
   }

   while (Instructions.size() < kMinNumDifferentAddresses) {
     InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
     ++I;
     Instructions.push_back(std::move(IT));
   }
   assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
          "not enough scratch space");
 }

 enum class RegRandomizationStrategy : uint8_t {
   PickRandomRegs,
   SingleStaticRegPerOperand,
   SingleStaticReg,

   FIRST = PickRandomRegs,
   LAST = SingleStaticReg,
 };

 } // namespace exegesis

 template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
   static constexpr bool is_iterable = true;
 };

 namespace exegesis {

 const char *getDescription(RegRandomizationStrategy S) {
   switch (S) {
   case RegRandomizationStrategy::PickRandomRegs:
     return "randomizing registers";
   case RegRandomizationStrategy::SingleStaticRegPerOperand:
     return "one unique register for each position";
   case RegRandomizationStrategy::SingleStaticReg:
     return "reusing the same register for all positions";
   }
   llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
 }

 static std::variant<std::nullopt_t, MCOperand, Register>
 generateSingleRegisterForInstrAvoidingDefUseOverlap(
     const LLVMState &State, const BitVector &ForbiddenRegisters,
     const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
     const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
     const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
     RegRandomizationStrategy S) {
   const Instruction &Instr = IT.getInstr();
   assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
          !IT.getValueFor(Op).isValid());
   assert((!Op.isUse() || !Op.isTied()) &&
          "Not expecting to see a tied use reg");

   if (Op.isUse()) {
     switch (S) {
     case RegRandomizationStrategy::PickRandomRegs:
       break;
     case RegRandomizationStrategy::SingleStaticReg:
     case RegRandomizationStrategy::SingleStaticRegPerOperand: {
       if (!Instructions.empty())
         return Instructions.front().getValueFor(Op);
       if (S != RegRandomizationStrategy::SingleStaticReg)
         break;
       BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
       const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
       if (std::optional<int> CommonBit =
               getFirstCommonBit(PossibleRegisters, UseAliases))
         return *CommonBit;
       break;
     }
     }
   }

   BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
   remove(PossibleRegisters, ForbiddenRegisters);

   if (Op.isDef()) {
     remove(PossibleRegisters, ImplicitUseAliases);
     const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
     remove(PossibleRegisters, UseAliases);
   }

   if (Op.isUse()) {
     remove(PossibleRegisters, ImplicitDefAliases);
     // NOTE: in general, using same reg for multiple Use's is fine.
     if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
       const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
       remove(PossibleRegisters, UseAliases);
     }
   }

   bool IsDefWithTiedUse =
       Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
   if (Op.isUse() || IsDefWithTiedUse) {
     // Now, important bit: if we have used some register for def,
     // then we can not use that same register for *any* use,
     // be it either an untied use, or an use tied to a def.
     // But def-ing same regs is fine, as long as there are no uses!
     const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs);
     remove(PossibleRegisters, DefsAliases);
   }

   if (!PossibleRegisters.any())
     return std::nullopt;

   return randomBit(PossibleRegisters);
 }

 static std::optional<InstructionTemplate>
 generateSingleSnippetForInstrAvoidingDefUseOverlap(
     const LLVMState &State, const BitVector &ForbiddenRegisters,
     const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
     BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
     const ArrayRef<InstructionTemplate> Instructions,
     RegRandomizationStrategy S) {
   const Instruction &Instr = IT.getInstr();
   for (const Operand &Op : Instr.Operands) {
     if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() ||
         IT.getValueFor(Op).isValid())
       continue;
     assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses.");

     std::variant<std::nullopt_t, MCOperand, Register> R =
         generateSingleRegisterForInstrAvoidingDefUseOverlap(
             State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
             Uses, Defs, IT, Op, Instructions, S);

     if (std::holds_alternative<std::nullopt_t>(R))
       return {};

     MCOperand MCOp;
     if (std::holds_alternative<MCOperand>(R))
       MCOp = std::get<MCOperand>(R);
     else {
       Register RandomReg = std::get<Register>(R);
       if (Op.isDef())
         Defs.set(RandomReg);
       if (Op.isUse())
         Uses.set(RandomReg);
       MCOp = MCOperand::createReg(RandomReg);
     }
     IT.getValueFor(Op) = MCOp;
   }
   return IT;
 }

 static std::vector<InstructionTemplate>
 generateSnippetForInstrAvoidingDefUseOverlap(
     const LLVMState &State, const InstructionTemplate &IT,
     RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
   // We don't want to accidentally serialize the instruction,
   // so we must be sure that we don't pick a def that is an implicit use,
   // or a use that is an implicit def, so record implicit regs now.
   BitVector ImplicitUses(State.getRegInfo().getNumRegs());
   BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
   for (const auto &Op : IT.getInstr().Operands) {
     if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
       assert(Op.isImplicitReg() && "Not an implicit register operand?");
       if (Op.isUse())
         ImplicitUses.set(Op.getImplicitReg());
       else {
         assert(Op.isDef() && "Not a use and not a def?");
         ImplicitDefs.set(Op.getImplicitReg());
       }
     }
   }
   const BitVector ImplicitUseAliases =
       getAliasedBits(State.getRegInfo(), ImplicitUses);
   const BitVector ImplicitDefAliases =
       getAliasedBits(State.getRegInfo(), ImplicitDefs);

   BitVector Defs(State.getRegInfo().getNumRegs());
   BitVector Uses(State.getRegInfo().getNumRegs());
   std::vector<InstructionTemplate> Instructions;

   while (true) {
     std::optional<InstructionTemplate> TmpIT =
         generateSingleSnippetForInstrAvoidingDefUseOverlap(
             State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
             Uses, Defs, IT, Instructions, S);
     if (!TmpIT)
       return Instructions;
     Instructions.push_back(std::move(*TmpIT));
     if (!hasVariablesWithTiedOperands(IT.getInstr()))
       return Instructions;
     assert(Instructions.size() <= 128 && "Stuck in endless loop?");
   }
 }

 Expected<std::vector<CodeTemplate>>
 ParallelSnippetGenerator::generateCodeTemplates(
     InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
   const Instruction &Instr = Variant.getInstr();
   CodeTemplate CT;
   CT.ScratchSpacePointerInReg =
       Instr.hasMemoryOperands()
           ? State.getExegesisTarget().getScratchMemoryRegister(
                 State.getTargetMachine().getTargetTriple())
           : 0;
   const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
   if (SelfAliasing.empty()) {
     CT.Info = "instruction is parallel, repeating a random one.";
     CT.Instructions.push_back(std::move(Variant));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
     return getSingleton(std::move(CT));
   }
   if (SelfAliasing.hasImplicitAliasing()) {
     CT.Info = "instruction is serial, repeating a random one.";
     CT.Instructions.push_back(std::move(Variant));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
     return getSingleton(std::move(CT));
   }
   std::vector<CodeTemplate> Result;
   bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
   // If there are no tied operands, then we don't want to "saturate backedge",
   // and the template we will produce will have only a single instruction.
   unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) {
     return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
            !Op.isTied();
   });
   SmallVector<RegRandomizationStrategy, 3> Strategies;
   if (HasTiedOperands || NumUntiedUseRegs >= 3)
     Strategies.push_back(RegRandomizationStrategy::PickRandomRegs);
   if (NumUntiedUseRegs >= 2)
     Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand);
   Strategies.push_back(RegRandomizationStrategy::SingleStaticReg);
   for (RegRandomizationStrategy S : Strategies) {
     CodeTemplate CurrCT = CT.clone();
     CurrCT.Info =
         Twine("instruction has ")
             .concat(HasTiedOperands ? "" : "no ")
             .concat("tied variables, avoiding "
                     "Read-After-Write issue, picking random def and use "
                     "registers not aliasing each other, for uses, ")
             .concat(getDescription(S))
             .str();
     CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
         State, Variant, S, ForbiddenRegisters);
     if (CurrCT.Instructions.empty())
       return make_error<StringError>(
           Twine("Failed to produce any snippet via: ").concat(CurrCT.Info),
           inconvertibleErrorCode());
     instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg,
                               CurrCT.Instructions);
     Result.push_back(std::move(CurrCT));
   }
   return Result;
 }

 constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;

 } // namespace exegesis
 } // namespace llvm
	//===-- ParallelSnippetGenerator.cpp ----------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "ParallelSnippetGenerator.h"

	#include "BenchmarkRunner.h"
	#include "MCInstrDescView.h"
	#include "Target.h"

	// FIXME: Load constants into registers (e.g. with fld1) to not break
	// instructions like x87.

	// Ideally we would like the only limitation on executing instructions to be the
	// availability of the CPU resources (e.g. execution ports) needed to execute
	// them, instead of the availability of their data dependencies.

	// To achieve that, one approach is to generate instructions that do not have
	// data dependencies between them.
	//
	// For some instructions, this is trivial:
	// mov rax, qword ptr [rsi]
	// mov rax, qword ptr [rsi]
	// mov rax, qword ptr [rsi]
	// mov rax, qword ptr [rsi]
	// For the above snippet, haswell just renames rax four times and executes the
	// four instructions two at a time on P23 and P0126.
	//
	// For some instructions, we just need to make sure that the source is
	// different from the destination. For example, IDIV8r reads from GPR and
	// writes to AX. We just need to ensure that the Var is assigned a
	// register which is different from AX:
	// idiv bx
	// idiv bx
	// idiv bx
	// idiv bx
	// The above snippet will be able to fully saturate the ports, while the same
	// with ax would issue one uop every `latency(IDIV8r)` cycles.
	//
	// Some instructions make this harder because they both read and write from
	// the same register:
	// inc rax
	// inc rax
	// inc rax
	// inc rax
	// This has a data dependency from each instruction to the next, limit the
	// number of instructions that can be issued in parallel.
	// It turns out that this is not a big issue on recent Intel CPUs because they
	// have heuristics to balance port pressure. In the snippet above, subsequent
	// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
	// might end up executing them all on P0 (just because they can), or try
	// avoiding P5 because it's usually under high pressure from vector
	// instructions.
	// This issue is even more important for high-latency instructions because
	// they increase the idle time of the CPU, e.g. :
	// imul rax, rbx
	// imul rax, rbx
	// imul rax, rbx
	// imul rax, rbx
	//
	// To avoid that, we do the renaming statically by generating as many
	// independent exclusive assignments as possible (until all possible registers
	// are exhausted) e.g.:
	// imul rax, rbx
	// imul rcx, rbx
	// imul rdx, rbx
	// imul r8, rbx
	//
	// Some instruction even make the above static renaming impossible because
	// they implicitly read and write from the same operand, e.g. ADC16rr reads
	// and writes from EFLAGS.
	// In that case we just use a greedy register assignment and hope for the
	// best.

	namespace llvm {
	namespace exegesis {

	static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
	SmallVector<const Variable *, 8> Result;
	for (const auto &Var : Instr.Variables)
	if (Var.hasTiedOperands())
	return true;
	return false;
	}

	ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;

	void ParallelSnippetGenerator::instantiateMemoryOperands(
	const unsigned ScratchSpacePointerInReg,
	std::vector<InstructionTemplate> &Instructions) const {
	if (ScratchSpacePointerInReg == 0)
	return; // no memory operands.
	const auto &ET = State.getExegesisTarget();
	const unsigned MemStep = ET.getMaxMemoryAccessSize();
	const size_t OriginalInstructionsSize = Instructions.size();
	size_t I = 0;
	for (InstructionTemplate &IT : Instructions) {
	ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
	++I;
	}

	while (Instructions.size() < kMinNumDifferentAddresses) {
	InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
	ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
	++I;
	Instructions.push_back(std::move(IT));
	}
	assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
	"not enough scratch space");
	}

	enum class RegRandomizationStrategy : uint8_t {
	PickRandomRegs,
	SingleStaticRegPerOperand,
	SingleStaticReg,

	FIRST = PickRandomRegs,
	LAST = SingleStaticReg,
	};

	} // namespace exegesis

	template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
	static constexpr bool is_iterable = true;
	};

	namespace exegesis {

	const char *getDescription(RegRandomizationStrategy S) {
	switch (S) {
	case RegRandomizationStrategy::PickRandomRegs:
	return "randomizing registers";
	case RegRandomizationStrategy::SingleStaticRegPerOperand:
	return "one unique register for each position";
	case RegRandomizationStrategy::SingleStaticReg:
	return "reusing the same register for all positions";
	}
	llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
	}

	static std::variant<std::nullopt_t, MCOperand, Register>
	generateSingleRegisterForInstrAvoidingDefUseOverlap(
	const LLVMState &State, const BitVector &ForbiddenRegisters,
	const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
	const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
	const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
	RegRandomizationStrategy S) {
	const Instruction &Instr = IT.getInstr();
	assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
	!IT.getValueFor(Op).isValid());
	assert((!Op.isUse() \|\| !Op.isTied()) &&
	"Not expecting to see a tied use reg");

	if (Op.isUse()) {
	switch (S) {
	case RegRandomizationStrategy::PickRandomRegs:
	break;
	case RegRandomizationStrategy::SingleStaticReg:
	case RegRandomizationStrategy::SingleStaticRegPerOperand: {
	if (!Instructions.empty())
	return Instructions.front().getValueFor(Op);
	if (S != RegRandomizationStrategy::SingleStaticReg)
	break;
	BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
	const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
	if (std::optional<int> CommonBit =
	getFirstCommonBit(PossibleRegisters, UseAliases))
	return *CommonBit;
	break;
	}
	}
	}

	BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
	remove(PossibleRegisters, ForbiddenRegisters);

	if (Op.isDef()) {
	remove(PossibleRegisters, ImplicitUseAliases);
	const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
	remove(PossibleRegisters, UseAliases);
	}

	if (Op.isUse()) {
	remove(PossibleRegisters, ImplicitDefAliases);
	// NOTE: in general, using same reg for multiple Use's is fine.
	if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
	const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
	remove(PossibleRegisters, UseAliases);
	}
	}

	bool IsDefWithTiedUse =
	Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
	if (Op.isUse() \|\| IsDefWithTiedUse) {
	// Now, important bit: if we have used some register for def,
	// then we can not use that same register for any use,
	// be it either an untied use, or an use tied to a def.
	// But def-ing same regs is fine, as long as there are no uses!
	const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs);
	remove(PossibleRegisters, DefsAliases);
	}

	if (!PossibleRegisters.any())
	return std::nullopt;

	return randomBit(PossibleRegisters);
	}

	static std::optional<InstructionTemplate>
	generateSingleSnippetForInstrAvoidingDefUseOverlap(
	const LLVMState &State, const BitVector &ForbiddenRegisters,
	const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
	BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
	const ArrayRef<InstructionTemplate> Instructions,
	RegRandomizationStrategy S) {
	const Instruction &Instr = IT.getInstr();
	for (const Operand &Op : Instr.Operands) {
	if (!Op.isReg() \|\| !Op.isExplicit() \|\| Op.isMemory() \|\|
	IT.getValueFor(Op).isValid())
	continue;
	assert((!Op.isUse() \|\| !Op.isTied()) && "Will not get tied uses.");

	std::variant<std::nullopt_t, MCOperand, Register> R =
	generateSingleRegisterForInstrAvoidingDefUseOverlap(
	State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
	Uses, Defs, IT, Op, Instructions, S);

	if (std::holds_alternative<std::nullopt_t>(R))
	return {};

	MCOperand MCOp;
	if (std::holds_alternative<MCOperand>(R))
	MCOp = std::get<MCOperand>(R);
	else {
	Register RandomReg = std::get<Register>(R);
	if (Op.isDef())
	Defs.set(RandomReg);
	if (Op.isUse())
	Uses.set(RandomReg);
	MCOp = MCOperand::createReg(RandomReg);
	}
	IT.getValueFor(Op) = MCOp;
	}
	return IT;
	}

	static std::vector<InstructionTemplate>
	generateSnippetForInstrAvoidingDefUseOverlap(
	const LLVMState &State, const InstructionTemplate &IT,
	RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
	// We don't want to accidentally serialize the instruction,
	// so we must be sure that we don't pick a def that is an implicit use,
	// or a use that is an implicit def, so record implicit regs now.
	BitVector ImplicitUses(State.getRegInfo().getNumRegs());
	BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
	for (const auto &Op : IT.getInstr().Operands) {
	if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
	assert(Op.isImplicitReg() && "Not an implicit register operand?");
	if (Op.isUse())
	ImplicitUses.set(Op.getImplicitReg());
	else {
	assert(Op.isDef() && "Not a use and not a def?");
	ImplicitDefs.set(Op.getImplicitReg());
	}
	}
	}
	const BitVector ImplicitUseAliases =
	getAliasedBits(State.getRegInfo(), ImplicitUses);
	const BitVector ImplicitDefAliases =
	getAliasedBits(State.getRegInfo(), ImplicitDefs);

	BitVector Defs(State.getRegInfo().getNumRegs());
	BitVector Uses(State.getRegInfo().getNumRegs());
	std::vector<InstructionTemplate> Instructions;

	while (true) {
	std::optional<InstructionTemplate> TmpIT =
	generateSingleSnippetForInstrAvoidingDefUseOverlap(
	State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
	Uses, Defs, IT, Instructions, S);
	if (!TmpIT)
	return Instructions;
	Instructions.push_back(std::move(*TmpIT));
	if (!hasVariablesWithTiedOperands(IT.getInstr()))
	return Instructions;
	assert(Instructions.size() <= 128 && "Stuck in endless loop?");
	}
	}

	Expected<std::vector<CodeTemplate>>
	ParallelSnippetGenerator::generateCodeTemplates(
	InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
	const Instruction &Instr = Variant.getInstr();
	CodeTemplate CT;
	CT.ScratchSpacePointerInReg =
	Instr.hasMemoryOperands()
	? State.getExegesisTarget().getScratchMemoryRegister(
	State.getTargetMachine().getTargetTriple())
	: 0;
	const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
	if (SelfAliasing.empty()) {
	CT.Info = "instruction is parallel, repeating a random one.";
	CT.Instructions.push_back(std::move(Variant));
	instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
	return getSingleton(std::move(CT));
	}
	if (SelfAliasing.hasImplicitAliasing()) {
	CT.Info = "instruction is serial, repeating a random one.";
	CT.Instructions.push_back(std::move(Variant));
	instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
	return getSingleton(std::move(CT));
	}
	std::vector<CodeTemplate> Result;
	bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
	// If there are no tied operands, then we don't want to "saturate backedge",
	// and the template we will produce will have only a single instruction.
	unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) {
	return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
	!Op.isTied();
	});
	SmallVector<RegRandomizationStrategy, 3> Strategies;
	if (HasTiedOperands \|\| NumUntiedUseRegs >= 3)
	Strategies.push_back(RegRandomizationStrategy::PickRandomRegs);
	if (NumUntiedUseRegs >= 2)
	Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand);
	Strategies.push_back(RegRandomizationStrategy::SingleStaticReg);
	for (RegRandomizationStrategy S : Strategies) {
	CodeTemplate CurrCT = CT.clone();
	CurrCT.Info =
	Twine("instruction has ")
	.concat(HasTiedOperands ? "" : "no ")
	.concat("tied variables, avoiding "
	"Read-After-Write issue, picking random def and use "
	"registers not aliasing each other, for uses, ")
	.concat(getDescription(S))
	.str();
	CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
	State, Variant, S, ForbiddenRegisters);
	if (CurrCT.Instructions.empty())
	return make_error<StringError>(
	Twine("Failed to produce any snippet via: ").concat(CurrCT.Info),
	inconvertibleErrorCode());
	instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg,
	CurrCT.Instructions);
	Result.push_back(std::move(CurrCT));
	}
	return Result;
	}

	constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;

	} // namespace exegesis
	} // namespace llvm