llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp - llvm-project - Git at Google

 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 /// \file
 ///
 /// This file implements methods from the AMDGPUCustomBehaviour class.
 ///
 //===----------------------------------------------------------------------===//

 #include "AMDGPUCustomBehaviour.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/WithColor.h"

 namespace llvm {
 namespace mca {

 void AMDGPUInstrPostProcess::postProcessInstruction(
     std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
   switch (MCI.getOpcode()) {
   case AMDGPU::S_WAITCNT:
   case AMDGPU::S_WAITCNT_EXPCNT:
   case AMDGPU::S_WAITCNT_LGKMCNT:
   case AMDGPU::S_WAITCNT_VMCNT:
   case AMDGPU::S_WAITCNT_VSCNT:
   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
   case AMDGPU::S_WAITCNT_gfx10:
   case AMDGPU::S_WAITCNT_gfx6_gfx7:
   case AMDGPU::S_WAITCNT_vi:
     return processWaitCnt(Inst, MCI);
   }
 }

 // s_waitcnt instructions encode important information as immediate operands
 // which are lost during the MCInst -> mca::Instruction lowering.
 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
                                             const MCInst &MCI) {
   for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
     MCAOperand Op;
     const MCOperand &MCOp = MCI.getOperand(Idx);
     if (MCOp.isReg()) {
       Op = MCAOperand::createReg(MCOp.getReg());
     } else if (MCOp.isImm()) {
       Op = MCAOperand::createImm(MCOp.getImm());
     }
     Op.setIndex(Idx);
     Inst->addOperand(Op);
   }
 }

 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                                              const mca::SourceMgr &SrcMgr,
                                              const MCInstrInfo &MCII)
     : CustomBehaviour(STI, SrcMgr, MCII) {
   generateWaitCntInfo();
 }

 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
                                                   const InstRef &IR) {
   const Instruction &Inst = *IR.getInstruction();
   unsigned Opcode = Inst.getOpcode();

   // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
   // pseudo instructions here. However, there are plans for the future to make
   // it possible to use mca within backend passes. As such, I have left the
   // pseudo version of s_waitcnt within this switch statement.
   switch (Opcode) {
   default:
     return 0;
   case AMDGPU::S_WAITCNT: // This instruction
   case AMDGPU::S_WAITCNT_EXPCNT:
   case AMDGPU::S_WAITCNT_LGKMCNT:
   case AMDGPU::S_WAITCNT_VMCNT:
   case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
   case AMDGPU::S_WAITCNT_gfx10:
   case AMDGPU::S_WAITCNT_gfx6_gfx7:
   case AMDGPU::S_WAITCNT_vi:
     // s_endpgm also behaves as if there is an implicit
     // s_waitcnt 0, but I'm not sure if it would be appropriate
     // to model this in llvm-mca based on how the iterations work
     // while simulating the pipeline over and over.
     return handleWaitCnt(IssuedInst, IR);
   }

   return 0;
 }

 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
                                               const InstRef &IR) {
   // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
   // I do not know how that instruction works so I did not attempt to model it.
   // set the max values to begin
   unsigned Vmcnt = 63;
   unsigned Expcnt = 7;
   unsigned Lgkmcnt = 31;
   unsigned Vscnt = 63;
   unsigned CurrVmcnt = 0;
   unsigned CurrExpcnt = 0;
   unsigned CurrLgkmcnt = 0;
   unsigned CurrVscnt = 0;
   unsigned CyclesToWaitVm = ~0U;
   unsigned CyclesToWaitExp = ~0U;
   unsigned CyclesToWaitLgkm = ~0U;
   unsigned CyclesToWaitVs = ~0U;

   computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);

   // We will now look at each of the currently executing instructions
   // to find out if this wait instruction still needs to wait.
   for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
     const InstRef &PrevIR = *I;
     const Instruction &PrevInst = *PrevIR.getInstruction();
     const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
     const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
     const int CyclesLeft = PrevInst.getCyclesLeft();
     assert(CyclesLeft != UNKNOWN_CYCLES &&
            "We should know how many cycles are left for this instruction");
     if (PrevInstWaitInfo.VmCnt) {
       CurrVmcnt++;
       if ((unsigned)CyclesLeft < CyclesToWaitVm)
         CyclesToWaitVm = CyclesLeft;
     }
     if (PrevInstWaitInfo.ExpCnt) {
       CurrExpcnt++;
       if ((unsigned)CyclesLeft < CyclesToWaitExp)
         CyclesToWaitExp = CyclesLeft;
     }
     if (PrevInstWaitInfo.LgkmCnt) {
       CurrLgkmcnt++;
       if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
         CyclesToWaitLgkm = CyclesLeft;
     }
     if (PrevInstWaitInfo.VsCnt) {
       CurrVscnt++;
       if ((unsigned)CyclesLeft < CyclesToWaitVs)
         CyclesToWaitVs = CyclesLeft;
     }
   }

   unsigned CyclesToWait = ~0U;
   if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
     CyclesToWait = CyclesToWaitVm;
   if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
     CyclesToWait = CyclesToWaitExp;
   if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
     CyclesToWait = CyclesToWaitLgkm;
   if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
     CyclesToWait = CyclesToWaitVs;

   // We may underestimate how many cycles we need to wait, but this
   // isn't a big deal. Our return value is just how many cycles until
   // this function gets run again. So as long as we don't overestimate
   // the wait time, we'll still end up stalling at this instruction
   // for the correct number of cycles.

   if (CyclesToWait == ~0U)
     return 0;
   return CyclesToWait;
 }

 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
                                            unsigned &Expcnt, unsigned &Lgkmcnt,
                                            unsigned &Vscnt) {
   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
   const Instruction &Inst = *IR.getInstruction();
   unsigned Opcode = Inst.getOpcode();

   switch (Opcode) {
   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
   case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
     // Should probably be checking for nullptr
     // here, but I'm not sure how I should handle the case
     // where we see a nullptr.
     const MCAOperand *OpReg = Inst.getOperand(0);
     const MCAOperand *OpImm = Inst.getOperand(1);
     assert(OpReg && OpReg->isReg() && "First operand should be a register.");
     assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
     if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
       // Instruction is using a real register.
       // Since we can't know what value this register will have,
       // we can't compute what the value of this wait should be.
       WithColor::warning() << "The register component of "
                            << MCII.getName(Opcode) << " will be completely "
                            << "ignored. So the wait may not be accurate.\n";
     }
     switch (Opcode) {
     // Redundant switch so I don't have to repeat the code above
     // for each case. There are more clever ways to avoid this
     // extra switch and anyone can feel free to implement one of them.
     case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
       Expcnt = OpImm->getImm();
       break;
     case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
       Lgkmcnt = OpImm->getImm();
       break;
     case AMDGPU::S_WAITCNT_VMCNT_gfx10:
       Vmcnt = OpImm->getImm();
       break;
     case AMDGPU::S_WAITCNT_VSCNT_gfx10:
       Vscnt = OpImm->getImm();
       break;
     }
     return;
   }
   case AMDGPU::S_WAITCNT_gfx10:
   case AMDGPU::S_WAITCNT_gfx6_gfx7:
   case AMDGPU::S_WAITCNT_vi:
     unsigned WaitCnt = Inst.getOperand(0)->getImm();
     AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
     return;
   }
 }

 void AMDGPUCustomBehaviour::generateWaitCntInfo() {
   // The core logic from this function is taken from
   // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
   // that are being looked at are in the MachineInstr format, whereas we have
   // access to the MCInst format. The side effects of this are that we can't use
   // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
   // functions. Therefore, we conservatively assume that these functions will
   // return true. This may cause a few instructions to be incorrectly tagged
   // with an extra CNT. However, these are instructions that do interact with at
   // least one CNT so giving them an extra CNT shouldn't cause issues in most
   // scenarios.
   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
   InstrWaitCntInfo.resize(SrcMgr.size());

   int Index = 0;
   for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
     const std::unique_ptr<Instruction> &Inst = *I;
     unsigned Opcode = Inst->getOpcode();
     const MCInstrDesc &MCID = MCII.get(Opcode);
     if ((MCID.TSFlags & SIInstrFlags::DS) &&
         (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
       InstrWaitCntInfo[Index].LgkmCnt = true;
       if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
         InstrWaitCntInfo[Index].ExpCnt = true;
     } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
       // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
       // and mayAccessLDSThroughFlat(Inst) would both return true for this
       // instruction. We have to do this because those functions use
       // information about the memory operands that we don't have access to.
       InstrWaitCntInfo[Index].LgkmCnt = true;
       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
         InstrWaitCntInfo[Index].VmCnt = true;
       else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
         InstrWaitCntInfo[Index].VmCnt = true;
       else
         InstrWaitCntInfo[Index].VsCnt = true;
     } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
         InstrWaitCntInfo[Index].VmCnt = true;
       else if ((MCID.mayLoad() &&
                 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
                ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
                 !MCID.mayStore()))
         InstrWaitCntInfo[Index].VmCnt = true;
       else if (MCID.mayStore())
         InstrWaitCntInfo[Index].VsCnt = true;

       // (IV.Major < 7) is meant to represent
       // GCNTarget.vmemWriteNeedsExpWaitcnt()
       // which is defined as
       // { return getGeneration() < SEA_ISLANDS; }
       if (IV.Major < 7 &&
           (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
         InstrWaitCntInfo[Index].ExpCnt = true;
     } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
       InstrWaitCntInfo[Index].LgkmCnt = true;
     } else if (MCID.TSFlags & SIInstrFlags::EXP) {
       InstrWaitCntInfo[Index].ExpCnt = true;
     } else {
       switch (Opcode) {
       case AMDGPU::S_SENDMSG:
       case AMDGPU::S_SENDMSGHALT:
       case AMDGPU::S_MEMTIME:
       case AMDGPU::S_MEMREALTIME:
         InstrWaitCntInfo[Index].LgkmCnt = true;
         break;
       }
     }
   }
 }

 // taken from SIInstrInfo::isVMEM()
 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
   return MCID.TSFlags & SIInstrFlags::MUBUF ||
          MCID.TSFlags & SIInstrFlags::MTBUF ||
          MCID.TSFlags & SIInstrFlags::MIMG;
 }

 // taken from SIInstrInfo::hasModifiersSet()
 bool AMDGPUCustomBehaviour::hasModifiersSet(
     const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
   int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
   if (Idx == -1)
     return false;

   const MCAOperand *Op = Inst->getOperand(Idx);
   if (Op == nullptr || !Op->isImm() || !Op->getImm())
     return false;

   return true;
 }

 // taken from SIInstrInfo::isAlwaysGDS()
 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
   return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
          Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
          Opcode == AMDGPU::DS_GWS_SEMA_P ||
          Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
          Opcode == AMDGPU::DS_GWS_BARRIER;
 }

 } // namespace mca
 } // namespace llvm

 using namespace llvm;
 using namespace mca;

 static CustomBehaviour *
 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                             const mca::SourceMgr &SrcMgr,
                             const MCInstrInfo &MCII) {
   return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
 }

 static InstrPostProcess *
 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
                              const MCInstrInfo &MCII) {
   return new AMDGPUInstrPostProcess(STI, MCII);
 }

 /// Extern function to initialize the targets for the AMDGPU backend

 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
   TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(),
                                           createAMDGPUCustomBehaviour);
   TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(),
                                            createAMDGPUInstrPostProcess);

   TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
                                           createAMDGPUCustomBehaviour);
   TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
                                            createAMDGPUInstrPostProcess);
 }
	//===------------------ AMDGPUCustomBehaviour.cpp ----------------C++ - -===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	/// \file
	///
	/// This file implements methods from the AMDGPUCustomBehaviour class.
	///
	//===----------------------------------------------------------------------===//

	#include "AMDGPUCustomBehaviour.h"
	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
	#include "SIInstrInfo.h"
	#include "TargetInfo/AMDGPUTargetInfo.h"
	#include "llvm/MC/TargetRegistry.h"
	#include "llvm/Support/WithColor.h"

	namespace llvm {
	namespace mca {

	void AMDGPUInstrPostProcess::postProcessInstruction(
	std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
	switch (MCI.getOpcode()) {
	case AMDGPU::S_WAITCNT:
	case AMDGPU::S_WAITCNT_EXPCNT:
	case AMDGPU::S_WAITCNT_LGKMCNT:
	case AMDGPU::S_WAITCNT_VMCNT:
	case AMDGPU::S_WAITCNT_VSCNT:
	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
	case AMDGPU::S_WAITCNT_gfx10:
	case AMDGPU::S_WAITCNT_gfx6_gfx7:
	case AMDGPU::S_WAITCNT_vi:
	return processWaitCnt(Inst, MCI);
	}
	}

	// s_waitcnt instructions encode important information as immediate operands
	// which are lost during the MCInst -> mca::Instruction lowering.
	void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
	const MCInst &MCI) {
	for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
	MCAOperand Op;
	const MCOperand &MCOp = MCI.getOperand(Idx);
	if (MCOp.isReg()) {
	Op = MCAOperand::createReg(MCOp.getReg());
	} else if (MCOp.isImm()) {
	Op = MCAOperand::createImm(MCOp.getImm());
	}
	Op.setIndex(Idx);
	Inst->addOperand(Op);
	}
	}

	AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
	const mca::SourceMgr &SrcMgr,
	const MCInstrInfo &MCII)
	: CustomBehaviour(STI, SrcMgr, MCII) {
	generateWaitCntInfo();
	}

	unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
	const InstRef &IR) {
	const Instruction &Inst = *IR.getInstruction();
	unsigned Opcode = Inst.getOpcode();

	// llvm-mca is generally run on fully compiled assembly so we wouldn't see any
	// pseudo instructions here. However, there are plans for the future to make
	// it possible to use mca within backend passes. As such, I have left the
	// pseudo version of s_waitcnt within this switch statement.
	switch (Opcode) {
	default:
	return 0;
	case AMDGPU::S_WAITCNT: // This instruction
	case AMDGPU::S_WAITCNT_EXPCNT:
	case AMDGPU::S_WAITCNT_LGKMCNT:
	case AMDGPU::S_WAITCNT_VMCNT:
	case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
	case AMDGPU::S_WAITCNT_gfx10:
	case AMDGPU::S_WAITCNT_gfx6_gfx7:
	case AMDGPU::S_WAITCNT_vi:
	// s_endpgm also behaves as if there is an implicit
	// s_waitcnt 0, but I'm not sure if it would be appropriate
	// to model this in llvm-mca based on how the iterations work
	// while simulating the pipeline over and over.
	return handleWaitCnt(IssuedInst, IR);
	}

	return 0;
	}

	unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
	const InstRef &IR) {
	// Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
	// I do not know how that instruction works so I did not attempt to model it.
	// set the max values to begin
	unsigned Vmcnt = 63;
	unsigned Expcnt = 7;
	unsigned Lgkmcnt = 31;
	unsigned Vscnt = 63;
	unsigned CurrVmcnt = 0;
	unsigned CurrExpcnt = 0;
	unsigned CurrLgkmcnt = 0;
	unsigned CurrVscnt = 0;
	unsigned CyclesToWaitVm = ~0U;
	unsigned CyclesToWaitExp = ~0U;
	unsigned CyclesToWaitLgkm = ~0U;
	unsigned CyclesToWaitVs = ~0U;

	computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);

	// We will now look at each of the currently executing instructions
	// to find out if this wait instruction still needs to wait.
	for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
	const InstRef &PrevIR = *I;
	const Instruction &PrevInst = *PrevIR.getInstruction();
	const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
	const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
	const int CyclesLeft = PrevInst.getCyclesLeft();
	assert(CyclesLeft != UNKNOWN_CYCLES &&
	"We should know how many cycles are left for this instruction");
	if (PrevInstWaitInfo.VmCnt) {
	CurrVmcnt++;
	if ((unsigned)CyclesLeft < CyclesToWaitVm)
	CyclesToWaitVm = CyclesLeft;
	}
	if (PrevInstWaitInfo.ExpCnt) {
	CurrExpcnt++;
	if ((unsigned)CyclesLeft < CyclesToWaitExp)
	CyclesToWaitExp = CyclesLeft;
	}
	if (PrevInstWaitInfo.LgkmCnt) {
	CurrLgkmcnt++;
	if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
	CyclesToWaitLgkm = CyclesLeft;
	}
	if (PrevInstWaitInfo.VsCnt) {
	CurrVscnt++;
	if ((unsigned)CyclesLeft < CyclesToWaitVs)
	CyclesToWaitVs = CyclesLeft;
	}
	}

	unsigned CyclesToWait = ~0U;
	if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
	CyclesToWait = CyclesToWaitVm;
	if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
	CyclesToWait = CyclesToWaitExp;
	if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
	CyclesToWait = CyclesToWaitLgkm;
	if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
	CyclesToWait = CyclesToWaitVs;

	// We may underestimate how many cycles we need to wait, but this
	// isn't a big deal. Our return value is just how many cycles until
	// this function gets run again. So as long as we don't overestimate
	// the wait time, we'll still end up stalling at this instruction
	// for the correct number of cycles.

	if (CyclesToWait == ~0U)
	return 0;
	return CyclesToWait;
	}

	void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
	unsigned &Expcnt, unsigned &Lgkmcnt,
	unsigned &Vscnt) {
	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
	const Instruction &Inst = *IR.getInstruction();
	unsigned Opcode = Inst.getOpcode();

	switch (Opcode) {
	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
	case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
	// Should probably be checking for nullptr
	// here, but I'm not sure how I should handle the case
	// where we see a nullptr.
	const MCAOperand *OpReg = Inst.getOperand(0);
	const MCAOperand *OpImm = Inst.getOperand(1);
	assert(OpReg && OpReg->isReg() && "First operand should be a register.");
	assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
	if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
	// Instruction is using a real register.
	// Since we can't know what value this register will have,
	// we can't compute what the value of this wait should be.
	WithColor::warning() << "The register component of "
	<< MCII.getName(Opcode) << " will be completely "
	<< "ignored. So the wait may not be accurate.\n";
	}
	switch (Opcode) {
	// Redundant switch so I don't have to repeat the code above
	// for each case. There are more clever ways to avoid this
	// extra switch and anyone can feel free to implement one of them.
	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
	Expcnt = OpImm->getImm();
	break;
	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
	Lgkmcnt = OpImm->getImm();
	break;
	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
	Vmcnt = OpImm->getImm();
	break;
	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
	Vscnt = OpImm->getImm();
	break;
	}
	return;
	}
	case AMDGPU::S_WAITCNT_gfx10:
	case AMDGPU::S_WAITCNT_gfx6_gfx7:
	case AMDGPU::S_WAITCNT_vi:
	unsigned WaitCnt = Inst.getOperand(0)->getImm();
	AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
	return;
	}
	}

	void AMDGPUCustomBehaviour::generateWaitCntInfo() {
	// The core logic from this function is taken from
	// SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
	// that are being looked at are in the MachineInstr format, whereas we have
	// access to the MCInst format. The side effects of this are that we can't use
	// the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
	// functions. Therefore, we conservatively assume that these functions will
	// return true. This may cause a few instructions to be incorrectly tagged
	// with an extra CNT. However, these are instructions that do interact with at
	// least one CNT so giving them an extra CNT shouldn't cause issues in most
	// scenarios.
	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
	InstrWaitCntInfo.resize(SrcMgr.size());

	int Index = 0;
	for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
	const std::unique_ptr<Instruction> &Inst = *I;
	unsigned Opcode = Inst->getOpcode();
	const MCInstrDesc &MCID = MCII.get(Opcode);
	if ((MCID.TSFlags & SIInstrFlags::DS) &&
	(MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
	InstrWaitCntInfo[Index].LgkmCnt = true;
	if (isAlwaysGDS(Opcode) \|\| hasModifiersSet(Inst, AMDGPU::OpName::gds))
	InstrWaitCntInfo[Index].ExpCnt = true;
	} else if (MCID.TSFlags & SIInstrFlags::FLAT) {
	// We conservatively assume that mayAccessVMEMThroughFlat(Inst)
	// and mayAccessLDSThroughFlat(Inst) would both return true for this
	// instruction. We have to do this because those functions use
	// information about the memory operands that we don't have access to.
	InstrWaitCntInfo[Index].LgkmCnt = true;
	if (!STI.hasFeature(AMDGPU::FeatureVscnt))
	InstrWaitCntInfo[Index].VmCnt = true;
	else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
	InstrWaitCntInfo[Index].VmCnt = true;
	else
	InstrWaitCntInfo[Index].VsCnt = true;
	} else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
	if (!STI.hasFeature(AMDGPU::FeatureVscnt))
	InstrWaitCntInfo[Index].VmCnt = true;
	else if ((MCID.mayLoad() &&
	!(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) \|\|
	((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
	!MCID.mayStore()))
	InstrWaitCntInfo[Index].VmCnt = true;
	else if (MCID.mayStore())
	InstrWaitCntInfo[Index].VsCnt = true;

	// (IV.Major < 7) is meant to represent
	// GCNTarget.vmemWriteNeedsExpWaitcnt()
	// which is defined as
	// { return getGeneration() < SEA_ISLANDS; }
	if (IV.Major < 7 &&
	(MCID.mayStore() \|\| (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
	InstrWaitCntInfo[Index].ExpCnt = true;
	} else if (MCID.TSFlags & SIInstrFlags::SMRD) {
	InstrWaitCntInfo[Index].LgkmCnt = true;
	} else if (MCID.TSFlags & SIInstrFlags::EXP) {
	InstrWaitCntInfo[Index].ExpCnt = true;
	} else {
	switch (Opcode) {
	case AMDGPU::S_SENDMSG:
	case AMDGPU::S_SENDMSGHALT:
	case AMDGPU::S_MEMTIME:
	case AMDGPU::S_MEMREALTIME:
	InstrWaitCntInfo[Index].LgkmCnt = true;
	break;
	}
	}
	}
	}

	// taken from SIInstrInfo::isVMEM()
	bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
	return MCID.TSFlags & SIInstrFlags::MUBUF \|\|
	MCID.TSFlags & SIInstrFlags::MTBUF \|\|
	MCID.TSFlags & SIInstrFlags::MIMG;
	}

	// taken from SIInstrInfo::hasModifiersSet()
	bool AMDGPUCustomBehaviour::hasModifiersSet(
	const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
	int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
	if (Idx == -1)
	return false;

	const MCAOperand *Op = Inst->getOperand(Idx);
	if (Op == nullptr \|\| !Op->isImm() \|\| !Op->getImm())
	return false;

	return true;
	}

	// taken from SIInstrInfo::isAlwaysGDS()
	bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
	return Opcode == AMDGPU::DS_ORDERED_COUNT \|\| Opcode == AMDGPU::DS_GWS_INIT \|\|
	Opcode == AMDGPU::DS_GWS_SEMA_V \|\| Opcode == AMDGPU::DS_GWS_SEMA_BR \|\|
	Opcode == AMDGPU::DS_GWS_SEMA_P \|\|
	Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL \|\|
	Opcode == AMDGPU::DS_GWS_BARRIER;
	}

	} // namespace mca
	} // namespace llvm

	using namespace llvm;
	using namespace mca;

	static CustomBehaviour *
	createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
	const mca::SourceMgr &SrcMgr,
	const MCInstrInfo &MCII) {
	return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
	}

	static InstrPostProcess *
	createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
	const MCInstrInfo &MCII) {
	return new AMDGPUInstrPostProcess(STI, MCII);
	}

	/// Extern function to initialize the targets for the AMDGPU backend

	extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
	TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(),
	createAMDGPUCustomBehaviour);
	TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(),
	createAMDGPUInstrPostProcess);

	TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
	createAMDGPUCustomBehaviour);
	TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
	createAMDGPUInstrPostProcess);
	}