llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp - llvm-project.git - Git at Google

 //===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPUWaitSGPRHazards.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "llvm/ADT/SetVector.h"

 using namespace llvm;

 #define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"

 static cl::opt<bool> GlobalEnableSGPRHazardWaits(
     "amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden,
     cl::desc("Enable required s_wait_alu on SGPR hazards"));

 static cl::opt<bool> GlobalCullSGPRHazardsOnFunctionBoundary(
     "amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden,
     cl::desc("Cull hazards on function boundaries"));

 static cl::opt<bool>
     GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull",
                                    cl::init(false), cl::Hidden,
                                    cl::desc("Cull hazards on memory waits"));

 static cl::opt<unsigned> GlobalCullSGPRHazardsMemWaitThreshold(
     "amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden,
     cl::desc("Number of tracked SGPRs before initiating hazard cull on memory "
              "wait"));

 namespace {

 class AMDGPUWaitSGPRHazards {
 public:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
   unsigned DsNopCount;

   bool EnableSGPRHazardWaits;
   bool CullSGPRHazardsOnFunctionBoundary;
   bool CullSGPRHazardsAtMemWait;
   unsigned CullSGPRHazardsMemWaitThreshold;

   AMDGPUWaitSGPRHazards() {}

   // Return the numeric ID 0-127 for a given SGPR.
   static std::optional<unsigned> sgprNumber(Register Reg,
                                             const SIRegisterInfo &TRI) {
     switch (Reg) {
     case AMDGPU::M0:
     case AMDGPU::EXEC:
     case AMDGPU::EXEC_LO:
     case AMDGPU::EXEC_HI:
     case AMDGPU::SGPR_NULL:
     case AMDGPU::SGPR_NULL64:
       return {};
     default:
       break;
     }
     unsigned RegN = TRI.getHWRegIndex(Reg);
     if (RegN > 127)
       return {};
     return RegN;
   }

   static inline bool isVCC(Register Reg) {
     return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
   }

   // Adjust global offsets for instructions bundled with S_GETPC_B64 after
   // insertion of a new instruction.
   static void updateGetPCBundle(MachineInstr *NewMI) {
     if (!NewMI->isBundled())
       return;

     // Find start of bundle.
     auto I = NewMI->getIterator();
     while (I->isBundledWithPred())
       I--;
     if (I->isBundle())
       I++;

     // Bail if this is not an S_GETPC bundle.
     if (I->getOpcode() != AMDGPU::S_GETPC_B64)
       return;

     // Update offsets of any references in the bundle.
     const unsigned NewBytes = 4;
     assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
            "Unexpected instruction insertion in bundle");
     auto NextMI = std::next(NewMI->getIterator());
     auto End = NewMI->getParent()->end();
     while (NextMI != End && NextMI->isBundledWithPred()) {
       for (auto &Operand : NextMI->operands()) {
         if (Operand.isGlobal())
           Operand.setOffset(Operand.getOffset() + NewBytes);
       }
       NextMI++;
     }
   }

   struct HazardState {
     static constexpr unsigned None = 0;
     static constexpr unsigned SALU = (1 << 0);
     static constexpr unsigned VALU = (1 << 1);

     std::bitset<64> Tracked;      // SGPR banks ever read by VALU
     std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU
     std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU
     unsigned VCCHazard = None;    // Source of current VCC writes
     bool ActiveFlat = false;      // Has unwaited flat instructions

     bool merge(const HazardState &RHS) {
       HazardState Orig(*this);
       *this |= RHS;
       return (*this != Orig);
     }

     bool operator==(const HazardState &RHS) const {
       return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards &&
              VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard &&
              ActiveFlat == RHS.ActiveFlat;
     }

     bool operator!=(const HazardState &RHS) const { return !(*this == RHS); }

     void operator|=(const HazardState &RHS) {
       Tracked |= RHS.Tracked;
       SALUHazards |= RHS.SALUHazards;
       VALUHazards |= RHS.VALUHazards;
       VCCHazard |= RHS.VCCHazard;
       ActiveFlat |= RHS.ActiveFlat;
     }
   };

   struct BlockHazardState {
     HazardState In;
     HazardState Out;
   };

   DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState;

   static constexpr unsigned WAVE32_NOPS = 4;
   static constexpr unsigned WAVE64_NOPS = 8;

   void insertHazardCull(MachineBasicBlock &MBB,
                         MachineBasicBlock::instr_iterator &MI) {
     assert(!MI->isBundled());
     unsigned Count = DsNopCount;
     while (Count--)
       BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP));
   }

   unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
     unsigned Mask = 0xffff;
     Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
         Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
                        AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
     Mask = AMDGPU::DepCtr::encodeFieldVaVcc(
         Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(Mask1),
                        AMDGPU::DepCtr::decodeFieldVaVcc(Mask2)));
     Mask = AMDGPU::DepCtr::encodeFieldVmVsrc(
         Mask, std::min(AMDGPU::DepCtr::decodeFieldVmVsrc(Mask1),
                        AMDGPU::DepCtr::decodeFieldVmVsrc(Mask2)));
     Mask = AMDGPU::DepCtr::encodeFieldVaSdst(
         Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(Mask1),
                        AMDGPU::DepCtr::decodeFieldVaSdst(Mask2)));
     Mask = AMDGPU::DepCtr::encodeFieldVaVdst(
         Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1),
                        AMDGPU::DepCtr::decodeFieldVaVdst(Mask2)));
     Mask = AMDGPU::DepCtr::encodeFieldHoldCnt(
         Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1),
                        AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2)));
     Mask = AMDGPU::DepCtr::encodeFieldVaSsrc(
         Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1),
                        AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2)));
     return Mask;
   }

   bool mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator &MI,
                                 unsigned Mask) {
     auto MBB = MI->getParent();
     if (MI == MBB->instr_begin())
       return false;

     auto It = prev_nodbg(MI, MBB->instr_begin());
     if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR)
       return false;

     It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm()));
     return true;
   }

   bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
     enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };

     HazardState State = BlockState[&MBB].In;
     SmallSet<Register, 8> SeenRegs;
     bool Emitted = false;
     unsigned DsNops = 0;

     for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(),
                                            E = MBB.instr_end();
          MI != E; ++MI) {
       if (MI->isMetaInstruction())
         continue;

       // Clear tracked SGPRs if sufficient DS_NOPs occur
       if (MI->getOpcode() == AMDGPU::DS_NOP) {
         if (++DsNops >= DsNopCount)
           State.Tracked.reset();
         continue;
       }
       DsNops = 0;

       // Snoop FLAT instructions to avoid adding culls before scratch/lds loads.
       // Culls could be disproportionate in cost to load time.
       if (SIInstrInfo::isFLAT(*MI) && !SIInstrInfo::isFLATGlobal(*MI))
         State.ActiveFlat = true;

       // SMEM or VMEM clears hazards
       if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSMRD(*MI)) {
         State.VCCHazard = HazardState::None;
         State.SALUHazards.reset();
         State.VALUHazards.reset();
         continue;
       }

       // Existing S_WAITALU can clear hazards
       if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
         unsigned int Mask = MI->getOperand(0).getImm();
         if (AMDGPU::DepCtr::decodeFieldVaVcc(Mask) == 0)
           State.VCCHazard &= ~HazardState::VALU;
         if (AMDGPU::DepCtr::decodeFieldSaSdst(Mask) == 0) {
           State.SALUHazards.reset();
           State.VCCHazard &= ~HazardState::SALU;
         }
         if (AMDGPU::DepCtr::decodeFieldVaSdst(Mask) == 0)
           State.VALUHazards.reset();
         continue;
       }

       // Snoop counter waits to insert culls
       if (CullSGPRHazardsAtMemWait &&
           (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT ||
            MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT ||
            MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&
           (MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) &&
           (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {
         if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {
           State.ActiveFlat = false;
         } else {
           State.Tracked.reset();
           if (Emit)
             insertHazardCull(MBB, MI);
           continue;
         }
       }

       // Process only VALUs and SALUs
       bool IsVALU = SIInstrInfo::isVALU(*MI);
       bool IsSALU = SIInstrInfo::isSALU(*MI);
       if (!IsVALU && !IsSALU)
         continue;

       unsigned Wait = 0;

       auto processOperand = [&](const MachineOperand &Op, bool IsUse) {
         if (!Op.isReg())
           return;
         Register Reg = Op.getReg();
         assert(!Op.getSubReg());
         if (!TRI->isSGPRReg(*MRI, Reg))
           return;

         // Only visit each register once
         if (!SeenRegs.insert(Reg).second)
           return;

         auto RegNumber = sgprNumber(Reg, *TRI);
         if (!RegNumber)
           return;

         // Track SGPRs by pair -- numeric ID of an 64b SGPR pair.
         // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
         unsigned RegN = *RegNumber;
         unsigned PairN = (RegN >> 1) & 0x3f;

         // Read/write of untracked register is safe; but must record any new
         // reads.
         if (!State.Tracked[PairN]) {
           if (IsVALU && IsUse)
             State.Tracked.set(PairN);
           return;
         }

         uint8_t SGPRCount =
             AMDGPU::getRegBitWidth(*TRI->getRegClassForReg(*MRI, Reg)) / 32;

         if (IsUse) {
           // SALU reading SGPR clears VALU hazards
           if (IsSALU) {
             if (isVCC(Reg)) {
               if (State.VCCHazard & HazardState::VALU)
                 State.VCCHazard = HazardState::None;
             } else {
               State.VALUHazards.reset();
             }
           }
           // Compute required waits
           for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
             Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;
             Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;
           }
           if (isVCC(Reg) && State.VCCHazard) {
             // Note: it's possible for both SALU and VALU to exist if VCC
             // was updated differently by merged predecessors.
             if (State.VCCHazard & HazardState::SALU)
               Wait |= WA_SALU;
             if (State.VCCHazard & HazardState::VALU)
               Wait |= WA_VCC;
           }
         } else {
           // Update hazards
           if (isVCC(Reg)) {
             State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;
           } else {
             for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
               if (IsSALU)
                 State.SALUHazards.set(RegN + RegIdx);
               else
                 State.VALUHazards.set(RegN + RegIdx);
             }
           }
         }
       };

       const bool IsSetPC =
           (MI->isCall() || MI->isReturn() || MI->isIndirectBranch()) &&
           MI->getOpcode() != AMDGPU::S_ENDPGM &&
           MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED;

       // Only consider implicit VCC specified by instruction descriptor.
       const bool HasImplicitVCC =
           llvm::any_of(MI->getDesc().implicit_uses(),
                        [](MCPhysReg Reg) { return isVCC(Reg); }) ||
           llvm::any_of(MI->getDesc().implicit_defs(),
                        [](MCPhysReg Reg) { return isVCC(Reg); });

       if (IsSetPC) {
         // All SGPR writes before a call/return must be flushed as the
         // callee/caller will not will not see the hazard chain.
         if (State.VCCHazard & HazardState::VALU)
           Wait |= WA_VCC;
         if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU))
           Wait |= WA_SALU;
         if (State.VALUHazards.any())
           Wait |= WA_VALU;
         if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {
           State.Tracked.reset();
           if (Emit)
             insertHazardCull(MBB, MI);
         }
       } else {
         // Process uses to determine required wait.
         SeenRegs.clear();
         for (const MachineOperand &Op : MI->all_uses()) {
           if (Op.isImplicit() &&
               (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg())))
             continue;
           processOperand(Op, true);
         }
       }

       // Apply wait
       if (Wait) {
         unsigned Mask = 0xffff;
         if (Wait & WA_VCC) {
           State.VCCHazard &= ~HazardState::VALU;
           Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
         }
         if (Wait & WA_SALU) {
           State.SALUHazards.reset();
           State.VCCHazard &= ~HazardState::SALU;
           Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Mask, 0);
         }
         if (Wait & WA_VALU) {
           State.VALUHazards.reset();
           Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0);
         }
         if (Emit) {
           if (!mergeConsecutiveWaitAlus(MI, Mask)) {
             auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(),
                                  TII->get(AMDGPU::S_WAITCNT_DEPCTR))
                              .addImm(Mask);
             updateGetPCBundle(NewMI);
           }
           Emitted = true;
         }
       }

       // On return from a call SGPR state is unknown, so all potential hazards.
       if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)
         State.Tracked.set();

       // Update hazards based on defs.
       SeenRegs.clear();
       for (const MachineOperand &Op : MI->all_defs()) {
         if (Op.isImplicit() &&
             (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg())))
           continue;
         processOperand(Op, false);
       }
     }

     BlockHazardState &BS = BlockState[&MBB];
     bool Changed = State != BS.Out;
     if (Emit) {
       assert(!Changed && "Hazard state should not change on emit pass");
       return Emitted;
     }
     if (Changed)
       BS.Out = State;
     return Changed;
   }

   bool run(MachineFunction &MF) {
     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     if (!ST.hasVALUReadSGPRHazard())
       return false;

     // Parse settings
     EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits;
     CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary;
     CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait;
     CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold;

     if (!GlobalEnableSGPRHazardWaits.getNumOccurrences())
       EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger(
           "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);
     if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences())
       CullSGPRHazardsOnFunctionBoundary =
           MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-boundary-cull");
     if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences())
       CullSGPRHazardsAtMemWait =
           MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-mem-wait-cull");
     if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences())
       CullSGPRHazardsMemWaitThreshold =
           MF.getFunction().getFnAttributeAsParsedInteger(
               "amdgpu-sgpr-hazard-mem-wait-cull-threshold",
               CullSGPRHazardsMemWaitThreshold);

     // Bail if disabled
     if (!EnableSGPRHazardWaits)
       return false;

     TII = ST.getInstrInfo();
     TRI = ST.getRegisterInfo();
     MRI = &MF.getRegInfo();
     DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;

     auto CallingConv = MF.getFunction().getCallingConv();
     if (!AMDGPU::isEntryFunctionCC(CallingConv) &&
         !CullSGPRHazardsOnFunctionBoundary) {
       // Callee must consider all SGPRs as tracked.
       LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n");
       MachineBasicBlock &EntryBlock = MF.front();
       BlockState[&EntryBlock].In.Tracked.set();
     }

     // Calculate the hazard state for each basic block.
     // Iterate until a fixed point is reached.
     // Fixed point is guaranteed as merge function only ever increases
     // the hazard set, and all backedges will cause a merge.
     //
     // Note: we have to take care of the entry block as this technically
     // has an edge from outside the function. Failure to treat this as
     // a merge could prevent fixed point being reached.
     SetVector<MachineBasicBlock *> Worklist;
     for (auto &MBB : reverse(MF))
       Worklist.insert(&MBB);
     while (!Worklist.empty()) {
       auto &MBB = *Worklist.pop_back_val();
       bool Changed = runOnMachineBasicBlock(MBB, false);
       if (Changed) {
         // Note: take a copy of state here in case it is reallocated by map
         HazardState NewState = BlockState[&MBB].Out;
         // Propagate to all successor blocks
         for (auto Succ : MBB.successors()) {
           // We only need to merge hazards at CFG merge points.
           auto &SuccState = BlockState[Succ];
           if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {
             if (SuccState.In != NewState) {
               SuccState.In = NewState;
               Worklist.insert(Succ);
             }
           } else if (SuccState.In.merge(NewState)) {
             Worklist.insert(Succ);
           }
         }
       }
     }

     LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n");

     // Final to emit wait instructions.
     bool Changed = false;
     for (auto &MBB : MF)
       Changed |= runOnMachineBasicBlock(MBB, true);

     BlockState.clear();
     return Changed;
   }
 };

 class AMDGPUWaitSGPRHazardsLegacy : public MachineFunctionPass {
 public:
   static char ID;

   AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {}

   bool runOnMachineFunction(MachineFunction &MF) override {
     return AMDGPUWaitSGPRHazards().run(MF);
   }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };

 } // namespace

 char AMDGPUWaitSGPRHazardsLegacy::ID = 0;

 char &llvm::AMDGPUWaitSGPRHazardsLegacyID = AMDGPUWaitSGPRHazardsLegacy::ID;

 INITIALIZE_PASS(AMDGPUWaitSGPRHazardsLegacy, DEBUG_TYPE,
                 "AMDGPU Insert waits for SGPR read hazards", false, false)

 PreservedAnalyses
 AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF,
                                MachineFunctionAnalysisManager &MFAM) {
   if (AMDGPUWaitSGPRHazards().run(MF))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
	//===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPUWaitSGPRHazards.h"
	#include "AMDGPU.h"
	#include "GCNSubtarget.h"
	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
	#include "SIInstrInfo.h"
	#include "llvm/ADT/SetVector.h"

	using namespace llvm;

	#define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"

	static cl::opt<bool> GlobalEnableSGPRHazardWaits(
	"amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden,
	cl::desc("Enable required s_wait_alu on SGPR hazards"));

	static cl::opt<bool> GlobalCullSGPRHazardsOnFunctionBoundary(
	"amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden,
	cl::desc("Cull hazards on function boundaries"));

	static cl::opt<bool>
	GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull",
	cl::init(false), cl::Hidden,
	cl::desc("Cull hazards on memory waits"));

	static cl::opt<unsigned> GlobalCullSGPRHazardsMemWaitThreshold(
	"amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden,
	cl::desc("Number of tracked SGPRs before initiating hazard cull on memory "
	"wait"));

	namespace {

	class AMDGPUWaitSGPRHazards {
	public:
	const SIInstrInfo *TII;
	const SIRegisterInfo *TRI;
	const MachineRegisterInfo *MRI;
	unsigned DsNopCount;

	bool EnableSGPRHazardWaits;
	bool CullSGPRHazardsOnFunctionBoundary;
	bool CullSGPRHazardsAtMemWait;
	unsigned CullSGPRHazardsMemWaitThreshold;

	AMDGPUWaitSGPRHazards() {}

	// Return the numeric ID 0-127 for a given SGPR.
	static std::optional<unsigned> sgprNumber(Register Reg,
	const SIRegisterInfo &TRI) {
	switch (Reg) {
	case AMDGPU::M0:
	case AMDGPU::EXEC:
	case AMDGPU::EXEC_LO:
	case AMDGPU::EXEC_HI:
	case AMDGPU::SGPR_NULL:
	case AMDGPU::SGPR_NULL64:
	return {};
	default:
	break;
	}
	unsigned RegN = TRI.getHWRegIndex(Reg);
	if (RegN > 127)
	return {};
	return RegN;
	}

	static inline bool isVCC(Register Reg) {
	return Reg == AMDGPU::VCC \|\| Reg == AMDGPU::VCC_LO \|\| Reg == AMDGPU::VCC_HI;
	}

	// Adjust global offsets for instructions bundled with S_GETPC_B64 after
	// insertion of a new instruction.
	static void updateGetPCBundle(MachineInstr *NewMI) {
	if (!NewMI->isBundled())
	return;

	// Find start of bundle.
	auto I = NewMI->getIterator();
	while (I->isBundledWithPred())
	I--;
	if (I->isBundle())
	I++;

	// Bail if this is not an S_GETPC bundle.
	if (I->getOpcode() != AMDGPU::S_GETPC_B64)
	return;

	// Update offsets of any references in the bundle.
	const unsigned NewBytes = 4;
	assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
	"Unexpected instruction insertion in bundle");
	auto NextMI = std::next(NewMI->getIterator());
	auto End = NewMI->getParent()->end();
	while (NextMI != End && NextMI->isBundledWithPred()) {
	for (auto &Operand : NextMI->operands()) {
	if (Operand.isGlobal())
	Operand.setOffset(Operand.getOffset() + NewBytes);
	}
	NextMI++;
	}
	}

	struct HazardState {
	static constexpr unsigned None = 0;
	static constexpr unsigned SALU = (1 << 0);
	static constexpr unsigned VALU = (1 << 1);

	std::bitset<64> Tracked; // SGPR banks ever read by VALU
	std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU
	std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU
	unsigned VCCHazard = None; // Source of current VCC writes
	bool ActiveFlat = false; // Has unwaited flat instructions

	bool merge(const HazardState &RHS) {
	HazardState Orig(*this);
	*this \|= RHS;
	return (*this != Orig);
	}

	bool operator==(const HazardState &RHS) const {
	return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards &&
	VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard &&
	ActiveFlat == RHS.ActiveFlat;
	}

	bool operator!=(const HazardState &RHS) const { return !(*this == RHS); }

	void operator\|=(const HazardState &RHS) {
	Tracked \|= RHS.Tracked;
	SALUHazards \|= RHS.SALUHazards;
	VALUHazards \|= RHS.VALUHazards;
	VCCHazard \|= RHS.VCCHazard;
	ActiveFlat \|= RHS.ActiveFlat;
	}
	};

	struct BlockHazardState {
	HazardState In;
	HazardState Out;
	};

	DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState;

	static constexpr unsigned WAVE32_NOPS = 4;
	static constexpr unsigned WAVE64_NOPS = 8;

	void insertHazardCull(MachineBasicBlock &MBB,
	MachineBasicBlock::instr_iterator &MI) {
	assert(!MI->isBundled());
	unsigned Count = DsNopCount;
	while (Count--)
	BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP));
	}

	unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
	unsigned Mask = 0xffff;
	Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
	Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
	AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
	Mask = AMDGPU::DepCtr::encodeFieldVaVcc(
	Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(Mask1),
	AMDGPU::DepCtr::decodeFieldVaVcc(Mask2)));
	Mask = AMDGPU::DepCtr::encodeFieldVmVsrc(
	Mask, std::min(AMDGPU::DepCtr::decodeFieldVmVsrc(Mask1),
	AMDGPU::DepCtr::decodeFieldVmVsrc(Mask2)));
	Mask = AMDGPU::DepCtr::encodeFieldVaSdst(
	Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(Mask1),
	AMDGPU::DepCtr::decodeFieldVaSdst(Mask2)));
	Mask = AMDGPU::DepCtr::encodeFieldVaVdst(
	Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1),
	AMDGPU::DepCtr::decodeFieldVaVdst(Mask2)));
	Mask = AMDGPU::DepCtr::encodeFieldHoldCnt(
	Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1),
	AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2)));
	Mask = AMDGPU::DepCtr::encodeFieldVaSsrc(
	Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1),
	AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2)));
	return Mask;
	}

	bool mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator &MI,
	unsigned Mask) {
	auto MBB = MI->getParent();
	if (MI == MBB->instr_begin())
	return false;

	auto It = prev_nodbg(MI, MBB->instr_begin());
	if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR)
	return false;

	It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm()));
	return true;
	}

	bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
	enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };

	HazardState State = BlockState[&MBB].In;
	SmallSet<Register, 8> SeenRegs;
	bool Emitted = false;
	unsigned DsNops = 0;

	for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(),
	E = MBB.instr_end();
	MI != E; ++MI) {
	if (MI->isMetaInstruction())
	continue;

	// Clear tracked SGPRs if sufficient DS_NOPs occur
	if (MI->getOpcode() == AMDGPU::DS_NOP) {
	if (++DsNops >= DsNopCount)
	State.Tracked.reset();
	continue;
	}
	DsNops = 0;

	// Snoop FLAT instructions to avoid adding culls before scratch/lds loads.
	// Culls could be disproportionate in cost to load time.
	if (SIInstrInfo::isFLAT(MI) && !SIInstrInfo::isFLATGlobal(MI))
	State.ActiveFlat = true;

	// SMEM or VMEM clears hazards
	if (SIInstrInfo::isVMEM(MI) \|\| SIInstrInfo::isSMRD(MI)) {
	State.VCCHazard = HazardState::None;
	State.SALUHazards.reset();
	State.VALUHazards.reset();
	continue;
	}

	// Existing S_WAITALU can clear hazards
	if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
	unsigned int Mask = MI->getOperand(0).getImm();
	if (AMDGPU::DepCtr::decodeFieldVaVcc(Mask) == 0)
	State.VCCHazard &= ~HazardState::VALU;
	if (AMDGPU::DepCtr::decodeFieldSaSdst(Mask) == 0) {
	State.SALUHazards.reset();
	State.VCCHazard &= ~HazardState::SALU;
	}
	if (AMDGPU::DepCtr::decodeFieldVaSdst(Mask) == 0)
	State.VALUHazards.reset();
	continue;
	}

	// Snoop counter waits to insert culls
	if (CullSGPRHazardsAtMemWait &&
	(MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT \|\|
	MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT \|\|
	MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&
	(MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) &&
	(State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {
	if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {
	State.ActiveFlat = false;
	} else {
	State.Tracked.reset();
	if (Emit)
	insertHazardCull(MBB, MI);
	continue;
	}
	}

	// Process only VALUs and SALUs
	bool IsVALU = SIInstrInfo::isVALU(*MI);
	bool IsSALU = SIInstrInfo::isSALU(*MI);
	if (!IsVALU && !IsSALU)
	continue;

	unsigned Wait = 0;

	auto processOperand = [&](const MachineOperand &Op, bool IsUse) {
	if (!Op.isReg())
	return;
	Register Reg = Op.getReg();
	assert(!Op.getSubReg());
	if (!TRI->isSGPRReg(*MRI, Reg))
	return;

	// Only visit each register once
	if (!SeenRegs.insert(Reg).second)
	return;

	auto RegNumber = sgprNumber(Reg, *TRI);
	if (!RegNumber)
	return;

	// Track SGPRs by pair -- numeric ID of an 64b SGPR pair.
	// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
	unsigned RegN = *RegNumber;
	unsigned PairN = (RegN >> 1) & 0x3f;

	// Read/write of untracked register is safe; but must record any new
	// reads.
	if (!State.Tracked[PairN]) {
	if (IsVALU && IsUse)
	State.Tracked.set(PairN);
	return;
	}

	uint8_t SGPRCount =
	AMDGPU::getRegBitWidth(TRI->getRegClassForReg(MRI, Reg)) / 32;

	if (IsUse) {
	// SALU reading SGPR clears VALU hazards
	if (IsSALU) {
	if (isVCC(Reg)) {
	if (State.VCCHazard & HazardState::VALU)
	State.VCCHazard = HazardState::None;
	} else {
	State.VALUHazards.reset();
	}
	}
	// Compute required waits
	for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
	Wait \|= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;
	Wait \|= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;
	}
	if (isVCC(Reg) && State.VCCHazard) {
	// Note: it's possible for both SALU and VALU to exist if VCC
	// was updated differently by merged predecessors.
	if (State.VCCHazard & HazardState::SALU)
	Wait \|= WA_SALU;
	if (State.VCCHazard & HazardState::VALU)
	Wait \|= WA_VCC;
	}
	} else {
	// Update hazards
	if (isVCC(Reg)) {
	State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;
	} else {
	for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
	if (IsSALU)
	State.SALUHazards.set(RegN + RegIdx);
	else
	State.VALUHazards.set(RegN + RegIdx);
	}
	}
	}
	};

	const bool IsSetPC =
	(MI->isCall() \|\| MI->isReturn() \|\| MI->isIndirectBranch()) &&
	MI->getOpcode() != AMDGPU::S_ENDPGM &&
	MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED;

	// Only consider implicit VCC specified by instruction descriptor.
	const bool HasImplicitVCC =
	llvm::any_of(MI->getDesc().implicit_uses(),
	[](MCPhysReg Reg) { return isVCC(Reg); }) \|\|
	llvm::any_of(MI->getDesc().implicit_defs(),
	[](MCPhysReg Reg) { return isVCC(Reg); });

	if (IsSetPC) {
	// All SGPR writes before a call/return must be flushed as the
	// callee/caller will not will not see the hazard chain.
	if (State.VCCHazard & HazardState::VALU)
	Wait \|= WA_VCC;
	if (State.SALUHazards.any() \|\| (State.VCCHazard & HazardState::SALU))
	Wait \|= WA_SALU;
	if (State.VALUHazards.any())
	Wait \|= WA_VALU;
	if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {
	State.Tracked.reset();
	if (Emit)
	insertHazardCull(MBB, MI);
	}
	} else {
	// Process uses to determine required wait.
	SeenRegs.clear();
	for (const MachineOperand &Op : MI->all_uses()) {
	if (Op.isImplicit() &&
	(!HasImplicitVCC \|\| !Op.isReg() \|\| !isVCC(Op.getReg())))
	continue;
	processOperand(Op, true);
	}
	}

	// Apply wait
	if (Wait) {
	unsigned Mask = 0xffff;
	if (Wait & WA_VCC) {
	State.VCCHazard &= ~HazardState::VALU;
	Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
	}
	if (Wait & WA_SALU) {
	State.SALUHazards.reset();
	State.VCCHazard &= ~HazardState::SALU;
	Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Mask, 0);
	}
	if (Wait & WA_VALU) {
	State.VALUHazards.reset();
	Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0);
	}
	if (Emit) {
	if (!mergeConsecutiveWaitAlus(MI, Mask)) {
	auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(),
	TII->get(AMDGPU::S_WAITCNT_DEPCTR))
	.addImm(Mask);
	updateGetPCBundle(NewMI);
	}
	Emitted = true;
	}
	}

	// On return from a call SGPR state is unknown, so all potential hazards.
	if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)
	State.Tracked.set();

	// Update hazards based on defs.
	SeenRegs.clear();
	for (const MachineOperand &Op : MI->all_defs()) {
	if (Op.isImplicit() &&
	(!HasImplicitVCC \|\| !Op.isReg() \|\| !isVCC(Op.getReg())))
	continue;
	processOperand(Op, false);
	}
	}

	BlockHazardState &BS = BlockState[&MBB];
	bool Changed = State != BS.Out;
	if (Emit) {
	assert(!Changed && "Hazard state should not change on emit pass");
	return Emitted;
	}
	if (Changed)
	BS.Out = State;
	return Changed;
	}

	bool run(MachineFunction &MF) {
	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
	if (!ST.hasVALUReadSGPRHazard())
	return false;

	// Parse settings
	EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits;
	CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary;
	CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait;
	CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold;

	if (!GlobalEnableSGPRHazardWaits.getNumOccurrences())
	EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger(
	"amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);
	if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences())
	CullSGPRHazardsOnFunctionBoundary =
	MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-boundary-cull");
	if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences())
	CullSGPRHazardsAtMemWait =
	MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-mem-wait-cull");
	if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences())
	CullSGPRHazardsMemWaitThreshold =
	MF.getFunction().getFnAttributeAsParsedInteger(
	"amdgpu-sgpr-hazard-mem-wait-cull-threshold",
	CullSGPRHazardsMemWaitThreshold);

	// Bail if disabled
	if (!EnableSGPRHazardWaits)
	return false;

	TII = ST.getInstrInfo();
	TRI = ST.getRegisterInfo();
	MRI = &MF.getRegInfo();
	DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;

	auto CallingConv = MF.getFunction().getCallingConv();
	if (!AMDGPU::isEntryFunctionCC(CallingConv) &&
	!CullSGPRHazardsOnFunctionBoundary) {
	// Callee must consider all SGPRs as tracked.
	LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n");
	MachineBasicBlock &EntryBlock = MF.front();
	BlockState[&EntryBlock].In.Tracked.set();
	}

	// Calculate the hazard state for each basic block.
	// Iterate until a fixed point is reached.
	// Fixed point is guaranteed as merge function only ever increases
	// the hazard set, and all backedges will cause a merge.
	//
	// Note: we have to take care of the entry block as this technically
	// has an edge from outside the function. Failure to treat this as
	// a merge could prevent fixed point being reached.
	SetVector<MachineBasicBlock *> Worklist;
	for (auto &MBB : reverse(MF))
	Worklist.insert(&MBB);
	while (!Worklist.empty()) {
	auto &MBB = *Worklist.pop_back_val();
	bool Changed = runOnMachineBasicBlock(MBB, false);
	if (Changed) {
	// Note: take a copy of state here in case it is reallocated by map
	HazardState NewState = BlockState[&MBB].Out;
	// Propagate to all successor blocks
	for (auto Succ : MBB.successors()) {
	// We only need to merge hazards at CFG merge points.
	auto &SuccState = BlockState[Succ];
	if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {
	if (SuccState.In != NewState) {
	SuccState.In = NewState;
	Worklist.insert(Succ);
	}
	} else if (SuccState.In.merge(NewState)) {
	Worklist.insert(Succ);
	}
	}
	}
	}

	LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n");

	// Final to emit wait instructions.
	bool Changed = false;
	for (auto &MBB : MF)
	Changed \|= runOnMachineBasicBlock(MBB, true);

	BlockState.clear();
	return Changed;
	}
	};

	class AMDGPUWaitSGPRHazardsLegacy : public MachineFunctionPass {
	public:
	static char ID;

	AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {}

	bool runOnMachineFunction(MachineFunction &MF) override {
	return AMDGPUWaitSGPRHazards().run(MF);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	MachineFunctionPass::getAnalysisUsage(AU);
	}
	};

	} // namespace

	char AMDGPUWaitSGPRHazardsLegacy::ID = 0;

	char &llvm::AMDGPUWaitSGPRHazardsLegacyID = AMDGPUWaitSGPRHazardsLegacy::ID;

	INITIALIZE_PASS(AMDGPUWaitSGPRHazardsLegacy, DEBUG_TYPE,
	"AMDGPU Insert waits for SGPR read hazards", false, false)

	PreservedAnalyses
	AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF,
	MachineFunctionAnalysisManager &MFAM) {
	if (AMDGPUWaitSGPRHazards().run(MF))
	return PreservedAnalyses::none();
	return PreservedAnalyses::all();
	}