llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp - llvm-project.git - Git at Google

 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// \brief Analyzes how many registers and other resources are used by
 /// functions.
 ///
 /// The results of this analysis are used to fill the register usage, flat
 /// usage, etc. into hardware registers.
 ///
 //===----------------------------------------------------------------------===//

 #include "AMDGPUResourceUsageAnalysis.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Target/TargetMachine.h"

 using namespace llvm;
 using namespace llvm::AMDGPU;

 #define DEBUG_TYPE "amdgpu-resource-usage"

 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;

 // In code object v4 and older, we need to tell the runtime some amount ahead of
 // time if we don't know the true stack size. Assume a smaller number if this is
 // only due to dynamic / non-entry block allocas.
 static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
     "amdgpu-assume-external-call-stack-size",
     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
     cl::init(16384));

 static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
     "amdgpu-assume-dynamic-stack-object-size",
     cl::desc("Assumed extra stack use if there are any "
              "variable sized objects (in bytes)"),
     cl::Hidden, cl::init(4096));

 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
                 "Function register usage analysis", true, true)

 static const Function *getCalleeFunction(const MachineOperand &Op) {
   if (Op.isImm()) {
     assert(Op.getImm() == 0);
     return nullptr;
   }
   return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases());
 }

 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
                                   const SIInstrInfo &TII, unsigned Reg) {
   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
       return true;
   }

   return false;
 }

 bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) {
   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   if (!TPC)
     return false;

   const TargetMachine &TM = TPC->getTM<TargetMachine>();
   const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();

   // By default, for code object v5 and later, track only the minimum scratch
   // size
   uint32_t AssumedStackSizeForDynamicSizeObjects =
       clAssumedStackSizeForDynamicSizeObjects;
   uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
           AMDGPU::AMDHSA_COV5 ||
       STI.getTargetTriple().getOS() == Triple::AMDPAL) {
     if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
       AssumedStackSizeForDynamicSizeObjects = 0;
     if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
       AssumedStackSizeForExternalCall = 0;
   }

   ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects,
                                       AssumedStackSizeForExternalCall);

   return false;
 }

 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
     const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
     uint32_t AssumedStackSizeForExternalCall) const {
   SIFunctionResourceInfo Info;

   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();

   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
                          MRI.isLiveIn(MFI->getPreloadedReg(
                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));

   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
   // instructions aren't used to access the scratch buffer. Inline assembly may
   // need it though.
   //
   // If we only have implicit uses of flat_scr on flat instructions, it is not
   // really needed.
   if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
     Info.UsesFlatScratch = false;
   }

   Info.PrivateSegmentSize = FrameInfo.getStackSize();

   // Assume a big number if there are any unknown sized objects.
   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
   if (Info.HasDynamicallySizedStack)
     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;

   if (MFI->isStackRealigned())
     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();

   Info.UsesVCC =
       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);

   // If there are no calls, MachineRegisterInfo can tell us the used register
   // count easily.
   // A tail call isn't considered a call for MachineFrameInfo's purposes.
   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
     Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
     Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
     if (ST.hasMAIInsts())
       Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
     return Info;
   }

   int32_t MaxVGPR = -1;
   int32_t MaxAGPR = -1;
   int32_t MaxSGPR = -1;
   Info.CalleeSegmentSize = 0;

   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
       // TODO: Check regmasks? Do they occur anywhere except calls?
       for (const MachineOperand &MO : MI.operands()) {
         unsigned Width = 0;
         bool IsSGPR = false;
         bool IsAGPR = false;

         if (!MO.isReg())
           continue;

         Register Reg = MO.getReg();
         switch (Reg) {
         case AMDGPU::EXEC:
         case AMDGPU::EXEC_LO:
         case AMDGPU::EXEC_HI:
         case AMDGPU::SCC:
         case AMDGPU::M0:
         case AMDGPU::M0_LO16:
         case AMDGPU::M0_HI16:
         case AMDGPU::SRC_SHARED_BASE_LO:
         case AMDGPU::SRC_SHARED_BASE:
         case AMDGPU::SRC_SHARED_LIMIT_LO:
         case AMDGPU::SRC_SHARED_LIMIT:
         case AMDGPU::SRC_PRIVATE_BASE_LO:
         case AMDGPU::SRC_PRIVATE_BASE:
         case AMDGPU::SRC_PRIVATE_LIMIT_LO:
         case AMDGPU::SRC_PRIVATE_LIMIT:
         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
         case AMDGPU::SGPR_NULL:
         case AMDGPU::SGPR_NULL64:
         case AMDGPU::MODE:
           continue;

         case AMDGPU::NoRegister:
           assert(MI.isDebugInstr() &&
                  "Instruction uses invalid noreg register");
           continue;

         case AMDGPU::VCC:
         case AMDGPU::VCC_LO:
         case AMDGPU::VCC_HI:
         case AMDGPU::VCC_LO_LO16:
         case AMDGPU::VCC_LO_HI16:
         case AMDGPU::VCC_HI_LO16:
         case AMDGPU::VCC_HI_HI16:
           Info.UsesVCC = true;
           continue;

         case AMDGPU::FLAT_SCR:
         case AMDGPU::FLAT_SCR_LO:
         case AMDGPU::FLAT_SCR_HI:
           continue;

         case AMDGPU::XNACK_MASK:
         case AMDGPU::XNACK_MASK_LO:
         case AMDGPU::XNACK_MASK_HI:
           llvm_unreachable("xnack_mask registers should not be used");

         case AMDGPU::LDS_DIRECT:
           llvm_unreachable("lds_direct register should not be used");

         case AMDGPU::TBA:
         case AMDGPU::TBA_LO:
         case AMDGPU::TBA_HI:
         case AMDGPU::TMA:
         case AMDGPU::TMA_LO:
         case AMDGPU::TMA_HI:
           llvm_unreachable("trap handler registers should not be used");

         case AMDGPU::SRC_VCCZ:
           llvm_unreachable("src_vccz register should not be used");

         case AMDGPU::SRC_EXECZ:
           llvm_unreachable("src_execz register should not be used");

         case AMDGPU::SRC_SCC:
           llvm_unreachable("src_scc register should not be used");

         default:
           break;
         }

         if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
             AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 1;
         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
                    AMDGPU::VGPR_16RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 1;
         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 1;
         } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 2;
         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 2;
         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 2;
         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 3;
         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 3;
         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 3;
         } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 4;
         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 4;
         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 4;
         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 5;
         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 5;
         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 5;
         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 6;
         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 6;
         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 6;
         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 7;
         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 7;
         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 7;
         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 8;
         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 8;
         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 8;
         } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 9;
         } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 9;
         } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 9;
         } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 10;
         } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 10;
         } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 10;
         } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 11;
         } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 11;
         } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 11;
         } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 12;
         } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 12;
         } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 12;
         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 16;
         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 16;
         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 16;
         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 32;
         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 32;
         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 32;
         } else {
           // We only expect TTMP registers or registers that do not belong to
           // any RC.
           assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
                   AMDGPU::TTMP_64RegClass.contains(Reg) ||
                   AMDGPU::TTMP_128RegClass.contains(Reg) ||
                   AMDGPU::TTMP_256RegClass.contains(Reg) ||
                   AMDGPU::TTMP_512RegClass.contains(Reg) ||
                   !TRI.getPhysRegBaseClass(Reg)) &&
                  "Unknown register class");
         }
         unsigned HWReg = TRI.getHWRegIndex(Reg);
         int MaxUsed = HWReg + Width - 1;
         if (IsSGPR) {
           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
         } else if (IsAGPR) {
           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
         } else {
           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
         }
       }

       if (MI.isCall()) {
         // Pseudo used just to encode the underlying global. Is there a better
         // way to track this?

         const MachineOperand *CalleeOp =
             TII->getNamedOperand(MI, AMDGPU::OpName::callee);

         const Function *Callee = getCalleeFunction(*CalleeOp);

         // Avoid crashing on undefined behavior with an illegal call to a
         // kernel. If a callsite's calling convention doesn't match the
         // function's, it's undefined behavior. If the callsite calling
         // convention does match, that would have errored earlier.
         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
           report_fatal_error("invalid call to entry function");

         auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
           return F == &MF.getFunction();
         };

         if (Callee && !isSameFunction(MF, Callee))
           Info.Callees.push_back(Callee);

         bool IsIndirect = !Callee || Callee->isDeclaration();

         // FIXME: Call site could have norecurse on it
         if (!Callee || !Callee->doesNotRecurse()) {
           Info.HasRecursion = true;

           // TODO: If we happen to know there is no stack usage in the
           // callgraph, we don't need to assume an infinitely growing stack.
           if (!MI.isReturn()) {
             // We don't need to assume an unknown stack size for tail calls.

             // FIXME: This only benefits in the case where the kernel does not
             // directly call the tail called function. If a kernel directly
             // calls a tail recursive function, we'll assume maximum stack size
             // based on the regular call instruction.
             Info.CalleeSegmentSize = std::max(
                 Info.CalleeSegmentSize,
                 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
           }
         }

         if (IsIndirect) {
           Info.CalleeSegmentSize =
               std::max(Info.CalleeSegmentSize,
                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));

           // Register usage of indirect calls gets handled later
           Info.UsesVCC = true;
           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
           Info.HasDynamicallySizedStack = true;
           Info.HasIndirectCall = true;
         }
       }
     }
   }

   Info.NumExplicitSGPR = MaxSGPR + 1;
   Info.NumVGPR = MaxVGPR + 1;
   Info.NumAGPR = MaxAGPR + 1;

   return Info;
 }
	//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// \brief Analyzes how many registers and other resources are used by
	/// functions.
	///
	/// The results of this analysis are used to fill the register usage, flat
	/// usage, etc. into hardware registers.
	///
	//===----------------------------------------------------------------------===//

	#include "AMDGPUResourceUsageAnalysis.h"
	#include "AMDGPU.h"
	#include "GCNSubtarget.h"
	#include "SIMachineFunctionInfo.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/Target/TargetMachine.h"

	using namespace llvm;
	using namespace llvm::AMDGPU;

	#define DEBUG_TYPE "amdgpu-resource-usage"

	char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
	char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;

	// In code object v4 and older, we need to tell the runtime some amount ahead of
	// time if we don't know the true stack size. Assume a smaller number if this is
	// only due to dynamic / non-entry block allocas.
	static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
	"amdgpu-assume-external-call-stack-size",
	cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
	cl::init(16384));

	static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
	"amdgpu-assume-dynamic-stack-object-size",
	cl::desc("Assumed extra stack use if there are any "
	"variable sized objects (in bytes)"),
	cl::Hidden, cl::init(4096));

	INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
	"Function register usage analysis", true, true)

	static const Function *getCalleeFunction(const MachineOperand &Op) {
	if (Op.isImm()) {
	assert(Op.getImm() == 0);
	return nullptr;
	}
	return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases());
	}

	static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
	const SIInstrInfo &TII, unsigned Reg) {
	for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
	if (!UseOp.isImplicit() \|\| !TII.isFLAT(*UseOp.getParent()))
	return true;
	}

	return false;
	}

	bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) {
	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
	if (!TPC)
	return false;

	const TargetMachine &TM = TPC->getTM<TargetMachine>();
	const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();

	// By default, for code object v5 and later, track only the minimum scratch
	// size
	uint32_t AssumedStackSizeForDynamicSizeObjects =
	clAssumedStackSizeForDynamicSizeObjects;
	uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
	if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
	AMDGPU::AMDHSA_COV5 \|\|
	STI.getTargetTriple().getOS() == Triple::AMDPAL) {
	if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
	AssumedStackSizeForDynamicSizeObjects = 0;
	if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
	AssumedStackSizeForExternalCall = 0;
	}

	ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects,
	AssumedStackSizeForExternalCall);

	return false;
	}

	AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
	AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
	const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
	uint32_t AssumedStackSizeForExternalCall) const {
	SIFunctionResourceInfo Info;

	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
	const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	const SIInstrInfo *TII = ST.getInstrInfo();
	const SIRegisterInfo &TRI = TII->getRegisterInfo();

	Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) \|\|
	MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) \|\|
	MRI.isLiveIn(MFI->getPreloadedReg(
	AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));

	// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
	// instructions aren't used to access the scratch buffer. Inline assembly may
	// need it though.
	//
	// If we only have implicit uses of flat_scr on flat instructions, it is not
	// really needed.
	if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
	(!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
	!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
	!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
	Info.UsesFlatScratch = false;
	}

	Info.PrivateSegmentSize = FrameInfo.getStackSize();

	// Assume a big number if there are any unknown sized objects.
	Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
	if (Info.HasDynamicallySizedStack)
	Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;

	if (MFI->isStackRealigned())
	Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();

	Info.UsesVCC =
	MRI.isPhysRegUsed(AMDGPU::VCC_LO) \|\| MRI.isPhysRegUsed(AMDGPU::VCC_HI);

	// If there are no calls, MachineRegisterInfo can tell us the used register
	// count easily.
	// A tail call isn't considered a call for MachineFrameInfo's purposes.
	if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
	Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
	Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
	if (ST.hasMAIInsts())
	Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
	return Info;
	}

	int32_t MaxVGPR = -1;
	int32_t MaxAGPR = -1;
	int32_t MaxSGPR = -1;
	Info.CalleeSegmentSize = 0;

	for (const MachineBasicBlock &MBB : MF) {
	for (const MachineInstr &MI : MBB) {
	// TODO: Check regmasks? Do they occur anywhere except calls?
	for (const MachineOperand &MO : MI.operands()) {
	unsigned Width = 0;
	bool IsSGPR = false;
	bool IsAGPR = false;

	if (!MO.isReg())
	continue;

	Register Reg = MO.getReg();
	switch (Reg) {
	case AMDGPU::EXEC:
	case AMDGPU::EXEC_LO:
	case AMDGPU::EXEC_HI:
	case AMDGPU::SCC:
	case AMDGPU::M0:
	case AMDGPU::M0_LO16:
	case AMDGPU::M0_HI16:
	case AMDGPU::SRC_SHARED_BASE_LO:
	case AMDGPU::SRC_SHARED_BASE:
	case AMDGPU::SRC_SHARED_LIMIT_LO:
	case AMDGPU::SRC_SHARED_LIMIT:
	case AMDGPU::SRC_PRIVATE_BASE_LO:
	case AMDGPU::SRC_PRIVATE_BASE:
	case AMDGPU::SRC_PRIVATE_LIMIT_LO:
	case AMDGPU::SRC_PRIVATE_LIMIT:
	case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
	case AMDGPU::SGPR_NULL:
	case AMDGPU::SGPR_NULL64:
	case AMDGPU::MODE:
	continue;

	case AMDGPU::NoRegister:
	assert(MI.isDebugInstr() &&
	"Instruction uses invalid noreg register");
	continue;

	case AMDGPU::VCC:
	case AMDGPU::VCC_LO:
	case AMDGPU::VCC_HI:
	case AMDGPU::VCC_LO_LO16:
	case AMDGPU::VCC_LO_HI16:
	case AMDGPU::VCC_HI_LO16:
	case AMDGPU::VCC_HI_HI16:
	Info.UsesVCC = true;
	continue;

	case AMDGPU::FLAT_SCR:
	case AMDGPU::FLAT_SCR_LO:
	case AMDGPU::FLAT_SCR_HI:
	continue;

	case AMDGPU::XNACK_MASK:
	case AMDGPU::XNACK_MASK_LO:
	case AMDGPU::XNACK_MASK_HI:
	llvm_unreachable("xnack_mask registers should not be used");

	case AMDGPU::LDS_DIRECT:
	llvm_unreachable("lds_direct register should not be used");

	case AMDGPU::TBA:
	case AMDGPU::TBA_LO:
	case AMDGPU::TBA_HI:
	case AMDGPU::TMA:
	case AMDGPU::TMA_LO:
	case AMDGPU::TMA_HI:
	llvm_unreachable("trap handler registers should not be used");

	case AMDGPU::SRC_VCCZ:
	llvm_unreachable("src_vccz register should not be used");

	case AMDGPU::SRC_EXECZ:
	llvm_unreachable("src_execz register should not be used");

	case AMDGPU::SRC_SCC:
	llvm_unreachable("src_scc register should not be used");

	default:
	break;
	}

	if (AMDGPU::SGPR_32RegClass.contains(Reg) \|\|
	AMDGPU::SGPR_LO16RegClass.contains(Reg) \|\|
	AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 1;
	} else if (AMDGPU::VGPR_32RegClass.contains(Reg) \|\|
	AMDGPU::VGPR_16RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 1;
	} else if (AMDGPU::AGPR_32RegClass.contains(Reg) \|\|
	AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 1;
	} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 2;
	} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 2;
	} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 2;
	} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 3;
	} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 3;
	} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 3;
	} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 4;
	} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 4;
	} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 4;
	} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 5;
	} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 5;
	} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 5;
	} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 6;
	} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 6;
	} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 6;
	} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 7;
	} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 7;
	} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 7;
	} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 8;
	} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 8;
	} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 8;
	} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 9;
	} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 9;
	} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 9;
	} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 10;
	} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 10;
	} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 10;
	} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 11;
	} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 11;
	} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 11;
	} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 12;
	} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 12;
	} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 12;
	} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 16;
	} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 16;
	} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 16;
	} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
	IsSGPR = true;
	Width = 32;
	} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
	IsSGPR = false;
	Width = 32;
	} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
	IsSGPR = false;
	IsAGPR = true;
	Width = 32;
	} else {
	// We only expect TTMP registers or registers that do not belong to
	// any RC.
	assert((AMDGPU::TTMP_32RegClass.contains(Reg) \|\|
	AMDGPU::TTMP_64RegClass.contains(Reg) \|\|
	AMDGPU::TTMP_128RegClass.contains(Reg) \|\|
	AMDGPU::TTMP_256RegClass.contains(Reg) \|\|
	AMDGPU::TTMP_512RegClass.contains(Reg) \|\|
	!TRI.getPhysRegBaseClass(Reg)) &&
	"Unknown register class");
	}
	unsigned HWReg = TRI.getHWRegIndex(Reg);
	int MaxUsed = HWReg + Width - 1;
	if (IsSGPR) {
	MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
	} else if (IsAGPR) {
	MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
	} else {
	MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
	}
	}

	if (MI.isCall()) {
	// Pseudo used just to encode the underlying global. Is there a better
	// way to track this?

	const MachineOperand *CalleeOp =
	TII->getNamedOperand(MI, AMDGPU::OpName::callee);

	const Function Callee = getCalleeFunction(CalleeOp);

	// Avoid crashing on undefined behavior with an illegal call to a
	// kernel. If a callsite's calling convention doesn't match the
	// function's, it's undefined behavior. If the callsite calling
	// convention does match, that would have errored earlier.
	if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
	report_fatal_error("invalid call to entry function");

	auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
	return F == &MF.getFunction();
	};

	if (Callee && !isSameFunction(MF, Callee))
	Info.Callees.push_back(Callee);

	bool IsIndirect = !Callee \|\| Callee->isDeclaration();

	// FIXME: Call site could have norecurse on it
	if (!Callee \|\| !Callee->doesNotRecurse()) {
	Info.HasRecursion = true;

	// TODO: If we happen to know there is no stack usage in the
	// callgraph, we don't need to assume an infinitely growing stack.
	if (!MI.isReturn()) {
	// We don't need to assume an unknown stack size for tail calls.

	// FIXME: This only benefits in the case where the kernel does not
	// directly call the tail called function. If a kernel directly
	// calls a tail recursive function, we'll assume maximum stack size
	// based on the regular call instruction.
	Info.CalleeSegmentSize = std::max(
	Info.CalleeSegmentSize,
	static_cast<uint64_t>(AssumedStackSizeForExternalCall));
	}
	}

	if (IsIndirect) {
	Info.CalleeSegmentSize =
	std::max(Info.CalleeSegmentSize,
	static_cast<uint64_t>(AssumedStackSizeForExternalCall));

	// Register usage of indirect calls gets handled later
	Info.UsesVCC = true;
	Info.UsesFlatScratch = ST.hasFlatAddressSpace();
	Info.HasDynamicallySizedStack = true;
	Info.HasIndirectCall = true;
	}
	}
	}
	}

	Info.NumExplicitSGPR = MaxSGPR + 1;
	Info.NumVGPR = MaxVGPR + 1;
	Info.NumAGPR = MaxAGPR + 1;

	return Info;
	}