lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp - llvm - Git at Google

 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file This pass adds target attributes to functions which use intrinsics
 /// which will impact calling convention lowering.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"

 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"

 using namespace llvm;

 namespace {

 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
 private:
   const TargetMachine *TM = nullptr;
   SmallVector<CallGraphNode*, 8> NodeList;

   bool addFeatureAttributes(Function &F);
   bool processUniformWorkGroupAttribute();
   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);

 public:
   static char ID;

   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}

   bool doInitialization(CallGraph &CG) override;
   bool runOnSCC(CallGraphSCC &SCC) override;

   StringRef getPassName() const override {
     return "AMDGPU Annotate Kernel Features";
   }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
     CallGraphSCCPass::getAnalysisUsage(AU);
   }

   static bool visitConstantExpr(const ConstantExpr *CE);
   static bool visitConstantExprsRecursively(
     const Constant *EntryC,
     SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
 };

 } // end anonymous namespace

 char AMDGPUAnnotateKernelFeatures::ID = 0;

 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;

 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
                 "Add AMDGPU function attributes", false, false)


 // The queue ptr is only needed when casting to flat, not from it.
 static bool castRequiresQueuePtr(unsigned SrcAS) {
   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
 }

 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
 }

 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
     return castRequiresQueuePtr(SrcAS);
   }

   return false;
 }

 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   const Constant *EntryC,
   SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {

   if (!ConstantExprVisited.insert(EntryC).second)
     return false;

   SmallVector<const Constant *, 16> Stack;
   Stack.push_back(EntryC);

   while (!Stack.empty()) {
     const Constant *C = Stack.pop_back_val();

     // Check this constant expression.
     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
       if (visitConstantExpr(CE))
         return true;
     }

     // Visit all sub-expressions.
     for (const Use &U : C->operands()) {
       const auto *OpC = dyn_cast<Constant>(U);
       if (!OpC)
         continue;

       if (!ConstantExprVisited.insert(OpC).second)
         continue;

       Stack.push_back(OpC);
     }
   }

   return false;
 }

 // We do not need to note the x workitem or workgroup id because they are always
 // initialized.
 //
 // TODO: We should not add the attributes if the known compile time workgroup
 // size is 1 for y/z.
 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
                                      bool &NonKernelOnly,
                                      bool &IsQueuePtr) {
   switch (ID) {
   case Intrinsic::amdgcn_workitem_id_x:
     NonKernelOnly = true;
     return "amdgpu-work-item-id-x";
   case Intrinsic::amdgcn_workgroup_id_x:
     NonKernelOnly = true;
     return "amdgpu-work-group-id-x";
   case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::r600_read_tidig_y:
     return "amdgpu-work-item-id-y";
   case Intrinsic::amdgcn_workitem_id_z:
   case Intrinsic::r600_read_tidig_z:
     return "amdgpu-work-item-id-z";
   case Intrinsic::amdgcn_workgroup_id_y:
   case Intrinsic::r600_read_tgid_y:
     return "amdgpu-work-group-id-y";
   case Intrinsic::amdgcn_workgroup_id_z:
   case Intrinsic::r600_read_tgid_z:
     return "amdgpu-work-group-id-z";
   case Intrinsic::amdgcn_dispatch_ptr:
     return "amdgpu-dispatch-ptr";
   case Intrinsic::amdgcn_dispatch_id:
     return "amdgpu-dispatch-id";
   case Intrinsic::amdgcn_kernarg_segment_ptr:
     return "amdgpu-kernarg-segment-ptr";
   case Intrinsic::amdgcn_implicitarg_ptr:
     return "amdgpu-implicitarg-ptr";
   case Intrinsic::amdgcn_queue_ptr:
   case Intrinsic::trap:
   case Intrinsic::debugtrap:
     IsQueuePtr = true;
     return "amdgpu-queue-ptr";
   default:
     return "";
   }
 }

 static bool handleAttr(Function &Parent, const Function &Callee,
                        StringRef Name) {
   if (Callee.hasFnAttribute(Name)) {
     Parent.addFnAttr(Name);
     return true;
   }
   return false;
 }

 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
                                    bool &NeedQueuePtr) {
   // X ids unnecessarily propagated to kernels.
   static const StringRef AttrNames[] = {
     { "amdgpu-work-item-id-x" },
     { "amdgpu-work-item-id-y" },
     { "amdgpu-work-item-id-z" },
     { "amdgpu-work-group-id-x" },
     { "amdgpu-work-group-id-y" },
     { "amdgpu-work-group-id-z" },
     { "amdgpu-dispatch-ptr" },
     { "amdgpu-dispatch-id" },
     { "amdgpu-kernarg-segment-ptr" },
     { "amdgpu-implicitarg-ptr" }
   };

   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
     NeedQueuePtr = true;

   for (StringRef AttrName : AttrNames)
     handleAttr(Parent, Callee, AttrName);
 }

 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
   bool Changed = false;

   for (auto *Node : reverse(NodeList)) {
     Function *Caller = Node->getFunction();

     for (auto I : *Node) {
       Function *Callee = std::get<1>(I)->getFunction();
       if (Callee)
         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
     }
   }

   return Changed;
 }

 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
        Function &Caller, Function &Callee) {

   // Check for externally defined function
   if (!Callee.hasExactDefinition()) {
     Callee.addFnAttr("uniform-work-group-size", "false");
     if (!Caller.hasFnAttribute("uniform-work-group-size"))
       Caller.addFnAttr("uniform-work-group-size", "false");

     return true;
   }
   // Check if the Caller has the attribute
   if (Caller.hasFnAttribute("uniform-work-group-size")) {
     // Check if the value of the attribute is true
     if (Caller.getFnAttribute("uniform-work-group-size")
         .getValueAsString().equals("true")) {
       // Propagate the attribute to the Callee, if it does not have it
       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
         Callee.addFnAttr("uniform-work-group-size", "true");
         return true;
       }
     } else {
       Callee.addFnAttr("uniform-work-group-size", "false");
       return true;
     }
   } else {
     // If the attribute is absent, set it as false
     Caller.addFnAttr("uniform-work-group-size", "false");
     Callee.addFnAttr("uniform-work-group-size", "false");
     return true;
   }
   return false;
 }

 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
   bool HasFlat = ST.hasFlatAddressSpace();
   bool HasApertureRegs = ST.hasApertureRegs();
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;

   bool Changed = false;
   bool NeedQueuePtr = false;
   bool HaveCall = false;
   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());

   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       CallSite CS(&I);
       if (CS) {
         Function *Callee = CS.getCalledFunction();

         // TODO: Do something with indirect calls.
         if (!Callee) {
           if (!CS.isInlineAsm())
             HaveCall = true;
           continue;
         }

         Intrinsic::ID IID = Callee->getIntrinsicID();
         if (IID == Intrinsic::not_intrinsic) {
           HaveCall = true;
           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
           Changed = true;
         } else {
           bool NonKernelOnly = false;
           StringRef AttrName = intrinsicToAttrName(IID,
                                                    NonKernelOnly, NeedQueuePtr);
           if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
             F.addFnAttr(AttrName);
             Changed = true;
           }
         }
       }

       if (NeedQueuePtr || HasApertureRegs)
         continue;

       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
         if (castRequiresQueuePtr(ASC)) {
           NeedQueuePtr = true;
           continue;
         }
       }

       for (const Use &U : I.operands()) {
         const auto *OpC = dyn_cast<Constant>(U);
         if (!OpC)
           continue;

         if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
           NeedQueuePtr = true;
           break;
         }
       }
     }
   }

   if (NeedQueuePtr) {
     F.addFnAttr("amdgpu-queue-ptr");
     Changed = true;
   }

   // TODO: We could refine this to captured pointers that could possibly be
   // accessed by flat instructions. For now this is mostly a poor way of
   // estimating whether there are calls before argument lowering.
   if (HasFlat && !IsFunc && HaveCall) {
     F.addFnAttr("amdgpu-flat-scratch");
     Changed = true;
   }

   return Changed;
 }

 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
   bool Changed = false;

   for (CallGraphNode *I : SCC) {
     // Build a list of CallGraphNodes from most number of uses to least
     if (I->getNumReferences())
       NodeList.push_back(I);
     else {
       processUniformWorkGroupAttribute();
       NodeList.clear();
     }

     Function *F = I->getFunction();
     // Add feature attributes
     if (!F || F->isDeclaration())
       continue;
     Changed |= addFeatureAttributes(*F);
   }

   return Changed;
 }

 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   if (!TPC)
     report_fatal_error("TargetMachine is required");

   TM = &TPC->getTM<TargetMachine>();
   return false;
 }

 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
   return new AMDGPUAnnotateKernelFeatures();
 }
	//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file This pass adds target attributes to functions which use intrinsics
	/// which will impact calling convention lowering.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPU.h"
	#include "AMDGPUSubtarget.h"
	#include "Utils/AMDGPUBaseInfo.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Analysis/CallGraph.h"
	#include "llvm/Analysis/CallGraphSCCPass.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Target/TargetMachine.h"

	#define DEBUG_TYPE "amdgpu-annotate-kernel-features"

	using namespace llvm;

	namespace {

	class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
	private:
	const TargetMachine *TM = nullptr;
	SmallVector<CallGraphNode*, 8> NodeList;

	bool addFeatureAttributes(Function &F);
	bool processUniformWorkGroupAttribute();
	bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);

	public:
	static char ID;

	AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}

	bool doInitialization(CallGraph &CG) override;
	bool runOnSCC(CallGraphSCC &SCC) override;

	StringRef getPassName() const override {
	return "AMDGPU Annotate Kernel Features";
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesAll();
	CallGraphSCCPass::getAnalysisUsage(AU);
	}

	static bool visitConstantExpr(const ConstantExpr *CE);
	static bool visitConstantExprsRecursively(
	const Constant *EntryC,
	SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
	};

	} // end anonymous namespace

	char AMDGPUAnnotateKernelFeatures::ID = 0;

	char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;

	INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
	"Add AMDGPU function attributes", false, false)


	// The queue ptr is only needed when casting to flat, not from it.
	static bool castRequiresQueuePtr(unsigned SrcAS) {
	return SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\| SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
	}

	static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
	return castRequiresQueuePtr(ASC->getSrcAddressSpace());
	}

	bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
	if (CE->getOpcode() == Instruction::AddrSpaceCast) {
	unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
	return castRequiresQueuePtr(SrcAS);
	}

	return false;
	}

	bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
	const Constant *EntryC,
	SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {

	if (!ConstantExprVisited.insert(EntryC).second)
	return false;

	SmallVector<const Constant *, 16> Stack;
	Stack.push_back(EntryC);

	while (!Stack.empty()) {
	const Constant *C = Stack.pop_back_val();

	// Check this constant expression.
	if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
	if (visitConstantExpr(CE))
	return true;
	}

	// Visit all sub-expressions.
	for (const Use &U : C->operands()) {
	const auto *OpC = dyn_cast<Constant>(U);
	if (!OpC)
	continue;

	if (!ConstantExprVisited.insert(OpC).second)
	continue;

	Stack.push_back(OpC);
	}
	}

	return false;
	}

	// We do not need to note the x workitem or workgroup id because they are always
	// initialized.
	//
	// TODO: We should not add the attributes if the known compile time workgroup
	// size is 1 for y/z.
	static StringRef intrinsicToAttrName(Intrinsic::ID ID,
	bool &NonKernelOnly,
	bool &IsQueuePtr) {
	switch (ID) {
	case Intrinsic::amdgcn_workitem_id_x:
	NonKernelOnly = true;
	return "amdgpu-work-item-id-x";
	case Intrinsic::amdgcn_workgroup_id_x:
	NonKernelOnly = true;
	return "amdgpu-work-group-id-x";
	case Intrinsic::amdgcn_workitem_id_y:
	case Intrinsic::r600_read_tidig_y:
	return "amdgpu-work-item-id-y";
	case Intrinsic::amdgcn_workitem_id_z:
	case Intrinsic::r600_read_tidig_z:
	return "amdgpu-work-item-id-z";
	case Intrinsic::amdgcn_workgroup_id_y:
	case Intrinsic::r600_read_tgid_y:
	return "amdgpu-work-group-id-y";
	case Intrinsic::amdgcn_workgroup_id_z:
	case Intrinsic::r600_read_tgid_z:
	return "amdgpu-work-group-id-z";
	case Intrinsic::amdgcn_dispatch_ptr:
	return "amdgpu-dispatch-ptr";
	case Intrinsic::amdgcn_dispatch_id:
	return "amdgpu-dispatch-id";
	case Intrinsic::amdgcn_kernarg_segment_ptr:
	return "amdgpu-kernarg-segment-ptr";
	case Intrinsic::amdgcn_implicitarg_ptr:
	return "amdgpu-implicitarg-ptr";
	case Intrinsic::amdgcn_queue_ptr:
	case Intrinsic::trap:
	case Intrinsic::debugtrap:
	IsQueuePtr = true;
	return "amdgpu-queue-ptr";
	default:
	return "";
	}
	}

	static bool handleAttr(Function &Parent, const Function &Callee,
	StringRef Name) {
	if (Callee.hasFnAttribute(Name)) {
	Parent.addFnAttr(Name);
	return true;
	}
	return false;
	}

	static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
	bool &NeedQueuePtr) {
	// X ids unnecessarily propagated to kernels.
	static const StringRef AttrNames[] = {
	{ "amdgpu-work-item-id-x" },
	{ "amdgpu-work-item-id-y" },
	{ "amdgpu-work-item-id-z" },
	{ "amdgpu-work-group-id-x" },
	{ "amdgpu-work-group-id-y" },
	{ "amdgpu-work-group-id-z" },
	{ "amdgpu-dispatch-ptr" },
	{ "amdgpu-dispatch-id" },
	{ "amdgpu-kernarg-segment-ptr" },
	{ "amdgpu-implicitarg-ptr" }
	};

	if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
	NeedQueuePtr = true;

	for (StringRef AttrName : AttrNames)
	handleAttr(Parent, Callee, AttrName);
	}

	bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
	bool Changed = false;

	for (auto *Node : reverse(NodeList)) {
	Function *Caller = Node->getFunction();

	for (auto I : *Node) {
	Function *Callee = std::get<1>(I)->getFunction();
	if (Callee)
	Changed = propagateUniformWorkGroupAttribute(Caller, Callee);
	}
	}

	return Changed;
	}

	bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
	Function &Caller, Function &Callee) {

	// Check for externally defined function
	if (!Callee.hasExactDefinition()) {
	Callee.addFnAttr("uniform-work-group-size", "false");
	if (!Caller.hasFnAttribute("uniform-work-group-size"))
	Caller.addFnAttr("uniform-work-group-size", "false");

	return true;
	}
	// Check if the Caller has the attribute
	if (Caller.hasFnAttribute("uniform-work-group-size")) {
	// Check if the value of the attribute is true
	if (Caller.getFnAttribute("uniform-work-group-size")
	.getValueAsString().equals("true")) {
	// Propagate the attribute to the Callee, if it does not have it
	if (!Callee.hasFnAttribute("uniform-work-group-size")) {
	Callee.addFnAttr("uniform-work-group-size", "true");
	return true;
	}
	} else {
	Callee.addFnAttr("uniform-work-group-size", "false");
	return true;
	}
	} else {
	// If the attribute is absent, set it as false
	Caller.addFnAttr("uniform-work-group-size", "false");
	Callee.addFnAttr("uniform-work-group-size", "false");
	return true;
	}
	return false;
	}

	bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
	const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
	bool HasFlat = ST.hasFlatAddressSpace();
	bool HasApertureRegs = ST.hasApertureRegs();
	SmallPtrSet<const Constant *, 8> ConstantExprVisited;

	bool Changed = false;
	bool NeedQueuePtr = false;
	bool HaveCall = false;
	bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());

	for (BasicBlock &BB : F) {
	for (Instruction &I : BB) {
	CallSite CS(&I);
	if (CS) {
	Function *Callee = CS.getCalledFunction();

	// TODO: Do something with indirect calls.
	if (!Callee) {
	if (!CS.isInlineAsm())
	HaveCall = true;
	continue;
	}

	Intrinsic::ID IID = Callee->getIntrinsicID();
	if (IID == Intrinsic::not_intrinsic) {
	HaveCall = true;
	copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
	Changed = true;
	} else {
	bool NonKernelOnly = false;
	StringRef AttrName = intrinsicToAttrName(IID,
	NonKernelOnly, NeedQueuePtr);
	if (!AttrName.empty() && (IsFunc \|\| !NonKernelOnly)) {
	F.addFnAttr(AttrName);
	Changed = true;
	}
	}
	}

	if (NeedQueuePtr \|\| HasApertureRegs)
	continue;

	if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
	if (castRequiresQueuePtr(ASC)) {
	NeedQueuePtr = true;
	continue;
	}
	}

	for (const Use &U : I.operands()) {
	const auto *OpC = dyn_cast<Constant>(U);
	if (!OpC)
	continue;

	if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
	NeedQueuePtr = true;
	break;
	}
	}
	}
	}

	if (NeedQueuePtr) {
	F.addFnAttr("amdgpu-queue-ptr");
	Changed = true;
	}

	// TODO: We could refine this to captured pointers that could possibly be
	// accessed by flat instructions. For now this is mostly a poor way of
	// estimating whether there are calls before argument lowering.
	if (HasFlat && !IsFunc && HaveCall) {
	F.addFnAttr("amdgpu-flat-scratch");
	Changed = true;
	}

	return Changed;
	}

	bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
	bool Changed = false;

	for (CallGraphNode *I : SCC) {
	// Build a list of CallGraphNodes from most number of uses to least
	if (I->getNumReferences())
	NodeList.push_back(I);
	else {
	processUniformWorkGroupAttribute();
	NodeList.clear();
	}

	Function *F = I->getFunction();
	// Add feature attributes
	if (!F \|\| F->isDeclaration())
	continue;
	Changed \|= addFeatureAttributes(*F);
	}

	return Changed;
	}

	bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
	if (!TPC)
	report_fatal_error("TargetMachine is required");

	TM = &TPC->getTM<TargetMachine>();
	return false;
	}

	Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
	return new AMDGPUAnnotateKernelFeatures();
	}