lib/Target/R600/AMDGPUTargetTransformInfo.cpp - llvm - Git at Google

 //===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // \file
 // This file implements a TargetTransformInfo analysis pass specific to the
 // AMDGPU target machine. It uses the target's detailed information to provide
 // more precise answers to certain TTI queries, while letting the target
 // independent and default TTI implementations handle the rest.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;

 #define DEBUG_TYPE "AMDGPUtti"

 // Declare the pass initialization routine locally as target-specific passes
 // don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializeAMDGPUTTIPass(PassRegistry &);
 }

 namespace {

 class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
   const AMDGPUTargetMachine *TM;
   const AMDGPUSubtarget *ST;
   const AMDGPUTargetLowering *TLI;

   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the result needs to be inserted and/or extracted from vectors.
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;

 public:
   AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }

   AMDGPUTTI(const AMDGPUTargetMachine *TM)
       : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
         TLI(TM->getSubtargetImpl()->getTargetLowering()) {
     initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
   }

   void initializePass() override { pushTTIStack(this); }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }

   /// Pass identification.
   static char ID;

   /// Provide necessary pointer adjustments for the two base classes.
   void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo *)this;
     return this;
   }

   bool hasBranchDivergence() const override;

   void getUnrollingPreferences(const Function *F, Loop *L,
                                UnrollingPreferences &UP) const override;

   PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override;

   unsigned getNumberOfRegisters(bool Vector) const override;
   unsigned getRegisterBitWidth(bool Vector) const override;
   unsigned getMaxInterleaveFactor() const override;
 };

 } // end anonymous namespace

 INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti",
                    "AMDGPU Target Transform Info", true, true, false)
 char AMDGPUTTI::ID = 0;

 ImmutablePass *
 llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) {
   return new AMDGPUTTI(TM);
 }

 bool AMDGPUTTI::hasBranchDivergence() const { return true; }

 void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L,
                                         UnrollingPreferences &UP) const {
   UP.Threshold = 300; // Twice the default.
   UP.MaxCount = UINT_MAX;
   UP.Partial = true;

   // TODO: Do we want runtime unrolling?

   for (const BasicBlock *BB : L->getBlocks()) {
     for (const Instruction &I : *BB) {
       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
       if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
         continue;

       const Value *Ptr = GEP->getPointerOperand();
       const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr));
       if (Alloca) {
         // We want to do whatever we can to limit the number of alloca
         // instructions that make it through to the code generator.  allocas
         // require us to use indirect addressing, which is slow and prone to
         // compiler bugs.  If this loop does an address calculation on an
         // alloca ptr, then we want to use a higher than normal loop unroll
         // threshold. This will give SROA a better chance to eliminate these
         // allocas.
         //
         // Don't use the maximum allowed value here as it will make some
         // programs way too big.
         UP.Threshold = 800;
       }
     }
   }
 }

 AMDGPUTTI::PopcntSupportKind
 AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software;
 }

 unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const {
   if (Vec)
     return 0;

   // Number of VGPRs on SI.
   if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
     return 256;

   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }

 unsigned AMDGPUTTI::getRegisterBitWidth(bool) const {
   return 32;
 }

 unsigned AMDGPUTTI::getMaxInterleaveFactor() const {
   // Semi-arbitrary large amount.
   return 64;
 }
	//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// \file
	// This file implements a TargetTransformInfo analysis pass specific to the
	// AMDGPU target machine. It uses the target's detailed information to provide
	// more precise answers to certain TTI queries, while letting the target
	// independent and default TTI implementations handle the rest.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPU.h"
	#include "AMDGPUTargetMachine.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Target/CostTable.h"
	#include "llvm/Target/TargetLowering.h"
	using namespace llvm;

	#define DEBUG_TYPE "AMDGPUtti"

	// Declare the pass initialization routine locally as target-specific passes
	// don't have a target-wide initialization entry point, and so we rely on the
	// pass constructor initialization.
	namespace llvm {
	void initializeAMDGPUTTIPass(PassRegistry &);
	}

	namespace {

	class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
	const AMDGPUTargetMachine *TM;
	const AMDGPUSubtarget *ST;
	const AMDGPUTargetLowering *TLI;

	/// Estimate the overhead of scalarizing an instruction. Insert and Extract
	/// are set if the result needs to be inserted and/or extracted from vectors.
	unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;

	public:
	AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
	llvm_unreachable("This pass cannot be directly constructed");
	}

	AMDGPUTTI(const AMDGPUTargetMachine *TM)
	: ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
	TLI(TM->getSubtargetImpl()->getTargetLowering()) {
	initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
	}

	void initializePass() override { pushTTIStack(this); }

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	TargetTransformInfo::getAnalysisUsage(AU);
	}

	/// Pass identification.
	static char ID;

	/// Provide necessary pointer adjustments for the two base classes.
	void getAdjustedAnalysisPointer(const void ID) override {
	if (ID == &TargetTransformInfo::ID)
	return (TargetTransformInfo *)this;
	return this;
	}

	bool hasBranchDivergence() const override;

	void getUnrollingPreferences(const Function F, Loop L,
	UnrollingPreferences &UP) const override;

	PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override;

	unsigned getNumberOfRegisters(bool Vector) const override;
	unsigned getRegisterBitWidth(bool Vector) const override;
	unsigned getMaxInterleaveFactor() const override;
	};

	} // end anonymous namespace

	INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti",
	"AMDGPU Target Transform Info", true, true, false)
	char AMDGPUTTI::ID = 0;

	ImmutablePass *
	llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) {
	return new AMDGPUTTI(TM);
	}

	bool AMDGPUTTI::hasBranchDivergence() const { return true; }

	void AMDGPUTTI::getUnrollingPreferences(const Function , Loop L,
	UnrollingPreferences &UP) const {
	UP.Threshold = 300; // Twice the default.
	UP.MaxCount = UINT_MAX;
	UP.Partial = true;

	// TODO: Do we want runtime unrolling?

	for (const BasicBlock *BB : L->getBlocks()) {
	for (const Instruction &I : *BB) {
	const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
	if (!GEP \|\| GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
	continue;

	const Value *Ptr = GEP->getPointerOperand();
	const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr));
	if (Alloca) {
	// We want to do whatever we can to limit the number of alloca
	// instructions that make it through to the code generator. allocas
	// require us to use indirect addressing, which is slow and prone to
	// compiler bugs. If this loop does an address calculation on an
	// alloca ptr, then we want to use a higher than normal loop unroll
	// threshold. This will give SROA a better chance to eliminate these
	// allocas.
	//
	// Don't use the maximum allowed value here as it will make some
	// programs way too big.
	UP.Threshold = 800;
	}
	}
	}
	}

	AMDGPUTTI::PopcntSupportKind
	AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const {
	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
	return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software;
	}

	unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const {
	if (Vec)
	return 0;

	// Number of VGPRs on SI.
	if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
	return 256;

	return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
	}

	unsigned AMDGPUTTI::getRegisterBitWidth(bool) const {
	return 32;
	}

	unsigned AMDGPUTTI::getMaxInterleaveFactor() const {
	// Semi-arbitrary large amount.
	return 64;
	}