llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp - llvm-project - Git at Google

 //===-- GCNPreRAOptimizations.cpp -----------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// This pass combines split register tuple initialization into a single pseudo:
 ///
 ///   undef %0.sub1:sreg_64 = S_MOV_B32 1
 ///   %0.sub0:sreg_64 = S_MOV_B32 2
 /// =>
 ///   %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001
 ///
 /// This is to allow rematerialization of a value instead of spilling. It is
 /// supposed to be done after register coalescer to allow it to do its job and
 /// before actual register allocation to allow rematerialization.
 ///
 /// Right now the pass only handles 64 bit SGPRs with immediate initializers,
 /// although the same shall be possible with other register classes and
 /// instructions if necessary.
 ///
 /// This pass also adds register allocation hints to COPY.
 /// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
 /// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
 /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
 /// the VGPR_32, the COPY can be completely eliminated.
 ///
 //===----------------------------------------------------------------------===//

 #include "GCNPreRAOptimizations.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"

 using namespace llvm;

 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"

 namespace {

 class GCNPreRAOptimizationsImpl {
 private:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
   LiveIntervals *LIS;

   bool processReg(Register Reg);

 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
   bool run(MachineFunction &MF);
 };

 class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 public:
   static char ID;

   GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
     initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
   }

   bool runOnMachineFunction(MachineFunction &MF) override;

   StringRef getPassName() const override {
     return "AMDGPU Pre-RA optimizations";
   }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervalsWrapperPass>();
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
 } // End anonymous namespace.

 INITIALIZE_PASS_BEGIN(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
                       "AMDGPU Pre-RA optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_END(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
                     "Pre-RA optimizations", false, false)

 char GCNPreRAOptimizationsLegacy::ID = 0;

 char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizationsLegacy::ID;

 FunctionPass *llvm::createGCNPreRAOptimizationsLegacyPass() {
   return new GCNPreRAOptimizationsLegacy();
 }

 bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   MachineInstr *Def0 = nullptr;
   MachineInstr *Def1 = nullptr;
   uint64_t Init = 0;
   bool Changed = false;
   SmallSet<Register, 32> ModifiedRegs;
   bool IsAGPRDst = TRI->isAGPRClass(MRI->getRegClass(Reg));

   for (MachineInstr &I : MRI->def_instructions(Reg)) {
     switch (I.getOpcode()) {
     default:
       return false;
     case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
       break;
     case AMDGPU::COPY: {
       // Some subtargets cannot do an AGPR to AGPR copy directly, and need an
       // intermdiate temporary VGPR register. Try to find the defining
       // accvgpr_write to avoid temporary registers.

       if (!IsAGPRDst)
         return false;

       Register SrcReg = I.getOperand(1).getReg();

       if (!SrcReg.isVirtual())
         break;

       // Check if source of copy is from another AGPR.
       bool IsAGPRSrc = TRI->isAGPRClass(MRI->getRegClass(SrcReg));
       if (!IsAGPRSrc)
         break;

       // def_instructions() does not look at subregs so it may give us a
       // different instruction that defines the same vreg but different subreg
       // so we have to manually check subreg.
       Register SrcSubReg = I.getOperand(1).getSubReg();
       for (auto &Def : MRI->def_instructions(SrcReg)) {
         if (SrcSubReg != Def.getOperand(0).getSubReg())
           continue;

         if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
           MachineOperand DefSrcMO = Def.getOperand(1);

           // Immediates are not an issue and can be propagated in
           // postrapseudos pass. Only handle cases where defining
           // accvgpr_write source is a vreg.
           if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) {
             // Propagate source reg of accvgpr write to this copy instruction
             I.getOperand(1).setReg(DefSrcMO.getReg());
             I.getOperand(1).setSubReg(DefSrcMO.getSubReg());

             // Reg uses were changed, collect unique set of registers to update
             // live intervals at the end.
             ModifiedRegs.insert(DefSrcMO.getReg());
             ModifiedRegs.insert(SrcReg);

             Changed = true;
           }

           // Found the defining accvgpr_write, stop looking any further.
           break;
         }
       }
       break;
     }
     case AMDGPU::S_MOV_B32:
       if (I.getOperand(0).getReg() != Reg || !I.getOperand(1).isImm() ||
           I.getNumOperands() != 2)
         return false;

       switch (I.getOperand(0).getSubReg()) {
       default:
         return false;
       case AMDGPU::sub0:
         if (Def0)
           return false;
         Def0 = &I;
         Init |= Lo_32(I.getOperand(1).getImm());
         break;
       case AMDGPU::sub1:
         if (Def1)
           return false;
         Def1 = &I;
         Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
         break;
       }
       break;
     }
   }

   // For AGPR reg, check if live intervals need to be updated.
   if (IsAGPRDst) {
     if (Changed) {
       for (Register RegToUpdate : ModifiedRegs) {
         LIS->removeInterval(RegToUpdate);
         LIS->createAndComputeVirtRegInterval(RegToUpdate);
       }
     }

     return Changed;
   }

   // For SGPR reg, check if we can combine instructions.
   if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
     return Changed;

   LLVM_DEBUG(dbgs() << "Combining:\n  " << *Def0 << "  " << *Def1
                     << "    =>\n");

   if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1),
                                 LIS->getInstructionIndex(*Def0)))
     std::swap(Def0, Def1);

   LIS->RemoveMachineInstrFromMaps(*Def0);
   LIS->RemoveMachineInstrFromMaps(*Def1);
   auto NewI = BuildMI(*Def0->getParent(), *Def0, Def0->getDebugLoc(),
                       TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg)
                   .addImm(Init);

   Def0->eraseFromParent();
   Def1->eraseFromParent();
   LIS->InsertMachineInstrInMaps(*NewI);
   LIS->removeInterval(Reg);
   LIS->createAndComputeVirtRegInterval(Reg);

   LLVM_DEBUG(dbgs() << "  " << *NewI);

   return true;
 }

 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   return GCNPreRAOptimizationsImpl(LIS).run(MF);
 }

 PreservedAnalyses
 GCNPreRAOptimizationsPass::run(MachineFunction &MF,
                                MachineFunctionAnalysisManager &MFAM) {
   LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
   GCNPreRAOptimizationsImpl(LIS).run(MF);
   return PreservedAnalyses::all();
 }

 bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   MRI = &MF.getRegInfo();
   TRI = ST.getRegisterInfo();

   bool Changed = false;

   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))
       continue;
     const TargetRegisterClass *RC = MRI->getRegClass(Reg);
     if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
         (ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
       continue;

     Changed |= processReg(Reg);
   }

   if (!ST.useRealTrue16Insts())
     return Changed;

   // Add RA hints to improve True16 COPY elimination.
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
       if (MI.getOpcode() != AMDGPU::COPY)
         continue;
       Register Dst = MI.getOperand(0).getReg();
       Register Src = MI.getOperand(1).getReg();
       if (Dst.isVirtual() &&
           MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
           Src.isPhysical() &&
           TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
         MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
       if (Src.isVirtual() &&
           MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
           Dst.isPhysical() &&
           TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
         MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
       if (!Dst.isVirtual() || !Src.isVirtual())
         continue;
       if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
           MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
         MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
         MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
       }
       if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
           MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
         MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
     }
   }

   return Changed;
 }
	//===-- GCNPreRAOptimizations.cpp -----------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// This pass combines split register tuple initialization into a single pseudo:
	///
	/// undef %0.sub1:sreg_64 = S_MOV_B32 1
	/// %0.sub0:sreg_64 = S_MOV_B32 2
	/// =>
	/// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001
	///
	/// This is to allow rematerialization of a value instead of spilling. It is
	/// supposed to be done after register coalescer to allow it to do its job and
	/// before actual register allocation to allow rematerialization.
	///
	/// Right now the pass only handles 64 bit SGPRs with immediate initializers,
	/// although the same shall be possible with other register classes and
	/// instructions if necessary.
	///
	/// This pass also adds register allocation hints to COPY.
	/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
	/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
	/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
	/// the VGPR_32, the COPY can be completely eliminated.
	///
	//===----------------------------------------------------------------------===//

	#include "GCNPreRAOptimizations.h"
	#include "AMDGPU.h"
	#include "GCNSubtarget.h"
	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
	#include "SIRegisterInfo.h"
	#include "llvm/CodeGen/LiveIntervals.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/InitializePasses.h"

	using namespace llvm;

	#define DEBUG_TYPE "amdgpu-pre-ra-optimizations"

	namespace {

	class GCNPreRAOptimizationsImpl {
	private:
	const SIInstrInfo *TII;
	const SIRegisterInfo *TRI;
	MachineRegisterInfo *MRI;
	LiveIntervals *LIS;

	bool processReg(Register Reg);

	public:
	GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
	bool run(MachineFunction &MF);
	};

	class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
	public:
	static char ID;

	GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
	initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
	}

	bool runOnMachineFunction(MachineFunction &MF) override;

	StringRef getPassName() const override {
	return "AMDGPU Pre-RA optimizations";
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<LiveIntervalsWrapperPass>();
	AU.setPreservesAll();
	MachineFunctionPass::getAnalysisUsage(AU);
	}
	};
	} // End anonymous namespace.

	INITIALIZE_PASS_BEGIN(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
	"AMDGPU Pre-RA optimizations", false, false)
	INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
	INITIALIZE_PASS_END(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
	"Pre-RA optimizations", false, false)

	char GCNPreRAOptimizationsLegacy::ID = 0;

	char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizationsLegacy::ID;

	FunctionPass *llvm::createGCNPreRAOptimizationsLegacyPass() {
	return new GCNPreRAOptimizationsLegacy();
	}

	bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
	MachineInstr *Def0 = nullptr;
	MachineInstr *Def1 = nullptr;
	uint64_t Init = 0;
	bool Changed = false;
	SmallSet<Register, 32> ModifiedRegs;
	bool IsAGPRDst = TRI->isAGPRClass(MRI->getRegClass(Reg));

	for (MachineInstr &I : MRI->def_instructions(Reg)) {
	switch (I.getOpcode()) {
	default:
	return false;
	case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
	break;
	case AMDGPU::COPY: {
	// Some subtargets cannot do an AGPR to AGPR copy directly, and need an
	// intermdiate temporary VGPR register. Try to find the defining
	// accvgpr_write to avoid temporary registers.

	if (!IsAGPRDst)
	return false;

	Register SrcReg = I.getOperand(1).getReg();

	if (!SrcReg.isVirtual())
	break;

	// Check if source of copy is from another AGPR.
	bool IsAGPRSrc = TRI->isAGPRClass(MRI->getRegClass(SrcReg));
	if (!IsAGPRSrc)
	break;

	// def_instructions() does not look at subregs so it may give us a
	// different instruction that defines the same vreg but different subreg
	// so we have to manually check subreg.
	Register SrcSubReg = I.getOperand(1).getSubReg();
	for (auto &Def : MRI->def_instructions(SrcReg)) {
	if (SrcSubReg != Def.getOperand(0).getSubReg())
	continue;

	if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
	MachineOperand DefSrcMO = Def.getOperand(1);

	// Immediates are not an issue and can be propagated in
	// postrapseudos pass. Only handle cases where defining
	// accvgpr_write source is a vreg.
	if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) {
	// Propagate source reg of accvgpr write to this copy instruction
	I.getOperand(1).setReg(DefSrcMO.getReg());
	I.getOperand(1).setSubReg(DefSrcMO.getSubReg());

	// Reg uses were changed, collect unique set of registers to update
	// live intervals at the end.
	ModifiedRegs.insert(DefSrcMO.getReg());
	ModifiedRegs.insert(SrcReg);

	Changed = true;
	}

	// Found the defining accvgpr_write, stop looking any further.
	break;
	}
	}
	break;
	}
	case AMDGPU::S_MOV_B32:
	if (I.getOperand(0).getReg() != Reg \|\| !I.getOperand(1).isImm() \|\|
	I.getNumOperands() != 2)
	return false;

	switch (I.getOperand(0).getSubReg()) {
	default:
	return false;
	case AMDGPU::sub0:
	if (Def0)
	return false;
	Def0 = &I;
	Init \|= Lo_32(I.getOperand(1).getImm());
	break;
	case AMDGPU::sub1:
	if (Def1)
	return false;
	Def1 = &I;
	Init \|= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
	break;
	}
	break;
	}
	}

	// For AGPR reg, check if live intervals need to be updated.
	if (IsAGPRDst) {
	if (Changed) {
	for (Register RegToUpdate : ModifiedRegs) {
	LIS->removeInterval(RegToUpdate);
	LIS->createAndComputeVirtRegInterval(RegToUpdate);
	}
	}

	return Changed;
	}

	// For SGPR reg, check if we can combine instructions.
	if (!Def0 \|\| !Def1 \|\| Def0->getParent() != Def1->getParent())
	return Changed;

	LLVM_DEBUG(dbgs() << "Combining:\n " << Def0 << " " << Def1
	<< " =>\n");

	if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1),
	LIS->getInstructionIndex(*Def0)))
	std::swap(Def0, Def1);

	LIS->RemoveMachineInstrFromMaps(*Def0);
	LIS->RemoveMachineInstrFromMaps(*Def1);
	auto NewI = BuildMI(Def0->getParent(), Def0, Def0->getDebugLoc(),
	TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg)
	.addImm(Init);

	Def0->eraseFromParent();
	Def1->eraseFromParent();
	LIS->InsertMachineInstrInMaps(*NewI);
	LIS->removeInterval(Reg);
	LIS->createAndComputeVirtRegInterval(Reg);

	LLVM_DEBUG(dbgs() << " " << *NewI);

	return true;
	}

	bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
	if (skipFunction(MF.getFunction()))
	return false;
	LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
	return GCNPreRAOptimizationsImpl(LIS).run(MF);
	}

	PreservedAnalyses
	GCNPreRAOptimizationsPass::run(MachineFunction &MF,
	MachineFunctionAnalysisManager &MFAM) {
	LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
	GCNPreRAOptimizationsImpl(LIS).run(MF);
	return PreservedAnalyses::all();
	}

	bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
	TII = ST.getInstrInfo();
	MRI = &MF.getRegInfo();
	TRI = ST.getRegisterInfo();

	bool Changed = false;

	for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
	Register Reg = Register::index2VirtReg(I);
	if (!LIS->hasInterval(Reg))
	continue;
	const TargetRegisterClass *RC = MRI->getRegClass(Reg);
	if ((RC->MC->getSizeInBits() != 64 \|\| !TRI->isSGPRClass(RC)) &&
	(ST.hasGFX90AInsts() \|\| !TRI->isAGPRClass(RC)))
	continue;

	Changed \|= processReg(Reg);
	}

	if (!ST.useRealTrue16Insts())
	return Changed;

	// Add RA hints to improve True16 COPY elimination.
	for (const MachineBasicBlock &MBB : MF) {
	for (const MachineInstr &MI : MBB) {
	if (MI.getOpcode() != AMDGPU::COPY)
	continue;
	Register Dst = MI.getOperand(0).getReg();
	Register Src = MI.getOperand(1).getReg();
	if (Dst.isVirtual() &&
	MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
	Src.isPhysical() &&
	TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
	MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
	if (Src.isVirtual() &&
	MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
	Dst.isPhysical() &&
	TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
	MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
	if (!Dst.isVirtual() \|\| !Src.isVirtual())
	continue;
	if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
	MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
	MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
	MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
	}
	if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
	MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
	MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
	}
	}

	return Changed;
	}