| //===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| /// \file |
| /// \brief Try to reassign registers on GFX10+ to reduce register bank |
| /// conflicts. |
| /// |
| /// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in |
| /// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to |
| /// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, |
| /// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. |
| /// |
| /// The shader can read one dword from each of these banks once per cycle. |
| /// If an instruction has to read more register operands from the same bank |
| /// an additional cycle is needed. HW attempts to pre-load registers through |
| /// input operand gathering, but a stall cycle may occur if that fails. For |
| /// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, |
| /// potentially incuring 2 stall cycles. |
| /// |
| /// The pass tries to reassign registers to reduce bank conflicts. |
| /// |
| /// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so |
| /// that 4 has to be subtracted from an SGPR bank number to get the real value. |
| /// This also corresponds to bit numbers in bank masks used in the pass. |
| /// |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPU.h" |
| #include "GCNSubtarget.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "llvm/ADT/SmallSet.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/CodeGen/LiveIntervals.h" |
| #include "llvm/CodeGen/LiveRegMatrix.h" |
| #include "llvm/CodeGen/MachineFunctionPass.h" |
| #include "llvm/CodeGen/MachineLoopInfo.h" |
| #include "llvm/InitializePasses.h" |
| |
| using namespace llvm; |
| using namespace AMDGPU; |
| |
| static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign", |
| cl::desc("Verify stall cycles in the regbanks reassign pass"), |
| cl::value_desc("0|1|2"), |
| cl::init(0), cl::Hidden); |
| |
| // Threshold to keep compile time reasonable. |
| static cl::opt<unsigned> VRegThresh("amdgpu-regbanks-reassign-threshold", |
| cl::desc("Max number of vregs to run the regbanks reassign pass"), |
| cl::init(100000), cl::Hidden); |
| |
| #define DEBUG_TYPE "amdgpu-regbanks-reassign" |
| |
| #define NUM_VGPR_BANKS 4 |
| #define NUM_SGPR_BANKS 8 |
| #define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) |
| #define SGPR_BANK_OFFSET NUM_VGPR_BANKS |
| #define VGPR_BANK_MASK 0xf |
| #define SGPR_BANK_MASK 0xff0 |
| #define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) |
| |
| STATISTIC(NumStallsDetected, |
| "Number of operand read stalls detected"); |
| STATISTIC(NumStallsRecovered, |
| "Number of operand read stalls recovered"); |
| |
| namespace { |
| |
| class GCNRegBankReassign : public MachineFunctionPass { |
| |
| class OperandMask { |
| public: |
| OperandMask(unsigned r, unsigned s, unsigned m) |
| : Reg(r), SubReg(s), Mask(m) {} |
| Register Reg; |
| unsigned SubReg; |
| unsigned Mask; |
| }; |
| |
| class Candidate { |
| public: |
| Candidate(MachineInstr *mi, Register reg, unsigned subreg, |
| unsigned freebanks) |
| : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {} |
| |
| #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| void dump(const GCNRegBankReassign *P) const { |
| MI->dump(); |
| dbgs() << P->printReg(Reg) << " to banks "; |
| dumpFreeBanks(FreeBanks); |
| dbgs() << '\n'; |
| } |
| #endif |
| |
| MachineInstr *MI; |
| Register Reg; |
| unsigned SubReg; |
| unsigned FreeBanks; |
| }; |
| |
| class CandidateList : public std::map<unsigned, std::list<Candidate>> { |
| public: |
| void push(unsigned Weight, const Candidate&& C) { |
| operator[](Weight).push_front(C); |
| } |
| |
| Candidate &back() { |
| return rbegin()->second.back(); |
| } |
| |
| void pop_back() { |
| rbegin()->second.pop_back(); |
| if (rbegin()->second.empty()) |
| erase(rbegin()->first); |
| } |
| |
| #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| void dump(const GCNRegBankReassign *P) const { |
| dbgs() << "\nCandidates:\n\n"; |
| for (auto &B : *this) { |
| dbgs() << " Weight " << B.first << ":\n"; |
| for (auto &C : B.second) |
| C.dump(P); |
| } |
| dbgs() << "\n\n"; |
| } |
| #endif |
| }; |
| |
| public: |
| static char ID; |
| |
| public: |
| GCNRegBankReassign(RegBankReassignMode Mode = RM_BOTH) |
| : MachineFunctionPass(ID), Mode(Mode) { |
| initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); |
| } |
| |
| bool runOnMachineFunction(MachineFunction &MF) override; |
| |
| StringRef getPassName() const override { return "GCN RegBank Reassign"; } |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override { |
| AU.addRequired<MachineLoopInfo>(); |
| AU.addRequired<LiveIntervals>(); |
| AU.addRequired<VirtRegMap>(); |
| AU.addRequired<LiveRegMatrix>(); |
| AU.setPreservesAll(); |
| MachineFunctionPass::getAnalysisUsage(AU); |
| } |
| |
| private: |
| const GCNSubtarget *ST; |
| |
| const MachineRegisterInfo *MRI; |
| |
| const SIRegisterInfo *TRI; |
| |
| MachineLoopInfo *MLI; |
| |
| VirtRegMap *VRM; |
| |
| LiveRegMatrix *LRM; |
| |
| LiveIntervals *LIS; |
| |
| RegBankReassignMode Mode; |
| |
| unsigned MaxNumVGPRs; |
| |
| unsigned MaxNumSGPRs; |
| |
| BitVector RegsUsed; |
| |
| SmallVector<OperandMask, 8> OperandMasks; |
| |
| CandidateList Candidates; |
| |
| const MCPhysReg *CSRegs; |
| |
| // Returns bank for a phys reg. |
| unsigned getPhysRegBank(Register Reg, unsigned SubReg) const; |
| |
| // Return a bit set for each register bank used. 4 banks for VGPRs and |
| // 8 banks for SGPRs. |
| // Registers already processed and recorded in RegsUsed are excluded. |
| // If Bank is not -1 assume Reg:SubReg to belong to that Bank. |
| uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank); |
| |
| // Analyze one instruction returning the number of stalls and a mask of the |
| // banks used by all operands. |
| // If Reg and Bank are provided, assume all uses of Reg will be replaced with |
| // a register chosen from Bank. |
| std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI, |
| Register Reg = Register(), |
| unsigned SubReg = 0, int Bank = -1); |
| |
| // Return true if register is regular VGPR or SGPR or their tuples. |
| // Returns false for special registers like m0, vcc etc. |
| bool isReassignable(Register Reg) const; |
| |
| // Check if registers' defs are old and may be pre-loaded. |
| // Returns 0 if both registers are old enough, 1 or 2 if one or both |
| // registers will not likely be pre-loaded. |
| unsigned getOperandGatherWeight(const MachineInstr& MI, |
| Register Reg1, |
| Register Reg2, |
| unsigned StallCycles) const; |
| |
| |
| // Find all bank bits in UsedBanks where Mask can be relocated to. |
| unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; |
| |
| // Find all bank bits in UsedBanks where Mask can be relocated to. |
| // Bank is relative to the register and not its subregister component. |
| // Returns 0 is a register is not reassignable. |
| unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask, |
| unsigned UsedBanks) const; |
| |
| // Add cadidate instruction to the work list. |
| void collectCandidates(MachineInstr& MI, unsigned UsedBanks, |
| unsigned StallCycles); |
| |
| // Collect cadidate instructions across function. Returns a number stall |
| // cycles detected. Only counts stalls if Collect is false. |
| unsigned collectCandidates(MachineFunction &MF, bool Collect = true); |
| |
| // Remove all candidates that read specified register. |
| void removeCandidates(Register Reg); |
| |
| // Compute stalls within the uses of SrcReg replaced by a register from |
| // Bank. If Bank is -1 does not perform substitution. If Collect is set |
| // candidates are collected and added to work list. |
| unsigned computeStallCycles(Register SrcReg, |
| Register Reg = Register(), |
| unsigned SubReg = 0, int Bank = -1, |
| bool Collect = false); |
| |
| // Search for a register in Bank unused within LI. |
| // Returns phys reg or NoRegister. |
| MCRegister scavengeReg(LiveInterval &LI, unsigned Bank, |
| unsigned SubReg) const; |
| |
| // Try to reassign candidate. Returns number or stall cycles saved. |
| unsigned tryReassign(Candidate &C); |
| |
| bool verifyCycles(MachineFunction &MF, |
| unsigned OriginalCycles, unsigned CyclesSaved); |
| |
| |
| #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| public: |
| Printable printReg(Register Reg, unsigned SubReg = 0) const { |
| return Printable([Reg, SubReg, this](raw_ostream &OS) { |
| if (Reg.isPhysical()) { |
| OS << llvm::printReg(Reg, TRI); |
| return; |
| } |
| if (!VRM->isAssignedReg(Reg)) |
| OS << "<unassigned> " << llvm::printReg(Reg, TRI); |
| else |
| OS << llvm::printReg(Reg, TRI) << '(' |
| << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; |
| if (SubReg) |
| OS << ':' << TRI->getSubRegIndexName(SubReg); |
| }); |
| } |
| |
| static Printable printBank(unsigned Bank) { |
| return Printable([Bank](raw_ostream &OS) { |
| OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); |
| }); |
| } |
| |
| static void dumpFreeBanks(unsigned FreeBanks) { |
| for (unsigned L = 0; L < NUM_BANKS; ++L) |
| if (FreeBanks & (1 << L)) |
| dbgs() << printBank(L) << ' '; |
| } |
| #endif |
| }; |
| |
| } // End anonymous namespace. |
| |
| INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", |
| false, false) |
| INITIALIZE_PASS_DEPENDENCY(LiveIntervals) |
| INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) |
| INITIALIZE_PASS_DEPENDENCY(VirtRegMap) |
| INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) |
| INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", |
| false, false) |
| |
| |
| char GCNRegBankReassign::ID = 0; |
| |
| char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; |
| |
| unsigned GCNRegBankReassign::getPhysRegBank(Register Reg, |
| unsigned SubReg) const { |
| assert(Reg.isPhysical()); |
| |
| const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); |
| unsigned Size = TRI->getRegSizeInBits(*RC); |
| if (Size == 16) |
| Reg = TRI->get32BitRegister(Reg); |
| else if (Size > 32) { |
| if (SubReg) { |
| const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg); |
| Reg = TRI->getSubReg(Reg, SubReg); |
| if (TRI->getRegSizeInBits(*SubRC) > 32) |
| Reg = TRI->getSubReg(Reg, AMDGPU::sub0); |
| } else { |
| Reg = TRI->getSubReg(Reg, AMDGPU::sub0); |
| } |
| } |
| |
| if (TRI->hasVGPRs(RC)) { |
| unsigned RegNo = Reg - AMDGPU::VGPR0; |
| return RegNo % NUM_VGPR_BANKS; |
| } |
| |
| unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2; |
| return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; |
| } |
| |
| uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg, |
| int Bank) { |
| if (Reg.isVirtual()) { |
| if (!VRM->isAssignedReg(Reg)) |
| return 0; |
| |
| Reg = VRM->getPhys(Reg); |
| if (!Reg) |
| return 0; |
| if (SubReg) |
| Reg = TRI->getSubReg(Reg, SubReg); |
| } |
| |
| const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); |
| unsigned Size = TRI->getRegSizeInBits(*RC); |
| |
| if (Size == 16) { |
| Reg = TRI->get32BitRegister(Reg); |
| Size = 1; |
| } else { |
| Size /= 32; |
| if (Size > 1) |
| Reg = TRI->getSubReg(Reg, AMDGPU::sub0); |
| } |
| |
| if (TRI->hasVGPRs(RC)) { |
| // VGPRs have 4 banks assigned in a round-robin fashion. |
| unsigned RegNo = Reg - AMDGPU::VGPR0; |
| uint32_t Mask = maskTrailingOnes<uint32_t>(Size); |
| unsigned Used = 0; |
| // Bitmask lacks an extract method |
| for (unsigned I = 0; I < Size; ++I) |
| if (RegsUsed.test(RegNo + I)) |
| Used |= 1 << I; |
| RegsUsed.set(RegNo, RegNo + Size); |
| Mask &= ~Used; |
| Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank); |
| return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; |
| } |
| |
| // SGPRs have 8 banks holding 2 consequitive registers each. |
| unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2; |
| unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); |
| if (RegNo + StartBit >= RegsUsed.size()) |
| return 0; |
| |
| if (Size > 1) |
| Size /= 2; |
| unsigned Mask = (1 << Size) - 1; |
| unsigned Used = 0; |
| for (unsigned I = 0; I < Size; ++I) |
| if (RegsUsed.test(StartBit + RegNo + I)) |
| Used |= 1 << I; |
| RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size); |
| Mask &= ~Used; |
| Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS |
| : unsigned(Bank - SGPR_BANK_OFFSET); |
| Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; |
| // Reserve 4 bank ids for VGPRs. |
| return Mask << SGPR_BANK_OFFSET; |
| } |
| |
| std::pair<unsigned, unsigned> |
| GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg, |
| unsigned SubReg, int Bank) { |
| unsigned StallCycles = 0; |
| unsigned UsedBanks = 0; |
| |
| if (MI.isMetaInstruction()) |
| return std::make_pair(StallCycles, UsedBanks); |
| |
| if (!(Mode & RM_SGPR) && |
| MI.getDesc().TSFlags & (SIInstrFlags::SMRD | SIInstrFlags::SALU)) |
| return std::make_pair(StallCycles, UsedBanks); |
| |
| RegsUsed.reset(); |
| OperandMasks.clear(); |
| for (const auto& Op : MI.explicit_uses()) { |
| // Undef can be assigned to any register, so two vregs can be assigned |
| // the same phys reg within the same instruction. |
| if (!Op.isReg() || Op.isUndef()) |
| continue; |
| |
| const Register R = Op.getReg(); |
| const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R); |
| |
| // Do not compute stalls for AGPRs |
| if (TRI->hasAGPRs(RC)) |
| continue; |
| if ((Mode != RM_BOTH) && !(Mode & (TRI->hasVGPRs(RC) ? RM_VGPR : RM_SGPR))) |
| continue; |
| |
| // Do not compute stalls if sub-register covers all banks |
| if (Op.getSubReg()) { |
| LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()); |
| if (TRI->hasVGPRs(RC)) { |
| if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS) |
| continue; |
| } else { |
| if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS) |
| continue; |
| } |
| } |
| |
| unsigned ShiftedBank = Bank; |
| |
| if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) { |
| unsigned RegOffset = |
| TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0); |
| unsigned Offset = TRI->getChannelFromSubReg( |
| Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0); |
| if (Bank < NUM_VGPR_BANKS) { |
| unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset); |
| ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS; |
| } else if (Bank >= SGPR_BANK_OFFSET) { |
| unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1); |
| ShiftedBank = SGPR_BANK_OFFSET + |
| (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS; |
| } |
| } |
| |
| uint32_t Mask = getRegBankMask(R, Op.getSubReg(), |
| (Reg == R) ? ShiftedBank : -1); |
| StallCycles += countPopulation(UsedBanks & Mask); |
| UsedBanks |= Mask; |
| OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); |
| } |
| |
| return std::make_pair(StallCycles, UsedBanks); |
| } |
| |
| unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, |
| Register Reg1, |
| Register Reg2, |
| unsigned StallCycles) const |
| { |
| unsigned Defs = 0; |
| MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); |
| MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); |
| for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { |
| if (MI.isDebugInstr()) |
| continue; |
| --Def; |
| if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) |
| continue; |
| if (Def->modifiesRegister(Reg1, TRI)) |
| Defs |= 1; |
| if (Def->modifiesRegister(Reg2, TRI)) |
| Defs |= 2; |
| } |
| return countPopulation(Defs); |
| } |
| |
| bool GCNRegBankReassign::isReassignable(Register Reg) const { |
| if (Reg.isPhysical() || !VRM->isAssignedReg(Reg)) |
| return false; |
| |
| // InlineSpiller does not call LRM::assign() after an LI split leaving it |
| // in an inconsistent state, so we cannot call LRM::unassign(). |
| // See llvm bug #48911. |
| // Skip reassign if a register has originated from such split. |
| // FIXME: Remove the workaround when bug #48911 is fixed. |
| if (VRM->getPreSplitReg(Reg)) |
| return false; |
| |
| const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
| |
| Register PhysReg = VRM->getPhys(Reg); |
| |
| if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) |
| return false; |
| |
| for (auto U : MRI->use_nodbg_operands(Reg)) { |
| if (U.isImplicit()) |
| return false; |
| const MachineInstr *UseInst = U.getParent(); |
| if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) |
| return false; |
| } |
| |
| const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); |
| unsigned Size = TRI->getRegSizeInBits(*RC); |
| |
| // TODO: Support 16 bit registers. Those needs to be moved with their |
| // parent VGPR_32 and potentially a sibling 16 bit sub-register. |
| if (Size < 32) |
| return false; |
| |
| if (TRI->hasVGPRs(RC)) |
| return true; |
| |
| if (Size == 16) |
| return AMDGPU::SGPR_LO16RegClass.contains(PhysReg); |
| |
| if (Size > 32) |
| PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); |
| |
| return AMDGPU::SGPR_32RegClass.contains(PhysReg); |
| } |
| |
| unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, |
| unsigned UsedBanks) const { |
| unsigned Size = countPopulation(Mask); |
| unsigned FreeBanks = 0; |
| unsigned Bank = findFirstSet(Mask); |
| |
| UsedBanks &= ~Mask; |
| |
| // Find free VGPR banks |
| if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { |
| for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { |
| if (Bank == I) |
| continue; |
| unsigned NewMask = ((1 << Size) - 1) << I; |
| NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; |
| if (!(UsedBanks & NewMask)) |
| FreeBanks |= 1 << I; |
| } |
| return FreeBanks; |
| } |
| |
| // Find free SGPR banks |
| // SGPR tuples must be aligned, so step is size in banks it |
| // crosses. |
| Bank -= SGPR_BANK_OFFSET; |
| for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { |
| if (Bank == I) |
| continue; |
| unsigned NewMask = ((1 << Size) - 1) << I; |
| NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; |
| if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) |
| FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; |
| } |
| |
| return FreeBanks; |
| } |
| |
| unsigned GCNRegBankReassign::getFreeBanks(Register Reg, |
| unsigned SubReg, |
| unsigned Mask, |
| unsigned UsedBanks) const { |
| if (!isReassignable(Reg)) |
| return 0; |
| |
| unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); |
| |
| unsigned Offset = TRI->getChannelFromSubReg(SubReg); |
| if (Offset && (Mask & VGPR_BANK_MASK)) { |
| unsigned Shift = Offset; |
| if (Shift >= NUM_VGPR_BANKS) |
| return 0; |
| unsigned VB = FreeBanks & VGPR_BANK_MASK; |
| FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & |
| VGPR_BANK_MASK; |
| } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) { |
| unsigned Shift = Offset >> 1; |
| if (Shift >= NUM_SGPR_BANKS) |
| return 0; |
| unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; |
| FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & |
| SGPR_BANK_SHIFTED_MASK; |
| FreeBanks <<= SGPR_BANK_OFFSET; |
| } |
| |
| LLVM_DEBUG(if (FreeBanks) { |
| dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) |
| << " to banks: "; dumpFreeBanks(FreeBanks); |
| dbgs() << '\n'; }); |
| |
| return FreeBanks; |
| } |
| |
| void GCNRegBankReassign::collectCandidates(MachineInstr& MI, |
| unsigned UsedBanks, |
| unsigned StallCycles) { |
| LLVM_DEBUG(MI.dump()); |
| |
| if (!StallCycles) |
| return; |
| |
| LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); |
| |
| for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { |
| for (unsigned J = I + 1; J != E; ++J) { |
| if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) |
| continue; |
| |
| Register Reg1 = OperandMasks[I].Reg; |
| Register Reg2 = OperandMasks[J].Reg; |
| unsigned SubReg1 = OperandMasks[I].SubReg; |
| unsigned SubReg2 = OperandMasks[J].SubReg; |
| unsigned Mask1 = OperandMasks[I].Mask; |
| unsigned Mask2 = OperandMasks[J].Mask; |
| unsigned Size1 = countPopulation(Mask1); |
| unsigned Size2 = countPopulation(Mask2); |
| |
| LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << |
| " and " << printReg(Reg2, SubReg2) << '\n'); |
| |
| unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); |
| Weight += MLI->getLoopDepth(MI.getParent()) * 10; |
| |
| LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); |
| |
| unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); |
| unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); |
| if (FreeBanks1) |
| Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0), |
| Candidate(&MI, Reg1, SubReg1, FreeBanks1)); |
| if (FreeBanks2) |
| Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0), |
| Candidate(&MI, Reg2, SubReg2, FreeBanks2)); |
| } |
| } |
| } |
| |
| unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg, |
| unsigned SubReg, int Bank, |
| bool Collect) { |
| unsigned TotalStallCycles = 0; |
| SmallSet<const MachineInstr *, 16> Visited; |
| |
| for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { |
| if (MI.isBundle()) |
| continue; |
| if (!Visited.insert(&MI).second) |
| continue; |
| unsigned StallCycles; |
| unsigned UsedBanks; |
| std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank); |
| TotalStallCycles += StallCycles; |
| if (Collect) |
| collectCandidates(MI, UsedBanks, StallCycles); |
| } |
| |
| return TotalStallCycles; |
| } |
| |
| MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank, |
| unsigned SubReg) const { |
| const TargetRegisterClass *RC = MRI->getRegClass(LI.reg()); |
| unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs |
| : MaxNumSGPRs; |
| unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 |
| : AMDGPU::SGPR0); |
| |
| for (MCRegister Reg : RC->getRegisters()) { |
| // Check occupancy limit. |
| if (TRI->isSubRegisterEq(Reg, MaxReg)) |
| break; |
| |
| if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank) |
| continue; |
| |
| for (unsigned I = 0; CSRegs[I]; ++I) |
| if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && |
| !LRM->isPhysRegUsed(CSRegs[I])) |
| return MCRegister::from(AMDGPU::NoRegister); |
| |
| LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); |
| |
| if (!LRM->checkInterference(LI, Reg)) |
| return Reg; |
| } |
| |
| return MCRegister::from(AMDGPU::NoRegister); |
| } |
| |
| unsigned GCNRegBankReassign::tryReassign(Candidate &C) { |
| if (!LIS->hasInterval(C.Reg)) |
| return 0; |
| |
| LiveInterval &LI = LIS->getInterval(C.Reg); |
| LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); |
| LI.dump()); |
| |
| // For each candidate bank walk all instructions in the range of live |
| // interval and check if replacing the register with one belonging to |
| // the candidate bank reduces conflicts. |
| |
| unsigned OrigStalls = computeStallCycles(C.Reg); |
| LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); |
| if (!OrigStalls) |
| return 0; |
| |
| struct BankStall { |
| BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; |
| bool operator<(const BankStall &RHS) const { |
| if (Stalls == RHS.Stalls) |
| return Bank < RHS.Bank; |
| return Stalls > RHS.Stalls; |
| } |
| unsigned Bank; |
| unsigned Stalls; |
| }; |
| SmallVector<BankStall, 8> BankStalls; |
| |
| for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { |
| if (C.FreeBanks & (1 << Bank)) { |
| LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); |
| unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank); |
| if (Stalls < OrigStalls) { |
| LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " |
| << Stalls << '\n'); |
| BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); |
| } |
| } |
| } |
| llvm::sort(BankStalls); |
| |
| MCRegister OrigReg = VRM->getPhys(C.Reg); |
| LRM->unassign(LI); |
| while (!BankStalls.empty()) { |
| BankStall BS = BankStalls.pop_back_val(); |
| MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg); |
| if (Reg == AMDGPU::NoRegister) { |
| LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) |
| << '\n'); |
| continue; |
| } |
| LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) |
| << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") |
| << " in bank " << printBank(BS.Bank) << '\n'); |
| |
| LRM->assign(LI, Reg); |
| |
| LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); |
| |
| return OrigStalls - BS.Stalls; |
| } |
| LRM->assign(LI, OrigReg); |
| |
| return 0; |
| } |
| |
| unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, |
| bool Collect) { |
| unsigned TotalStallCycles = 0; |
| |
| for (MachineBasicBlock &MBB : MF) { |
| |
| LLVM_DEBUG(if (Collect) { |
| if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); |
| else dbgs() << MBB.getName(); dbgs() << ":\n"; |
| }); |
| |
| for (MachineInstr &MI : MBB.instrs()) { |
| if (MI.isBundle()) |
| continue; // we analyze the instructions inside the bundle individually |
| |
| unsigned StallCycles; |
| unsigned UsedBanks; |
| std::tie(StallCycles, UsedBanks) = analyzeInst(MI); |
| |
| if (Collect) |
| collectCandidates(MI, UsedBanks, StallCycles); |
| |
| TotalStallCycles += StallCycles; |
| } |
| |
| LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); |
| } |
| |
| return TotalStallCycles; |
| } |
| |
| void GCNRegBankReassign::removeCandidates(Register Reg) { |
| typename CandidateList::iterator Next; |
| for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) { |
| Next = std::next(I); |
| I->second.remove_if([Reg, this](const Candidate& C) { |
| return C.MI->readsRegister(Reg, TRI); |
| }); |
| if (I->second.empty()) |
| Candidates.erase(I); |
| } |
| } |
| |
| bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, |
| unsigned OriginalCycles, |
| unsigned CyclesSaved) { |
| unsigned StallCycles = collectCandidates(MF, false); |
| LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles |
| << " stall cycles left\n"); |
| return StallCycles + CyclesSaved == OriginalCycles; |
| } |
| |
| bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { |
| ST = &MF.getSubtarget<GCNSubtarget>(); |
| if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) |
| return false; |
| |
| MRI = &MF.getRegInfo(); |
| |
| LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " |
| << MF.getName() << '\n' |
| << ((Mode & RM_VGPR) ? "VGPR " : "") |
| << ((Mode & RM_SGPR) ? "SGPR " : "") << "mode\n" |
| << "NumVirtRegs = " << MRI->getNumVirtRegs() << "\n\n"); |
| |
| if (MRI->getNumVirtRegs() > VRegThresh) { |
| LLVM_DEBUG(dbgs() << "NumVirtRegs > " << VRegThresh |
| << " threshold, skipping function.\n\n"); |
| return false; |
| } |
| |
| TRI = ST->getRegisterInfo(); |
| MLI = &getAnalysis<MachineLoopInfo>(); |
| VRM = &getAnalysis<VirtRegMap>(); |
| LRM = &getAnalysis<LiveRegMatrix>(); |
| LIS = &getAnalysis<LiveIntervals>(); |
| |
| const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| unsigned Occupancy = MFI->getOccupancy(); |
| MaxNumVGPRs = ST->getMaxNumVGPRs(MF); |
| MaxNumSGPRs = ST->getMaxNumSGPRs(MF); |
| MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); |
| MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); |
| |
| CSRegs = MRI->getCalleeSavedRegs(); |
| unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() + |
| // Not a tight bound |
| AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1; |
| RegsUsed.resize(NumRegBanks); |
| |
| unsigned StallCycles = collectCandidates(MF); |
| NumStallsDetected += StallCycles; |
| |
| LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " |
| "function " << MF.getName() << '\n'); |
| |
| LLVM_DEBUG(Candidates.dump(this)); |
| |
| unsigned CyclesSaved = 0; |
| while (!Candidates.empty()) { |
| Candidate C = Candidates.back(); |
| unsigned LocalCyclesSaved = tryReassign(C); |
| CyclesSaved += LocalCyclesSaved; |
| |
| if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) |
| report_fatal_error("RegBank reassign stall cycles verification failed."); |
| |
| Candidates.pop_back(); |
| if (LocalCyclesSaved) { |
| removeCandidates(C.Reg); |
| computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true); |
| |
| LLVM_DEBUG(Candidates.dump(this)); |
| } |
| } |
| NumStallsRecovered += CyclesSaved; |
| |
| LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved |
| << " cycles saved in function " << MF.getName() << '\n'); |
| |
| Candidates.clear(); |
| |
| if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) |
| report_fatal_error("RegBank reassign stall cycles verification failed."); |
| |
| RegsUsed.clear(); |
| |
| return CyclesSaved > 0; |
| } |
| |
| MachineFunctionPass * |
| llvm::createGCNRegBankReassignPass(RegBankReassignMode Mode) { |
| return new GCNRegBankReassign(Mode); |
| } |