| //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| /// \file |
| /// This contains a MachineSchedStrategy implementation for maximizing wave |
| /// occupancy on GCN hardware. |
| /// |
| /// This pass will apply multiple scheduling stages to the same function. |
| /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual |
| /// entry point for the scheduling of those regions is |
| /// GCNScheduleDAGMILive::runSchedStages. |
| |
| /// Generally, the reason for having multiple scheduling stages is to account |
| /// for the kernel-wide effect of register usage on occupancy. Usually, only a |
| /// few scheduling regions will have register pressure high enough to limit |
| /// occupancy for the kernel, so constraints can be relaxed to improve ILP in |
| /// other regions. |
| /// |
| //===----------------------------------------------------------------------===// |
| |
| #include "GCNSchedStrategy.h" |
| #include "AMDGPUIGroupLP.h" |
| #include "GCNRegPressure.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "Utils/AMDGPUBaseInfo.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/CodeGen/RegisterClassInfo.h" |
| #include "llvm/MC/LaneBitmask.h" |
| #include "llvm/Support/ErrorHandling.h" |
| |
| #define DEBUG_TYPE "machine-scheduler" |
| |
| using namespace llvm; |
| |
| static cl::opt<bool> DisableUnclusterHighRP( |
| "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, |
| cl::desc("Disable unclustered high register pressure " |
| "reduction scheduling stage."), |
| cl::init(false)); |
| |
| static cl::opt<bool> DisableClusteredLowOccupancy( |
| "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, |
| cl::desc("Disable clustered low occupancy " |
| "rescheduling for ILP scheduling stage."), |
| cl::init(false)); |
| |
| static cl::opt<unsigned> ScheduleMetricBias( |
| "amdgpu-schedule-metric-bias", cl::Hidden, |
| cl::desc( |
| "Sets the bias which adds weight to occupancy vs latency. Set it to " |
| "100 to chase the occupancy only."), |
| cl::init(10)); |
| |
| static cl::opt<bool> |
| RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, |
| cl::desc("Relax occupancy targets for kernels which are memory " |
| "bound (amdgpu-membound-threshold), or " |
| "Wave Limited (amdgpu-limit-wave-threshold)."), |
| cl::init(false)); |
| |
| static cl::opt<bool> GCNTrackers( |
| "amdgpu-use-amdgpu-trackers", cl::Hidden, |
| cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), |
| cl::init(false)); |
| |
| const unsigned ScheduleMetrics::ScaleFactor = 100; |
| |
| GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) |
| : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), |
| DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { |
| } |
| |
| void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { |
| GenericScheduler::initialize(DAG); |
| |
| MF = &DAG->MF; |
| |
| const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
| |
| SGPRExcessLimit = |
| Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); |
| VGPRExcessLimit = |
| Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); |
| |
| SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
| // Set the initial TargetOccupnacy to the maximum occupancy that we can |
| // achieve for this function. This effectively sets a lower bound on the |
| // 'Critical' register limits in the scheduler. |
| // Allow for lower occupancy targets if kernel is wave limited or memory |
| // bound, and using the relaxed occupancy feature. |
| TargetOccupancy = |
| RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); |
| SGPRCriticalLimit = |
| std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); |
| |
| if (!KnownExcessRP) { |
| VGPRCriticalLimit = |
| std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); |
| } else { |
| // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except |
| // returns a reasonably small number for targets with lots of VGPRs, such |
| // as GFX10 and GFX11. |
| LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " |
| "VGPRCriticalLimit calculation method.\n"); |
| |
| unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST); |
| unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST); |
| unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule); |
| VGPRBudget = std::max(VGPRBudget, Granule); |
| VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit); |
| } |
| |
| // Subtract error margin and bias from register limits and avoid overflow. |
| SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit); |
| VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit); |
| SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit); |
| VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit); |
| |
| LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit |
| << ", VGPRExcessLimit = " << VGPRExcessLimit |
| << ", SGPRCriticalLimit = " << SGPRCriticalLimit |
| << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n"); |
| } |
| |
| /// Checks whether \p SU can use the cached DAG pressure diffs to compute the |
| /// current register pressure. |
| /// |
| /// This works for the common case, but it has a few exceptions that have been |
| /// observed through trial and error: |
| /// - Explicit physical register operands |
| /// - Subregister definitions |
| /// |
| /// In both of those cases, PressureDiff doesn't represent the actual pressure, |
| /// and querying LiveIntervals through the RegPressureTracker is needed to get |
| /// an accurate value. |
| /// |
| /// We should eventually only use PressureDiff for maximum performance, but this |
| /// already allows 80% of SUs to take the fast path without changing scheduling |
| /// at all. Further changes would either change scheduling, or require a lot |
| /// more logic to recover an accurate pressure estimate from the PressureDiffs. |
| static bool canUsePressureDiffs(const SUnit &SU) { |
| if (!SU.isInstr()) |
| return false; |
| |
| // Cannot use pressure diffs for subregister defs or with physregs, it's |
| // imprecise in both cases. |
| for (const auto &Op : SU.getInstr()->operands()) { |
| if (!Op.isReg() || Op.isImplicit()) |
| continue; |
| if (Op.getReg().isPhysical() || |
| (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) |
| return false; |
| } |
| return true; |
| } |
| |
| static void getRegisterPressures( |
| bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, |
| std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure, |
| GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, |
| ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) { |
| // getDownwardPressure() and getUpwardPressure() make temporary changes to |
| // the tracker, so we need to pass those function a non-const copy. |
| RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); |
| if (!GCNTrackers) { |
| AtTop |
| ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure) |
| : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); |
| |
| return; |
| } |
| |
| // GCNTrackers |
| Pressure.resize(4, 0); |
| MachineInstr *MI = SU->getInstr(); |
| GCNRegPressure NewPressure; |
| if (AtTop) { |
| GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); |
| NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI); |
| } else { |
| GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); |
| TempUpwardTracker.recede(*MI); |
| NewPressure = TempUpwardTracker.getPressure(); |
| } |
| Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); |
| Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = |
| NewPressure.getArchVGPRNum(); |
| Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); |
| } |
| |
| void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, |
| bool AtTop, |
| const RegPressureTracker &RPTracker, |
| const SIRegisterInfo *SRI, |
| unsigned SGPRPressure, |
| unsigned VGPRPressure, bool IsBottomUp) { |
| Cand.SU = SU; |
| Cand.AtTop = AtTop; |
| |
| if (!DAG->isTrackingPressure()) |
| return; |
| |
| Pressure.clear(); |
| MaxPressure.clear(); |
| |
| // We try to use the cached PressureDiffs in the ScheduleDAG whenever |
| // possible over querying the RegPressureTracker. |
| // |
| // RegPressureTracker will make a lot of LIS queries which are very |
| // expensive, it is considered a slow function in this context. |
| // |
| // PressureDiffs are precomputed and cached, and getPressureDiff is just a |
| // trivial lookup into an array. It is pretty much free. |
| // |
| // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of |
| // PressureDiffs. |
| if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { |
| getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, |
| DownwardTracker, UpwardTracker, DAG, SRI); |
| } else { |
| // Reserve 4 slots. |
| Pressure.resize(4, 0); |
| Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; |
| Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; |
| |
| for (const auto &Diff : DAG->getPressureDiff(SU)) { |
| if (!Diff.isValid()) |
| continue; |
| // PressureDiffs is always bottom-up so if we're working top-down we need |
| // to invert its sign. |
| Pressure[Diff.getPSet()] += |
| (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); |
| } |
| |
| #ifdef EXPENSIVE_CHECKS |
| std::vector<unsigned> CheckPressure, CheckMaxPressure; |
| getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, |
| DownwardTracker, UpwardTracker, DAG, SRI); |
| if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != |
| CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || |
| Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != |
| CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { |
| errs() << "Register Pressure is inaccurate when calculated through " |
| "PressureDiff\n" |
| << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] |
| << ", expected " |
| << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" |
| << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] |
| << ", expected " |
| << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"; |
| report_fatal_error("inaccurate register pressure calculation"); |
| } |
| #endif |
| } |
| |
| unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
| unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
| |
| // If two instructions increase the pressure of different register sets |
| // by the same amount, the generic scheduler will prefer to schedule the |
| // instruction that increases the set with the least amount of registers, |
| // which in our case would be SGPRs. This is rarely what we want, so |
| // when we report excess/critical register pressure, we do it either |
| // only for VGPRs or only for SGPRs. |
| |
| // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. |
| const unsigned MaxVGPRPressureInc = 16; |
| bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; |
| bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; |
| |
| // FIXME: We have to enter REG-EXCESS before we reach the actual threshold |
| // to increase the likelihood we don't go over the limits. We should improve |
| // the analysis to look through dependencies to find the path with the least |
| // register pressure. |
| |
| // We only need to update the RPDelta for instructions that increase register |
| // pressure. Instructions that decrease or keep reg pressure the same will be |
| // marked as RegExcess in tryCandidate() when they are compared with |
| // instructions that increase the register pressure. |
| if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { |
| HasHighPressure = true; |
| Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
| Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); |
| } |
| |
| if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { |
| HasHighPressure = true; |
| Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
| Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); |
| } |
| |
| // Register pressure is considered 'CRITICAL' if it is approaching a value |
| // that would reduce the wave occupancy for the execution unit. When |
| // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both |
| // has the same cost, so we don't need to prefer one over the other. |
| |
| int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; |
| int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; |
| |
| if (SGPRDelta >= 0 || VGPRDelta >= 0) { |
| HasHighPressure = true; |
| if (SGPRDelta > VGPRDelta) { |
| Cand.RPDelta.CriticalMax = |
| PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
| Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); |
| } else { |
| Cand.RPDelta.CriticalMax = |
| PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
| Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); |
| } |
| } |
| } |
| |
| // This function is mostly cut and pasted from |
| // GenericScheduler::pickNodeFromQueue() |
| void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, |
| const CandPolicy &ZonePolicy, |
| const RegPressureTracker &RPTracker, |
| SchedCandidate &Cand, |
| bool IsBottomUp) { |
| const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); |
| ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); |
| unsigned SGPRPressure = 0; |
| unsigned VGPRPressure = 0; |
| if (DAG->isTrackingPressure()) { |
| if (!GCNTrackers) { |
| SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
| VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
| } else { |
| GCNRPTracker *T = IsBottomUp |
| ? static_cast<GCNRPTracker *>(&UpwardTracker) |
| : static_cast<GCNRPTracker *>(&DownwardTracker); |
| SGPRPressure = T->getPressure().getSGPRNum(); |
| VGPRPressure = T->getPressure().getArchVGPRNum(); |
| } |
| } |
| ReadyQueue &Q = Zone.Available; |
| for (SUnit *SU : Q) { |
| |
| SchedCandidate TryCand(ZonePolicy); |
| initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, |
| VGPRPressure, IsBottomUp); |
| // Pass SchedBoundary only when comparing nodes from the same boundary. |
| SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
| tryCandidate(Cand, TryCand, ZoneArg); |
| if (TryCand.Reason != NoCand) { |
| // Initialize resource delta if needed in case future heuristics query it. |
| if (TryCand.ResDelta == SchedResourceDelta()) |
| TryCand.initResourceDelta(Zone.DAG, SchedModel); |
| Cand.setBest(TryCand); |
| LLVM_DEBUG(traceCandidate(Cand)); |
| } |
| } |
| } |
| |
| // This function is mostly cut and pasted from |
| // GenericScheduler::pickNodeBidirectional() |
| SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { |
| // Schedule as far as possible in the direction of no choice. This is most |
| // efficient, but also provides the best heuristics for CriticalPSets. |
| if (SUnit *SU = Bot.pickOnlyChoice()) { |
| IsTopNode = false; |
| return SU; |
| } |
| if (SUnit *SU = Top.pickOnlyChoice()) { |
| IsTopNode = true; |
| return SU; |
| } |
| // Set the bottom-up policy based on the state of the current bottom zone and |
| // the instructions outside the zone, including the top zone. |
| CandPolicy BotPolicy; |
| setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); |
| // Set the top-down policy based on the state of the current top zone and |
| // the instructions outside the zone, including the bottom zone. |
| CandPolicy TopPolicy; |
| setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); |
| |
| // See if BotCand is still valid (because we previously scheduled from Top). |
| LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); |
| if (!BotCand.isValid() || BotCand.SU->isScheduled || |
| BotCand.Policy != BotPolicy) { |
| BotCand.reset(CandPolicy()); |
| pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, |
| /*IsBottomUp=*/true); |
| assert(BotCand.Reason != NoCand && "failed to find the first candidate"); |
| } else { |
| LLVM_DEBUG(traceCandidate(BotCand)); |
| #ifndef NDEBUG |
| if (VerifyScheduling) { |
| SchedCandidate TCand; |
| TCand.reset(CandPolicy()); |
| pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, |
| /*IsBottomUp=*/true); |
| assert(TCand.SU == BotCand.SU && |
| "Last pick result should correspond to re-picking right now"); |
| } |
| #endif |
| } |
| |
| // Check if the top Q has a better candidate. |
| LLVM_DEBUG(dbgs() << "Picking from Top:\n"); |
| if (!TopCand.isValid() || TopCand.SU->isScheduled || |
| TopCand.Policy != TopPolicy) { |
| TopCand.reset(CandPolicy()); |
| pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, |
| /*IsBottomUp=*/false); |
| assert(TopCand.Reason != NoCand && "failed to find the first candidate"); |
| } else { |
| LLVM_DEBUG(traceCandidate(TopCand)); |
| #ifndef NDEBUG |
| if (VerifyScheduling) { |
| SchedCandidate TCand; |
| TCand.reset(CandPolicy()); |
| pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, |
| /*IsBottomUp=*/false); |
| assert(TCand.SU == TopCand.SU && |
| "Last pick result should correspond to re-picking right now"); |
| } |
| #endif |
| } |
| |
| // Pick best from BotCand and TopCand. |
| LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); |
| dbgs() << "Bot Cand: "; traceCandidate(BotCand);); |
| SchedCandidate Cand = BotCand; |
| TopCand.Reason = NoCand; |
| tryCandidate(Cand, TopCand, nullptr); |
| if (TopCand.Reason != NoCand) { |
| Cand.setBest(TopCand); |
| } |
| LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); |
| |
| IsTopNode = Cand.AtTop; |
| return Cand.SU; |
| } |
| |
| // This function is mostly cut and pasted from |
| // GenericScheduler::pickNode() |
| SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { |
| if (DAG->top() == DAG->bottom()) { |
| assert(Top.Available.empty() && Top.Pending.empty() && |
| Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); |
| return nullptr; |
| } |
| SUnit *SU; |
| do { |
| if (RegionPolicy.OnlyTopDown) { |
| SU = Top.pickOnlyChoice(); |
| if (!SU) { |
| CandPolicy NoPolicy; |
| TopCand.reset(NoPolicy); |
| pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, |
| /*IsBottomUp=*/false); |
| assert(TopCand.Reason != NoCand && "failed to find a candidate"); |
| SU = TopCand.SU; |
| } |
| IsTopNode = true; |
| } else if (RegionPolicy.OnlyBottomUp) { |
| SU = Bot.pickOnlyChoice(); |
| if (!SU) { |
| CandPolicy NoPolicy; |
| BotCand.reset(NoPolicy); |
| pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, |
| /*IsBottomUp=*/true); |
| assert(BotCand.Reason != NoCand && "failed to find a candidate"); |
| SU = BotCand.SU; |
| } |
| IsTopNode = false; |
| } else { |
| SU = pickNodeBidirectional(IsTopNode); |
| } |
| } while (SU->isScheduled); |
| |
| if (SU->isTopReady()) |
| Top.removeReady(SU); |
| if (SU->isBottomReady()) |
| Bot.removeReady(SU); |
| |
| LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " |
| << *SU->getInstr()); |
| return SU; |
| } |
| |
| void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { |
| if (GCNTrackers) { |
| MachineInstr *MI = SU->getInstr(); |
| IsTopNode ? (void)DownwardTracker.advance(MI, false) |
| : UpwardTracker.recede(*MI); |
| } |
| |
| return GenericScheduler::schedNode(SU, IsTopNode); |
| } |
| |
| GCNSchedStageID GCNSchedStrategy::getCurrentStage() { |
| assert(CurrentStage && CurrentStage != SchedStages.end()); |
| return *CurrentStage; |
| } |
| |
| bool GCNSchedStrategy::advanceStage() { |
| assert(CurrentStage != SchedStages.end()); |
| if (!CurrentStage) |
| CurrentStage = SchedStages.begin(); |
| else |
| CurrentStage++; |
| |
| return CurrentStage != SchedStages.end(); |
| } |
| |
| bool GCNSchedStrategy::hasNextStage() const { |
| assert(CurrentStage); |
| return std::next(CurrentStage) != SchedStages.end(); |
| } |
| |
| GCNSchedStageID GCNSchedStrategy::getNextStage() const { |
| assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); |
| return *std::next(CurrentStage); |
| } |
| |
| GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( |
| const MachineSchedContext *C, bool IsLegacyScheduler) |
| : GCNSchedStrategy(C) { |
| SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); |
| SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); |
| SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); |
| SchedStages.push_back(GCNSchedStageID::PreRARematerialize); |
| GCNTrackers = GCNTrackers & !IsLegacyScheduler; |
| } |
| |
| GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) |
| : GCNSchedStrategy(C) { |
| SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule); |
| } |
| |
| bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, |
| SchedCandidate &TryCand, |
| SchedBoundary *Zone) const { |
| // Initialize the candidate if needed. |
| if (!Cand.isValid()) { |
| TryCand.Reason = NodeOrder; |
| return true; |
| } |
| |
| // Avoid spilling by exceeding the register limit. |
| if (DAG->isTrackingPressure() && |
| tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, |
| RegExcess, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| // Bias PhysReg Defs and copies to their uses and defined respectively. |
| if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), |
| biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) |
| return TryCand.Reason != NoCand; |
| |
| bool SameBoundary = Zone != nullptr; |
| if (SameBoundary) { |
| // Prioritize instructions that read unbuffered resources by stall cycles. |
| if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), |
| Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid critical resource consumption and balance the schedule. |
| TryCand.initResourceDelta(DAG, SchedModel); |
| if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, |
| TryCand, Cand, ResourceReduce)) |
| return TryCand.Reason != NoCand; |
| if (tryGreater(TryCand.ResDelta.DemandedResources, |
| Cand.ResDelta.DemandedResources, TryCand, Cand, |
| ResourceDemand)) |
| return TryCand.Reason != NoCand; |
| |
| // Unconditionally try to reduce latency. |
| if (tryLatency(TryCand, Cand, *Zone)) |
| return TryCand.Reason != NoCand; |
| |
| // Weak edges are for clustering and other constraints. |
| if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), |
| getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) |
| return TryCand.Reason != NoCand; |
| } |
| |
| // Keep clustered nodes together to encourage downstream peephole |
| // optimizations which may reduce resource requirements. |
| // |
| // This is a best effort to set things up for a post-RA pass. Optimizations |
| // like generating loads of multiple registers should ideally be done within |
| // the scheduler pass by combining the loads during DAG postprocessing. |
| const SUnit *CandNextClusterSU = |
| Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
| const SUnit *TryCandNextClusterSU = |
| TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
| if (tryGreater(TryCand.SU == TryCandNextClusterSU, |
| Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid increasing the max critical pressure in the scheduled region. |
| if (DAG->isTrackingPressure() && |
| tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, |
| TryCand, Cand, RegCritical, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid increasing the max pressure of the entire region. |
| if (DAG->isTrackingPressure() && |
| tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, |
| Cand, RegMax, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| if (SameBoundary) { |
| // Fall through to original instruction order. |
| if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || |
| (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { |
| TryCand.Reason = NodeOrder; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy( |
| const MachineSchedContext *C) |
| : GCNSchedStrategy(C) { |
| SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule); |
| } |
| |
| /// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as |
| /// much as possible. This is achieved by: |
| // 1. Prioritize clustered operations before stall latency heuristic. |
| // 2. Prioritize long-latency-load before stall latency heuristic. |
| /// |
| /// \param Cand provides the policy and current best candidate. |
| /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. |
| /// \param Zone describes the scheduled zone that we are extending, or nullptr |
| /// if Cand is from a different zone than TryCand. |
| /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) |
| bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand, |
| SchedCandidate &TryCand, |
| SchedBoundary *Zone) const { |
| // Initialize the candidate if needed. |
| if (!Cand.isValid()) { |
| TryCand.Reason = NodeOrder; |
| return true; |
| } |
| |
| // Bias PhysReg Defs and copies to their uses and defined respectively. |
| if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), |
| biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) |
| return TryCand.Reason != NoCand; |
| |
| if (DAG->isTrackingPressure()) { |
| // Avoid exceeding the target's limit. |
| if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, |
| RegExcess, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid increasing the max critical pressure in the scheduled region. |
| if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, |
| TryCand, Cand, RegCritical, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| } |
| |
| // MaxMemoryClause-specific: We prioritize clustered instructions as we would |
| // get more benefit from clausing these memory instructions. |
| const SUnit *CandNextClusterSU = |
| Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
| const SUnit *TryCandNextClusterSU = |
| TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
| if (tryGreater(TryCand.SU == TryCandNextClusterSU, |
| Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) |
| return TryCand.Reason != NoCand; |
| |
| // We only compare a subset of features when comparing nodes between |
| // Top and Bottom boundary. Some properties are simply incomparable, in many |
| // other instances we should only override the other boundary if something |
| // is a clear good pick on one boundary. Skip heuristics that are more |
| // "tie-breaking" in nature. |
| bool SameBoundary = Zone != nullptr; |
| if (SameBoundary) { |
| // For loops that are acyclic path limited, aggressively schedule for |
| // latency. Within an single cycle, whenever CurrMOps > 0, allow normal |
| // heuristics to take precedence. |
| if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && |
| tryLatency(TryCand, Cand, *Zone)) |
| return TryCand.Reason != NoCand; |
| |
| // MaxMemoryClause-specific: Prioritize long latency memory load |
| // instructions in top-bottom order to hide more latency. The mayLoad check |
| // is used to exclude store-like instructions, which we do not want to |
| // scheduler them too early. |
| bool TryMayLoad = |
| TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad(); |
| bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad(); |
| |
| if (TryMayLoad || CandMayLoad) { |
| bool TryLongLatency = |
| TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad; |
| bool CandLongLatency = |
| 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad; |
| |
| if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency, |
| Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand, |
| Cand, Stall)) |
| return TryCand.Reason != NoCand; |
| } |
| // Prioritize instructions that read unbuffered resources by stall cycles. |
| if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), |
| Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) |
| return TryCand.Reason != NoCand; |
| } |
| |
| if (SameBoundary) { |
| // Weak edges are for clustering and other constraints. |
| if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), |
| getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) |
| return TryCand.Reason != NoCand; |
| } |
| |
| // Avoid increasing the max pressure of the entire region. |
| if (DAG->isTrackingPressure() && |
| tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, |
| Cand, RegMax, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| if (SameBoundary) { |
| // Avoid critical resource consumption and balance the schedule. |
| TryCand.initResourceDelta(DAG, SchedModel); |
| if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, |
| TryCand, Cand, ResourceReduce)) |
| return TryCand.Reason != NoCand; |
| if (tryGreater(TryCand.ResDelta.DemandedResources, |
| Cand.ResDelta.DemandedResources, TryCand, Cand, |
| ResourceDemand)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid serializing long latency dependence chains. |
| // For acyclic path limited loops, latency was already checked above. |
| if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && |
| !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) |
| return TryCand.Reason != NoCand; |
| |
| // Fall through to original instruction order. |
| if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) { |
| assert(TryCand.SU->NodeNum != Cand.SU->NodeNum); |
| TryCand.Reason = NodeOrder; |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| GCNScheduleDAGMILive::GCNScheduleDAGMILive( |
| MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) |
| : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), |
| MFI(*MF.getInfo<SIMachineFunctionInfo>()), |
| StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), |
| RegionLiveOuts(this, /*IsLiveOut=*/true) { |
| |
| // We want regions with a single MI to be scheduled so that we can reason |
| // about them correctly during scheduling stages that move MIs between regions |
| // (e.g., rematerialization). |
| ScheduleSingleMIRegions = true; |
| LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); |
| if (RelaxedOcc) { |
| MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy); |
| if (MinOccupancy != StartingOccupancy) |
| LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy |
| << ".\n"); |
| } |
| } |
| |
| std::unique_ptr<GCNSchedStage> |
| GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { |
| switch (SchedStageID) { |
| case GCNSchedStageID::OccInitialSchedule: |
| return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this); |
| case GCNSchedStageID::UnclusteredHighRPReschedule: |
| return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this); |
| case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
| return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this); |
| case GCNSchedStageID::PreRARematerialize: |
| return std::make_unique<PreRARematStage>(SchedStageID, *this); |
| case GCNSchedStageID::ILPInitialSchedule: |
| return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this); |
| case GCNSchedStageID::MemoryClauseInitialSchedule: |
| return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID, |
| *this); |
| } |
| |
| llvm_unreachable("Unknown SchedStageID."); |
| } |
| |
| void GCNScheduleDAGMILive::schedule() { |
| // Collect all scheduling regions. The actual scheduling is performed in |
| // GCNScheduleDAGMILive::finalizeSchedule. |
| Regions.push_back(std::pair(RegionBegin, RegionEnd)); |
| } |
| |
| GCNRegPressure |
| GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { |
| GCNDownwardRPTracker RPTracker(*LIS); |
| RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); |
| return RPTracker.moveMaxPressure(); |
| } |
| |
| static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, |
| MachineBasicBlock::iterator RegionEnd) { |
| auto REnd = RegionEnd == RegionBegin->getParent()->end() |
| ? std::prev(RegionEnd) |
| : RegionEnd; |
| return &*skipDebugInstructionsBackward(REnd, RegionBegin); |
| } |
| |
| void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, |
| const MachineBasicBlock *MBB) { |
| GCNDownwardRPTracker RPTracker(*LIS); |
| |
| // If the block has the only successor then live-ins of that successor are |
| // live-outs of the current block. We can reuse calculated live set if the |
| // successor will be sent to scheduling past current block. |
| |
| // However, due to the bug in LiveInterval analysis it may happen that two |
| // predecessors of the same successor block have different lane bitmasks for |
| // a live-out register. Workaround that by sticking to one-to-one relationship |
| // i.e. one predecessor with one successor block. |
| const MachineBasicBlock *OnlySucc = nullptr; |
| if (MBB->succ_size() == 1) { |
| auto *Candidate = *MBB->succ_begin(); |
| if (!Candidate->empty() && Candidate->pred_size() == 1) { |
| SlotIndexes *Ind = LIS->getSlotIndexes(); |
| if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate)) |
| OnlySucc = Candidate; |
| } |
| } |
| |
| // Scheduler sends regions from the end of the block upwards. |
| size_t CurRegion = RegionIdx; |
| for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) |
| if (Regions[CurRegion].first->getParent() != MBB) |
| break; |
| --CurRegion; |
| |
| auto I = MBB->begin(); |
| auto LiveInIt = MBBLiveIns.find(MBB); |
| auto &Rgn = Regions[CurRegion]; |
| auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); |
| if (LiveInIt != MBBLiveIns.end()) { |
| auto LiveIn = std::move(LiveInIt->second); |
| RPTracker.reset(*MBB->begin(), &LiveIn); |
| MBBLiveIns.erase(LiveInIt); |
| } else { |
| I = Rgn.first; |
| auto LRS = BBLiveInMap.lookup(NonDbgMI); |
| #ifdef EXPENSIVE_CHECKS |
| assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); |
| #endif |
| RPTracker.reset(*I, &LRS); |
| } |
| |
| for (;;) { |
| I = RPTracker.getNext(); |
| |
| if (Regions[CurRegion].first == I || NonDbgMI == I) { |
| LiveIns[CurRegion] = RPTracker.getLiveRegs(); |
| RPTracker.clearMaxPressure(); |
| } |
| |
| if (Regions[CurRegion].second == I) { |
| Pressure[CurRegion] = RPTracker.moveMaxPressure(); |
| if (CurRegion-- == RegionIdx) |
| break; |
| auto &Rgn = Regions[CurRegion]; |
| NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); |
| } |
| RPTracker.advanceToNext(); |
| RPTracker.advanceBeforeNext(); |
| } |
| |
| if (OnlySucc) { |
| if (I != MBB->end()) { |
| RPTracker.advanceToNext(); |
| RPTracker.advance(MBB->end()); |
| } |
| RPTracker.advanceBeforeNext(); |
| MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); |
| } |
| } |
| |
| DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
| GCNScheduleDAGMILive::getRegionLiveInMap() const { |
| assert(!Regions.empty()); |
| std::vector<MachineInstr *> RegionFirstMIs; |
| RegionFirstMIs.reserve(Regions.size()); |
| auto I = Regions.rbegin(), E = Regions.rend(); |
| do { |
| const MachineBasicBlock *MBB = I->first->getParent(); |
| auto *MI = &*skipDebugInstructionsForward(I->first, I->second); |
| RegionFirstMIs.push_back(MI); |
| do { |
| ++I; |
| } while (I != E && I->first->getParent() == MBB); |
| } while (I != E); |
| return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS); |
| } |
| |
| DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
| GCNScheduleDAGMILive::getRegionLiveOutMap() const { |
| assert(!Regions.empty()); |
| std::vector<MachineInstr *> RegionLastMIs; |
| RegionLastMIs.reserve(Regions.size()); |
| for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) |
| RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd)); |
| |
| return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS); |
| } |
| |
| void RegionPressureMap::buildLiveRegMap() { |
| IdxToInstruction.clear(); |
| |
| RegionLiveRegMap = |
| IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); |
| for (unsigned I = 0; I < DAG->Regions.size(); I++) { |
| MachineInstr *RegionKey = |
| IsLiveOut |
| ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second) |
| : &*DAG->Regions[I].first; |
| IdxToInstruction[I] = RegionKey; |
| } |
| } |
| |
| void GCNScheduleDAGMILive::finalizeSchedule() { |
| // Start actual scheduling here. This function is called by the base |
| // MachineScheduler after all regions have been recorded by |
| // GCNScheduleDAGMILive::schedule(). |
| LiveIns.resize(Regions.size()); |
| Pressure.resize(Regions.size()); |
| RescheduleRegions.resize(Regions.size()); |
| RegionsWithHighRP.resize(Regions.size()); |
| RegionsWithExcessRP.resize(Regions.size()); |
| RegionsWithMinOcc.resize(Regions.size()); |
| RegionsWithIGLPInstrs.resize(Regions.size()); |
| RescheduleRegions.set(); |
| RegionsWithHighRP.reset(); |
| RegionsWithExcessRP.reset(); |
| RegionsWithMinOcc.reset(); |
| RegionsWithIGLPInstrs.reset(); |
| |
| runSchedStages(); |
| } |
| |
| void GCNScheduleDAGMILive::runSchedStages() { |
| LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); |
| |
| if (!Regions.empty()) { |
| BBLiveInMap = getRegionLiveInMap(); |
| if (GCNTrackers) |
| RegionLiveOuts.buildLiveRegMap(); |
| } |
| |
| GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); |
| while (S.advanceStage()) { |
| auto Stage = createSchedStage(S.getCurrentStage()); |
| if (!Stage->initGCNSchedStage()) |
| continue; |
| |
| for (auto Region : Regions) { |
| RegionBegin = Region.first; |
| RegionEnd = Region.second; |
| // Setup for scheduling the region and check whether it should be skipped. |
| if (!Stage->initGCNRegion()) { |
| Stage->advanceRegion(); |
| exitRegion(); |
| continue; |
| } |
| |
| if (GCNTrackers) { |
| GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); |
| GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); |
| GCNRPTracker::LiveRegSet *RegionLiveIns = |
| &LiveIns[Stage->getRegionIdx()]; |
| |
| reinterpret_cast<GCNRPTracker *>(DownwardTracker) |
| ->reset(MRI, *RegionLiveIns); |
| reinterpret_cast<GCNRPTracker *>(UpwardTracker) |
| ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx( |
| Stage->getRegionIdx())); |
| } |
| |
| ScheduleDAGMILive::schedule(); |
| Stage->finalizeGCNRegion(); |
| } |
| |
| Stage->finalizeGCNSchedStage(); |
| } |
| } |
| |
| #ifndef NDEBUG |
| raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { |
| switch (StageID) { |
| case GCNSchedStageID::OccInitialSchedule: |
| OS << "Max Occupancy Initial Schedule"; |
| break; |
| case GCNSchedStageID::UnclusteredHighRPReschedule: |
| OS << "Unclustered High Register Pressure Reschedule"; |
| break; |
| case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
| OS << "Clustered Low Occupancy Reschedule"; |
| break; |
| case GCNSchedStageID::PreRARematerialize: |
| OS << "Pre-RA Rematerialize"; |
| break; |
| case GCNSchedStageID::ILPInitialSchedule: |
| OS << "Max ILP Initial Schedule"; |
| break; |
| case GCNSchedStageID::MemoryClauseInitialSchedule: |
| OS << "Max memory clause Initial Schedule"; |
| break; |
| } |
| |
| return OS; |
| } |
| #endif |
| |
| GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) |
| : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), |
| MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} |
| |
| bool GCNSchedStage::initGCNSchedStage() { |
| if (!DAG.LIS) |
| return false; |
| |
| LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); |
| return true; |
| } |
| |
| bool UnclusteredHighRPStage::initGCNSchedStage() { |
| if (DisableUnclusterHighRP) |
| return false; |
| |
| if (!GCNSchedStage::initGCNSchedStage()) |
| return false; |
| |
| if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) |
| return false; |
| |
| SavedMutations.swap(DAG.Mutations); |
| DAG.addMutation( |
| createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry)); |
| |
| InitialOccupancy = DAG.MinOccupancy; |
| // Aggressivly try to reduce register pressure in the unclustered high RP |
| // stage. Temporarily increase occupancy target in the region. |
| S.SGPRLimitBias = S.HighRPSGPRBias; |
| S.VGPRLimitBias = S.HighRPVGPRBias; |
| if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) |
| MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); |
| |
| LLVM_DEBUG( |
| dbgs() |
| << "Retrying function scheduling without clustering. " |
| "Aggressivly try to reduce register pressure to achieve occupancy " |
| << DAG.MinOccupancy << ".\n"); |
| |
| return true; |
| } |
| |
| bool ClusteredLowOccStage::initGCNSchedStage() { |
| if (DisableClusteredLowOccupancy) |
| return false; |
| |
| if (!GCNSchedStage::initGCNSchedStage()) |
| return false; |
| |
| // Don't bother trying to improve ILP in lower RP regions if occupancy has not |
| // been dropped. All regions will have already been scheduled with the ideal |
| // occupancy targets. |
| if (DAG.StartingOccupancy <= DAG.MinOccupancy) |
| return false; |
| |
| LLVM_DEBUG( |
| dbgs() << "Retrying function scheduling with lowest recorded occupancy " |
| << DAG.MinOccupancy << ".\n"); |
| return true; |
| } |
| |
| /// Allows to easily filter for this stage's debug output. |
| #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;) |
| |
| bool PreRARematStage::initGCNSchedStage() { |
| // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for |
| // regions inbetween the defs and region we sinked the def to. Will need to be |
| // fixed if there is another pass after this pass. |
| assert(!S.hasNextStage()); |
| |
| if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() || |
| DAG.Regions.size() == 1) |
| return false; |
| |
| // Before performing any IR modification record the parent region of each MI |
| // and the parent MBB of each region. |
| const unsigned NumRegions = DAG.Regions.size(); |
| RegionBB.reserve(NumRegions); |
| for (unsigned I = 0; I < NumRegions; ++I) { |
| RegionBoundaries Region = DAG.Regions[I]; |
| for (auto MI = Region.first; MI != Region.second; ++MI) |
| MIRegion.insert({&*MI, I}); |
| RegionBB.push_back(Region.first->getParent()); |
| } |
| |
| if (!canIncreaseOccupancyOrReduceSpill()) |
| return false; |
| |
| // Rematerialize identified instructions and update scheduler's state. |
| rematerialize(); |
| if (GCNTrackers) |
| DAG.RegionLiveOuts.buildLiveRegMap(); |
| REMAT_DEBUG( |
| dbgs() << "Retrying function scheduling with new min. occupancy of " |
| << AchievedOcc << " from rematerializing (original was " |
| << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n"); |
| if (AchievedOcc > DAG.MinOccupancy) { |
| DAG.MinOccupancy = AchievedOcc; |
| SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
| MFI.increaseOccupancy(MF, DAG.MinOccupancy); |
| } |
| return true; |
| } |
| |
| void GCNSchedStage::finalizeGCNSchedStage() { |
| DAG.finishBlock(); |
| LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); |
| } |
| |
| void UnclusteredHighRPStage::finalizeGCNSchedStage() { |
| SavedMutations.swap(DAG.Mutations); |
| S.SGPRLimitBias = S.VGPRLimitBias = 0; |
| if (DAG.MinOccupancy > InitialOccupancy) { |
| for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) |
| DAG.RegionsWithMinOcc[IDX] = |
| DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy; |
| |
| LLVM_DEBUG(dbgs() << StageID |
| << " stage successfully increased occupancy to " |
| << DAG.MinOccupancy << '\n'); |
| } |
| |
| GCNSchedStage::finalizeGCNSchedStage(); |
| } |
| |
| bool GCNSchedStage::initGCNRegion() { |
| // Check whether this new region is also a new block. |
| if (DAG.RegionBegin->getParent() != CurrentMBB) |
| setupNewBlock(); |
| |
| unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); |
| DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); |
| |
| // Skip empty scheduling regions (0 or 1 schedulable instructions). |
| if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) |
| return false; |
| |
| LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); |
| LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) |
| << " " << CurrentMBB->getName() |
| << "\n From: " << *DAG.begin() << " To: "; |
| if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; |
| else dbgs() << "End"; |
| dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); |
| |
| // Save original instruction order before scheduling for possible revert. |
| Unsched.clear(); |
| Unsched.reserve(DAG.NumRegionInstrs); |
| if (StageID == GCNSchedStageID::OccInitialSchedule || |
| StageID == GCNSchedStageID::ILPInitialSchedule) { |
| const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII); |
| for (auto &I : DAG) { |
| Unsched.push_back(&I); |
| if (SII->isIGLPMutationOnly(I.getOpcode())) |
| DAG.RegionsWithIGLPInstrs[RegionIdx] = true; |
| } |
| } else { |
| for (auto &I : DAG) |
| Unsched.push_back(&I); |
| } |
| |
| PressureBefore = DAG.Pressure[RegionIdx]; |
| |
| LLVM_DEBUG( |
| dbgs() << "Pressure before scheduling:\nRegion live-ins:" |
| << print(DAG.LiveIns[RegionIdx], DAG.MRI) |
| << "Region live-in pressure: " |
| << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) |
| << "Region register pressure: " << print(PressureBefore)); |
| |
| S.HasHighPressure = false; |
| S.KnownExcessRP = isRegionWithExcessRP(); |
| |
| if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
| StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { |
| SavedMutations.clear(); |
| SavedMutations.swap(DAG.Mutations); |
| bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || |
| StageID == GCNSchedStageID::ILPInitialSchedule; |
| DAG.addMutation(createIGroupLPDAGMutation( |
| IsInitialStage ? AMDGPU::SchedulingPhase::Initial |
| : AMDGPU::SchedulingPhase::PreRAReentry)); |
| } |
| |
| return true; |
| } |
| |
| bool UnclusteredHighRPStage::initGCNRegion() { |
| // Only reschedule regions with the minimum occupancy or regions that may have |
| // spilling (excess register pressure). |
| if ((!DAG.RegionsWithMinOcc[RegionIdx] || |
| DAG.MinOccupancy <= InitialOccupancy) && |
| !DAG.RegionsWithExcessRP[RegionIdx]) |
| return false; |
| |
| return GCNSchedStage::initGCNRegion(); |
| } |
| |
| bool ClusteredLowOccStage::initGCNRegion() { |
| // We may need to reschedule this region if it wasn't rescheduled in the last |
| // stage, or if we found it was testing critical register pressure limits in |
| // the unclustered reschedule stage. The later is because we may not have been |
| // able to raise the min occupancy in the previous stage so the region may be |
| // overly constrained even if it was already rescheduled. |
| if (!DAG.RegionsWithHighRP[RegionIdx]) |
| return false; |
| |
| return GCNSchedStage::initGCNRegion(); |
| } |
| |
| bool PreRARematStage::initGCNRegion() { |
| if (!DAG.RescheduleRegions[RegionIdx]) |
| return false; |
| |
| return GCNSchedStage::initGCNRegion(); |
| } |
| |
| void GCNSchedStage::setupNewBlock() { |
| if (CurrentMBB) |
| DAG.finishBlock(); |
| |
| CurrentMBB = DAG.RegionBegin->getParent(); |
| DAG.startBlock(CurrentMBB); |
| // Get real RP for the region if it hasn't be calculated before. After the |
| // initial schedule stage real RP will be collected after scheduling. |
| if (StageID == GCNSchedStageID::OccInitialSchedule || |
| StageID == GCNSchedStageID::ILPInitialSchedule || |
| StageID == GCNSchedStageID::MemoryClauseInitialSchedule) |
| DAG.computeBlockPressure(RegionIdx, CurrentMBB); |
| } |
| |
| void GCNSchedStage::finalizeGCNRegion() { |
| DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
| DAG.RescheduleRegions[RegionIdx] = false; |
| if (S.HasHighPressure) |
| DAG.RegionsWithHighRP[RegionIdx] = true; |
| |
| // Revert scheduling if we have dropped occupancy or there is some other |
| // reason that the original schedule is better. |
| checkScheduling(); |
| |
| if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
| StageID != GCNSchedStageID::UnclusteredHighRPReschedule) |
| SavedMutations.swap(DAG.Mutations); |
| |
| DAG.exitRegion(); |
| RegionIdx++; |
| } |
| |
| void GCNSchedStage::checkScheduling() { |
| // Check the results of scheduling. |
| PressureAfter = DAG.getRealRegPressure(RegionIdx); |
| |
| LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); |
| LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); |
| |
| if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && |
| PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { |
| DAG.Pressure[RegionIdx] = PressureAfter; |
| DAG.RegionsWithMinOcc[RegionIdx] = |
| PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; |
| |
| // Early out if we have achieved the occupancy target. |
| LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); |
| return; |
| } |
| |
| unsigned TargetOccupancy = std::min( |
| S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); |
| unsigned WavesAfter = |
| std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); |
| unsigned WavesBefore = |
| std::min(TargetOccupancy, PressureBefore.getOccupancy(ST)); |
| LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore |
| << ", after " << WavesAfter << ".\n"); |
| |
| // We may not be able to keep the current target occupancy because of the just |
| // scheduled region. We might still be able to revert scheduling if the |
| // occupancy before was higher, or if the current schedule has register |
| // pressure higher than the excess limits which could lead to more spilling. |
| unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); |
| |
| // Allow memory bound functions to drop to 4 waves if not limited by an |
| // attribute. |
| if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && |
| WavesAfter >= MFI.getMinAllowedOccupancy()) { |
| LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " |
| << MFI.getMinAllowedOccupancy() << " waves\n"); |
| NewOccupancy = WavesAfter; |
| } |
| |
| if (NewOccupancy < DAG.MinOccupancy) { |
| DAG.MinOccupancy = NewOccupancy; |
| MFI.limitOccupancy(DAG.MinOccupancy); |
| DAG.RegionsWithMinOcc.reset(); |
| LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " |
| << DAG.MinOccupancy << ".\n"); |
| } |
| // The maximum number of arch VGPR on non-unified register file, or the |
| // maximum VGPR + AGPR in the unified register file case. |
| unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); |
| // The maximum number of arch VGPR for both unified and non-unified register |
| // file. |
| unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs()); |
| unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); |
| |
| if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs || |
| PressureAfter.getVGPRNum(false) > MaxArchVGPRs || |
| PressureAfter.getAGPRNum() > MaxArchVGPRs || |
| PressureAfter.getSGPRNum() > MaxSGPRs) { |
| DAG.RescheduleRegions[RegionIdx] = true; |
| DAG.RegionsWithHighRP[RegionIdx] = true; |
| DAG.RegionsWithExcessRP[RegionIdx] = true; |
| } |
| |
| // Revert if this region's schedule would cause a drop in occupancy or |
| // spilling. |
| if (shouldRevertScheduling(WavesAfter)) { |
| revertScheduling(); |
| } else { |
| DAG.Pressure[RegionIdx] = PressureAfter; |
| DAG.RegionsWithMinOcc[RegionIdx] = |
| PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; |
| } |
| } |
| |
| unsigned |
| GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, |
| DenseMap<unsigned, unsigned> &ReadyCycles, |
| const TargetSchedModel &SM) { |
| unsigned ReadyCycle = CurrCycle; |
| for (auto &D : SU.Preds) { |
| if (D.isAssignedRegDep()) { |
| MachineInstr *DefMI = D.getSUnit()->getInstr(); |
| unsigned Latency = SM.computeInstrLatency(DefMI); |
| unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum]; |
| ReadyCycle = std::max(ReadyCycle, DefReady + Latency); |
| } |
| } |
| ReadyCycles[SU.NodeNum] = ReadyCycle; |
| return ReadyCycle; |
| } |
| |
| #ifndef NDEBUG |
| struct EarlierIssuingCycle { |
| bool operator()(std::pair<MachineInstr *, unsigned> A, |
| std::pair<MachineInstr *, unsigned> B) const { |
| return A.second < B.second; |
| } |
| }; |
| |
| static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, |
| EarlierIssuingCycle> &ReadyCycles) { |
| if (ReadyCycles.empty()) |
| return; |
| unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); |
| dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum |
| << " ##################\n# Cycle #\t\t\tInstruction " |
| " " |
| " \n"; |
| unsigned IPrev = 1; |
| for (auto &I : ReadyCycles) { |
| if (I.second > IPrev + 1) |
| dbgs() << "****************************** BUBBLE OF " << I.second - IPrev |
| << " CYCLES DETECTED ******************************\n\n"; |
| dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; |
| IPrev = I.second; |
| } |
| } |
| #endif |
| |
| ScheduleMetrics |
| GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { |
| #ifndef NDEBUG |
| std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
| ReadyCyclesSorted; |
| #endif |
| const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
| unsigned SumBubbles = 0; |
| DenseMap<unsigned, unsigned> ReadyCycles; |
| unsigned CurrCycle = 0; |
| for (auto &SU : InputSchedule) { |
| unsigned ReadyCycle = |
| computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); |
| SumBubbles += ReadyCycle - CurrCycle; |
| #ifndef NDEBUG |
| ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); |
| #endif |
| CurrCycle = ++ReadyCycle; |
| } |
| #ifndef NDEBUG |
| LLVM_DEBUG( |
| printScheduleModel(ReadyCyclesSorted); |
| dbgs() << "\n\t" |
| << "Metric: " |
| << (SumBubbles |
| ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
| : 1) |
| << "\n\n"); |
| #endif |
| |
| return ScheduleMetrics(CurrCycle, SumBubbles); |
| } |
| |
| ScheduleMetrics |
| GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { |
| #ifndef NDEBUG |
| std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
| ReadyCyclesSorted; |
| #endif |
| const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
| unsigned SumBubbles = 0; |
| DenseMap<unsigned, unsigned> ReadyCycles; |
| unsigned CurrCycle = 0; |
| for (auto &MI : DAG) { |
| SUnit *SU = DAG.getSUnit(&MI); |
| if (!SU) |
| continue; |
| unsigned ReadyCycle = |
| computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); |
| SumBubbles += ReadyCycle - CurrCycle; |
| #ifndef NDEBUG |
| ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); |
| #endif |
| CurrCycle = ++ReadyCycle; |
| } |
| #ifndef NDEBUG |
| LLVM_DEBUG( |
| printScheduleModel(ReadyCyclesSorted); |
| dbgs() << "\n\t" |
| << "Metric: " |
| << (SumBubbles |
| ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
| : 1) |
| << "\n\n"); |
| #endif |
| |
| return ScheduleMetrics(CurrCycle, SumBubbles); |
| } |
| |
| bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { |
| if (WavesAfter < DAG.MinOccupancy) |
| return true; |
| |
| // For dynamic VGPR mode, we don't want to waste any VGPR blocks. |
| if (ST.isDynamicVGPREnabled()) { |
| unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( |
| &ST, PressureBefore.getVGPRNum(false)); |
| unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( |
| &ST, PressureAfter.getVGPRNum(false)); |
| if (BlocksAfter > BlocksBefore) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
| if (PressureAfter == PressureBefore) |
| return false; |
| |
| if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
| return true; |
| |
| if (mayCauseSpilling(WavesAfter)) |
| return true; |
| |
| return false; |
| } |
| |
| bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { |
| // If RP is not reduced in the unclustered reschedule stage, revert to the |
| // old schedule. |
| if ((WavesAfter <= PressureBefore.getOccupancy(ST) && |
| mayCauseSpilling(WavesAfter)) || |
| GCNSchedStage::shouldRevertScheduling(WavesAfter)) { |
| LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); |
| return true; |
| } |
| |
| // Do not attempt to relax schedule even more if we are already spilling. |
| if (isRegionWithExcessRP()) |
| return false; |
| |
| LLVM_DEBUG( |
| dbgs() |
| << "\n\t *** In shouldRevertScheduling ***\n" |
| << " *********** BEFORE UnclusteredHighRPStage ***********\n"); |
| ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits); |
| LLVM_DEBUG( |
| dbgs() |
| << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); |
| ScheduleMetrics MAfter = getScheduleMetrics(DAG); |
| unsigned OldMetric = MBefore.getMetric(); |
| unsigned NewMetric = MAfter.getMetric(); |
| unsigned WavesBefore = |
| std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); |
| unsigned Profit = |
| ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * |
| ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / |
| NewMetric) / |
| ScheduleMetrics::ScaleFactor; |
| LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " |
| << MAfter << "Profit: " << Profit << "\n"); |
| return Profit < ScheduleMetrics::ScaleFactor; |
| } |
| |
| bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { |
| if (PressureAfter == PressureBefore) |
| return false; |
| |
| if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
| return true; |
| |
| if (mayCauseSpilling(WavesAfter)) |
| return true; |
| |
| return false; |
| } |
| |
| bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { |
| return GCNSchedStage::shouldRevertScheduling(WavesAfter) || |
| mayCauseSpilling(WavesAfter) || |
| (IncreaseOccupancy && WavesAfter < TargetOcc); |
| } |
| |
| bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
| if (mayCauseSpilling(WavesAfter)) |
| return true; |
| |
| return false; |
| } |
| |
| bool MemoryClauseInitialScheduleStage::shouldRevertScheduling( |
| unsigned WavesAfter) { |
| return mayCauseSpilling(WavesAfter); |
| } |
| |
| bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { |
| if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() && |
| !PressureAfter.less(MF, PressureBefore)) { |
| LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| void GCNSchedStage::revertScheduling() { |
| DAG.RegionsWithMinOcc[RegionIdx] = |
| PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; |
| LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); |
| DAG.RescheduleRegions[RegionIdx] = |
| S.hasNextStage() && |
| S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule; |
| DAG.RegionEnd = DAG.RegionBegin; |
| int SkippedDebugInstr = 0; |
| for (MachineInstr *MI : Unsched) { |
| if (MI->isDebugInstr()) { |
| ++SkippedDebugInstr; |
| continue; |
| } |
| |
| if (MI->getIterator() != DAG.RegionEnd) { |
| DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI); |
| if (!MI->isDebugInstr()) |
| DAG.LIS->handleMove(*MI, true); |
| } |
| |
| // Reset read-undef flags and update them later. |
| for (auto &Op : MI->all_defs()) |
| Op.setIsUndef(false); |
| RegisterOperands RegOpers; |
| RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); |
| if (!MI->isDebugInstr()) { |
| if (DAG.ShouldTrackLaneMasks) { |
| // Adjust liveness and add missing dead+read-undef flags. |
| SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); |
| RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); |
| } else { |
| // Adjust for missing dead-def flags. |
| RegOpers.detectDeadDefs(*MI, *DAG.LIS); |
| } |
| } |
| DAG.RegionEnd = MI->getIterator(); |
| ++DAG.RegionEnd; |
| LLVM_DEBUG(dbgs() << "Scheduling " << *MI); |
| } |
| |
| // After reverting schedule, debug instrs will now be at the end of the block |
| // and RegionEnd will point to the first debug instr. Increment RegionEnd |
| // pass debug instrs to the actual end of the scheduling region. |
| while (SkippedDebugInstr-- > 0) |
| ++DAG.RegionEnd; |
| |
| // If Unsched.front() instruction is a debug instruction, this will actually |
| // shrink the region since we moved all debug instructions to the end of the |
| // block. Find the first instruction that is not a debug instruction. |
| DAG.RegionBegin = Unsched.front()->getIterator(); |
| if (DAG.RegionBegin->isDebugInstr()) { |
| for (MachineInstr *MI : Unsched) { |
| if (MI->isDebugInstr()) |
| continue; |
| DAG.RegionBegin = MI->getIterator(); |
| break; |
| } |
| } |
| |
| // Then move the debug instructions back into their correct place and set |
| // RegionBegin and RegionEnd if needed. |
| DAG.placeDebugValues(); |
| |
| DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
| } |
| |
| bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, |
| SlotIndex OriginalIdx, |
| SlotIndex RematIdx) const { |
| |
| LiveIntervals *LIS = DAG.LIS; |
| MachineRegisterInfo &MRI = DAG.MRI; |
| OriginalIdx = OriginalIdx.getRegSlot(true); |
| RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true)); |
| for (const MachineOperand &MO : InstToRemat->operands()) { |
| if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) |
| continue; |
| |
| if (!MO.getReg().isVirtual()) { |
| // Do not attempt to reason about PhysRegs |
| // TODO: better analysis of PhysReg livness |
| if (!DAG.MRI.isConstantPhysReg(MO.getReg()) && |
| !DAG.TII->isIgnorableUse(MO)) |
| return false; |
| |
| // Constant PhysRegs and IgnorableUses are okay |
| continue; |
| } |
| |
| LiveInterval &LI = LIS->getInterval(MO.getReg()); |
| const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx); |
| assert(OVNI); |
| |
| // Don't allow rematerialization immediately after the original def. |
| // It would be incorrect if InstToRemat redefines the register. |
| // See PR14098. |
| if (SlotIndex::isSameInstr(OriginalIdx, RematIdx)) |
| return false; |
| |
| if (OVNI != LI.getVNInfoAt(RematIdx)) |
| return false; |
| |
| // Check that subrange is live at RematIdx. |
| if (LI.hasSubRanges()) { |
| const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); |
| unsigned SubReg = MO.getSubReg(); |
| LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) |
| : MRI.getMaxLaneMaskForVReg(MO.getReg()); |
| for (LiveInterval::SubRange &SR : LI.subranges()) { |
| if ((SR.LaneMask & LM).none()) |
| continue; |
| if (!SR.liveAt(RematIdx)) |
| return false; |
| |
| // Early exit if all used lanes are checked. No need to continue. |
| LM &= ~SR.LaneMask; |
| if (LM.none()) |
| break; |
| } |
| } |
| } |
| return true; |
| } |
| |
| namespace { |
| /// Models excess register pressure in a region and tracks our progress as we |
| /// identify rematerialization opportunities. |
| struct ExcessRP { |
| /// Number of excess ArchVGPRs. |
| unsigned ArchVGPRs = 0; |
| /// Number of excess AGPRs. |
| unsigned AGPRs = 0; |
| /// For unified register files, number of excess VGPRs. |
| unsigned VGPRs = 0; |
| /// For unified register files with AGPR usage, number of excess ArchVGPRs to |
| /// save before we are able to save a whole allocation granule. |
| unsigned ArchVGPRsToAlignment = 0; |
| /// Whether the region uses AGPRs. |
| bool HasAGPRs = false; |
| /// Whether the subtarget has a unified RF. |
| bool UnifiedRF; |
| |
| /// Constructs the excess RP model; determines the excess pressure w.r.t. a |
| /// maximum number of allowed VGPRs. |
| ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs); |
| |
| /// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p |
| /// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess |
| /// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether |
| /// saving these ArchVGPRs helped reduce excess pressure. |
| bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill); |
| |
| /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving |
| /// these ArchVGPRs helped reduce excess pressure. |
| bool saveAGPRs(unsigned NumRegs); |
| |
| /// Returns whether there is any excess register pressure. |
| operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; } |
| |
| #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) { |
| OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and " |
| << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in " |
| << Excess.ArchVGPRsToAlignment << " registers)\n"; |
| return OS; |
| } |
| #endif |
| |
| private: |
| static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) { |
| unsigned NumSaved = std::min(LeftToSave, NumRegs); |
| NumRegs -= NumSaved; |
| LeftToSave -= NumSaved; |
| return NumSaved; |
| } |
| }; |
| } // namespace |
| |
| ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, |
| unsigned MaxVGPRs) |
| : UnifiedRF(ST.hasGFX90AInsts()) { |
| unsigned NumArchVGPRs = RP.getArchVGPRNum(); |
| unsigned NumAGPRs = RP.getAGPRNum(); |
| HasAGPRs = NumAGPRs; |
| |
| if (!UnifiedRF) { |
| // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs |
| // independently. |
| if (NumArchVGPRs > MaxVGPRs) |
| ArchVGPRs = NumArchVGPRs - MaxVGPRs; |
| if (NumAGPRs > MaxVGPRs) |
| AGPRs = NumAGPRs - MaxVGPRs; |
| return; |
| } |
| |
| // Independently of whether overall VGPR pressure is under the limit, we still |
| // have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the |
| // number of addressable registers in each category. |
| const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs(); |
| if (NumArchVGPRs > MaxArchVGPRs) { |
| ArchVGPRs = NumArchVGPRs - MaxArchVGPRs; |
| NumArchVGPRs = MaxArchVGPRs; |
| } |
| if (NumAGPRs > MaxArchVGPRs) { |
| AGPRs = NumAGPRs - MaxArchVGPRs; |
| NumAGPRs = MaxArchVGPRs; |
| } |
| |
| // Check overall VGPR usage against the limit; any excess above addressable |
| // register limits has already been accounted for. |
| const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule(); |
| unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs); |
| if (NumVGPRs > MaxVGPRs) { |
| VGPRs = NumVGPRs - MaxVGPRs; |
| ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, Granule); |
| if (!ArchVGPRsToAlignment) |
| ArchVGPRsToAlignment = Granule; |
| } |
| } |
| |
| bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) { |
| bool Progress = saveRegs(ArchVGPRs, NumRegs); |
| if (!NumRegs) |
| return Progress; |
| |
| if (!UnifiedRF) { |
| if (UseArchVGPRForAGPRSpill) |
| Progress |= saveRegs(AGPRs, NumRegs); |
| } else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) { |
| // There is progress as long as there are VGPRs left to save, even if the |
| // save induced by this particular call does not cross an ArchVGPR alignment |
| // barrier. |
| Progress = true; |
| |
| // ArchVGPRs can only be allocated as a multiple of a granule in unified RF. |
| unsigned NumSavedRegs = 0; |
| |
| // Count the number of whole ArchVGPR allocation granules we can save. |
| const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule(); |
| if (unsigned NumGranules = NumRegs / Granule; NumGranules) { |
| NumSavedRegs = NumGranules * Granule; |
| NumRegs -= NumSavedRegs; |
| } |
| |
| // We may be able to save one more whole ArchVGPR allocation granule. |
| if (NumRegs >= ArchVGPRsToAlignment) { |
| NumSavedRegs += Granule; |
| ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment); |
| } else { |
| ArchVGPRsToAlignment -= NumRegs; |
| } |
| |
| // Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR |
| // spilling and have some free ArchVGPR slots. |
| saveRegs(VGPRs, NumSavedRegs); |
| if (UseArchVGPRForAGPRSpill) |
| saveRegs(AGPRs, NumSavedRegs); |
| } else { |
| // No AGPR usage in the region i.e., no allocation granule to worry about. |
| Progress |= saveRegs(VGPRs, NumRegs); |
| } |
| |
| return Progress; |
| } |
| |
| bool ExcessRP::saveAGPRs(unsigned NumRegs) { |
| return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs); |
| } |
| |
| bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { |
| const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI); |
| |
| REMAT_DEBUG({ |
| dbgs() << "Collecting rematerializable instructions in "; |
| MF.getFunction().printAsOperand(dbgs(), false); |
| dbgs() << '\n'; |
| }); |
| |
| // Maps optimizable regions (i.e., regions at minimum and VGPR-limited |
| // occupancy, or regions with VGPR spilling) to a model of their excess RP. |
| DenseMap<unsigned, ExcessRP> OptRegions; |
| const Function &F = MF.getFunction(); |
| |
| std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); |
| const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); |
| const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); |
| const unsigned MaxSGPRsIncOcc = |
| ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false); |
| const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1); |
| IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; |
| |
| auto ClearOptRegionsIf = [&](bool Cond) -> bool { |
| if (Cond) { |
| // We won't try to increase occupancy. |
| IncreaseOccupancy = false; |
| OptRegions.clear(); |
| } |
| return Cond; |
| }; |
| |
| // Collect optimizable regions. If there is spilling in any region we will |
| // just try to reduce ArchVGPR spilling. Otherwise we will try to increase |
| // occupancy by one in the whole function. |
| for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| GCNRegPressure &RP = DAG.Pressure[I]; |
| |
| // Check whether SGPR pressures prevents us from eliminating spilling. |
| unsigned NumSGPRs = RP.getSGPRNum(); |
| if (NumSGPRs > MaxSGPRsNoSpill) |
| ClearOptRegionsIf(IncreaseOccupancy); |
| |
| ExcessRP Excess(ST, RP, MaxVGPRsNoSpill); |
| if (Excess) { |
| ClearOptRegionsIf(IncreaseOccupancy); |
| } else if (IncreaseOccupancy) { |
| // Check whether SGPR pressure prevents us from increasing occupancy. |
| if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) { |
| if (DAG.MinOccupancy >= WavesPerEU.first) |
| return false; |
| continue; |
| } |
| if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) { |
| // We can only rematerialize ArchVGPRs at this point. |
| unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs; |
| bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum(); |
| if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) { |
| if (DAG.MinOccupancy >= WavesPerEU.first) |
| return false; |
| continue; |
| } |
| } |
| } |
| if (Excess) |
| OptRegions.insert({I, Excess}); |
| } |
| if (OptRegions.empty()) |
| return false; |
| |
| #ifndef NDEBUG |
| if (IncreaseOccupancy) |
| REMAT_DEBUG(dbgs() << "Occupancy minimal in regions:\n"); |
| else |
| REMAT_DEBUG(dbgs() << "Spilling in regions:\n"); |
| for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) |
| REMAT_DEBUG(dbgs() << " " << I << ": " << OptIt->getSecond() << '\n'); |
| } |
| #endif |
| |
| // When we are reducing spilling, the target is the minimum target number of |
| // waves/EU determined by the subtarget. |
| TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first; |
| |
| // Accounts for a reduction in RP in an optimizable region. Returns whether we |
| // estimate that we have identified enough rematerialization opportunities to |
| // achieve our goal, and sets Progress to true when this particular reduction |
| // in pressure was helpful toward that goal. |
| auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask, |
| bool &Progress) -> bool { |
| ExcessRP &Excess = OptIt->getSecond(); |
| // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs |
| // only when we are just trying to eliminate spilling to memory. At this |
| // point we err on the conservative side and do not increase |
| // register-to-register spilling for the sake of increasing occupancy. |
| Progress |= |
| Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask), |
| /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy); |
| if (!Excess) |
| OptRegions.erase(OptIt->getFirst()); |
| return OptRegions.empty(); |
| }; |
| |
| // We need up-to-date live-out info. to query live-out register masks in |
| // regions containing rematerializable instructions. |
| DAG.RegionLiveOuts.buildLiveRegMap(); |
| |
| // Cache set of registers that are going to be rematerialized. |
| DenseSet<unsigned> RematRegs; |
| |
| // Identify rematerializable instructions in the function. |
| for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| auto Region = DAG.Regions[I]; |
| for (auto MI = Region.first; MI != Region.second; ++MI) { |
| // The instruction must be trivially rematerializable. |
| MachineInstr &DefMI = *MI; |
| if (!isTriviallyReMaterializable(DefMI)) |
| continue; |
| |
| // We only support rematerializing virtual VGPRs with one definition. |
| Register Reg = DefMI.getOperand(0).getReg(); |
| if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) || |
| !DAG.MRI.hasOneDef(Reg)) |
| continue; |
| |
| // We only care to rematerialize the instruction if it has a single |
| // non-debug user in a different region. The using MI may not belong to a |
| // region if it is a lone region terminator. |
| MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg); |
| if (!UseMI) |
| continue; |
| auto UseRegion = MIRegion.find(UseMI); |
| if (UseRegion != MIRegion.end() && UseRegion->second == I) |
| continue; |
| |
| // Do not rematerialize an instruction if it uses or is used by an |
| // instruction that we have designated for rematerialization. |
| // FIXME: Allow for rematerialization chains: this requires 1. updating |
| // remat points to account for uses that are rematerialized, and 2. either |
| // rematerializing the candidates in careful ordering, or deferring the |
| // MBB RP walk until the entire chain has been rematerialized. |
| if (Rematerializations.contains(UseMI) || |
| llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) { |
| return MO.isReg() && RematRegs.contains(MO.getReg()); |
| })) |
| continue; |
| |
| // Do not rematerialize an instruction it it uses registers that aren't |
| // available at its use. This ensures that we are not extending any live |
| // range while rematerializing. |
| SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI); |
| SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true); |
| if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx)) |
| continue; |
| |
| REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI); |
| RematInstruction &Remat = |
| Rematerializations.try_emplace(&DefMI, UseMI).first->second; |
| |
| bool RematUseful = false; |
| if (auto It = OptRegions.find(I); It != OptRegions.end()) { |
| // Optimistically consider that moving the instruction out of its |
| // defining region will reduce RP in the latter; this assumes that |
| // maximum RP in the region is reached somewhere between the defining |
| // instruction and the end of the region. |
| REMAT_DEBUG(dbgs() << " Defining region is optimizable\n"); |
| LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg]; |
| if (ReduceRPInRegion(It, Mask, RematUseful)) |
| return true; |
| } |
| |
| for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) { |
| // We are only collecting regions in which the register is a live-in |
| // (and may be live-through). |
| auto It = DAG.LiveIns[LIRegion].find(Reg); |
| if (It == DAG.LiveIns[LIRegion].end() || It->second.none()) |
| continue; |
| Remat.LiveInRegions.insert(LIRegion); |
| |
| // Account for the reduction in RP due to the rematerialization in an |
| // optimizable region in which the defined register is a live-in. This |
| // is exact for live-through region but optimistic in the using region, |
| // where RP is actually reduced only if maximum RP is reached somewhere |
| // between the beginning of the region and the rematerializable |
| // instruction's use. |
| if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) { |
| REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n'); |
| if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful)) |
| return true; |
| } |
| } |
| |
| // If the instruction is not a live-in or live-out in any optimizable |
| // region then there is no point in rematerializing it. |
| if (!RematUseful) { |
| Rematerializations.pop_back(); |
| REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n"); |
| } else { |
| RematRegs.insert(Reg); |
| } |
| } |
| } |
| |
| if (IncreaseOccupancy) { |
| // We were trying to increase occupancy but failed, abort the stage. |
| REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n"); |
| Rematerializations.clear(); |
| return false; |
| } |
| REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n"); |
| return !Rematerializations.empty(); |
| } |
| |
| void PreRARematStage::rematerialize() { |
| const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); |
| |
| // Collect regions whose RP changes in unpredictable way; we will have to |
| // fully recompute their RP after all rematerailizations. |
| DenseSet<unsigned> RecomputeRP; |
| |
| // Rematerialize all instructions. |
| for (auto &[DefMI, Remat] : Rematerializations) { |
| MachineBasicBlock::iterator InsertPos(Remat.UseMI); |
| Register Reg = DefMI->getOperand(0).getReg(); |
| unsigned SubReg = DefMI->getOperand(0).getSubReg(); |
| unsigned DefRegion = MIRegion.at(DefMI); |
| |
| // Rematerialize DefMI to its use block. |
| TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI, |
| *DAG.TRI); |
| Remat.RematMI = &*std::prev(InsertPos); |
| Remat.RematMI->getOperand(0).setSubReg(SubReg); |
| DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); |
| |
| // Update region boundaries in regions we sinked from (remove defining MI) |
| // and to (insert MI rematerialized in use block). Only then we can erase |
| // the original MI. |
| DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr); |
| auto UseRegion = MIRegion.find(Remat.UseMI); |
| if (UseRegion != MIRegion.end()) { |
| DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos, |
| Remat.RematMI); |
| } |
| DefMI->eraseFromParent(); |
| DAG.LIS->RemoveMachineInstrFromMaps(*DefMI); |
| |
| // Collect all regions impacted by the rematerialization and update their |
| // live-in/RP information. |
| for (unsigned I : Remat.LiveInRegions) { |
| ImpactedRegions.insert({I, DAG.Pressure[I]}); |
| GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I]; |
| |
| #ifdef EXPENSIVE_CHECKS |
| // All uses are known to be available / live at the remat point. Thus, the |
| // uses should already be live in to the region. |
| for (MachineOperand &MO : DefMI->operands()) { |
| if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) |
| continue; |
| |
| Register UseReg = MO.getReg(); |
| if (!UseReg.isVirtual()) |
| continue; |
| |
| LiveInterval &LI = DAG.LIS->getInterval(UseReg); |
| LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg()); |
| if (LI.hasSubRanges() && MO.getSubReg()) |
| LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg()); |
| |
| LaneBitmask LiveInMask = RegionLiveIns.at(UseReg); |
| LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM); |
| // If this register has lanes not covered by the LiveIns, be sure they |
| // do not map to any subrange. ref: |
| // machine-scheduler-sink-trivial-remats.mir::omitted_subrange |
| if (UncoveredLanes.any()) { |
| assert(LI.hasSubRanges()); |
| for (LiveInterval::SubRange &SR : LI.subranges()) |
| assert((SR.LaneMask & UncoveredLanes).none()); |
| } |
| } |
| #endif |
| |
| // The register is no longer a live-in in all regions but the one that |
| // contains the single use. In live-through regions, maximum register |
| // pressure decreases predictably so we can directly update it. In the |
| // using region, maximum RP may or may not decrease, so we will mark it |
| // for re-computation after all materializations have taken place. |
| LaneBitmask PrevMask = RegionLiveIns[Reg]; |
| RegionLiveIns.erase(Reg); |
| RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask}); |
| if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) |
| DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); |
| else |
| RecomputeRP.insert(I); |
| } |
| // RP in the region from which the instruction was rematerialized may or may |
| // not decrease. |
| ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]}); |
| RecomputeRP.insert(DefRegion); |
| |
| // Recompute live interval to reflect the register's rematerialization. |
| Register RematReg = Remat.RematMI->getOperand(0).getReg(); |
| DAG.LIS->removeInterval(RematReg); |
| DAG.LIS->createAndComputeVirtRegInterval(RematReg); |
| } |
| |
| // All regions impacted by at least one rematerialization must be rescheduled. |
| // Maximum pressure must also be recomputed for all regions where it changed |
| // non-predictably and checked against the target occupancy. |
| AchievedOcc = TargetOcc; |
| for (auto &[I, OriginalRP] : ImpactedRegions) { |
| bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; |
| DAG.RescheduleRegions[I] = !IsEmptyRegion; |
| if (!RecomputeRP.contains(I)) |
| continue; |
| |
| GCNRegPressure RP; |
| if (IsEmptyRegion) { |
| RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]); |
| } else { |
| GCNDownwardRPTracker RPT(*DAG.LIS); |
| auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first, |
| DAG.Regions[I].second); |
| if (NonDbgMI == DAG.Regions[I].second) { |
| // Region is non-empty but contains only debug instructions. |
| RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]); |
| } else { |
| RPT.reset(*NonDbgMI, &DAG.LiveIns[I]); |
| RPT.advance(DAG.Regions[I].second); |
| RP = RPT.moveMaxPressure(); |
| } |
| } |
| DAG.Pressure[I] = RP; |
| AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST)); |
| } |
| REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); |
| } |
| |
| // Copied from MachineLICM |
| bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { |
| if (!DAG.TII->isTriviallyReMaterializable(MI)) |
| return false; |
| |
| for (const MachineOperand &MO : MI.all_uses()) { |
| // We can't remat physreg uses, unless it is a constant or an ignorable |
| // use (e.g. implicit exec use on VALU instructions) |
| if (MO.getReg().isPhysical()) { |
| if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO)) |
| continue; |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| void PreRARematStage::finalizeGCNSchedStage() { |
| // We consider that reducing spilling is always beneficial so we never |
| // rollback rematerializations in such cases. It's also possible that |
| // rescheduling lowers occupancy over the one achieved just through remats, in |
| // which case we do not want to rollback either (the rescheduling was already |
| // reverted in PreRARematStage::shouldRevertScheduling in such cases). |
| unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); |
| if (!IncreaseOccupancy || MaxOcc >= TargetOcc) |
| return; |
| |
| REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); |
| const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); |
| |
| // Rollback the rematerializations. |
| for (const auto &[DefMI, Remat] : Rematerializations) { |
| MachineInstr &RematMI = *Remat.RematMI; |
| unsigned DefRegion = MIRegion.at(DefMI); |
| MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second); |
| MachineBasicBlock *MBB = RegionBB[DefRegion]; |
| Register Reg = RematMI.getOperand(0).getReg(); |
| unsigned SubReg = RematMI.getOperand(0).getSubReg(); |
| |
| // Re-rematerialize MI at the end of its original region. Note that it may |
| // not be rematerialized exactly in the same position as originally within |
| // the region, but it should not matter much. |
| TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI); |
| MachineInstr *NewMI = &*std::prev(InsertPos); |
| NewMI->getOperand(0).setSubReg(SubReg); |
| DAG.LIS->InsertMachineInstrInMaps(*NewMI); |
| |
| auto UseRegion = MIRegion.find(Remat.UseMI); |
| if (UseRegion != MIRegion.end()) { |
| DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI, |
| nullptr); |
| } |
| DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI); |
| |
| // Erase rematerialized MI. |
| RematMI.eraseFromParent(); |
| DAG.LIS->RemoveMachineInstrFromMaps(RematMI); |
| |
| // Recompute live interval for the re-rematerialized register |
| DAG.LIS->removeInterval(Reg); |
| DAG.LIS->createAndComputeVirtRegInterval(Reg); |
| |
| // Re-add the register as a live-in in all regions it used to be one in. |
| for (unsigned LIRegion : Remat.LiveInRegions) |
| DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})}); |
| } |
| |
| // Reset RP in all impacted regions. |
| for (auto &[I, OriginalRP] : ImpactedRegions) |
| DAG.Pressure[I] = OriginalRP; |
| |
| GCNSchedStage::finalizeGCNSchedStage(); |
| } |
| |
| void GCNScheduleDAGMILive::updateRegionBoundaries( |
| RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI, |
| MachineInstr *NewMI) { |
| assert(!NewMI || |
| NewMI != RegionBounds.second && "cannot remove at region end"); |
| |
| if (RegionBounds.first == RegionBounds.second) { |
| assert(NewMI && "cannot remove from an empty region"); |
| RegionBounds.first = NewMI; |
| return; |
| } |
| |
| // We only care for modifications at the beginning of a non-empty region since |
| // the upper region boundary is exclusive. |
| if (MI != RegionBounds.first) |
| return; |
| if (!NewMI) |
| RegionBounds.first = std::next(MI); // Removal |
| else |
| RegionBounds.first = NewMI; // Insertion |
| } |
| |
| static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { |
| const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII); |
| return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) { |
| return SII->isIGLPMutationOnly(MI->getOpcode()); |
| }); |
| } |
| |
| GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( |
| MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, |
| bool RemoveKillFlags) |
| : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} |
| |
| void GCNPostScheduleDAGMILive::schedule() { |
| HasIGLPInstrs = hasIGLPInstrs(this); |
| if (HasIGLPInstrs) { |
| SavedMutations.clear(); |
| SavedMutations.swap(Mutations); |
| addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); |
| } |
| |
| ScheduleDAGMI::schedule(); |
| } |
| |
| void GCNPostScheduleDAGMILive::finalizeSchedule() { |
| if (HasIGLPInstrs) |
| SavedMutations.swap(Mutations); |
| |
| ScheduleDAGMI::finalizeSchedule(); |
| } |