llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp - llvm-project - Git at Google

 //===- AArch64SRLTDefineSuperRegs.cpp -------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // When SubRegister Liveness Tracking (SRLT) is enabled, this pass adds
 // extra implicit-def's to instructions that define the low N bits of
 // a GPR/FPR register to also define the top bits, because all AArch64
 // instructions that write the low bits of a GPR/FPR also implicitly zero
 // the top bits.  For example, 'mov w0, w1' writes zeroes to the top 32-bits of
 // x0, so this pass adds a `implicit-def $x0` after register allocation.
 //
 // These semantics are originally represented in the MIR using `SUBREG_TO_REG`
 // which expresses that the top bits have been defined by the preceding
 // instructions, but during register coalescing this information is lost and in
 // contrast to when SRTL is disabled, when rewriting virtual -> physical
 // registers the implicit-defs are not added to the instruction.
 //
 // There have been several attempts to fix this in the coalescer [1], but each
 // iteration has exposed new bugs and the patch had to be reverted.
 // Additionally, the concept of adding 'implicit-def' of a virtual register is
 // particularly fragile and many places don't expect it (for example in
 // `X86::commuteInstructionImpl` the  code only looks at specific operands and
 // does not consider implicit-defs. Similar in `SplitEditor::addDeadDef` where
 // it traverses operand 'defs' rather than 'all_defs').
 //
 // We want a temporary solution that doesn't impact other targets and is simpler
 // and less intrusive than the patch proposed for the register coalescer [1], so
 // that we can enable SRLT for AArch64.
 //
 // The approach here is to just add the 'implicit-def' manually after rewriting
 // virtual regs -> phsyical regs. This still means that during the register
 // allocation process the dependences are not accurately represented in the MIR
 // and LiveIntervals, but there are several reasons why we believe this isn't a
 // problem in practice:
 // (A) The register allocator only spills entire virtual registers.
 //     This is additionally guarded by code in
 //     AArch64InstrInfo::storeRegToStackSlot/loadRegFromStackSlot
 //     where it checks if a register matches the expected register class.
 // (B) Rematerialization only happens when the instruction writes the full
 //     register.
 // (C) The high bits of the AArch64 register cannot be written independently.
 // (D) Instructions that write only part of a register always take that same
 //     register as a tied input operand, to indicate it's a merging operation.
 //
 // (A) means that for two virtual registers of regclass GPR32 and GPR64, if the
 // GPR32 register is coalesced into the GPR64 vreg then the full GPR64 would
 // be spilled/filled even if only the low 32-bits would be required for the
 // given liverange. (B) means that the top bits of a GPR64 would never be
 // overwritten by rematerialising a GPR32 sub-register for a given liverange.
 // (C-D) means that we can assume that the MIR as input to the register
 // allocator correctly expresses the instruction behaviour and dependences
 // between values, so unless the register allocator would violate (A) or (B),
 // the MIR is otherwise sound.
 //
 // Alternative approaches have also been considered, such as:
 // (1) Changing the AArch64 instruction definitions to write all bits and
 //     extract the low N bits for the result.
 // (2) Disabling coalescing of SUBREG_TO_REG and using regalloc hints to tell
 //     the register allocator to favour the same register for the input/output.
 // (3) Adding a new coalescer guard node with a tied-operand constraint, such
 //     that when the SUBREG_TO_REG is removed, something still represents that
 //     the top bits are defined. The node would get removed before rewriting
 //     virtregs.
 // (4) Using an explicit INSERT_SUBREG into a zero value and try to optimize
 //     away the INSERT_SUBREG (this is a more explicit variant of (2) and (3))
 // (5) Adding a new MachineOperand flag that represents the top bits would be
 //     defined, but are not read nor undef.
 //
 // (1) would be the best approach but would be a significant effort as it
 // requires rewriting most/all instruction definitions and fixing MIR passes
 // that rely on the current definitions, whereas (2-4) result in sub-optimal
 // code that can't really be avoided because the explicit nodes would stop
 // rematerialization. (5) might be a way to mitigate the
 // fragility of implicit-def's of virtual registers if we want to pursue
 // landing [1], but then we'd rather choose approach (1) to avoid using
 // SUBREG_TO_REG entirely.
 //
 // [1] https://github.com/llvm/llvm-project/pull/168353
 //===----------------------------------------------------------------------===//

 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/Support/Debug.h"

 using namespace llvm;

 #define DEBUG_TYPE "aarch64-srlt-define-superregs"
 #define PASS_NAME "AArch64 SRLT Define Super-Regs Pass"

 namespace {

 struct AArch64SRLTDefineSuperRegs : public MachineFunctionPass {
   inline static char ID = 0;

   AArch64SRLTDefineSuperRegs() : MachineFunctionPass(ID) {}

   bool runOnMachineFunction(MachineFunction &MF) override;

   Register getWidestSuperReg(Register R, const BitVector &RequiredBaseRegUnits,
                              const BitVector &QHiRegUnits);

   StringRef getPassName() const override { return PASS_NAME; }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addPreservedID(MachineLoopInfoID);
     AU.addPreservedID(MachineDominatorsID);
     MachineFunctionPass::getAnalysisUsage(AU);
   }

 private:
   MachineFunction *MF = nullptr;
   const AArch64Subtarget *Subtarget = nullptr;
   const AArch64RegisterInfo *TRI = nullptr;
 };

 } // end anonymous namespace

 INITIALIZE_PASS(AArch64SRLTDefineSuperRegs, DEBUG_TYPE, PASS_NAME, false, false)

 // Returns the widest super-reg for a given reg, or NoRegister if no suitable
 // wider super-reg has been found. For example:
 //  W0    -> X0
 //  B1    -> Q1 (without SVE)
 //        -> Z1 (with SVE)
 //  W1_W2 -> X1_X2
 //  D0_D1 -> Q0_Q1 (without SVE)
 //        -> Z0_Z1 (with SVE)
 Register AArch64SRLTDefineSuperRegs::getWidestSuperReg(
     Register R, const BitVector &RequiredBaseRegUnits,
     const BitVector &QHiRegUnits) {
   assert(R.isPhysical() &&
          "Expected to be run straight after virtregrewriter!");

   BitVector Units(TRI->getNumRegUnits());
   for (MCRegUnit U : TRI->regunits(R))
     Units.set((unsigned)U);

   auto IsSuitableSuperReg = [&](Register SR) {
     for (MCRegUnit U : TRI->regunits(SR)) {
       // Avoid choosing z1 as super-reg of d1 if SVE is not available.
       // Q*_HI registers are only set for SVE registers, as those consist
       // of the Q* register for the low 128 bits and the Q*_HI (artificial)
       // register for the top (vscale-1) * 128 bits.
       if (QHiRegUnits.test((unsigned)U) &&
           !Subtarget->isSVEorStreamingSVEAvailable())
         return false;
       // We consider a super-reg as unsuitable if any of its reg units is not
       // artificial and not shared, as that would imply that U is a unit for a
       // different register, which means the candidate super-reg is likely
       // a register tuple.
       if (!TRI->isArtificialRegUnit(U) &&
           (!Units.test((unsigned)U) || !RequiredBaseRegUnits.test((unsigned)U)))
         return false;
     }
     return true;
   };

   Register LargestSuperReg = AArch64::NoRegister;
   for (Register SR : TRI->superregs(R))
     if (IsSuitableSuperReg(SR) && (LargestSuperReg == AArch64::NoRegister ||
                                    TRI->isSuperRegister(LargestSuperReg, SR)))
       LargestSuperReg = SR;

   return LargestSuperReg;
 }

 bool AArch64SRLTDefineSuperRegs::runOnMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
   Subtarget = &MF.getSubtarget<AArch64Subtarget>();
   TRI = Subtarget->getRegisterInfo();
   const MachineRegisterInfo *MRI = &MF.getRegInfo();

   if (!MRI->subRegLivenessEnabled())
     return false;

   assert(!MRI->isSSA() && "Expected to be run after breaking down SSA form!");

   auto XRegs = seq_inclusive<unsigned>(AArch64::X0, AArch64::X28);
   auto ZRegs = seq_inclusive<unsigned>(AArch64::Z0, AArch64::Z31);
   constexpr unsigned FixedRegs[] = {AArch64::FP, AArch64::LR, AArch64::SP};

   BitVector RequiredBaseRegUnits(TRI->getNumRegUnits());
   for (Register R : concat<unsigned>(XRegs, ZRegs, FixedRegs))
     for (MCRegUnit U : TRI->regunits(R))
       RequiredBaseRegUnits.set((unsigned)U);

   BitVector QHiRegUnits(TRI->getNumRegUnits());
   for (Register R : seq_inclusive<unsigned>(AArch64::Q0_HI, AArch64::Q31_HI))
     for (MCRegUnit U : TRI->regunits(R))
       QHiRegUnits.set((unsigned)U);

   bool Changed = false;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
       // PATCHPOINT may have a 'def' that's not a register, avoid this.
       if (MI.getOpcode() == TargetOpcode::PATCHPOINT)
         continue;
       // For each partial register write, also add an implicit-def for top bits
       // of the register (e.g. for w0 add a def of x0).
       SmallSet<Register, 8> SuperRegs;
       for (const MachineOperand &DefOp : MI.defs())
         if (Register R = getWidestSuperReg(DefOp.getReg(), RequiredBaseRegUnits,
                                            QHiRegUnits);
             R != AArch64::NoRegister)
           SuperRegs.insert(R);

       if (!SuperRegs.size())
         continue;

       LLVM_DEBUG(dbgs() << "Adding implicit-defs to: " << MI);
       for (Register R : SuperRegs) {
         LLVM_DEBUG(dbgs() << "  " << printReg(R, TRI) << "\n");
         bool IsRenamable = any_of(MI.defs(), [&](const MachineOperand &MO) {
           return MO.isRenamable() && TRI->regsOverlap(MO.getReg(), R);
         });
         bool IsDead = any_of(MI.defs(), [&](const MachineOperand &MO) {
           return MO.isDead() && TRI->regsOverlap(MO.getReg(), R);
         });
         MachineOperand DefOp = MachineOperand::CreateReg(
             R, /*isDef=*/true, /*isImp=*/true, /*isKill=*/false,
             /*isDead=*/IsDead, /*isUndef=*/false, /*isEarlyClobber=*/false,
             /*SubReg=*/0, /*isDebug=*/false, /*isInternalRead=*/false,
             /*isRenamable=*/IsRenamable);
         MI.addOperand(DefOp);
       }
       Changed = true;
     }
   }

   return Changed;
 }

 FunctionPass *llvm::createAArch64SRLTDefineSuperRegsPass() {
   return new AArch64SRLTDefineSuperRegs();
 }
	//===- AArch64SRLTDefineSuperRegs.cpp -------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// When SubRegister Liveness Tracking (SRLT) is enabled, this pass adds
	// extra implicit-def's to instructions that define the low N bits of
	// a GPR/FPR register to also define the top bits, because all AArch64
	// instructions that write the low bits of a GPR/FPR also implicitly zero
	// the top bits. For example, 'mov w0, w1' writes zeroes to the top 32-bits of
	// x0, so this pass adds a `implicit-def $x0` after register allocation.
	//
	// These semantics are originally represented in the MIR using `SUBREG_TO_REG`
	// which expresses that the top bits have been defined by the preceding
	// instructions, but during register coalescing this information is lost and in
	// contrast to when SRTL is disabled, when rewriting virtual -> physical
	// registers the implicit-defs are not added to the instruction.
	//
	// There have been several attempts to fix this in the coalescer [1], but each
	// iteration has exposed new bugs and the patch had to be reverted.
	// Additionally, the concept of adding 'implicit-def' of a virtual register is
	// particularly fragile and many places don't expect it (for example in
	// `X86::commuteInstructionImpl` the code only looks at specific operands and
	// does not consider implicit-defs. Similar in `SplitEditor::addDeadDef` where
	// it traverses operand 'defs' rather than 'all_defs').
	//
	// We want a temporary solution that doesn't impact other targets and is simpler
	// and less intrusive than the patch proposed for the register coalescer [1], so
	// that we can enable SRLT for AArch64.
	//
	// The approach here is to just add the 'implicit-def' manually after rewriting
	// virtual regs -> phsyical regs. This still means that during the register
	// allocation process the dependences are not accurately represented in the MIR
	// and LiveIntervals, but there are several reasons why we believe this isn't a
	// problem in practice:
	// (A) The register allocator only spills entire virtual registers.
	// This is additionally guarded by code in
	// AArch64InstrInfo::storeRegToStackSlot/loadRegFromStackSlot
	// where it checks if a register matches the expected register class.
	// (B) Rematerialization only happens when the instruction writes the full
	// register.
	// (C) The high bits of the AArch64 register cannot be written independently.
	// (D) Instructions that write only part of a register always take that same
	// register as a tied input operand, to indicate it's a merging operation.
	//
	// (A) means that for two virtual registers of regclass GPR32 and GPR64, if the
	// GPR32 register is coalesced into the GPR64 vreg then the full GPR64 would
	// be spilled/filled even if only the low 32-bits would be required for the
	// given liverange. (B) means that the top bits of a GPR64 would never be
	// overwritten by rematerialising a GPR32 sub-register for a given liverange.
	// (C-D) means that we can assume that the MIR as input to the register
	// allocator correctly expresses the instruction behaviour and dependences
	// between values, so unless the register allocator would violate (A) or (B),
	// the MIR is otherwise sound.
	//
	// Alternative approaches have also been considered, such as:
	// (1) Changing the AArch64 instruction definitions to write all bits and
	// extract the low N bits for the result.
	// (2) Disabling coalescing of SUBREG_TO_REG and using regalloc hints to tell
	// the register allocator to favour the same register for the input/output.
	// (3) Adding a new coalescer guard node with a tied-operand constraint, such
	// that when the SUBREG_TO_REG is removed, something still represents that
	// the top bits are defined. The node would get removed before rewriting
	// virtregs.
	// (4) Using an explicit INSERT_SUBREG into a zero value and try to optimize
	// away the INSERT_SUBREG (this is a more explicit variant of (2) and (3))
	// (5) Adding a new MachineOperand flag that represents the top bits would be
	// defined, but are not read nor undef.
	//
	// (1) would be the best approach but would be a significant effort as it
	// requires rewriting most/all instruction definitions and fixing MIR passes
	// that rely on the current definitions, whereas (2-4) result in sub-optimal
	// code that can't really be avoided because the explicit nodes would stop
	// rematerialization. (5) might be a way to mitigate the
	// fragility of implicit-def's of virtual registers if we want to pursue
	// landing [1], but then we'd rather choose approach (1) to avoid using
	// SUBREG_TO_REG entirely.
	//
	// [1] https://github.com/llvm/llvm-project/pull/168353
	//===----------------------------------------------------------------------===//

	#include "AArch64InstrInfo.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/Support/Debug.h"

	using namespace llvm;

	#define DEBUG_TYPE "aarch64-srlt-define-superregs"
	#define PASS_NAME "AArch64 SRLT Define Super-Regs Pass"

	namespace {

	struct AArch64SRLTDefineSuperRegs : public MachineFunctionPass {
	inline static char ID = 0;

	AArch64SRLTDefineSuperRegs() : MachineFunctionPass(ID) {}

	bool runOnMachineFunction(MachineFunction &MF) override;

	Register getWidestSuperReg(Register R, const BitVector &RequiredBaseRegUnits,
	const BitVector &QHiRegUnits);

	StringRef getPassName() const override { return PASS_NAME; }

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	AU.addPreservedID(MachineLoopInfoID);
	AU.addPreservedID(MachineDominatorsID);
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	private:
	MachineFunction *MF = nullptr;
	const AArch64Subtarget *Subtarget = nullptr;
	const AArch64RegisterInfo *TRI = nullptr;
	};

	} // end anonymous namespace

	INITIALIZE_PASS(AArch64SRLTDefineSuperRegs, DEBUG_TYPE, PASS_NAME, false, false)

	// Returns the widest super-reg for a given reg, or NoRegister if no suitable
	// wider super-reg has been found. For example:
	// W0 -> X0
	// B1 -> Q1 (without SVE)
	// -> Z1 (with SVE)
	// W1_W2 -> X1_X2
	// D0_D1 -> Q0_Q1 (without SVE)
	// -> Z0_Z1 (with SVE)
	Register AArch64SRLTDefineSuperRegs::getWidestSuperReg(
	Register R, const BitVector &RequiredBaseRegUnits,
	const BitVector &QHiRegUnits) {
	assert(R.isPhysical() &&
	"Expected to be run straight after virtregrewriter!");

	BitVector Units(TRI->getNumRegUnits());
	for (MCRegUnit U : TRI->regunits(R))
	Units.set((unsigned)U);

	auto IsSuitableSuperReg = [&](Register SR) {
	for (MCRegUnit U : TRI->regunits(SR)) {
	// Avoid choosing z1 as super-reg of d1 if SVE is not available.
	// Q*_HI registers are only set for SVE registers, as those consist
	// of the Q* register for the low 128 bits and the Q*_HI (artificial)
	// register for the top (vscale-1) * 128 bits.
	if (QHiRegUnits.test((unsigned)U) &&
	!Subtarget->isSVEorStreamingSVEAvailable())
	return false;
	// We consider a super-reg as unsuitable if any of its reg units is not
	// artificial and not shared, as that would imply that U is a unit for a
	// different register, which means the candidate super-reg is likely
	// a register tuple.
	if (!TRI->isArtificialRegUnit(U) &&
	(!Units.test((unsigned)U) \|\| !RequiredBaseRegUnits.test((unsigned)U)))
	return false;
	}
	return true;
	};

	Register LargestSuperReg = AArch64::NoRegister;
	for (Register SR : TRI->superregs(R))
	if (IsSuitableSuperReg(SR) && (LargestSuperReg == AArch64::NoRegister \|\|
	TRI->isSuperRegister(LargestSuperReg, SR)))
	LargestSuperReg = SR;

	return LargestSuperReg;
	}

	bool AArch64SRLTDefineSuperRegs::runOnMachineFunction(MachineFunction &MF) {
	this->MF = &MF;
	Subtarget = &MF.getSubtarget<AArch64Subtarget>();
	TRI = Subtarget->getRegisterInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();

	if (!MRI->subRegLivenessEnabled())
	return false;

	assert(!MRI->isSSA() && "Expected to be run after breaking down SSA form!");

	auto XRegs = seq_inclusive<unsigned>(AArch64::X0, AArch64::X28);
	auto ZRegs = seq_inclusive<unsigned>(AArch64::Z0, AArch64::Z31);
	constexpr unsigned FixedRegs[] = {AArch64::FP, AArch64::LR, AArch64::SP};

	BitVector RequiredBaseRegUnits(TRI->getNumRegUnits());
	for (Register R : concat<unsigned>(XRegs, ZRegs, FixedRegs))
	for (MCRegUnit U : TRI->regunits(R))
	RequiredBaseRegUnits.set((unsigned)U);

	BitVector QHiRegUnits(TRI->getNumRegUnits());
	for (Register R : seq_inclusive<unsigned>(AArch64::Q0_HI, AArch64::Q31_HI))
	for (MCRegUnit U : TRI->regunits(R))
	QHiRegUnits.set((unsigned)U);

	bool Changed = false;
	for (MachineBasicBlock &MBB : MF) {
	for (MachineInstr &MI : MBB) {
	// PATCHPOINT may have a 'def' that's not a register, avoid this.
	if (MI.getOpcode() == TargetOpcode::PATCHPOINT)
	continue;
	// For each partial register write, also add an implicit-def for top bits
	// of the register (e.g. for w0 add a def of x0).
	SmallSet<Register, 8> SuperRegs;
	for (const MachineOperand &DefOp : MI.defs())
	if (Register R = getWidestSuperReg(DefOp.getReg(), RequiredBaseRegUnits,
	QHiRegUnits);
	R != AArch64::NoRegister)
	SuperRegs.insert(R);

	if (!SuperRegs.size())
	continue;

	LLVM_DEBUG(dbgs() << "Adding implicit-defs to: " << MI);
	for (Register R : SuperRegs) {
	LLVM_DEBUG(dbgs() << " " << printReg(R, TRI) << "\n");
	bool IsRenamable = any_of(MI.defs(), [&](const MachineOperand &MO) {
	return MO.isRenamable() && TRI->regsOverlap(MO.getReg(), R);
	});
	bool IsDead = any_of(MI.defs(), [&](const MachineOperand &MO) {
	return MO.isDead() && TRI->regsOverlap(MO.getReg(), R);
	});
	MachineOperand DefOp = MachineOperand::CreateReg(
	R, /isDef=/true, /isImp=/true, /isKill=/false,
	/isDead=/IsDead, /isUndef=/false, /isEarlyClobber=/false,
	/SubReg=/0, /isDebug=/false, /isInternalRead=/false,
	/isRenamable=/IsRenamable);
	MI.addOperand(DefOp);
	}
	Changed = true;
	}
	}

	return Changed;
	}

	FunctionPass *llvm::createAArch64SRLTDefineSuperRegsPass() {
	return new AArch64SRLTDefineSuperRegs();
	}