llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp - llvm-project.git - Git at Google

 //===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// Lower VGPRs above first 256 on gfx1250.
 ///
 /// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
 /// VGPR addressing mode. The mode change is effective until the next change.
 /// This instruction provides high bits of a VGPR address for four of the
 /// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
 /// instruction encoding. If bits are set they are added as MSB to the
 /// corresponding operand VGPR number.
 ///
 /// There is no need to replace actual register operands because encoding of the
 /// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
 /// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
 /// VGPRs will survive until actual encoding and will result in a same actual
 /// bit encoding.
 ///
 /// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
 /// to a VGPR address of the subseqent instructions. The InstPrinter will take
 /// care of the printing a low VGPR instead of a high one. In prinicple this
 /// shall be viable to print actual high VGPR numbers, but that would disagree
 /// with a disasm printing and create a situation where asm text is not
 /// deterministic.
 ///
 /// This pass creates a convention where non-fall through basic blocks shall
 /// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
 /// An optimization here is possible but deemed not desirable because of the
 /// readbility concerns.
 ///
 /// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
 /// The pass must run very late in the pipeline to make sure no changes to VGPR
 /// operands will be made after it.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPULowerVGPREncoding.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "llvm/ADT/PackedVector.h"
 #include "llvm/ADT/bit.h"
 #include "llvm/Support/MathExtras.h"

 using namespace llvm;

 #define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"

 namespace {

 class AMDGPULowerVGPREncoding {
   static constexpr unsigned OpNum = 4;
   static constexpr unsigned BitsPerField = 2;
   static constexpr unsigned NumFields = 4;
   static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
   static constexpr unsigned ModeWidth = NumFields * BitsPerField;
   static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
   using ModeType = PackedVector<unsigned, BitsPerField,
                                 std::bitset<BitsPerField * NumFields>>;

   static constexpr unsigned VGPRMSBShift =
       llvm::countr_zero_constexpr<unsigned>(AMDGPU::Hwreg::DST_VGPR_MSB);

   class ModeTy : public ModeType {
   public:
     // bitset constructor will set all bits to zero
     ModeTy() : ModeType(0) {}

     operator int64_t() const { return raw_bits().to_ulong(); }

     static ModeTy fullMask() {
       ModeTy M;
       M.raw_bits().flip();
       return M;
     }
   };

 public:
   bool run(MachineFunction &MF);

 private:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;

   // Current basic block.
   MachineBasicBlock *MBB;

   /// Most recent s_set_* instruction.
   MachineInstr *MostRecentModeSet;

   /// Current mode bits.
   ModeTy CurrentMode;

   /// Current mask of mode bits that instructions since MostRecentModeSet care
   /// about.
   ModeTy CurrentMask;

   /// Number of current hard clause instructions.
   unsigned ClauseLen;

   /// Number of hard clause instructions remaining.
   unsigned ClauseRemaining;

   /// Clause group breaks.
   unsigned ClauseBreaks;

   /// Last hard clause instruction.
   MachineInstr *Clause;

   /// Insert mode change before \p I. \returns true if mode was changed.
   bool setMode(ModeTy NewMode, ModeTy Mask,
                MachineBasicBlock::instr_iterator I);

   /// Reset mode to default.
   void resetMode(MachineBasicBlock::instr_iterator I) {
     setMode(ModeTy(), ModeTy::fullMask(), I);
   }

   /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
   std::optional<unsigned> getMSBs(const MachineOperand &MO) const;

   /// Handle single \p MI. \return true if changed.
   bool runOnMachineInstr(MachineInstr &MI);

   /// Compute the mode and mode mask for a single \p MI given \p Ops operands
   /// bit mapping. Optionally takes second array \p Ops2 for VOPD.
   /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
   /// is checked.
   void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
                    const AMDGPU::OpName Ops[OpNum],
                    const AMDGPU::OpName *Ops2 = nullptr);

   /// Check if an instruction \p I is within a clause and returns a suitable
   /// iterator to insert mode change. It may also modify the S_CLAUSE
   /// instruction to extend it or drop the clause if it cannot be adjusted.
   MachineBasicBlock::instr_iterator
   handleClause(MachineBasicBlock::instr_iterator I);

   /// Check if an instruction \p I is immediately after another program state
   /// instruction which it cannot coissue with. If so, insert before that
   /// instruction to encourage more coissuing.
   MachineBasicBlock::instr_iterator
   handleCoissue(MachineBasicBlock::instr_iterator I);

   /// Handle S_SETREG_IMM32_B32 targeting MODE register. On certain hardware,
   /// this instruction clobbers VGPR MSB bits[12:19], so we need to restore
   /// the current mode. \returns true if the instruction was modified or a
   /// new one was inserted.
   bool handleSetregMode(MachineInstr &MI);

   /// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain
   /// the VGPR MSB mode value. \returns true if the immediate was changed.
   bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
 };

 bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
                                       MachineBasicBlock::instr_iterator I) {
   assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());

   auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();

   if ((Delta & Mask.raw_bits()).none()) {
     CurrentMask |= Mask;
     return false;
   }

   if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
     CurrentMode |= NewMode;
     CurrentMask |= Mask;

     // Update MostRecentModeSet with the new mode. It can be either
     // S_SET_VGPR_MSB or S_SETREG_IMM32_B32 (with Size <= 12).
     if (MostRecentModeSet->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
       MachineOperand &Op = MostRecentModeSet->getOperand(0);
       // Carry old mode bits from the existing instruction.
       int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
       Op.setImm(CurrentMode | OldModeBits);
     } else {
       assert(MostRecentModeSet->getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
              "unexpected MostRecentModeSet opcode");
       updateSetregModeImm(*MostRecentModeSet, CurrentMode);
     }

     return true;
   }

   // Record previous mode into high 8 bits of the immediate.
   int64_t OldModeBits = CurrentMode << ModeWidth;

   I = handleClause(I);
   I = handleCoissue(I);
   MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
                           .addImm(NewMode | OldModeBits);

   CurrentMode = NewMode;
   CurrentMask = Mask;
   return true;
 }

 std::optional<unsigned>
 AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
   if (!MO.isReg())
     return std::nullopt;

   MCRegister Reg = MO.getReg();
   const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
   if (!RC || !TRI->isVGPRClass(RC))
     return std::nullopt;

   unsigned Idx = TRI->getHWRegIndex(Reg);
   return Idx >> 8;
 }

 void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
                                           MachineInstr &MI,
                                           const AMDGPU::OpName Ops[OpNum],
                                           const AMDGPU::OpName *Ops2) {
   NewMode = {};
   Mask = {};

   for (unsigned I = 0; I < OpNum; ++I) {
     MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);

     std::optional<unsigned> MSBits;
     if (Op)
       MSBits = getMSBs(*Op);

 #if !defined(NDEBUG)
     if (MSBits.has_value() && Ops2) {
       auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
       if (Op2) {
         std::optional<unsigned> MSBits2;
         MSBits2 = getMSBs(*Op2);
         if (MSBits2.has_value() && MSBits != MSBits2)
           llvm_unreachable("Invalid VOPD pair was created");
       }
     }
 #endif

     if (!MSBits.has_value() && Ops2) {
       Op = TII->getNamedOperand(MI, Ops2[I]);
       if (Op)
         MSBits = getMSBs(*Op);
     }

     if (!MSBits.has_value())
       continue;

     // Skip tied uses of src2 of VOP2, these will be handled along with defs and
     // only vdst bit affects these operands. We cannot skip tied uses of VOP3,
     // these uses are real even if must match the vdst.
     if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
         (SIInstrInfo::isVOP2(MI) ||
          (SIInstrInfo::isVOP3(MI) &&
           TII->hasVALU32BitEncoding(MI.getOpcode()))))
       continue;

     NewMode[I] = MSBits.value();
     Mask[I] = FieldMask;
   }
 }

 bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
   auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
   if (Ops.first) {
     ModeTy NewMode, Mask;
     computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
     return setMode(NewMode, Mask, MI.getIterator());
   }
   assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());

   return false;
 }

 MachineBasicBlock::instr_iterator
 AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
   if (!ClauseRemaining)
     return I;

   // A clause cannot start with a special instruction, place it right before
   // the clause.
   if (ClauseRemaining == ClauseLen) {
     I = Clause->getPrevNode()->getIterator();
     assert(I->isBundle());
     return I;
   }

   // If a clause defines breaks each group cannot start with a mode change.
   // just drop the clause.
   if (ClauseBreaks) {
     Clause->eraseFromBundle();
     ClauseRemaining = 0;
     return I;
   }

   // Otherwise adjust a number of instructions in the clause if it fits.
   // If it does not clause will just become shorter. Since the length
   // recorded in the clause is one less, increment the length after the
   // update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
   if (ClauseLen < 63)
     Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8));

   ++ClauseLen;

   return I;
 }

 MachineBasicBlock::instr_iterator
 AMDGPULowerVGPREncoding::handleCoissue(MachineBasicBlock::instr_iterator I) {
   if (I.isEnd())
     return I;

   if (I == I->getParent()->begin())
     return I;

   MachineBasicBlock::instr_iterator Prev = std::prev(I);
   auto isProgramStateSALU = [this](MachineInstr *MI) {
     return TII->isBarrier(MI->getOpcode()) ||
            TII->isWaitcnt(MI || (SIInstrInfo::isProgramStateSALU(*MI) &&
                                  MI->getOpcode() != AMDGPU::S_SET_VGPR_MSB));
   };

   if (!isProgramStateSALU(&*Prev))
     return I;

   while (!Prev.isEnd() && (Prev != Prev->getParent()->begin()) &&
          isProgramStateSALU(&*Prev)) {
     --Prev;
   }
   return Prev;
 }

 /// Convert mode value from S_SET_VGPR_MSB format to MODE register format.
 /// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7])
 /// MODE register uses:  (dst[0-1], src0[2-3], src1[4-5], src2[6-7])
 /// This is a left rotation by 2 bits on an 8-bit value.
 static int64_t convertModeToSetregFormat(int64_t Mode) {
   assert(isUInt<8>(Mode) && "Mode expected to be 8-bit");
   return llvm::rotl<uint8_t>(static_cast<uint8_t>(Mode), /*R=*/2);
 }

 bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
                                                   int64_t ModeValue) {
   assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32);

   // Convert from S_SET_VGPR_MSB format to MODE register format
   int64_t SetregMode = convertModeToSetregFormat(ModeValue);

   MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
   int64_t OldImm = ImmOp->getImm();
   int64_t NewImm =
       (OldImm & ~AMDGPU::Hwreg::VGPR_MSB_MASK) | (SetregMode << VGPRMSBShift);
   ImmOp->setImm(NewImm);
   return NewImm != OldImm;
 }

 bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
   using namespace AMDGPU::Hwreg;

   assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
          "only S_SETREG_IMM32_B32 needs to be handled");

   MachineOperand *SIMM16Op = TII->getNamedOperand(MI, AMDGPU::OpName::simm16);
   assert(SIMM16Op && "SIMM16Op must be present");

   auto [HwRegId, Offset, Size] = HwregEncoding::decode(SIMM16Op->getImm());
   (void)Offset;
   if (HwRegId != ID_MODE)
     return false;

   int64_t ModeValue = static_cast<int64_t>(CurrentMode);

   // Case 1: Size <= 12 - the original instruction uses imm32[0:Size-1], so
   // imm32[12:19] is unused. Safe to set imm32[12:19] to the correct VGPR
   // MSBs.
   if (Size <= VGPRMSBShift) {
     // This instruction now acts as MostRecentModeSet so it can be updated if
     // CurrentMode changes via piggybacking.
     MostRecentModeSet = &MI;
     return updateSetregModeImm(MI, ModeValue);
   }

   // Case 2: Size > 12 - the original instruction uses bits beyond 11, so we
   // cannot arbitrarily modify imm32[12:19]. Check if it already matches VGPR
   // MSBs. Note: imm32[12:19] is in MODE register format, while ModeValue is
   // in S_SET_VGPR_MSB format, so we need to convert before comparing.
   MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
   assert(ImmOp && "ImmOp must be present");
   int64_t ImmBits12To19 = (ImmOp->getImm() & VGPR_MSB_MASK) >> VGPRMSBShift;
   int64_t SetregModeValue = convertModeToSetregFormat(ModeValue);
   if (ImmBits12To19 == SetregModeValue) {
     // Already correct, but we must invalidate MostRecentModeSet because this
     // instruction will overwrite mode[12:19]. We can't update this instruction
     // via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes,
     // a new s_set_vgpr_msb will be inserted after this instruction.
     MostRecentModeSet = nullptr;
     return false;
   }

   // imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after
   // the original instruction to restore the correct value.
   MachineBasicBlock::iterator InsertPt = std::next(MI.getIterator());
   MostRecentModeSet = BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
                               TII->get(AMDGPU::S_SET_VGPR_MSB))
                           .addImm(ModeValue);
   return true;
 }

 bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (!ST.has1024AddressableVGPRs())
     return false;

   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();

   bool Changed = false;
   ClauseLen = ClauseRemaining = 0;
   CurrentMode.reset();
   CurrentMask.reset();
   for (auto &MBB : MF) {
     MostRecentModeSet = nullptr;
     this->MBB = &MBB;

     for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
       if (MI.isMetaInstruction())
         continue;

       if (MI.isTerminator() || MI.isCall()) {
         if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
             MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
           CurrentMode.reset();
         else
           resetMode(MI.getIterator());
         continue;
       }

       if (MI.isInlineAsm()) {
         if (TII->hasVGPRUses(MI))
           resetMode(MI.getIterator());
         continue;
       }

       if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
         assert(!ClauseRemaining && "Nested clauses are not supported");
         ClauseLen = MI.getOperand(0).getImm();
         ClauseBreaks = (ClauseLen >> 8) & 15;
         ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
         Clause = &MI;
         continue;
       }

       if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
           ST.hasSetregVGPRMSBFixup()) {
         Changed |= handleSetregMode(MI);
         continue;
       }

       Changed |= runOnMachineInstr(MI);

       if (ClauseRemaining)
         --ClauseRemaining;
     }

     // Reset the mode if we are falling through.
     resetMode(MBB.instr_end());
   }

   return Changed;
 }

 class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass {
 public:
   static char ID;

   AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {}

   bool runOnMachineFunction(MachineFunction &MF) override {
     return AMDGPULowerVGPREncoding().run(MF);
   }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };

 } // namespace

 char AMDGPULowerVGPREncodingLegacy::ID = 0;

 char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID;

 INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE,
                 "AMDGPU Lower VGPR Encoding", false, false)

 PreservedAnalyses
 AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
                                  MachineFunctionAnalysisManager &MFAM) {
   if (!AMDGPULowerVGPREncoding().run(MF))
     return PreservedAnalyses::all();

   return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
 }
	//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// Lower VGPRs above first 256 on gfx1250.
	///
	/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
	/// VGPR addressing mode. The mode change is effective until the next change.
	/// This instruction provides high bits of a VGPR address for four of the
	/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
	/// instruction encoding. If bits are set they are added as MSB to the
	/// corresponding operand VGPR number.
	///
	/// There is no need to replace actual register operands because encoding of the
	/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
	/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
	/// VGPRs will survive until actual encoding and will result in a same actual
	/// bit encoding.
	///
	/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
	/// to a VGPR address of the subseqent instructions. The InstPrinter will take
	/// care of the printing a low VGPR instead of a high one. In prinicple this
	/// shall be viable to print actual high VGPR numbers, but that would disagree
	/// with a disasm printing and create a situation where asm text is not
	/// deterministic.
	///
	/// This pass creates a convention where non-fall through basic blocks shall
	/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
	/// An optimization here is possible but deemed not desirable because of the
	/// readbility concerns.
	///
	/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
	/// The pass must run very late in the pipeline to make sure no changes to VGPR
	/// operands will be made after it.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPULowerVGPREncoding.h"
	#include "AMDGPU.h"
	#include "GCNSubtarget.h"
	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
	#include "SIDefines.h"
	#include "SIInstrInfo.h"
	#include "llvm/ADT/PackedVector.h"
	#include "llvm/ADT/bit.h"
	#include "llvm/Support/MathExtras.h"

	using namespace llvm;

	#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"

	namespace {

	class AMDGPULowerVGPREncoding {
	static constexpr unsigned OpNum = 4;
	static constexpr unsigned BitsPerField = 2;
	static constexpr unsigned NumFields = 4;
	static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
	static constexpr unsigned ModeWidth = NumFields * BitsPerField;
	static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
	using ModeType = PackedVector<unsigned, BitsPerField,
	std::bitset<BitsPerField * NumFields>>;

	static constexpr unsigned VGPRMSBShift =
	llvm::countr_zero_constexpr<unsigned>(AMDGPU::Hwreg::DST_VGPR_MSB);

	class ModeTy : public ModeType {
	public:
	// bitset constructor will set all bits to zero
	ModeTy() : ModeType(0) {}

	operator int64_t() const { return raw_bits().to_ulong(); }

	static ModeTy fullMask() {
	ModeTy M;
	M.raw_bits().flip();
	return M;
	}
	};

	public:
	bool run(MachineFunction &MF);

	private:
	const SIInstrInfo *TII;
	const SIRegisterInfo *TRI;

	// Current basic block.
	MachineBasicBlock *MBB;

	/// Most recent s_set_* instruction.
	MachineInstr *MostRecentModeSet;

	/// Current mode bits.
	ModeTy CurrentMode;

	/// Current mask of mode bits that instructions since MostRecentModeSet care
	/// about.
	ModeTy CurrentMask;

	/// Number of current hard clause instructions.
	unsigned ClauseLen;

	/// Number of hard clause instructions remaining.
	unsigned ClauseRemaining;

	/// Clause group breaks.
	unsigned ClauseBreaks;

	/// Last hard clause instruction.
	MachineInstr *Clause;

	/// Insert mode change before \p I. \returns true if mode was changed.
	bool setMode(ModeTy NewMode, ModeTy Mask,
	MachineBasicBlock::instr_iterator I);

	/// Reset mode to default.
	void resetMode(MachineBasicBlock::instr_iterator I) {
	setMode(ModeTy(), ModeTy::fullMask(), I);
	}

	/// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
	std::optional<unsigned> getMSBs(const MachineOperand &MO) const;

	/// Handle single \p MI. \return true if changed.
	bool runOnMachineInstr(MachineInstr &MI);

	/// Compute the mode and mode mask for a single \p MI given \p Ops operands
	/// bit mapping. Optionally takes second array \p Ops2 for VOPD.
	/// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
	/// is checked.
	void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
	const AMDGPU::OpName Ops[OpNum],
	const AMDGPU::OpName *Ops2 = nullptr);

	/// Check if an instruction \p I is within a clause and returns a suitable
	/// iterator to insert mode change. It may also modify the S_CLAUSE
	/// instruction to extend it or drop the clause if it cannot be adjusted.
	MachineBasicBlock::instr_iterator
	handleClause(MachineBasicBlock::instr_iterator I);

	/// Check if an instruction \p I is immediately after another program state
	/// instruction which it cannot coissue with. If so, insert before that
	/// instruction to encourage more coissuing.
	MachineBasicBlock::instr_iterator
	handleCoissue(MachineBasicBlock::instr_iterator I);

	/// Handle S_SETREG_IMM32_B32 targeting MODE register. On certain hardware,
	/// this instruction clobbers VGPR MSB bits[12:19], so we need to restore
	/// the current mode. \returns true if the instruction was modified or a
	/// new one was inserted.
	bool handleSetregMode(MachineInstr &MI);

	/// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain
	/// the VGPR MSB mode value. \returns true if the immediate was changed.
	bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
	};

	bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
	MachineBasicBlock::instr_iterator I) {
	assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());

	auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();

	if ((Delta & Mask.raw_bits()).none()) {
	CurrentMask \|= Mask;
	return false;
	}

	if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
	CurrentMode \|= NewMode;
	CurrentMask \|= Mask;

	// Update MostRecentModeSet with the new mode. It can be either
	// S_SET_VGPR_MSB or S_SETREG_IMM32_B32 (with Size <= 12).
	if (MostRecentModeSet->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
	MachineOperand &Op = MostRecentModeSet->getOperand(0);
	// Carry old mode bits from the existing instruction.
	int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
	Op.setImm(CurrentMode \| OldModeBits);
	} else {
	assert(MostRecentModeSet->getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
	"unexpected MostRecentModeSet opcode");
	updateSetregModeImm(*MostRecentModeSet, CurrentMode);
	}

	return true;
	}

	// Record previous mode into high 8 bits of the immediate.
	int64_t OldModeBits = CurrentMode << ModeWidth;

	I = handleClause(I);
	I = handleCoissue(I);
	MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
	.addImm(NewMode \| OldModeBits);

	CurrentMode = NewMode;
	CurrentMask = Mask;
	return true;
	}

	std::optional<unsigned>
	AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
	if (!MO.isReg())
	return std::nullopt;

	MCRegister Reg = MO.getReg();
	const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
	if (!RC \|\| !TRI->isVGPRClass(RC))
	return std::nullopt;

	unsigned Idx = TRI->getHWRegIndex(Reg);
	return Idx >> 8;
	}

	void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
	MachineInstr &MI,
	const AMDGPU::OpName Ops[OpNum],
	const AMDGPU::OpName *Ops2) {
	NewMode = {};
	Mask = {};

	for (unsigned I = 0; I < OpNum; ++I) {
	MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);

	std::optional<unsigned> MSBits;
	if (Op)
	MSBits = getMSBs(*Op);

	#if !defined(NDEBUG)
	if (MSBits.has_value() && Ops2) {
	auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
	if (Op2) {
	std::optional<unsigned> MSBits2;
	MSBits2 = getMSBs(*Op2);
	if (MSBits2.has_value() && MSBits != MSBits2)
	llvm_unreachable("Invalid VOPD pair was created");
	}
	}
	#endif

	if (!MSBits.has_value() && Ops2) {
	Op = TII->getNamedOperand(MI, Ops2[I]);
	if (Op)
	MSBits = getMSBs(*Op);
	}

	if (!MSBits.has_value())
	continue;

	// Skip tied uses of src2 of VOP2, these will be handled along with defs and
	// only vdst bit affects these operands. We cannot skip tied uses of VOP3,
	// these uses are real even if must match the vdst.
	if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
	(SIInstrInfo::isVOP2(MI) \|\|
	(SIInstrInfo::isVOP3(MI) &&
	TII->hasVALU32BitEncoding(MI.getOpcode()))))
	continue;

	NewMode[I] = MSBits.value();
	Mask[I] = FieldMask;
	}
	}

	bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
	auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
	if (Ops.first) {
	ModeTy NewMode, Mask;
	computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
	return setMode(NewMode, Mask, MI.getIterator());
	}
	assert(!TII->hasVGPRUses(MI) \|\| MI.isMetaInstruction() \|\| MI.isPseudo());

	return false;
	}

	MachineBasicBlock::instr_iterator
	AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
	if (!ClauseRemaining)
	return I;

	// A clause cannot start with a special instruction, place it right before
	// the clause.
	if (ClauseRemaining == ClauseLen) {
	I = Clause->getPrevNode()->getIterator();
	assert(I->isBundle());
	return I;
	}

	// If a clause defines breaks each group cannot start with a mode change.
	// just drop the clause.
	if (ClauseBreaks) {
	Clause->eraseFromBundle();
	ClauseRemaining = 0;
	return I;
	}

	// Otherwise adjust a number of instructions in the clause if it fits.
	// If it does not clause will just become shorter. Since the length
	// recorded in the clause is one less, increment the length after the
	// update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
	if (ClauseLen < 63)
	Clause->getOperand(0).setImm(ClauseLen \| (ClauseBreaks << 8));

	++ClauseLen;

	return I;
	}

	MachineBasicBlock::instr_iterator
	AMDGPULowerVGPREncoding::handleCoissue(MachineBasicBlock::instr_iterator I) {
	if (I.isEnd())
	return I;

	if (I == I->getParent()->begin())
	return I;

	MachineBasicBlock::instr_iterator Prev = std::prev(I);
	auto isProgramStateSALU = [this](MachineInstr *MI) {
	return TII->isBarrier(MI->getOpcode()) \|\|
	TII->isWaitcnt(MI \|\| (SIInstrInfo::isProgramStateSALU(*MI) &&
	MI->getOpcode() != AMDGPU::S_SET_VGPR_MSB));
	};

	if (!isProgramStateSALU(&*Prev))
	return I;

	while (!Prev.isEnd() && (Prev != Prev->getParent()->begin()) &&
	isProgramStateSALU(&*Prev)) {
	--Prev;
	}
	return Prev;
	}

	/// Convert mode value from S_SET_VGPR_MSB format to MODE register format.
	/// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7])
	/// MODE register uses: (dst[0-1], src0[2-3], src1[4-5], src2[6-7])
	/// This is a left rotation by 2 bits on an 8-bit value.
	static int64_t convertModeToSetregFormat(int64_t Mode) {
	assert(isUInt<8>(Mode) && "Mode expected to be 8-bit");
	return llvm::rotl<uint8_t>(static_cast<uint8_t>(Mode), /R=/2);
	}

	bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
	int64_t ModeValue) {
	assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32);

	// Convert from S_SET_VGPR_MSB format to MODE register format
	int64_t SetregMode = convertModeToSetregFormat(ModeValue);

	MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
	int64_t OldImm = ImmOp->getImm();
	int64_t NewImm =
	(OldImm & ~AMDGPU::Hwreg::VGPR_MSB_MASK) \| (SetregMode << VGPRMSBShift);
	ImmOp->setImm(NewImm);
	return NewImm != OldImm;
	}

	bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
	using namespace AMDGPU::Hwreg;

	assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
	"only S_SETREG_IMM32_B32 needs to be handled");

	MachineOperand *SIMM16Op = TII->getNamedOperand(MI, AMDGPU::OpName::simm16);
	assert(SIMM16Op && "SIMM16Op must be present");

	auto [HwRegId, Offset, Size] = HwregEncoding::decode(SIMM16Op->getImm());
	(void)Offset;
	if (HwRegId != ID_MODE)
	return false;

	int64_t ModeValue = static_cast<int64_t>(CurrentMode);

	// Case 1: Size <= 12 - the original instruction uses imm32[0:Size-1], so
	// imm32[12:19] is unused. Safe to set imm32[12:19] to the correct VGPR
	// MSBs.
	if (Size <= VGPRMSBShift) {
	// This instruction now acts as MostRecentModeSet so it can be updated if
	// CurrentMode changes via piggybacking.
	MostRecentModeSet = &MI;
	return updateSetregModeImm(MI, ModeValue);
	}

	// Case 2: Size > 12 - the original instruction uses bits beyond 11, so we
	// cannot arbitrarily modify imm32[12:19]. Check if it already matches VGPR
	// MSBs. Note: imm32[12:19] is in MODE register format, while ModeValue is
	// in S_SET_VGPR_MSB format, so we need to convert before comparing.
	MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
	assert(ImmOp && "ImmOp must be present");
	int64_t ImmBits12To19 = (ImmOp->getImm() & VGPR_MSB_MASK) >> VGPRMSBShift;
	int64_t SetregModeValue = convertModeToSetregFormat(ModeValue);
	if (ImmBits12To19 == SetregModeValue) {
	// Already correct, but we must invalidate MostRecentModeSet because this
	// instruction will overwrite mode[12:19]. We can't update this instruction
	// via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes,
	// a new s_set_vgpr_msb will be inserted after this instruction.
	MostRecentModeSet = nullptr;
	return false;
	}

	// imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after
	// the original instruction to restore the correct value.
	MachineBasicBlock::iterator InsertPt = std::next(MI.getIterator());
	MostRecentModeSet = BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
	TII->get(AMDGPU::S_SET_VGPR_MSB))
	.addImm(ModeValue);
	return true;
	}

	bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
	if (!ST.has1024AddressableVGPRs())
	return false;

	TII = ST.getInstrInfo();
	TRI = ST.getRegisterInfo();

	bool Changed = false;
	ClauseLen = ClauseRemaining = 0;
	CurrentMode.reset();
	CurrentMask.reset();
	for (auto &MBB : MF) {
	MostRecentModeSet = nullptr;
	this->MBB = &MBB;

	for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
	if (MI.isMetaInstruction())
	continue;

	if (MI.isTerminator() \|\| MI.isCall()) {
	if (MI.getOpcode() == AMDGPU::S_ENDPGM \|\|
	MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
	CurrentMode.reset();
	else
	resetMode(MI.getIterator());
	continue;
	}

	if (MI.isInlineAsm()) {
	if (TII->hasVGPRUses(MI))
	resetMode(MI.getIterator());
	continue;
	}

	if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
	assert(!ClauseRemaining && "Nested clauses are not supported");
	ClauseLen = MI.getOperand(0).getImm();
	ClauseBreaks = (ClauseLen >> 8) & 15;
	ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
	Clause = &MI;
	continue;
	}

	if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
	ST.hasSetregVGPRMSBFixup()) {
	Changed \|= handleSetregMode(MI);
	continue;
	}

	Changed \|= runOnMachineInstr(MI);

	if (ClauseRemaining)
	--ClauseRemaining;
	}

	// Reset the mode if we are falling through.
	resetMode(MBB.instr_end());
	}

	return Changed;
	}

	class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass {
	public:
	static char ID;

	AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {}

	bool runOnMachineFunction(MachineFunction &MF) override {
	return AMDGPULowerVGPREncoding().run(MF);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	MachineFunctionPass::getAnalysisUsage(AU);
	}
	};

	} // namespace

	char AMDGPULowerVGPREncodingLegacy::ID = 0;

	char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID;

	INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE,
	"AMDGPU Lower VGPR Encoding", false, false)

	PreservedAnalyses
	AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
	MachineFunctionAnalysisManager &MFAM) {
	if (!AMDGPULowerVGPREncoding().run(MF))
	return PreservedAnalyses::all();

	return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
	}