llvm/lib/Target/ARM/ARMLatencyMutations.cpp - llvm-project - Git at Google

 //===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file This file contains the ARM definition DAG scheduling mutations which
 /// change inter-instruction latencies
 //
 //===----------------------------------------------------------------------===//

 #include "ARMLatencyMutations.h"
 #include "ARMSubtarget.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include <algorithm>
 #include <array>
 #include <initializer_list>
 #include <memory>

 namespace llvm {

 namespace {

 // Precompute information about opcodes to speed up pass

 class InstructionInformation {
 protected:
   struct IInfo {
     bool HasBRegAddr : 1;      // B-side of addr gen is a register
     bool HasBRegAddrShift : 1; // B-side of addr gen has a shift
     bool IsDivide : 1;         // Some form of integer divide
     bool IsInlineShiftALU : 1; // Inline shift+ALU
     bool IsMultiply : 1;       // Some form of integer multiply
     bool IsMVEIntMAC : 1;      // MVE 8/16/32-bit integer MAC operation
     bool IsNonSubwordLoad : 1; // Load which is a word or larger
     bool IsShift : 1;          // Shift operation
     bool IsRev : 1;            // REV operation
     bool ProducesQP : 1;       // Produces a vector register result
     bool ProducesDP : 1;       // Produces a double-precision register result
     bool ProducesSP : 1;       // Produces a single-precision register result
     bool ConsumesQP : 1;       // Consumes a vector register result
     bool ConsumesDP : 1;       // Consumes a double-precision register result
     bool ConsumesSP : 1;       // Consumes a single-precision register result
     unsigned MVEIntMACMatched; // Matched operand type (for MVE)
     unsigned AddressOpMask;    // Mask indicating which operands go into AGU
     IInfo()
         : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
           IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
           IsNonSubwordLoad(false), IsShift(false), IsRev(false),
           ProducesQP(false), ProducesDP(false), ProducesSP(false),
           ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
           MVEIntMACMatched(0), AddressOpMask(0) {}
   };
   typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;
   IInfoArray Info;

 public:
   // Always available information
   unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }
   bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }
   bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }
   bool isDivide(unsigned Op) { return Info[Op].IsDivide; }
   bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }
   bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }
   bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }
   bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }
   bool isRev(unsigned Op) { return Info[Op].IsRev; }
   bool isShift(unsigned Op) { return Info[Op].IsShift; }

   // information available if markDPConsumers is called.
   bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }
   bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }
   bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }
   bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }
   bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }
   bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }

   bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {
     return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;
   }

   InstructionInformation(const ARMBaseInstrInfo *TII);

 protected:
   void markDPProducersConsumers(const ARMBaseInstrInfo *TII);
 };

 InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {
   using namespace ARM;

   std::initializer_list<unsigned> hasBRegAddrList = {
       t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
       tLDRr,  tLDRBr,  tLDRHr,  tSTRr,  tSTRBr,  tSTRHr,
   };
   for (auto op : hasBRegAddrList) {
     Info[op].HasBRegAddr = true;
   }

   std::initializer_list<unsigned> hasBRegAddrShiftList = {
       t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
   };
   for (auto op : hasBRegAddrShiftList) {
     Info[op].HasBRegAddrShift = true;
   }

   Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;

   std::initializer_list<unsigned> isInlineShiftALUList = {
       t2ADCrs,  t2ADDSrs, t2ADDrs,  t2BICrs, t2EORrs,
       t2ORNrs,  t2RSBSrs, t2RSBrs,  t2SBCrs, t2SUBrs,
       t2SUBSrs, t2CMPrs,  t2CMNzrs, t2TEQrs, t2TSTrs,
   };
   for (auto op : isInlineShiftALUList) {
     Info[op].IsInlineShiftALU = true;
   }

   Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;

   std::initializer_list<unsigned> isMultiplyList = {
       t2MUL,    t2MLA,     t2MLS,     t2SMLABB, t2SMLABT,  t2SMLAD,   t2SMLADX,
       t2SMLAL,  t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,
       t2SMLATB, t2SMLATT,  t2SMLAWT,  t2SMLSD,  t2SMLSDX,  t2SMLSLD,  t2SMLSLDX,
       t2SMMLA,  t2SMMLAR,  t2SMMLS,   t2SMMLSR, t2SMMUL,   t2SMMULR,  t2SMUAD,
       t2SMUADX, t2SMULBB,  t2SMULBT,  t2SMULL,  t2SMULTB,  t2SMULTT,  t2SMULWT,
       t2SMUSD,  t2SMUSDX,  t2UMAAL,   t2UMLAL,  t2UMULL,   tMUL,
   };
   for (auto op : isMultiplyList) {
     Info[op].IsMultiply = true;
   }

   std::initializer_list<unsigned> isMVEIntMACList = {
       MVE_VMLAS_qr_i16,    MVE_VMLAS_qr_i32,    MVE_VMLAS_qr_i8,
       MVE_VMLA_qr_i16,     MVE_VMLA_qr_i32,     MVE_VMLA_qr_i8,
       MVE_VQDMLAH_qrs16,   MVE_VQDMLAH_qrs32,   MVE_VQDMLAH_qrs8,
       MVE_VQDMLASH_qrs16,  MVE_VQDMLASH_qrs32,  MVE_VQDMLASH_qrs8,
       MVE_VQRDMLAH_qrs16,  MVE_VQRDMLAH_qrs32,  MVE_VQRDMLAH_qrs8,
       MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,
       MVE_VQDMLADHXs16,    MVE_VQDMLADHXs32,    MVE_VQDMLADHXs8,
       MVE_VQDMLADHs16,     MVE_VQDMLADHs32,     MVE_VQDMLADHs8,
       MVE_VQDMLSDHXs16,    MVE_VQDMLSDHXs32,    MVE_VQDMLSDHXs8,
       MVE_VQDMLSDHs16,     MVE_VQDMLSDHs32,     MVE_VQDMLSDHs8,
       MVE_VQRDMLADHXs16,   MVE_VQRDMLADHXs32,   MVE_VQRDMLADHXs8,
       MVE_VQRDMLADHs16,    MVE_VQRDMLADHs32,    MVE_VQRDMLADHs8,
       MVE_VQRDMLSDHXs16,   MVE_VQRDMLSDHXs32,   MVE_VQRDMLSDHXs8,
       MVE_VQRDMLSDHs16,    MVE_VQRDMLSDHs32,    MVE_VQRDMLSDHs8,
   };
   for (auto op : isMVEIntMACList) {
     Info[op].IsMVEIntMAC = true;
   }

   std::initializer_list<unsigned> isNonSubwordLoadList = {
       t2LDRi12, t2LDRi8,  t2LDR_POST,  t2LDR_PRE,  t2LDRpci,
       t2LDRs,   t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,
       tLDRpci,  tLDRr,    tLDRspi,
   };
   for (auto op : isNonSubwordLoadList) {
     Info[op].IsNonSubwordLoad = true;
   }

   std::initializer_list<unsigned> isRevList = {
       t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,
   };
   for (auto op : isRevList) {
     Info[op].IsRev = true;
   }

   std::initializer_list<unsigned> isShiftList = {
       t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,
       tASRri,  tASRrr,  tLSLSri, tLSLri,  tLSLrr,  tLSRri,  tLSRrr,  tROR,
   };
   for (auto op : isShiftList) {
     Info[op].IsShift = true;
   }

   std::initializer_list<unsigned> Address1List = {
       t2LDRBi12,
       t2LDRBi8,
       t2LDRBpci,
       t2LDRBs,
       t2LDRHi12,
       t2LDRHi8,
       t2LDRHpci,
       t2LDRHs,
       t2LDRSBi12,
       t2LDRSBi8,
       t2LDRSBpci,
       t2LDRSBs,
       t2LDRSHi12,
       t2LDRSHi8,
       t2LDRSHpci,
       t2LDRSHs,
       t2LDRi12,
       t2LDRi8,
       t2LDRpci,
       t2LDRs,
       tLDRBi,
       tLDRBr,
       tLDRHi,
       tLDRHr,
       tLDRSB,
       tLDRSH,
       tLDRi,
       tLDRpci,
       tLDRr,
       tLDRspi,
       t2STRBi12,
       t2STRBi8,
       t2STRBs,
       t2STRHi12,
       t2STRHi8,
       t2STRHs,
       t2STRi12,
       t2STRi8,
       t2STRs,
       tSTRBi,
       tSTRBr,
       tSTRHi,
       tSTRHr,
       tSTRi,
       tSTRr,
       tSTRspi,
       VLDRD,
       VLDRH,
       VLDRS,
       VSTRD,
       VSTRH,
       VSTRS,
       MVE_VLD20_16,
       MVE_VLD20_32,
       MVE_VLD20_8,
       MVE_VLD21_16,
       MVE_VLD21_32,
       MVE_VLD21_8,
       MVE_VLD40_16,
       MVE_VLD40_32,
       MVE_VLD40_8,
       MVE_VLD41_16,
       MVE_VLD41_32,
       MVE_VLD41_8,
       MVE_VLD42_16,
       MVE_VLD42_32,
       MVE_VLD42_8,
       MVE_VLD43_16,
       MVE_VLD43_32,
       MVE_VLD43_8,
       MVE_VLDRBS16,
       MVE_VLDRBS16_rq,
       MVE_VLDRBS32,
       MVE_VLDRBS32_rq,
       MVE_VLDRBU16,
       MVE_VLDRBU16_rq,
       MVE_VLDRBU32,
       MVE_VLDRBU32_rq,
       MVE_VLDRBU8,
       MVE_VLDRBU8_rq,
       MVE_VLDRDU64_qi,
       MVE_VLDRDU64_rq,
       MVE_VLDRDU64_rq_u,
       MVE_VLDRHS32,
       MVE_VLDRHS32_rq,
       MVE_VLDRHS32_rq_u,
       MVE_VLDRHU16,
       MVE_VLDRHU16_rq,
       MVE_VLDRHU16_rq_u,
       MVE_VLDRHU32,
       MVE_VLDRHU32_rq,
       MVE_VLDRHU32_rq_u,
       MVE_VLDRWU32,
       MVE_VLDRWU32_qi,
       MVE_VLDRWU32_rq,
       MVE_VLDRWU32_rq_u,
       MVE_VST20_16,
       MVE_VST20_32,
       MVE_VST20_8,
       MVE_VST21_16,
       MVE_VST21_32,
       MVE_VST21_8,
       MVE_VST40_16,
       MVE_VST40_32,
       MVE_VST40_8,
       MVE_VST41_16,
       MVE_VST41_32,
       MVE_VST41_8,
       MVE_VST42_16,
       MVE_VST42_32,
       MVE_VST42_8,
       MVE_VST43_16,
       MVE_VST43_32,
       MVE_VST43_8,
       MVE_VSTRB16,
       MVE_VSTRB16_rq,
       MVE_VSTRB32,
       MVE_VSTRB32_rq,
       MVE_VSTRBU8,
       MVE_VSTRB8_rq,
       MVE_VSTRD64_qi,
       MVE_VSTRD64_rq,
       MVE_VSTRD64_rq_u,
       MVE_VSTRH32,
       MVE_VSTRH32_rq,
       MVE_VSTRH32_rq_u,
       MVE_VSTRHU16,
       MVE_VSTRH16_rq,
       MVE_VSTRH16_rq_u,
       MVE_VSTRWU32,
       MVE_VSTRW32_qi,
       MVE_VSTRW32_rq,
       MVE_VSTRW32_rq_u,
   };
   std::initializer_list<unsigned> Address2List = {
       t2LDRB_POST,
       t2LDRB_PRE,
       t2LDRDi8,
       t2LDRH_POST,
       t2LDRH_PRE,
       t2LDRSB_POST,
       t2LDRSB_PRE,
       t2LDRSH_POST,
       t2LDRSH_PRE,
       t2LDR_POST,
       t2LDR_PRE,
       t2STRB_POST,
       t2STRB_PRE,
       t2STRDi8,
       t2STRH_POST,
       t2STRH_PRE,
       t2STR_POST,
       t2STR_PRE,
       MVE_VLD20_16_wb,
       MVE_VLD20_32_wb,
       MVE_VLD20_8_wb,
       MVE_VLD21_16_wb,
       MVE_VLD21_32_wb,
       MVE_VLD21_8_wb,
       MVE_VLD40_16_wb,
       MVE_VLD40_32_wb,
       MVE_VLD40_8_wb,
       MVE_VLD41_16_wb,
       MVE_VLD41_32_wb,
       MVE_VLD41_8_wb,
       MVE_VLD42_16_wb,
       MVE_VLD42_32_wb,
       MVE_VLD42_8_wb,
       MVE_VLD43_16_wb,
       MVE_VLD43_32_wb,
       MVE_VLD43_8_wb,
       MVE_VLDRBS16_post,
       MVE_VLDRBS16_pre,
       MVE_VLDRBS32_post,
       MVE_VLDRBS32_pre,
       MVE_VLDRBU16_post,
       MVE_VLDRBU16_pre,
       MVE_VLDRBU32_post,
       MVE_VLDRBU32_pre,
       MVE_VLDRBU8_post,
       MVE_VLDRBU8_pre,
       MVE_VLDRDU64_qi_pre,
       MVE_VLDRHS32_post,
       MVE_VLDRHS32_pre,
       MVE_VLDRHU16_post,
       MVE_VLDRHU16_pre,
       MVE_VLDRHU32_post,
       MVE_VLDRHU32_pre,
       MVE_VLDRWU32_post,
       MVE_VLDRWU32_pre,
       MVE_VLDRWU32_qi_pre,
       MVE_VST20_16_wb,
       MVE_VST20_32_wb,
       MVE_VST20_8_wb,
       MVE_VST21_16_wb,
       MVE_VST21_32_wb,
       MVE_VST21_8_wb,
       MVE_VST40_16_wb,
       MVE_VST40_32_wb,
       MVE_VST40_8_wb,
       MVE_VST41_16_wb,
       MVE_VST41_32_wb,
       MVE_VST41_8_wb,
       MVE_VST42_16_wb,
       MVE_VST42_32_wb,
       MVE_VST42_8_wb,
       MVE_VST43_16_wb,
       MVE_VST43_32_wb,
       MVE_VST43_8_wb,
       MVE_VSTRB16_post,
       MVE_VSTRB16_pre,
       MVE_VSTRB32_post,
       MVE_VSTRB32_pre,
       MVE_VSTRBU8_post,
       MVE_VSTRBU8_pre,
       MVE_VSTRD64_qi_pre,
       MVE_VSTRH32_post,
       MVE_VSTRH32_pre,
       MVE_VSTRHU16_post,
       MVE_VSTRHU16_pre,
       MVE_VSTRWU32_post,
       MVE_VSTRWU32_pre,
       MVE_VSTRW32_qi_pre,
   };
   std::initializer_list<unsigned> Address3List = {
       t2LDRD_POST,
       t2LDRD_PRE,
       t2STRD_POST,
       t2STRD_PRE,
   };
   // Compute a mask of which operands are involved in address computation
   for (auto &op : Address1List) {
     Info[op].AddressOpMask = 0x6;
   }
   for (auto &op : Address2List) {
     Info[op].AddressOpMask = 0xc;
   }
   for (auto &op : Address3List) {
     Info[op].AddressOpMask = 0x18;
   }
   for (auto &op : hasBRegAddrShiftList) {
     Info[op].AddressOpMask |= 0x8;
   }
 }

 void InstructionInformation::markDPProducersConsumers(
     const ARMBaseInstrInfo *TII) {
   // Learn about all instructions which have FP source/dest registers
   for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {
     const MCInstrDesc &MID = TII->get(MI);
     auto Operands = MID.operands();
     for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {
       bool MarkQP = false, MarkDP = false, MarkSP = false;
       switch (Operands[OI].RegClass) {
       case ARM::MQPRRegClassID:
       case ARM::DPRRegClassID:
       case ARM::DPR_8RegClassID:
       case ARM::DPR_VFP2RegClassID:
       case ARM::DPairRegClassID:
       case ARM::DPairSpcRegClassID:
       case ARM::DQuadRegClassID:
       case ARM::DQuadSpcRegClassID:
       case ARM::DTripleRegClassID:
       case ARM::DTripleSpcRegClassID:
         MarkDP = true;
         break;
       case ARM::QPRRegClassID:
       case ARM::QPR_8RegClassID:
       case ARM::QPR_VFP2RegClassID:
       case ARM::QQPRRegClassID:
       case ARM::QQQQPRRegClassID:
         MarkQP = true;
         break;
       case ARM::SPRRegClassID:
       case ARM::SPR_8RegClassID:
       case ARM::FPWithVPRRegClassID:
         MarkSP = true;
         break;
       default:
         break;
       }
       if (MarkQP) {
         if (OI < MID.getNumDefs())
           Info[MI].ProducesQP = true;
         else
           Info[MI].ConsumesQP = true;
       }
       if (MarkDP) {
         if (OI < MID.getNumDefs())
           Info[MI].ProducesDP = true;
         else
           Info[MI].ConsumesDP = true;
       }
       if (MarkSP) {
         if (OI < MID.getNumDefs())
           Info[MI].ProducesSP = true;
         else
           Info[MI].ConsumesSP = true;
       }
     }
   }
 }

 } // anonymous namespace

 static bool hasImplicitCPSRUse(const MachineInstr *MI) {
   return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);
 }

 void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,
                                             unsigned latency) {
   SDep Reverse = SrcDep;
   Reverse.setSUnit(&SrcSU);
   for (SDep &PDep : SrcDep.getSUnit()->Preds) {
     if (PDep == Reverse) {
       PDep.setLatency(latency);
       SrcDep.getSUnit()->setDepthDirty();
       break;
     }
   }
   SrcDep.setLatency(latency);
   SrcSU.setHeightDirty();
 }

 static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {
   return (a & 0xe) != (b & 0xe);
 }

 // Set output dependences to zero latency for processors which can
 // simultaneously issue to the same register.  Returns true if a change
 // was made.
 bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {
   if (Dep.getKind() == SDep::Output) {
     setBidirLatencies(ISU, Dep, 0);
     return true;
   }
   return false;
 }

 // The graph doesn't look inside of bundles to determine their
 // scheduling boundaries and reports zero latency into and out of them
 // (except for CPSR into the bundle, which has latency 1).
 // Make some better scheduling assumptions:
 // 1) CPSR uses have zero latency; other uses have incoming latency 1
 // 2) CPSR defs retain a latency of zero; others have a latency of 1.
 //
 // Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
 unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {

   SUnit &DepSU = *Dep.getSUnit();
   const MachineInstr *SrcMI = ISU.getInstr();
   unsigned SrcOpcode = SrcMI->getOpcode();
   const MachineInstr *DstMI = DepSU.getInstr();
   unsigned DstOpcode = DstMI->getOpcode();

   if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {
     setBidirLatencies(
         ISU, Dep,
         (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);
     return 1;
   }
   if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&
       Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {
     setBidirLatencies(ISU, Dep, 1);
     return 2;
   }
   return 0;
 }

 // Determine whether there is a memory RAW hazard here and set up latency
 // accordingly
 bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,
                                           unsigned latency) {
   if (!Dep.isNormalMemory())
     return false;
   auto &SrcInst = *ISU.getInstr();
   auto &DstInst = *Dep.getSUnit()->getInstr();
   if (!SrcInst.mayStore() || !DstInst.mayLoad())
     return false;

   auto SrcMO = *SrcInst.memoperands().begin();
   auto DstMO = *DstInst.memoperands().begin();
   auto SrcVal = SrcMO->getValue();
   auto DstVal = DstMO->getValue();
   auto SrcPseudoVal = SrcMO->getPseudoValue();
   auto DstPseudoVal = DstMO->getPseudoValue();
   if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&
       SrcMO->getOffset() == DstMO->getOffset()) {
     setBidirLatencies(ISU, Dep, latency);
     return true;
   } else if (SrcPseudoVal && DstPseudoVal &&
              SrcPseudoVal->kind() == DstPseudoVal->kind() &&
              SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {
     // Spills/fills
     auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);
     auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);
     if (FS0 == FS1) {
       setBidirLatencies(ISU, Dep, latency);
       return true;
     }
   }
   return false;
 }

 namespace {

 std::unique_ptr<InstructionInformation> II;

 class CortexM7InstructionInformation : public InstructionInformation {
 public:
   CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)
       : InstructionInformation(TII) {}
 };

 class CortexM7Overrides : public ARMOverrideBypasses {
 public:
   CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
       : ARMOverrideBypasses(TII, AA) {
     if (!II)
       II.reset(new CortexM7InstructionInformation(TII));
   }

   void modifyBypasses(SUnit &) override;
 };

 void CortexM7Overrides::modifyBypasses(SUnit &ISU) {
   const MachineInstr *SrcMI = ISU.getInstr();
   unsigned SrcOpcode = SrcMI->getOpcode();
   bool isNSWload = II->isNonSubwordLoad(SrcOpcode);

   // Walk the successors looking for latency overrides that are needed
   for (SDep &Dep : ISU.Succs) {

     // Output dependences should have 0 latency, as M7 is able to
     // schedule writers to the same register for simultaneous issue.
     if (zeroOutputDependences(ISU, Dep))
       continue;

     if (memoryRAWHazard(ISU, Dep, 4))
       continue;

     // Ignore dependencies other than data
     if (Dep.getKind() != SDep::Data)
       continue;

     SUnit &DepSU = *Dep.getSUnit();
     if (DepSU.isBoundaryNode())
       continue;

     if (makeBundleAssumptions(ISU, Dep) == 1)
       continue;

     const MachineInstr *DstMI = DepSU.getInstr();
     unsigned DstOpcode = DstMI->getOpcode();

     // Word loads into any multiply or divide instruction are considered
     // cannot bypass their scheduling stage. Didn't do this in the .td file
     // because we cannot easily create a read advance that is 0 from certain
     // writer classes and 1 from all the rest.
     // (The other way around would have been easy.)
     if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode)))
       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);

     // Word loads into B operand of a load/store are considered cannot bypass
     // their scheduling stage. Cannot do in the .td file because
     // need to decide between -1 and -2 for ReadAdvance
     if (isNSWload && II->hasBRegAddr(DstOpcode) &&
         DstMI->getOperand(2).getReg() == Dep.getReg())
       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);

     // Multiplies into any address generation cannot bypass from EX3.  Cannot do
     // in the .td file because need to decide between -1 and -2 for ReadAdvance
     if (II->isMultiply(SrcOpcode)) {
       unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1;
       for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) {
         if ((OpMask & 1) && DstMI->getOperand(i).isReg() &&
             DstMI->getOperand(i).getReg() == Dep.getReg()) {
           setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1
           break;
         }
       }
     }

     // Mismatched conditional producers take longer on M7; they end up looking
     // like they were produced at EX3 and read at IS.
     if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() &&
         (SrcOpcode == ARM::BUNDLE ||
          mismatchedPred(TII->getPredicate(*SrcMI),
                         TII->getPredicate(*DstMI)))) {
       unsigned Lat = 1;
       // Operand A of shift+ALU is treated as an EX1 read instead of EX2.
       if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() &&
           DstMI->getOperand(1).getReg() == Dep.getReg())
         Lat = 2;
       Lat = std::min(3u, Dep.getLatency() + Lat);
       setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat));
     }

     // CC setter into conditional producer shouldn't have a latency of more
     // than 1 unless it's due to an implicit read. (All the "true" readers
     // of the condition code use an implicit read, and predicates use an
     // explicit.)
     if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
         TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
       setBidirLatencies(ISU, Dep, 1);

     // REV instructions cannot bypass directly into the EX1 shifter.  The
     // code is slightly inexact as it doesn't attempt to ensure that the bypass
     // is to the shifter operands.
     if (II->isRev(SrcOpcode)) {
       if (II->isInlineShiftALU(DstOpcode))
         setBidirLatencies(ISU, Dep, 2);
       else if (II->isShift(DstOpcode))
         setBidirLatencies(ISU, Dep, 1);
     }
   }
 }

 class M85InstructionInformation : public InstructionInformation {
 public:
   M85InstructionInformation(const ARMBaseInstrInfo *t)
       : InstructionInformation(t) {
     markDPProducersConsumers(t);
   }
 };

 class M85Overrides : public ARMOverrideBypasses {
 public:
   M85Overrides(const ARMBaseInstrInfo *t, AAResults *a)
       : ARMOverrideBypasses(t, a) {
     if (!II)
       II.reset(new M85InstructionInformation(t));
   }

   void modifyBypasses(SUnit &) override;

 private:
   unsigned computeBypassStage(const MCSchedClassDesc *SCD);
   signed modifyMixedWidthFP(const MachineInstr *SrcMI,
                             const MachineInstr *DstMI, unsigned RegID,
                             const MCSchedClassDesc *SCD);
 };

 unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {
   auto SM = DAG->getSchedModel();
   unsigned DefIdx = 0; // just look for the first output's timing
   if (DefIdx < SCDesc->NumWriteLatencyEntries) {
     // Lookup the definition's write latency in SubtargetInfo.
     const MCWriteLatencyEntry *WLEntry =
         SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx);
     unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000;
     if (Latency == 4)
       return 2;
     else if (Latency == 5)
       return 3;
     else if (Latency > 3)
       return 3;
     else
       return Latency;
   }
   return 2;
 }

 // Latency changes for bypassing between FP registers of different sizes:
 //
 // Note that mixed DP/SP are unlikely because of the semantics
 // of C.  Mixed MVE/SP are quite common when MVE intrinsics are used.
 signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,
                                         const MachineInstr *DstMI,
                                         unsigned RegID,
                                         const MCSchedClassDesc *SCD) {

   if (!II->producesSP(SrcMI->getOpcode()) &&
       !II->producesDP(SrcMI->getOpcode()) &&
       !II->producesQP(SrcMI->getOpcode()))
     return 0;

   if (Register::isVirtualRegister(RegID)) {
     if (II->producesSP(SrcMI->getOpcode()) &&
         II->consumesDP(DstMI->getOpcode())) {
       for (auto &OP : SrcMI->operands())
         if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
             OP.getSubReg() == ARM::ssub_1)
           return 5 - computeBypassStage(SCD);
     } else if (II->producesSP(SrcMI->getOpcode()) &&
                II->consumesQP(DstMI->getOpcode())) {
       for (auto &OP : SrcMI->operands())
         if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
             (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
           return 5 - computeBypassStage(SCD) -
                  ((OP.getSubReg() == ARM::ssub_2 ||
                    OP.getSubReg() == ARM::ssub_3)
                       ? 1
                       : 0);
     } else if (II->producesDP(SrcMI->getOpcode()) &&
                II->consumesQP(DstMI->getOpcode())) {
       for (auto &OP : SrcMI->operands())
         if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
             OP.getSubReg() == ARM::ssub_1)
           return -1;
     } else if (II->producesDP(SrcMI->getOpcode()) &&
                II->consumesSP(DstMI->getOpcode())) {
       for (auto &OP : DstMI->operands())
         if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
             OP.getSubReg() == ARM::ssub_1)
           return 5 - computeBypassStage(SCD);
     } else if (II->producesQP(SrcMI->getOpcode()) &&
                II->consumesSP(DstMI->getOpcode())) {
       for (auto &OP : DstMI->operands())
         if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
             (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
           return 5 - computeBypassStage(SCD) +
                  ((OP.getSubReg() == ARM::ssub_2 ||
                    OP.getSubReg() == ARM::ssub_3)
                       ? 1
                       : 0);
     } else if (II->producesQP(SrcMI->getOpcode()) &&
                II->consumesDP(DstMI->getOpcode())) {
       for (auto &OP : DstMI->operands())
         if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
             OP.getSubReg() == ARM::ssub_1)
           return 1;
     }
   } else if (Register::isPhysicalRegister(RegID)) {
     // Note that when the producer is narrower, not all of the producers
     // may be present in the scheduling graph; somewhere earlier in the
     // compiler, an implicit def/use of the aliased full register gets
     // added to the producer, and so only that producer is seen as *the*
     // single producer.  This behavior also has the unfortunate effect of
     // serializing the producers in the compiler's view of things.
     if (II->producesSP(SrcMI->getOpcode()) &&
         II->consumesDP(DstMI->getOpcode())) {
       for (auto &OP : SrcMI->operands())
         if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
             OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
             (OP.getReg() == RegID ||
              (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
              (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
           return 5 - computeBypassStage(SCD);
     } else if (II->producesSP(SrcMI->getOpcode()) &&
                II->consumesQP(DstMI->getOpcode())) {
       for (auto &OP : SrcMI->operands())
         if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
             OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
             (OP.getReg() == RegID ||
              (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
              (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
           return 5 - computeBypassStage(SCD) -
                  (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0);
     } else if (II->producesDP(SrcMI->getOpcode()) &&
                II->consumesQP(DstMI->getOpcode())) {
       for (auto &OP : SrcMI->operands())
         if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&
             OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 &&
             (OP.getReg() == RegID ||
              (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID))
           return -1;
     } else if (II->producesDP(SrcMI->getOpcode()) &&
                II->consumesSP(DstMI->getOpcode())) {
       if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
         return 5 - computeBypassStage(SCD);
     } else if (II->producesQP(SrcMI->getOpcode()) &&
                II->consumesSP(DstMI->getOpcode())) {
       if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
         return 5 - computeBypassStage(SCD) +
                (((RegID - ARM::S0) / 2) % 2 ? 1 : 0);
     } else if (II->producesQP(SrcMI->getOpcode()) &&
                II->consumesDP(DstMI->getOpcode())) {
       if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2)
         return 1;
     }
   }
   return 0;
 }

 void M85Overrides::modifyBypasses(SUnit &ISU) {
   const MachineInstr *SrcMI = ISU.getInstr();
   unsigned SrcOpcode = SrcMI->getOpcode();
   bool isNSWload = II->isNonSubwordLoad(SrcOpcode);

   // Walk the successors looking for latency overrides that are needed
   for (SDep &Dep : ISU.Succs) {

     // Output dependences should have 0 latency, as CortexM85 is able to
     // schedule writers to the same register for simultaneous issue.
     if (zeroOutputDependences(ISU, Dep))
       continue;

     if (memoryRAWHazard(ISU, Dep, 3))
       continue;

     // Ignore dependencies other than data or strong ordering.
     if (Dep.getKind() != SDep::Data)
       continue;

     SUnit &DepSU = *Dep.getSUnit();
     if (DepSU.isBoundaryNode())
       continue;

     if (makeBundleAssumptions(ISU, Dep) == 1)
       continue;

     const MachineInstr *DstMI = DepSU.getInstr();
     unsigned DstOpcode = DstMI->getOpcode();

     // Word loads into B operand of a load/store with cannot bypass their
     // scheduling stage. Cannot do in the .td file because need to decide
     // between -1 and -2 for ReadAdvance

     if (isNSWload && II->hasBRegAddrShift(DstOpcode) &&
         DstMI->getOperand(3).getImm() != 0 && // shift operand
         DstMI->getOperand(2).getReg() == Dep.getReg())
       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);

     if (isNSWload && isMVEVectorInstruction(DstMI)) {
       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
     }

     if (II->isMVEIntMAC(DstOpcode) &&
         II->isMVEIntMACMatched(SrcOpcode, DstOpcode) &&
         DstMI->getOperand(0).isReg() &&
         DstMI->getOperand(0).getReg() == Dep.getReg())
       setBidirLatencies(ISU, Dep, Dep.getLatency() - 1);

     // CC setter into conditional producer shouldn't have a latency of more
     // than 0 unless it's due to an implicit read.
     if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
         TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
       setBidirLatencies(ISU, Dep, 0);

     if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(),
                                          DAG->getSchedClass(&ISU)))
       setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat));

     if (II->isRev(SrcOpcode)) {
       if (II->isInlineShiftALU(DstOpcode))
         setBidirLatencies(ISU, Dep, 1);
       else if (II->isShift(DstOpcode))
         setBidirLatencies(ISU, Dep, 1);
     }
   }
 }

 // Add M55 specific overrides for latencies between instructions. Currently it:
 //  - Adds an extra cycle latency between MVE VMLAV and scalar instructions.
 class CortexM55Overrides : public ARMOverrideBypasses {
 public:
   CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
       : ARMOverrideBypasses(TII, AA) {}

   void modifyBypasses(SUnit &SU) override {
     MachineInstr *SrcMI = SU.getInstr();
     if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))
       return;

     for (SDep &Dep : SU.Succs) {
       if (Dep.getKind() != SDep::Data)
         continue;
       SUnit &DepSU = *Dep.getSUnit();
       if (DepSU.isBoundaryNode())
         continue;
       MachineInstr *DstMI = DepSU.getInstr();

       if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore())
         setBidirLatencies(SU, Dep, 3);
     }
   }
 };

 } // end anonymous namespace

 void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) {
   DAG = DAGInstrs;
   for (SUnit &ISU : DAGInstrs->SUnits) {
     if (ISU.isBoundaryNode())
       continue;
     modifyBypasses(ISU);
   }
   if (DAGInstrs->ExitSU.getInstr())
     modifyBypasses(DAGInstrs->ExitSU);
 }

 std::unique_ptr<ScheduleDAGMutation>
 createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) {
   if (ST.isCortexM85())
     return std::make_unique<M85Overrides>(ST.getInstrInfo(), AA);
   else if (ST.isCortexM7())
     return std::make_unique<CortexM7Overrides>(ST.getInstrInfo(), AA);
   else if (ST.isCortexM55())
     return std::make_unique<CortexM55Overrides>(ST.getInstrInfo(), AA);

   return nullptr;
 }

 } // end namespace llvm