lib/Target/ARM/ARMScheduleM7.td - llvm-project/llvm - Git at Google

 //=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the SchedRead/Write data for the ARM Cortex-M7 processor.
 //
 //===----------------------------------------------------------------------===//

 def CortexM7Model : SchedMachineModel {
   let IssueWidth = 2;        // Dual issue for most instructions.
   let MicroOpBufferSize = 0; // The Cortex-M7 is in-order.
   let LoadLatency = 2;       // Best case for load-use case.
   let MispredictPenalty = 4; // Mispredict cost for forward branches is 6,
                              // but 4 works better
   let CompleteModel = 0;
 }

 //===--------------------------------------------------------------------===//
 // The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP
 // pipe. The stages relevant to scheduling are as follows:
 //
 //   EX1: address generation  shifts
 //   EX2: fast load data      ALUs                  FP operation
 //   EX3: slow load data      integer writeback     FP operation
 //   EX4: store data                                FP writeback
 //
 // There are shifters in both EX1 and EX2, and some instructions can be
 // flexibly allocated between them.  EX2 is used as the "zero" point
 // for scheduling, so simple ALU operations executing in EX2 will have
 // ReadAdvance<0> (the default) for their source operands and Latency = 1.

 def M7UnitLoad   : ProcResource<2> { let BufferSize = 0; }
 def M7UnitStore  : ProcResource<1> { let BufferSize = 0; }
 def M7UnitALU    : ProcResource<2>;
 def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; }
 def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; }
 def M7UnitMAC    : ProcResource<1> { let BufferSize = 0; }
 def M7UnitBranch : ProcResource<1> { let BufferSize = 0; }
 def M7UnitVFP    : ProcResource<1> { let BufferSize = 0; }
 def M7UnitVPort  : ProcResource<2> { let BufferSize = 0; }
 def M7UnitSIMD   : ProcResource<1> { let BufferSize = 0; }

 //===---------------------------------------------------------------------===//
 // Subtarget-specific SchedWrite types with map ProcResources and set latency.

 let SchedModel = CortexM7Model in {

 def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; }

 // Basic ALU with shifts.
 let Latency = 1 in {
   def : WriteRes<WriteALUsi,  [M7UnitALU, M7UnitShift1]>;
   def : WriteRes<WriteALUsr,  [M7UnitALU, M7UnitShift1]>;
   def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>;
 }

 // Compares.
 def : WriteRes<WriteCMP,   [M7UnitALU]> { let Latency = 1; }
 def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
 def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }

 // Multiplies.
 let Latency = 2 in {
   def : WriteRes<WriteMUL16,   [M7UnitMAC]>;
   def : WriteRes<WriteMUL32,   [M7UnitMAC]>;
   def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>;
   def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; }
 }

 // Multiply-accumulates.
 let Latency = 2 in {
   def : WriteRes<WriteMAC16,   [M7UnitMAC]>;
   def : WriteRes<WriteMAC32,   [M7UnitMAC]>;
   def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; }
   def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; }
 }

 // Divisions.
 // These cannot be dual-issued with any instructions.
 def : WriteRes<WriteDIV, [M7UnitALU]> {
   let Latency = 7;
   let SingleIssue = 1;
 }

 // Loads/Stores.
 def : WriteRes<WriteLd,    [M7UnitLoad]> { let Latency = 1; }
 def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; }
 def : WriteRes<WriteST,    [M7UnitStore]> { let Latency = 2; }

 // Branches.
 def : WriteRes<WriteBr,    [M7UnitBranch]> { let Latency = 2; }
 def : WriteRes<WriteBrL,   [M7UnitBranch]> { let Latency = 2; }
 def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; }

 // Noop.
 def : WriteRes<WriteNoop, []> { let Latency = 0; }

 //===---------------------------------------------------------------------===//
 // Sched definitions for floating-point instructions
 //
 // Floating point conversions.
 def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
 def : WriteRes<WriteFPMOV, [M7UnitVPort]>            { let Latency = 3; }

 // The FP pipeline has a latency of 3 cycles.
 // ALU operations (32/64-bit).  These go down the FP pipeline.
 def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]>  { let Latency = 3; }
 def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
   let Latency = 4;
   let BeginGroup = 1;
 }

 // Multiplication
 def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
 def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
   let Latency = 7;
   let BeginGroup = 1;
 }

 // Multiply-accumulate.  FPMAC goes down the FP Pipeline.
 def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; }
 def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
   let Latency = 11;
   let BeginGroup = 1;
 }

 // Division.   Effective scheduling latency is 3, though real latency is larger
 def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]>  { let Latency = 16; }
 def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
   let Latency = 30;
   let BeginGroup = 1;
 }

 // Square-root.  Effective scheduling latency is 3; real latency is larger
 def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
 def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
   let Latency = 30;
   let BeginGroup = 1;
 }

 def M7WriteShift2   : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {}

 // Not used for M7, but needing definitions anyway
 def : WriteRes<WriteVLD1, []>;
 def : WriteRes<WriteVLD2, []>;
 def : WriteRes<WriteVLD3, []>;
 def : WriteRes<WriteVLD4, []>;
 def : WriteRes<WriteVST1, []>;
 def : WriteRes<WriteVST2, []>;
 def : WriteRes<WriteVST3, []>;
 def : WriteRes<WriteVST4, []>;

 def M7SingleIssue : SchedWriteRes<[]> {
   let SingleIssue = 1;
   let NumMicroOps = 0;
 }
 def M7Slot0Only   : SchedWriteRes<[]> {
   let BeginGroup = 1;
   let NumMicroOps = 0;
 }

 // What pipeline stage operands need to be ready for depending on
 // where they come from.
 def : ReadAdvance<ReadALUsr, 0>;
 def : ReadAdvance<ReadMUL, 0>;
 def : ReadAdvance<ReadMAC, 1>;
 def : ReadAdvance<ReadALU, 0>;
 def : ReadAdvance<ReadFPMUL, 0>;
 def : ReadAdvance<ReadFPMAC, 3>;
 def M7Read_ISS : SchedReadAdvance<-1>;     // operands needed at EX1
 def M7Read_EX2   : SchedReadAdvance<1>;    // operands needed at EX3
 def M7Read_EX3   : SchedReadAdvance<2>;    // operands needed at EX4

 // Non general purpose instructions may not be dual issued. These
 // use both issue units.
 def M7NonGeneralPurpose : SchedWriteRes<[]> {
   // Assume that these will go down the main ALU pipeline.
   // In reality, many look likely to stall the whole pipeline.
   let Latency = 3;
   let SingleIssue = 1;
 }

 // List the non general purpose instructions.
 def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT",
                                      "t2MSR", "t2DMB", "t2DSB", "t2ISB",
                                      "t2HVC", "t2SMC", "t2UDF", "ERET",
                                      "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>;

 //===---------------------------------------------------------------------===//
 // Sched definitions for load/store
 //
 // Mark whether the loads/stores must be single-issue
 // Address operands are needed earlier
 // Data operands are needed later

 def M7BaseUpdate : SchedWriteRes<[]> {
     let Latency = 0; // Update is bypassable out of EX1
     let NumMicroOps = 0;
 }
 def M7LoadLatency1 : SchedWriteRes<[]> {
     let Latency = 1;
     let NumMicroOps = 0;
 }
 def M7SlowLoad : SchedWriteRes<[M7UnitLoad]>            { let Latency = 2; }

 // Byte and half-word loads should have greater latency than other loads.
 // So should load exclusive.

 def : InstRW<[M7SlowLoad],
       (instregex "t2LDR(B|H|SB|SH)pc")>;
 def : InstRW<[M7SlowLoad, M7Read_ISS],
       (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i",
                  "tLDR(B|H)i")>;
 def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS],
       (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>;
 def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS],
       (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>;

 // Exclusive loads/stores cannot be dual-issued
 def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS],
       (instregex "t2LDREX$")>;
 def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS],
       (instregex "t2LDREX(B|H)")>;
 def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS],
       (instregex "t2STREX(B|H)?$")>;

 // Load/store multiples cannot be dual-issued.  Note that default scheduling
 // occurs around read/write times of individual registers in the list; read
 // time for STM cannot be overridden because it is a variadic source operand.

 def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
       (instregex "(t|t2)LDM(DB|IA)$")>;
 def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
       (instregex "(t|t2)STM(DB|IA)$")>;
 def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
       (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>;
 def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
       (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>;

 // Load/store doubles cannot be dual-issued.

 def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue,
               M7Read_EX2, M7Read_EX2, M7Read_ISS],
       (instregex "t2STRD_(PRE|POST)")>;
 def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS],
       (instregex "t2STRDi")>;
 def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS],
       (instregex "t2LDRD_(PRE|POST)")>;
 def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS],
       (instregex "t2LDRDi")>;

 // Word load / preload
 def : InstRW<[WriteLd],
       (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>;
 def : InstRW<[WriteLd, M7Read_ISS],
       (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>;
 def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS],
       (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>;
 def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS],
       (instregex "t2LDR_(POST|PRE)")>;

 // Stores
 def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS],
       (instregex "t2STR(B|H)?_(POST|PRE)")>;
 def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS],
       (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>;
 def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS],
       (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>;

 // TBB/TBH - single-issue only; takes two cycles to issue

 def M7TableLoad : SchedWriteRes<[M7UnitLoad]> {
   let NumMicroOps = 2;
   let SingleIssue = 1;
 }

 def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>;

 // VFP loads and stores

 def M7LoadSP  : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; }
 def M7LoadDP  : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> {
   let Latency = 2;
   let SingleIssue = 1;
 }
 def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>;
 def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> {
   let SingleIssue = 1;
 }

 def : InstRW<[M7LoadSP, M7Read_ISS],                 (instregex "VLDR(S|H)$")>;
 def : InstRW<[M7LoadDP, M7Read_ISS],                 (instregex "VLDRD$")>;
 def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS],    (instregex "VSTR(S|H)$")>;
 def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS],    (instregex "VSTRD$")>;

 // Load/store multiples cannot be dual-issued.

 def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
       (instregex "VLDM(S|D|Q)(DB|IA)$")>;
 def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
       (instregex "VSTM(S|D|Q)(DB|IA)$")>;
 def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
       (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>;
 def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
       (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>;

 //===---------------------------------------------------------------------===//
 // Sched definitions for ALU
 //

 // Shifted ALU operands are read a cycle early.
 def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>;

 def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS],
              (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$",
                         "t2(SUB|CMP|CMNz|TEQ|TST)rs$",
                         "t2MOVsr(a|l)")>;
 def : InstRW<[WriteALUsi, M7Read_ISS],
              (instregex "t2MVNs")>;

 // Treat pure shift operations (except for RRX) as if they used the EX1
 // shifter but have timing as if they used the EX2 shifter as they usually
 // can choose the EX2 shifter when needed.  Will miss a few dual-issue cases,
 // but the results prove to be better than trying to get them exact.

 def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>;
 def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>;

 // Instructions that use the shifter, but have normal timing.

 def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>;

 // Instructions which are slot zero only but otherwise normal.

 def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>;

 // MAC operations that don't have SchedRW set.

 def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>;

 // Divides are special because they stall for their latency, and so look like a
 // single-cycle as far as scheduling opportunities go.  By putting WriteALU
 // first, we make the operand latency 1, but keep the instruction latency 7.

 def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>;

 // DSP extension operations

 def M7WriteSIMD1   : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
   let Latency = 1;
   let BeginGroup = 1;
 }
 def M7WriteSIMD2   : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
   let Latency = 2;
   let BeginGroup = 1;
 }
 def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
   let Latency = 1;
   let BeginGroup = 1;
 }
 def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
   let Latency = 0;      // Bypassable out of EX1
   let BeginGroup = 1;
 }
 def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
   let Latency = 2;
   let BeginGroup = 1;
 }

 def : InstRW<[M7WriteShSIMD2, M7Read_ISS],
              (instregex "t2(S|U)SAT")>;
 def : InstRW<[M7WriteSIMD1, ReadALU],
              (instregex "(t|t2)(S|U)XT(B|H)")>;
 def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU],
              (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)",
                         "t2SEL")>;
 def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU],
              (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>;
 def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS],
              (instregex "t2QD(ADD|SUB)")>;
 def : InstRW<[M7WriteShSIMD0, M7Read_ISS],
              (instregex "t2(RBIT|REV)", "tREV")>;
 def : InstRW<[M7WriteShSIMD1, M7Read_ISS],
              (instregex "t2(SBFX|UBFX)")>;
 def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS],
              (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>;
 def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2],
              (instregex "t2USADA8")>;

 // MSR/MRS
 def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>;

 //===---------------------------------------------------------------------===//
 // Sched definitions for FP operations
 //

 // Effective scheduling latency is really 3 for nearly all FP operations,
 // even if their true latency is higher.
 def M7WriteVFPLatOverride : SchedWriteRes<[]> {
   let Latency = 3;
   let NumMicroOps = 0;
 }
 def M7WriteVFPExtraVPort  : SchedWriteRes<[M7UnitVPort]> {
   let Latency = 3;
   let NumMicroOps = 0;
 }

 // Instructions which are missing default schedules.
 def : InstRW<[WriteFPALU32],
              (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>;
 def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
              (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>;

 // VCMP
 def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; }
 def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
   let Latency = 0;
   let BeginGroup = 1;
 }
 def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>;
 def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>;

     // VMRS/VMSR
 def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
 def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
 def : InstRW<[M7VMRS], (instregex "FMSTAT")>;
 def : InstRW<[M7VMSR], (instregex "VMSR")>;

 // VSEL cannot bypass in its implied $cpsr operand; model as earlier read
 def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS],
              (instregex "VSEL.*S$")>;
 def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only,
               ReadALU, ReadALU, M7Read_ISS],
              (instregex "VSEL.*D$")>;

 // VMOV
 def : InstRW<[WriteFPMOV],
              (instregex "VMOV(H|S)$", "FCONST(H|S)")>;
 def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
              (instregex "VMOVD$")>;
 def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
              (instregex "FCONSTD")>;
 def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue],
              (instregex "VMOV(DRR|RRD|RRS|SRR)")>;

 // Larger-latency overrides.

 def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32],  (instregex "VDIVS")>;
 def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64],  (instregex "VDIVD")>;
 def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>;
 def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>;
 def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64],
              (instregex "V(MUL|NMUL)D")>;
 def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
              (instregex "V(ADD|SUB)D")>;

 // Multiply-accumulate.  Chained SP timing is correct; rest need overrides
 // Double-precision chained MAC stalls the pipeline behind it for 3 cycles,
 // making it appear to have 3 cycle latency for scheduling.

 def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
               ReadFPMAC, ReadFPMUL, ReadFPMUL],
              (instregex "V(N)?ML(A|S)D$")>;

 // Single-precision fused MACs look like latency 5 with advance of 2.

 def M7WriteVFPLatOverride5 : SchedWriteRes<[]> {
   let Latency = 5;
   let NumMicroOps = 0;
 }
 def M7ReadFPMAC2   : SchedReadAdvance<2>;

 def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32,
               M7ReadFPMAC2, ReadFPMUL, ReadFPMUL],
              (instregex "VF(N)?M(A|S)S$")>;

 // Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making
 // it appear to have 3 cycle latency for scheduling.

 def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
               ReadFPMAC, ReadFPMUL, ReadFPMUL],
              (instregex "VF(N)?M(A|S)D$")>;

 }  // SchedModel = CortexM7Model
	//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -- tablegen --=//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor.
	//
	//===----------------------------------------------------------------------===//

	def CortexM7Model : SchedMachineModel {
	let IssueWidth = 2; // Dual issue for most instructions.
	let MicroOpBufferSize = 0; // The Cortex-M7 is in-order.
	let LoadLatency = 2; // Best case for load-use case.
	let MispredictPenalty = 4; // Mispredict cost for forward branches is 6,
	// but 4 works better
	let CompleteModel = 0;
	}

	//===--------------------------------------------------------------------===//
	// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP
	// pipe. The stages relevant to scheduling are as follows:
	//
	// EX1: address generation shifts
	// EX2: fast load data ALUs FP operation
	// EX3: slow load data integer writeback FP operation
	// EX4: store data FP writeback
	//
	// There are shifters in both EX1 and EX2, and some instructions can be
	// flexibly allocated between them. EX2 is used as the "zero" point
	// for scheduling, so simple ALU operations executing in EX2 will have
	// ReadAdvance<0> (the default) for their source operands and Latency = 1.

	def M7UnitLoad : ProcResource<2> { let BufferSize = 0; }
	def M7UnitStore : ProcResource<1> { let BufferSize = 0; }
	def M7UnitALU : ProcResource<2>;
	def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; }
	def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; }
	def M7UnitMAC : ProcResource<1> { let BufferSize = 0; }
	def M7UnitBranch : ProcResource<1> { let BufferSize = 0; }
	def M7UnitVFP : ProcResource<1> { let BufferSize = 0; }
	def M7UnitVPort : ProcResource<2> { let BufferSize = 0; }
	def M7UnitSIMD : ProcResource<1> { let BufferSize = 0; }

	//===---------------------------------------------------------------------===//
	// Subtarget-specific SchedWrite types with map ProcResources and set latency.

	let SchedModel = CortexM7Model in {

	def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; }

	// Basic ALU with shifts.
	let Latency = 1 in {
	def : WriteRes<WriteALUsi, [M7UnitALU, M7UnitShift1]>;
	def : WriteRes<WriteALUsr, [M7UnitALU, M7UnitShift1]>;
	def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>;
	}

	// Compares.
	def : WriteRes<WriteCMP, [M7UnitALU]> { let Latency = 1; }
	def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
	def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }

	// Multiplies.
	let Latency = 2 in {
	def : WriteRes<WriteMUL16, [M7UnitMAC]>;
	def : WriteRes<WriteMUL32, [M7UnitMAC]>;
	def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>;
	def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; }
	}

	// Multiply-accumulates.
	let Latency = 2 in {
	def : WriteRes<WriteMAC16, [M7UnitMAC]>;
	def : WriteRes<WriteMAC32, [M7UnitMAC]>;
	def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; }
	def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; }
	}

	// Divisions.
	// These cannot be dual-issued with any instructions.
	def : WriteRes<WriteDIV, [M7UnitALU]> {
	let Latency = 7;
	let SingleIssue = 1;
	}

	// Loads/Stores.
	def : WriteRes<WriteLd, [M7UnitLoad]> { let Latency = 1; }
	def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; }
	def : WriteRes<WriteST, [M7UnitStore]> { let Latency = 2; }

	// Branches.
	def : WriteRes<WriteBr, [M7UnitBranch]> { let Latency = 2; }
	def : WriteRes<WriteBrL, [M7UnitBranch]> { let Latency = 2; }
	def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; }

	// Noop.
	def : WriteRes<WriteNoop, []> { let Latency = 0; }

	//===---------------------------------------------------------------------===//
	// Sched definitions for floating-point instructions
	//
	// Floating point conversions.
	def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
	def : WriteRes<WriteFPMOV, [M7UnitVPort]> { let Latency = 3; }

	// The FP pipeline has a latency of 3 cycles.
	// ALU operations (32/64-bit). These go down the FP pipeline.
	def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
	def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
	let Latency = 4;
	let BeginGroup = 1;
	}

	// Multiplication
	def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
	def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
	let Latency = 7;
	let BeginGroup = 1;
	}

	// Multiply-accumulate. FPMAC goes down the FP Pipeline.
	def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; }
	def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
	let Latency = 11;
	let BeginGroup = 1;
	}

	// Division. Effective scheduling latency is 3, though real latency is larger
	def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
	def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
	let Latency = 30;
	let BeginGroup = 1;
	}

	// Square-root. Effective scheduling latency is 3; real latency is larger
	def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
	def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
	let Latency = 30;
	let BeginGroup = 1;
	}

	def M7WriteShift2 : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {}

	// Not used for M7, but needing definitions anyway
	def : WriteRes<WriteVLD1, []>;
	def : WriteRes<WriteVLD2, []>;
	def : WriteRes<WriteVLD3, []>;
	def : WriteRes<WriteVLD4, []>;
	def : WriteRes<WriteVST1, []>;
	def : WriteRes<WriteVST2, []>;
	def : WriteRes<WriteVST3, []>;
	def : WriteRes<WriteVST4, []>;

	def M7SingleIssue : SchedWriteRes<[]> {
	let SingleIssue = 1;
	let NumMicroOps = 0;
	}
	def M7Slot0Only : SchedWriteRes<[]> {
	let BeginGroup = 1;
	let NumMicroOps = 0;
	}

	// What pipeline stage operands need to be ready for depending on
	// where they come from.
	def : ReadAdvance<ReadALUsr, 0>;
	def : ReadAdvance<ReadMUL, 0>;
	def : ReadAdvance<ReadMAC, 1>;
	def : ReadAdvance<ReadALU, 0>;
	def : ReadAdvance<ReadFPMUL, 0>;
	def : ReadAdvance<ReadFPMAC, 3>;
	def M7Read_ISS : SchedReadAdvance<-1>; // operands needed at EX1
	def M7Read_EX2 : SchedReadAdvance<1>; // operands needed at EX3
	def M7Read_EX3 : SchedReadAdvance<2>; // operands needed at EX4

	// Non general purpose instructions may not be dual issued. These
	// use both issue units.
	def M7NonGeneralPurpose : SchedWriteRes<[]> {
	// Assume that these will go down the main ALU pipeline.
	// In reality, many look likely to stall the whole pipeline.
	let Latency = 3;
	let SingleIssue = 1;
	}

	// List the non general purpose instructions.
	def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT",
	"t2MSR", "t2DMB", "t2DSB", "t2ISB",
	"t2HVC", "t2SMC", "t2UDF", "ERET",
	"tHINT", "t2HINT", "t2CLREX", "BUNDLE")>;

	//===---------------------------------------------------------------------===//
	// Sched definitions for load/store
	//
	// Mark whether the loads/stores must be single-issue
	// Address operands are needed earlier
	// Data operands are needed later

	def M7BaseUpdate : SchedWriteRes<[]> {
	let Latency = 0; // Update is bypassable out of EX1
	let NumMicroOps = 0;
	}
	def M7LoadLatency1 : SchedWriteRes<[]> {
	let Latency = 1;
	let NumMicroOps = 0;
	}
	def M7SlowLoad : SchedWriteRes<[M7UnitLoad]> { let Latency = 2; }

	// Byte and half-word loads should have greater latency than other loads.
	// So should load exclusive.

	def : InstRW<[M7SlowLoad],
	(instregex "t2LDR(B\|H\|SB\|SH)pc")>;
	def : InstRW<[M7SlowLoad, M7Read_ISS],
	(instregex "t2LDR(B\|H\|SB\|SH)T", "t2LDR(B\|H\|SB\|SH)i",
	"tLDR(B\|H)i")>;
	def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS],
	(instregex "t2LDR(B\|H\|SB\|SH)s", "tLDR(B\|H)r", "tLDR(SB\|SH)")>;
	def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS],
	(instregex "t2LDR(B\|H\|SB\|SH)_(POST\|PRE)")>;

	// Exclusive loads/stores cannot be dual-issued
	def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS],
	(instregex "t2LDREX$")>;
	def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS],
	(instregex "t2LDREX(B\|H)")>;
	def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS],
	(instregex "t2STREX(B\|H)?$")>;

	// Load/store multiples cannot be dual-issued. Note that default scheduling
	// occurs around read/write times of individual registers in the list; read
	// time for STM cannot be overridden because it is a variadic source operand.

	def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
	(instregex "(t\|t2)LDM(DB\|IA)$")>;
	def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
	(instregex "(t\|t2)STM(DB\|IA)$")>;
	def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
	(instregex "(t\|t2)LDM(DB\|IA)_UPD$", "tPOP")>;
	def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
	(instregex "(t\|t2)STM(DB\|IA)_UPD$", "tPUSH")>;

	// Load/store doubles cannot be dual-issued.

	def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue,
	M7Read_EX2, M7Read_EX2, M7Read_ISS],
	(instregex "t2STRD_(PRE\|POST)")>;
	def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS],
	(instregex "t2STRDi")>;
	def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS],
	(instregex "t2LDRD_(PRE\|POST)")>;
	def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS],
	(instregex "t2LDRDi")>;

	// Word load / preload
	def : InstRW<[WriteLd],
	(instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>;
	def : InstRW<[WriteLd, M7Read_ISS],
	(instregex "t2LDR(i\|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>;
	def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS],
	(instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>;
	def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS],
	(instregex "t2LDR_(POST\|PRE)")>;

	// Stores
	def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS],
	(instregex "t2STR(B\|H)?_(POST\|PRE)")>;
	def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS],
	(instregex "t2STR(B\|H)?s$", "tSTR(B\|H)?r$")>;
	def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS],
	(instregex "t2STR(B\|H)?(i\|T)", "tSTR(B\|H)?i$", "tSTRspi")>;

	// TBB/TBH - single-issue only; takes two cycles to issue

	def M7TableLoad : SchedWriteRes<[M7UnitLoad]> {
	let NumMicroOps = 2;
	let SingleIssue = 1;
	}

	def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>;

	// VFP loads and stores

	def M7LoadSP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; }
	def M7LoadDP : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> {
	let Latency = 2;
	let SingleIssue = 1;
	}
	def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>;
	def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> {
	let SingleIssue = 1;
	}

	def : InstRW<[M7LoadSP, M7Read_ISS], (instregex "VLDR(S\|H)$")>;
	def : InstRW<[M7LoadDP, M7Read_ISS], (instregex "VLDRD$")>;
	def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS], (instregex "VSTR(S\|H)$")>;
	def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS], (instregex "VSTRD$")>;

	// Load/store multiples cannot be dual-issued.

	def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
	(instregex "VLDM(S\|D\|Q)(DB\|IA)$")>;
	def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
	(instregex "VSTM(S\|D\|Q)(DB\|IA)$")>;
	def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
	(instregex "VLDM(S\|D\|Q)(DB\|IA)_UPD$")>;
	def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
	(instregex "VSTM(S\|D\|Q)(DB\|IA)_UPD$")>;

	//===---------------------------------------------------------------------===//
	// Sched definitions for ALU
	//

	// Shifted ALU operands are read a cycle early.
	def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>;

	def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS],
	(instregex "t2(ADC\|ADDS\|ADD\|BIC\|EOR\|ORN\|ORR\|RSBS\|RSB\|SBC\|SUBS)rs$",
	"t2(SUB\|CMP\|CMNz\|TEQ\|TST)rs$",
	"t2MOVsr(a\|l)")>;
	def : InstRW<[WriteALUsi, M7Read_ISS],
	(instregex "t2MVNs")>;

	// Treat pure shift operations (except for RRX) as if they used the EX1
	// shifter but have timing as if they used the EX2 shifter as they usually
	// can choose the EX2 shifter when needed. Will miss a few dual-issue cases,
	// but the results prove to be better than trying to get them exact.

	def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>;
	def : InstRW<[WriteALUsi], (instregex "(t\|t2)(LSL\|LSR\|ASR\|ROR)")>;

	// Instructions that use the shifter, but have normal timing.

	def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC\|BFI)$")>;

	// Instructions which are slot zero only but otherwise normal.

	def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>;

	// MAC operations that don't have SchedRW set.

	def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>;

	// Divides are special because they stall for their latency, and so look like a
	// single-cycle as far as scheduling opportunities go. By putting WriteALU
	// first, we make the operand latency 1, but keep the instruction latency 7.

	def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S\|U)DIV")>;

	// DSP extension operations

	def M7WriteSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
	let Latency = 1;
	let BeginGroup = 1;
	}
	def M7WriteSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
	let Latency = 2;
	let BeginGroup = 1;
	}
	def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
	let Latency = 1;
	let BeginGroup = 1;
	}
	def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
	let Latency = 0; // Bypassable out of EX1
	let BeginGroup = 1;
	}
	def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
	let Latency = 2;
	let BeginGroup = 1;
	}

	def : InstRW<[M7WriteShSIMD2, M7Read_ISS],
	(instregex "t2(S\|U)SAT")>;
	def : InstRW<[M7WriteSIMD1, ReadALU],
	(instregex "(t\|t2)(S\|U)XT(B\|H)")>;
	def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU],
	(instregex "t2(S\|SH\|U\|UH)(ADD16\|ADD8\|ASX\|SAX\|SUB16\|SUB8)",
	"t2SEL")>;
	def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU],
	(instregex "t2(Q\|UQ)(ADD\|ASX\|SAX\|SUB)", "t2USAD8")>;
	def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS],
	(instregex "t2QD(ADD\|SUB)")>;
	def : InstRW<[M7WriteShSIMD0, M7Read_ISS],
	(instregex "t2(RBIT\|REV)", "tREV")>;
	def : InstRW<[M7WriteShSIMD1, M7Read_ISS],
	(instregex "t2(SBFX\|UBFX)")>;
	def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS],
	(instregex "t2PKH(BT\|TB)", "t2(S\|U)XTA")>;
	def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2],
	(instregex "t2USADA8")>;

	// MSR/MRS
	def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>;

	//===---------------------------------------------------------------------===//
	// Sched definitions for FP operations
	//

	// Effective scheduling latency is really 3 for nearly all FP operations,
	// even if their true latency is higher.
	def M7WriteVFPLatOverride : SchedWriteRes<[]> {
	let Latency = 3;
	let NumMicroOps = 0;
	}
	def M7WriteVFPExtraVPort : SchedWriteRes<[M7UnitVPort]> {
	let Latency = 3;
	let NumMicroOps = 0;
	}

	// Instructions which are missing default schedules.
	def : InstRW<[WriteFPALU32],
	(instregex "V(ABS\|CVT.\|NEG\|FP_VMAX.\|FP_VMIN.\|RINT.)S$")>;
	def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
	(instregex "V(ABS\|CVT.\|NEG\|FP_VMAX.\|FP_VMIN.\|RINT.)D$")>;

	// VCMP
	def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; }
	def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
	let Latency = 0;
	let BeginGroup = 1;
	}
	def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>;
	def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>;

	// VMRS/VMSR
	def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
	def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
	def : InstRW<[M7VMRS], (instregex "FMSTAT")>;
	def : InstRW<[M7VMSR], (instregex "VMSR")>;

	// VSEL cannot bypass in its implied $cpsr operand; model as earlier read
	def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS],
	(instregex "VSEL.*S$")>;
	def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only,
	ReadALU, ReadALU, M7Read_ISS],
	(instregex "VSEL.*D$")>;

	// VMOV
	def : InstRW<[WriteFPMOV],
	(instregex "VMOV(H\|S)$", "FCONST(H\|S)")>;
	def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
	(instregex "VMOVD$")>;
	def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
	(instregex "FCONSTD")>;
	def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue],
	(instregex "VMOV(DRR\|RRD\|RRS\|SRR)")>;

	// Larger-latency overrides.

	def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32], (instregex "VDIVS")>;
	def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64], (instregex "VDIVD")>;
	def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>;
	def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>;
	def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64],
	(instregex "V(MUL\|NMUL)D")>;
	def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
	(instregex "V(ADD\|SUB)D")>;

	// Multiply-accumulate. Chained SP timing is correct; rest need overrides
	// Double-precision chained MAC stalls the pipeline behind it for 3 cycles,
	// making it appear to have 3 cycle latency for scheduling.

	def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
	ReadFPMAC, ReadFPMUL, ReadFPMUL],
	(instregex "V(N)?ML(A\|S)D$")>;

	// Single-precision fused MACs look like latency 5 with advance of 2.

	def M7WriteVFPLatOverride5 : SchedWriteRes<[]> {
	let Latency = 5;
	let NumMicroOps = 0;
	}
	def M7ReadFPMAC2 : SchedReadAdvance<2>;

	def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32,
	M7ReadFPMAC2, ReadFPMUL, ReadFPMUL],
	(instregex "VF(N)?M(A\|S)S$")>;

	// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making
	// it appear to have 3 cycle latency for scheduling.

	def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
	ReadFPMAC, ReadFPMUL, ReadFPMUL],
	(instregex "VF(N)?M(A\|S)D$")>;

	} // SchedModel = CortexM7Model