| //=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file defines the machine model for Znver4 to support instruction |
| // scheduling and other instruction cost heuristics. |
| // Based on: |
| // * AMD Software Optimization Guide for AMD Family 19h Processors. |
| // https://www.amd.com/system/files/TechDocs/56665.zip |
| //===----------------------------------------------------------------------===// |
| |
| def Znver4Model : SchedMachineModel { |
| // AMD SOG 19h, 2.9.6 Dispatch |
| // The processor may dispatch up to 6 macro ops per cycle |
| // into the execution engine. |
| let IssueWidth = 6; |
| // AMD SOG 19h, 2.10.3 |
| // The retire control unit (RCU) tracks the completion status of all |
| // outstanding operations (integer, load/store, and floating-point) and is |
| // the final arbiter for exception processing and recovery. |
| // The unit can receive up to 6 macro ops dispatched per cycle and track up |
| // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode. |
| let MicroOpBufferSize = 320; |
| // AMD SOG 19h, 2.9.1 Op Cache |
| // The op cache is organized as an associative cache with 64 sets and 8 ways. |
| // At each set-way intersection is an entry containing up to 8 macro ops. |
| // The maximum capacity of the op cache is 4K ops. |
| // Agner, 22.5 µop cache |
| // The size of the µop cache is big enough for holding most critical loops. |
| // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity, |
| // with large values here the compilation of certain loops |
| // ends up taking way too long. |
| // Ideally for znver4, we should have 6.75K. However we don't add that |
| // considerting the impact compile time and prefer using default values |
| // instead. |
| // let LoopMicroOpBufferSize = 6750; |
| // AMD SOG 19h, 2.6.2 L1 Data Cache |
| // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. |
| // AMD SOG 19h, 2.12 L1 Data Cache |
| // The AGU and LS pipelines are optimized for simple address generation modes. |
| // <...> and can achieve 4-cycle load-to-use integer load latency. |
| let LoadLatency = 4; |
| // AMD SOG 19h, 2.12 L1 Data Cache |
| // The AGU and LS pipelines are optimized for simple address generation modes. |
| // <...> and can achieve <...> 7-cycle load-to-use FP load latency. |
| int VecLoadLatency = 7; |
| // Latency of a simple store operation. |
| int StoreLatency = 1; |
| // FIXME: |
| let HighLatency = 25; // FIXME: any better choice? |
| // AMD SOG 19h, 2.8 Optimizing Branching |
| // The branch misprediction penalty is in the range from 11 to 18 cycles, |
| // <...>. The common case penalty is 13 cycles. |
| let MispredictPenalty = 13; |
| |
| let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. |
| |
| let CompleteModel = 1; |
| } |
| |
| let SchedModel = Znver4Model in { |
| |
| |
| //===----------------------------------------------------------------------===// |
| // RCU |
| //===----------------------------------------------------------------------===// |
| |
| // AMD SOG 19h, 2.10.3 Retire Control Unit |
| // The unit can receive up to 6 macro ops dispatched per cycle and track up to |
| // 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> |
| // The retire unit handles in-order commit of up to nine macro ops per cycle. |
| def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>; |
| |
| //===----------------------------------------------------------------------===// |
| // Integer Execution Unit |
| // |
| |
| // AMD SOG 19h, 2.4 Superscalar Organization |
| // The processor uses four decoupled independent integer scheduler queues, |
| // each one servicing one ALU pipeline and one or two other pipelines |
| |
| // |
| // Execution pipes |
| //===----------------------------------------------------------------------===// |
| |
| // AMD SOG 19h, 2.10.2 Execution Units |
| // The processor contains 4 general purpose integer execution pipes. |
| // Each pipe has an ALU capable of general purpose integer operations. |
| def Zn4ALU0 : ProcResource<1>; |
| def Zn4ALU1 : ProcResource<1>; |
| def Zn4ALU2 : ProcResource<1>; |
| def Zn4ALU3 : ProcResource<1>; |
| |
| // AMD SOG 19h, 2.10.2 Execution Units |
| // There is also a separate branch execution unit. |
| def Zn4BRU1 : ProcResource<1>; |
| |
| // AMD SOG 19h, 2.10.2 Execution Units |
| // There are three Address Generation Units (AGUs) for all load and store |
| // address generation. There are also 3 store data movement units |
| // associated with the same schedulers as the AGUs. |
| def Zn4AGU0 : ProcResource<1>; |
| def Zn4AGU1 : ProcResource<1>; |
| def Zn4AGU2 : ProcResource<1>; |
| |
| // |
| // Execution Units |
| //===----------------------------------------------------------------------===// |
| |
| // AMD SOG 19h, 2.10.2 Execution Units |
| // ALU0 additionally has divide <...> execution capability. |
| defvar Zn4Divider = Zn4ALU0; |
| |
| // AMD SOG 19h, 2.10.2 Execution Units |
| // ALU0 additionally has <...> branch execution capability. |
| defvar Zn4BRU0 = Zn4ALU0; |
| |
| // Integer Multiplication issued on ALU1. |
| defvar Zn4Multiplier = Zn4ALU1; |
| |
| // Execution pipeline grouping |
| //===----------------------------------------------------------------------===// |
| |
| // General ALU operations |
| def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>; |
| |
| // General AGU operations |
| def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>; |
| |
| // Control flow: jumps, calls |
| def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>; |
| |
| // Everything that isn't control flow, but still needs to access CC register, |
| // namely: conditional moves, SETcc. |
| def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>; |
| |
| // Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT |
| |
| // Simple bit twiddling: bit test, shift/rotate, bit extraction |
| def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>; |
| |
| |
| // |
| // Scheduling |
| //===----------------------------------------------------------------------===// |
| |
| // AMD SOG 19h, 2.10.3 Retire Control Unit |
| // The integer physical register file (PRF) consists of 224 registers. |
| def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0], |
| 6, // Max moves that can be eliminated per cycle. |
| 0>; // Restrict move elimination to zero regs. |
| |
| // anandtech, The integer scheduler has a 4*24 entry macro op capacity. |
| // AMD SOG 19h, 2.10.1 Schedulers |
| // The schedulers can receive up to six macro ops per cycle, with a limit of |
| // two per scheduler. Each scheduler can issue one micro op per cycle into |
| // each of its associated pipelines |
| def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0 |
| Zn4ALU1, Zn4AGU1, // scheduler 1 |
| Zn4ALU2, Zn4AGU2, // scheduler 2 |
| Zn4ALU3, Zn4BRU1 // scheduler 3 |
| ]> { |
| let BufferSize = !mul(4, 24); |
| } |
| |
| |
| //===----------------------------------------------------------------------===// |
| // Floating-Point Unit |
| // |
| |
| // AMD SOG 19h, 2.4 Superscalar Organization |
| // The processor uses <...> two decoupled independent floating point schedulers |
| // each servicing two FP pipelines and one store or FP-to-integer pipeline. |
| |
| // |
| // Execution pipes |
| //===----------------------------------------------------------------------===// |
| |
| // AMD SOG 19h, 2.10.1 Schedulers |
| // <...>, and six FPU pipes. |
| // Agner, 22.10 Floating point execution pipes |
| // There are six floating point/vector execution pipes, |
| def Zn4FP0 : ProcResource<1>; |
| def Zn4FP1 : ProcResource<1>; |
| def Zn4FP2 : ProcResource<1>; |
| def Zn4FP3 : ProcResource<1>; |
| def Zn4FP45 : ProcResource<2>; |
| |
| // |
| // Execution Units |
| //===----------------------------------------------------------------------===// |
| // AMD SOG 19h, 2.11.1 Floating Point Execution Resources |
| |
| // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) |
| defvar Zn4FPFMul0 = Zn4FP0; |
| defvar Zn4FPFMul1 = Zn4FP1; |
| |
| // (v)FADD* |
| defvar Zn4FPFAdd0 = Zn4FP2; |
| defvar Zn4FPFAdd1 = Zn4FP3; |
| |
| // All convert operations except pack/unpack |
| defvar Zn4FPFCvt0 = Zn4FP2; |
| defvar Zn4FPFCvt1 = Zn4FP3; |
| |
| // All Divide and Square Root except Reciprocal Approximation |
| // AMD SOG 19h, 2.11.1 Floating Point Execution Resources |
| // FDIV unit can support 2 simultaneous operations in flight |
| // even though it occupies a single pipe. |
| // FIXME: BufferSize=2 ? |
| defvar Zn4FPFDiv = Zn4FP1; |
| |
| // Moves and Logical operations on Floating Point Data Types |
| defvar Zn4FPFMisc0 = Zn4FP0; |
| defvar Zn4FPFMisc1 = Zn4FP1; |
| defvar Zn4FPFMisc2 = Zn4FP2; |
| defvar Zn4FPFMisc3 = Zn4FP3; |
| |
| // Integer Adds, Subtracts, and Compares |
| // Some complex VADD operations are not available in all pipes. |
| defvar Zn4FPVAdd0 = Zn4FP0; |
| defvar Zn4FPVAdd1 = Zn4FP1; |
| defvar Zn4FPVAdd2 = Zn4FP2; |
| defvar Zn4FPVAdd3 = Zn4FP3; |
| |
| // Integer Multiplies, SAD, Blendvb |
| defvar Zn4FPVMul0 = Zn4FP0; |
| defvar Zn4FPVMul1 = Zn4FP3; |
| |
| // Data Shuffles, Packs, Unpacks, Permute |
| // Some complex shuffle operations are only available in pipe1. |
| defvar Zn4FPVShuf = Zn4FP1; |
| defvar Zn4FPVShufAux = Zn4FP2; |
| |
| // Bit Shift Left/Right operations |
| defvar Zn4FPVShift0 = Zn4FP1; |
| defvar Zn4FPVShift1 = Zn4FP2; |
| |
| // Moves and Logical operations on Packed Integer Data Types |
| defvar Zn4FPVMisc0 = Zn4FP0; |
| defvar Zn4FPVMisc1 = Zn4FP1; |
| defvar Zn4FPVMisc2 = Zn4FP2; |
| defvar Zn4FPVMisc3 = Zn4FP3; |
| |
| // *AES* |
| defvar Zn4FPAES0 = Zn4FP0; |
| defvar Zn4FPAES1 = Zn4FP1; |
| |
| // *CLM* |
| defvar Zn4FPCLM0 = Zn4FP0; |
| defvar Zn4FPCLM1 = Zn4FP1; |
| |
| // Execution pipeline grouping |
| //===----------------------------------------------------------------------===// |
| |
| // AMD SOG 19h, 2.11 Floating-Point Unit |
| // Stores and floating point to general purpose register transfer |
| // have 2 dedicated pipelines (pipe 5 and 6). |
| def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>; |
| |
| // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) |
| def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>; |
| |
| // (v)FADD* |
| // Some complex VADD operations are not available in all pipes. |
| def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>; |
| |
| // All convert operations except pack/unpack |
| def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>; |
| |
| // All Divide and Square Root except Reciprocal Approximation |
| // def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>; |
| |
| // Moves and Logical operations on Floating Point Data Types |
| def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>; |
| |
| // FIXUP and RANGE use FP01 pipelines |
| def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>; |
| def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>; |
| // SCALE instructions use FP23 pipelines |
| def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>; |
| def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>; |
| |
| // Loads, Stores and Move to General Register (EX) Operations |
| // AMD SOG 19h, 2.11 Floating-Point Unit |
| // Stores and floating point to general purpose register transfer |
| // have 2 dedicated pipelines (pipe 5 and 6). |
| defvar Zn4FPLd01 = Zn4FP45; |
| |
| // AMD SOG 19h, 2.11 Floating-Point Unit |
| // Note that FP stores are supported on two pipelines, |
| // but throughput is limited to one per cycle. |
| let Super = Zn4FP45 in |
| def Zn4FPSt : ProcResource<1>; |
| |
| // Integer Adds, Subtracts, and Compares |
| // Some complex VADD operations are not available in all pipes. |
| def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>; |
| |
| def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>; |
| def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>; |
| |
| // AVX512 Opmask pipelines |
| def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>; |
| def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>; |
| |
| // Integer Multiplies, SAD, Blendvb |
| def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>; |
| |
| // Data Shuffles, Packs, Unpacks, Permute |
| // Some complex shuffle operations are only available in pipe1. |
| def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>; |
| |
| // Bit Shift Left/Right operations |
| def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>; |
| |
| // Moves and Logical operations on Packed Integer Data Types |
| def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>; |
| |
| // *AES* |
| def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>; |
| |
| // *CLM* |
| def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>; |
| |
| |
| // |
| // Scheduling |
| //===----------------------------------------------------------------------===// |
| |
| // Agner, 21.8 Register renaming and out-of-order schedulers |
| // The floating point register file has 192 vector registers |
| // of 512b each in zen4. |
| def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1], |
| 6, // Max moves that can be eliminated per cycle. |
| 0>; // Restrict move elimination to zero regs. |
| |
| // AMD SOG 19h, 2.11 Floating-Point Unit |
| // The floating-point scheduler has a 2*32 entry macro op capacity. |
| // AMD SOG 19h, 2.11 Floating-Point Unit |
| // <...> the scheduler can issue 1 micro op per cycle for each pipe. |
| // FIXME: those are two separate schedulers, not a single big one. |
| def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0 |
| Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/ // scheduler 1 |
| ]> { |
| let BufferSize = !mul(2, 32); |
| } |
| |
| // AMD SOG 19h, 2.11 Floating-Point Unit |
| // Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) |
| // even if floating-point scheduler is full. |
| // FIXME: how to model this properly? |
| |
| |
| //===----------------------------------------------------------------------===// |
| // Load-Store Unit |
| // |
| |
| // AMD SOG 19h, 2.12 Load-Store Unit |
| // The LS unit contains three largely independent pipe-lines |
| // enabling the execution of three 256-bit memory operations per cycle. |
| def Zn4LSU : ProcResource<3>; |
| |
| // AMD SOG 19h, 2.12 Load-Store Unit |
| // All three memory operations can be loads. |
| let Super = Zn4LSU in |
| def Zn4Load : ProcResource<3> { |
| // AMD SOG 19h, 2.12 Load-Store Unit |
| // The LS unit can process up to 72 out-of-order loads. |
| let BufferSize = 72; |
| } |
| |
| def Zn4LoadQueue : LoadQueue<Zn4Load>; |
| |
| // AMD SOG 19h, 2.12 Load-Store Unit |
| // A maximum of two of the memory operations can be stores. |
| let Super = Zn4LSU in |
| def Zn4Store : ProcResource<2> { |
| // AMD SOG 19h, 2.12 Load-Store Unit |
| // The LS unit utilizes a 64-entry store queue (STQ). |
| let BufferSize = 64; |
| } |
| |
| def Zn4StoreQueue : StoreQueue<Zn4Store>; |
| |
| //===----------------------------------------------------------------------===// |
| // Basic helper classes. |
| //===----------------------------------------------------------------------===// |
| |
| // Many SchedWrites are defined in pairs with and without a folded load. |
| // Instructions with folded loads are usually micro-fused, so they only appear |
| // as two micro-ops when dispatched by the schedulers. |
| // This multiclass defines the resource usage for variants with and without |
| // folded loads. |
| |
| multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts, |
| int Lat = 1, list<int> Res = [], int UOps = 1> { |
| def : WriteRes<SchedRW, ExePorts> { |
| let Latency = Lat; |
| let ResourceCycles = Res; |
| let NumMicroOps = UOps; |
| } |
| } |
| |
| multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW, |
| list<ProcResourceKind> ExePorts, int Lat, |
| list<int> Res, int UOps, int LoadLat, int LoadUOps, |
| ProcResourceKind AGU, int LoadRes> { |
| defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; |
| |
| defm : __Zn4WriteRes<SchedRW.Folded, |
| !listconcat([AGU, Zn4Load], ExePorts), |
| !add(Lat, LoadLat), |
| !if(!and(!empty(Res), !eq(LoadRes, 1)), |
| [], |
| !listconcat([1, LoadRes], |
| !if(!empty(Res), |
| !listsplat(1, !size(ExePorts)), |
| Res))), |
| !add(UOps, LoadUOps)>; |
| } |
| |
| // For classes without folded loads. |
| multiclass Zn4WriteResInt<SchedWrite SchedRW, |
| list<ProcResourceKind> ExePorts, int Lat = 1, |
| list<int> Res = [], int UOps = 1> { |
| defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; |
| } |
| |
| multiclass Zn4WriteResXMM<SchedWrite SchedRW, |
| list<ProcResourceKind> ExePorts, int Lat = 1, |
| list<int> Res = [], int UOps = 1> { |
| defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; |
| } |
| |
| multiclass Zn4WriteResYMM<SchedWrite SchedRW, |
| list<ProcResourceKind> ExePorts, int Lat = 1, |
| list<int> Res = [], int UOps = 1> { |
| defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; |
| } |
| |
| multiclass Zn4WriteResZMM<SchedWrite SchedRW, |
| list<ProcResourceKind> ExePorts, int Lat = 1, |
| list<int> Res = [], int UOps = 1> { |
| defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; |
| } |
| |
| // For classes with folded loads. |
| multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW, |
| list<ProcResourceKind> ExePorts, int Lat = 1, |
| list<int> Res = [], int UOps = 1, |
| int LoadUOps = 0, int LoadRes = 1> { |
| defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, |
| Znver4Model.LoadLatency, |
| LoadUOps, Zn4AGU012, LoadRes>; |
| } |
| |
| multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW, |
| list<ProcResourceKind> ExePorts, int Lat = 1, |
| list<int> Res = [], int UOps = 1, |
| int LoadUOps = 0, int LoadRes = 1> { |
| defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, |
| Znver4Model.VecLoadLatency, |
| LoadUOps, Zn4FPLd01, LoadRes>; |
| } |
| |
| multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW, |
| list<ProcResourceKind> ExePorts, int Lat = 1, |
| list<int> Res = [], int UOps = 1, |
| int LoadUOps = 0, int LoadRes = 1> { |
| defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, |
| Znver4Model.VecLoadLatency, |
| LoadUOps, Zn4FPLd01, LoadRes>; |
| } |
| |
| multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW, |
| list<ProcResourceKind> ExePorts, int Lat = 1, |
| list<int> Res = [], int UOps = 2, |
| int LoadUOps = 0, int LoadRes = 1> { |
| defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, |
| Znver4Model.VecLoadLatency, |
| LoadUOps, Zn4FPLd01, LoadRes>; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Here be dragons. |
| //===----------------------------------------------------------------------===// |
| |
| def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>; |
| |
| def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>; |
| def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>; |
| def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>; |
| |
| // AMD SOG 19h, 2.11 Floating-Point Unit |
| // There is 1 cycle of added latency for a result to cross |
| // from F to I or I to F domain. |
| def : ReadAdvance<ReadInt2Fpu, -1>; |
| |
| // Instructions with both a load and a store folded are modeled as a folded |
| // load + WriteRMW. |
| defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>; |
| |
| // Loads, stores, and moves, not folded with other operations. |
| defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>; |
| |
| // Model the effect of clobbering the read-write mask operand of the GATHER operation. |
| // Does not cost anything by itself, only has latency, matching that of the WriteLoad, |
| defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>; |
| |
| def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> { |
| let Latency = !add(Znver4Model.LoadLatency, 1); |
| let ResourceCycles = [3, 1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>; |
| |
| defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>; |
| defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>; |
| defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>; |
| |
| // Treat misc copies as a move. |
| def : InstRW<[WriteMove], (instrs COPY)>; |
| |
| def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { |
| let Latency = Znver4Model.LoadLatency; |
| let ResourceCycles = [1, 1, 4]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>; |
| |
| def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> { |
| let Latency = Znver4Model.StoreLatency; |
| let ResourceCycles = [4, 1, 1]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>; |
| |
| // Arithmetic. |
| defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op. |
| |
| def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 1; |
| let ResourceCycles = [4]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32, |
| AND8i8, AND16i16, AND32i32, AND64i32, |
| OR8i8, OR16i16, OR32i32, OR64i32, |
| SUB8i8, SUB16i16, SUB32i32, SUB64i32, |
| XOR8i8, XOR16i16, XOR32i32, XOR64i32)>; |
| |
| def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 1; |
| let ResourceCycles = [4]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>; |
| |
| def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 1; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>; |
| |
| def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> { |
| let Latency = 3; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr, |
| PEXT32rr, PEXT64rr)>; |
| |
| defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op. |
| |
| def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> { |
| let Latency = 1; |
| let ResourceCycles = [1, 1, 7, 1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>; |
| |
| // This is for simple LEAs with one or two input operands. |
| defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>; // LEA instructions can't fold loads. |
| |
| // This write is used for slow LEA instructions. |
| def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 2; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 2; |
| } |
| |
| // On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset), |
| // or an LEA with a `Scale` value different than 1. |
| def Zn4SlowLEAPredicate : MCSchedPredicate< |
| CheckAny<[ |
| // A 3-operand LEA (base, index, offset). |
| IsThreeOperandsLEAFn, |
| // An LEA with a "Scale" different than 1. |
| CheckAll<[ |
| CheckIsImmOperand<2>, |
| CheckNot<CheckImmOperand<2, 1>> |
| ]> |
| ]> |
| >; |
| |
| def Zn4WriteLEA : SchedWriteVariant<[ |
| SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>, |
| SchedVar<NoSchedPred, [WriteLEA]> |
| ]>; |
| |
| def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; |
| |
| def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 2; // FIXME: not from llvm-exegesis |
| let ResourceCycles = [4]; |
| let NumMicroOps = 2; |
| } |
| |
| def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>; |
| |
| // Integer multiplication |
| defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication. |
| defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication. |
| defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate. |
| defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register. |
| defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>; // Integer 32-bit multiplication. |
| defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. |
| defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate. |
| defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register. |
| defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>; // Integer 64-bit multiplication. |
| defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. |
| defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate. |
| defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register. |
| defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>; // Integer multiplication, high part. |
| defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part. |
| |
| defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap. |
| defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap. |
| |
| defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap. |
| |
| def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 3; |
| let ResourceCycles = [12]; |
| let NumMicroOps = 3; |
| } |
| def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; |
| |
| defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>; // Compare and set, compare and swap. |
| |
| def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency); |
| let ResourceCycles = [1, 1, 12]; |
| let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2); |
| } |
| def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; |
| |
| def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 3; // FIXME: not from llvm-exegesis |
| let ResourceCycles = [24]; |
| let NumMicroOps = 19; |
| } |
| def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>; |
| |
| def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 4; // FIXME: not from llvm-exegesis |
| let ResourceCycles = [59]; |
| let NumMicroOps = 28; |
| } |
| def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; |
| |
| def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 1; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>; |
| |
| def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { |
| let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis |
| let ResourceCycles = [1, 1, 2]; |
| let NumMicroOps = 5; |
| } |
| def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; |
| |
| def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { |
| let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis |
| let ResourceCycles = [1, 1, 2]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; |
| |
| // Integer division. |
| // FIXME: uops for 8-bit division measures as 2. for others it's a guess. |
| // FIXME: latency for 8-bit division measures as 10. for others it's a guess. |
| defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>; |
| defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>; |
| defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>; |
| defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>; |
| defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>; |
| defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>; |
| defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>; |
| defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>; |
| |
| defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward. |
| defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse. |
| |
| defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count. |
| |
| def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 1; |
| let ResourceCycles = [4]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>; |
| |
| defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count. |
| |
| def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 1; |
| let ResourceCycles = [4]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>; |
| |
| defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count. |
| |
| def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { |
| let Latency = 2; |
| let ResourceCycles = [4]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>; |
| |
| defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move. |
| defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move. |
| defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code. |
| defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis |
| defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH. |
| |
| defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test |
| defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>; |
| defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>; |
| |
| defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set |
| defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>; |
| defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>; |
| |
| // Integer shifts and rotates. |
| defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; |
| defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; |
| defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; |
| |
| def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> { |
| let Latency = 1; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, |
| RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; |
| |
| def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency); |
| let ResourceCycles = [1, 1, 2]; |
| let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1); |
| } |
| def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1, |
| RCR8m1, RCR16m1, RCR32m1, RCR64m1)>; |
| |
| def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> { |
| let Latency = 3; |
| let ResourceCycles = [6]; |
| let NumMicroOps = 7; |
| } |
| def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; |
| |
| def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency); |
| let ResourceCycles = [1, 1, 8]; |
| let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3); |
| } |
| def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>; |
| |
| def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> { |
| let Latency = 4; |
| let ResourceCycles = [8]; |
| let NumMicroOps = 9; |
| } |
| def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; |
| |
| def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency); |
| let ResourceCycles = [1, 1, 8]; |
| let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2); |
| } |
| def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>; |
| |
| defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; |
| |
| def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> { |
| let Latency = 3; |
| let ResourceCycles = [6]; |
| let NumMicroOps = 7; |
| } |
| def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>; |
| |
| def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency); |
| let ResourceCycles = [1, 1, 8]; |
| let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2); |
| } |
| def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>; |
| |
| def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> { |
| let Latency = 4; |
| let ResourceCycles = [8]; |
| let NumMicroOps = 9; |
| } |
| def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>; |
| |
| def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency); |
| let ResourceCycles = [1, 1, 8]; |
| let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2); |
| } |
| def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>; |
| |
| // Double shift instructions. |
| defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>; |
| defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>; |
| defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>; |
| defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>; |
| |
| // BMI1 BEXTR/BLS, BMI2 BZHI |
| defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; |
| defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>; |
| defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; |
| |
| // Idioms that clear a register, like xorps %xmm0, %xmm0. |
| // These can often bypass execution ports completely. |
| defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>; |
| |
| // Branches don't produce values, so they have no latency, but they still |
| // consume resources. Indirect branches can fold loads. |
| defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis |
| |
| // Floating point. This covers both scalar and vector operations. |
| defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>; |
| defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>; |
| defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| |
| def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> { |
| let Latency = 2; // FIXME: not from llvm-exegesis |
| let ResourceCycles = [1, 1]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr, |
| VMOVHPDmr, VMOVHPSmr)>; |
| |
| defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| |
| defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; |
| defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>; |
| defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>; |
| defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; |
| |
| defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub. |
| |
| def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { |
| let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis |
| let ResourceCycles = [1, 1, 24]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m, |
| SUB_FI16m, SUB_FI32m, |
| SUBR_FI16m, SUBR_FI32m, |
| MUL_FI16m, MUL_FI32m)>; |
| |
| def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { |
| let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis |
| let ResourceCycles = [1, 1, 62]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m, |
| DIVR_FI16m, DIVR_FI32m)>; |
| |
| defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM). |
| defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM). |
| defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub. |
| defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM). |
| defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM). |
| defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare. |
| defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 1, [1], 1>; // Floating point compare (XMM). |
| defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (YMM). |
| defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [4], 1>; // Floating point compare (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare. |
| defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare (XMM). |
| defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (YMM). |
| defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [4], 1>; // Floating point double compare (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87). |
| defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE). |
| defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication. |
| defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM). |
| defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM). |
| defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication. |
| defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM). |
| defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM). |
| defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>; // Floating point division. |
| defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM). |
| defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM). |
| defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division. |
| defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM). |
| defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM). |
| defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root. |
| defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (XMM). |
| defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (YMM). |
| defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>; // Floating point square root (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root. |
| defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM). |
| defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM). |
| defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root. |
| defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate. |
| defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM). |
| defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM). |
| defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate. |
| defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM). |
| defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM). |
| defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add. |
| defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM). |
| defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM). |
| defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [4], 1>; // Fused Multiply Add (ZMM). |
| defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product. |
| defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product. |
| defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM). |
| defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs. |
| defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding. |
| defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM). |
| defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM). |
| |
| defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals. |
| defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM). |
| defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions. |
| defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM). |
| defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles. |
| defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM). |
| defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles. |
| defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM). |
| defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends. |
| defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM). |
| defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM). |
| defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends. |
| defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM). |
| defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM). |
| |
| // Horizontal Add/Sub (float and integer) |
| defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>; |
| defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>; |
| defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>; |
| defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; |
| defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>; |
| defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>; |
| defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>; |
| |
| // Vector integer operations. |
| defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| |
| def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> { |
| let Latency = 4; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>; |
| |
| def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); |
| let ResourceCycles = [1, 1, 1]; |
| let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); |
| } |
| def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>; |
| |
| def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); |
| let ResourceCycles = [1, 1, 1]; |
| let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); |
| } |
| def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rm)>; |
| |
| defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; |
| defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; |
| defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>; |
| defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>; |
| defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; |
| |
| defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>; |
| defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>; |
| |
| def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> { |
| let Latency = 1; |
| let ResourceCycles = [1, 2]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>; |
| |
| def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> { |
| let Latency = 1; |
| let ResourceCycles = [1, 4]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>; |
| |
| defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals. |
| |
| def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> { |
| let Latency = 3; |
| let ResourceCycles = [1, 1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>; |
| |
| def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> { |
| let Latency = 3; |
| let ResourceCycles = [1, 1]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>; |
| |
| defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM). |
| |
| def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr, |
| PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr, |
| PAVGBrr, PAVGWrr, |
| PSIGNBrr, PSIGNDrr, PSIGNWrr, |
| VPABSBrr, VPABSDrr, VPABSWrr, |
| VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr, |
| VPAVGBrr, VPAVGWrr, |
| VPCMPEQQrr, |
| VPSIGNBrr, VPSIGNDrr, VPSIGNWrr, |
| PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>; |
| |
| def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> { |
| let Latency = 1; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteVecOpMask], (instrs KADDBrr, KADDDrr, KADDQrr, KADDWrr, |
| KANDBrr, KANDDrr, KANDQrr, KANDWrr, |
| KANDNBrr, KANDNDrr, KANDNQrr, KANDNWrr, |
| KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk, |
| KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk, |
| KNOTBrr, KNOTDrr, KNOTQrr, KNOTWrr, |
| KORBrr, KORDrr, KORQrr, KORWrr, |
| KORTESTBrr, KORTESTDrr, KORTESTQrr, KORTESTWrr, |
| KTESTBrr, KTESTDrr, KTESTQrr, KTESTWrr, |
| KUNPCKBWrr, KUNPCKDQrr, KUNPCKWDrr, |
| KXNORBrr, KXNORDrr, KXNORQrr, KXNORWrr, |
| KXORBrr, KXORDrr, KXORQrr, KXORWrr)>; |
| |
| def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> { |
| let Latency = 1; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>; |
| |
| def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> { |
| let Latency = 1; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>; |
| |
| def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { |
| // TODO: All align instructions are expected to be of 4 cycle latency |
| let Latency = 4; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri, |
| VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri) |
| >; |
| defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). |
| |
| def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> { |
| let Latency = 1; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr, |
| VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr, |
| VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr, |
| VPAVGBYrr, VPAVGWYrr, |
| VPCMPEQQYrr, |
| VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>; |
| |
| defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM). |
| |
| defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals. |
| defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM). |
| defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM). |
| defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM). |
| defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions. |
| defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM). |
| defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (ZMM). |
| defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (default). |
| defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM). |
| defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM). |
| defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM). |
| defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default). |
| defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM). |
| defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM). |
| defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM). |
| defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (default). |
| defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM). |
| defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM). |
| defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM). |
| defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD. |
| defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM). |
| defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM). |
| defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles. |
| defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM). |
| defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM). |
| defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM). |
| defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles. |
| defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM). |
| defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM). |
| defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM). |
| defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends. |
| defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM). |
| defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM). |
| defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends. |
| defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM). |
| defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM). |
| defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW. |
| defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM). |
| defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM). |
| defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM). |
| defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD. |
| defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM). |
| defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM). |
| defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>; // Vector PHMINPOS. |
| |
| // Vector insert/extract operations. |
| defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element. |
| defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr. |
| defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store. |
| |
| // MOVMSK operations. |
| defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; |
| defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; |
| defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>; |
| defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; |
| |
| // Conversion between integer and float. |
| defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>; // Double -> Integer. |
| defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM). |
| defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM). |
| defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM). |
| |
| def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> { |
| let Latency = 1; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 2; |
| } |
| defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>; // Float -> Integer. |
| |
| defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM). |
| defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM). |
| defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM). |
| |
| defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double. |
| defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM). |
| defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM). |
| defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM). |
| |
| def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> { |
| let Latency = 2; |
| let ResourceCycles = [6]; |
| let NumMicroOps = 2; |
| } |
| |
| defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. |
| defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). |
| defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM). |
| defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM). |
| |
| def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> { |
| let Latency = 3; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 2; |
| } |
| |
| defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. |
| defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). |
| defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM). |
| defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM). |
| |
| defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion. |
| defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM). |
| defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM). |
| defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM). |
| |
| defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion. |
| defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM). |
| defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM). |
| |
| defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion. |
| defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM). |
| defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM). |
| |
| defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion. |
| defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM). |
| defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM). |
| |
| // CRC32 instruction. |
| defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>; |
| |
| def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; |
| |
| def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency); |
| let ResourceCycles = [1, 1, 2]; |
| let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0); |
| } |
| def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>; |
| |
| def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> { |
| let Latency = 1; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; |
| |
| def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency); |
| let ResourceCycles = [1, 1, 2]; |
| let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); |
| } |
| def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>; |
| |
| def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> { |
| let Latency = 2; |
| let ResourceCycles = [3]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; |
| |
| def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency); |
| let ResourceCycles = [1, 1, 3]; |
| let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0); |
| } |
| def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>; |
| |
| def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> { |
| let Latency = 3; |
| let ResourceCycles = [8]; |
| let NumMicroOps = 4; |
| } |
| def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; |
| |
| def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency); |
| let ResourceCycles = [1, 1, 8]; |
| let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1); |
| } |
| def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>; |
| |
| def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> { |
| let Latency = 6; |
| let ResourceCycles = [8]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>; |
| |
| def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> { |
| let Latency = 4; |
| let ResourceCycles = [8]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; |
| |
| // Strings instructions. |
| // Packed Compare Implicit Length Strings, Return Mask |
| defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; |
| // Packed Compare Explicit Length Strings, Return Mask |
| defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; |
| // Packed Compare Implicit Length Strings, Return Index |
| defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>; |
| // Packed Compare Explicit Length Strings, Return Index |
| defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>; |
| |
| // AES instructions. |
| defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption. |
| defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn. |
| defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation. |
| |
| // Carry-less multiplication instructions. |
| defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>; |
| |
| // EMMS/FEMMS |
| defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis |
| |
| // Load/store MXCSR |
| defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis |
| defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis |
| |
| // Catch-all for expensive system instructions. |
| defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>; |
| |
| def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> { |
| let Latency = 0; // FIXME: not from llvm-exegesis |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>; |
| |
| def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> { |
| let Latency = 10; // FIXME: not from llvm-exegesis |
| let ResourceCycles = [24]; |
| let NumMicroOps = 18; |
| } |
| def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>; |
| |
| // AVX2. |
| defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles. |
| defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles. |
| defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles. |
| |
| def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> { |
| let Latency = 3; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>; |
| |
| def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency); |
| let ResourceCycles = [1, 1, 1]; |
| let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); |
| } |
| def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rm)>; |
| |
| def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> { |
| let Latency = 7; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>; |
| |
| def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency); |
| let ResourceCycles = [1, 1, 2]; |
| let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1); |
| } |
| def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>; |
| |
| def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> { |
| let Latency = 6; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; |
| |
| def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency); |
| let ResourceCycles = [1, 1, 2]; |
| let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1); |
| } |
| def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>; |
| |
| def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> { |
| let Latency = 5; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>; |
| |
| def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { |
| let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency); |
| let ResourceCycles = [1, 1, 2]; |
| let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0); |
| } |
| def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; |
| |
| defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move. |
| defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles. |
| defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts. |
| defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM). |
| defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM). |
| |
| // Old microcoded instructions that nobody use. |
| defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>; |
| |
| // Fence instructions. |
| defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>; |
| |
| def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> { |
| let Latency = 1; |
| let ResourceCycles = [30]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>; |
| |
| def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> { |
| let Latency = 1; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>; |
| |
| // Nop, not very useful expect it provides a model for nops! |
| defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis |
| |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // Zero Cycle Move |
| /////////////////////////////////////////////////////////////////////////////// |
| |
| def Zn4WriteZeroLatency : SchedWriteRes<[]> { |
| let Latency = 0; |
| let ResourceCycles = []; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV, |
| MOV64rr, MOV64rr_REV, |
| MOVSX32rr32)>; |
| |
| def Zn4WriteSwapRenameable : SchedWriteRes<[]> { |
| let Latency = 0; |
| let ResourceCycles = []; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar, |
| XCHG64rr, XCHG64ar)>; |
| |
| defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support. |
| |
| defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>; |
| defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>; |
| defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>; |
| |
| defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX |
| defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>; |
| defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>; |
| defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>; |
| |
| def : IsOptimizableRegisterMove<[ |
| InstructionEquivalenceClass<[ |
| // GPR variants. |
| MOV32rr, MOV32rr_REV, |
| MOV64rr, MOV64rr_REV, |
| MOVSX32rr32, |
| XCHG32rr, XCHG32ar, |
| XCHG64rr, XCHG64ar, |
| |
| // MMX variants. |
| // MMX moves are *NOT* eliminated. |
| |
| // SSE variants. |
| MOVAPSrr, MOVAPSrr_REV, |
| MOVUPSrr, MOVUPSrr_REV, |
| MOVAPDrr, MOVAPDrr_REV, |
| MOVUPDrr, MOVUPDrr_REV, |
| MOVDQArr, MOVDQArr_REV, |
| MOVDQUrr, MOVDQUrr_REV, |
| |
| // AVX variants. |
| VMOVAPSrr, VMOVAPSrr_REV, |
| VMOVUPSrr, VMOVUPSrr_REV, |
| VMOVAPDrr, VMOVAPDrr_REV, |
| VMOVUPDrr, VMOVUPDrr_REV, |
| VMOVDQArr, VMOVDQArr_REV, |
| VMOVDQUrr, VMOVDQUrr_REV, |
| |
| // AVX YMM variants. |
| VMOVAPSYrr, VMOVAPSYrr_REV, |
| VMOVUPSYrr, VMOVUPSYrr_REV, |
| VMOVAPDYrr, VMOVAPDYrr_REV, |
| VMOVUPDYrr, VMOVUPDYrr_REV, |
| VMOVDQAYrr, VMOVDQAYrr_REV, |
| VMOVDQUYrr, VMOVDQUYrr_REV, |
| ], TruePred > |
| ]>; |
| |
| // FIXUP and RANGE Instructions |
| def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex |
| "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", |
| "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)", |
| "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz" |
| )>; |
| |
| // SCALE & REDUCE instructions |
| def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> { |
| let Latency = 6; |
| let ResourceCycles = [6]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteSCALErr], (instregex |
| "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)", |
| "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)" |
| )>; |
| |
| //BF16PS Instructions |
| def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> { |
| let Latency = 6; |
| let ResourceCycles = [6]; |
| let NumMicroOps = 2; |
| } |
| def : InstRW<[Zn4WriteBF16], (instregex |
| "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)" |
| )>; |
| |
| // BUSD and VPMADD Instructions |
| def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 4; |
| let ResourceCycles = [4]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex |
| "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)", |
| "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)" |
| )>; |
| |
| // SHIFT instructions |
| def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteSHIFTrr], (instregex |
| "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)", |
| "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)", |
| "(V?)P(SLL|SRL|SRA)DQYri", |
| "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri", |
| "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)", |
| "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)", |
| "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)", |
| "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)", |
| "VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz" |
| )>; |
| |
| def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 1; |
| let ResourceCycles = [1]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteSHIFTri], (instregex |
| "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)" |
| )>; |
| |
| // ALIGN Instructions |
| def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteALIGN], (instregex |
| "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)" |
| )>; |
| |
| //PACK Instructions |
| def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WritePACK], (instregex |
| "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)" |
| )>; |
| |
| // MAX and MIN Instructions |
| def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4WriteFCmp64], (instregex |
| "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)", |
| "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)", |
| "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)", |
| "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)" |
| )>; |
| |
| // MOV Instructions |
| def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4MOVS], (instregex |
| "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)", |
| "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)", |
| "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)", |
| "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)", |
| "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?)(rr|rrk|rrkz)" |
| )>; |
| |
| def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> { |
| let Latency = 4; |
| let ResourceCycles = [4]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4MOVSZ], (instregex |
| "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)" |
| )>; |
| |
| def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> { |
| let Latency = 5; |
| let ResourceCycles = [5]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4MOVSrr], (instregex |
| "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)" |
| )>; |
| |
| |
| //VPTEST Instructions |
| def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 3; |
| let ResourceCycles = [3]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4VPTESTZ128], (instregex |
| "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)" |
| )>; |
| |
| def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 4; |
| let ResourceCycles = [4]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4VPTESTZ256], (instregex |
| "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)" |
| )>; |
| |
| def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 5; |
| let ResourceCycles = [5]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4VPTESTZ], (instregex |
| "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)" |
| )>; |
| |
| // CONFLICT Instructions |
| def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4CONFLICTZ128], (instregex |
| "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)" |
| )>; |
| |
| def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> { |
| let Latency = 6; |
| let ResourceCycles = [2,2,2]; |
| let NumMicroOps = 4; |
| } |
| def : InstRW<[Zn4CONFLICTrr], (instregex |
| "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)" |
| )>; |
| |
| // RSQRT Instructions |
| def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 5; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4VRSQRT14PDZ256], (instregex |
| "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)" |
| )>; |
| |
| |
| // PERM Instructions |
| def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4PERMILP], (instregex |
| "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)" |
| )>; |
| |
| def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> { |
| let Latency = 3; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4PERMIT2_128], (instregex |
| "VPERM(I2|T2)(PS|PD|W)128(rr|rrk|rrkz)", |
| "VPERM(I2|T2)(B|D|Q)128(rr|rrk|rrkz)" |
| )>; |
| |
| def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4PERMIT2_128rr], (instregex |
| "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)", |
| "VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)" |
| )>; |
| |
| def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> { |
| let Latency = 4; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4PERMIT2_256], (instregex |
| "VPERM(I2|T2)(PS|PD|W)256(rr|rrk|rrkz)", |
| "VPERMP(S|D)Z256(rr|rrk|rrkz)", |
| "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)", |
| "VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)", |
| "VPERM(I2|Q|T2)(B|D|Q)(Z?)256(rr|rrk|rrkz)", |
| "VPEXPAND(B|W)Z256(rr|rrk|rrkz)" |
| )>; |
| |
| def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> { |
| let Latency = 5; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4PERMIT2Z], (instregex |
| "VPERM(I2|T2)(PS|PD|W)(rr|rrk|rrkz)", |
| "VPERM(B|D|W)Z(rr|rrk|rrkz)", |
| "VPERM(I2|Q|T2)(B|D|Q)(Z?)(rr|rrk|rrkz)", |
| "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)", |
| "VPEXPAND(B|W)Z(rr|rrk|rrkz)", |
| "VPERMP(S|D)Z(rr|rrk|rrkz)" |
| )>; |
| |
| // ALU SLOW Misc Instructions |
| def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> { |
| let Latency = 2; |
| let ResourceCycles = [2]; |
| let NumMicroOps = 1; |
| } |
| def : InstRW<[Zn4VecALUZSlow], (instrs |
| VPABSBZ128rr, VPABSBZ128rrk, VPABSBZ128rrkz, VPABSDZ128rr, |
| VPABSDZ128rrk, VPABSDZ128rrkz, VPABSQZ128rr, VPABSQZ128rrk, |
| VPABSQZ128rrkz, VPABSWZ128rr, VPABSWZ128rrk, VPABSWZ128rrkz, |
| VPADDSBZ128rr, VPADDSBZ128rrk, VPADDSBZ128rrkz, VPADDSWZ128rr, |
| VPADDSWZ128rrk, VPADDSWZ128rrkz,VPADDUSBZ128rr, VPADDUSBZ128rrk, |
| VPADDUSBZ128rrkz, VPADDUSWZ128rr, VPADDUSWZ128rrk, VPADDUSWZ128rrkz, |
| VPAVGBZ128rr, VPAVGBZ128rrk, VPAVGBZ128rrkz, VPAVGWZ128rr, |
| VPAVGWZ128rrk, VPAVGWZ128rrkz, VPOPCNTBZ128rr, VPOPCNTBZ128rrk, |
| VPOPCNTBZ128rrkz, VPOPCNTDZ128rr, VPOPCNTDZ128rrk, VPOPCNTDZ128rrkz, |
| VPOPCNTQZ128rr, VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr, |
| VPOPCNTWZ128rrk, VPOPCNTWZ128rrkz,VPSUBSBZ128rr, VPSUBSBZ128rrk, |
| VPSUBSBZ128rrkz, VPSUBSWZ128rr, VPSUBSWZ128rrk, VPSUBSWZ128rrkz, |
| VPSUBUSBZ128rr, VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr, |
| VPSUBUSWZ128rrk, VPSUBUSWZ128rrkz |
| )>; |
| |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // Dependency breaking instructions. |
| /////////////////////////////////////////////////////////////////////////////// |
| |
| def Zn4WriteZeroIdiom : SchedWriteVariant<[ |
| SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, |
| SchedVar<NoSchedPred, [WriteALU]> |
| ]>; |
| def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV, |
| XOR64rr, XOR64rr_REV, |
| SUB32rr, SUB32rr_REV, |
| SUB64rr, SUB64rr_REV)>; |
| |
| def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[ |
| SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>, |
| SchedVar<NoSchedPred, [WriteALU]> |
| ]>; |
| def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV, |
| CMP16rr, CMP16rr_REV, |
| CMP32rr, CMP32rr_REV, |
| CMP64rr, CMP64rr_REV)>; |
| |
| def Zn4WriteFZeroIdiom : SchedWriteVariant<[ |
| SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, |
| SchedVar<NoSchedPred, [WriteFLogic]> |
| ]>; |
| // NOTE: XORPSrr, XORPDrr are not zero-cycle! |
| def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr, |
| VANDNPSrr, VANDNPDrr)>; |
| |
| def Zn4WriteFZeroIdiomY : SchedWriteVariant<[ |
| SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, |
| SchedVar<NoSchedPred, [WriteFLogicY]> |
| ]>; |
| def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, |
| VANDNPSYrr, VANDNPDYrr)>; |
| |
| def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[ |
| SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, |
| SchedVar<NoSchedPred, [WriteVecLogicX]> |
| ]>; |
| // NOTE: PXORrr,PANDNrr are not zero-cycle! |
| def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>; |
| |
| def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[ |
| SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, |
| SchedVar<NoSchedPred, [WriteVecLogicY]> |
| ]>; |
| def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>; |
| |
| def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[ |
| SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, |
| SchedVar<NoSchedPred, [WriteVecALUX]> |
| ]>; |
| // NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, |
| // PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle! |
| def : InstRW<[Zn4WriteVZeroIdiomALUX], |
| (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, |
| VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>; |
| |
| def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[ |
| SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, |
| SchedVar<NoSchedPred, [WriteVecALUY]> |
| ]>; |
| def : InstRW<[Zn4WriteVZeroIdiomALUY], |
| (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, |
| VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>; |
| |
| def : IsZeroIdiomFunction<[ |
| // GPR Zero-idioms. |
| DepBreakingClass<[ XOR32rr, XOR32rr_REV, |
| XOR64rr, XOR64rr_REV, |
| SUB32rr, SUB32rr_REV, |
| SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>, |
| |
| // SSE XMM Zero-idioms. |
| DepBreakingClass<[ |
| // fp variants. |
| XORPSrr, XORPDrr, |
| ANDNPSrr, ANDNPDrr, |
| |
| // int variants. |
| PXORrr, |
| PANDNrr, |
| PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, |
| PSUBSBrr, PSUBSWrr, |
| PSUBUSBrr, PSUBUSWrr, |
| PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr |
| ], ZeroIdiomPredicate>, |
| |
| // AVX XMM Zero-idioms. |
| DepBreakingClass<[ |
| // fp variants. |
| VXORPSrr, VXORPDrr, |
| VANDNPSrr, VANDNPDrr, |
| |
| // int variants. |
| VPXORrr, |
| VPANDNrr, |
| VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, |
| VPSUBSBrr, VPSUBSWrr, |
| VPSUBUSBrr, VPSUBUSWrr, |
| VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, |
| ], ZeroIdiomPredicate>, |
| |
| // AVX YMM Zero-idioms. |
| DepBreakingClass<[ |
| // fp variants. |
| VXORPSYrr, VXORPDYrr, |
| VANDNPSYrr, VANDNPDYrr, |
| |
| // int variants. |
| VPXORYrr, |
| VPANDNYrr, |
| VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, |
| VPSUBSBYrr, VPSUBSWYrr, |
| VPSUBUSBYrr, VPSUBUSWYrr, |
| VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr |
| ], ZeroIdiomPredicate>, |
| ]>; |
| |
| def : IsDepBreakingFunction<[ |
| // GPR |
| DepBreakingClass<[ SBB32rr, SBB32rr_REV, |
| SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>, |
| DepBreakingClass<[ CMP8rr, CMP8rr_REV, |
| CMP16rr, CMP16rr_REV, |
| CMP32rr, CMP32rr_REV, |
| CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >, |
| // SSE |
| DepBreakingClass<[ |
| PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr |
| ], ZeroIdiomPredicate>, |
| |
| // AVX XMM |
| DepBreakingClass<[ |
| VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr |
| ], ZeroIdiomPredicate>, |
| |
| // AVX YMM |
| DepBreakingClass<[ |
| VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr |
| ], ZeroIdiomPredicate>, |
| ]>; |
| |
| } // SchedModel |
| |