blob: be7017a7b4260b5a5c1d40ff001589b9989f8eb6 [file] [log] [blame]
//=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the itinerary class data for the ARM Cortex A9 processors.
//
//===----------------------------------------------------------------------===//
// ===---------------------------------------------------------------------===//
// This section contains legacy support for itineraries. This is
// required until SD and PostRA schedulers are replaced by MachineScheduler.
//
// Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
// Reference Manual".
//
// Functional units
def A9_Issue0 : FuncUnit; // Issue 0
def A9_Issue1 : FuncUnit; // Issue 1
def A9_Branch : FuncUnit; // Branch
def A9_ALU0 : FuncUnit; // ALU / MUL pipeline 0
def A9_ALU1 : FuncUnit; // ALU pipeline 1
def A9_AGU : FuncUnit; // Address generation unit for ld / st
def A9_NPipe : FuncUnit; // NEON pipeline
def A9_MUX0 : FuncUnit; // AGU + NEON/FPU multiplexer
def A9_LSUnit : FuncUnit; // L/S Unit
def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
def A9_DRegsN : FuncUnit; // FP register set, NEON side
// Bypasses
def A9_LdBypass : Bypass;
def CortexA9Itineraries : ProcessorItineraries<
[A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0,
A9_LSUnit, A9_DRegsVFP, A9_DRegsN],
[A9_LdBypass], [
// Two fully-pipelined integer ALU pipelines
//
// Move instructions, unconditional
InstrItinData<IIC_iMOVi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
InstrItinData<IIC_iMOVr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
InstrItinData<IIC_iMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
InstrItinData<IIC_iMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>,
InstrStage<1, [A9_ALU0, A9_ALU1]>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>,
InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>,
InstrStage<1, [A9_ALU0, A9_ALU1]>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>], [5]>,
//
// MVN instructions
InstrItinData<IIC_iMVNi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>],
[1]>,
InstrItinData<IIC_iMVNr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>],
[1, 1], [NoBypass, A9_LdBypass]>,
InstrItinData<IIC_iMVNsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0, A9_ALU1]>],
[2, 1]>,
InstrItinData<IIC_iMVNsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<3, [A9_ALU0, A9_ALU1]>],
[3, 1, 1]>,
//
// No operand cycles
InstrItinData<IIC_iALUx , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>]>,
//
// Binary Instructions that produce a result
InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>],
[1, 1], [NoBypass, A9_LdBypass]>,
InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>],
[1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>,
InstrItinData<IIC_iALUsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0, A9_ALU1]>],
[2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>,
InstrItinData<IIC_iALUsir,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0, A9_ALU1]>],
[2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>,
InstrItinData<IIC_iALUsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<3, [A9_ALU0, A9_ALU1]>],
[3, 1, 1, 1],
[NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
//
// Bitwise Instructions that produce a result
InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
InstrItinData<IIC_iBITsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
InstrItinData<IIC_iBITsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
//
// Unary Instructions that produce a result
// CLZ, RBIT, etc.
InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
// BFC, BFI, UBFX, SBFX
InstrItinData<IIC_iUNAsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>,
//
// Zero and sign extension instructions
InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>,
InstrItinData<IIC_iEXTAr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>,
InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
//
// Compare instructions
InstrItinData<IIC_iCMPi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>],
[1], [A9_LdBypass]>,
InstrItinData<IIC_iCMPr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>],
[1, 1], [A9_LdBypass, A9_LdBypass]>,
InstrItinData<IIC_iCMPsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0, A9_ALU1]>],
[1, 1], [A9_LdBypass, NoBypass]>,
InstrItinData<IIC_iCMPsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<3, [A9_ALU0, A9_ALU1]>],
[1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
//
// Test instructions
InstrItinData<IIC_iTSTi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
InstrItinData<IIC_iTSTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
InstrItinData<IIC_iTSTsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>,
InstrItinData<IIC_iTSTsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
//
// Move instructions, conditional
// FIXME: Correctly model the extra input dep on the destination.
InstrItinData<IIC_iCMOVi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
InstrItinData<IIC_iCMOVr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>,
InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
// Integer multiply pipeline
//
InstrItinData<IIC_iMUL16 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0]>], [3, 1, 1]>,
InstrItinData<IIC_iMAC16 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0]>],
[3, 1, 1, 1]>,
InstrItinData<IIC_iMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0]>], [4, 1, 1]>,
InstrItinData<IIC_iMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<2, [A9_ALU0]>],
[4, 1, 1, 1]>,
InstrItinData<IIC_iMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>,
InstrItinData<IIC_iMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<3, [A9_ALU0]>],
[4, 5, 1, 1]>,
// Integer load pipeline
// FIXME: The timings are some rough approximations
//
// Immediate offset
InstrItinData<IIC_iLoad_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[3, 1], [A9_LdBypass]>,
InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[4, 1], [A9_LdBypass]>,
// FIXME: If address is 64-bit aligned, AGU cycles is 1.
InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[3, 3, 1], [A9_LdBypass]>,
//
// Register offset
InstrItinData<IIC_iLoad_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[3, 1, 1], [A9_LdBypass]>,
InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[4, 1, 1], [A9_LdBypass]>,
InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[3, 3, 1, 1], [A9_LdBypass]>,
//
// Scaled register offset
InstrItinData<IIC_iLoad_si , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit], 0>],
[4, 1, 1], [A9_LdBypass]>,
InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[5, 1, 1], [A9_LdBypass]>,
//
// Immediate offset with update
InstrItinData<IIC_iLoad_iu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[3, 2, 1], [A9_LdBypass]>,
InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[4, 3, 1], [A9_LdBypass]>,
//
// Register offset with update
InstrItinData<IIC_iLoad_ru , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[3, 2, 1, 1], [A9_LdBypass]>,
InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[4, 3, 1, 1], [A9_LdBypass]>,
InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[3, 3, 1, 1], [A9_LdBypass]>,
//
// Scaled register offset with update
InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[4, 3, 1, 1], [A9_LdBypass]>,
InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[5, 4, 1, 1], [A9_LdBypass]>,
//
// Load multiple, def is the 5th operand.
// FIXME: This assumes 3 to 4 registers.
InstrItinData<IIC_iLoad_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 1, 1, 3],
[NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
-1>, // dynamic uops
//
// Load multiple + update, defs are the 1st and 5th operands.
InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<2, [A9_LSUnit]>],
[2, 1, 1, 1, 3],
[NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
-1>, // dynamic uops
//
// Load multiple plus branch
InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 1>,
InstrStage<2, [A9_LSUnit]>,
InstrStage<1, [A9_Branch]>],
[1, 2, 1, 1, 3],
[NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
-1>, // dynamic uops
//
// Pop, def is the 3rd operand.
InstrItinData<IIC_iPop , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 3],
[NoBypass, NoBypass, A9_LdBypass],
-1>, // dynamic uops
//
// Pop + branch, def is the 3rd operand.
InstrItinData<IIC_iPop_Br, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<2, [A9_LSUnit]>,
InstrStage<1, [A9_Branch]>],
[1, 1, 3],
[NoBypass, NoBypass, A9_LdBypass],
-1>, // dynamic uops
//
// iLoadi + iALUr for t2LDRpci_pic.
InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>,
InstrStage<1, [A9_ALU0, A9_ALU1]>],
[2, 1]>,
// Integer store pipeline
///
// Immediate offset
InstrItinData<IIC_iStore_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>], [1, 1]>,
InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<1, [A9_LSUnit]>], [1, 1]>,
// FIXME: If address is 64-bit aligned, AGU cycles is 1.
InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<1, [A9_LSUnit]>], [1, 1]>,
//
// Register offset
InstrItinData<IIC_iStore_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
//
// Scaled register offset
InstrItinData<IIC_iStore_si , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
//
// Immediate offset with update
InstrItinData<IIC_iStore_iu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>,
InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>,
//
// Register offset with update
InstrItinData<IIC_iStore_ru , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 1, 1, 1]>,
InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<1, [A9_LSUnit]>],
[3, 1, 1, 1]>,
InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<1, [A9_LSUnit]>],
[3, 1, 1, 1]>,
//
// Scaled register offset with update
InstrItinData<IIC_iStore_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 1, 1, 1]>,
InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_AGU], 1>,
InstrStage<1, [A9_LSUnit]>],
[3, 1, 1, 1]>,
//
// Store multiple
InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<2, [A9_LSUnit]>],
[], [], -1>, // dynamic uops
//
// Store multiple + update
InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_AGU], 0>,
InstrStage<2, [A9_LSUnit]>],
[2], [], -1>, // dynamic uops
//
// Preload
InstrItinData<IIC_Preload, [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>,
// Branch
//
// no delay slots, so the latency of a branch is unimportant
InstrItinData<IIC_Br , [InstrStage<1, [A9_Issue0], 0>,
InstrStage<1, [A9_Issue1], 0>,
InstrStage<1, [A9_Branch]>]>,
// VFP and NEON shares the same register file. This means that every VFP
// instruction should wait for full completion of the consecutive NEON
// instruction and vice-versa. We model this behavior with two artificial FUs:
// DRegsVFP and DRegsVFP.
//
// Every VFP instruction:
// - Acquires DRegsVFP resource for 1 cycle
// - Reserves DRegsN resource for the whole duration (including time to
// register file writeback!).
// Every NEON instruction does the same but with FUs swapped.
//
// Since the reserved FU cannot be acquired, this models precisely
// "cross-domain" stalls.
// VFP
// Issue through integer pipeline, and execute in NEON unit.
// FP Special Register to Integer Register File Move
InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<2, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[1]>,
//
// Single-precision FP Unary
InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
// Extra latency cycles since wbck is 2 cycles
InstrStage<3, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[1, 1]>,
//
// Double-precision FP Unary
InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
// Extra latency cycles since wbck is 2 cycles
InstrStage<3, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[1, 1]>,
//
// Single-precision FP Compare
InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
// Extra latency cycles since wbck is 4 cycles
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[1, 1]>,
//
// Double-precision FP Compare
InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
// Extra latency cycles since wbck is 4 cycles
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[1, 1]>,
//
// Single to Double FP Convert
InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1]>,
//
// Double to Single FP Convert
InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1]>,
//
// Single to Half FP Convert
InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1]>,
//
// Half to Single FP Convert
InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<3, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[2, 1]>,
//
// Single-Precision FP to Integer Convert
InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1]>,
//
// Double-Precision FP to Integer Convert
InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1]>,
//
// Integer to Single-Precision FP Convert
InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1]>,
//
// Integer to Double-Precision FP Convert
InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1]>,
//
// Single-precision FP ALU
InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1, 1]>,
//
// Double-precision FP ALU
InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<5, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1, 1]>,
//
// Single-precision FP Multiply
InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<6, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[5, 1, 1]>,
//
// Double-precision FP Multiply
InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<7, [A9_DRegsN], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[6, 1, 1]>,
//
// Single-precision FP MAC
InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<9, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[8, 1, 1, 1]>,
//
// Double-precision FP MAC
InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<10, [A9_DRegsN], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[9, 1, 1, 1]>,
//
// Single-precision Fused FP MAC
InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<9, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[8, 1, 1, 1]>,
//
// Double-precision Fused FP MAC
InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<10, [A9_DRegsN], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[9, 1, 1, 1]>,
//
// Single-precision FP DIV
InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<16, [A9_DRegsN], 0, Reserved>,
InstrStage<10, [A9_NPipe]>],
[15, 1, 1]>,
//
// Double-precision FP DIV
InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<26, [A9_DRegsN], 0, Reserved>,
InstrStage<20, [A9_NPipe]>],
[25, 1, 1]>,
//
// Single-precision FP SQRT
InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<18, [A9_DRegsN], 0, Reserved>,
InstrStage<13, [A9_NPipe]>],
[17, 1]>,
//
// Double-precision FP SQRT
InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<33, [A9_DRegsN], 0, Reserved>,
InstrStage<28, [A9_NPipe]>],
[32, 1]>,
//
// Integer to Single-precision Move
InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
// Extra 1 latency cycle since wbck is 2 cycles
InstrStage<3, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[1, 1]>,
//
// Integer to Double-precision Move
InstrItinData<IIC_fpMOVID, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
// Extra 1 latency cycle since wbck is 2 cycles
InstrStage<3, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[1, 1, 1]>,
//
// Single-precision to Integer Move
//
// On A9 move-from-VFP is free to issue with no stall if other VFP
// operations are in flight. I assume it still can't dual-issue though.
InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>],
[2, 1]>,
//
// Double-precision to Integer Move
//
// On A9 move-from-VFP is free to issue with no stall if other VFP
// operations are in flight. I assume it still can't dual-issue though.
InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>],
[2, 1, 1]>,
//
// Single-precision FP Load
InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<2, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1]>,
//
// Double-precision FP Load
// FIXME: Result latency is 1 if address is 64-bit aligned.
InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<2, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 1]>,
//
// FP Load Multiple
// FIXME: assumes 2 doubles which requires 2 LS cycles.
InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<2, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 1, 1], [], -1>, // dynamic uops
//
// FP Load Multiple + update
// FIXME: assumes 2 doubles which requires 2 LS cycles.
InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<2, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 1, 1, 1], [], -1>, // dynamic uops
//
// Single-precision FP Store
InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<2, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1]>,
//
// Double-precision FP Store
InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<2, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1]>,
//
// FP Store Multiple
// FIXME: assumes 2 doubles which requires 2 LS cycles.
InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<2, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 1, 1], [], -1>, // dynamic uops
//
// FP Store Multiple + update
// FIXME: assumes 2 doubles which requires 2 LS cycles.
InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsVFP], 0, Required>,
InstrStage<2, [A9_DRegsN], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 1, 1, 1], [], -1>, // dynamic uops
// NEON
// VLD1
InstrItinData<IIC_VLD1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1]>,
// VLD1x2
InstrItinData<IIC_VLD1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1, 1]>,
// VLD1x3
InstrItinData<IIC_VLD1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 2, 1]>,
// VLD1x4
InstrItinData<IIC_VLD1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 2, 2, 1]>,
// VLD1u
InstrItinData<IIC_VLD1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 2, 1]>,
// VLD1x2u
InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1, 2, 1]>,
// VLD1x3u
InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 2, 2, 1]>,
// VLD1x4u
InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 2, 2, 2, 1]>,
//
// VLD1ln
InstrItinData<IIC_VLD1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[3, 1, 1, 1]>,
//
// VLD1lnu
InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[3, 2, 1, 1, 1, 1]>,
//
// VLD1dup
InstrItinData<IIC_VLD1dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 1]>,
//
// VLD1dupu
InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 2, 1, 1]>,
//
// VLD2
InstrItinData<IIC_VLD2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 2, 1]>,
//
// VLD2x2
InstrItinData<IIC_VLD2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 3, 2, 3, 1]>,
//
// VLD2ln
InstrItinData<IIC_VLD2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[3, 3, 1, 1, 1, 1]>,
//
// VLD2u
InstrItinData<IIC_VLD2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 2, 2, 1, 1, 1]>,
//
// VLD2x2u
InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 3, 2, 3, 2, 1]>,
//
// VLD2lnu
InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[3, 3, 2, 1, 1, 1, 1, 1]>,
//
// VLD2dup
InstrItinData<IIC_VLD2dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 2, 1]>,
//
// VLD2dupu
InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 2, 2, 1, 1]>,
//
// VLD3
InstrItinData<IIC_VLD3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe], 0>,
InstrStage<3, [A9_LSUnit]>],
[3, 3, 4, 1]>,
//
// VLD3ln
InstrItinData<IIC_VLD3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
InstrStage<5, [A9_NPipe], 0>,
InstrStage<5, [A9_LSUnit]>],
[5, 5, 6, 1, 1, 1, 1, 2]>,
//
// VLD3u
InstrItinData<IIC_VLD3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe], 0>,
InstrStage<3, [A9_LSUnit]>],
[3, 3, 4, 2, 1]>,
//
// VLD3lnu
InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
InstrStage<5, [A9_NPipe], 0>,
InstrStage<5, [A9_LSUnit]>],
[5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>,
//
// VLD3dup
InstrItinData<IIC_VLD3dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe], 0>,
InstrStage<3, [A9_LSUnit]>],
[3, 3, 4, 1]>,
//
// VLD3dupu
InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe], 0>,
InstrStage<3, [A9_LSUnit]>],
[3, 3, 4, 2, 1, 1]>,
//
// VLD4
InstrItinData<IIC_VLD4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe], 0>,
InstrStage<3, [A9_LSUnit]>],
[3, 3, 4, 4, 1]>,
//
// VLD4ln
InstrItinData<IIC_VLD4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
InstrStage<4, [A9_NPipe], 0>,
InstrStage<4, [A9_LSUnit]>],
[4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
//
// VLD4u
InstrItinData<IIC_VLD4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe], 0>,
InstrStage<3, [A9_LSUnit]>],
[3, 3, 4, 4, 2, 1]>,
//
// VLD4lnu
InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
InstrStage<4, [A9_NPipe], 0>,
InstrStage<4, [A9_LSUnit]>],
[4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
//
// VLD4dup
InstrItinData<IIC_VLD4dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 2, 3, 3, 1]>,
//
// VLD4dupu
InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 2, 3, 3, 2, 1, 1]>,
//
// VST1
InstrItinData<IIC_VST1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1, 1]>,
//
// VST1x2
InstrItinData<IIC_VST1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1, 1, 1]>,
//
// VST1x3
InstrItinData<IIC_VST1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 1, 1, 2]>,
//
// VST1x4
InstrItinData<IIC_VST1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 1, 1, 2, 2]>,
//
// VST1u
InstrItinData<IIC_VST1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 1, 1, 1, 1]>,
//
// VST1x2u
InstrItinData<IIC_VST1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 1, 1, 1, 1, 1]>,
//
// VST1x3u
InstrItinData<IIC_VST1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 1, 1, 1, 1, 1, 2]>,
//
// VST1x4u
InstrItinData<IIC_VST1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 1, 1, 1, 1, 1, 2, 2]>,
//
// VST1ln
InstrItinData<IIC_VST1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1, 1]>,
//
// VST1lnu
InstrItinData<IIC_VST1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 1, 1, 1, 1]>,
//
// VST2
InstrItinData<IIC_VST2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1, 1, 1]>,
//
// VST2x2
InstrItinData<IIC_VST2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe], 0>,
InstrStage<3, [A9_LSUnit]>],
[1, 1, 1, 1, 2, 2]>,
//
// VST2u
InstrItinData<IIC_VST2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 1, 1, 1, 1, 1]>,
//
// VST2x2u
InstrItinData<IIC_VST2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe], 0>,
InstrStage<3, [A9_LSUnit]>],
[2, 1, 1, 1, 1, 1, 2, 2]>,
//
// VST2ln
InstrItinData<IIC_VST2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[1, 1, 1, 1]>,
//
// VST2lnu
InstrItinData<IIC_VST2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe], 0>,
InstrStage<1, [A9_LSUnit]>],
[2, 1, 1, 1, 1, 1]>,
//
// VST3
InstrItinData<IIC_VST3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 1, 1, 2]>,
//
// VST3u
InstrItinData<IIC_VST3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 1, 1, 1, 1, 1, 2]>,
//
// VST3ln
InstrItinData<IIC_VST3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe], 0>,
InstrStage<3, [A9_LSUnit]>],
[1, 1, 1, 1, 2]>,
//
// VST3lnu
InstrItinData<IIC_VST3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe], 0>,
InstrStage<3, [A9_LSUnit]>],
[2, 1, 1, 1, 1, 1, 2]>,
//
// VST4
InstrItinData<IIC_VST4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 1, 1, 2, 2]>,
//
// VST4u
InstrItinData<IIC_VST4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 1, 1, 1, 1, 1, 2, 2]>,
//
// VST4ln
InstrItinData<IIC_VST4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[1, 1, 1, 1, 2, 2]>,
//
// VST4lnu
InstrItinData<IIC_VST4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe], 0>,
InstrStage<2, [A9_LSUnit]>],
[2, 1, 1, 1, 1, 1, 2, 2]>,
//
// Double-register Integer Unary
InstrItinData<IIC_VUNAiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 2]>,
//
// Quad-register Integer Unary
InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 2]>,
//
// Double-register Integer Q-Unary
InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1]>,
//
// Quad-register Integer CountQ-Unary
InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1]>,
//
// Double-register Integer Binary
InstrItinData<IIC_VBINiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[3, 2, 2]>,
//
// Quad-register Integer Binary
InstrItinData<IIC_VBINiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[3, 2, 2]>,
//
// Double-register Integer Subtract
InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[3, 2, 1]>,
//
// Quad-register Integer Subtract
InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[3, 2, 1]>,
//
// Double-register Integer Shift
InstrItinData<IIC_VSHLiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[3, 1, 1]>,
//
// Quad-register Integer Shift
InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[3, 1, 1]>,
//
// Double-register Integer Shift (4 cycle)
InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1, 1]>,
//
// Quad-register Integer Shift (4 cycle)
InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 1, 1]>,
//
// Double-register Integer Binary (4 cycle)
InstrItinData<IIC_VBINi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 2, 2]>,
//
// Quad-register Integer Binary (4 cycle)
InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 2, 2]>,
//
// Double-register Integer Subtract (4 cycle)
InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 2, 1]>,
//
// Quad-register Integer Subtract (4 cycle)
InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[4, 2, 1]>,
//
// Double-register Integer Count
InstrItinData<IIC_VCNTiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[3, 2, 2]>,
//
// Quad-register Integer Count
// Result written in N3, but that is relative to the last cycle of multicycle,
// so we use 4 for those cases
InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[4, 2, 2]>,
//
// Double-register Absolute Difference and Accumulate
InstrItinData<IIC_VABAD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[6, 3, 2, 1]>,
//
// Quad-register Absolute Difference and Accumulate
InstrItinData<IIC_VABAQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[6, 3, 2, 1]>,
//
// Double-register Integer Pair Add Long
InstrItinData<IIC_VPALiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[6, 3, 1]>,
//
// Quad-register Integer Pair Add Long
InstrItinData<IIC_VPALiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[6, 3, 1]>,
//
// Double-register Integer Multiply (.8, .16)
InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[6, 2, 2]>,
//
// Quad-register Integer Multiply (.8, .16)
InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[7, 2, 2]>,
//
// Double-register Integer Multiply (.32)
InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[7, 2, 1]>,
//
// Quad-register Integer Multiply (.32)
InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 9 cycles
InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
InstrStage<4, [A9_NPipe]>],
[9, 2, 1]>,
//
// Double-register Integer Multiply-Accumulate (.8, .16)
InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[6, 3, 2, 2]>,
//
// Double-register Integer Multiply-Accumulate (.32)
InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[7, 3, 2, 1]>,
//
// Quad-register Integer Multiply-Accumulate (.8, .16)
InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[7, 3, 2, 2]>,
//
// Quad-register Integer Multiply-Accumulate (.32)
InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 9 cycles
InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
InstrStage<4, [A9_NPipe]>],
[9, 3, 2, 1]>,
//
// Move
InstrItinData<IIC_VMOV, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[1,1]>,
//
// Move Immediate
InstrItinData<IIC_VMOVImm, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[3]>,
//
// Double-register Permute Move
InstrItinData<IIC_VMOVD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[2, 1]>,
//
// Quad-register Permute Move
InstrItinData<IIC_VMOVQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[2, 1]>,
//
// Integer to Single-precision Move
InstrItinData<IIC_VMOVIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[1, 1]>,
//
// Integer to Double-precision Move
InstrItinData<IIC_VMOVID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[1, 1, 1]>,
//
// Single-precision to Integer Move
InstrItinData<IIC_VMOVSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[2, 1]>,
//
// Double-precision to Integer Move
InstrItinData<IIC_VMOVDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[2, 2, 1]>,
//
// Integer to Lane Move
InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[3, 1, 1]>,
//
// Vector narrow move
InstrItinData<IIC_VMOVN, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[3, 1]>,
//
// Double-register FP Unary
InstrItinData<IIC_VUNAD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[5, 2]>,
//
// Quad-register FP Unary
// Result written in N5, but that is relative to the last cycle of multicycle,
// so we use 6 for those cases
InstrItinData<IIC_VUNAQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[6, 2]>,
//
// Double-register FP Binary
// FIXME: We're using this itin for many instructions and [2, 2] here is too
// optimistic.
InstrItinData<IIC_VBIND, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[5, 2, 2]>,
//
// VPADD, etc.
InstrItinData<IIC_VPBIND, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[5, 1, 1]>,
//
// Double-register FP VMUL
InstrItinData<IIC_VFMULD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[5, 2, 1]>,
//
// Quad-register FP Binary
// Result written in N5, but that is relative to the last cycle of multicycle,
// so we use 6 for those cases
// FIXME: We're using this itin for many instructions and [2, 2] here is too
// optimistic.
InstrItinData<IIC_VBINQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[6, 2, 2]>,
//
// Quad-register FP VMUL
InstrItinData<IIC_VFMULQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[6, 2, 1]>,
//
// Double-register FP Multiple-Accumulate
InstrItinData<IIC_VMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[6, 3, 2, 1]>,
//
// Quad-register FP Multiple-Accumulate
// Result written in N9, but that is relative to the last cycle of multicycle,
// so we use 10 for those cases
InstrItinData<IIC_VMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 9 cycles
InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
InstrStage<4, [A9_NPipe]>],
[8, 4, 2, 1]>,
//
// Double-register Fused FP Multiple-Accumulate
InstrItinData<IIC_VFMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[6, 3, 2, 1]>,
//
// Quad-register Fused FP Multiple-Accumulate
// Result written in N9, but that is relative to the last cycle of multicycle,
// so we use 10 for those cases
InstrItinData<IIC_VFMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 9 cycles
InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
InstrStage<4, [A9_NPipe]>],
[8, 4, 2, 1]>,
//
// Double-register Reciprical Step
InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 10 cycles
InstrStage<11, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[9, 2, 2]>,
//
// Quad-register Reciprical Step
InstrItinData<IIC_VRECSQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 11 cycles
InstrStage<12, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[10, 2, 2]>,
//
// Double-register Permute
InstrItinData<IIC_VPERMD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[2, 2, 1, 1]>,
//
// Quad-register Permute
// Result written in N2, but that is relative to the last cycle of multicycle,
// so we use 3 for those cases
InstrItinData<IIC_VPERMQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[3, 3, 1, 1]>,
//
// Quad-register Permute (3 cycle issue)
// Result written in N2, but that is relative to the last cycle of multicycle,
// so we use 4 for those cases
InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 8 cycles
InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe]>],
[4, 4, 1, 1]>,
//
// Double-register VEXT
InstrItinData<IIC_VEXTD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 6 cycles
InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
InstrStage<1, [A9_NPipe]>],
[2, 1, 1]>,
//
// Quad-register VEXT
InstrItinData<IIC_VEXTQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[3, 1, 2]>,
//
// VTB
InstrItinData<IIC_VTB1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[3, 2, 1]>,
InstrItinData<IIC_VTB2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[3, 2, 2, 1]>,
InstrItinData<IIC_VTB3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<2, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 8 cycles
InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe]>],
[4, 2, 2, 3, 1]>,
InstrItinData<IIC_VTB4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 8 cycles
InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe]>],
[4, 2, 2, 3, 3, 1]>,
//
// VTBX
InstrItinData<IIC_VTBX1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[3, 1, 2, 1]>,
InstrItinData<IIC_VTBX2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 7 cycles
InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[3, 1, 2, 2, 1]>,
InstrItinData<IIC_VTBX3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 8 cycles
InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
InstrStage<3, [A9_NPipe]>],
[4, 1, 2, 2, 3, 1]>,
InstrItinData<IIC_VTBX4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
InstrStage<1, [A9_DRegsN], 0, Required>,
// Extra latency cycles since wbck is 8 cycles
InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
InstrStage<2, [A9_NPipe]>],
[4, 1, 2, 2, 3, 3, 1]>
]>;
// ===---------------------------------------------------------------------===//
// The following definitions describe the simpler per-operand machine model.
// This works with MachineScheduler and will eventually replace itineraries.
class A9WriteLMOpsListType<list<WriteSequence> writes> {
list <WriteSequence> Writes = writes;
SchedMachineModel SchedModel = ?;
}
// Cortex-A9 machine model for scheduling and other instruction cost heuristics.
def CortexA9Model : SchedMachineModel {
let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
let MicroOpBufferSize = 56; // Based on available renamed registers.
let LoadLatency = 2; // Optimistic load latency assuming bypass.
// This is overriden by OperandCycles if the
// Itineraries are queried instead.
let MispredictPenalty = 8; // Based on estimate of pipeline depth.
let Itineraries = CortexA9Itineraries;
// FIXME: Many vector operations were never given an itinerary. We
// haven't mapped these to the new model either.
let CompleteModel = 0;
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
//===----------------------------------------------------------------------===//
// Define each kind of processor resource and number available.
//
// The AGU unit has BufferSize=1 so that the latency between operations
// that use it are considered to stall other operations.
//
// The FP unit has BufferSize=0 so that it is a hard dispatch
// hazard. No instruction may be dispatched while the unit is reserved.
let SchedModel = CortexA9Model in {
def A9UnitALU : ProcResource<2>;
def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
def A9UnitAGU : ProcResource<1> { let BufferSize = 1; }
def A9UnitLS : ProcResource<1>;
def A9UnitFP : ProcResource<1> { let BufferSize = 0; }
def A9UnitB : ProcResource<1>;
//===----------------------------------------------------------------------===//
// Define scheduler read/write types with their resources and latency on A9.
// Consume an issue slot, but no processor resources. This is useful when all
// other writes associated with the operand have NumMicroOps = 0.
def A9WriteIssue : SchedWriteRes<[]> { let Latency = 0; }
// Write an integer register.
def A9WriteI : SchedWriteRes<[A9UnitALU]>;
// Write an integer shifted-by register
def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; }
// Basic ALU.
def A9WriteALU : SchedWriteRes<[A9UnitALU]>;
// ALU with operand shifted by immediate.
def : WriteRes<WriteALUsi, [A9UnitALU]> { let Latency = 2; }
// ALU with operand shifted by register.
def A9WriteALUsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; }
// Multiplication
def A9WriteM : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; }
def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
let NumMicroOps = 0; }
def A9WriteM16 : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
let NumMicroOps = 0; }
def : SchedAlias<WriteMUL16, A9WriteM16>;
def : SchedAlias<WriteMUL32, A9WriteM>;
def : SchedAlias<WriteMUL64Lo, A9WriteM>;
def : SchedAlias<WriteMUL64Hi, A9WriteMHi>;
def : SchedAlias<WriteMAC16, A9WriteM16>;
def : SchedAlias<WriteMAC32, A9WriteM>;
def : SchedAlias<WriteMAC64Lo, A9WriteM>;
def : SchedAlias<WriteMAC64Hi, A9WriteMHi>;
def : ReadAdvance<ReadMUL, 0>;
def : ReadAdvance<ReadMAC, 0>;
// Floating-point
// Only one FP or AGU instruction may issue per cycle. We model this
// by having FP instructions consume the AGU resource.
def A9WriteF : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
def A9WriteFMov : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
def A9WriteFMulS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
def A9WriteFMulD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
def A9WriteFMAS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
def A9WriteFMAD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
def A9WriteFDivS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
def A9WriteFDivD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
def A9WriteFSqrtS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 17; }
def A9WriteFSqrtD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 32; }
// NEON has an odd mix of latencies. Simply name the write types by latency.
def A9WriteV1 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
def A9WriteV2 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 2; }
def A9WriteV3 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 3; }
def A9WriteV4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
def A9WriteV5 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
def A9WriteV6 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
def : WriteRes<WriteVLD1, []>;
def : WriteRes<WriteVLD2, []>;
def : WriteRes<WriteVLD3, []>;
def : WriteRes<WriteVLD4, []>;
def : WriteRes<WriteVST1, []>;
def : WriteRes<WriteVST2, []>;
def : WriteRes<WriteVST3, []>;
def : WriteRes<WriteVST4, []>;
// Reserve A9UnitFP for 2 consecutive cycles.
def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
let Latency = 4;
let ResourceCycles = [2, 1];
}
def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
let Latency = 7;
let ResourceCycles = [2, 1];
}
def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
let Latency = 9;
let ResourceCycles = [2, 1];
}
// Branches don't have a def operand but still consume resources.
def A9WriteB : SchedWriteRes<[A9UnitB]>;
// Address generation.
def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
// Load Integer.
def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
def : SchedAlias<WriteLd, A9WriteL>;
// Load the upper 32-bits using the same micro-op.
def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
let NumMicroOps = 0; }
// Offset shifted by register.
def A9WriteLsi : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
// Load (and zero extend) a byte.
def A9WriteLb : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
def A9WriteLbsi : SchedWriteRes<[A9UnitLS]> { let Latency = 5; }
// Load or Store Float, aligned.
def A9WriteLSfp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 1; }
// Store Integer.
def A9WriteS : SchedWriteRes<[A9UnitLS]>;
//===----------------------------------------------------------------------===//
// Define resources dynamically for load multiple variants.
// Define helpers for extra latency without consuming resources.
def A9WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; }
foreach NumCycles = 2-8 in {
def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>;
} // foreach NumCycles
// Define address generation sequences and predicates for 8 flavors of LDMs.
foreach NumAddr = 1-8 in {
// Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive
// latency for instructions that generate multiple loads or stores.
def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
// Define a predicate to select the LDM based on number of memory addresses.
def A9LMAdr#NumAddr#Pred :
SchedPredicate<"(TII->getNumLDMAddresses(*MI)+1)/2 == "#NumAddr>;
} // foreach NumAddr
// Fall-back for unknown LDMs.
def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == 0">;
// LDM/VLDM/VLDn address generation latency & resources.
// Dynamically select the A9WriteAdrN sequence using a predicate.
def A9WriteLMAdr : SchedWriteVariant<[
SchedVar<A9LMAdr1Pred, [A9WriteAdr1]>,
SchedVar<A9LMAdr2Pred, [A9WriteAdr2]>,
SchedVar<A9LMAdr3Pred, [A9WriteAdr3]>,
SchedVar<A9LMAdr4Pred, [A9WriteAdr4]>,
SchedVar<A9LMAdr5Pred, [A9WriteAdr5]>,
SchedVar<A9LMAdr6Pred, [A9WriteAdr6]>,
SchedVar<A9LMAdr7Pred, [A9WriteAdr7]>,
SchedVar<A9LMAdr8Pred, [A9WriteAdr8]>,
// For unknown LDM/VLDM/VSTM, assume 2 32-bit registers.
SchedVar<A9LMUnknownPred, [A9WriteAdr2]>]>;
// Define LDM Resources.
// These take no issue resource, so they can be combined with other
// writes like WriteB.
// A9WriteLMLo takes a single LS resource and 2 cycles.
def A9WriteLMLo : SchedWriteRes<[A9UnitLS]> { let Latency = 2;
let NumMicroOps = 0; }
// Assuming aligned access, the upper half of each pair is free with
// the same latency.
def A9WriteLMHi : SchedWriteRes<[]> { let Latency = 2;
let NumMicroOps = 0; }
// Each A9WriteL#N variant adds N cycles of latency without consuming
// additional resources.
foreach NumAddr = 1-8 in {
def A9WriteL#NumAddr : WriteSequence<
[A9WriteLMLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
def A9WriteL#NumAddr#Hi : WriteSequence<
[A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
}
//===----------------------------------------------------------------------===//
// LDM: Load multiple into 32-bit integer registers.
def A9WriteLMOpsList : A9WriteLMOpsListType<
[A9WriteL1, A9WriteL1Hi,
A9WriteL2, A9WriteL2Hi,
A9WriteL3, A9WriteL3Hi,