| //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file contains the AArch64 implementation of TargetFrameLowering class. |
| // |
| // On AArch64, stack frames are structured as follows: |
| // |
| // The stack grows downward. |
| // |
| // All of the individual frame areas on the frame below are optional, i.e. it's |
| // possible to create a function so that the particular area isn't present |
| // in the frame. |
| // |
| // At function entry, the "frame" looks as follows: |
| // |
| // | | Higher address |
| // |-----------------------------------| |
| // | | |
| // | arguments passed on the stack | |
| // | | |
| // |-----------------------------------| <- sp |
| // | | Lower address |
| // |
| // |
| // After the prologue has run, the frame has the following general structure. |
| // Note that this doesn't depict the case where a red-zone is used. Also, |
| // technically the last frame area (VLAs) doesn't get created until in the |
| // main function body, after the prologue is run. However, it's depicted here |
| // for completeness. |
| // |
| // | | Higher address |
| // |-----------------------------------| |
| // | | |
| // | arguments passed on the stack | |
| // | | |
| // |-----------------------------------| |
| // | | |
| // | (Win64 only) varargs from reg | |
| // | | |
| // |-----------------------------------| |
| // | | |
| // | (Win64 only) callee-saved SVE reg | |
| // | | |
| // |-----------------------------------| |
| // | | |
| // | callee-saved gpr registers | <--. |
| // | | | On Darwin platforms these |
| // |- - - - - - - - - - - - - - - - - -| | callee saves are swapped, |
| // | prev_lr | | (frame record first) |
| // | prev_fp | <--' |
| // | async context if needed | |
| // | (a.k.a. "frame record") | |
| // |-----------------------------------| <- fp(=x29) |
| // Default SVE stack layout Split SVE objects |
| // (aarch64-split-sve-objects=false) (aarch64-split-sve-objects=true) |
| // |-----------------------------------| |-----------------------------------| |
| // | <hazard padding> | | callee-saved PPR registers | |
| // |-----------------------------------| |-----------------------------------| |
| // | | | PPR stack objects | |
| // | callee-saved fp/simd/SVE regs | |-----------------------------------| |
| // | | | <hazard padding> | |
| // |-----------------------------------| |-----------------------------------| |
| // | | | callee-saved ZPR/FPR registers | |
| // | SVE stack objects | |-----------------------------------| |
| // | | | ZPR stack objects | |
| // |-----------------------------------| |-----------------------------------| |
| // ^ NB: FPR CSRs are promoted to ZPRs |
| // |-----------------------------------| |
| // |.empty.space.to.make.part.below....| |
| // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at |
| // |.the.standard.16-byte.alignment....| compile time; if present) |
| // |-----------------------------------| |
| // | local variables of fixed size | |
| // | including spill slots | |
| // | <FPR> | |
| // | <hazard padding> | |
| // | <GPR> | |
| // |-----------------------------------| <- bp(not defined by ABI, |
| // |.variable-sized.local.variables....| LLVM chooses X19) |
| // |.(VLAs)............................| (size of this area is unknown at |
| // |...................................| compile time) |
| // |-----------------------------------| <- sp |
| // | | Lower address |
| // |
| // |
| // To access the data in a frame, at-compile time, a constant offset must be |
| // computable from one of the pointers (fp, bp, sp) to access it. The size |
| // of the areas with a dotted background cannot be computed at compile-time |
| // if they are present, making it required to have all three of fp, bp and |
| // sp to be set up to be able to access all contents in the frame areas, |
| // assuming all of the frame areas are non-empty. |
| // |
| // For most functions, some of the frame areas are empty. For those functions, |
| // it may not be necessary to set up fp or bp: |
| // * A base pointer is definitely needed when there are both VLAs and local |
| // variables with more-than-default alignment requirements. |
| // * A frame pointer is definitely needed when there are local variables with |
| // more-than-default alignment requirements. |
| // |
| // For Darwin platforms the frame-record (fp, lr) is stored at the top of the |
| // callee-saved area, since the unwind encoding does not allow for encoding |
| // this dynamically and existing tools depend on this layout. For other |
| // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved |
| // area to allow SVE stack objects (allocated directly below the callee-saves, |
| // if available) to be accessed directly from the framepointer. |
| // The SVE spill/fill instructions have VL-scaled addressing modes such |
| // as: |
| // ldr z8, [fp, #-7 mul vl] |
| // For SVE the size of the vector length (VL) is not known at compile-time, so |
| // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this |
| // layout, we don't need to add an unscaled offset to the framepointer before |
| // accessing the SVE object in the frame. |
| // |
| // In some cases when a base pointer is not strictly needed, it is generated |
| // anyway when offsets from the frame pointer to access local variables become |
| // so large that the offset can't be encoded in the immediate fields of loads |
| // or stores. |
| // |
| // Outgoing function arguments must be at the bottom of the stack frame when |
| // calling another function. If we do not have variable-sized stack objects, we |
| // can allocate a "reserved call frame" area at the bottom of the local |
| // variable area, large enough for all outgoing calls. If we do have VLAs, then |
| // the stack pointer must be decremented and incremented around each call to |
| // make space for the arguments below the VLAs. |
| // |
| // FIXME: also explain the redzone concept. |
| // |
| // About stack hazards: Under some SME contexts, a coprocessor with its own |
| // separate cache can used for FP operations. This can create hazards if the CPU |
| // and the SME unit try to access the same area of memory, including if the |
| // access is to an area of the stack. To try to alleviate this we attempt to |
| // introduce extra padding into the stack frame between FP and GPR accesses, |
| // controlled by the aarch64-stack-hazard-size option. Without changing the |
| // layout of the stack frame in the diagram above, a stack object of size |
| // aarch64-stack-hazard-size is added between GPR and FPR CSRs. Another is added |
| // to the stack objects section, and stack objects are sorted so that FPR > |
| // Hazard padding slot > GPRs (where possible). Unfortunately some things are |
| // not handled well (VLA area, arguments on the stack, objects with both GPR and |
| // FPR accesses), but if those are controlled by the user then the entire stack |
| // frame becomes GPR at the start/end with FPR in the middle, surrounded by |
| // Hazard padding. |
| // |
| // An example of the prologue: |
| // |
| // .globl __foo |
| // .align 2 |
| // __foo: |
| // Ltmp0: |
| // .cfi_startproc |
| // .cfi_personality 155, ___gxx_personality_v0 |
| // Leh_func_begin: |
| // .cfi_lsda 16, Lexception33 |
| // |
| // stp xa,bx, [sp, -#offset]! |
| // ... |
| // stp x28, x27, [sp, #offset-32] |
| // stp fp, lr, [sp, #offset-16] |
| // add fp, sp, #offset - 16 |
| // sub sp, sp, #1360 |
| // |
| // The Stack: |
| // +-------------------------------------------+ |
| // 10000 | ........ | ........ | ........ | ........ | |
| // 10004 | ........ | ........ | ........ | ........ | |
| // +-------------------------------------------+ |
| // 10008 | ........ | ........ | ........ | ........ | |
| // 1000c | ........ | ........ | ........ | ........ | |
| // +===========================================+ |
| // 10010 | X28 Register | |
| // 10014 | X28 Register | |
| // +-------------------------------------------+ |
| // 10018 | X27 Register | |
| // 1001c | X27 Register | |
| // +===========================================+ |
| // 10020 | Frame Pointer | |
| // 10024 | Frame Pointer | |
| // +-------------------------------------------+ |
| // 10028 | Link Register | |
| // 1002c | Link Register | |
| // +===========================================+ |
| // 10030 | ........ | ........ | ........ | ........ | |
| // 10034 | ........ | ........ | ........ | ........ | |
| // +-------------------------------------------+ |
| // 10038 | ........ | ........ | ........ | ........ | |
| // 1003c | ........ | ........ | ........ | ........ | |
| // +-------------------------------------------+ |
| // |
| // [sp] = 10030 :: >>initial value<< |
| // sp = 10020 :: stp fp, lr, [sp, #-16]! |
| // fp = sp == 10020 :: mov fp, sp |
| // [sp] == 10020 :: stp x28, x27, [sp, #-16]! |
| // sp == 10010 :: >>final value<< |
| // |
| // The frame pointer (w29) points to address 10020. If we use an offset of |
| // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24 |
| // for w27, and -32 for w28: |
| // |
| // Ltmp1: |
| // .cfi_def_cfa w29, 16 |
| // Ltmp2: |
| // .cfi_offset w30, -8 |
| // Ltmp3: |
| // .cfi_offset w29, -16 |
| // Ltmp4: |
| // .cfi_offset w27, -24 |
| // Ltmp5: |
| // .cfi_offset w28, -32 |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "AArch64FrameLowering.h" |
| #include "AArch64InstrInfo.h" |
| #include "AArch64MachineFunctionInfo.h" |
| #include "AArch64PrologueEpilogue.h" |
| #include "AArch64RegisterInfo.h" |
| #include "AArch64SMEAttributes.h" |
| #include "AArch64Subtarget.h" |
| #include "MCTargetDesc/AArch64AddressingModes.h" |
| #include "MCTargetDesc/AArch64MCTargetDesc.h" |
| #include "llvm/ADT/ScopeExit.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/Analysis/ValueTracking.h" |
| #include "llvm/CodeGen/CFIInstBuilder.h" |
| #include "llvm/CodeGen/LivePhysRegs.h" |
| #include "llvm/CodeGen/MachineBasicBlock.h" |
| #include "llvm/CodeGen/MachineFrameInfo.h" |
| #include "llvm/CodeGen/MachineFunction.h" |
| #include "llvm/CodeGen/MachineInstr.h" |
| #include "llvm/CodeGen/MachineInstrBuilder.h" |
| #include "llvm/CodeGen/MachineMemOperand.h" |
| #include "llvm/CodeGen/MachineModuleInfo.h" |
| #include "llvm/CodeGen/MachineOperand.h" |
| #include "llvm/CodeGen/MachineRegisterInfo.h" |
| #include "llvm/CodeGen/RegisterScavenging.h" |
| #include "llvm/CodeGen/TargetInstrInfo.h" |
| #include "llvm/CodeGen/TargetRegisterInfo.h" |
| #include "llvm/CodeGen/TargetSubtargetInfo.h" |
| #include "llvm/CodeGen/WinEHFuncInfo.h" |
| #include "llvm/IR/Attributes.h" |
| #include "llvm/IR/CallingConv.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/DebugLoc.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/MC/MCAsmInfo.h" |
| #include "llvm/MC/MCDwarf.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/FormatVariadic.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Target/TargetMachine.h" |
| #include "llvm/Target/TargetOptions.h" |
| #include <cassert> |
| #include <cstdint> |
| #include <iterator> |
| #include <optional> |
| #include <vector> |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "frame-info" |
| |
| static cl::opt<bool> EnableRedZone("aarch64-redzone", |
| cl::desc("enable use of redzone on AArch64"), |
| cl::init(false), cl::Hidden); |
| |
| static cl::opt<bool> StackTaggingMergeSetTag( |
| "stack-tagging-merge-settag", |
| cl::desc("merge settag instruction in function epilog"), cl::init(true), |
| cl::Hidden); |
| |
| static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects", |
| cl::desc("sort stack allocations"), |
| cl::init(true), cl::Hidden); |
| |
| static cl::opt<bool> |
| SplitSVEObjects("aarch64-split-sve-objects", |
| cl::desc("Split allocation of ZPR & PPR objects"), |
| cl::init(true), cl::Hidden); |
| |
| cl::opt<bool> EnableHomogeneousPrologEpilog( |
| "homogeneous-prolog-epilog", cl::Hidden, |
| cl::desc("Emit homogeneous prologue and epilogue for the size " |
| "optimization (default = off)")); |
| |
| // Stack hazard size for analysis remarks. StackHazardSize takes precedence. |
| static cl::opt<unsigned> |
| StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(0), |
| cl::Hidden); |
| // Whether to insert padding into non-streaming functions (for testing). |
| static cl::opt<bool> |
| StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming", |
| cl::init(false), cl::Hidden); |
| |
| static cl::opt<bool> DisableMultiVectorSpillFill( |
| "aarch64-disable-multivector-spill-fill", |
| cl::desc("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(false), |
| cl::Hidden); |
| |
| int64_t |
| AArch64FrameLowering::getArgumentStackToRestore(MachineFunction &MF, |
| MachineBasicBlock &MBB) const { |
| MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); |
| AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| bool IsTailCallReturn = (MBB.end() != MBBI) |
| ? AArch64InstrInfo::isTailCallReturnInst(*MBBI) |
| : false; |
| |
| int64_t ArgumentPopSize = 0; |
| if (IsTailCallReturn) { |
| MachineOperand &StackAdjust = MBBI->getOperand(1); |
| |
| // For a tail-call in a callee-pops-arguments environment, some or all of |
| // the stack may actually be in use for the call's arguments, this is |
| // calculated during LowerCall and consumed here... |
| ArgumentPopSize = StackAdjust.getImm(); |
| } else { |
| // ... otherwise the amount to pop is *all* of the argument space, |
| // conveniently stored in the MachineFunctionInfo by |
| // LowerFormalArguments. This will, of course, be zero for the C calling |
| // convention. |
| ArgumentPopSize = AFI->getArgumentStackToRestore(); |
| } |
| |
| return ArgumentPopSize; |
| } |
| |
| static bool produceCompactUnwindFrame(const AArch64FrameLowering &, |
| MachineFunction &MF); |
| |
| enum class AssignObjectOffsets { No, Yes }; |
| /// Process all the SVE stack objects and the SVE stack size and offsets for |
| /// each object. If AssignOffsets is "Yes", the offsets get assigned (and SVE |
| /// stack sizes set). Returns the size of the SVE stack. |
| static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, |
| AssignObjectOffsets AssignOffsets); |
| |
| static unsigned getStackHazardSize(const MachineFunction &MF) { |
| return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize(); |
| } |
| |
| StackOffset |
| AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const { |
| const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| return StackOffset::getScalable(AFI->getStackSizeZPR()); |
| } |
| |
| StackOffset |
| AArch64FrameLowering::getPPRStackSize(const MachineFunction &MF) const { |
| // With split SVE objects, the hazard padding is added to the PPR region, |
| // which places it between the [GPR, PPR] area and the [ZPR, FPR] area. This |
| // avoids hazards between both GPRs and FPRs and ZPRs and PPRs. |
| const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| return StackOffset::get(AFI->hasSplitSVEObjects() ? getStackHazardSize(MF) |
| : 0, |
| AFI->getStackSizePPR()); |
| } |
| |
| // Conservatively, returns true if the function is likely to have SVE vectors |
| // on the stack. This function is safe to be called before callee-saves or |
| // object offsets have been determined. |
| static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL, |
| const MachineFunction &MF) { |
| auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| if (AFI->isSVECC()) |
| return true; |
| |
| if (AFI->hasCalculatedStackSizeSVE()) |
| return bool(AFL.getSVEStackSize(MF)); |
| |
| const MachineFrameInfo &MFI = MF.getFrameInfo(); |
| for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) { |
| if (MFI.hasScalableStackID(FI)) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static bool isTargetWindows(const MachineFunction &MF) { |
| return MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); |
| } |
| |
| bool AArch64FrameLowering::hasSVECalleeSavesAboveFrameRecord( |
| const MachineFunction &MF) const { |
| auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| return isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize(); |
| } |
| |
| /// Returns true if a homogeneous prolog or epilog code can be emitted |
| /// for the size optimization. If possible, a frame helper call is injected. |
| /// When Exit block is given, this check is for epilog. |
| bool AArch64FrameLowering::homogeneousPrologEpilog( |
| MachineFunction &MF, MachineBasicBlock *Exit) const { |
| if (!MF.getFunction().hasMinSize()) |
| return false; |
| if (!EnableHomogeneousPrologEpilog) |
| return false; |
| if (EnableRedZone) |
| return false; |
| |
| // TODO: Window is supported yet. |
| if (isTargetWindows(MF)) |
| return false; |
| |
| // TODO: SVE is not supported yet. |
| if (isLikelyToHaveSVEStack(*this, MF)) |
| return false; |
| |
| // Bail on stack adjustment needed on return for simplicity. |
| const MachineFrameInfo &MFI = MF.getFrameInfo(); |
| const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); |
| if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF)) |
| return false; |
| if (Exit && getArgumentStackToRestore(MF, *Exit)) |
| return false; |
| |
| auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges()) |
| return false; |
| |
| // If there are an odd number of GPRs before LR and FP in the CSRs list, |
| // they will not be paired into one RegPairInfo, which is incompatible with |
| // the assumption made by the homogeneous prolog epilog pass. |
| const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); |
| unsigned NumGPRs = 0; |
| for (unsigned I = 0; CSRegs[I]; ++I) { |
| Register Reg = CSRegs[I]; |
| if (Reg == AArch64::LR) { |
| assert(CSRegs[I + 1] == AArch64::FP); |
| if (NumGPRs % 2 != 0) |
| return false; |
| break; |
| } |
| if (AArch64::GPR64RegClass.contains(Reg)) |
| ++NumGPRs; |
| } |
| |
| return true; |
| } |
| |
| /// Returns true if CSRs should be paired. |
| bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const { |
| return produceCompactUnwindFrame(*this, MF) || homogeneousPrologEpilog(MF); |
| } |
| |
| /// This is the biggest offset to the stack pointer we can encode in aarch64 |
| /// instructions (without using a separate calculation and a temp register). |
| /// Note that the exception here are vector stores/loads which cannot encode any |
| /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()). |
| static const unsigned DefaultSafeSPDisplacement = 255; |
| |
| /// Look at each instruction that references stack frames and return the stack |
| /// size limit beyond which some of these instructions will require a scratch |
| /// register during their expansion later. |
| static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { |
| // FIXME: For now, just conservatively guesstimate based on unscaled indexing |
| // range. We'll end up allocating an unnecessary spill slot a lot, but |
| // realistically that's not a big deal at this stage of the game. |
| for (MachineBasicBlock &MBB : MF) { |
| for (MachineInstr &MI : MBB) { |
| if (MI.isDebugInstr() || MI.isPseudo() || |
| MI.getOpcode() == AArch64::ADDXri || |
| MI.getOpcode() == AArch64::ADDSXri) |
| continue; |
| |
| for (const MachineOperand &MO : MI.operands()) { |
| if (!MO.isFI()) |
| continue; |
| |
| StackOffset Offset; |
| if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) == |
| AArch64FrameOffsetCannotUpdate) |
| return 0; |
| } |
| } |
| } |
| return DefaultSafeSPDisplacement; |
| } |
| |
| TargetStackID::Value |
| AArch64FrameLowering::getStackIDForScalableVectors() const { |
| return TargetStackID::ScalableVector; |
| } |
| |
| unsigned |
| AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF, |
| const AArch64FunctionInfo *AFI, |
| bool IsWin64, bool IsFunclet) const { |
| assert(AFI->getTailCallReservedStack() % 16 == 0 && |
| "Tail call reserved stack must be aligned to 16 bytes"); |
| if (!IsWin64 || IsFunclet) { |
| return AFI->getTailCallReservedStack(); |
| } else { |
| if (AFI->getTailCallReservedStack() != 0 && |
| !MF.getFunction().getAttributes().hasAttrSomewhere( |
| Attribute::SwiftAsync)) |
| report_fatal_error("cannot generate ABI-changing tail call for Win64"); |
| unsigned FixedObjectSize = AFI->getTailCallReservedStack(); |
| |
| // Var args are stored here in the primary function. |
| FixedObjectSize += AFI->getVarArgsGPRSize(); |
| |
| if (MF.hasEHFunclets()) { |
| // Catch objects are stored here in the primary function. |
| const MachineFrameInfo &MFI = MF.getFrameInfo(); |
| const WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); |
| SmallSetVector<int, 8> CatchObjFrameIndices; |
| for (const WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) { |
| for (const WinEHHandlerType &H : TBME.HandlerArray) { |
| int FrameIndex = H.CatchObj.FrameIndex; |
| if ((FrameIndex != INT_MAX) && |
| CatchObjFrameIndices.insert(FrameIndex)) { |
| FixedObjectSize = alignTo(FixedObjectSize, |
| MFI.getObjectAlign(FrameIndex).value()) + |
| MFI.getObjectSize(FrameIndex); |
| } |
| } |
| } |
| // To support EH funclets we allocate an UnwindHelp object |
| FixedObjectSize += 8; |
| } |
| return alignTo(FixedObjectSize, 16); |
| } |
| } |
| |
| bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { |
| if (!EnableRedZone) |
| return false; |
| |
| // Don't use the red zone if the function explicitly asks us not to. |
| // This is typically used for kernel code. |
| const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| const unsigned RedZoneSize = |
| Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction()); |
| if (!RedZoneSize) |
| return false; |
| |
| const MachineFrameInfo &MFI = MF.getFrameInfo(); |
| const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| uint64_t NumBytes = AFI->getLocalStackSize(); |
| |
| // If neither NEON or SVE are available, a COPY from one Q-reg to |
| // another requires a spill -> reload sequence. We can do that |
| // using a pre-decrementing store/post-decrementing load, but |
| // if we do so, we can't use the Red Zone. |
| bool LowerQRegCopyThroughMem = Subtarget.hasFPARMv8() && |
| !Subtarget.isNeonAvailable() && |
| !Subtarget.hasSVE(); |
| |
| return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize || |
| AFI->hasSVEStackSize() || LowerQRegCopyThroughMem); |
| } |
| |
| /// hasFPImpl - Return true if the specified function should have a dedicated |
| /// frame pointer register. |
| bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const { |
| const MachineFrameInfo &MFI = MF.getFrameInfo(); |
| const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); |
| const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); |
| |
| // Win64 EH requires a frame pointer if funclets are present, as the locals |
| // are accessed off the frame pointer in both the parent function and the |
| // funclets. |
| if (MF.hasEHFunclets()) |
| return true; |
| // Retain behavior of always omitting the FP for leaf functions when possible. |
| if (MF.getTarget().Options.DisableFramePointerElim(MF)) |
| return true; |
| if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || |
| MFI.hasStackMap() || MFI.hasPatchPoint() || |
| RegInfo->hasStackRealignment(MF)) |
| return true; |
| |
| // If we: |
| // |
| // 1. Have streaming mode changes |
| // OR: |
| // 2. Have a streaming body with SVE stack objects |
| // |
| // Then the value of VG restored when unwinding to this function may not match |
| // the value of VG used to set up the stack. |
| // |
| // This is a problem as the CFA can be described with an expression of the |
| // form: CFA = SP + NumBytes + VG * NumScalableBytes. |
| // |
| // If the value of VG used in that expression does not match the value used to |
| // set up the stack, an incorrect address for the CFA will be computed, and |
| // unwinding will fail. |
| // |
| // We work around this issue by ensuring the frame-pointer can describe the |
| // CFA in either of these cases. |
| if (AFI.needsDwarfUnwindInfo(MF) && |
| ((requiresSaveVG(MF) || AFI.getSMEFnAttrs().hasStreamingBody()) && |
| (!AFI.hasCalculatedStackSizeSVE() || AFI.hasSVEStackSize()))) |
| return true; |
| // With large callframes around we may need to use FP to access the scavenging |
| // emergency spillslot. |
| // |
| // Unfortunately some calls to hasFP() like machine verifier -> |
| // getReservedReg() -> hasFP in the middle of global isel are too early |
| // to know the max call frame size. Hopefully conservatively returning "true" |
| // in those cases is fine. |
| // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs. |
| if (!MFI.isMaxCallFrameSizeComputed() || |
| MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement) |
| return true; |
| |
| return false; |
| } |
| |
| /// Should the Frame Pointer be reserved for the current function? |
| bool AArch64FrameLowering::isFPReserved(const MachineFunction &MF) const { |
| const TargetMachine &TM = MF.getTarget(); |
| const Triple &TT = TM.getTargetTriple(); |
| |
| // These OSes require the frame chain is valid, even if the current frame does |
| // not use a frame pointer. |
| if (TT.isOSDarwin() || TT.isOSWindows()) |
| return true; |
| |
| // If the function has a frame pointer, it is reserved. |
| if (hasFP(MF)) |
| return true; |
| |
| // Frontend has requested to preserve the frame pointer. |
| if (TM.Options.FramePointerIsReserved(MF)) |
| return true; |
| |
| return false; |
| } |
| |
| /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is |
| /// not required, we reserve argument space for call sites in the function |
| /// immediately on entry to the current function. This eliminates the need for |
| /// add/sub sp brackets around call sites. Returns true if the call frame is |
| /// included as part of the stack frame. |
| bool AArch64FrameLowering::hasReservedCallFrame( |
| const MachineFunction &MF) const { |
| // The stack probing code for the dynamically allocated outgoing arguments |
| // area assumes that the stack is probed at the top - either by the prologue |
| // code, which issues a probe if `hasVarSizedObjects` return true, or by the |
| // most recent variable-sized object allocation. Changing the condition here |
| // may need to be followed up by changes to the probe issuing logic. |
| return !MF.getFrameInfo().hasVarSizedObjects(); |
| } |
| |
| MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( |
| MachineFunction &MF, MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator I) const { |
| |
| const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| const AArch64InstrInfo *TII = Subtarget.getInstrInfo(); |
| const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); |
| [[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo(); |
| DebugLoc DL = I->getDebugLoc(); |
| unsigned Opc = I->getOpcode(); |
| bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); |
| uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; |
| |
| if (!hasReservedCallFrame(MF)) { |
| int64_t Amount = I->getOperand(0).getImm(); |
| Amount = alignTo(Amount, getStackAlign()); |
| if (!IsDestroy) |
| Amount = -Amount; |
| |
| // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it |
| // doesn't have to pop anything), then the first operand will be zero too so |
| // this adjustment is a no-op. |
| if (CalleePopAmount == 0) { |
| // FIXME: in-function stack adjustment for calls is limited to 24-bits |
| // because there's no guaranteed temporary register available. |
| // |
| // ADD/SUB (immediate) has only LSL #0 and LSL #12 available. |
| // 1) For offset <= 12-bit, we use LSL #0 |
| // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses |
| // LSL #0, and the other uses LSL #12. |
| // |
| // Most call frames will be allocated at the start of a function so |
| // this is OK, but it is a limitation that needs dealing with. |
| assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); |
| |
| if (TLI->hasInlineStackProbe(MF) && |
| -Amount >= AArch64::StackProbeMaxUnprobedStack) { |
| // When stack probing is enabled, the decrement of SP may need to be |
| // probed. We only need to do this if the call site needs 1024 bytes of |
| // space or more, because a region smaller than that is allowed to be |
| // unprobed at an ABI boundary. We rely on the fact that SP has been |
| // probed exactly at this point, either by the prologue or most recent |
| // dynamic allocation. |
| assert(MFI.hasVarSizedObjects() && |
| "non-reserved call frame without var sized objects?"); |
| Register ScratchReg = |
| MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); |
| inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0)); |
| } else { |
| emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, |
| StackOffset::getFixed(Amount), TII); |
| } |
| } |
| } else if (CalleePopAmount != 0) { |
| // If the calling convention demands that the callee pops arguments from the |
| // stack, we want to add it back if we have a reserved call frame. |
| assert(CalleePopAmount < 0xffffff && "call frame too large"); |
| emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, |
| StackOffset::getFixed(-(int64_t)CalleePopAmount), TII); |
| } |
| return MBB.erase(I); |
| } |
| |
| void AArch64FrameLowering::resetCFIToInitialState( |
| MachineBasicBlock &MBB) const { |
| |
| MachineFunction &MF = *MBB.getParent(); |
| const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| const auto &TRI = *Subtarget.getRegisterInfo(); |
| const auto &MFI = *MF.getInfo<AArch64FunctionInfo>(); |
| |
| CFIInstBuilder CFIBuilder(MBB, MBB.begin(), MachineInstr::NoFlags); |
| |
| // Reset the CFA to `SP + 0`. |
| CFIBuilder.buildDefCFA(AArch64::SP, 0); |
| |
| // Flip the RA sign state. |
| if (MFI.shouldSignReturnAddress(MF)) |
| MFI.branchProtectionPAuthLR() ? CFIBuilder.buildNegateRAStateWithPC() |
| : CFIBuilder.buildNegateRAState(); |
| |
| // Shadow call stack uses X18, reset it. |
| if (MFI.needsShadowCallStackPrologueEpilogue(MF)) |
| CFIBuilder.buildSameValue(AArch64::X18); |
| |
| // Emit .cfi_same_value for callee-saved registers. |
| const std::vector<CalleeSavedInfo> &CSI = |
| MF.getFrameInfo().getCalleeSavedInfo(); |
| for (const auto &Info : CSI) { |
| MCRegister Reg = Info.getReg(); |
| if (!TRI.regNeedsCFI(Reg, Reg)) |
| continue; |
| CFIBuilder.buildSameValue(Reg); |
| } |
| } |
| |
| static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) { |
| switch (Reg.id()) { |
| default: |
| // The called routine is expected to preserve r19-r28 |
| // r29 and r30 are used as frame pointer and link register resp. |
| return 0; |
| |
| // GPRs |
| #define CASE(n) \ |
| case AArch64::W##n: \ |
| case AArch64::X##n: \ |
| return AArch64::X##n |
| CASE(0); |
| CASE(1); |
| CASE(2); |
| CASE(3); |
| CASE(4); |
| CASE(5); |
| CASE(6); |
| CASE(7); |
| CASE(8); |
| CASE(9); |
| CASE(10); |
| CASE(11); |
| CASE(12); |
| CASE(13); |
| CASE(14); |
| CASE(15); |
| CASE(16); |
| CASE(17); |
| CASE(18); |
| #undef CASE |
| |
| // FPRs |
| #define CASE(n) \ |
| case AArch64::B##n: \ |
| case AArch64::H##n: \ |
| case AArch64::S##n: \ |
| case AArch64::D##n: \ |
| case AArch64::Q##n: \ |
| return HasSVE ? AArch64::Z##n : AArch64::Q##n |
| CASE(0); |
| CASE(1); |
| CASE(2); |
| CASE(3); |
| CASE(4); |
| CASE(5); |
| CASE(6); |
| CASE(7); |
| CASE(8); |
| CASE(9); |
| CASE(10); |
| CASE(11); |
| CASE(12); |
| CASE(13); |
| CASE(14); |
| CASE(15); |
| CASE(16); |
| CASE(17); |
| CASE(18); |
| CASE(19); |
| CASE(20); |
| CASE(21); |
| CASE(22); |
| CASE(23); |
| CASE(24); |
| CASE(25); |
| CASE(26); |
| CASE(27); |
| CASE(28); |
| CASE(29); |
| CASE(30); |
| CASE(31); |
| #undef CASE |
| } |
| } |
| |
| void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero, |
| MachineBasicBlock &MBB) const { |
| // Insertion point. |
| MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); |
| |
| // Fake a debug loc. |
| DebugLoc DL; |
| if (MBBI != MBB.end()) |
| DL = MBBI->getDebugLoc(); |
| |
| const MachineFunction &MF = *MBB.getParent(); |
| const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); |
| const AArch64RegisterInfo &TRI = *STI.getRegisterInfo(); |
| |
| BitVector GPRsToZero(TRI.getNumRegs()); |
| BitVector FPRsToZero(TRI.getNumRegs()); |
| bool HasSVE = STI.isSVEorStreamingSVEAvailable(); |
| for (MCRegister Reg : RegsToZero.set_bits()) { |
| if (TRI.isGeneralPurposeRegister(MF, Reg)) { |
| // For GPRs, we only care to clear out the 64-bit register. |
| if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE)) |
| GPRsToZero.set(XReg); |
| } else if (AArch64InstrInfo::isFpOrNEON(Reg)) { |
| // For FPRs, |
| if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE)) |
| FPRsToZero.set(XReg); |
| } |
| } |
| |
| const AArch64InstrInfo &TII = *STI.getInstrInfo(); |
| |
| // Zero out GPRs. |
| for (MCRegister Reg : GPRsToZero.set_bits()) |
| TII.buildClearRegister(Reg, MBB, MBBI, DL); |
| |
| // Zero out FP/vector registers. |
| for (MCRegister Reg : FPRsToZero.set_bits()) |
| TII.buildClearRegister(Reg, MBB, MBBI, DL); |
| |
| if (HasSVE) { |
| for (MCRegister PReg : |
| {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4, |
| AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9, |
| AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14, |
| AArch64::P15}) { |
| if (RegsToZero[PReg]) |
| BuildMI(MBB, MBBI, DL, TII.get(AArch64::PFALSE), PReg); |
| } |
| } |
| } |
| |
| bool AArch64FrameLowering::windowsRequiresStackProbe( |
| const MachineFunction &MF, uint64_t StackSizeInBytes) const { |
| const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>(); |
| // TODO: When implementing stack protectors, take that into account |
| // for the probe threshold. |
| return Subtarget.isTargetWindows() && MFI.hasStackProbing() && |
| StackSizeInBytes >= uint64_t(MFI.getStackProbeSize()); |
| } |
| |
| static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs, |
| const MachineBasicBlock &MBB) { |
| const MachineFunction *MF = MBB.getParent(); |
| LiveRegs.addLiveIns(MBB); |
| // Mark callee saved registers as used so we will not choose them. |
| const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs(); |
| for (unsigned i = 0; CSRegs[i]; ++i) |
| LiveRegs.addReg(CSRegs[i]); |
| } |
| |
| Register |
| AArch64FrameLowering::findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, |
| bool HasCall) const { |
| MachineFunction *MF = MBB->getParent(); |
| |
| // If MBB is an entry block, use X9 as the scratch register |
| // preserve_none functions may be using X9 to pass arguments, |
| // so prefer to pick an available register below. |
| if (&MF->front() == MBB && |
| MF->getFunction().getCallingConv() != CallingConv::PreserveNone) |
| return AArch64::X9; |
| |
| const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); |
| const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); |
| LivePhysRegs LiveRegs(TRI); |
| getLiveRegsForEntryMBB(LiveRegs, *MBB); |
| if (HasCall) { |
| LiveRegs.addReg(AArch64::X16); |
| LiveRegs.addReg(AArch64::X17); |
| LiveRegs.addReg(AArch64::X18); |
| } |
| |
| // Prefer X9 since it was historically used for the prologue scratch reg. |
| const MachineRegisterInfo &MRI = MF->getRegInfo(); |
| if (LiveRegs.available(MRI, AArch64::X9)) |
| return AArch64::X9; |
| |
| for (unsigned Reg : AArch64::GPR64RegClass) { |
| if (LiveRegs.available(MRI, Reg)) |
| return Reg; |
| } |
| return AArch64::NoRegister; |
| } |
| |
| bool AArch64FrameLowering::canUseAsPrologue( |
| const MachineBasicBlock &MBB) const { |
| const MachineFunction *MF = MBB.getParent(); |
| MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); |
| const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); |
| const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); |
| const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); |
| const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>(); |
| |
| if (AFI->hasSwiftAsyncContext()) { |
| const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); |
| const MachineRegisterInfo &MRI = MF->getRegInfo(); |
| LivePhysRegs LiveRegs(TRI); |
| getLiveRegsForEntryMBB(LiveRegs, MBB); |
| // The StoreSwiftAsyncContext clobbers X16 and X17. Make sure they are |
| // available. |
| if (!LiveRegs.available(MRI, AArch64::X16) || |
| !LiveRegs.available(MRI, AArch64::X17)) |
| return false; |
| } |
| |
| // Certain stack probing sequences might clobber flags, then we can't use |
| // the block as a prologue if the flags register is a live-in. |
| if (MF->getInfo<AArch64FunctionInfo>()->hasStackProbing() && |
| MBB.isLiveIn(AArch64::NZCV)) |
| return false; |
| |
| if (RegInfo->hasStackRealignment(*MF) || TLI->hasInlineStackProbe(*MF)) |
| if (findScratchNonCalleeSaveRegister(TmpMBB) == AArch64::NoRegister) |
| return false; |
| |
| // May need a scratch register (for return value) if require making a special |
| // call |
| if (requiresSaveVG(*MF) || |
| windowsRequiresStackProbe(*MF, std::numeric_limits<uint64_t>::max())) |
| if (findScratchNonCalleeSaveRegister(TmpMBB, true) == AArch64::NoRegister) |
| return false; |
| |
| return true; |
| } |
| |
| bool AArch64FrameLowering::needsWinCFI(const MachineFunction &MF) const { |
| const Function &F = MF.getFunction(); |
| return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && |
| F.needsUnwindTableEntry(); |
| } |
| |
| bool AArch64FrameLowering::shouldSignReturnAddressEverywhere( |
| const MachineFunction &MF) const { |
| // FIXME: With WinCFI, extra care should be taken to place SEH_PACSignLR |
| // and SEH_EpilogEnd instructions in the correct order. |
| if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) |
| return false; |
| const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| return AFI->getSignReturnAddressCondition() == SignReturnAddress::All; |
| } |
| |
| // Given a load or a store instruction, generate an appropriate unwinding SEH |
| // code on Windows. |
| MachineBasicBlock::iterator |
| AArch64FrameLowering::insertSEH(MachineBasicBlock::iterator MBBI, |
| const AArch64InstrInfo &TII, |
| MachineInstr::MIFlag Flag) const { |
| unsigned Opc = MBBI->getOpcode(); |
| MachineBasicBlock *MBB = MBBI->getParent(); |
| MachineFunction &MF = *MBB->getParent(); |
| DebugLoc DL = MBBI->getDebugLoc(); |
| unsigned ImmIdx = MBBI->getNumOperands() - 1; |
| int Imm = MBBI->getOperand(ImmIdx).getImm(); |
| MachineInstrBuilder MIB; |
| const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); |
| |
| switch (Opc) { |
| default: |
| report_fatal_error("No SEH Opcode for this instruction"); |
| case AArch64::STR_ZXI: |
| case AArch64::LDR_ZXI: { |
| unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg()); |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveZReg)) |
| .addImm(Reg0) |
| .addImm(Imm) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::STR_PXI: |
| case AArch64::LDR_PXI: { |
| unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg()); |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SavePReg)) |
| .addImm(Reg0) |
| .addImm(Imm) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::LDPDpost: |
| Imm = -Imm; |
| [[fallthrough]]; |
| case AArch64::STPDpre: { |
| unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg()); |
| unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg()); |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X)) |
| .addImm(Reg0) |
| .addImm(Reg1) |
| .addImm(Imm * 8) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::LDPXpost: |
| Imm = -Imm; |
| [[fallthrough]]; |
| case AArch64::STPXpre: { |
| Register Reg0 = MBBI->getOperand(1).getReg(); |
| Register Reg1 = MBBI->getOperand(2).getReg(); |
| if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X)) |
| .addImm(Imm * 8) |
| .setMIFlag(Flag); |
| else |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X)) |
| .addImm(RegInfo->getSEHRegNum(Reg0)) |
| .addImm(RegInfo->getSEHRegNum(Reg1)) |
| .addImm(Imm * 8) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::LDRDpost: |
| Imm = -Imm; |
| [[fallthrough]]; |
| case AArch64::STRDpre: { |
| unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg()); |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X)) |
| .addImm(Reg) |
| .addImm(Imm) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::LDRXpost: |
| Imm = -Imm; |
| [[fallthrough]]; |
| case AArch64::STRXpre: { |
| unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg()); |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X)) |
| .addImm(Reg) |
| .addImm(Imm) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::STPDi: |
| case AArch64::LDPDi: { |
| unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg()); |
| unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg()); |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP)) |
| .addImm(Reg0) |
| .addImm(Reg1) |
| .addImm(Imm * 8) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::STPXi: |
| case AArch64::LDPXi: { |
| Register Reg0 = MBBI->getOperand(0).getReg(); |
| Register Reg1 = MBBI->getOperand(1).getReg(); |
| |
| int SEHReg0 = RegInfo->getSEHRegNum(Reg0); |
| int SEHReg1 = RegInfo->getSEHRegNum(Reg1); |
| |
| if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR)) |
| .addImm(Imm * 8) |
| .setMIFlag(Flag); |
| else if (SEHReg0 >= 19 && SEHReg1 >= 19) |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP)) |
| .addImm(SEHReg0) |
| .addImm(SEHReg1) |
| .addImm(Imm * 8) |
| .setMIFlag(Flag); |
| else |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegIP)) |
| .addImm(SEHReg0) |
| .addImm(SEHReg1) |
| .addImm(Imm * 8) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::STRXui: |
| case AArch64::LDRXui: { |
| int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg()); |
| if (Reg >= 19) |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg)) |
| .addImm(Reg) |
| .addImm(Imm * 8) |
| .setMIFlag(Flag); |
| else |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegI)) |
| .addImm(Reg) |
| .addImm(Imm * 8) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::STRDui: |
| case AArch64::LDRDui: { |
| unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg()); |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg)) |
| .addImm(Reg) |
| .addImm(Imm * 8) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::STPQi: |
| case AArch64::LDPQi: { |
| unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg()); |
| unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg()); |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegQP)) |
| .addImm(Reg0) |
| .addImm(Reg1) |
| .addImm(Imm * 16) |
| .setMIFlag(Flag); |
| break; |
| } |
| case AArch64::LDPQpost: |
| Imm = -Imm; |
| [[fallthrough]]; |
| case AArch64::STPQpre: { |
| unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg()); |
| unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg()); |
| MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegQPX)) |
| .addImm(Reg0) |
| .addImm(Reg1) |
| .addImm(Imm * 16) |
| .setMIFlag(Flag); |
| break; |
| } |
| } |
| auto I = MBB->insertAfter(MBBI, MIB); |
| return I; |
| } |
| |
| bool AArch64FrameLowering::requiresSaveVG(const MachineFunction &MF) const { |
| const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| if (!AFI->needsDwarfUnwindInfo(MF) || !AFI->hasStreamingModeChanges()) |
| return false; |
| // For Darwin platforms we don't save VG for non-SVE functions, even if SME |
| // is enabled with streaming mode changes. |
| auto &ST = MF.getSubtarget<AArch64Subtarget>(); |
| if (ST.isTargetDarwin()) |
| return ST.hasSVE(); |
| return true; |
| } |
| |
| void AArch64FrameLowering::emitPacRetPlusLeafHardening( |
| MachineFunction &MF) const { |
| const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| const AArch64InstrInfo *TII = Subtarget.getInstrInfo(); |
| |
| auto EmitSignRA = [&](MachineBasicBlock &MBB) { |
| DebugLoc DL; // Set debug location to unknown. |
| MachineBasicBlock::iterator MBBI = MBB.begin(); |
| |
| BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_PROLOGUE)) |
| .setMIFlag(MachineInstr::FrameSetup); |
| }; |
| |
| auto EmitAuthRA = [&](MachineBasicBlock &MBB) { |
| DebugLoc DL; |
| MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); |
| if (MBBI != MBB.end()) |
| DL = MBBI->getDebugLoc(); |
| |
| BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_EPILOGUE)) |
| .setMIFlag(MachineInstr::FrameDestroy); |
| }; |
| |
| // This should be in sync with PEIImpl::calculateSaveRestoreBlocks. |
| EmitSignRA(MF.front()); |
| for (MachineBasicBlock &MBB : MF) { |
| if (MBB.isEHFuncletEntry()) |
| EmitSignRA(MBB); |
| if (MBB.isReturnBlock()) |
| EmitAuthRA(MBB); |
| } |
| } |
| |
| void AArch64FrameLowering::emitPrologue(MachineFunction &MF, |
| MachineBasicBlock &MBB) const { |
| AArch64PrologueEmitter PrologueEmitter(MF, MBB, *this); |
| PrologueEmitter.emitPrologue(); |
| } |
| |
| void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, |
| MachineBasicBlock &MBB) const { |
| AArch64EpilogueEmitter EpilogueEmitter(MF, MBB, *this); |
| EpilogueEmitter.emitEpilogue(); |
| } |
| |
| bool AArch64FrameLowering::enableCFIFixup(const MachineFunction &MF) const { |
| return TargetFrameLowering::enableCFIFixup(MF) && |
| MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF); |
| } |
| |
| bool AArch64FrameLowering::enableFullCFIFixup(const MachineFunction &MF) const { |
| return enableCFIFixup(MF) && |
| MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF); |
| } |
| |
| /// getFrameIndexReference - Provide a base+offset reference to an FI slot for |
| /// debug info. It's the same as what we use for resolving the code-gen |
| /// references for now. FIXME: This can go wrong when references are |
| /// SP-relative and simple call frames aren't used. |
| StackOffset |
| AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, |
| Register &FrameReg) const { |
| return resolveFrameIndexReference( |
| MF, FI, FrameReg, |
| /*PreferFP=*/ |
| MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress) || |
| MF.getFunction().hasFnAttribute(Attribute::SanitizeMemTag), |
| /*ForSimm=*/false); |
| } |
| |
| StackOffset |
| AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, |
| int FI) const { |
| // This function serves to provide a comparable offset from a single reference |
| // point (the value of SP at function entry) that can be used for analysis, |
| // e.g. the stack-frame-layout analysis pass. It is not guaranteed to be |
| // correct for all objects in the presence of VLA-area objects or dynamic |
| // stack re-alignment. |
| |
| const auto &MFI = MF.getFrameInfo(); |
| |
| int64_t ObjectOffset = MFI.getObjectOffset(FI); |
| StackOffset ZPRStackSize = getZPRStackSize(MF); |
| StackOffset PPRStackSize = getPPRStackSize(MF); |
| StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; |
| |
| // For VLA-area objects, just emit an offset at the end of the stack frame. |
| // Whilst not quite correct, these objects do live at the end of the frame and |
| // so it is more useful for analysis for the offset to reflect this. |
| if (MFI.isVariableSizedObjectIndex(FI)) { |
| return StackOffset::getFixed(-((int64_t)MFI.getStackSize())) - SVEStackSize; |
| } |
| |
| // This is correct in the absence of any SVE stack objects. |
| if (!SVEStackSize) |
| return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea()); |
| |
| const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| bool FPAfterSVECalleeSaves = hasSVECalleeSavesAboveFrameRecord(MF); |
| if (MFI.hasScalableStackID(FI)) { |
| if (FPAfterSVECalleeSaves && |
| -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) { |
| assert(!AFI->hasSplitSVEObjects() && |
| "split-sve-objects not supported with FPAfterSVECalleeSaves"); |
| return StackOffset::getScalable(ObjectOffset); |
| } |
| StackOffset AccessOffset{}; |
| // The scalable vectors are below (lower address) the scalable predicates |
| // with split SVE objects, so we must subtract the size of the predicates. |
| if (AFI->hasSplitSVEObjects() && |
| MFI.getStackID(FI) == TargetStackID::ScalableVector) |
| AccessOffset = -PPRStackSize; |
| return AccessOffset + |
| StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()), |
| ObjectOffset); |
| } |
| |
| bool IsFixed = MFI.isFixedObjectIndex(FI); |
| bool IsCSR = |
| !IsFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI)); |
| |
| StackOffset ScalableOffset = {}; |
| if (!IsFixed && !IsCSR) { |
| ScalableOffset = -SVEStackSize; |
| } else if (FPAfterSVECalleeSaves && IsCSR) { |
| ScalableOffset = |
| -StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()); |
| } |
| |
| return StackOffset::getFixed(ObjectOffset) + ScalableOffset; |
| } |
| |
| StackOffset |
| AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF, |
| int FI) const { |
| return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI)); |
| } |
| |
| StackOffset AArch64FrameLowering::getFPOffset(const MachineFunction &MF, |
| int64_t ObjectOffset) const { |
| const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| const Function &F = MF.getFunction(); |
| bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg()); |
| unsigned FixedObject = |
| getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false); |
| int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo()); |
| int64_t FPAdjust = |
| CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset(); |
| return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust); |
| } |
| |
| StackOffset AArch64FrameLowering::getStackOffset(const MachineFunction &MF, |
| int64_t ObjectOffset) const { |
| const auto &MFI = MF.getFrameInfo(); |
| return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize()); |
| } |
| |
| // TODO: This function currently does not work for scalable vectors. |
| int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, |
| int FI) const { |
| const AArch64RegisterInfo *RegInfo = |
| MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); |
| int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI); |
| return RegInfo->getLocalAddressRegister(MF) == AArch64::FP |
| ? getFPOffset(MF, ObjectOffset).getFixed() |
| : getStackOffset(MF, ObjectOffset).getFixed(); |
| } |
| |
| StackOffset AArch64FrameLowering::resolveFrameIndexReference( |
| const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP, |
| bool ForSimm) const { |
| const auto &MFI = MF.getFrameInfo(); |
| int64_t ObjectOffset = MFI.getObjectOffset(FI); |
| bool isFixed = MFI.isFixedObjectIndex(FI); |
| auto StackID = static_cast<TargetStackID::Value>(MFI.getStackID(FI)); |
| return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, StackID, |
| FrameReg, PreferFP, ForSimm); |
| } |
| |
| StackOffset AArch64FrameLowering::resolveFrameOffsetReference( |
| const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, |
| TargetStackID::Value StackID, Register &FrameReg, bool PreferFP, |
| bool ForSimm) const { |
| const auto &MFI = MF.getFrameInfo(); |
| const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); |
| const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| |
| int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed(); |
| int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed(); |
| bool isCSR = |
| !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI)); |
| bool isSVE = MFI.isScalableStackID(StackID); |
| |
| StackOffset ZPRStackSize = getZPRStackSize(MF); |
| StackOffset PPRStackSize = getPPRStackSize(MF); |
| StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; |
| |
| // Use frame pointer to reference fixed objects. Use it for locals if |
| // there are VLAs or a dynamically realigned SP (and thus the SP isn't |
| // reliable as a base). Make sure useFPForScavengingIndex() does the |
| // right thing for the emergency spill slot. |
| bool UseFP = false; |
| if (AFI->hasStackFrame() && !isSVE) { |
| // We shouldn't prefer using the FP to access fixed-sized stack objects when |
| // there are scalable (SVE) objects in between the FP and the fixed-sized |
| // objects. |
| PreferFP &= !SVEStackSize; |
| |
| // Note: Keeping the following as multiple 'if' statements rather than |
| // merging to a single expression for readability. |
| // |
| // Argument access should always use the FP. |
| if (isFixed) { |
| UseFP = hasFP(MF); |
| } else if (isCSR && RegInfo->hasStackRealignment(MF)) { |
| // References to the CSR area must use FP if we're re-aligning the stack |
| // since the dynamically-sized alignment padding is between the SP/BP and |
| // the CSR area. |
| assert(hasFP(MF) && "Re-aligned stack must have frame pointer"); |
| UseFP = true; |
| } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) { |
| // If the FPOffset is negative and we're producing a signed immediate, we |
| // have to keep in mind that the available offset range for negative |
| // offsets is smaller than for positive ones. If an offset is available |
| // via the FP and the SP, use whichever is closest. |
| bool FPOffsetFits = !ForSimm || FPOffset >= -256; |
| PreferFP |= Offset > -FPOffset && !SVEStackSize; |
| |
| if (FPOffset >= 0) { |
| // If the FPOffset is positive, that'll always be best, as the SP/BP |
| // will be even further away. |
| UseFP = true; |
| } else if (MFI.hasVarSizedObjects()) { |
| // If we have variable sized objects, we can use either FP or BP, as the |
| // SP offset is unknown. We can use the base pointer if we have one and |
| // FP is not preferred. If not, we're stuck with using FP. |
| bool CanUseBP = RegInfo->hasBasePointer(MF); |
| if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best. |
| UseFP = PreferFP; |
| else if (!CanUseBP) // Can't use BP. Forced to use FP. |
| UseFP = true; |
| // else we can use BP and FP, but the offset from FP won't fit. |
| // That will make us scavenge registers which we can probably avoid by |
| // using BP. If it won't fit for BP either, we'll scavenge anyway. |
| } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) { |
| // Funclets access the locals contained in the parent's stack frame |
| // via the frame pointer, so we have to use the FP in the parent |
| // function. |
| (void) Subtarget; |
| assert(Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(), |
| MF.getFunction().isVarArg()) && |
| "Funclets should only be present on Win64"); |
| UseFP = true; |
| } else { |
| // We have the choice between FP and (SP or BP). |
| if (FPOffsetFits && PreferFP) // If FP is the best fit, use it. |
| UseFP = true; |
| } |
| } |
| } |
| |
| assert( |
| ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) && |
| "In the presence of dynamic stack pointer realignment, " |
| "non-argument/CSR objects cannot be accessed through the frame pointer"); |
| |
| bool FPAfterSVECalleeSaves = hasSVECalleeSavesAboveFrameRecord(MF); |
| |
| if (isSVE) { |
| StackOffset FPOffset = StackOffset::get( |
| -AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset); |
| StackOffset SPOffset = |
| SVEStackSize + |
| StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), |
| ObjectOffset); |
| |
| // With split SVE objects the ObjectOffset is relative to the split area |
| // (i.e. the PPR area or ZPR area respectively). |
| if (AFI->hasSplitSVEObjects() && StackID == TargetStackID::ScalableVector) { |
| // If we're accessing an SVE vector with split SVE objects... |
| // - From the FP we need to move down past the PPR area: |
| FPOffset -= PPRStackSize; |
| // - From the SP we only need to move up to the ZPR area: |
| SPOffset -= PPRStackSize; |
| // Note: `SPOffset = SVEStackSize + ...`, so `-= PPRStackSize` results in |
| // `SPOffset = ZPRStackSize + ...`. |
| } |
| |
| if (FPAfterSVECalleeSaves) { |
| FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()); |
| if (-ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) { |
| FPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize()); |
| SPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize()); |
| } |
| } |
| |
| // Always use the FP for SVE spills if available and beneficial. |
| if (hasFP(MF) && (SPOffset.getFixed() || |
| FPOffset.getScalable() < SPOffset.getScalable() || |
| RegInfo->hasStackRealignment(MF))) { |
| FrameReg = RegInfo->getFrameRegister(MF); |
| return FPOffset; |
| } |
| FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() |
| : MCRegister(AArch64::SP); |
| |
| return SPOffset; |
| } |
| |
| StackOffset SVEAreaOffset = {}; |
| if (FPAfterSVECalleeSaves) { |
| // In this stack layout, the FP is in between the callee saves and other |
| // SVE allocations. |
| StackOffset SVECalleeSavedStack = |
| StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()); |
| if (UseFP) { |
| if (isFixed) |
| SVEAreaOffset = SVECalleeSavedStack; |
| else if (!isCSR) |
| SVEAreaOffset = SVECalleeSavedStack - SVEStackSize; |
| } else { |
| if (isFixed) |
| SVEAreaOffset = SVEStackSize; |
| else if (isCSR) |
| SVEAreaOffset = SVEStackSize - SVECalleeSavedStack; |
| } |
| } else { |
| if (UseFP && !(isFixed || isCSR)) |
| SVEAreaOffset = -SVEStackSize; |
| if (!UseFP && (isFixed || isCSR)) |
| SVEAreaOffset = SVEStackSize; |
| } |
| |
| if (UseFP) { |
| FrameReg = RegInfo->getFrameRegister(MF); |
| return StackOffset::getFixed(FPOffset) + SVEAreaOffset; |
| } |
| |
| // Use the base pointer if we have one. |
| if (RegInfo->hasBasePointer(MF)) |
| FrameReg = RegInfo->getBaseRegister(); |
| else { |
| assert(!MFI.hasVarSizedObjects() && |
| "Can't use SP when we have var sized objects."); |
| FrameReg = AArch64::SP; |
| // If we're using the red zone for this function, the SP won't actually |
| // be adjusted, so the offsets will be negative. They're also all |
| // within range of the signed 9-bit immediate instructions. |
| if (canUseRedZone(MF)) |
| Offset -= AFI->getLocalStackSize(); |
| } |
| |
| return StackOffset::getFixed(Offset) + SVEAreaOffset; |
| } |
| |
| static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { |
| // Do not set a kill flag on values that are also marked as live-in. This |
| // happens with the @llvm-returnaddress intrinsic and with arguments passed in |
| // callee saved registers. |
| // Omitting the kill flags is conservatively correct even if the live-in |
| // is not used after all. |
| bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg); |
| return getKillRegState(!IsLiveIn); |
| } |
| |
| static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL, |
| MachineFunction &MF) { |
| const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| AttributeList Attrs = MF.getFunction().getAttributes(); |
| AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| return Subtarget.isTargetMachO() && |
| !(Subtarget.getTargetLowering()->supportSwiftError() && |
| Attrs.hasAttrSomewhere(Attribute::SwiftError)) && |
| MF.getFunction().getCallingConv() != CallingConv::SwiftTail && |
| !AFL.requiresSaveVG(MF) && !AFI->isSVECC(); |
| } |
| |
| static bool invalidateWindowsRegisterPairing(bool SpillExtendedVolatile, |
| unsigned SpillCount, unsigned Reg1, |
| unsigned Reg2, bool NeedsWinCFI, |
| bool IsFirst, |
| const TargetRegisterInfo *TRI) { |
| // If we are generating register pairs for a Windows function that requires |
| // EH support, then pair consecutive registers only. There are no unwind |
| // opcodes for saves/restores of non-consecutive register pairs. |
| // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x, |
| // save_lrpair. |
| // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling |
| |
| if (Reg2 == AArch64::FP) |
| return true; |
| if (!NeedsWinCFI) |
| return false; |
| |
| // ARM64EC introduced `save_any_regp`, which expects 16-byte alignment. |
| // This is handled by only allowing paired spills for registers spilled at |
| // even positions (which should be 16-byte aligned, as other GPRs/FPRs are |
| // 8-bytes). We carve out an exception for {FP,LR}, which does not require |
| // 16-byte alignment in the uop representation. |
| if (TRI->getEncodingValue(Reg2) == TRI->getEncodingValue(Reg1) + 1) |
| return SpillExtendedVolatile |
| ? !((Reg1 == AArch64::FP && Reg2 == AArch64::LR) || |
| (SpillCount % 2) == 0) |
| : false; |
| |
| // If pairing a GPR with LR, the pair can be described by the save_lrpair |
| // opcode. If this is the first register pair, it would end up with a |
| // predecrement, but there's no save_lrpair_x opcode, so we can only do this |
| // if LR is paired with something else than the first register. |
| // The save_lrpair opcode requires the first register to be an odd one. |
| if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 && |
| (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst) |
| return false; |
| return true; |
| } |
| |
| /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction. |
| /// WindowsCFI requires that only consecutive registers can be paired. |
| /// LR and FP need to be allocated together when the frame needs to save |
| /// the frame-record. This means any other register pairing with LR is invalid. |
| static bool invalidateRegisterPairing(bool SpillExtendedVolatile, |
| unsigned SpillCount, unsigned Reg1, |
| unsigned Reg2, bool UsesWinAAPCS, |
| bool NeedsWinCFI, bool NeedsFrameRecord, |
| bool IsFirst, |
| const TargetRegisterInfo *TRI) { |
| if (UsesWinAAPCS) |
| return invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount, |
| Reg1, Reg2, NeedsWinCFI, IsFirst, |
| TRI); |
| |
| // If we need to store the frame record, don't pair any register |
| // with LR other than FP. |
| if (NeedsFrameRecord) |
| return Reg2 == AArch64::LR; |
| |
| return false; |
| } |
| |
| namespace { |
| |
| struct RegPairInfo { |
| Register Reg1; |
| Register Reg2; |
| int FrameIdx; |
| int Offset; |
| enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type; |
| const TargetRegisterClass *RC; |
| |
| RegPairInfo() = default; |
| |
| bool isPaired() const { return Reg2.isValid(); } |
| |
| bool isScalable() const { return Type == PPR || Type == ZPR; } |
| }; |
| |
| } // end anonymous namespace |
| |
| MCRegister findFreePredicateReg(BitVector &SavedRegs) { |
| for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) { |
| if (SavedRegs.test(PReg)) { |
| unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0; |
| return MCRegister(PNReg); |
| } |
| } |
| return MCRegister(); |
| } |
| |
| // The multivector LD/ST are available only for SME or SVE2p1 targets |
| bool enableMultiVectorSpillFill(const AArch64Subtarget &Subtarget, |
| MachineFunction &MF) { |
| if (DisableMultiVectorSpillFill) |
| return false; |
| |
| SMEAttrs FuncAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs(); |
| bool IsLocallyStreaming = |
| FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface(); |
| |
| // Only when in streaming mode SME2 instructions can be safely used. |
| // It is not safe to use SME2 instructions when in streaming compatible or |
| // locally streaming mode. |
| return Subtarget.hasSVE2p1() || |
| (Subtarget.hasSME2() && |
| (!IsLocallyStreaming && Subtarget.isStreaming())); |
| } |
| |
| void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, |
| MachineFunction &MF, |
| ArrayRef<CalleeSavedInfo> CSI, |
| const TargetRegisterInfo *TRI, |
| SmallVectorImpl<RegPairInfo> &RegPairs, |
| bool NeedsFrameRecord) { |
| |
| if (CSI.empty()) |
| return; |
| |
| bool IsWindows = isTargetWindows(MF); |
| AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| unsigned StackHazardSize = getStackHazardSize(MF); |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| CallingConv::ID CC = MF.getFunction().getCallingConv(); |
| unsigned Count = CSI.size(); |
| (void)CC; |
| // MachO's compact unwind format relies on all registers being stored in |
| // pairs. |
| assert((!produceCompactUnwindFrame(AFL, MF) || |
| CC == CallingConv::PreserveMost || CC == CallingConv::PreserveAll || |
| CC == CallingConv::CXX_FAST_TLS || CC == CallingConv::Win64 || |
| (Count & 1) == 0) && |
| "Odd number of callee-saved regs to spill!"); |
| int ByteOffset = AFI->getCalleeSavedStackSize(); |
| int StackFillDir = -1; |
| int RegInc = 1; |
| unsigned FirstReg = 0; |
| if (IsWindows) { |
| // For WinCFI, fill the stack from the bottom up. |
| ByteOffset = 0; |
| StackFillDir = 1; |
| // As the CSI array is reversed to match PrologEpilogInserter, iterate |
| // backwards, to pair up registers starting from lower numbered registers. |
| RegInc = -1; |
| FirstReg = Count - 1; |
| } |
| |
| bool FPAfterSVECalleeSaves = AFL.hasSVECalleeSavesAboveFrameRecord(MF); |
| // Windows AAPCS has x9-x15 as volatile registers, x16-x17 as intra-procedural |
| // scratch, x18 as platform reserved. However, clang has extended calling |
| // convensions such as preserve_most and preserve_all which treat these as |
| // CSR. As such, the ARM64 unwind uOPs bias registers by 19. We use ARM64EC |
| // uOPs which have separate restrictions. We need to check for that. |
| // |
| // NOTE: we currently do not account for the D registers as LLVM does not |
| // support non-ABI compliant D register spills. |
| bool SpillExtendedVolatile = |
| IsWindows && llvm::any_of(CSI, [](const CalleeSavedInfo &CSI) { |
| const auto &Reg = CSI.getReg(); |
| return Reg >= AArch64::X0 && Reg <= AArch64::X18; |
| }); |
| |
| int ZPRByteOffset = 0; |
| int PPRByteOffset = 0; |
| bool SplitPPRs = AFI->hasSplitSVEObjects(); |
| if (SplitPPRs) { |
| ZPRByteOffset = AFI->getZPRCalleeSavedStackSize(); |
| PPRByteOffset = AFI->getPPRCalleeSavedStackSize(); |
| } else if (!FPAfterSVECalleeSaves) { |
| ZPRByteOffset = |
| AFI->getZPRCalleeSavedStackSize() + AFI->getPPRCalleeSavedStackSize(); |
| // Unused: Everything goes in ZPR space. |
| PPRByteOffset = 0; |
| } |
| |
| bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace(); |
| Register LastReg = 0; |
| bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex() && !SplitPPRs; |
| |
| // When iterating backwards, the loop condition relies on unsigned wraparound. |
| for (unsigned i = FirstReg; i < Count; i += RegInc) { |
| RegPairInfo RPI; |
| RPI.Reg1 = CSI[i].getReg(); |
| |
| if (AArch64::GPR64RegClass.contains(RPI.Reg1)) { |
| RPI.Type = RegPairInfo::GPR; |
| RPI.RC = &AArch64::GPR64RegClass; |
| } else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) { |
| RPI.Type = RegPairInfo::FPR64; |
| RPI.RC = &AArch64::FPR64RegClass; |
| } else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) { |
| RPI.Type = RegPairInfo::FPR128; |
| RPI.RC = &AArch64::FPR128RegClass; |
| } else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) { |
| RPI.Type = RegPairInfo::ZPR; |
| RPI.RC = &AArch64::ZPRRegClass; |
| } else if (AArch64::PPRRegClass.contains(RPI.Reg1)) { |
| RPI.Type = RegPairInfo::PPR; |
| RPI.RC = &AArch64::PPRRegClass; |
| } else if (RPI.Reg1 == AArch64::VG) { |
| RPI.Type = RegPairInfo::VG; |
| RPI.RC = &AArch64::FIXED_REGSRegClass; |
| } else { |
| llvm_unreachable("Unsupported register class."); |
| } |
| |
| int &ScalableByteOffset = RPI.Type == RegPairInfo::PPR && SplitPPRs |
| ? PPRByteOffset |
| : ZPRByteOffset; |
| |
| // Add the stack hazard size as we transition from GPR->FPR CSRs. |
| if (HasCSHazardPadding && |
| (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && |
| AArch64InstrInfo::isFpOrNEON(RPI.Reg1)) |
| ByteOffset += StackFillDir * StackHazardSize; |
| LastReg = RPI.Reg1; |
| |
| bool NeedsWinCFI = AFL.needsWinCFI(MF); |
| int Scale = TRI->getSpillSize(*RPI.RC); |
| // Add the next reg to the pair if it is in the same register class. |
| if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) { |
| MCRegister NextReg = CSI[i + RegInc].getReg(); |
| bool IsFirst = i == FirstReg; |
| unsigned SpillCount = NeedsWinCFI ? FirstReg - i : i; |
| switch (RPI.Type) { |
| case RegPairInfo::GPR: |
| if (AArch64::GPR64RegClass.contains(NextReg) && |
| !invalidateRegisterPairing( |
| SpillExtendedVolatile, SpillCount, RPI.Reg1, NextReg, IsWindows, |
| NeedsWinCFI, NeedsFrameRecord, IsFirst, TRI)) |
| RPI.Reg2 = NextReg; |
| break; |
| case RegPairInfo::FPR64: |
| if (AArch64::FPR64RegClass.contains(NextReg) && |
| !invalidateRegisterPairing( |
| SpillExtendedVolatile, SpillCount, RPI.Reg1, NextReg, IsWindows, |
| NeedsWinCFI, NeedsFrameRecord, IsFirst, TRI)) |
| RPI.Reg2 = NextReg; |
| break; |
| case RegPairInfo::FPR128: |
| if (AArch64::FPR128RegClass.contains(NextReg)) |
| RPI.Reg2 = NextReg; |
| break; |
| case RegPairInfo::PPR: |
| break; |
| case RegPairInfo::ZPR: |
| if (AFI->getPredicateRegForFillSpill() != 0 && |
| ((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) { |
| // Calculate offset of register pair to see if pair instruction can be |
| // used. |
| int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale; |
| if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0)) |
| RPI.Reg2 = NextReg; |
| } |
| break; |
| case RegPairInfo::VG: |
| break; |
| } |
| } |
| |
| // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI |
| // list to come in sorted by frame index so that we can issue the store |
| // pair instructions directly. Assert if we see anything otherwise. |
| // |
| // The order of the registers in the list is controlled by |
| // getCalleeSavedRegs(), so they will always be in-order, as well. |
| assert((!RPI.isPaired() || |
| (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) && |
| "Out of order callee saved regs!"); |
| |
| assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP || |
| RPI.Reg1 == AArch64::LR) && |
| "FrameRecord must be allocated together with LR"); |
| |
| // Windows AAPCS has FP and LR reversed. |
| assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP || |
| RPI.Reg2 == AArch64::LR) && |
| "FrameRecord must be allocated together with LR"); |
| |
| // MachO's compact unwind format relies on all registers being stored in |
| // adjacent register pairs. |
| assert((!produceCompactUnwindFrame(AFL, MF) || |
| CC == CallingConv::PreserveMost || CC == CallingConv::PreserveAll || |
| CC == CallingConv::CXX_FAST_TLS || CC == CallingConv::Win64 || |
| (RPI.isPaired() && |
| ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) || |
| RPI.Reg1 + 1 == RPI.Reg2))) && |
| "Callee-save registers not saved as adjacent register pair!"); |
| |
| RPI.FrameIdx = CSI[i].getFrameIdx(); |
| if (IsWindows && |
| RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair |
| RPI.FrameIdx = CSI[i + RegInc].getFrameIdx(); |
| |
| // Realign the scalable offset if necessary. This is relevant when |
| // spilling predicates on Windows. |
| if (RPI.isScalable() && ScalableByteOffset % Scale != 0) { |
| ScalableByteOffset = alignTo(ScalableByteOffset, Scale); |
| } |
| |
| int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset; |
| assert(OffsetPre % Scale == 0); |
| |
| if (RPI.isScalable()) |
| ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale); |
| else |
| ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale); |
| |
| // Swift's async context is directly before FP, so allocate an extra |
| // 8 bytes for it. |
| if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() && |
| ((!IsWindows && RPI.Reg2 == AArch64::FP) || |
| (IsWindows && RPI.Reg2 == AArch64::LR))) |
| ByteOffset += StackFillDir * 8; |
| |
| // Round up size of non-pair to pair size if we need to pad the |
| // callee-save area to ensure 16-byte alignment. |
| if (NeedGapToAlignStack && !IsWindows && !RPI.isScalable() && |
| RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired() && |
| ByteOffset % 16 != 0) { |
| ByteOffset += 8 * StackFillDir; |
| assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16)); |
| // A stack frame with a gap looks like this, bottom up: |
| // d9, d8. x21, gap, x20, x19. |
| // Set extra alignment on the x21 object to create the gap above it. |
| MFI.setObjectAlignment(RPI.FrameIdx, Align(16)); |
| NeedGapToAlignStack = false; |
| } |
| |
| int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset; |
| assert(OffsetPost % Scale == 0); |
| // If filling top down (default), we want the offset after incrementing it. |
| // If filling bottom up (WinCFI) we need the original offset. |
| int Offset = IsWindows ? OffsetPre : OffsetPost; |
| |
| // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the |
| // Swift context can directly precede FP. |
| if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() && |
| ((!IsWindows && RPI.Reg2 == AArch64::FP) || |
| (IsWindows && RPI.Reg2 == AArch64::LR))) |
| Offset += 8; |
| RPI.Offset = Offset / Scale; |
| |
| assert((!RPI.isPaired() || |
| (!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || |
| (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) && |
| "Offset out of bounds for LDP/STP immediate"); |
| |
| auto isFrameRecord = [&] { |
| if (RPI.isPaired()) |
| return IsWindows ? RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR |
| : RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP; |
| // Otherwise, look for the frame record as two unpaired registers. This is |
| // needed for -aarch64-stack-hazard-size=<val>, which disables register |
| // pairing (as the padding may be too large for the LDP/STP offset). Note: |
| // On Windows, this check works out as current reg == FP, next reg == LR, |
| // and on other platforms current reg == FP, previous reg == LR. This |
| // works out as the correct pre-increment or post-increment offsets |
| // respectively. |
| return i > 0 && RPI.Reg1 == AArch64::FP && |
| CSI[i - 1].getReg() == AArch64::LR; |
| }; |
| |
| // Save the offset to frame record so that the FP register can point to the |
| // innermost frame record (spilled FP and LR registers). |
| if (NeedsFrameRecord && isFrameRecord()) |
| AFI->setCalleeSaveBaseToFrameRecordOffset(Offset); |
| |
| RegPairs.push_back(RPI); |
| if (RPI.isPaired()) |
| i += RegInc; |
| } |
| if (IsWindows) { |
| // If we need an alignment gap in the stack, align the topmost stack |
| // object. A stack frame with a gap looks like this, bottom up: |
| // x19, d8. d9, gap. |
| // Set extra alignment on the topmost stack object (the first element in |
| // CSI, which goes top down), to create the gap above it. |
| if (AFI->hasCalleeSaveStackFreeSpace()) |
| MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16)); |
| // We iterated bottom up over the registers; flip RegPairs back to top |
| // down order. |
| std::reverse(RegPairs.begin(), RegPairs.end()); |
| } |
| } |
| |
| bool AArch64FrameLowering::spillCalleeSavedRegisters( |
| MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, |
| ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { |
| MachineFunction &MF = *MBB.getParent(); |
| const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| auto &TLI = *Subtarget.getTargetLowering(); |
| const AArch64InstrInfo &TII = *Subtarget.getInstrInfo(); |
| bool NeedsWinCFI = needsWinCFI(MF); |
| DebugLoc DL; |
| SmallVector<RegPairInfo, 8> RegPairs; |
| |
| computeCalleeSaveRegisterPairs(*this, MF, CSI, TRI, RegPairs, hasFP(MF)); |
| |
| MachineRegisterInfo &MRI = MF.getRegInfo(); |
| // Refresh the reserved regs in case there are any potential changes since the |
| // last freeze. |
| MRI.freezeReservedRegs(); |
| |
| if (homogeneousPrologEpilog(MF)) { |
| auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog)) |
| .setMIFlag(MachineInstr::FrameSetup); |
| |
| for (auto &RPI : RegPairs) { |
| MIB.addReg(RPI.Reg1); |
| MIB.addReg(RPI.Reg2); |
| |
| // Update register live in. |
| if (!MRI.isReserved(RPI.Reg1)) |
| MBB.addLiveIn(RPI.Reg1); |
| if (RPI.isPaired() && !MRI.isReserved(RPI.Reg2)) |
| MBB.addLiveIn(RPI.Reg2); |
| } |
| return true; |
| } |
| bool PTrueCreated = false; |
| for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) { |
| Register Reg1 = RPI.Reg1; |
| Register Reg2 = RPI.Reg2; |
| unsigned StrOpc; |
| |
| // Issue sequence of spills for cs regs. The first spill may be converted |
| // to a pre-decrement store later by emitPrologue if the callee-save stack |
| // area allocation can't be combined with the local stack area allocation. |
| // For example: |
| // stp x22, x21, [sp, #0] // addImm(+0) |
| // stp x20, x19, [sp, #16] // addImm(+2) |
| // stp fp, lr, [sp, #32] // addImm(+4) |
| // Rationale: This sequence saves uop updates compared to a sequence of |
| // pre-increment spills like stp xi,xj,[sp,#-16]! |
| // Note: Similar rationale and sequence for restores in epilog. |
| unsigned Size = TRI->getSpillSize(*RPI.RC); |
| Align Alignment = TRI->getSpillAlign(*RPI.RC); |
| switch (RPI.Type) { |
| case RegPairInfo::GPR: |
| StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; |
| break; |
| case RegPairInfo::FPR64: |
| StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; |
| break; |
| case RegPairInfo::FPR128: |
| StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; |
| break; |
| case RegPairInfo::ZPR: |
| StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI; |
| break; |
| case RegPairInfo::PPR: |
| StrOpc = AArch64::STR_PXI; |
| break; |
| case RegPairInfo::VG: |
| StrOpc = AArch64::STRXui; |
| break; |
| } |
| |
| Register X0Scratch; |
| llvm::scope_exit RestoreX0([&] { |
| if (X0Scratch != AArch64::NoRegister) |
| BuildMI(MBB, MI, DL, TII.get(TargetOpcode::COPY), AArch64::X0) |
| .addReg(X0Scratch) |
| .setMIFlag(MachineInstr::FrameSetup); |
| }); |
| |
| if (Reg1 == AArch64::VG) { |
| // Find an available register to store value of VG to. |
| Reg1 = findScratchNonCalleeSaveRegister(&MBB, true); |
| assert(Reg1 != AArch64::NoRegister); |
| if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) { |
| BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1) |
| .addImm(31) |
| .addImm(1) |
| .setMIFlag(MachineInstr::FrameSetup); |
| } else { |
| const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); |
| if (any_of(MBB.liveins(), |
| [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) { |
| return STI.getRegisterInfo()->isSuperOrSubRegisterEq( |
| AArch64::X0, LiveIn.PhysReg); |
| })) { |
| X0Scratch = Reg1; |
| BuildMI(MBB, MI, DL, TII.get(TargetOpcode::COPY), X0Scratch) |
| .addReg(AArch64::X0) |
| .setMIFlag(MachineInstr::FrameSetup); |
| } |
| |
| RTLIB::Libcall LC = RTLIB::SMEABI_GET_CURRENT_VG; |
| const uint32_t *RegMask = |
| TRI->getCallPreservedMask(MF, TLI.getLibcallCallingConv(LC)); |
| BuildMI(MBB, MI, DL, TII.get(AArch64::BL)) |
| .addExternalSymbol(TLI.getLibcallName(LC)) |
| .addRegMask(RegMask) |
| .addReg(AArch64::X0, RegState::ImplicitDefine) |
| .setMIFlag(MachineInstr::FrameSetup); |
| Reg1 = AArch64::X0; |
| } |
| } |
| |
| LLVM_DEBUG({ |
| dbgs() << "CSR spill: (" << printReg(Reg1, TRI); |
| if (RPI.isPaired()) |
| dbgs() << ", " << printReg(Reg2, TRI); |
| dbgs() << ") -> fi#(" << RPI.FrameIdx; |
| if (RPI.isPaired()) |
| dbgs() << ", " << RPI.FrameIdx + 1; |
| dbgs() << ")\n"; |
| }); |
| |
| assert((!isTargetWindows(MF) || |
| !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) && |
| "Windows unwdinding requires a consecutive (FP,LR) pair"); |
| // Windows unwind codes require consecutive registers if registers are |
| // paired. Make the switch here, so that the code below will save (x,x+1) |
| // and not (x+1,x). |
| unsigned FrameIdxReg1 = RPI.FrameIdx; |
| unsigned FrameIdxReg2 = RPI.FrameIdx + 1; |
| if (isTargetWindows(MF) && RPI.isPaired()) { |
| std::swap(Reg1, Reg2); |
| std::swap(FrameIdxReg1, FrameIdxReg2); |
| } |
| |
| if (RPI.isPaired() && RPI.isScalable()) { |
| [[maybe_unused]] const AArch64Subtarget &Subtarget = |
| MF.getSubtarget<AArch64Subtarget>(); |
| AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| unsigned PnReg = AFI->getPredicateRegForFillSpill(); |
| assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) && |
| "Expects SVE2.1 or SME2 target and a predicate register"); |
| #ifdef EXPENSIVE_CHECKS |
| auto IsPPR = [](const RegPairInfo &c) { |
| return c.Reg1 == RegPairInfo::PPR; |
| }; |
| auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR); |
| auto IsZPR = [](const RegPairInfo &c) { |
| return c.Type == RegPairInfo::ZPR; |
| }; |
| auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR); |
| assert(!(PPRBegin < ZPRBegin) && |
| "Expected callee save predicate to be handled first"); |
| #endif |
| if (!PTrueCreated) { |
| PTrueCreated = true; |
| BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg) |
| .setMIFlags(MachineInstr::FrameSetup); |
| } |
| MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); |
| if (!MRI.isReserved(Reg1)) |
| MBB.addLiveIn(Reg1); |
| if (!MRI.isReserved(Reg2)) |
| MBB.addLiveIn(Reg2); |
| MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0)); |
| MIB.addMemOperand(MF.getMachineMemOperand( |
| MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), |
| MachineMemOperand::MOStore, Size, Alignment)); |
| MIB.addReg(PnReg); |
| MIB.addReg(AArch64::SP) |
| .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale], |
| // where 2*vscale is implicit |
| .setMIFlag(MachineInstr::FrameSetup); |
| MIB.addMemOperand(MF.getMachineMemOperand( |
| MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), |
| MachineMemOperand::MOStore, Size, Alignment)); |
| if (NeedsWinCFI) |
| insertSEH(MIB, TII, MachineInstr::FrameSetup); |
| } else { // The code when the pair of ZReg is not present |
| MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); |
| if (!MRI.isReserved(Reg1)) |
| MBB.addLiveIn(Reg1); |
| if (RPI.isPaired()) { |
| if (!MRI.isReserved(Reg2)) |
| MBB.addLiveIn(Reg2); |
| MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); |
| MIB.addMemOperand(MF.getMachineMemOperand( |
| MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), |
| MachineMemOperand::MOStore, Size, Alignment)); |
| } |
| MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) |
| .addReg(AArch64::SP) |
| .addImm(RPI.Offset) // [sp, #offset*vscale], |
| // where factor*vscale is implicit |
| .setMIFlag(MachineInstr::FrameSetup); |
| MIB.addMemOperand(MF.getMachineMemOperand( |
| MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), |
| MachineMemOperand::MOStore, Size, Alignment)); |
| if (NeedsWinCFI) |
| insertSEH(MIB, TII, MachineInstr::FrameSetup); |
| } |
| // Update the StackIDs of the SVE stack slots. |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| if (RPI.Type == RegPairInfo::ZPR) { |
| MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector); |
| if (RPI.isPaired()) |
| MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector); |
| } else if (RPI.Type == RegPairInfo::PPR) { |
| MFI.setStackID(FrameIdxReg1, TargetStackID::ScalablePredicateVector); |
| if (RPI.isPaired()) |
| MFI.setStackID(FrameIdxReg2, TargetStackID::ScalablePredicateVector); |
| } |
| } |
| return true; |
| } |
| |
| bool AArch64FrameLowering::restoreCalleeSavedRegisters( |
| MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, |
| MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { |
| MachineFunction &MF = *MBB.getParent(); |
| const AArch64InstrInfo &TII = |
| *MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); |
| DebugLoc DL; |
| SmallVector<RegPairInfo, 8> RegPairs; |
| bool NeedsWinCFI = needsWinCFI(MF); |
| |
| if (MBBI != MBB.end()) |
| DL = MBBI->getDebugLoc(); |
| |
| computeCalleeSaveRegisterPairs(*this, MF, CSI, TRI, RegPairs, hasFP(MF)); |
| if (homogeneousPrologEpilog(MF, &MBB)) { |
| auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) |
| .setMIFlag(MachineInstr::FrameDestroy); |
| for (auto &RPI : RegPairs) { |
| MIB.addReg(RPI.Reg1, RegState::Define); |
| MIB.addReg(RPI.Reg2, RegState::Define); |
| } |
| return true; |
| } |
| |
| // For performance reasons restore SVE register in increasing order |
| auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; }; |
| auto PPRBegin = llvm::find_if(RegPairs, IsPPR); |
| auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR); |
| std::reverse(PPRBegin, PPREnd); |
| auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; }; |
| auto ZPRBegin = llvm::find_if(RegPairs, IsZPR); |
| auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR); |
| std::reverse(ZPRBegin, ZPREnd); |
| |
| bool PTrueCreated = false; |
| for (const RegPairInfo &RPI : RegPairs) { |
| Register Reg1 = RPI.Reg1; |
| Register Reg2 = RPI.Reg2; |
| |
| // Issue sequence of restores for cs regs. The last restore may be converted |
| // to a post-increment load later by emitEpilogue if the callee-save stack |
| // area allocation can't be combined with the local stack area allocation. |
| // For example: |
| // ldp fp, lr, [sp, #32] // addImm(+4) |
| // ldp x20, x19, [sp, #16] // addImm(+2) |
| // ldp x22, x21, [sp, #0] // addImm(+0) |
| // Note: see comment in spillCalleeSavedRegisters() |
| unsigned LdrOpc; |
| unsigned Size = TRI->getSpillSize(*RPI.RC); |
| Align Alignment = TRI->getSpillAlign(*RPI.RC); |
| switch (RPI.Type) { |
| case RegPairInfo::GPR: |
| LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; |
| break; |
| case RegPairInfo::FPR64: |
| LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; |
| break; |
| case RegPairInfo::FPR128: |
| LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui; |
| break; |
| case RegPairInfo::ZPR: |
| LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI; |
| break; |
| case RegPairInfo::PPR: |
| LdrOpc = AArch64::LDR_PXI; |
| break; |
| case RegPairInfo::VG: |
| continue; |
| } |
| LLVM_DEBUG({ |
| dbgs() << "CSR restore: (" << printReg(Reg1, TRI); |
| if (RPI.isPaired()) |
| dbgs() << ", " << printReg(Reg2, TRI); |
| dbgs() << ") -> fi#(" << RPI.FrameIdx; |
| if (RPI.isPaired()) |
| dbgs() << ", " << RPI.FrameIdx + 1; |
| dbgs() << ")\n"; |
| }); |
| |
| // Windows unwind codes require consecutive registers if registers are |
| // paired. Make the switch here, so that the code below will save (x,x+1) |
| // and not (x+1,x). |
| unsigned FrameIdxReg1 = RPI.FrameIdx; |
| unsigned FrameIdxReg2 = RPI.FrameIdx + 1; |
| if (isTargetWindows(MF) && RPI.isPaired()) { |
| std::swap(Reg1, Reg2); |
| std::swap(FrameIdxReg1, FrameIdxReg2); |
| } |
| |
| AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| if (RPI.isPaired() && RPI.isScalable()) { |
| [[maybe_unused]] const AArch64Subtarget &Subtarget = |
| MF.getSubtarget<AArch64Subtarget>(); |
| unsigned PnReg = AFI->getPredicateRegForFillSpill(); |
| assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) && |
| "Expects SVE2.1 or SME2 target and a predicate register"); |
| #ifdef EXPENSIVE_CHECKS |
| assert(!(PPRBegin < ZPRBegin) && |
| "Expected callee save predicate to be handled first"); |
| #endif |
| if (!PTrueCreated) { |
| PTrueCreated = true; |
| BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg) |
| .setMIFlags(MachineInstr::FrameDestroy); |
| } |
| MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc)); |
| MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0), |
| getDefRegState(true)); |
| MIB.addMemOperand(MF.getMachineMemOperand( |
| MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), |
| MachineMemOperand::MOLoad, Size, Alignment)); |
| MIB.addReg(PnReg); |
| MIB.addReg(AArch64::SP) |
| .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale] |
| // where 2*vscale is implicit |
| .setMIFlag(MachineInstr::FrameDestroy); |
| MIB.addMemOperand(MF.getMachineMemOperand( |
| MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), |
| MachineMemOperand::MOLoad, Size, Alignment)); |
| if (NeedsWinCFI) |
| insertSEH(MIB, TII, MachineInstr::FrameDestroy); |
| } else { |
| MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc)); |
| if (RPI.isPaired()) { |
| MIB.addReg(Reg2, getDefRegState(true)); |
| MIB.addMemOperand(MF.getMachineMemOperand( |
| MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), |
| MachineMemOperand::MOLoad, Size, Alignment)); |
| } |
| MIB.addReg(Reg1, getDefRegState(true)); |
| MIB.addReg(AArch64::SP) |
| .addImm(RPI.Offset) // [sp, #offset*vscale] |
| // where factor*vscale is implicit |
| .setMIFlag(MachineInstr::FrameDestroy); |
| MIB.addMemOperand(MF.getMachineMemOperand( |
| MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), |
| MachineMemOperand::MOLoad, Size, Alignment)); |
| if (NeedsWinCFI) |
| insertSEH(MIB, TII, MachineInstr::FrameDestroy); |
| } |
| } |
| return true; |
| } |
| |
| // Return the FrameID for a MMO. |
| static std::optional<int> getMMOFrameID(MachineMemOperand *MMO, |
| const MachineFrameInfo &MFI) { |
| auto *PSV = |
| dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue()); |
| if (PSV) |
| return std::optional<int>(PSV->getFrameIndex()); |
| |
| if (MMO->getValue()) { |
| if (auto *Al = dyn_cast<AllocaInst>(getUnderlyingObject(MMO->getValue()))) { |
| for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); |
| FI++) |
| if (MFI.getObjectAllocation(FI) == Al) |
| return FI; |
| } |
| } |
| |
| return std::nullopt; |
| } |
| |
| // Return the FrameID for a Load/Store instruction by looking at the first MMO. |
| static std::optional<int> getLdStFrameID(const MachineInstr &MI, |
| const MachineFrameInfo &MFI) { |
| if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) |
| return std::nullopt; |
| |
| return getMMOFrameID(*MI.memoperands_begin(), MFI); |
| } |
| |
| // Returns true if the LDST MachineInstr \p MI is a PPR access. |
| static bool isPPRAccess(const MachineInstr &MI) { |
| return AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()); |
| } |
| |
| // Check if a Hazard slot is needed for the current function, and if so create |
| // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex, |
| // which can be used to determine if any hazard padding is needed. |
| void AArch64FrameLowering::determineStackHazardSlot( |
| MachineFunction &MF, BitVector &SavedRegs) const { |
| unsigned StackHazardSize = getStackHazardSize(MF); |
| auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| if (StackHazardSize == 0 || StackHazardSize % 16 != 0 || |
| AFI->hasStackHazardSlotIndex()) |
| return; |
| |
| // Stack hazards are only needed in streaming functions. |
| SMEAttrs Attrs = AFI->getSMEFnAttrs(); |
| if (!StackHazardInNonStreaming && Attrs.hasNonStreamingInterfaceAndBody()) |
| return; |
| |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| |
| // Add a hazard slot if there are any CSR FPR registers, or are any fp-only |
| // stack objects. |
| bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { |
| return AArch64::FPR64RegClass.contains(Reg) || |
| AArch64::FPR128RegClass.contains(Reg) || |
| AArch64::ZPRRegClass.contains(Reg); |
| }); |
| bool HasPPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { |
| return AArch64::PPRRegClass.contains(Reg); |
| }); |
| bool HasFPRStackObjects = false; |
| bool HasPPRStackObjects = false; |
| if (!HasFPRCSRs || SplitSVEObjects) { |
| enum SlotType : uint8_t { |
| Unknown = 0, |
| ZPRorFPR = 1 << 0, |
| PPR = 1 << 1, |
| GPR = 1 << 2, |
| LLVM_MARK_AS_BITMASK_ENUM(GPR) |
| }; |
| |
| // Find stack slots solely used for one kind of register (ZPR, PPR, etc.), |
| // based on the kinds of accesses used in the function. |
| SmallVector<SlotType> SlotTypes(MFI.getObjectIndexEnd(), SlotType::Unknown); |
| for (auto &MBB : MF) { |
| for (auto &MI : MBB) { |
| std::optional<int> FI = getLdStFrameID(MI, MFI); |
| if (!FI || FI < 0 || FI > int(SlotTypes.size())) |
| continue; |
| if (MFI.hasScalableStackID(*FI)) { |
| SlotTypes[*FI] |= |
| isPPRAccess(MI) ? SlotType::PPR : SlotType::ZPRorFPR; |
| } else { |
| SlotTypes[*FI] |= AArch64InstrInfo::isFpOrNEON(MI) |
| ? SlotType::ZPRorFPR |
| : SlotType::GPR; |
| } |
| } |
| } |
| |
| for (int FI = 0; FI < int(SlotTypes.size()); ++FI) { |
| HasFPRStackObjects |= SlotTypes[FI] == SlotType::ZPRorFPR; |
| // For SplitSVEObjects remember that this stack slot is a predicate, this |
| // will be needed later when determining the frame layout. |
| if (SlotTypes[FI] == SlotType::PPR) { |
| MFI.setStackID(FI, TargetStackID::ScalablePredicateVector); |
| HasPPRStackObjects = true; |
| } |
| } |
| } |
| |
| if (HasFPRCSRs || HasFPRStackObjects) { |
| int ID = MFI.CreateStackObject(StackHazardSize, Align(16), false); |
| LLVM_DEBUG(dbgs() << "Created Hazard slot at " << ID << " size " |
| << StackHazardSize << "\n"); |
| AFI->setStackHazardSlotIndex(ID); |
| } |
| |
| if (!AFI->hasStackHazardSlotIndex()) |
| return; |
| |
| if (SplitSVEObjects) { |
| CallingConv::ID CC = MF.getFunction().getCallingConv(); |
| if (AFI->isSVECC() || CC == CallingConv::AArch64_SVE_VectorCall) { |
| AFI->setSplitSVEObjects(true); |
| LLVM_DEBUG(dbgs() << "Using SplitSVEObjects for SVE CC function\n"); |
| return; |
| } |
| |
| // We only use SplitSVEObjects in non-SVE CC functions if there's a |
| // possibility of a stack hazard between PPRs and ZPRs/FPRs. |
| LLVM_DEBUG(dbgs() << "Determining if SplitSVEObjects should be used in " |
| "non-SVE CC function...\n"); |
| |
| // If another calling convention is explicitly set FPRs can't be promoted to |
| // ZPR callee-saves. |
| if (!is_contained({CallingConv::C, CallingConv::Fast}, CC)) { |
| LLVM_DEBUG( |
| dbgs() |
| << "Calling convention is not supported with SplitSVEObjects\n"); |
| return; |
| } |
| |
| if (!HasPPRCSRs && !HasPPRStackObjects) { |
| LLVM_DEBUG( |
| dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n"); |
| return; |
| } |
| |
| if (!HasFPRCSRs && !HasFPRStackObjects) { |
| LLVM_DEBUG( |
| dbgs() |
| << "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n"); |
| return; |
| } |
| |
| [[maybe_unused]] const AArch64Subtarget &Subtarget = |
| MF.getSubtarget<AArch64Subtarget>(); |
| assert(Subtarget.isSVEorStreamingSVEAvailable() && |
| "Expected SVE to be available for PPRs"); |
| |
| const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); |
| // With SplitSVEObjects the CS hazard padding is placed between the |
| // PPRs and ZPRs. If there are any FPR CS there would be a hazard between |
| // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs. |
| BitVector FPRZRegs(SavedRegs.size()); |
| for (size_t Reg = 0, E = SavedRegs.size(); HasFPRCSRs && Reg < E; ++Reg) { |
| BitVector::reference RegBit = SavedRegs[Reg]; |
| if (!RegBit) |
| continue; |
| unsigned SubRegIdx = 0; |
| if (AArch64::FPR64RegClass.contains(Reg)) |
| SubRegIdx = AArch64::dsub; |
| else if (AArch64::FPR128RegClass.contains(Reg)) |
| SubRegIdx = AArch64::zsub; |
| else |
| continue; |
| // Clear the bit for the FPR save. |
| RegBit = false; |
| // Mark that we should save the corresponding ZPR. |
| Register ZReg = |
| TRI->getMatchingSuperReg(Reg, SubRegIdx, &AArch64::ZPRRegClass); |
| FPRZRegs.set(ZReg); |
| } |
| SavedRegs |= FPRZRegs; |
| |
| AFI->setSplitSVEObjects(true); |
| LLVM_DEBUG(dbgs() << "SplitSVEObjects enabled!\n"); |
| } |
| } |
| |
| void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, |
| BitVector &SavedRegs, |
| RegScavenger *RS) const { |
| // All calls are tail calls in GHC calling conv, and functions have no |
| // prologue/epilogue. |
| if (MF.getFunction().getCallingConv() == CallingConv::GHC) |
| return; |
| |
| const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); |
| |
| TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); |
| const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); |
| AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| unsigned UnspilledCSGPR = AArch64::NoRegister; |
| unsigned UnspilledCSGPRPaired = AArch64::NoRegister; |
| |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); |
| |
| MCRegister BasePointerReg = |
| RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() : MCRegister(); |
| |
| unsigned ExtraCSSpill = 0; |
| bool HasUnpairedGPR64 = false; |
| bool HasPairZReg = false; |
| BitVector UserReservedRegs = RegInfo->getUserReservedRegs(MF); |
| BitVector ReservedRegs = RegInfo->getReservedRegs(MF); |
| |
| // Figure out which callee-saved registers to save/restore. |
| for (unsigned i = 0; CSRegs[i]; ++i) { |
| const MCRegister Reg = CSRegs[i]; |
| |
| // Add the base pointer register to SavedRegs if it is callee-save. |
| if (Reg == BasePointerReg) |
| SavedRegs.set(Reg); |
| |
| // Don't save manually reserved registers set through +reserve-x#i, |
| // even for callee-saved registers, as per GCC's behavior. |
| if (UserReservedRegs[Reg]) { |
| SavedRegs.reset(Reg); |
| continue; |
| } |
| |
| bool RegUsed = SavedRegs.test(Reg); |
| MCRegister PairedReg; |
| const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg); |
| if (RegIsGPR64 || AArch64::FPR64RegClass.contains(Reg) || |
| AArch64::FPR128RegClass.contains(Reg)) { |
| // Compensate for odd numbers of GP CSRs. |
| // For now, all the known cases of odd number of CSRs are of GPRs. |
| if (HasUnpairedGPR64) |
| PairedReg = CSRegs[i % 2 == 0 ? i - 1 : i + 1]; |
| else |
| PairedReg = CSRegs[i ^ 1]; |
| } |
| |
| // If the function requires all the GP registers to save (SavedRegs), |
| // and there are an odd number of GP CSRs at the same time (CSRegs), |
| // PairedReg could be in a different register class from Reg, which would |
| // lead to a FPR (usually D8) accidentally being marked saved. |
| if (RegIsGPR64 && !AArch64::GPR64RegClass.contains(PairedReg)) { |
| PairedReg = AArch64::NoRegister; |
| HasUnpairedGPR64 = true; |
| } |
| assert(PairedReg == AArch64::NoRegister || |
| AArch64::GPR64RegClass.contains(Reg, PairedReg) || |
| AArch64::FPR64RegClass.contains(Reg, PairedReg) || |
| AArch64::FPR128RegClass.contains(Reg, PairedReg)); |
| |
| if (!RegUsed) { |
| if (AArch64::GPR64RegClass.contains(Reg) && !ReservedRegs[Reg]) { |
| UnspilledCSGPR = Reg; |
| UnspilledCSGPRPaired = PairedReg; |
| } |
| continue; |
| } |
| |
| // MachO's compact unwind format relies on all registers being stored in |
| // pairs. |
| // FIXME: the usual format is actually better if unwinding isn't needed. |
| if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister && |
| !SavedRegs.test(PairedReg)) { |
| SavedRegs.set(PairedReg); |
| if (AArch64::GPR64RegClass.contains(PairedReg) && |
| !ReservedRegs[PairedReg]) |
| ExtraCSSpill = PairedReg; |
| } |
| // Check if there is a pair of ZRegs, so it can select PReg for spill/fill |
| HasPairZReg |= (AArch64::ZPRRegClass.contains(Reg, CSRegs[i ^ 1]) && |
| SavedRegs.test(CSRegs[i ^ 1])); |
| } |
| |
| if (HasPairZReg && enableMultiVectorSpillFill(Subtarget, MF)) { |
| AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| // Find a suitable predicate register for the multi-vector spill/fill |
| // instructions. |
| MCRegister PnReg = findFreePredicateReg(SavedRegs); |
| if (PnReg.isValid()) |
| AFI->setPredicateRegForFillSpill(PnReg); |
| // If no free callee-save has been found assign one. |
| if (!AFI->getPredicateRegForFillSpill() && |
| MF.getFunction().getCallingConv() == |
| CallingConv::AArch64_SVE_VectorCall) { |
| SavedRegs.set(AArch64::P8); |
| AFI->setPredicateRegForFillSpill(AArch64::PN8); |
| } |
| |
| assert(!ReservedRegs[AFI->getPredicateRegForFillSpill()] && |
| "Predicate cannot be a reserved register"); |
| } |
| |
| if (MF.getFunction().getCallingConv() == CallingConv::Win64 && |
| !Subtarget.isTargetWindows()) { |
| // For Windows calling convention on a non-windows OS, where X18 is treated |
| // as reserved, back up X18 when entering non-windows code (marked with the |
| // Windows calling convention) and restore when returning regardless of |
| // whether the individual function uses it - it might call other functions |
| // that clobber it. |
| SavedRegs.set(AArch64::X18); |
| } |
| |
| // Determine if a Hazard slot should be used and where it should go. |
| // If SplitSVEObjects is used, the hazard padding is placed between the PPRs |
| // and ZPRs. Otherwise, it goes in the callee save area. |
| determineStackHazardSlot(MF, SavedRegs); |
| |
| // Calculates the callee saved stack size. |
| unsigned CSStackSize = 0; |
| unsigned ZPRCSStackSize = 0; |
| unsigned PPRCSStackSize = 0; |
| const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); |
| for (unsigned Reg : SavedRegs.set_bits()) { |
| auto *RC = TRI->getMinimalPhysRegClass(MCRegister(Reg)); |
| assert(RC && "expected register class!"); |
| auto SpillSize = TRI->getSpillSize(*RC); |
| bool IsZPR = AArch64::ZPRRegClass.contains(Reg); |
| bool IsPPR = !IsZPR && AArch64::PPRRegClass.contains(Reg); |
| if (IsZPR) |
| ZPRCSStackSize += SpillSize; |
| else if (IsPPR) |
| PPRCSStackSize += SpillSize; |
| else |
| CSStackSize += SpillSize; |
| } |
| |
| // Save number of saved regs, so we can easily update CSStackSize later to |
| // account for any additional 64-bit GPR saves. Note: After this point |
| // only 64-bit GPRs can be added to SavedRegs. |
| unsigned NumSavedRegs = SavedRegs.count(); |
| |
| // If we have hazard padding in the CS area add that to the size. |
| if (AFI->isStackHazardIncludedInCalleeSaveArea()) |
| CSStackSize += getStackHazardSize(MF); |
| |
| // Increase the callee-saved stack size if the function has streaming mode |
| // changes, as we will need to spill the value of the VG register. |
| if (requiresSaveVG(MF)) |
| CSStackSize += 8; |
| |
| // If we must call __arm_get_current_vg in the prologue preserve the LR. |
| if (requiresSaveVG(MF) && !Subtarget.hasSVE()) |
| SavedRegs.set(AArch64::LR); |
| |
| // The frame record needs to be created by saving the appropriate registers |
| uint64_t EstimatedStackSize = MFI.estimateStackSize(MF); |
| if (hasFP(MF) || |
| windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) { |
| SavedRegs.set(AArch64::FP); |
| SavedRegs.set(AArch64::LR); |
| } |
| |
| LLVM_DEBUG({ |
| dbgs() << "*** determineCalleeSaves\nSaved CSRs:"; |
| for (unsigned Reg : SavedRegs.set_bits()) |
| dbgs() << ' ' << printReg(MCRegister(Reg), RegInfo); |
| dbgs() << "\n"; |
| }); |
| |
| // If any callee-saved registers are used, the frame cannot be eliminated. |
| auto [ZPRLocalStackSize, PPRLocalStackSize] = |
| determineSVEStackSizes(MF, AssignObjectOffsets::No); |
| uint64_t SVELocals = ZPRLocalStackSize + PPRLocalStackSize; |
| uint64_t SVEStackSize = |
| alignTo(ZPRCSStackSize + PPRCSStackSize + SVELocals, 16); |
| bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; |
| |
| // The CSR spill slots have not been allocated yet, so estimateStackSize |
| // won't include them. |
| unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); |
| |
| // We may address some of the stack above the canonical frame address, either |
| // for our own arguments or during a call. Include that in calculating whether |
| // we have complicated addressing concerns. |
| int64_t CalleeStackUsed = 0; |
| for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) { |
| int64_t FixedOff = MFI.getObjectOffset(I); |
| if (FixedOff > CalleeStackUsed) |
| CalleeStackUsed = FixedOff; |
| } |
| |
| // Conservatively always assume BigStack when there are SVE spills. |
| bool BigStack = SVEStackSize || (EstimatedStackSize + CSStackSize + |
| CalleeStackUsed) > EstimatedStackSizeLimit; |
| if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) |
| AFI->setHasStackFrame(true); |
| |
| // Estimate if we might need to scavenge a register at some point in order |
| // to materialize a stack offset. If so, either spill one additional |
| // callee-saved register or reserve a special spill slot to facilitate |
| // register scavenging. If we already spilled an extra callee-saved register |
| // above to keep the number of spills even, we don't need to do anything else |
| // here. |
| if (BigStack) { |
| if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) { |
| LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo) |
| << " to get a scratch register.\n"); |
| SavedRegs.set(UnspilledCSGPR); |
| ExtraCSSpill = UnspilledCSGPR; |
| |
| // MachO's compact unwind format relies on all registers being stored in |
| // pairs, so if we need to spill one extra for BigStack, then we need to |
| // store the pair. |
| if (producePairRegisters(MF)) { |
| if (UnspilledCSGPRPaired == AArch64::NoRegister) { |
| // Failed to make a pair for compact unwind format, revert spilling. |
| if (produceCompactUnwindFrame(*this, MF)) { |
| SavedRegs.reset(UnspilledCSGPR); |
| ExtraCSSpill = AArch64::NoRegister; |
| } |
| } else |
| SavedRegs.set(UnspilledCSGPRPaired); |
| } |
| } |
| |
| // If we didn't find an extra callee-saved register to spill, create |
| // an emergency spill slot. |
| if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) { |
| const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); |
| const TargetRegisterClass &RC = AArch64::GPR64RegClass; |
| unsigned Size = TRI->getSpillSize(RC); |
| Align Alignment = TRI->getSpillAlign(RC); |
| int FI = MFI.CreateSpillStackObject(Size, Alignment); |
| RS->addScavengingFrameIndex(FI); |
| LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI |
| << " as the emergency spill slot.\n"); |
| } |
| } |
| |
| // Adding the size of additional 64bit GPR saves. |
| CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs); |
| |
| // A Swift asynchronous context extends the frame record with a pointer |
| // directly before FP. |
| if (hasFP(MF) && AFI->hasSwiftAsyncContext()) |
| CSStackSize += 8; |
| |
| uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16); |
| LLVM_DEBUG(dbgs() << "Estimated stack frame size: " |
| << EstimatedStackSize + AlignedCSStackSize << " bytes.\n"); |
| |
| assert((!MFI.isCalleeSavedInfoValid() || |
| AFI->getCalleeSavedStackSize() == AlignedCSStackSize) && |
| "Should not invalidate callee saved info"); |
| |
| // Round up to register pair alignment to avoid additional SP adjustment |
| // instructions. |
| AFI->setCalleeSavedStackSize(AlignedCSStackSize); |
| AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize); |
| AFI->setSVECalleeSavedStackSize(ZPRCSStackSize, alignTo(PPRCSStackSize, 16)); |
| } |
| |
| bool AArch64FrameLowering::assignCalleeSavedSpillSlots( |
| MachineFunction &MF, const TargetRegisterInfo *RegInfo, |
| std::vector<CalleeSavedInfo> &CSI) const { |
| bool IsWindows = isTargetWindows(MF); |
| unsigned StackHazardSize = getStackHazardSize(MF); |
| // To match the canonical windows frame layout, reverse the list of |
| // callee saved registers to get them laid out by PrologEpilogInserter |
| // in the right order. (PrologEpilogInserter allocates stack objects top |
| // down. Windows canonical prologs store higher numbered registers at |
| // the top, thus have the CSI array start from the highest registers.) |
| if (IsWindows) |
| std::reverse(CSI.begin(), CSI.end()); |
| |
| if (CSI.empty()) |
| return true; // Early exit if no callee saved registers are modified! |
| |
| // Now that we know which registers need to be saved and restored, allocate |
| // stack slots for them. |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| |
| if (IsWindows && hasFP(MF) && AFI->hasSwiftAsyncContext()) { |
| int FrameIdx = MFI.CreateStackObject(8, Align(16), true); |
| AFI->setSwiftAsyncContextFrameIdx(FrameIdx); |
| MFI.setIsCalleeSavedObjectIndex(FrameIdx, true); |
| } |
| |
| // Insert VG into the list of CSRs, immediately before LR if saved. |
| if (requiresSaveVG(MF)) { |
| CalleeSavedInfo VGInfo(AArch64::VG); |
| auto It = |
| find_if(CSI, [](auto &Info) { return Info.getReg() == AArch64::LR; }); |
| if (It != CSI.end()) |
| CSI.insert(It, VGInfo); |
| else |
| CSI.push_back(VGInfo); |
| } |
| |
| Register LastReg = 0; |
| int HazardSlotIndex = std::numeric_limits<int>::max(); |
| for (auto &CS : CSI) { |
| MCRegister Reg = CS.getReg(); |
| const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); |
| |
| // Create a hazard slot as we switch between GPR and FPR CSRs. |
| if (AFI->isStackHazardIncludedInCalleeSaveArea() && |
| (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && |
| AArch64InstrInfo::isFpOrNEON(Reg)) { |
| assert(HazardSlotIndex == std::numeric_limits<int>::max() && |
| "Unexpected register order for hazard slot"); |
| HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true); |
| LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex |
| << "\n"); |
| AFI->setStackHazardCSRSlotIndex(HazardSlotIndex); |
| MFI.setIsCalleeSavedObjectIndex(HazardSlotIndex, true); |
| } |
| |
| unsigned Size = RegInfo->getSpillSize(*RC); |
| Align Alignment(RegInfo->getSpillAlign(*RC)); |
| int FrameIdx = MFI.CreateStackObject(Size, Alignment, true); |
| CS.setFrameIdx(FrameIdx); |
| MFI.setIsCalleeSavedObjectIndex(FrameIdx, true); |
| |
| // Grab 8 bytes below FP for the extended asynchronous frame info. |
| if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !IsWindows && |
| Reg == AArch64::FP) { |
| FrameIdx = MFI.CreateStackObject(8, Alignment, true); |
| AFI->setSwiftAsyncContextFrameIdx(FrameIdx); |
| MFI.setIsCalleeSavedObjectIndex(FrameIdx, true); |
| } |
| LastReg = Reg; |
| } |
| |
| // Add hazard slot in the case where no FPR CSRs are present. |
| if (AFI->isStackHazardIncludedInCalleeSaveArea() && |
| HazardSlotIndex == std::numeric_limits<int>::max()) { |
| HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true); |
| LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex |
| << "\n"); |
| AFI->setStackHazardCSRSlotIndex(HazardSlotIndex); |
| MFI.setIsCalleeSavedObjectIndex(HazardSlotIndex, true); |
| } |
| |
| return true; |
| } |
| |
| bool AArch64FrameLowering::enableStackSlotScavenging( |
| const MachineFunction &MF) const { |
| const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| // If the function has streaming-mode changes, don't scavenge a |
| // spillslot in the callee-save area, as that might require an |
| // 'addvl' in the streaming-mode-changing call-sequence when the |
| // function doesn't use a FP. |
| if (AFI->hasStreamingModeChanges() && !hasFP(MF)) |
| return false; |
| // Don't allow register salvaging with hazard slots, in case it moves objects |
| // into the wrong place. |
| if (AFI->hasStackHazardSlotIndex()) |
| return false; |
| return AFI->hasCalleeSaveStackFreeSpace(); |
| } |
| |
| /// returns true if there are any SVE callee saves. |
| static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, |
| int &Min, int &Max) { |
| Min = std::numeric_limits<int>::max(); |
| Max = std::numeric_limits<int>::min(); |
| |
| if (!MFI.isCalleeSavedInfoValid()) |
| return false; |
| |
| const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); |
| for (auto &CS : CSI) { |
| if (AArch64::ZPRRegClass.contains(CS.getReg()) || |
| AArch64::PPRRegClass.contains(CS.getReg())) { |
| assert((Max == std::numeric_limits<int>::min() || |
| Max + 1 == CS.getFrameIdx()) && |
| "SVE CalleeSaves are not consecutive"); |
| Min = std::min(Min, CS.getFrameIdx()); |
| Max = std::max(Max, CS.getFrameIdx()); |
| } |
| } |
| return Min != std::numeric_limits<int>::max(); |
| } |
| |
| static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, |
| AssignObjectOffsets AssignOffsets) { |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| |
| SVEStackSizes SVEStack{}; |
| |
| // With SplitSVEObjects we maintain separate stack offsets for predicates |
| // (PPRs) and SVE vectors (ZPRs). When SplitSVEObjects is disabled predicates |
| // are included in the SVE vector area. |
| uint64_t &ZPRStackTop = SVEStack.ZPRStackSize; |
| uint64_t &PPRStackTop = |
| AFI->hasSplitSVEObjects() ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize; |
| |
| #ifndef NDEBUG |
| // First process all fixed stack objects. |
| for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) |
| assert(!MFI.hasScalableStackID(I) && |
| "SVE vectors should never be passed on the stack by value, only by " |
| "reference."); |
| #endif |
| |
| auto AllocateObject = [&](int FI) { |
| uint64_t &StackTop = MFI.getStackID(FI) == TargetStackID::ScalableVector |
| ? ZPRStackTop |
| : PPRStackTop; |
| |
| // FIXME: Given that the length of SVE vectors is not necessarily a power of |
| // two, we'd need to align every object dynamically at runtime if the |
| // alignment is larger than 16. This is not yet supported. |
| Align Alignment = MFI.getObjectAlign(FI); |
| if (Alignment > Align(16)) |
| report_fatal_error( |
| "Alignment of scalable vectors > 16 bytes is not yet supported"); |
| |
| StackTop += MFI.getObjectSize(FI); |
| StackTop = alignTo(StackTop, Alignment); |
| |
| assert(StackTop < (uint64_t)std::numeric_limits<int64_t>::max() && |
| "SVE StackTop far too large?!"); |
| |
| int64_t Offset = -int64_t(StackTop); |
| if (AssignOffsets == AssignObjectOffsets::Yes) |
| MFI.setObjectOffset(FI, Offset); |
| |
| LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n"); |
| }; |
| |
| // Then process all callee saved slots. |
| int MinCSFrameIndex, MaxCSFrameIndex; |
| if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { |
| for (int FI = MinCSFrameIndex; FI <= MaxCSFrameIndex; ++FI) |
| AllocateObject(FI); |
| } |
| |
| // Ensure the CS area is 16-byte aligned. |
| PPRStackTop = alignTo(PPRStackTop, Align(16U)); |
| ZPRStackTop = alignTo(ZPRStackTop, Align(16U)); |
| |
| // Create a buffer of SVE objects to allocate and sort it. |
| SmallVector<int, 8> ObjectsToAllocate; |
| // If we have a stack protector, and we've previously decided that we have SVE |
| // objects on the stack and thus need it to go in the SVE stack area, then it |
| // needs to go first. |
| int StackProtectorFI = -1; |
| if (MFI.hasStackProtectorIndex()) { |
| StackProtectorFI = MFI.getStackProtectorIndex(); |
| if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector) |
| ObjectsToAllocate.push_back(StackProtectorFI); |
| } |
| |
| for (int FI = 0, E = MFI.getObjectIndexEnd(); FI != E; ++FI) { |
| if (FI == StackProtectorFI || MFI.isDeadObjectIndex(FI) || |
| MFI.isCalleeSavedObjectIndex(FI)) |
| continue; |
| |
| if (MFI.getStackID(FI) != TargetStackID::ScalableVector && |
| MFI.getStackID(FI) != TargetStackID::ScalablePredicateVector) |
| continue; |
| |
| ObjectsToAllocate.push_back(FI); |
| } |
| |
| // Allocate all SVE locals and spills |
| for (unsigned FI : ObjectsToAllocate) |
| AllocateObject(FI); |
| |
| PPRStackTop = alignTo(PPRStackTop, Align(16U)); |
| ZPRStackTop = alignTo(ZPRStackTop, Align(16U)); |
| |
| if (AssignOffsets == AssignObjectOffsets::Yes) |
| AFI->setStackSizeSVE(SVEStack.ZPRStackSize, SVEStack.PPRStackSize); |
| |
| return SVEStack; |
| } |
| |
| void AArch64FrameLowering::processFunctionBeforeFrameFinalized( |
| MachineFunction &MF, RegScavenger *RS) const { |
| assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && |
| "Upwards growing stack unsupported"); |
| |
| (void)determineSVEStackSizes(MF, AssignObjectOffsets::Yes); |
| |
| // If this function isn't doing Win64-style C++ EH, we don't need to do |
| // anything. |
| if (!MF.hasEHFunclets()) |
| return; |
| |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| |
| // Win64 C++ EH needs to allocate space for the catch objects in the fixed |
| // object area right next to the UnwindHelp object. |
| WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); |
| int64_t CurrentOffset = |
| AFI->getVarArgsGPRSize() + AFI->getTailCallReservedStack(); |
| for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) { |
| for (WinEHHandlerType &H : TBME.HandlerArray) { |
| int FrameIndex = H.CatchObj.FrameIndex; |
| if ((FrameIndex != INT_MAX) && MFI.getObjectOffset(FrameIndex) == 0) { |
| CurrentOffset = |
| alignTo(CurrentOffset, MFI.getObjectAlign(FrameIndex).value()); |
| CurrentOffset += MFI.getObjectSize(FrameIndex); |
| MFI.setObjectOffset(FrameIndex, -CurrentOffset); |
| } |
| } |
| } |
| |
| // Create an UnwindHelp object. |
| // The UnwindHelp object is allocated at the start of the fixed object area |
| int64_t UnwindHelpOffset = alignTo(CurrentOffset + 8, Align(16)); |
| assert(UnwindHelpOffset == getFixedObjectSize(MF, AFI, /*IsWin64*/ true, |
| /*IsFunclet*/ false) && |
| "UnwindHelpOffset must be at the start of the fixed object area"); |
| int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8, -UnwindHelpOffset, |
| /*IsImmutable=*/false); |
| EHInfo.UnwindHelpFrameIdx = UnwindHelpFI; |
| |
| MachineBasicBlock &MBB = MF.front(); |
| auto MBBI = MBB.begin(); |
| while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) |
| ++MBBI; |
| |
| // We need to store -2 into the UnwindHelp object at the start of the |
| // function. |
| DebugLoc DL; |
| RS->enterBasicBlockEnd(MBB); |
| RS->backward(MBBI); |
| Register DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass); |
| assert(DstReg && "There must be a free register after frame setup"); |
| const AArch64InstrInfo &TII = |
| *MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); |
| BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2); |
| BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi)) |
| .addReg(DstReg, getKillRegState(true)) |
| .addFrameIndex(UnwindHelpFI) |
| .addImm(0); |
| } |
| |
| namespace { |
| struct TagStoreInstr { |
| MachineInstr *MI; |
| int64_t Offset, Size; |
| explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size) |
| : MI(MI), Offset(Offset), Size(Size) {} |
| }; |
| |
| class TagStoreEdit { |
| MachineFunction *MF; |
| MachineBasicBlock *MBB; |
| MachineRegisterInfo *MRI; |
| // Tag store instructions that are being replaced. |
| SmallVector<TagStoreInstr, 8> TagStores; |
| // Combined memref arguments of the above instructions. |
| SmallVector<MachineMemOperand *, 8> CombinedMemRefs; |
| |
| // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg + |
| // FrameRegOffset + Size) with the address tag of SP. |
| Register FrameReg; |
| StackOffset FrameRegOffset; |
| int64_t Size; |
| // If not std::nullopt, move FrameReg to (FrameReg + FrameRegUpdate) at the |
| // end. |
| std::optional<int64_t> FrameRegUpdate; |
| // MIFlags for any FrameReg updating instructions. |
| unsigned FrameRegUpdateFlags; |
| |
| // Use zeroing instruction variants. |
| bool ZeroData; |
| DebugLoc DL; |
| |
| void emitUnrolled(MachineBasicBlock::iterator InsertI); |
| void emitLoop(MachineBasicBlock::iterator InsertI); |
| |
| public: |
| TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData) |
| : MBB(MBB), ZeroData(ZeroData) { |
| MF = MBB->getParent(); |
| MRI = &MF->getRegInfo(); |
| } |
| // Add an instruction to be replaced. Instructions must be added in the |
| // ascending order of Offset, and have to be adjacent. |
| void addInstruction(TagStoreInstr I) { |
| assert((TagStores.empty() || |
| TagStores.back().Offset + TagStores.back().Size == I.Offset) && |
| "Non-adjacent tag store instructions."); |
| TagStores.push_back(I); |
| } |
| void clear() { TagStores.clear(); } |
| // Emit equivalent code at the given location, and erase the current set of |
| // instructions. May skip if the replacement is not profitable. May invalidate |
| // the input iterator and replace it with a valid one. |
| void emitCode(MachineBasicBlock::iterator &InsertI, |
| const AArch64FrameLowering *TFI, bool TryMergeSPUpdate); |
| }; |
| |
| void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) { |
| const AArch64InstrInfo *TII = |
| MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); |
| |
| const int64_t kMinOffset = -256 * 16; |
| const int64_t kMaxOffset = 255 * 16; |
| |
| Register BaseReg = FrameReg; |
| int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed(); |
| if (BaseRegOffsetBytes < kMinOffset || |
| BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset || |
| // BaseReg can be FP, which is not necessarily aligned to 16-bytes. In |
| // that case, BaseRegOffsetBytes will not be aligned to 16 bytes, which |
| // is required for the offset of ST2G. |
| BaseRegOffsetBytes % 16 != 0) { |
| Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); |
| emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg, |
| StackOffset::getFixed(BaseRegOffsetBytes), TII); |
| BaseReg = ScratchReg; |
| BaseRegOffsetBytes = 0; |
| } |
| |
| MachineInstr *LastI = nullptr; |
| while (Size) { |
| int64_t InstrSize = (Size > 16) ? 32 : 16; |
| unsigned Opcode = |
| InstrSize == 16 |
| ? (ZeroData ? AArch64::STZGi : AArch64::STGi) |
| : (ZeroData ? AArch64::STZ2Gi : AArch64::ST2Gi); |
| assert(BaseRegOffsetBytes % 16 == 0); |
| MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode)) |
| .addReg(AArch64::SP) |
| .addReg(BaseReg) |
| .addImm(BaseRegOffsetBytes / 16) |
| .setMemRefs(CombinedMemRefs); |
| // A store to [BaseReg, #0] should go last for an opportunity to fold the |
| // final SP adjustment in the epilogue. |
| if (BaseRegOffsetBytes == 0) |
| LastI = I; |
| BaseRegOffsetBytes += InstrSize; |
| Size -= InstrSize; |
| } |
| |
| if (LastI) |
| MBB->splice(InsertI, MBB, LastI); |
| } |
| |
| void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) { |
| const AArch64InstrInfo *TII = |
| MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); |
| |
| Register BaseReg = FrameRegUpdate |
| ? FrameReg |
| : MRI->createVirtualRegister(&AArch64::GPR64RegClass); |
| Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); |
| |
| emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII); |
| |
| int64_t LoopSize = Size; |
| // If the loop size is not a multiple of 32, split off one 16-byte store at |
| // the end to fold BaseReg update into. |
| if (FrameRegUpdate && *FrameRegUpdate) |
| LoopSize -= LoopSize % 32; |
| MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL, |
| TII->get(ZeroData ? AArch64::STZGloop_wback |
| : AArch64::STGloop_wback)) |
| .addDef(SizeReg) |
| .addDef(BaseReg) |
| .addImm(LoopSize) |
| .addReg(BaseReg) |
| .setMemRefs(CombinedMemRefs); |
| if (FrameRegUpdate) |
| LoopI->setFlags(FrameRegUpdateFlags); |
| |
| int64_t ExtraBaseRegUpdate = |
| FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0; |
| LLVM_DEBUG(dbgs() << "TagStoreEdit::emitLoop: LoopSize=" << LoopSize |
| << ", Size=" << Size |
| << ", ExtraBaseRegUpdate=" << ExtraBaseRegUpdate |
| << ", FrameRegUpdate=" << FrameRegUpdate |
| << ", FrameRegOffset.getFixed()=" |
| << FrameRegOffset.getFixed() << "\n"); |
| if (LoopSize < Size) { |
| assert(FrameRegUpdate); |
| assert(Size - LoopSize == 16); |
| // Tag 16 more bytes at BaseReg and update BaseReg. |
| int64_t STGOffset = ExtraBaseRegUpdate + 16; |
| assert(STGOffset % 16 == 0 && STGOffset >= -4096 && STGOffset <= 4080 && |
| "STG immediate out of range"); |
| BuildMI(*MBB, InsertI, DL, |
| TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex)) |
| .addDef(BaseReg) |
| .addReg(BaseReg) |
| .addReg(BaseReg) |
| .addImm(STGOffset / 16) |
| .setMemRefs(CombinedMemRefs) |
| .setMIFlags(FrameRegUpdateFlags); |
| } else if (ExtraBaseRegUpdate) { |
| // Update BaseReg. |
| int64_t AddSubOffset = std::abs(ExtraBaseRegUpdate); |
| assert(AddSubOffset <= 4095 && "ADD/SUB immediate out of range"); |
| BuildMI( |
| *MBB, InsertI, DL, |
| TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri)) |
| .addDef(BaseReg) |
| .addReg(BaseReg) |
| .addImm(AddSubOffset) |
| .addImm(0) |
| .setMIFlags(FrameRegUpdateFlags); |
| } |
| } |
| |
| // Check if *II is a register update that can be merged into STGloop that ends |
| // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the |
| // end of the loop. |
| bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg, |
| int64_t Size, int64_t *TotalOffset) { |
| MachineInstr &MI = *II; |
| if ((MI.getOpcode() == AArch64::ADDXri || |
| MI.getOpcode() == AArch64::SUBXri) && |
| MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) { |
| unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm()); |
| int64_t Offset = MI.getOperand(2).getImm() << Shift; |
| if (MI.getOpcode() == AArch64::SUBXri) |
| Offset = -Offset; |
| int64_t PostOffset = Offset - Size; |
| // TagStoreEdit::emitLoop might emit either an ADD/SUB after the loop, or |
| // an STGPostIndex which does the last 16 bytes of tag write. Which one is |
| // chosen depends on the alignment of the loop size, but the difference |
| // between the valid ranges for the two instructions is small, so we |
| // conservatively assume that it could be either case here. |
| // |
| // Max offset of STGPostIndex, minus the 16 byte tag write folded into that |
| // instruction. |
| const int64_t kMaxOffset = 4080 - 16; |
| // Max offset of SUBXri. |
| const int64_t kMinOffset = -4095; |
| if (PostOffset <= kMaxOffset && PostOffset >= kMinOffset && |
| PostOffset % 16 == 0) { |
| *TotalOffset = Offset; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE, |
| SmallVectorImpl<MachineMemOperand *> &MemRefs) { |
| MemRefs.clear(); |
| for (auto &TS : TSE) { |
| MachineInstr *MI = TS.MI; |
| // An instruction without memory operands may access anything. Be |
| // conservative and return an empty list. |
| if (MI->memoperands_empty()) { |
| MemRefs.clear(); |
| return; |
| } |
| MemRefs.append(MI->memoperands_begin(), MI->memoperands_end()); |
| } |
| } |
| |
| void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, |
| const AArch64FrameLowering *TFI, |
| bool TryMergeSPUpdate) { |
| if (TagStores.empty()) |
| return; |
| TagStoreInstr &FirstTagStore = TagStores[0]; |
| TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1]; |
| Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size; |
| DL = TagStores[0].MI->getDebugLoc(); |
| |
| Register Reg; |
| FrameRegOffset = TFI->resolveFrameOffsetReference( |
| *MF, FirstTagStore.Offset, false /*isFixed*/, |
| TargetStackID::Default /*StackID*/, Reg, |
| /*PreferFP=*/false, /*ForSimm=*/true); |
| FrameReg = Reg; |
| FrameRegUpdate = std::nullopt; |
| |
| mergeMemRefs(TagStores, CombinedMemRefs); |
| |
| LLVM_DEBUG({ |
| dbgs() << "Replacing adjacent STG instructions:\n"; |
| for (const auto &Instr : TagStores) { |
| dbgs() << " " << *Instr.MI; |
| } |
| }); |
| |
| // Size threshold where a loop becomes shorter than a linear sequence of |
| // tagging instructions. |
| const int kSetTagLoopThreshold = 176; |
| if (Size < kSetTagLoopThreshold) { |
| if (TagStores.size() < 2) |
| return; |
| emitUnrolled(InsertI); |
| } else { |
| MachineInstr *UpdateInstr = nullptr; |
| int64_t TotalOffset = 0; |
| if (TryMergeSPUpdate) { |
| // See if we can merge base register update into the STGloop. |
| // This is done in AArch64LoadStoreOptimizer for "normal" stores, |
| // but STGloop is way too unusual for that, and also it only |
| // realistically happens in function epilogue. Also, STGloop is expanded |
| // before that pass. |
| if (InsertI != MBB->end() && |
| canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size, |
| &TotalOffset)) { |
| UpdateInstr = &*InsertI++; |
| LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n " |
| << *UpdateInstr); |
| } |
| } |
| |
| if (!UpdateInstr && TagStores.size() < 2) |
| return; |
| |
| if (UpdateInstr) { |
| FrameRegUpdate = TotalOffset; |
| FrameRegUpdateFlags = UpdateInstr->getFlags(); |
| } |
| emitLoop(InsertI); |
| if (UpdateInstr) |
| UpdateInstr->eraseFromParent(); |
| } |
| |
| for (auto &TS : TagStores) |
| TS.MI->eraseFromParent(); |
| } |
| |
| bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset, |
| int64_t &Size, bool &ZeroData) { |
| MachineFunction &MF = *MI.getParent()->getParent(); |
| const MachineFrameInfo &MFI = MF.getFrameInfo(); |
| |
| unsigned Opcode = MI.getOpcode(); |
| ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGi || |
| Opcode == AArch64::STZ2Gi); |
| |
| if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) { |
| if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead()) |
| return false; |
| if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI()) |
| return false; |
| Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex()); |
| Size = MI.getOperand(2).getImm(); |
| return true; |
| } |
| |
| if (Opcode == AArch64::STGi || Opcode == AArch64::STZGi) |
| Size = 16; |
| else if (Opcode == AArch64::ST2Gi || Opcode == AArch64::STZ2Gi) |
| Size = 32; |
| else |
| return false; |
| |
| if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI()) |
| return false; |
| |
| Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) + |
| 16 * MI.getOperand(2).getImm(); |
| return true; |
| } |
| |
| // Detect a run of memory tagging instructions for adjacent stack frame slots, |
| // and replace them with a shorter instruction sequence: |
| // * replace STG + STG with ST2G |
| // * replace STGloop + STGloop with STGloop |
| // This code needs to run when stack slot offsets are already known, but before |
| // FrameIndex operands in STG instructions are eliminated. |
| MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, |
| const AArch64FrameLowering *TFI, |
| RegScavenger *RS) { |
| bool FirstZeroData; |
| int64_t Size, Offset; |
| MachineInstr &MI = *II; |
| MachineBasicBlock *MBB = MI.getParent(); |
| MachineBasicBlock::iterator NextI = ++II; |
| if (&MI == &MBB->instr_back()) |
| return II; |
| if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData)) |
| return II; |
| |
| SmallVector<TagStoreInstr, 4> Instrs; |
| Instrs.emplace_back(&MI, Offset, Size); |
| |
| constexpr int kScanLimit = 10; |
| int Count = 0; |
| for (MachineBasicBlock::iterator E = MBB->end(); |
| NextI != E && Count < kScanLimit; ++NextI) { |
| MachineInstr &MI = *NextI; |
| bool ZeroData; |
| int64_t Size, Offset; |
| // Collect instructions that update memory tags with a FrameIndex operand |
| // and (when applicable) constant size, and whose output registers are dead |
| // (the latter is almost always the case in practice). Since these |
| // instructions effectively have no inputs or outputs, we are free to skip |
| // any non-aliasing instructions in between without tracking used registers. |
| if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) { |
| if (ZeroData != FirstZeroData) |
| break; |
| Instrs.emplace_back(&MI, Offset, Size); |
| continue; |
| } |
| |
| // Only count non-transient, non-tagging instructions toward the scan |
| // limit. |
| if (!MI.isTransient()) |
| ++Count; |
| |
| // Just in case, stop before the epilogue code starts. |
| if (MI.getFlag(MachineInstr::FrameSetup) || |
| MI.getFlag(MachineInstr::FrameDestroy)) |
| break; |
| |
| // Reject anything that may alias the collected instructions. |
| if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() || MI.isCall()) |
| break; |
| } |
| |
| // New code will be inserted after the last tagging instruction we've found. |
| MachineBasicBlock::iterator InsertI = Instrs.back().MI; |
| |
| // All the gathered stack tag instructions are merged and placed after |
| // last tag store in the list. The check should be made if the nzcv |
| // flag is live at the point where we are trying to insert. Otherwise |
| // the nzcv flag might get clobbered if any stg loops are present. |
| |
| // FIXME : This approach of bailing out from merge is conservative in |
| // some ways like even if stg loops are not present after merge the |
| // insert list, this liveness check is done (which is not needed). |
| LivePhysRegs LiveRegs(*(MBB->getParent()->getSubtarget().getRegisterInfo())); |
| LiveRegs.addLiveOuts(*MBB); |
| for (auto I = MBB->rbegin();; ++I) { |
| MachineInstr &MI = *I; |
| if (MI == InsertI) |
| break; |
| LiveRegs.stepBackward(*I); |
| } |
| InsertI++; |
| if (LiveRegs.contains(AArch64::NZCV)) |
| return InsertI; |
| |
| llvm::stable_sort(Instrs, |
| [](const TagStoreInstr &Left, const TagStoreInstr &Right) { |
| return Left.Offset < Right.Offset; |
| }); |
| |
| // Make sure that we don't have any overlapping stores. |
| int64_t CurOffset = Instrs[0].Offset; |
| for (auto &Instr : Instrs) { |
| if (CurOffset > Instr.Offset) |
| return NextI; |
| CurOffset = Instr.Offset + Instr.Size; |
| } |
| |
| // Find contiguous runs of tagged memory and emit shorter instruction |
| // sequences for them when possible. |
| TagStoreEdit TSE(MBB, FirstZeroData); |
| std::optional<int64_t> EndOffset; |
| for (auto &Instr : Instrs) { |
| if (EndOffset && *EndOffset != Instr.Offset) { |
| // Found a gap. |
| TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false); |
| TSE.clear(); |
| } |
| |
| TSE.addInstruction(Instr); |
| EndOffset = Instr.Offset + Instr.Size; |
| } |
| |
| const MachineFunction *MF = MBB->getParent(); |
| // Multiple FP/SP updates in a loop cannot be described by CFI instructions. |
| TSE.emitCode( |
| InsertI, TFI, /*TryMergeSPUpdate = */ |
| !MF->getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(*MF)); |
| |
| return InsertI; |
| } |
| } // namespace |
| |
| void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( |
| MachineFunction &MF, RegScavenger *RS = nullptr) const { |
| for (auto &BB : MF) |
| for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) { |
| if (StackTaggingMergeSetTag) |
| II = tryMergeAdjacentSTG(II, this, RS); |
| } |
| |
| // By the time this method is called, most of the prologue/epilogue code is |
| // already emitted, whether its location was affected by the shrink-wrapping |
| // optimization or not. |
| if (!MF.getFunction().hasFnAttribute(Attribute::Naked) && |
| shouldSignReturnAddressEverywhere(MF)) |
| emitPacRetPlusLeafHardening(MF); |
| } |
| |
| /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP |
| /// before the update. This is easily retrieved as it is exactly the offset |
| /// that is set in processFunctionBeforeFrameFinalized. |
| StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP( |
| const MachineFunction &MF, int FI, Register &FrameReg, |
| bool IgnoreSPUpdates) const { |
| const MachineFrameInfo &MFI = MF.getFrameInfo(); |
| if (IgnoreSPUpdates) { |
| LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is " |
| << MFI.getObjectOffset(FI) << "\n"); |
| FrameReg = AArch64::SP; |
| return StackOffset::getFixed(MFI.getObjectOffset(FI)); |
| } |
| |
| // Go to common code if we cannot provide sp + offset. |
| if (MFI.hasVarSizedObjects() || |
| MF.getInfo<AArch64FunctionInfo>()->hasSVEStackSize() || |
| MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF)) |
| return getFrameIndexReference(MF, FI, FrameReg); |
| |
| FrameReg = AArch64::SP; |
| return getStackOffset(MF, MFI.getObjectOffset(FI)); |
| } |
| |
| /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve |
| /// the parent's frame pointer |
| unsigned AArch64FrameLowering::getWinEHParentFrameOffset( |
| const MachineFunction &MF) const { |
| return 0; |
| } |
| |
| /// Funclets only need to account for space for the callee saved registers, |
| /// as the locals are accounted for in the parent's stack frame. |
| unsigned AArch64FrameLowering::getWinEHFuncletFrameSize( |
| const MachineFunction &MF) const { |
| // This is the size of the pushed CSRs. |
| unsigned CSSize = |
| MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize(); |
| // This is the amount of stack a funclet needs to allocate. |
| return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(), |
| getStackAlign()); |
| } |
| |
| namespace { |
| struct FrameObject { |
| bool IsValid = false; |
| // Index of the object in MFI. |
| int ObjectIndex = 0; |
| // Group ID this object belongs to. |
| int GroupIndex = -1; |
| // This object should be placed first (closest to SP). |
| bool ObjectFirst = false; |
| // This object's group (which always contains the object with |
| // ObjectFirst==true) should be placed first. |
| bool GroupFirst = false; |
| |
| // Used to distinguish between FP and GPR accesses. The values are decided so |
| // that they sort FPR < Hazard < GPR and they can be or'd together. |
| unsigned Accesses = 0; |
| enum { AccessFPR = 1, AccessHazard = 2, AccessGPR = 4 }; |
| }; |
| |
| class GroupBuilder { |
| SmallVector<int, 8> CurrentMembers; |
| int NextGroupIndex = 0; |
| std::vector<FrameObject> &Objects; |
| |
| public: |
| GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {} |
| void AddMember(int Index) { CurrentMembers.push_back(Index); } |
| void EndCurrentGroup() { |
| if (CurrentMembers.size() > 1) { |
| // Create a new group with the current member list. This might remove them |
| // from their pre-existing groups. That's OK, dealing with overlapping |
| // groups is too hard and unlikely to make a difference. |
| LLVM_DEBUG(dbgs() << "group:"); |
| for (int Index : CurrentMembers) { |
| Objects[Index].GroupIndex = NextGroupIndex; |
| LLVM_DEBUG(dbgs() << " " << Index); |
| } |
| LLVM_DEBUG(dbgs() << "\n"); |
| NextGroupIndex++; |
| } |
| CurrentMembers.clear(); |
| } |
| }; |
| |
| bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { |
| // Objects at a lower index are closer to FP; objects at a higher index are |
| // closer to SP. |
| // |
| // For consistency in our comparison, all invalid objects are placed |
| // at the end. This also allows us to stop walking when we hit the |
| // first invalid item after it's all sorted. |
| // |
| // If we want to include a stack hazard region, order FPR accesses < the |
| // hazard object < GPRs accesses in order to create a separation between the |
| // two. For the Accesses field 1 = FPR, 2 = Hazard Object, 4 = GPR. |
| // |
| // Otherwise the "first" object goes first (closest to SP), followed by the |
| // members of the "first" group. |
| // |
| // The rest are sorted by the group index to keep the groups together. |
| // Higher numbered groups are more likely to be around longer (i.e. untagged |
| // in the function epilogue and not at some earlier point). Place them closer |
| // to SP. |
| // |
| // If all else equal, sort by the object index to keep the objects in the |
| // original order. |
| return std::make_tuple(!A.IsValid, A.Accesses, A.ObjectFirst, A.GroupFirst, |
| A.GroupIndex, A.ObjectIndex) < |
| std::make_tuple(!B.IsValid, B.Accesses, B.ObjectFirst, B.GroupFirst, |
| B.GroupIndex, B.ObjectIndex); |
| } |
| } // namespace |
| |
| void AArch64FrameLowering::orderFrameObjects( |
| const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const { |
| const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); |
| |
| if ((!OrderFrameObjects && !AFI.hasSplitSVEObjects()) || |
| ObjectsToAllocate.empty()) |
| return; |
| |
| const MachineFrameInfo &MFI = MF.getFrameInfo(); |
| std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd()); |
| for (auto &Obj : ObjectsToAllocate) { |
| FrameObjects[Obj].IsValid = true; |
| FrameObjects[Obj].ObjectIndex = Obj; |
| } |
| |
| // Identify FPR vs GPR slots for hazards, and stack slots that are tagged at |
| // the same time. |
| GroupBuilder GB(FrameObjects); |
| for (auto &MBB : MF) { |
| for (auto &MI : MBB) { |
| if (MI.isDebugInstr()) |
| continue; |
| |
| if (AFI.hasStackHazardSlotIndex()) { |
| std::optional<int> FI = getLdStFrameID(MI, MFI); |
| if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { |
| if (MFI.getStackID(*FI) == TargetStackID::ScalableVector || |
| AArch64InstrInfo::isFpOrNEON(MI)) |
| FrameObjects[*FI].Accesses |= FrameObject::AccessFPR; |
| else |
| FrameObjects[*FI].Accesses |= FrameObject::AccessGPR; |
| } |
| } |
| |
| int OpIndex; |
| switch (MI.getOpcode()) { |
| case AArch64::STGloop: |
| case AArch64::STZGloop: |
| OpIndex = 3; |
| break; |
| case AArch64::STGi: |
| case AArch64::STZGi: |
| case AArch64::ST2Gi: |
| case AArch64::STZ2Gi: |
| OpIndex = 1; |
| break; |
| default: |
| OpIndex = -1; |
| } |
| |
| int TaggedFI = -1; |
| if (OpIndex >= 0) { |
| const MachineOperand &MO = MI.getOperand(OpIndex); |
| if (MO.isFI()) { |
| int FI = MO.getIndex(); |
| if (FI >= 0 && FI < MFI.getObjectIndexEnd() && |
| FrameObjects[FI].IsValid) |
| TaggedFI = FI; |
| } |
| } |
| |
| // If this is a stack tagging instruction for a slot that is not part of a |
| // group yet, either start a new group or add it to the current one. |
| if (TaggedFI >= 0) |
| GB.AddMember(TaggedFI); |
| else |
| GB.EndCurrentGroup(); |
| } |
| // Groups should never span multiple basic blocks. |
| GB.EndCurrentGroup(); |
| } |
| |
| if (AFI.hasStackHazardSlotIndex()) { |
| FrameObjects[AFI.getStackHazardSlotIndex()].Accesses = |
| FrameObject::AccessHazard; |
| // If a stack object is unknown or both GPR and FPR, sort it into GPR. |
| for (auto &Obj : FrameObjects) |
| if (!Obj.Accesses || |
| Obj.Accesses == (FrameObject::AccessGPR | FrameObject::AccessFPR)) |
| Obj.Accesses = FrameObject::AccessGPR; |
| } |
| |
| // If the function's tagged base pointer is pinned to a stack slot, we want to |
| // put that slot first when possible. This will likely place it at SP + 0, |
| // and save one instruction when generating the base pointer because IRG does |
| // not allow an immediate offset. |
| std::optional<int> TBPI = AFI.getTaggedBasePointerIndex(); |
| if (TBPI) { |
| FrameObjects[*TBPI].ObjectFirst = true; |
| FrameObjects[*TBPI].GroupFirst = true; |
| int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex; |
| if (FirstGroupIndex >= 0) |
| for (FrameObject &Object : FrameObjects) |
| if (Object.GroupIndex == FirstGroupIndex) |
| Object.GroupFirst = true; |
| } |
| |
| llvm::stable_sort(FrameObjects, FrameObjectCompare); |
| |
| int i = 0; |
| for (auto &Obj : FrameObjects) { |
| // All invalid items are sorted at the end, so it's safe to stop. |
| if (!Obj.IsValid) |
| break; |
| ObjectsToAllocate[i++] = Obj.ObjectIndex; |
| } |
| |
| LLVM_DEBUG({ |
| dbgs() << "Final frame order:\n"; |
| for (auto &Obj : FrameObjects) { |
| if (!Obj.IsValid) |
| break; |
| dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex; |
| if (Obj.ObjectFirst) |
| dbgs() << ", first"; |
| if (Obj.GroupFirst) |
| dbgs() << ", group-first"; |
| dbgs() << "\n"; |
| } |
| }); |
| } |
| |
| /// Emit a loop to decrement SP until it is equal to TargetReg, with probes at |
| /// least every ProbeSize bytes. Returns an iterator of the first instruction |
| /// after the loop. The difference between SP and TargetReg must be an exact |
| /// multiple of ProbeSize. |
| MachineBasicBlock::iterator |
| AArch64FrameLowering::inlineStackProbeLoopExactMultiple( |
| MachineBasicBlock::iterator MBBI, int64_t ProbeSize, |
| Register TargetReg) const { |
| MachineBasicBlock &MBB = *MBBI->getParent(); |
| MachineFunction &MF = *MBB.getParent(); |
| const AArch64InstrInfo *TII = |
| MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); |
| DebugLoc DL = MBB.findDebugLoc(MBBI); |
| |
| MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); |
| MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); |
| MF.insert(MBBInsertPoint, LoopMBB); |
| MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); |
| MF.insert(MBBInsertPoint, ExitMBB); |
| |
| // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable |
| // in SUB). |
| emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP, |
| StackOffset::getFixed(-ProbeSize), TII, |
| MachineInstr::FrameSetup); |
| // STR XZR, [SP] |
| BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui)) |
| .addReg(AArch64::XZR) |
| .addReg(AArch64::SP) |
| .addImm(0) |
| .setMIFlags(MachineInstr::FrameSetup); |
| // CMP SP, TargetReg |
| BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64), |
| AArch64::XZR) |
| .addReg(AArch64::SP) |
| .addReg(TargetReg) |
| .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) |
| .setMIFlags(MachineInstr::FrameSetup); |
| // B.CC Loop |
| BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc)) |
| .addImm(AArch64CC::NE) |
| .addMBB(LoopMBB) |
| .setMIFlags(MachineInstr::FrameSetup); |
| |
| LoopMBB->addSuccessor(ExitMBB); |
| LoopMBB->addSuccessor(LoopMBB); |
| // Synthesize the exit MBB. |
| ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end()); |
| ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); |
| MBB.addSuccessor(LoopMBB); |
| // Update liveins. |
| fullyRecomputeLiveIns({ExitMBB, LoopMBB}); |
| |
| return ExitMBB->begin(); |
| } |
| |
| void AArch64FrameLowering::inlineStackProbeFixed( |
| MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize, |
| StackOffset CFAOffset) const { |
| MachineBasicBlock *MBB = MBBI->getParent(); |
| MachineFunction &MF = *MBB->getParent(); |
| const AArch64InstrInfo *TII = |
| MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); |
| AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF); |
| bool HasFP = hasFP(MF); |
| |
| DebugLoc DL; |
| int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize(); |
| int64_t NumBlocks = FrameSize / ProbeSize; |
| int64_t ResidualSize = FrameSize % ProbeSize; |
| |
| LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, " |
| << NumBlocks << " blocks of " << ProbeSize |
| << " bytes, plus " << ResidualSize << " bytes\n"); |
| |
| // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or |
| // ordinary loop. |
| if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) { |
| for (int i = 0; i < NumBlocks; ++i) { |
| // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not |
| // encodable in a SUB). |
| emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, |
| StackOffset::getFixed(-ProbeSize), TII, |
| MachineInstr::FrameSetup, false, false, nullptr, |
| EmitAsyncCFI && !HasFP, CFAOffset); |
| CFAOffset += StackOffset::getFixed(ProbeSize); |
| // STR XZR, [SP] |
| BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) |
| .addReg(AArch64::XZR) |
| .addReg(AArch64::SP) |
| .addImm(0) |
| .setMIFlags(MachineInstr::FrameSetup); |
| } |
| } else if (NumBlocks != 0) { |
| // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not |
| // encodable in ADD). ScrathReg may temporarily become the CFA register. |
| emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP, |
| StackOffset::getFixed(-ProbeSize * NumBlocks), TII, |
| MachineInstr::FrameSetup, false, false, nullptr, |
| EmitAsyncCFI && !HasFP, CFAOffset); |
| CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks); |
| MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg); |
| MBB = MBBI->getParent(); |
| if (EmitAsyncCFI && !HasFP) { |
| // Set the CFA register back to SP. |
| CFIInstBuilder(*MBB, MBBI, MachineInstr::FrameSetup) |
| .buildDefCFARegister(AArch64::SP); |
| } |
| } |
| |
| if (ResidualSize != 0) { |
| // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable |
| // in SUB). |
| emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, |
| StackOffset::getFixed(-ResidualSize), TII, |
| MachineInstr::FrameSetup, false, false, nullptr, |
| EmitAsyncCFI && !HasFP, CFAOffset); |
| if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) { |
| // STR XZR, [SP] |
| BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) |
| .addReg(AArch64::XZR) |
| .addReg(AArch64::SP) |
| .addImm(0) |
| .setMIFlags(MachineInstr::FrameSetup); |
| } |
| } |
| } |
| |
| void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF, |
| MachineBasicBlock &MBB) const { |
| // Get the instructions that need to be replaced. We emit at most two of |
| // these. Remember them in order to avoid complications coming from the need |
| // to traverse the block while potentially creating more blocks. |
| SmallVector<MachineInstr *, 4> ToReplace; |
| for (MachineInstr &MI : MBB) |
| if (MI.getOpcode() == AArch64::PROBED_STACKALLOC || |
| MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR) |
| ToReplace.push_back(&MI); |
| |
| for (MachineInstr *MI : ToReplace) { |
| if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) { |
| Register ScratchReg = MI->getOperand(0).getReg(); |
| int64_t FrameSize = MI->getOperand(1).getImm(); |
| StackOffset CFAOffset = StackOffset::get(MI->getOperand(2).getImm(), |
| MI->getOperand(3).getImm()); |
| inlineStackProbeFixed(MI->getIterator(), ScratchReg, FrameSize, |
| CFAOffset); |
| } else { |
| assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR && |
| "Stack probe pseudo-instruction expected"); |
| const AArch64InstrInfo *TII = |
| MI->getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo(); |
| Register TargetReg = MI->getOperand(0).getReg(); |
| (void)TII->probedStackAlloc(MI->getIterator(), TargetReg, true); |
| } |
| MI->eraseFromParent(); |
| } |
| } |
| |
| struct StackAccess { |
| enum AccessType { |
| NotAccessed = 0, // Stack object not accessed by load/store instructions. |
| GPR = 1 << 0, // A general purpose register. |
| PPR = 1 << 1, // A predicate register. |
| FPR = 1 << 2, // A floating point/Neon/SVE register. |
| }; |
| |
| int Idx; |
| StackOffset Offset; |
| int64_t Size; |
| unsigned AccessTypes; |
| |
| StackAccess() : Idx(0), Offset(), Size(0), AccessTypes(NotAccessed) {} |
| |
| bool operator<(const StackAccess &Rhs) const { |
| return std::make_tuple(start(), Idx) < |
| std::make_tuple(Rhs.start(), Rhs.Idx); |
| } |
| |
| bool isCPU() const { |
| // Predicate register load and store instructions execute on the CPU. |
| return AccessTypes & (AccessType::GPR | AccessType::PPR); |
| } |
| bool isSME() const { return AccessTypes & AccessType::FPR; } |
| bool isMixed() const { return isCPU() && isSME(); } |
| |
| int64_t start() const { return Offset.getFixed() + Offset.getScalable(); } |
| int64_t end() const { return start() + Size; } |
| |
| std::string getTypeString() const { |
| switch (AccessTypes) { |
| case AccessType::FPR: |
| return "FPR"; |
| case AccessType::PPR: |
| return "PPR"; |
| case AccessType::GPR: |
| return "GPR"; |
| case AccessType::NotAccessed: |
| return "NA"; |
| default: |
| return "Mixed"; |
| } |
| } |
| |
| void print(raw_ostream &OS) const { |
| OS << getTypeString() << " stack object at [SP" |
| << (Offset.getFixed() < 0 ? "" : "+") << Offset.getFixed(); |
| if (Offset.getScalable()) |
| OS << (Offset.getScalable() < 0 ? "" : "+") << Offset.getScalable() |
| << " * vscale"; |
| OS << "]"; |
| } |
| }; |
| |
| static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) { |
| SA.print(OS); |
| return OS; |
| } |
| |
| void AArch64FrameLowering::emitRemarks( |
| const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const { |
| |
| auto *AFI = MF.getInfo<AArch64FunctionInfo>(); |
| if (AFI->getSMEFnAttrs().hasNonStreamingInterfaceAndBody()) |
| return; |
| |
| unsigned StackHazardSize = getStackHazardSize(MF); |
| const uint64_t HazardSize = |
| (StackHazardSize) ? StackHazardSize : StackHazardRemarkSize; |
| |
| if (HazardSize == 0) |
| return; |
| |
| const MachineFrameInfo &MFI = MF.getFrameInfo(); |
| // Bail if function has no stack objects. |
| if (!MFI.hasStackObjects()) |
| return; |
| |
| std::vector<StackAccess> StackAccesses(MFI.getNumObjects()); |
| |
| size_t NumFPLdSt = 0; |
| size_t NumNonFPLdSt = 0; |
| |
| // Collect stack accesses via Load/Store instructions. |
| for (const MachineBasicBlock &MBB : MF) { |
| for (const MachineInstr &MI : MBB) { |
| if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) |
| continue; |
| for (MachineMemOperand *MMO : MI.memoperands()) { |
| std::optional<int> FI = getMMOFrameID(MMO, MFI); |
| if (FI && !MFI.isDeadObjectIndex(*FI)) { |
| int FrameIdx = *FI; |
| |
| size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects(); |
| if (StackAccesses[ArrIdx].AccessTypes == StackAccess::NotAccessed) { |
| StackAccesses[ArrIdx].Idx = FrameIdx; |
| StackAccesses[ArrIdx].Offset = |
| getFrameIndexReferenceFromSP(MF, FrameIdx); |
| StackAccesses[ArrIdx].Size = MFI.getObjectSize(FrameIdx); |
| } |
| |
| unsigned RegTy = StackAccess::AccessType::GPR; |
| if (MFI.hasScalableStackID(FrameIdx)) |
| RegTy = isPPRAccess(MI) ? StackAccess::PPR : StackAccess::FPR; |
| else if (AArch64InstrInfo::isFpOrNEON(MI)) |
| RegTy = StackAccess::FPR; |
| |
| StackAccesses[ArrIdx].AccessTypes |= RegTy; |
| |
| if (RegTy == StackAccess::FPR) |
| ++NumFPLdSt; |
| else |
| ++NumNonFPLdSt; |
| } |
| } |
| } |
| } |
| |
| if (NumFPLdSt == 0 || NumNonFPLdSt == 0) |
| return; |
| |
| llvm::sort(StackAccesses); |
| llvm::erase_if(StackAccesses, [](const StackAccess &S) { |
| return S.AccessTypes == StackAccess::NotAccessed; |
| }); |
| |
| SmallVector<const StackAccess *> MixedObjects; |
| SmallVector<std::pair<const StackAccess *, const StackAccess *>> HazardPairs; |
| |
| if (StackAccesses.front().isMixed()) |
| MixedObjects.push_back(&StackAccesses.front()); |
| |
| for (auto It = StackAccesses.begin(), End = std::prev(StackAccesses.end()); |
| It != End; ++It) { |
| const auto &First = *It; |
| const auto &Second = *(It + 1); |
| |
| if (Second.isMixed()) |
| MixedObjects.push_back(&Second); |
| |
| if ((First.isSME() && Second.isCPU()) || |
| (First.isCPU() && Second.isSME())) { |
| uint64_t Distance = static_cast<uint64_t>(Second.start() - First.end()); |
| if (Distance < HazardSize) |
| HazardPairs.emplace_back(&First, &Second); |
| } |
| } |
| |
| auto EmitRemark = [&](llvm::StringRef Str) { |
| ORE->emit([&]() { |
| auto R = MachineOptimizationRemarkAnalysis( |
| "sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front()); |
| return R << formatv("stack hazard in '{0}': ", MF.getName()).str() << Str; |
| }); |
| }; |
| |
| for (const auto &P : HazardPairs) |
| EmitRemark(formatv("{0} is too close to {1}", *P.first, *P.second).str()); |
| |
| for (const auto *Obj : MixedObjects) |
| EmitRemark( |
| formatv("{0} accessed by both GP and FP instructions", *Obj).str()); |
| } |