1//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of TargetFrameLowering class.
10//
11// On AArch64, stack frames are structured as follows:
12//
13// The stack grows downward.
14//
15// All of the individual frame areas on the frame below are optional, i.e. it's
16// possible to create a function so that the particular area isn't present
17// in the frame.
18//
19// At function entry, the "frame" looks as follows:
20//
21// | | Higher address
22// |-----------------------------------|
23// | |
24// | arguments passed on the stack |
25// | |
26// |-----------------------------------| <- sp
27// | | Lower address
28//
29//
30// After the prologue has run, the frame has the following general structure.
31// Note that this doesn't depict the case where a red-zone is used. Also,
32// technically the last frame area (VLAs) doesn't get created until in the
33// main function body, after the prologue is run. However, it's depicted here
34// for completeness.
35//
36// | | Higher address
37// |-----------------------------------|
38// | |
39// | arguments passed on the stack |
40// | |
41// |-----------------------------------|
42// | |
43// | (Win64 only) varargs from reg |
44// | |
45// |-----------------------------------|
46// | |
47// | (Win64 only) callee-saved SVE reg |
48// | |
49// |-----------------------------------|
50// | |
51// | callee-saved gpr registers | <--.
52// | | | On Darwin platforms these
53// |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
54// | prev_lr | | (frame record first)
55// | prev_fp | <--'
56// | async context if needed |
57// | (a.k.a. "frame record") |
58// |-----------------------------------| <- fp(=x29)
59// | <hazard padding> |
60// |-----------------------------------|
61// | |
62// | callee-saved fp/simd/SVE regs |
63// | |
64// |-----------------------------------|
65// | |
66// | SVE stack objects |
67// | |
68// |-----------------------------------|
69// |.empty.space.to.make.part.below....|
70// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
71// |.the.standard.16-byte.alignment....| compile time; if present)
72// |-----------------------------------|
73// | local variables of fixed size |
74// | including spill slots |
75// | <FPR> |
76// | <hazard padding> |
77// | <GPR> |
78// |-----------------------------------| <- bp(not defined by ABI,
79// |.variable-sized.local.variables....| LLVM chooses X19)
80// |.(VLAs)............................| (size of this area is unknown at
81// |...................................| compile time)
82// |-----------------------------------| <- sp
83// | | Lower address
84//
85//
86// To access the data in a frame, at-compile time, a constant offset must be
87// computable from one of the pointers (fp, bp, sp) to access it. The size
88// of the areas with a dotted background cannot be computed at compile-time
89// if they are present, making it required to have all three of fp, bp and
90// sp to be set up to be able to access all contents in the frame areas,
91// assuming all of the frame areas are non-empty.
92//
93// For most functions, some of the frame areas are empty. For those functions,
94// it may not be necessary to set up fp or bp:
95// * A base pointer is definitely needed when there are both VLAs and local
96// variables with more-than-default alignment requirements.
97// * A frame pointer is definitely needed when there are local variables with
98// more-than-default alignment requirements.
99//
100// For Darwin platforms the frame-record (fp, lr) is stored at the top of the
101// callee-saved area, since the unwind encoding does not allow for encoding
102// this dynamically and existing tools depend on this layout. For other
103// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
104// area to allow SVE stack objects (allocated directly below the callee-saves,
105// if available) to be accessed directly from the framepointer.
106// The SVE spill/fill instructions have VL-scaled addressing modes such
107// as:
108// ldr z8, [fp, #-7 mul vl]
109// For SVE the size of the vector length (VL) is not known at compile-time, so
110// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
111// layout, we don't need to add an unscaled offset to the framepointer before
112// accessing the SVE object in the frame.
113//
114// In some cases when a base pointer is not strictly needed, it is generated
115// anyway when offsets from the frame pointer to access local variables become
116// so large that the offset can't be encoded in the immediate fields of loads
117// or stores.
118//
119// Outgoing function arguments must be at the bottom of the stack frame when
120// calling another function. If we do not have variable-sized stack objects, we
121// can allocate a "reserved call frame" area at the bottom of the local
122// variable area, large enough for all outgoing calls. If we do have VLAs, then
123// the stack pointer must be decremented and incremented around each call to
124// make space for the arguments below the VLAs.
125//
126// FIXME: also explain the redzone concept.
127//
128// About stack hazards: Under some SME contexts, a coprocessor with its own
129// separate cache can used for FP operations. This can create hazards if the CPU
130// and the SME unit try to access the same area of memory, including if the
131// access is to an area of the stack. To try to alleviate this we attempt to
132// introduce extra padding into the stack frame between FP and GPR accesses,
133// controlled by the aarch64-stack-hazard-size option. Without changing the
134// layout of the stack frame in the diagram above, a stack object of size
135// aarch64-stack-hazard-size is added between GPR and FPR CSRs. Another is added
136// to the stack objects section, and stack objects are sorted so that FPR >
137// Hazard padding slot > GPRs (where possible). Unfortunately some things are
138// not handled well (VLA area, arguments on the stack, objects with both GPR and
139// FPR accesses), but if those are controlled by the user then the entire stack
140// frame becomes GPR at the start/end with FPR in the middle, surrounded by
141// Hazard padding.
142//
143// An example of the prologue:
144//
145// .globl __foo
146// .align 2
147// __foo:
148// Ltmp0:
149// .cfi_startproc
150// .cfi_personality 155, ___gxx_personality_v0
151// Leh_func_begin:
152// .cfi_lsda 16, Lexception33
153//
154// stp xa,bx, [sp, -#offset]!
155// ...
156// stp x28, x27, [sp, #offset-32]
157// stp fp, lr, [sp, #offset-16]
158// add fp, sp, #offset - 16
159// sub sp, sp, #1360
160//
161// The Stack:
162// +-------------------------------------------+
163// 10000 | ........ | ........ | ........ | ........ |
164// 10004 | ........ | ........ | ........ | ........ |
165// +-------------------------------------------+
166// 10008 | ........ | ........ | ........ | ........ |
167// 1000c | ........ | ........ | ........ | ........ |
168// +===========================================+
169// 10010 | X28 Register |
170// 10014 | X28 Register |
171// +-------------------------------------------+
172// 10018 | X27 Register |
173// 1001c | X27 Register |
174// +===========================================+
175// 10020 | Frame Pointer |
176// 10024 | Frame Pointer |
177// +-------------------------------------------+
178// 10028 | Link Register |
179// 1002c | Link Register |
180// +===========================================+
181// 10030 | ........ | ........ | ........ | ........ |
182// 10034 | ........ | ........ | ........ | ........ |
183// +-------------------------------------------+
184// 10038 | ........ | ........ | ........ | ........ |
185// 1003c | ........ | ........ | ........ | ........ |
186// +-------------------------------------------+
187//
188// [sp] = 10030 :: >>initial value<<
189// sp = 10020 :: stp fp, lr, [sp, #-16]!
190// fp = sp == 10020 :: mov fp, sp
191// [sp] == 10020 :: stp x28, x27, [sp, #-16]!
192// sp == 10010 :: >>final value<<
193//
194// The frame pointer (w29) points to address 10020. If we use an offset of
195// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
196// for w27, and -32 for w28:
197//
198// Ltmp1:
199// .cfi_def_cfa w29, 16
200// Ltmp2:
201// .cfi_offset w30, -8
202// Ltmp3:
203// .cfi_offset w29, -16
204// Ltmp4:
205// .cfi_offset w27, -24
206// Ltmp5:
207// .cfi_offset w28, -32
208//
209//===----------------------------------------------------------------------===//
210
211#include "AArch64FrameLowering.h"
212#include "AArch64InstrInfo.h"
213#include "AArch64MachineFunctionInfo.h"
214#include "AArch64RegisterInfo.h"
215#include "AArch64Subtarget.h"
216#include "MCTargetDesc/AArch64AddressingModes.h"
217#include "MCTargetDesc/AArch64MCTargetDesc.h"
218#include "Utils/AArch64SMEAttributes.h"
219#include "llvm/ADT/ScopeExit.h"
220#include "llvm/ADT/SmallVector.h"
221#include "llvm/ADT/Statistic.h"
222#include "llvm/Analysis/ValueTracking.h"
223#include "llvm/CodeGen/CFIInstBuilder.h"
224#include "llvm/CodeGen/LivePhysRegs.h"
225#include "llvm/CodeGen/MachineBasicBlock.h"
226#include "llvm/CodeGen/MachineFrameInfo.h"
227#include "llvm/CodeGen/MachineFunction.h"
228#include "llvm/CodeGen/MachineInstr.h"
229#include "llvm/CodeGen/MachineInstrBuilder.h"
230#include "llvm/CodeGen/MachineMemOperand.h"
231#include "llvm/CodeGen/MachineModuleInfo.h"
232#include "llvm/CodeGen/MachineOperand.h"
233#include "llvm/CodeGen/MachineRegisterInfo.h"
234#include "llvm/CodeGen/RegisterScavenging.h"
235#include "llvm/CodeGen/TargetInstrInfo.h"
236#include "llvm/CodeGen/TargetRegisterInfo.h"
237#include "llvm/CodeGen/TargetSubtargetInfo.h"
238#include "llvm/CodeGen/WinEHFuncInfo.h"
239#include "llvm/IR/Attributes.h"
240#include "llvm/IR/CallingConv.h"
241#include "llvm/IR/DataLayout.h"
242#include "llvm/IR/DebugLoc.h"
243#include "llvm/IR/Function.h"
244#include "llvm/MC/MCAsmInfo.h"
245#include "llvm/MC/MCDwarf.h"
246#include "llvm/Support/CommandLine.h"
247#include "llvm/Support/Debug.h"
248#include "llvm/Support/ErrorHandling.h"
249#include "llvm/Support/FormatVariadic.h"
250#include "llvm/Support/MathExtras.h"
251#include "llvm/Support/raw_ostream.h"
252#include "llvm/Target/TargetMachine.h"
253#include "llvm/Target/TargetOptions.h"
254#include <cassert>
255#include <cstdint>
256#include <iterator>
257#include <optional>
258#include <vector>
259
260using namespace llvm;
261
262#define DEBUG_TYPE "frame-info"
263
264static cl::opt<bool> EnableRedZone("aarch64-redzone",
265 cl::desc("enable use of redzone on AArch64"),
266 cl::init(Val: false), cl::Hidden);
267
268static cl::opt<bool> StackTaggingMergeSetTag(
269 "stack-tagging-merge-settag",
270 cl::desc("merge settag instruction in function epilog"), cl::init(Val: true),
271 cl::Hidden);
272
273static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
274 cl::desc("sort stack allocations"),
275 cl::init(Val: true), cl::Hidden);
276
277cl::opt<bool> EnableHomogeneousPrologEpilog(
278 "homogeneous-prolog-epilog", cl::Hidden,
279 cl::desc("Emit homogeneous prologue and epilogue for the size "
280 "optimization (default = off)"));
281
282// Stack hazard size for analysis remarks. StackHazardSize takes precedence.
283static cl::opt<unsigned>
284 StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(Val: 0),
285 cl::Hidden);
286// Whether to insert padding into non-streaming functions (for testing).
287static cl::opt<bool>
288 StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming",
289 cl::init(Val: false), cl::Hidden);
290
291static cl::opt<bool> DisableMultiVectorSpillFill(
292 "aarch64-disable-multivector-spill-fill",
293 cl::desc("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(Val: false),
294 cl::Hidden);
295
296STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
297
298/// Returns how much of the incoming argument stack area (in bytes) we should
299/// clean up in an epilogue. For the C calling convention this will be 0, for
300/// guaranteed tail call conventions it can be positive (a normal return or a
301/// tail call to a function that uses less stack space for arguments) or
302/// negative (for a tail call to a function that needs more stack space than us
303/// for arguments).
304static int64_t getArgumentStackToRestore(MachineFunction &MF,
305 MachineBasicBlock &MBB) {
306 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
307 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
308 bool IsTailCallReturn = (MBB.end() != MBBI)
309 ? AArch64InstrInfo::isTailCallReturnInst(MI: *MBBI)
310 : false;
311
312 int64_t ArgumentPopSize = 0;
313 if (IsTailCallReturn) {
314 MachineOperand &StackAdjust = MBBI->getOperand(i: 1);
315
316 // For a tail-call in a callee-pops-arguments environment, some or all of
317 // the stack may actually be in use for the call's arguments, this is
318 // calculated during LowerCall and consumed here...
319 ArgumentPopSize = StackAdjust.getImm();
320 } else {
321 // ... otherwise the amount to pop is *all* of the argument space,
322 // conveniently stored in the MachineFunctionInfo by
323 // LowerFormalArguments. This will, of course, be zero for the C calling
324 // convention.
325 ArgumentPopSize = AFI->getArgumentStackToRestore();
326 }
327
328 return ArgumentPopSize;
329}
330
331static bool produceCompactUnwindFrame(MachineFunction &MF);
332static bool needsWinCFI(const MachineFunction &MF);
333static StackOffset getSVEStackSize(const MachineFunction &MF);
334static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
335 bool HasCall = false);
336static bool requiresSaveVG(const MachineFunction &MF);
337
338/// Returns true if a homogeneous prolog or epilog code can be emitted
339/// for the size optimization. If possible, a frame helper call is injected.
340/// When Exit block is given, this check is for epilog.
341bool AArch64FrameLowering::homogeneousPrologEpilog(
342 MachineFunction &MF, MachineBasicBlock *Exit) const {
343 if (!MF.getFunction().hasMinSize())
344 return false;
345 if (!EnableHomogeneousPrologEpilog)
346 return false;
347 if (EnableRedZone)
348 return false;
349
350 // TODO: Window is supported yet.
351 if (needsWinCFI(MF))
352 return false;
353 // TODO: SVE is not supported yet.
354 if (getSVEStackSize(MF))
355 return false;
356
357 // Bail on stack adjustment needed on return for simplicity.
358 const MachineFrameInfo &MFI = MF.getFrameInfo();
359 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
360 if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
361 return false;
362 if (Exit && getArgumentStackToRestore(MF, MBB&: *Exit))
363 return false;
364
365 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
366 if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
367 return false;
368
369 // If there are an odd number of GPRs before LR and FP in the CSRs list,
370 // they will not be paired into one RegPairInfo, which is incompatible with
371 // the assumption made by the homogeneous prolog epilog pass.
372 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
373 unsigned NumGPRs = 0;
374 for (unsigned I = 0; CSRegs[I]; ++I) {
375 Register Reg = CSRegs[I];
376 if (Reg == AArch64::LR) {
377 assert(CSRegs[I + 1] == AArch64::FP);
378 if (NumGPRs % 2 != 0)
379 return false;
380 break;
381 }
382 if (AArch64::GPR64RegClass.contains(Reg))
383 ++NumGPRs;
384 }
385
386 return true;
387}
388
389/// Returns true if CSRs should be paired.
390bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
391 return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
392}
393
394/// This is the biggest offset to the stack pointer we can encode in aarch64
395/// instructions (without using a separate calculation and a temp register).
396/// Note that the exception here are vector stores/loads which cannot encode any
397/// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
398static const unsigned DefaultSafeSPDisplacement = 255;
399
400/// Look at each instruction that references stack frames and return the stack
401/// size limit beyond which some of these instructions will require a scratch
402/// register during their expansion later.
403static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
404 // FIXME: For now, just conservatively guesstimate based on unscaled indexing
405 // range. We'll end up allocating an unnecessary spill slot a lot, but
406 // realistically that's not a big deal at this stage of the game.
407 for (MachineBasicBlock &MBB : MF) {
408 for (MachineInstr &MI : MBB) {
409 if (MI.isDebugInstr() || MI.isPseudo() ||
410 MI.getOpcode() == AArch64::ADDXri ||
411 MI.getOpcode() == AArch64::ADDSXri)
412 continue;
413
414 for (const MachineOperand &MO : MI.operands()) {
415 if (!MO.isFI())
416 continue;
417
418 StackOffset Offset;
419 if (isAArch64FrameOffsetLegal(MI, Offset, OutUseUnscaledOp: nullptr, OutUnscaledOp: nullptr, EmittableOffset: nullptr) ==
420 AArch64FrameOffsetCannotUpdate)
421 return 0;
422 }
423 }
424 }
425 return DefaultSafeSPDisplacement;
426}
427
428TargetStackID::Value
429AArch64FrameLowering::getStackIDForScalableVectors() const {
430 return TargetStackID::ScalableVector;
431}
432
433/// Returns the size of the fixed object area (allocated next to sp on entry)
434/// On Win64 this may include a var args area and an UnwindHelp object for EH.
435static unsigned getFixedObjectSize(const MachineFunction &MF,
436 const AArch64FunctionInfo *AFI, bool IsWin64,
437 bool IsFunclet) {
438 if (!IsWin64 || IsFunclet) {
439 return AFI->getTailCallReservedStack();
440 } else {
441 if (AFI->getTailCallReservedStack() != 0 &&
442 !MF.getFunction().getAttributes().hasAttrSomewhere(
443 Kind: Attribute::SwiftAsync))
444 report_fatal_error(reason: "cannot generate ABI-changing tail call for Win64");
445 // Var args are stored here in the primary function.
446 const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
447 // To support EH funclets we allocate an UnwindHelp object
448 const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
449 return AFI->getTailCallReservedStack() +
450 alignTo(Value: VarArgsArea + UnwindHelpObject, Align: 16);
451 }
452}
453
454/// Returns the size of the entire SVE stackframe (calleesaves + spills).
455static StackOffset getSVEStackSize(const MachineFunction &MF) {
456 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
457 return StackOffset::getScalable(Scalable: (int64_t)AFI->getStackSizeSVE());
458}
459
460bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
461 if (!EnableRedZone)
462 return false;
463
464 // Don't use the red zone if the function explicitly asks us not to.
465 // This is typically used for kernel code.
466 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
467 const unsigned RedZoneSize =
468 Subtarget.getTargetLowering()->getRedZoneSize(F: MF.getFunction());
469 if (!RedZoneSize)
470 return false;
471
472 const MachineFrameInfo &MFI = MF.getFrameInfo();
473 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
474 uint64_t NumBytes = AFI->getLocalStackSize();
475
476 // If neither NEON or SVE are available, a COPY from one Q-reg to
477 // another requires a spill -> reload sequence. We can do that
478 // using a pre-decrementing store/post-decrementing load, but
479 // if we do so, we can't use the Red Zone.
480 bool LowerQRegCopyThroughMem = Subtarget.hasFPARMv8() &&
481 !Subtarget.isNeonAvailable() &&
482 !Subtarget.hasSVE();
483
484 return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
485 getSVEStackSize(MF) || LowerQRegCopyThroughMem);
486}
487
488/// hasFPImpl - Return true if the specified function should have a dedicated
489/// frame pointer register.
490bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const {
491 const MachineFrameInfo &MFI = MF.getFrameInfo();
492 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
493
494 // Win64 EH requires a frame pointer if funclets are present, as the locals
495 // are accessed off the frame pointer in both the parent function and the
496 // funclets.
497 if (MF.hasEHFunclets())
498 return true;
499 // Retain behavior of always omitting the FP for leaf functions when possible.
500 if (MF.getTarget().Options.DisableFramePointerElim(MF))
501 return true;
502 if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
503 MFI.hasStackMap() || MFI.hasPatchPoint() ||
504 RegInfo->hasStackRealignment(MF))
505 return true;
506 // With large callframes around we may need to use FP to access the scavenging
507 // emergency spillslot.
508 //
509 // Unfortunately some calls to hasFP() like machine verifier ->
510 // getReservedReg() -> hasFP in the middle of global isel are too early
511 // to know the max call frame size. Hopefully conservatively returning "true"
512 // in those cases is fine.
513 // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
514 if (!MFI.isMaxCallFrameSizeComputed() ||
515 MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
516 return true;
517
518 return false;
519}
520
521/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
522/// not required, we reserve argument space for call sites in the function
523/// immediately on entry to the current function. This eliminates the need for
524/// add/sub sp brackets around call sites. Returns true if the call frame is
525/// included as part of the stack frame.
526bool AArch64FrameLowering::hasReservedCallFrame(
527 const MachineFunction &MF) const {
528 // The stack probing code for the dynamically allocated outgoing arguments
529 // area assumes that the stack is probed at the top - either by the prologue
530 // code, which issues a probe if `hasVarSizedObjects` return true, or by the
531 // most recent variable-sized object allocation. Changing the condition here
532 // may need to be followed up by changes to the probe issuing logic.
533 return !MF.getFrameInfo().hasVarSizedObjects();
534}
535
536MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
537 MachineFunction &MF, MachineBasicBlock &MBB,
538 MachineBasicBlock::iterator I) const {
539 const AArch64InstrInfo *TII =
540 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
541 const AArch64TargetLowering *TLI =
542 MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
543 [[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo();
544 DebugLoc DL = I->getDebugLoc();
545 unsigned Opc = I->getOpcode();
546 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
547 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(i: 1).getImm() : 0;
548
549 if (!hasReservedCallFrame(MF)) {
550 int64_t Amount = I->getOperand(i: 0).getImm();
551 Amount = alignTo(Size: Amount, A: getStackAlign());
552 if (!IsDestroy)
553 Amount = -Amount;
554
555 // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
556 // doesn't have to pop anything), then the first operand will be zero too so
557 // this adjustment is a no-op.
558 if (CalleePopAmount == 0) {
559 // FIXME: in-function stack adjustment for calls is limited to 24-bits
560 // because there's no guaranteed temporary register available.
561 //
562 // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
563 // 1) For offset <= 12-bit, we use LSL #0
564 // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
565 // LSL #0, and the other uses LSL #12.
566 //
567 // Most call frames will be allocated at the start of a function so
568 // this is OK, but it is a limitation that needs dealing with.
569 assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
570
571 if (TLI->hasInlineStackProbe(MF) &&
572 -Amount >= AArch64::StackProbeMaxUnprobedStack) {
573 // When stack probing is enabled, the decrement of SP may need to be
574 // probed. We only need to do this if the call site needs 1024 bytes of
575 // space or more, because a region smaller than that is allowed to be
576 // unprobed at an ABI boundary. We rely on the fact that SP has been
577 // probed exactly at this point, either by the prologue or most recent
578 // dynamic allocation.
579 assert(MFI.hasVarSizedObjects() &&
580 "non-reserved call frame without var sized objects?");
581 Register ScratchReg =
582 MF.getRegInfo().createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
583 inlineStackProbeFixed(MBBI: I, ScratchReg, FrameSize: -Amount, CFAOffset: StackOffset::get(Fixed: 0, Scalable: 0));
584 } else {
585 emitFrameOffset(MBB, MBBI: I, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
586 Offset: StackOffset::getFixed(Fixed: Amount), TII);
587 }
588 }
589 } else if (CalleePopAmount != 0) {
590 // If the calling convention demands that the callee pops arguments from the
591 // stack, we want to add it back if we have a reserved call frame.
592 assert(CalleePopAmount < 0xffffff && "call frame too large");
593 emitFrameOffset(MBB, MBBI: I, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
594 Offset: StackOffset::getFixed(Fixed: -(int64_t)CalleePopAmount), TII);
595 }
596 return MBB.erase(I);
597}
598
599void AArch64FrameLowering::emitCalleeSavedGPRLocations(
600 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
601 MachineFunction &MF = *MBB.getParent();
602 MachineFrameInfo &MFI = MF.getFrameInfo();
603 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
604 SMEAttrs Attrs = AFI->getSMEFnAttrs();
605 bool LocallyStreaming =
606 Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface();
607
608 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
609 if (CSI.empty())
610 return;
611
612 CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
613 for (const auto &Info : CSI) {
614 unsigned FrameIdx = Info.getFrameIdx();
615 if (MFI.getStackID(ObjectIdx: FrameIdx) == TargetStackID::ScalableVector)
616 continue;
617
618 assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
619 int64_t Offset = MFI.getObjectOffset(ObjectIdx: FrameIdx) - getOffsetOfLocalArea();
620
621 // The location of VG will be emitted before each streaming-mode change in
622 // the function. Only locally-streaming functions require emitting the
623 // non-streaming VG location here.
624 if ((LocallyStreaming && FrameIdx == AFI->getStreamingVGIdx()) ||
625 (!LocallyStreaming && Info.getReg() == AArch64::VG))
626 continue;
627
628 CFIBuilder.buildOffset(Reg: Info.getReg(), Offset);
629 }
630}
631
632void AArch64FrameLowering::emitCalleeSavedSVELocations(
633 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
634 MachineFunction &MF = *MBB.getParent();
635 MachineFrameInfo &MFI = MF.getFrameInfo();
636
637 // Add callee saved registers to move list.
638 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
639 if (CSI.empty())
640 return;
641
642 const TargetSubtargetInfo &STI = MF.getSubtarget();
643 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
644 AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
645 CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
646
647 for (const auto &Info : CSI) {
648 if (!(MFI.getStackID(ObjectIdx: Info.getFrameIdx()) == TargetStackID::ScalableVector))
649 continue;
650
651 // Not all unwinders may know about SVE registers, so assume the lowest
652 // common denominator.
653 assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
654 MCRegister Reg = Info.getReg();
655 if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, RegToUseForCFI&: Reg))
656 continue;
657
658 StackOffset Offset =
659 StackOffset::getScalable(Scalable: MFI.getObjectOffset(ObjectIdx: Info.getFrameIdx())) -
660 StackOffset::getFixed(Fixed: AFI.getCalleeSavedStackSize(MFI));
661
662 CFIBuilder.insertCFIInst(CFIInst: createCFAOffset(MRI: TRI, Reg, OffsetFromDefCFA: Offset));
663 }
664}
665
666void AArch64FrameLowering::resetCFIToInitialState(
667 MachineBasicBlock &MBB) const {
668
669 MachineFunction &MF = *MBB.getParent();
670 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
671 const auto &TRI =
672 static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo());
673 const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
674
675 CFIInstBuilder CFIBuilder(MBB, MBB.begin(), MachineInstr::NoFlags);
676
677 // Reset the CFA to `SP + 0`.
678 CFIBuilder.buildDefCFA(Reg: AArch64::SP, Offset: 0);
679
680 // Flip the RA sign state.
681 if (MFI.shouldSignReturnAddress(MF))
682 MFI.branchProtectionPAuthLR() ? CFIBuilder.buildNegateRAStateWithPC()
683 : CFIBuilder.buildNegateRAState();
684
685 // Shadow call stack uses X18, reset it.
686 if (MFI.needsShadowCallStackPrologueEpilogue(MF))
687 CFIBuilder.buildSameValue(Reg: AArch64::X18);
688
689 // Emit .cfi_same_value for callee-saved registers.
690 const std::vector<CalleeSavedInfo> &CSI =
691 MF.getFrameInfo().getCalleeSavedInfo();
692 for (const auto &Info : CSI) {
693 MCRegister Reg = Info.getReg();
694 if (!TRI.regNeedsCFI(Reg, RegToUseForCFI&: Reg))
695 continue;
696 CFIBuilder.buildSameValue(Reg);
697 }
698}
699
700static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
701 MachineBasicBlock::iterator MBBI,
702 bool SVE) {
703 MachineFunction &MF = *MBB.getParent();
704 MachineFrameInfo &MFI = MF.getFrameInfo();
705
706 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
707 if (CSI.empty())
708 return;
709
710 const TargetSubtargetInfo &STI = MF.getSubtarget();
711 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
712 CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy);
713
714 for (const auto &Info : CSI) {
715 if (SVE !=
716 (MFI.getStackID(ObjectIdx: Info.getFrameIdx()) == TargetStackID::ScalableVector))
717 continue;
718
719 MCRegister Reg = Info.getReg();
720 if (SVE &&
721 !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, RegToUseForCFI&: Reg))
722 continue;
723
724 if (!Info.isRestored())
725 continue;
726
727 CFIBuilder.buildRestore(Reg: Info.getReg());
728 }
729}
730
731void AArch64FrameLowering::emitCalleeSavedGPRRestores(
732 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
733 emitCalleeSavedRestores(MBB, MBBI, SVE: false);
734}
735
736void AArch64FrameLowering::emitCalleeSavedSVERestores(
737 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
738 emitCalleeSavedRestores(MBB, MBBI, SVE: true);
739}
740
741// Return the maximum possible number of bytes for `Size` due to the
742// architectural limit on the size of a SVE register.
743static int64_t upperBound(StackOffset Size) {
744 static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
745 return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed();
746}
747
748void AArch64FrameLowering::allocateStackSpace(
749 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
750 int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI,
751 bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset,
752 bool FollowupAllocs) const {
753
754 if (!AllocSize)
755 return;
756
757 DebugLoc DL;
758 MachineFunction &MF = *MBB.getParent();
759 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
760 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
761 AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
762 const MachineFrameInfo &MFI = MF.getFrameInfo();
763
764 const int64_t MaxAlign = MFI.getMaxAlign().value();
765 const uint64_t AndMask = ~(MaxAlign - 1);
766
767 if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) {
768 Register TargetReg = RealignmentPadding
769 ? findScratchNonCalleeSaveRegister(MBB: &MBB)
770 : AArch64::SP;
771 // SUB Xd/SP, SP, AllocSize
772 emitFrameOffset(MBB, MBBI, DL, DestReg: TargetReg, SrcReg: AArch64::SP, Offset: -AllocSize, TII: &TII,
773 MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI, HasWinCFI,
774 EmitCFAOffset: EmitCFI, InitialOffset);
775
776 if (RealignmentPadding) {
777 // AND SP, X9, 0b11111...0000
778 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::ANDXri), DestReg: AArch64::SP)
779 .addReg(RegNo: TargetReg, flags: RegState::Kill)
780 .addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: AndMask, regSize: 64))
781 .setMIFlags(MachineInstr::FrameSetup);
782 AFI.setStackRealigned(true);
783
784 // No need for SEH instructions here; if we're realigning the stack,
785 // we've set a frame pointer and already finished the SEH prologue.
786 assert(!NeedsWinCFI);
787 }
788 return;
789 }
790
791 //
792 // Stack probing allocation.
793 //
794
795 // Fixed length allocation. If we don't need to re-align the stack and don't
796 // have SVE objects, we can use a more efficient sequence for stack probing.
797 if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) {
798 Register ScratchReg = findScratchNonCalleeSaveRegister(MBB: &MBB);
799 assert(ScratchReg != AArch64::NoRegister);
800 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::PROBED_STACKALLOC))
801 .addDef(RegNo: ScratchReg)
802 .addImm(Val: AllocSize.getFixed())
803 .addImm(Val: InitialOffset.getFixed())
804 .addImm(Val: InitialOffset.getScalable());
805 // The fixed allocation may leave unprobed bytes at the top of the
806 // stack. If we have subsequent allocation (e.g. if we have variable-sized
807 // objects), we need to issue an extra probe, so these allocations start in
808 // a known state.
809 if (FollowupAllocs) {
810 // STR XZR, [SP]
811 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::STRXui))
812 .addReg(RegNo: AArch64::XZR)
813 .addReg(RegNo: AArch64::SP)
814 .addImm(Val: 0)
815 .setMIFlags(MachineInstr::FrameSetup);
816 }
817
818 return;
819 }
820
821 // Variable length allocation.
822
823 // If the (unknown) allocation size cannot exceed the probe size, decrement
824 // the stack pointer right away.
825 int64_t ProbeSize = AFI.getStackProbeSize();
826 if (upperBound(Size: AllocSize) + RealignmentPadding <= ProbeSize) {
827 Register ScratchReg = RealignmentPadding
828 ? findScratchNonCalleeSaveRegister(MBB: &MBB)
829 : AArch64::SP;
830 assert(ScratchReg != AArch64::NoRegister);
831 // SUB Xd, SP, AllocSize
832 emitFrameOffset(MBB, MBBI, DL, DestReg: ScratchReg, SrcReg: AArch64::SP, Offset: -AllocSize, TII: &TII,
833 MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI, HasWinCFI,
834 EmitCFAOffset: EmitCFI, InitialOffset);
835 if (RealignmentPadding) {
836 // AND SP, Xn, 0b11111...0000
837 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::ANDXri), DestReg: AArch64::SP)
838 .addReg(RegNo: ScratchReg, flags: RegState::Kill)
839 .addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: AndMask, regSize: 64))
840 .setMIFlags(MachineInstr::FrameSetup);
841 AFI.setStackRealigned(true);
842 }
843 if (FollowupAllocs || upperBound(Size: AllocSize) + RealignmentPadding >
844 AArch64::StackProbeMaxUnprobedStack) {
845 // STR XZR, [SP]
846 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::STRXui))
847 .addReg(RegNo: AArch64::XZR)
848 .addReg(RegNo: AArch64::SP)
849 .addImm(Val: 0)
850 .setMIFlags(MachineInstr::FrameSetup);
851 }
852 return;
853 }
854
855 // Emit a variable-length allocation probing loop.
856 // TODO: As an optimisation, the loop can be "unrolled" into a few parts,
857 // each of them guaranteed to adjust the stack by less than the probe size.
858 Register TargetReg = findScratchNonCalleeSaveRegister(MBB: &MBB);
859 assert(TargetReg != AArch64::NoRegister);
860 // SUB Xd, SP, AllocSize
861 emitFrameOffset(MBB, MBBI, DL, DestReg: TargetReg, SrcReg: AArch64::SP, Offset: -AllocSize, TII: &TII,
862 MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI, HasWinCFI,
863 EmitCFAOffset: EmitCFI, InitialOffset);
864 if (RealignmentPadding) {
865 // AND Xn, Xn, 0b11111...0000
866 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::ANDXri), DestReg: TargetReg)
867 .addReg(RegNo: TargetReg, flags: RegState::Kill)
868 .addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: AndMask, regSize: 64))
869 .setMIFlags(MachineInstr::FrameSetup);
870 }
871
872 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::PROBED_STACKALLOC_VAR))
873 .addReg(RegNo: TargetReg);
874 if (EmitCFI) {
875 // Set the CFA register back to SP.
876 CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
877 .buildDefCFARegister(Reg: AArch64::SP);
878 }
879 if (RealignmentPadding)
880 AFI.setStackRealigned(true);
881}
882
883static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
884 switch (Reg.id()) {
885 default:
886 // The called routine is expected to preserve r19-r28
887 // r29 and r30 are used as frame pointer and link register resp.
888 return 0;
889
890 // GPRs
891#define CASE(n) \
892 case AArch64::W##n: \
893 case AArch64::X##n: \
894 return AArch64::X##n
895 CASE(0);
896 CASE(1);
897 CASE(2);
898 CASE(3);
899 CASE(4);
900 CASE(5);
901 CASE(6);
902 CASE(7);
903 CASE(8);
904 CASE(9);
905 CASE(10);
906 CASE(11);
907 CASE(12);
908 CASE(13);
909 CASE(14);
910 CASE(15);
911 CASE(16);
912 CASE(17);
913 CASE(18);
914#undef CASE
915
916 // FPRs
917#define CASE(n) \
918 case AArch64::B##n: \
919 case AArch64::H##n: \
920 case AArch64::S##n: \
921 case AArch64::D##n: \
922 case AArch64::Q##n: \
923 return HasSVE ? AArch64::Z##n : AArch64::Q##n
924 CASE(0);
925 CASE(1);
926 CASE(2);
927 CASE(3);
928 CASE(4);
929 CASE(5);
930 CASE(6);
931 CASE(7);
932 CASE(8);
933 CASE(9);
934 CASE(10);
935 CASE(11);
936 CASE(12);
937 CASE(13);
938 CASE(14);
939 CASE(15);
940 CASE(16);
941 CASE(17);
942 CASE(18);
943 CASE(19);
944 CASE(20);
945 CASE(21);
946 CASE(22);
947 CASE(23);
948 CASE(24);
949 CASE(25);
950 CASE(26);
951 CASE(27);
952 CASE(28);
953 CASE(29);
954 CASE(30);
955 CASE(31);
956#undef CASE
957 }
958}
959
960void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
961 MachineBasicBlock &MBB) const {
962 // Insertion point.
963 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
964
965 // Fake a debug loc.
966 DebugLoc DL;
967 if (MBBI != MBB.end())
968 DL = MBBI->getDebugLoc();
969
970 const MachineFunction &MF = *MBB.getParent();
971 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
972 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
973
974 BitVector GPRsToZero(TRI.getNumRegs());
975 BitVector FPRsToZero(TRI.getNumRegs());
976 bool HasSVE = STI.isSVEorStreamingSVEAvailable();
977 for (MCRegister Reg : RegsToZero.set_bits()) {
978 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
979 // For GPRs, we only care to clear out the 64-bit register.
980 if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
981 GPRsToZero.set(XReg);
982 } else if (AArch64InstrInfo::isFpOrNEON(Reg)) {
983 // For FPRs,
984 if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
985 FPRsToZero.set(XReg);
986 }
987 }
988
989 const AArch64InstrInfo &TII = *STI.getInstrInfo();
990
991 // Zero out GPRs.
992 for (MCRegister Reg : GPRsToZero.set_bits())
993 TII.buildClearRegister(Reg, MBB, Iter: MBBI, DL);
994
995 // Zero out FP/vector registers.
996 for (MCRegister Reg : FPRsToZero.set_bits())
997 TII.buildClearRegister(Reg, MBB, Iter: MBBI, DL);
998
999 if (HasSVE) {
1000 for (MCRegister PReg :
1001 {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4,
1002 AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9,
1003 AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14,
1004 AArch64::P15}) {
1005 if (RegsToZero[PReg])
1006 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::PFALSE), DestReg: PReg);
1007 }
1008 }
1009}
1010
1011static bool windowsRequiresStackProbe(const MachineFunction &MF,
1012 uint64_t StackSizeInBytes) {
1013 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1014 const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
1015 // TODO: When implementing stack protectors, take that into account
1016 // for the probe threshold.
1017 return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
1018 StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
1019}
1020
1021static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
1022 const MachineBasicBlock &MBB) {
1023 const MachineFunction *MF = MBB.getParent();
1024 LiveRegs.addLiveIns(MBB);
1025 // Mark callee saved registers as used so we will not choose them.
1026 const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
1027 for (unsigned i = 0; CSRegs[i]; ++i)
1028 LiveRegs.addReg(Reg: CSRegs[i]);
1029}
1030
1031// Find a scratch register that we can use at the start of the prologue to
1032// re-align the stack pointer. We avoid using callee-save registers since they
1033// may appear to be free when this is called from canUseAsPrologue (during
1034// shrink wrapping), but then no longer be free when this is called from
1035// emitPrologue.
1036//
1037// FIXME: This is a bit conservative, since in the above case we could use one
1038// of the callee-save registers as a scratch temp to re-align the stack pointer,
1039// but we would then have to make sure that we were in fact saving at least one
1040// callee-save register in the prologue, which is additional complexity that
1041// doesn't seem worth the benefit.
1042static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
1043 bool HasCall) {
1044 MachineFunction *MF = MBB->getParent();
1045
1046 // If MBB is an entry block, use X9 as the scratch register
1047 // preserve_none functions may be using X9 to pass arguments,
1048 // so prefer to pick an available register below.
1049 if (&MF->front() == MBB &&
1050 MF->getFunction().getCallingConv() != CallingConv::PreserveNone)
1051 return AArch64::X9;
1052
1053 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
1054 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
1055 LivePhysRegs LiveRegs(TRI);
1056 getLiveRegsForEntryMBB(LiveRegs, MBB: *MBB);
1057 if (HasCall) {
1058 LiveRegs.addReg(Reg: AArch64::X16);
1059 LiveRegs.addReg(Reg: AArch64::X17);
1060 LiveRegs.addReg(Reg: AArch64::X18);
1061 }
1062
1063 // Prefer X9 since it was historically used for the prologue scratch reg.
1064 const MachineRegisterInfo &MRI = MF->getRegInfo();
1065 if (LiveRegs.available(MRI, Reg: AArch64::X9))
1066 return AArch64::X9;
1067
1068 for (unsigned Reg : AArch64::GPR64RegClass) {
1069 if (LiveRegs.available(MRI, Reg))
1070 return Reg;
1071 }
1072 return AArch64::NoRegister;
1073}
1074
1075bool AArch64FrameLowering::canUseAsPrologue(
1076 const MachineBasicBlock &MBB) const {
1077 const MachineFunction *MF = MBB.getParent();
1078 MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
1079 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
1080 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1081 const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
1082 const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>();
1083
1084 if (AFI->hasSwiftAsyncContext()) {
1085 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
1086 const MachineRegisterInfo &MRI = MF->getRegInfo();
1087 LivePhysRegs LiveRegs(TRI);
1088 getLiveRegsForEntryMBB(LiveRegs, MBB);
1089 // The StoreSwiftAsyncContext clobbers X16 and X17. Make sure they are
1090 // available.
1091 if (!LiveRegs.available(MRI, Reg: AArch64::X16) ||
1092 !LiveRegs.available(MRI, Reg: AArch64::X17))
1093 return false;
1094 }
1095
1096 // Certain stack probing sequences might clobber flags, then we can't use
1097 // the block as a prologue if the flags register is a live-in.
1098 if (MF->getInfo<AArch64FunctionInfo>()->hasStackProbing() &&
1099 MBB.isLiveIn(Reg: AArch64::NZCV))
1100 return false;
1101
1102 if (RegInfo->hasStackRealignment(MF: *MF) || TLI->hasInlineStackProbe(MF: *MF))
1103 if (findScratchNonCalleeSaveRegister(MBB: TmpMBB) == AArch64::NoRegister)
1104 return false;
1105
1106 // May need a scratch register (for return value) if require making a special
1107 // call
1108 if (requiresSaveVG(MF: *MF) ||
1109 windowsRequiresStackProbe(MF: *MF, StackSizeInBytes: std::numeric_limits<uint64_t>::max()))
1110 if (findScratchNonCalleeSaveRegister(MBB: TmpMBB, HasCall: true) == AArch64::NoRegister)
1111 return false;
1112
1113 return true;
1114}
1115
1116static bool needsWinCFI(const MachineFunction &MF) {
1117 const Function &F = MF.getFunction();
1118 return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1119 F.needsUnwindTableEntry();
1120}
1121
1122bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
1123 MachineFunction &MF, uint64_t StackBumpBytes) const {
1124 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1125 const MachineFrameInfo &MFI = MF.getFrameInfo();
1126 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1127 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1128 if (homogeneousPrologEpilog(MF))
1129 return false;
1130
1131 if (AFI->getLocalStackSize() == 0)
1132 return false;
1133
1134 // For WinCFI, if optimizing for size, prefer to not combine the stack bump
1135 // (to force a stp with predecrement) to match the packed unwind format,
1136 // provided that there actually are any callee saved registers to merge the
1137 // decrement with.
1138 // This is potentially marginally slower, but allows using the packed
1139 // unwind format for functions that both have a local area and callee saved
1140 // registers. Using the packed unwind format notably reduces the size of
1141 // the unwind info.
1142 if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
1143 MF.getFunction().hasOptSize())
1144 return false;
1145
1146 // 512 is the maximum immediate for stp/ldp that will be used for
1147 // callee-save save/restores
1148 if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackSizeInBytes: StackBumpBytes))
1149 return false;
1150
1151 if (MFI.hasVarSizedObjects())
1152 return false;
1153
1154 if (RegInfo->hasStackRealignment(MF))
1155 return false;
1156
1157 // This isn't strictly necessary, but it simplifies things a bit since the
1158 // current RedZone handling code assumes the SP is adjusted by the
1159 // callee-save save/restore code.
1160 if (canUseRedZone(MF))
1161 return false;
1162
1163 // When there is an SVE area on the stack, always allocate the
1164 // callee-saves and spills/locals separately.
1165 if (getSVEStackSize(MF))
1166 return false;
1167
1168 return true;
1169}
1170
1171bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
1172 MachineBasicBlock &MBB, uint64_t StackBumpBytes) const {
1173 if (!shouldCombineCSRLocalStackBump(MF&: *MBB.getParent(), StackBumpBytes))
1174 return false;
1175 if (MBB.empty())
1176 return true;
1177
1178 // Disable combined SP bump if the last instruction is an MTE tag store. It
1179 // is almost always better to merge SP adjustment into those instructions.
1180 MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
1181 MachineBasicBlock::iterator Begin = MBB.begin();
1182 while (LastI != Begin) {
1183 --LastI;
1184 if (LastI->isTransient())
1185 continue;
1186 if (!LastI->getFlag(Flag: MachineInstr::FrameDestroy))
1187 break;
1188 }
1189 switch (LastI->getOpcode()) {
1190 case AArch64::STGloop:
1191 case AArch64::STZGloop:
1192 case AArch64::STGi:
1193 case AArch64::STZGi:
1194 case AArch64::ST2Gi:
1195 case AArch64::STZ2Gi:
1196 return false;
1197 default:
1198 return true;
1199 }
1200 llvm_unreachable("unreachable");
1201}
1202
1203// Given a load or a store instruction, generate an appropriate unwinding SEH
1204// code on Windows.
1205static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
1206 const TargetInstrInfo &TII,
1207 MachineInstr::MIFlag Flag) {
1208 unsigned Opc = MBBI->getOpcode();
1209 MachineBasicBlock *MBB = MBBI->getParent();
1210 MachineFunction &MF = *MBB->getParent();
1211 DebugLoc DL = MBBI->getDebugLoc();
1212 unsigned ImmIdx = MBBI->getNumOperands() - 1;
1213 int Imm = MBBI->getOperand(i: ImmIdx).getImm();
1214 MachineInstrBuilder MIB;
1215 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1216 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1217
1218 switch (Opc) {
1219 default:
1220 report_fatal_error(reason: "No SEH Opcode for this instruction");
1221 case AArch64::STR_ZXI:
1222 case AArch64::LDR_ZXI: {
1223 unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 0).getReg());
1224 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveZReg))
1225 .addImm(Val: Reg0)
1226 .addImm(Val: Imm)
1227 .setMIFlag(Flag);
1228 break;
1229 }
1230 case AArch64::STR_PXI:
1231 case AArch64::LDR_PXI: {
1232 unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 0).getReg());
1233 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SavePReg))
1234 .addImm(Val: Reg0)
1235 .addImm(Val: Imm)
1236 .setMIFlag(Flag);
1237 break;
1238 }
1239 case AArch64::LDPDpost:
1240 Imm = -Imm;
1241 [[fallthrough]];
1242 case AArch64::STPDpre: {
1243 unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 1).getReg());
1244 unsigned Reg1 = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 2).getReg());
1245 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFRegP_X))
1246 .addImm(Val: Reg0)
1247 .addImm(Val: Reg1)
1248 .addImm(Val: Imm * 8)
1249 .setMIFlag(Flag);
1250 break;
1251 }
1252 case AArch64::LDPXpost:
1253 Imm = -Imm;
1254 [[fallthrough]];
1255 case AArch64::STPXpre: {
1256 Register Reg0 = MBBI->getOperand(i: 1).getReg();
1257 Register Reg1 = MBBI->getOperand(i: 2).getReg();
1258 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1259 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFPLR_X))
1260 .addImm(Val: Imm * 8)
1261 .setMIFlag(Flag);
1262 else
1263 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveRegP_X))
1264 .addImm(Val: RegInfo->getSEHRegNum(i: Reg0))
1265 .addImm(Val: RegInfo->getSEHRegNum(i: Reg1))
1266 .addImm(Val: Imm * 8)
1267 .setMIFlag(Flag);
1268 break;
1269 }
1270 case AArch64::LDRDpost:
1271 Imm = -Imm;
1272 [[fallthrough]];
1273 case AArch64::STRDpre: {
1274 unsigned Reg = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 1).getReg());
1275 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFReg_X))
1276 .addImm(Val: Reg)
1277 .addImm(Val: Imm)
1278 .setMIFlag(Flag);
1279 break;
1280 }
1281 case AArch64::LDRXpost:
1282 Imm = -Imm;
1283 [[fallthrough]];
1284 case AArch64::STRXpre: {
1285 unsigned Reg = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 1).getReg());
1286 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveReg_X))
1287 .addImm(Val: Reg)
1288 .addImm(Val: Imm)
1289 .setMIFlag(Flag);
1290 break;
1291 }
1292 case AArch64::STPDi:
1293 case AArch64::LDPDi: {
1294 unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 0).getReg());
1295 unsigned Reg1 = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 1).getReg());
1296 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFRegP))
1297 .addImm(Val: Reg0)
1298 .addImm(Val: Reg1)
1299 .addImm(Val: Imm * 8)
1300 .setMIFlag(Flag);
1301 break;
1302 }
1303 case AArch64::STPXi:
1304 case AArch64::LDPXi: {
1305 Register Reg0 = MBBI->getOperand(i: 0).getReg();
1306 Register Reg1 = MBBI->getOperand(i: 1).getReg();
1307 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1308 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFPLR))
1309 .addImm(Val: Imm * 8)
1310 .setMIFlag(Flag);
1311 else
1312 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveRegP))
1313 .addImm(Val: RegInfo->getSEHRegNum(i: Reg0))
1314 .addImm(Val: RegInfo->getSEHRegNum(i: Reg1))
1315 .addImm(Val: Imm * 8)
1316 .setMIFlag(Flag);
1317 break;
1318 }
1319 case AArch64::STRXui:
1320 case AArch64::LDRXui: {
1321 int Reg = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 0).getReg());
1322 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveReg))
1323 .addImm(Val: Reg)
1324 .addImm(Val: Imm * 8)
1325 .setMIFlag(Flag);
1326 break;
1327 }
1328 case AArch64::STRDui:
1329 case AArch64::LDRDui: {
1330 unsigned Reg = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 0).getReg());
1331 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFReg))
1332 .addImm(Val: Reg)
1333 .addImm(Val: Imm * 8)
1334 .setMIFlag(Flag);
1335 break;
1336 }
1337 case AArch64::STPQi:
1338 case AArch64::LDPQi: {
1339 unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 0).getReg());
1340 unsigned Reg1 = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 1).getReg());
1341 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveAnyRegQP))
1342 .addImm(Val: Reg0)
1343 .addImm(Val: Reg1)
1344 .addImm(Val: Imm * 16)
1345 .setMIFlag(Flag);
1346 break;
1347 }
1348 case AArch64::LDPQpost:
1349 Imm = -Imm;
1350 [[fallthrough]];
1351 case AArch64::STPQpre: {
1352 unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 1).getReg());
1353 unsigned Reg1 = RegInfo->getSEHRegNum(i: MBBI->getOperand(i: 2).getReg());
1354 MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveAnyRegQPX))
1355 .addImm(Val: Reg0)
1356 .addImm(Val: Reg1)
1357 .addImm(Val: Imm * 16)
1358 .setMIFlag(Flag);
1359 break;
1360 }
1361 }
1362 auto I = MBB->insertAfter(I: MBBI, MI: MIB);
1363 return I;
1364}
1365
1366// Fix up the SEH opcode associated with the save/restore instruction.
1367static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
1368 unsigned LocalStackSize) {
1369 MachineOperand *ImmOpnd = nullptr;
1370 unsigned ImmIdx = MBBI->getNumOperands() - 1;
1371 switch (MBBI->getOpcode()) {
1372 default:
1373 llvm_unreachable("Fix the offset in the SEH instruction");
1374 case AArch64::SEH_SaveFPLR:
1375 case AArch64::SEH_SaveRegP:
1376 case AArch64::SEH_SaveReg:
1377 case AArch64::SEH_SaveFRegP:
1378 case AArch64::SEH_SaveFReg:
1379 case AArch64::SEH_SaveAnyRegQP:
1380 case AArch64::SEH_SaveAnyRegQPX:
1381 ImmOpnd = &MBBI->getOperand(i: ImmIdx);
1382 break;
1383 }
1384 if (ImmOpnd)
1385 ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
1386}
1387
1388bool requiresGetVGCall(MachineFunction &MF) {
1389 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1390 return AFI->hasStreamingModeChanges() &&
1391 !MF.getSubtarget<AArch64Subtarget>().hasSVE();
1392}
1393
1394static bool requiresSaveVG(const MachineFunction &MF) {
1395 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1396 // For Darwin platforms we don't save VG for non-SVE functions, even if SME
1397 // is enabled with streaming mode changes.
1398 if (!AFI->hasStreamingModeChanges())
1399 return false;
1400 auto &ST = MF.getSubtarget<AArch64Subtarget>();
1401 if (ST.isTargetDarwin())
1402 return ST.hasSVE();
1403 return true;
1404}
1405
1406bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
1407 unsigned Opc = MBBI->getOpcode();
1408 if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
1409 Opc == AArch64::UBFMXri)
1410 return true;
1411
1412 if (requiresGetVGCall(MF&: *MBBI->getMF())) {
1413 if (Opc == AArch64::ORRXrr)
1414 return true;
1415
1416 if (Opc == AArch64::BL) {
1417 auto Op1 = MBBI->getOperand(i: 0);
1418 return Op1.isSymbol() &&
1419 (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
1420 }
1421 }
1422
1423 return false;
1424}
1425
1426// Convert callee-save register save/restore instruction to do stack pointer
1427// decrement/increment to allocate/deallocate the callee-save stack area by
1428// converting store/load to use pre/post increment version.
1429static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
1430 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
1431 const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
1432 bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
1433 MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
1434 int CFAOffset = 0) {
1435 unsigned NewOpc;
1436
1437 // If the function contains streaming mode changes, we expect instructions
1438 // to calculate the value of VG before spilling. For locally-streaming
1439 // functions, we need to do this for both the streaming and non-streaming
1440 // vector length. Move past these instructions if necessary.
1441 MachineFunction &MF = *MBB.getParent();
1442 if (requiresSaveVG(MF))
1443 while (isVGInstruction(MBBI))
1444 ++MBBI;
1445
1446 switch (MBBI->getOpcode()) {
1447 default:
1448 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1449 case AArch64::STPXi:
1450 NewOpc = AArch64::STPXpre;
1451 break;
1452 case AArch64::STPDi:
1453 NewOpc = AArch64::STPDpre;
1454 break;
1455 case AArch64::STPQi:
1456 NewOpc = AArch64::STPQpre;
1457 break;
1458 case AArch64::STRXui:
1459 NewOpc = AArch64::STRXpre;
1460 break;
1461 case AArch64::STRDui:
1462 NewOpc = AArch64::STRDpre;
1463 break;
1464 case AArch64::STRQui:
1465 NewOpc = AArch64::STRQpre;
1466 break;
1467 case AArch64::LDPXi:
1468 NewOpc = AArch64::LDPXpost;
1469 break;
1470 case AArch64::LDPDi:
1471 NewOpc = AArch64::LDPDpost;
1472 break;
1473 case AArch64::LDPQi:
1474 NewOpc = AArch64::LDPQpost;
1475 break;
1476 case AArch64::LDRXui:
1477 NewOpc = AArch64::LDRXpost;
1478 break;
1479 case AArch64::LDRDui:
1480 NewOpc = AArch64::LDRDpost;
1481 break;
1482 case AArch64::LDRQui:
1483 NewOpc = AArch64::LDRQpost;
1484 break;
1485 }
1486 TypeSize Scale = TypeSize::getFixed(ExactSize: 1), Width = TypeSize::getFixed(ExactSize: 0);
1487 int64_t MinOffset, MaxOffset;
1488 bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
1489 Opcode: NewOpc, Scale, Width, MinOffset, MaxOffset);
1490 (void)Success;
1491 assert(Success && "unknown load/store opcode");
1492
1493 // If the first store isn't right where we want SP then we can't fold the
1494 // update in so create a normal arithmetic instruction instead.
1495 if (MBBI->getOperand(i: MBBI->getNumOperands() - 1).getImm() != 0 ||
1496 CSStackSizeInc < MinOffset * (int64_t)Scale.getFixedValue() ||
1497 CSStackSizeInc > MaxOffset * (int64_t)Scale.getFixedValue()) {
1498 // If we are destroying the frame, make sure we add the increment after the
1499 // last frame operation.
1500 if (FrameFlag == MachineInstr::FrameDestroy) {
1501 ++MBBI;
1502 // Also skip the SEH instruction, if needed
1503 if (NeedsWinCFI && AArch64InstrInfo::isSEHInstruction(MI: *MBBI))
1504 ++MBBI;
1505 }
1506 emitFrameOffset(MBB, MBBI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
1507 Offset: StackOffset::getFixed(Fixed: CSStackSizeInc), TII, FrameFlag,
1508 SetNZCV: false, NeedsWinCFI, HasWinCFI, EmitCFAOffset: EmitCFI,
1509 InitialOffset: StackOffset::getFixed(Fixed: CFAOffset));
1510
1511 return std::prev(x: MBBI);
1512 }
1513
1514 // Get rid of the SEH code associated with the old instruction.
1515 if (NeedsWinCFI) {
1516 auto SEH = std::next(x: MBBI);
1517 if (AArch64InstrInfo::isSEHInstruction(MI: *SEH))
1518 SEH->eraseFromParent();
1519 }
1520
1521 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: NewOpc));
1522 MIB.addReg(RegNo: AArch64::SP, flags: RegState::Define);
1523
1524 // Copy all operands other than the immediate offset.
1525 unsigned OpndIdx = 0;
1526 for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
1527 ++OpndIdx)
1528 MIB.add(MO: MBBI->getOperand(i: OpndIdx));
1529
1530 assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
1531 "Unexpected immediate offset in first/last callee-save save/restore "
1532 "instruction!");
1533 assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
1534 "Unexpected base register in callee-save save/restore instruction!");
1535 assert(CSStackSizeInc % Scale == 0);
1536 MIB.addImm(Val: CSStackSizeInc / (int)Scale);
1537
1538 MIB.setMIFlags(MBBI->getFlags());
1539 MIB.setMemRefs(MBBI->memoperands());
1540
1541 // Generate a new SEH code that corresponds to the new instruction.
1542 if (NeedsWinCFI) {
1543 *HasWinCFI = true;
1544 InsertSEH(MBBI: *MIB, TII: *TII, Flag: FrameFlag);
1545 }
1546
1547 if (EmitCFI)
1548 CFIInstBuilder(MBB, MBBI, FrameFlag)
1549 .buildDefCFAOffset(Offset: CFAOffset - CSStackSizeInc);
1550
1551 return std::prev(x: MBB.erase(I: MBBI));
1552}
1553
1554// Fixup callee-save register save/restore instructions to take into account
1555// combined SP bump by adding the local stack size to the stack offsets.
1556static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
1557 uint64_t LocalStackSize,
1558 bool NeedsWinCFI,
1559 bool *HasWinCFI) {
1560 if (AArch64InstrInfo::isSEHInstruction(MI))
1561 return;
1562
1563 unsigned Opc = MI.getOpcode();
1564 unsigned Scale;
1565 switch (Opc) {
1566 case AArch64::STPXi:
1567 case AArch64::STRXui:
1568 case AArch64::STPDi:
1569 case AArch64::STRDui:
1570 case AArch64::LDPXi:
1571 case AArch64::LDRXui:
1572 case AArch64::LDPDi:
1573 case AArch64::LDRDui:
1574 Scale = 8;
1575 break;
1576 case AArch64::STPQi:
1577 case AArch64::STRQui:
1578 case AArch64::LDPQi:
1579 case AArch64::LDRQui:
1580 Scale = 16;
1581 break;
1582 default:
1583 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1584 }
1585
1586 unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1587 assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1588 "Unexpected base register in callee-save save/restore instruction!");
1589 // Last operand is immediate offset that needs fixing.
1590 MachineOperand &OffsetOpnd = MI.getOperand(i: OffsetIdx);
1591 // All generated opcodes have scaled offsets.
1592 assert(LocalStackSize % Scale == 0);
1593 OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1594
1595 if (NeedsWinCFI) {
1596 *HasWinCFI = true;
1597 auto MBBI = std::next(x: MachineBasicBlock::iterator(MI));
1598 assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1599 assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1600 "Expecting a SEH instruction");
1601 fixupSEHOpcode(MBBI, LocalStackSize);
1602 }
1603}
1604
1605static bool isTargetWindows(const MachineFunction &MF) {
1606 return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1607}
1608
1609static unsigned getStackHazardSize(const MachineFunction &MF) {
1610 return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
1611}
1612
1613// Convenience function to determine whether I is an SVE callee save.
1614static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1615 switch (I->getOpcode()) {
1616 default:
1617 return false;
1618 case AArch64::PTRUE_C_B:
1619 case AArch64::LD1B_2Z_IMM:
1620 case AArch64::ST1B_2Z_IMM:
1621 case AArch64::STR_ZXI:
1622 case AArch64::STR_PXI:
1623 case AArch64::LDR_ZXI:
1624 case AArch64::LDR_PXI:
1625 case AArch64::PTRUE_B:
1626 case AArch64::CPY_ZPzI_B:
1627 case AArch64::CMPNE_PPzZI_B:
1628 return I->getFlag(Flag: MachineInstr::FrameSetup) ||
1629 I->getFlag(Flag: MachineInstr::FrameDestroy);
1630 case AArch64::SEH_SavePReg:
1631 case AArch64::SEH_SaveZReg:
1632 return true;
1633 }
1634}
1635
1636static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
1637 MachineFunction &MF,
1638 MachineBasicBlock &MBB,
1639 MachineBasicBlock::iterator MBBI,
1640 const DebugLoc &DL, bool NeedsWinCFI,
1641 bool NeedsUnwindInfo) {
1642 // Shadow call stack prolog: str x30, [x18], #8
1643 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::STRXpost))
1644 .addReg(RegNo: AArch64::X18, flags: RegState::Define)
1645 .addReg(RegNo: AArch64::LR)
1646 .addReg(RegNo: AArch64::X18)
1647 .addImm(Val: 8)
1648 .setMIFlag(MachineInstr::FrameSetup);
1649
1650 // This instruction also makes x18 live-in to the entry block.
1651 MBB.addLiveIn(PhysReg: AArch64::X18);
1652
1653 if (NeedsWinCFI)
1654 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_Nop))
1655 .setMIFlag(MachineInstr::FrameSetup);
1656
1657 if (NeedsUnwindInfo) {
1658 // Emit a CFI instruction that causes 8 to be subtracted from the value of
1659 // x18 when unwinding past this frame.
1660 static const char CFIInst[] = {
1661 dwarf::DW_CFA_val_expression,
1662 18, // register
1663 2, // length
1664 static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
1665 static_cast<char>(-8) & 0x7f, // addend (sleb128)
1666 };
1667 CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
1668 .buildEscape(Bytes: StringRef(CFIInst, sizeof(CFIInst)));
1669 }
1670}
1671
1672static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
1673 MachineFunction &MF,
1674 MachineBasicBlock &MBB,
1675 MachineBasicBlock::iterator MBBI,
1676 const DebugLoc &DL) {
1677 // Shadow call stack epilog: ldr x30, [x18, #-8]!
1678 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::LDRXpre))
1679 .addReg(RegNo: AArch64::X18, flags: RegState::Define)
1680 .addReg(RegNo: AArch64::LR, flags: RegState::Define)
1681 .addReg(RegNo: AArch64::X18)
1682 .addImm(Val: -8)
1683 .setMIFlag(MachineInstr::FrameDestroy);
1684
1685 if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF))
1686 CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
1687 .buildRestore(Reg: AArch64::X18);
1688}
1689
1690// Define the current CFA rule to use the provided FP.
1691static void emitDefineCFAWithFP(MachineFunction &MF, MachineBasicBlock &MBB,
1692 MachineBasicBlock::iterator MBBI,
1693 unsigned FixedObject) {
1694 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
1695 const AArch64RegisterInfo *TRI = STI.getRegisterInfo();
1696 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1697
1698 const int OffsetToFirstCalleeSaveFromFP =
1699 AFI->getCalleeSaveBaseToFrameRecordOffset() -
1700 AFI->getCalleeSavedStackSize();
1701 Register FramePtr = TRI->getFrameRegister(MF);
1702 CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
1703 .buildDefCFA(Reg: FramePtr, Offset: FixedObject - OffsetToFirstCalleeSaveFromFP);
1704}
1705
1706#ifndef NDEBUG
1707/// Collect live registers from the end of \p MI's parent up to (including) \p
1708/// MI in \p LiveRegs.
1709static void getLivePhysRegsUpTo(MachineInstr &MI, const TargetRegisterInfo &TRI,
1710 LivePhysRegs &LiveRegs) {
1711
1712 MachineBasicBlock &MBB = *MI.getParent();
1713 LiveRegs.addLiveOuts(MBB);
1714 for (const MachineInstr &MI :
1715 reverse(make_range(MI.getIterator(), MBB.instr_end())))
1716 LiveRegs.stepBackward(MI);
1717}
1718#endif
1719
1720void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1721 MachineBasicBlock &MBB) const {
1722 MachineBasicBlock::iterator MBBI = MBB.begin();
1723 const MachineFrameInfo &MFI = MF.getFrameInfo();
1724 const Function &F = MF.getFunction();
1725 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1726 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1727 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1728
1729 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1730 bool EmitCFI = AFI->needsDwarfUnwindInfo(MF);
1731 bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
1732 bool HasFP = hasFP(MF);
1733 bool NeedsWinCFI = needsWinCFI(MF);
1734 bool HasWinCFI = false;
1735 auto Cleanup = make_scope_exit(F: [&]() { MF.setHasWinCFI(HasWinCFI); });
1736
1737 MachineBasicBlock::iterator End = MBB.end();
1738#ifndef NDEBUG
1739 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1740 // Collect live register from the end of MBB up to the start of the existing
1741 // frame setup instructions.
1742 MachineBasicBlock::iterator NonFrameStart = MBB.begin();
1743 while (NonFrameStart != End &&
1744 NonFrameStart->getFlag(MachineInstr::FrameSetup))
1745 ++NonFrameStart;
1746
1747 LivePhysRegs LiveRegs(*TRI);
1748 if (NonFrameStart != MBB.end()) {
1749 getLivePhysRegsUpTo(*NonFrameStart, *TRI, LiveRegs);
1750 // Ignore registers used for stack management for now.
1751 LiveRegs.removeReg(AArch64::SP);
1752 LiveRegs.removeReg(AArch64::X19);
1753 LiveRegs.removeReg(AArch64::FP);
1754 LiveRegs.removeReg(AArch64::LR);
1755
1756 // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
1757 // This is necessary to spill VG if required where SVE is unavailable, but
1758 // X0 is preserved around this call.
1759 if (requiresGetVGCall(MF))
1760 LiveRegs.removeReg(AArch64::X0);
1761 }
1762
1763 auto VerifyClobberOnExit = make_scope_exit([&]() {
1764 if (NonFrameStart == MBB.end())
1765 return;
1766 // Check if any of the newly instructions clobber any of the live registers.
1767 for (MachineInstr &MI :
1768 make_range(MBB.instr_begin(), NonFrameStart->getIterator())) {
1769 for (auto &Op : MI.operands())
1770 if (Op.isReg() && Op.isDef())
1771 assert(!LiveRegs.contains(Op.getReg()) &&
1772 "live register clobbered by inserted prologue instructions");
1773 }
1774 });
1775#endif
1776
1777 bool IsFunclet = MBB.isEHFuncletEntry();
1778
1779 // At this point, we're going to decide whether or not the function uses a
1780 // redzone. In most cases, the function doesn't have a redzone so let's
1781 // assume that's false and set it to true in the case that there's a redzone.
1782 AFI->setHasRedZone(false);
1783
1784 // Debug location must be unknown since the first debug location is used
1785 // to determine the end of the prologue.
1786 DebugLoc DL;
1787
1788 const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1789 if (MFnI.needsShadowCallStackPrologueEpilogue(MF))
1790 emitShadowCallStackPrologue(TII: *TII, MF, MBB, MBBI, DL, NeedsWinCFI,
1791 NeedsUnwindInfo: MFnI.needsDwarfUnwindInfo(MF));
1792
1793 if (MFnI.shouldSignReturnAddress(MF)) {
1794 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::PAUTH_PROLOGUE))
1795 .setMIFlag(MachineInstr::FrameSetup);
1796 if (NeedsWinCFI)
1797 HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR
1798 }
1799
1800 if (EmitCFI && MFnI.isMTETagged()) {
1801 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::EMITMTETAGGED))
1802 .setMIFlag(MachineInstr::FrameSetup);
1803 }
1804
1805 // We signal the presence of a Swift extended frame to external tools by
1806 // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1807 // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1808 // bits so that is still true.
1809 if (HasFP && AFI->hasSwiftAsyncContext()) {
1810 switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
1811 case SwiftAsyncFramePointerMode::DeploymentBased:
1812 if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
1813 // The special symbol below is absolute and has a *value* that can be
1814 // combined with the frame pointer to signal an extended frame.
1815 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::LOADgot), DestReg: AArch64::X16)
1816 .addExternalSymbol(FnName: "swift_async_extendedFramePointerFlags",
1817 TargetFlags: AArch64II::MO_GOT);
1818 if (NeedsWinCFI) {
1819 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
1820 .setMIFlags(MachineInstr::FrameSetup);
1821 HasWinCFI = true;
1822 }
1823 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ORRXrs), DestReg: AArch64::FP)
1824 .addUse(RegNo: AArch64::FP)
1825 .addUse(RegNo: AArch64::X16)
1826 .addImm(Val: Subtarget.isTargetILP32() ? 32 : 0);
1827 if (NeedsWinCFI) {
1828 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
1829 .setMIFlags(MachineInstr::FrameSetup);
1830 HasWinCFI = true;
1831 }
1832 break;
1833 }
1834 [[fallthrough]];
1835
1836 case SwiftAsyncFramePointerMode::Always:
1837 // ORR x29, x29, #0x1000_0000_0000_0000
1838 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ORRXri), DestReg: AArch64::FP)
1839 .addUse(RegNo: AArch64::FP)
1840 .addImm(Val: 0x1100)
1841 .setMIFlag(MachineInstr::FrameSetup);
1842 if (NeedsWinCFI) {
1843 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
1844 .setMIFlags(MachineInstr::FrameSetup);
1845 HasWinCFI = true;
1846 }
1847 break;
1848
1849 case SwiftAsyncFramePointerMode::Never:
1850 break;
1851 }
1852 }
1853
1854 // All calls are tail calls in GHC calling conv, and functions have no
1855 // prologue/epilogue.
1856 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1857 return;
1858
1859 // Set tagged base pointer to the requested stack slot.
1860 // Ideally it should match SP value after prologue.
1861 std::optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1862 if (TBPI)
1863 AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(ObjectIdx: *TBPI));
1864 else
1865 AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1866
1867 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1868
1869 // getStackSize() includes all the locals in its size calculation. We don't
1870 // include these locals when computing the stack size of a funclet, as they
1871 // are allocated in the parent's stack frame and accessed via the frame
1872 // pointer from the funclet. We only save the callee saved registers in the
1873 // funclet, which are really the callee saved registers of the parent
1874 // function, including the funclet.
1875 int64_t NumBytes =
1876 IsFunclet ? getWinEHFuncletFrameSize(MF) : MFI.getStackSize();
1877 if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, StackSizeInBytes: NumBytes)) {
1878 assert(!HasFP && "unexpected function without stack frame but with FP");
1879 assert(!SVEStackSize &&
1880 "unexpected function without stack frame but with SVE objects");
1881 // All of the stack allocation is for locals.
1882 AFI->setLocalStackSize(NumBytes);
1883 if (!NumBytes)
1884 return;
1885 // REDZONE: If the stack size is less than 128 bytes, we don't need
1886 // to actually allocate.
1887 if (canUseRedZone(MF)) {
1888 AFI->setHasRedZone(true);
1889 ++NumRedZoneFunctions;
1890 } else {
1891 emitFrameOffset(MBB, MBBI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
1892 Offset: StackOffset::getFixed(Fixed: -NumBytes), TII,
1893 MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI);
1894 if (EmitCFI) {
1895 // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1896 MCSymbol *FrameLabel = MF.getContext().createTempSymbol();
1897 // Encode the stack size of the leaf function.
1898 CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
1899 .buildDefCFAOffset(Offset: NumBytes, Label: FrameLabel);
1900 }
1901 }
1902
1903 if (NeedsWinCFI) {
1904 HasWinCFI = true;
1905 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_PrologEnd))
1906 .setMIFlag(MachineInstr::FrameSetup);
1907 }
1908
1909 return;
1910 }
1911
1912 bool IsWin64 = Subtarget.isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
1913 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1914
1915 // Windows unwind can't represent the required stack adjustments if we have
1916 // both SVE callee-saves and dynamic stack allocations, and the frame
1917 // pointer is before the SVE spills. The allocation of the frame pointer
1918 // must be the last instruction in the prologue so the unwinder can restore
1919 // the stack pointer correctly. (And there isn't any unwind opcode for
1920 // `addvl sp, x29, -17`.)
1921 //
1922 // Because of this, we do spills in the opposite order on Windows: first SVE,
1923 // then GPRs. The main side-effect of this is that it makes accessing
1924 // parameters passed on the stack more expensive.
1925 //
1926 // We could consider rearranging the spills for simpler cases.
1927 bool FPAfterSVECalleeSaves =
1928 Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
1929
1930 if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex())
1931 reportFatalUsageError(reason: "SME hazard padding is not supported on Windows");
1932
1933 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1934 // All of the remaining stack allocations are for locals.
1935 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1936 bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, StackBumpBytes: NumBytes);
1937 bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1938 if (FPAfterSVECalleeSaves) {
1939 // If we're doing SVE saves first, we need to immediately allocate space
1940 // for fixed objects, then space for the SVE callee saves.
1941 //
1942 // Windows unwind requires that the scalable size is a multiple of 16;
1943 // that's handled when the callee-saved size is computed.
1944 auto SaveSize =
1945 StackOffset::getScalable(Scalable: AFI->getSVECalleeSavedStackSize()) +
1946 StackOffset::getFixed(Fixed: FixedObject);
1947 allocateStackSpace(MBB, MBBI, RealignmentPadding: 0, AllocSize: SaveSize, NeedsWinCFI, HasWinCFI: &HasWinCFI,
1948 /*EmitCFI=*/false, InitialOffset: StackOffset{},
1949 /*FollowupAllocs=*/true);
1950 NumBytes -= FixedObject;
1951
1952 // Now allocate space for the GPR callee saves.
1953 while (MBBI != End && IsSVECalleeSave(I: MBBI))
1954 ++MBBI;
1955 MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1956 MBB, MBBI, DL, TII, CSStackSizeInc: -AFI->getCalleeSavedStackSize(), NeedsWinCFI,
1957 HasWinCFI: &HasWinCFI, EmitCFI: EmitAsyncCFI);
1958 NumBytes -= AFI->getCalleeSavedStackSize();
1959 } else if (CombineSPBump) {
1960 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1961 emitFrameOffset(MBB, MBBI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
1962 Offset: StackOffset::getFixed(Fixed: -NumBytes), TII,
1963 MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI,
1964 EmitCFAOffset: EmitAsyncCFI);
1965 NumBytes = 0;
1966 } else if (HomPrologEpilog) {
1967 // Stack has been already adjusted.
1968 NumBytes -= PrologueSaveSize;
1969 } else if (PrologueSaveSize != 0) {
1970 MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1971 MBB, MBBI, DL, TII, CSStackSizeInc: -PrologueSaveSize, NeedsWinCFI, HasWinCFI: &HasWinCFI,
1972 EmitCFI: EmitAsyncCFI);
1973 NumBytes -= PrologueSaveSize;
1974 }
1975 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1976
1977 // Move past the saves of the callee-saved registers, fixing up the offsets
1978 // and pre-inc if we decided to combine the callee-save and local stack
1979 // pointer bump above.
1980 while (MBBI != End && MBBI->getFlag(Flag: MachineInstr::FrameSetup) &&
1981 !IsSVECalleeSave(I: MBBI)) {
1982 if (CombineSPBump &&
1983 // Only fix-up frame-setup load/store instructions.
1984 (!requiresSaveVG(MF) || !isVGInstruction(MBBI)))
1985 fixupCalleeSaveRestoreStackOffset(MI&: *MBBI, LocalStackSize: AFI->getLocalStackSize(),
1986 NeedsWinCFI, HasWinCFI: &HasWinCFI);
1987 ++MBBI;
1988 }
1989
1990 // For funclets the FP belongs to the containing function.
1991 if (!IsFunclet && HasFP) {
1992 // Only set up FP if we actually need to.
1993 int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1994
1995 if (CombineSPBump)
1996 FPOffset += AFI->getLocalStackSize();
1997
1998 if (AFI->hasSwiftAsyncContext()) {
1999 // Before we update the live FP we have to ensure there's a valid (or
2000 // null) asynchronous context in its slot just before FP in the frame
2001 // record, so store it now.
2002 const auto &Attrs = MF.getFunction().getAttributes();
2003 bool HaveInitialContext = Attrs.hasAttrSomewhere(Kind: Attribute::SwiftAsync);
2004 if (HaveInitialContext)
2005 MBB.addLiveIn(PhysReg: AArch64::X22);
2006 Register Reg = HaveInitialContext ? AArch64::X22 : AArch64::XZR;
2007 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::StoreSwiftAsyncContext))
2008 .addUse(RegNo: Reg)
2009 .addUse(RegNo: AArch64::SP)
2010 .addImm(Val: FPOffset - 8)
2011 .setMIFlags(MachineInstr::FrameSetup);
2012 if (NeedsWinCFI) {
2013 // WinCFI and arm64e, where StoreSwiftAsyncContext is expanded
2014 // to multiple instructions, should be mutually-exclusive.
2015 assert(Subtarget.getTargetTriple().getArchName() != "arm64e");
2016 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
2017 .setMIFlags(MachineInstr::FrameSetup);
2018 HasWinCFI = true;
2019 }
2020 }
2021
2022 if (HomPrologEpilog) {
2023 auto Prolog = MBBI;
2024 --Prolog;
2025 assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
2026 Prolog->addOperand(Op: MachineOperand::CreateImm(Val: FPOffset));
2027 } else {
2028 // Issue sub fp, sp, FPOffset or
2029 // mov fp,sp when FPOffset is zero.
2030 // Note: All stores of callee-saved registers are marked as "FrameSetup".
2031 // This code marks the instruction(s) that set the FP also.
2032 emitFrameOffset(MBB, MBBI, DL, DestReg: AArch64::FP, SrcReg: AArch64::SP,
2033 Offset: StackOffset::getFixed(Fixed: FPOffset), TII,
2034 MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI);
2035 if (NeedsWinCFI && HasWinCFI) {
2036 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_PrologEnd))
2037 .setMIFlag(MachineInstr::FrameSetup);
2038 // After setting up the FP, the rest of the prolog doesn't need to be
2039 // included in the SEH unwind info.
2040 NeedsWinCFI = false;
2041 }
2042 }
2043 if (EmitAsyncCFI)
2044 emitDefineCFAWithFP(MF, MBB, MBBI, FixedObject);
2045 }
2046
2047 // Now emit the moves for whatever callee saved regs we have (including FP,
2048 // LR if those are saved). Frame instructions for SVE register are emitted
2049 // later, after the instruction which actually save SVE regs.
2050 if (EmitAsyncCFI)
2051 emitCalleeSavedGPRLocations(MBB, MBBI);
2052
2053 // Alignment is required for the parent frame, not the funclet
2054 const bool NeedsRealignment =
2055 NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF);
2056 const int64_t RealignmentPadding =
2057 (NeedsRealignment && MFI.getMaxAlign() > Align(16))
2058 ? MFI.getMaxAlign().value() - 16
2059 : 0;
2060
2061 if (windowsRequiresStackProbe(MF, StackSizeInBytes: NumBytes + RealignmentPadding)) {
2062 if (AFI->getSVECalleeSavedStackSize())
2063 report_fatal_error(
2064 reason: "SVE callee saves not yet supported with stack probing");
2065
2066 // Find an available register to spill the value of X15 to, if X15 is being
2067 // used already for nest.
2068 unsigned X15Scratch = AArch64::NoRegister;
2069 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
2070 if (llvm::any_of(Range: MBB.liveins(),
2071 P: [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
2072 return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
2073 RegA: AArch64::X15, RegB: LiveIn.PhysReg);
2074 })) {
2075 X15Scratch = findScratchNonCalleeSaveRegister(MBB: &MBB, HasCall: true);
2076 assert(X15Scratch != AArch64::NoRegister &&
2077 (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17));
2078#ifndef NDEBUG
2079 LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
2080#endif
2081 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ORRXrr), DestReg: X15Scratch)
2082 .addReg(RegNo: AArch64::XZR)
2083 .addReg(RegNo: AArch64::X15, flags: RegState::Undef)
2084 .addReg(RegNo: AArch64::X15, flags: RegState::Implicit)
2085 .setMIFlag(MachineInstr::FrameSetup);
2086 }
2087
2088 uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
2089 if (NeedsWinCFI) {
2090 HasWinCFI = true;
2091 // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
2092 // exceed this amount. We need to move at most 2^24 - 1 into x15.
2093 // This is at most two instructions, MOVZ followed by MOVK.
2094 // TODO: Fix to use multiple stack alloc unwind codes for stacks
2095 // exceeding 256MB in size.
2096 if (NumBytes >= (1 << 28))
2097 report_fatal_error(reason: "Stack size cannot exceed 256MB for stack "
2098 "unwinding purposes");
2099
2100 uint32_t LowNumWords = NumWords & 0xFFFF;
2101 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::MOVZXi), DestReg: AArch64::X15)
2102 .addImm(Val: LowNumWords)
2103 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
2104 .setMIFlag(MachineInstr::FrameSetup);
2105 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
2106 .setMIFlag(MachineInstr::FrameSetup);
2107 if ((NumWords & 0xFFFF0000) != 0) {
2108 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::MOVKXi), DestReg: AArch64::X15)
2109 .addReg(RegNo: AArch64::X15)
2110 .addImm(Val: (NumWords & 0xFFFF0000) >> 16) // High half
2111 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 16))
2112 .setMIFlag(MachineInstr::FrameSetup);
2113 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
2114 .setMIFlag(MachineInstr::FrameSetup);
2115 }
2116 } else {
2117 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::MOVi64imm), DestReg: AArch64::X15)
2118 .addImm(Val: NumWords)
2119 .setMIFlags(MachineInstr::FrameSetup);
2120 }
2121
2122 const char *ChkStk = Subtarget.getChkStkName();
2123 switch (MF.getTarget().getCodeModel()) {
2124 case CodeModel::Tiny:
2125 case CodeModel::Small:
2126 case CodeModel::Medium:
2127 case CodeModel::Kernel:
2128 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::BL))
2129 .addExternalSymbol(FnName: ChkStk)
2130 .addReg(RegNo: AArch64::X15, flags: RegState::Implicit)
2131 .addReg(RegNo: AArch64::X16, flags: RegState::Implicit | RegState::Define | RegState::Dead)
2132 .addReg(RegNo: AArch64::X17, flags: RegState::Implicit | RegState::Define | RegState::Dead)
2133 .addReg(RegNo: AArch64::NZCV, flags: RegState::Implicit | RegState::Define | RegState::Dead)
2134 .setMIFlags(MachineInstr::FrameSetup);
2135 if (NeedsWinCFI) {
2136 HasWinCFI = true;
2137 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
2138 .setMIFlag(MachineInstr::FrameSetup);
2139 }
2140 break;
2141 case CodeModel::Large:
2142 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::MOVaddrEXT))
2143 .addReg(RegNo: AArch64::X16, flags: RegState::Define)
2144 .addExternalSymbol(FnName: ChkStk)
2145 .addExternalSymbol(FnName: ChkStk)
2146 .setMIFlags(MachineInstr::FrameSetup);
2147 if (NeedsWinCFI) {
2148 HasWinCFI = true;
2149 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
2150 .setMIFlag(MachineInstr::FrameSetup);
2151 }
2152
2153 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: getBLRCallOpcode(MF)))
2154 .addReg(RegNo: AArch64::X16, flags: RegState::Kill)
2155 .addReg(RegNo: AArch64::X15, flags: RegState::Implicit | RegState::Define)
2156 .addReg(RegNo: AArch64::X16, flags: RegState::Implicit | RegState::Define | RegState::Dead)
2157 .addReg(RegNo: AArch64::X17, flags: RegState::Implicit | RegState::Define | RegState::Dead)
2158 .addReg(RegNo: AArch64::NZCV, flags: RegState::Implicit | RegState::Define | RegState::Dead)
2159 .setMIFlags(MachineInstr::FrameSetup);
2160 if (NeedsWinCFI) {
2161 HasWinCFI = true;
2162 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
2163 .setMIFlag(MachineInstr::FrameSetup);
2164 }
2165 break;
2166 }
2167
2168 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SUBXrx64), DestReg: AArch64::SP)
2169 .addReg(RegNo: AArch64::SP, flags: RegState::Kill)
2170 .addReg(RegNo: AArch64::X15, flags: RegState::Kill)
2171 .addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: 4))
2172 .setMIFlags(MachineInstr::FrameSetup);
2173 if (NeedsWinCFI) {
2174 HasWinCFI = true;
2175 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_StackAlloc))
2176 .addImm(Val: NumBytes)
2177 .setMIFlag(MachineInstr::FrameSetup);
2178 }
2179 NumBytes = 0;
2180
2181 if (RealignmentPadding > 0) {
2182 if (RealignmentPadding >= 4096) {
2183 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::MOVi64imm))
2184 .addReg(RegNo: AArch64::X16, flags: RegState::Define)
2185 .addImm(Val: RealignmentPadding)
2186 .setMIFlags(MachineInstr::FrameSetup);
2187 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXrx64), DestReg: AArch64::X15)
2188 .addReg(RegNo: AArch64::SP)
2189 .addReg(RegNo: AArch64::X16, flags: RegState::Kill)
2190 .addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: 0))
2191 .setMIFlag(MachineInstr::FrameSetup);
2192 } else {
2193 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri), DestReg: AArch64::X15)
2194 .addReg(RegNo: AArch64::SP)
2195 .addImm(Val: RealignmentPadding)
2196 .addImm(Val: 0)
2197 .setMIFlag(MachineInstr::FrameSetup);
2198 }
2199
2200 uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1);
2201 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ANDXri), DestReg: AArch64::SP)
2202 .addReg(RegNo: AArch64::X15, flags: RegState::Kill)
2203 .addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: AndMask, regSize: 64));
2204 AFI->setStackRealigned(true);
2205
2206 // No need for SEH instructions here; if we're realigning the stack,
2207 // we've set a frame pointer and already finished the SEH prologue.
2208 assert(!NeedsWinCFI);
2209 }
2210 if (X15Scratch != AArch64::NoRegister) {
2211 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ORRXrr), DestReg: AArch64::X15)
2212 .addReg(RegNo: AArch64::XZR)
2213 .addReg(RegNo: X15Scratch, flags: RegState::Undef)
2214 .addReg(RegNo: X15Scratch, flags: RegState::Implicit)
2215 .setMIFlag(MachineInstr::FrameSetup);
2216 }
2217 }
2218
2219 StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
2220 MachineBasicBlock::iterator CalleeSavesEnd = MBBI;
2221
2222 StackOffset CFAOffset =
2223 StackOffset::getFixed(Fixed: (int64_t)MFI.getStackSize() - NumBytes);
2224
2225 // Process the SVE callee-saves to determine what space needs to be
2226 // allocated.
2227 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2228 LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
2229 << "\n");
2230 SVECalleeSavesSize = StackOffset::getScalable(Scalable: CalleeSavedSize);
2231 SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
2232 // Find callee save instructions in frame.
2233 // Note: With FPAfterSVECalleeSaves the callee saves have already been
2234 // allocated.
2235 if (!FPAfterSVECalleeSaves) {
2236 MachineBasicBlock::iterator CalleeSavesBegin = MBBI;
2237 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
2238 while (IsSVECalleeSave(I: MBBI) && MBBI != MBB.getFirstTerminator())
2239 ++MBBI;
2240 CalleeSavesEnd = MBBI;
2241
2242 StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(Fixed: NumBytes);
2243 // Allocate space for the callee saves (if any).
2244 allocateStackSpace(MBB, MBBI: CalleeSavesBegin, RealignmentPadding: 0, AllocSize: SVECalleeSavesSize, NeedsWinCFI: false,
2245 HasWinCFI: nullptr, EmitCFI: EmitAsyncCFI && !HasFP, InitialOffset: CFAOffset,
2246 FollowupAllocs: MFI.hasVarSizedObjects() || LocalsSize);
2247 }
2248 }
2249 CFAOffset += SVECalleeSavesSize;
2250
2251 if (EmitAsyncCFI)
2252 emitCalleeSavedSVELocations(MBB, MBBI: CalleeSavesEnd);
2253
2254 // Allocate space for the rest of the frame including SVE locals. Align the
2255 // stack as necessary.
2256 assert(!(canUseRedZone(MF) && NeedsRealignment) &&
2257 "Cannot use redzone with stack realignment");
2258 if (!canUseRedZone(MF)) {
2259 // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
2260 // the correct value here, as NumBytes also includes padding bytes,
2261 // which shouldn't be counted here.
2262 allocateStackSpace(MBB, MBBI: CalleeSavesEnd, RealignmentPadding,
2263 AllocSize: SVELocalsSize + StackOffset::getFixed(Fixed: NumBytes),
2264 NeedsWinCFI, HasWinCFI: &HasWinCFI, EmitCFI: EmitAsyncCFI && !HasFP,
2265 InitialOffset: CFAOffset, FollowupAllocs: MFI.hasVarSizedObjects());
2266 }
2267
2268 // If we need a base pointer, set it up here. It's whatever the value of the
2269 // stack pointer is at this point. Any variable size objects will be allocated
2270 // after this, so we can still use the base pointer to reference locals.
2271 //
2272 // FIXME: Clarify FrameSetup flags here.
2273 // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
2274 // needed.
2275 // For funclets the BP belongs to the containing function.
2276 if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
2277 TII->copyPhysReg(MBB, MI: MBBI, DL, DestReg: RegInfo->getBaseRegister(), SrcReg: AArch64::SP,
2278 KillSrc: false);
2279 if (NeedsWinCFI) {
2280 HasWinCFI = true;
2281 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
2282 .setMIFlag(MachineInstr::FrameSetup);
2283 }
2284 }
2285
2286 // The very last FrameSetup instruction indicates the end of prologue. Emit a
2287 // SEH opcode indicating the prologue end.
2288 if (NeedsWinCFI && HasWinCFI) {
2289 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_PrologEnd))
2290 .setMIFlag(MachineInstr::FrameSetup);
2291 }
2292
2293 // SEH funclets are passed the frame pointer in X1. If the parent
2294 // function uses the base register, then the base register is used
2295 // directly, and is not retrieved from X1.
2296 if (IsFunclet && F.hasPersonalityFn()) {
2297 EHPersonality Per = classifyEHPersonality(Pers: F.getPersonalityFn());
2298 if (isAsynchronousEHPersonality(Pers: Per)) {
2299 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: AArch64::FP)
2300 .addReg(RegNo: AArch64::X1)
2301 .setMIFlag(MachineInstr::FrameSetup);
2302 MBB.addLiveIn(PhysReg: AArch64::X1);
2303 }
2304 }
2305
2306 if (EmitCFI && !EmitAsyncCFI) {
2307 if (HasFP) {
2308 emitDefineCFAWithFP(MF, MBB, MBBI, FixedObject);
2309 } else {
2310 StackOffset TotalSize =
2311 SVEStackSize + StackOffset::getFixed(Fixed: (int64_t)MFI.getStackSize());
2312 CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
2313 CFIBuilder.insertCFIInst(
2314 CFIInst: createDefCFA(TRI: *RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP,
2315 Offset: TotalSize, /*LastAdjustmentWasScalable=*/false));
2316 }
2317 emitCalleeSavedGPRLocations(MBB, MBBI);
2318 emitCalleeSavedSVELocations(MBB, MBBI);
2319 }
2320}
2321
2322static bool isFuncletReturnInstr(const MachineInstr &MI) {
2323 switch (MI.getOpcode()) {
2324 default:
2325 return false;
2326 case AArch64::CATCHRET:
2327 case AArch64::CLEANUPRET:
2328 return true;
2329 }
2330}
2331
2332void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
2333 MachineBasicBlock &MBB) const {
2334 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
2335 MachineFrameInfo &MFI = MF.getFrameInfo();
2336 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2337 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2338 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
2339 DebugLoc DL;
2340 bool NeedsWinCFI = needsWinCFI(MF);
2341 bool EmitCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
2342 bool HasWinCFI = false;
2343 bool IsFunclet = false;
2344
2345 if (MBB.end() != MBBI) {
2346 DL = MBBI->getDebugLoc();
2347 IsFunclet = isFuncletReturnInstr(MI: *MBBI);
2348 }
2349
2350 MachineBasicBlock::iterator EpilogStartI = MBB.end();
2351
2352 auto FinishingTouches = make_scope_exit(F: [&]() {
2353 if (AFI->shouldSignReturnAddress(MF)) {
2354 BuildMI(BB&: MBB, I: MBB.getFirstTerminator(), MIMD: DL,
2355 MCID: TII->get(Opcode: AArch64::PAUTH_EPILOGUE))
2356 .setMIFlag(MachineInstr::FrameDestroy);
2357 if (NeedsWinCFI)
2358 HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR
2359 }
2360 if (AFI->needsShadowCallStackPrologueEpilogue(MF))
2361 emitShadowCallStackEpilogue(TII: *TII, MF, MBB, MBBI: MBB.getFirstTerminator(), DL);
2362 if (EmitCFI)
2363 emitCalleeSavedGPRRestores(MBB, MBBI: MBB.getFirstTerminator());
2364 if (HasWinCFI) {
2365 BuildMI(BB&: MBB, I: MBB.getFirstTerminator(), MIMD: DL,
2366 MCID: TII->get(Opcode: AArch64::SEH_EpilogEnd))
2367 .setMIFlag(MachineInstr::FrameDestroy);
2368 if (!MF.hasWinCFI())
2369 MF.setHasWinCFI(true);
2370 }
2371 if (NeedsWinCFI) {
2372 assert(EpilogStartI != MBB.end());
2373 if (!HasWinCFI)
2374 MBB.erase(I: EpilogStartI);
2375 }
2376 });
2377
2378 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
2379 : MFI.getStackSize();
2380
2381 // All calls are tail calls in GHC calling conv, and functions have no
2382 // prologue/epilogue.
2383 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2384 return;
2385
2386 // How much of the stack used by incoming arguments this function is expected
2387 // to restore in this particular epilogue.
2388 int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
2389 bool IsWin64 = Subtarget.isCallingConvWin64(CC: MF.getFunction().getCallingConv(),
2390 IsVarArg: MF.getFunction().isVarArg());
2391 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
2392
2393 int64_t AfterCSRPopSize = ArgumentStackToRestore;
2394 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
2395 // We cannot rely on the local stack size set in emitPrologue if the function
2396 // has funclets, as funclets have different local stack size requirements, and
2397 // the current value set in emitPrologue may be that of the containing
2398 // function.
2399 if (MF.hasEHFunclets())
2400 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
2401 if (homogeneousPrologEpilog(MF, Exit: &MBB)) {
2402 assert(!NeedsWinCFI);
2403 auto LastPopI = MBB.getFirstTerminator();
2404 if (LastPopI != MBB.begin()) {
2405 auto HomogeneousEpilog = std::prev(x: LastPopI);
2406 if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
2407 LastPopI = HomogeneousEpilog;
2408 }
2409
2410 // Adjust local stack
2411 emitFrameOffset(MBB, MBBI: LastPopI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2412 Offset: StackOffset::getFixed(Fixed: AFI->getLocalStackSize()), TII,
2413 MachineInstr::FrameDestroy, SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI);
2414
2415 // SP has been already adjusted while restoring callee save regs.
2416 // We've bailed-out the case with adjusting SP for arguments.
2417 assert(AfterCSRPopSize == 0);
2418 return;
2419 }
2420
2421 bool FPAfterSVECalleeSaves =
2422 Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
2423
2424 bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, StackBumpBytes: NumBytes);
2425 // Assume we can't combine the last pop with the sp restore.
2426 bool CombineAfterCSRBump = false;
2427 if (FPAfterSVECalleeSaves) {
2428 AfterCSRPopSize += FixedObject;
2429 } else if (!CombineSPBump && PrologueSaveSize != 0) {
2430 MachineBasicBlock::iterator Pop = std::prev(x: MBB.getFirstTerminator());
2431 while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
2432 AArch64InstrInfo::isSEHInstruction(MI: *Pop))
2433 Pop = std::prev(x: Pop);
2434 // Converting the last ldp to a post-index ldp is valid only if the last
2435 // ldp's offset is 0.
2436 const MachineOperand &OffsetOp = Pop->getOperand(i: Pop->getNumOperands() - 1);
2437 // If the offset is 0 and the AfterCSR pop is not actually trying to
2438 // allocate more stack for arguments (in space that an untimely interrupt
2439 // may clobber), convert it to a post-index ldp.
2440 if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
2441 convertCalleeSaveRestoreToSPPrePostIncDec(
2442 MBB, MBBI: Pop, DL, TII, CSStackSizeInc: PrologueSaveSize, NeedsWinCFI, HasWinCFI: &HasWinCFI, EmitCFI,
2443 FrameFlag: MachineInstr::FrameDestroy, CFAOffset: PrologueSaveSize);
2444 } else {
2445 // If not, make sure to emit an add after the last ldp.
2446 // We're doing this by transferring the size to be restored from the
2447 // adjustment *before* the CSR pops to the adjustment *after* the CSR
2448 // pops.
2449 AfterCSRPopSize += PrologueSaveSize;
2450 CombineAfterCSRBump = true;
2451 }
2452 }
2453
2454 // Move past the restores of the callee-saved registers.
2455 // If we plan on combining the sp bump of the local stack size and the callee
2456 // save stack size, we might need to adjust the CSR save and restore offsets.
2457 MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
2458 MachineBasicBlock::iterator Begin = MBB.begin();
2459 while (LastPopI != Begin) {
2460 --LastPopI;
2461 if (!LastPopI->getFlag(Flag: MachineInstr::FrameDestroy) ||
2462 (!FPAfterSVECalleeSaves && IsSVECalleeSave(I: LastPopI))) {
2463 ++LastPopI;
2464 break;
2465 } else if (CombineSPBump)
2466 fixupCalleeSaveRestoreStackOffset(MI&: *LastPopI, LocalStackSize: AFI->getLocalStackSize(),
2467 NeedsWinCFI, HasWinCFI: &HasWinCFI);
2468 }
2469
2470 if (NeedsWinCFI) {
2471 // Note that there are cases where we insert SEH opcodes in the
2472 // epilogue when we had no SEH opcodes in the prologue. For
2473 // example, when there is no stack frame but there are stack
2474 // arguments. Insert the SEH_EpilogStart and remove it later if it
2475 // we didn't emit any SEH opcodes to avoid generating WinCFI for
2476 // functions that don't need it.
2477 BuildMI(BB&: MBB, I: LastPopI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_EpilogStart))
2478 .setMIFlag(MachineInstr::FrameDestroy);
2479 EpilogStartI = LastPopI;
2480 --EpilogStartI;
2481 }
2482
2483 if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
2484 switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
2485 case SwiftAsyncFramePointerMode::DeploymentBased:
2486 // Avoid the reload as it is GOT relative, and instead fall back to the
2487 // hardcoded value below. This allows a mismatch between the OS and
2488 // application without immediately terminating on the difference.
2489 [[fallthrough]];
2490 case SwiftAsyncFramePointerMode::Always:
2491 // We need to reset FP to its untagged state on return. Bit 60 is
2492 // currently used to show the presence of an extended frame.
2493
2494 // BIC x29, x29, #0x1000_0000_0000_0000
2495 BuildMI(BB&: MBB, I: MBB.getFirstTerminator(), MIMD: DL, MCID: TII->get(Opcode: AArch64::ANDXri),
2496 DestReg: AArch64::FP)
2497 .addUse(RegNo: AArch64::FP)
2498 .addImm(Val: 0x10fe)
2499 .setMIFlag(MachineInstr::FrameDestroy);
2500 if (NeedsWinCFI) {
2501 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_Nop))
2502 .setMIFlags(MachineInstr::FrameDestroy);
2503 HasWinCFI = true;
2504 }
2505 break;
2506
2507 case SwiftAsyncFramePointerMode::Never:
2508 break;
2509 }
2510 }
2511
2512 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2513
2514 // If there is a single SP update, insert it before the ret and we're done.
2515 if (CombineSPBump) {
2516 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
2517
2518 // When we are about to restore the CSRs, the CFA register is SP again.
2519 if (EmitCFI && hasFP(MF))
2520 CFIInstBuilder(MBB, LastPopI, MachineInstr::FrameDestroy)
2521 .buildDefCFA(Reg: AArch64::SP, Offset: NumBytes);
2522
2523 emitFrameOffset(MBB, MBBI: MBB.getFirstTerminator(), DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2524 Offset: StackOffset::getFixed(Fixed: NumBytes + (int64_t)AfterCSRPopSize),
2525 TII, MachineInstr::FrameDestroy, SetNZCV: false, NeedsWinCFI,
2526 HasWinCFI: &HasWinCFI, EmitCFAOffset: EmitCFI, InitialOffset: StackOffset::getFixed(Fixed: NumBytes));
2527 return;
2528 }
2529
2530 NumBytes -= PrologueSaveSize;
2531 assert(NumBytes >= 0 && "Negative stack allocation size!?");
2532
2533 // Process the SVE callee-saves to determine what space needs to be
2534 // deallocated.
2535 StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
2536 MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
2537 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2538 if (FPAfterSVECalleeSaves)
2539 RestoreEnd = MBB.getFirstTerminator();
2540
2541 RestoreBegin = std::prev(x: RestoreEnd);
2542 while (RestoreBegin != MBB.begin() &&
2543 IsSVECalleeSave(I: std::prev(x: RestoreBegin)))
2544 --RestoreBegin;
2545
2546 assert(IsSVECalleeSave(RestoreBegin) &&
2547 IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
2548
2549 StackOffset CalleeSavedSizeAsOffset =
2550 StackOffset::getScalable(Scalable: CalleeSavedSize);
2551 DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
2552 DeallocateAfter = CalleeSavedSizeAsOffset;
2553 }
2554
2555 // Deallocate the SVE area.
2556 if (FPAfterSVECalleeSaves) {
2557 // If the callee-save area is before FP, restoring the FP implicitly
2558 // deallocates non-callee-save SVE allocations. Otherwise, deallocate
2559 // them explicitly.
2560 if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
2561 emitFrameOffset(MBB, MBBI: LastPopI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2562 Offset: DeallocateBefore, TII, MachineInstr::FrameDestroy, SetNZCV: false,
2563 NeedsWinCFI, HasWinCFI: &HasWinCFI);
2564 }
2565
2566 // Deallocate callee-save non-SVE registers.
2567 emitFrameOffset(MBB, MBBI: RestoreBegin, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2568 Offset: StackOffset::getFixed(Fixed: AFI->getCalleeSavedStackSize()), TII,
2569 MachineInstr::FrameDestroy, SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI);
2570
2571 // Deallocate fixed objects.
2572 emitFrameOffset(MBB, MBBI: RestoreEnd, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2573 Offset: StackOffset::getFixed(Fixed: FixedObject), TII,
2574 MachineInstr::FrameDestroy, SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI);
2575
2576 // Deallocate callee-save SVE registers.
2577 emitFrameOffset(MBB, MBBI: RestoreEnd, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2578 Offset: DeallocateAfter, TII, MachineInstr::FrameDestroy, SetNZCV: false,
2579 NeedsWinCFI, HasWinCFI: &HasWinCFI);
2580 } else if (SVEStackSize) {
2581 int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
2582 // If we have stack realignment or variable-sized objects we must use the
2583 // FP to restore SVE callee saves (as there is an unknown amount of
2584 // data/padding between the SP and SVE CS area).
2585 Register BaseForSVEDealloc =
2586 (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
2587 : AArch64::SP;
2588 if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
2589 Register CalleeSaveBase = AArch64::FP;
2590 if (int64_t CalleeSaveBaseOffset =
2591 AFI->getCalleeSaveBaseToFrameRecordOffset()) {
2592 // If we have have an non-zero offset to the non-SVE CS base we need to
2593 // compute the base address by subtracting the offest in a temporary
2594 // register first (to avoid briefly deallocating the SVE CS).
2595 CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
2596 RegClass: &AArch64::GPR64RegClass);
2597 emitFrameOffset(MBB, MBBI: RestoreBegin, DL, DestReg: CalleeSaveBase, SrcReg: AArch64::FP,
2598 Offset: StackOffset::getFixed(Fixed: -CalleeSaveBaseOffset), TII,
2599 MachineInstr::FrameDestroy);
2600 }
2601 // The code below will deallocate the stack space space by moving the
2602 // SP to the start of the SVE callee-save area.
2603 emitFrameOffset(MBB, MBBI: RestoreBegin, DL, DestReg: AArch64::SP, SrcReg: CalleeSaveBase,
2604 Offset: StackOffset::getScalable(Scalable: -SVECalleeSavedSize), TII,
2605 MachineInstr::FrameDestroy);
2606 } else if (BaseForSVEDealloc == AArch64::SP) {
2607 if (SVECalleeSavedSize) {
2608 // Deallocate the non-SVE locals first before we can deallocate (and
2609 // restore callee saves) from the SVE area.
2610 emitFrameOffset(
2611 MBB, MBBI: RestoreBegin, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2612 Offset: StackOffset::getFixed(Fixed: NumBytes), TII, MachineInstr::FrameDestroy,
2613 SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI, EmitCFAOffset: EmitCFI && !hasFP(MF),
2614 InitialOffset: SVEStackSize + StackOffset::getFixed(Fixed: NumBytes + PrologueSaveSize));
2615 NumBytes = 0;
2616 }
2617
2618 emitFrameOffset(MBB, MBBI: RestoreBegin, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2619 Offset: DeallocateBefore, TII, MachineInstr::FrameDestroy, SetNZCV: false,
2620 NeedsWinCFI, HasWinCFI: &HasWinCFI, EmitCFAOffset: EmitCFI && !hasFP(MF),
2621 InitialOffset: SVEStackSize +
2622 StackOffset::getFixed(Fixed: NumBytes + PrologueSaveSize));
2623
2624 emitFrameOffset(MBB, MBBI: RestoreEnd, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2625 Offset: DeallocateAfter, TII, MachineInstr::FrameDestroy, SetNZCV: false,
2626 NeedsWinCFI, HasWinCFI: &HasWinCFI, EmitCFAOffset: EmitCFI && !hasFP(MF),
2627 InitialOffset: DeallocateAfter +
2628 StackOffset::getFixed(Fixed: NumBytes + PrologueSaveSize));
2629 }
2630 if (EmitCFI)
2631 emitCalleeSavedSVERestores(MBB, MBBI: RestoreEnd);
2632 }
2633
2634 if (!hasFP(MF)) {
2635 bool RedZone = canUseRedZone(MF);
2636 // If this was a redzone leaf function, we don't need to restore the
2637 // stack pointer (but we may need to pop stack args for fastcc).
2638 if (RedZone && AfterCSRPopSize == 0)
2639 return;
2640
2641 // Pop the local variables off the stack. If there are no callee-saved
2642 // registers, it means we are actually positioned at the terminator and can
2643 // combine stack increment for the locals and the stack increment for
2644 // callee-popped arguments into (possibly) a single instruction and be done.
2645 bool NoCalleeSaveRestore = PrologueSaveSize == 0;
2646 int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
2647 if (NoCalleeSaveRestore)
2648 StackRestoreBytes += AfterCSRPopSize;
2649
2650 emitFrameOffset(
2651 MBB, MBBI: LastPopI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2652 Offset: StackOffset::getFixed(Fixed: StackRestoreBytes), TII,
2653 MachineInstr::FrameDestroy, SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI, EmitCFAOffset: EmitCFI,
2654 InitialOffset: StackOffset::getFixed(Fixed: (RedZone ? 0 : NumBytes) + PrologueSaveSize));
2655
2656 // If we were able to combine the local stack pop with the argument pop,
2657 // then we're done.
2658 if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
2659 return;
2660 }
2661
2662 NumBytes = 0;
2663 }
2664
2665 // Restore the original stack pointer.
2666 // FIXME: Rather than doing the math here, we should instead just use
2667 // non-post-indexed loads for the restores if we aren't actually going to
2668 // be able to save any instructions.
2669 if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
2670 emitFrameOffset(
2671 MBB, MBBI: LastPopI, DL, DestReg: AArch64::SP, SrcReg: AArch64::FP,
2672 Offset: StackOffset::getFixed(Fixed: -AFI->getCalleeSaveBaseToFrameRecordOffset()),
2673 TII, MachineInstr::FrameDestroy, SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI);
2674 } else if (NumBytes)
2675 emitFrameOffset(MBB, MBBI: LastPopI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2676 Offset: StackOffset::getFixed(Fixed: NumBytes), TII,
2677 MachineInstr::FrameDestroy, SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI);
2678
2679 // When we are about to restore the CSRs, the CFA register is SP again.
2680 if (EmitCFI && hasFP(MF))
2681 CFIInstBuilder(MBB, LastPopI, MachineInstr::FrameDestroy)
2682 .buildDefCFA(Reg: AArch64::SP, Offset: PrologueSaveSize);
2683
2684 // This must be placed after the callee-save restore code because that code
2685 // assumes the SP is at the same location as it was after the callee-save save
2686 // code in the prologue.
2687 if (AfterCSRPopSize) {
2688 assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
2689 "interrupt may have clobbered");
2690
2691 emitFrameOffset(
2692 MBB, MBBI: MBB.getFirstTerminator(), DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
2693 Offset: StackOffset::getFixed(Fixed: AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
2694 SetNZCV: false, NeedsWinCFI, HasWinCFI: &HasWinCFI, EmitCFAOffset: EmitCFI,
2695 InitialOffset: StackOffset::getFixed(Fixed: CombineAfterCSRBump ? PrologueSaveSize : 0));
2696 }
2697}
2698
2699bool AArch64FrameLowering::enableCFIFixup(const MachineFunction &MF) const {
2700 return TargetFrameLowering::enableCFIFixup(MF) &&
2701 MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF);
2702}
2703
2704bool AArch64FrameLowering::enableFullCFIFixup(const MachineFunction &MF) const {
2705 return enableCFIFixup(MF) &&
2706 MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF);
2707}
2708
2709/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
2710/// debug info. It's the same as what we use for resolving the code-gen
2711/// references for now. FIXME: This can go wrong when references are
2712/// SP-relative and simple call frames aren't used.
2713StackOffset
2714AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
2715 Register &FrameReg) const {
2716 return resolveFrameIndexReference(
2717 MF, FI, FrameReg,
2718 /*PreferFP=*/
2719 MF.getFunction().hasFnAttribute(Kind: Attribute::SanitizeHWAddress) ||
2720 MF.getFunction().hasFnAttribute(Kind: Attribute::SanitizeMemTag),
2721 /*ForSimm=*/false);
2722}
2723
2724StackOffset
2725AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
2726 int FI) const {
2727 // This function serves to provide a comparable offset from a single reference
2728 // point (the value of SP at function entry) that can be used for analysis,
2729 // e.g. the stack-frame-layout analysis pass. It is not guaranteed to be
2730 // correct for all objects in the presence of VLA-area objects or dynamic
2731 // stack re-alignment.
2732
2733 const auto &MFI = MF.getFrameInfo();
2734
2735 int64_t ObjectOffset = MFI.getObjectOffset(ObjectIdx: FI);
2736 StackOffset SVEStackSize = getSVEStackSize(MF);
2737
2738 // For VLA-area objects, just emit an offset at the end of the stack frame.
2739 // Whilst not quite correct, these objects do live at the end of the frame and
2740 // so it is more useful for analysis for the offset to reflect this.
2741 if (MFI.isVariableSizedObjectIndex(ObjectIdx: FI)) {
2742 return StackOffset::getFixed(Fixed: -((int64_t)MFI.getStackSize())) - SVEStackSize;
2743 }
2744
2745 // This is correct in the absence of any SVE stack objects.
2746 if (!SVEStackSize)
2747 return StackOffset::getFixed(Fixed: ObjectOffset - getOffsetOfLocalArea());
2748
2749 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2750 bool FPAfterSVECalleeSaves =
2751 isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
2752 if (MFI.getStackID(ObjectIdx: FI) == TargetStackID::ScalableVector) {
2753 if (FPAfterSVECalleeSaves &&
2754 -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize())
2755 return StackOffset::getScalable(Scalable: ObjectOffset);
2756 return StackOffset::get(Fixed: -((int64_t)AFI->getCalleeSavedStackSize()),
2757 Scalable: ObjectOffset);
2758 }
2759
2760 bool IsFixed = MFI.isFixedObjectIndex(ObjectIdx: FI);
2761 bool IsCSR =
2762 !IsFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2763
2764 StackOffset ScalableOffset = {};
2765 if (!IsFixed && !IsCSR) {
2766 ScalableOffset = -SVEStackSize;
2767 } else if (FPAfterSVECalleeSaves && IsCSR) {
2768 ScalableOffset =
2769 -StackOffset::getScalable(Scalable: AFI->getSVECalleeSavedStackSize());
2770 }
2771
2772 return StackOffset::getFixed(Fixed: ObjectOffset) + ScalableOffset;
2773}
2774
2775StackOffset
2776AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
2777 int FI) const {
2778 return StackOffset::getFixed(Fixed: getSEHFrameIndexOffset(MF, FI));
2779}
2780
2781static StackOffset getFPOffset(const MachineFunction &MF,
2782 int64_t ObjectOffset) {
2783 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2784 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2785 const Function &F = MF.getFunction();
2786 bool IsWin64 = Subtarget.isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
2787 unsigned FixedObject =
2788 getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
2789 int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MFI: MF.getFrameInfo());
2790 int64_t FPAdjust =
2791 CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
2792 return StackOffset::getFixed(Fixed: ObjectOffset + FixedObject + FPAdjust);
2793}
2794
2795static StackOffset getStackOffset(const MachineFunction &MF,
2796 int64_t ObjectOffset) {
2797 const auto &MFI = MF.getFrameInfo();
2798 return StackOffset::getFixed(Fixed: ObjectOffset + (int64_t)MFI.getStackSize());
2799}
2800
2801// TODO: This function currently does not work for scalable vectors.
2802int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
2803 int FI) const {
2804 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2805 MF.getSubtarget().getRegisterInfo());
2806 int ObjectOffset = MF.getFrameInfo().getObjectOffset(ObjectIdx: FI);
2807 return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
2808 ? getFPOffset(MF, ObjectOffset).getFixed()
2809 : getStackOffset(MF, ObjectOffset).getFixed();
2810}
2811
2812StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2813 const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2814 bool ForSimm) const {
2815 const auto &MFI = MF.getFrameInfo();
2816 int64_t ObjectOffset = MFI.getObjectOffset(ObjectIdx: FI);
2817 bool isFixed = MFI.isFixedObjectIndex(ObjectIdx: FI);
2818 bool isSVE = MFI.getStackID(ObjectIdx: FI) == TargetStackID::ScalableVector;
2819 return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2820 PreferFP, ForSimm);
2821}
2822
2823StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2824 const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2825 Register &FrameReg, bool PreferFP, bool ForSimm) const {
2826 const auto &MFI = MF.getFrameInfo();
2827 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2828 MF.getSubtarget().getRegisterInfo());
2829 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2830 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2831
2832 int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2833 int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2834 bool isCSR =
2835 !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2836
2837 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2838
2839 // Use frame pointer to reference fixed objects. Use it for locals if
2840 // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2841 // reliable as a base). Make sure useFPForScavengingIndex() does the
2842 // right thing for the emergency spill slot.
2843 bool UseFP = false;
2844 if (AFI->hasStackFrame() && !isSVE) {
2845 // We shouldn't prefer using the FP to access fixed-sized stack objects when
2846 // there are scalable (SVE) objects in between the FP and the fixed-sized
2847 // objects.
2848 PreferFP &= !SVEStackSize;
2849
2850 // Note: Keeping the following as multiple 'if' statements rather than
2851 // merging to a single expression for readability.
2852 //
2853 // Argument access should always use the FP.
2854 if (isFixed) {
2855 UseFP = hasFP(MF);
2856 } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2857 // References to the CSR area must use FP if we're re-aligning the stack
2858 // since the dynamically-sized alignment padding is between the SP/BP and
2859 // the CSR area.
2860 assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2861 UseFP = true;
2862 } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2863 // If the FPOffset is negative and we're producing a signed immediate, we
2864 // have to keep in mind that the available offset range for negative
2865 // offsets is smaller than for positive ones. If an offset is available
2866 // via the FP and the SP, use whichever is closest.
2867 bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2868 PreferFP |= Offset > -FPOffset && !SVEStackSize;
2869
2870 if (FPOffset >= 0) {
2871 // If the FPOffset is positive, that'll always be best, as the SP/BP
2872 // will be even further away.
2873 UseFP = true;
2874 } else if (MFI.hasVarSizedObjects()) {
2875 // If we have variable sized objects, we can use either FP or BP, as the
2876 // SP offset is unknown. We can use the base pointer if we have one and
2877 // FP is not preferred. If not, we're stuck with using FP.
2878 bool CanUseBP = RegInfo->hasBasePointer(MF);
2879 if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2880 UseFP = PreferFP;
2881 else if (!CanUseBP) // Can't use BP. Forced to use FP.
2882 UseFP = true;
2883 // else we can use BP and FP, but the offset from FP won't fit.
2884 // That will make us scavenge registers which we can probably avoid by
2885 // using BP. If it won't fit for BP either, we'll scavenge anyway.
2886 } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2887 // Funclets access the locals contained in the parent's stack frame
2888 // via the frame pointer, so we have to use the FP in the parent
2889 // function.
2890 (void) Subtarget;
2891 assert(Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(),
2892 MF.getFunction().isVarArg()) &&
2893 "Funclets should only be present on Win64");
2894 UseFP = true;
2895 } else {
2896 // We have the choice between FP and (SP or BP).
2897 if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2898 UseFP = true;
2899 }
2900 }
2901 }
2902
2903 assert(
2904 ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2905 "In the presence of dynamic stack pointer realignment, "
2906 "non-argument/CSR objects cannot be accessed through the frame pointer");
2907
2908 bool FPAfterSVECalleeSaves =
2909 isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
2910
2911 if (isSVE) {
2912 StackOffset FPOffset =
2913 StackOffset::get(Fixed: -AFI->getCalleeSaveBaseToFrameRecordOffset(), Scalable: ObjectOffset);
2914 StackOffset SPOffset =
2915 SVEStackSize +
2916 StackOffset::get(Fixed: MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2917 Scalable: ObjectOffset);
2918 if (FPAfterSVECalleeSaves) {
2919 assert(-ObjectOffset > (int64_t)AFI->getSVECalleeSavedStackSize() &&
2920 "Math isn't correct for CSRs with FPAfterSVECalleeSaves");
2921 FPOffset += StackOffset::getScalable(Scalable: AFI->getSVECalleeSavedStackSize());
2922 }
2923 // Always use the FP for SVE spills if available and beneficial.
2924 if (hasFP(MF) && (SPOffset.getFixed() ||
2925 FPOffset.getScalable() < SPOffset.getScalable() ||
2926 RegInfo->hasStackRealignment(MF))) {
2927 FrameReg = RegInfo->getFrameRegister(MF);
2928 return FPOffset;
2929 }
2930
2931 FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2932 : (unsigned)AArch64::SP;
2933 return SPOffset;
2934 }
2935
2936 StackOffset ScalableOffset = {};
2937 if (FPAfterSVECalleeSaves) {
2938 // In this stack layout, the FP is in between the callee saves and other
2939 // SVE allocations.
2940 StackOffset SVECalleeSavedStack =
2941 StackOffset::getScalable(Scalable: AFI->getSVECalleeSavedStackSize());
2942 if (UseFP) {
2943 if (isFixed)
2944 ScalableOffset = SVECalleeSavedStack;
2945 else if (!isCSR)
2946 ScalableOffset = SVECalleeSavedStack - SVEStackSize;
2947 } else {
2948 if (isFixed)
2949 ScalableOffset = SVEStackSize;
2950 else if (isCSR)
2951 ScalableOffset = SVEStackSize - SVECalleeSavedStack;
2952 }
2953 } else {
2954 if (UseFP && !(isFixed || isCSR))
2955 ScalableOffset = -SVEStackSize;
2956 if (!UseFP && (isFixed || isCSR))
2957 ScalableOffset = SVEStackSize;
2958 }
2959
2960 if (UseFP) {
2961 FrameReg = RegInfo->getFrameRegister(MF);
2962 return StackOffset::getFixed(Fixed: FPOffset) + ScalableOffset;
2963 }
2964
2965 // Use the base pointer if we have one.
2966 if (RegInfo->hasBasePointer(MF))
2967 FrameReg = RegInfo->getBaseRegister();
2968 else {
2969 assert(!MFI.hasVarSizedObjects() &&
2970 "Can't use SP when we have var sized objects.");
2971 FrameReg = AArch64::SP;
2972 // If we're using the red zone for this function, the SP won't actually
2973 // be adjusted, so the offsets will be negative. They're also all
2974 // within range of the signed 9-bit immediate instructions.
2975 if (canUseRedZone(MF))
2976 Offset -= AFI->getLocalStackSize();
2977 }
2978
2979 return StackOffset::getFixed(Fixed: Offset) + ScalableOffset;
2980}
2981
2982static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2983 // Do not set a kill flag on values that are also marked as live-in. This
2984 // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2985 // callee saved registers.
2986 // Omitting the kill flags is conservatively correct even if the live-in
2987 // is not used after all.
2988 bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2989 return getKillRegState(B: !IsLiveIn);
2990}
2991
2992static bool produceCompactUnwindFrame(MachineFunction &MF) {
2993 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2994 AttributeList Attrs = MF.getFunction().getAttributes();
2995 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2996 return Subtarget.isTargetMachO() &&
2997 !(Subtarget.getTargetLowering()->supportSwiftError() &&
2998 Attrs.hasAttrSomewhere(Kind: Attribute::SwiftError)) &&
2999 MF.getFunction().getCallingConv() != CallingConv::SwiftTail &&
3000 !requiresSaveVG(MF) && AFI->getSVECalleeSavedStackSize() == 0;
3001}
3002
3003static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
3004 bool NeedsWinCFI, bool IsFirst,
3005 const TargetRegisterInfo *TRI) {
3006 // If we are generating register pairs for a Windows function that requires
3007 // EH support, then pair consecutive registers only. There are no unwind
3008 // opcodes for saves/restores of non-consecutive register pairs.
3009 // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
3010 // save_lrpair.
3011 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
3012
3013 if (Reg2 == AArch64::FP)
3014 return true;
3015 if (!NeedsWinCFI)
3016 return false;
3017 if (TRI->getEncodingValue(Reg: Reg2) == TRI->getEncodingValue(Reg: Reg1) + 1)
3018 return false;
3019 // If pairing a GPR with LR, the pair can be described by the save_lrpair
3020 // opcode. If this is the first register pair, it would end up with a
3021 // predecrement, but there's no save_lrpair_x opcode, so we can only do this
3022 // if LR is paired with something else than the first register.
3023 // The save_lrpair opcode requires the first register to be an odd one.
3024 if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
3025 (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
3026 return false;
3027 return true;
3028}
3029
3030/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
3031/// WindowsCFI requires that only consecutive registers can be paired.
3032/// LR and FP need to be allocated together when the frame needs to save
3033/// the frame-record. This means any other register pairing with LR is invalid.
3034static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
3035 bool UsesWinAAPCS, bool NeedsWinCFI,
3036 bool NeedsFrameRecord, bool IsFirst,
3037 const TargetRegisterInfo *TRI) {
3038 if (UsesWinAAPCS)
3039 return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst,
3040 TRI);
3041
3042 // If we need to store the frame record, don't pair any register
3043 // with LR other than FP.
3044 if (NeedsFrameRecord)
3045 return Reg2 == AArch64::LR;
3046
3047 return false;
3048}
3049
3050namespace {
3051
3052struct RegPairInfo {
3053 unsigned Reg1 = AArch64::NoRegister;
3054 unsigned Reg2 = AArch64::NoRegister;
3055 int FrameIdx;
3056 int Offset;
3057 enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
3058 const TargetRegisterClass *RC;
3059
3060 RegPairInfo() = default;
3061
3062 bool isPaired() const { return Reg2 != AArch64::NoRegister; }
3063
3064 bool isScalable() const { return Type == PPR || Type == ZPR; }
3065};
3066
3067} // end anonymous namespace
3068
3069unsigned findFreePredicateReg(BitVector &SavedRegs) {
3070 for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
3071 if (SavedRegs.test(Idx: PReg)) {
3072 unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0;
3073 return PNReg;
3074 }
3075 }
3076 return AArch64::NoRegister;
3077}
3078
3079// The multivector LD/ST are available only for SME or SVE2p1 targets
3080bool enableMultiVectorSpillFill(const AArch64Subtarget &Subtarget,
3081 MachineFunction &MF) {
3082 if (DisableMultiVectorSpillFill)
3083 return false;
3084
3085 SMEAttrs FuncAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
3086 bool IsLocallyStreaming =
3087 FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
3088
3089 // Only when in streaming mode SME2 instructions can be safely used.
3090 // It is not safe to use SME2 instructions when in streaming compatible or
3091 // locally streaming mode.
3092 return Subtarget.hasSVE2p1() ||
3093 (Subtarget.hasSME2() &&
3094 (!IsLocallyStreaming && Subtarget.isStreaming()));
3095}
3096
3097static void computeCalleeSaveRegisterPairs(
3098 MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
3099 const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
3100 bool NeedsFrameRecord) {
3101
3102 if (CSI.empty())
3103 return;
3104
3105 bool IsWindows = isTargetWindows(MF);
3106 bool NeedsWinCFI = needsWinCFI(MF);
3107 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3108 unsigned StackHazardSize = getStackHazardSize(MF);
3109 MachineFrameInfo &MFI = MF.getFrameInfo();
3110 CallingConv::ID CC = MF.getFunction().getCallingConv();
3111 unsigned Count = CSI.size();
3112 (void)CC;
3113 // MachO's compact unwind format relies on all registers being stored in
3114 // pairs.
3115 assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost ||
3116 CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS ||
3117 CC == CallingConv::Win64 || (Count & 1) == 0) &&
3118 "Odd number of callee-saved regs to spill!");
3119 int ByteOffset = AFI->getCalleeSavedStackSize();
3120 int StackFillDir = -1;
3121 int RegInc = 1;
3122 unsigned FirstReg = 0;
3123 if (NeedsWinCFI) {
3124 // For WinCFI, fill the stack from the bottom up.
3125 ByteOffset = 0;
3126 StackFillDir = 1;
3127 // As the CSI array is reversed to match PrologEpilogInserter, iterate
3128 // backwards, to pair up registers starting from lower numbered registers.
3129 RegInc = -1;
3130 FirstReg = Count - 1;
3131 }
3132 bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize();
3133 int ScalableByteOffset =
3134 FPAfterSVECalleeSaves ? 0 : AFI->getSVECalleeSavedStackSize();
3135 bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
3136 Register LastReg = 0;
3137
3138 // When iterating backwards, the loop condition relies on unsigned wraparound.
3139 for (unsigned i = FirstReg; i < Count; i += RegInc) {
3140 RegPairInfo RPI;
3141 RPI.Reg1 = CSI[i].getReg();
3142
3143 if (AArch64::GPR64RegClass.contains(Reg: RPI.Reg1)) {
3144 RPI.Type = RegPairInfo::GPR;
3145 RPI.RC = &AArch64::GPR64RegClass;
3146 } else if (AArch64::FPR64RegClass.contains(Reg: RPI.Reg1)) {
3147 RPI.Type = RegPairInfo::FPR64;
3148 RPI.RC = &AArch64::FPR64RegClass;
3149 } else if (AArch64::FPR128RegClass.contains(Reg: RPI.Reg1)) {
3150 RPI.Type = RegPairInfo::FPR128;
3151 RPI.RC = &AArch64::FPR128RegClass;
3152 } else if (AArch64::ZPRRegClass.contains(Reg: RPI.Reg1)) {
3153 RPI.Type = RegPairInfo::ZPR;
3154 RPI.RC = &AArch64::ZPRRegClass;
3155 } else if (AArch64::PPRRegClass.contains(Reg: RPI.Reg1)) {
3156 RPI.Type = RegPairInfo::PPR;
3157 RPI.RC = &AArch64::PPRRegClass;
3158 } else if (RPI.Reg1 == AArch64::VG) {
3159 RPI.Type = RegPairInfo::VG;
3160 RPI.RC = &AArch64::FIXED_REGSRegClass;
3161 } else {
3162 llvm_unreachable("Unsupported register class.");
3163 }
3164
3165 // Add the stack hazard size as we transition from GPR->FPR CSRs.
3166 if (AFI->hasStackHazardSlotIndex() &&
3167 (!LastReg || !AArch64InstrInfo::isFpOrNEON(Reg: LastReg)) &&
3168 AArch64InstrInfo::isFpOrNEON(Reg: RPI.Reg1))
3169 ByteOffset += StackFillDir * StackHazardSize;
3170 LastReg = RPI.Reg1;
3171
3172 int Scale = TRI->getSpillSize(RC: *RPI.RC);
3173 // Add the next reg to the pair if it is in the same register class.
3174 if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
3175 MCRegister NextReg = CSI[i + RegInc].getReg();
3176 bool IsFirst = i == FirstReg;
3177 switch (RPI.Type) {
3178 case RegPairInfo::GPR:
3179 if (AArch64::GPR64RegClass.contains(Reg: NextReg) &&
3180 !invalidateRegisterPairing(Reg1: RPI.Reg1, Reg2: NextReg, UsesWinAAPCS: IsWindows,
3181 NeedsWinCFI, NeedsFrameRecord, IsFirst,
3182 TRI))
3183 RPI.Reg2 = NextReg;
3184 break;
3185 case RegPairInfo::FPR64:
3186 if (AArch64::FPR64RegClass.contains(Reg: NextReg) &&
3187 !invalidateWindowsRegisterPairing(Reg1: RPI.Reg1, Reg2: NextReg, NeedsWinCFI,
3188 IsFirst, TRI))
3189 RPI.Reg2 = NextReg;
3190 break;
3191 case RegPairInfo::FPR128:
3192 if (AArch64::FPR128RegClass.contains(Reg: NextReg))
3193 RPI.Reg2 = NextReg;
3194 break;
3195 case RegPairInfo::PPR:
3196 break;
3197 case RegPairInfo::ZPR:
3198 if (AFI->getPredicateRegForFillSpill() != 0 &&
3199 ((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) {
3200 // Calculate offset of register pair to see if pair instruction can be
3201 // used.
3202 int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale;
3203 if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0))
3204 RPI.Reg2 = NextReg;
3205 }
3206 break;
3207 case RegPairInfo::VG:
3208 break;
3209 }
3210 }
3211
3212 // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
3213 // list to come in sorted by frame index so that we can issue the store
3214 // pair instructions directly. Assert if we see anything otherwise.
3215 //
3216 // The order of the registers in the list is controlled by
3217 // getCalleeSavedRegs(), so they will always be in-order, as well.
3218 assert((!RPI.isPaired() ||
3219 (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
3220 "Out of order callee saved regs!");
3221
3222 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
3223 RPI.Reg1 == AArch64::LR) &&
3224 "FrameRecord must be allocated together with LR");
3225
3226 // Windows AAPCS has FP and LR reversed.
3227 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
3228 RPI.Reg2 == AArch64::LR) &&
3229 "FrameRecord must be allocated together with LR");
3230
3231 // MachO's compact unwind format relies on all registers being stored in
3232 // adjacent register pairs.
3233 assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost ||
3234 CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS ||
3235 CC == CallingConv::Win64 ||
3236 (RPI.isPaired() &&
3237 ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
3238 RPI.Reg1 + 1 == RPI.Reg2))) &&
3239 "Callee-save registers not saved as adjacent register pair!");
3240
3241 RPI.FrameIdx = CSI[i].getFrameIdx();
3242 if (NeedsWinCFI &&
3243 RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
3244 RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
3245
3246 // Realign the scalable offset if necessary. This is relevant when
3247 // spilling predicates on Windows.
3248 if (RPI.isScalable() && ScalableByteOffset % Scale != 0) {
3249 ScalableByteOffset = alignTo(Value: ScalableByteOffset, Align: Scale);
3250 }
3251
3252 int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
3253 assert(OffsetPre % Scale == 0);
3254
3255 if (RPI.isScalable())
3256 ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
3257 else
3258 ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
3259
3260 // Swift's async context is directly before FP, so allocate an extra
3261 // 8 bytes for it.
3262 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
3263 ((!IsWindows && RPI.Reg2 == AArch64::FP) ||
3264 (IsWindows && RPI.Reg2 == AArch64::LR)))
3265 ByteOffset += StackFillDir * 8;
3266
3267 // Round up size of non-pair to pair size if we need to pad the
3268 // callee-save area to ensure 16-byte alignment.
3269 if (NeedGapToAlignStack && !NeedsWinCFI && !RPI.isScalable() &&
3270 RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired() &&
3271 ByteOffset % 16 != 0) {
3272 ByteOffset += 8 * StackFillDir;
3273 assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
3274 // A stack frame with a gap looks like this, bottom up:
3275 // d9, d8. x21, gap, x20, x19.
3276 // Set extra alignment on the x21 object to create the gap above it.
3277 MFI.setObjectAlignment(ObjectIdx: RPI.FrameIdx, Alignment: Align(16));
3278 NeedGapToAlignStack = false;
3279 }
3280
3281 int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
3282 assert(OffsetPost % Scale == 0);
3283 // If filling top down (default), we want the offset after incrementing it.
3284 // If filling bottom up (WinCFI) we need the original offset.
3285 int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
3286
3287 // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
3288 // Swift context can directly precede FP.
3289 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
3290 ((!IsWindows && RPI.Reg2 == AArch64::FP) ||
3291 (IsWindows && RPI.Reg2 == AArch64::LR)))
3292 Offset += 8;
3293 RPI.Offset = Offset / Scale;
3294
3295 assert((!RPI.isPaired() ||
3296 (!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
3297 (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
3298 "Offset out of bounds for LDP/STP immediate");
3299
3300 auto isFrameRecord = [&] {
3301 if (RPI.isPaired())
3302 return IsWindows ? RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR
3303 : RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP;
3304 // Otherwise, look for the frame record as two unpaired registers. This is
3305 // needed for -aarch64-stack-hazard-size=<val>, which disables register
3306 // pairing (as the padding may be too large for the LDP/STP offset). Note:
3307 // On Windows, this check works out as current reg == FP, next reg == LR,
3308 // and on other platforms current reg == FP, previous reg == LR. This
3309 // works out as the correct pre-increment or post-increment offsets
3310 // respectively.
3311 return i > 0 && RPI.Reg1 == AArch64::FP &&
3312 CSI[i - 1].getReg() == AArch64::LR;
3313 };
3314
3315 // Save the offset to frame record so that the FP register can point to the
3316 // innermost frame record (spilled FP and LR registers).
3317 if (NeedsFrameRecord && isFrameRecord())
3318 AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
3319
3320 RegPairs.push_back(Elt: RPI);
3321 if (RPI.isPaired())
3322 i += RegInc;
3323 }
3324 if (NeedsWinCFI) {
3325 // If we need an alignment gap in the stack, align the topmost stack
3326 // object. A stack frame with a gap looks like this, bottom up:
3327 // x19, d8. d9, gap.
3328 // Set extra alignment on the topmost stack object (the first element in
3329 // CSI, which goes top down), to create the gap above it.
3330 if (AFI->hasCalleeSaveStackFreeSpace())
3331 MFI.setObjectAlignment(ObjectIdx: CSI[0].getFrameIdx(), Alignment: Align(16));
3332 // We iterated bottom up over the registers; flip RegPairs back to top
3333 // down order.
3334 std::reverse(first: RegPairs.begin(), last: RegPairs.end());
3335 }
3336}
3337
3338bool AArch64FrameLowering::spillCalleeSavedRegisters(
3339 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
3340 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
3341 MachineFunction &MF = *MBB.getParent();
3342 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3343 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3344 bool NeedsWinCFI = needsWinCFI(MF);
3345 DebugLoc DL;
3346 SmallVector<RegPairInfo, 8> RegPairs;
3347
3348 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, NeedsFrameRecord: hasFP(MF));
3349
3350 MachineRegisterInfo &MRI = MF.getRegInfo();
3351 // Refresh the reserved regs in case there are any potential changes since the
3352 // last freeze.
3353 MRI.freezeReservedRegs();
3354
3355 if (homogeneousPrologEpilog(MF)) {
3356 auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::HOM_Prolog))
3357 .setMIFlag(MachineInstr::FrameSetup);
3358
3359 for (auto &RPI : RegPairs) {
3360 MIB.addReg(RegNo: RPI.Reg1);
3361 MIB.addReg(RegNo: RPI.Reg2);
3362
3363 // Update register live in.
3364 if (!MRI.isReserved(PhysReg: RPI.Reg1))
3365 MBB.addLiveIn(PhysReg: RPI.Reg1);
3366 if (RPI.isPaired() && !MRI.isReserved(PhysReg: RPI.Reg2))
3367 MBB.addLiveIn(PhysReg: RPI.Reg2);
3368 }
3369 return true;
3370 }
3371 bool PTrueCreated = false;
3372 for (const RegPairInfo &RPI : llvm::reverse(C&: RegPairs)) {
3373 unsigned Reg1 = RPI.Reg1;
3374 unsigned Reg2 = RPI.Reg2;
3375 unsigned StrOpc;
3376
3377 // Issue sequence of spills for cs regs. The first spill may be converted
3378 // to a pre-decrement store later by emitPrologue if the callee-save stack
3379 // area allocation can't be combined with the local stack area allocation.
3380 // For example:
3381 // stp x22, x21, [sp, #0] // addImm(+0)
3382 // stp x20, x19, [sp, #16] // addImm(+2)
3383 // stp fp, lr, [sp, #32] // addImm(+4)
3384 // Rationale: This sequence saves uop updates compared to a sequence of
3385 // pre-increment spills like stp xi,xj,[sp,#-16]!
3386 // Note: Similar rationale and sequence for restores in epilog.
3387 unsigned Size = TRI->getSpillSize(RC: *RPI.RC);
3388 Align Alignment = TRI->getSpillAlign(RC: *RPI.RC);
3389 switch (RPI.Type) {
3390 case RegPairInfo::GPR:
3391 StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
3392 break;
3393 case RegPairInfo::FPR64:
3394 StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
3395 break;
3396 case RegPairInfo::FPR128:
3397 StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
3398 break;
3399 case RegPairInfo::ZPR:
3400 StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
3401 break;
3402 case RegPairInfo::PPR:
3403 StrOpc =
3404 Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI;
3405 break;
3406 case RegPairInfo::VG:
3407 StrOpc = AArch64::STRXui;
3408 break;
3409 }
3410
3411 unsigned X0Scratch = AArch64::NoRegister;
3412 if (Reg1 == AArch64::VG) {
3413 // Find an available register to store value of VG to.
3414 Reg1 = findScratchNonCalleeSaveRegister(MBB: &MBB, HasCall: true);
3415 assert(Reg1 != AArch64::NoRegister);
3416 SMEAttrs Attrs = AFI->getSMEFnAttrs();
3417
3418 if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() &&
3419 AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) {
3420 // For locally-streaming functions, we need to store both the streaming
3421 // & non-streaming VG. Spill the streaming value first.
3422 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::RDSVLI_XI), DestReg: Reg1)
3423 .addImm(Val: 1)
3424 .setMIFlag(MachineInstr::FrameSetup);
3425 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::UBFMXri), DestReg: Reg1)
3426 .addReg(RegNo: Reg1)
3427 .addImm(Val: 3)
3428 .addImm(Val: 63)
3429 .setMIFlag(MachineInstr::FrameSetup);
3430
3431 AFI->setStreamingVGIdx(RPI.FrameIdx);
3432 } else if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
3433 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::CNTD_XPiI), DestReg: Reg1)
3434 .addImm(Val: 31)
3435 .addImm(Val: 1)
3436 .setMIFlag(MachineInstr::FrameSetup);
3437 AFI->setVGIdx(RPI.FrameIdx);
3438 } else {
3439 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
3440 if (llvm::any_of(
3441 Range: MBB.liveins(),
3442 P: [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
3443 return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
3444 RegA: AArch64::X0, RegB: LiveIn.PhysReg);
3445 }))
3446 X0Scratch = Reg1;
3447
3448 if (X0Scratch != AArch64::NoRegister)
3449 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::ORRXrr), DestReg: Reg1)
3450 .addReg(RegNo: AArch64::XZR)
3451 .addReg(RegNo: AArch64::X0, flags: RegState::Undef)
3452 .addReg(RegNo: AArch64::X0, flags: RegState::Implicit)
3453 .setMIFlag(MachineInstr::FrameSetup);
3454
3455 const uint32_t *RegMask = TRI->getCallPreservedMask(
3456 MF,
3457 CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
3458 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::BL))
3459 .addExternalSymbol(FnName: "__arm_get_current_vg")
3460 .addRegMask(Mask: RegMask)
3461 .addReg(RegNo: AArch64::X0, flags: RegState::ImplicitDefine)
3462 .setMIFlag(MachineInstr::FrameSetup);
3463 Reg1 = AArch64::X0;
3464 AFI->setVGIdx(RPI.FrameIdx);
3465 }
3466 }
3467
3468 LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
3469 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
3470 dbgs() << ") -> fi#(" << RPI.FrameIdx;
3471 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
3472 dbgs() << ")\n");
3473
3474 assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
3475 "Windows unwdinding requires a consecutive (FP,LR) pair");
3476 // Windows unwind codes require consecutive registers if registers are
3477 // paired. Make the switch here, so that the code below will save (x,x+1)
3478 // and not (x+1,x).
3479 unsigned FrameIdxReg1 = RPI.FrameIdx;
3480 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
3481 if (NeedsWinCFI && RPI.isPaired()) {
3482 std::swap(a&: Reg1, b&: Reg2);
3483 std::swap(a&: FrameIdxReg1, b&: FrameIdxReg2);
3484 }
3485
3486 if (RPI.isPaired() && RPI.isScalable()) {
3487 [[maybe_unused]] const AArch64Subtarget &Subtarget =
3488 MF.getSubtarget<AArch64Subtarget>();
3489 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3490 unsigned PnReg = AFI->getPredicateRegForFillSpill();
3491 assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) &&
3492 "Expects SVE2.1 or SME2 target and a predicate register");
3493#ifdef EXPENSIVE_CHECKS
3494 auto IsPPR = [](const RegPairInfo &c) {
3495 return c.Reg1 == RegPairInfo::PPR;
3496 };
3497 auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
3498 auto IsZPR = [](const RegPairInfo &c) {
3499 return c.Type == RegPairInfo::ZPR;
3500 };
3501 auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
3502 assert(!(PPRBegin < ZPRBegin) &&
3503 "Expected callee save predicate to be handled first");
3504#endif
3505 if (!PTrueCreated) {
3506 PTrueCreated = true;
3507 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::PTRUE_C_B), DestReg: PnReg)
3508 .setMIFlags(MachineInstr::FrameSetup);
3509 }
3510 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: StrOpc));
3511 if (!MRI.isReserved(PhysReg: Reg1))
3512 MBB.addLiveIn(PhysReg: Reg1);
3513 if (!MRI.isReserved(PhysReg: Reg2))
3514 MBB.addLiveIn(PhysReg: Reg2);
3515 MIB.addReg(/*PairRegs*/ RegNo: AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0));
3516 MIB.addMemOperand(MMO: MF.getMachineMemOperand(
3517 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg2),
3518 F: MachineMemOperand::MOStore, Size, BaseAlignment: Alignment));
3519 MIB.addReg(RegNo: PnReg);
3520 MIB.addReg(RegNo: AArch64::SP)
3521 .addImm(Val: RPI.Offset / 2) // [sp, #imm*2*vscale],
3522 // where 2*vscale is implicit
3523 .setMIFlag(MachineInstr::FrameSetup);
3524 MIB.addMemOperand(MMO: MF.getMachineMemOperand(
3525 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg1),
3526 F: MachineMemOperand::MOStore, Size, BaseAlignment: Alignment));
3527 if (NeedsWinCFI)
3528 InsertSEH(MBBI: MIB, TII, Flag: MachineInstr::FrameSetup);
3529 } else { // The code when the pair of ZReg is not present
3530 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: StrOpc));
3531 if (!MRI.isReserved(PhysReg: Reg1))
3532 MBB.addLiveIn(PhysReg: Reg1);
3533 if (RPI.isPaired()) {
3534 if (!MRI.isReserved(PhysReg: Reg2))
3535 MBB.addLiveIn(PhysReg: Reg2);
3536 MIB.addReg(RegNo: Reg2, flags: getPrologueDeath(MF, Reg: Reg2));
3537 MIB.addMemOperand(MMO: MF.getMachineMemOperand(
3538 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg2),
3539 F: MachineMemOperand::MOStore, Size, BaseAlignment: Alignment));
3540 }
3541 MIB.addReg(RegNo: Reg1, flags: getPrologueDeath(MF, Reg: Reg1))
3542 .addReg(RegNo: AArch64::SP)
3543 .addImm(Val: RPI.Offset) // [sp, #offset*vscale],
3544 // where factor*vscale is implicit
3545 .setMIFlag(MachineInstr::FrameSetup);
3546 MIB.addMemOperand(MMO: MF.getMachineMemOperand(
3547 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg1),
3548 F: MachineMemOperand::MOStore, Size, BaseAlignment: Alignment));
3549 if (NeedsWinCFI)
3550 InsertSEH(MBBI: MIB, TII, Flag: MachineInstr::FrameSetup);
3551 }
3552 // Update the StackIDs of the SVE stack slots.
3553 MachineFrameInfo &MFI = MF.getFrameInfo();
3554 if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
3555 MFI.setStackID(ObjectIdx: FrameIdxReg1, ID: TargetStackID::ScalableVector);
3556 if (RPI.isPaired())
3557 MFI.setStackID(ObjectIdx: FrameIdxReg2, ID: TargetStackID::ScalableVector);
3558 }
3559
3560 if (X0Scratch != AArch64::NoRegister)
3561 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::ORRXrr), DestReg: AArch64::X0)
3562 .addReg(RegNo: AArch64::XZR)
3563 .addReg(RegNo: X0Scratch, flags: RegState::Undef)
3564 .addReg(RegNo: X0Scratch, flags: RegState::Implicit)
3565 .setMIFlag(MachineInstr::FrameSetup);
3566 }
3567 return true;
3568}
3569
3570bool AArch64FrameLowering::restoreCalleeSavedRegisters(
3571 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
3572 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
3573 MachineFunction &MF = *MBB.getParent();
3574 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3575 DebugLoc DL;
3576 SmallVector<RegPairInfo, 8> RegPairs;
3577 bool NeedsWinCFI = needsWinCFI(MF);
3578
3579 if (MBBI != MBB.end())
3580 DL = MBBI->getDebugLoc();
3581
3582 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, NeedsFrameRecord: hasFP(MF));
3583 if (homogeneousPrologEpilog(MF, Exit: &MBB)) {
3584 auto MIB = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::HOM_Epilog))
3585 .setMIFlag(MachineInstr::FrameDestroy);
3586 for (auto &RPI : RegPairs) {
3587 MIB.addReg(RegNo: RPI.Reg1, flags: RegState::Define);
3588 MIB.addReg(RegNo: RPI.Reg2, flags: RegState::Define);
3589 }
3590 return true;
3591 }
3592
3593 // For performance reasons restore SVE register in increasing order
3594 auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
3595 auto PPRBegin = llvm::find_if(Range&: RegPairs, P: IsPPR);
3596 auto PPREnd = std::find_if_not(first: PPRBegin, last: RegPairs.end(), pred: IsPPR);
3597 std::reverse(first: PPRBegin, last: PPREnd);
3598 auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
3599 auto ZPRBegin = llvm::find_if(Range&: RegPairs, P: IsZPR);
3600 auto ZPREnd = std::find_if_not(first: ZPRBegin, last: RegPairs.end(), pred: IsZPR);
3601 std::reverse(first: ZPRBegin, last: ZPREnd);
3602
3603 bool PTrueCreated = false;
3604 for (const RegPairInfo &RPI : RegPairs) {
3605 unsigned Reg1 = RPI.Reg1;
3606 unsigned Reg2 = RPI.Reg2;
3607
3608 // Issue sequence of restores for cs regs. The last restore may be converted
3609 // to a post-increment load later by emitEpilogue if the callee-save stack
3610 // area allocation can't be combined with the local stack area allocation.
3611 // For example:
3612 // ldp fp, lr, [sp, #32] // addImm(+4)
3613 // ldp x20, x19, [sp, #16] // addImm(+2)
3614 // ldp x22, x21, [sp, #0] // addImm(+0)
3615 // Note: see comment in spillCalleeSavedRegisters()
3616 unsigned LdrOpc;
3617 unsigned Size = TRI->getSpillSize(RC: *RPI.RC);
3618 Align Alignment = TRI->getSpillAlign(RC: *RPI.RC);
3619 switch (RPI.Type) {
3620 case RegPairInfo::GPR:
3621 LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
3622 break;
3623 case RegPairInfo::FPR64:
3624 LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
3625 break;
3626 case RegPairInfo::FPR128:
3627 LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
3628 break;
3629 case RegPairInfo::ZPR:
3630 LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
3631 break;
3632 case RegPairInfo::PPR:
3633 LdrOpc = Size == 16 ? AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO
3634 : AArch64::LDR_PXI;
3635 break;
3636 case RegPairInfo::VG:
3637 continue;
3638 }
3639 LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
3640 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
3641 dbgs() << ") -> fi#(" << RPI.FrameIdx;
3642 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
3643 dbgs() << ")\n");
3644
3645 // Windows unwind codes require consecutive registers if registers are
3646 // paired. Make the switch here, so that the code below will save (x,x+1)
3647 // and not (x+1,x).
3648 unsigned FrameIdxReg1 = RPI.FrameIdx;
3649 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
3650 if (NeedsWinCFI && RPI.isPaired()) {
3651 std::swap(a&: Reg1, b&: Reg2);
3652 std::swap(a&: FrameIdxReg1, b&: FrameIdxReg2);
3653 }
3654
3655 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3656 if (RPI.isPaired() && RPI.isScalable()) {
3657 [[maybe_unused]] const AArch64Subtarget &Subtarget =
3658 MF.getSubtarget<AArch64Subtarget>();
3659 unsigned PnReg = AFI->getPredicateRegForFillSpill();
3660 assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) &&
3661 "Expects SVE2.1 or SME2 target and a predicate register");
3662#ifdef EXPENSIVE_CHECKS
3663 assert(!(PPRBegin < ZPRBegin) &&
3664 "Expected callee save predicate to be handled first");
3665#endif
3666 if (!PTrueCreated) {
3667 PTrueCreated = true;
3668 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::PTRUE_C_B), DestReg: PnReg)
3669 .setMIFlags(MachineInstr::FrameDestroy);
3670 }
3671 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: LdrOpc));
3672 MIB.addReg(/*PairRegs*/ RegNo: AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0),
3673 flags: getDefRegState(B: true));
3674 MIB.addMemOperand(MMO: MF.getMachineMemOperand(
3675 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg2),
3676 F: MachineMemOperand::MOLoad, Size, BaseAlignment: Alignment));
3677 MIB.addReg(RegNo: PnReg);
3678 MIB.addReg(RegNo: AArch64::SP)
3679 .addImm(Val: RPI.Offset / 2) // [sp, #imm*2*vscale]
3680 // where 2*vscale is implicit
3681 .setMIFlag(MachineInstr::FrameDestroy);
3682 MIB.addMemOperand(MMO: MF.getMachineMemOperand(
3683 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg1),
3684 F: MachineMemOperand::MOLoad, Size, BaseAlignment: Alignment));
3685 if (NeedsWinCFI)
3686 InsertSEH(MBBI: MIB, TII, Flag: MachineInstr::FrameDestroy);
3687 } else {
3688 MachineInstrBuilder MIB = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: LdrOpc));
3689 if (RPI.isPaired()) {
3690 MIB.addReg(RegNo: Reg2, flags: getDefRegState(B: true));
3691 MIB.addMemOperand(MMO: MF.getMachineMemOperand(
3692 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg2),
3693 F: MachineMemOperand::MOLoad, Size, BaseAlignment: Alignment));
3694 }
3695 MIB.addReg(RegNo: Reg1, flags: getDefRegState(B: true));
3696 MIB.addReg(RegNo: AArch64::SP)
3697 .addImm(Val: RPI.Offset) // [sp, #offset*vscale]
3698 // where factor*vscale is implicit
3699 .setMIFlag(MachineInstr::FrameDestroy);
3700 MIB.addMemOperand(MMO: MF.getMachineMemOperand(
3701 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg1),
3702 F: MachineMemOperand::MOLoad, Size, BaseAlignment: Alignment));
3703 if (NeedsWinCFI)
3704 InsertSEH(MBBI: MIB, TII, Flag: MachineInstr::FrameDestroy);
3705 }
3706 }
3707 return true;
3708}
3709
3710// Return the FrameID for a MMO.
3711static std::optional<int> getMMOFrameID(MachineMemOperand *MMO,
3712 const MachineFrameInfo &MFI) {
3713 auto *PSV =
3714 dyn_cast_or_null<FixedStackPseudoSourceValue>(Val: MMO->getPseudoValue());
3715 if (PSV)
3716 return std::optional<int>(PSV->getFrameIndex());
3717
3718 if (MMO->getValue()) {
3719 if (auto *Al = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: MMO->getValue()))) {
3720 for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd();
3721 FI++)
3722 if (MFI.getObjectAllocation(ObjectIdx: FI) == Al)
3723 return FI;
3724 }
3725 }
3726
3727 return std::nullopt;
3728}
3729
3730// Return the FrameID for a Load/Store instruction by looking at the first MMO.
3731static std::optional<int> getLdStFrameID(const MachineInstr &MI,
3732 const MachineFrameInfo &MFI) {
3733 if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
3734 return std::nullopt;
3735
3736 return getMMOFrameID(MMO: *MI.memoperands_begin(), MFI);
3737}
3738
3739// Check if a Hazard slot is needed for the current function, and if so create
3740// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
3741// which can be used to determine if any hazard padding is needed.
3742void AArch64FrameLowering::determineStackHazardSlot(
3743 MachineFunction &MF, BitVector &SavedRegs) const {
3744 unsigned StackHazardSize = getStackHazardSize(MF);
3745 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
3746 if (StackHazardSize == 0 || StackHazardSize % 16 != 0 ||
3747 AFI->hasStackHazardSlotIndex())
3748 return;
3749
3750 // Stack hazards are only needed in streaming functions.
3751 SMEAttrs Attrs = AFI->getSMEFnAttrs();
3752 if (!StackHazardInNonStreaming && Attrs.hasNonStreamingInterfaceAndBody())
3753 return;
3754
3755 MachineFrameInfo &MFI = MF.getFrameInfo();
3756
3757 // Add a hazard slot if there are any CSR FPR registers, or are any fp-only
3758 // stack objects.
3759 bool HasFPRCSRs = any_of(Range: SavedRegs.set_bits(), P: [](unsigned Reg) {
3760 return AArch64::FPR64RegClass.contains(Reg) ||
3761 AArch64::FPR128RegClass.contains(Reg) ||
3762 AArch64::ZPRRegClass.contains(Reg) ||
3763 AArch64::PPRRegClass.contains(Reg);
3764 });
3765 bool HasFPRStackObjects = false;
3766 if (!HasFPRCSRs) {
3767 std::vector<unsigned> FrameObjects(MFI.getObjectIndexEnd());
3768 for (auto &MBB : MF) {
3769 for (auto &MI : MBB) {
3770 std::optional<int> FI = getLdStFrameID(MI, MFI);
3771 if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
3772 if (MFI.getStackID(ObjectIdx: *FI) == TargetStackID::ScalableVector ||
3773 AArch64InstrInfo::isFpOrNEON(MI))
3774 FrameObjects[*FI] |= 2;
3775 else
3776 FrameObjects[*FI] |= 1;
3777 }
3778 }
3779 }
3780 HasFPRStackObjects =
3781 any_of(Range&: FrameObjects, P: [](unsigned B) { return (B & 3) == 2; });
3782 }
3783
3784 if (HasFPRCSRs || HasFPRStackObjects) {
3785 int ID = MFI.CreateStackObject(Size: StackHazardSize, Alignment: Align(16), isSpillSlot: false);
3786 LLVM_DEBUG(dbgs() << "Created Hazard slot at " << ID << " size "
3787 << StackHazardSize << "\n");
3788 AFI->setStackHazardSlotIndex(ID);
3789 }
3790}
3791
3792void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
3793 BitVector &SavedRegs,
3794 RegScavenger *RS) const {
3795 // All calls are tail calls in GHC calling conv, and functions have no
3796 // prologue/epilogue.
3797 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
3798 return;
3799
3800 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
3801 const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
3802 MF.getSubtarget().getRegisterInfo());
3803 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
3804 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3805 unsigned UnspilledCSGPR = AArch64::NoRegister;
3806 unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
3807
3808 MachineFrameInfo &MFI = MF.getFrameInfo();
3809 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
3810
3811 unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
3812 ? RegInfo->getBaseRegister()
3813 : (unsigned)AArch64::NoRegister;
3814
3815 unsigned ExtraCSSpill = 0;
3816 bool HasUnpairedGPR64 = false;
3817 bool HasPairZReg = false;
3818 BitVector UserReservedRegs = RegInfo->getUserReservedRegs(MF);
3819 BitVector ReservedRegs = RegInfo->getReservedRegs(MF);
3820
3821 // Figure out which callee-saved registers to save/restore.
3822 for (unsigned i = 0; CSRegs[i]; ++i) {
3823 const unsigned Reg = CSRegs[i];
3824
3825 // Add the base pointer register to SavedRegs if it is callee-save.
3826 if (Reg == BasePointerReg)
3827 SavedRegs.set(Reg);
3828
3829 // Don't save manually reserved registers set through +reserve-x#i,
3830 // even for callee-saved registers, as per GCC's behavior.
3831 if (UserReservedRegs[Reg]) {
3832 SavedRegs.reset(Idx: Reg);
3833 continue;
3834 }
3835
3836 bool RegUsed = SavedRegs.test(Idx: Reg);
3837 unsigned PairedReg = AArch64::NoRegister;
3838 const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg);
3839 if (RegIsGPR64 || AArch64::FPR64RegClass.contains(Reg) ||
3840 AArch64::FPR128RegClass.contains(Reg)) {
3841 // Compensate for odd numbers of GP CSRs.
3842 // For now, all the known cases of odd number of CSRs are of GPRs.
3843 if (HasUnpairedGPR64)
3844 PairedReg = CSRegs[i % 2 == 0 ? i - 1 : i + 1];
3845 else
3846 PairedReg = CSRegs[i ^ 1];
3847 }
3848
3849 // If the function requires all the GP registers to save (SavedRegs),
3850 // and there are an odd number of GP CSRs at the same time (CSRegs),
3851 // PairedReg could be in a different register class from Reg, which would
3852 // lead to a FPR (usually D8) accidentally being marked saved.
3853 if (RegIsGPR64 && !AArch64::GPR64RegClass.contains(Reg: PairedReg)) {
3854 PairedReg = AArch64::NoRegister;
3855 HasUnpairedGPR64 = true;
3856 }
3857 assert(PairedReg == AArch64::NoRegister ||
3858 AArch64::GPR64RegClass.contains(Reg, PairedReg) ||
3859 AArch64::FPR64RegClass.contains(Reg, PairedReg) ||
3860 AArch64::FPR128RegClass.contains(Reg, PairedReg));
3861
3862 if (!RegUsed) {
3863 if (AArch64::GPR64RegClass.contains(Reg) && !ReservedRegs[Reg]) {
3864 UnspilledCSGPR = Reg;
3865 UnspilledCSGPRPaired = PairedReg;
3866 }
3867 continue;
3868 }
3869
3870 // Always save P4 when PPR spills are ZPR-sized and a predicate above p8 is
3871 // spilled. If all of p0-p3 are used as return values p4 is must be free
3872 // to reload p8-p15.
3873 if (RegInfo->getSpillSize(RC: AArch64::PPRRegClass) == 16 &&
3874 AArch64::PPR_p8to15RegClass.contains(Reg)) {
3875 SavedRegs.set(AArch64::P4);
3876 }
3877
3878 // MachO's compact unwind format relies on all registers being stored in
3879 // pairs.
3880 // FIXME: the usual format is actually better if unwinding isn't needed.
3881 if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
3882 !SavedRegs.test(Idx: PairedReg)) {
3883 SavedRegs.set(PairedReg);
3884 if (AArch64::GPR64RegClass.contains(Reg: PairedReg) &&
3885 !ReservedRegs[PairedReg])
3886 ExtraCSSpill = PairedReg;
3887 }
3888 // Check if there is a pair of ZRegs, so it can select PReg for spill/fill
3889 HasPairZReg |= (AArch64::ZPRRegClass.contains(Reg1: Reg, Reg2: CSRegs[i ^ 1]) &&
3890 SavedRegs.test(Idx: CSRegs[i ^ 1]));
3891 }
3892
3893 if (HasPairZReg && enableMultiVectorSpillFill(Subtarget, MF)) {
3894 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3895 // Find a suitable predicate register for the multi-vector spill/fill
3896 // instructions.
3897 unsigned PnReg = findFreePredicateReg(SavedRegs);
3898 if (PnReg != AArch64::NoRegister)
3899 AFI->setPredicateRegForFillSpill(PnReg);
3900 // If no free callee-save has been found assign one.
3901 if (!AFI->getPredicateRegForFillSpill() &&
3902 MF.getFunction().getCallingConv() ==
3903 CallingConv::AArch64_SVE_VectorCall) {
3904 SavedRegs.set(AArch64::P8);
3905 AFI->setPredicateRegForFillSpill(AArch64::PN8);
3906 }
3907
3908 assert(!ReservedRegs[AFI->getPredicateRegForFillSpill()] &&
3909 "Predicate cannot be a reserved register");
3910 }
3911
3912 if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
3913 !Subtarget.isTargetWindows()) {
3914 // For Windows calling convention on a non-windows OS, where X18 is treated
3915 // as reserved, back up X18 when entering non-windows code (marked with the
3916 // Windows calling convention) and restore when returning regardless of
3917 // whether the individual function uses it - it might call other functions
3918 // that clobber it.
3919 SavedRegs.set(AArch64::X18);
3920 }
3921
3922 // Calculates the callee saved stack size.
3923 unsigned CSStackSize = 0;
3924 unsigned SVECSStackSize = 0;
3925 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3926 for (unsigned Reg : SavedRegs.set_bits()) {
3927 auto *RC = TRI->getMinimalPhysRegClass(Reg);
3928 assert(RC && "expected register class!");
3929 auto SpillSize = TRI->getSpillSize(RC: *RC);
3930 if (AArch64::PPRRegClass.contains(Reg) ||
3931 AArch64::ZPRRegClass.contains(Reg))
3932 SVECSStackSize += SpillSize;
3933 else
3934 CSStackSize += SpillSize;
3935 }
3936
3937 // Save number of saved regs, so we can easily update CSStackSize later to
3938 // account for any additional 64-bit GPR saves. Note: After this point
3939 // only 64-bit GPRs can be added to SavedRegs.
3940 unsigned NumSavedRegs = SavedRegs.count();
3941
3942 // Increase the callee-saved stack size if the function has streaming mode
3943 // changes, as we will need to spill the value of the VG register.
3944 // For locally streaming functions, we spill both the streaming and
3945 // non-streaming VG value.
3946 SMEAttrs Attrs = AFI->getSMEFnAttrs();
3947 if (requiresSaveVG(MF)) {
3948 if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
3949 CSStackSize += 16;
3950 else
3951 CSStackSize += 8;
3952 }
3953
3954 // Determine if a Hazard slot should be used, and increase the CSStackSize by
3955 // StackHazardSize if so.
3956 determineStackHazardSlot(MF, SavedRegs);
3957 if (AFI->hasStackHazardSlotIndex())
3958 CSStackSize += getStackHazardSize(MF);
3959
3960 // If we must call __arm_get_current_vg in the prologue preserve the LR.
3961 if (requiresSaveVG(MF) && !Subtarget.hasSVE())
3962 SavedRegs.set(AArch64::LR);
3963
3964 // The frame record needs to be created by saving the appropriate registers
3965 uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
3966 if (hasFP(MF) ||
3967 windowsRequiresStackProbe(MF, StackSizeInBytes: EstimatedStackSize + CSStackSize + 16)) {
3968 SavedRegs.set(AArch64::FP);
3969 SavedRegs.set(AArch64::LR);
3970 }
3971
3972 LLVM_DEBUG({
3973 dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
3974 for (unsigned Reg : SavedRegs.set_bits())
3975 dbgs() << ' ' << printReg(Reg, RegInfo);
3976 dbgs() << "\n";
3977 });
3978
3979 // If any callee-saved registers are used, the frame cannot be eliminated.
3980 int64_t SVEStackSize =
3981 alignTo(Value: SVECSStackSize + estimateSVEStackObjectOffsets(MF&: MFI), Align: 16);
3982 bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
3983
3984 // The CSR spill slots have not been allocated yet, so estimateStackSize
3985 // won't include them.
3986 unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
3987
3988 // We may address some of the stack above the canonical frame address, either
3989 // for our own arguments or during a call. Include that in calculating whether
3990 // we have complicated addressing concerns.
3991 int64_t CalleeStackUsed = 0;
3992 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) {
3993 int64_t FixedOff = MFI.getObjectOffset(ObjectIdx: I);
3994 if (FixedOff > CalleeStackUsed)
3995 CalleeStackUsed = FixedOff;
3996 }
3997
3998 // Conservatively always assume BigStack when there are SVE spills.
3999 bool BigStack = SVEStackSize || (EstimatedStackSize + CSStackSize +
4000 CalleeStackUsed) > EstimatedStackSizeLimit;
4001 if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
4002 AFI->setHasStackFrame(true);
4003
4004 // Estimate if we might need to scavenge a register at some point in order
4005 // to materialize a stack offset. If so, either spill one additional
4006 // callee-saved register or reserve a special spill slot to facilitate
4007 // register scavenging. If we already spilled an extra callee-saved register
4008 // above to keep the number of spills even, we don't need to do anything else
4009 // here.
4010 if (BigStack) {
4011 if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
4012 LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
4013 << " to get a scratch register.\n");
4014 SavedRegs.set(UnspilledCSGPR);
4015 ExtraCSSpill = UnspilledCSGPR;
4016
4017 // MachO's compact unwind format relies on all registers being stored in
4018 // pairs, so if we need to spill one extra for BigStack, then we need to
4019 // store the pair.
4020 if (producePairRegisters(MF)) {
4021 if (UnspilledCSGPRPaired == AArch64::NoRegister) {
4022 // Failed to make a pair for compact unwind format, revert spilling.
4023 if (produceCompactUnwindFrame(MF)) {
4024 SavedRegs.reset(Idx: UnspilledCSGPR);
4025 ExtraCSSpill = AArch64::NoRegister;
4026 }
4027 } else
4028 SavedRegs.set(UnspilledCSGPRPaired);
4029 }
4030 }
4031
4032 // If we didn't find an extra callee-saved register to spill, create
4033 // an emergency spill slot.
4034 if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(PhysReg: ExtraCSSpill)) {
4035 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
4036 const TargetRegisterClass &RC = AArch64::GPR64RegClass;
4037 unsigned Size = TRI->getSpillSize(RC);
4038 Align Alignment = TRI->getSpillAlign(RC);
4039 int FI = MFI.CreateSpillStackObject(Size, Alignment);
4040 RS->addScavengingFrameIndex(FI);
4041 LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
4042 << " as the emergency spill slot.\n");
4043 }
4044 }
4045
4046 // Adding the size of additional 64bit GPR saves.
4047 CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
4048
4049 // A Swift asynchronous context extends the frame record with a pointer
4050 // directly before FP.
4051 if (hasFP(MF) && AFI->hasSwiftAsyncContext())
4052 CSStackSize += 8;
4053
4054 uint64_t AlignedCSStackSize = alignTo(Value: CSStackSize, Align: 16);
4055 LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
4056 << EstimatedStackSize + AlignedCSStackSize << " bytes.\n");
4057
4058 assert((!MFI.isCalleeSavedInfoValid() ||
4059 AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
4060 "Should not invalidate callee saved info");
4061
4062 // Round up to register pair alignment to avoid additional SP adjustment
4063 // instructions.
4064 AFI->setCalleeSavedStackSize(AlignedCSStackSize);
4065 AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
4066 AFI->setSVECalleeSavedStackSize(alignTo(Value: SVECSStackSize, Align: 16));
4067}
4068
4069bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
4070 MachineFunction &MF, const TargetRegisterInfo *RegInfo,
4071 std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
4072 unsigned &MaxCSFrameIndex) const {
4073 bool NeedsWinCFI = needsWinCFI(MF);
4074 unsigned StackHazardSize = getStackHazardSize(MF);
4075 // To match the canonical windows frame layout, reverse the list of
4076 // callee saved registers to get them laid out by PrologEpilogInserter
4077 // in the right order. (PrologEpilogInserter allocates stack objects top
4078 // down. Windows canonical prologs store higher numbered registers at
4079 // the top, thus have the CSI array start from the highest registers.)
4080 if (NeedsWinCFI)
4081 std::reverse(first: CSI.begin(), last: CSI.end());
4082
4083 if (CSI.empty())
4084 return true; // Early exit if no callee saved registers are modified!
4085
4086 // Now that we know which registers need to be saved and restored, allocate
4087 // stack slots for them.
4088 MachineFrameInfo &MFI = MF.getFrameInfo();
4089 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
4090
4091 bool UsesWinAAPCS = isTargetWindows(MF);
4092 if (UsesWinAAPCS && hasFP(MF) && AFI->hasSwiftAsyncContext()) {
4093 int FrameIdx = MFI.CreateStackObject(Size: 8, Alignment: Align(16), isSpillSlot: true);
4094 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
4095 if ((unsigned)FrameIdx < MinCSFrameIndex)
4096 MinCSFrameIndex = FrameIdx;
4097 if ((unsigned)FrameIdx > MaxCSFrameIndex)
4098 MaxCSFrameIndex = FrameIdx;
4099 }
4100
4101 // Insert VG into the list of CSRs, immediately before LR if saved.
4102 if (requiresSaveVG(MF)) {
4103 std::vector<CalleeSavedInfo> VGSaves;
4104 SMEAttrs Attrs = AFI->getSMEFnAttrs();
4105
4106 auto VGInfo = CalleeSavedInfo(AArch64::VG);
4107 VGInfo.setRestored(false);
4108 VGSaves.push_back(x: VGInfo);
4109
4110 // Add VG again if the function is locally-streaming, as we will spill two
4111 // values.
4112 if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
4113 VGSaves.push_back(x: VGInfo);
4114
4115 bool InsertBeforeLR = false;
4116
4117 for (unsigned I = 0; I < CSI.size(); I++)
4118 if (CSI[I].getReg() == AArch64::LR) {
4119 InsertBeforeLR = true;
4120 CSI.insert(position: CSI.begin() + I, first: VGSaves.begin(), last: VGSaves.end());
4121 break;
4122 }
4123
4124 if (!InsertBeforeLR)
4125 llvm::append_range(C&: CSI, R&: VGSaves);
4126 }
4127
4128 Register LastReg = 0;
4129 int HazardSlotIndex = std::numeric_limits<int>::max();
4130 for (auto &CS : CSI) {
4131 MCRegister Reg = CS.getReg();
4132 const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
4133
4134 // Create a hazard slot as we switch between GPR and FPR CSRs.
4135 if (AFI->hasStackHazardSlotIndex() &&
4136 (!LastReg || !AArch64InstrInfo::isFpOrNEON(Reg: LastReg)) &&
4137 AArch64InstrInfo::isFpOrNEON(Reg)) {
4138 assert(HazardSlotIndex == std::numeric_limits<int>::max() &&
4139 "Unexpected register order for hazard slot");
4140 HazardSlotIndex = MFI.CreateStackObject(Size: StackHazardSize, Alignment: Align(8), isSpillSlot: true);
4141 LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
4142 << "\n");
4143 AFI->setStackHazardCSRSlotIndex(HazardSlotIndex);
4144 if ((unsigned)HazardSlotIndex < MinCSFrameIndex)
4145 MinCSFrameIndex = HazardSlotIndex;
4146 if ((unsigned)HazardSlotIndex > MaxCSFrameIndex)
4147 MaxCSFrameIndex = HazardSlotIndex;
4148 }
4149
4150 unsigned Size = RegInfo->getSpillSize(RC: *RC);
4151 Align Alignment(RegInfo->getSpillAlign(RC: *RC));
4152 int FrameIdx = MFI.CreateStackObject(Size, Alignment, isSpillSlot: true);
4153 CS.setFrameIdx(FrameIdx);
4154
4155 if ((unsigned)FrameIdx < MinCSFrameIndex)
4156 MinCSFrameIndex = FrameIdx;
4157 if ((unsigned)FrameIdx > MaxCSFrameIndex)
4158 MaxCSFrameIndex = FrameIdx;
4159
4160 // Grab 8 bytes below FP for the extended asynchronous frame info.
4161 if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !UsesWinAAPCS &&
4162 Reg == AArch64::FP) {
4163 FrameIdx = MFI.CreateStackObject(Size: 8, Alignment, isSpillSlot: true);
4164 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
4165 if ((unsigned)FrameIdx < MinCSFrameIndex)
4166 MinCSFrameIndex = FrameIdx;
4167 if ((unsigned)FrameIdx > MaxCSFrameIndex)
4168 MaxCSFrameIndex = FrameIdx;
4169 }
4170 LastReg = Reg;
4171 }
4172
4173 // Add hazard slot in the case where no FPR CSRs are present.
4174 if (AFI->hasStackHazardSlotIndex() &&
4175 HazardSlotIndex == std::numeric_limits<int>::max()) {
4176 HazardSlotIndex = MFI.CreateStackObject(Size: StackHazardSize, Alignment: Align(8), isSpillSlot: true);
4177 LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
4178 << "\n");
4179 AFI->setStackHazardCSRSlotIndex(HazardSlotIndex);
4180 if ((unsigned)HazardSlotIndex < MinCSFrameIndex)
4181 MinCSFrameIndex = HazardSlotIndex;
4182 if ((unsigned)HazardSlotIndex > MaxCSFrameIndex)
4183 MaxCSFrameIndex = HazardSlotIndex;
4184 }
4185
4186 return true;
4187}
4188
4189bool AArch64FrameLowering::enableStackSlotScavenging(
4190 const MachineFunction &MF) const {
4191 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
4192 // If the function has streaming-mode changes, don't scavenge a
4193 // spillslot in the callee-save area, as that might require an
4194 // 'addvl' in the streaming-mode-changing call-sequence when the
4195 // function doesn't use a FP.
4196 if (AFI->hasStreamingModeChanges() && !hasFP(MF))
4197 return false;
4198 // Don't allow register salvaging with hazard slots, in case it moves objects
4199 // into the wrong place.
4200 if (AFI->hasStackHazardSlotIndex())
4201 return false;
4202 return AFI->hasCalleeSaveStackFreeSpace();
4203}
4204
4205/// returns true if there are any SVE callee saves.
4206static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
4207 int &Min, int &Max) {
4208 Min = std::numeric_limits<int>::max();
4209 Max = std::numeric_limits<int>::min();
4210
4211 if (!MFI.isCalleeSavedInfoValid())
4212 return false;
4213
4214 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
4215 for (auto &CS : CSI) {
4216 if (AArch64::ZPRRegClass.contains(Reg: CS.getReg()) ||
4217 AArch64::PPRRegClass.contains(Reg: CS.getReg())) {
4218 assert((Max == std::numeric_limits<int>::min() ||
4219 Max + 1 == CS.getFrameIdx()) &&
4220 "SVE CalleeSaves are not consecutive");
4221
4222 Min = std::min(a: Min, b: CS.getFrameIdx());
4223 Max = std::max(a: Max, b: CS.getFrameIdx());
4224 }
4225 }
4226 return Min != std::numeric_limits<int>::max();
4227}
4228
4229// Process all the SVE stack objects and determine offsets for each
4230// object. If AssignOffsets is true, the offsets get assigned.
4231// Fills in the first and last callee-saved frame indices into
4232// Min/MaxCSFrameIndex, respectively.
4233// Returns the size of the stack.
4234static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
4235 int &MinCSFrameIndex,
4236 int &MaxCSFrameIndex,
4237 bool AssignOffsets) {
4238#ifndef NDEBUG
4239 // First process all fixed stack objects.
4240 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
4241 assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
4242 "SVE vectors should never be passed on the stack by value, only by "
4243 "reference.");
4244#endif
4245
4246 auto Assign = [&MFI](int FI, int64_t Offset) {
4247 LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
4248 MFI.setObjectOffset(ObjectIdx: FI, SPOffset: Offset);
4249 };
4250
4251 int64_t Offset = 0;
4252
4253 // Then process all callee saved slots.
4254 if (getSVECalleeSaveSlotRange(MFI, Min&: MinCSFrameIndex, Max&: MaxCSFrameIndex)) {
4255 // Assign offsets to the callee save slots.
4256 for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
4257 Offset += MFI.getObjectSize(ObjectIdx: I);
4258 Offset = alignTo(Size: Offset, A: MFI.getObjectAlign(ObjectIdx: I));
4259 if (AssignOffsets)
4260 Assign(I, -Offset);
4261 }
4262 }
4263
4264 // Ensure that the Callee-save area is aligned to 16bytes.
4265 Offset = alignTo(Size: Offset, A: Align(16U));
4266
4267 // Create a buffer of SVE objects to allocate and sort it.
4268 SmallVector<int, 8> ObjectsToAllocate;
4269 // If we have a stack protector, and we've previously decided that we have SVE
4270 // objects on the stack and thus need it to go in the SVE stack area, then it
4271 // needs to go first.
4272 int StackProtectorFI = -1;
4273 if (MFI.hasStackProtectorIndex()) {
4274 StackProtectorFI = MFI.getStackProtectorIndex();
4275 if (MFI.getStackID(ObjectIdx: StackProtectorFI) == TargetStackID::ScalableVector)
4276 ObjectsToAllocate.push_back(Elt: StackProtectorFI);
4277 }
4278 for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
4279 unsigned StackID = MFI.getStackID(ObjectIdx: I);
4280 if (StackID != TargetStackID::ScalableVector)
4281 continue;
4282 if (I == StackProtectorFI)
4283 continue;
4284 if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
4285 continue;
4286 if (MFI.isDeadObjectIndex(ObjectIdx: I))
4287 continue;
4288
4289 ObjectsToAllocate.push_back(Elt: I);
4290 }
4291
4292 // Allocate all SVE locals and spills
4293 for (unsigned FI : ObjectsToAllocate) {
4294 Align Alignment = MFI.getObjectAlign(ObjectIdx: FI);
4295 // FIXME: Given that the length of SVE vectors is not necessarily a power of
4296 // two, we'd need to align every object dynamically at runtime if the
4297 // alignment is larger than 16. This is not yet supported.
4298 if (Alignment > Align(16))
4299 report_fatal_error(
4300 reason: "Alignment of scalable vectors > 16 bytes is not yet supported");
4301
4302 Offset = alignTo(Size: Offset + MFI.getObjectSize(ObjectIdx: FI), A: Alignment);
4303 if (AssignOffsets)
4304 Assign(FI, -Offset);
4305 }
4306
4307 return Offset;
4308}
4309
4310int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
4311 MachineFrameInfo &MFI) const {
4312 int MinCSFrameIndex, MaxCSFrameIndex;
4313 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, AssignOffsets: false);
4314}
4315
4316int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
4317 MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
4318 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
4319 AssignOffsets: true);
4320}
4321
4322/// Attempts to scavenge a register from \p ScavengeableRegs given the used
4323/// registers in \p UsedRegs.
4324static Register tryScavengeRegister(LiveRegUnits const &UsedRegs,
4325 BitVector const &ScavengeableRegs,
4326 Register PreferredReg) {
4327 if (PreferredReg != AArch64::NoRegister && UsedRegs.available(Reg: PreferredReg))
4328 return PreferredReg;
4329 for (auto Reg : ScavengeableRegs.set_bits()) {
4330 if (UsedRegs.available(Reg))
4331 return Reg;
4332 }
4333 return AArch64::NoRegister;
4334}
4335
4336/// Propagates frame-setup/destroy flags from \p SourceMI to all instructions in
4337/// \p MachineInstrs.
4338static void propagateFrameFlags(MachineInstr &SourceMI,
4339 ArrayRef<MachineInstr *> MachineInstrs) {
4340 for (MachineInstr *MI : MachineInstrs) {
4341 if (SourceMI.getFlag(Flag: MachineInstr::FrameSetup))
4342 MI->setFlag(MachineInstr::FrameSetup);
4343 if (SourceMI.getFlag(Flag: MachineInstr::FrameDestroy))
4344 MI->setFlag(MachineInstr::FrameDestroy);
4345 }
4346}
4347
4348/// RAII helper class for scavenging or spilling a register. On construction
4349/// attempts to find a free register of class \p RC (given \p UsedRegs and \p
4350/// AllocatableRegs), if no register can be found spills \p SpillCandidate to \p
4351/// MaybeSpillFI to free a register. The free'd register is returned via the \p
4352/// FreeReg output parameter. On destruction, if there is a spill, its previous
4353/// value is reloaded. The spilling and scavenging is only valid at the
4354/// insertion point \p MBBI, this class should _not_ be used in places that
4355/// create or manipulate basic blocks, moving the expected insertion point.
4356struct ScopedScavengeOrSpill {
4357 ScopedScavengeOrSpill(const ScopedScavengeOrSpill &) = delete;
4358 ScopedScavengeOrSpill(ScopedScavengeOrSpill &&) = delete;
4359
4360 ScopedScavengeOrSpill(MachineFunction &MF, MachineBasicBlock &MBB,
4361 MachineBasicBlock::iterator MBBI,
4362 Register SpillCandidate, const TargetRegisterClass &RC,
4363 LiveRegUnits const &UsedRegs,
4364 BitVector const &AllocatableRegs,
4365 std::optional<int> *MaybeSpillFI,
4366 Register PreferredReg = AArch64::NoRegister)
4367 : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast<const AArch64InstrInfo &>(
4368 *MF.getSubtarget().getInstrInfo())),
4369 TRI(*MF.getSubtarget().getRegisterInfo()) {
4370 FreeReg = tryScavengeRegister(UsedRegs, ScavengeableRegs: AllocatableRegs, PreferredReg);
4371 if (FreeReg != AArch64::NoRegister)
4372 return;
4373 assert(MaybeSpillFI && "Expected emergency spill slot FI information "
4374 "(attempted to spill in prologue/epilogue?)");
4375 if (!MaybeSpillFI->has_value()) {
4376 MachineFrameInfo &MFI = MF.getFrameInfo();
4377 *MaybeSpillFI = MFI.CreateSpillStackObject(Size: TRI.getSpillSize(RC),
4378 Alignment: TRI.getSpillAlign(RC));
4379 }
4380 FreeReg = SpillCandidate;
4381 SpillFI = MaybeSpillFI->value();
4382 TII.storeRegToStackSlot(MBB, MBBI, SrcReg: FreeReg, isKill: false, FrameIndex: *SpillFI, RC: &RC, TRI: &TRI,
4383 VReg: Register());
4384 }
4385
4386 bool hasSpilled() const { return SpillFI.has_value(); }
4387
4388 /// Returns the free register (found from scavenging or spilling a register).
4389 Register freeRegister() const { return FreeReg; }
4390
4391 Register operator*() const { return freeRegister(); }
4392
4393 ~ScopedScavengeOrSpill() {
4394 if (hasSpilled())
4395 TII.loadRegFromStackSlot(MBB, MBBI, DestReg: FreeReg, FrameIndex: *SpillFI, RC: &RC, TRI: &TRI,
4396 VReg: Register());
4397 }
4398
4399private:
4400 MachineBasicBlock &MBB;
4401 MachineBasicBlock::iterator MBBI;
4402 const TargetRegisterClass &RC;
4403 const AArch64InstrInfo &TII;
4404 const TargetRegisterInfo &TRI;
4405 Register FreeReg = AArch64::NoRegister;
4406 std::optional<int> SpillFI;
4407};
4408
4409/// Emergency stack slots for expanding SPILL_PPR_TO_ZPR_SLOT_PSEUDO and
4410/// FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
4411struct EmergencyStackSlots {
4412 std::optional<int> ZPRSpillFI;
4413 std::optional<int> PPRSpillFI;
4414 std::optional<int> GPRSpillFI;
4415};
4416
4417/// Registers available for scavenging (ZPR, PPR3b, GPR).
4418struct ScavengeableRegs {
4419 BitVector ZPRRegs;
4420 BitVector PPR3bRegs;
4421 BitVector GPRRegs;
4422};
4423
4424static bool isInPrologueOrEpilogue(const MachineInstr &MI) {
4425 return MI.getFlag(Flag: MachineInstr::FrameSetup) ||
4426 MI.getFlag(Flag: MachineInstr::FrameDestroy);
4427}
4428
4429/// Expands:
4430/// ```
4431/// SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0
4432/// ```
4433/// To:
4434/// ```
4435/// $z0 = CPY_ZPzI_B $p0, 1, 0
4436/// STR_ZXI $z0, $stack.0, 0
4437/// ```
4438/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
4439/// spilling if necessary).
4440static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB,
4441 MachineInstr &MI,
4442 const TargetRegisterInfo &TRI,
4443 LiveRegUnits const &UsedRegs,
4444 ScavengeableRegs const &SR,
4445 EmergencyStackSlots &SpillSlots) {
4446 MachineFunction &MF = *MBB.getParent();
4447 auto *TII =
4448 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
4449
4450 ScopedScavengeOrSpill ZPredReg(
4451 MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
4452 isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
4453
4454 SmallVector<MachineInstr *, 2> MachineInstrs;
4455 const DebugLoc &DL = MI.getDebugLoc();
4456 MachineInstrs.push_back(Elt: BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AArch64::CPY_ZPzI_B))
4457 .addReg(RegNo: *ZPredReg, flags: RegState::Define)
4458 .add(MO: MI.getOperand(i: 0))
4459 .addImm(Val: 1)
4460 .addImm(Val: 0)
4461 .getInstr());
4462 MachineInstrs.push_back(Elt: BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AArch64::STR_ZXI))
4463 .addReg(RegNo: *ZPredReg)
4464 .add(MO: MI.getOperand(i: 1))
4465 .addImm(Val: MI.getOperand(i: 2).getImm())
4466 .setMemRefs(MI.memoperands())
4467 .getInstr());
4468 propagateFrameFlags(SourceMI&: MI, MachineInstrs);
4469}
4470
4471/// Expands:
4472/// ```
4473/// $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0
4474/// ```
4475/// To:
4476/// ```
4477/// $z0 = LDR_ZXI %stack.0, 0
4478/// $p0 = PTRUE_B 31, implicit $vg
4479/// $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
4480/// ```
4481/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
4482/// spilling if necessary). If the status flags are in use at the point of
4483/// expansion they are preserved (by moving them to/from a GPR). This may cause
4484/// an additional spill if no GPR is free at the expansion point.
4485static bool expandFillPPRFromZPRSlotPseudo(
4486 MachineBasicBlock &MBB, MachineInstr &MI, const TargetRegisterInfo &TRI,
4487 LiveRegUnits const &UsedRegs, ScavengeableRegs const &SR,
4488 MachineInstr *&LastPTrue, EmergencyStackSlots &SpillSlots) {
4489 MachineFunction &MF = *MBB.getParent();
4490 auto *TII =
4491 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
4492
4493 ScopedScavengeOrSpill ZPredReg(
4494 MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
4495 isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
4496
4497 ScopedScavengeOrSpill PredReg(
4498 MF, MBB, MI, AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs, SR.PPR3bRegs,
4499 isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.PPRSpillFI,
4500 /*PreferredReg=*/
4501 LastPTrue ? LastPTrue->getOperand(i: 0).getReg() : AArch64::NoRegister);
4502
4503 // Elide NZCV spills if we know it is not used.
4504 bool IsNZCVUsed = !UsedRegs.available(Reg: AArch64::NZCV);
4505 std::optional<ScopedScavengeOrSpill> NZCVSaveReg;
4506 if (IsNZCVUsed)
4507 NZCVSaveReg.emplace(
4508 args&: MF, args&: MBB, args&: MI, args: AArch64::X0, args: AArch64::GPR64RegClass, args: UsedRegs, args: SR.GPRRegs,
4509 args: isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.GPRSpillFI);
4510 SmallVector<MachineInstr *, 4> MachineInstrs;
4511 const DebugLoc &DL = MI.getDebugLoc();
4512 MachineInstrs.push_back(Elt: BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AArch64::LDR_ZXI))
4513 .addReg(RegNo: *ZPredReg, flags: RegState::Define)
4514 .add(MO: MI.getOperand(i: 1))
4515 .addImm(Val: MI.getOperand(i: 2).getImm())
4516 .setMemRefs(MI.memoperands())
4517 .getInstr());
4518 if (IsNZCVUsed)
4519 MachineInstrs.push_back(
4520 Elt: BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AArch64::MRS))
4521 .addReg(RegNo: NZCVSaveReg->freeRegister(), flags: RegState::Define)
4522 .addImm(Val: AArch64SysReg::NZCV)
4523 .addReg(RegNo: AArch64::NZCV, flags: RegState::Implicit)
4524 .getInstr());
4525
4526 // Reuse previous ptrue if we know it has not been clobbered.
4527 if (LastPTrue) {
4528 assert(*PredReg == LastPTrue->getOperand(0).getReg());
4529 LastPTrue->moveBefore(MovePos: &MI);
4530 } else {
4531 LastPTrue = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AArch64::PTRUE_B))
4532 .addReg(RegNo: *PredReg, flags: RegState::Define)
4533 .addImm(Val: 31);
4534 }
4535 MachineInstrs.push_back(Elt: LastPTrue);
4536 MachineInstrs.push_back(
4537 Elt: BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AArch64::CMPNE_PPzZI_B))
4538 .addReg(RegNo: MI.getOperand(i: 0).getReg(), flags: RegState::Define)
4539 .addReg(RegNo: *PredReg)
4540 .addReg(RegNo: *ZPredReg)
4541 .addImm(Val: 0)
4542 .addReg(RegNo: AArch64::NZCV, flags: RegState::ImplicitDefine)
4543 .getInstr());
4544 if (IsNZCVUsed)
4545 MachineInstrs.push_back(Elt: BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AArch64::MSR))
4546 .addImm(Val: AArch64SysReg::NZCV)
4547 .addReg(RegNo: NZCVSaveReg->freeRegister())
4548 .addReg(RegNo: AArch64::NZCV, flags: RegState::ImplicitDefine)
4549 .getInstr());
4550
4551 propagateFrameFlags(SourceMI&: MI, MachineInstrs);
4552 return PredReg.hasSpilled();
4553}
4554
4555/// Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
4556/// operations within the MachineBasicBlock \p MBB.
4557static bool expandSMEPPRToZPRSpillPseudos(MachineBasicBlock &MBB,
4558 const TargetRegisterInfo &TRI,
4559 ScavengeableRegs const &SR,
4560 EmergencyStackSlots &SpillSlots) {
4561 LiveRegUnits UsedRegs(TRI);
4562 UsedRegs.addLiveOuts(MBB);
4563 bool HasPPRSpills = false;
4564 MachineInstr *LastPTrue = nullptr;
4565 for (MachineInstr &MI : make_early_inc_range(Range: reverse(C&: MBB))) {
4566 UsedRegs.stepBackward(MI);
4567 switch (MI.getOpcode()) {
4568 case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
4569 if (LastPTrue &&
4570 MI.definesRegister(Reg: LastPTrue->getOperand(i: 0).getReg(), TRI: &TRI))
4571 LastPTrue = nullptr;
4572 HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR,
4573 LastPTrue, SpillSlots);
4574 MI.eraseFromParent();
4575 break;
4576 case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
4577 expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR, SpillSlots);
4578 MI.eraseFromParent();
4579 [[fallthrough]];
4580 default:
4581 LastPTrue = nullptr;
4582 break;
4583 }
4584 }
4585
4586 return HasPPRSpills;
4587}
4588
4589void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
4590 MachineFunction &MF, RegScavenger *RS) const {
4591
4592 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
4593 const TargetSubtargetInfo &TSI = MF.getSubtarget();
4594 const TargetRegisterInfo &TRI = *TSI.getRegisterInfo();
4595
4596 // If predicates spills are 16-bytes we may need to expand
4597 // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
4598 if (AFI->hasStackFrame() && TRI.getSpillSize(RC: AArch64::PPRRegClass) == 16) {
4599 auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
4600 BitVector Regs = TRI.getAllocatableSet(MF, RC: TRI.getRegClass(i: RegClassID));
4601 assert(Regs.count() > 0 && "Expected scavengeable registers");
4602 return Regs;
4603 };
4604
4605 ScavengeableRegs SR{};
4606 SR.ZPRRegs = ComputeScavengeableRegisters(AArch64::ZPRRegClassID);
4607 // Only p0-7 are possible as the second operand of cmpne (needed for fills).
4608 SR.PPR3bRegs = ComputeScavengeableRegisters(AArch64::PPR_3bRegClassID);
4609 SR.GPRRegs = ComputeScavengeableRegisters(AArch64::GPR64RegClassID);
4610
4611 EmergencyStackSlots SpillSlots;
4612 for (MachineBasicBlock &MBB : MF) {
4613 // In the case we had to spill a predicate (in the range p0-p7) to reload
4614 // a predicate (>= p8), additional spill/fill pseudos will be created.
4615 // These need an additional expansion pass. Note: There will only be at
4616 // most two expansion passes, as spilling/filling a predicate in the range
4617 // p0-p7 never requires spilling another predicate.
4618 for (int Pass = 0; Pass < 2; Pass++) {
4619 bool HasPPRSpills =
4620 expandSMEPPRToZPRSpillPseudos(MBB, TRI, SR, SpillSlots);
4621 assert((Pass == 0 || !HasPPRSpills) && "Did not expect PPR spills");
4622 if (!HasPPRSpills)
4623 break;
4624 }
4625 }
4626 }
4627
4628 MachineFrameInfo &MFI = MF.getFrameInfo();
4629
4630 assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
4631 "Upwards growing stack unsupported");
4632
4633 int MinCSFrameIndex, MaxCSFrameIndex;
4634 int64_t SVEStackSize =
4635 assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
4636
4637 AFI->setStackSizeSVE(alignTo(Value: SVEStackSize, Align: 16U));
4638 AFI->setMinMaxSVECSFrameIndex(Min: MinCSFrameIndex, Max: MaxCSFrameIndex);
4639
4640 // If this function isn't doing Win64-style C++ EH, we don't need to do
4641 // anything.
4642 if (!MF.hasEHFunclets())
4643 return;
4644 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
4645 WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
4646
4647 MachineBasicBlock &MBB = MF.front();
4648 auto MBBI = MBB.begin();
4649 while (MBBI != MBB.end() && MBBI->getFlag(Flag: MachineInstr::FrameSetup))
4650 ++MBBI;
4651
4652 // Create an UnwindHelp object.
4653 // The UnwindHelp object is allocated at the start of the fixed object area
4654 int64_t FixedObject =
4655 getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
4656 int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
4657 /*SPOffset*/ -FixedObject,
4658 /*IsImmutable=*/false);
4659 EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
4660
4661 // We need to store -2 into the UnwindHelp object at the start of the
4662 // function.
4663 DebugLoc DL;
4664 RS->enterBasicBlockEnd(MBB);
4665 RS->backward(I: MBBI);
4666 Register DstReg = RS->FindUnusedReg(RC: &AArch64::GPR64commonRegClass);
4667 assert(DstReg && "There must be a free register after frame setup");
4668 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::MOVi64imm), DestReg: DstReg).addImm(Val: -2);
4669 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::STURXi))
4670 .addReg(RegNo: DstReg, flags: getKillRegState(B: true))
4671 .addFrameIndex(Idx: UnwindHelpFI)
4672 .addImm(Val: 0);
4673}
4674
4675namespace {
4676struct TagStoreInstr {
4677 MachineInstr *MI;
4678 int64_t Offset, Size;
4679 explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
4680 : MI(MI), Offset(Offset), Size(Size) {}
4681};
4682
4683class TagStoreEdit {
4684 MachineFunction *MF;
4685 MachineBasicBlock *MBB;
4686 MachineRegisterInfo *MRI;
4687 // Tag store instructions that are being replaced.
4688 SmallVector<TagStoreInstr, 8> TagStores;
4689 // Combined memref arguments of the above instructions.
4690 SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
4691
4692 // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
4693 // FrameRegOffset + Size) with the address tag of SP.
4694 Register FrameReg;
4695 StackOffset FrameRegOffset;
4696 int64_t Size;
4697 // If not std::nullopt, move FrameReg to (FrameReg + FrameRegUpdate) at the
4698 // end.
4699 std::optional<int64_t> FrameRegUpdate;
4700 // MIFlags for any FrameReg updating instructions.
4701 unsigned FrameRegUpdateFlags;
4702
4703 // Use zeroing instruction variants.
4704 bool ZeroData;
4705 DebugLoc DL;
4706
4707 void emitUnrolled(MachineBasicBlock::iterator InsertI);
4708 void emitLoop(MachineBasicBlock::iterator InsertI);
4709
4710public:
4711 TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
4712 : MBB(MBB), ZeroData(ZeroData) {
4713 MF = MBB->getParent();
4714 MRI = &MF->getRegInfo();
4715 }
4716 // Add an instruction to be replaced. Instructions must be added in the
4717 // ascending order of Offset, and have to be adjacent.
4718 void addInstruction(TagStoreInstr I) {
4719 assert((TagStores.empty() ||
4720 TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
4721 "Non-adjacent tag store instructions.");
4722 TagStores.push_back(Elt: I);
4723 }
4724 void clear() { TagStores.clear(); }
4725 // Emit equivalent code at the given location, and erase the current set of
4726 // instructions. May skip if the replacement is not profitable. May invalidate
4727 // the input iterator and replace it with a valid one.
4728 void emitCode(MachineBasicBlock::iterator &InsertI,
4729 const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
4730};
4731
4732void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
4733 const AArch64InstrInfo *TII =
4734 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
4735
4736 const int64_t kMinOffset = -256 * 16;
4737 const int64_t kMaxOffset = 255 * 16;
4738
4739 Register BaseReg = FrameReg;
4740 int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
4741 if (BaseRegOffsetBytes < kMinOffset ||
4742 BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset ||
4743 // BaseReg can be FP, which is not necessarily aligned to 16-bytes. In
4744 // that case, BaseRegOffsetBytes will not be aligned to 16 bytes, which
4745 // is required for the offset of ST2G.
4746 BaseRegOffsetBytes % 16 != 0) {
4747 Register ScratchReg = MRI->createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
4748 emitFrameOffset(MBB&: *MBB, MBBI: InsertI, DL, DestReg: ScratchReg, SrcReg: BaseReg,
4749 Offset: StackOffset::getFixed(Fixed: BaseRegOffsetBytes), TII);
4750 BaseReg = ScratchReg;
4751 BaseRegOffsetBytes = 0;
4752 }
4753
4754 MachineInstr *LastI = nullptr;
4755 while (Size) {
4756 int64_t InstrSize = (Size > 16) ? 32 : 16;
4757 unsigned Opcode =
4758 InstrSize == 16
4759 ? (ZeroData ? AArch64::STZGi : AArch64::STGi)
4760 : (ZeroData ? AArch64::STZ2Gi : AArch64::ST2Gi);
4761 assert(BaseRegOffsetBytes % 16 == 0);
4762 MachineInstr *I = BuildMI(BB&: *MBB, I: InsertI, MIMD: DL, MCID: TII->get(Opcode))
4763 .addReg(RegNo: AArch64::SP)
4764 .addReg(RegNo: BaseReg)
4765 .addImm(Val: BaseRegOffsetBytes / 16)
4766 .setMemRefs(CombinedMemRefs);
4767 // A store to [BaseReg, #0] should go last for an opportunity to fold the
4768 // final SP adjustment in the epilogue.
4769 if (BaseRegOffsetBytes == 0)
4770 LastI = I;
4771 BaseRegOffsetBytes += InstrSize;
4772 Size -= InstrSize;
4773 }
4774
4775 if (LastI)
4776 MBB->splice(Where: InsertI, Other: MBB, From: LastI);
4777}
4778
4779void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
4780 const AArch64InstrInfo *TII =
4781 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
4782
4783 Register BaseReg = FrameRegUpdate
4784 ? FrameReg
4785 : MRI->createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
4786 Register SizeReg = MRI->createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
4787
4788 emitFrameOffset(MBB&: *MBB, MBBI: InsertI, DL, DestReg: BaseReg, SrcReg: FrameReg, Offset: FrameRegOffset, TII);
4789
4790 int64_t LoopSize = Size;
4791 // If the loop size is not a multiple of 32, split off one 16-byte store at
4792 // the end to fold BaseReg update into.
4793 if (FrameRegUpdate && *FrameRegUpdate)
4794 LoopSize -= LoopSize % 32;
4795 MachineInstr *LoopI = BuildMI(BB&: *MBB, I: InsertI, MIMD: DL,
4796 MCID: TII->get(Opcode: ZeroData ? AArch64::STZGloop_wback
4797 : AArch64::STGloop_wback))
4798 .addDef(RegNo: SizeReg)
4799 .addDef(RegNo: BaseReg)
4800 .addImm(Val: LoopSize)
4801 .addReg(RegNo: BaseReg)
4802 .setMemRefs(CombinedMemRefs);
4803 if (FrameRegUpdate)
4804 LoopI->setFlags(FrameRegUpdateFlags);
4805
4806 int64_t ExtraBaseRegUpdate =
4807 FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
4808 LLVM_DEBUG(dbgs() << "TagStoreEdit::emitLoop: LoopSize=" << LoopSize
4809 << ", Size=" << Size
4810 << ", ExtraBaseRegUpdate=" << ExtraBaseRegUpdate
4811 << ", FrameRegUpdate=" << FrameRegUpdate
4812 << ", FrameRegOffset.getFixed()="
4813 << FrameRegOffset.getFixed() << "\n");
4814 if (LoopSize < Size) {
4815 assert(FrameRegUpdate);
4816 assert(Size - LoopSize == 16);
4817 // Tag 16 more bytes at BaseReg and update BaseReg.
4818 int64_t STGOffset = ExtraBaseRegUpdate + 16;
4819 assert(STGOffset % 16 == 0 && STGOffset >= -4096 && STGOffset <= 4080 &&
4820 "STG immediate out of range");
4821 BuildMI(BB&: *MBB, I: InsertI, MIMD: DL,
4822 MCID: TII->get(Opcode: ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
4823 .addDef(RegNo: BaseReg)
4824 .addReg(RegNo: BaseReg)
4825 .addReg(RegNo: BaseReg)
4826 .addImm(Val: STGOffset / 16)
4827 .setMemRefs(CombinedMemRefs)
4828 .setMIFlags(FrameRegUpdateFlags);
4829 } else if (ExtraBaseRegUpdate) {
4830 // Update BaseReg.
4831 int64_t AddSubOffset = std::abs(i: ExtraBaseRegUpdate);
4832 assert(AddSubOffset <= 4095 && "ADD/SUB immediate out of range");
4833 BuildMI(
4834 BB&: *MBB, I: InsertI, MIMD: DL,
4835 MCID: TII->get(Opcode: ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
4836 .addDef(RegNo: BaseReg)
4837 .addReg(RegNo: BaseReg)
4838 .addImm(Val: AddSubOffset)
4839 .addImm(Val: 0)
4840 .setMIFlags(FrameRegUpdateFlags);
4841 }
4842}
4843
4844// Check if *II is a register update that can be merged into STGloop that ends
4845// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
4846// end of the loop.
4847bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
4848 int64_t Size, int64_t *TotalOffset) {
4849 MachineInstr &MI = *II;
4850 if ((MI.getOpcode() == AArch64::ADDXri ||
4851 MI.getOpcode() == AArch64::SUBXri) &&
4852 MI.getOperand(i: 0).getReg() == Reg && MI.getOperand(i: 1).getReg() == Reg) {
4853 unsigned Shift = AArch64_AM::getShiftValue(Imm: MI.getOperand(i: 3).getImm());
4854 int64_t Offset = MI.getOperand(i: 2).getImm() << Shift;
4855 if (MI.getOpcode() == AArch64::SUBXri)
4856 Offset = -Offset;
4857 int64_t PostOffset = Offset - Size;
4858 // TagStoreEdit::emitLoop might emit either an ADD/SUB after the loop, or
4859 // an STGPostIndex which does the last 16 bytes of tag write. Which one is
4860 // chosen depends on the alignment of the loop size, but the difference
4861 // between the valid ranges for the two instructions is small, so we
4862 // conservatively assume that it could be either case here.
4863 //
4864 // Max offset of STGPostIndex, minus the 16 byte tag write folded into that
4865 // instruction.
4866 const int64_t kMaxOffset = 4080 - 16;
4867 // Max offset of SUBXri.
4868 const int64_t kMinOffset = -4095;
4869 if (PostOffset <= kMaxOffset && PostOffset >= kMinOffset &&
4870 PostOffset % 16 == 0) {
4871 *TotalOffset = Offset;
4872 return true;
4873 }
4874 }
4875 return false;
4876}
4877
4878void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
4879 SmallVectorImpl<MachineMemOperand *> &MemRefs) {
4880 MemRefs.clear();
4881 for (auto &TS : TSE) {
4882 MachineInstr *MI = TS.MI;
4883 // An instruction without memory operands may access anything. Be
4884 // conservative and return an empty list.
4885 if (MI->memoperands_empty()) {
4886 MemRefs.clear();
4887 return;
4888 }
4889 MemRefs.append(in_start: MI->memoperands_begin(), in_end: MI->memoperands_end());
4890 }
4891}
4892
4893void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
4894 const AArch64FrameLowering *TFI,
4895 bool TryMergeSPUpdate) {
4896 if (TagStores.empty())
4897 return;
4898 TagStoreInstr &FirstTagStore = TagStores[0];
4899 TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
4900 Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
4901 DL = TagStores[0].MI->getDebugLoc();
4902
4903 Register Reg;
4904 FrameRegOffset = TFI->resolveFrameOffsetReference(
4905 MF: *MF, ObjectOffset: FirstTagStore.Offset, isFixed: false /*isFixed*/, isSVE: false /*isSVE*/, FrameReg&: Reg,
4906 /*PreferFP=*/false, /*ForSimm=*/true);
4907 FrameReg = Reg;
4908 FrameRegUpdate = std::nullopt;
4909
4910 mergeMemRefs(TSE: TagStores, MemRefs&: CombinedMemRefs);
4911
4912 LLVM_DEBUG({
4913 dbgs() << "Replacing adjacent STG instructions:\n";
4914 for (const auto &Instr : TagStores) {
4915 dbgs() << " " << *Instr.MI;
4916 }
4917 });
4918
4919 // Size threshold where a loop becomes shorter than a linear sequence of
4920 // tagging instructions.
4921 const int kSetTagLoopThreshold = 176;
4922 if (Size < kSetTagLoopThreshold) {
4923 if (TagStores.size() < 2)
4924 return;
4925 emitUnrolled(InsertI);
4926 } else {
4927 MachineInstr *UpdateInstr = nullptr;
4928 int64_t TotalOffset = 0;
4929 if (TryMergeSPUpdate) {
4930 // See if we can merge base register update into the STGloop.
4931 // This is done in AArch64LoadStoreOptimizer for "normal" stores,
4932 // but STGloop is way too unusual for that, and also it only
4933 // realistically happens in function epilogue. Also, STGloop is expanded
4934 // before that pass.
4935 if (InsertI != MBB->end() &&
4936 canMergeRegUpdate(II: InsertI, Reg: FrameReg, Size: FrameRegOffset.getFixed() + Size,
4937 TotalOffset: &TotalOffset)) {
4938 UpdateInstr = &*InsertI++;
4939 LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
4940 << *UpdateInstr);
4941 }
4942 }
4943
4944 if (!UpdateInstr && TagStores.size() < 2)
4945 return;
4946
4947 if (UpdateInstr) {
4948 FrameRegUpdate = TotalOffset;
4949 FrameRegUpdateFlags = UpdateInstr->getFlags();
4950 }
4951 emitLoop(InsertI);
4952 if (UpdateInstr)
4953 UpdateInstr->eraseFromParent();
4954 }
4955
4956 for (auto &TS : TagStores)
4957 TS.MI->eraseFromParent();
4958}
4959
4960bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
4961 int64_t &Size, bool &ZeroData) {
4962 MachineFunction &MF = *MI.getParent()->getParent();
4963 const MachineFrameInfo &MFI = MF.getFrameInfo();
4964
4965 unsigned Opcode = MI.getOpcode();
4966 ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGi ||
4967 Opcode == AArch64::STZ2Gi);
4968
4969 if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
4970 if (!MI.getOperand(i: 0).isDead() || !MI.getOperand(i: 1).isDead())
4971 return false;
4972 if (!MI.getOperand(i: 2).isImm() || !MI.getOperand(i: 3).isFI())
4973 return false;
4974 Offset = MFI.getObjectOffset(ObjectIdx: MI.getOperand(i: 3).getIndex());
4975 Size = MI.getOperand(i: 2).getImm();
4976 return true;
4977 }
4978
4979 if (Opcode == AArch64::STGi || Opcode == AArch64::STZGi)
4980 Size = 16;
4981 else if (Opcode == AArch64::ST2Gi || Opcode == AArch64::STZ2Gi)
4982 Size = 32;
4983 else
4984 return false;
4985
4986 if (MI.getOperand(i: 0).getReg() != AArch64::SP || !MI.getOperand(i: 1).isFI())
4987 return false;
4988
4989 Offset = MFI.getObjectOffset(ObjectIdx: MI.getOperand(i: 1).getIndex()) +
4990 16 * MI.getOperand(i: 2).getImm();
4991 return true;
4992}
4993
4994// Detect a run of memory tagging instructions for adjacent stack frame slots,
4995// and replace them with a shorter instruction sequence:
4996// * replace STG + STG with ST2G
4997// * replace STGloop + STGloop with STGloop
4998// This code needs to run when stack slot offsets are already known, but before
4999// FrameIndex operands in STG instructions are eliminated.
5000MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
5001 const AArch64FrameLowering *TFI,
5002 RegScavenger *RS) {
5003 bool FirstZeroData;
5004 int64_t Size, Offset;
5005 MachineInstr &MI = *II;
5006 MachineBasicBlock *MBB = MI.getParent();
5007 MachineBasicBlock::iterator NextI = ++II;
5008 if (&MI == &MBB->instr_back())
5009 return II;
5010 if (!isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData&: FirstZeroData))
5011 return II;
5012
5013 SmallVector<TagStoreInstr, 4> Instrs;
5014 Instrs.emplace_back(Args: &MI, Args&: Offset, Args&: Size);
5015
5016 constexpr int kScanLimit = 10;
5017 int Count = 0;
5018 for (MachineBasicBlock::iterator E = MBB->end();
5019 NextI != E && Count < kScanLimit; ++NextI) {
5020 MachineInstr &MI = *NextI;
5021 bool ZeroData;
5022 int64_t Size, Offset;
5023 // Collect instructions that update memory tags with a FrameIndex operand
5024 // and (when applicable) constant size, and whose output registers are dead
5025 // (the latter is almost always the case in practice). Since these
5026 // instructions effectively have no inputs or outputs, we are free to skip
5027 // any non-aliasing instructions in between without tracking used registers.
5028 if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
5029 if (ZeroData != FirstZeroData)
5030 break;
5031 Instrs.emplace_back(Args: &MI, Args&: Offset, Args&: Size);
5032 continue;
5033 }
5034
5035 // Only count non-transient, non-tagging instructions toward the scan
5036 // limit.
5037 if (!MI.isTransient())
5038 ++Count;
5039
5040 // Just in case, stop before the epilogue code starts.
5041 if (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
5042 MI.getFlag(Flag: MachineInstr::FrameDestroy))
5043 break;
5044
5045 // Reject anything that may alias the collected instructions.
5046 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() || MI.isCall())
5047 break;
5048 }
5049
5050 // New code will be inserted after the last tagging instruction we've found.
5051 MachineBasicBlock::iterator InsertI = Instrs.back().MI;
5052
5053 // All the gathered stack tag instructions are merged and placed after
5054 // last tag store in the list. The check should be made if the nzcv
5055 // flag is live at the point where we are trying to insert. Otherwise
5056 // the nzcv flag might get clobbered if any stg loops are present.
5057
5058 // FIXME : This approach of bailing out from merge is conservative in
5059 // some ways like even if stg loops are not present after merge the
5060 // insert list, this liveness check is done (which is not needed).
5061 LivePhysRegs LiveRegs(*(MBB->getParent()->getSubtarget().getRegisterInfo()));
5062 LiveRegs.addLiveOuts(MBB: *MBB);
5063 for (auto I = MBB->rbegin();; ++I) {
5064 MachineInstr &MI = *I;
5065 if (MI == InsertI)
5066 break;
5067 LiveRegs.stepBackward(MI: *I);
5068 }
5069 InsertI++;
5070 if (LiveRegs.contains(Reg: AArch64::NZCV))
5071 return InsertI;
5072
5073 llvm::stable_sort(Range&: Instrs,
5074 C: [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
5075 return Left.Offset < Right.Offset;
5076 });
5077
5078 // Make sure that we don't have any overlapping stores.
5079 int64_t CurOffset = Instrs[0].Offset;
5080 for (auto &Instr : Instrs) {
5081 if (CurOffset > Instr.Offset)
5082 return NextI;
5083 CurOffset = Instr.Offset + Instr.Size;
5084 }
5085
5086 // Find contiguous runs of tagged memory and emit shorter instruction
5087 // sequences for them when possible.
5088 TagStoreEdit TSE(MBB, FirstZeroData);
5089 std::optional<int64_t> EndOffset;
5090 for (auto &Instr : Instrs) {
5091 if (EndOffset && *EndOffset != Instr.Offset) {
5092 // Found a gap.
5093 TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false);
5094 TSE.clear();
5095 }
5096
5097 TSE.addInstruction(I: Instr);
5098 EndOffset = Instr.Offset + Instr.Size;
5099 }
5100
5101 const MachineFunction *MF = MBB->getParent();
5102 // Multiple FP/SP updates in a loop cannot be described by CFI instructions.
5103 TSE.emitCode(
5104 InsertI, TFI, /*TryMergeSPUpdate = */
5105 !MF->getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF: *MF));
5106
5107 return InsertI;
5108}
5109} // namespace
5110
5111static void emitVGSaveRestore(MachineBasicBlock::iterator II,
5112 const AArch64FrameLowering *TFI) {
5113 MachineInstr &MI = *II;
5114 MachineBasicBlock *MBB = MI.getParent();
5115 MachineFunction *MF = MBB->getParent();
5116
5117 if (MI.getOpcode() != AArch64::VGSavePseudo &&
5118 MI.getOpcode() != AArch64::VGRestorePseudo)
5119 return;
5120
5121 auto *AFI = MF->getInfo<AArch64FunctionInfo>();
5122 SMEAttrs FuncAttrs = AFI->getSMEFnAttrs();
5123 bool LocallyStreaming =
5124 FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
5125
5126 int64_t VGFrameIdx =
5127 LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx();
5128 assert(VGFrameIdx != std::numeric_limits<int>::max() &&
5129 "Expected FrameIdx for VG");
5130
5131 CFIInstBuilder CFIBuilder(*MBB, II, MachineInstr::NoFlags);
5132 if (MI.getOpcode() == AArch64::VGSavePseudo) {
5133 const MachineFrameInfo &MFI = MF->getFrameInfo();
5134 int64_t Offset =
5135 MFI.getObjectOffset(ObjectIdx: VGFrameIdx) - TFI->getOffsetOfLocalArea();
5136 CFIBuilder.buildOffset(Reg: AArch64::VG, Offset);
5137 } else {
5138 CFIBuilder.buildRestore(Reg: AArch64::VG);
5139 }
5140
5141 MI.eraseFromParent();
5142}
5143
5144void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
5145 MachineFunction &MF, RegScavenger *RS = nullptr) const {
5146 for (auto &BB : MF)
5147 for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) {
5148 if (requiresSaveVG(MF))
5149 emitVGSaveRestore(II: II++, TFI: this);
5150 else if (StackTaggingMergeSetTag)
5151 II = tryMergeAdjacentSTG(II, TFI: this, RS);
5152 }
5153}
5154
5155/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
5156/// before the update. This is easily retrieved as it is exactly the offset
5157/// that is set in processFunctionBeforeFrameFinalized.
5158StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
5159 const MachineFunction &MF, int FI, Register &FrameReg,
5160 bool IgnoreSPUpdates) const {
5161 const MachineFrameInfo &MFI = MF.getFrameInfo();
5162 if (IgnoreSPUpdates) {
5163 LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
5164 << MFI.getObjectOffset(FI) << "\n");
5165 FrameReg = AArch64::SP;
5166 return StackOffset::getFixed(Fixed: MFI.getObjectOffset(ObjectIdx: FI));
5167 }
5168
5169 // Go to common code if we cannot provide sp + offset.
5170 if (MFI.hasVarSizedObjects() ||
5171 MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
5172 MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
5173 return getFrameIndexReference(MF, FI, FrameReg);
5174
5175 FrameReg = AArch64::SP;
5176 return getStackOffset(MF, ObjectOffset: MFI.getObjectOffset(ObjectIdx: FI));
5177}
5178
5179/// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
5180/// the parent's frame pointer
5181unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
5182 const MachineFunction &MF) const {
5183 return 0;
5184}
5185
5186/// Funclets only need to account for space for the callee saved registers,
5187/// as the locals are accounted for in the parent's stack frame.
5188unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
5189 const MachineFunction &MF) const {
5190 // This is the size of the pushed CSRs.
5191 unsigned CSSize =
5192 MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
5193 // This is the amount of stack a funclet needs to allocate.
5194 return alignTo(Size: CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
5195 A: getStackAlign());
5196}
5197
5198namespace {
5199struct FrameObject {
5200 bool IsValid = false;
5201 // Index of the object in MFI.
5202 int ObjectIndex = 0;
5203 // Group ID this object belongs to.
5204 int GroupIndex = -1;
5205 // This object should be placed first (closest to SP).
5206 bool ObjectFirst = false;
5207 // This object's group (which always contains the object with
5208 // ObjectFirst==true) should be placed first.
5209 bool GroupFirst = false;
5210
5211 // Used to distinguish between FP and GPR accesses. The values are decided so
5212 // that they sort FPR < Hazard < GPR and they can be or'd together.
5213 unsigned Accesses = 0;
5214 enum { AccessFPR = 1, AccessHazard = 2, AccessGPR = 4 };
5215};
5216
5217class GroupBuilder {
5218 SmallVector<int, 8> CurrentMembers;
5219 int NextGroupIndex = 0;
5220 std::vector<FrameObject> &Objects;
5221
5222public:
5223 GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
5224 void AddMember(int Index) { CurrentMembers.push_back(Elt: Index); }
5225 void EndCurrentGroup() {
5226 if (CurrentMembers.size() > 1) {
5227 // Create a new group with the current member list. This might remove them
5228 // from their pre-existing groups. That's OK, dealing with overlapping
5229 // groups is too hard and unlikely to make a difference.
5230 LLVM_DEBUG(dbgs() << "group:");
5231 for (int Index : CurrentMembers) {
5232 Objects[Index].GroupIndex = NextGroupIndex;
5233 LLVM_DEBUG(dbgs() << " " << Index);
5234 }
5235 LLVM_DEBUG(dbgs() << "\n");
5236 NextGroupIndex++;
5237 }
5238 CurrentMembers.clear();
5239 }
5240};
5241
5242bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
5243 // Objects at a lower index are closer to FP; objects at a higher index are
5244 // closer to SP.
5245 //
5246 // For consistency in our comparison, all invalid objects are placed
5247 // at the end. This also allows us to stop walking when we hit the
5248 // first invalid item after it's all sorted.
5249 //
5250 // If we want to include a stack hazard region, order FPR accesses < the
5251 // hazard object < GPRs accesses in order to create a separation between the
5252 // two. For the Accesses field 1 = FPR, 2 = Hazard Object, 4 = GPR.
5253 //
5254 // Otherwise the "first" object goes first (closest to SP), followed by the
5255 // members of the "first" group.
5256 //
5257 // The rest are sorted by the group index to keep the groups together.
5258 // Higher numbered groups are more likely to be around longer (i.e. untagged
5259 // in the function epilogue and not at some earlier point). Place them closer
5260 // to SP.
5261 //
5262 // If all else equal, sort by the object index to keep the objects in the
5263 // original order.
5264 return std::make_tuple(args: !A.IsValid, args: A.Accesses, args: A.ObjectFirst, args: A.GroupFirst,
5265 args: A.GroupIndex, args: A.ObjectIndex) <
5266 std::make_tuple(args: !B.IsValid, args: B.Accesses, args: B.ObjectFirst, args: B.GroupFirst,
5267 args: B.GroupIndex, args: B.ObjectIndex);
5268}
5269} // namespace
5270
5271void AArch64FrameLowering::orderFrameObjects(
5272 const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
5273 if (!OrderFrameObjects || ObjectsToAllocate.empty())
5274 return;
5275
5276 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
5277 const MachineFrameInfo &MFI = MF.getFrameInfo();
5278 std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
5279 for (auto &Obj : ObjectsToAllocate) {
5280 FrameObjects[Obj].IsValid = true;
5281 FrameObjects[Obj].ObjectIndex = Obj;
5282 }
5283
5284 // Identify FPR vs GPR slots for hazards, and stack slots that are tagged at
5285 // the same time.
5286 GroupBuilder GB(FrameObjects);
5287 for (auto &MBB : MF) {
5288 for (auto &MI : MBB) {
5289 if (MI.isDebugInstr())
5290 continue;
5291
5292 if (AFI.hasStackHazardSlotIndex()) {
5293 std::optional<int> FI = getLdStFrameID(MI, MFI);
5294 if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
5295 if (MFI.getStackID(ObjectIdx: *FI) == TargetStackID::ScalableVector ||
5296 AArch64InstrInfo::isFpOrNEON(MI))
5297 FrameObjects[*FI].Accesses |= FrameObject::AccessFPR;
5298 else
5299 FrameObjects[*FI].Accesses |= FrameObject::AccessGPR;
5300 }
5301 }
5302
5303 int OpIndex;
5304 switch (MI.getOpcode()) {
5305 case AArch64::STGloop:
5306 case AArch64::STZGloop:
5307 OpIndex = 3;
5308 break;
5309 case AArch64::STGi:
5310 case AArch64::STZGi:
5311 case AArch64::ST2Gi:
5312 case AArch64::STZ2Gi:
5313 OpIndex = 1;
5314 break;
5315 default:
5316 OpIndex = -1;
5317 }
5318
5319 int TaggedFI = -1;
5320 if (OpIndex >= 0) {
5321 const MachineOperand &MO = MI.getOperand(i: OpIndex);
5322 if (MO.isFI()) {
5323 int FI = MO.getIndex();
5324 if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
5325 FrameObjects[FI].IsValid)
5326 TaggedFI = FI;
5327 }
5328 }
5329
5330 // If this is a stack tagging instruction for a slot that is not part of a
5331 // group yet, either start a new group or add it to the current one.
5332 if (TaggedFI >= 0)
5333 GB.AddMember(Index: TaggedFI);
5334 else
5335 GB.EndCurrentGroup();
5336 }
5337 // Groups should never span multiple basic blocks.
5338 GB.EndCurrentGroup();
5339 }
5340
5341 if (AFI.hasStackHazardSlotIndex()) {
5342 FrameObjects[AFI.getStackHazardSlotIndex()].Accesses =
5343 FrameObject::AccessHazard;
5344 // If a stack object is unknown or both GPR and FPR, sort it into GPR.
5345 for (auto &Obj : FrameObjects)
5346 if (!Obj.Accesses ||
5347 Obj.Accesses == (FrameObject::AccessGPR | FrameObject::AccessFPR))
5348 Obj.Accesses = FrameObject::AccessGPR;
5349 }
5350
5351 // If the function's tagged base pointer is pinned to a stack slot, we want to
5352 // put that slot first when possible. This will likely place it at SP + 0,
5353 // and save one instruction when generating the base pointer because IRG does
5354 // not allow an immediate offset.
5355 std::optional<int> TBPI = AFI.getTaggedBasePointerIndex();
5356 if (TBPI) {
5357 FrameObjects[*TBPI].ObjectFirst = true;
5358 FrameObjects[*TBPI].GroupFirst = true;
5359 int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
5360 if (FirstGroupIndex >= 0)
5361 for (FrameObject &Object : FrameObjects)
5362 if (Object.GroupIndex == FirstGroupIndex)
5363 Object.GroupFirst = true;
5364 }
5365
5366 llvm::stable_sort(Range&: FrameObjects, C: FrameObjectCompare);
5367
5368 int i = 0;
5369 for (auto &Obj : FrameObjects) {
5370 // All invalid items are sorted at the end, so it's safe to stop.
5371 if (!Obj.IsValid)
5372 break;
5373 ObjectsToAllocate[i++] = Obj.ObjectIndex;
5374 }
5375
5376 LLVM_DEBUG({
5377 dbgs() << "Final frame order:\n";
5378 for (auto &Obj : FrameObjects) {
5379 if (!Obj.IsValid)
5380 break;
5381 dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
5382 if (Obj.ObjectFirst)
5383 dbgs() << ", first";
5384 if (Obj.GroupFirst)
5385 dbgs() << ", group-first";
5386 dbgs() << "\n";
5387 }
5388 });
5389}
5390
5391/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
5392/// least every ProbeSize bytes. Returns an iterator of the first instruction
5393/// after the loop. The difference between SP and TargetReg must be an exact
5394/// multiple of ProbeSize.
5395MachineBasicBlock::iterator
5396AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
5397 MachineBasicBlock::iterator MBBI, int64_t ProbeSize,
5398 Register TargetReg) const {
5399 MachineBasicBlock &MBB = *MBBI->getParent();
5400 MachineFunction &MF = *MBB.getParent();
5401 const AArch64InstrInfo *TII =
5402 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
5403 DebugLoc DL = MBB.findDebugLoc(MBBI);
5404
5405 MachineFunction::iterator MBBInsertPoint = std::next(x: MBB.getIterator());
5406 MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
5407 MF.insert(MBBI: MBBInsertPoint, MBB: LoopMBB);
5408 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
5409 MF.insert(MBBI: MBBInsertPoint, MBB: ExitMBB);
5410
5411 // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable
5412 // in SUB).
5413 emitFrameOffset(MBB&: *LoopMBB, MBBI: LoopMBB->end(), DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
5414 Offset: StackOffset::getFixed(Fixed: -ProbeSize), TII,
5415 MachineInstr::FrameSetup);
5416 // STR XZR, [SP]
5417 BuildMI(BB&: *LoopMBB, I: LoopMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::STRXui))
5418 .addReg(RegNo: AArch64::XZR)
5419 .addReg(RegNo: AArch64::SP)
5420 .addImm(Val: 0)
5421 .setMIFlags(MachineInstr::FrameSetup);
5422 // CMP SP, TargetReg
5423 BuildMI(BB&: *LoopMBB, I: LoopMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::SUBSXrx64),
5424 DestReg: AArch64::XZR)
5425 .addReg(RegNo: AArch64::SP)
5426 .addReg(RegNo: TargetReg)
5427 .addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: 0))
5428 .setMIFlags(MachineInstr::FrameSetup);
5429 // B.CC Loop
5430 BuildMI(BB&: *LoopMBB, I: LoopMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc))
5431 .addImm(Val: AArch64CC::NE)
5432 .addMBB(MBB: LoopMBB)
5433 .setMIFlags(MachineInstr::FrameSetup);
5434
5435 LoopMBB->addSuccessor(Succ: ExitMBB);
5436 LoopMBB->addSuccessor(Succ: LoopMBB);
5437 // Synthesize the exit MBB.
5438 ExitMBB->splice(Where: ExitMBB->end(), Other: &MBB, From: MBBI, To: MBB.end());
5439 ExitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
5440 MBB.addSuccessor(Succ: LoopMBB);
5441 // Update liveins.
5442 fullyRecomputeLiveIns(MBBs: {ExitMBB, LoopMBB});
5443
5444 return ExitMBB->begin();
5445}
5446
5447void AArch64FrameLowering::inlineStackProbeFixed(
5448 MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize,
5449 StackOffset CFAOffset) const {
5450 MachineBasicBlock *MBB = MBBI->getParent();
5451 MachineFunction &MF = *MBB->getParent();
5452 const AArch64InstrInfo *TII =
5453 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
5454 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5455 bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
5456 bool HasFP = hasFP(MF);
5457
5458 DebugLoc DL;
5459 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
5460 int64_t NumBlocks = FrameSize / ProbeSize;
5461 int64_t ResidualSize = FrameSize % ProbeSize;
5462
5463 LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, "
5464 << NumBlocks << " blocks of " << ProbeSize
5465 << " bytes, plus " << ResidualSize << " bytes\n");
5466
5467 // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or
5468 // ordinary loop.
5469 if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) {
5470 for (int i = 0; i < NumBlocks; ++i) {
5471 // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not
5472 // encodable in a SUB).
5473 emitFrameOffset(MBB&: *MBB, MBBI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
5474 Offset: StackOffset::getFixed(Fixed: -ProbeSize), TII,
5475 MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI: false, HasWinCFI: nullptr,
5476 EmitCFAOffset: EmitAsyncCFI && !HasFP, InitialOffset: CFAOffset);
5477 CFAOffset += StackOffset::getFixed(Fixed: ProbeSize);
5478 // STR XZR, [SP]
5479 BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::STRXui))
5480 .addReg(RegNo: AArch64::XZR)
5481 .addReg(RegNo: AArch64::SP)
5482 .addImm(Val: 0)
5483 .setMIFlags(MachineInstr::FrameSetup);
5484 }
5485 } else if (NumBlocks != 0) {
5486 // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not
5487 // encodable in ADD). ScrathReg may temporarily become the CFA register.
5488 emitFrameOffset(MBB&: *MBB, MBBI, DL, DestReg: ScratchReg, SrcReg: AArch64::SP,
5489 Offset: StackOffset::getFixed(Fixed: -ProbeSize * NumBlocks), TII,
5490 MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI: false, HasWinCFI: nullptr,
5491 EmitCFAOffset: EmitAsyncCFI && !HasFP, InitialOffset: CFAOffset);
5492 CFAOffset += StackOffset::getFixed(Fixed: ProbeSize * NumBlocks);
5493 MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, TargetReg: ScratchReg);
5494 MBB = MBBI->getParent();
5495 if (EmitAsyncCFI && !HasFP) {
5496 // Set the CFA register back to SP.
5497 CFIInstBuilder(*MBB, MBBI, MachineInstr::FrameSetup)
5498 .buildDefCFARegister(Reg: AArch64::SP);
5499 }
5500 }
5501
5502 if (ResidualSize != 0) {
5503 // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable
5504 // in SUB).
5505 emitFrameOffset(MBB&: *MBB, MBBI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
5506 Offset: StackOffset::getFixed(Fixed: -ResidualSize), TII,
5507 MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI: false, HasWinCFI: nullptr,
5508 EmitCFAOffset: EmitAsyncCFI && !HasFP, InitialOffset: CFAOffset);
5509 if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) {
5510 // STR XZR, [SP]
5511 BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::STRXui))
5512 .addReg(RegNo: AArch64::XZR)
5513 .addReg(RegNo: AArch64::SP)
5514 .addImm(Val: 0)
5515 .setMIFlags(MachineInstr::FrameSetup);
5516 }
5517 }
5518}
5519
5520void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
5521 MachineBasicBlock &MBB) const {
5522 // Get the instructions that need to be replaced. We emit at most two of
5523 // these. Remember them in order to avoid complications coming from the need
5524 // to traverse the block while potentially creating more blocks.
5525 SmallVector<MachineInstr *, 4> ToReplace;
5526 for (MachineInstr &MI : MBB)
5527 if (MI.getOpcode() == AArch64::PROBED_STACKALLOC ||
5528 MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR)
5529 ToReplace.push_back(Elt: &MI);
5530
5531 for (MachineInstr *MI : ToReplace) {
5532 if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) {
5533 Register ScratchReg = MI->getOperand(i: 0).getReg();
5534 int64_t FrameSize = MI->getOperand(i: 1).getImm();
5535 StackOffset CFAOffset = StackOffset::get(Fixed: MI->getOperand(i: 2).getImm(),
5536 Scalable: MI->getOperand(i: 3).getImm());
5537 inlineStackProbeFixed(MBBI: MI->getIterator(), ScratchReg, FrameSize,
5538 CFAOffset);
5539 } else {
5540 assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR &&
5541 "Stack probe pseudo-instruction expected");
5542 const AArch64InstrInfo *TII =
5543 MI->getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
5544 Register TargetReg = MI->getOperand(i: 0).getReg();
5545 (void)TII->probedStackAlloc(MBBI: MI->getIterator(), TargetReg, FrameSetup: true);
5546 }
5547 MI->eraseFromParent();
5548 }
5549}
5550
5551struct StackAccess {
5552 enum AccessType {
5553 NotAccessed = 0, // Stack object not accessed by load/store instructions.
5554 GPR = 1 << 0, // A general purpose register.
5555 PPR = 1 << 1, // A predicate register.
5556 FPR = 1 << 2, // A floating point/Neon/SVE register.
5557 };
5558
5559 int Idx;
5560 StackOffset Offset;
5561 int64_t Size;
5562 unsigned AccessTypes;
5563
5564 StackAccess() : Idx(0), Offset(), Size(0), AccessTypes(NotAccessed) {}
5565
5566 bool operator<(const StackAccess &Rhs) const {
5567 return std::make_tuple(args: start(), args: Idx) <
5568 std::make_tuple(args: Rhs.start(), args: Rhs.Idx);
5569 }
5570
5571 bool isCPU() const {
5572 // Predicate register load and store instructions execute on the CPU.
5573 return AccessTypes & (AccessType::GPR | AccessType::PPR);
5574 }
5575 bool isSME() const { return AccessTypes & AccessType::FPR; }
5576 bool isMixed() const { return isCPU() && isSME(); }
5577
5578 int64_t start() const { return Offset.getFixed() + Offset.getScalable(); }
5579 int64_t end() const { return start() + Size; }
5580
5581 std::string getTypeString() const {
5582 switch (AccessTypes) {
5583 case AccessType::FPR:
5584 return "FPR";
5585 case AccessType::PPR:
5586 return "PPR";
5587 case AccessType::GPR:
5588 return "GPR";
5589 case AccessType::NotAccessed:
5590 return "NA";
5591 default:
5592 return "Mixed";
5593 }
5594 }
5595
5596 void print(raw_ostream &OS) const {
5597 OS << getTypeString() << " stack object at [SP"
5598 << (Offset.getFixed() < 0 ? "" : "+") << Offset.getFixed();
5599 if (Offset.getScalable())
5600 OS << (Offset.getScalable() < 0 ? "" : "+") << Offset.getScalable()
5601 << " * vscale";
5602 OS << "]";
5603 }
5604};
5605
5606static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) {
5607 SA.print(OS);
5608 return OS;
5609}
5610
5611void AArch64FrameLowering::emitRemarks(
5612 const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const {
5613
5614 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
5615 if (AFI->getSMEFnAttrs().hasNonStreamingInterfaceAndBody())
5616 return;
5617
5618 unsigned StackHazardSize = getStackHazardSize(MF);
5619 const uint64_t HazardSize =
5620 (StackHazardSize) ? StackHazardSize : StackHazardRemarkSize;
5621
5622 if (HazardSize == 0)
5623 return;
5624
5625 const MachineFrameInfo &MFI = MF.getFrameInfo();
5626 // Bail if function has no stack objects.
5627 if (!MFI.hasStackObjects())
5628 return;
5629
5630 std::vector<StackAccess> StackAccesses(MFI.getNumObjects());
5631
5632 size_t NumFPLdSt = 0;
5633 size_t NumNonFPLdSt = 0;
5634
5635 // Collect stack accesses via Load/Store instructions.
5636 for (const MachineBasicBlock &MBB : MF) {
5637 for (const MachineInstr &MI : MBB) {
5638 if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
5639 continue;
5640 for (MachineMemOperand *MMO : MI.memoperands()) {
5641 std::optional<int> FI = getMMOFrameID(MMO, MFI);
5642 if (FI && !MFI.isDeadObjectIndex(ObjectIdx: *FI)) {
5643 int FrameIdx = *FI;
5644
5645 size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects();
5646 if (StackAccesses[ArrIdx].AccessTypes == StackAccess::NotAccessed) {
5647 StackAccesses[ArrIdx].Idx = FrameIdx;
5648 StackAccesses[ArrIdx].Offset =
5649 getFrameIndexReferenceFromSP(MF, FI: FrameIdx);
5650 StackAccesses[ArrIdx].Size = MFI.getObjectSize(ObjectIdx: FrameIdx);
5651 }
5652
5653 unsigned RegTy = StackAccess::AccessType::GPR;
5654 if (MFI.getStackID(ObjectIdx: FrameIdx) == TargetStackID::ScalableVector) {
5655 // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
5656 // spill/fill the predicate as a data vector (so are an FPR access).
5657 if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
5658 MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
5659 AArch64::PPRRegClass.contains(Reg: MI.getOperand(i: 0).getReg())) {
5660 RegTy = StackAccess::PPR;
5661 } else
5662 RegTy = StackAccess::FPR;
5663 } else if (AArch64InstrInfo::isFpOrNEON(MI)) {
5664 RegTy = StackAccess::FPR;
5665 }
5666
5667 StackAccesses[ArrIdx].AccessTypes |= RegTy;
5668
5669 if (RegTy == StackAccess::FPR)
5670 ++NumFPLdSt;
5671 else
5672 ++NumNonFPLdSt;
5673 }
5674 }
5675 }
5676 }
5677
5678 if (NumFPLdSt == 0 || NumNonFPLdSt == 0)
5679 return;
5680
5681 llvm::sort(C&: StackAccesses);
5682 llvm::erase_if(C&: StackAccesses, P: [](const StackAccess &S) {
5683 return S.AccessTypes == StackAccess::NotAccessed;
5684 });
5685
5686 SmallVector<const StackAccess *> MixedObjects;
5687 SmallVector<std::pair<const StackAccess *, const StackAccess *>> HazardPairs;
5688
5689 if (StackAccesses.front().isMixed())
5690 MixedObjects.push_back(Elt: &StackAccesses.front());
5691
5692 for (auto It = StackAccesses.begin(), End = std::prev(x: StackAccesses.end());
5693 It != End; ++It) {
5694 const auto &First = *It;
5695 const auto &Second = *(It + 1);
5696
5697 if (Second.isMixed())
5698 MixedObjects.push_back(Elt: &Second);
5699
5700 if ((First.isSME() && Second.isCPU()) ||
5701 (First.isCPU() && Second.isSME())) {
5702 uint64_t Distance = static_cast<uint64_t>(Second.start() - First.end());
5703 if (Distance < HazardSize)
5704 HazardPairs.emplace_back(Args: &First, Args: &Second);
5705 }
5706 }
5707
5708 auto EmitRemark = [&](llvm::StringRef Str) {
5709 ORE->emit(RemarkBuilder: [&]() {
5710 auto R = MachineOptimizationRemarkAnalysis(
5711 "sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front());
5712 return R << formatv(Fmt: "stack hazard in '{0}': ", Vals: MF.getName()).str() << Str;
5713 });
5714 };
5715
5716 for (const auto &P : HazardPairs)
5717 EmitRemark(formatv(Fmt: "{0} is too close to {1}", Vals: *P.first, Vals: *P.second).str());
5718
5719 for (const auto *Obj : MixedObjects)
5720 EmitRemark(
5721 formatv(Fmt: "{0} accessed by both GP and FP instructions", Vals: *Obj).str());
5722}
5723