AArch64FrameLowering.cpp source code [llvm_projects/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp]

1	//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------- C++ --====//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file contains the AArch64 implementation of TargetFrameLowering class.
10	//
11	// On AArch64, stack frames are structured as follows:
12	//
13	// The stack grows downward.
14	//
15	// All of the individual frame areas on the frame below are optional, i.e. it's
16	// possible to create a function so that the particular area isn't present
17	// in the frame.
18	//
19	// At function entry, the "frame" looks as follows:
20	//
21	// \| \| Higher address
22	// \|-----------------------------------\|
23	// \| \|
24	// \| arguments passed on the stack \|
25	// \| \|
26	// \|-----------------------------------\| <- sp
27	// \| \| Lower address
28	//
29	//
30	// After the prologue has run, the frame has the following general structure.
31	// Note that this doesn't depict the case where a red-zone is used. Also,
32	// technically the last frame area (VLAs) doesn't get created until in the
33	// main function body, after the prologue is run. However, it's depicted here
34	// for completeness.
35	//
36	// \| \| Higher address
37	// \|-----------------------------------\|
38	// \| \|
39	// \| arguments passed on the stack \|
40	// \| \|
41	// \|-----------------------------------\|
42	// \| \|
43	// \| (Win64 only) varargs from reg \|
44	// \| \|
45	// \|-----------------------------------\|
46	// \| \|
47	// \| (Win64 only) callee-saved SVE reg \|
48	// \| \|
49	// \|-----------------------------------\|
50	// \| \|
51	// \| callee-saved gpr registers \| <--.
52	// \| \| \| On Darwin platforms these
53	// \|- - - - - - - - - - - - - - - - - -\| \| callee saves are swapped,
54	// \| prev_lr \| \| (frame record first)
55	// \| prev_fp \| <--'
56	// \| async context if needed \|
57	// \| (a.k.a. "frame record") \|
58	// \|-----------------------------------\| <- fp(=x29)
59	// Default SVE stack layout Split SVE objects
60	// (aarch64-split-sve-objects=false) (aarch64-split-sve-objects=true)
61	// \|-----------------------------------\| \|-----------------------------------\|
62	// \| <hazard padding> \| \| callee-saved PPR registers \|
63	// \|-----------------------------------\| \|-----------------------------------\|
64	// \| \| \| PPR stack objects \|
65	// \| callee-saved fp/simd/SVE regs \| \|-----------------------------------\|
66	// \| \| \| <hazard padding> \|
67	// \|-----------------------------------\| \|-----------------------------------\|
68	// \| \| \| callee-saved ZPR/FPR registers \|
69	// \| SVE stack objects \| \|-----------------------------------\|
70	// \| \| \| ZPR stack objects \|
71	// \|-----------------------------------\| \|-----------------------------------\|
72	// ^ NB: FPR CSRs are promoted to ZPRs
73	// \|-----------------------------------\|
74	// \|.empty.space.to.make.part.below....\|
75	// \|.aligned.in.case.it.needs.more.than\| (size of this area is unknown at
76	// \|.the.standard.16-byte.alignment....\| compile time; if present)
77	// \|-----------------------------------\|
78	// \| local variables of fixed size \|
79	// \| including spill slots \|
80	// \| <FPR> \|
81	// \| <hazard padding> \|
82	// \| <GPR> \|
83	// \|-----------------------------------\| <- bp(not defined by ABI,
84	// \|.variable-sized.local.variables....\| LLVM chooses X19)
85	// \|.(VLAs)............................\| (size of this area is unknown at
86	// \|...................................\| compile time)
87	// \|-----------------------------------\| <- sp
88	// \| \| Lower address
89	//
90	//
91	// To access the data in a frame, at-compile time, a constant offset must be
92	// computable from one of the pointers (fp, bp, sp) to access it. The size
93	// of the areas with a dotted background cannot be computed at compile-time
94	// if they are present, making it required to have all three of fp, bp and
95	// sp to be set up to be able to access all contents in the frame areas,
96	// assuming all of the frame areas are non-empty.
97	//
98	// For most functions, some of the frame areas are empty. For those functions,
99	// it may not be necessary to set up fp or bp:
100	// A base pointer is definitely needed when there are both VLAs and local*
101	// variables with more-than-default alignment requirements.
102	// A frame pointer is definitely needed when there are local variables with*
103	// more-than-default alignment requirements.
104	//
105	// For Darwin platforms the frame-record (fp, lr) is stored at the top of the
106	// callee-saved area, since the unwind encoding does not allow for encoding
107	// this dynamically and existing tools depend on this layout. For other
108	// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
109	// area to allow SVE stack objects (allocated directly below the callee-saves,
110	// if available) to be accessed directly from the framepointer.
111	// The SVE spill/fill instructions have VL-scaled addressing modes such
112	// as:
113	// ldr z8, [fp, #-7 mul vl]
114	// For SVE the size of the vector length (VL) is not known at compile-time, so
115	// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
116	// layout, we don't need to add an unscaled offset to the framepointer before
117	// accessing the SVE object in the frame.
118	//
119	// In some cases when a base pointer is not strictly needed, it is generated
120	// anyway when offsets from the frame pointer to access local variables become
121	// so large that the offset can't be encoded in the immediate fields of loads
122	// or stores.
123	//
124	// Outgoing function arguments must be at the bottom of the stack frame when
125	// calling another function. If we do not have variable-sized stack objects, we
126	// can allocate a "reserved call frame" area at the bottom of the local
127	// variable area, large enough for all outgoing calls. If we do have VLAs, then
128	// the stack pointer must be decremented and incremented around each call to
129	// make space for the arguments below the VLAs.
130	//
131	// FIXME: also explain the redzone concept.
132	//
133	// About stack hazards: Under some SME contexts, a coprocessor with its own
134	// separate cache can used for FP operations. This can create hazards if the CPU
135	// and the SME unit try to access the same area of memory, including if the
136	// access is to an area of the stack. To try to alleviate this we attempt to
137	// introduce extra padding into the stack frame between FP and GPR accesses,
138	// controlled by the aarch64-stack-hazard-size option. Without changing the
139	// layout of the stack frame in the diagram above, a stack object of size
140	// aarch64-stack-hazard-size is added between GPR and FPR CSRs. Another is added
141	// to the stack objects section, and stack objects are sorted so that FPR >
142	// Hazard padding slot > GPRs (where possible). Unfortunately some things are
143	// not handled well (VLA area, arguments on the stack, objects with both GPR and
144	// FPR accesses), but if those are controlled by the user then the entire stack
145	// frame becomes GPR at the start/end with FPR in the middle, surrounded by
146	// Hazard padding.
147	//
148	// An example of the prologue:
149	//
150	// .globl __foo
151	// .align 2
152	// __foo:
153	// Ltmp0:
154	// .cfi_startproc
155	// .cfi_personality 155, ___gxx_personality_v0
156	// Leh_func_begin:
157	// .cfi_lsda 16, Lexception33
158	//
159	// stp xa,bx, [sp, -#offset]!
160	// ...
161	// stp x28, x27, [sp, #offset-32]
162	// stp fp, lr, [sp, #offset-16]
163	// add fp, sp, #offset - 16
164	// sub sp, sp, #1360
165	//
166	// The Stack:
167	// +-------------------------------------------+
168	// 10000 \| ........ \| ........ \| ........ \| ........ \|
169	// 10004 \| ........ \| ........ \| ........ \| ........ \|
170	// +-------------------------------------------+
171	// 10008 \| ........ \| ........ \| ........ \| ........ \|
172	// 1000c \| ........ \| ........ \| ........ \| ........ \|
173	// +===========================================+
174	// 10010 \| X28 Register \|
175	// 10014 \| X28 Register \|
176	// +-------------------------------------------+
177	// 10018 \| X27 Register \|
178	// 1001c \| X27 Register \|
179	// +===========================================+
180	// 10020 \| Frame Pointer \|
181	// 10024 \| Frame Pointer \|
182	// +-------------------------------------------+
183	// 10028 \| Link Register \|
184	// 1002c \| Link Register \|
185	// +===========================================+
186	// 10030 \| ........ \| ........ \| ........ \| ........ \|
187	// 10034 \| ........ \| ........ \| ........ \| ........ \|
188	// +-------------------------------------------+
189	// 10038 \| ........ \| ........ \| ........ \| ........ \|
190	// 1003c \| ........ \| ........ \| ........ \| ........ \|
191	// +-------------------------------------------+
192	//
193	// [sp] = 10030 :: >>initial value<<
194	// sp = 10020 :: stp fp, lr, [sp, #-16]!
195	// fp = sp == 10020 :: mov fp, sp
196	// [sp] == 10020 :: stp x28, x27, [sp, #-16]!
197	// sp == 10010 :: >>final value<<
198	//
199	// The frame pointer (w29) points to address 10020. If we use an offset of
200	// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
201	// for w27, and -32 for w28:
202	//
203	// Ltmp1:
204	// .cfi_def_cfa w29, 16
205	// Ltmp2:
206	// .cfi_offset w30, -8
207	// Ltmp3:
208	// .cfi_offset w29, -16
209	// Ltmp4:
210	// .cfi_offset w27, -24
211	// Ltmp5:
212	// .cfi_offset w28, -32
213	//
214	//===----------------------------------------------------------------------===//
215
216	#include "AArch64FrameLowering.h"
217	#include "AArch64InstrInfo.h"
218	#include "AArch64MachineFunctionInfo.h"
219	#include "AArch64PrologueEpilogue.h"
220	#include "AArch64RegisterInfo.h"
221	#include "AArch64SMEAttributes.h"
222	#include "AArch64Subtarget.h"
223	#include "MCTargetDesc/AArch64AddressingModes.h"
224	#include "MCTargetDesc/AArch64MCTargetDesc.h"
225	#include "llvm/ADT/ScopeExit.h"
226	#include "llvm/ADT/SmallVector.h"
227	#include "llvm/Analysis/ValueTracking.h"
228	#include "llvm/CodeGen/CFIInstBuilder.h"
229	#include "llvm/CodeGen/LivePhysRegs.h"
230	#include "llvm/CodeGen/MachineBasicBlock.h"
231	#include "llvm/CodeGen/MachineFrameInfo.h"
232	#include "llvm/CodeGen/MachineFunction.h"
233	#include "llvm/CodeGen/MachineInstr.h"
234	#include "llvm/CodeGen/MachineInstrBuilder.h"
235	#include "llvm/CodeGen/MachineMemOperand.h"
236	#include "llvm/CodeGen/MachineModuleInfo.h"
237	#include "llvm/CodeGen/MachineOperand.h"
238	#include "llvm/CodeGen/MachineRegisterInfo.h"
239	#include "llvm/CodeGen/RegisterScavenging.h"
240	#include "llvm/CodeGen/TargetInstrInfo.h"
241	#include "llvm/CodeGen/TargetRegisterInfo.h"
242	#include "llvm/CodeGen/TargetSubtargetInfo.h"
243	#include "llvm/CodeGen/WinEHFuncInfo.h"
244	#include "llvm/IR/Attributes.h"
245	#include "llvm/IR/CallingConv.h"
246	#include "llvm/IR/DataLayout.h"
247	#include "llvm/IR/DebugLoc.h"
248	#include "llvm/IR/Function.h"
249	#include "llvm/MC/MCAsmInfo.h"
250	#include "llvm/MC/MCDwarf.h"
251	#include "llvm/Support/CommandLine.h"
252	#include "llvm/Support/Debug.h"
253	#include "llvm/Support/ErrorHandling.h"
254	#include "llvm/Support/FormatVariadic.h"
255	#include "llvm/Support/MathExtras.h"
256	#include "llvm/Support/raw_ostream.h"
257	#include "llvm/Target/TargetMachine.h"
258	#include "llvm/Target/TargetOptions.h"
259	#include <cassert>
260	#include <cstdint>
261	#include <iterator>
262	#include <optional>
263	#include <vector>
264
265	using namespace llvm;
266
267	#define DEBUG_TYPE "frame-info"
268
269	static cl::opt<bool> EnableRedZone("aarch64-redzone",
270	cl::desc ("enable use of redzone on AArch64"),
271	cl::init(Val: false), cl::Hidden);
272
273	static cl::opt<bool> StackTaggingMergeSetTag(
274	"stack-tagging-merge-settag",
275	cl::desc ("merge settag instruction in function epilog"), cl::init(Val: true),
276	cl::Hidden);
277
278	static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
279	cl::desc ("sort stack allocations"),
280	cl::init(Val: true), cl::Hidden);
281
282	static cl::opt<bool>
283	SplitSVEObjects("aarch64-split-sve-objects",
284	cl::desc ("Split allocation of ZPR & PPR objects"),
285	cl::init(Val: true), cl::Hidden);
286
287	cl::opt<bool> EnableHomogeneousPrologEpilog(
288	"homogeneous-prolog-epilog", cl::Hidden,
289	cl::desc ("Emit homogeneous prologue and epilogue for the size "
290	"optimization (default = off)"));
291
292	// Stack hazard size for analysis remarks. StackHazardSize takes precedence.
293	static cl::opt<unsigned>
294	StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(Val: `0`),
295	cl::Hidden);
296	// Whether to insert padding into non-streaming functions (for testing).
297	static cl::opt<bool>
298	StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming",
299	cl::init(Val: false), cl::Hidden);
300
301	static cl::opt<bool> DisableMultiVectorSpillFill(
302	"aarch64-disable-multivector-spill-fill",
303	cl::desc ("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(Val: false),
304	cl::Hidden);
305
306	int64_t
307	AArch64FrameLowering::getArgumentStackToRestore(MachineFunction &MF,
308	MachineBasicBlock &MBB) const {
309	MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
310	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
311	bool IsTailCallReturn = (MBB.end() != MBBI)
312	? AArch64InstrInfo::isTailCallReturnInst(MI: *MBBI)
313	: false;
314
315	int64_t ArgumentPopSize = `0`;
316	if (IsTailCallReturn) {
317	MachineOperand &StackAdjust = MBBI ->getOperand(i: `1`);
318
319	// For a tail-call in a callee-pops-arguments environment, some or all of
320	// the stack may actually be in use for the call's arguments, this is
321	// calculated during LowerCall and consumed here...
322	ArgumentPopSize = StackAdjust.getImm();
323	} else {
324	// ... otherwise the amount to pop is all* of the argument space,*
325	// conveniently stored in the MachineFunctionInfo by
326	// LowerFormalArguments. This will, of course, be zero for the C calling
327	// convention.
328	ArgumentPopSize = AFI->getArgumentStackToRestore();
329	}
330
331	return ArgumentPopSize;
332	}
333
334	static bool produceCompactUnwindFrame(const AArch64FrameLowering &,
335	MachineFunction &MF);
336
337	enum class AssignObjectOffsets { No, Yes };
338	/// Process all the SVE stack objects and the SVE stack size and offsets for
339	/// each object. If AssignOffsets is "Yes", the offsets get assigned (and SVE
340	/// stack sizes set). Returns the size of the SVE stack.
341	static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
342	AssignObjectOffsets AssignOffsets);
343
344	static unsigned getStackHazardSize(const MachineFunction &MF) {
345	return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
346	}
347
348	StackOffset
349	AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const {
350	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
351	return StackOffset::getScalable(Scalable: AFI->getStackSizeZPR());
352	}
353
354	StackOffset
355	AArch64FrameLowering::getPPRStackSize(const MachineFunction &MF) const {
356	// With split SVE objects, the hazard padding is added to the PPR region,
357	// which places it between the [GPR, PPR] area and the [ZPR, FPR] area. This
358	// avoids hazards between both GPRs and FPRs and ZPRs and PPRs.
359	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
360	return StackOffset::get(Fixed: AFI->hasSplitSVEObjects() ? getStackHazardSize(MF)
361	: `0`,
362	Scalable: AFI->getStackSizePPR());
363	}
364
365	// Conservatively, returns true if the function is likely to have SVE vectors
366	// on the stack. This function is safe to be called before callee-saves or
367	// object offsets have been determined.
368	static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL,
369	const MachineFunction &MF) {
370	auto *AFI = MF.getInfo<AArch64FunctionInfo>();
371	if (AFI->isSVECC())
372	return true;
373
374	if (AFI->hasCalculatedStackSizeSVE())
375	return bool(AFL.getSVEStackSize(MF));
376
377	const MachineFrameInfo &MFI = MF.getFrameInfo();
378	for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) {
379	if (MFI.hasScalableStackID(ObjectIdx: FI))
380	return true;
381	}
382
383	return false;
384	}
385
386	static bool isTargetWindows(const MachineFunction &MF) {
387	return MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
388	}
389
390	bool AArch64FrameLowering::hasSVECalleeSavesAboveFrameRecord(
391	const MachineFunction &MF) const {
392	auto *AFI = MF.getInfo<AArch64FunctionInfo>();
393	return isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
394	}
395
396	/// Returns true if a homogeneous prolog or epilog code can be emitted
397	/// for the size optimization. If possible, a frame helper call is injected.
398	/// When Exit block is given, this check is for epilog.
399	bool AArch64FrameLowering::homogeneousPrologEpilog(
400	MachineFunction &MF, MachineBasicBlock Exit) const* {
401	if (!MF.getFunction().hasMinSize())
402	return false;
403	if (!EnableHomogeneousPrologEpilog)
404	return false;
405	if (EnableRedZone)
406	return false;
407
408	// TODO: Window is supported yet.
409	if (isTargetWindows(MF))
410	return false;
411
412	// TODO: SVE is not supported yet.
413	if (isLikelyToHaveSVEStack(AFL: *this, MF))
414	return false;
415
416	// Bail on stack adjustment needed on return for simplicity.
417	const MachineFrameInfo &MFI = MF.getFrameInfo();
418	const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
419	if (MFI.hasVarSizedObjects() \|\| RegInfo->hasStackRealignment(MF))
420	return false;
421	if (Exit && getArgumentStackToRestore(MF, MBB&: *Exit))
422	return false;
423
424	auto *AFI = MF.getInfo<AArch64FunctionInfo>();
425	if (AFI->hasSwiftAsyncContext() \|\| AFI->hasStreamingModeChanges())
426	return false;
427
428	// If there are an odd number of GPRs before LR and FP in the CSRs list,
429	// they will not be paired into one RegPairInfo, which is incompatible with
430	// the assumption made by the homogeneous prolog epilog pass.
431	const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
432	unsigned NumGPRs = `0`;
433	for (unsigned I = `0`; CSRegs[I]; ++I) {
434	Register Reg = CSRegs[I];
435	if (Reg == AArch64::LR) {
436	assert(CSRegs[I + `1`] == AArch64::FP);
437	if (NumGPRs % `2` != `0`)
438	return false;
439	break;
440	}
441	if (AArch64::GPR64RegClass.contains(Reg))
442	++NumGPRs;
443	}
444
445	return true;
446	}
447
448	/// Returns true if CSRs should be paired.
449	bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
450	return produceCompactUnwindFrame(*this, MF) \|\| homogeneousPrologEpilog(MF);
451	}
452
453	/// This is the biggest offset to the stack pointer we can encode in aarch64
454	/// instructions (without using a separate calculation and a temp register).
455	/// Note that the exception here are vector stores/loads which cannot encode any
456	/// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
457	static const unsigned DefaultSafeSPDisplacement = `255`;
458
459	/// Look at each instruction that references stack frames and return the stack
460	/// size limit beyond which some of these instructions will require a scratch
461	/// register during their expansion later.
462	static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
463	// FIXME: For now, just conservatively guesstimate based on unscaled indexing
464	// range. We'll end up allocating an unnecessary spill slot a lot, but
465	// realistically that's not a big deal at this stage of the game.
466	for (MachineBasicBlock &MBB : MF) {
467	for (MachineInstr &MI : MBB) {
468	if (MI.isDebugInstr() \|\| MI.isPseudo() \|\|
469	MI.getOpcode() == AArch64::ADDXri \|\|
470	MI.getOpcode() == AArch64::ADDSXri)
471	continue;
472
473	for (const MachineOperand &MO : MI.operands()) {
474	if (!MO.isFI())
475	continue;
476
477	StackOffset Offset;
478	if (isAArch64FrameOffsetLegal(MI, Offset, OutUseUnscaledOp: nullptr, OutUnscaledOp: nullptr, EmittableOffset: nullptr) ==
479	AArch64FrameOffsetCannotUpdate)
480	return `0`;
481	}
482	}
483	}
484	return DefaultSafeSPDisplacement;
485	}
486
487	TargetStackID::Value
488	AArch64FrameLowering::getStackIDForScalableVectors() const {
489	return TargetStackID::ScalableVector;
490	}
491
492	unsigned
493	AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF,
494	const AArch64FunctionInfo *AFI,
495	bool IsWin64, bool IsFunclet) const {
496	assert(AFI->getTailCallReservedStack() % `16` == `0` &&
497	"Tail call reserved stack must be aligned to 16 bytes");
498	if (!IsWin64 \|\| IsFunclet) {
499	return AFI->getTailCallReservedStack();
500	} else {
501	if (AFI->getTailCallReservedStack() != `0` &&
502	!MF.getFunction().getAttributes().hasAttrSomewhere(
503	Kind: Attribute::SwiftAsync))
504	report_fatal_error(reason: "cannot generate ABI-changing tail call for Win64");
505	unsigned FixedObjectSize = AFI->getTailCallReservedStack();
506
507	// Var args are stored here in the primary function.
508	FixedObjectSize += AFI->getVarArgsGPRSize();
509
510	if (MF.hasEHFunclets()) {
511	// Catch objects are stored here in the primary function.
512	const MachineFrameInfo &MFI = MF.getFrameInfo();
513	const WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
514	SmallSetVector<int, `8`> CatchObjFrameIndices;
515	for (const WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
516	for (const WinEHHandlerType &H : TBME.HandlerArray) {
517	int FrameIndex = H.CatchObj.FrameIndex;
518	if ((FrameIndex != INT_MAX) &&
519	CatchObjFrameIndices.insert(X: FrameIndex)) {
520	FixedObjectSize = alignTo(Value: FixedObjectSize,
521	Align: MFI.getObjectAlign(ObjectIdx: FrameIndex).value()) +
522	MFI.getObjectSize(ObjectIdx: FrameIndex);
523	}
524	}
525	}
526	// To support EH funclets we allocate an UnwindHelp object
527	FixedObjectSize += `8`;
528	}
529	return alignTo(Value: FixedObjectSize, Align: `16`);
530	}
531	}
532
533	bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
534	if (!EnableRedZone)
535	return false;
536
537	// Don't use the red zone if the function explicitly asks us not to.
538	// This is typically used for kernel code.
539	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
540	const unsigned RedZoneSize =
541	Subtarget.getTargetLowering()->getRedZoneSize(F: MF.getFunction());
542	if (!RedZoneSize)
543	return false;
544
545	const MachineFrameInfo &MFI = MF.getFrameInfo();
546	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
547	uint64_t NumBytes = AFI->getLocalStackSize();
548
549	// If neither NEON or SVE are available, a COPY from one Q-reg to
550	// another requires a spill -> reload sequence. We can do that
551	// using a pre-decrementing store/post-decrementing load, but
552	// if we do so, we can't use the Red Zone.
553	bool LowerQRegCopyThroughMem = Subtarget.hasFPARMv8() &&
554	!Subtarget.isNeonAvailable() &&
555	!Subtarget.hasSVE();
556
557	return !(MFI.hasCalls() \|\| hasFP(MF) \|\| NumBytes > RedZoneSize \|\|
558	AFI->hasSVEStackSize() \|\| LowerQRegCopyThroughMem);
559	}
560
561	/// hasFPImpl - Return true if the specified function should have a dedicated
562	/// frame pointer register.
563	bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const {
564	const MachineFrameInfo &MFI = MF.getFrameInfo();
565	const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
566	const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
567
568	// Win64 EH requires a frame pointer if funclets are present, as the locals
569	// are accessed off the frame pointer in both the parent function and the
570	// funclets.
571	if (MF.hasEHFunclets())
572	return true;
573	// Retain behavior of always omitting the FP for leaf functions when possible.
574	if (MF.getTarget().Options.DisableFramePointerElim(MF))
575	return true;
576	if (MFI.hasVarSizedObjects() \|\| MFI.isFrameAddressTaken() \|\|
577	MFI.hasStackMap() \|\| MFI.hasPatchPoint() \|\|
578	RegInfo->hasStackRealignment(MF))
579	return true;
580
581	// If we:
582	//
583	// 1. Have streaming mode changes
584	// OR:
585	// 2. Have a streaming body with SVE stack objects
586	//
587	// Then the value of VG restored when unwinding to this function may not match
588	// the value of VG used to set up the stack.
589	//
590	// This is a problem as the CFA can be described with an expression of the
591	// form: CFA = SP + NumBytes + VG NumScalableBytes.*
592	//
593	// If the value of VG used in that expression does not match the value used to
594	// set up the stack, an incorrect address for the CFA will be computed, and
595	// unwinding will fail.
596	//
597	// We work around this issue by ensuring the frame-pointer can describe the
598	// CFA in either of these cases.
599	if (AFI.needsDwarfUnwindInfo(MF) &&
600	((requiresSaveVG(MF) \|\| AFI.getSMEFnAttrs().hasStreamingBody()) &&
601	(!AFI.hasCalculatedStackSizeSVE() \|\| AFI.hasSVEStackSize())))
602	return true;
603	// With large callframes around we may need to use FP to access the scavenging
604	// emergency spillslot.
605	//
606	// Unfortunately some calls to hasFP() like machine verifier ->
607	// getReservedReg() -> hasFP in the middle of global isel are too early
608	// to know the max call frame size. Hopefully conservatively returning "true"
609	// in those cases is fine.
610	// DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
611	if (!MFI.isMaxCallFrameSizeComputed() \|\|
612	MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
613	return true;
614
615	return false;
616	}
617
618	/// Should the Frame Pointer be reserved for the current function?
619	bool AArch64FrameLowering::isFPReserved(const MachineFunction &MF) const {
620	const TargetMachine &TM = MF.getTarget();
621	const Triple &TT = TM.getTargetTriple();
622
623	// These OSes require the frame chain is valid, even if the current frame does
624	// not use a frame pointer.
625	if (TT.isOSDarwin() \|\| TT.isOSWindows())
626	return true;
627
628	// If the function has a frame pointer, it is reserved.
629	if (hasFP(MF))
630	return true;
631
632	// Frontend has requested to preserve the frame pointer.
633	if (TM.Options.FramePointerIsReserved(MF))
634	return true;
635
636	return false;
637	}
638
639	/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
640	/// not required, we reserve argument space for call sites in the function
641	/// immediately on entry to the current function. This eliminates the need for
642	/// add/sub sp brackets around call sites. Returns true if the call frame is
643	/// included as part of the stack frame.
644	bool AArch64FrameLowering::hasReservedCallFrame(
645	const MachineFunction &MF) const {
646	// The stack probing code for the dynamically allocated outgoing arguments
647	// area assumes that the stack is probed at the top - either by the prologue
648	// code, which issues a probe if `hasVarSizedObjects` return true, or by the
649	// most recent variable-sized object allocation. Changing the condition here
650	// may need to be followed up by changes to the probe issuing logic.
651	return !MF.getFrameInfo().hasVarSizedObjects();
652	}
653
654	MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
655	MachineFunction &MF, MachineBasicBlock &MBB,
656	MachineBasicBlock::iterator I) const {
657
658	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
659	const AArch64InstrInfo *TII = Subtarget.getInstrInfo();
660	const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
661	[[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo();
662	DebugLoc DL = I ->getDebugLoc();
663	unsigned Opc = I ->getOpcode();
664	bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
665	uint64_t CalleePopAmount = IsDestroy ? I ->getOperand(i: `1`).getImm() : `0`;
666
667	if (!hasReservedCallFrame(MF)) {
668	int64_t Amount = I ->getOperand(i: `0`).getImm();
669	Amount = alignTo(Size: Amount, A: getStackAlign());
670	if (!IsDestroy)
671	Amount = -Amount;
672
673	// N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
674	// doesn't have to pop anything), then the first operand will be zero too so
675	// this adjustment is a no-op.
676	if (CalleePopAmount == `0`) {
677	// FIXME: in-function stack adjustment for calls is limited to 24-bits
678	// because there's no guaranteed temporary register available.
679	//
680	// ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
681	// 1) For offset <= 12-bit, we use LSL #0
682	// 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
683	// LSL #0, and the other uses LSL #12.
684	//
685	// Most call frames will be allocated at the start of a function so
686	// this is OK, but it is a limitation that needs dealing with.
687	assert(Amount > -`0xffffff` && Amount < `0xffffff` && "call frame too large");
688
689	if (TLI->hasInlineStackProbe(MF) &&
690	-Amount >= AArch64::StackProbeMaxUnprobedStack) {
691	// When stack probing is enabled, the decrement of SP may need to be
692	// probed. We only need to do this if the call site needs 1024 bytes of
693	// space or more, because a region smaller than that is allowed to be
694	// unprobed at an ABI boundary. We rely on the fact that SP has been
695	// probed exactly at this point, either by the prologue or most recent
696	// dynamic allocation.
697	assert(MFI.hasVarSizedObjects() &&
698	"non-reserved call frame without var sized objects?");
699	Register ScratchReg =
700	MF.getRegInfo().createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
701	inlineStackProbeFixed(MBBI: I, ScratchReg, FrameSize: -Amount, CFAOffset: StackOffset::get(Fixed: `0`, Scalable: `0`));
702	} else {
703	emitFrameOffset(MBB, MBBI: I, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
704	Offset: StackOffset::getFixed(Fixed: Amount), TII);
705	}
706	}
707	} else if (CalleePopAmount != `0`) {
708	// If the calling convention demands that the callee pops arguments from the
709	// stack, we want to add it back if we have a reserved call frame.
710	assert(CalleePopAmount < `0xffffff` && "call frame too large");
711	emitFrameOffset(MBB, MBBI: I, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
712	Offset: StackOffset::getFixed(Fixed: -(int64_t)CalleePopAmount), TII);
713	}
714	return MBB.erase(I);
715	}
716
717	void AArch64FrameLowering::resetCFIToInitialState(
718	MachineBasicBlock &MBB) const {
719
720	MachineFunction &MF = *MBB.getParent();
721	const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
722	const auto &TRI = *Subtarget.getRegisterInfo();
723	const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
724
725	CFIInstBuilder CFIBuilder(MBB, MBB.begin(), MachineInstr::NoFlags);
726
727	// Reset the CFA to `SP + 0`.
728	CFIBuilder.buildDefCFA(Reg: AArch64::SP, Offset: `0`);
729
730	// Flip the RA sign state.
731	if (MFI.shouldSignReturnAddress(MF))
732	MFI.branchProtectionPAuthLR() ? CFIBuilder.buildNegateRAStateWithPC()
733	: CFIBuilder.buildNegateRAState();
734
735	// Shadow call stack uses X18, reset it.
736	if (MFI.needsShadowCallStackPrologueEpilogue(MF))
737	CFIBuilder.buildSameValue(Reg: AArch64::X18);
738
739	// Emit .cfi_same_value for callee-saved registers.
740	const std::vector<CalleeSavedInfo> &CSI =
741	MF.getFrameInfo().getCalleeSavedInfo();
742	for (const auto &Info : CSI) {
743	MCRegister Reg = Info.getReg();
744	if (!TRI.regNeedsCFI(Reg, RegToUseForCFI&: Reg))
745	continue;
746	CFIBuilder.buildSameValue(Reg);
747	}
748	}
749
750	static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
751	switch (Reg.id()) {
752	default:
753	// The called routine is expected to preserve r19-r28
754	// r29 and r30 are used as frame pointer and link register resp.
755	return `0`;
756
757	// GPRs
758	#define CASE(n) \
759	case AArch64::W##n: \
760	case AArch64::X##n: \
761	return AArch64::X##n
762	CASE(`0`);
763	CASE(`1`);
764	CASE(`2`);
765	CASE(`3`);
766	CASE(`4`);
767	CASE(`5`);
768	CASE(`6`);
769	CASE(`7`);
770	CASE(`8`);
771	CASE(`9`);
772	CASE(`10`);
773	CASE(`11`);
774	CASE(`12`);
775	CASE(`13`);
776	CASE(`14`);
777	CASE(`15`);
778	CASE(`16`);
779	CASE(`17`);
780	CASE(`18`);
781	#undef CASE
782
783	// FPRs
784	#define CASE(n) \
785	case AArch64::B##n: \
786	case AArch64::H##n: \
787	case AArch64::S##n: \
788	case AArch64::D##n: \
789	case AArch64::Q##n: \
790	return HasSVE ? AArch64::Z##n : AArch64::Q##n
791	CASE(`0`);
792	CASE(`1`);
793	CASE(`2`);
794	CASE(`3`);
795	CASE(`4`);
796	CASE(`5`);
797	CASE(`6`);
798	CASE(`7`);
799	CASE(`8`);
800	CASE(`9`);
801	CASE(`10`);
802	CASE(`11`);
803	CASE(`12`);
804	CASE(`13`);
805	CASE(`14`);
806	CASE(`15`);
807	CASE(`16`);
808	CASE(`17`);
809	CASE(`18`);
810	CASE(`19`);
811	CASE(`20`);
812	CASE(`21`);
813	CASE(`22`);
814	CASE(`23`);
815	CASE(`24`);
816	CASE(`25`);
817	CASE(`26`);
818	CASE(`27`);
819	CASE(`28`);
820	CASE(`29`);
821	CASE(`30`);
822	CASE(`31`);
823	#undef CASE
824	}
825	}
826
827	void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
828	MachineBasicBlock &MBB) const {
829	// Insertion point.
830	MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
831
832	// Fake a debug loc.
833	DebugLoc DL;
834	if (MBBI != MBB.end())
835	DL = MBBI ->getDebugLoc();
836
837	const MachineFunction &MF = *MBB.getParent();
838	const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
839	const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
840
841	BitVector GPRsToZero(TRI.getNumRegs());
842	BitVector FPRsToZero(TRI.getNumRegs());
843	bool HasSVE = STI.isSVEorStreamingSVEAvailable();
844	for (MCRegister Reg : RegsToZero.set_bits()) {
845	if (TRI.isGeneralPurposeRegister(MF, Reg)) {
846	// For GPRs, we only care to clear out the 64-bit register.
847	if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
848	GPRsToZero.set(XReg);
849	} else if (AArch64InstrInfo::isFpOrNEON(Reg)) {
850	// For FPRs,
851	if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
852	FPRsToZero.set(XReg);
853	}
854	}
855
856	const AArch64InstrInfo &TII = *STI.getInstrInfo();
857
858	// Zero out GPRs.
859	for (MCRegister Reg : GPRsToZero.set_bits())
860	TII.buildClearRegister(Reg, MBB, Iter: MBBI, DL);
861
862	// Zero out FP/vector registers.
863	for (MCRegister Reg : FPRsToZero.set_bits())
864	TII.buildClearRegister(Reg, MBB, Iter: MBBI, DL);
865
866	if (HasSVE) {
867	for (MCRegister PReg :
868	{AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4,
869	AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9,
870	AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14,
871	AArch64::P15}) {
872	if (RegsToZero [PReg])
873	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::PFALSE), DestReg: PReg);
874	}
875	}
876	}
877
878	bool AArch64FrameLowering::windowsRequiresStackProbe(
879	const MachineFunction &MF, uint64_t StackSizeInBytes) const {
880	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
881	const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
882	// TODO: When implementing stack protectors, take that into account
883	// for the probe threshold.
884	return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
885	StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
886	}
887
888	static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
889	const MachineBasicBlock &MBB) {
890	const MachineFunction *MF = MBB.getParent();
891	LiveRegs.addLiveIns(MBB);
892	// Mark callee saved registers as used so we will not choose them.
893	const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
894	for (unsigned i = `0`; CSRegs[i]; ++i)
895	LiveRegs.addReg(Reg: CSRegs[i]);
896	}
897
898	Register
899	AArch64FrameLowering::findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
900	bool HasCall) const {
901	MachineFunction *MF = MBB->getParent();
902
903	// If MBB is an entry block, use X9 as the scratch register
904	// preserve_none functions may be using X9 to pass arguments,
905	// so prefer to pick an available register below.
906	if (&MF->front() == MBB &&
907	MF->getFunction().getCallingConv() != CallingConv::PreserveNone)
908	return AArch64::X9;
909
910	const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
911	const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
912	LivePhysRegs LiveRegs(TRI);
913	getLiveRegsForEntryMBB(LiveRegs, MBB: *MBB);
914	if (HasCall) {
915	LiveRegs.addReg(Reg: AArch64::X16);
916	LiveRegs.addReg(Reg: AArch64::X17);
917	LiveRegs.addReg(Reg: AArch64::X18);
918	}
919
920	// Prefer X9 since it was historically used for the prologue scratch reg.
921	const MachineRegisterInfo &MRI = MF->getRegInfo();
922	if (LiveRegs.available(MRI, Reg: AArch64::X9))
923	return AArch64::X9;
924
925	for (unsigned Reg : AArch64::GPR64RegClass) {
926	if (LiveRegs.available(MRI, Reg))
927	return Reg;
928	}
929	return AArch64::NoRegister;
930	}
931
932	bool AArch64FrameLowering::canUseAsPrologue(
933	const MachineBasicBlock &MBB) const {
934	const MachineFunction *MF = MBB.getParent();
935	MachineBasicBlock TmpMBB = const_cast<MachineBasicBlock >(&MBB);
936	const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
937	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
938	const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
939	const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>();
940
941	if (AFI->hasSwiftAsyncContext()) {
942	const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
943	const MachineRegisterInfo &MRI = MF->getRegInfo();
944	LivePhysRegs LiveRegs(TRI);
945	getLiveRegsForEntryMBB(LiveRegs, MBB);
946	// The StoreSwiftAsyncContext clobbers X16 and X17. Make sure they are
947	// available.
948	if (!LiveRegs.available(MRI, Reg: AArch64::X16) \|\|
949	!LiveRegs.available(MRI, Reg: AArch64::X17))
950	return false;
951	}
952
953	// Certain stack probing sequences might clobber flags, then we can't use
954	// the block as a prologue if the flags register is a live-in.
955	if (MF->getInfo<AArch64FunctionInfo>()->hasStackProbing() &&
956	MBB.isLiveIn(Reg: AArch64::NZCV))
957	return false;
958
959	if (RegInfo->hasStackRealignment(MF: MF) \|\| TLI->hasInlineStackProbe(MF: MF))
960	if (findScratchNonCalleeSaveRegister(MBB: TmpMBB) == AArch64::NoRegister)
961	return false;
962
963	// May need a scratch register (for return value) if require making a special
964	// call
965	if (requiresSaveVG(MF: *MF) \|\|
966	windowsRequiresStackProbe(MF: *MF, StackSizeInBytes: std::numeric_limits<uint64_t>::max()))
967	if (findScratchNonCalleeSaveRegister(MBB: TmpMBB, HasCall: true) == AArch64::NoRegister)
968	return false;
969
970	return true;
971	}
972
973	bool AArch64FrameLowering::needsWinCFI(const MachineFunction &MF) const {
974	const Function &F = MF.getFunction();
975	return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
976	F.needsUnwindTableEntry();
977	}
978
979	bool AArch64FrameLowering::shouldSignReturnAddressEverywhere(
980	const MachineFunction &MF) const {
981	// FIXME: With WinCFI, extra care should be taken to place SEH_PACSignLR
982	// and SEH_EpilogEnd instructions in the correct order.
983	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
984	return false;
985	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
986	return AFI->getSignReturnAddressCondition() == SignReturnAddress::All;
987	}
988
989	// Given a load or a store instruction, generate an appropriate unwinding SEH
990	// code on Windows.
991	MachineBasicBlock::iterator
992	AArch64FrameLowering::insertSEH(MachineBasicBlock::iterator MBBI,
993	const AArch64InstrInfo &TII,
994	MachineInstr::MIFlag Flag) const {
995	unsigned Opc = MBBI ->getOpcode();
996	MachineBasicBlock *MBB = MBBI ->getParent();
997	MachineFunction &MF = *MBB->getParent();
998	DebugLoc DL = MBBI ->getDebugLoc();
999	unsigned ImmIdx = MBBI ->getNumOperands() - `1`;
1000	int Imm = MBBI ->getOperand(i: ImmIdx).getImm();
1001	MachineInstrBuilder MIB;
1002	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1003	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1004
1005	switch (Opc) {
1006	default:
1007	report_fatal_error(reason: "No SEH Opcode for this instruction");
1008	case AArch64::STR_ZXI:
1009	case AArch64::LDR_ZXI: {
1010	unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `0`).getReg());
1011	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveZReg))
1012	.addImm(Val: Reg0)
1013	.addImm(Val: Imm)
1014	.setMIFlag(Flag);
1015	break;
1016	}
1017	case AArch64::STR_PXI:
1018	case AArch64::LDR_PXI: {
1019	unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `0`).getReg());
1020	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SavePReg))
1021	.addImm(Val: Reg0)
1022	.addImm(Val: Imm)
1023	.setMIFlag(Flag);
1024	break;
1025	}
1026	case AArch64::LDPDpost:
1027	Imm = -Imm;
1028	[[fallthrough]];
1029	case AArch64::STPDpre: {
1030	unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `1`).getReg());
1031	unsigned Reg1 = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `2`).getReg());
1032	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFRegP_X))
1033	.addImm(Val: Reg0)
1034	.addImm(Val: Reg1)
1035	.addImm(Val: Imm * `8`)
1036	.setMIFlag(Flag);
1037	break;
1038	}
1039	case AArch64::LDPXpost:
1040	Imm = -Imm;
1041	[[fallthrough]];
1042	case AArch64::STPXpre: {
1043	Register Reg0 = MBBI ->getOperand(i: `1`).getReg();
1044	Register Reg1 = MBBI ->getOperand(i: `2`).getReg();
1045	if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1046	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFPLR_X))
1047	.addImm(Val: Imm * `8`)
1048	.setMIFlag(Flag);
1049	else
1050	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveRegP_X))
1051	.addImm(Val: RegInfo->getSEHRegNum(i: Reg0))
1052	.addImm(Val: RegInfo->getSEHRegNum(i: Reg1))
1053	.addImm(Val: Imm * `8`)
1054	.setMIFlag(Flag);
1055	break;
1056	}
1057	case AArch64::LDRDpost:
1058	Imm = -Imm;
1059	[[fallthrough]];
1060	case AArch64::STRDpre: {
1061	unsigned Reg = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `1`).getReg());
1062	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFReg_X))
1063	.addImm(Val: Reg)
1064	.addImm(Val: Imm)
1065	.setMIFlag(Flag);
1066	break;
1067	}
1068	case AArch64::LDRXpost:
1069	Imm = -Imm;
1070	[[fallthrough]];
1071	case AArch64::STRXpre: {
1072	unsigned Reg = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `1`).getReg());
1073	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveReg_X))
1074	.addImm(Val: Reg)
1075	.addImm(Val: Imm)
1076	.setMIFlag(Flag);
1077	break;
1078	}
1079	case AArch64::STPDi:
1080	case AArch64::LDPDi: {
1081	unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `0`).getReg());
1082	unsigned Reg1 = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `1`).getReg());
1083	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFRegP))
1084	.addImm(Val: Reg0)
1085	.addImm(Val: Reg1)
1086	.addImm(Val: Imm * `8`)
1087	.setMIFlag(Flag);
1088	break;
1089	}
1090	case AArch64::STPXi:
1091	case AArch64::LDPXi: {
1092	Register Reg0 = MBBI ->getOperand(i: `0`).getReg();
1093	Register Reg1 = MBBI ->getOperand(i: `1`).getReg();
1094
1095	int SEHReg0 = RegInfo->getSEHRegNum(i: Reg0);
1096	int SEHReg1 = RegInfo->getSEHRegNum(i: Reg1);
1097
1098	if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1099	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFPLR))
1100	.addImm(Val: Imm * `8`)
1101	.setMIFlag(Flag);
1102	else if (SEHReg0 >= `19` && SEHReg1 >= `19`)
1103	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveRegP))
1104	.addImm(Val: SEHReg0)
1105	.addImm(Val: SEHReg1)
1106	.addImm(Val: Imm * `8`)
1107	.setMIFlag(Flag);
1108	else
1109	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveAnyRegIP))
1110	.addImm(Val: SEHReg0)
1111	.addImm(Val: SEHReg1)
1112	.addImm(Val: Imm * `8`)
1113	.setMIFlag(Flag);
1114	break;
1115	}
1116	case AArch64::STRXui:
1117	case AArch64::LDRXui: {
1118	int Reg = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `0`).getReg());
1119	if (Reg >= `19`)
1120	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveReg))
1121	.addImm(Val: Reg)
1122	.addImm(Val: Imm * `8`)
1123	.setMIFlag(Flag);
1124	else
1125	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveAnyRegI))
1126	.addImm(Val: Reg)
1127	.addImm(Val: Imm * `8`)
1128	.setMIFlag(Flag);
1129	break;
1130	}
1131	case AArch64::STRDui:
1132	case AArch64::LDRDui: {
1133	unsigned Reg = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `0`).getReg());
1134	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveFReg))
1135	.addImm(Val: Reg)
1136	.addImm(Val: Imm * `8`)
1137	.setMIFlag(Flag);
1138	break;
1139	}
1140	case AArch64::STPQi:
1141	case AArch64::LDPQi: {
1142	unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `0`).getReg());
1143	unsigned Reg1 = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `1`).getReg());
1144	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveAnyRegQP))
1145	.addImm(Val: Reg0)
1146	.addImm(Val: Reg1)
1147	.addImm(Val: Imm * `16`)
1148	.setMIFlag(Flag);
1149	break;
1150	}
1151	case AArch64::LDPQpost:
1152	Imm = -Imm;
1153	[[fallthrough]];
1154	case AArch64::STPQpre: {
1155	unsigned Reg0 = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `1`).getReg());
1156	unsigned Reg1 = RegInfo->getSEHRegNum(i: MBBI ->getOperand(i: `2`).getReg());
1157	MIB = BuildMI(MF, MIMD: DL, MCID: TII.get(Opcode: AArch64::SEH_SaveAnyRegQPX))
1158	.addImm(Val: Reg0)
1159	.addImm(Val: Reg1)
1160	.addImm(Val: Imm * `16`)
1161	.setMIFlag(Flag);
1162	break;
1163	}
1164	}
1165	auto I = MBB->insertAfter(I: MBBI, MI: MIB);
1166	return I;
1167	}
1168
1169	bool AArch64FrameLowering::requiresSaveVG(const MachineFunction &MF) const {
1170	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1171	if (!AFI->needsDwarfUnwindInfo(MF) \|\| !AFI->hasStreamingModeChanges())
1172	return false;
1173	// For Darwin platforms we don't save VG for non-SVE functions, even if SME
1174	// is enabled with streaming mode changes.
1175	auto &ST = MF.getSubtarget<AArch64Subtarget>();
1176	if (ST.isTargetDarwin())
1177	return ST.hasSVE();
1178	return true;
1179	}
1180
1181	void AArch64FrameLowering::emitPacRetPlusLeafHardening(
1182	MachineFunction &MF) const {
1183	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1184	const AArch64InstrInfo *TII = Subtarget.getInstrInfo();
1185
1186	auto EmitSignRA = [&](MachineBasicBlock &MBB) {
1187	DebugLoc DL; // Set debug location to unknown.
1188	MachineBasicBlock::iterator MBBI = MBB.begin();
1189
1190	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::PAUTH_PROLOGUE))
1191	.setMIFlag(MachineInstr::FrameSetup);
1192	};
1193
1194	auto EmitAuthRA = [&](MachineBasicBlock &MBB) {
1195	DebugLoc DL;
1196	MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1197	if (MBBI != MBB.end())
1198	DL = MBBI ->getDebugLoc();
1199
1200	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::PAUTH_EPILOGUE))
1201	.setMIFlag(MachineInstr::FrameDestroy);
1202	};
1203
1204	// This should be in sync with PEIImpl::calculateSaveRestoreBlocks.
1205	EmitSignRA (MF.front());
1206	for (MachineBasicBlock &MBB : MF) {
1207	if (MBB.isEHFuncletEntry())
1208	EmitSignRA (MBB);
1209	if (MBB.isReturnBlock())
1210	EmitAuthRA (MBB);
1211	}
1212	}
1213
1214	void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1215	MachineBasicBlock &MBB) const {
1216	AArch64PrologueEmitter PrologueEmitter(MF, MBB, *this);
1217	PrologueEmitter.emitPrologue();
1218	}
1219
1220	void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1221	MachineBasicBlock &MBB) const {
1222	AArch64EpilogueEmitter EpilogueEmitter(MF, MBB, *this);
1223	EpilogueEmitter.emitEpilogue();
1224	}
1225
1226	bool AArch64FrameLowering::enableCFIFixup(const MachineFunction &MF) const {
1227	return TargetFrameLowering::enableCFIFixup(MF) &&
1228	MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF);
1229	}
1230
1231	bool AArch64FrameLowering::enableFullCFIFixup(const MachineFunction &MF) const {
1232	return enableCFIFixup(MF) &&
1233	MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF);
1234	}
1235
1236	/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
1237	/// debug info. It's the same as what we use for resolving the code-gen
1238	/// references for now. FIXME: This can go wrong when references are
1239	/// SP-relative and simple call frames aren't used.
1240	StackOffset
1241	AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1242	Register &FrameReg) const {
1243	return resolveFrameIndexReference(
1244	MF, FI, FrameReg,
1245	/PreferFP=/
1246	MF.getFunction().hasFnAttribute(Kind: Attribute::SanitizeHWAddress) \|\|
1247	MF.getFunction().hasFnAttribute(Kind: Attribute::SanitizeMemTag),
1248	/ForSimm=/false);
1249	}
1250
1251	StackOffset
1252	AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
1253	int FI) const {
1254	// This function serves to provide a comparable offset from a single reference
1255	// point (the value of SP at function entry) that can be used for analysis,
1256	// e.g. the stack-frame-layout analysis pass. It is not guaranteed to be
1257	// correct for all objects in the presence of VLA-area objects or dynamic
1258	// stack re-alignment.
1259
1260	const auto &MFI = MF.getFrameInfo();
1261
1262	int64_t ObjectOffset = MFI.getObjectOffset(ObjectIdx: FI);
1263	StackOffset ZPRStackSize = getZPRStackSize(MF);
1264	StackOffset PPRStackSize = getPPRStackSize(MF);
1265	StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
1266
1267	// For VLA-area objects, just emit an offset at the end of the stack frame.
1268	// Whilst not quite correct, these objects do live at the end of the frame and
1269	// so it is more useful for analysis for the offset to reflect this.
1270	if (MFI.isVariableSizedObjectIndex(ObjectIdx: FI)) {
1271	return StackOffset::getFixed(Fixed: -((int64_t)MFI.getStackSize())) - SVEStackSize;
1272	}
1273
1274	// This is correct in the absence of any SVE stack objects.
1275	if (!SVEStackSize)
1276	return StackOffset::getFixed(Fixed: ObjectOffset - getOffsetOfLocalArea());
1277
1278	const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1279	bool FPAfterSVECalleeSaves = hasSVECalleeSavesAboveFrameRecord(MF);
1280	if (MFI.hasScalableStackID(ObjectIdx: FI)) {
1281	if (FPAfterSVECalleeSaves &&
1282	-ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) {
1283	assert(!AFI->hasSplitSVEObjects() &&
1284	"split-sve-objects not supported with FPAfterSVECalleeSaves");
1285	return StackOffset::getScalable(Scalable: ObjectOffset);
1286	}
1287	StackOffset AccessOffset{};
1288	// The scalable vectors are below (lower address) the scalable predicates
1289	// with split SVE objects, so we must subtract the size of the predicates.
1290	if (AFI->hasSplitSVEObjects() &&
1291	MFI.getStackID(ObjectIdx: FI) == TargetStackID::ScalableVector)
1292	AccessOffset = -PPRStackSize;
1293	return AccessOffset +
1294	StackOffset::get(Fixed: -((int64_t)AFI->getCalleeSavedStackSize()),
1295	Scalable: ObjectOffset);
1296	}
1297
1298	bool IsFixed = MFI.isFixedObjectIndex(ObjectIdx: FI);
1299	bool IsCSR =
1300	!IsFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
1301
1302	StackOffset ScalableOffset = {};
1303	if (!IsFixed && !IsCSR) {
1304	ScalableOffset = -SVEStackSize;
1305	} else if (FPAfterSVECalleeSaves && IsCSR) {
1306	ScalableOffset =
1307	-StackOffset::getScalable(Scalable: AFI->getSVECalleeSavedStackSize());
1308	}
1309
1310	return StackOffset::getFixed(Fixed: ObjectOffset) + ScalableOffset;
1311	}
1312
1313	StackOffset
1314	AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
1315	int FI) const {
1316	return StackOffset::getFixed(Fixed: getSEHFrameIndexOffset(MF, FI));
1317	}
1318
1319	StackOffset AArch64FrameLowering::getFPOffset(const MachineFunction &MF,
1320	int64_t ObjectOffset) const {
1321	const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1322	const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1323	const Function &F = MF.getFunction();
1324	bool IsWin64 = Subtarget.isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
1325	unsigned FixedObject =
1326	getFixedObjectSize(MF, AFI, IsWin64, /IsFunclet=/false);
1327	int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MFI: MF.getFrameInfo());
1328	int64_t FPAdjust =
1329	CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
1330	return StackOffset::getFixed(Fixed: ObjectOffset + FixedObject + FPAdjust);
1331	}
1332
1333	StackOffset AArch64FrameLowering::getStackOffset(const MachineFunction &MF,
1334	int64_t ObjectOffset) const {
1335	const auto &MFI = MF.getFrameInfo();
1336	return StackOffset::getFixed(Fixed: ObjectOffset + (int64_t)MFI.getStackSize());
1337	}
1338
1339	// TODO: This function currently does not work for scalable vectors.
1340	int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
1341	int FI) const {
1342	const AArch64RegisterInfo *RegInfo =
1343	MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
1344	int ObjectOffset = MF.getFrameInfo().getObjectOffset(ObjectIdx: FI);
1345	return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
1346	? getFPOffset(MF, ObjectOffset).getFixed()
1347	: getStackOffset(MF, ObjectOffset).getFixed();
1348	}
1349
1350	StackOffset AArch64FrameLowering::resolveFrameIndexReference(
1351	const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
1352	bool ForSimm) const {
1353	const auto &MFI = MF.getFrameInfo();
1354	int64_t ObjectOffset = MFI.getObjectOffset(ObjectIdx: FI);
1355	bool isFixed = MFI.isFixedObjectIndex(ObjectIdx: FI);
1356	auto StackID = static_cast<TargetStackID::Value>(MFI.getStackID(ObjectIdx: FI));
1357	return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, StackID,
1358	FrameReg, PreferFP, ForSimm);
1359	}
1360
1361	StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
1362	const MachineFunction &MF, int64_t ObjectOffset, bool isFixed,
1363	TargetStackID::Value StackID, Register &FrameReg, bool PreferFP,
1364	bool ForSimm) const {
1365	const auto &MFI = MF.getFrameInfo();
1366	const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1367	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1368	const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1369
1370	int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
1371	int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
1372	bool isCSR =
1373	!isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
1374	bool isSVE = MFI.isScalableStackID(StackID);
1375
1376	StackOffset ZPRStackSize = getZPRStackSize(MF);
1377	StackOffset PPRStackSize = getPPRStackSize(MF);
1378	StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
1379
1380	// Use frame pointer to reference fixed objects. Use it for locals if
1381	// there are VLAs or a dynamically realigned SP (and thus the SP isn't
1382	// reliable as a base). Make sure useFPForScavengingIndex() does the
1383	// right thing for the emergency spill slot.
1384	bool UseFP = false;
1385	if (AFI->hasStackFrame() && !isSVE) {
1386	// We shouldn't prefer using the FP to access fixed-sized stack objects when
1387	// there are scalable (SVE) objects in between the FP and the fixed-sized
1388	// objects.
1389	PreferFP &= !SVEStackSize;
1390
1391	// Note: Keeping the following as multiple 'if' statements rather than
1392	// merging to a single expression for readability.
1393	//
1394	// Argument access should always use the FP.
1395	if (isFixed) {
1396	UseFP = hasFP(MF);
1397	} else if (isCSR && RegInfo->hasStackRealignment(MF)) {
1398	// References to the CSR area must use FP if we're re-aligning the stack
1399	// since the dynamically-sized alignment padding is between the SP/BP and
1400	// the CSR area.
1401	assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
1402	UseFP = true;
1403	} else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
1404	// If the FPOffset is negative and we're producing a signed immediate, we
1405	// have to keep in mind that the available offset range for negative
1406	// offsets is smaller than for positive ones. If an offset is available
1407	// via the FP and the SP, use whichever is closest.
1408	bool FPOffsetFits = !ForSimm \|\| FPOffset >= -`256`;
1409	PreferFP \|= Offset > -FPOffset && !SVEStackSize;
1410
1411	if (FPOffset >= `0`) {
1412	// If the FPOffset is positive, that'll always be best, as the SP/BP
1413	// will be even further away.
1414	UseFP = true;
1415	} else if (MFI.hasVarSizedObjects()) {
1416	// If we have variable sized objects, we can use either FP or BP, as the
1417	// SP offset is unknown. We can use the base pointer if we have one and
1418	// FP is not preferred. If not, we're stuck with using FP.
1419	bool CanUseBP = RegInfo->hasBasePointer(MF);
1420	if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
1421	UseFP = PreferFP;
1422	else if (!CanUseBP) // Can't use BP. Forced to use FP.
1423	UseFP = true;
1424	// else we can use BP and FP, but the offset from FP won't fit.
1425	// That will make us scavenge registers which we can probably avoid by
1426	// using BP. If it won't fit for BP either, we'll scavenge anyway.
1427	} else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
1428	// Funclets access the locals contained in the parent's stack frame
1429	// via the frame pointer, so we have to use the FP in the parent
1430	// function.
1431	(void) Subtarget;
1432	assert(Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(),
1433	MF.getFunction().isVarArg()) &&
1434	"Funclets should only be present on Win64");
1435	UseFP = true;
1436	} else {
1437	// We have the choice between FP and (SP or BP).
1438	if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
1439	UseFP = true;
1440	}
1441	}
1442	}
1443
1444	assert(
1445	((isFixed \|\| isCSR) \|\| !RegInfo->hasStackRealignment(MF) \|\| !UseFP) &&
1446	"In the presence of dynamic stack pointer realignment, "
1447	"non-argument/CSR objects cannot be accessed through the frame pointer");
1448
1449	bool FPAfterSVECalleeSaves = hasSVECalleeSavesAboveFrameRecord(MF);
1450
1451	if (isSVE) {
1452	StackOffset FPOffset = StackOffset::get(
1453	Fixed: -AFI->getCalleeSaveBaseToFrameRecordOffset(), Scalable: ObjectOffset);
1454	StackOffset SPOffset =
1455	SVEStackSize +
1456	StackOffset::get(Fixed: MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
1457	Scalable: ObjectOffset);
1458
1459	// With split SVE objects the ObjectOffset is relative to the split area
1460	// (i.e. the PPR area or ZPR area respectively).
1461	if (AFI->hasSplitSVEObjects() && StackID == TargetStackID::ScalableVector) {
1462	// If we're accessing an SVE vector with split SVE objects...
1463	// - From the FP we need to move down past the PPR area:
1464	FPOffset -= PPRStackSize;
1465	// - From the SP we only need to move up to the ZPR area:
1466	SPOffset -= PPRStackSize;
1467	// Note: `SPOffset = SVEStackSize + ...`, so `-= PPRStackSize` results in
1468	// `SPOffset = ZPRStackSize + ...`.
1469	}
1470
1471	if (FPAfterSVECalleeSaves) {
1472	FPOffset += StackOffset::getScalable(Scalable: AFI->getSVECalleeSavedStackSize());
1473	if (-ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) {
1474	FPOffset += StackOffset::getFixed(Fixed: AFI->getCalleeSavedStackSize());
1475	SPOffset += StackOffset::getFixed(Fixed: AFI->getCalleeSavedStackSize());
1476	}
1477	}
1478
1479	// Always use the FP for SVE spills if available and beneficial.
1480	if (hasFP(MF) && (SPOffset.getFixed() \|\|
1481	FPOffset.getScalable() < SPOffset.getScalable() \|\|
1482	RegInfo->hasStackRealignment(MF))) {
1483	FrameReg = RegInfo->getFrameRegister(MF);
1484	return FPOffset;
1485	}
1486	FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
1487	: MCRegister (AArch64::SP);
1488
1489	return SPOffset;
1490	}
1491
1492	StackOffset SVEAreaOffset = {};
1493	if (FPAfterSVECalleeSaves) {
1494	// In this stack layout, the FP is in between the callee saves and other
1495	// SVE allocations.
1496	StackOffset SVECalleeSavedStack =
1497	StackOffset::getScalable(Scalable: AFI->getSVECalleeSavedStackSize());
1498	if (UseFP) {
1499	if (isFixed)
1500	SVEAreaOffset = SVECalleeSavedStack;
1501	else if (!isCSR)
1502	SVEAreaOffset = SVECalleeSavedStack - SVEStackSize;
1503	} else {
1504	if (isFixed)
1505	SVEAreaOffset = SVEStackSize;
1506	else if (isCSR)
1507	SVEAreaOffset = SVEStackSize - SVECalleeSavedStack;
1508	}
1509	} else {
1510	if (UseFP && !(isFixed \|\| isCSR))
1511	SVEAreaOffset = -SVEStackSize;
1512	if (!UseFP && (isFixed \|\| isCSR))
1513	SVEAreaOffset = SVEStackSize;
1514	}
1515
1516	if (UseFP) {
1517	FrameReg = RegInfo->getFrameRegister(MF);
1518	return StackOffset::getFixed(Fixed: FPOffset) + SVEAreaOffset;
1519	}
1520
1521	// Use the base pointer if we have one.
1522	if (RegInfo->hasBasePointer(MF))
1523	FrameReg = RegInfo->getBaseRegister();
1524	else {
1525	assert(!MFI.hasVarSizedObjects() &&
1526	"Can't use SP when we have var sized objects.");
1527	FrameReg = AArch64::SP;
1528	// If we're using the red zone for this function, the SP won't actually
1529	// be adjusted, so the offsets will be negative. They're also all
1530	// within range of the signed 9-bit immediate instructions.
1531	if (canUseRedZone(MF))
1532	Offset -= AFI->getLocalStackSize();
1533	}
1534
1535	return StackOffset::getFixed(Fixed: Offset) + SVEAreaOffset;
1536	}
1537
1538	static RegState getPrologueDeath(MachineFunction &MF, unsigned Reg) {
1539	// Do not set a kill flag on values that are also marked as live-in. This
1540	// happens with the @llvm-returnaddress intrinsic and with arguments passed in
1541	// callee saved registers.
1542	// Omitting the kill flags is conservatively correct even if the live-in
1543	// is not used after all.
1544	bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
1545	return getKillRegState(B: !IsLiveIn);
1546	}
1547
1548	static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL,
1549	MachineFunction &MF) {
1550	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1551	AttributeList Attrs = MF.getFunction().getAttributes();
1552	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1553	return Subtarget.isTargetMachO() &&
1554	!(Subtarget.getTargetLowering()->supportSwiftError() &&
1555	Attrs.hasAttrSomewhere(Kind: Attribute::SwiftError)) &&
1556	MF.getFunction().getCallingConv() != CallingConv::SwiftTail &&
1557	!AFL.requiresSaveVG(MF) && !AFI->isSVECC();
1558	}
1559
1560	static bool invalidateWindowsRegisterPairing(bool SpillExtendedVolatile,
1561	unsigned SpillCount, unsigned Reg1,
1562	unsigned Reg2, bool NeedsWinCFI,
1563	const TargetRegisterInfo *TRI) {
1564	// If we are generating register pairs for a Windows function that requires
1565	// EH support, then pair consecutive registers only. There are no unwind
1566	// opcodes for saves/restores of non-consecutive register pairs.
1567	// The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
1568	// save_lrpair.
1569	// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
1570
1571	if (Reg2 == AArch64::FP)
1572	return true;
1573	if (!NeedsWinCFI)
1574	return false;
1575
1576	// ARM64EC introduced `save_any_regp`, which expects 16-byte alignment.
1577	// This is handled by only allowing paired spills for registers spilled at
1578	// even positions (which should be 16-byte aligned, as other GPRs/FPRs are
1579	// 8-bytes). We carve out an exception for {FP,LR}, which does not require
1580	// 16-byte alignment in the uop representation.
1581	if (TRI->getEncodingValue(Reg: Reg2) == TRI->getEncodingValue(Reg: Reg1) + `1`)
1582	return SpillExtendedVolatile
1583	? !((Reg1 == AArch64::FP && Reg2 == AArch64::LR) \|\|
1584	(SpillCount % `2`) == `0`)
1585	: false;
1586
1587	// If pairing a GPR with LR, the pair can be described by the save_lrpair
1588	// opcode. The save_lrpair opcode requires the first register to be odd.
1589	if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
1590	(Reg1 - AArch64::X19) % `2` == `0` && Reg2 == AArch64::LR)
1591	return false;
1592	return true;
1593	}
1594
1595	/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
1596	/// WindowsCFI requires that only consecutive registers can be paired.
1597	/// LR and FP need to be allocated together when the frame needs to save
1598	/// the frame-record. This means any other register pairing with LR is invalid.
1599	static bool invalidateRegisterPairing(bool SpillExtendedVolatile,
1600	unsigned SpillCount, unsigned Reg1,
1601	unsigned Reg2, bool UsesWinAAPCS,
1602	bool NeedsWinCFI, bool NeedsFrameRecord,
1603	const TargetRegisterInfo *TRI) {
1604	if (UsesWinAAPCS)
1605	return invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount,
1606	Reg1, Reg2, NeedsWinCFI, TRI);
1607
1608	// If we need to store the frame record, don't pair any register
1609	// with LR other than FP.
1610	if (NeedsFrameRecord)
1611	return Reg2 == AArch64::LR;
1612
1613	return false;
1614	}
1615
1616	namespace {
1617
1618	struct RegPairInfo {
1619	Register Reg1;
1620	Register Reg2;
1621	int FrameIdx;
1622	int Offset;
1623	enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
1624	const TargetRegisterClass *RC;
1625
1626	RegPairInfo() = default;
1627
1628	bool isPaired() const { return Reg2.isValid(); }
1629
1630	bool isScalable() const { return Type == PPR \|\| Type == ZPR; }
1631	};
1632
1633	} // end anonymous namespace
1634
1635	MCRegister findFreePredicateReg(BitVector &SavedRegs) {
1636	for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
1637	if (SavedRegs.test(Idx: PReg)) {
1638	unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0;
1639	return MCRegister (PNReg);
1640	}
1641	}
1642	return MCRegister ();
1643	}
1644
1645	// The multivector LD/ST are available only for SME or SVE2p1 targets
1646	bool enableMultiVectorSpillFill(const AArch64Subtarget &Subtarget,
1647	MachineFunction &MF) {
1648	if (DisableMultiVectorSpillFill)
1649	return false;
1650
1651	SMEAttrs FuncAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
1652	bool IsLocallyStreaming =
1653	FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
1654
1655	// Only when in streaming mode SME2 instructions can be safely used.
1656	// It is not safe to use SME2 instructions when in streaming compatible or
1657	// locally streaming mode.
1658	return Subtarget.hasSVE2p1() \|\|
1659	(Subtarget.hasSME2() &&
1660	(!IsLocallyStreaming && Subtarget.isStreaming()));
1661	}
1662
1663	void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
1664	MachineFunction &MF,
1665	ArrayRef<CalleeSavedInfo> CSI,
1666	const TargetRegisterInfo *TRI,
1667	SmallVectorImpl<RegPairInfo> &RegPairs,
1668	bool NeedsFrameRecord) {
1669
1670	if (CSI.empty())
1671	return;
1672
1673	bool IsWindows = isTargetWindows(MF);
1674	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1675	unsigned StackHazardSize = getStackHazardSize(MF);
1676	MachineFrameInfo &MFI = MF.getFrameInfo();
1677	CallingConv::ID CC = MF.getFunction().getCallingConv();
1678	unsigned Count = CSI.size();
1679	(void)CC;
1680	// MachO's compact unwind format relies on all registers being stored in
1681	// pairs.
1682	assert((!produceCompactUnwindFrame(AFL, MF) \|\|
1683	CC == CallingConv::PreserveMost \|\| CC == CallingConv::PreserveAll \|\|
1684	CC == CallingConv::CXX_FAST_TLS \|\| CC == CallingConv::Win64 \|\|
1685	(Count & `1`) == `0`) &&
1686	"Odd number of callee-saved regs to spill!");
1687	int ByteOffset = AFI->getCalleeSavedStackSize();
1688	int StackFillDir = -`1`;
1689	int RegInc = `1`;
1690	unsigned FirstReg = `0`;
1691	if (IsWindows) {
1692	// For WinCFI, fill the stack from the bottom up.
1693	ByteOffset = `0`;
1694	StackFillDir = `1`;
1695	// As the CSI array is reversed to match PrologEpilogInserter, iterate
1696	// backwards, to pair up registers starting from lower numbered registers.
1697	RegInc = -`1`;
1698	FirstReg = Count - `1`;
1699	}
1700
1701	bool FPAfterSVECalleeSaves = AFL.hasSVECalleeSavesAboveFrameRecord(MF);
1702	// Windows AAPCS has x9-x15 as volatile registers, x16-x17 as intra-procedural
1703	// scratch, x18 as platform reserved. However, clang has extended calling
1704	// convensions such as preserve_most and preserve_all which treat these as
1705	// CSR. As such, the ARM64 unwind uOPs bias registers by 19. We use ARM64EC
1706	// uOPs which have separate restrictions. We need to check for that.
1707	//
1708	// NOTE: we currently do not account for the D registers as LLVM does not
1709	// support non-ABI compliant D register spills.
1710	bool SpillExtendedVolatile =
1711	IsWindows && llvm::any_of(Range&: CSI, P: [](const CalleeSavedInfo &CSI) {
1712	const auto &Reg = CSI.getReg();
1713	return Reg >= AArch64::X0 && Reg <= AArch64::X18;
1714	});
1715
1716	int ZPRByteOffset = `0`;
1717	int PPRByteOffset = `0`;
1718	bool SplitPPRs = AFI->hasSplitSVEObjects();
1719	if (SplitPPRs) {
1720	ZPRByteOffset = AFI->getZPRCalleeSavedStackSize();
1721	PPRByteOffset = AFI->getPPRCalleeSavedStackSize();
1722	} else if (!FPAfterSVECalleeSaves) {
1723	ZPRByteOffset =
1724	AFI->getZPRCalleeSavedStackSize() + AFI->getPPRCalleeSavedStackSize();
1725	// Unused: Everything goes in ZPR space.
1726	PPRByteOffset = `0`;
1727	}
1728
1729	bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
1730	Register LastReg = `0`;
1731	bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex() && !SplitPPRs;
1732
1733	auto AlignOffset = [StackFillDir](int Offset, int Align) {
1734	if (StackFillDir < `0`)
1735	return alignDown(Value: Offset, Align);
1736	return alignTo(Value: Offset, Align);
1737	};
1738
1739	// When iterating backwards, the loop condition relies on unsigned wraparound.
1740	for (unsigned i = FirstReg; i < Count; i += RegInc) {
1741	RegPairInfo RPI;
1742	RPI.Reg1 = CSI [i].getReg();
1743
1744	if (AArch64::GPR64RegClass.contains(Reg: RPI.Reg1)) {
1745	RPI.Type = RegPairInfo::GPR;
1746	RPI.RC = &AArch64::GPR64RegClass;
1747	} else if (AArch64::FPR64RegClass.contains(Reg: RPI.Reg1)) {
1748	RPI.Type = RegPairInfo::FPR64;
1749	RPI.RC = &AArch64::FPR64RegClass;
1750	} else if (AArch64::FPR128RegClass.contains(Reg: RPI.Reg1)) {
1751	RPI.Type = RegPairInfo::FPR128;
1752	RPI.RC = &AArch64::FPR128RegClass;
1753	} else if (AArch64::ZPRRegClass.contains(Reg: RPI.Reg1)) {
1754	RPI.Type = RegPairInfo::ZPR;
1755	RPI.RC = &AArch64::ZPRRegClass;
1756	} else if (AArch64::PPRRegClass.contains(Reg: RPI.Reg1)) {
1757	RPI.Type = RegPairInfo::PPR;
1758	RPI.RC = &AArch64::PPRRegClass;
1759	} else if (RPI.Reg1 == AArch64::VG) {
1760	RPI.Type = RegPairInfo::VG;
1761	RPI.RC = &AArch64::FIXED_REGSRegClass;
1762	} else {
1763	llvm_unreachable("Unsupported register class.");
1764	}
1765
1766	int &ScalableByteOffset = RPI.Type == RegPairInfo::PPR && SplitPPRs
1767	? PPRByteOffset
1768	: ZPRByteOffset;
1769
1770	// Add the stack hazard size as we transition from GPR->FPR CSRs.
1771	if (HasCSHazardPadding &&
1772	(!LastReg \|\| !AArch64InstrInfo::isFpOrNEON(Reg: LastReg)) &&
1773	AArch64InstrInfo::isFpOrNEON(Reg: RPI.Reg1))
1774	ByteOffset += StackFillDir * StackHazardSize;
1775	LastReg = RPI.Reg1;
1776
1777	bool NeedsWinCFI = AFL.needsWinCFI(MF);
1778	int Scale = TRI->getSpillSize(RC: *RPI.RC);
1779	// Add the next reg to the pair if it is in the same register class.
1780	if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) {
1781	MCRegister NextReg = CSI [i + RegInc].getReg();
1782	unsigned SpillCount = NeedsWinCFI ? FirstReg - i : i;
1783	switch (RPI.Type) {
1784	case RegPairInfo::GPR:
1785	if (AArch64::GPR64RegClass.contains(Reg: NextReg) &&
1786	!invalidateRegisterPairing(SpillExtendedVolatile, SpillCount,
1787	Reg1: RPI.Reg1, Reg2: NextReg, UsesWinAAPCS: IsWindows,
1788	NeedsWinCFI, NeedsFrameRecord, TRI))
1789	RPI.Reg2 = NextReg;
1790	break;
1791	case RegPairInfo::FPR64:
1792	if (AArch64::FPR64RegClass.contains(Reg: NextReg) &&
1793	!invalidateRegisterPairing(SpillExtendedVolatile, SpillCount,
1794	Reg1: RPI.Reg1, Reg2: NextReg, UsesWinAAPCS: IsWindows,
1795	NeedsWinCFI, NeedsFrameRecord, TRI))
1796	RPI.Reg2 = NextReg;
1797	break;
1798	case RegPairInfo::FPR128:
1799	if (AArch64::FPR128RegClass.contains(Reg: NextReg))
1800	RPI.Reg2 = NextReg;
1801	break;
1802	case RegPairInfo::PPR:
1803	break;
1804	case RegPairInfo::ZPR:
1805	if (AFI->getPredicateRegForFillSpill() != `0` &&
1806	((RPI.Reg1 - AArch64::Z0) & `1`) == `0` && (NextReg == RPI.Reg1 + `1`)) {
1807	// Calculate offset of register pair to see if pair instruction can be
1808	// used.
1809	int Offset = (ScalableByteOffset + StackFillDir * `2` * Scale) / Scale;
1810	if ((-`16` <= Offset && Offset <= `14`) && (Offset % `2` == `0`))
1811	RPI.Reg2 = NextReg;
1812	}
1813	break;
1814	case RegPairInfo::VG:
1815	break;
1816	}
1817	}
1818
1819	// GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
1820	// list to come in sorted by frame index so that we can issue the store
1821	// pair instructions directly. Assert if we see anything otherwise.
1822	//
1823	// The order of the registers in the list is controlled by
1824	// getCalleeSavedRegs(), so they will always be in-order, as well.
1825	assert((!RPI.isPaired() \|\|
1826	(CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
1827	"Out of order callee saved regs!");
1828
1829	assert((!RPI.isPaired() \|\| !NeedsFrameRecord \|\| RPI.Reg2 != AArch64::FP \|\|
1830	RPI.Reg1 == AArch64::LR) &&
1831	"FrameRecord must be allocated together with LR");
1832
1833	// Windows AAPCS has FP and LR reversed.
1834	assert((!RPI.isPaired() \|\| !NeedsFrameRecord \|\| RPI.Reg1 != AArch64::FP \|\|
1835	RPI.Reg2 == AArch64::LR) &&
1836	"FrameRecord must be allocated together with LR");
1837
1838	// MachO's compact unwind format relies on all registers being stored in
1839	// adjacent register pairs.
1840	assert((!produceCompactUnwindFrame(AFL, MF) \|\|
1841	CC == CallingConv::PreserveMost \|\| CC == CallingConv::PreserveAll \|\|
1842	CC == CallingConv::CXX_FAST_TLS \|\| CC == CallingConv::Win64 \|\|
1843	(RPI.isPaired() &&
1844	((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) \|\|
1845	RPI.Reg1 + `1` == RPI.Reg2))) &&
1846	"Callee-save registers not saved as adjacent register pair!");
1847
1848	RPI.FrameIdx = CSI [i].getFrameIdx();
1849	if (IsWindows &&
1850	RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
1851	RPI.FrameIdx = CSI [i + RegInc].getFrameIdx();
1852
1853	// Realign the scalable offset if necessary. This is relevant when spilling
1854	// predicates on Windows.
1855	if (RPI.isScalable() && ScalableByteOffset % Scale != `0`)
1856	ScalableByteOffset = AlignOffset (ScalableByteOffset, Scale);
1857
1858	// Realign the fixed offset if necessary. This is relevant when spilling Q
1859	// registers after spilling an odd amount of X registers.
1860	if (!RPI.isScalable() && ByteOffset % Scale != `0`)
1861	ByteOffset = AlignOffset (ByteOffset, Scale);
1862
1863	int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
1864	assert(OffsetPre % Scale == `0`);
1865
1866	if (RPI.isScalable())
1867	ScalableByteOffset += StackFillDir * (RPI.isPaired() ? `2` * Scale : Scale);
1868	else
1869	ByteOffset += StackFillDir * (RPI.isPaired() ? `2` * Scale : Scale);
1870
1871	// Swift's async context is directly before FP, so allocate an extra
1872	// 8 bytes for it.
1873	if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
1874	((!IsWindows && RPI.Reg2 == AArch64::FP) \|\|
1875	(IsWindows && RPI.Reg2 == AArch64::LR)))
1876	ByteOffset += StackFillDir * `8`;
1877
1878	// Round up size of non-pair to pair size if we need to pad the
1879	// callee-save area to ensure 16-byte alignment.
1880	if (NeedGapToAlignStack && !IsWindows && !RPI.isScalable() &&
1881	RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired() &&
1882	ByteOffset % `16` != `0`) {
1883	ByteOffset += `8` * StackFillDir;
1884	assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(`16`));
1885	// A stack frame with a gap looks like this, bottom up:
1886	// d9, d8. x21, gap, x20, x19.
1887	// Set extra alignment on the x21 object to create the gap above it.
1888	MFI.setObjectAlignment(ObjectIdx: RPI.FrameIdx, Alignment: Align (`16`));
1889	NeedGapToAlignStack = false;
1890	}
1891
1892	int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
1893	assert(OffsetPost % Scale == `0`);
1894	// If filling top down (default), we want the offset after incrementing it.
1895	// If filling bottom up (WinCFI) we need the original offset.
1896	int Offset = IsWindows ? OffsetPre : OffsetPost;
1897
1898	// The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
1899	// Swift context can directly precede FP.
1900	if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
1901	((!IsWindows && RPI.Reg2 == AArch64::FP) \|\|
1902	(IsWindows && RPI.Reg2 == AArch64::LR)))
1903	Offset += `8`;
1904	RPI.Offset = Offset / Scale;
1905
1906	assert((!RPI.isPaired() \|\|
1907	(!RPI.isScalable() && RPI.Offset >= -`64` && RPI.Offset <= `63`) \|\|
1908	(RPI.isScalable() && RPI.Offset >= -`256` && RPI.Offset <= `255`)) &&
1909	"Offset out of bounds for LDP/STP immediate");
1910
1911	auto isFrameRecord = [&] {
1912	if (RPI.isPaired())
1913	return IsWindows ? RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR
1914	: RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP;
1915	// Otherwise, look for the frame record as two unpaired registers. This is
1916	// needed for -aarch64-stack-hazard-size=<val>, which disables register
1917	// pairing (as the padding may be too large for the LDP/STP offset). Note:
1918	// On Windows, this check works out as current reg == FP, next reg == LR,
1919	// and on other platforms current reg == FP, previous reg == LR. This
1920	// works out as the correct pre-increment or post-increment offsets
1921	// respectively.
1922	return i > `0` && RPI.Reg1 == AArch64::FP &&
1923	CSI [i - `1`].getReg() == AArch64::LR;
1924	};
1925
1926	// Save the offset to frame record so that the FP register can point to the
1927	// innermost frame record (spilled FP and LR registers).
1928	if (NeedsFrameRecord && isFrameRecord ())
1929	AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
1930
1931	RegPairs.push_back(Elt: RPI);
1932	if (RPI.isPaired())
1933	i += RegInc;
1934	}
1935	if (IsWindows) {
1936	// If we need an alignment gap in the stack, align the topmost stack
1937	// object. A stack frame with a gap looks like this, bottom up:
1938	// x19, d8. d9, gap.
1939	// Set extra alignment on the topmost stack object (the first element in
1940	// CSI, which goes top down), to create the gap above it.
1941	if (AFI->hasCalleeSaveStackFreeSpace())
1942	MFI.setObjectAlignment(ObjectIdx: CSI [`0`].getFrameIdx(), Alignment: Align (`16`));
1943	// We iterated bottom up over the registers; flip RegPairs back to top
1944	// down order.
1945	std::reverse(first: RegPairs.begin(), last: RegPairs.end());
1946	}
1947	}
1948
1949	bool AArch64FrameLowering::spillCalleeSavedRegisters(
1950	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1951	ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo TRI) const* {
1952	MachineFunction &MF = *MBB.getParent();
1953	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1954	auto &TLI = *Subtarget.getTargetLowering();
1955	const AArch64InstrInfo &TII = *Subtarget.getInstrInfo();
1956	bool NeedsWinCFI = needsWinCFI(MF);
1957	DebugLoc DL;
1958	SmallVector<RegPairInfo, `8`> RegPairs;
1959
1960	computeCalleeSaveRegisterPairs(AFL: *this, MF, CSI, TRI, RegPairs, NeedsFrameRecord: hasFP(MF));
1961
1962	MachineRegisterInfo &MRI = MF.getRegInfo();
1963	// Refresh the reserved regs in case there are any potential changes since the
1964	// last freeze.
1965	MRI.freezeReservedRegs();
1966
1967	if (homogeneousPrologEpilog(MF)) {
1968	auto MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::HOM_Prolog))
1969	.setMIFlag(MachineInstr::FrameSetup);
1970
1971	for (auto &RPI : RegPairs) {
1972	MIB.addReg(RegNo: RPI.Reg1);
1973	MIB.addReg(RegNo: RPI.Reg2);
1974
1975	// Update register live in.
1976	if (!MRI.isReserved(PhysReg: RPI.Reg1))
1977	MBB.addLiveIn(PhysReg: RPI.Reg1);
1978	if (RPI.isPaired() && !MRI.isReserved(PhysReg: RPI.Reg2))
1979	MBB.addLiveIn(PhysReg: RPI.Reg2);
1980	}
1981	return true;
1982	}
1983	bool PTrueCreated = false;
1984	for (const RegPairInfo &RPI : llvm::reverse(C&: RegPairs)) {
1985	Register Reg1 = RPI.Reg1;
1986	Register Reg2 = RPI.Reg2;
1987	unsigned StrOpc;
1988
1989	// Issue sequence of spills for cs regs. The first spill may be converted
1990	// to a pre-decrement store later by emitPrologue if the callee-save stack
1991	// area allocation can't be combined with the local stack area allocation.
1992	// For example:
1993	// stp x22, x21, [sp, #0] // addImm(+0)
1994	// stp x20, x19, [sp, #16] // addImm(+2)
1995	// stp fp, lr, [sp, #32] // addImm(+4)
1996	// Rationale: This sequence saves uop updates compared to a sequence of
1997	// pre-increment spills like stp xi,xj,[sp,#-16]!
1998	// Note: Similar rationale and sequence for restores in epilog.
1999	unsigned Size = TRI->getSpillSize(RC: *RPI.RC);
2000	Align Alignment = TRI->getSpillAlign(RC: *RPI.RC);
2001	switch (RPI.Type) {
2002	case RegPairInfo::GPR:
2003	StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2004	break;
2005	case RegPairInfo::FPR64:
2006	StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2007	break;
2008	case RegPairInfo::FPR128:
2009	StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2010	break;
2011	case RegPairInfo::ZPR:
2012	StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
2013	break;
2014	case RegPairInfo::PPR:
2015	StrOpc = AArch64::STR_PXI;
2016	break;
2017	case RegPairInfo::VG:
2018	StrOpc = AArch64::STRXui;
2019	break;
2020	}
2021
2022	Register X0Scratch;
2023	llvm::scope_exit RestoreX0([&] {
2024	if (X0Scratch != AArch64::NoRegister)
2025	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: AArch64::X0)
2026	.addReg(RegNo: X0Scratch)
2027	.setMIFlag(MachineInstr::FrameSetup);
2028	});
2029
2030	if (Reg1 == AArch64::VG) {
2031	// Find an available register to store value of VG to.
2032	Reg1 = findScratchNonCalleeSaveRegister(MBB: &MBB, HasCall: true);
2033	assert(Reg1 != AArch64::NoRegister);
2034	if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
2035	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::CNTD_XPiI), DestReg: Reg1)
2036	.addImm(Val: `31`)
2037	.addImm(Val: `1`)
2038	.setMIFlag(MachineInstr::FrameSetup);
2039	} else {
2040	const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
2041	if (any_of(Range: MBB.liveins(),
2042	P: [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
2043	return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
2044	RegA: AArch64::X0, RegB: LiveIn.PhysReg);
2045	})) {
2046	X0Scratch = Reg1;
2047	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: TargetOpcode::COPY), DestReg: X0Scratch)
2048	.addReg(RegNo: AArch64::X0)
2049	.setMIFlag(MachineInstr::FrameSetup);
2050	}
2051
2052	RTLIB::Libcall LC = RTLIB::SMEABI_GET_CURRENT_VG;
2053	const uint32_t *RegMask =
2054	TRI->getCallPreservedMask(MF, TLI.getLibcallCallingConv(Call: LC));
2055	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::BL))
2056	.addExternalSymbol(FnName: TLI.getLibcallName(Call: LC))
2057	.addRegMask(Mask: RegMask)
2058	.addReg(RegNo: AArch64::X0, Flags: RegState::ImplicitDefine)
2059	.setMIFlag(MachineInstr::FrameSetup);
2060	Reg1 = AArch64::X0;
2061	}
2062	}
2063
2064	LLVM_DEBUG({
2065	dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2066	if (RPI.isPaired())
2067	dbgs() << ", " << printReg(Reg2, TRI);
2068	dbgs() << ") -> fi#(" << RPI.FrameIdx;
2069	if (RPI.isPaired())
2070	dbgs() << ", " << RPI.FrameIdx + `1`;
2071	dbgs() << ")\n";
2072	});
2073
2074	assert((!isTargetWindows(MF) \|\|
2075	!(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2076	"Windows unwdinding requires a consecutive (FP,LR) pair");
2077	// Windows unwind codes require consecutive registers if registers are
2078	// paired. Make the switch here, so that the code below will save (x,x+1)
2079	// and not (x+1,x).
2080	unsigned FrameIdxReg1 = RPI.FrameIdx;
2081	unsigned FrameIdxReg2 = RPI.FrameIdx + `1`;
2082	if (isTargetWindows(MF) && RPI.isPaired()) {
2083	std::swap(a&: Reg1, b&: Reg2);
2084	std::swap(a&: FrameIdxReg1, b&: FrameIdxReg2);
2085	}
2086
2087	if (RPI.isPaired() && RPI.isScalable()) {
2088	[[maybe_unused]] const AArch64Subtarget &Subtarget =
2089	MF.getSubtarget<AArch64Subtarget>();
2090	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2091	unsigned PnReg = AFI->getPredicateRegForFillSpill();
2092	assert((PnReg != `0` && enableMultiVectorSpillFill(Subtarget, MF)) &&
2093	"Expects SVE2.1 or SME2 target and a predicate register");
2094	#ifdef EXPENSIVE_CHECKS
2095	auto IsPPR = [](const RegPairInfo &c) {
2096	return c.Reg1 == RegPairInfo::PPR;
2097	};
2098	auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
2099	auto IsZPR = [](const RegPairInfo &c) {
2100	return c.Type == RegPairInfo::ZPR;
2101	};
2102	auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
2103	assert(!(PPRBegin < ZPRBegin) &&
2104	"Expected callee save predicate to be handled first");
2105	#endif
2106	if (!PTrueCreated) {
2107	PTrueCreated = true;
2108	BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AArch64::PTRUE_C_B), DestReg: PnReg)
2109	.setMIFlags(MachineInstr::FrameSetup);
2110	}
2111	MachineInstrBuilder MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: StrOpc));
2112	if (!MRI.isReserved(PhysReg: Reg1))
2113	MBB.addLiveIn(PhysReg: Reg1);
2114	if (!MRI.isReserved(PhysReg: Reg2))
2115	MBB.addLiveIn(PhysReg: Reg2);
2116	MIB.addReg(/PairRegs/ RegNo: AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0));
2117	MIB.addMemOperand(MMO: MF.getMachineMemOperand(
2118	PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg2),
2119	F: MachineMemOperand::MOStore, Size, BaseAlignment: Alignment));
2120	MIB.addReg(RegNo: PnReg);
2121	MIB.addReg(RegNo: AArch64::SP)
2122	.addImm(Val: RPI.Offset / `2`) // [sp, #imm2vscale],
2123	// where 2vscale is implicit*
2124	.setMIFlag(MachineInstr::FrameSetup);
2125	MIB.addMemOperand(MMO: MF.getMachineMemOperand(
2126	PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg1),
2127	F: MachineMemOperand::MOStore, Size, BaseAlignment: Alignment));
2128	if (NeedsWinCFI)
2129	insertSEH(MBBI: MIB, TII, Flag: MachineInstr::FrameSetup);
2130	} else { // The code when the pair of ZReg is not present
2131	MachineInstrBuilder MIB = BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: StrOpc));
2132	if (!MRI.isReserved(PhysReg: Reg1))
2133	MBB.addLiveIn(PhysReg: Reg1);
2134	if (RPI.isPaired()) {
2135	if (!MRI.isReserved(PhysReg: Reg2))
2136	MBB.addLiveIn(PhysReg: Reg2);
2137	MIB.addReg(RegNo: Reg2, Flags: getPrologueDeath(MF, Reg: Reg2));
2138	MIB.addMemOperand(MMO: MF.getMachineMemOperand(
2139	PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg2),
2140	F: MachineMemOperand::MOStore, Size, BaseAlignment: Alignment));
2141	}
2142	MIB.addReg(RegNo: Reg1, Flags: getPrologueDeath(MF, Reg: Reg1))
2143	.addReg(RegNo: AArch64::SP)
2144	.addImm(Val: RPI.Offset) // [sp, #offsetvscale],*
2145	// where factorvscale is implicit*
2146	.setMIFlag(MachineInstr::FrameSetup);
2147	MIB.addMemOperand(MMO: MF.getMachineMemOperand(
2148	PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg1),
2149	F: MachineMemOperand::MOStore, Size, BaseAlignment: Alignment));
2150	if (NeedsWinCFI)
2151	insertSEH(MBBI: MIB, TII, Flag: MachineInstr::FrameSetup);
2152	}
2153	// Update the StackIDs of the SVE stack slots.
2154	MachineFrameInfo &MFI = MF.getFrameInfo();
2155	if (RPI.Type == RegPairInfo::ZPR) {
2156	MFI.setStackID(ObjectIdx: FrameIdxReg1, ID: TargetStackID::ScalableVector);
2157	if (RPI.isPaired())
2158	MFI.setStackID(ObjectIdx: FrameIdxReg2, ID: TargetStackID::ScalableVector);
2159	} else if (RPI.Type == RegPairInfo::PPR) {
2160	MFI.setStackID(ObjectIdx: FrameIdxReg1, ID: TargetStackID::ScalablePredicateVector);
2161	if (RPI.isPaired())
2162	MFI.setStackID(ObjectIdx: FrameIdxReg2, ID: TargetStackID::ScalablePredicateVector);
2163	}
2164	}
2165	return true;
2166	}
2167
2168	bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2169	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2170	MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo TRI) const* {
2171	MachineFunction &MF = *MBB.getParent();
2172	const AArch64InstrInfo &TII =
2173	*MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2174	DebugLoc DL;
2175	SmallVector<RegPairInfo, `8`> RegPairs;
2176	bool NeedsWinCFI = needsWinCFI(MF);
2177
2178	if (MBBI != MBB.end())
2179	DL = MBBI ->getDebugLoc();
2180
2181	computeCalleeSaveRegisterPairs(AFL: *this, MF, CSI, TRI, RegPairs, NeedsFrameRecord: hasFP(MF));
2182	if (homogeneousPrologEpilog(MF, Exit: &MBB)) {
2183	auto MIB = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::HOM_Epilog))
2184	.setMIFlag(MachineInstr::FrameDestroy);
2185	for (auto &RPI : RegPairs) {
2186	MIB.addReg(RegNo: RPI.Reg1, Flags: RegState::Define);
2187	MIB.addReg(RegNo: RPI.Reg2, Flags: RegState::Define);
2188	}
2189	return true;
2190	}
2191
2192	// For performance reasons restore SVE register in increasing order
2193	auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
2194	auto PPRBegin = llvm::find_if(Range&: RegPairs, P: IsPPR);
2195	auto PPREnd = std::find_if_not(first: PPRBegin, last: RegPairs.end(), pred: IsPPR);
2196	std::reverse(first: PPRBegin, last: PPREnd);
2197	auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
2198	auto ZPRBegin = llvm::find_if(Range&: RegPairs, P: IsZPR);
2199	auto ZPREnd = std::find_if_not(first: ZPRBegin, last: RegPairs.end(), pred: IsZPR);
2200	std::reverse(first: ZPRBegin, last: ZPREnd);
2201
2202	bool PTrueCreated = false;
2203	for (const RegPairInfo &RPI : RegPairs) {
2204	Register Reg1 = RPI.Reg1;
2205	Register Reg2 = RPI.Reg2;
2206
2207	// Issue sequence of restores for cs regs. The last restore may be converted
2208	// to a post-increment load later by emitEpilogue if the callee-save stack
2209	// area allocation can't be combined with the local stack area allocation.
2210	// For example:
2211	// ldp fp, lr, [sp, #32] // addImm(+4)
2212	// ldp x20, x19, [sp, #16] // addImm(+2)
2213	// ldp x22, x21, [sp, #0] // addImm(+0)
2214	// Note: see comment in spillCalleeSavedRegisters()
2215	unsigned LdrOpc;
2216	unsigned Size = TRI->getSpillSize(RC: *RPI.RC);
2217	Align Alignment = TRI->getSpillAlign(RC: *RPI.RC);
2218	switch (RPI.Type) {
2219	case RegPairInfo::GPR:
2220	LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2221	break;
2222	case RegPairInfo::FPR64:
2223	LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2224	break;
2225	case RegPairInfo::FPR128:
2226	LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2227	break;
2228	case RegPairInfo::ZPR:
2229	LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
2230	break;
2231	case RegPairInfo::PPR:
2232	LdrOpc = AArch64::LDR_PXI;
2233	break;
2234	case RegPairInfo::VG:
2235	continue;
2236	}
2237	LLVM_DEBUG({
2238	dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2239	if (RPI.isPaired())
2240	dbgs() << ", " << printReg(Reg2, TRI);
2241	dbgs() << ") -> fi#(" << RPI.FrameIdx;
2242	if (RPI.isPaired())
2243	dbgs() << ", " << RPI.FrameIdx + `1`;
2244	dbgs() << ")\n";
2245	});
2246
2247	// Windows unwind codes require consecutive registers if registers are
2248	// paired. Make the switch here, so that the code below will save (x,x+1)
2249	// and not (x+1,x).
2250	unsigned FrameIdxReg1 = RPI.FrameIdx;
2251	unsigned FrameIdxReg2 = RPI.FrameIdx + `1`;
2252	if (isTargetWindows(MF) && RPI.isPaired()) {
2253	std::swap(a&: Reg1, b&: Reg2);
2254	std::swap(a&: FrameIdxReg1, b&: FrameIdxReg2);
2255	}
2256
2257	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2258	if (RPI.isPaired() && RPI.isScalable()) {
2259	[[maybe_unused]] const AArch64Subtarget &Subtarget =
2260	MF.getSubtarget<AArch64Subtarget>();
2261	unsigned PnReg = AFI->getPredicateRegForFillSpill();
2262	assert((PnReg != `0` && enableMultiVectorSpillFill(Subtarget, MF)) &&
2263	"Expects SVE2.1 or SME2 target and a predicate register");
2264	#ifdef EXPENSIVE_CHECKS
2265	assert(!(PPRBegin < ZPRBegin) &&
2266	"Expected callee save predicate to be handled first");
2267	#endif
2268	if (!PTrueCreated) {
2269	PTrueCreated = true;
2270	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::PTRUE_C_B), DestReg: PnReg)
2271	.setMIFlags(MachineInstr::FrameDestroy);
2272	}
2273	MachineInstrBuilder MIB = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: LdrOpc));
2274	MIB.addReg(/PairRegs/ RegNo: AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0),
2275	Flags: getDefRegState(B: true));
2276	MIB.addMemOperand(MMO: MF.getMachineMemOperand(
2277	PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg2),
2278	F: MachineMemOperand::MOLoad, Size, BaseAlignment: Alignment));
2279	MIB.addReg(RegNo: PnReg);
2280	MIB.addReg(RegNo: AArch64::SP)
2281	.addImm(Val: RPI.Offset / `2`) // [sp, #imm2vscale]
2282	// where 2vscale is implicit*
2283	.setMIFlag(MachineInstr::FrameDestroy);
2284	MIB.addMemOperand(MMO: MF.getMachineMemOperand(
2285	PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg1),
2286	F: MachineMemOperand::MOLoad, Size, BaseAlignment: Alignment));
2287	if (NeedsWinCFI)
2288	insertSEH(MBBI: MIB, TII, Flag: MachineInstr::FrameDestroy);
2289	} else {
2290	MachineInstrBuilder MIB = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: LdrOpc));
2291	if (RPI.isPaired()) {
2292	MIB.addReg(RegNo: Reg2, Flags: getDefRegState(B: true));
2293	MIB.addMemOperand(MMO: MF.getMachineMemOperand(
2294	PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg2),
2295	F: MachineMemOperand::MOLoad, Size, BaseAlignment: Alignment));
2296	}
2297	MIB.addReg(RegNo: Reg1, Flags: getDefRegState(B: true));
2298	MIB.addReg(RegNo: AArch64::SP)
2299	.addImm(Val: RPI.Offset) // [sp, #offsetvscale]*
2300	// where factorvscale is implicit*
2301	.setMIFlag(MachineInstr::FrameDestroy);
2302	MIB.addMemOperand(MMO: MF.getMachineMemOperand(
2303	PtrInfo: MachinePointerInfo::getFixedStack(MF, FI: FrameIdxReg1),
2304	F: MachineMemOperand::MOLoad, Size, BaseAlignment: Alignment));
2305	if (NeedsWinCFI)
2306	insertSEH(MBBI: MIB, TII, Flag: MachineInstr::FrameDestroy);
2307	}
2308	}
2309	return true;
2310	}
2311
2312	// Return the FrameID for a MMO.
2313	static std::optional<int> getMMOFrameID(MachineMemOperand *MMO,
2314	const MachineFrameInfo &MFI) {
2315	auto *PSV =
2316	dyn_cast_or_null<FixedStackPseudoSourceValue>(Val: MMO->getPseudoValue());
2317	if (PSV)
2318	return std::optional<int>(PSV->getFrameIndex());
2319
2320	if (MMO->getValue()) {
2321	if (auto *Al = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: MMO->getValue()))) {
2322	for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd();
2323	FI++)
2324	if (MFI.getObjectAllocation(ObjectIdx: FI) == Al)
2325	return FI;
2326	}
2327	}
2328
2329	return std::nullopt;
2330	}
2331
2332	// Return the FrameID for a Load/Store instruction by looking at the first MMO.
2333	static std::optional<int> getLdStFrameID(const MachineInstr &MI,
2334	const MachineFrameInfo &MFI) {
2335	if (!MI.mayLoadOrStore() \|\| MI.getNumMemOperands() < `1`)
2336	return std::nullopt;
2337
2338	return getMMOFrameID(MMO: *MI.memoperands_begin(), MFI);
2339	}
2340
2341	// Returns true if the LDST MachineInstr \p MI is a PPR access.
2342	static bool isPPRAccess(const MachineInstr &MI) {
2343	return AArch64::PPRRegClass.contains(Reg: MI.getOperand(i: `0`).getReg());
2344	}
2345
2346	// Check if a Hazard slot is needed for the current function, and if so create
2347	// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
2348	// which can be used to determine if any hazard padding is needed.
2349	void AArch64FrameLowering::determineStackHazardSlot(
2350	MachineFunction &MF, BitVector &SavedRegs) const {
2351	unsigned StackHazardSize = getStackHazardSize(MF);
2352	auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2353	if (StackHazardSize == `0` \|\| StackHazardSize % `16` != `0` \|\|
2354	AFI->hasStackHazardSlotIndex())
2355	return;
2356
2357	// Stack hazards are only needed in streaming functions.
2358	SMEAttrs Attrs = AFI->getSMEFnAttrs();
2359	if (!StackHazardInNonStreaming && Attrs.hasNonStreamingInterfaceAndBody())
2360	return;
2361
2362	MachineFrameInfo &MFI = MF.getFrameInfo();
2363
2364	// Add a hazard slot if there are any CSR FPR registers, or are any fp-only
2365	// stack objects.
2366	bool HasFPRCSRs = any_of(Range: SavedRegs.set_bits(), P: [](unsigned Reg) {
2367	return AArch64::FPR64RegClass.contains(Reg) \|\|
2368	AArch64::FPR128RegClass.contains(Reg) \|\|
2369	AArch64::ZPRRegClass.contains(Reg);
2370	});
2371	bool HasPPRCSRs = any_of(Range: SavedRegs.set_bits(), P: [](unsigned Reg) {
2372	return AArch64::PPRRegClass.contains(Reg);
2373	});
2374	bool HasFPRStackObjects = false;
2375	bool HasPPRStackObjects = false;
2376	if (!HasFPRCSRs \|\| SplitSVEObjects) {
2377	enum SlotType : uint8_t {
2378	Unknown = `0`,
2379	ZPRorFPR = `1` << `0`,
2380	PPR = `1` << `1`,
2381	GPR = `1` << `2`,
2382	LLVM_MARK_AS_BITMASK_ENUM(GPR)
2383	};
2384
2385	// Find stack slots solely used for one kind of register (ZPR, PPR, etc.),
2386	// based on the kinds of accesses used in the function.
2387	SmallVector<SlotType> SlotTypes(MFI.getObjectIndexEnd(), SlotType::Unknown);
2388	for (auto &MBB : MF) {
2389	for (auto &MI : MBB) {
2390	std::optional<int> FI = getLdStFrameID(MI, MFI);
2391	if (!FI \|\| FI < `0` \|\| FI > int(SlotTypes.size()))
2392	continue;
2393	if (MFI.hasScalableStackID(ObjectIdx: *FI)) {
2394	SlotTypes [*FI] \|=
2395	isPPRAccess(MI) ? SlotType::PPR : SlotType::ZPRorFPR;
2396	} else {
2397	SlotTypes [*FI] \|= AArch64InstrInfo::isFpOrNEON(MI)
2398	? SlotType::ZPRorFPR
2399	: SlotType::GPR;
2400	}
2401	}
2402	}
2403
2404	for (int FI = `0`; FI < int(SlotTypes.size()); ++FI) {
2405	HasFPRStackObjects \|= SlotTypes [FI] == SlotType::ZPRorFPR;
2406	// For SplitSVEObjects remember that this stack slot is a predicate, this
2407	// will be needed later when determining the frame layout.
2408	if (SlotTypes [FI] == SlotType::PPR) {
2409	MFI.setStackID(ObjectIdx: FI, ID: TargetStackID::ScalablePredicateVector);
2410	HasPPRStackObjects = true;
2411	}
2412	}
2413	}
2414
2415	if (HasFPRCSRs \|\| HasFPRStackObjects) {
2416	int ID = MFI.CreateStackObject(Size: StackHazardSize, Alignment: Align (`16`), isSpillSlot: false);
2417	LLVM_DEBUG(dbgs() << "Created Hazard slot at " << ID << " size "
2418	<< StackHazardSize << "\n");
2419	AFI->setStackHazardSlotIndex(ID);
2420	}
2421
2422	if (!AFI->hasStackHazardSlotIndex())
2423	return;
2424
2425	if (SplitSVEObjects) {
2426	CallingConv::ID CC = MF.getFunction().getCallingConv();
2427	if (AFI->isSVECC() \|\| CC == CallingConv::AArch64_SVE_VectorCall) {
2428	AFI->setSplitSVEObjects(true);
2429	LLVM_DEBUG(dbgs() << "Using SplitSVEObjects for SVE CC function\n");
2430	return;
2431	}
2432
2433	// We only use SplitSVEObjects in non-SVE CC functions if there's a
2434	// possibility of a stack hazard between PPRs and ZPRs/FPRs.
2435	LLVM_DEBUG(dbgs() << "Determining if SplitSVEObjects should be used in "
2436	"non-SVE CC function...\n");
2437
2438	// If another calling convention is explicitly set FPRs can't be promoted to
2439	// ZPR callee-saves.
2440	if (!is_contained(Set: {CallingConv::C, CallingConv::Fast}, Element: CC)) {
2441	LLVM_DEBUG(
2442	dbgs()
2443	<< "Calling convention is not supported with SplitSVEObjects\n");
2444	return;
2445	}
2446
2447	if (!HasPPRCSRs && !HasPPRStackObjects) {
2448	LLVM_DEBUG(
2449	dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n");
2450	return;
2451	}
2452
2453	if (!HasFPRCSRs && !HasFPRStackObjects) {
2454	LLVM_DEBUG(
2455	dbgs()
2456	<< "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n");
2457	return;
2458	}
2459
2460	[[maybe_unused]] const AArch64Subtarget &Subtarget =
2461	MF.getSubtarget<AArch64Subtarget>();
2462	assert(Subtarget.isSVEorStreamingSVEAvailable() &&
2463	"Expected SVE to be available for PPRs");
2464
2465	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2466	// With SplitSVEObjects the CS hazard padding is placed between the
2467	// PPRs and ZPRs. If there are any FPR CS there would be a hazard between
2468	// them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs.
2469	BitVector FPRZRegs(SavedRegs.size());
2470	for (size_t Reg = `0`, E = SavedRegs.size(); HasFPRCSRs && Reg < E; ++Reg) {
2471	BitVector::reference RegBit = SavedRegs [Reg];
2472	if (!RegBit)
2473	continue;
2474	unsigned SubRegIdx = `0`;
2475	if (AArch64::FPR64RegClass.contains(Reg))
2476	SubRegIdx = AArch64::dsub;
2477	else if (AArch64::FPR128RegClass.contains(Reg))
2478	SubRegIdx = AArch64::zsub;
2479	else
2480	continue;
2481	// Clear the bit for the FPR save.
2482	RegBit = false;
2483	// Mark that we should save the corresponding ZPR.
2484	Register ZReg =
2485	TRI->getMatchingSuperReg(Reg, SubIdx: SubRegIdx, RC: &AArch64::ZPRRegClass);
2486	FPRZRegs.set(ZReg);
2487	}
2488	SavedRegs \|= FPRZRegs;
2489
2490	AFI->setSplitSVEObjects(true);
2491	LLVM_DEBUG(dbgs() << "SplitSVEObjects enabled!\n");
2492	}
2493	}
2494
2495	void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2496	BitVector &SavedRegs,
2497	RegScavenger RS) const* {
2498	// All calls are tail calls in GHC calling conv, and functions have no
2499	// prologue/epilogue.
2500	if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2501	return;
2502
2503	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2504
2505	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2506	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2507	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2508	unsigned UnspilledCSGPR = AArch64::NoRegister;
2509	unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2510
2511	MachineFrameInfo &MFI = MF.getFrameInfo();
2512	const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2513
2514	MCRegister BasePointerReg =
2515	RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() : MCRegister ();
2516
2517	unsigned ExtraCSSpill = `0`;
2518	bool HasUnpairedGPR64 = false;
2519	bool HasPairZReg = false;
2520	BitVector UserReservedRegs = RegInfo->getUserReservedRegs(MF);
2521	BitVector ReservedRegs = RegInfo->getReservedRegs(MF);
2522
2523	// Figure out which callee-saved registers to save/restore.
2524	for (unsigned i = `0`; CSRegs[i]; ++i) {
2525	const MCRegister Reg = CSRegs[i];
2526
2527	// Add the base pointer register to SavedRegs if it is callee-save.
2528	if (Reg == BasePointerReg)
2529	SavedRegs.set(Reg);
2530
2531	// Don't save manually reserved registers set through +reserve-x#i,
2532	// even for callee-saved registers, as per GCC's behavior.
2533	if (UserReservedRegs [Reg]) {
2534	SavedRegs.reset(Idx: Reg);
2535	continue;
2536	}
2537
2538	bool RegUsed = SavedRegs.test(Idx: Reg);
2539	MCRegister PairedReg;
2540	const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg);
2541	if (RegIsGPR64 \|\| AArch64::FPR64RegClass.contains(Reg) \|\|
2542	AArch64::FPR128RegClass.contains(Reg)) {
2543	// Compensate for odd numbers of GP CSRs.
2544	// For now, all the known cases of odd number of CSRs are of GPRs.
2545	if (HasUnpairedGPR64)
2546	PairedReg = CSRegs[i % `2` == `0` ? i - `1` : i + `1`];
2547	else
2548	PairedReg = CSRegs[i ^ `1`];
2549	}
2550
2551	// If the function requires all the GP registers to save (SavedRegs),
2552	// and there are an odd number of GP CSRs at the same time (CSRegs),
2553	// PairedReg could be in a different register class from Reg, which would
2554	// lead to a FPR (usually D8) accidentally being marked saved.
2555	if (RegIsGPR64 && !AArch64::GPR64RegClass.contains(Reg: PairedReg)) {
2556	PairedReg = AArch64::NoRegister;
2557	HasUnpairedGPR64 = true;
2558	}
2559	assert(PairedReg == AArch64::NoRegister \|\|
2560	AArch64::GPR64RegClass.contains(Reg, PairedReg) \|\|
2561	AArch64::FPR64RegClass.contains(Reg, PairedReg) \|\|
2562	AArch64::FPR128RegClass.contains(Reg, PairedReg));
2563
2564	if (!RegUsed) {
2565	if (AArch64::GPR64RegClass.contains(Reg) && !ReservedRegs [Reg]) {
2566	UnspilledCSGPR = Reg;
2567	UnspilledCSGPRPaired = PairedReg;
2568	}
2569	continue;
2570	}
2571
2572	// MachO's compact unwind format relies on all registers being stored in
2573	// pairs.
2574	// FIXME: the usual format is actually better if unwinding isn't needed.
2575	if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
2576	!SavedRegs.test(Idx: PairedReg)) {
2577	SavedRegs.set(PairedReg);
2578	if (AArch64::GPR64RegClass.contains(Reg: PairedReg) &&
2579	!ReservedRegs [PairedReg])
2580	ExtraCSSpill = PairedReg;
2581	}
2582	// Check if there is a pair of ZRegs, so it can select PReg for spill/fill
2583	HasPairZReg \|= (AArch64::ZPRRegClass.contains(Reg1: Reg, Reg2: CSRegs[i ^ `1`]) &&
2584	SavedRegs.test(Idx: CSRegs[i ^ `1`]));
2585	}
2586
2587	if (HasPairZReg && enableMultiVectorSpillFill(Subtarget, MF)) {
2588	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2589	// Find a suitable predicate register for the multi-vector spill/fill
2590	// instructions.
2591	MCRegister PnReg = findFreePredicateReg(SavedRegs);
2592	if (PnReg.isValid())
2593	AFI->setPredicateRegForFillSpill(PnReg);
2594	// If no free callee-save has been found assign one.
2595	if (!AFI->getPredicateRegForFillSpill() &&
2596	MF.getFunction().getCallingConv() ==
2597	CallingConv::AArch64_SVE_VectorCall) {
2598	SavedRegs.set(AArch64::P8);
2599	AFI->setPredicateRegForFillSpill(AArch64::PN8);
2600	}
2601
2602	assert(!ReservedRegs[AFI->getPredicateRegForFillSpill()] &&
2603	"Predicate cannot be a reserved register");
2604	}
2605
2606	if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
2607	!Subtarget.isTargetWindows()) {
2608	// For Windows calling convention on a non-windows OS, where X18 is treated
2609	// as reserved, back up X18 when entering non-windows code (marked with the
2610	// Windows calling convention) and restore when returning regardless of
2611	// whether the individual function uses it - it might call other functions
2612	// that clobber it.
2613	SavedRegs.set(AArch64::X18);
2614	}
2615
2616	// Determine if a Hazard slot should be used and where it should go.
2617	// If SplitSVEObjects is used, the hazard padding is placed between the PPRs
2618	// and ZPRs. Otherwise, it goes in the callee save area.
2619	determineStackHazardSlot(MF, SavedRegs);
2620
2621	// Calculates the callee saved stack size.
2622	unsigned CSStackSize = `0`;
2623	unsigned ZPRCSStackSize = `0`;
2624	unsigned PPRCSStackSize = `0`;
2625	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2626	for (unsigned Reg : SavedRegs.set_bits()) {
2627	auto *RC = TRI->getMinimalPhysRegClass(Reg: MCRegister (Reg));
2628	assert(RC && "expected register class!");
2629	auto SpillSize = TRI->getSpillSize(RC: *RC);
2630	bool IsZPR = AArch64::ZPRRegClass.contains(Reg);
2631	bool IsPPR = !IsZPR && AArch64::PPRRegClass.contains(Reg);
2632	if (IsZPR)
2633	ZPRCSStackSize += SpillSize;
2634	else if (IsPPR)
2635	PPRCSStackSize += SpillSize;
2636	else
2637	CSStackSize += SpillSize;
2638	}
2639
2640	// Save number of saved regs, so we can easily update CSStackSize later to
2641	// account for any additional 64-bit GPR saves. Note: After this point
2642	// only 64-bit GPRs can be added to SavedRegs.
2643	unsigned NumSavedRegs = SavedRegs.count();
2644
2645	// If we have hazard padding in the CS area add that to the size.
2646	if (AFI->isStackHazardIncludedInCalleeSaveArea())
2647	CSStackSize += getStackHazardSize(MF);
2648
2649	// Increase the callee-saved stack size if the function has streaming mode
2650	// changes, as we will need to spill the value of the VG register.
2651	if (requiresSaveVG(MF))
2652	CSStackSize += `8`;
2653
2654	// If we must call __arm_get_current_vg in the prologue preserve the LR.
2655	if (requiresSaveVG(MF) && !Subtarget.hasSVE())
2656	SavedRegs.set(AArch64::LR);
2657
2658	// The frame record needs to be created by saving the appropriate registers
2659	uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
2660	if (hasFP(MF) \|\|
2661	windowsRequiresStackProbe(MF, StackSizeInBytes: EstimatedStackSize + CSStackSize + `16`)) {
2662	SavedRegs.set(AArch64::FP);
2663	SavedRegs.set(AArch64::LR);
2664	}
2665
2666	LLVM_DEBUG({
2667	dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
2668	for (unsigned Reg : SavedRegs.set_bits())
2669	dbgs() << `' '` << printReg(MCRegister(Reg), RegInfo);
2670	dbgs() << "\n";
2671	});
2672
2673	// If any callee-saved registers are used, the frame cannot be eliminated.
2674	auto [ZPRLocalStackSize, PPRLocalStackSize] =
2675	determineSVEStackSizes(MF, AssignOffsets: AssignObjectOffsets::No);
2676	uint64_t SVELocals = ZPRLocalStackSize + PPRLocalStackSize;
2677	uint64_t SVEStackSize =
2678	alignTo(Value: ZPRCSStackSize + PPRCSStackSize + SVELocals, Align: `16`);
2679	bool CanEliminateFrame = (SavedRegs.count() == `0`) && !SVEStackSize;
2680
2681	// The CSR spill slots have not been allocated yet, so estimateStackSize
2682	// won't include them.
2683	unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
2684
2685	// We may address some of the stack above the canonical frame address, either
2686	// for our own arguments or during a call. Include that in calculating whether
2687	// we have complicated addressing concerns.
2688	int64_t CalleeStackUsed = `0`;
2689	for (int I = MFI.getObjectIndexBegin(); I != `0`; ++I) {
2690	int64_t FixedOff = MFI.getObjectOffset(ObjectIdx: I);
2691	if (FixedOff > CalleeStackUsed)
2692	CalleeStackUsed = FixedOff;
2693	}
2694
2695	// Conservatively always assume BigStack when there are SVE spills.
2696	bool BigStack = SVEStackSize \|\| (EstimatedStackSize + CSStackSize +
2697	CalleeStackUsed) > EstimatedStackSizeLimit;
2698	if (BigStack \|\| !CanEliminateFrame \|\| RegInfo->cannotEliminateFrame(MF))
2699	AFI->setHasStackFrame(true);
2700
2701	// Estimate if we might need to scavenge a register at some point in order
2702	// to materialize a stack offset. If so, either spill one additional
2703	// callee-saved register or reserve a special spill slot to facilitate
2704	// register scavenging. If we already spilled an extra callee-saved register
2705	// above to keep the number of spills even, we don't need to do anything else
2706	// here.
2707	if (BigStack) {
2708	if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
2709	LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
2710	<< " to get a scratch register.\n");
2711	SavedRegs.set(UnspilledCSGPR);
2712	ExtraCSSpill = UnspilledCSGPR;
2713
2714	// MachO's compact unwind format relies on all registers being stored in
2715	// pairs, so if we need to spill one extra for BigStack, then we need to
2716	// store the pair.
2717	if (producePairRegisters(MF)) {
2718	if (UnspilledCSGPRPaired == AArch64::NoRegister) {
2719	// Failed to make a pair for compact unwind format, revert spilling.
2720	if (produceCompactUnwindFrame(AFL: *this, MF)) {
2721	SavedRegs.reset(Idx: UnspilledCSGPR);
2722	ExtraCSSpill = AArch64::NoRegister;
2723	}
2724	} else
2725	SavedRegs.set(UnspilledCSGPRPaired);
2726	}
2727	}
2728
2729	// If we didn't find an extra callee-saved register to spill, create
2730	// an emergency spill slot.
2731	if (!ExtraCSSpill \|\| MF.getRegInfo().isPhysRegUsed(PhysReg: ExtraCSSpill)) {
2732	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2733	const TargetRegisterClass &RC = AArch64::GPR64RegClass;
2734	unsigned Size = TRI->getSpillSize(RC);
2735	Align Alignment = TRI->getSpillAlign(RC);
2736	int FI = MFI.CreateSpillStackObject(Size, Alignment);
2737	RS->addScavengingFrameIndex(FI);
2738	LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
2739	<< " as the emergency spill slot.\n");
2740	}
2741	}
2742
2743	// Adding the size of additional 64bit GPR saves.
2744	CSStackSize += `8` * (SavedRegs.count() - NumSavedRegs);
2745
2746	// A Swift asynchronous context extends the frame record with a pointer
2747	// directly before FP.
2748	if (hasFP(MF) && AFI->hasSwiftAsyncContext())
2749	CSStackSize += `8`;
2750
2751	uint64_t AlignedCSStackSize = alignTo(Value: CSStackSize, Align: `16`);
2752	LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
2753	<< EstimatedStackSize + AlignedCSStackSize << " bytes.\n");
2754
2755	assert((!MFI.isCalleeSavedInfoValid() \|\|
2756	AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
2757	"Should not invalidate callee saved info");
2758
2759	// Round up to register pair alignment to avoid additional SP adjustment
2760	// instructions.
2761	AFI->setCalleeSavedStackSize(AlignedCSStackSize);
2762	AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
2763	AFI->setSVECalleeSavedStackSize(ZPR: ZPRCSStackSize, PPR: alignTo(Value: PPRCSStackSize, Align: `16`));
2764	}
2765
2766	bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
2767	MachineFunction &MF, const TargetRegisterInfo *RegInfo,
2768	std::vector<CalleeSavedInfo> &CSI) const {
2769	bool IsWindows = isTargetWindows(MF);
2770	unsigned StackHazardSize = getStackHazardSize(MF);
2771	// To match the canonical windows frame layout, reverse the list of
2772	// callee saved registers to get them laid out by PrologEpilogInserter
2773	// in the right order. (PrologEpilogInserter allocates stack objects top
2774	// down. Windows canonical prologs store higher numbered registers at
2775	// the top, thus have the CSI array start from the highest registers.)
2776	if (IsWindows)
2777	std::reverse(first: CSI.begin(), last: CSI.end());
2778
2779	if (CSI.empty())
2780	return true; // Early exit if no callee saved registers are modified!
2781
2782	// Now that we know which registers need to be saved and restored, allocate
2783	// stack slots for them.
2784	MachineFrameInfo &MFI = MF.getFrameInfo();
2785	auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2786
2787	if (IsWindows && hasFP(MF) && AFI->hasSwiftAsyncContext()) {
2788	int FrameIdx = MFI.CreateStackObject(Size: `8`, Alignment: Align (`16`), isSpillSlot: true);
2789	AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
2790	MFI.setIsCalleeSavedObjectIndex(ObjectIdx: FrameIdx, IsCalleeSaved: true);
2791	}
2792
2793	// Insert VG into the list of CSRs, immediately before LR if saved.
2794	if (requiresSaveVG(MF)) {
2795	CalleeSavedInfo VGInfo(AArch64::VG);
2796	auto It =
2797	find_if(Range&: CSI, P: [](auto &Info) { return Info.getReg() == AArch64::LR; });
2798	if (It != CSI.end())
2799	CSI.insert(position: It, x: VGInfo);
2800	else
2801	CSI.push_back(x: VGInfo);
2802	}
2803
2804	Register LastReg = `0`;
2805	int HazardSlotIndex = std::numeric_limits<int>::max();
2806	for (auto &CS : CSI) {
2807	MCRegister Reg = CS.getReg();
2808	const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
2809
2810	// Create a hazard slot as we switch between GPR and FPR CSRs.
2811	if (AFI->isStackHazardIncludedInCalleeSaveArea() &&
2812	(!LastReg \|\| !AArch64InstrInfo::isFpOrNEON(Reg: LastReg)) &&
2813	AArch64InstrInfo::isFpOrNEON(Reg)) {
2814	assert(HazardSlotIndex == std::numeric_limits<int>::max() &&
2815	"Unexpected register order for hazard slot");
2816	HazardSlotIndex = MFI.CreateStackObject(Size: StackHazardSize, Alignment: Align (`8`), isSpillSlot: true);
2817	LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
2818	<< "\n");
2819	AFI->setStackHazardCSRSlotIndex(HazardSlotIndex);
2820	MFI.setIsCalleeSavedObjectIndex(ObjectIdx: HazardSlotIndex, IsCalleeSaved: true);
2821	}
2822
2823	unsigned Size = RegInfo->getSpillSize(RC: *RC);
2824	Align Alignment(RegInfo->getSpillAlign(RC: *RC));
2825	int FrameIdx = MFI.CreateStackObject(Size, Alignment, isSpillSlot: true);
2826	CS.setFrameIdx(FrameIdx);
2827	MFI.setIsCalleeSavedObjectIndex(ObjectIdx: FrameIdx, IsCalleeSaved: true);
2828
2829	// Grab 8 bytes below FP for the extended asynchronous frame info.
2830	if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !IsWindows &&
2831	Reg == AArch64::FP) {
2832	FrameIdx = MFI.CreateStackObject(Size: `8`, Alignment, isSpillSlot: true);
2833	AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
2834	MFI.setIsCalleeSavedObjectIndex(ObjectIdx: FrameIdx, IsCalleeSaved: true);
2835	}
2836	LastReg = Reg;
2837	}
2838
2839	// Add hazard slot in the case where no FPR CSRs are present.
2840	if (AFI->isStackHazardIncludedInCalleeSaveArea() &&
2841	HazardSlotIndex == std::numeric_limits<int>::max()) {
2842	HazardSlotIndex = MFI.CreateStackObject(Size: StackHazardSize, Alignment: Align (`8`), isSpillSlot: true);
2843	LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
2844	<< "\n");
2845	AFI->setStackHazardCSRSlotIndex(HazardSlotIndex);
2846	MFI.setIsCalleeSavedObjectIndex(ObjectIdx: HazardSlotIndex, IsCalleeSaved: true);
2847	}
2848
2849	return true;
2850	}
2851
2852	bool AArch64FrameLowering::enableStackSlotScavenging(
2853	const MachineFunction &MF) const {
2854	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2855	// If the function has streaming-mode changes, don't scavenge a
2856	// spillslot in the callee-save area, as that might require an
2857	// 'addvl' in the streaming-mode-changing call-sequence when the
2858	// function doesn't use a FP.
2859	if (AFI->hasStreamingModeChanges() && !hasFP(MF))
2860	return false;
2861	// Don't allow register salvaging with hazard slots, in case it moves objects
2862	// into the wrong place.
2863	if (AFI->hasStackHazardSlotIndex())
2864	return false;
2865	return AFI->hasCalleeSaveStackFreeSpace();
2866	}
2867
2868	/// returns true if there are any SVE callee saves.
2869	static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
2870	int &Min, int &Max) {
2871	Min = std::numeric_limits<int>::max();
2872	Max = std::numeric_limits<int>::min();
2873
2874	if (!MFI.isCalleeSavedInfoValid())
2875	return false;
2876
2877	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
2878	for (auto &CS : CSI) {
2879	if (AArch64::ZPRRegClass.contains(Reg: CS.getReg()) \|\|
2880	AArch64::PPRRegClass.contains(Reg: CS.getReg())) {
2881	assert((Max == std::numeric_limits<int>::min() \|\|
2882	Max + `1` == CS.getFrameIdx()) &&
2883	"SVE CalleeSaves are not consecutive");
2884	Min = std::min(a: Min, b: CS.getFrameIdx());
2885	Max = std::max(a: Max, b: CS.getFrameIdx());
2886	}
2887	}
2888	return Min != std::numeric_limits<int>::max();
2889	}
2890
2891	static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
2892	AssignObjectOffsets AssignOffsets) {
2893	MachineFrameInfo &MFI = MF.getFrameInfo();
2894	auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2895
2896	SVEStackSizes SVEStack{};
2897
2898	// With SplitSVEObjects we maintain separate stack offsets for predicates
2899	// (PPRs) and SVE vectors (ZPRs). When SplitSVEObjects is disabled predicates
2900	// are included in the SVE vector area.
2901	uint64_t &ZPRStackTop = SVEStack.ZPRStackSize;
2902	uint64_t &PPRStackTop =
2903	AFI->hasSplitSVEObjects() ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize;
2904
2905	#ifndef NDEBUG
2906	// First process all fixed stack objects.
2907	for (int I = MFI.getObjectIndexBegin(); I != `0`; ++I)
2908	assert(!MFI.hasScalableStackID(I) &&
2909	"SVE vectors should never be passed on the stack by value, only by "
2910	"reference.");
2911	#endif
2912
2913	auto AllocateObject = [&](int FI) {
2914	uint64_t &StackTop = MFI.getStackID(ObjectIdx: FI) == TargetStackID::ScalableVector
2915	? ZPRStackTop
2916	: PPRStackTop;
2917
2918	// FIXME: Given that the length of SVE vectors is not necessarily a power of
2919	// two, we'd need to align every object dynamically at runtime if the
2920	// alignment is larger than 16. This is not yet supported.
2921	Align Alignment = MFI.getObjectAlign(ObjectIdx: FI);
2922	if (Alignment > Align (`16`))
2923	report_fatal_error(
2924	reason: "Alignment of scalable vectors > 16 bytes is not yet supported");
2925
2926	StackTop += MFI.getObjectSize(ObjectIdx: FI);
2927	StackTop = alignTo(Size: StackTop, A: Alignment);
2928
2929	assert(StackTop < (uint64_t)std::numeric_limits<int64_t>::max() &&
2930	"SVE StackTop far too large?!");
2931
2932	int64_t Offset = -int64_t(StackTop);
2933	if (AssignOffsets == AssignObjectOffsets::Yes)
2934	MFI.setObjectOffset(ObjectIdx: FI, SPOffset: Offset);
2935
2936	LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
2937	};
2938
2939	// Then process all callee saved slots.
2940	int MinCSFrameIndex, MaxCSFrameIndex;
2941	if (getSVECalleeSaveSlotRange(MFI, Min&: MinCSFrameIndex, Max&: MaxCSFrameIndex)) {
2942	for (int FI = MinCSFrameIndex; FI <= MaxCSFrameIndex; ++FI)
2943	AllocateObject (FI);
2944	}
2945
2946	// Ensure the CS area is 16-byte aligned.
2947	PPRStackTop = alignTo(Size: PPRStackTop, A: Align (`16U`));
2948	ZPRStackTop = alignTo(Size: ZPRStackTop, A: Align (`16U`));
2949
2950	// Create a buffer of SVE objects to allocate and sort it.
2951	SmallVector<int, `8`> ObjectsToAllocate;
2952	// If we have a stack protector, and we've previously decided that we have SVE
2953	// objects on the stack and thus need it to go in the SVE stack area, then it
2954	// needs to go first.
2955	int StackProtectorFI = -`1`;
2956	if (MFI.hasStackProtectorIndex()) {
2957	StackProtectorFI = MFI.getStackProtectorIndex();
2958	if (MFI.getStackID(ObjectIdx: StackProtectorFI) == TargetStackID::ScalableVector)
2959	ObjectsToAllocate.push_back(Elt: StackProtectorFI);
2960	}
2961
2962	for (int FI = `0`, E = MFI.getObjectIndexEnd(); FI != E; ++FI) {
2963	if (FI == StackProtectorFI \|\| MFI.isDeadObjectIndex(ObjectIdx: FI) \|\|
2964	MFI.isCalleeSavedObjectIndex(ObjectIdx: FI))
2965	continue;
2966
2967	if (MFI.getStackID(ObjectIdx: FI) != TargetStackID::ScalableVector &&
2968	MFI.getStackID(ObjectIdx: FI) != TargetStackID::ScalablePredicateVector)
2969	continue;
2970
2971	ObjectsToAllocate.push_back(Elt: FI);
2972	}
2973
2974	// Allocate all SVE locals and spills
2975	for (unsigned FI : ObjectsToAllocate)
2976	AllocateObject (FI);
2977
2978	PPRStackTop = alignTo(Size: PPRStackTop, A: Align (`16U`));
2979	ZPRStackTop = alignTo(Size: ZPRStackTop, A: Align (`16U`));
2980
2981	if (AssignOffsets == AssignObjectOffsets::Yes)
2982	AFI->setStackSizeSVE(ZPR: SVEStack.ZPRStackSize, PPR: SVEStack.PPRStackSize);
2983
2984	return SVEStack;
2985	}
2986
2987	void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
2988	MachineFunction &MF, RegScavenger RS) const* {
2989	assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
2990	"Upwards growing stack unsupported");
2991
2992	(void)determineSVEStackSizes(MF, AssignOffsets: AssignObjectOffsets::Yes);
2993
2994	// If this function isn't doing Win64-style C++ EH, we don't need to do
2995	// anything.
2996	if (!MF.hasEHFunclets())
2997	return;
2998
2999	MachineFrameInfo &MFI = MF.getFrameInfo();
3000	auto *AFI = MF.getInfo<AArch64FunctionInfo>();
3001
3002	// Win64 C++ EH needs to allocate space for the catch objects in the fixed
3003	// object area right next to the UnwindHelp object.
3004	WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
3005	int64_t CurrentOffset =
3006	AFI->getVarArgsGPRSize() + AFI->getTailCallReservedStack();
3007	for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
3008	for (WinEHHandlerType &H : TBME.HandlerArray) {
3009	int FrameIndex = H.CatchObj.FrameIndex;
3010	if ((FrameIndex != INT_MAX) && MFI.getObjectOffset(ObjectIdx: FrameIndex) == `0`) {
3011	CurrentOffset =
3012	alignTo(Value: CurrentOffset, Align: MFI.getObjectAlign(ObjectIdx: FrameIndex).value());
3013	CurrentOffset += MFI.getObjectSize(ObjectIdx: FrameIndex);
3014	MFI.setObjectOffset(ObjectIdx: FrameIndex, SPOffset: -CurrentOffset);
3015	}
3016	}
3017	}
3018
3019	// Create an UnwindHelp object.
3020	// The UnwindHelp object is allocated at the start of the fixed object area
3021	int64_t UnwindHelpOffset = alignTo(Size: CurrentOffset + `8`, A: Align (`16`));
3022	assert(UnwindHelpOffset == getFixedObjectSize(MF, AFI, /IsWin64/ true,
3023	/IsFunclet/ false) &&
3024	"UnwindHelpOffset must be at the start of the fixed object area");
3025	int UnwindHelpFI = MFI.CreateFixedObject(/Size/ `8`, SPOffset: -UnwindHelpOffset,
3026	/IsImmutable=/false);
3027	EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
3028
3029	MachineBasicBlock &MBB = MF.front();
3030	auto MBBI = MBB.begin();
3031	while (MBBI != MBB.end() && MBBI ->getFlag(Flag: MachineInstr::FrameSetup))
3032	++MBBI;
3033
3034	// We need to store -2 into the UnwindHelp object at the start of the
3035	// function.
3036	DebugLoc DL;
3037	RS->enterBasicBlockEnd(MBB);
3038	RS->backward(I: MBBI);
3039	Register DstReg = RS->FindUnusedReg(RC: &AArch64::GPR64commonRegClass);
3040	assert(DstReg && "There must be a free register after frame setup");
3041	const AArch64InstrInfo &TII =
3042	*MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3043	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::MOVi64imm), DestReg: DstReg).addImm(Val: -`2`);
3044	BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII.get(Opcode: AArch64::STURXi))
3045	.addReg(RegNo: DstReg, Flags: getKillRegState(B: true))
3046	.addFrameIndex(Idx: UnwindHelpFI)
3047	.addImm(Val: `0`);
3048	}
3049
3050	namespace {
3051	struct TagStoreInstr {
3052	MachineInstr *MI;
3053	int64_t Offset, Size;
3054	explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
3055	: MI(MI), Offset(Offset), Size(Size) {}
3056	};
3057
3058	class TagStoreEdit {
3059	MachineFunction *MF;
3060	MachineBasicBlock *MBB;
3061	MachineRegisterInfo *MRI;
3062	// Tag store instructions that are being replaced.
3063	SmallVector<TagStoreInstr, `8`> TagStores;
3064	// Combined memref arguments of the above instructions.
3065	SmallVector<MachineMemOperand *, `8`> CombinedMemRefs;
3066
3067	// Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
3068	// FrameRegOffset + Size) with the address tag of SP.
3069	Register FrameReg;
3070	StackOffset FrameRegOffset;
3071	int64_t Size;
3072	// If not std::nullopt, move FrameReg to (FrameReg + FrameRegUpdate) at the
3073	// end.
3074	std::optional<int64_t> FrameRegUpdate;
3075	// MIFlags for any FrameReg updating instructions.
3076	unsigned FrameRegUpdateFlags;
3077
3078	// Use zeroing instruction variants.
3079	bool ZeroData;
3080	DebugLoc DL;
3081
3082	void emitUnrolled(MachineBasicBlock::iterator InsertI);
3083	void emitLoop(MachineBasicBlock::iterator InsertI);
3084
3085	public:
3086	TagStoreEdit(MachineBasicBlock MBB, bool* ZeroData)
3087	: MBB(MBB), ZeroData(ZeroData) {
3088	MF = MBB->getParent();
3089	MRI = &MF->getRegInfo();
3090	}
3091	// Add an instruction to be replaced. Instructions must be added in the
3092	// ascending order of Offset, and have to be adjacent.
3093	void addInstruction(TagStoreInstr I) {
3094	assert((TagStores.empty() \|\|
3095	TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
3096	"Non-adjacent tag store instructions.");
3097	TagStores.push_back(Elt: I);
3098	}
3099	void clear() { TagStores.clear(); }
3100	// Emit equivalent code at the given location, and erase the current set of
3101	// instructions. May skip if the replacement is not profitable. May invalidate
3102	// the input iterator and replace it with a valid one.
3103	void emitCode(MachineBasicBlock::iterator &InsertI,
3104	const AArch64FrameLowering TFI, bool* TryMergeSPUpdate);
3105	};
3106
3107	void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
3108	const AArch64InstrInfo *TII =
3109	MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3110
3111	const int64_t kMinOffset = -`256` * `16`;
3112	const int64_t kMaxOffset = `255` * `16`;
3113
3114	Register BaseReg = FrameReg;
3115	int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
3116	if (BaseRegOffsetBytes < kMinOffset \|\|
3117	BaseRegOffsetBytes + (Size - Size % `32`) > kMaxOffset \|\|
3118	// BaseReg can be FP, which is not necessarily aligned to 16-bytes. In
3119	// that case, BaseRegOffsetBytes will not be aligned to 16 bytes, which
3120	// is required for the offset of ST2G.
3121	BaseRegOffsetBytes % `16` != `0`) {
3122	Register ScratchReg = MRI->createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3123	emitFrameOffset(MBB&: *MBB, MBBI: InsertI, DL, DestReg: ScratchReg, SrcReg: BaseReg,
3124	Offset: StackOffset::getFixed(Fixed: BaseRegOffsetBytes), TII);
3125	BaseReg = ScratchReg;
3126	BaseRegOffsetBytes = `0`;
3127	}
3128
3129	MachineInstr LastI = nullptr*;
3130	while (Size) {
3131	int64_t InstrSize = (Size > `16`) ? `32` : `16`;
3132	unsigned Opcode =
3133	InstrSize == `16`
3134	? (ZeroData ? AArch64::STZGi : AArch64::STGi)
3135	: (ZeroData ? AArch64::STZ2Gi : AArch64::ST2Gi);
3136	assert(BaseRegOffsetBytes % `16` == `0`);
3137	MachineInstr I = BuildMI(BB&: MBB, I: InsertI, MIMD: DL, MCID: TII->get(Opcode))
3138	.addReg(RegNo: AArch64::SP)
3139	.addReg(RegNo: BaseReg)
3140	.addImm(Val: BaseRegOffsetBytes / `16`)
3141	.setMemRefs(CombinedMemRefs);
3142	// A store to [BaseReg, #0] should go last for an opportunity to fold the
3143	// final SP adjustment in the epilogue.
3144	if (BaseRegOffsetBytes == `0`)
3145	LastI = I;
3146	BaseRegOffsetBytes += InstrSize;
3147	Size -= InstrSize;
3148	}
3149
3150	if (LastI)
3151	MBB->splice(Where: InsertI, Other: MBB, From: LastI);
3152	}
3153
3154	void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
3155	const AArch64InstrInfo *TII =
3156	MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3157
3158	Register BaseReg = FrameRegUpdate
3159	? FrameReg
3160	: MRI->createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3161	Register SizeReg = MRI->createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3162
3163	emitFrameOffset(MBB&: *MBB, MBBI: InsertI, DL, DestReg: BaseReg, SrcReg: FrameReg, Offset: FrameRegOffset, TII);
3164
3165	int64_t LoopSize = Size;
3166	// If the loop size is not a multiple of 32, split off one 16-byte store at
3167	// the end to fold BaseReg update into.
3168	if (FrameRegUpdate && *FrameRegUpdate)
3169	LoopSize -= LoopSize % `32`;
3170	MachineInstr LoopI = BuildMI(BB&: MBB, I: InsertI, MIMD: DL,
3171	MCID: TII->get(Opcode: ZeroData ? AArch64::STZGloop_wback
3172	: AArch64::STGloop_wback))
3173	.addDef(RegNo: SizeReg)
3174	.addDef(RegNo: BaseReg)
3175	.addImm(Val: LoopSize)
3176	.addReg(RegNo: BaseReg)
3177	.setMemRefs(CombinedMemRefs);
3178	if (FrameRegUpdate)
3179	LoopI->setFlags(FrameRegUpdateFlags);
3180
3181	int64_t ExtraBaseRegUpdate =
3182	FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : `0`;
3183	LLVM_DEBUG(dbgs() << "TagStoreEdit::emitLoop: LoopSize=" << LoopSize
3184	<< ", Size=" << Size
3185	<< ", ExtraBaseRegUpdate=" << ExtraBaseRegUpdate
3186	<< ", FrameRegUpdate=" << FrameRegUpdate
3187	<< ", FrameRegOffset.getFixed()="
3188	<< FrameRegOffset.getFixed() << "\n");
3189	if (LoopSize < Size) {
3190	assert(FrameRegUpdate);
3191	assert(Size - LoopSize == `16`);
3192	// Tag 16 more bytes at BaseReg and update BaseReg.
3193	int64_t STGOffset = ExtraBaseRegUpdate + `16`;
3194	assert(STGOffset % `16` == `0` && STGOffset >= -`4096` && STGOffset <= `4080` &&
3195	"STG immediate out of range");
3196	BuildMI(BB&: *MBB, I: InsertI, MIMD: DL,
3197	MCID: TII->get(Opcode: ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3198	.addDef(RegNo: BaseReg)
3199	.addReg(RegNo: BaseReg)
3200	.addReg(RegNo: BaseReg)
3201	.addImm(Val: STGOffset / `16`)
3202	.setMemRefs(CombinedMemRefs)
3203	.setMIFlags(FrameRegUpdateFlags);
3204	} else if (ExtraBaseRegUpdate) {
3205	// Update BaseReg.
3206	int64_t AddSubOffset = std::abs(i: ExtraBaseRegUpdate);
3207	assert(AddSubOffset <= `4095` && "ADD/SUB immediate out of range");
3208	BuildMI(
3209	BB&: *MBB, I: InsertI, MIMD: DL,
3210	MCID: TII->get(Opcode: ExtraBaseRegUpdate > `0` ? AArch64::ADDXri : AArch64::SUBXri))
3211	.addDef(RegNo: BaseReg)
3212	.addReg(RegNo: BaseReg)
3213	.addImm(Val: AddSubOffset)
3214	.addImm(Val: `0`)
3215	.setMIFlags(FrameRegUpdateFlags);
3216	}
3217	}
3218
3219	// Check if II is a register update that can be merged into STGloop that ends*
3220	// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3221	// end of the loop.
3222	bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3223	int64_t Size, int64_t *TotalOffset) {
3224	MachineInstr &MI = *II;
3225	if ((MI.getOpcode() == AArch64::ADDXri \|\|
3226	MI.getOpcode() == AArch64::SUBXri) &&
3227	MI.getOperand(i: `0`).getReg() == Reg && MI.getOperand(i: `1`).getReg() == Reg) {
3228	unsigned Shift = AArch64_AM::getShiftValue(Imm: MI.getOperand(i: `3`).getImm());
3229	int64_t Offset = MI.getOperand(i: `2`).getImm() << Shift;
3230	if (MI.getOpcode() == AArch64::SUBXri)
3231	Offset = -Offset;
3232	int64_t PostOffset = Offset - Size;
3233	// TagStoreEdit::emitLoop might emit either an ADD/SUB after the loop, or
3234	// an STGPostIndex which does the last 16 bytes of tag write. Which one is
3235	// chosen depends on the alignment of the loop size, but the difference
3236	// between the valid ranges for the two instructions is small, so we
3237	// conservatively assume that it could be either case here.
3238	//
3239	// Max offset of STGPostIndex, minus the 16 byte tag write folded into that
3240	// instruction.
3241	const int64_t kMaxOffset = `4080` - `16`;
3242	// Max offset of SUBXri.
3243	const int64_t kMinOffset = -`4095`;
3244	if (PostOffset <= kMaxOffset && PostOffset >= kMinOffset &&
3245	PostOffset % `16` == `0`) {
3246	*TotalOffset = Offset;
3247	return true;
3248	}
3249	}
3250	return false;
3251	}
3252
3253	void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3254	SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3255	MemRefs.clear();
3256	for (auto &TS : TSE) {
3257	MachineInstr *MI = TS.MI;
3258	// An instruction without memory operands may access anything. Be
3259	// conservative and return an empty list.
3260	if (MI->memoperands_empty()) {
3261	MemRefs.clear();
3262	return;
3263	}
3264	MemRefs.append(in_start: MI->memoperands_begin(), in_end: MI->memoperands_end());
3265	}
3266	}
3267
3268	void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3269	const AArch64FrameLowering *TFI,
3270	bool TryMergeSPUpdate) {
3271	if (TagStores.empty())
3272	return;
3273	TagStoreInstr &FirstTagStore = TagStores [`0`];
3274	TagStoreInstr &LastTagStore = TagStores [TagStores.size() - `1`];
3275	Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3276	DL = TagStores [`0`].MI->getDebugLoc();
3277
3278	Register Reg;
3279	FrameRegOffset = TFI->resolveFrameOffsetReference(
3280	MF: MF, ObjectOffset: FirstTagStore.Offset, isFixed: false* /isFixed/,
3281	StackID: TargetStackID::Default /StackID/, FrameReg&: Reg,
3282	/PreferFP=/false, /ForSimm=/true);
3283	FrameReg = Reg;
3284	FrameRegUpdate = std::nullopt;
3285
3286	mergeMemRefs(TSE: TagStores, MemRefs&: CombinedMemRefs);
3287
3288	LLVM_DEBUG({
3289	dbgs() << "Replacing adjacent STG instructions:\n";
3290	for (const auto &Instr : TagStores) {
3291	dbgs() << " " << *Instr.MI;
3292	}
3293	});
3294
3295	// Size threshold where a loop becomes shorter than a linear sequence of
3296	// tagging instructions.
3297	const int kSetTagLoopThreshold = `176`;
3298	if (Size < kSetTagLoopThreshold) {
3299	if (TagStores.size() < `2`)
3300	return;
3301	emitUnrolled(InsertI);
3302	} else {
3303	MachineInstr UpdateInstr = nullptr*;
3304	int64_t TotalOffset = `0`;
3305	if (TryMergeSPUpdate) {
3306	// See if we can merge base register update into the STGloop.
3307	// This is done in AArch64LoadStoreOptimizer for "normal" stores,
3308	// but STGloop is way too unusual for that, and also it only
3309	// realistically happens in function epilogue. Also, STGloop is expanded
3310	// before that pass.
3311	if (InsertI != MBB->end() &&
3312	canMergeRegUpdate(II: InsertI, Reg: FrameReg, Size: FrameRegOffset.getFixed() + Size,
3313	TotalOffset: &TotalOffset)) {
3314	UpdateInstr = &*InsertI ++;
3315	LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
3316	<< *UpdateInstr);
3317	}
3318	}
3319
3320	if (!UpdateInstr && TagStores.size() < `2`)
3321	return;
3322
3323	if (UpdateInstr) {
3324	FrameRegUpdate = TotalOffset;
3325	FrameRegUpdateFlags = UpdateInstr->getFlags();
3326	}
3327	emitLoop(InsertI);
3328	if (UpdateInstr)
3329	UpdateInstr->eraseFromParent();
3330	}
3331
3332	for (auto &TS : TagStores)
3333	TS.MI->eraseFromParent();
3334	}
3335
3336	bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3337	int64_t &Size, bool &ZeroData) {
3338	MachineFunction &MF = *MI.getParent()->getParent();
3339	const MachineFrameInfo &MFI = MF.getFrameInfo();
3340
3341	unsigned Opcode = MI.getOpcode();
3342	ZeroData = (Opcode == AArch64::STZGloop \|\| Opcode == AArch64::STZGi \|\|
3343	Opcode == AArch64::STZ2Gi);
3344
3345	if (Opcode == AArch64::STGloop \|\| Opcode == AArch64::STZGloop) {
3346	if (!MI.getOperand(i: `0`).isDead() \|\| !MI.getOperand(i: `1`).isDead())
3347	return false;
3348	if (!MI.getOperand(i: `2`).isImm() \|\| !MI.getOperand(i: `3`).isFI())
3349	return false;
3350	Offset = MFI.getObjectOffset(ObjectIdx: MI.getOperand(i: `3`).getIndex());
3351	Size = MI.getOperand(i: `2`).getImm();
3352	return true;
3353	}
3354
3355	if (Opcode == AArch64::STGi \|\| Opcode == AArch64::STZGi)
3356	Size = `16`;
3357	else if (Opcode == AArch64::ST2Gi \|\| Opcode == AArch64::STZ2Gi)
3358	Size = `32`;
3359	else
3360	return false;
3361
3362	if (MI.getOperand(i: `0`).getReg() != AArch64::SP \|\| !MI.getOperand(i: `1`).isFI())
3363	return false;
3364
3365	Offset = MFI.getObjectOffset(ObjectIdx: MI.getOperand(i: `1`).getIndex()) +
3366	`16` * MI.getOperand(i: `2`).getImm();
3367	return true;
3368	}
3369
3370	static size_t countAvailableScavengerSlots(LivePhysRegs &LiveRegs,
3371	MachineRegisterInfo &MRI,
3372	RegScavenger *RS) {
3373	auto FreeGPRs =
3374	llvm::count_if(Range: AArch64::GPR64RegClass, P: [&LiveRegs, &MRI](auto Reg) {
3375	return LiveRegs.available(MRI, Reg);
3376	});
3377
3378	size_t NumEmergencySlots = `0`;
3379	if (RS)
3380	NumEmergencySlots = RS->getNumScavengingFrameIndices();
3381
3382	return FreeGPRs + NumEmergencySlots;
3383	}
3384
3385	// Detect a run of memory tagging instructions for adjacent stack frame slots,
3386	// and replace them with a shorter instruction sequence:
3387	// replace STG + STG with ST2G*
3388	// replace STGloop + STGloop with STGloop*
3389	// This code needs to run when stack slot offsets are already known, but before
3390	// FrameIndex operands in STG instructions are eliminated.
3391	MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3392	const AArch64FrameLowering *TFI,
3393	RegScavenger *RS) {
3394	bool FirstZeroData;
3395	int64_t Size, Offset;
3396	MachineInstr &MI = *II;
3397	MachineBasicBlock *MBB = MI.getParent();
3398	MachineBasicBlock::iterator NextI = ++II;
3399	if (&MI == &MBB->instr_back())
3400	return II;
3401	if (!isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData&: FirstZeroData))
3402	return II;
3403
3404	SmallVector<TagStoreInstr, `4`> Instrs;
3405	Instrs.emplace_back(Args: &MI, Args&: Offset, Args&: Size);
3406
3407	constexpr int kScanLimit = `10`;
3408	int Count = `0`;
3409	for (MachineBasicBlock::iterator E = MBB->end();
3410	NextI != E && Count < kScanLimit; ++NextI) {
3411	MachineInstr &MI = *NextI;
3412	bool ZeroData;
3413	int64_t Size, Offset;
3414	// Collect instructions that update memory tags with a FrameIndex operand
3415	// and (when applicable) constant size, and whose output registers are dead
3416	// (the latter is almost always the case in practice). Since these
3417	// instructions effectively have no inputs or outputs, we are free to skip
3418	// any non-aliasing instructions in between without tracking used registers.
3419	if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3420	if (ZeroData != FirstZeroData)
3421	break;
3422	Instrs.emplace_back(Args: &MI, Args&: Offset, Args&: Size);
3423	continue;
3424	}
3425
3426	// Only count non-transient, non-tagging instructions toward the scan
3427	// limit.
3428	if (!MI.isTransient())
3429	++Count;
3430
3431	// Just in case, stop before the epilogue code starts.
3432	if (MI.getFlag(Flag: MachineInstr::FrameSetup) \|\|
3433	MI.getFlag(Flag: MachineInstr::FrameDestroy))
3434	break;
3435
3436	// Reject anything that may alias the collected instructions.
3437	if (MI.mayLoadOrStore() \|\| MI.hasUnmodeledSideEffects() \|\| MI.isCall())
3438	break;
3439	}
3440
3441	// New code will be inserted after the last tagging instruction we've found.
3442	MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3443
3444	// All the gathered stack tag instructions are merged and placed after
3445	// last tag store in the list. The check should be made if the nzcv
3446	// flag is live at the point where we are trying to insert. Otherwise
3447	// the nzcv flag might get clobbered if any stg loops are present.
3448
3449	// FIXME : This approach of bailing out from merge is conservative in
3450	// some ways like even if stg loops are not present after merge the
3451	// insert list, this liveness check is done (which is not needed).
3452	LivePhysRegs LiveRegs(*(MBB->getParent()->getSubtarget().getRegisterInfo()));
3453	LiveRegs.addLiveOuts(MBB: *MBB);
3454	for (auto I = MBB->rbegin();; ++I) {
3455	MachineInstr &MI = *I;
3456	if (MI == InsertI)
3457	break;
3458	LiveRegs.stepBackward(MI: *I);
3459	}
3460	InsertI ++;
3461	if (LiveRegs.contains(Reg: AArch64::NZCV))
3462	return InsertI;
3463
3464	// Emitting an MTE loop requires two physical registers (BaseReg and
3465	// SizeReg). If the function is under register pressure, the register
3466	// scavenger will crash trying to allocate them. If we don't have at least
3467	// two free slots (free registers + emergency slots), bail out and fall back
3468	// to the unrolled sequence.
3469	if (countAvailableScavengerSlots(LiveRegs, MRI&: MBB->getParent()->getRegInfo(),
3470	RS) < `2`) {
3471	LLVM_DEBUG(
3472	dbgs() << "Failed to merge MTE stack tagging instructions into loop "
3473	<< "due to high register pressure.\n");
3474	return InsertI;
3475	}
3476
3477	llvm::stable_sort(Range&: Instrs,
3478	C: [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3479	return Left.Offset < Right.Offset;
3480	});
3481
3482	// Make sure that we don't have any overlapping stores.
3483	int64_t CurOffset = Instrs [`0`].Offset;
3484	for (auto &Instr : Instrs) {
3485	if (CurOffset > Instr.Offset)
3486	return NextI;
3487	CurOffset = Instr.Offset + Instr.Size;
3488	}
3489
3490	// Find contiguous runs of tagged memory and emit shorter instruction
3491	// sequences for them when possible.
3492	TagStoreEdit TSE(MBB, FirstZeroData);
3493	std::optional<int64_t> EndOffset;
3494	for (auto &Instr : Instrs) {
3495	if (EndOffset && *EndOffset != Instr.Offset) {
3496	// Found a gap.
3497	TSE.emitCode(InsertI, TFI, /TryMergeSPUpdate = / false);
3498	TSE.clear();
3499	}
3500
3501	TSE.addInstruction(I: Instr);
3502	EndOffset = Instr.Offset + Instr.Size;
3503	}
3504
3505	const MachineFunction *MF = MBB->getParent();
3506	// Multiple FP/SP updates in a loop cannot be described by CFI instructions.
3507	TSE.emitCode(
3508	InsertI, TFI, /TryMergeSPUpdate = /
3509	!MF->getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF: *MF));
3510
3511	return InsertI;
3512	}
3513	} // namespace
3514
3515	void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3516	MachineFunction &MF, RegScavenger RS = nullptr) const* {
3517	for (auto &BB : MF)
3518	for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) {
3519	if (StackTaggingMergeSetTag)
3520	II = tryMergeAdjacentSTG(II, TFI: this, RS);
3521	}
3522
3523	// By the time this method is called, most of the prologue/epilogue code is
3524	// already emitted, whether its location was affected by the shrink-wrapping
3525	// optimization or not.
3526	if (!MF.getFunction().hasFnAttribute(Kind: Attribute::Naked) &&
3527	shouldSignReturnAddressEverywhere(MF))
3528	emitPacRetPlusLeafHardening(MF);
3529	}
3530
3531	/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3532	/// before the update. This is easily retrieved as it is exactly the offset
3533	/// that is set in processFunctionBeforeFrameFinalized.
3534	StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3535	const MachineFunction &MF, int FI, Register &FrameReg,
3536	bool IgnoreSPUpdates) const {
3537	const MachineFrameInfo &MFI = MF.getFrameInfo();
3538	if (IgnoreSPUpdates) {
3539	LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3540	<< MFI.getObjectOffset(FI) << "\n");
3541	FrameReg = AArch64::SP;
3542	return StackOffset::getFixed(Fixed: MFI.getObjectOffset(ObjectIdx: FI));
3543	}
3544
3545	// Go to common code if we cannot provide sp + offset.
3546	if (MFI.hasVarSizedObjects() \|\|
3547	MF.getInfo<AArch64FunctionInfo>()->hasSVEStackSize() \|\|
3548	MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
3549	return getFrameIndexReference(MF, FI, FrameReg);
3550
3551	FrameReg = AArch64::SP;
3552	return getStackOffset(MF, ObjectOffset: MFI.getObjectOffset(ObjectIdx: FI));
3553	}
3554
3555	/// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3556	/// the parent's frame pointer
3557	unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3558	const MachineFunction &MF) const {
3559	return `0`;
3560	}
3561
3562	/// Funclets only need to account for space for the callee saved registers,
3563	/// as the locals are accounted for in the parent's stack frame.
3564	unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3565	const MachineFunction &MF) const {
3566	// This is the size of the pushed CSRs.
3567	unsigned CSSize =
3568	MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3569	// This is the amount of stack a funclet needs to allocate.
3570	return alignTo(Size: CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3571	A: getStackAlign());
3572	}
3573
3574	namespace {
3575	struct FrameObject {
3576	bool IsValid = false;
3577	// Index of the object in MFI.
3578	int ObjectIndex = `0`;
3579	// Group ID this object belongs to.
3580	int GroupIndex = -`1`;
3581	// This object should be placed first (closest to SP).
3582	bool ObjectFirst = false;
3583	// This object's group (which always contains the object with
3584	// ObjectFirst==true) should be placed first.
3585	bool GroupFirst = false;
3586
3587	// Used to distinguish between FP and GPR accesses. The values are decided so
3588	// that they sort FPR < Hazard < GPR and they can be or'd together.
3589	unsigned Accesses = `0`;
3590	enum { AccessFPR = `1`, AccessHazard = `2`, AccessGPR = `4` };
3591	};
3592
3593	class GroupBuilder {
3594	SmallVector<int, `8`> CurrentMembers;
3595	int NextGroupIndex = `0`;
3596	std::vector<FrameObject> &Objects;
3597
3598	public:
3599	GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
3600	void AddMember(int Index) { CurrentMembers.push_back(Elt: Index); }
3601	void EndCurrentGroup() {
3602	if (CurrentMembers.size() > `1`) {
3603	// Create a new group with the current member list. This might remove them
3604	// from their pre-existing groups. That's OK, dealing with overlapping
3605	// groups is too hard and unlikely to make a difference.
3606	LLVM_DEBUG(dbgs() << "group:");
3607	for (int Index : CurrentMembers) {
3608	Objects [Index].GroupIndex = NextGroupIndex;
3609	LLVM_DEBUG(dbgs() << " " << Index);
3610	}
3611	LLVM_DEBUG(dbgs() << "\n");
3612	NextGroupIndex++;
3613	}
3614	CurrentMembers.clear();
3615	}
3616	};
3617
3618	bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3619	// Objects at a lower index are closer to FP; objects at a higher index are
3620	// closer to SP.
3621	//
3622	// For consistency in our comparison, all invalid objects are placed
3623	// at the end. This also allows us to stop walking when we hit the
3624	// first invalid item after it's all sorted.
3625	//
3626	// If we want to include a stack hazard region, order FPR accesses < the
3627	// hazard object < GPRs accesses in order to create a separation between the
3628	// two. For the Accesses field 1 = FPR, 2 = Hazard Object, 4 = GPR.
3629	//
3630	// Otherwise the "first" object goes first (closest to SP), followed by the
3631	// members of the "first" group.
3632	//
3633	// The rest are sorted by the group index to keep the groups together.
3634	// Higher numbered groups are more likely to be around longer (i.e. untagged
3635	// in the function epilogue and not at some earlier point). Place them closer
3636	// to SP.
3637	//
3638	// If all else equal, sort by the object index to keep the objects in the
3639	// original order.
3640	return std::make_tuple(args: !A.IsValid, args: A.Accesses, args: A.ObjectFirst, args: A.GroupFirst,
3641	args: A.GroupIndex, args: A.ObjectIndex) <
3642	std::make_tuple(args: !B.IsValid, args: B.Accesses, args: B.ObjectFirst, args: B.GroupFirst,
3643	args: B.GroupIndex, args: B.ObjectIndex);
3644	}
3645	} // namespace
3646
3647	void AArch64FrameLowering::orderFrameObjects(
3648	const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3649	const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3650
3651	if ((!OrderFrameObjects && !AFI.hasSplitSVEObjects()) \|\|
3652	ObjectsToAllocate.empty())
3653	return;
3654
3655	const MachineFrameInfo &MFI = MF.getFrameInfo();
3656	std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3657	for (auto &Obj : ObjectsToAllocate) {
3658	FrameObjects [Obj].IsValid = true;
3659	FrameObjects [Obj].ObjectIndex = Obj;
3660	}
3661
3662	// Identify FPR vs GPR slots for hazards, and stack slots that are tagged at
3663	// the same time.
3664	GroupBuilder GB(FrameObjects);
3665	for (auto &MBB : MF) {
3666	for (auto &MI : MBB) {
3667	if (MI.isDebugInstr())
3668	continue;
3669
3670	if (AFI.hasStackHazardSlotIndex()) {
3671	std::optional<int> FI = getLdStFrameID(MI, MFI);
3672	if (FI && FI >= `0` && FI < (int)FrameObjects.size()) {
3673	if (MFI.getStackID(ObjectIdx: *FI) == TargetStackID::ScalableVector \|\|
3674	AArch64InstrInfo::isFpOrNEON(MI))
3675	FrameObjects [*FI].Accesses \|= FrameObject::AccessFPR;
3676	else
3677	FrameObjects [*FI].Accesses \|= FrameObject::AccessGPR;
3678	}
3679	}
3680
3681	int OpIndex;
3682	switch (MI.getOpcode()) {
3683	case AArch64::STGloop:
3684	case AArch64::STZGloop:
3685	OpIndex = `3`;
3686	break;
3687	case AArch64::STGi:
3688	case AArch64::STZGi:
3689	case AArch64::ST2Gi:
3690	case AArch64::STZ2Gi:
3691	OpIndex = `1`;
3692	break;
3693	default:
3694	OpIndex = -`1`;
3695	}
3696
3697	int TaggedFI = -`1`;
3698	if (OpIndex >= `0`) {
3699	const MachineOperand &MO = MI.getOperand(i: OpIndex);
3700	if (MO.isFI()) {
3701	int FI = MO.getIndex();
3702	if (FI >= `0` && FI < MFI.getObjectIndexEnd() &&
3703	FrameObjects [FI].IsValid)
3704	TaggedFI = FI;
3705	}
3706	}
3707
3708	// If this is a stack tagging instruction for a slot that is not part of a
3709	// group yet, either start a new group or add it to the current one.
3710	if (TaggedFI >= `0`)
3711	GB.AddMember(Index: TaggedFI);
3712	else
3713	GB.EndCurrentGroup();
3714	}
3715	// Groups should never span multiple basic blocks.
3716	GB.EndCurrentGroup();
3717	}
3718
3719	if (AFI.hasStackHazardSlotIndex()) {
3720	FrameObjects [AFI.getStackHazardSlotIndex()].Accesses =
3721	FrameObject::AccessHazard;
3722	// If a stack object is unknown or both GPR and FPR, sort it into GPR.
3723	for (auto &Obj : FrameObjects)
3724	if (!Obj.Accesses \|\|
3725	Obj.Accesses == (FrameObject::AccessGPR \| FrameObject::AccessFPR))
3726	Obj.Accesses = FrameObject::AccessGPR;
3727	}
3728
3729	// If the function's tagged base pointer is pinned to a stack slot, we want to
3730	// put that slot first when possible. This will likely place it at SP + 0,
3731	// and save one instruction when generating the base pointer because IRG does
3732	// not allow an immediate offset.
3733	std::optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3734	if (TBPI) {
3735	FrameObjects [TBPI].ObjectFirst = true*;
3736	FrameObjects [TBPI].GroupFirst = true*;
3737	int FirstGroupIndex = FrameObjects [*TBPI].GroupIndex;
3738	if (FirstGroupIndex >= `0`)
3739	for (FrameObject &Object : FrameObjects)
3740	if (Object.GroupIndex == FirstGroupIndex)
3741	Object.GroupFirst = true;
3742	}
3743
3744	llvm::stable_sort(Range&: FrameObjects, C: FrameObjectCompare);
3745
3746	int i = `0`;
3747	for (auto &Obj : FrameObjects) {
3748	// All invalid items are sorted at the end, so it's safe to stop.
3749	if (!Obj.IsValid)
3750	break;
3751	ObjectsToAllocate [i++] = Obj.ObjectIndex;
3752	}
3753
3754	LLVM_DEBUG({
3755	dbgs() << "Final frame order:\n";
3756	for (auto &Obj : FrameObjects) {
3757	if (!Obj.IsValid)
3758	break;
3759	dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3760	if (Obj.ObjectFirst)
3761	dbgs() << ", first";
3762	if (Obj.GroupFirst)
3763	dbgs() << ", group-first";
3764	dbgs() << "\n";
3765	}
3766	});
3767	}
3768
3769	/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
3770	/// least every ProbeSize bytes. Returns an iterator of the first instruction
3771	/// after the loop. The difference between SP and TargetReg must be an exact
3772	/// multiple of ProbeSize.
3773	MachineBasicBlock::iterator
3774	AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
3775	MachineBasicBlock::iterator MBBI, int64_t ProbeSize,
3776	Register TargetReg) const {
3777	MachineBasicBlock &MBB = *MBBI ->getParent();
3778	MachineFunction &MF = *MBB.getParent();
3779	const AArch64InstrInfo *TII =
3780	MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3781	DebugLoc DL = MBB.findDebugLoc(MBBI);
3782
3783	MachineFunction::iterator MBBInsertPoint = std::next(x: MBB.getIterator());
3784	MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
3785	MF.insert(MBBI: MBBInsertPoint, MBB: LoopMBB);
3786	MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
3787	MF.insert(MBBI: MBBInsertPoint, MBB: ExitMBB);
3788
3789	// SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable
3790	// in SUB).
3791	emitFrameOffset(MBB&: *LoopMBB, MBBI: LoopMBB->end(), DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
3792	Offset: StackOffset::getFixed(Fixed: -ProbeSize), TII,
3793	MachineInstr::FrameSetup);
3794	// LDR XZR, [SP]
3795	BuildMI(BB&: *LoopMBB, I: LoopMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
3796	.addDef(RegNo: AArch64::XZR)
3797	.addReg(RegNo: AArch64::SP)
3798	.addImm(Val: `0`)
3799	.addMemOperand(MMO: MF.getMachineMemOperand(
3800	PtrInfo: MachinePointerInfo::getUnknownStack(MF),
3801	F: MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile, Size: `8`,
3802	BaseAlignment: Align (`8`)))
3803	.setMIFlags(MachineInstr::FrameSetup);
3804	// CMP SP, TargetReg
3805	BuildMI(BB&: *LoopMBB, I: LoopMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::SUBSXrx64),
3806	DestReg: AArch64::XZR)
3807	.addReg(RegNo: AArch64::SP)
3808	.addReg(RegNo: TargetReg)
3809	.addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: `0`))
3810	.setMIFlags(MachineInstr::FrameSetup);
3811	// B.CC Loop
3812	BuildMI(BB&: *LoopMBB, I: LoopMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc))
3813	.addImm(Val: AArch64CC::NE)
3814	.addMBB(MBB: LoopMBB)
3815	.setMIFlags(MachineInstr::FrameSetup);
3816
3817	LoopMBB->addSuccessor(Succ: ExitMBB);
3818	LoopMBB->addSuccessor(Succ: LoopMBB);
3819	// Synthesize the exit MBB.
3820	ExitMBB->splice(Where: ExitMBB->end(), Other: &MBB, From: MBBI, To: MBB.end());
3821	ExitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
3822	MBB.addSuccessor(Succ: LoopMBB);
3823	// Update liveins.
3824	fullyRecomputeLiveIns(MBBs: {ExitMBB, LoopMBB});
3825
3826	return ExitMBB->begin();
3827	}
3828
3829	void AArch64FrameLowering::inlineStackProbeFixed(
3830	MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize,
3831	StackOffset CFAOffset) const {
3832	MachineBasicBlock *MBB = MBBI ->getParent();
3833	MachineFunction &MF = *MBB->getParent();
3834	const AArch64InstrInfo *TII =
3835	MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3836	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3837	bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
3838	bool HasFP = hasFP(MF);
3839
3840	DebugLoc DL;
3841	int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
3842	int64_t NumBlocks = FrameSize / ProbeSize;
3843	int64_t ResidualSize = FrameSize % ProbeSize;
3844
3845	LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, "
3846	<< NumBlocks << " blocks of " << ProbeSize
3847	<< " bytes, plus " << ResidualSize << " bytes\n");
3848
3849	// Decrement SP by NumBlock ProbeSize bytes, with either unrolled or*
3850	// ordinary loop.
3851	if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) {
3852	for (int i = `0`; i < NumBlocks; ++i) {
3853	// SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not
3854	// encodable in a SUB).
3855	emitFrameOffset(MBB&: *MBB, MBBI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
3856	Offset: StackOffset::getFixed(Fixed: -ProbeSize), TII,
3857	MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI: false, HasWinCFI: nullptr,
3858	EmitCFAOffset: EmitAsyncCFI && !HasFP, InitialOffset: CFAOffset);
3859	CFAOffset += StackOffset::getFixed(Fixed: ProbeSize);
3860	// LDR XZR, [SP]
3861	BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
3862	.addDef(RegNo: AArch64::XZR)
3863	.addReg(RegNo: AArch64::SP)
3864	.addImm(Val: `0`)
3865	.addMemOperand(MMO: MF.getMachineMemOperand(
3866	PtrInfo: MachinePointerInfo::getUnknownStack(MF),
3867	F: MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile, Size: `8`,
3868	BaseAlignment: Align (`8`)))
3869	.setMIFlags(MachineInstr::FrameSetup);
3870	}
3871	} else if (NumBlocks != `0`) {
3872	// SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not
3873	// encodable in ADD). ScrathReg may temporarily become the CFA register.
3874	emitFrameOffset(MBB&: *MBB, MBBI, DL, DestReg: ScratchReg, SrcReg: AArch64::SP,
3875	Offset: StackOffset::getFixed(Fixed: -ProbeSize * NumBlocks), TII,
3876	MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI: false, HasWinCFI: nullptr,
3877	EmitCFAOffset: EmitAsyncCFI && !HasFP, InitialOffset: CFAOffset);
3878	CFAOffset += StackOffset::getFixed(Fixed: ProbeSize * NumBlocks);
3879	MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, TargetReg: ScratchReg);
3880	MBB = MBBI ->getParent();
3881	if (EmitAsyncCFI && !HasFP) {
3882	// Set the CFA register back to SP.
3883	CFIInstBuilder (*MBB, MBBI, MachineInstr::FrameSetup)
3884	.buildDefCFARegister(Reg: AArch64::SP);
3885	}
3886	}
3887
3888	if (ResidualSize != `0`) {
3889	// SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable
3890	// in SUB).
3891	emitFrameOffset(MBB&: *MBB, MBBI, DL, DestReg: AArch64::SP, SrcReg: AArch64::SP,
3892	Offset: StackOffset::getFixed(Fixed: -ResidualSize), TII,
3893	MachineInstr::FrameSetup, SetNZCV: false, NeedsWinCFI: false, HasWinCFI: nullptr,
3894	EmitCFAOffset: EmitAsyncCFI && !HasFP, InitialOffset: CFAOffset);
3895	if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) {
3896	// LDR XZR, [SP]
3897	BuildMI(BB&: *MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
3898	.addDef(RegNo: AArch64::XZR)
3899	.addReg(RegNo: AArch64::SP)
3900	.addImm(Val: `0`)
3901	.addMemOperand(MMO: MF.getMachineMemOperand(
3902	PtrInfo: MachinePointerInfo::getUnknownStack(MF),
3903	F: MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile, Size: `8`,
3904	BaseAlignment: Align (`8`)))
3905	.setMIFlags(MachineInstr::FrameSetup);
3906	}
3907	}
3908	}
3909
3910	void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
3911	MachineBasicBlock &MBB) const {
3912	// Get the instructions that need to be replaced. We emit at most two of
3913	// these. Remember them in order to avoid complications coming from the need
3914	// to traverse the block while potentially creating more blocks.
3915	SmallVector<MachineInstr *, `4`> ToReplace;
3916	for (MachineInstr &MI : MBB)
3917	if (MI.getOpcode() == AArch64::PROBED_STACKALLOC \|\|
3918	MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR)
3919	ToReplace.push_back(Elt: &MI);
3920
3921	for (MachineInstr *MI : ToReplace) {
3922	if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) {
3923	Register ScratchReg = MI->getOperand(i: `0`).getReg();
3924	int64_t FrameSize = MI->getOperand(i: `1`).getImm();
3925	StackOffset CFAOffset = StackOffset::get(Fixed: MI->getOperand(i: `2`).getImm(),
3926	Scalable: MI->getOperand(i: `3`).getImm());
3927	inlineStackProbeFixed(MBBI: MI->getIterator(), ScratchReg, FrameSize,
3928	CFAOffset);
3929	} else {
3930	assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR &&
3931	"Stack probe pseudo-instruction expected");
3932	const AArch64InstrInfo *TII =
3933	MI->getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
3934	Register TargetReg = MI->getOperand(i: `0`).getReg();
3935	(void)TII->probedStackAlloc(MBBI: MI->getIterator(), TargetReg, FrameSetup: true);
3936	}
3937	MI->eraseFromParent();
3938	}
3939	}
3940
3941	struct StackAccess {
3942	enum AccessType {
3943	NotAccessed = `0`, // Stack object not accessed by load/store instructions.
3944	GPR = `1` << `0`, // A general purpose register.
3945	PPR = `1` << `1`, // A predicate register.
3946	FPR = `1` << `2`, // A floating point/Neon/SVE register.
3947	};
3948
3949	int Idx;
3950	StackOffset Offset;
3951	int64_t Size;
3952	unsigned AccessTypes;
3953
3954	StackAccess() : Idx(`0`), Offset (), Size(`0`), AccessTypes(NotAccessed) {}
3955
3956	bool operator<(const StackAccess &Rhs) const {
3957	return std::make_tuple(args: start(), args: Idx) <
3958	std::make_tuple(args: Rhs.start(), args: Rhs.Idx);
3959	}
3960
3961	bool isCPU() const {
3962	// Predicate register load and store instructions execute on the CPU.
3963	return AccessTypes & (AccessType::GPR \| AccessType::PPR);
3964	}
3965	bool isSME() const { return AccessTypes & AccessType::FPR; }
3966	bool isMixed() const { return isCPU() && isSME(); }
3967
3968	int64_t start() const { return Offset.getFixed() + Offset.getScalable(); }
3969	int64_t end() const { return start() + Size; }
3970
3971	std::string getTypeString() const {
3972	switch (AccessTypes) {
3973	case AccessType::FPR:
3974	return "FPR";
3975	case AccessType::PPR:
3976	return "PPR";
3977	case AccessType::GPR:
3978	return "GPR";
3979	case AccessType::NotAccessed:
3980	return "NA";
3981	default:
3982	return "Mixed";
3983	}
3984	}
3985
3986	void print(raw_ostream &OS) const {
3987	OS << getTypeString() << " stack object at [SP"
3988	<< (Offset.getFixed() < `0` ? "" : "+") << Offset.getFixed();
3989	if (Offset.getScalable())
3990	OS << (Offset.getScalable() < `0` ? "" : "+") << Offset.getScalable()
3991	<< " * vscale";
3992	OS << "]";
3993	}
3994	};
3995
3996	static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) {
3997	SA.print(OS);
3998	return OS;
3999	}
4000
4001	void AArch64FrameLowering::emitRemarks(
4002	const MachineFunction &MF, MachineOptimizationRemarkEmitter ORE) const* {
4003
4004	auto *AFI = MF.getInfo<AArch64FunctionInfo>();
4005	if (AFI->getSMEFnAttrs().hasNonStreamingInterfaceAndBody())
4006	return;
4007
4008	unsigned StackHazardSize = getStackHazardSize(MF);
4009	const uint64_t HazardSize =
4010	(StackHazardSize) ? StackHazardSize : StackHazardRemarkSize;
4011
4012	if (HazardSize == `0`)
4013	return;
4014
4015	const MachineFrameInfo &MFI = MF.getFrameInfo();
4016	// Bail if function has no stack objects.
4017	if (!MFI.hasStackObjects())
4018	return;
4019
4020	std::vector<StackAccess> StackAccesses(MFI.getNumObjects());
4021
4022	size_t NumFPLdSt = `0`;
4023	size_t NumNonFPLdSt = `0`;
4024
4025	// Collect stack accesses via Load/Store instructions.
4026	for (const MachineBasicBlock &MBB : MF) {
4027	for (const MachineInstr &MI : MBB) {
4028	if (!MI.mayLoadOrStore() \|\| MI.getNumMemOperands() < `1`)
4029	continue;
4030	for (MachineMemOperand *MMO : MI.memoperands()) {
4031	std::optional<int> FI = getMMOFrameID(MMO, MFI);
4032	if (FI && !MFI.isDeadObjectIndex(ObjectIdx: *FI)) {
4033	int FrameIdx = *FI;
4034
4035	size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects();
4036	if (StackAccesses [ArrIdx].AccessTypes == StackAccess::NotAccessed) {
4037	StackAccesses [ArrIdx].Idx = FrameIdx;
4038	StackAccesses [ArrIdx].Offset =
4039	getFrameIndexReferenceFromSP(MF, FI: FrameIdx);
4040	StackAccesses [ArrIdx].Size = MFI.getObjectSize(ObjectIdx: FrameIdx);
4041	}
4042
4043	unsigned RegTy = StackAccess::AccessType::GPR;
4044	if (MFI.hasScalableStackID(ObjectIdx: FrameIdx))
4045	RegTy = isPPRAccess(MI) ? StackAccess::PPR : StackAccess::FPR;
4046	else if (AArch64InstrInfo::isFpOrNEON(MI))
4047	RegTy = StackAccess::FPR;
4048
4049	StackAccesses [ArrIdx].AccessTypes \|= RegTy;
4050
4051	if (RegTy == StackAccess::FPR)
4052	++NumFPLdSt;
4053	else
4054	++NumNonFPLdSt;
4055	}
4056	}
4057	}
4058	}
4059
4060	if (NumFPLdSt == `0` \|\| NumNonFPLdSt == `0`)
4061	return;
4062
4063	llvm::sort(C&: StackAccesses);
4064	llvm::erase_if(C&: StackAccesses, P: [](const StackAccess &S) {
4065	return S.AccessTypes == StackAccess::NotAccessed;
4066	});
4067
4068	SmallVector<const StackAccess *> MixedObjects;
4069	SmallVector<std::pair<const StackAccess , const* StackAccess *>> HazardPairs;
4070
4071	if (StackAccesses.front().isMixed())
4072	MixedObjects.push_back(Elt: &StackAccesses.front());
4073
4074	for (auto It = StackAccesses.begin(), End = std::prev(x: StackAccesses.end());
4075	It != End; ++It) {
4076	const auto &First = *It;
4077	const auto &Second = *(It + `1`);
4078
4079	if (Second.isMixed())
4080	MixedObjects.push_back(Elt: &Second);
4081
4082	if ((First.isSME() && Second.isCPU()) \|\|
4083	(First.isCPU() && Second.isSME())) {
4084	uint64_t Distance = static_cast<uint64_t>(Second.start() - First.end());
4085	if (Distance < HazardSize)
4086	HazardPairs.emplace_back(Args: &First, Args: &Second);
4087	}
4088	}
4089
4090	auto EmitRemark = [&](llvm::StringRef Str) {
4091	ORE->emit(RemarkBuilder: [&]() {
4092	auto R = MachineOptimizationRemarkAnalysis (
4093	"sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front());
4094	return R << formatv(Fmt: "stack hazard in '{0}': ", Vals: MF.getName()).str() << Str;
4095	});
4096	};
4097
4098	for (const auto &P : HazardPairs)
4099	EmitRemark (formatv(Fmt: "{0} is too close to {1}", Vals: P.first, Vals: P.second).str());
4100
4101	for (const auto *Obj : MixedObjects)
4102	EmitRemark (
4103	formatv(Fmt: "{0} accessed by both GP and FP instructions", Vals: *Obj).str());
4104	}
4105

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp