AArch64LoadStoreOptimizer.cpp source code [llvm_projects/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp]

1	//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file contains a pass that performs load / store related peephole
10	// optimizations. This pass should be run after register allocation.
11	//
12	// The pass runs after the PrologEpilogInserter where we emit the CFI
13	// instructions. In order to preserve the correctness of the unwind information,
14	// the pass should not change the order of any two instructions, one of which
15	// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16	// to unwind information.
17	//
18	//===----------------------------------------------------------------------===//
19
20	#include "AArch64InstrInfo.h"
21	#include "AArch64MachineFunctionInfo.h"
22	#include "AArch64Subtarget.h"
23	#include "MCTargetDesc/AArch64AddressingModes.h"
24	#include "llvm/ADT/SmallVector.h"
25	#include "llvm/ADT/Statistic.h"
26	#include "llvm/ADT/StringRef.h"
27	#include "llvm/ADT/iterator_range.h"
28	#include "llvm/Analysis/AliasAnalysis.h"
29	#include "llvm/CodeGen/MachineBasicBlock.h"
30	#include "llvm/CodeGen/MachineFunction.h"
31	#include "llvm/CodeGen/MachineFunctionPass.h"
32	#include "llvm/CodeGen/MachineInstr.h"
33	#include "llvm/CodeGen/MachineInstrBuilder.h"
34	#include "llvm/CodeGen/MachineOperand.h"
35	#include "llvm/CodeGen/MachineRegisterInfo.h"
36	#include "llvm/CodeGen/TargetRegisterInfo.h"
37	#include "llvm/IR/DebugLoc.h"
38	#include "llvm/MC/MCAsmInfo.h"
39	#include "llvm/MC/MCDwarf.h"
40	#include "llvm/Pass.h"
41	#include "llvm/Support/CommandLine.h"
42	#include "llvm/Support/Debug.h"
43	#include "llvm/Support/DebugCounter.h"
44	#include "llvm/Support/ErrorHandling.h"
45	#include <cassert>
46	#include <cstdint>
47	#include <functional>
48	#include <iterator>
49	#include <limits>
50	#include <optional>
51
52	using namespace llvm;
53
54	#define DEBUG_TYPE "aarch64-ldst-opt"
55
56	STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
57	STATISTIC(NumPostFolded, "Number of post-index updates folded");
58	STATISTIC(NumPreFolded, "Number of pre-index updates folded");
59	STATISTIC(NumUnscaledPairCreated,
60	"Number of load/store from unscaled generated");
61	STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
62	STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
63	STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
64	"not passed the alignment check");
65	STATISTIC(NumConstOffsetFolded,
66	"Number of const offset of index address folded");
67
68	DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
69	"Controls which pairs are considered for renaming");
70
71	// The LdStLimit limits how far we search for load/store pairs.
72	static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
73	cl::init(Val: `20`), cl::Hidden);
74
75	// The UpdateLimit limits how far we search for update instructions when we form
76	// pre-/post-index instructions.
77	static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(Val: `100`),
78	cl::Hidden);
79
80	// The LdStConstLimit limits how far we search for const offset instructions
81	// when we form index address load/store instructions.
82	static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
83	cl::init(Val: `10`), cl::Hidden);
84
85	// Enable register renaming to find additional store pairing opportunities.
86	static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
87	cl::init(Val: true), cl::Hidden);
88
89	#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
90
91	namespace {
92
93	using LdStPairFlags = struct LdStPairFlags {
94	// If a matching instruction is found, MergeForward is set to true if the
95	// merge is to remove the first instruction and replace the second with
96	// a pair-wise insn, and false if the reverse is true.
97	bool MergeForward = false;
98
99	// SExtIdx gives the index of the result of the load pair that must be
100	// extended. The value of SExtIdx assumes that the paired load produces the
101	// value in this order: (I, returned iterator), i.e., -1 means no value has
102	// to be extended, 0 means I, and 1 means the returned iterator.
103	int SExtIdx = -`1`;
104
105	// If not none, RenameReg can be used to rename the result register of the
106	// first store in a pair. Currently this only works when merging stores
107	// forward.
108	std::optional<MCPhysReg> RenameReg;
109
110	LdStPairFlags() = default;
111
112	void setMergeForward(bool V = true) { MergeForward = V; }
113	bool getMergeForward() const { return MergeForward; }
114
115	void setSExtIdx(int V) { SExtIdx = V; }
116	int getSExtIdx() const { return SExtIdx; }
117
118	void setRenameReg(MCPhysReg R) { RenameReg = R; }
119	void clearRenameReg() { RenameReg = std::nullopt; }
120	std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
121	};
122
123	struct AArch64LoadStoreOpt {
124	AliasAnalysis *AA;
125	const AArch64InstrInfo *TII;
126	const TargetRegisterInfo *TRI;
127	const AArch64Subtarget *Subtarget;
128
129	// Track which register units have been modified and used.
130	LiveRegUnits ModifiedRegUnits, UsedRegUnits;
131	LiveRegUnits DefinedInBB;
132
133	// Scan the instructions looking for a load/store that can be combined
134	// with the current instruction into a load/store pair.
135	// Return the matching instruction if one is found, else MBB->end().
136	MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
137	LdStPairFlags &Flags,
138	unsigned Limit,
139	bool FindNarrowMerge);
140
141	// Scan the instructions looking for a store that writes to the address from
142	// which the current load instruction reads. Return true if one is found.
143	bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
144	MachineBasicBlock::iterator &StoreI);
145
146	// Merge the two instructions indicated into a wider narrow store instruction.
147	MachineBasicBlock::iterator
148	mergeNarrowZeroStores(MachineBasicBlock::iterator I,
149	MachineBasicBlock::iterator MergeMI,
150	const LdStPairFlags &Flags);
151
152	// Merge the two instructions indicated into a single pair-wise instruction.
153	MachineBasicBlock::iterator
154	mergePairedInsns(MachineBasicBlock::iterator I,
155	MachineBasicBlock::iterator Paired,
156	const LdStPairFlags &Flags);
157
158	// Promote the load that reads directly from the address stored to.
159	MachineBasicBlock::iterator
160	promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
161	MachineBasicBlock::iterator StoreI);
162
163	// Scan the instruction list to find a base register update that can
164	// be combined with the current instruction (a load or store) using
165	// pre or post indexed addressing with writeback. Scan forwards.
166	MachineBasicBlock::iterator
167	findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
168	int UnscaledOffset, unsigned Limit);
169
170	// Scan the instruction list to find a register assigned with a const
171	// value that can be combined with the current instruction (a load or store)
172	// using base addressing with writeback. Scan backwards.
173	MachineBasicBlock::iterator
174	findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
175	unsigned &Offset);
176
177	// Scan the instruction list to find a base register update that can
178	// be combined with the current instruction (a load or store) using
179	// pre or post indexed addressing with writeback. Scan backwards.
180	// `MergeEither` is set to true if the combined instruction may be placed
181	// either at the location of the load/store instruction or at the location of
182	// the update instruction.
183	MachineBasicBlock::iterator
184	findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
185	bool &MergeEither);
186
187	// Find an instruction that updates the base register of the ld/st
188	// instruction.
189	bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
190	unsigned BaseReg, int Offset);
191
192	bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
193	unsigned IndexReg, unsigned &Offset);
194
195	// Merge a pre- or post-index base register update into a ld/st instruction.
196	std::optional<MachineBasicBlock::iterator>
197	mergeUpdateInsn(MachineBasicBlock::iterator I,
198	MachineBasicBlock::iterator Update, bool IsForward,
199	bool IsPreIdx, bool MergeEither);
200
201	MachineBasicBlock::iterator
202	mergeConstOffsetInsn(MachineBasicBlock::iterator I,
203	MachineBasicBlock::iterator Update, unsigned Offset,
204	int Scale);
205
206	// Find and merge zero store instructions.
207	bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
208
209	// Find and pair ldr/str instructions.
210	bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
211
212	// Find and promote load instructions which read directly from store.
213	bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
214
215	// Find and merge a base register updates before or after a ld/st instruction.
216	bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
217
218	// Find and merge an index ldr/st instruction into a base ld/st instruction.
219	bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
220
221	bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
222
223	bool runOnMachineFunction(MachineFunction &MF);
224	};
225
226	struct AArch64LoadStoreOptLegacy : public MachineFunctionPass {
227	static char ID;
228
229	AArch64LoadStoreOptLegacy() : MachineFunctionPass (ID) {}
230
231	bool runOnMachineFunction(MachineFunction &Fn) override;
232
233	void getAnalysisUsage(AnalysisUsage &AU) const override {
234	AU.addRequired<AAResultsWrapperPass>();
235	MachineFunctionPass::getAnalysisUsage(AU);
236	}
237
238	MachineFunctionProperties getRequiredProperties() const override {
239	return MachineFunctionProperties ().setNoVRegs();
240	}
241
242	StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
243	};
244
245	char AArch64LoadStoreOptLegacy::ID = `0`;
246
247	} // end anonymous namespace
248
249	INITIALIZE_PASS(AArch64LoadStoreOptLegacy, "aarch64-ldst-opt",
250	AARCH64_LOAD_STORE_OPT_NAME, false, false)
251
252	static bool isNarrowStore(unsigned Opc) {
253	switch (Opc) {
254	default:
255	return false;
256	case AArch64::STRBBui:
257	case AArch64::STURBBi:
258	case AArch64::STRHHui:
259	case AArch64::STURHHi:
260	return true;
261	}
262	}
263
264	// These instruction set memory tag and either keep memory contents unchanged or
265	// set it to zero, ignoring the address part of the source register.
266	static bool isTagStore(const MachineInstr &MI) {
267	switch (MI.getOpcode()) {
268	default:
269	return false;
270	case AArch64::STGi:
271	case AArch64::STZGi:
272	case AArch64::ST2Gi:
273	case AArch64::STZ2Gi:
274	return true;
275	}
276	}
277
278	static unsigned getMatchingNonSExtOpcode(unsigned Opc,
279	bool IsValidLdStrOpc = nullptr*) {
280	if (IsValidLdStrOpc)
281	IsValidLdStrOpc = true*;
282	switch (Opc) {
283	default:
284	if (IsValidLdStrOpc)
285	IsValidLdStrOpc = false*;
286	return std::numeric_limits<unsigned>::max();
287	case AArch64::STRDui:
288	case AArch64::STURDi:
289	case AArch64::STRDpre:
290	case AArch64::STRQui:
291	case AArch64::STURQi:
292	case AArch64::STRQpre:
293	case AArch64::STRBBui:
294	case AArch64::STURBBi:
295	case AArch64::STRHHui:
296	case AArch64::STURHHi:
297	case AArch64::STRWui:
298	case AArch64::STRWpre:
299	case AArch64::STURWi:
300	case AArch64::STRXui:
301	case AArch64::STRXpre:
302	case AArch64::STURXi:
303	case AArch64::STR_ZXI:
304	case AArch64::LDRDui:
305	case AArch64::LDURDi:
306	case AArch64::LDRDpre:
307	case AArch64::LDRQui:
308	case AArch64::LDURQi:
309	case AArch64::LDRQpre:
310	case AArch64::LDRWui:
311	case AArch64::LDURWi:
312	case AArch64::LDRWpre:
313	case AArch64::LDRXui:
314	case AArch64::LDURXi:
315	case AArch64::LDRXpre:
316	case AArch64::STRSui:
317	case AArch64::STURSi:
318	case AArch64::STRSpre:
319	case AArch64::LDRSui:
320	case AArch64::LDURSi:
321	case AArch64::LDRSpre:
322	case AArch64::LDR_ZXI:
323	return Opc;
324	case AArch64::LDRSWui:
325	return AArch64::LDRWui;
326	case AArch64::LDURSWi:
327	return AArch64::LDURWi;
328	case AArch64::LDRSWpre:
329	return AArch64::LDRWpre;
330	}
331	}
332
333	static unsigned getMatchingWideOpcode(unsigned Opc) {
334	switch (Opc) {
335	default:
336	llvm_unreachable("Opcode has no wide equivalent!");
337	case AArch64::STRBBui:
338	return AArch64::STRHHui;
339	case AArch64::STRHHui:
340	return AArch64::STRWui;
341	case AArch64::STURBBi:
342	return AArch64::STURHHi;
343	case AArch64::STURHHi:
344	return AArch64::STURWi;
345	case AArch64::STURWi:
346	return AArch64::STURXi;
347	case AArch64::STRWui:
348	return AArch64::STRXui;
349	}
350	}
351
352	static unsigned getMatchingPairOpcode(unsigned Opc) {
353	switch (Opc) {
354	default:
355	llvm_unreachable("Opcode has no pairwise equivalent!");
356	case AArch64::STRSui:
357	case AArch64::STURSi:
358	return AArch64::STPSi;
359	case AArch64::STRSpre:
360	return AArch64::STPSpre;
361	case AArch64::STRDui:
362	case AArch64::STURDi:
363	return AArch64::STPDi;
364	case AArch64::STRDpre:
365	return AArch64::STPDpre;
366	case AArch64::STRQui:
367	case AArch64::STURQi:
368	case AArch64::STR_ZXI:
369	return AArch64::STPQi;
370	case AArch64::STRQpre:
371	return AArch64::STPQpre;
372	case AArch64::STRWui:
373	case AArch64::STURWi:
374	return AArch64::STPWi;
375	case AArch64::STRWpre:
376	return AArch64::STPWpre;
377	case AArch64::STRXui:
378	case AArch64::STURXi:
379	return AArch64::STPXi;
380	case AArch64::STRXpre:
381	return AArch64::STPXpre;
382	case AArch64::LDRSui:
383	case AArch64::LDURSi:
384	return AArch64::LDPSi;
385	case AArch64::LDRSpre:
386	return AArch64::LDPSpre;
387	case AArch64::LDRDui:
388	case AArch64::LDURDi:
389	return AArch64::LDPDi;
390	case AArch64::LDRDpre:
391	return AArch64::LDPDpre;
392	case AArch64::LDRQui:
393	case AArch64::LDURQi:
394	case AArch64::LDR_ZXI:
395	return AArch64::LDPQi;
396	case AArch64::LDRQpre:
397	return AArch64::LDPQpre;
398	case AArch64::LDRWui:
399	case AArch64::LDURWi:
400	return AArch64::LDPWi;
401	case AArch64::LDRWpre:
402	return AArch64::LDPWpre;
403	case AArch64::LDRXui:
404	case AArch64::LDURXi:
405	return AArch64::LDPXi;
406	case AArch64::LDRXpre:
407	return AArch64::LDPXpre;
408	case AArch64::LDRSWui:
409	case AArch64::LDURSWi:
410	return AArch64::LDPSWi;
411	case AArch64::LDRSWpre:
412	return AArch64::LDPSWpre;
413	}
414	}
415
416	static unsigned isMatchingStore(MachineInstr &LoadInst,
417	MachineInstr &StoreInst) {
418	unsigned LdOpc = LoadInst.getOpcode();
419	unsigned StOpc = StoreInst.getOpcode();
420	switch (LdOpc) {
421	default:
422	llvm_unreachable("Unsupported load instruction!");
423	case AArch64::LDRBBui:
424	return StOpc == AArch64::STRBBui \|\| StOpc == AArch64::STRHHui \|\|
425	StOpc == AArch64::STRWui \|\| StOpc == AArch64::STRXui;
426	case AArch64::LDURBBi:
427	return StOpc == AArch64::STURBBi \|\| StOpc == AArch64::STURHHi \|\|
428	StOpc == AArch64::STURWi \|\| StOpc == AArch64::STURXi;
429	case AArch64::LDRHHui:
430	return StOpc == AArch64::STRHHui \|\| StOpc == AArch64::STRWui \|\|
431	StOpc == AArch64::STRXui;
432	case AArch64::LDURHHi:
433	return StOpc == AArch64::STURHHi \|\| StOpc == AArch64::STURWi \|\|
434	StOpc == AArch64::STURXi;
435	case AArch64::LDRWui:
436	return StOpc == AArch64::STRWui \|\| StOpc == AArch64::STRXui;
437	case AArch64::LDURWi:
438	return StOpc == AArch64::STURWi \|\| StOpc == AArch64::STURXi;
439	case AArch64::LDRXui:
440	return StOpc == AArch64::STRXui;
441	case AArch64::LDURXi:
442	return StOpc == AArch64::STURXi;
443	}
444	}
445
446	static unsigned getPreIndexedOpcode(unsigned Opc) {
447	// FIXME: We don't currently support creating pre-indexed loads/stores when
448	// the load or store is the unscaled version. If we decide to perform such an
449	// optimization in the future the cases for the unscaled loads/stores will
450	// need to be added here.
451	switch (Opc) {
452	default:
453	llvm_unreachable("Opcode has no pre-indexed equivalent!");
454	case AArch64::STRBui:
455	return AArch64::STRBpre;
456	case AArch64::STRHui:
457	return AArch64::STRHpre;
458	case AArch64::STRSui:
459	return AArch64::STRSpre;
460	case AArch64::STRDui:
461	return AArch64::STRDpre;
462	case AArch64::STRQui:
463	return AArch64::STRQpre;
464	case AArch64::STRBBui:
465	return AArch64::STRBBpre;
466	case AArch64::STRHHui:
467	return AArch64::STRHHpre;
468	case AArch64::STRWui:
469	return AArch64::STRWpre;
470	case AArch64::STRXui:
471	return AArch64::STRXpre;
472	case AArch64::LDRBui:
473	return AArch64::LDRBpre;
474	case AArch64::LDRHui:
475	return AArch64::LDRHpre;
476	case AArch64::LDRSui:
477	return AArch64::LDRSpre;
478	case AArch64::LDRDui:
479	return AArch64::LDRDpre;
480	case AArch64::LDRQui:
481	return AArch64::LDRQpre;
482	case AArch64::LDRBBui:
483	return AArch64::LDRBBpre;
484	case AArch64::LDRHHui:
485	return AArch64::LDRHHpre;
486	case AArch64::LDRWui:
487	return AArch64::LDRWpre;
488	case AArch64::LDRXui:
489	return AArch64::LDRXpre;
490	case AArch64::LDRSWui:
491	return AArch64::LDRSWpre;
492	case AArch64::LDPSi:
493	return AArch64::LDPSpre;
494	case AArch64::LDPSWi:
495	return AArch64::LDPSWpre;
496	case AArch64::LDPDi:
497	return AArch64::LDPDpre;
498	case AArch64::LDPQi:
499	return AArch64::LDPQpre;
500	case AArch64::LDPWi:
501	return AArch64::LDPWpre;
502	case AArch64::LDPXi:
503	return AArch64::LDPXpre;
504	case AArch64::STPSi:
505	return AArch64::STPSpre;
506	case AArch64::STPDi:
507	return AArch64::STPDpre;
508	case AArch64::STPQi:
509	return AArch64::STPQpre;
510	case AArch64::STPWi:
511	return AArch64::STPWpre;
512	case AArch64::STPXi:
513	return AArch64::STPXpre;
514	case AArch64::STGi:
515	return AArch64::STGPreIndex;
516	case AArch64::STZGi:
517	return AArch64::STZGPreIndex;
518	case AArch64::ST2Gi:
519	return AArch64::ST2GPreIndex;
520	case AArch64::STZ2Gi:
521	return AArch64::STZ2GPreIndex;
522	case AArch64::STGPi:
523	return AArch64::STGPpre;
524	}
525	}
526
527	static unsigned getBaseAddressOpcode(unsigned Opc) {
528	// TODO: Add more index address stores.
529	switch (Opc) {
530	default:
531	llvm_unreachable("Opcode has no base address equivalent!");
532	case AArch64::LDRBroX:
533	return AArch64::LDRBui;
534	case AArch64::LDRBBroX:
535	return AArch64::LDRBBui;
536	case AArch64::LDRSBXroX:
537	return AArch64::LDRSBXui;
538	case AArch64::LDRSBWroX:
539	return AArch64::LDRSBWui;
540	case AArch64::LDRHroX:
541	return AArch64::LDRHui;
542	case AArch64::LDRHHroX:
543	return AArch64::LDRHHui;
544	case AArch64::LDRSHXroX:
545	return AArch64::LDRSHXui;
546	case AArch64::LDRSHWroX:
547	return AArch64::LDRSHWui;
548	case AArch64::LDRWroX:
549	return AArch64::LDRWui;
550	case AArch64::LDRSroX:
551	return AArch64::LDRSui;
552	case AArch64::LDRSWroX:
553	return AArch64::LDRSWui;
554	case AArch64::LDRDroX:
555	return AArch64::LDRDui;
556	case AArch64::LDRXroX:
557	return AArch64::LDRXui;
558	case AArch64::LDRQroX:
559	return AArch64::LDRQui;
560	}
561	}
562
563	static unsigned getPostIndexedOpcode(unsigned Opc) {
564	switch (Opc) {
565	default:
566	llvm_unreachable("Opcode has no post-indexed wise equivalent!");
567	case AArch64::STRBui:
568	return AArch64::STRBpost;
569	case AArch64::STRHui:
570	return AArch64::STRHpost;
571	case AArch64::STRSui:
572	case AArch64::STURSi:
573	return AArch64::STRSpost;
574	case AArch64::STRDui:
575	case AArch64::STURDi:
576	return AArch64::STRDpost;
577	case AArch64::STRQui:
578	case AArch64::STURQi:
579	return AArch64::STRQpost;
580	case AArch64::STRBBui:
581	return AArch64::STRBBpost;
582	case AArch64::STRHHui:
583	return AArch64::STRHHpost;
584	case AArch64::STRWui:
585	case AArch64::STURWi:
586	return AArch64::STRWpost;
587	case AArch64::STRXui:
588	case AArch64::STURXi:
589	return AArch64::STRXpost;
590	case AArch64::LDRBui:
591	return AArch64::LDRBpost;
592	case AArch64::LDRHui:
593	return AArch64::LDRHpost;
594	case AArch64::LDRSui:
595	case AArch64::LDURSi:
596	return AArch64::LDRSpost;
597	case AArch64::LDRDui:
598	case AArch64::LDURDi:
599	return AArch64::LDRDpost;
600	case AArch64::LDRQui:
601	case AArch64::LDURQi:
602	return AArch64::LDRQpost;
603	case AArch64::LDRBBui:
604	return AArch64::LDRBBpost;
605	case AArch64::LDRHHui:
606	return AArch64::LDRHHpost;
607	case AArch64::LDRWui:
608	case AArch64::LDURWi:
609	return AArch64::LDRWpost;
610	case AArch64::LDRXui:
611	case AArch64::LDURXi:
612	return AArch64::LDRXpost;
613	case AArch64::LDRSWui:
614	return AArch64::LDRSWpost;
615	case AArch64::LDPSi:
616	return AArch64::LDPSpost;
617	case AArch64::LDPSWi:
618	return AArch64::LDPSWpost;
619	case AArch64::LDPDi:
620	return AArch64::LDPDpost;
621	case AArch64::LDPQi:
622	return AArch64::LDPQpost;
623	case AArch64::LDPWi:
624	return AArch64::LDPWpost;
625	case AArch64::LDPXi:
626	return AArch64::LDPXpost;
627	case AArch64::STPSi:
628	return AArch64::STPSpost;
629	case AArch64::STPDi:
630	return AArch64::STPDpost;
631	case AArch64::STPQi:
632	return AArch64::STPQpost;
633	case AArch64::STPWi:
634	return AArch64::STPWpost;
635	case AArch64::STPXi:
636	return AArch64::STPXpost;
637	case AArch64::STGi:
638	return AArch64::STGPostIndex;
639	case AArch64::STZGi:
640	return AArch64::STZGPostIndex;
641	case AArch64::ST2Gi:
642	return AArch64::ST2GPostIndex;
643	case AArch64::STZ2Gi:
644	return AArch64::STZ2GPostIndex;
645	case AArch64::STGPi:
646	return AArch64::STGPpost;
647	}
648	}
649
650	static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
651
652	unsigned OpcA = FirstMI.getOpcode();
653	unsigned OpcB = MI.getOpcode();
654
655	switch (OpcA) {
656	default:
657	return false;
658	case AArch64::STRSpre:
659	return (OpcB == AArch64::STRSui) \|\| (OpcB == AArch64::STURSi);
660	case AArch64::STRDpre:
661	return (OpcB == AArch64::STRDui) \|\| (OpcB == AArch64::STURDi);
662	case AArch64::STRQpre:
663	return (OpcB == AArch64::STRQui) \|\| (OpcB == AArch64::STURQi);
664	case AArch64::STRWpre:
665	return (OpcB == AArch64::STRWui) \|\| (OpcB == AArch64::STURWi);
666	case AArch64::STRXpre:
667	return (OpcB == AArch64::STRXui) \|\| (OpcB == AArch64::STURXi);
668	case AArch64::LDRSpre:
669	return (OpcB == AArch64::LDRSui) \|\| (OpcB == AArch64::LDURSi);
670	case AArch64::LDRDpre:
671	return (OpcB == AArch64::LDRDui) \|\| (OpcB == AArch64::LDURDi);
672	case AArch64::LDRQpre:
673	return (OpcB == AArch64::LDRQui) \|\| (OpcB == AArch64::LDURQi);
674	case AArch64::LDRWpre:
675	return (OpcB == AArch64::LDRWui) \|\| (OpcB == AArch64::LDURWi);
676	case AArch64::LDRXpre:
677	return (OpcB == AArch64::LDRXui) \|\| (OpcB == AArch64::LDURXi);
678	case AArch64::LDRSWpre:
679	return (OpcB == AArch64::LDRSWui) \|\| (OpcB == AArch64::LDURSWi);
680	}
681	}
682
683	// Returns the scale and offset range of pre/post indexed variants of MI.
684	static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
685	int &MinOffset, int &MaxOffset) {
686	bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
687	bool IsTagStore = isTagStore(MI);
688	// STG and all paired ldst have the same scale in pre/post-indexed variants*
689	// as in the "unsigned offset" variant.
690	// All other pre/post indexed ldst instructions are unscaled.
691	Scale = (IsTagStore \|\| IsPaired) ? AArch64InstrInfo::getMemScale(MI) : `1`;
692
693	if (IsPaired) {
694	MinOffset = -`64`;
695	MaxOffset = `63`;
696	} else {
697	MinOffset = -`256`;
698	MaxOffset = `255`;
699	}
700	}
701
702	static MachineOperand &getLdStRegOp(MachineInstr &MI,
703	unsigned PairedRegOp = `0`) {
704	assert(PairedRegOp < `2` && "Unexpected register operand idx.");
705	bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
706	if (IsPreLdSt)
707	PairedRegOp += `1`;
708	unsigned Idx =
709	AArch64InstrInfo::isPairedLdSt(MI) \|\| IsPreLdSt ? PairedRegOp : `0`;
710	return MI.getOperand(i: Idx);
711	}
712
713	static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
714	MachineInstr &StoreInst,
715	const AArch64InstrInfo *TII) {
716	assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
717	int LoadSize = TII->getMemScale(MI: LoadInst);
718	int StoreSize = TII->getMemScale(MI: StoreInst);
719	int UnscaledStOffset =
720	TII->hasUnscaledLdStOffset(MI&: StoreInst)
721	? AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm()
722	: AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm() * StoreSize;
723	int UnscaledLdOffset =
724	TII->hasUnscaledLdStOffset(MI&: LoadInst)
725	? AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm()
726	: AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm() * LoadSize;
727	return (UnscaledStOffset <= UnscaledLdOffset) &&
728	(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
729	}
730
731	static bool isPromotableZeroStoreInst(MachineInstr &MI) {
732	unsigned Opc = MI.getOpcode();
733	return (Opc == AArch64::STRWui \|\| Opc == AArch64::STURWi \|\|
734	isNarrowStore(Opc)) &&
735	getLdStRegOp(MI).getReg() == AArch64::WZR;
736	}
737
738	static bool isPromotableLoadFromStore(MachineInstr &MI) {
739	switch (MI.getOpcode()) {
740	default:
741	return false;
742	// Scaled instructions.
743	case AArch64::LDRBBui:
744	case AArch64::LDRHHui:
745	case AArch64::LDRWui:
746	case AArch64::LDRXui:
747	// Unscaled instructions.
748	case AArch64::LDURBBi:
749	case AArch64::LDURHHi:
750	case AArch64::LDURWi:
751	case AArch64::LDURXi:
752	return true;
753	}
754	}
755
756	static bool isMergeableLdStUpdate(MachineInstr &MI, AArch64FunctionInfo &AFI) {
757	unsigned Opc = MI.getOpcode();
758	switch (Opc) {
759	default:
760	return false;
761	// Scaled instructions.
762	case AArch64::STRBui:
763	case AArch64::STRHui:
764	case AArch64::STRSui:
765	case AArch64::STRDui:
766	case AArch64::STRQui:
767	case AArch64::STRXui:
768	case AArch64::STRWui:
769	case AArch64::STRHHui:
770	case AArch64::STRBBui:
771	case AArch64::LDRBui:
772	case AArch64::LDRHui:
773	case AArch64::LDRSui:
774	case AArch64::LDRDui:
775	case AArch64::LDRQui:
776	case AArch64::LDRXui:
777	case AArch64::LDRWui:
778	case AArch64::LDRHHui:
779	case AArch64::LDRBBui:
780	case AArch64::STGi:
781	case AArch64::STZGi:
782	case AArch64::ST2Gi:
783	case AArch64::STZ2Gi:
784	case AArch64::STGPi:
785	// Unscaled instructions.
786	case AArch64::STURSi:
787	case AArch64::STURDi:
788	case AArch64::STURQi:
789	case AArch64::STURWi:
790	case AArch64::STURXi:
791	case AArch64::LDURSi:
792	case AArch64::LDURDi:
793	case AArch64::LDURQi:
794	case AArch64::LDURWi:
795	case AArch64::LDURXi:
796	// Paired instructions.
797	case AArch64::LDPSi:
798	case AArch64::LDPSWi:
799	case AArch64::LDPDi:
800	case AArch64::LDPQi:
801	case AArch64::LDPWi:
802	case AArch64::LDPXi:
803	case AArch64::STPSi:
804	case AArch64::STPDi:
805	case AArch64::STPQi:
806	case AArch64::STPWi:
807	case AArch64::STPXi:
808	// Make sure this is a reg+imm (as opposed to an address reloc).
809	if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
810	return false;
811
812	// When using stack tagging, simple sp+imm loads and stores are not
813	// tag-checked, but pre- and post-indexed versions of them are, so we can't
814	// replace the former with the latter. This transformation would be valid
815	// if the load/store accesses an untagged stack slot, but we don't have
816	// that information available after frame indices have been eliminated.
817	if (AFI.isMTETagged() &&
818	AArch64InstrInfo::getLdStBaseOp(MI).getReg() == AArch64::SP)
819	return false;
820
821	return true;
822	}
823	}
824
825	// Make sure this is a reg+reg Ld/St
826	static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
827	unsigned Opc = MI.getOpcode();
828	switch (Opc) {
829	default:
830	return false;
831	// Scaled instructions.
832	// TODO: Add more index address stores.
833	case AArch64::LDRBroX:
834	case AArch64::LDRBBroX:
835	case AArch64::LDRSBXroX:
836	case AArch64::LDRSBWroX:
837	Scale = `1`;
838	return true;
839	case AArch64::LDRHroX:
840	case AArch64::LDRHHroX:
841	case AArch64::LDRSHXroX:
842	case AArch64::LDRSHWroX:
843	Scale = `2`;
844	return true;
845	case AArch64::LDRWroX:
846	case AArch64::LDRSroX:
847	case AArch64::LDRSWroX:
848	Scale = `4`;
849	return true;
850	case AArch64::LDRDroX:
851	case AArch64::LDRXroX:
852	Scale = `8`;
853	return true;
854	case AArch64::LDRQroX:
855	Scale = `16`;
856	return true;
857	}
858	}
859
860	static bool isRewritableImplicitDef(const MachineOperand &MO) {
861	switch (MO.getParent()->getOpcode()) {
862	default:
863	return MO.isRenamable();
864	case AArch64::ORRWrs:
865	case AArch64::ADDWri:
866	return true;
867	}
868	}
869
870	MachineBasicBlock::iterator
871	AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
872	MachineBasicBlock::iterator MergeMI,
873	const LdStPairFlags &Flags) {
874	assert(isPromotableZeroStoreInst(I) && isPromotableZeroStoreInst(MergeMI) &&
875	"Expected promotable zero stores.");
876
877	MachineBasicBlock::iterator E = I ->getParent()->end();
878	MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
879	// If NextI is the second of the two instructions to be merged, we need
880	// to skip one further. Either way we merge will invalidate the iterator,
881	// and we don't need to scan the new instruction, as it's a pairwise
882	// instruction, which we're not considering for further action anyway.
883	if (NextI == MergeMI)
884	NextI = next_nodbg(It: NextI, End: E);
885
886	unsigned Opc = I ->getOpcode();
887	unsigned MergeMIOpc = MergeMI ->getOpcode();
888	bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
889	bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(Opc: MergeMIOpc);
890	int OffsetStride = IsScaled ? TII->getMemScale(MI: *I) : `1`;
891	int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(MI: *MergeMI) : `1`;
892
893	bool MergeForward = Flags.getMergeForward();
894	// Insert our new paired instruction after whichever of the paired
895	// instructions MergeForward indicates.
896	MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
897	// Also based on MergeForward is from where we copy the base register operand
898	// so we get the flags compatible with the input code.
899	const MachineOperand &BaseRegOp =
900	MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *MergeMI)
901	: AArch64InstrInfo::getLdStBaseOp(MI: *I);
902
903	// Which register is Rt and which is Rt2 depends on the offset order.
904	int64_t IOffsetInBytes =
905	AArch64InstrInfo::getLdStOffsetOp(MI: I).getImm() OffsetStride;
906	int64_t MIOffsetInBytes =
907	AArch64InstrInfo::getLdStOffsetOp(MI: MergeMI).getImm()
908	MergeMIOffsetStride;
909	// Select final offset based on the offset order.
910	int64_t OffsetImm;
911	if (IOffsetInBytes > MIOffsetInBytes)
912	OffsetImm = MIOffsetInBytes;
913	else
914	OffsetImm = IOffsetInBytes;
915
916	int NewOpcode = getMatchingWideOpcode(Opc);
917	// Adjust final offset on scaled stores because the new instruction
918	// has a different scale.
919	if (!TII->hasUnscaledLdStOffset(Opc: NewOpcode)) {
920	int NewOffsetStride = TII->getMemScale(Opc: NewOpcode);
921	assert(((OffsetImm % NewOffsetStride) == `0`) &&
922	"Offset should be a multiple of the store memory scale");
923	OffsetImm = OffsetImm / NewOffsetStride;
924	}
925
926	// Construct the new instruction.
927	DebugLoc DL = I ->getDebugLoc();
928	MachineBasicBlock *MBB = I ->getParent();
929	MachineInstrBuilder MIB;
930	MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: NewOpcode))
931	.addReg(RegNo: isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
932	.add(MO: BaseRegOp)
933	.addImm(Val: OffsetImm)
934	.cloneMergedMemRefs(OtherMIs: {&I, &MergeMI})
935	.setMIFlags(I ->mergeFlagsWith(Other: *MergeMI));
936	(void)MIB;
937
938	LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
939	LLVM_DEBUG(I->print(dbgs()));
940	LLVM_DEBUG(dbgs() << " ");
941	LLVM_DEBUG(MergeMI->print(dbgs()));
942	LLVM_DEBUG(dbgs() << " with instruction:\n ");
943	LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
944	LLVM_DEBUG(dbgs() << "\n");
945
946	// Erase the old instructions.
947	I ->eraseFromParent();
948	MergeMI ->eraseFromParent();
949	return NextI;
950	}
951
952	// Apply Fn to all instructions between MI and the beginning of the block, until
953	// a def for DefReg is reached. Returns true, iff Fn returns true for all
954	// visited instructions. Stop after visiting Limit iterations.
955	static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
956	const TargetRegisterInfo TRI, unsigned* Limit,
957	std::function<bool(MachineInstr &, bool)> &Fn) {
958	auto MBB = MI.getParent();
959	for (MachineInstr &I :
960	instructionsWithoutDebug(It: MI.getReverseIterator(), End: MBB->instr_rend())) {
961	if (!Limit)
962	return false;
963	--Limit;
964
965	bool isDef = any_of(Range: I.operands(), P: [DefReg, TRI](MachineOperand &MOP) {
966	return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
967	TRI->regsOverlap(RegA: MOP.getReg(), RegB: DefReg);
968	});
969	if (!Fn (I, isDef))
970	return false;
971	if (isDef)
972	break;
973	}
974	return true;
975	}
976
977	static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
978	const TargetRegisterInfo *TRI) {
979
980	for (const MachineOperand &MOP : phys_regs_and_masks(MI))
981	if (MOP.isReg() && MOP.isKill())
982	Units.removeReg(Reg: MOP.getReg());
983
984	for (const MachineOperand &MOP : phys_regs_and_masks(MI))
985	if (MOP.isReg() && !MOP.isKill())
986	Units.addReg(Reg: MOP.getReg());
987	}
988
989	/// This function will add a new entry into the debugValueSubstitutions table
990	/// when two instruction have been merged into a new one represented by \p
991	/// MergedInstr.
992	static void addDebugSubstitutionsToTable(MachineFunction *MF,
993	unsigned InstrNumToSet,
994	MachineInstr &OriginalInstr,
995	MachineInstr &MergedInstr) {
996
997	// Figure out the Operand Index of the destination register of the
998	// OriginalInstr in the new MergedInstr.
999	auto Reg = OriginalInstr.getOperand(i: `0`).getReg();
1000	unsigned OperandNo = `0`;
1001	bool RegFound = false;
1002	for (const auto Op : MergedInstr.operands()) {
1003	if (Op.getReg() == Reg) {
1004	RegFound = true;
1005	break;
1006	}
1007	OperandNo++;
1008	}
1009
1010	if (RegFound)
1011	MF->makeDebugValueSubstitution({OriginalInstr.peekDebugInstrNum(), `0`},
1012	{InstrNumToSet, OperandNo});
1013	}
1014
1015	MachineBasicBlock::iterator
1016	AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
1017	MachineBasicBlock::iterator Paired,
1018	const LdStPairFlags &Flags) {
1019	MachineBasicBlock::iterator E = I ->getParent()->end();
1020	MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
1021	// If NextI is the second of the two instructions to be merged, we need
1022	// to skip one further. Either way we merge will invalidate the iterator,
1023	// and we don't need to scan the new instruction, as it's a pairwise
1024	// instruction, which we're not considering for further action anyway.
1025	if (NextI == Paired)
1026	NextI = next_nodbg(It: NextI, End: E);
1027
1028	int SExtIdx = Flags.getSExtIdx();
1029	unsigned Opc =
1030	SExtIdx == -`1` ? I ->getOpcode() : getMatchingNonSExtOpcode(Opc: I ->getOpcode());
1031	bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
1032	int OffsetStride = IsUnscaled ? TII->getMemScale(MI: *I) : `1`;
1033
1034	bool MergeForward = Flags.getMergeForward();
1035
1036	std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
1037	if (RenameReg) {
1038	MCRegister RegToRename = getLdStRegOp(MI&: *I).getReg();
1039	DefinedInBB.addReg(Reg: *RenameReg);
1040
1041	// Return the sub/super register for RenameReg, matching the size of
1042	// OriginalReg.
1043	auto GetMatchingSubReg =
1044	[this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
1045	for (MCPhysReg SubOrSuper :
1046	TRI->sub_and_superregs_inclusive(Reg: *RenameReg)) {
1047	if (C->contains(Reg: SubOrSuper))
1048	return SubOrSuper;
1049	}
1050	llvm_unreachable("Should have found matching sub or super register!");
1051	};
1052
1053	std::function<bool(MachineInstr &, bool)> UpdateMIs =
1054	[this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
1055	bool IsDef) {
1056	if (IsDef) {
1057	bool SeenDef = false;
1058	for (unsigned OpIdx = `0`; OpIdx < MI.getNumOperands(); ++OpIdx) {
1059	MachineOperand &MOP = MI.getOperand(i: OpIdx);
1060	// Rename the first explicit definition and all implicit
1061	// definitions matching RegToRename.
1062	if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1063	(!MergeForward \|\| !SeenDef \|\|
1064	(MOP.isDef() && MOP.isImplicit())) &&
1065	TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1066	assert((MOP.isImplicit() \|\|
1067	(MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1068	"Need renamable operands");
1069	Register MatchingReg;
1070	if (const TargetRegisterClass *RC =
1071	MI.getRegClassConstraint(OpIdx, TII, TRI))
1072	MatchingReg = GetMatchingSubReg (RC);
1073	else {
1074	if (!isRewritableImplicitDef(MO: MOP))
1075	continue;
1076	MatchingReg = GetMatchingSubReg (
1077	TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1078	}
1079	MOP.setReg(MatchingReg);
1080	SeenDef = true;
1081	}
1082	}
1083	} else {
1084	for (unsigned OpIdx = `0`; OpIdx < MI.getNumOperands(); ++OpIdx) {
1085	MachineOperand &MOP = MI.getOperand(i: OpIdx);
1086	if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1087	TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1088	assert((MOP.isImplicit() \|\|
1089	(MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1090	"Need renamable operands");
1091	Register MatchingReg;
1092	if (const TargetRegisterClass *RC =
1093	MI.getRegClassConstraint(OpIdx, TII, TRI))
1094	MatchingReg = GetMatchingSubReg (RC);
1095	else
1096	MatchingReg = GetMatchingSubReg (
1097	TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1098	assert(MatchingReg != AArch64::NoRegister &&
1099	"Cannot find matching regs for renaming");
1100	MOP.setReg(MatchingReg);
1101	}
1102	}
1103	}
1104	LLVM_DEBUG(dbgs() << "Renamed " << MI);
1105	return true;
1106	};
1107	forAllMIsUntilDef(MI&: MergeForward ? I : Paired ->getPrevNode(), DefReg: RegToRename,
1108	TRI, UINT32_MAX, Fn&: UpdateMIs);
1109
1110	#if !defined(NDEBUG)
1111	// For forward merging store:
1112	// Make sure the register used for renaming is not used between the
1113	// paired instructions. That would trash the content before the new
1114	// paired instruction.
1115	MCPhysReg RegToCheck = *RenameReg;
1116	// For backward merging load:
1117	// Make sure the register being renamed is not used between the
1118	// paired instructions. That would trash the content after the new
1119	// paired instruction.
1120	if (!MergeForward)
1121	RegToCheck = RegToRename;
1122	for (auto &MI :
1123	iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
1124	MergeForward ? std::next(I) : I,
1125	MergeForward ? std::next(Paired) : Paired))
1126	assert(all_of(MI.operands(),
1127	[this, RegToCheck](const MachineOperand &MOP) {
1128	return !MOP.isReg() \|\| MOP.isDebug() \|\| !MOP.getReg() \|\|
1129	MOP.isUndef() \|\|
1130	!TRI->regsOverlap(MOP.getReg(), RegToCheck);
1131	}) &&
1132	"Rename register used between paired instruction, trashing the "
1133	"content");
1134	#endif
1135	}
1136
1137	// Insert our new paired instruction after whichever of the paired
1138	// instructions MergeForward indicates.
1139	MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
1140	// Also based on MergeForward is from where we copy the base register operand
1141	// so we get the flags compatible with the input code.
1142	const MachineOperand &BaseRegOp =
1143	MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *Paired)
1144	: AArch64InstrInfo::getLdStBaseOp(MI: *I);
1145
1146	int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm();
1147	int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(MI: *Paired).getImm();
1148	bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Opc: Paired ->getOpcode());
1149	if (IsUnscaled != PairedIsUnscaled) {
1150	// We're trying to pair instructions that differ in how they are scaled. If
1151	// I is scaled then scale the offset of Paired accordingly. Otherwise, do
1152	// the opposite (i.e., make Paired's offset unscaled).
1153	int MemSize = TII->getMemScale(MI: *Paired);
1154	if (PairedIsUnscaled) {
1155	// If the unscaled offset isn't a multiple of the MemSize, we can't
1156	// pair the operations together.
1157	assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1158	"Offset should be a multiple of the stride!");
1159	PairedOffset /= MemSize;
1160	} else {
1161	PairedOffset *= MemSize;
1162	}
1163	}
1164
1165	// Which register is Rt and which is Rt2 depends on the offset order.
1166	// However, for pre load/stores the Rt should be the one of the pre
1167	// load/store.
1168	MachineInstr RtMI, Rt2MI;
1169	if (Offset == PairedOffset + OffsetStride &&
1170	!AArch64InstrInfo::isPreLdSt(MI: *I)) {
1171	RtMI = &*Paired;
1172	Rt2MI = &*I;
1173	// Here we swapped the assumption made for SExtIdx.
1174	// I.e., we turn ldp I, Paired into ldp Paired, I.
1175	// Update the index accordingly.
1176	if (SExtIdx != -`1`)
1177	SExtIdx = (SExtIdx + `1`) % `2`;
1178	} else {
1179	RtMI = &*I;
1180	Rt2MI = &*Paired;
1181	}
1182	int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(MI: *RtMI).getImm();
1183	// Scale the immediate offset, if necessary.
1184	if (TII->hasUnscaledLdStOffset(Opc: RtMI->getOpcode())) {
1185	assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1186	"Unscaled offset cannot be scaled.");
1187	OffsetImm /= TII->getMemScale(MI: *RtMI);
1188	}
1189
1190	// Construct the new instruction.
1191	MachineInstrBuilder MIB;
1192	DebugLoc DL = I ->getDebugLoc();
1193	MachineBasicBlock *MBB = I ->getParent();
1194	MachineOperand RegOp0 = getLdStRegOp(MI&: *RtMI);
1195	MachineOperand RegOp1 = getLdStRegOp(MI&: *Rt2MI);
1196	MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1197	// Kill flags may become invalid when moving stores for pairing.
1198	if (RegOp0.isUse()) {
1199	if (!MergeForward) {
1200	// Clear kill flags on store if moving upwards. Example:
1201	// STRWui kill %w0, ...
1202	// USE %w1
1203	// STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1204	// We are about to move the store of w1, so its kill flag may become
1205	// invalid; not the case for w0.
1206	// Since w1 is used between the stores, the kill flag on w1 is cleared
1207	// after merging.
1208	// STPWi kill %w0, %w1, ...
1209	// USE %w1
1210	for (auto It = std::next(x: I); It != Paired && PairedRegOp.isKill(); ++It)
1211	if (It ->readsRegister(Reg: PairedRegOp.getReg(), TRI))
1212	PairedRegOp.setIsKill(false);
1213	} else {
1214	// Clear kill flags of the first stores register. Example:
1215	// STRWui %w1, ...
1216	// USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1217	// STRW %w0
1218	Register Reg = getLdStRegOp(MI&: *I).getReg();
1219	for (MachineInstr &MI :
1220	make_range(x: std::next(x: I ->getIterator()), y: Paired ->getIterator()))
1221	MI.clearRegisterKills(Reg, RegInfo: TRI);
1222	}
1223	}
1224
1225	unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1226	MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: MatchPairOpcode));
1227
1228	// Adds the pre-index operand for pre-indexed ld/st pairs.
1229	if (AArch64InstrInfo::isPreLdSt(MI: *RtMI))
1230	MIB.addReg(RegNo: BaseRegOp.getReg(), Flags: RegState::Define);
1231
1232	MIB.add(MO: RegOp0)
1233	.add(MO: RegOp1)
1234	.add(MO: BaseRegOp)
1235	.addImm(Val: OffsetImm)
1236	.cloneMergedMemRefs(OtherMIs: {&I, &Paired})
1237	.setMIFlags(I ->mergeFlagsWith(Other: *Paired));
1238
1239	(void)MIB;
1240
1241	LLVM_DEBUG(
1242	dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1243	LLVM_DEBUG(I->print(dbgs()));
1244	LLVM_DEBUG(dbgs() << " ");
1245	LLVM_DEBUG(Paired->print(dbgs()));
1246	LLVM_DEBUG(dbgs() << " with instruction:\n ");
1247	if (SExtIdx != -`1`) {
1248	// Generate the sign extension for the proper result of the ldp.
1249	// I.e., with X1, that would be:
1250	// %w1 = KILL %w1, implicit-def %x1
1251	// %x1 = SBFMXri killed %x1, 0, 31
1252	MachineOperand &DstMO = MIB ->getOperand(i: SExtIdx);
1253	// Right now, DstMO has the extended register, since it comes from an
1254	// extended opcode.
1255	Register DstRegX = DstMO.getReg();
1256	// Get the W variant of that register.
1257	Register DstRegW = TRI->getSubReg(Reg: DstRegX, Idx: AArch64::sub_32);
1258	// Update the result of LDP to use the W instead of the X variant.
1259	DstMO.setReg(DstRegW);
1260	LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1261	LLVM_DEBUG(dbgs() << "\n");
1262	// Make the machine verifier happy by providing a definition for
1263	// the X register.
1264	// Insert this definition right after the generated LDP, i.e., before
1265	// InsertionPoint.
1266	MachineInstrBuilder MIBKill =
1267	BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::KILL), DestReg: DstRegW)
1268	.addReg(RegNo: DstRegW)
1269	.addReg(RegNo: DstRegX, Flags: RegState::Define);
1270	MIBKill ->getOperand(i: `2`).setImplicit();
1271	// Create the sign extension.
1272	MachineInstrBuilder MIBSXTW =
1273	BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: AArch64::SBFMXri), DestReg: DstRegX)
1274	.addReg(RegNo: DstRegX)
1275	.addImm(Val: `0`)
1276	.addImm(Val: `31`);
1277	(void)MIBSXTW;
1278
1279	// In the case of a sign-extend, where we have something like:
1280	// debugValueSubstitutions:[]
1281	// $w1 = LDRWui $x0, 1, debug-instr-number 1
1282	// DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1283	// $x0 = LDRSWui $x0, 0, debug-instr-number 2
1284	// DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1285
1286	// It will be converted to:
1287	// debugValueSubstitutions:[]
1288	// $w0, $w1 = LDPWi $x0, 0
1289	// $w0 = KILL $w0, implicit-def $x0
1290	// $x0 = SBFMXri $x0, 0, 31
1291	// DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1292	// DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1293
1294	// We want the final result to look like:
1295	// debugValueSubstitutions:
1296	// - { srcinst: 1, srcop: 0, dstinst: 4, dstop: 1, subreg: 0 }
1297	// - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1298	// $w0, $w1 = LDPWi $x0, 0, debug-instr-number 4
1299	// $w0 = KILL $w0, implicit-def $x0
1300	// $x0 = SBFMXri $x0, 0, 31, debug-instr-number 3
1301	// DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1302	// DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1303
1304	// $x0 is where the final value is stored, so the sign extend (SBFMXri)
1305	// instruction contains the final value we care about we give it a new
1306	// debug-instr-number 3. Whereas, $w1 contains the final value that we care
1307	// about, therefore the LDP instruction is also given a new
1308	// debug-instr-number 4. We have to add these substitutions to the
1309	// debugValueSubstitutions table. However, we also have to ensure that the
1310	// OpIndex that pointed to debug-instr-number 1 gets updated to 1, because
1311	// $w1 is the second operand of the LDP instruction.
1312
1313	if (I ->peekDebugInstrNum()) {
1314	// If I is the instruction which got sign extended and has a
1315	// debug-instr-number, give the SBFMXri instruction a new
1316	// debug-instr-number, and update the debugValueSubstitutions table with
1317	// the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1318	// instruction a new debug-instr-number, and update the
1319	// debugValueSubstitutions table with the new debug-instr-number and
1320	// OpIndex pair.
1321	unsigned NewInstrNum;
1322	if (DstRegX == I ->getOperand(i: `0`).getReg()) {
1323	NewInstrNum = MIBSXTW ->getDebugInstrNum();
1324	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I,
1325	MergedInstr&: *MIBSXTW);
1326	} else {
1327	NewInstrNum = MIB ->getDebugInstrNum();
1328	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: I, MergedInstr&: MIB);
1329	}
1330	}
1331	if (Paired ->peekDebugInstrNum()) {
1332	// If Paired is the instruction which got sign extended and has a
1333	// debug-instr-number, give the SBFMXri instruction a new
1334	// debug-instr-number, and update the debugValueSubstitutions table with
1335	// the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1336	// instruction a new debug-instr-number, and update the
1337	// debugValueSubstitutions table with the new debug-instr-number and
1338	// OpIndex pair.
1339	unsigned NewInstrNum;
1340	if (DstRegX == Paired ->getOperand(i: `0`).getReg()) {
1341	NewInstrNum = MIBSXTW ->getDebugInstrNum();
1342	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1343	MergedInstr&: *MIBSXTW);
1344	} else {
1345	NewInstrNum = MIB ->getDebugInstrNum();
1346	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1347	MergedInstr&: *MIB);
1348	}
1349	}
1350
1351	LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1352	LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1353	} else if (Opc == AArch64::LDR_ZXI \|\| Opc == AArch64::STR_ZXI) {
1354	// We are combining SVE fill/spill to LDP/STP, so we need to use the Q
1355	// variant of the registers.
1356	MachineOperand &MOp0 = MIB ->getOperand(i: `0`);
1357	MachineOperand &MOp1 = MIB ->getOperand(i: `1`);
1358	assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
1359	AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
1360	MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
1361	MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
1362	LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1363	} else {
1364
1365	// In the case that the merge doesn't result in a sign-extend, if we have
1366	// something like:
1367	// debugValueSubstitutions:[]
1368	// $x1 = LDRXui $x0, 1, debug-instr-number 1
1369	// DBG_INSTR_REF !13, dbg-instr-ref(1, 0), debug-location !11
1370	// $x0 = LDRXui killed $x0, 0, debug-instr-number 2
1371	// DBG_INSTR_REF !14, dbg-instr-ref(2, 0), debug-location !11
1372
1373	// It will be converted to:
1374	// debugValueSubstitutions: []
1375	// $x0, $x1 = LDPXi $x0, 0
1376	// DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1377	// DBG_INSTR_REF !13, dbg-instr-ref(2, 0), debug-location !14
1378
1379	// We want the final result to look like:
1380	// debugValueSubstitutions:
1381	// - { srcinst: 1, srcop: 0, dstinst: 3, dstop: 1, subreg: 0 }
1382	// - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1383	// $x0, $x1 = LDPXi $x0, 0, debug-instr-number 3
1384	// DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1385	// DBG_INSTR_REF !12, dbg-instr-ref(2, 0), debug-location !14
1386
1387	// Here all that needs to be done is, that the LDP instruction needs to be
1388	// updated with a new debug-instr-number, we then need to add entries into
1389	// the debugSubstitutions table to map the old instr-refs to the new ones.
1390
1391	// Assign new DebugInstrNum to the Paired instruction.
1392	if (I ->peekDebugInstrNum()) {
1393	unsigned NewDebugInstrNum = MIB ->getDebugInstrNum();
1394	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *I,
1395	MergedInstr&: *MIB);
1396	}
1397	if (Paired ->peekDebugInstrNum()) {
1398	unsigned NewDebugInstrNum = MIB ->getDebugInstrNum();
1399	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *Paired,
1400	MergedInstr&: *MIB);
1401	}
1402
1403	LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1404	}
1405	LLVM_DEBUG(dbgs() << "\n");
1406
1407	if (MergeForward)
1408	for (const MachineOperand &MOP : phys_regs_and_masks(MI: *I))
1409	if (MOP.isReg() && MOP.isKill())
1410	DefinedInBB.addReg(Reg: MOP.getReg());
1411
1412	// Copy over any implicit-def operands. This is like MI.copyImplicitOps, but
1413	// only copies implicit defs and makes sure that each operand is only added
1414	// once in case of duplicates.
1415	auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1,
1416	MachineBasicBlock::iterator MI2) {
1417	SmallSetVector<Register, `4`> Ops;
1418	for (const MachineOperand &MO :
1419	llvm::drop_begin(RangeOrContainer: MI1 ->operands(), N: MI1 ->getDesc().getNumOperands()))
1420	if (MO.isReg() && MO.isImplicit() && MO.isDef())
1421	Ops.insert(X: MO.getReg());
1422	for (const MachineOperand &MO :
1423	llvm::drop_begin(RangeOrContainer: MI2 ->operands(), N: MI2 ->getDesc().getNumOperands()))
1424	if (MO.isReg() && MO.isImplicit() && MO.isDef())
1425	Ops.insert(X: MO.getReg());
1426	for (auto Op : Ops)
1427	MIB.addDef(RegNo: Op, Flags: RegState::Implicit);
1428	};
1429	CopyImplicitOps (I, Paired);
1430
1431	// Erase the old instructions.
1432	I ->eraseFromParent();
1433	Paired ->eraseFromParent();
1434
1435	return NextI;
1436	}
1437
1438	MachineBasicBlock::iterator
1439	AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1440	MachineBasicBlock::iterator StoreI) {
1441	MachineBasicBlock::iterator NextI =
1442	next_nodbg(It: LoadI, End: LoadI ->getParent()->end());
1443
1444	int LoadSize = TII->getMemScale(MI: *LoadI);
1445	int StoreSize = TII->getMemScale(MI: *StoreI);
1446	Register LdRt = getLdStRegOp(MI&: *LoadI).getReg();
1447	const MachineOperand &StMO = getLdStRegOp(MI&: *StoreI);
1448	Register StRt = getLdStRegOp(MI&: *StoreI).getReg();
1449	bool IsStoreXReg = TRI->getRegClass(i: AArch64::GPR64RegClassID)->contains(Reg: StRt);
1450
1451	assert((IsStoreXReg \|\|
1452	TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1453	"Unexpected RegClass");
1454
1455	MachineInstr *BitExtMI;
1456	if (LoadSize == StoreSize && (LoadSize == `4` \|\| LoadSize == `8`)) {
1457	// Remove the load, if the destination register of the loads is the same
1458	// register for stored value.
1459	if (StRt == LdRt && LoadSize == `8`) {
1460	for (MachineInstr &MI : make_range(x: StoreI ->getIterator(),
1461	y: LoadI ->getIterator())) {
1462	if (MI.killsRegister(Reg: StRt, TRI)) {
1463	MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1464	break;
1465	}
1466	}
1467	LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1468	LLVM_DEBUG(LoadI->print(dbgs()));
1469	LLVM_DEBUG(dbgs() << "\n");
1470	LoadI ->eraseFromParent();
1471	return NextI;
1472	}
1473	// Replace the load with a mov if the load and store are in the same size.
1474	BitExtMI =
1475	BuildMI(BB&: *LoadI ->getParent(), I: LoadI, MIMD: LoadI ->getDebugLoc(),
1476	MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), DestReg: LdRt)
1477	.addReg(RegNo: IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1478	.add(MO: StMO)
1479	.addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: `0`))
1480	.setMIFlags(LoadI ->getFlags());
1481	} else {
1482	// FIXME: Currently we disable this transformation in big-endian targets as
1483	// performance and correctness are verified only in little-endian.
1484	if (!Subtarget->isLittleEndian())
1485	return NextI;
1486	bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: *LoadI);
1487	assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1488	"Unsupported ld/st match");
1489	assert(LoadSize <= StoreSize && "Invalid load size");
1490	int UnscaledLdOffset =
1491	IsUnscaled
1492	? AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm()
1493	: AArch64InstrInfo::getLdStOffsetOp(MI: LoadI).getImm() LoadSize;
1494	int UnscaledStOffset =
1495	IsUnscaled
1496	? AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm()
1497	: AArch64InstrInfo::getLdStOffsetOp(MI: StoreI).getImm() StoreSize;
1498	int Width = LoadSize * `8`;
1499	Register DestReg =
1500	IsStoreXReg ? Register (TRI->getMatchingSuperReg(
1501	Reg: LdRt, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64RegClass))
1502	: LdRt;
1503
1504	assert((UnscaledLdOffset >= UnscaledStOffset &&
1505	(UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1506	"Invalid offset");
1507
1508	int Immr = `8` * (UnscaledLdOffset - UnscaledStOffset);
1509	int Imms = Immr + Width - `1`;
1510	if (UnscaledLdOffset == UnscaledStOffset) {
1511	uint32_t AndMaskEncoded = ((IsStoreXReg ? `1` : `0`) << `12`) // N
1512	\| ((Immr) << `6`) // immr
1513	\| ((Imms) << `0`) // imms
1514	;
1515
1516	BitExtMI =
1517	BuildMI(BB&: *LoadI ->getParent(), I: LoadI, MIMD: LoadI ->getDebugLoc(),
1518	MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1519	DestReg)
1520	.add(MO: StMO)
1521	.addImm(Val: AndMaskEncoded)
1522	.setMIFlags(LoadI ->getFlags());
1523	} else if (IsStoreXReg && Imms == `31`) {
1524	// Use the 32 bit variant of UBFM if it's the LSR alias of the
1525	// instruction.
1526	assert(Immr <= Imms && "Expected LSR alias of UBFM");
1527	BitExtMI = BuildMI(BB&: *LoadI ->getParent(), I: LoadI, MIMD: LoadI ->getDebugLoc(),
1528	MCID: TII->get(Opcode: AArch64::UBFMWri),
1529	DestReg: TRI->getSubReg(Reg: DestReg, Idx: AArch64::sub_32))
1530	.addReg(RegNo: TRI->getSubReg(Reg: StRt, Idx: AArch64::sub_32))
1531	.addImm(Val: Immr)
1532	.addImm(Val: Imms)
1533	.setMIFlags(LoadI ->getFlags());
1534	} else {
1535	BitExtMI =
1536	BuildMI(BB&: *LoadI ->getParent(), I: LoadI, MIMD: LoadI ->getDebugLoc(),
1537	MCID: TII->get(Opcode: IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1538	DestReg)
1539	.add(MO: StMO)
1540	.addImm(Val: Immr)
1541	.addImm(Val: Imms)
1542	.setMIFlags(LoadI ->getFlags());
1543	}
1544	}
1545
1546	// Clear kill flags between store and load.
1547	for (MachineInstr &MI : make_range(x: StoreI ->getIterator(),
1548	y: BitExtMI->getIterator()))
1549	if (MI.killsRegister(Reg: StRt, TRI)) {
1550	MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1551	break;
1552	}
1553
1554	LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1555	LLVM_DEBUG(StoreI->print(dbgs()));
1556	LLVM_DEBUG(dbgs() << " ");
1557	LLVM_DEBUG(LoadI->print(dbgs()));
1558	LLVM_DEBUG(dbgs() << " with instructions:\n ");
1559	LLVM_DEBUG(StoreI->print(dbgs()));
1560	LLVM_DEBUG(dbgs() << " ");
1561	LLVM_DEBUG((BitExtMI)->print(dbgs()));
1562	LLVM_DEBUG(dbgs() << "\n");
1563
1564	// Erase the old instructions.
1565	LoadI ->eraseFromParent();
1566	return NextI;
1567	}
1568
1569	static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1570	// Convert the byte-offset used by unscaled into an "element" offset used
1571	// by the scaled pair load/store instructions.
1572	if (IsUnscaled) {
1573	// If the byte-offset isn't a multiple of the stride, there's no point
1574	// trying to match it.
1575	if (Offset % OffsetStride)
1576	return false;
1577	Offset /= OffsetStride;
1578	}
1579	return Offset <= `63` && Offset >= -`64`;
1580	}
1581
1582	// Do alignment, specialized to power of 2 and for signed ints,
1583	// avoiding having to do a C-style cast from uint_64t to int when
1584	// using alignTo from include/llvm/Support/MathExtras.h.
1585	// FIXME: Move this function to include/MathExtras.h?
1586	static int alignTo(int Num, int PowOf2) {
1587	return (Num + PowOf2 - `1`) & ~(PowOf2 - `1`);
1588	}
1589
1590	static bool mayAlias(MachineInstr &MIa,
1591	SmallVectorImpl<MachineInstr *> &MemInsns,
1592	AliasAnalysis *AA) {
1593	for (MachineInstr *MIb : MemInsns) {
1594	if (MIa.mayAlias(AA, Other: MIb, /UseTBAA/* false)) {
1595	LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1596	return true;
1597	}
1598	}
1599
1600	LLVM_DEBUG(dbgs() << "No aliases found\n");
1601	return false;
1602	}
1603
1604	bool AArch64LoadStoreOpt::findMatchingStore(
1605	MachineBasicBlock::iterator I, unsigned Limit,
1606	MachineBasicBlock::iterator &StoreI) {
1607	MachineBasicBlock::iterator B = I ->getParent()->begin();
1608	MachineBasicBlock::iterator MBBI = I;
1609	MachineInstr &LoadMI = *I;
1610	Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: LoadMI).getReg();
1611
1612	// If the load is the first instruction in the block, there's obviously
1613	// not any matching store.
1614	if (MBBI == B)
1615	return false;
1616
1617	// Track which register units have been modified and used between the first
1618	// insn and the second insn.
1619	ModifiedRegUnits.clear();
1620	UsedRegUnits.clear();
1621
1622	unsigned Count = `0`;
1623	do {
1624	MBBI = prev_nodbg(It: MBBI, Begin: B);
1625	MachineInstr &MI = *MBBI;
1626
1627	// Don't count transient instructions towards the search limit since there
1628	// may be different numbers of them if e.g. debug information is present.
1629	if (!MI.isTransient())
1630	++Count;
1631
1632	// If the load instruction reads directly from the address to which the
1633	// store instruction writes and the stored value is not modified, we can
1634	// promote the load. Since we do not handle stores with pre-/post-index,
1635	// it's unnecessary to check if BaseReg is modified by the store itself.
1636	// Also we can't handle stores without an immediate offset operand,
1637	// while the operand might be the address for a global variable.
1638	if (MI.mayStore() && isMatchingStore(LoadInst&: LoadMI, StoreInst&: MI) &&
1639	BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1640	AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1641	isLdOffsetInRangeOfSt(LoadInst&: LoadMI, StoreInst&: MI, TII) &&
1642	ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg())) {
1643	StoreI = MBBI;
1644	return true;
1645	}
1646
1647	if (MI.isCall())
1648	return false;
1649
1650	// Update modified / uses register units.
1651	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1652
1653	// Otherwise, if the base register is modified, we have no match, so
1654	// return early.
1655	if (!ModifiedRegUnits.available(Reg: BaseReg))
1656	return false;
1657
1658	// If we encounter a store aliased with the load, return early.
1659	if (MI.mayStore() && LoadMI.mayAlias(AA, Other: MI, /UseTBAA/ false))
1660	return false;
1661	} while (MBBI != B && Count < Limit);
1662	return false;
1663	}
1664
1665	static bool needsWinCFI(const MachineFunction *MF) {
1666	return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1667	MF->getFunction().needsUnwindTableEntry();
1668	}
1669
1670	// Returns true if FirstMI and MI are candidates for merging or pairing.
1671	// Otherwise, returns false.
1672	static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1673	LdStPairFlags &Flags,
1674	const AArch64InstrInfo *TII) {
1675	// If this is volatile or if pairing is suppressed, not a candidate.
1676	if (MI.hasOrderedMemoryRef() \|\| TII->isLdStPairSuppressed(MI))
1677	return false;
1678
1679	// We should have already checked FirstMI for pair suppression and volatility.
1680	assert(!FirstMI.hasOrderedMemoryRef() &&
1681	!TII->isLdStPairSuppressed(FirstMI) &&
1682	"FirstMI shouldn't get here if either of these checks are true.");
1683
1684	if (needsWinCFI(MF: MI.getMF()) && (MI.getFlag(Flag: MachineInstr::FrameSetup) \|\|
1685	MI.getFlag(Flag: MachineInstr::FrameDestroy)))
1686	return false;
1687
1688	unsigned OpcA = FirstMI.getOpcode();
1689	unsigned OpcB = MI.getOpcode();
1690
1691	// Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1692	if (OpcA == OpcB)
1693	return !AArch64InstrInfo::isPreLdSt(MI: FirstMI);
1694
1695	// Bail out if one of the opcodes is SVE fill/spill, as we currently don't
1696	// allow pairing them with other instructions.
1697	if (OpcA == AArch64::LDR_ZXI \|\| OpcA == AArch64::STR_ZXI \|\|
1698	OpcB == AArch64::LDR_ZXI \|\| OpcB == AArch64::STR_ZXI)
1699	return false;
1700
1701	// Two pre ld/st of different opcodes cannot be merged either
1702	if (AArch64InstrInfo::isPreLdSt(MI: FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1703	return false;
1704
1705	// Try to match a sign-extended load/store with a zero-extended load/store.
1706	bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1707	unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc: OpcA, IsValidLdStrOpc: &IsValidLdStrOpc);
1708	assert(IsValidLdStrOpc &&
1709	"Given Opc should be a Load or Store with an immediate");
1710	// OpcA will be the first instruction in the pair.
1711	if (NonSExtOpc == getMatchingNonSExtOpcode(Opc: OpcB, IsValidLdStrOpc: &PairIsValidLdStrOpc)) {
1712	Flags.setSExtIdx(NonSExtOpc == OpcA ? `1` : `0`);
1713	return true;
1714	}
1715
1716	// If the second instruction isn't even a mergable/pairable load/store, bail
1717	// out.
1718	if (!PairIsValidLdStrOpc)
1719	return false;
1720
1721	// Narrow stores do not have a matching pair opcodes, so constrain their
1722	// merging to zero stores.
1723	if (isNarrowStore(Opc: OpcA) \|\| isNarrowStore(Opc: OpcB))
1724	return getLdStRegOp(MI&: FirstMI).getReg() == AArch64::WZR &&
1725	getLdStRegOp(MI).getReg() == AArch64::WZR &&
1726	TII->getMemScale(MI: FirstMI) == TII->getMemScale(MI);
1727
1728	// The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1729	// LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1730	// are candidate pairs that can be merged.
1731	if (isPreLdStPairCandidate(FirstMI, MI))
1732	return true;
1733
1734	// Try to match an unscaled load/store with a scaled load/store.
1735	return TII->hasUnscaledLdStOffset(Opc: OpcA) != TII->hasUnscaledLdStOffset(Opc: OpcB) &&
1736	getMatchingPairOpcode(Opc: OpcA) == getMatchingPairOpcode(Opc: OpcB);
1737
1738	// FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1739	}
1740
1741	static bool canRenameMOP(const MachineOperand &MOP,
1742	const TargetRegisterInfo *TRI) {
1743	if (MOP.isReg()) {
1744	auto *RegClass = TRI->getMinimalPhysRegClass(Reg: MOP.getReg());
1745	// Renaming registers with multiple disjunct sub-registers (e.g. the
1746	// result of a LD3) means that all sub-registers are renamed, potentially
1747	// impacting other instructions we did not check. Bail out.
1748	// Note that this relies on the structure of the AArch64 register file. In
1749	// particular, a subregister cannot be written without overwriting the
1750	// whole register.
1751	if (RegClass->HasDisjunctSubRegs && RegClass->CoveredBySubRegs &&
1752	(TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::dsub0) \|\|
1753	TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::qsub0) \|\|
1754	TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::zsub0))) {
1755	LLVM_DEBUG(
1756	dbgs()
1757	<< " Cannot rename operands with multiple disjunct subregisters ("
1758	<< MOP << ")\n");
1759	return false;
1760	}
1761
1762	// We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1763	// them must be known. For example, in ORRWrs the implicit-def
1764	// corresponds to the result register.
1765	if (MOP.isImplicit() && MOP.isDef()) {
1766	if (!isRewritableImplicitDef(MO: MOP))
1767	return false;
1768	return TRI->isSuperOrSubRegisterEq(
1769	RegA: MOP.getParent()->getOperand(i: `0`).getReg(), RegB: MOP.getReg());
1770	}
1771	}
1772	return MOP.isImplicit() \|\|
1773	(MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1774	}
1775
1776	static bool
1777	canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1778	SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1779	const TargetRegisterInfo *TRI) {
1780	if (!FirstMI.mayStore())
1781	return false;
1782
1783	// Check if we can find an unused register which we can use to rename
1784	// the register used by the first load/store.
1785
1786	auto RegToRename = getLdStRegOp(MI&: FirstMI).getReg();
1787	// For now, we only rename if the store operand gets killed at the store.
1788	if (!getLdStRegOp(MI&: FirstMI).isKill() &&
1789	!any_of(Range: FirstMI.operands(),
1790	P: [TRI, RegToRename](const MachineOperand &MOP) {
1791	return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1792	MOP.isImplicit() && MOP.isKill() &&
1793	TRI->regsOverlap(RegA: RegToRename, RegB: MOP.getReg());
1794	})) {
1795	LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1796	return false;
1797	}
1798
1799	bool FoundDef = false;
1800
1801	// For each instruction between FirstMI and the previous def for RegToRename,
1802	// we
1803	// check if we can rename RegToRename in this instruction*
1804	// collect the registers used and required register classes for RegToRename.*
1805	std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1806	bool IsDef) {
1807	LLVM_DEBUG(dbgs() << "Checking " << MI);
1808	// Currently we do not try to rename across frame-setup instructions.
1809	if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1810	LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1811	<< "currently\n");
1812	return false;
1813	}
1814
1815	UsedInBetween.accumulate(MI);
1816
1817	// For a definition, check that we can rename the definition and exit the
1818	// loop.
1819	FoundDef = IsDef;
1820
1821	// For defs, check if we can rename the first def of RegToRename.
1822	if (FoundDef) {
1823	// For some pseudo instructions, we might not generate code in the end
1824	// (e.g. KILL) and we would end up without a correct def for the rename
1825	// register.
1826	// TODO: This might be overly conservative and we could handle those cases
1827	// in multiple ways:
1828	// 1. Insert an extra copy, to materialize the def.
1829	// 2. Skip pseudo-defs until we find an non-pseudo def.
1830	if (MI.isPseudo()) {
1831	LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1832	return false;
1833	}
1834
1835	for (auto &MOP : MI.operands()) {
1836	if (!MOP.isReg() \|\| !MOP.isDef() \|\| MOP.isDebug() \|\| !MOP.getReg() \|\|
1837	!TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1838	continue;
1839	if (!canRenameMOP(MOP, TRI)) {
1840	LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1841	return false;
1842	}
1843	RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1844	}
1845	return true;
1846	} else {
1847	for (auto &MOP : MI.operands()) {
1848	if (!MOP.isReg() \|\| MOP.isDebug() \|\| !MOP.getReg() \|\|
1849	!TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1850	continue;
1851
1852	if (!canRenameMOP(MOP, TRI)) {
1853	LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1854	return false;
1855	}
1856	RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1857	}
1858	}
1859	return true;
1860	};
1861
1862	if (!forAllMIsUntilDef(MI&: FirstMI, DefReg: RegToRename, TRI, Limit: LdStLimit, Fn&: CheckMIs))
1863	return false;
1864
1865	if (!FoundDef) {
1866	LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1867	return false;
1868	}
1869	return true;
1870	}
1871
1872	// We want to merge the second load into the first by rewriting the usages of
1873	// the same reg between first (incl.) and second (excl.). We don't need to care
1874	// about any insns before FirstLoad or after SecondLoad.
1875	// 1. The second load writes new value into the same reg.
1876	// - The renaming is impossible to impact later use of the reg.
1877	// - The second load always trash the value written by the first load which
1878	// means the reg must be killed before the second load.
1879	// 2. The first load must be a def for the same reg so we don't need to look
1880	// into anything before it.
1881	static bool canRenameUntilSecondLoad(
1882	MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1883	LiveRegUnits &UsedInBetween,
1884	SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1885	const TargetRegisterInfo *TRI) {
1886	if (FirstLoad.isPseudo())
1887	return false;
1888
1889	UsedInBetween.accumulate(MI: FirstLoad);
1890	auto RegToRename = getLdStRegOp(MI&: FirstLoad).getReg();
1891	bool Success = std::all_of(
1892	first: FirstLoad.getIterator(), last: SecondLoad.getIterator(),
1893	pred: [&](MachineInstr &MI) {
1894	LLVM_DEBUG(dbgs() << "Checking " << MI);
1895	// Currently we do not try to rename across frame-setup instructions.
1896	if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1897	LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1898	<< "currently\n");
1899	return false;
1900	}
1901
1902	for (auto &MOP : MI.operands()) {
1903	if (!MOP.isReg() \|\| MOP.isDebug() \|\| !MOP.getReg() \|\|
1904	!TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1905	continue;
1906	if (!canRenameMOP(MOP, TRI)) {
1907	LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1908	return false;
1909	}
1910	RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1911	}
1912
1913	return true;
1914	});
1915	return Success;
1916	}
1917
1918	// Check if we can find a physical register for renaming \p Reg. This register
1919	// must:
1920	// not be defined already in \p DefinedInBB; DefinedInBB must contain all*
1921	// defined registers up to the point where the renamed register will be used,
1922	// not used in \p UsedInBetween; UsedInBetween must contain all accessed*
1923	// registers in the range the rename register will be used,
1924	// is available in all used register classes (checked using RequiredClasses).*
1925	static std::optional<MCPhysReg> tryToFindRegisterToRename(
1926	const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1927	LiveRegUnits &UsedInBetween,
1928	SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1929	const TargetRegisterInfo *TRI) {
1930	const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1931
1932	// Checks if any sub- or super-register of PR is callee saved.
1933	auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1934	return any_of(Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1935	P: [&MF, TRI](MCPhysReg SubOrSuper) {
1936	return TRI->isCalleeSavedPhysReg(PhysReg: SubOrSuper, MF);
1937	});
1938	};
1939
1940	// Check if PR or one of its sub- or super-registers can be used for all
1941	// required register classes.
1942	auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1943	return all_of(Range&: RequiredClasses, P: [PR, TRI](const TargetRegisterClass *C) {
1944	return any_of(
1945	Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1946	P: [C](MCPhysReg SubOrSuper) { return C->contains(Reg: SubOrSuper); });
1947	});
1948	};
1949
1950	auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1951	for (const MCPhysReg &PR : *RegClass) {
1952	if (DefinedInBB.available(Reg: PR) && UsedInBetween.available(Reg: PR) &&
1953	!RegInfo.isReserved(PhysReg: PR) && !AnySubOrSuperRegCalleePreserved (PR) &&
1954	CanBeUsedForAllClasses (PR)) {
1955	DefinedInBB.addReg(Reg: PR);
1956	LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1957	<< "\n");
1958	return {PR};
1959	}
1960	}
1961	LLVM_DEBUG(dbgs() << "No rename register found from "
1962	<< TRI->getRegClassName(RegClass) << "\n");
1963	return std::nullopt;
1964	}
1965
1966	// For store pairs: returns a register from FirstMI to the beginning of the
1967	// block that can be renamed.
1968	// For load pairs: returns a register from FirstMI to MI that can be renamed.
1969	static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1970	std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1971	Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1972	SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1973	const TargetRegisterInfo *TRI) {
1974	std::optional<MCPhysReg> RenameReg;
1975	if (!DebugCounter::shouldExecute(Counter&: RegRenamingCounter))
1976	return RenameReg;
1977
1978	auto *RegClass = TRI->getMinimalPhysRegClass(Reg: getLdStRegOp(MI&: FirstMI).getReg());
1979	MachineFunction &MF = *FirstMI.getParent()->getParent();
1980	if (!RegClass \|\| !MF.getRegInfo().tracksLiveness())
1981	return RenameReg;
1982
1983	const bool IsLoad = FirstMI.mayLoad();
1984
1985	if (!MaybeCanRename) {
1986	if (IsLoad)
1987	MaybeCanRename = {canRenameUntilSecondLoad(FirstLoad&: FirstMI, SecondLoad&: MI, UsedInBetween,
1988	RequiredClasses, TRI)};
1989	else
1990	MaybeCanRename = {
1991	canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1992	}
1993
1994	if (*MaybeCanRename) {
1995	RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1996	RequiredClasses, TRI);
1997	}
1998	return RenameReg;
1999	}
2000
2001	/// Scan the instructions looking for a load/store that can be combined with the
2002	/// current instruction into a wider equivalent or a load/store pair.
2003	MachineBasicBlock::iterator
2004	AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
2005	LdStPairFlags &Flags, unsigned Limit,
2006	bool FindNarrowMerge) {
2007	MachineBasicBlock::iterator E = I ->getParent()->end();
2008	MachineBasicBlock::iterator MBBI = I;
2009	MachineBasicBlock::iterator MBBIWithRenameReg;
2010	MachineInstr &FirstMI = *I;
2011	MBBI = next_nodbg(It: MBBI, End: E);
2012
2013	bool MayLoad = FirstMI.mayLoad();
2014	bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: FirstMI);
2015	Register Reg = getLdStRegOp(MI&: FirstMI).getReg();
2016	Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: FirstMI).getReg();
2017	int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: FirstMI).getImm();
2018	int OffsetStride = IsUnscaled ? TII->getMemScale(MI: FirstMI) : `1`;
2019	bool IsPromotableZeroStore = isPromotableZeroStoreInst(MI&: FirstMI);
2020
2021	std::optional<bool> MaybeCanRename;
2022	if (!EnableRenaming)
2023	MaybeCanRename = {false};
2024
2025	SmallPtrSet<const TargetRegisterClass *, `5`> RequiredClasses;
2026	LiveRegUnits UsedInBetween;
2027	UsedInBetween.init(TRI: *TRI);
2028
2029	Flags.clearRenameReg();
2030
2031	// Track which register units have been modified and used between the first
2032	// insn (inclusive) and the second insn.
2033	ModifiedRegUnits.clear();
2034	UsedRegUnits.clear();
2035
2036	// Remember any instructions that read/write memory between FirstMI and MI.
2037	SmallVector<MachineInstr *, `4`> MemInsns;
2038
2039	LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
2040	for (unsigned Count = `0`; MBBI != E && Count < Limit;
2041	MBBI = next_nodbg(It: MBBI, End: E)) {
2042	MachineInstr &MI = *MBBI;
2043	LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
2044
2045	UsedInBetween.accumulate(MI);
2046
2047	// Don't count transient instructions towards the search limit since there
2048	// may be different numbers of them if e.g. debug information is present.
2049	if (!MI.isTransient())
2050	++Count;
2051
2052	Flags.setSExtIdx(-`1`);
2053	if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
2054	AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
2055	assert(MI.mayLoadOrStore() && "Expected memory operation.");
2056	// If we've found another instruction with the same opcode, check to see
2057	// if the base and offset are compatible with our starting instruction.
2058	// These instructions all have scaled immediate operands, so we just
2059	// check for +1/-1. Make sure to check the new instruction offset is
2060	// actually an immediate and not a symbolic reference destined for
2061	// a relocation.
2062	Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
2063	int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2064	bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
2065	if (IsUnscaled != MIIsUnscaled) {
2066	// We're trying to pair instructions that differ in how they are scaled.
2067	// If FirstMI is scaled then scale the offset of MI accordingly.
2068	// Otherwise, do the opposite (i.e., make MI's offset unscaled).
2069	int MemSize = TII->getMemScale(MI);
2070	if (MIIsUnscaled) {
2071	// If the unscaled offset isn't a multiple of the MemSize, we can't
2072	// pair the operations together: bail and keep looking.
2073	if (MIOffset % MemSize) {
2074	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2075	UsedRegUnits, TRI);
2076	MemInsns.push_back(Elt: &MI);
2077	continue;
2078	}
2079	MIOffset /= MemSize;
2080	} else {
2081	MIOffset *= MemSize;
2082	}
2083	}
2084
2085	bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
2086
2087	if (BaseReg == MIBaseReg) {
2088	// If the offset of the second ld/st is not equal to the size of the
2089	// destination register it can’t be paired with a pre-index ld/st
2090	// pair. Additionally if the base reg is used or modified the operations
2091	// can't be paired: bail and keep looking.
2092	if (IsPreLdSt) {
2093	bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
2094	bool IsBaseRegUsed = !UsedRegUnits.available(
2095	Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2096	bool IsBaseRegModified = !ModifiedRegUnits.available(
2097	Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2098	// If the stored value and the address of the second instruction is
2099	// the same, it needs to be using the updated register and therefore
2100	// it must not be folded.
2101	bool IsMIRegTheSame =
2102	TRI->regsOverlap(RegA: getLdStRegOp(MI).getReg(),
2103	RegB: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2104	if (IsOutOfBounds \|\| IsBaseRegUsed \|\| IsBaseRegModified \|\|
2105	IsMIRegTheSame) {
2106	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2107	UsedRegUnits, TRI);
2108	MemInsns.push_back(Elt: &MI);
2109	continue;
2110	}
2111	} else {
2112	if ((Offset != MIOffset + OffsetStride) &&
2113	(Offset + OffsetStride != MIOffset)) {
2114	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2115	UsedRegUnits, TRI);
2116	MemInsns.push_back(Elt: &MI);
2117	continue;
2118	}
2119	}
2120
2121	int MinOffset = Offset < MIOffset ? Offset : MIOffset;
2122	if (FindNarrowMerge) {
2123	// If the alignment requirements of the scaled wide load/store
2124	// instruction can't express the offset of the scaled narrow input,
2125	// bail and keep looking. For promotable zero stores, allow only when
2126	// the stored value is the same (i.e., WZR).
2127	if ((!IsUnscaled && alignTo(Num: MinOffset, PowOf2: `2`) != MinOffset) \|\|
2128	(IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
2129	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2130	UsedRegUnits, TRI);
2131	MemInsns.push_back(Elt: &MI);
2132	continue;
2133	}
2134	} else {
2135	// Pairwise instructions have a 7-bit signed offset field. Single
2136	// insns have a 12-bit unsigned offset field. If the resultant
2137	// immediate offset of merging these instructions is out of range for
2138	// a pairwise instruction, bail and keep looking.
2139	if (!inBoundsForPair(IsUnscaled, Offset: MinOffset, OffsetStride)) {
2140	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2141	UsedRegUnits, TRI);
2142	MemInsns.push_back(Elt: &MI);
2143	LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
2144	<< "keep looking.\n");
2145	continue;
2146	}
2147	// If the alignment requirements of the paired (scaled) instruction
2148	// can't express the offset of the unscaled input, bail and keep
2149	// looking.
2150	if (IsUnscaled && (alignTo(Num: MinOffset, PowOf2: OffsetStride) != MinOffset)) {
2151	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2152	UsedRegUnits, TRI);
2153	MemInsns.push_back(Elt: &MI);
2154	LLVM_DEBUG(dbgs()
2155	<< "Offset doesn't fit due to alignment requirements, "
2156	<< "keep looking.\n");
2157	continue;
2158	}
2159	}
2160
2161	// If the BaseReg has been modified, then we cannot do the optimization.
2162	// For example, in the following pattern
2163	// ldr x1 [x2]
2164	// ldr x2 [x3]
2165	// ldr x4 [x2, #8],
2166	// the first and third ldr cannot be converted to ldp x1, x4, [x2]
2167	if (!ModifiedRegUnits.available(Reg: BaseReg))
2168	return E;
2169
2170	const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
2171	RegA: Reg, RegB: getLdStRegOp(MI).getReg());
2172
2173	// If the Rt of the second instruction (destination register of the
2174	// load) was not modified or used between the two instructions and none
2175	// of the instructions between the second and first alias with the
2176	// second, we can combine the second into the first.
2177	bool RtNotModified =
2178	ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg());
2179	bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
2180	!UsedRegUnits.available(Reg: getLdStRegOp(MI).getReg()));
2181
2182	LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
2183	<< "Reg '" << getLdStRegOp(MI) << "' not modified: "
2184	<< (RtNotModified ? "true" : "false") << "\n"
2185	<< "Reg '" << getLdStRegOp(MI) << "' not used: "
2186	<< (RtNotUsed ? "true" : "false") << "\n");
2187
2188	if (RtNotModified && RtNotUsed && !mayAlias(MIa&: MI, MemInsns, AA)) {
2189	// For pairs loading into the same reg, try to find a renaming
2190	// opportunity to allow the renaming of Reg between FirstMI and MI
2191	// and combine MI into FirstMI; otherwise bail and keep looking.
2192	if (SameLoadReg) {
2193	std::optional<MCPhysReg> RenameReg =
2194	findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
2195	Reg, DefinedInBB, UsedInBetween,
2196	RequiredClasses, TRI);
2197	if (!RenameReg) {
2198	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2199	UsedRegUnits, TRI);
2200	MemInsns.push_back(Elt: &MI);
2201	LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
2202	<< "keep looking.\n");
2203	continue;
2204	}
2205	Flags.setRenameReg(*RenameReg);
2206	}
2207
2208	Flags.setMergeForward(false);
2209	if (!SameLoadReg)
2210	Flags.clearRenameReg();
2211	return MBBI;
2212	}
2213
2214	// Likewise, if the Rt of the first instruction is not modified or used
2215	// between the two instructions and none of the instructions between the
2216	// first and the second alias with the first, we can combine the first
2217	// into the second.
2218	RtNotModified = !(
2219	MayLoad && !UsedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg()));
2220
2221	LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
2222	<< "Reg '" << getLdStRegOp(FirstMI)
2223	<< "' not modified: "
2224	<< (RtNotModified ? "true" : "false") << "\n");
2225
2226	if (RtNotModified && !mayAlias(MIa&: FirstMI, MemInsns, AA)) {
2227	if (ModifiedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg())) {
2228	Flags.setMergeForward(true);
2229	Flags.clearRenameReg();
2230	return MBBI;
2231	}
2232
2233	std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
2234	MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
2235	RequiredClasses, TRI);
2236	if (RenameReg) {
2237	Flags.setMergeForward(true);
2238	Flags.setRenameReg(*RenameReg);
2239	MBBIWithRenameReg = MBBI;
2240	}
2241	}
2242	LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
2243	<< "interference in between, keep looking.\n");
2244	}
2245	}
2246
2247	if (Flags.getRenameReg())
2248	return MBBIWithRenameReg;
2249
2250	// If the instruction wasn't a matching load or store. Stop searching if we
2251	// encounter a call instruction that might modify memory.
2252	if (MI.isCall()) {
2253	LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
2254	return E;
2255	}
2256
2257	// Update modified / uses register units.
2258	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2259
2260	// Otherwise, if the base register is modified, we have no match, so
2261	// return early.
2262	if (!ModifiedRegUnits.available(Reg: BaseReg)) {
2263	LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
2264	return E;
2265	}
2266
2267	// Update list of instructions that read/write memory.
2268	if (MI.mayLoadOrStore())
2269	MemInsns.push_back(Elt: &MI);
2270	}
2271	return E;
2272	}
2273
2274	static MachineBasicBlock::iterator
2275	maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
2276	assert((MI.getOpcode() == AArch64::SUBXri \|\|
2277	MI.getOpcode() == AArch64::ADDXri) &&
2278	"Expected a register update instruction");
2279	auto End = MI.getParent()->end();
2280	if (MaybeCFI == End \|\|
2281	MaybeCFI ->getOpcode() != TargetOpcode::CFI_INSTRUCTION \|\|
2282	!(MI.getFlag(Flag: MachineInstr::FrameSetup) \|\|
2283	MI.getFlag(Flag: MachineInstr::FrameDestroy)) \|\|
2284	MI.getOperand(i: `0`).getReg() != AArch64::SP)
2285	return End;
2286
2287	const MachineFunction &MF = *MI.getParent()->getParent();
2288	unsigned CFIIndex = MaybeCFI ->getOperand(i: `0`).getCFIIndex();
2289	const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
2290	switch (CFI.getOperation()) {
2291	case MCCFIInstruction::OpDefCfa:
2292	case MCCFIInstruction::OpDefCfaOffset:
2293	return MaybeCFI;
2294	default:
2295	return End;
2296	}
2297	}
2298
2299	std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
2300	MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
2301	bool IsForward, bool IsPreIdx, bool MergeEither) {
2302	assert((Update->getOpcode() == AArch64::ADDXri \|\|
2303	Update->getOpcode() == AArch64::SUBXri) &&
2304	"Unexpected base register update instruction to merge!");
2305	MachineBasicBlock::iterator E = I ->getParent()->end();
2306	MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2307
2308	// If updating the SP and the following instruction is CFA offset related CFI,
2309	// make sure the CFI follows the SP update either by merging at the location
2310	// of the update or by moving the CFI after the merged instruction. If unable
2311	// to do so, bail.
2312	MachineBasicBlock::iterator InsertPt = I;
2313	if (IsForward) {
2314	assert(IsPreIdx);
2315	if (auto CFI = maybeMoveCFI(MI&: *Update, MaybeCFI: next_nodbg(It: Update, End: E)); CFI != E) {
2316	if (MergeEither) {
2317	InsertPt = Update;
2318	} else {
2319	// Take care not to reorder CFIs.
2320	if (std::any_of(first: std::next(x: CFI), last: I, pred: [](const auto &Insn) {
2321	return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
2322	}))
2323	return std::nullopt;
2324
2325	MachineBasicBlock *MBB = InsertPt ->getParent();
2326	MBB->splice(Where: std::next(x: InsertPt), Other: MBB, From: CFI);
2327	}
2328	}
2329	}
2330
2331	// Return the instruction following the merged instruction, which is
2332	// the instruction following our unmerged load. Unless that's the add/sub
2333	// instruction we're merging, in which case it's the one after that.
2334	if (NextI == Update)
2335	NextI = next_nodbg(It: NextI, End: E);
2336
2337	int Value = Update ->getOperand(i: `2`).getImm();
2338	assert(AArch64_AM::getShiftValue(Update->getOperand(`3`).getImm()) == `0` &&
2339	"Can't merge 1 << 12 offset into pre-/post-indexed load / store");
2340	if (Update ->getOpcode() == AArch64::SUBXri)
2341	Value = -Value;
2342
2343	unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(Opc: I ->getOpcode())
2344	: getPostIndexedOpcode(Opc: I ->getOpcode());
2345	MachineInstrBuilder MIB;
2346	int Scale, MinOffset, MaxOffset;
2347	getPrePostIndexedMemOpInfo(MI: *I, Scale, MinOffset, MaxOffset);
2348	if (!AArch64InstrInfo::isPairedLdSt(MI: *I)) {
2349	// Non-paired instruction.
2350	MIB = BuildMI(BB&: *InsertPt ->getParent(), I: InsertPt, MIMD: InsertPt ->getDebugLoc(),
2351	MCID: TII->get(Opcode: NewOpc))
2352	.add(MO: Update ->getOperand(i: `0`))
2353	.add(MO: getLdStRegOp(MI&: *I))
2354	.add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2355	.addImm(Val: Value / Scale)
2356	.setMemRefs(I ->memoperands())
2357	.setMIFlags(I ->mergeFlagsWith(Other: *Update));
2358	} else {
2359	// Paired instruction.
2360	MIB = BuildMI(BB&: *InsertPt ->getParent(), I: InsertPt, MIMD: InsertPt ->getDebugLoc(),
2361	MCID: TII->get(Opcode: NewOpc))
2362	.add(MO: Update ->getOperand(i: `0`))
2363	.add(MO: getLdStRegOp(MI&: *I, PairedRegOp: `0`))
2364	.add(MO: getLdStRegOp(MI&: *I, PairedRegOp: `1`))
2365	.add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2366	.addImm(Val: Value / Scale)
2367	.setMemRefs(I ->memoperands())
2368	.setMIFlags(I ->mergeFlagsWith(Other: *Update));
2369	}
2370
2371	if (IsPreIdx) {
2372	++NumPreFolded;
2373	LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2374	} else {
2375	++NumPostFolded;
2376	LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2377	}
2378	LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2379	LLVM_DEBUG(I->print(dbgs()));
2380	LLVM_DEBUG(dbgs() << " ");
2381	LLVM_DEBUG(Update->print(dbgs()));
2382	LLVM_DEBUG(dbgs() << " with instruction:\n ");
2383	LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2384	LLVM_DEBUG(dbgs() << "\n");
2385
2386	// Erase the old instructions for the block.
2387	I ->eraseFromParent();
2388	Update ->eraseFromParent();
2389
2390	return NextI;
2391	}
2392
2393	MachineBasicBlock::iterator
2394	AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2395	MachineBasicBlock::iterator Update,
2396	unsigned Offset, int Scale) {
2397	assert((Update->getOpcode() == AArch64::MOVKWi) &&
2398	"Unexpected const mov instruction to merge!");
2399	MachineBasicBlock::iterator E = I ->getParent()->end();
2400	MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2401	MachineBasicBlock::iterator PrevI = prev_nodbg(It: Update, Begin: E);
2402	MachineInstr &MemMI = *I;
2403	unsigned Mask = (`1` << `12`) * Scale - `1`;
2404	unsigned Low = Offset & Mask;
2405	unsigned High = Offset - Low;
2406	Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2407	Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2408	MachineInstrBuilder AddMIB, MemMIB;
2409
2410	// Add IndexReg, BaseReg, High (the BaseReg may be SP)
2411	AddMIB =
2412	BuildMI(BB&: *I ->getParent(), I, MIMD: I ->getDebugLoc(), MCID: TII->get(Opcode: AArch64::ADDXri))
2413	.addDef(RegNo: IndexReg)
2414	.addUse(RegNo: BaseReg)
2415	.addImm(Val: High >> `12`) // shifted value
2416	.addImm(Val: `12`); // shift 12
2417	(void)AddMIB;
2418	// Ld/St DestReg, IndexReg, Imm12
2419	unsigned NewOpc = getBaseAddressOpcode(Opc: I ->getOpcode());
2420	MemMIB = BuildMI(BB&: *I ->getParent(), I, MIMD: I ->getDebugLoc(), MCID: TII->get(Opcode: NewOpc))
2421	.add(MO: getLdStRegOp(MI&: MemMI))
2422	.add(MO: AArch64InstrInfo::getLdStOffsetOp(MI: MemMI))
2423	.addImm(Val: Low / Scale)
2424	.setMemRefs(I ->memoperands())
2425	.setMIFlags(I ->mergeFlagsWith(Other: *Update));
2426	(void)MemMIB;
2427
2428	++NumConstOffsetFolded;
2429	LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2430	LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2431	LLVM_DEBUG(PrevI->print(dbgs()));
2432	LLVM_DEBUG(dbgs() << " ");
2433	LLVM_DEBUG(Update->print(dbgs()));
2434	LLVM_DEBUG(dbgs() << " ");
2435	LLVM_DEBUG(I->print(dbgs()));
2436	LLVM_DEBUG(dbgs() << " with instruction:\n ");
2437	LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2438	LLVM_DEBUG(dbgs() << " ");
2439	LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2440	LLVM_DEBUG(dbgs() << "\n");
2441
2442	// Erase the old instructions for the block.
2443	I ->eraseFromParent();
2444	PrevI ->eraseFromParent();
2445	Update ->eraseFromParent();
2446
2447	return NextI;
2448	}
2449
2450	bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2451	MachineInstr &MI,
2452	unsigned BaseReg, int Offset) {
2453	switch (MI.getOpcode()) {
2454	default:
2455	break;
2456	case AArch64::SUBXri:
2457	case AArch64::ADDXri:
2458	// Make sure it's a vanilla immediate operand, not a relocation or
2459	// anything else we can't handle.
2460	if (!MI.getOperand(i: `2`).isImm())
2461	break;
2462	// Watch out for 1 << 12 shifted value.
2463	if (AArch64_AM::getShiftValue(Imm: MI.getOperand(i: `3`).getImm()))
2464	break;
2465
2466	// The update instruction source and destination register must be the
2467	// same as the load/store base register.
2468	if (MI.getOperand(i: `0`).getReg() != BaseReg \|\|
2469	MI.getOperand(i: `1`).getReg() != BaseReg)
2470	break;
2471
2472	int UpdateOffset = MI.getOperand(i: `2`).getImm();
2473	if (MI.getOpcode() == AArch64::SUBXri)
2474	UpdateOffset = -UpdateOffset;
2475
2476	// The immediate must be a multiple of the scaling factor of the pre/post
2477	// indexed instruction.
2478	int Scale, MinOffset, MaxOffset;
2479	getPrePostIndexedMemOpInfo(MI: MemMI, Scale, MinOffset, MaxOffset);
2480	if (UpdateOffset % Scale != `0`)
2481	break;
2482
2483	// Scaled offset must fit in the instruction immediate.
2484	int ScaledOffset = UpdateOffset / Scale;
2485	if (ScaledOffset > MaxOffset \|\| ScaledOffset < MinOffset)
2486	break;
2487
2488	// If we have a non-zero Offset, we check that it matches the amount
2489	// we're adding to the register.
2490	if (!Offset \|\| Offset == UpdateOffset)
2491	return true;
2492	break;
2493	}
2494	return false;
2495	}
2496
2497	bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2498	MachineInstr &MI,
2499	unsigned IndexReg,
2500	unsigned &Offset) {
2501	// The update instruction source and destination register must be the
2502	// same as the load/store index register.
2503	if (MI.getOpcode() == AArch64::MOVKWi &&
2504	TRI->isSuperOrSubRegisterEq(RegA: IndexReg, RegB: MI.getOperand(i: `1`).getReg())) {
2505
2506	// movz + movk hold a large offset of a Ld/St instruction.
2507	MachineBasicBlock::iterator B = MI.getParent()->begin();
2508	MachineBasicBlock::iterator MBBI = &MI;
2509	// Skip the scene when the MI is the first instruction of a block.
2510	if (MBBI == B)
2511	return false;
2512	MBBI = prev_nodbg(It: MBBI, Begin: B);
2513	MachineInstr &MovzMI = *MBBI;
2514	// Make sure the MOVKWi and MOVZWi set the same register.
2515	if (MovzMI.getOpcode() == AArch64::MOVZWi &&
2516	MovzMI.getOperand(i: `0`).getReg() == MI.getOperand(i: `0`).getReg()) {
2517	unsigned Low = MovzMI.getOperand(i: `1`).getImm();
2518	unsigned High = MI.getOperand(i: `2`).getImm() << MI.getOperand(i: `3`).getImm();
2519	Offset = High + Low;
2520	// 12-bit optionally shifted immediates are legal for adds.
2521	return Offset >> `24` == `0`;
2522	}
2523	}
2524	return false;
2525	}
2526
2527	MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2528	MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2529	MachineBasicBlock::iterator E = I ->getParent()->end();
2530	MachineInstr &MemMI = *I;
2531	MachineBasicBlock::iterator MBBI = I;
2532
2533	Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2534	int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm() *
2535	TII->getMemScale(MI: MemMI);
2536
2537	// Scan forward looking for post-index opportunities. Updating instructions
2538	// can't be formed if the memory instruction doesn't have the offset we're
2539	// looking for.
2540	if (MIUnscaledOffset != UnscaledOffset)
2541	return E;
2542
2543	// If the base register overlaps a source/destination register, we can't
2544	// merge the update. This does not apply to tag store instructions which
2545	// ignore the address part of the source register.
2546	// This does not apply to STGPi as well, which does not have unpredictable
2547	// behavior in this case unlike normal stores, and always performs writeback
2548	// after reading the source register value.
2549	if (!isTagStore(MI: MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2550	bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2551	for (unsigned i = `0`, e = IsPairedInsn ? `2` : `1`; i != e; ++i) {
2552	Register DestReg = getLdStRegOp(MI&: MemMI, PairedRegOp: i).getReg();
2553	if (DestReg == BaseReg \|\| TRI->isSubRegister(RegA: BaseReg, RegB: DestReg))
2554	return E;
2555	}
2556	}
2557
2558	// Track which register units have been modified and used between the first
2559	// insn (inclusive) and the second insn.
2560	ModifiedRegUnits.clear();
2561	UsedRegUnits.clear();
2562	MBBI = next_nodbg(It: MBBI, End: E);
2563
2564	// We can't post-increment the stack pointer if any instruction between
2565	// the memory access (I) and the increment (MBBI) can access the memory
2566	// region defined by [SP, MBBI].
2567	const bool BaseRegSP = BaseReg == AArch64::SP;
2568	if (BaseRegSP && needsWinCFI(MF: I ->getMF())) {
2569	// FIXME: For now, we always block the optimization over SP in windows
2570	// targets as it requires to adjust the unwind/debug info, messing up
2571	// the unwind info can actually cause a miscompile.
2572	return E;
2573	}
2574
2575	unsigned Count = `0`;
2576	MachineBasicBlock *CurMBB = I ->getParent();
2577	// choice of next block to visit is liveins-based
2578	bool VisitSucc = CurMBB->getParent()->getRegInfo().tracksLiveness();
2579
2580	while (true) {
2581	for (MachineBasicBlock::iterator CurEnd = CurMBB->end();
2582	MBBI != CurEnd && Count < Limit; MBBI = next_nodbg(It: MBBI, End: CurEnd)) {
2583	MachineInstr &MI = *MBBI;
2584
2585	// Don't count transient instructions towards the search limit since there
2586	// may be different numbers of them if e.g. debug information is present.
2587	if (!MI.isTransient())
2588	++Count;
2589
2590	// If we found a match, return it.
2591	if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset: UnscaledOffset))
2592	return MBBI;
2593
2594	// Update the status of what the instruction clobbered and used.
2595	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2596	TRI);
2597
2598	// Otherwise, if the base register is used or modified, we have no match,
2599	// so return early. If we are optimizing SP, do not allow instructions
2600	// that may load or store in between the load and the optimized value
2601	// update.
2602	if (!ModifiedRegUnits.available(Reg: BaseReg) \|\|
2603	!UsedRegUnits.available(Reg: BaseReg) \|\|
2604	(BaseRegSP && MBBI ->mayLoadOrStore()))
2605	return E;
2606	}
2607
2608	if (!VisitSucc \|\| Limit <= Count)
2609	break;
2610
2611	// Try to go downward to successors along a CF path w/o side enters
2612	// such that BaseReg is alive along it but not at its exits
2613	MachineBasicBlock SuccToVisit = nullptr*;
2614	unsigned LiveSuccCount = `0`;
2615	for (MachineBasicBlock *Succ : CurMBB->successors()) {
2616	for (MCRegAliasIterator AI(BaseReg, TRI, true); AI.isValid(); ++AI) {
2617	if (Succ->isLiveIn(Reg: *AI)) {
2618	if (LiveSuccCount++)
2619	return E;
2620	if (Succ->pred_size() == `1`)
2621	SuccToVisit = Succ;
2622	break;
2623	}
2624	}
2625	}
2626	if (!SuccToVisit)
2627	break;
2628	CurMBB = SuccToVisit;
2629	MBBI = CurMBB->begin();
2630	}
2631
2632	return E;
2633	}
2634
2635	MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2636	MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
2637	MachineBasicBlock::iterator B = I ->getParent()->begin();
2638	MachineBasicBlock::iterator E = I ->getParent()->end();
2639	MachineInstr &MemMI = *I;
2640	MachineBasicBlock::iterator MBBI = I;
2641	MachineFunction &MF = *MemMI.getMF();
2642
2643	Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2644	int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm();
2645
2646	bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2647	Register DestReg[] = {getLdStRegOp(MI&: MemMI, PairedRegOp: `0`).getReg(),
2648	IsPairedInsn ? getLdStRegOp(MI&: MemMI, PairedRegOp: `1`).getReg()
2649	: AArch64::NoRegister};
2650
2651	// If the load/store is the first instruction in the block, there's obviously
2652	// not any matching update. Ditto if the memory offset isn't zero.
2653	if (MBBI == B \|\| Offset != `0`)
2654	return E;
2655	// If the base register overlaps a destination register, we can't
2656	// merge the update.
2657	if (!isTagStore(MI: MemMI)) {
2658	for (unsigned i = `0`, e = IsPairedInsn ? `2` : `1`; i != e; ++i)
2659	if (DestReg[i] == BaseReg \|\| TRI->isSubRegister(RegA: BaseReg, RegB: DestReg[i]))
2660	return E;
2661	}
2662
2663	const bool BaseRegSP = BaseReg == AArch64::SP;
2664	if (BaseRegSP && needsWinCFI(MF: I ->getMF())) {
2665	// FIXME: For now, we always block the optimization over SP in windows
2666	// targets as it requires to adjust the unwind/debug info, messing up
2667	// the unwind info can actually cause a miscompile.
2668	return E;
2669	}
2670
2671	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2672	unsigned RedZoneSize =
2673	Subtarget.getTargetLowering()->getRedZoneSize(F: MF.getFunction());
2674
2675	// Track which register units have been modified and used between the first
2676	// insn (inclusive) and the second insn.
2677	ModifiedRegUnits.clear();
2678	UsedRegUnits.clear();
2679	unsigned Count = `0`;
2680	bool MemAccessBeforeSPPreInc = false;
2681	MergeEither = true;
2682	do {
2683	MBBI = prev_nodbg(It: MBBI, Begin: B);
2684	MachineInstr &MI = *MBBI;
2685
2686	// Don't count transient instructions towards the search limit since there
2687	// may be different numbers of them if e.g. debug information is present.
2688	if (!MI.isTransient())
2689	++Count;
2690
2691	// If we found a match, return it.
2692	if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset)) {
2693	// Check that the update value is within our red zone limit (which may be
2694	// zero).
2695	if (MemAccessBeforeSPPreInc && MBBI ->getOperand(i: `2`).getImm() > RedZoneSize)
2696	return E;
2697	return MBBI;
2698	}
2699
2700	// Update the status of what the instruction clobbered and used.
2701	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2702
2703	// Otherwise, if the base register is used or modified, we have no match, so
2704	// return early.
2705	if (!ModifiedRegUnits.available(Reg: BaseReg) \|\|
2706	!UsedRegUnits.available(Reg: BaseReg))
2707	return E;
2708
2709	// If we have a destination register (i.e. a load instruction) and a
2710	// destination register is used or modified, then we can only merge forward,
2711	// i.e. the combined instruction is put in the place of the memory
2712	// instruction. Same applies if we see a memory access or side effects.
2713	if (MI.mayLoadOrStore() \|\| MI.hasUnmodeledSideEffects() \|\|
2714	(DestReg[`0`] != AArch64::NoRegister &&
2715	!(ModifiedRegUnits.available(Reg: DestReg[`0`]) &&
2716	UsedRegUnits.available(Reg: DestReg[`0`]))) \|\|
2717	(DestReg[`1`] != AArch64::NoRegister &&
2718	!(ModifiedRegUnits.available(Reg: DestReg[`1`]) &&
2719	UsedRegUnits.available(Reg: DestReg[`1`]))))
2720	MergeEither = false;
2721
2722	// Keep track if we have a memory access before an SP pre-increment, in this
2723	// case we need to validate later that the update amount respects the red
2724	// zone.
2725	if (BaseRegSP && MBBI ->mayLoadOrStore())
2726	MemAccessBeforeSPPreInc = true;
2727	} while (MBBI != B && Count < Limit);
2728	return E;
2729	}
2730
2731	MachineBasicBlock::iterator
2732	AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2733	MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2734	MachineBasicBlock::iterator B = I ->getParent()->begin();
2735	MachineBasicBlock::iterator E = I ->getParent()->end();
2736	MachineInstr &MemMI = *I;
2737	MachineBasicBlock::iterator MBBI = I;
2738
2739	// If the load is the first instruction in the block, there's obviously
2740	// not any matching load or store.
2741	if (MBBI == B)
2742	return E;
2743
2744	// Make sure the IndexReg is killed and the shift amount is zero.
2745	// TODO: Relex this restriction to extend, simplify processing now.
2746	if (!AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).isKill() \|\|
2747	!AArch64InstrInfo::getLdStAmountOp(MI: MemMI).isImm() \|\|
2748	(AArch64InstrInfo::getLdStAmountOp(MI: MemMI).getImm() != `0`))
2749	return E;
2750
2751	Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2752
2753	// Track which register units have been modified and used between the first
2754	// insn (inclusive) and the second insn.
2755	ModifiedRegUnits.clear();
2756	UsedRegUnits.clear();
2757	unsigned Count = `0`;
2758	do {
2759	MBBI = prev_nodbg(It: MBBI, Begin: B);
2760	MachineInstr &MI = *MBBI;
2761
2762	// Don't count transient instructions towards the search limit since there
2763	// may be different numbers of them if e.g. debug information is present.
2764	if (!MI.isTransient())
2765	++Count;
2766
2767	// If we found a match, return it.
2768	if (isMatchingMovConstInsn(MemMI&: *I, MI, IndexReg, Offset)) {
2769	return MBBI;
2770	}
2771
2772	// Update the status of what the instruction clobbered and used.
2773	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2774
2775	// Otherwise, if the index register is used or modified, we have no match,
2776	// so return early.
2777	if (!ModifiedRegUnits.available(Reg: IndexReg) \|\|
2778	!UsedRegUnits.available(Reg: IndexReg))
2779	return E;
2780
2781	} while (MBBI != B && Count < Limit);
2782	return E;
2783	}
2784
2785	bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2786	MachineBasicBlock::iterator &MBBI) {
2787	MachineInstr &MI = *MBBI;
2788	// If this is a volatile load, don't mess with it.
2789	if (MI.hasOrderedMemoryRef())
2790	return false;
2791
2792	if (needsWinCFI(MF: MI.getMF()) && MI.getFlag(Flag: MachineInstr::FrameDestroy))
2793	return false;
2794
2795	// Make sure this is a reg+imm.
2796	// FIXME: It is possible to extend it to handle reg+reg cases.
2797	if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2798	return false;
2799
2800	// Look backward up to LdStLimit instructions.
2801	MachineBasicBlock::iterator StoreI;
2802	if (findMatchingStore(I: MBBI, Limit: LdStLimit, StoreI)) {
2803	++NumLoadsFromStoresPromoted;
2804	// Promote the load. Keeping the iterator straight is a
2805	// pain, so we let the merge routine tell us what the next instruction
2806	// is after it's done mucking about.
2807	MBBI = promoteLoadFromStore(LoadI: MBBI, StoreI);
2808	return true;
2809	}
2810	return false;
2811	}
2812
2813	// Merge adjacent zero stores into a wider store.
2814	bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2815	MachineBasicBlock::iterator &MBBI) {
2816	assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2817	MachineInstr &MI = *MBBI;
2818	MachineBasicBlock::iterator E = MI.getParent()->end();
2819
2820	if (!TII->isCandidateToMergeOrPair(MI))
2821	return false;
2822
2823	// Look ahead up to LdStLimit instructions for a mergeable instruction.
2824	LdStPairFlags Flags;
2825	MachineBasicBlock::iterator MergeMI =
2826	findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, / FindNarrowMerge = / true);
2827	if (MergeMI != E) {
2828	++NumZeroStoresPromoted;
2829
2830	// Keeping the iterator straight is a pain, so we let the merge routine tell
2831	// us what the next instruction is after it's done mucking about.
2832	MBBI = mergeNarrowZeroStores(I: MBBI, MergeMI, Flags);
2833	return true;
2834	}
2835	return false;
2836	}
2837
2838	// Find loads and stores that can be merged into a single load or store pair
2839	// instruction.
2840	bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2841	MachineInstr &MI = *MBBI;
2842	MachineBasicBlock::iterator E = MI.getParent()->end();
2843
2844	if (!TII->isCandidateToMergeOrPair(MI))
2845	return false;
2846
2847	// If disable-ldp feature is opted, do not emit ldp.
2848	if (MI.mayLoad() && Subtarget->hasDisableLdp())
2849	return false;
2850
2851	// If disable-stp feature is opted, do not emit stp.
2852	if (MI.mayStore() && Subtarget->hasDisableStp())
2853	return false;
2854
2855	// Early exit if the offset is not possible to match. (6 bits of positive
2856	// range, plus allow an extra one in case we find a later insn that matches
2857	// with Offset-1)
2858	bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2859	int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2860	int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : `1`;
2861	// Allow one more for offset.
2862	if (Offset > `0`)
2863	Offset -= OffsetStride;
2864	if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2865	return false;
2866
2867	// Look ahead up to LdStLimit instructions for a pairable instruction.
2868	LdStPairFlags Flags;
2869	MachineBasicBlock::iterator Paired =
2870	findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, / FindNarrowMerge = / false);
2871	if (Paired != E) {
2872	// Keeping the iterator straight is a pain, so we let the merge routine tell
2873	// us what the next instruction is after it's done mucking about.
2874	auto Prev = std::prev(x: MBBI);
2875
2876	// Fetch the memoperand of the load/store that is a candidate for
2877	// combination.
2878	MachineMemOperand *MemOp =
2879	MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2880
2881	// If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2882	// that the alignment of the source pointer is at least double the alignment
2883	// of the type.
2884	if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) \|\|
2885	(MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2886	// If there is no size/align information, cancel the transformation.
2887	if (!MemOp \|\| !MemOp->getMemoryType().isValid()) {
2888	NumFailedAlignmentCheck ++;
2889	return false;
2890	}
2891
2892	// Get the needed alignments to check them if
2893	// ldp-aligned-only/stp-aligned-only features are opted.
2894	uint64_t MemAlignment = MemOp->getAlign().value();
2895	uint64_t TypeAlignment =
2896	Align (MemOp->getSize().getValue().getKnownMinValue()).value();
2897
2898	if (MemAlignment < `2` * TypeAlignment) {
2899	NumFailedAlignmentCheck ++;
2900	return false;
2901	}
2902	}
2903
2904	++NumPairCreated;
2905	if (TII->hasUnscaledLdStOffset(MI))
2906	++NumUnscaledPairCreated;
2907
2908	MBBI = mergePairedInsns(I: MBBI, Paired, Flags);
2909	// Collect liveness info for instructions between Prev and the new position
2910	// MBBI.
2911	for (auto I = std::next(x: Prev); I != MBBI; I ++)
2912	updateDefinedRegisters(MI&: *I, Units&: DefinedInBB, TRI);
2913
2914	return true;
2915	}
2916	return false;
2917	}
2918
2919	bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2920	(MachineBasicBlock::iterator &MBBI) {
2921	MachineInstr &MI = *MBBI;
2922	MachineBasicBlock::iterator E = MI.getParent()->end();
2923	MachineBasicBlock::iterator Update;
2924
2925	// Look forward to try to form a post-index instruction. For example,
2926	// ldr x0, [x20]
2927	// add x20, x20, #32
2928	// merged into:
2929	// ldr x0, [x20], #32
2930	Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset: `0`, Limit: UpdateLimit);
2931	if (Update != E) {
2932	// Merge the update into the ld/st.
2933	if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /IsForward=/false,
2934	/IsPreIdx=/false,
2935	/MergeEither=/false)) {
2936	MBBI = *NextI;
2937	return true;
2938	}
2939	}
2940
2941	// Don't know how to handle unscaled pre/post-index versions below, so bail.
2942	if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2943	return false;
2944
2945	// Look back to try to find a pre-index instruction. For example,
2946	// add x0, x0, #8
2947	// ldr x1, [x0]
2948	// merged into:
2949	// ldr x1, [x0, #8]!
2950	bool MergeEither;
2951	Update = findMatchingUpdateInsnBackward(I: MBBI, Limit: UpdateLimit, MergeEither);
2952	if (Update != E) {
2953	// Merge the update into the ld/st.
2954	if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /IsForward=/true,
2955	/IsPreIdx=/true, MergeEither)) {
2956	MBBI = *NextI;
2957	return true;
2958	}
2959	}
2960
2961	// The immediate in the load/store is scaled by the size of the memory
2962	// operation. The immediate in the add we're looking for,
2963	// however, is not, so adjust here.
2964	int UnscaledOffset =
2965	AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2966
2967	// Look forward to try to find a pre-index instruction. For example,
2968	// ldr x1, [x0, #64]
2969	// add x0, x0, #64
2970	// merged into:
2971	// ldr x1, [x0, #64]!
2972	Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset, Limit: UpdateLimit);
2973	if (Update != E) {
2974	// Merge the update into the ld/st.
2975	if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /IsForward=/false,
2976	/IsPreIdx=/true,
2977	/MergeEither=/false)) {
2978	MBBI = *NextI;
2979	return true;
2980	}
2981	}
2982
2983	return false;
2984	}
2985
2986	bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2987	int Scale) {
2988	MachineInstr &MI = *MBBI;
2989	MachineBasicBlock::iterator E = MI.getParent()->end();
2990	MachineBasicBlock::iterator Update;
2991
2992	// Don't know how to handle unscaled pre/post-index versions below, so bail.
2993	if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2994	return false;
2995
2996	// Look back to try to find a const offset for index LdSt instruction. For
2997	// example,
2998	// mov x8, #LargeImm ; = a (1<<12) + imm12*
2999	// ldr x1, [x0, x8]
3000	// merged into:
3001	// add x8, x0, a (1<<12)*
3002	// ldr x1, [x8, imm12]
3003	unsigned Offset;
3004	Update = findMatchingConstOffsetBackward(I: MBBI, Limit: LdStConstLimit, Offset);
3005	if (Update != E && (Offset & (Scale - `1`)) == `0`) {
3006	// Merge the imm12 into the ld/st.
3007	MBBI = mergeConstOffsetInsn(I: MBBI, Update, Offset, Scale);
3008	return true;
3009	}
3010
3011	return false;
3012	}
3013
3014	bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
3015	bool EnableNarrowZeroStOpt) {
3016	AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
3017
3018	bool Modified = false;
3019	// Four transformations to do here:
3020	// 1) Find loads that directly read from stores and promote them by
3021	// replacing with mov instructions. If the store is wider than the load,
3022	// the load will be replaced with a bitfield extract.
3023	// e.g.,
3024	// str w1, [x0, #4]
3025	// ldrh w2, [x0, #6]
3026	// ; becomes
3027	// str w1, [x0, #4]
3028	// lsr w2, w1, #16
3029	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3030	MBBI != E;) {
3031	if (isPromotableLoadFromStore(MI&: *MBBI) && tryToPromoteLoadFromStore(MBBI))
3032	Modified = true;
3033	else
3034	++MBBI;
3035	}
3036	// 2) Merge adjacent zero stores into a wider store.
3037	// e.g.,
3038	// strh wzr, [x0]
3039	// strh wzr, [x0, #2]
3040	// ; becomes
3041	// str wzr, [x0]
3042	// e.g.,
3043	// str wzr, [x0]
3044	// str wzr, [x0, #4]
3045	// ; becomes
3046	// str xzr, [x0]
3047	if (EnableNarrowZeroStOpt)
3048	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3049	MBBI != E;) {
3050	if (isPromotableZeroStoreInst(MI&: *MBBI) && tryToMergeZeroStInst(MBBI))
3051	Modified = true;
3052	else
3053	++MBBI;
3054	}
3055	// 3) Find loads and stores that can be merged into a single load or store
3056	// pair instruction.
3057	// When compiling for SVE 128, also try to combine SVE fill/spill
3058	// instructions into LDP/STP.
3059	// e.g.,
3060	// ldr x0, [x2]
3061	// ldr x1, [x2, #8]
3062	// ; becomes
3063	// ldp x0, x1, [x2]
3064	// e.g.,
3065	// ldr z0, [x2]
3066	// ldr z1, [x2, #1, mul vl]
3067	// ; becomes
3068	// ldp q0, q1, [x2]
3069
3070	if (MBB.getParent()->getRegInfo().tracksLiveness()) {
3071	DefinedInBB.clear();
3072	DefinedInBB.addLiveIns(MBB);
3073	}
3074
3075	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3076	MBBI != E;) {
3077	// Track currently live registers up to this point, to help with
3078	// searching for a rename register on demand.
3079	updateDefinedRegisters(MI&: *MBBI, Units&: DefinedInBB, TRI);
3080	if (TII->isPairableLdStInst(MI: *MBBI) && tryToPairLdStInst(MBBI))
3081	Modified = true;
3082	else
3083	++MBBI;
3084	}
3085	// 4) Find base register updates that can be merged into the load or store
3086	// as a base-reg writeback.
3087	// e.g.,
3088	// ldr x0, [x2]
3089	// add x2, x2, #4
3090	// ; becomes
3091	// ldr x0, [x2], #4
3092	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3093	MBBI != E;) {
3094	if (isMergeableLdStUpdate(MI&: *MBBI, AFI) && tryToMergeLdStUpdate(MBBI))
3095	Modified = true;
3096	else
3097	++MBBI;
3098	}
3099
3100	// 5) Find a register assigned with a const value that can be combined with
3101	// into the load or store. e.g.,
3102	// mov x8, #LargeImm ; = a (1<<12) + imm12*
3103	// ldr x1, [x0, x8]
3104	// ; becomes
3105	// add x8, x0, a (1<<12)*
3106	// ldr x1, [x8, imm12]
3107	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3108	MBBI != E;) {
3109	int Scale;
3110	if (isMergeableIndexLdSt(MI&: *MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
3111	Modified = true;
3112	else
3113	++MBBI;
3114	}
3115
3116	return Modified;
3117	}
3118
3119	bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
3120	Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
3121	TII = Subtarget->getInstrInfo();
3122	TRI = Subtarget->getRegisterInfo();
3123
3124	// Resize the modified and used register unit trackers. We do this once
3125	// per function and then clear the register units each time we optimize a load
3126	// or store.
3127	ModifiedRegUnits.init(TRI: *TRI);
3128	UsedRegUnits.init(TRI: *TRI);
3129	DefinedInBB.init(TRI: *TRI);
3130
3131	bool Modified = false;
3132	bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
3133	for (auto &MBB : Fn) {
3134	auto M = optimizeBlock(MBB, EnableNarrowZeroStOpt: enableNarrowZeroStOpt);
3135	Modified \|= M;
3136	}
3137
3138	return Modified;
3139	}
3140
3141	// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
3142	// stores near one another? Note: The pre-RA instruction scheduler already has
3143	// hooks to try and schedule pairable loads/stores together to improve pairing
3144	// opportunities. Thus, pre-RA pairing pass may not be worth the effort.
3145
3146	// FIXME: When pairing store instructions it's very possible for this pass to
3147	// hoist a store with a KILL marker above another use (without a KILL marker).
3148	// The resulting IR is invalid, but nothing uses the KILL markers after this
3149	// pass, so it's never caused a problem in practice.
3150
3151	bool AArch64LoadStoreOptLegacy::runOnMachineFunction(MachineFunction &MF) {
3152	if (skipFunction(F: MF.getFunction()))
3153	return false;
3154	AArch64LoadStoreOpt Impl;
3155	Impl.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3156	return Impl.runOnMachineFunction(Fn&: MF);
3157	}
3158
3159	/// createAArch64LoadStoreOptimizationPass - returns an instance of the
3160	/// load / store optimization pass.
3161	FunctionPass *llvm::createAArch64LoadStoreOptLegacyPass() {
3162	return new AArch64LoadStoreOptLegacy ();
3163	}
3164
3165	PreservedAnalyses
3166	AArch64LoadStoreOptPass::run(MachineFunction &MF,
3167	MachineFunctionAnalysisManager &MFAM) {
3168	AArch64LoadStoreOpt Impl;
3169	Impl.AA = &MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
3170	.getManager()
3171	.getResult<AAManager>(IR&: MF.getFunction());
3172	bool Changed = Impl.runOnMachineFunction(Fn&: MF);
3173	if (!Changed)
3174	return PreservedAnalyses::all();
3175	PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
3176	PA.preserveSet<CFGAnalyses>();
3177	return PA;
3178	}
3179

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp