AArch64LoadStoreOptimizer.cpp source code [llvm_projects/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp]

1	//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file contains a pass that performs load / store related peephole
10	// optimizations. This pass should be run after register allocation.
11	//
12	// The pass runs after the PrologEpilogInserter where we emit the CFI
13	// instructions. In order to preserve the correctness of the unwind information,
14	// the pass should not change the order of any two instructions, one of which
15	// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16	// to unwind information.
17	//
18	//===----------------------------------------------------------------------===//
19
20	#include "AArch64InstrInfo.h"
21	#include "AArch64MachineFunctionInfo.h"
22	#include "AArch64Subtarget.h"
23	#include "MCTargetDesc/AArch64AddressingModes.h"
24	#include "llvm/ADT/SmallVector.h"
25	#include "llvm/ADT/Statistic.h"
26	#include "llvm/ADT/StringRef.h"
27	#include "llvm/ADT/iterator_range.h"
28	#include "llvm/Analysis/AliasAnalysis.h"
29	#include "llvm/CodeGen/MachineBasicBlock.h"
30	#include "llvm/CodeGen/MachineFunction.h"
31	#include "llvm/CodeGen/MachineFunctionPass.h"
32	#include "llvm/CodeGen/MachineInstr.h"
33	#include "llvm/CodeGen/MachineInstrBuilder.h"
34	#include "llvm/CodeGen/MachineOperand.h"
35	#include "llvm/CodeGen/MachineRegisterInfo.h"
36	#include "llvm/CodeGen/TargetRegisterInfo.h"
37	#include "llvm/IR/DebugLoc.h"
38	#include "llvm/MC/MCAsmInfo.h"
39	#include "llvm/MC/MCDwarf.h"
40	#include "llvm/Pass.h"
41	#include "llvm/Support/CommandLine.h"
42	#include "llvm/Support/Debug.h"
43	#include "llvm/Support/DebugCounter.h"
44	#include "llvm/Support/ErrorHandling.h"
45	#include "llvm/Support/raw_ostream.h"
46	#include <cassert>
47	#include <cstdint>
48	#include <functional>
49	#include <iterator>
50	#include <limits>
51	#include <optional>
52
53	using namespace llvm;
54
55	#define DEBUG_TYPE "aarch64-ldst-opt"
56
57	STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
58	STATISTIC(NumPostFolded, "Number of post-index updates folded");
59	STATISTIC(NumPreFolded, "Number of pre-index updates folded");
60	STATISTIC(NumUnscaledPairCreated,
61	"Number of load/store from unscaled generated");
62	STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
63	STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
64	STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
65	"not passed the alignment check");
66	STATISTIC(NumConstOffsetFolded,
67	"Number of const offset of index address folded");
68
69	DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
70	"Controls which pairs are considered for renaming");
71
72	// The LdStLimit limits how far we search for load/store pairs.
73	static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
74	cl::init(Val: `20`), cl::Hidden);
75
76	// The UpdateLimit limits how far we search for update instructions when we form
77	// pre-/post-index instructions.
78	static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(Val: `100`),
79	cl::Hidden);
80
81	// The LdStConstLimit limits how far we search for const offset instructions
82	// when we form index address load/store instructions.
83	static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
84	cl::init(Val: `10`), cl::Hidden);
85
86	// Enable register renaming to find additional store pairing opportunities.
87	static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
88	cl::init(Val: true), cl::Hidden);
89
90	#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
91
92	namespace {
93
94	using LdStPairFlags = struct LdStPairFlags {
95	// If a matching instruction is found, MergeForward is set to true if the
96	// merge is to remove the first instruction and replace the second with
97	// a pair-wise insn, and false if the reverse is true.
98	bool MergeForward = false;
99
100	// SExtIdx gives the index of the result of the load pair that must be
101	// extended. The value of SExtIdx assumes that the paired load produces the
102	// value in this order: (I, returned iterator), i.e., -1 means no value has
103	// to be extended, 0 means I, and 1 means the returned iterator.
104	int SExtIdx = -`1`;
105
106	// If not none, RenameReg can be used to rename the result register of the
107	// first store in a pair. Currently this only works when merging stores
108	// forward.
109	std::optional<MCPhysReg> RenameReg;
110
111	LdStPairFlags() = default;
112
113	void setMergeForward(bool V = true) { MergeForward = V; }
114	bool getMergeForward() const { return MergeForward; }
115
116	void setSExtIdx(int V) { SExtIdx = V; }
117	int getSExtIdx() const { return SExtIdx; }
118
119	void setRenameReg(MCPhysReg R) { RenameReg = R; }
120	void clearRenameReg() { RenameReg = std::nullopt; }
121	std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
122	};
123
124	struct AArch64LoadStoreOpt : public MachineFunctionPass {
125	static char ID;
126
127	AArch64LoadStoreOpt() : MachineFunctionPass (ID) {}
128
129	AliasAnalysis *AA;
130	const AArch64InstrInfo *TII;
131	const TargetRegisterInfo *TRI;
132	const AArch64Subtarget *Subtarget;
133
134	// Track which register units have been modified and used.
135	LiveRegUnits ModifiedRegUnits, UsedRegUnits;
136	LiveRegUnits DefinedInBB;
137
138	void getAnalysisUsage(AnalysisUsage &AU) const override {
139	AU.addRequired<AAResultsWrapperPass>();
140	MachineFunctionPass::getAnalysisUsage(AU);
141	}
142
143	// Scan the instructions looking for a load/store that can be combined
144	// with the current instruction into a load/store pair.
145	// Return the matching instruction if one is found, else MBB->end().
146	MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
147	LdStPairFlags &Flags,
148	unsigned Limit,
149	bool FindNarrowMerge);
150
151	// Scan the instructions looking for a store that writes to the address from
152	// which the current load instruction reads. Return true if one is found.
153	bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
154	MachineBasicBlock::iterator &StoreI);
155
156	// Merge the two instructions indicated into a wider narrow store instruction.
157	MachineBasicBlock::iterator
158	mergeNarrowZeroStores(MachineBasicBlock::iterator I,
159	MachineBasicBlock::iterator MergeMI,
160	const LdStPairFlags &Flags);
161
162	// Merge the two instructions indicated into a single pair-wise instruction.
163	MachineBasicBlock::iterator
164	mergePairedInsns(MachineBasicBlock::iterator I,
165	MachineBasicBlock::iterator Paired,
166	const LdStPairFlags &Flags);
167
168	// Promote the load that reads directly from the address stored to.
169	MachineBasicBlock::iterator
170	promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
171	MachineBasicBlock::iterator StoreI);
172
173	// Scan the instruction list to find a base register update that can
174	// be combined with the current instruction (a load or store) using
175	// pre or post indexed addressing with writeback. Scan forwards.
176	MachineBasicBlock::iterator
177	findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
178	int UnscaledOffset, unsigned Limit);
179
180	// Scan the instruction list to find a register assigned with a const
181	// value that can be combined with the current instruction (a load or store)
182	// using base addressing with writeback. Scan backwards.
183	MachineBasicBlock::iterator
184	findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
185	unsigned &Offset);
186
187	// Scan the instruction list to find a base register update that can
188	// be combined with the current instruction (a load or store) using
189	// pre or post indexed addressing with writeback. Scan backwards.
190	// `MergeEither` is set to true if the combined instruction may be placed
191	// either at the location of the load/store instruction or at the location of
192	// the update instruction.
193	MachineBasicBlock::iterator
194	findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
195	bool &MergeEither);
196
197	// Find an instruction that updates the base register of the ld/st
198	// instruction.
199	bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
200	unsigned BaseReg, int Offset);
201
202	bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
203	unsigned IndexReg, unsigned &Offset);
204
205	// Merge a pre- or post-index base register update into a ld/st instruction.
206	std::optional<MachineBasicBlock::iterator>
207	mergeUpdateInsn(MachineBasicBlock::iterator I,
208	MachineBasicBlock::iterator Update, bool IsForward,
209	bool IsPreIdx, bool MergeEither);
210
211	MachineBasicBlock::iterator
212	mergeConstOffsetInsn(MachineBasicBlock::iterator I,
213	MachineBasicBlock::iterator Update, unsigned Offset,
214	int Scale);
215
216	// Find and merge zero store instructions.
217	bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
218
219	// Find and pair ldr/str instructions.
220	bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
221
222	// Find and promote load instructions which read directly from store.
223	bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
224
225	// Find and merge a base register updates before or after a ld/st instruction.
226	bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
227
228	// Find and merge an index ldr/st instruction into a base ld/st instruction.
229	bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
230
231	bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
232
233	bool runOnMachineFunction(MachineFunction &Fn) override;
234
235	MachineFunctionProperties getRequiredProperties() const override {
236	return MachineFunctionProperties ().setNoVRegs();
237	}
238
239	StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
240	};
241
242	char AArch64LoadStoreOpt::ID = `0`;
243
244	} // end anonymous namespace
245
246	INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
247	AARCH64_LOAD_STORE_OPT_NAME, false, false)
248
249	static bool isNarrowStore(unsigned Opc) {
250	switch (Opc) {
251	default:
252	return false;
253	case AArch64::STRBBui:
254	case AArch64::STURBBi:
255	case AArch64::STRHHui:
256	case AArch64::STURHHi:
257	return true;
258	}
259	}
260
261	// These instruction set memory tag and either keep memory contents unchanged or
262	// set it to zero, ignoring the address part of the source register.
263	static bool isTagStore(const MachineInstr &MI) {
264	switch (MI.getOpcode()) {
265	default:
266	return false;
267	case AArch64::STGi:
268	case AArch64::STZGi:
269	case AArch64::ST2Gi:
270	case AArch64::STZ2Gi:
271	return true;
272	}
273	}
274
275	static unsigned getMatchingNonSExtOpcode(unsigned Opc,
276	bool IsValidLdStrOpc = nullptr*) {
277	if (IsValidLdStrOpc)
278	IsValidLdStrOpc = true*;
279	switch (Opc) {
280	default:
281	if (IsValidLdStrOpc)
282	IsValidLdStrOpc = false*;
283	return std::numeric_limits<unsigned>::max();
284	case AArch64::STRDui:
285	case AArch64::STURDi:
286	case AArch64::STRDpre:
287	case AArch64::STRQui:
288	case AArch64::STURQi:
289	case AArch64::STRQpre:
290	case AArch64::STRBBui:
291	case AArch64::STURBBi:
292	case AArch64::STRHHui:
293	case AArch64::STURHHi:
294	case AArch64::STRWui:
295	case AArch64::STRWpre:
296	case AArch64::STURWi:
297	case AArch64::STRXui:
298	case AArch64::STRXpre:
299	case AArch64::STURXi:
300	case AArch64::STR_ZXI:
301	case AArch64::LDRDui:
302	case AArch64::LDURDi:
303	case AArch64::LDRDpre:
304	case AArch64::LDRQui:
305	case AArch64::LDURQi:
306	case AArch64::LDRQpre:
307	case AArch64::LDRWui:
308	case AArch64::LDURWi:
309	case AArch64::LDRWpre:
310	case AArch64::LDRXui:
311	case AArch64::LDURXi:
312	case AArch64::LDRXpre:
313	case AArch64::STRSui:
314	case AArch64::STURSi:
315	case AArch64::STRSpre:
316	case AArch64::LDRSui:
317	case AArch64::LDURSi:
318	case AArch64::LDRSpre:
319	case AArch64::LDR_ZXI:
320	return Opc;
321	case AArch64::LDRSWui:
322	return AArch64::LDRWui;
323	case AArch64::LDURSWi:
324	return AArch64::LDURWi;
325	case AArch64::LDRSWpre:
326	return AArch64::LDRWpre;
327	}
328	}
329
330	static unsigned getMatchingWideOpcode(unsigned Opc) {
331	switch (Opc) {
332	default:
333	llvm_unreachable("Opcode has no wide equivalent!");
334	case AArch64::STRBBui:
335	return AArch64::STRHHui;
336	case AArch64::STRHHui:
337	return AArch64::STRWui;
338	case AArch64::STURBBi:
339	return AArch64::STURHHi;
340	case AArch64::STURHHi:
341	return AArch64::STURWi;
342	case AArch64::STURWi:
343	return AArch64::STURXi;
344	case AArch64::STRWui:
345	return AArch64::STRXui;
346	}
347	}
348
349	static unsigned getMatchingPairOpcode(unsigned Opc) {
350	switch (Opc) {
351	default:
352	llvm_unreachable("Opcode has no pairwise equivalent!");
353	case AArch64::STRSui:
354	case AArch64::STURSi:
355	return AArch64::STPSi;
356	case AArch64::STRSpre:
357	return AArch64::STPSpre;
358	case AArch64::STRDui:
359	case AArch64::STURDi:
360	return AArch64::STPDi;
361	case AArch64::STRDpre:
362	return AArch64::STPDpre;
363	case AArch64::STRQui:
364	case AArch64::STURQi:
365	case AArch64::STR_ZXI:
366	return AArch64::STPQi;
367	case AArch64::STRQpre:
368	return AArch64::STPQpre;
369	case AArch64::STRWui:
370	case AArch64::STURWi:
371	return AArch64::STPWi;
372	case AArch64::STRWpre:
373	return AArch64::STPWpre;
374	case AArch64::STRXui:
375	case AArch64::STURXi:
376	return AArch64::STPXi;
377	case AArch64::STRXpre:
378	return AArch64::STPXpre;
379	case AArch64::LDRSui:
380	case AArch64::LDURSi:
381	return AArch64::LDPSi;
382	case AArch64::LDRSpre:
383	return AArch64::LDPSpre;
384	case AArch64::LDRDui:
385	case AArch64::LDURDi:
386	return AArch64::LDPDi;
387	case AArch64::LDRDpre:
388	return AArch64::LDPDpre;
389	case AArch64::LDRQui:
390	case AArch64::LDURQi:
391	case AArch64::LDR_ZXI:
392	return AArch64::LDPQi;
393	case AArch64::LDRQpre:
394	return AArch64::LDPQpre;
395	case AArch64::LDRWui:
396	case AArch64::LDURWi:
397	return AArch64::LDPWi;
398	case AArch64::LDRWpre:
399	return AArch64::LDPWpre;
400	case AArch64::LDRXui:
401	case AArch64::LDURXi:
402	return AArch64::LDPXi;
403	case AArch64::LDRXpre:
404	return AArch64::LDPXpre;
405	case AArch64::LDRSWui:
406	case AArch64::LDURSWi:
407	return AArch64::LDPSWi;
408	case AArch64::LDRSWpre:
409	return AArch64::LDPSWpre;
410	}
411	}
412
413	static unsigned isMatchingStore(MachineInstr &LoadInst,
414	MachineInstr &StoreInst) {
415	unsigned LdOpc = LoadInst.getOpcode();
416	unsigned StOpc = StoreInst.getOpcode();
417	switch (LdOpc) {
418	default:
419	llvm_unreachable("Unsupported load instruction!");
420	case AArch64::LDRBBui:
421	return StOpc == AArch64::STRBBui \|\| StOpc == AArch64::STRHHui \|\|
422	StOpc == AArch64::STRWui \|\| StOpc == AArch64::STRXui;
423	case AArch64::LDURBBi:
424	return StOpc == AArch64::STURBBi \|\| StOpc == AArch64::STURHHi \|\|
425	StOpc == AArch64::STURWi \|\| StOpc == AArch64::STURXi;
426	case AArch64::LDRHHui:
427	return StOpc == AArch64::STRHHui \|\| StOpc == AArch64::STRWui \|\|
428	StOpc == AArch64::STRXui;
429	case AArch64::LDURHHi:
430	return StOpc == AArch64::STURHHi \|\| StOpc == AArch64::STURWi \|\|
431	StOpc == AArch64::STURXi;
432	case AArch64::LDRWui:
433	return StOpc == AArch64::STRWui \|\| StOpc == AArch64::STRXui;
434	case AArch64::LDURWi:
435	return StOpc == AArch64::STURWi \|\| StOpc == AArch64::STURXi;
436	case AArch64::LDRXui:
437	return StOpc == AArch64::STRXui;
438	case AArch64::LDURXi:
439	return StOpc == AArch64::STURXi;
440	}
441	}
442
443	static unsigned getPreIndexedOpcode(unsigned Opc) {
444	// FIXME: We don't currently support creating pre-indexed loads/stores when
445	// the load or store is the unscaled version. If we decide to perform such an
446	// optimization in the future the cases for the unscaled loads/stores will
447	// need to be added here.
448	switch (Opc) {
449	default:
450	llvm_unreachable("Opcode has no pre-indexed equivalent!");
451	case AArch64::STRSui:
452	return AArch64::STRSpre;
453	case AArch64::STRDui:
454	return AArch64::STRDpre;
455	case AArch64::STRQui:
456	return AArch64::STRQpre;
457	case AArch64::STRBBui:
458	return AArch64::STRBBpre;
459	case AArch64::STRHHui:
460	return AArch64::STRHHpre;
461	case AArch64::STRWui:
462	return AArch64::STRWpre;
463	case AArch64::STRXui:
464	return AArch64::STRXpre;
465	case AArch64::LDRSui:
466	return AArch64::LDRSpre;
467	case AArch64::LDRDui:
468	return AArch64::LDRDpre;
469	case AArch64::LDRQui:
470	return AArch64::LDRQpre;
471	case AArch64::LDRBBui:
472	return AArch64::LDRBBpre;
473	case AArch64::LDRHHui:
474	return AArch64::LDRHHpre;
475	case AArch64::LDRWui:
476	return AArch64::LDRWpre;
477	case AArch64::LDRXui:
478	return AArch64::LDRXpre;
479	case AArch64::LDRSWui:
480	return AArch64::LDRSWpre;
481	case AArch64::LDPSi:
482	return AArch64::LDPSpre;
483	case AArch64::LDPSWi:
484	return AArch64::LDPSWpre;
485	case AArch64::LDPDi:
486	return AArch64::LDPDpre;
487	case AArch64::LDPQi:
488	return AArch64::LDPQpre;
489	case AArch64::LDPWi:
490	return AArch64::LDPWpre;
491	case AArch64::LDPXi:
492	return AArch64::LDPXpre;
493	case AArch64::STPSi:
494	return AArch64::STPSpre;
495	case AArch64::STPDi:
496	return AArch64::STPDpre;
497	case AArch64::STPQi:
498	return AArch64::STPQpre;
499	case AArch64::STPWi:
500	return AArch64::STPWpre;
501	case AArch64::STPXi:
502	return AArch64::STPXpre;
503	case AArch64::STGi:
504	return AArch64::STGPreIndex;
505	case AArch64::STZGi:
506	return AArch64::STZGPreIndex;
507	case AArch64::ST2Gi:
508	return AArch64::ST2GPreIndex;
509	case AArch64::STZ2Gi:
510	return AArch64::STZ2GPreIndex;
511	case AArch64::STGPi:
512	return AArch64::STGPpre;
513	}
514	}
515
516	static unsigned getBaseAddressOpcode(unsigned Opc) {
517	// TODO: Add more index address stores.
518	switch (Opc) {
519	default:
520	llvm_unreachable("Opcode has no base address equivalent!");
521	case AArch64::LDRBroX:
522	return AArch64::LDRBui;
523	case AArch64::LDRBBroX:
524	return AArch64::LDRBBui;
525	case AArch64::LDRSBXroX:
526	return AArch64::LDRSBXui;
527	case AArch64::LDRSBWroX:
528	return AArch64::LDRSBWui;
529	case AArch64::LDRHroX:
530	return AArch64::LDRHui;
531	case AArch64::LDRHHroX:
532	return AArch64::LDRHHui;
533	case AArch64::LDRSHXroX:
534	return AArch64::LDRSHXui;
535	case AArch64::LDRSHWroX:
536	return AArch64::LDRSHWui;
537	case AArch64::LDRWroX:
538	return AArch64::LDRWui;
539	case AArch64::LDRSroX:
540	return AArch64::LDRSui;
541	case AArch64::LDRSWroX:
542	return AArch64::LDRSWui;
543	case AArch64::LDRDroX:
544	return AArch64::LDRDui;
545	case AArch64::LDRXroX:
546	return AArch64::LDRXui;
547	case AArch64::LDRQroX:
548	return AArch64::LDRQui;
549	}
550	}
551
552	static unsigned getPostIndexedOpcode(unsigned Opc) {
553	switch (Opc) {
554	default:
555	llvm_unreachable("Opcode has no post-indexed wise equivalent!");
556	case AArch64::STRSui:
557	case AArch64::STURSi:
558	return AArch64::STRSpost;
559	case AArch64::STRDui:
560	case AArch64::STURDi:
561	return AArch64::STRDpost;
562	case AArch64::STRQui:
563	case AArch64::STURQi:
564	return AArch64::STRQpost;
565	case AArch64::STRBBui:
566	return AArch64::STRBBpost;
567	case AArch64::STRHHui:
568	return AArch64::STRHHpost;
569	case AArch64::STRWui:
570	case AArch64::STURWi:
571	return AArch64::STRWpost;
572	case AArch64::STRXui:
573	case AArch64::STURXi:
574	return AArch64::STRXpost;
575	case AArch64::LDRSui:
576	case AArch64::LDURSi:
577	return AArch64::LDRSpost;
578	case AArch64::LDRDui:
579	case AArch64::LDURDi:
580	return AArch64::LDRDpost;
581	case AArch64::LDRQui:
582	case AArch64::LDURQi:
583	return AArch64::LDRQpost;
584	case AArch64::LDRBBui:
585	return AArch64::LDRBBpost;
586	case AArch64::LDRHHui:
587	return AArch64::LDRHHpost;
588	case AArch64::LDRWui:
589	case AArch64::LDURWi:
590	return AArch64::LDRWpost;
591	case AArch64::LDRXui:
592	case AArch64::LDURXi:
593	return AArch64::LDRXpost;
594	case AArch64::LDRSWui:
595	return AArch64::LDRSWpost;
596	case AArch64::LDPSi:
597	return AArch64::LDPSpost;
598	case AArch64::LDPSWi:
599	return AArch64::LDPSWpost;
600	case AArch64::LDPDi:
601	return AArch64::LDPDpost;
602	case AArch64::LDPQi:
603	return AArch64::LDPQpost;
604	case AArch64::LDPWi:
605	return AArch64::LDPWpost;
606	case AArch64::LDPXi:
607	return AArch64::LDPXpost;
608	case AArch64::STPSi:
609	return AArch64::STPSpost;
610	case AArch64::STPDi:
611	return AArch64::STPDpost;
612	case AArch64::STPQi:
613	return AArch64::STPQpost;
614	case AArch64::STPWi:
615	return AArch64::STPWpost;
616	case AArch64::STPXi:
617	return AArch64::STPXpost;
618	case AArch64::STGi:
619	return AArch64::STGPostIndex;
620	case AArch64::STZGi:
621	return AArch64::STZGPostIndex;
622	case AArch64::ST2Gi:
623	return AArch64::ST2GPostIndex;
624	case AArch64::STZ2Gi:
625	return AArch64::STZ2GPostIndex;
626	case AArch64::STGPi:
627	return AArch64::STGPpost;
628	}
629	}
630
631	static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
632
633	unsigned OpcA = FirstMI.getOpcode();
634	unsigned OpcB = MI.getOpcode();
635
636	switch (OpcA) {
637	default:
638	return false;
639	case AArch64::STRSpre:
640	return (OpcB == AArch64::STRSui) \|\| (OpcB == AArch64::STURSi);
641	case AArch64::STRDpre:
642	return (OpcB == AArch64::STRDui) \|\| (OpcB == AArch64::STURDi);
643	case AArch64::STRQpre:
644	return (OpcB == AArch64::STRQui) \|\| (OpcB == AArch64::STURQi);
645	case AArch64::STRWpre:
646	return (OpcB == AArch64::STRWui) \|\| (OpcB == AArch64::STURWi);
647	case AArch64::STRXpre:
648	return (OpcB == AArch64::STRXui) \|\| (OpcB == AArch64::STURXi);
649	case AArch64::LDRSpre:
650	return (OpcB == AArch64::LDRSui) \|\| (OpcB == AArch64::LDURSi);
651	case AArch64::LDRDpre:
652	return (OpcB == AArch64::LDRDui) \|\| (OpcB == AArch64::LDURDi);
653	case AArch64::LDRQpre:
654	return (OpcB == AArch64::LDRQui) \|\| (OpcB == AArch64::LDURQi);
655	case AArch64::LDRWpre:
656	return (OpcB == AArch64::LDRWui) \|\| (OpcB == AArch64::LDURWi);
657	case AArch64::LDRXpre:
658	return (OpcB == AArch64::LDRXui) \|\| (OpcB == AArch64::LDURXi);
659	case AArch64::LDRSWpre:
660	return (OpcB == AArch64::LDRSWui) \|\| (OpcB == AArch64::LDURSWi);
661	}
662	}
663
664	// Returns the scale and offset range of pre/post indexed variants of MI.
665	static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
666	int &MinOffset, int &MaxOffset) {
667	bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
668	bool IsTagStore = isTagStore(MI);
669	// STG and all paired ldst have the same scale in pre/post-indexed variants*
670	// as in the "unsigned offset" variant.
671	// All other pre/post indexed ldst instructions are unscaled.
672	Scale = (IsTagStore \|\| IsPaired) ? AArch64InstrInfo::getMemScale(MI) : `1`;
673
674	if (IsPaired) {
675	MinOffset = -`64`;
676	MaxOffset = `63`;
677	} else {
678	MinOffset = -`256`;
679	MaxOffset = `255`;
680	}
681	}
682
683	static MachineOperand &getLdStRegOp(MachineInstr &MI,
684	unsigned PairedRegOp = `0`) {
685	assert(PairedRegOp < `2` && "Unexpected register operand idx.");
686	bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
687	if (IsPreLdSt)
688	PairedRegOp += `1`;
689	unsigned Idx =
690	AArch64InstrInfo::isPairedLdSt(MI) \|\| IsPreLdSt ? PairedRegOp : `0`;
691	return MI.getOperand(i: Idx);
692	}
693
694	static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
695	MachineInstr &StoreInst,
696	const AArch64InstrInfo *TII) {
697	assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
698	int LoadSize = TII->getMemScale(MI: LoadInst);
699	int StoreSize = TII->getMemScale(MI: StoreInst);
700	int UnscaledStOffset =
701	TII->hasUnscaledLdStOffset(MI&: StoreInst)
702	? AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm()
703	: AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm() * StoreSize;
704	int UnscaledLdOffset =
705	TII->hasUnscaledLdStOffset(MI&: LoadInst)
706	? AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm()
707	: AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm() * LoadSize;
708	return (UnscaledStOffset <= UnscaledLdOffset) &&
709	(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
710	}
711
712	static bool isPromotableZeroStoreInst(MachineInstr &MI) {
713	unsigned Opc = MI.getOpcode();
714	return (Opc == AArch64::STRWui \|\| Opc == AArch64::STURWi \|\|
715	isNarrowStore(Opc)) &&
716	getLdStRegOp(MI).getReg() == AArch64::WZR;
717	}
718
719	static bool isPromotableLoadFromStore(MachineInstr &MI) {
720	switch (MI.getOpcode()) {
721	default:
722	return false;
723	// Scaled instructions.
724	case AArch64::LDRBBui:
725	case AArch64::LDRHHui:
726	case AArch64::LDRWui:
727	case AArch64::LDRXui:
728	// Unscaled instructions.
729	case AArch64::LDURBBi:
730	case AArch64::LDURHHi:
731	case AArch64::LDURWi:
732	case AArch64::LDURXi:
733	return true;
734	}
735	}
736
737	static bool isMergeableLdStUpdate(MachineInstr &MI, AArch64FunctionInfo &AFI) {
738	unsigned Opc = MI.getOpcode();
739	switch (Opc) {
740	default:
741	return false;
742	// Scaled instructions.
743	case AArch64::STRSui:
744	case AArch64::STRDui:
745	case AArch64::STRQui:
746	case AArch64::STRXui:
747	case AArch64::STRWui:
748	case AArch64::STRHHui:
749	case AArch64::STRBBui:
750	case AArch64::LDRSui:
751	case AArch64::LDRDui:
752	case AArch64::LDRQui:
753	case AArch64::LDRXui:
754	case AArch64::LDRWui:
755	case AArch64::LDRHHui:
756	case AArch64::LDRBBui:
757	case AArch64::STGi:
758	case AArch64::STZGi:
759	case AArch64::ST2Gi:
760	case AArch64::STZ2Gi:
761	case AArch64::STGPi:
762	// Unscaled instructions.
763	case AArch64::STURSi:
764	case AArch64::STURDi:
765	case AArch64::STURQi:
766	case AArch64::STURWi:
767	case AArch64::STURXi:
768	case AArch64::LDURSi:
769	case AArch64::LDURDi:
770	case AArch64::LDURQi:
771	case AArch64::LDURWi:
772	case AArch64::LDURXi:
773	// Paired instructions.
774	case AArch64::LDPSi:
775	case AArch64::LDPSWi:
776	case AArch64::LDPDi:
777	case AArch64::LDPQi:
778	case AArch64::LDPWi:
779	case AArch64::LDPXi:
780	case AArch64::STPSi:
781	case AArch64::STPDi:
782	case AArch64::STPQi:
783	case AArch64::STPWi:
784	case AArch64::STPXi:
785	// Make sure this is a reg+imm (as opposed to an address reloc).
786	if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
787	return false;
788
789	// When using stack tagging, simple sp+imm loads and stores are not
790	// tag-checked, but pre- and post-indexed versions of them are, so we can't
791	// replace the former with the latter. This transformation would be valid
792	// if the load/store accesses an untagged stack slot, but we don't have
793	// that information available after frame indices have been eliminated.
794	if (AFI.isMTETagged() &&
795	AArch64InstrInfo::getLdStBaseOp(MI).getReg() == AArch64::SP)
796	return false;
797
798	return true;
799	}
800	}
801
802	// Make sure this is a reg+reg Ld/St
803	static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
804	unsigned Opc = MI.getOpcode();
805	switch (Opc) {
806	default:
807	return false;
808	// Scaled instructions.
809	// TODO: Add more index address stores.
810	case AArch64::LDRBroX:
811	case AArch64::LDRBBroX:
812	case AArch64::LDRSBXroX:
813	case AArch64::LDRSBWroX:
814	Scale = `1`;
815	return true;
816	case AArch64::LDRHroX:
817	case AArch64::LDRHHroX:
818	case AArch64::LDRSHXroX:
819	case AArch64::LDRSHWroX:
820	Scale = `2`;
821	return true;
822	case AArch64::LDRWroX:
823	case AArch64::LDRSroX:
824	case AArch64::LDRSWroX:
825	Scale = `4`;
826	return true;
827	case AArch64::LDRDroX:
828	case AArch64::LDRXroX:
829	Scale = `8`;
830	return true;
831	case AArch64::LDRQroX:
832	Scale = `16`;
833	return true;
834	}
835	}
836
837	static bool isRewritableImplicitDef(unsigned Opc) {
838	switch (Opc) {
839	default:
840	return false;
841	case AArch64::ORRWrs:
842	case AArch64::ADDWri:
843	return true;
844	}
845	}
846
847	MachineBasicBlock::iterator
848	AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
849	MachineBasicBlock::iterator MergeMI,
850	const LdStPairFlags &Flags) {
851	assert(isPromotableZeroStoreInst(I) && isPromotableZeroStoreInst(MergeMI) &&
852	"Expected promotable zero stores.");
853
854	MachineBasicBlock::iterator E = I ->getParent()->end();
855	MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
856	// If NextI is the second of the two instructions to be merged, we need
857	// to skip one further. Either way we merge will invalidate the iterator,
858	// and we don't need to scan the new instruction, as it's a pairwise
859	// instruction, which we're not considering for further action anyway.
860	if (NextI == MergeMI)
861	NextI = next_nodbg(It: NextI, End: E);
862
863	unsigned Opc = I ->getOpcode();
864	unsigned MergeMIOpc = MergeMI ->getOpcode();
865	bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
866	bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(Opc: MergeMIOpc);
867	int OffsetStride = IsScaled ? TII->getMemScale(MI: *I) : `1`;
868	int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(MI: *MergeMI) : `1`;
869
870	bool MergeForward = Flags.getMergeForward();
871	// Insert our new paired instruction after whichever of the paired
872	// instructions MergeForward indicates.
873	MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
874	// Also based on MergeForward is from where we copy the base register operand
875	// so we get the flags compatible with the input code.
876	const MachineOperand &BaseRegOp =
877	MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *MergeMI)
878	: AArch64InstrInfo::getLdStBaseOp(MI: *I);
879
880	// Which register is Rt and which is Rt2 depends on the offset order.
881	int64_t IOffsetInBytes =
882	AArch64InstrInfo::getLdStOffsetOp(MI: I).getImm() OffsetStride;
883	int64_t MIOffsetInBytes =
884	AArch64InstrInfo::getLdStOffsetOp(MI: MergeMI).getImm()
885	MergeMIOffsetStride;
886	// Select final offset based on the offset order.
887	int64_t OffsetImm;
888	if (IOffsetInBytes > MIOffsetInBytes)
889	OffsetImm = MIOffsetInBytes;
890	else
891	OffsetImm = IOffsetInBytes;
892
893	int NewOpcode = getMatchingWideOpcode(Opc);
894	// Adjust final offset on scaled stores because the new instruction
895	// has a different scale.
896	if (!TII->hasUnscaledLdStOffset(Opc: NewOpcode)) {
897	int NewOffsetStride = TII->getMemScale(Opc: NewOpcode);
898	assert(((OffsetImm % NewOffsetStride) == `0`) &&
899	"Offset should be a multiple of the store memory scale");
900	OffsetImm = OffsetImm / NewOffsetStride;
901	}
902
903	// Construct the new instruction.
904	DebugLoc DL = I ->getDebugLoc();
905	MachineBasicBlock *MBB = I ->getParent();
906	MachineInstrBuilder MIB;
907	MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: NewOpcode))
908	.addReg(RegNo: isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
909	.add(MO: BaseRegOp)
910	.addImm(Val: OffsetImm)
911	.cloneMergedMemRefs(OtherMIs: {&I, &MergeMI})
912	.setMIFlags(I ->mergeFlagsWith(Other: *MergeMI));
913	(void)MIB;
914
915	LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
916	LLVM_DEBUG(I->print(dbgs()));
917	LLVM_DEBUG(dbgs() << " ");
918	LLVM_DEBUG(MergeMI->print(dbgs()));
919	LLVM_DEBUG(dbgs() << " with instruction:\n ");
920	LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
921	LLVM_DEBUG(dbgs() << "\n");
922
923	// Erase the old instructions.
924	I ->eraseFromParent();
925	MergeMI ->eraseFromParent();
926	return NextI;
927	}
928
929	// Apply Fn to all instructions between MI and the beginning of the block, until
930	// a def for DefReg is reached. Returns true, iff Fn returns true for all
931	// visited instructions. Stop after visiting Limit iterations.
932	static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
933	const TargetRegisterInfo TRI, unsigned* Limit,
934	std::function<bool(MachineInstr &, bool)> &Fn) {
935	auto MBB = MI.getParent();
936	for (MachineInstr &I :
937	instructionsWithoutDebug(It: MI.getReverseIterator(), End: MBB->instr_rend())) {
938	if (!Limit)
939	return false;
940	--Limit;
941
942	bool isDef = any_of(Range: I.operands(), P: [DefReg, TRI](MachineOperand &MOP) {
943	return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
944	TRI->regsOverlap(RegA: MOP.getReg(), RegB: DefReg);
945	});
946	if (!Fn (I, isDef))
947	return false;
948	if (isDef)
949	break;
950	}
951	return true;
952	}
953
954	static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
955	const TargetRegisterInfo *TRI) {
956
957	for (const MachineOperand &MOP : phys_regs_and_masks(MI))
958	if (MOP.isReg() && MOP.isKill())
959	Units.removeReg(Reg: MOP.getReg());
960
961	for (const MachineOperand &MOP : phys_regs_and_masks(MI))
962	if (MOP.isReg() && !MOP.isKill())
963	Units.addReg(Reg: MOP.getReg());
964	}
965
966	/// This function will add a new entry into the debugValueSubstitutions table
967	/// when two instruction have been merged into a new one represented by \p
968	/// MergedInstr.
969	static void addDebugSubstitutionsToTable(MachineFunction *MF,
970	unsigned InstrNumToSet,
971	MachineInstr &OriginalInstr,
972	MachineInstr &MergedInstr) {
973
974	// Figure out the Operand Index of the destination register of the
975	// OriginalInstr in the new MergedInstr.
976	auto Reg = OriginalInstr.getOperand(i: `0`).getReg();
977	unsigned OperandNo = `0`;
978	bool RegFound = false;
979	for (const auto Op : MergedInstr.operands()) {
980	if (Op.getReg() == Reg) {
981	RegFound = true;
982	break;
983	}
984	OperandNo++;
985	}
986
987	if (RegFound)
988	MF->makeDebugValueSubstitution({OriginalInstr.peekDebugInstrNum(), `0`},
989	{InstrNumToSet, OperandNo});
990	}
991
992	MachineBasicBlock::iterator
993	AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
994	MachineBasicBlock::iterator Paired,
995	const LdStPairFlags &Flags) {
996	MachineBasicBlock::iterator E = I ->getParent()->end();
997	MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
998	// If NextI is the second of the two instructions to be merged, we need
999	// to skip one further. Either way we merge will invalidate the iterator,
1000	// and we don't need to scan the new instruction, as it's a pairwise
1001	// instruction, which we're not considering for further action anyway.
1002	if (NextI == Paired)
1003	NextI = next_nodbg(It: NextI, End: E);
1004
1005	int SExtIdx = Flags.getSExtIdx();
1006	unsigned Opc =
1007	SExtIdx == -`1` ? I ->getOpcode() : getMatchingNonSExtOpcode(Opc: I ->getOpcode());
1008	bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
1009	int OffsetStride = IsUnscaled ? TII->getMemScale(MI: *I) : `1`;
1010
1011	bool MergeForward = Flags.getMergeForward();
1012
1013	std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
1014	if (RenameReg) {
1015	MCRegister RegToRename = getLdStRegOp(MI&: *I).getReg();
1016	DefinedInBB.addReg(Reg: *RenameReg);
1017
1018	// Return the sub/super register for RenameReg, matching the size of
1019	// OriginalReg.
1020	auto GetMatchingSubReg =
1021	[this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
1022	for (MCPhysReg SubOrSuper :
1023	TRI->sub_and_superregs_inclusive(Reg: *RenameReg)) {
1024	if (C->contains(Reg: SubOrSuper))
1025	return SubOrSuper;
1026	}
1027	llvm_unreachable("Should have found matching sub or super register!");
1028	};
1029
1030	std::function<bool(MachineInstr &, bool)> UpdateMIs =
1031	[this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
1032	bool IsDef) {
1033	if (IsDef) {
1034	bool SeenDef = false;
1035	for (unsigned OpIdx = `0`; OpIdx < MI.getNumOperands(); ++OpIdx) {
1036	MachineOperand &MOP = MI.getOperand(i: OpIdx);
1037	// Rename the first explicit definition and all implicit
1038	// definitions matching RegToRename.
1039	if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1040	(!MergeForward \|\| !SeenDef \|\|
1041	(MOP.isDef() && MOP.isImplicit())) &&
1042	TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1043	assert((MOP.isImplicit() \|\|
1044	(MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1045	"Need renamable operands");
1046	Register MatchingReg;
1047	if (const TargetRegisterClass *RC =
1048	MI.getRegClassConstraint(OpIdx, TII, TRI))
1049	MatchingReg = GetMatchingSubReg (RC);
1050	else {
1051	if (!isRewritableImplicitDef(Opc: MI.getOpcode()))
1052	continue;
1053	MatchingReg = GetMatchingSubReg (
1054	TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1055	}
1056	MOP.setReg(MatchingReg);
1057	SeenDef = true;
1058	}
1059	}
1060	} else {
1061	for (unsigned OpIdx = `0`; OpIdx < MI.getNumOperands(); ++OpIdx) {
1062	MachineOperand &MOP = MI.getOperand(i: OpIdx);
1063	if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1064	TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1065	assert((MOP.isImplicit() \|\|
1066	(MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1067	"Need renamable operands");
1068	Register MatchingReg;
1069	if (const TargetRegisterClass *RC =
1070	MI.getRegClassConstraint(OpIdx, TII, TRI))
1071	MatchingReg = GetMatchingSubReg (RC);
1072	else
1073	MatchingReg = GetMatchingSubReg (
1074	TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1075	assert(MatchingReg != AArch64::NoRegister &&
1076	"Cannot find matching regs for renaming");
1077	MOP.setReg(MatchingReg);
1078	}
1079	}
1080	}
1081	LLVM_DEBUG(dbgs() << "Renamed " << MI);
1082	return true;
1083	};
1084	forAllMIsUntilDef(MI&: MergeForward ? I : Paired ->getPrevNode(), DefReg: RegToRename,
1085	TRI, UINT32_MAX, Fn&: UpdateMIs);
1086
1087	#if !defined(NDEBUG)
1088	// For forward merging store:
1089	// Make sure the register used for renaming is not used between the
1090	// paired instructions. That would trash the content before the new
1091	// paired instruction.
1092	MCPhysReg RegToCheck = *RenameReg;
1093	// For backward merging load:
1094	// Make sure the register being renamed is not used between the
1095	// paired instructions. That would trash the content after the new
1096	// paired instruction.
1097	if (!MergeForward)
1098	RegToCheck = RegToRename;
1099	for (auto &MI :
1100	iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
1101	MergeForward ? std::next(I) : I,
1102	MergeForward ? std::next(Paired) : Paired))
1103	assert(all_of(MI.operands(),
1104	[this, RegToCheck](const MachineOperand &MOP) {
1105	return !MOP.isReg() \|\| MOP.isDebug() \|\| !MOP.getReg() \|\|
1106	MOP.isUndef() \|\|
1107	!TRI->regsOverlap(MOP.getReg(), RegToCheck);
1108	}) &&
1109	"Rename register used between paired instruction, trashing the "
1110	"content");
1111	#endif
1112	}
1113
1114	// Insert our new paired instruction after whichever of the paired
1115	// instructions MergeForward indicates.
1116	MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
1117	// Also based on MergeForward is from where we copy the base register operand
1118	// so we get the flags compatible with the input code.
1119	const MachineOperand &BaseRegOp =
1120	MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *Paired)
1121	: AArch64InstrInfo::getLdStBaseOp(MI: *I);
1122
1123	int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm();
1124	int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(MI: *Paired).getImm();
1125	bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Opc: Paired ->getOpcode());
1126	if (IsUnscaled != PairedIsUnscaled) {
1127	// We're trying to pair instructions that differ in how they are scaled. If
1128	// I is scaled then scale the offset of Paired accordingly. Otherwise, do
1129	// the opposite (i.e., make Paired's offset unscaled).
1130	int MemSize = TII->getMemScale(MI: *Paired);
1131	if (PairedIsUnscaled) {
1132	// If the unscaled offset isn't a multiple of the MemSize, we can't
1133	// pair the operations together.
1134	assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1135	"Offset should be a multiple of the stride!");
1136	PairedOffset /= MemSize;
1137	} else {
1138	PairedOffset *= MemSize;
1139	}
1140	}
1141
1142	// Which register is Rt and which is Rt2 depends on the offset order.
1143	// However, for pre load/stores the Rt should be the one of the pre
1144	// load/store.
1145	MachineInstr RtMI, Rt2MI;
1146	if (Offset == PairedOffset + OffsetStride &&
1147	!AArch64InstrInfo::isPreLdSt(MI: *I)) {
1148	RtMI = &*Paired;
1149	Rt2MI = &*I;
1150	// Here we swapped the assumption made for SExtIdx.
1151	// I.e., we turn ldp I, Paired into ldp Paired, I.
1152	// Update the index accordingly.
1153	if (SExtIdx != -`1`)
1154	SExtIdx = (SExtIdx + `1`) % `2`;
1155	} else {
1156	RtMI = &*I;
1157	Rt2MI = &*Paired;
1158	}
1159	int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(MI: *RtMI).getImm();
1160	// Scale the immediate offset, if necessary.
1161	if (TII->hasUnscaledLdStOffset(Opc: RtMI->getOpcode())) {
1162	assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1163	"Unscaled offset cannot be scaled.");
1164	OffsetImm /= TII->getMemScale(MI: *RtMI);
1165	}
1166
1167	// Construct the new instruction.
1168	MachineInstrBuilder MIB;
1169	DebugLoc DL = I ->getDebugLoc();
1170	MachineBasicBlock *MBB = I ->getParent();
1171	MachineOperand RegOp0 = getLdStRegOp(MI&: *RtMI);
1172	MachineOperand RegOp1 = getLdStRegOp(MI&: *Rt2MI);
1173	MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1174	// Kill flags may become invalid when moving stores for pairing.
1175	if (RegOp0.isUse()) {
1176	if (!MergeForward) {
1177	// Clear kill flags on store if moving upwards. Example:
1178	// STRWui kill %w0, ...
1179	// USE %w1
1180	// STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1181	// We are about to move the store of w1, so its kill flag may become
1182	// invalid; not the case for w0.
1183	// Since w1 is used between the stores, the kill flag on w1 is cleared
1184	// after merging.
1185	// STPWi kill %w0, %w1, ...
1186	// USE %w1
1187	for (auto It = std::next(x: I); It != Paired && PairedRegOp.isKill(); ++It)
1188	if (It ->readsRegister(Reg: PairedRegOp.getReg(), TRI))
1189	PairedRegOp.setIsKill(false);
1190	} else {
1191	// Clear kill flags of the first stores register. Example:
1192	// STRWui %w1, ...
1193	// USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1194	// STRW %w0
1195	Register Reg = getLdStRegOp(MI&: *I).getReg();
1196	for (MachineInstr &MI : make_range(x: std::next(x: I), y: Paired))
1197	MI.clearRegisterKills(Reg, RegInfo: TRI);
1198	}
1199	}
1200
1201	unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1202	MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: MatchPairOpcode));
1203
1204	// Adds the pre-index operand for pre-indexed ld/st pairs.
1205	if (AArch64InstrInfo::isPreLdSt(MI: *RtMI))
1206	MIB.addReg(RegNo: BaseRegOp.getReg(), flags: RegState::Define);
1207
1208	MIB.add(MO: RegOp0)
1209	.add(MO: RegOp1)
1210	.add(MO: BaseRegOp)
1211	.addImm(Val: OffsetImm)
1212	.cloneMergedMemRefs(OtherMIs: {&I, &Paired})
1213	.setMIFlags(I ->mergeFlagsWith(Other: *Paired));
1214
1215	(void)MIB;
1216
1217	LLVM_DEBUG(
1218	dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1219	LLVM_DEBUG(I->print(dbgs()));
1220	LLVM_DEBUG(dbgs() << " ");
1221	LLVM_DEBUG(Paired->print(dbgs()));
1222	LLVM_DEBUG(dbgs() << " with instruction:\n ");
1223	if (SExtIdx != -`1`) {
1224	// Generate the sign extension for the proper result of the ldp.
1225	// I.e., with X1, that would be:
1226	// %w1 = KILL %w1, implicit-def %x1
1227	// %x1 = SBFMXri killed %x1, 0, 31
1228	MachineOperand &DstMO = MIB ->getOperand(i: SExtIdx);
1229	// Right now, DstMO has the extended register, since it comes from an
1230	// extended opcode.
1231	Register DstRegX = DstMO.getReg();
1232	// Get the W variant of that register.
1233	Register DstRegW = TRI->getSubReg(Reg: DstRegX, Idx: AArch64::sub_32);
1234	// Update the result of LDP to use the W instead of the X variant.
1235	DstMO.setReg(DstRegW);
1236	LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1237	LLVM_DEBUG(dbgs() << "\n");
1238	// Make the machine verifier happy by providing a definition for
1239	// the X register.
1240	// Insert this definition right after the generated LDP, i.e., before
1241	// InsertionPoint.
1242	MachineInstrBuilder MIBKill =
1243	BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::KILL), DestReg: DstRegW)
1244	.addReg(RegNo: DstRegW)
1245	.addReg(RegNo: DstRegX, flags: RegState::Define);
1246	MIBKill ->getOperand(i: `2`).setImplicit();
1247	// Create the sign extension.
1248	MachineInstrBuilder MIBSXTW =
1249	BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: AArch64::SBFMXri), DestReg: DstRegX)
1250	.addReg(RegNo: DstRegX)
1251	.addImm(Val: `0`)
1252	.addImm(Val: `31`);
1253	(void)MIBSXTW;
1254
1255	// In the case of a sign-extend, where we have something like:
1256	// debugValueSubstitutions:[]
1257	// $w1 = LDRWui $x0, 1, debug-instr-number 1
1258	// DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1259	// $x0 = LDRSWui $x0, 0, debug-instr-number 2
1260	// DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1261
1262	// It will be converted to:
1263	// debugValueSubstitutions:[]
1264	// $w0, $w1 = LDPWi $x0, 0
1265	// $w0 = KILL $w0, implicit-def $x0
1266	// $x0 = SBFMXri $x0, 0, 31
1267	// DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1268	// DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1269
1270	// We want the final result to look like:
1271	// debugValueSubstitutions:
1272	// - { srcinst: 1, srcop: 0, dstinst: 4, dstop: 1, subreg: 0 }
1273	// - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1274	// $w0, $w1 = LDPWi $x0, 0, debug-instr-number 4
1275	// $w0 = KILL $w0, implicit-def $x0
1276	// $x0 = SBFMXri $x0, 0, 31, debug-instr-number 3
1277	// DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1278	// DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1279
1280	// $x0 is where the final value is stored, so the sign extend (SBFMXri)
1281	// instruction contains the final value we care about we give it a new
1282	// debug-instr-number 3. Whereas, $w1 contains the final value that we care
1283	// about, therefore the LDP instruction is also given a new
1284	// debug-instr-number 4. We have to add these substitutions to the
1285	// debugValueSubstitutions table. However, we also have to ensure that the
1286	// OpIndex that pointed to debug-instr-number 1 gets updated to 1, because
1287	// $w1 is the second operand of the LDP instruction.
1288
1289	if (I ->peekDebugInstrNum()) {
1290	// If I is the instruction which got sign extended and has a
1291	// debug-instr-number, give the SBFMXri instruction a new
1292	// debug-instr-number, and update the debugValueSubstitutions table with
1293	// the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1294	// instruction a new debug-instr-number, and update the
1295	// debugValueSubstitutions table with the new debug-instr-number and
1296	// OpIndex pair.
1297	unsigned NewInstrNum;
1298	if (DstRegX == I ->getOperand(i: `0`).getReg()) {
1299	NewInstrNum = MIBSXTW ->getDebugInstrNum();
1300	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I,
1301	MergedInstr&: *MIBSXTW);
1302	} else {
1303	NewInstrNum = MIB ->getDebugInstrNum();
1304	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: I, MergedInstr&: MIB);
1305	}
1306	}
1307	if (Paired ->peekDebugInstrNum()) {
1308	// If Paired is the instruction which got sign extended and has a
1309	// debug-instr-number, give the SBFMXri instruction a new
1310	// debug-instr-number, and update the debugValueSubstitutions table with
1311	// the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1312	// instruction a new debug-instr-number, and update the
1313	// debugValueSubstitutions table with the new debug-instr-number and
1314	// OpIndex pair.
1315	unsigned NewInstrNum;
1316	if (DstRegX == Paired ->getOperand(i: `0`).getReg()) {
1317	NewInstrNum = MIBSXTW ->getDebugInstrNum();
1318	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1319	MergedInstr&: *MIBSXTW);
1320	} else {
1321	NewInstrNum = MIB ->getDebugInstrNum();
1322	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1323	MergedInstr&: *MIB);
1324	}
1325	}
1326
1327	LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1328	LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1329	} else if (Opc == AArch64::LDR_ZXI \|\| Opc == AArch64::STR_ZXI) {
1330	// We are combining SVE fill/spill to LDP/STP, so we need to use the Q
1331	// variant of the registers.
1332	MachineOperand &MOp0 = MIB ->getOperand(i: `0`);
1333	MachineOperand &MOp1 = MIB ->getOperand(i: `1`);
1334	assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
1335	AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
1336	MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
1337	MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
1338	LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1339	} else {
1340
1341	// In the case that the merge doesn't result in a sign-extend, if we have
1342	// something like:
1343	// debugValueSubstitutions:[]
1344	// $x1 = LDRXui $x0, 1, debug-instr-number 1
1345	// DBG_INSTR_REF !13, dbg-instr-ref(1, 0), debug-location !11
1346	// $x0 = LDRXui killed $x0, 0, debug-instr-number 2
1347	// DBG_INSTR_REF !14, dbg-instr-ref(2, 0), debug-location !11
1348
1349	// It will be converted to:
1350	// debugValueSubstitutions: []
1351	// $x0, $x1 = LDPXi $x0, 0
1352	// DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1353	// DBG_INSTR_REF !13, dbg-instr-ref(2, 0), debug-location !14
1354
1355	// We want the final result to look like:
1356	// debugValueSubstitutions:
1357	// - { srcinst: 1, srcop: 0, dstinst: 3, dstop: 1, subreg: 0 }
1358	// - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1359	// $x0, $x1 = LDPXi $x0, 0, debug-instr-number 3
1360	// DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1361	// DBG_INSTR_REF !12, dbg-instr-ref(2, 0), debug-location !14
1362
1363	// Here all that needs to be done is, that the LDP instruction needs to be
1364	// updated with a new debug-instr-number, we then need to add entries into
1365	// the debugSubstitutions table to map the old instr-refs to the new ones.
1366
1367	// Assign new DebugInstrNum to the Paired instruction.
1368	if (I ->peekDebugInstrNum()) {
1369	unsigned NewDebugInstrNum = MIB ->getDebugInstrNum();
1370	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *I,
1371	MergedInstr&: *MIB);
1372	}
1373	if (Paired ->peekDebugInstrNum()) {
1374	unsigned NewDebugInstrNum = MIB ->getDebugInstrNum();
1375	addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *Paired,
1376	MergedInstr&: *MIB);
1377	}
1378
1379	LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1380	}
1381	LLVM_DEBUG(dbgs() << "\n");
1382
1383	if (MergeForward)
1384	for (const MachineOperand &MOP : phys_regs_and_masks(MI: *I))
1385	if (MOP.isReg() && MOP.isKill())
1386	DefinedInBB.addReg(Reg: MOP.getReg());
1387
1388	// Erase the old instructions.
1389	I ->eraseFromParent();
1390	Paired ->eraseFromParent();
1391
1392	return NextI;
1393	}
1394
1395	MachineBasicBlock::iterator
1396	AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1397	MachineBasicBlock::iterator StoreI) {
1398	MachineBasicBlock::iterator NextI =
1399	next_nodbg(It: LoadI, End: LoadI ->getParent()->end());
1400
1401	int LoadSize = TII->getMemScale(MI: *LoadI);
1402	int StoreSize = TII->getMemScale(MI: *StoreI);
1403	Register LdRt = getLdStRegOp(MI&: *LoadI).getReg();
1404	const MachineOperand &StMO = getLdStRegOp(MI&: *StoreI);
1405	Register StRt = getLdStRegOp(MI&: *StoreI).getReg();
1406	bool IsStoreXReg = TRI->getRegClass(i: AArch64::GPR64RegClassID)->contains(Reg: StRt);
1407
1408	assert((IsStoreXReg \|\|
1409	TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1410	"Unexpected RegClass");
1411
1412	MachineInstr *BitExtMI;
1413	if (LoadSize == StoreSize && (LoadSize == `4` \|\| LoadSize == `8`)) {
1414	// Remove the load, if the destination register of the loads is the same
1415	// register for stored value.
1416	if (StRt == LdRt && LoadSize == `8`) {
1417	for (MachineInstr &MI : make_range(x: StoreI ->getIterator(),
1418	y: LoadI ->getIterator())) {
1419	if (MI.killsRegister(Reg: StRt, TRI)) {
1420	MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1421	break;
1422	}
1423	}
1424	LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1425	LLVM_DEBUG(LoadI->print(dbgs()));
1426	LLVM_DEBUG(dbgs() << "\n");
1427	LoadI ->eraseFromParent();
1428	return NextI;
1429	}
1430	// Replace the load with a mov if the load and store are in the same size.
1431	BitExtMI =
1432	BuildMI(BB&: *LoadI ->getParent(), I: LoadI, MIMD: LoadI ->getDebugLoc(),
1433	MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), DestReg: LdRt)
1434	.addReg(RegNo: IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1435	.add(MO: StMO)
1436	.addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: `0`))
1437	.setMIFlags(LoadI ->getFlags());
1438	} else {
1439	// FIXME: Currently we disable this transformation in big-endian targets as
1440	// performance and correctness are verified only in little-endian.
1441	if (!Subtarget->isLittleEndian())
1442	return NextI;
1443	bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: *LoadI);
1444	assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1445	"Unsupported ld/st match");
1446	assert(LoadSize <= StoreSize && "Invalid load size");
1447	int UnscaledLdOffset =
1448	IsUnscaled
1449	? AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm()
1450	: AArch64InstrInfo::getLdStOffsetOp(MI: LoadI).getImm() LoadSize;
1451	int UnscaledStOffset =
1452	IsUnscaled
1453	? AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm()
1454	: AArch64InstrInfo::getLdStOffsetOp(MI: StoreI).getImm() StoreSize;
1455	int Width = LoadSize * `8`;
1456	Register DestReg =
1457	IsStoreXReg ? Register (TRI->getMatchingSuperReg(
1458	Reg: LdRt, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64RegClass))
1459	: LdRt;
1460
1461	assert((UnscaledLdOffset >= UnscaledStOffset &&
1462	(UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1463	"Invalid offset");
1464
1465	int Immr = `8` * (UnscaledLdOffset - UnscaledStOffset);
1466	int Imms = Immr + Width - `1`;
1467	if (UnscaledLdOffset == UnscaledStOffset) {
1468	uint32_t AndMaskEncoded = ((IsStoreXReg ? `1` : `0`) << `12`) // N
1469	\| ((Immr) << `6`) // immr
1470	\| ((Imms) << `0`) // imms
1471	;
1472
1473	BitExtMI =
1474	BuildMI(BB&: *LoadI ->getParent(), I: LoadI, MIMD: LoadI ->getDebugLoc(),
1475	MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1476	DestReg)
1477	.add(MO: StMO)
1478	.addImm(Val: AndMaskEncoded)
1479	.setMIFlags(LoadI ->getFlags());
1480	} else if (IsStoreXReg && Imms == `31`) {
1481	// Use the 32 bit variant of UBFM if it's the LSR alias of the
1482	// instruction.
1483	assert(Immr <= Imms && "Expected LSR alias of UBFM");
1484	BitExtMI = BuildMI(BB&: *LoadI ->getParent(), I: LoadI, MIMD: LoadI ->getDebugLoc(),
1485	MCID: TII->get(Opcode: AArch64::UBFMWri),
1486	DestReg: TRI->getSubReg(Reg: DestReg, Idx: AArch64::sub_32))
1487	.addReg(RegNo: TRI->getSubReg(Reg: StRt, Idx: AArch64::sub_32))
1488	.addImm(Val: Immr)
1489	.addImm(Val: Imms)
1490	.setMIFlags(LoadI ->getFlags());
1491	} else {
1492	BitExtMI =
1493	BuildMI(BB&: *LoadI ->getParent(), I: LoadI, MIMD: LoadI ->getDebugLoc(),
1494	MCID: TII->get(Opcode: IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1495	DestReg)
1496	.add(MO: StMO)
1497	.addImm(Val: Immr)
1498	.addImm(Val: Imms)
1499	.setMIFlags(LoadI ->getFlags());
1500	}
1501	}
1502
1503	// Clear kill flags between store and load.
1504	for (MachineInstr &MI : make_range(x: StoreI ->getIterator(),
1505	y: BitExtMI->getIterator()))
1506	if (MI.killsRegister(Reg: StRt, TRI)) {
1507	MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1508	break;
1509	}
1510
1511	LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1512	LLVM_DEBUG(StoreI->print(dbgs()));
1513	LLVM_DEBUG(dbgs() << " ");
1514	LLVM_DEBUG(LoadI->print(dbgs()));
1515	LLVM_DEBUG(dbgs() << " with instructions:\n ");
1516	LLVM_DEBUG(StoreI->print(dbgs()));
1517	LLVM_DEBUG(dbgs() << " ");
1518	LLVM_DEBUG((BitExtMI)->print(dbgs()));
1519	LLVM_DEBUG(dbgs() << "\n");
1520
1521	// Erase the old instructions.
1522	LoadI ->eraseFromParent();
1523	return NextI;
1524	}
1525
1526	static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1527	// Convert the byte-offset used by unscaled into an "element" offset used
1528	// by the scaled pair load/store instructions.
1529	if (IsUnscaled) {
1530	// If the byte-offset isn't a multiple of the stride, there's no point
1531	// trying to match it.
1532	if (Offset % OffsetStride)
1533	return false;
1534	Offset /= OffsetStride;
1535	}
1536	return Offset <= `63` && Offset >= -`64`;
1537	}
1538
1539	// Do alignment, specialized to power of 2 and for signed ints,
1540	// avoiding having to do a C-style cast from uint_64t to int when
1541	// using alignTo from include/llvm/Support/MathExtras.h.
1542	// FIXME: Move this function to include/MathExtras.h?
1543	static int alignTo(int Num, int PowOf2) {
1544	return (Num + PowOf2 - `1`) & ~(PowOf2 - `1`);
1545	}
1546
1547	static bool mayAlias(MachineInstr &MIa,
1548	SmallVectorImpl<MachineInstr *> &MemInsns,
1549	AliasAnalysis *AA) {
1550	for (MachineInstr *MIb : MemInsns) {
1551	if (MIa.mayAlias(AA, Other: MIb, /UseTBAA/* false)) {
1552	LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1553	return true;
1554	}
1555	}
1556
1557	LLVM_DEBUG(dbgs() << "No aliases found\n");
1558	return false;
1559	}
1560
1561	bool AArch64LoadStoreOpt::findMatchingStore(
1562	MachineBasicBlock::iterator I, unsigned Limit,
1563	MachineBasicBlock::iterator &StoreI) {
1564	MachineBasicBlock::iterator B = I ->getParent()->begin();
1565	MachineBasicBlock::iterator MBBI = I;
1566	MachineInstr &LoadMI = *I;
1567	Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: LoadMI).getReg();
1568
1569	// If the load is the first instruction in the block, there's obviously
1570	// not any matching store.
1571	if (MBBI == B)
1572	return false;
1573
1574	// Track which register units have been modified and used between the first
1575	// insn and the second insn.
1576	ModifiedRegUnits.clear();
1577	UsedRegUnits.clear();
1578
1579	unsigned Count = `0`;
1580	do {
1581	MBBI = prev_nodbg(It: MBBI, Begin: B);
1582	MachineInstr &MI = *MBBI;
1583
1584	// Don't count transient instructions towards the search limit since there
1585	// may be different numbers of them if e.g. debug information is present.
1586	if (!MI.isTransient())
1587	++Count;
1588
1589	// If the load instruction reads directly from the address to which the
1590	// store instruction writes and the stored value is not modified, we can
1591	// promote the load. Since we do not handle stores with pre-/post-index,
1592	// it's unnecessary to check if BaseReg is modified by the store itself.
1593	// Also we can't handle stores without an immediate offset operand,
1594	// while the operand might be the address for a global variable.
1595	if (MI.mayStore() && isMatchingStore(LoadInst&: LoadMI, StoreInst&: MI) &&
1596	BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1597	AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1598	isLdOffsetInRangeOfSt(LoadInst&: LoadMI, StoreInst&: MI, TII) &&
1599	ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg())) {
1600	StoreI = MBBI;
1601	return true;
1602	}
1603
1604	if (MI.isCall())
1605	return false;
1606
1607	// Update modified / uses register units.
1608	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1609
1610	// Otherwise, if the base register is modified, we have no match, so
1611	// return early.
1612	if (!ModifiedRegUnits.available(Reg: BaseReg))
1613	return false;
1614
1615	// If we encounter a store aliased with the load, return early.
1616	if (MI.mayStore() && LoadMI.mayAlias(AA, Other: MI, /UseTBAA/ false))
1617	return false;
1618	} while (MBBI != B && Count < Limit);
1619	return false;
1620	}
1621
1622	static bool needsWinCFI(const MachineFunction *MF) {
1623	return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1624	MF->getFunction().needsUnwindTableEntry();
1625	}
1626
1627	// Returns true if FirstMI and MI are candidates for merging or pairing.
1628	// Otherwise, returns false.
1629	static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1630	LdStPairFlags &Flags,
1631	const AArch64InstrInfo *TII) {
1632	// If this is volatile or if pairing is suppressed, not a candidate.
1633	if (MI.hasOrderedMemoryRef() \|\| TII->isLdStPairSuppressed(MI))
1634	return false;
1635
1636	// We should have already checked FirstMI for pair suppression and volatility.
1637	assert(!FirstMI.hasOrderedMemoryRef() &&
1638	!TII->isLdStPairSuppressed(FirstMI) &&
1639	"FirstMI shouldn't get here if either of these checks are true.");
1640
1641	if (needsWinCFI(MF: MI.getMF()) && (MI.getFlag(Flag: MachineInstr::FrameSetup) \|\|
1642	MI.getFlag(Flag: MachineInstr::FrameDestroy)))
1643	return false;
1644
1645	unsigned OpcA = FirstMI.getOpcode();
1646	unsigned OpcB = MI.getOpcode();
1647
1648	// Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1649	if (OpcA == OpcB)
1650	return !AArch64InstrInfo::isPreLdSt(MI: FirstMI);
1651
1652	// Bail out if one of the opcodes is SVE fill/spill, as we currently don't
1653	// allow pairing them with other instructions.
1654	if (OpcA == AArch64::LDR_ZXI \|\| OpcA == AArch64::STR_ZXI \|\|
1655	OpcB == AArch64::LDR_ZXI \|\| OpcB == AArch64::STR_ZXI)
1656	return false;
1657
1658	// Two pre ld/st of different opcodes cannot be merged either
1659	if (AArch64InstrInfo::isPreLdSt(MI: FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1660	return false;
1661
1662	// Try to match a sign-extended load/store with a zero-extended load/store.
1663	bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1664	unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc: OpcA, IsValidLdStrOpc: &IsValidLdStrOpc);
1665	assert(IsValidLdStrOpc &&
1666	"Given Opc should be a Load or Store with an immediate");
1667	// OpcA will be the first instruction in the pair.
1668	if (NonSExtOpc == getMatchingNonSExtOpcode(Opc: OpcB, IsValidLdStrOpc: &PairIsValidLdStrOpc)) {
1669	Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? `1` : `0`);
1670	return true;
1671	}
1672
1673	// If the second instruction isn't even a mergable/pairable load/store, bail
1674	// out.
1675	if (!PairIsValidLdStrOpc)
1676	return false;
1677
1678	// Narrow stores do not have a matching pair opcodes, so constrain their
1679	// merging to zero stores.
1680	if (isNarrowStore(Opc: OpcA) \|\| isNarrowStore(Opc: OpcB))
1681	return getLdStRegOp(MI&: FirstMI).getReg() == AArch64::WZR &&
1682	getLdStRegOp(MI).getReg() == AArch64::WZR &&
1683	TII->getMemScale(MI: FirstMI) == TII->getMemScale(MI);
1684
1685	// The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1686	// LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1687	// are candidate pairs that can be merged.
1688	if (isPreLdStPairCandidate(FirstMI, MI))
1689	return true;
1690
1691	// Try to match an unscaled load/store with a scaled load/store.
1692	return TII->hasUnscaledLdStOffset(Opc: OpcA) != TII->hasUnscaledLdStOffset(Opc: OpcB) &&
1693	getMatchingPairOpcode(Opc: OpcA) == getMatchingPairOpcode(Opc: OpcB);
1694
1695	// FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1696	}
1697
1698	static bool canRenameMOP(const MachineOperand &MOP,
1699	const TargetRegisterInfo *TRI) {
1700	if (MOP.isReg()) {
1701	auto *RegClass = TRI->getMinimalPhysRegClass(Reg: MOP.getReg());
1702	// Renaming registers with multiple disjunct sub-registers (e.g. the
1703	// result of a LD3) means that all sub-registers are renamed, potentially
1704	// impacting other instructions we did not check. Bail out.
1705	// Note that this relies on the structure of the AArch64 register file. In
1706	// particular, a subregister cannot be written without overwriting the
1707	// whole register.
1708	if (RegClass->HasDisjunctSubRegs && RegClass->CoveredBySubRegs &&
1709	(TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::dsub0) \|\|
1710	TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::qsub0) \|\|
1711	TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::zsub0))) {
1712	LLVM_DEBUG(
1713	dbgs()
1714	<< " Cannot rename operands with multiple disjunct subregisters ("
1715	<< MOP << ")\n");
1716	return false;
1717	}
1718
1719	// We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1720	// them must be known. For example, in ORRWrs the implicit-def
1721	// corresponds to the result register.
1722	if (MOP.isImplicit() && MOP.isDef()) {
1723	if (!isRewritableImplicitDef(Opc: MOP.getParent()->getOpcode()))
1724	return false;
1725	return TRI->isSuperOrSubRegisterEq(
1726	RegA: MOP.getParent()->getOperand(i: `0`).getReg(), RegB: MOP.getReg());
1727	}
1728	}
1729	return MOP.isImplicit() \|\|
1730	(MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1731	}
1732
1733	static bool
1734	canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1735	SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1736	const TargetRegisterInfo *TRI) {
1737	if (!FirstMI.mayStore())
1738	return false;
1739
1740	// Check if we can find an unused register which we can use to rename
1741	// the register used by the first load/store.
1742
1743	auto RegToRename = getLdStRegOp(MI&: FirstMI).getReg();
1744	// For now, we only rename if the store operand gets killed at the store.
1745	if (!getLdStRegOp(MI&: FirstMI).isKill() &&
1746	!any_of(Range: FirstMI.operands(),
1747	P: [TRI, RegToRename](const MachineOperand &MOP) {
1748	return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1749	MOP.isImplicit() && MOP.isKill() &&
1750	TRI->regsOverlap(RegA: RegToRename, RegB: MOP.getReg());
1751	})) {
1752	LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1753	return false;
1754	}
1755
1756	bool FoundDef = false;
1757
1758	// For each instruction between FirstMI and the previous def for RegToRename,
1759	// we
1760	// check if we can rename RegToRename in this instruction*
1761	// collect the registers used and required register classes for RegToRename.*
1762	std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1763	bool IsDef) {
1764	LLVM_DEBUG(dbgs() << "Checking " << MI);
1765	// Currently we do not try to rename across frame-setup instructions.
1766	if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1767	LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1768	<< "currently\n");
1769	return false;
1770	}
1771
1772	UsedInBetween.accumulate(MI);
1773
1774	// For a definition, check that we can rename the definition and exit the
1775	// loop.
1776	FoundDef = IsDef;
1777
1778	// For defs, check if we can rename the first def of RegToRename.
1779	if (FoundDef) {
1780	// For some pseudo instructions, we might not generate code in the end
1781	// (e.g. KILL) and we would end up without a correct def for the rename
1782	// register.
1783	// TODO: This might be overly conservative and we could handle those cases
1784	// in multiple ways:
1785	// 1. Insert an extra copy, to materialize the def.
1786	// 2. Skip pseudo-defs until we find an non-pseudo def.
1787	if (MI.isPseudo()) {
1788	LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1789	return false;
1790	}
1791
1792	for (auto &MOP : MI.operands()) {
1793	if (!MOP.isReg() \|\| !MOP.isDef() \|\| MOP.isDebug() \|\| !MOP.getReg() \|\|
1794	!TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1795	continue;
1796	if (!canRenameMOP(MOP, TRI)) {
1797	LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1798	return false;
1799	}
1800	RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1801	}
1802	return true;
1803	} else {
1804	for (auto &MOP : MI.operands()) {
1805	if (!MOP.isReg() \|\| MOP.isDebug() \|\| !MOP.getReg() \|\|
1806	!TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1807	continue;
1808
1809	if (!canRenameMOP(MOP, TRI)) {
1810	LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1811	return false;
1812	}
1813	RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1814	}
1815	}
1816	return true;
1817	};
1818
1819	if (!forAllMIsUntilDef(MI&: FirstMI, DefReg: RegToRename, TRI, Limit: LdStLimit, Fn&: CheckMIs))
1820	return false;
1821
1822	if (!FoundDef) {
1823	LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1824	return false;
1825	}
1826	return true;
1827	}
1828
1829	// We want to merge the second load into the first by rewriting the usages of
1830	// the same reg between first (incl.) and second (excl.). We don't need to care
1831	// about any insns before FirstLoad or after SecondLoad.
1832	// 1. The second load writes new value into the same reg.
1833	// - The renaming is impossible to impact later use of the reg.
1834	// - The second load always trash the value written by the first load which
1835	// means the reg must be killed before the second load.
1836	// 2. The first load must be a def for the same reg so we don't need to look
1837	// into anything before it.
1838	static bool canRenameUntilSecondLoad(
1839	MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1840	LiveRegUnits &UsedInBetween,
1841	SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1842	const TargetRegisterInfo *TRI) {
1843	if (FirstLoad.isPseudo())
1844	return false;
1845
1846	UsedInBetween.accumulate(MI: FirstLoad);
1847	auto RegToRename = getLdStRegOp(MI&: FirstLoad).getReg();
1848	bool Success = std::all_of(
1849	first: FirstLoad.getIterator(), last: SecondLoad.getIterator(),
1850	pred: [&](MachineInstr &MI) {
1851	LLVM_DEBUG(dbgs() << "Checking " << MI);
1852	// Currently we do not try to rename across frame-setup instructions.
1853	if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1854	LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1855	<< "currently\n");
1856	return false;
1857	}
1858
1859	for (auto &MOP : MI.operands()) {
1860	if (!MOP.isReg() \|\| MOP.isDebug() \|\| !MOP.getReg() \|\|
1861	!TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1862	continue;
1863	if (!canRenameMOP(MOP, TRI)) {
1864	LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1865	return false;
1866	}
1867	RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1868	}
1869
1870	return true;
1871	});
1872	return Success;
1873	}
1874
1875	// Check if we can find a physical register for renaming \p Reg. This register
1876	// must:
1877	// not be defined already in \p DefinedInBB; DefinedInBB must contain all*
1878	// defined registers up to the point where the renamed register will be used,
1879	// not used in \p UsedInBetween; UsedInBetween must contain all accessed*
1880	// registers in the range the rename register will be used,
1881	// is available in all used register classes (checked using RequiredClasses).*
1882	static std::optional<MCPhysReg> tryToFindRegisterToRename(
1883	const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1884	LiveRegUnits &UsedInBetween,
1885	SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1886	const TargetRegisterInfo *TRI) {
1887	const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1888
1889	// Checks if any sub- or super-register of PR is callee saved.
1890	auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1891	return any_of(Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1892	P: [&MF, TRI](MCPhysReg SubOrSuper) {
1893	return TRI->isCalleeSavedPhysReg(PhysReg: SubOrSuper, MF);
1894	});
1895	};
1896
1897	// Check if PR or one of its sub- or super-registers can be used for all
1898	// required register classes.
1899	auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1900	return all_of(Range&: RequiredClasses, P: [PR, TRI](const TargetRegisterClass *C) {
1901	return any_of(
1902	Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1903	P: [C](MCPhysReg SubOrSuper) { return C->contains(Reg: SubOrSuper); });
1904	});
1905	};
1906
1907	auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1908	for (const MCPhysReg &PR : *RegClass) {
1909	if (DefinedInBB.available(Reg: PR) && UsedInBetween.available(Reg: PR) &&
1910	!RegInfo.isReserved(PhysReg: PR) && !AnySubOrSuperRegCalleePreserved (PR) &&
1911	CanBeUsedForAllClasses (PR)) {
1912	DefinedInBB.addReg(Reg: PR);
1913	LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1914	<< "\n");
1915	return {PR};
1916	}
1917	}
1918	LLVM_DEBUG(dbgs() << "No rename register found from "
1919	<< TRI->getRegClassName(RegClass) << "\n");
1920	return std::nullopt;
1921	}
1922
1923	// For store pairs: returns a register from FirstMI to the beginning of the
1924	// block that can be renamed.
1925	// For load pairs: returns a register from FirstMI to MI that can be renamed.
1926	static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1927	std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1928	Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1929	SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1930	const TargetRegisterInfo *TRI) {
1931	std::optional<MCPhysReg> RenameReg;
1932	if (!DebugCounter::shouldExecute(CounterName: RegRenamingCounter))
1933	return RenameReg;
1934
1935	auto *RegClass = TRI->getMinimalPhysRegClass(Reg: getLdStRegOp(MI&: FirstMI).getReg());
1936	MachineFunction &MF = *FirstMI.getParent()->getParent();
1937	if (!RegClass \|\| !MF.getRegInfo().tracksLiveness())
1938	return RenameReg;
1939
1940	const bool IsLoad = FirstMI.mayLoad();
1941
1942	if (!MaybeCanRename) {
1943	if (IsLoad)
1944	MaybeCanRename = {canRenameUntilSecondLoad(FirstLoad&: FirstMI, SecondLoad&: MI, UsedInBetween,
1945	RequiredClasses, TRI)};
1946	else
1947	MaybeCanRename = {
1948	canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1949	}
1950
1951	if (*MaybeCanRename) {
1952	RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1953	RequiredClasses, TRI);
1954	}
1955	return RenameReg;
1956	}
1957
1958	/// Scan the instructions looking for a load/store that can be combined with the
1959	/// current instruction into a wider equivalent or a load/store pair.
1960	MachineBasicBlock::iterator
1961	AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1962	LdStPairFlags &Flags, unsigned Limit,
1963	bool FindNarrowMerge) {
1964	MachineBasicBlock::iterator E = I ->getParent()->end();
1965	MachineBasicBlock::iterator MBBI = I;
1966	MachineBasicBlock::iterator MBBIWithRenameReg;
1967	MachineInstr &FirstMI = *I;
1968	MBBI = next_nodbg(It: MBBI, End: E);
1969
1970	bool MayLoad = FirstMI.mayLoad();
1971	bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: FirstMI);
1972	Register Reg = getLdStRegOp(MI&: FirstMI).getReg();
1973	Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: FirstMI).getReg();
1974	int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: FirstMI).getImm();
1975	int OffsetStride = IsUnscaled ? TII->getMemScale(MI: FirstMI) : `1`;
1976	bool IsPromotableZeroStore = isPromotableZeroStoreInst(MI&: FirstMI);
1977
1978	std::optional<bool> MaybeCanRename;
1979	if (!EnableRenaming)
1980	MaybeCanRename = {false};
1981
1982	SmallPtrSet<const TargetRegisterClass *, `5`> RequiredClasses;
1983	LiveRegUnits UsedInBetween;
1984	UsedInBetween.init(TRI: *TRI);
1985
1986	Flags.clearRenameReg();
1987
1988	// Track which register units have been modified and used between the first
1989	// insn (inclusive) and the second insn.
1990	ModifiedRegUnits.clear();
1991	UsedRegUnits.clear();
1992
1993	// Remember any instructions that read/write memory between FirstMI and MI.
1994	SmallVector<MachineInstr *, `4`> MemInsns;
1995
1996	LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
1997	for (unsigned Count = `0`; MBBI != E && Count < Limit;
1998	MBBI = next_nodbg(It: MBBI, End: E)) {
1999	MachineInstr &MI = *MBBI;
2000	LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
2001
2002	UsedInBetween.accumulate(MI);
2003
2004	// Don't count transient instructions towards the search limit since there
2005	// may be different numbers of them if e.g. debug information is present.
2006	if (!MI.isTransient())
2007	++Count;
2008
2009	Flags.setSExtIdx(-`1`);
2010	if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
2011	AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
2012	assert(MI.mayLoadOrStore() && "Expected memory operation.");
2013	// If we've found another instruction with the same opcode, check to see
2014	// if the base and offset are compatible with our starting instruction.
2015	// These instructions all have scaled immediate operands, so we just
2016	// check for +1/-1. Make sure to check the new instruction offset is
2017	// actually an immediate and not a symbolic reference destined for
2018	// a relocation.
2019	Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
2020	int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2021	bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
2022	if (IsUnscaled != MIIsUnscaled) {
2023	// We're trying to pair instructions that differ in how they are scaled.
2024	// If FirstMI is scaled then scale the offset of MI accordingly.
2025	// Otherwise, do the opposite (i.e., make MI's offset unscaled).
2026	int MemSize = TII->getMemScale(MI);
2027	if (MIIsUnscaled) {
2028	// If the unscaled offset isn't a multiple of the MemSize, we can't
2029	// pair the operations together: bail and keep looking.
2030	if (MIOffset % MemSize) {
2031	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2032	UsedRegUnits, TRI);
2033	MemInsns.push_back(Elt: &MI);
2034	continue;
2035	}
2036	MIOffset /= MemSize;
2037	} else {
2038	MIOffset *= MemSize;
2039	}
2040	}
2041
2042	bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
2043
2044	if (BaseReg == MIBaseReg) {
2045	// If the offset of the second ld/st is not equal to the size of the
2046	// destination register it can’t be paired with a pre-index ld/st
2047	// pair. Additionally if the base reg is used or modified the operations
2048	// can't be paired: bail and keep looking.
2049	if (IsPreLdSt) {
2050	bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
2051	bool IsBaseRegUsed = !UsedRegUnits.available(
2052	Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2053	bool IsBaseRegModified = !ModifiedRegUnits.available(
2054	Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2055	// If the stored value and the address of the second instruction is
2056	// the same, it needs to be using the updated register and therefore
2057	// it must not be folded.
2058	bool IsMIRegTheSame =
2059	TRI->regsOverlap(RegA: getLdStRegOp(MI).getReg(),
2060	RegB: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2061	if (IsOutOfBounds \|\| IsBaseRegUsed \|\| IsBaseRegModified \|\|
2062	IsMIRegTheSame) {
2063	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2064	UsedRegUnits, TRI);
2065	MemInsns.push_back(Elt: &MI);
2066	continue;
2067	}
2068	} else {
2069	if ((Offset != MIOffset + OffsetStride) &&
2070	(Offset + OffsetStride != MIOffset)) {
2071	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2072	UsedRegUnits, TRI);
2073	MemInsns.push_back(Elt: &MI);
2074	continue;
2075	}
2076	}
2077
2078	int MinOffset = Offset < MIOffset ? Offset : MIOffset;
2079	if (FindNarrowMerge) {
2080	// If the alignment requirements of the scaled wide load/store
2081	// instruction can't express the offset of the scaled narrow input,
2082	// bail and keep looking. For promotable zero stores, allow only when
2083	// the stored value is the same (i.e., WZR).
2084	if ((!IsUnscaled && alignTo(Num: MinOffset, PowOf2: `2`) != MinOffset) \|\|
2085	(IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
2086	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2087	UsedRegUnits, TRI);
2088	MemInsns.push_back(Elt: &MI);
2089	continue;
2090	}
2091	} else {
2092	// Pairwise instructions have a 7-bit signed offset field. Single
2093	// insns have a 12-bit unsigned offset field. If the resultant
2094	// immediate offset of merging these instructions is out of range for
2095	// a pairwise instruction, bail and keep looking.
2096	if (!inBoundsForPair(IsUnscaled, Offset: MinOffset, OffsetStride)) {
2097	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2098	UsedRegUnits, TRI);
2099	MemInsns.push_back(Elt: &MI);
2100	LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
2101	<< "keep looking.\n");
2102	continue;
2103	}
2104	// If the alignment requirements of the paired (scaled) instruction
2105	// can't express the offset of the unscaled input, bail and keep
2106	// looking.
2107	if (IsUnscaled && (alignTo(Num: MinOffset, PowOf2: OffsetStride) != MinOffset)) {
2108	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2109	UsedRegUnits, TRI);
2110	MemInsns.push_back(Elt: &MI);
2111	LLVM_DEBUG(dbgs()
2112	<< "Offset doesn't fit due to alignment requirements, "
2113	<< "keep looking.\n");
2114	continue;
2115	}
2116	}
2117
2118	// If the BaseReg has been modified, then we cannot do the optimization.
2119	// For example, in the following pattern
2120	// ldr x1 [x2]
2121	// ldr x2 [x3]
2122	// ldr x4 [x2, #8],
2123	// the first and third ldr cannot be converted to ldp x1, x4, [x2]
2124	if (!ModifiedRegUnits.available(Reg: BaseReg))
2125	return E;
2126
2127	const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
2128	RegA: Reg, RegB: getLdStRegOp(MI).getReg());
2129
2130	// If the Rt of the second instruction (destination register of the
2131	// load) was not modified or used between the two instructions and none
2132	// of the instructions between the second and first alias with the
2133	// second, we can combine the second into the first.
2134	bool RtNotModified =
2135	ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg());
2136	bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
2137	!UsedRegUnits.available(Reg: getLdStRegOp(MI).getReg()));
2138
2139	LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
2140	<< "Reg '" << getLdStRegOp(MI) << "' not modified: "
2141	<< (RtNotModified ? "true" : "false") << "\n"
2142	<< "Reg '" << getLdStRegOp(MI) << "' not used: "
2143	<< (RtNotUsed ? "true" : "false") << "\n");
2144
2145	if (RtNotModified && RtNotUsed && !mayAlias(MIa&: MI, MemInsns, AA)) {
2146	// For pairs loading into the same reg, try to find a renaming
2147	// opportunity to allow the renaming of Reg between FirstMI and MI
2148	// and combine MI into FirstMI; otherwise bail and keep looking.
2149	if (SameLoadReg) {
2150	std::optional<MCPhysReg> RenameReg =
2151	findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
2152	Reg, DefinedInBB, UsedInBetween,
2153	RequiredClasses, TRI);
2154	if (!RenameReg) {
2155	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2156	UsedRegUnits, TRI);
2157	MemInsns.push_back(Elt: &MI);
2158	LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
2159	<< "keep looking.\n");
2160	continue;
2161	}
2162	Flags.setRenameReg(*RenameReg);
2163	}
2164
2165	Flags.setMergeForward(false);
2166	if (!SameLoadReg)
2167	Flags.clearRenameReg();
2168	return MBBI;
2169	}
2170
2171	// Likewise, if the Rt of the first instruction is not modified or used
2172	// between the two instructions and none of the instructions between the
2173	// first and the second alias with the first, we can combine the first
2174	// into the second.
2175	RtNotModified = !(
2176	MayLoad && !UsedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg()));
2177
2178	LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
2179	<< "Reg '" << getLdStRegOp(FirstMI)
2180	<< "' not modified: "
2181	<< (RtNotModified ? "true" : "false") << "\n");
2182
2183	if (RtNotModified && !mayAlias(MIa&: FirstMI, MemInsns, AA)) {
2184	if (ModifiedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg())) {
2185	Flags.setMergeForward(true);
2186	Flags.clearRenameReg();
2187	return MBBI;
2188	}
2189
2190	std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
2191	MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
2192	RequiredClasses, TRI);
2193	if (RenameReg) {
2194	Flags.setMergeForward(true);
2195	Flags.setRenameReg(*RenameReg);
2196	MBBIWithRenameReg = MBBI;
2197	}
2198	}
2199	LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
2200	<< "interference in between, keep looking.\n");
2201	}
2202	}
2203
2204	if (Flags.getRenameReg())
2205	return MBBIWithRenameReg;
2206
2207	// If the instruction wasn't a matching load or store. Stop searching if we
2208	// encounter a call instruction that might modify memory.
2209	if (MI.isCall()) {
2210	LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
2211	return E;
2212	}
2213
2214	// Update modified / uses register units.
2215	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2216
2217	// Otherwise, if the base register is modified, we have no match, so
2218	// return early.
2219	if (!ModifiedRegUnits.available(Reg: BaseReg)) {
2220	LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
2221	return E;
2222	}
2223
2224	// Update list of instructions that read/write memory.
2225	if (MI.mayLoadOrStore())
2226	MemInsns.push_back(Elt: &MI);
2227	}
2228	return E;
2229	}
2230
2231	static MachineBasicBlock::iterator
2232	maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
2233	assert((MI.getOpcode() == AArch64::SUBXri \|\|
2234	MI.getOpcode() == AArch64::ADDXri) &&
2235	"Expected a register update instruction");
2236	auto End = MI.getParent()->end();
2237	if (MaybeCFI == End \|\|
2238	MaybeCFI ->getOpcode() != TargetOpcode::CFI_INSTRUCTION \|\|
2239	!(MI.getFlag(Flag: MachineInstr::FrameSetup) \|\|
2240	MI.getFlag(Flag: MachineInstr::FrameDestroy)) \|\|
2241	MI.getOperand(i: `0`).getReg() != AArch64::SP)
2242	return End;
2243
2244	const MachineFunction &MF = *MI.getParent()->getParent();
2245	unsigned CFIIndex = MaybeCFI ->getOperand(i: `0`).getCFIIndex();
2246	const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
2247	switch (CFI.getOperation()) {
2248	case MCCFIInstruction::OpDefCfa:
2249	case MCCFIInstruction::OpDefCfaOffset:
2250	return MaybeCFI;
2251	default:
2252	return End;
2253	}
2254	}
2255
2256	std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
2257	MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
2258	bool IsForward, bool IsPreIdx, bool MergeEither) {
2259	assert((Update->getOpcode() == AArch64::ADDXri \|\|
2260	Update->getOpcode() == AArch64::SUBXri) &&
2261	"Unexpected base register update instruction to merge!");
2262	MachineBasicBlock::iterator E = I ->getParent()->end();
2263	MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2264
2265	// If updating the SP and the following instruction is CFA offset related CFI,
2266	// make sure the CFI follows the SP update either by merging at the location
2267	// of the update or by moving the CFI after the merged instruction. If unable
2268	// to do so, bail.
2269	MachineBasicBlock::iterator InsertPt = I;
2270	if (IsForward) {
2271	assert(IsPreIdx);
2272	if (auto CFI = maybeMoveCFI(MI&: *Update, MaybeCFI: next_nodbg(It: Update, End: E)); CFI != E) {
2273	if (MergeEither) {
2274	InsertPt = Update;
2275	} else {
2276	// Take care not to reorder CFIs.
2277	if (std::any_of(first: std::next(x: CFI), last: I, pred: [](const auto &Insn) {
2278	return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
2279	}))
2280	return std::nullopt;
2281
2282	MachineBasicBlock *MBB = InsertPt ->getParent();
2283	MBB->splice(Where: std::next(x: InsertPt), Other: MBB, From: CFI);
2284	}
2285	}
2286	}
2287
2288	// Return the instruction following the merged instruction, which is
2289	// the instruction following our unmerged load. Unless that's the add/sub
2290	// instruction we're merging, in which case it's the one after that.
2291	if (NextI == Update)
2292	NextI = next_nodbg(It: NextI, End: E);
2293
2294	int Value = Update ->getOperand(i: `2`).getImm();
2295	assert(AArch64_AM::getShiftValue(Update->getOperand(`3`).getImm()) == `0` &&
2296	"Can't merge 1 << 12 offset into pre-/post-indexed load / store");
2297	if (Update ->getOpcode() == AArch64::SUBXri)
2298	Value = -Value;
2299
2300	unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(Opc: I ->getOpcode())
2301	: getPostIndexedOpcode(Opc: I ->getOpcode());
2302	MachineInstrBuilder MIB;
2303	int Scale, MinOffset, MaxOffset;
2304	getPrePostIndexedMemOpInfo(MI: *I, Scale, MinOffset, MaxOffset);
2305	if (!AArch64InstrInfo::isPairedLdSt(MI: *I)) {
2306	// Non-paired instruction.
2307	MIB = BuildMI(BB&: *InsertPt ->getParent(), I: InsertPt, MIMD: InsertPt ->getDebugLoc(),
2308	MCID: TII->get(Opcode: NewOpc))
2309	.add(MO: Update ->getOperand(i: `0`))
2310	.add(MO: getLdStRegOp(MI&: *I))
2311	.add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2312	.addImm(Val: Value / Scale)
2313	.setMemRefs(I ->memoperands())
2314	.setMIFlags(I ->mergeFlagsWith(Other: *Update));
2315	} else {
2316	// Paired instruction.
2317	MIB = BuildMI(BB&: *InsertPt ->getParent(), I: InsertPt, MIMD: InsertPt ->getDebugLoc(),
2318	MCID: TII->get(Opcode: NewOpc))
2319	.add(MO: Update ->getOperand(i: `0`))
2320	.add(MO: getLdStRegOp(MI&: *I, PairedRegOp: `0`))
2321	.add(MO: getLdStRegOp(MI&: *I, PairedRegOp: `1`))
2322	.add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2323	.addImm(Val: Value / Scale)
2324	.setMemRefs(I ->memoperands())
2325	.setMIFlags(I ->mergeFlagsWith(Other: *Update));
2326	}
2327
2328	if (IsPreIdx) {
2329	++NumPreFolded;
2330	LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2331	} else {
2332	++NumPostFolded;
2333	LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2334	}
2335	LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2336	LLVM_DEBUG(I->print(dbgs()));
2337	LLVM_DEBUG(dbgs() << " ");
2338	LLVM_DEBUG(Update->print(dbgs()));
2339	LLVM_DEBUG(dbgs() << " with instruction:\n ");
2340	LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2341	LLVM_DEBUG(dbgs() << "\n");
2342
2343	// Erase the old instructions for the block.
2344	I ->eraseFromParent();
2345	Update ->eraseFromParent();
2346
2347	return NextI;
2348	}
2349
2350	MachineBasicBlock::iterator
2351	AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2352	MachineBasicBlock::iterator Update,
2353	unsigned Offset, int Scale) {
2354	assert((Update->getOpcode() == AArch64::MOVKWi) &&
2355	"Unexpected const mov instruction to merge!");
2356	MachineBasicBlock::iterator E = I ->getParent()->end();
2357	MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2358	MachineBasicBlock::iterator PrevI = prev_nodbg(It: Update, Begin: E);
2359	MachineInstr &MemMI = *I;
2360	unsigned Mask = (`1` << `12`) * Scale - `1`;
2361	unsigned Low = Offset & Mask;
2362	unsigned High = Offset - Low;
2363	Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2364	Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2365	MachineInstrBuilder AddMIB, MemMIB;
2366
2367	// Add IndexReg, BaseReg, High (the BaseReg may be SP)
2368	AddMIB =
2369	BuildMI(BB&: *I ->getParent(), I, MIMD: I ->getDebugLoc(), MCID: TII->get(Opcode: AArch64::ADDXri))
2370	.addDef(RegNo: IndexReg)
2371	.addUse(RegNo: BaseReg)
2372	.addImm(Val: High >> `12`) // shifted value
2373	.addImm(Val: `12`); // shift 12
2374	(void)AddMIB;
2375	// Ld/St DestReg, IndexReg, Imm12
2376	unsigned NewOpc = getBaseAddressOpcode(Opc: I ->getOpcode());
2377	MemMIB = BuildMI(BB&: *I ->getParent(), I, MIMD: I ->getDebugLoc(), MCID: TII->get(Opcode: NewOpc))
2378	.add(MO: getLdStRegOp(MI&: MemMI))
2379	.add(MO: AArch64InstrInfo::getLdStOffsetOp(MI: MemMI))
2380	.addImm(Val: Low / Scale)
2381	.setMemRefs(I ->memoperands())
2382	.setMIFlags(I ->mergeFlagsWith(Other: *Update));
2383	(void)MemMIB;
2384
2385	++NumConstOffsetFolded;
2386	LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2387	LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2388	LLVM_DEBUG(PrevI->print(dbgs()));
2389	LLVM_DEBUG(dbgs() << " ");
2390	LLVM_DEBUG(Update->print(dbgs()));
2391	LLVM_DEBUG(dbgs() << " ");
2392	LLVM_DEBUG(I->print(dbgs()));
2393	LLVM_DEBUG(dbgs() << " with instruction:\n ");
2394	LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2395	LLVM_DEBUG(dbgs() << " ");
2396	LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2397	LLVM_DEBUG(dbgs() << "\n");
2398
2399	// Erase the old instructions for the block.
2400	I ->eraseFromParent();
2401	PrevI ->eraseFromParent();
2402	Update ->eraseFromParent();
2403
2404	return NextI;
2405	}
2406
2407	bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2408	MachineInstr &MI,
2409	unsigned BaseReg, int Offset) {
2410	switch (MI.getOpcode()) {
2411	default:
2412	break;
2413	case AArch64::SUBXri:
2414	case AArch64::ADDXri:
2415	// Make sure it's a vanilla immediate operand, not a relocation or
2416	// anything else we can't handle.
2417	if (!MI.getOperand(i: `2`).isImm())
2418	break;
2419	// Watch out for 1 << 12 shifted value.
2420	if (AArch64_AM::getShiftValue(Imm: MI.getOperand(i: `3`).getImm()))
2421	break;
2422
2423	// The update instruction source and destination register must be the
2424	// same as the load/store base register.
2425	if (MI.getOperand(i: `0`).getReg() != BaseReg \|\|
2426	MI.getOperand(i: `1`).getReg() != BaseReg)
2427	break;
2428
2429	int UpdateOffset = MI.getOperand(i: `2`).getImm();
2430	if (MI.getOpcode() == AArch64::SUBXri)
2431	UpdateOffset = -UpdateOffset;
2432
2433	// The immediate must be a multiple of the scaling factor of the pre/post
2434	// indexed instruction.
2435	int Scale, MinOffset, MaxOffset;
2436	getPrePostIndexedMemOpInfo(MI: MemMI, Scale, MinOffset, MaxOffset);
2437	if (UpdateOffset % Scale != `0`)
2438	break;
2439
2440	// Scaled offset must fit in the instruction immediate.
2441	int ScaledOffset = UpdateOffset / Scale;
2442	if (ScaledOffset > MaxOffset \|\| ScaledOffset < MinOffset)
2443	break;
2444
2445	// If we have a non-zero Offset, we check that it matches the amount
2446	// we're adding to the register.
2447	if (!Offset \|\| Offset == UpdateOffset)
2448	return true;
2449	break;
2450	}
2451	return false;
2452	}
2453
2454	bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2455	MachineInstr &MI,
2456	unsigned IndexReg,
2457	unsigned &Offset) {
2458	// The update instruction source and destination register must be the
2459	// same as the load/store index register.
2460	if (MI.getOpcode() == AArch64::MOVKWi &&
2461	TRI->isSuperOrSubRegisterEq(RegA: IndexReg, RegB: MI.getOperand(i: `1`).getReg())) {
2462
2463	// movz + movk hold a large offset of a Ld/St instruction.
2464	MachineBasicBlock::iterator B = MI.getParent()->begin();
2465	MachineBasicBlock::iterator MBBI = &MI;
2466	// Skip the scene when the MI is the first instruction of a block.
2467	if (MBBI == B)
2468	return false;
2469	MBBI = prev_nodbg(It: MBBI, Begin: B);
2470	MachineInstr &MovzMI = *MBBI;
2471	// Make sure the MOVKWi and MOVZWi set the same register.
2472	if (MovzMI.getOpcode() == AArch64::MOVZWi &&
2473	MovzMI.getOperand(i: `0`).getReg() == MI.getOperand(i: `0`).getReg()) {
2474	unsigned Low = MovzMI.getOperand(i: `1`).getImm();
2475	unsigned High = MI.getOperand(i: `2`).getImm() << MI.getOperand(i: `3`).getImm();
2476	Offset = High + Low;
2477	// 12-bit optionally shifted immediates are legal for adds.
2478	return Offset >> `24` == `0`;
2479	}
2480	}
2481	return false;
2482	}
2483
2484	MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2485	MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2486	MachineBasicBlock::iterator E = I ->getParent()->end();
2487	MachineInstr &MemMI = *I;
2488	MachineBasicBlock::iterator MBBI = I;
2489
2490	Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2491	int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm() *
2492	TII->getMemScale(MI: MemMI);
2493
2494	// Scan forward looking for post-index opportunities. Updating instructions
2495	// can't be formed if the memory instruction doesn't have the offset we're
2496	// looking for.
2497	if (MIUnscaledOffset != UnscaledOffset)
2498	return E;
2499
2500	// If the base register overlaps a source/destination register, we can't
2501	// merge the update. This does not apply to tag store instructions which
2502	// ignore the address part of the source register.
2503	// This does not apply to STGPi as well, which does not have unpredictable
2504	// behavior in this case unlike normal stores, and always performs writeback
2505	// after reading the source register value.
2506	if (!isTagStore(MI: MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2507	bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2508	for (unsigned i = `0`, e = IsPairedInsn ? `2` : `1`; i != e; ++i) {
2509	Register DestReg = getLdStRegOp(MI&: MemMI, PairedRegOp: i).getReg();
2510	if (DestReg == BaseReg \|\| TRI->isSubRegister(RegA: BaseReg, RegB: DestReg))
2511	return E;
2512	}
2513	}
2514
2515	// Track which register units have been modified and used between the first
2516	// insn (inclusive) and the second insn.
2517	ModifiedRegUnits.clear();
2518	UsedRegUnits.clear();
2519	MBBI = next_nodbg(It: MBBI, End: E);
2520
2521	// We can't post-increment the stack pointer if any instruction between
2522	// the memory access (I) and the increment (MBBI) can access the memory
2523	// region defined by [SP, MBBI].
2524	const bool BaseRegSP = BaseReg == AArch64::SP;
2525	if (BaseRegSP && needsWinCFI(MF: I ->getMF())) {
2526	// FIXME: For now, we always block the optimization over SP in windows
2527	// targets as it requires to adjust the unwind/debug info, messing up
2528	// the unwind info can actually cause a miscompile.
2529	return E;
2530	}
2531
2532	for (unsigned Count = `0`; MBBI != E && Count < Limit;
2533	MBBI = next_nodbg(It: MBBI, End: E)) {
2534	MachineInstr &MI = *MBBI;
2535
2536	// Don't count transient instructions towards the search limit since there
2537	// may be different numbers of them if e.g. debug information is present.
2538	if (!MI.isTransient())
2539	++Count;
2540
2541	// If we found a match, return it.
2542	if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset: UnscaledOffset))
2543	return MBBI;
2544
2545	// Update the status of what the instruction clobbered and used.
2546	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2547
2548	// Otherwise, if the base register is used or modified, we have no match, so
2549	// return early.
2550	// If we are optimizing SP, do not allow instructions that may load or store
2551	// in between the load and the optimized value update.
2552	if (!ModifiedRegUnits.available(Reg: BaseReg) \|\|
2553	!UsedRegUnits.available(Reg: BaseReg) \|\|
2554	(BaseRegSP && MBBI ->mayLoadOrStore()))
2555	return E;
2556	}
2557	return E;
2558	}
2559
2560	MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2561	MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
2562	MachineBasicBlock::iterator B = I ->getParent()->begin();
2563	MachineBasicBlock::iterator E = I ->getParent()->end();
2564	MachineInstr &MemMI = *I;
2565	MachineBasicBlock::iterator MBBI = I;
2566	MachineFunction &MF = *MemMI.getMF();
2567
2568	Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2569	int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm();
2570
2571	bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2572	Register DestReg[] = {getLdStRegOp(MI&: MemMI, PairedRegOp: `0`).getReg(),
2573	IsPairedInsn ? getLdStRegOp(MI&: MemMI, PairedRegOp: `1`).getReg()
2574	: AArch64::NoRegister};
2575
2576	// If the load/store is the first instruction in the block, there's obviously
2577	// not any matching update. Ditto if the memory offset isn't zero.
2578	if (MBBI == B \|\| Offset != `0`)
2579	return E;
2580	// If the base register overlaps a destination register, we can't
2581	// merge the update.
2582	if (!isTagStore(MI: MemMI)) {
2583	for (unsigned i = `0`, e = IsPairedInsn ? `2` : `1`; i != e; ++i)
2584	if (DestReg[i] == BaseReg \|\| TRI->isSubRegister(RegA: BaseReg, RegB: DestReg[i]))
2585	return E;
2586	}
2587
2588	const bool BaseRegSP = BaseReg == AArch64::SP;
2589	if (BaseRegSP && needsWinCFI(MF: I ->getMF())) {
2590	// FIXME: For now, we always block the optimization over SP in windows
2591	// targets as it requires to adjust the unwind/debug info, messing up
2592	// the unwind info can actually cause a miscompile.
2593	return E;
2594	}
2595
2596	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2597	unsigned RedZoneSize =
2598	Subtarget.getTargetLowering()->getRedZoneSize(F: MF.getFunction());
2599
2600	// Track which register units have been modified and used between the first
2601	// insn (inclusive) and the second insn.
2602	ModifiedRegUnits.clear();
2603	UsedRegUnits.clear();
2604	unsigned Count = `0`;
2605	bool MemAccessBeforeSPPreInc = false;
2606	MergeEither = true;
2607	do {
2608	MBBI = prev_nodbg(It: MBBI, Begin: B);
2609	MachineInstr &MI = *MBBI;
2610
2611	// Don't count transient instructions towards the search limit since there
2612	// may be different numbers of them if e.g. debug information is present.
2613	if (!MI.isTransient())
2614	++Count;
2615
2616	// If we found a match, return it.
2617	if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset)) {
2618	// Check that the update value is within our red zone limit (which may be
2619	// zero).
2620	if (MemAccessBeforeSPPreInc && MBBI ->getOperand(i: `2`).getImm() > RedZoneSize)
2621	return E;
2622	return MBBI;
2623	}
2624
2625	// Update the status of what the instruction clobbered and used.
2626	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2627
2628	// Otherwise, if the base register is used or modified, we have no match, so
2629	// return early.
2630	if (!ModifiedRegUnits.available(Reg: BaseReg) \|\|
2631	!UsedRegUnits.available(Reg: BaseReg))
2632	return E;
2633
2634	// If we have a destination register (i.e. a load instruction) and a
2635	// destination register is used or modified, then we can only merge forward,
2636	// i.e. the combined instruction is put in the place of the memory
2637	// instruction. Same applies if we see a memory access or side effects.
2638	if (MI.mayLoadOrStore() \|\| MI.hasUnmodeledSideEffects() \|\|
2639	(DestReg[`0`] != AArch64::NoRegister &&
2640	!(ModifiedRegUnits.available(Reg: DestReg[`0`]) &&
2641	UsedRegUnits.available(Reg: DestReg[`0`]))) \|\|
2642	(DestReg[`1`] != AArch64::NoRegister &&
2643	!(ModifiedRegUnits.available(Reg: DestReg[`1`]) &&
2644	UsedRegUnits.available(Reg: DestReg[`1`]))))
2645	MergeEither = false;
2646
2647	// Keep track if we have a memory access before an SP pre-increment, in this
2648	// case we need to validate later that the update amount respects the red
2649	// zone.
2650	if (BaseRegSP && MBBI ->mayLoadOrStore())
2651	MemAccessBeforeSPPreInc = true;
2652	} while (MBBI != B && Count < Limit);
2653	return E;
2654	}
2655
2656	MachineBasicBlock::iterator
2657	AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2658	MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2659	MachineBasicBlock::iterator B = I ->getParent()->begin();
2660	MachineBasicBlock::iterator E = I ->getParent()->end();
2661	MachineInstr &MemMI = *I;
2662	MachineBasicBlock::iterator MBBI = I;
2663
2664	// If the load is the first instruction in the block, there's obviously
2665	// not any matching load or store.
2666	if (MBBI == B)
2667	return E;
2668
2669	// Make sure the IndexReg is killed and the shift amount is zero.
2670	// TODO: Relex this restriction to extend, simplify processing now.
2671	if (!AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).isKill() \|\|
2672	!AArch64InstrInfo::getLdStAmountOp(MI: MemMI).isImm() \|\|
2673	(AArch64InstrInfo::getLdStAmountOp(MI: MemMI).getImm() != `0`))
2674	return E;
2675
2676	Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2677
2678	// Track which register units have been modified and used between the first
2679	// insn (inclusive) and the second insn.
2680	ModifiedRegUnits.clear();
2681	UsedRegUnits.clear();
2682	unsigned Count = `0`;
2683	do {
2684	MBBI = prev_nodbg(It: MBBI, Begin: B);
2685	MachineInstr &MI = *MBBI;
2686
2687	// Don't count transient instructions towards the search limit since there
2688	// may be different numbers of them if e.g. debug information is present.
2689	if (!MI.isTransient())
2690	++Count;
2691
2692	// If we found a match, return it.
2693	if (isMatchingMovConstInsn(MemMI&: *I, MI, IndexReg, Offset)) {
2694	return MBBI;
2695	}
2696
2697	// Update the status of what the instruction clobbered and used.
2698	LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2699
2700	// Otherwise, if the index register is used or modified, we have no match,
2701	// so return early.
2702	if (!ModifiedRegUnits.available(Reg: IndexReg) \|\|
2703	!UsedRegUnits.available(Reg: IndexReg))
2704	return E;
2705
2706	} while (MBBI != B && Count < Limit);
2707	return E;
2708	}
2709
2710	bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2711	MachineBasicBlock::iterator &MBBI) {
2712	MachineInstr &MI = *MBBI;
2713	// If this is a volatile load, don't mess with it.
2714	if (MI.hasOrderedMemoryRef())
2715	return false;
2716
2717	if (needsWinCFI(MF: MI.getMF()) && MI.getFlag(Flag: MachineInstr::FrameDestroy))
2718	return false;
2719
2720	// Make sure this is a reg+imm.
2721	// FIXME: It is possible to extend it to handle reg+reg cases.
2722	if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2723	return false;
2724
2725	// Look backward up to LdStLimit instructions.
2726	MachineBasicBlock::iterator StoreI;
2727	if (findMatchingStore(I: MBBI, Limit: LdStLimit, StoreI)) {
2728	++NumLoadsFromStoresPromoted;
2729	// Promote the load. Keeping the iterator straight is a
2730	// pain, so we let the merge routine tell us what the next instruction
2731	// is after it's done mucking about.
2732	MBBI = promoteLoadFromStore(LoadI: MBBI, StoreI);
2733	return true;
2734	}
2735	return false;
2736	}
2737
2738	// Merge adjacent zero stores into a wider store.
2739	bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2740	MachineBasicBlock::iterator &MBBI) {
2741	assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2742	MachineInstr &MI = *MBBI;
2743	MachineBasicBlock::iterator E = MI.getParent()->end();
2744
2745	if (!TII->isCandidateToMergeOrPair(MI))
2746	return false;
2747
2748	// Look ahead up to LdStLimit instructions for a mergeable instruction.
2749	LdStPairFlags Flags;
2750	MachineBasicBlock::iterator MergeMI =
2751	findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, / FindNarrowMerge = / true);
2752	if (MergeMI != E) {
2753	++NumZeroStoresPromoted;
2754
2755	// Keeping the iterator straight is a pain, so we let the merge routine tell
2756	// us what the next instruction is after it's done mucking about.
2757	MBBI = mergeNarrowZeroStores(I: MBBI, MergeMI, Flags);
2758	return true;
2759	}
2760	return false;
2761	}
2762
2763	// Find loads and stores that can be merged into a single load or store pair
2764	// instruction.
2765	bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2766	MachineInstr &MI = *MBBI;
2767	MachineBasicBlock::iterator E = MI.getParent()->end();
2768
2769	if (!TII->isCandidateToMergeOrPair(MI))
2770	return false;
2771
2772	// If disable-ldp feature is opted, do not emit ldp.
2773	if (MI.mayLoad() && Subtarget->hasDisableLdp())
2774	return false;
2775
2776	// If disable-stp feature is opted, do not emit stp.
2777	if (MI.mayStore() && Subtarget->hasDisableStp())
2778	return false;
2779
2780	// Early exit if the offset is not possible to match. (6 bits of positive
2781	// range, plus allow an extra one in case we find a later insn that matches
2782	// with Offset-1)
2783	bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2784	int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2785	int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : `1`;
2786	// Allow one more for offset.
2787	if (Offset > `0`)
2788	Offset -= OffsetStride;
2789	if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2790	return false;
2791
2792	// Look ahead up to LdStLimit instructions for a pairable instruction.
2793	LdStPairFlags Flags;
2794	MachineBasicBlock::iterator Paired =
2795	findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, / FindNarrowMerge = / false);
2796	if (Paired != E) {
2797	// Keeping the iterator straight is a pain, so we let the merge routine tell
2798	// us what the next instruction is after it's done mucking about.
2799	auto Prev = std::prev(x: MBBI);
2800
2801	// Fetch the memoperand of the load/store that is a candidate for
2802	// combination.
2803	MachineMemOperand *MemOp =
2804	MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2805
2806	// If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2807	// that the alignment of the source pointer is at least double the alignment
2808	// of the type.
2809	if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) \|\|
2810	(MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2811	// If there is no size/align information, cancel the transformation.
2812	if (!MemOp \|\| !MemOp->getMemoryType().isValid()) {
2813	NumFailedAlignmentCheck ++;
2814	return false;
2815	}
2816
2817	// Get the needed alignments to check them if
2818	// ldp-aligned-only/stp-aligned-only features are opted.
2819	uint64_t MemAlignment = MemOp->getAlign().value();
2820	uint64_t TypeAlignment =
2821	Align (MemOp->getSize().getValue().getKnownMinValue()).value();
2822
2823	if (MemAlignment < `2` * TypeAlignment) {
2824	NumFailedAlignmentCheck ++;
2825	return false;
2826	}
2827	}
2828
2829	++NumPairCreated;
2830	if (TII->hasUnscaledLdStOffset(MI))
2831	++NumUnscaledPairCreated;
2832
2833	MBBI = mergePairedInsns(I: MBBI, Paired, Flags);
2834	// Collect liveness info for instructions between Prev and the new position
2835	// MBBI.
2836	for (auto I = std::next(x: Prev); I != MBBI; I ++)
2837	updateDefinedRegisters(MI&: *I, Units&: DefinedInBB, TRI);
2838
2839	return true;
2840	}
2841	return false;
2842	}
2843
2844	bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2845	(MachineBasicBlock::iterator &MBBI) {
2846	MachineInstr &MI = *MBBI;
2847	MachineBasicBlock::iterator E = MI.getParent()->end();
2848	MachineBasicBlock::iterator Update;
2849
2850	// Look forward to try to form a post-index instruction. For example,
2851	// ldr x0, [x20]
2852	// add x20, x20, #32
2853	// merged into:
2854	// ldr x0, [x20], #32
2855	Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset: `0`, Limit: UpdateLimit);
2856	if (Update != E) {
2857	// Merge the update into the ld/st.
2858	if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /IsForward=/false,
2859	/IsPreIdx=/false,
2860	/MergeEither=/false)) {
2861	MBBI = *NextI;
2862	return true;
2863	}
2864	}
2865
2866	// Don't know how to handle unscaled pre/post-index versions below, so bail.
2867	if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2868	return false;
2869
2870	// Look back to try to find a pre-index instruction. For example,
2871	// add x0, x0, #8
2872	// ldr x1, [x0]
2873	// merged into:
2874	// ldr x1, [x0, #8]!
2875	bool MergeEither;
2876	Update = findMatchingUpdateInsnBackward(I: MBBI, Limit: UpdateLimit, MergeEither);
2877	if (Update != E) {
2878	// Merge the update into the ld/st.
2879	if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /IsForward=/true,
2880	/IsPreIdx=/true, MergeEither)) {
2881	MBBI = *NextI;
2882	return true;
2883	}
2884	}
2885
2886	// The immediate in the load/store is scaled by the size of the memory
2887	// operation. The immediate in the add we're looking for,
2888	// however, is not, so adjust here.
2889	int UnscaledOffset =
2890	AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2891
2892	// Look forward to try to find a pre-index instruction. For example,
2893	// ldr x1, [x0, #64]
2894	// add x0, x0, #64
2895	// merged into:
2896	// ldr x1, [x0, #64]!
2897	Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset, Limit: UpdateLimit);
2898	if (Update != E) {
2899	// Merge the update into the ld/st.
2900	if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /IsForward=/false,
2901	/IsPreIdx=/true,
2902	/MergeEither=/false)) {
2903	MBBI = *NextI;
2904	return true;
2905	}
2906	}
2907
2908	return false;
2909	}
2910
2911	bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2912	int Scale) {
2913	MachineInstr &MI = *MBBI;
2914	MachineBasicBlock::iterator E = MI.getParent()->end();
2915	MachineBasicBlock::iterator Update;
2916
2917	// Don't know how to handle unscaled pre/post-index versions below, so bail.
2918	if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2919	return false;
2920
2921	// Look back to try to find a const offset for index LdSt instruction. For
2922	// example,
2923	// mov x8, #LargeImm ; = a (1<<12) + imm12*
2924	// ldr x1, [x0, x8]
2925	// merged into:
2926	// add x8, x0, a (1<<12)*
2927	// ldr x1, [x8, imm12]
2928	unsigned Offset;
2929	Update = findMatchingConstOffsetBackward(I: MBBI, Limit: LdStConstLimit, Offset);
2930	if (Update != E && (Offset & (Scale - `1`)) == `0`) {
2931	// Merge the imm12 into the ld/st.
2932	MBBI = mergeConstOffsetInsn(I: MBBI, Update, Offset, Scale);
2933	return true;
2934	}
2935
2936	return false;
2937	}
2938
2939	bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2940	bool EnableNarrowZeroStOpt) {
2941	AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
2942
2943	bool Modified = false;
2944	// Four transformations to do here:
2945	// 1) Find loads that directly read from stores and promote them by
2946	// replacing with mov instructions. If the store is wider than the load,
2947	// the load will be replaced with a bitfield extract.
2948	// e.g.,
2949	// str w1, [x0, #4]
2950	// ldrh w2, [x0, #6]
2951	// ; becomes
2952	// str w1, [x0, #4]
2953	// lsr w2, w1, #16
2954	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2955	MBBI != E;) {
2956	if (isPromotableLoadFromStore(MI&: *MBBI) && tryToPromoteLoadFromStore(MBBI))
2957	Modified = true;
2958	else
2959	++MBBI;
2960	}
2961	// 2) Merge adjacent zero stores into a wider store.
2962	// e.g.,
2963	// strh wzr, [x0]
2964	// strh wzr, [x0, #2]
2965	// ; becomes
2966	// str wzr, [x0]
2967	// e.g.,
2968	// str wzr, [x0]
2969	// str wzr, [x0, #4]
2970	// ; becomes
2971	// str xzr, [x0]
2972	if (EnableNarrowZeroStOpt)
2973	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2974	MBBI != E;) {
2975	if (isPromotableZeroStoreInst(MI&: *MBBI) && tryToMergeZeroStInst(MBBI))
2976	Modified = true;
2977	else
2978	++MBBI;
2979	}
2980	// 3) Find loads and stores that can be merged into a single load or store
2981	// pair instruction.
2982	// When compiling for SVE 128, also try to combine SVE fill/spill
2983	// instructions into LDP/STP.
2984	// e.g.,
2985	// ldr x0, [x2]
2986	// ldr x1, [x2, #8]
2987	// ; becomes
2988	// ldp x0, x1, [x2]
2989	// e.g.,
2990	// ldr z0, [x2]
2991	// ldr z1, [x2, #1, mul vl]
2992	// ; becomes
2993	// ldp q0, q1, [x2]
2994
2995	if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2996	DefinedInBB.clear();
2997	DefinedInBB.addLiveIns(MBB);
2998	}
2999
3000	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3001	MBBI != E;) {
3002	// Track currently live registers up to this point, to help with
3003	// searching for a rename register on demand.
3004	updateDefinedRegisters(MI&: *MBBI, Units&: DefinedInBB, TRI);
3005	if (TII->isPairableLdStInst(MI: *MBBI) && tryToPairLdStInst(MBBI))
3006	Modified = true;
3007	else
3008	++MBBI;
3009	}
3010	// 4) Find base register updates that can be merged into the load or store
3011	// as a base-reg writeback.
3012	// e.g.,
3013	// ldr x0, [x2]
3014	// add x2, x2, #4
3015	// ; becomes
3016	// ldr x0, [x2], #4
3017	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3018	MBBI != E;) {
3019	if (isMergeableLdStUpdate(MI&: *MBBI, AFI) && tryToMergeLdStUpdate(MBBI))
3020	Modified = true;
3021	else
3022	++MBBI;
3023	}
3024
3025	// 5) Find a register assigned with a const value that can be combined with
3026	// into the load or store. e.g.,
3027	// mov x8, #LargeImm ; = a (1<<12) + imm12*
3028	// ldr x1, [x0, x8]
3029	// ; becomes
3030	// add x8, x0, a (1<<12)*
3031	// ldr x1, [x8, imm12]
3032	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3033	MBBI != E;) {
3034	int Scale;
3035	if (isMergeableIndexLdSt(MI&: *MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
3036	Modified = true;
3037	else
3038	++MBBI;
3039	}
3040
3041	return Modified;
3042	}
3043
3044	bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
3045	if (skipFunction(F: Fn.getFunction()))
3046	return false;
3047
3048	Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
3049	TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
3050	TRI = Subtarget->getRegisterInfo();
3051	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3052
3053	// Resize the modified and used register unit trackers. We do this once
3054	// per function and then clear the register units each time we optimize a load
3055	// or store.
3056	ModifiedRegUnits.init(TRI: *TRI);
3057	UsedRegUnits.init(TRI: *TRI);
3058	DefinedInBB.init(TRI: *TRI);
3059
3060	bool Modified = false;
3061	bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
3062	for (auto &MBB : Fn) {
3063	auto M = optimizeBlock(MBB, EnableNarrowZeroStOpt: enableNarrowZeroStOpt);
3064	Modified \|= M;
3065	}
3066
3067	return Modified;
3068	}
3069
3070	// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
3071	// stores near one another? Note: The pre-RA instruction scheduler already has
3072	// hooks to try and schedule pairable loads/stores together to improve pairing
3073	// opportunities. Thus, pre-RA pairing pass may not be worth the effort.
3074
3075	// FIXME: When pairing store instructions it's very possible for this pass to
3076	// hoist a store with a KILL marker above another use (without a KILL marker).
3077	// The resulting IR is invalid, but nothing uses the KILL markers after this
3078	// pass, so it's never caused a problem in practice.
3079
3080	/// createAArch64LoadStoreOptimizationPass - returns an instance of the
3081	/// load / store optimization pass.
3082	FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
3083	return new AArch64LoadStoreOpt ();
3084	}
3085

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp