1//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains a pass that performs load / store related peephole
10// optimizations. This pass should be run after register allocation.
11//
12// The pass runs after the PrologEpilogInserter where we emit the CFI
13// instructions. In order to preserve the correctness of the unwind information,
14// the pass should not change the order of any two instructions, one of which
15// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16// to unwind information.
17//
18//===----------------------------------------------------------------------===//
19
20#include "AArch64InstrInfo.h"
21#include "AArch64MachineFunctionInfo.h"
22#include "AArch64Subtarget.h"
23#include "MCTargetDesc/AArch64AddressingModes.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/ADT/StringRef.h"
27#include "llvm/ADT/iterator_range.h"
28#include "llvm/Analysis/AliasAnalysis.h"
29#include "llvm/CodeGen/MachineBasicBlock.h"
30#include "llvm/CodeGen/MachineFunction.h"
31#include "llvm/CodeGen/MachineFunctionPass.h"
32#include "llvm/CodeGen/MachineInstr.h"
33#include "llvm/CodeGen/MachineInstrBuilder.h"
34#include "llvm/CodeGen/MachineOperand.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/TargetRegisterInfo.h"
37#include "llvm/IR/DebugLoc.h"
38#include "llvm/MC/MCAsmInfo.h"
39#include "llvm/MC/MCDwarf.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/Debug.h"
43#include "llvm/Support/DebugCounter.h"
44#include "llvm/Support/ErrorHandling.h"
45#include <cassert>
46#include <cstdint>
47#include <functional>
48#include <iterator>
49#include <limits>
50#include <optional>
51
52using namespace llvm;
53
54#define DEBUG_TYPE "aarch64-ldst-opt"
55
56STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
57STATISTIC(NumPostFolded, "Number of post-index updates folded");
58STATISTIC(NumPreFolded, "Number of pre-index updates folded");
59STATISTIC(NumUnscaledPairCreated,
60 "Number of load/store from unscaled generated");
61STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
62STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
63STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
64 "not passed the alignment check");
65STATISTIC(NumConstOffsetFolded,
66 "Number of const offset of index address folded");
67
68DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
69 "Controls which pairs are considered for renaming");
70
71// The LdStLimit limits how far we search for load/store pairs.
72static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
73 cl::init(Val: 20), cl::Hidden);
74
75// The UpdateLimit limits how far we search for update instructions when we form
76// pre-/post-index instructions.
77static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(Val: 100),
78 cl::Hidden);
79
80// The LdStConstLimit limits how far we search for const offset instructions
81// when we form index address load/store instructions.
82static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
83 cl::init(Val: 10), cl::Hidden);
84
85// Enable register renaming to find additional store pairing opportunities.
86static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
87 cl::init(Val: true), cl::Hidden);
88
89#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
90
91namespace {
92
93using LdStPairFlags = struct LdStPairFlags {
94 // If a matching instruction is found, MergeForward is set to true if the
95 // merge is to remove the first instruction and replace the second with
96 // a pair-wise insn, and false if the reverse is true.
97 bool MergeForward = false;
98
99 // SExtIdx gives the index of the result of the load pair that must be
100 // extended. The value of SExtIdx assumes that the paired load produces the
101 // value in this order: (I, returned iterator), i.e., -1 means no value has
102 // to be extended, 0 means I, and 1 means the returned iterator.
103 int SExtIdx = -1;
104
105 // If not none, RenameReg can be used to rename the result register of the
106 // first store in a pair. Currently this only works when merging stores
107 // forward.
108 std::optional<MCPhysReg> RenameReg;
109
110 LdStPairFlags() = default;
111
112 void setMergeForward(bool V = true) { MergeForward = V; }
113 bool getMergeForward() const { return MergeForward; }
114
115 void setSExtIdx(int V) { SExtIdx = V; }
116 int getSExtIdx() const { return SExtIdx; }
117
118 void setRenameReg(MCPhysReg R) { RenameReg = R; }
119 void clearRenameReg() { RenameReg = std::nullopt; }
120 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
121};
122
123struct AArch64LoadStoreOpt : public MachineFunctionPass {
124 static char ID;
125
126 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
127
128 AliasAnalysis *AA;
129 const AArch64InstrInfo *TII;
130 const TargetRegisterInfo *TRI;
131 const AArch64Subtarget *Subtarget;
132
133 // Track which register units have been modified and used.
134 LiveRegUnits ModifiedRegUnits, UsedRegUnits;
135 LiveRegUnits DefinedInBB;
136
137 void getAnalysisUsage(AnalysisUsage &AU) const override {
138 AU.addRequired<AAResultsWrapperPass>();
139 MachineFunctionPass::getAnalysisUsage(AU);
140 }
141
142 // Scan the instructions looking for a load/store that can be combined
143 // with the current instruction into a load/store pair.
144 // Return the matching instruction if one is found, else MBB->end().
145 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
146 LdStPairFlags &Flags,
147 unsigned Limit,
148 bool FindNarrowMerge);
149
150 // Scan the instructions looking for a store that writes to the address from
151 // which the current load instruction reads. Return true if one is found.
152 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
153 MachineBasicBlock::iterator &StoreI);
154
155 // Merge the two instructions indicated into a wider narrow store instruction.
156 MachineBasicBlock::iterator
157 mergeNarrowZeroStores(MachineBasicBlock::iterator I,
158 MachineBasicBlock::iterator MergeMI,
159 const LdStPairFlags &Flags);
160
161 // Merge the two instructions indicated into a single pair-wise instruction.
162 MachineBasicBlock::iterator
163 mergePairedInsns(MachineBasicBlock::iterator I,
164 MachineBasicBlock::iterator Paired,
165 const LdStPairFlags &Flags);
166
167 // Promote the load that reads directly from the address stored to.
168 MachineBasicBlock::iterator
169 promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
170 MachineBasicBlock::iterator StoreI);
171
172 // Scan the instruction list to find a base register update that can
173 // be combined with the current instruction (a load or store) using
174 // pre or post indexed addressing with writeback. Scan forwards.
175 MachineBasicBlock::iterator
176 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
177 int UnscaledOffset, unsigned Limit);
178
179 // Scan the instruction list to find a register assigned with a const
180 // value that can be combined with the current instruction (a load or store)
181 // using base addressing with writeback. Scan backwards.
182 MachineBasicBlock::iterator
183 findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
184 unsigned &Offset);
185
186 // Scan the instruction list to find a base register update that can
187 // be combined with the current instruction (a load or store) using
188 // pre or post indexed addressing with writeback. Scan backwards.
189 // `MergeEither` is set to true if the combined instruction may be placed
190 // either at the location of the load/store instruction or at the location of
191 // the update instruction.
192 MachineBasicBlock::iterator
193 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
194 bool &MergeEither);
195
196 // Find an instruction that updates the base register of the ld/st
197 // instruction.
198 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
199 unsigned BaseReg, int Offset);
200
201 bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
202 unsigned IndexReg, unsigned &Offset);
203
204 // Merge a pre- or post-index base register update into a ld/st instruction.
205 std::optional<MachineBasicBlock::iterator>
206 mergeUpdateInsn(MachineBasicBlock::iterator I,
207 MachineBasicBlock::iterator Update, bool IsForward,
208 bool IsPreIdx, bool MergeEither);
209
210 MachineBasicBlock::iterator
211 mergeConstOffsetInsn(MachineBasicBlock::iterator I,
212 MachineBasicBlock::iterator Update, unsigned Offset,
213 int Scale);
214
215 // Find and merge zero store instructions.
216 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
217
218 // Find and pair ldr/str instructions.
219 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
220
221 // Find and promote load instructions which read directly from store.
222 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
223
224 // Find and merge a base register updates before or after a ld/st instruction.
225 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
226
227 // Find and merge an index ldr/st instruction into a base ld/st instruction.
228 bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
229
230 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
231
232 bool runOnMachineFunction(MachineFunction &Fn) override;
233
234 MachineFunctionProperties getRequiredProperties() const override {
235 return MachineFunctionProperties().setNoVRegs();
236 }
237
238 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
239};
240
241char AArch64LoadStoreOpt::ID = 0;
242
243} // end anonymous namespace
244
245INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
246 AARCH64_LOAD_STORE_OPT_NAME, false, false)
247
248static bool isNarrowStore(unsigned Opc) {
249 switch (Opc) {
250 default:
251 return false;
252 case AArch64::STRBBui:
253 case AArch64::STURBBi:
254 case AArch64::STRHHui:
255 case AArch64::STURHHi:
256 return true;
257 }
258}
259
260// These instruction set memory tag and either keep memory contents unchanged or
261// set it to zero, ignoring the address part of the source register.
262static bool isTagStore(const MachineInstr &MI) {
263 switch (MI.getOpcode()) {
264 default:
265 return false;
266 case AArch64::STGi:
267 case AArch64::STZGi:
268 case AArch64::ST2Gi:
269 case AArch64::STZ2Gi:
270 return true;
271 }
272}
273
274static unsigned getMatchingNonSExtOpcode(unsigned Opc,
275 bool *IsValidLdStrOpc = nullptr) {
276 if (IsValidLdStrOpc)
277 *IsValidLdStrOpc = true;
278 switch (Opc) {
279 default:
280 if (IsValidLdStrOpc)
281 *IsValidLdStrOpc = false;
282 return std::numeric_limits<unsigned>::max();
283 case AArch64::STRDui:
284 case AArch64::STURDi:
285 case AArch64::STRDpre:
286 case AArch64::STRQui:
287 case AArch64::STURQi:
288 case AArch64::STRQpre:
289 case AArch64::STRBBui:
290 case AArch64::STURBBi:
291 case AArch64::STRHHui:
292 case AArch64::STURHHi:
293 case AArch64::STRWui:
294 case AArch64::STRWpre:
295 case AArch64::STURWi:
296 case AArch64::STRXui:
297 case AArch64::STRXpre:
298 case AArch64::STURXi:
299 case AArch64::STR_ZXI:
300 case AArch64::LDRDui:
301 case AArch64::LDURDi:
302 case AArch64::LDRDpre:
303 case AArch64::LDRQui:
304 case AArch64::LDURQi:
305 case AArch64::LDRQpre:
306 case AArch64::LDRWui:
307 case AArch64::LDURWi:
308 case AArch64::LDRWpre:
309 case AArch64::LDRXui:
310 case AArch64::LDURXi:
311 case AArch64::LDRXpre:
312 case AArch64::STRSui:
313 case AArch64::STURSi:
314 case AArch64::STRSpre:
315 case AArch64::LDRSui:
316 case AArch64::LDURSi:
317 case AArch64::LDRSpre:
318 case AArch64::LDR_ZXI:
319 return Opc;
320 case AArch64::LDRSWui:
321 return AArch64::LDRWui;
322 case AArch64::LDURSWi:
323 return AArch64::LDURWi;
324 case AArch64::LDRSWpre:
325 return AArch64::LDRWpre;
326 }
327}
328
329static unsigned getMatchingWideOpcode(unsigned Opc) {
330 switch (Opc) {
331 default:
332 llvm_unreachable("Opcode has no wide equivalent!");
333 case AArch64::STRBBui:
334 return AArch64::STRHHui;
335 case AArch64::STRHHui:
336 return AArch64::STRWui;
337 case AArch64::STURBBi:
338 return AArch64::STURHHi;
339 case AArch64::STURHHi:
340 return AArch64::STURWi;
341 case AArch64::STURWi:
342 return AArch64::STURXi;
343 case AArch64::STRWui:
344 return AArch64::STRXui;
345 }
346}
347
348static unsigned getMatchingPairOpcode(unsigned Opc) {
349 switch (Opc) {
350 default:
351 llvm_unreachable("Opcode has no pairwise equivalent!");
352 case AArch64::STRSui:
353 case AArch64::STURSi:
354 return AArch64::STPSi;
355 case AArch64::STRSpre:
356 return AArch64::STPSpre;
357 case AArch64::STRDui:
358 case AArch64::STURDi:
359 return AArch64::STPDi;
360 case AArch64::STRDpre:
361 return AArch64::STPDpre;
362 case AArch64::STRQui:
363 case AArch64::STURQi:
364 case AArch64::STR_ZXI:
365 return AArch64::STPQi;
366 case AArch64::STRQpre:
367 return AArch64::STPQpre;
368 case AArch64::STRWui:
369 case AArch64::STURWi:
370 return AArch64::STPWi;
371 case AArch64::STRWpre:
372 return AArch64::STPWpre;
373 case AArch64::STRXui:
374 case AArch64::STURXi:
375 return AArch64::STPXi;
376 case AArch64::STRXpre:
377 return AArch64::STPXpre;
378 case AArch64::LDRSui:
379 case AArch64::LDURSi:
380 return AArch64::LDPSi;
381 case AArch64::LDRSpre:
382 return AArch64::LDPSpre;
383 case AArch64::LDRDui:
384 case AArch64::LDURDi:
385 return AArch64::LDPDi;
386 case AArch64::LDRDpre:
387 return AArch64::LDPDpre;
388 case AArch64::LDRQui:
389 case AArch64::LDURQi:
390 case AArch64::LDR_ZXI:
391 return AArch64::LDPQi;
392 case AArch64::LDRQpre:
393 return AArch64::LDPQpre;
394 case AArch64::LDRWui:
395 case AArch64::LDURWi:
396 return AArch64::LDPWi;
397 case AArch64::LDRWpre:
398 return AArch64::LDPWpre;
399 case AArch64::LDRXui:
400 case AArch64::LDURXi:
401 return AArch64::LDPXi;
402 case AArch64::LDRXpre:
403 return AArch64::LDPXpre;
404 case AArch64::LDRSWui:
405 case AArch64::LDURSWi:
406 return AArch64::LDPSWi;
407 case AArch64::LDRSWpre:
408 return AArch64::LDPSWpre;
409 }
410}
411
412static unsigned isMatchingStore(MachineInstr &LoadInst,
413 MachineInstr &StoreInst) {
414 unsigned LdOpc = LoadInst.getOpcode();
415 unsigned StOpc = StoreInst.getOpcode();
416 switch (LdOpc) {
417 default:
418 llvm_unreachable("Unsupported load instruction!");
419 case AArch64::LDRBBui:
420 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
421 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
422 case AArch64::LDURBBi:
423 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
424 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
425 case AArch64::LDRHHui:
426 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
427 StOpc == AArch64::STRXui;
428 case AArch64::LDURHHi:
429 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
430 StOpc == AArch64::STURXi;
431 case AArch64::LDRWui:
432 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
433 case AArch64::LDURWi:
434 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
435 case AArch64::LDRXui:
436 return StOpc == AArch64::STRXui;
437 case AArch64::LDURXi:
438 return StOpc == AArch64::STURXi;
439 }
440}
441
442static unsigned getPreIndexedOpcode(unsigned Opc) {
443 // FIXME: We don't currently support creating pre-indexed loads/stores when
444 // the load or store is the unscaled version. If we decide to perform such an
445 // optimization in the future the cases for the unscaled loads/stores will
446 // need to be added here.
447 switch (Opc) {
448 default:
449 llvm_unreachable("Opcode has no pre-indexed equivalent!");
450 case AArch64::STRBui:
451 return AArch64::STRBpre;
452 case AArch64::STRHui:
453 return AArch64::STRHpre;
454 case AArch64::STRSui:
455 return AArch64::STRSpre;
456 case AArch64::STRDui:
457 return AArch64::STRDpre;
458 case AArch64::STRQui:
459 return AArch64::STRQpre;
460 case AArch64::STRBBui:
461 return AArch64::STRBBpre;
462 case AArch64::STRHHui:
463 return AArch64::STRHHpre;
464 case AArch64::STRWui:
465 return AArch64::STRWpre;
466 case AArch64::STRXui:
467 return AArch64::STRXpre;
468 case AArch64::LDRBui:
469 return AArch64::LDRBpre;
470 case AArch64::LDRHui:
471 return AArch64::LDRHpre;
472 case AArch64::LDRSui:
473 return AArch64::LDRSpre;
474 case AArch64::LDRDui:
475 return AArch64::LDRDpre;
476 case AArch64::LDRQui:
477 return AArch64::LDRQpre;
478 case AArch64::LDRBBui:
479 return AArch64::LDRBBpre;
480 case AArch64::LDRHHui:
481 return AArch64::LDRHHpre;
482 case AArch64::LDRWui:
483 return AArch64::LDRWpre;
484 case AArch64::LDRXui:
485 return AArch64::LDRXpre;
486 case AArch64::LDRSWui:
487 return AArch64::LDRSWpre;
488 case AArch64::LDPSi:
489 return AArch64::LDPSpre;
490 case AArch64::LDPSWi:
491 return AArch64::LDPSWpre;
492 case AArch64::LDPDi:
493 return AArch64::LDPDpre;
494 case AArch64::LDPQi:
495 return AArch64::LDPQpre;
496 case AArch64::LDPWi:
497 return AArch64::LDPWpre;
498 case AArch64::LDPXi:
499 return AArch64::LDPXpre;
500 case AArch64::STPSi:
501 return AArch64::STPSpre;
502 case AArch64::STPDi:
503 return AArch64::STPDpre;
504 case AArch64::STPQi:
505 return AArch64::STPQpre;
506 case AArch64::STPWi:
507 return AArch64::STPWpre;
508 case AArch64::STPXi:
509 return AArch64::STPXpre;
510 case AArch64::STGi:
511 return AArch64::STGPreIndex;
512 case AArch64::STZGi:
513 return AArch64::STZGPreIndex;
514 case AArch64::ST2Gi:
515 return AArch64::ST2GPreIndex;
516 case AArch64::STZ2Gi:
517 return AArch64::STZ2GPreIndex;
518 case AArch64::STGPi:
519 return AArch64::STGPpre;
520 }
521}
522
523static unsigned getBaseAddressOpcode(unsigned Opc) {
524 // TODO: Add more index address stores.
525 switch (Opc) {
526 default:
527 llvm_unreachable("Opcode has no base address equivalent!");
528 case AArch64::LDRBroX:
529 return AArch64::LDRBui;
530 case AArch64::LDRBBroX:
531 return AArch64::LDRBBui;
532 case AArch64::LDRSBXroX:
533 return AArch64::LDRSBXui;
534 case AArch64::LDRSBWroX:
535 return AArch64::LDRSBWui;
536 case AArch64::LDRHroX:
537 return AArch64::LDRHui;
538 case AArch64::LDRHHroX:
539 return AArch64::LDRHHui;
540 case AArch64::LDRSHXroX:
541 return AArch64::LDRSHXui;
542 case AArch64::LDRSHWroX:
543 return AArch64::LDRSHWui;
544 case AArch64::LDRWroX:
545 return AArch64::LDRWui;
546 case AArch64::LDRSroX:
547 return AArch64::LDRSui;
548 case AArch64::LDRSWroX:
549 return AArch64::LDRSWui;
550 case AArch64::LDRDroX:
551 return AArch64::LDRDui;
552 case AArch64::LDRXroX:
553 return AArch64::LDRXui;
554 case AArch64::LDRQroX:
555 return AArch64::LDRQui;
556 }
557}
558
559static unsigned getPostIndexedOpcode(unsigned Opc) {
560 switch (Opc) {
561 default:
562 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
563 case AArch64::STRBui:
564 return AArch64::STRBpost;
565 case AArch64::STRHui:
566 return AArch64::STRHpost;
567 case AArch64::STRSui:
568 case AArch64::STURSi:
569 return AArch64::STRSpost;
570 case AArch64::STRDui:
571 case AArch64::STURDi:
572 return AArch64::STRDpost;
573 case AArch64::STRQui:
574 case AArch64::STURQi:
575 return AArch64::STRQpost;
576 case AArch64::STRBBui:
577 return AArch64::STRBBpost;
578 case AArch64::STRHHui:
579 return AArch64::STRHHpost;
580 case AArch64::STRWui:
581 case AArch64::STURWi:
582 return AArch64::STRWpost;
583 case AArch64::STRXui:
584 case AArch64::STURXi:
585 return AArch64::STRXpost;
586 case AArch64::LDRBui:
587 return AArch64::LDRBpost;
588 case AArch64::LDRHui:
589 return AArch64::LDRHpost;
590 case AArch64::LDRSui:
591 case AArch64::LDURSi:
592 return AArch64::LDRSpost;
593 case AArch64::LDRDui:
594 case AArch64::LDURDi:
595 return AArch64::LDRDpost;
596 case AArch64::LDRQui:
597 case AArch64::LDURQi:
598 return AArch64::LDRQpost;
599 case AArch64::LDRBBui:
600 return AArch64::LDRBBpost;
601 case AArch64::LDRHHui:
602 return AArch64::LDRHHpost;
603 case AArch64::LDRWui:
604 case AArch64::LDURWi:
605 return AArch64::LDRWpost;
606 case AArch64::LDRXui:
607 case AArch64::LDURXi:
608 return AArch64::LDRXpost;
609 case AArch64::LDRSWui:
610 return AArch64::LDRSWpost;
611 case AArch64::LDPSi:
612 return AArch64::LDPSpost;
613 case AArch64::LDPSWi:
614 return AArch64::LDPSWpost;
615 case AArch64::LDPDi:
616 return AArch64::LDPDpost;
617 case AArch64::LDPQi:
618 return AArch64::LDPQpost;
619 case AArch64::LDPWi:
620 return AArch64::LDPWpost;
621 case AArch64::LDPXi:
622 return AArch64::LDPXpost;
623 case AArch64::STPSi:
624 return AArch64::STPSpost;
625 case AArch64::STPDi:
626 return AArch64::STPDpost;
627 case AArch64::STPQi:
628 return AArch64::STPQpost;
629 case AArch64::STPWi:
630 return AArch64::STPWpost;
631 case AArch64::STPXi:
632 return AArch64::STPXpost;
633 case AArch64::STGi:
634 return AArch64::STGPostIndex;
635 case AArch64::STZGi:
636 return AArch64::STZGPostIndex;
637 case AArch64::ST2Gi:
638 return AArch64::ST2GPostIndex;
639 case AArch64::STZ2Gi:
640 return AArch64::STZ2GPostIndex;
641 case AArch64::STGPi:
642 return AArch64::STGPpost;
643 }
644}
645
646static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
647
648 unsigned OpcA = FirstMI.getOpcode();
649 unsigned OpcB = MI.getOpcode();
650
651 switch (OpcA) {
652 default:
653 return false;
654 case AArch64::STRSpre:
655 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
656 case AArch64::STRDpre:
657 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
658 case AArch64::STRQpre:
659 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
660 case AArch64::STRWpre:
661 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
662 case AArch64::STRXpre:
663 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
664 case AArch64::LDRSpre:
665 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
666 case AArch64::LDRDpre:
667 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
668 case AArch64::LDRQpre:
669 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
670 case AArch64::LDRWpre:
671 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
672 case AArch64::LDRXpre:
673 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
674 case AArch64::LDRSWpre:
675 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
676 }
677}
678
679// Returns the scale and offset range of pre/post indexed variants of MI.
680static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
681 int &MinOffset, int &MaxOffset) {
682 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
683 bool IsTagStore = isTagStore(MI);
684 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
685 // as in the "unsigned offset" variant.
686 // All other pre/post indexed ldst instructions are unscaled.
687 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
688
689 if (IsPaired) {
690 MinOffset = -64;
691 MaxOffset = 63;
692 } else {
693 MinOffset = -256;
694 MaxOffset = 255;
695 }
696}
697
698static MachineOperand &getLdStRegOp(MachineInstr &MI,
699 unsigned PairedRegOp = 0) {
700 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
701 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
702 if (IsPreLdSt)
703 PairedRegOp += 1;
704 unsigned Idx =
705 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
706 return MI.getOperand(i: Idx);
707}
708
709static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
710 MachineInstr &StoreInst,
711 const AArch64InstrInfo *TII) {
712 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
713 int LoadSize = TII->getMemScale(MI: LoadInst);
714 int StoreSize = TII->getMemScale(MI: StoreInst);
715 int UnscaledStOffset =
716 TII->hasUnscaledLdStOffset(MI&: StoreInst)
717 ? AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm()
718 : AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm() * StoreSize;
719 int UnscaledLdOffset =
720 TII->hasUnscaledLdStOffset(MI&: LoadInst)
721 ? AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm()
722 : AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm() * LoadSize;
723 return (UnscaledStOffset <= UnscaledLdOffset) &&
724 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
725}
726
727static bool isPromotableZeroStoreInst(MachineInstr &MI) {
728 unsigned Opc = MI.getOpcode();
729 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
730 isNarrowStore(Opc)) &&
731 getLdStRegOp(MI).getReg() == AArch64::WZR;
732}
733
734static bool isPromotableLoadFromStore(MachineInstr &MI) {
735 switch (MI.getOpcode()) {
736 default:
737 return false;
738 // Scaled instructions.
739 case AArch64::LDRBBui:
740 case AArch64::LDRHHui:
741 case AArch64::LDRWui:
742 case AArch64::LDRXui:
743 // Unscaled instructions.
744 case AArch64::LDURBBi:
745 case AArch64::LDURHHi:
746 case AArch64::LDURWi:
747 case AArch64::LDURXi:
748 return true;
749 }
750}
751
752static bool isMergeableLdStUpdate(MachineInstr &MI, AArch64FunctionInfo &AFI) {
753 unsigned Opc = MI.getOpcode();
754 switch (Opc) {
755 default:
756 return false;
757 // Scaled instructions.
758 case AArch64::STRBui:
759 case AArch64::STRHui:
760 case AArch64::STRSui:
761 case AArch64::STRDui:
762 case AArch64::STRQui:
763 case AArch64::STRXui:
764 case AArch64::STRWui:
765 case AArch64::STRHHui:
766 case AArch64::STRBBui:
767 case AArch64::LDRBui:
768 case AArch64::LDRHui:
769 case AArch64::LDRSui:
770 case AArch64::LDRDui:
771 case AArch64::LDRQui:
772 case AArch64::LDRXui:
773 case AArch64::LDRWui:
774 case AArch64::LDRHHui:
775 case AArch64::LDRBBui:
776 case AArch64::STGi:
777 case AArch64::STZGi:
778 case AArch64::ST2Gi:
779 case AArch64::STZ2Gi:
780 case AArch64::STGPi:
781 // Unscaled instructions.
782 case AArch64::STURSi:
783 case AArch64::STURDi:
784 case AArch64::STURQi:
785 case AArch64::STURWi:
786 case AArch64::STURXi:
787 case AArch64::LDURSi:
788 case AArch64::LDURDi:
789 case AArch64::LDURQi:
790 case AArch64::LDURWi:
791 case AArch64::LDURXi:
792 // Paired instructions.
793 case AArch64::LDPSi:
794 case AArch64::LDPSWi:
795 case AArch64::LDPDi:
796 case AArch64::LDPQi:
797 case AArch64::LDPWi:
798 case AArch64::LDPXi:
799 case AArch64::STPSi:
800 case AArch64::STPDi:
801 case AArch64::STPQi:
802 case AArch64::STPWi:
803 case AArch64::STPXi:
804 // Make sure this is a reg+imm (as opposed to an address reloc).
805 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
806 return false;
807
808 // When using stack tagging, simple sp+imm loads and stores are not
809 // tag-checked, but pre- and post-indexed versions of them are, so we can't
810 // replace the former with the latter. This transformation would be valid
811 // if the load/store accesses an untagged stack slot, but we don't have
812 // that information available after frame indices have been eliminated.
813 if (AFI.isMTETagged() &&
814 AArch64InstrInfo::getLdStBaseOp(MI).getReg() == AArch64::SP)
815 return false;
816
817 return true;
818 }
819}
820
821// Make sure this is a reg+reg Ld/St
822static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
823 unsigned Opc = MI.getOpcode();
824 switch (Opc) {
825 default:
826 return false;
827 // Scaled instructions.
828 // TODO: Add more index address stores.
829 case AArch64::LDRBroX:
830 case AArch64::LDRBBroX:
831 case AArch64::LDRSBXroX:
832 case AArch64::LDRSBWroX:
833 Scale = 1;
834 return true;
835 case AArch64::LDRHroX:
836 case AArch64::LDRHHroX:
837 case AArch64::LDRSHXroX:
838 case AArch64::LDRSHWroX:
839 Scale = 2;
840 return true;
841 case AArch64::LDRWroX:
842 case AArch64::LDRSroX:
843 case AArch64::LDRSWroX:
844 Scale = 4;
845 return true;
846 case AArch64::LDRDroX:
847 case AArch64::LDRXroX:
848 Scale = 8;
849 return true;
850 case AArch64::LDRQroX:
851 Scale = 16;
852 return true;
853 }
854}
855
856static bool isRewritableImplicitDef(const MachineOperand &MO) {
857 switch (MO.getParent()->getOpcode()) {
858 default:
859 return MO.isRenamable();
860 case AArch64::ORRWrs:
861 case AArch64::ADDWri:
862 return true;
863 }
864}
865
866MachineBasicBlock::iterator
867AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
868 MachineBasicBlock::iterator MergeMI,
869 const LdStPairFlags &Flags) {
870 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
871 "Expected promotable zero stores.");
872
873 MachineBasicBlock::iterator E = I->getParent()->end();
874 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
875 // If NextI is the second of the two instructions to be merged, we need
876 // to skip one further. Either way we merge will invalidate the iterator,
877 // and we don't need to scan the new instruction, as it's a pairwise
878 // instruction, which we're not considering for further action anyway.
879 if (NextI == MergeMI)
880 NextI = next_nodbg(It: NextI, End: E);
881
882 unsigned Opc = I->getOpcode();
883 unsigned MergeMIOpc = MergeMI->getOpcode();
884 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
885 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(Opc: MergeMIOpc);
886 int OffsetStride = IsScaled ? TII->getMemScale(MI: *I) : 1;
887 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(MI: *MergeMI) : 1;
888
889 bool MergeForward = Flags.getMergeForward();
890 // Insert our new paired instruction after whichever of the paired
891 // instructions MergeForward indicates.
892 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
893 // Also based on MergeForward is from where we copy the base register operand
894 // so we get the flags compatible with the input code.
895 const MachineOperand &BaseRegOp =
896 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *MergeMI)
897 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
898
899 // Which register is Rt and which is Rt2 depends on the offset order.
900 int64_t IOffsetInBytes =
901 AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm() * OffsetStride;
902 int64_t MIOffsetInBytes =
903 AArch64InstrInfo::getLdStOffsetOp(MI: *MergeMI).getImm() *
904 MergeMIOffsetStride;
905 // Select final offset based on the offset order.
906 int64_t OffsetImm;
907 if (IOffsetInBytes > MIOffsetInBytes)
908 OffsetImm = MIOffsetInBytes;
909 else
910 OffsetImm = IOffsetInBytes;
911
912 int NewOpcode = getMatchingWideOpcode(Opc);
913 // Adjust final offset on scaled stores because the new instruction
914 // has a different scale.
915 if (!TII->hasUnscaledLdStOffset(Opc: NewOpcode)) {
916 int NewOffsetStride = TII->getMemScale(Opc: NewOpcode);
917 assert(((OffsetImm % NewOffsetStride) == 0) &&
918 "Offset should be a multiple of the store memory scale");
919 OffsetImm = OffsetImm / NewOffsetStride;
920 }
921
922 // Construct the new instruction.
923 DebugLoc DL = I->getDebugLoc();
924 MachineBasicBlock *MBB = I->getParent();
925 MachineInstrBuilder MIB;
926 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: NewOpcode))
927 .addReg(RegNo: isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
928 .add(MO: BaseRegOp)
929 .addImm(Val: OffsetImm)
930 .cloneMergedMemRefs(OtherMIs: {&*I, &*MergeMI})
931 .setMIFlags(I->mergeFlagsWith(Other: *MergeMI));
932 (void)MIB;
933
934 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
935 LLVM_DEBUG(I->print(dbgs()));
936 LLVM_DEBUG(dbgs() << " ");
937 LLVM_DEBUG(MergeMI->print(dbgs()));
938 LLVM_DEBUG(dbgs() << " with instruction:\n ");
939 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
940 LLVM_DEBUG(dbgs() << "\n");
941
942 // Erase the old instructions.
943 I->eraseFromParent();
944 MergeMI->eraseFromParent();
945 return NextI;
946}
947
948// Apply Fn to all instructions between MI and the beginning of the block, until
949// a def for DefReg is reached. Returns true, iff Fn returns true for all
950// visited instructions. Stop after visiting Limit iterations.
951static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
952 const TargetRegisterInfo *TRI, unsigned Limit,
953 std::function<bool(MachineInstr &, bool)> &Fn) {
954 auto MBB = MI.getParent();
955 for (MachineInstr &I :
956 instructionsWithoutDebug(It: MI.getReverseIterator(), End: MBB->instr_rend())) {
957 if (!Limit)
958 return false;
959 --Limit;
960
961 bool isDef = any_of(Range: I.operands(), P: [DefReg, TRI](MachineOperand &MOP) {
962 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
963 TRI->regsOverlap(RegA: MOP.getReg(), RegB: DefReg);
964 });
965 if (!Fn(I, isDef))
966 return false;
967 if (isDef)
968 break;
969 }
970 return true;
971}
972
973static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
974 const TargetRegisterInfo *TRI) {
975
976 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
977 if (MOP.isReg() && MOP.isKill())
978 Units.removeReg(Reg: MOP.getReg());
979
980 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
981 if (MOP.isReg() && !MOP.isKill())
982 Units.addReg(Reg: MOP.getReg());
983}
984
985/// This function will add a new entry into the debugValueSubstitutions table
986/// when two instruction have been merged into a new one represented by \p
987/// MergedInstr.
988static void addDebugSubstitutionsToTable(MachineFunction *MF,
989 unsigned InstrNumToSet,
990 MachineInstr &OriginalInstr,
991 MachineInstr &MergedInstr) {
992
993 // Figure out the Operand Index of the destination register of the
994 // OriginalInstr in the new MergedInstr.
995 auto Reg = OriginalInstr.getOperand(i: 0).getReg();
996 unsigned OperandNo = 0;
997 bool RegFound = false;
998 for (const auto Op : MergedInstr.operands()) {
999 if (Op.getReg() == Reg) {
1000 RegFound = true;
1001 break;
1002 }
1003 OperandNo++;
1004 }
1005
1006 if (RegFound)
1007 MF->makeDebugValueSubstitution({OriginalInstr.peekDebugInstrNum(), 0},
1008 {InstrNumToSet, OperandNo});
1009}
1010
1011MachineBasicBlock::iterator
1012AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
1013 MachineBasicBlock::iterator Paired,
1014 const LdStPairFlags &Flags) {
1015 MachineBasicBlock::iterator E = I->getParent()->end();
1016 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
1017 // If NextI is the second of the two instructions to be merged, we need
1018 // to skip one further. Either way we merge will invalidate the iterator,
1019 // and we don't need to scan the new instruction, as it's a pairwise
1020 // instruction, which we're not considering for further action anyway.
1021 if (NextI == Paired)
1022 NextI = next_nodbg(It: NextI, End: E);
1023
1024 int SExtIdx = Flags.getSExtIdx();
1025 unsigned Opc =
1026 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(Opc: I->getOpcode());
1027 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
1028 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: *I) : 1;
1029
1030 bool MergeForward = Flags.getMergeForward();
1031
1032 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
1033 if (RenameReg) {
1034 MCRegister RegToRename = getLdStRegOp(MI&: *I).getReg();
1035 DefinedInBB.addReg(Reg: *RenameReg);
1036
1037 // Return the sub/super register for RenameReg, matching the size of
1038 // OriginalReg.
1039 auto GetMatchingSubReg =
1040 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
1041 for (MCPhysReg SubOrSuper :
1042 TRI->sub_and_superregs_inclusive(Reg: *RenameReg)) {
1043 if (C->contains(Reg: SubOrSuper))
1044 return SubOrSuper;
1045 }
1046 llvm_unreachable("Should have found matching sub or super register!");
1047 };
1048
1049 std::function<bool(MachineInstr &, bool)> UpdateMIs =
1050 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
1051 bool IsDef) {
1052 if (IsDef) {
1053 bool SeenDef = false;
1054 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1055 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1056 // Rename the first explicit definition and all implicit
1057 // definitions matching RegToRename.
1058 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1059 (!MergeForward || !SeenDef ||
1060 (MOP.isDef() && MOP.isImplicit())) &&
1061 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1062 assert((MOP.isImplicit() ||
1063 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1064 "Need renamable operands");
1065 Register MatchingReg;
1066 if (const TargetRegisterClass *RC =
1067 MI.getRegClassConstraint(OpIdx, TII, TRI))
1068 MatchingReg = GetMatchingSubReg(RC);
1069 else {
1070 if (!isRewritableImplicitDef(MO: MOP))
1071 continue;
1072 MatchingReg = GetMatchingSubReg(
1073 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1074 }
1075 MOP.setReg(MatchingReg);
1076 SeenDef = true;
1077 }
1078 }
1079 } else {
1080 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1081 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1082 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1083 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1084 assert((MOP.isImplicit() ||
1085 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1086 "Need renamable operands");
1087 Register MatchingReg;
1088 if (const TargetRegisterClass *RC =
1089 MI.getRegClassConstraint(OpIdx, TII, TRI))
1090 MatchingReg = GetMatchingSubReg(RC);
1091 else
1092 MatchingReg = GetMatchingSubReg(
1093 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1094 assert(MatchingReg != AArch64::NoRegister &&
1095 "Cannot find matching regs for renaming");
1096 MOP.setReg(MatchingReg);
1097 }
1098 }
1099 }
1100 LLVM_DEBUG(dbgs() << "Renamed " << MI);
1101 return true;
1102 };
1103 forAllMIsUntilDef(MI&: MergeForward ? *I : *Paired->getPrevNode(), DefReg: RegToRename,
1104 TRI, UINT32_MAX, Fn&: UpdateMIs);
1105
1106#if !defined(NDEBUG)
1107 // For forward merging store:
1108 // Make sure the register used for renaming is not used between the
1109 // paired instructions. That would trash the content before the new
1110 // paired instruction.
1111 MCPhysReg RegToCheck = *RenameReg;
1112 // For backward merging load:
1113 // Make sure the register being renamed is not used between the
1114 // paired instructions. That would trash the content after the new
1115 // paired instruction.
1116 if (!MergeForward)
1117 RegToCheck = RegToRename;
1118 for (auto &MI :
1119 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
1120 MergeForward ? std::next(I) : I,
1121 MergeForward ? std::next(Paired) : Paired))
1122 assert(all_of(MI.operands(),
1123 [this, RegToCheck](const MachineOperand &MOP) {
1124 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1125 MOP.isUndef() ||
1126 !TRI->regsOverlap(MOP.getReg(), RegToCheck);
1127 }) &&
1128 "Rename register used between paired instruction, trashing the "
1129 "content");
1130#endif
1131 }
1132
1133 // Insert our new paired instruction after whichever of the paired
1134 // instructions MergeForward indicates.
1135 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
1136 // Also based on MergeForward is from where we copy the base register operand
1137 // so we get the flags compatible with the input code.
1138 const MachineOperand &BaseRegOp =
1139 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *Paired)
1140 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
1141
1142 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm();
1143 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(MI: *Paired).getImm();
1144 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Opc: Paired->getOpcode());
1145 if (IsUnscaled != PairedIsUnscaled) {
1146 // We're trying to pair instructions that differ in how they are scaled. If
1147 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
1148 // the opposite (i.e., make Paired's offset unscaled).
1149 int MemSize = TII->getMemScale(MI: *Paired);
1150 if (PairedIsUnscaled) {
1151 // If the unscaled offset isn't a multiple of the MemSize, we can't
1152 // pair the operations together.
1153 assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1154 "Offset should be a multiple of the stride!");
1155 PairedOffset /= MemSize;
1156 } else {
1157 PairedOffset *= MemSize;
1158 }
1159 }
1160
1161 // Which register is Rt and which is Rt2 depends on the offset order.
1162 // However, for pre load/stores the Rt should be the one of the pre
1163 // load/store.
1164 MachineInstr *RtMI, *Rt2MI;
1165 if (Offset == PairedOffset + OffsetStride &&
1166 !AArch64InstrInfo::isPreLdSt(MI: *I)) {
1167 RtMI = &*Paired;
1168 Rt2MI = &*I;
1169 // Here we swapped the assumption made for SExtIdx.
1170 // I.e., we turn ldp I, Paired into ldp Paired, I.
1171 // Update the index accordingly.
1172 if (SExtIdx != -1)
1173 SExtIdx = (SExtIdx + 1) % 2;
1174 } else {
1175 RtMI = &*I;
1176 Rt2MI = &*Paired;
1177 }
1178 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(MI: *RtMI).getImm();
1179 // Scale the immediate offset, if necessary.
1180 if (TII->hasUnscaledLdStOffset(Opc: RtMI->getOpcode())) {
1181 assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1182 "Unscaled offset cannot be scaled.");
1183 OffsetImm /= TII->getMemScale(MI: *RtMI);
1184 }
1185
1186 // Construct the new instruction.
1187 MachineInstrBuilder MIB;
1188 DebugLoc DL = I->getDebugLoc();
1189 MachineBasicBlock *MBB = I->getParent();
1190 MachineOperand RegOp0 = getLdStRegOp(MI&: *RtMI);
1191 MachineOperand RegOp1 = getLdStRegOp(MI&: *Rt2MI);
1192 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1193 // Kill flags may become invalid when moving stores for pairing.
1194 if (RegOp0.isUse()) {
1195 if (!MergeForward) {
1196 // Clear kill flags on store if moving upwards. Example:
1197 // STRWui kill %w0, ...
1198 // USE %w1
1199 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1200 // We are about to move the store of w1, so its kill flag may become
1201 // invalid; not the case for w0.
1202 // Since w1 is used between the stores, the kill flag on w1 is cleared
1203 // after merging.
1204 // STPWi kill %w0, %w1, ...
1205 // USE %w1
1206 for (auto It = std::next(x: I); It != Paired && PairedRegOp.isKill(); ++It)
1207 if (It->readsRegister(Reg: PairedRegOp.getReg(), TRI))
1208 PairedRegOp.setIsKill(false);
1209 } else {
1210 // Clear kill flags of the first stores register. Example:
1211 // STRWui %w1, ...
1212 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1213 // STRW %w0
1214 Register Reg = getLdStRegOp(MI&: *I).getReg();
1215 for (MachineInstr &MI :
1216 make_range(x: std::next(x: I->getIterator()), y: Paired->getIterator()))
1217 MI.clearRegisterKills(Reg, RegInfo: TRI);
1218 }
1219 }
1220
1221 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1222 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: MatchPairOpcode));
1223
1224 // Adds the pre-index operand for pre-indexed ld/st pairs.
1225 if (AArch64InstrInfo::isPreLdSt(MI: *RtMI))
1226 MIB.addReg(RegNo: BaseRegOp.getReg(), Flags: RegState::Define);
1227
1228 MIB.add(MO: RegOp0)
1229 .add(MO: RegOp1)
1230 .add(MO: BaseRegOp)
1231 .addImm(Val: OffsetImm)
1232 .cloneMergedMemRefs(OtherMIs: {&*I, &*Paired})
1233 .setMIFlags(I->mergeFlagsWith(Other: *Paired));
1234
1235 (void)MIB;
1236
1237 LLVM_DEBUG(
1238 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1239 LLVM_DEBUG(I->print(dbgs()));
1240 LLVM_DEBUG(dbgs() << " ");
1241 LLVM_DEBUG(Paired->print(dbgs()));
1242 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1243 if (SExtIdx != -1) {
1244 // Generate the sign extension for the proper result of the ldp.
1245 // I.e., with X1, that would be:
1246 // %w1 = KILL %w1, implicit-def %x1
1247 // %x1 = SBFMXri killed %x1, 0, 31
1248 MachineOperand &DstMO = MIB->getOperand(i: SExtIdx);
1249 // Right now, DstMO has the extended register, since it comes from an
1250 // extended opcode.
1251 Register DstRegX = DstMO.getReg();
1252 // Get the W variant of that register.
1253 Register DstRegW = TRI->getSubReg(Reg: DstRegX, Idx: AArch64::sub_32);
1254 // Update the result of LDP to use the W instead of the X variant.
1255 DstMO.setReg(DstRegW);
1256 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1257 LLVM_DEBUG(dbgs() << "\n");
1258 // Make the machine verifier happy by providing a definition for
1259 // the X register.
1260 // Insert this definition right after the generated LDP, i.e., before
1261 // InsertionPoint.
1262 MachineInstrBuilder MIBKill =
1263 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::KILL), DestReg: DstRegW)
1264 .addReg(RegNo: DstRegW)
1265 .addReg(RegNo: DstRegX, Flags: RegState::Define);
1266 MIBKill->getOperand(i: 2).setImplicit();
1267 // Create the sign extension.
1268 MachineInstrBuilder MIBSXTW =
1269 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: AArch64::SBFMXri), DestReg: DstRegX)
1270 .addReg(RegNo: DstRegX)
1271 .addImm(Val: 0)
1272 .addImm(Val: 31);
1273 (void)MIBSXTW;
1274
1275 // In the case of a sign-extend, where we have something like:
1276 // debugValueSubstitutions:[]
1277 // $w1 = LDRWui $x0, 1, debug-instr-number 1
1278 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1279 // $x0 = LDRSWui $x0, 0, debug-instr-number 2
1280 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1281
1282 // It will be converted to:
1283 // debugValueSubstitutions:[]
1284 // $w0, $w1 = LDPWi $x0, 0
1285 // $w0 = KILL $w0, implicit-def $x0
1286 // $x0 = SBFMXri $x0, 0, 31
1287 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1288 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1289
1290 // We want the final result to look like:
1291 // debugValueSubstitutions:
1292 // - { srcinst: 1, srcop: 0, dstinst: 4, dstop: 1, subreg: 0 }
1293 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1294 // $w0, $w1 = LDPWi $x0, 0, debug-instr-number 4
1295 // $w0 = KILL $w0, implicit-def $x0
1296 // $x0 = SBFMXri $x0, 0, 31, debug-instr-number 3
1297 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1298 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1299
1300 // $x0 is where the final value is stored, so the sign extend (SBFMXri)
1301 // instruction contains the final value we care about we give it a new
1302 // debug-instr-number 3. Whereas, $w1 contains the final value that we care
1303 // about, therefore the LDP instruction is also given a new
1304 // debug-instr-number 4. We have to add these substitutions to the
1305 // debugValueSubstitutions table. However, we also have to ensure that the
1306 // OpIndex that pointed to debug-instr-number 1 gets updated to 1, because
1307 // $w1 is the second operand of the LDP instruction.
1308
1309 if (I->peekDebugInstrNum()) {
1310 // If I is the instruction which got sign extended and has a
1311 // debug-instr-number, give the SBFMXri instruction a new
1312 // debug-instr-number, and update the debugValueSubstitutions table with
1313 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1314 // instruction a new debug-instr-number, and update the
1315 // debugValueSubstitutions table with the new debug-instr-number and
1316 // OpIndex pair.
1317 unsigned NewInstrNum;
1318 if (DstRegX == I->getOperand(i: 0).getReg()) {
1319 NewInstrNum = MIBSXTW->getDebugInstrNum();
1320 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I,
1321 MergedInstr&: *MIBSXTW);
1322 } else {
1323 NewInstrNum = MIB->getDebugInstrNum();
1324 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I, MergedInstr&: *MIB);
1325 }
1326 }
1327 if (Paired->peekDebugInstrNum()) {
1328 // If Paired is the instruction which got sign extended and has a
1329 // debug-instr-number, give the SBFMXri instruction a new
1330 // debug-instr-number, and update the debugValueSubstitutions table with
1331 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1332 // instruction a new debug-instr-number, and update the
1333 // debugValueSubstitutions table with the new debug-instr-number and
1334 // OpIndex pair.
1335 unsigned NewInstrNum;
1336 if (DstRegX == Paired->getOperand(i: 0).getReg()) {
1337 NewInstrNum = MIBSXTW->getDebugInstrNum();
1338 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1339 MergedInstr&: *MIBSXTW);
1340 } else {
1341 NewInstrNum = MIB->getDebugInstrNum();
1342 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1343 MergedInstr&: *MIB);
1344 }
1345 }
1346
1347 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1348 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1349 } else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) {
1350 // We are combining SVE fill/spill to LDP/STP, so we need to use the Q
1351 // variant of the registers.
1352 MachineOperand &MOp0 = MIB->getOperand(i: 0);
1353 MachineOperand &MOp1 = MIB->getOperand(i: 1);
1354 assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
1355 AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
1356 MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
1357 MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
1358 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1359 } else {
1360
1361 // In the case that the merge doesn't result in a sign-extend, if we have
1362 // something like:
1363 // debugValueSubstitutions:[]
1364 // $x1 = LDRXui $x0, 1, debug-instr-number 1
1365 // DBG_INSTR_REF !13, dbg-instr-ref(1, 0), debug-location !11
1366 // $x0 = LDRXui killed $x0, 0, debug-instr-number 2
1367 // DBG_INSTR_REF !14, dbg-instr-ref(2, 0), debug-location !11
1368
1369 // It will be converted to:
1370 // debugValueSubstitutions: []
1371 // $x0, $x1 = LDPXi $x0, 0
1372 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1373 // DBG_INSTR_REF !13, dbg-instr-ref(2, 0), debug-location !14
1374
1375 // We want the final result to look like:
1376 // debugValueSubstitutions:
1377 // - { srcinst: 1, srcop: 0, dstinst: 3, dstop: 1, subreg: 0 }
1378 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1379 // $x0, $x1 = LDPXi $x0, 0, debug-instr-number 3
1380 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1381 // DBG_INSTR_REF !12, dbg-instr-ref(2, 0), debug-location !14
1382
1383 // Here all that needs to be done is, that the LDP instruction needs to be
1384 // updated with a new debug-instr-number, we then need to add entries into
1385 // the debugSubstitutions table to map the old instr-refs to the new ones.
1386
1387 // Assign new DebugInstrNum to the Paired instruction.
1388 if (I->peekDebugInstrNum()) {
1389 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1390 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *I,
1391 MergedInstr&: *MIB);
1392 }
1393 if (Paired->peekDebugInstrNum()) {
1394 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1395 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *Paired,
1396 MergedInstr&: *MIB);
1397 }
1398
1399 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1400 }
1401 LLVM_DEBUG(dbgs() << "\n");
1402
1403 if (MergeForward)
1404 for (const MachineOperand &MOP : phys_regs_and_masks(MI: *I))
1405 if (MOP.isReg() && MOP.isKill())
1406 DefinedInBB.addReg(Reg: MOP.getReg());
1407
1408 // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but
1409 // only copies implicit defs and makes sure that each operand is only added
1410 // once in case of duplicates.
1411 auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1,
1412 MachineBasicBlock::iterator MI2) {
1413 SmallSetVector<Register, 4> Ops;
1414 for (const MachineOperand &MO :
1415 llvm::drop_begin(RangeOrContainer: MI1->operands(), N: MI1->getDesc().getNumOperands()))
1416 if (MO.isReg() && MO.isImplicit() && MO.isDef())
1417 Ops.insert(X: MO.getReg());
1418 for (const MachineOperand &MO :
1419 llvm::drop_begin(RangeOrContainer: MI2->operands(), N: MI2->getDesc().getNumOperands()))
1420 if (MO.isReg() && MO.isImplicit() && MO.isDef())
1421 Ops.insert(X: MO.getReg());
1422 for (auto Op : Ops)
1423 MIB.addDef(RegNo: Op, Flags: RegState::Implicit);
1424 };
1425 CopyImplicitOps(I, Paired);
1426
1427 // Erase the old instructions.
1428 I->eraseFromParent();
1429 Paired->eraseFromParent();
1430
1431 return NextI;
1432}
1433
1434MachineBasicBlock::iterator
1435AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1436 MachineBasicBlock::iterator StoreI) {
1437 MachineBasicBlock::iterator NextI =
1438 next_nodbg(It: LoadI, End: LoadI->getParent()->end());
1439
1440 int LoadSize = TII->getMemScale(MI: *LoadI);
1441 int StoreSize = TII->getMemScale(MI: *StoreI);
1442 Register LdRt = getLdStRegOp(MI&: *LoadI).getReg();
1443 const MachineOperand &StMO = getLdStRegOp(MI&: *StoreI);
1444 Register StRt = getLdStRegOp(MI&: *StoreI).getReg();
1445 bool IsStoreXReg = TRI->getRegClass(i: AArch64::GPR64RegClassID)->contains(Reg: StRt);
1446
1447 assert((IsStoreXReg ||
1448 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1449 "Unexpected RegClass");
1450
1451 MachineInstr *BitExtMI;
1452 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1453 // Remove the load, if the destination register of the loads is the same
1454 // register for stored value.
1455 if (StRt == LdRt && LoadSize == 8) {
1456 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1457 y: LoadI->getIterator())) {
1458 if (MI.killsRegister(Reg: StRt, TRI)) {
1459 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1460 break;
1461 }
1462 }
1463 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1464 LLVM_DEBUG(LoadI->print(dbgs()));
1465 LLVM_DEBUG(dbgs() << "\n");
1466 LoadI->eraseFromParent();
1467 return NextI;
1468 }
1469 // Replace the load with a mov if the load and store are in the same size.
1470 BitExtMI =
1471 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1472 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), DestReg: LdRt)
1473 .addReg(RegNo: IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1474 .add(MO: StMO)
1475 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
1476 .setMIFlags(LoadI->getFlags());
1477 } else {
1478 // FIXME: Currently we disable this transformation in big-endian targets as
1479 // performance and correctness are verified only in little-endian.
1480 if (!Subtarget->isLittleEndian())
1481 return NextI;
1482 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: *LoadI);
1483 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1484 "Unsupported ld/st match");
1485 assert(LoadSize <= StoreSize && "Invalid load size");
1486 int UnscaledLdOffset =
1487 IsUnscaled
1488 ? AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm()
1489 : AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm() * LoadSize;
1490 int UnscaledStOffset =
1491 IsUnscaled
1492 ? AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm()
1493 : AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm() * StoreSize;
1494 int Width = LoadSize * 8;
1495 Register DestReg =
1496 IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1497 Reg: LdRt, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64RegClass))
1498 : LdRt;
1499
1500 assert((UnscaledLdOffset >= UnscaledStOffset &&
1501 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1502 "Invalid offset");
1503
1504 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1505 int Imms = Immr + Width - 1;
1506 if (UnscaledLdOffset == UnscaledStOffset) {
1507 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1508 | ((Immr) << 6) // immr
1509 | ((Imms) << 0) // imms
1510 ;
1511
1512 BitExtMI =
1513 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1514 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1515 DestReg)
1516 .add(MO: StMO)
1517 .addImm(Val: AndMaskEncoded)
1518 .setMIFlags(LoadI->getFlags());
1519 } else if (IsStoreXReg && Imms == 31) {
1520 // Use the 32 bit variant of UBFM if it's the LSR alias of the
1521 // instruction.
1522 assert(Immr <= Imms && "Expected LSR alias of UBFM");
1523 BitExtMI = BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1524 MCID: TII->get(Opcode: AArch64::UBFMWri),
1525 DestReg: TRI->getSubReg(Reg: DestReg, Idx: AArch64::sub_32))
1526 .addReg(RegNo: TRI->getSubReg(Reg: StRt, Idx: AArch64::sub_32))
1527 .addImm(Val: Immr)
1528 .addImm(Val: Imms)
1529 .setMIFlags(LoadI->getFlags());
1530 } else {
1531 BitExtMI =
1532 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1533 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1534 DestReg)
1535 .add(MO: StMO)
1536 .addImm(Val: Immr)
1537 .addImm(Val: Imms)
1538 .setMIFlags(LoadI->getFlags());
1539 }
1540 }
1541
1542 // Clear kill flags between store and load.
1543 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1544 y: BitExtMI->getIterator()))
1545 if (MI.killsRegister(Reg: StRt, TRI)) {
1546 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1547 break;
1548 }
1549
1550 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1551 LLVM_DEBUG(StoreI->print(dbgs()));
1552 LLVM_DEBUG(dbgs() << " ");
1553 LLVM_DEBUG(LoadI->print(dbgs()));
1554 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1555 LLVM_DEBUG(StoreI->print(dbgs()));
1556 LLVM_DEBUG(dbgs() << " ");
1557 LLVM_DEBUG((BitExtMI)->print(dbgs()));
1558 LLVM_DEBUG(dbgs() << "\n");
1559
1560 // Erase the old instructions.
1561 LoadI->eraseFromParent();
1562 return NextI;
1563}
1564
1565static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1566 // Convert the byte-offset used by unscaled into an "element" offset used
1567 // by the scaled pair load/store instructions.
1568 if (IsUnscaled) {
1569 // If the byte-offset isn't a multiple of the stride, there's no point
1570 // trying to match it.
1571 if (Offset % OffsetStride)
1572 return false;
1573 Offset /= OffsetStride;
1574 }
1575 return Offset <= 63 && Offset >= -64;
1576}
1577
1578// Do alignment, specialized to power of 2 and for signed ints,
1579// avoiding having to do a C-style cast from uint_64t to int when
1580// using alignTo from include/llvm/Support/MathExtras.h.
1581// FIXME: Move this function to include/MathExtras.h?
1582static int alignTo(int Num, int PowOf2) {
1583 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1584}
1585
1586static bool mayAlias(MachineInstr &MIa,
1587 SmallVectorImpl<MachineInstr *> &MemInsns,
1588 AliasAnalysis *AA) {
1589 for (MachineInstr *MIb : MemInsns) {
1590 if (MIa.mayAlias(AA, Other: *MIb, /*UseTBAA*/ false)) {
1591 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1592 return true;
1593 }
1594 }
1595
1596 LLVM_DEBUG(dbgs() << "No aliases found\n");
1597 return false;
1598}
1599
1600bool AArch64LoadStoreOpt::findMatchingStore(
1601 MachineBasicBlock::iterator I, unsigned Limit,
1602 MachineBasicBlock::iterator &StoreI) {
1603 MachineBasicBlock::iterator B = I->getParent()->begin();
1604 MachineBasicBlock::iterator MBBI = I;
1605 MachineInstr &LoadMI = *I;
1606 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: LoadMI).getReg();
1607
1608 // If the load is the first instruction in the block, there's obviously
1609 // not any matching store.
1610 if (MBBI == B)
1611 return false;
1612
1613 // Track which register units have been modified and used between the first
1614 // insn and the second insn.
1615 ModifiedRegUnits.clear();
1616 UsedRegUnits.clear();
1617
1618 unsigned Count = 0;
1619 do {
1620 MBBI = prev_nodbg(It: MBBI, Begin: B);
1621 MachineInstr &MI = *MBBI;
1622
1623 // Don't count transient instructions towards the search limit since there
1624 // may be different numbers of them if e.g. debug information is present.
1625 if (!MI.isTransient())
1626 ++Count;
1627
1628 // If the load instruction reads directly from the address to which the
1629 // store instruction writes and the stored value is not modified, we can
1630 // promote the load. Since we do not handle stores with pre-/post-index,
1631 // it's unnecessary to check if BaseReg is modified by the store itself.
1632 // Also we can't handle stores without an immediate offset operand,
1633 // while the operand might be the address for a global variable.
1634 if (MI.mayStore() && isMatchingStore(LoadInst&: LoadMI, StoreInst&: MI) &&
1635 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1636 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1637 isLdOffsetInRangeOfSt(LoadInst&: LoadMI, StoreInst&: MI, TII) &&
1638 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg())) {
1639 StoreI = MBBI;
1640 return true;
1641 }
1642
1643 if (MI.isCall())
1644 return false;
1645
1646 // Update modified / uses register units.
1647 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1648
1649 // Otherwise, if the base register is modified, we have no match, so
1650 // return early.
1651 if (!ModifiedRegUnits.available(Reg: BaseReg))
1652 return false;
1653
1654 // If we encounter a store aliased with the load, return early.
1655 if (MI.mayStore() && LoadMI.mayAlias(AA, Other: MI, /*UseTBAA*/ false))
1656 return false;
1657 } while (MBBI != B && Count < Limit);
1658 return false;
1659}
1660
1661static bool needsWinCFI(const MachineFunction *MF) {
1662 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1663 MF->getFunction().needsUnwindTableEntry();
1664}
1665
1666// Returns true if FirstMI and MI are candidates for merging or pairing.
1667// Otherwise, returns false.
1668static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1669 LdStPairFlags &Flags,
1670 const AArch64InstrInfo *TII) {
1671 // If this is volatile or if pairing is suppressed, not a candidate.
1672 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1673 return false;
1674
1675 // We should have already checked FirstMI for pair suppression and volatility.
1676 assert(!FirstMI.hasOrderedMemoryRef() &&
1677 !TII->isLdStPairSuppressed(FirstMI) &&
1678 "FirstMI shouldn't get here if either of these checks are true.");
1679
1680 if (needsWinCFI(MF: MI.getMF()) && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
1681 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
1682 return false;
1683
1684 unsigned OpcA = FirstMI.getOpcode();
1685 unsigned OpcB = MI.getOpcode();
1686
1687 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1688 if (OpcA == OpcB)
1689 return !AArch64InstrInfo::isPreLdSt(MI: FirstMI);
1690
1691 // Bail out if one of the opcodes is SVE fill/spill, as we currently don't
1692 // allow pairing them with other instructions.
1693 if (OpcA == AArch64::LDR_ZXI || OpcA == AArch64::STR_ZXI ||
1694 OpcB == AArch64::LDR_ZXI || OpcB == AArch64::STR_ZXI)
1695 return false;
1696
1697 // Two pre ld/st of different opcodes cannot be merged either
1698 if (AArch64InstrInfo::isPreLdSt(MI: FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1699 return false;
1700
1701 // Try to match a sign-extended load/store with a zero-extended load/store.
1702 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1703 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc: OpcA, IsValidLdStrOpc: &IsValidLdStrOpc);
1704 assert(IsValidLdStrOpc &&
1705 "Given Opc should be a Load or Store with an immediate");
1706 // OpcA will be the first instruction in the pair.
1707 if (NonSExtOpc == getMatchingNonSExtOpcode(Opc: OpcB, IsValidLdStrOpc: &PairIsValidLdStrOpc)) {
1708 Flags.setSExtIdx(NonSExtOpc == OpcA ? 1 : 0);
1709 return true;
1710 }
1711
1712 // If the second instruction isn't even a mergable/pairable load/store, bail
1713 // out.
1714 if (!PairIsValidLdStrOpc)
1715 return false;
1716
1717 // Narrow stores do not have a matching pair opcodes, so constrain their
1718 // merging to zero stores.
1719 if (isNarrowStore(Opc: OpcA) || isNarrowStore(Opc: OpcB))
1720 return getLdStRegOp(MI&: FirstMI).getReg() == AArch64::WZR &&
1721 getLdStRegOp(MI).getReg() == AArch64::WZR &&
1722 TII->getMemScale(MI: FirstMI) == TII->getMemScale(MI);
1723
1724 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1725 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1726 // are candidate pairs that can be merged.
1727 if (isPreLdStPairCandidate(FirstMI, MI))
1728 return true;
1729
1730 // Try to match an unscaled load/store with a scaled load/store.
1731 return TII->hasUnscaledLdStOffset(Opc: OpcA) != TII->hasUnscaledLdStOffset(Opc: OpcB) &&
1732 getMatchingPairOpcode(Opc: OpcA) == getMatchingPairOpcode(Opc: OpcB);
1733
1734 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1735}
1736
1737static bool canRenameMOP(const MachineOperand &MOP,
1738 const TargetRegisterInfo *TRI) {
1739 if (MOP.isReg()) {
1740 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: MOP.getReg());
1741 // Renaming registers with multiple disjunct sub-registers (e.g. the
1742 // result of a LD3) means that all sub-registers are renamed, potentially
1743 // impacting other instructions we did not check. Bail out.
1744 // Note that this relies on the structure of the AArch64 register file. In
1745 // particular, a subregister cannot be written without overwriting the
1746 // whole register.
1747 if (RegClass->HasDisjunctSubRegs && RegClass->CoveredBySubRegs &&
1748 (TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::dsub0) ||
1749 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::qsub0) ||
1750 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::zsub0))) {
1751 LLVM_DEBUG(
1752 dbgs()
1753 << " Cannot rename operands with multiple disjunct subregisters ("
1754 << MOP << ")\n");
1755 return false;
1756 }
1757
1758 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1759 // them must be known. For example, in ORRWrs the implicit-def
1760 // corresponds to the result register.
1761 if (MOP.isImplicit() && MOP.isDef()) {
1762 if (!isRewritableImplicitDef(MO: MOP))
1763 return false;
1764 return TRI->isSuperOrSubRegisterEq(
1765 RegA: MOP.getParent()->getOperand(i: 0).getReg(), RegB: MOP.getReg());
1766 }
1767 }
1768 return MOP.isImplicit() ||
1769 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1770}
1771
1772static bool
1773canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1774 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1775 const TargetRegisterInfo *TRI) {
1776 if (!FirstMI.mayStore())
1777 return false;
1778
1779 // Check if we can find an unused register which we can use to rename
1780 // the register used by the first load/store.
1781
1782 auto RegToRename = getLdStRegOp(MI&: FirstMI).getReg();
1783 // For now, we only rename if the store operand gets killed at the store.
1784 if (!getLdStRegOp(MI&: FirstMI).isKill() &&
1785 !any_of(Range: FirstMI.operands(),
1786 P: [TRI, RegToRename](const MachineOperand &MOP) {
1787 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1788 MOP.isImplicit() && MOP.isKill() &&
1789 TRI->regsOverlap(RegA: RegToRename, RegB: MOP.getReg());
1790 })) {
1791 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1792 return false;
1793 }
1794
1795 bool FoundDef = false;
1796
1797 // For each instruction between FirstMI and the previous def for RegToRename,
1798 // we
1799 // * check if we can rename RegToRename in this instruction
1800 // * collect the registers used and required register classes for RegToRename.
1801 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1802 bool IsDef) {
1803 LLVM_DEBUG(dbgs() << "Checking " << MI);
1804 // Currently we do not try to rename across frame-setup instructions.
1805 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1806 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1807 << "currently\n");
1808 return false;
1809 }
1810
1811 UsedInBetween.accumulate(MI);
1812
1813 // For a definition, check that we can rename the definition and exit the
1814 // loop.
1815 FoundDef = IsDef;
1816
1817 // For defs, check if we can rename the first def of RegToRename.
1818 if (FoundDef) {
1819 // For some pseudo instructions, we might not generate code in the end
1820 // (e.g. KILL) and we would end up without a correct def for the rename
1821 // register.
1822 // TODO: This might be overly conservative and we could handle those cases
1823 // in multiple ways:
1824 // 1. Insert an extra copy, to materialize the def.
1825 // 2. Skip pseudo-defs until we find an non-pseudo def.
1826 if (MI.isPseudo()) {
1827 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1828 return false;
1829 }
1830
1831 for (auto &MOP : MI.operands()) {
1832 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1833 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1834 continue;
1835 if (!canRenameMOP(MOP, TRI)) {
1836 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1837 return false;
1838 }
1839 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1840 }
1841 return true;
1842 } else {
1843 for (auto &MOP : MI.operands()) {
1844 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1845 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1846 continue;
1847
1848 if (!canRenameMOP(MOP, TRI)) {
1849 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1850 return false;
1851 }
1852 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1853 }
1854 }
1855 return true;
1856 };
1857
1858 if (!forAllMIsUntilDef(MI&: FirstMI, DefReg: RegToRename, TRI, Limit: LdStLimit, Fn&: CheckMIs))
1859 return false;
1860
1861 if (!FoundDef) {
1862 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1863 return false;
1864 }
1865 return true;
1866}
1867
1868// We want to merge the second load into the first by rewriting the usages of
1869// the same reg between first (incl.) and second (excl.). We don't need to care
1870// about any insns before FirstLoad or after SecondLoad.
1871// 1. The second load writes new value into the same reg.
1872// - The renaming is impossible to impact later use of the reg.
1873// - The second load always trash the value written by the first load which
1874// means the reg must be killed before the second load.
1875// 2. The first load must be a def for the same reg so we don't need to look
1876// into anything before it.
1877static bool canRenameUntilSecondLoad(
1878 MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1879 LiveRegUnits &UsedInBetween,
1880 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1881 const TargetRegisterInfo *TRI) {
1882 if (FirstLoad.isPseudo())
1883 return false;
1884
1885 UsedInBetween.accumulate(MI: FirstLoad);
1886 auto RegToRename = getLdStRegOp(MI&: FirstLoad).getReg();
1887 bool Success = std::all_of(
1888 first: FirstLoad.getIterator(), last: SecondLoad.getIterator(),
1889 pred: [&](MachineInstr &MI) {
1890 LLVM_DEBUG(dbgs() << "Checking " << MI);
1891 // Currently we do not try to rename across frame-setup instructions.
1892 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1893 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1894 << "currently\n");
1895 return false;
1896 }
1897
1898 for (auto &MOP : MI.operands()) {
1899 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1900 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1901 continue;
1902 if (!canRenameMOP(MOP, TRI)) {
1903 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1904 return false;
1905 }
1906 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1907 }
1908
1909 return true;
1910 });
1911 return Success;
1912}
1913
1914// Check if we can find a physical register for renaming \p Reg. This register
1915// must:
1916// * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1917// defined registers up to the point where the renamed register will be used,
1918// * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1919// registers in the range the rename register will be used,
1920// * is available in all used register classes (checked using RequiredClasses).
1921static std::optional<MCPhysReg> tryToFindRegisterToRename(
1922 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1923 LiveRegUnits &UsedInBetween,
1924 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1925 const TargetRegisterInfo *TRI) {
1926 const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1927
1928 // Checks if any sub- or super-register of PR is callee saved.
1929 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1930 return any_of(Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1931 P: [&MF, TRI](MCPhysReg SubOrSuper) {
1932 return TRI->isCalleeSavedPhysReg(PhysReg: SubOrSuper, MF);
1933 });
1934 };
1935
1936 // Check if PR or one of its sub- or super-registers can be used for all
1937 // required register classes.
1938 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1939 return all_of(Range&: RequiredClasses, P: [PR, TRI](const TargetRegisterClass *C) {
1940 return any_of(
1941 Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1942 P: [C](MCPhysReg SubOrSuper) { return C->contains(Reg: SubOrSuper); });
1943 });
1944 };
1945
1946 auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1947 for (const MCPhysReg &PR : *RegClass) {
1948 if (DefinedInBB.available(Reg: PR) && UsedInBetween.available(Reg: PR) &&
1949 !RegInfo.isReserved(PhysReg: PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1950 CanBeUsedForAllClasses(PR)) {
1951 DefinedInBB.addReg(Reg: PR);
1952 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1953 << "\n");
1954 return {PR};
1955 }
1956 }
1957 LLVM_DEBUG(dbgs() << "No rename register found from "
1958 << TRI->getRegClassName(RegClass) << "\n");
1959 return std::nullopt;
1960}
1961
1962// For store pairs: returns a register from FirstMI to the beginning of the
1963// block that can be renamed.
1964// For load pairs: returns a register from FirstMI to MI that can be renamed.
1965static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1966 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1967 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1968 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1969 const TargetRegisterInfo *TRI) {
1970 std::optional<MCPhysReg> RenameReg;
1971 if (!DebugCounter::shouldExecute(Counter&: RegRenamingCounter))
1972 return RenameReg;
1973
1974 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: getLdStRegOp(MI&: FirstMI).getReg());
1975 MachineFunction &MF = *FirstMI.getParent()->getParent();
1976 if (!RegClass || !MF.getRegInfo().tracksLiveness())
1977 return RenameReg;
1978
1979 const bool IsLoad = FirstMI.mayLoad();
1980
1981 if (!MaybeCanRename) {
1982 if (IsLoad)
1983 MaybeCanRename = {canRenameUntilSecondLoad(FirstLoad&: FirstMI, SecondLoad&: MI, UsedInBetween,
1984 RequiredClasses, TRI)};
1985 else
1986 MaybeCanRename = {
1987 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1988 }
1989
1990 if (*MaybeCanRename) {
1991 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1992 RequiredClasses, TRI);
1993 }
1994 return RenameReg;
1995}
1996
1997/// Scan the instructions looking for a load/store that can be combined with the
1998/// current instruction into a wider equivalent or a load/store pair.
1999MachineBasicBlock::iterator
2000AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
2001 LdStPairFlags &Flags, unsigned Limit,
2002 bool FindNarrowMerge) {
2003 MachineBasicBlock::iterator E = I->getParent()->end();
2004 MachineBasicBlock::iterator MBBI = I;
2005 MachineBasicBlock::iterator MBBIWithRenameReg;
2006 MachineInstr &FirstMI = *I;
2007 MBBI = next_nodbg(It: MBBI, End: E);
2008
2009 bool MayLoad = FirstMI.mayLoad();
2010 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: FirstMI);
2011 Register Reg = getLdStRegOp(MI&: FirstMI).getReg();
2012 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: FirstMI).getReg();
2013 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: FirstMI).getImm();
2014 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: FirstMI) : 1;
2015 bool IsPromotableZeroStore = isPromotableZeroStoreInst(MI&: FirstMI);
2016
2017 std::optional<bool> MaybeCanRename;
2018 if (!EnableRenaming)
2019 MaybeCanRename = {false};
2020
2021 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
2022 LiveRegUnits UsedInBetween;
2023 UsedInBetween.init(TRI: *TRI);
2024
2025 Flags.clearRenameReg();
2026
2027 // Track which register units have been modified and used between the first
2028 // insn (inclusive) and the second insn.
2029 ModifiedRegUnits.clear();
2030 UsedRegUnits.clear();
2031
2032 // Remember any instructions that read/write memory between FirstMI and MI.
2033 SmallVector<MachineInstr *, 4> MemInsns;
2034
2035 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
2036 for (unsigned Count = 0; MBBI != E && Count < Limit;
2037 MBBI = next_nodbg(It: MBBI, End: E)) {
2038 MachineInstr &MI = *MBBI;
2039 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
2040
2041 UsedInBetween.accumulate(MI);
2042
2043 // Don't count transient instructions towards the search limit since there
2044 // may be different numbers of them if e.g. debug information is present.
2045 if (!MI.isTransient())
2046 ++Count;
2047
2048 Flags.setSExtIdx(-1);
2049 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
2050 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
2051 assert(MI.mayLoadOrStore() && "Expected memory operation.");
2052 // If we've found another instruction with the same opcode, check to see
2053 // if the base and offset are compatible with our starting instruction.
2054 // These instructions all have scaled immediate operands, so we just
2055 // check for +1/-1. Make sure to check the new instruction offset is
2056 // actually an immediate and not a symbolic reference destined for
2057 // a relocation.
2058 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
2059 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2060 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
2061 if (IsUnscaled != MIIsUnscaled) {
2062 // We're trying to pair instructions that differ in how they are scaled.
2063 // If FirstMI is scaled then scale the offset of MI accordingly.
2064 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
2065 int MemSize = TII->getMemScale(MI);
2066 if (MIIsUnscaled) {
2067 // If the unscaled offset isn't a multiple of the MemSize, we can't
2068 // pair the operations together: bail and keep looking.
2069 if (MIOffset % MemSize) {
2070 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2071 UsedRegUnits, TRI);
2072 MemInsns.push_back(Elt: &MI);
2073 continue;
2074 }
2075 MIOffset /= MemSize;
2076 } else {
2077 MIOffset *= MemSize;
2078 }
2079 }
2080
2081 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
2082
2083 if (BaseReg == MIBaseReg) {
2084 // If the offset of the second ld/st is not equal to the size of the
2085 // destination register it can’t be paired with a pre-index ld/st
2086 // pair. Additionally if the base reg is used or modified the operations
2087 // can't be paired: bail and keep looking.
2088 if (IsPreLdSt) {
2089 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
2090 bool IsBaseRegUsed = !UsedRegUnits.available(
2091 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2092 bool IsBaseRegModified = !ModifiedRegUnits.available(
2093 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2094 // If the stored value and the address of the second instruction is
2095 // the same, it needs to be using the updated register and therefore
2096 // it must not be folded.
2097 bool IsMIRegTheSame =
2098 TRI->regsOverlap(RegA: getLdStRegOp(MI).getReg(),
2099 RegB: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2100 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
2101 IsMIRegTheSame) {
2102 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2103 UsedRegUnits, TRI);
2104 MemInsns.push_back(Elt: &MI);
2105 continue;
2106 }
2107 } else {
2108 if ((Offset != MIOffset + OffsetStride) &&
2109 (Offset + OffsetStride != MIOffset)) {
2110 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2111 UsedRegUnits, TRI);
2112 MemInsns.push_back(Elt: &MI);
2113 continue;
2114 }
2115 }
2116
2117 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
2118 if (FindNarrowMerge) {
2119 // If the alignment requirements of the scaled wide load/store
2120 // instruction can't express the offset of the scaled narrow input,
2121 // bail and keep looking. For promotable zero stores, allow only when
2122 // the stored value is the same (i.e., WZR).
2123 if ((!IsUnscaled && alignTo(Num: MinOffset, PowOf2: 2) != MinOffset) ||
2124 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
2125 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2126 UsedRegUnits, TRI);
2127 MemInsns.push_back(Elt: &MI);
2128 continue;
2129 }
2130 } else {
2131 // Pairwise instructions have a 7-bit signed offset field. Single
2132 // insns have a 12-bit unsigned offset field. If the resultant
2133 // immediate offset of merging these instructions is out of range for
2134 // a pairwise instruction, bail and keep looking.
2135 if (!inBoundsForPair(IsUnscaled, Offset: MinOffset, OffsetStride)) {
2136 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2137 UsedRegUnits, TRI);
2138 MemInsns.push_back(Elt: &MI);
2139 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
2140 << "keep looking.\n");
2141 continue;
2142 }
2143 // If the alignment requirements of the paired (scaled) instruction
2144 // can't express the offset of the unscaled input, bail and keep
2145 // looking.
2146 if (IsUnscaled && (alignTo(Num: MinOffset, PowOf2: OffsetStride) != MinOffset)) {
2147 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2148 UsedRegUnits, TRI);
2149 MemInsns.push_back(Elt: &MI);
2150 LLVM_DEBUG(dbgs()
2151 << "Offset doesn't fit due to alignment requirements, "
2152 << "keep looking.\n");
2153 continue;
2154 }
2155 }
2156
2157 // If the BaseReg has been modified, then we cannot do the optimization.
2158 // For example, in the following pattern
2159 // ldr x1 [x2]
2160 // ldr x2 [x3]
2161 // ldr x4 [x2, #8],
2162 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
2163 if (!ModifiedRegUnits.available(Reg: BaseReg))
2164 return E;
2165
2166 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
2167 RegA: Reg, RegB: getLdStRegOp(MI).getReg());
2168
2169 // If the Rt of the second instruction (destination register of the
2170 // load) was not modified or used between the two instructions and none
2171 // of the instructions between the second and first alias with the
2172 // second, we can combine the second into the first.
2173 bool RtNotModified =
2174 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg());
2175 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
2176 !UsedRegUnits.available(Reg: getLdStRegOp(MI).getReg()));
2177
2178 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
2179 << "Reg '" << getLdStRegOp(MI) << "' not modified: "
2180 << (RtNotModified ? "true" : "false") << "\n"
2181 << "Reg '" << getLdStRegOp(MI) << "' not used: "
2182 << (RtNotUsed ? "true" : "false") << "\n");
2183
2184 if (RtNotModified && RtNotUsed && !mayAlias(MIa&: MI, MemInsns, AA)) {
2185 // For pairs loading into the same reg, try to find a renaming
2186 // opportunity to allow the renaming of Reg between FirstMI and MI
2187 // and combine MI into FirstMI; otherwise bail and keep looking.
2188 if (SameLoadReg) {
2189 std::optional<MCPhysReg> RenameReg =
2190 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
2191 Reg, DefinedInBB, UsedInBetween,
2192 RequiredClasses, TRI);
2193 if (!RenameReg) {
2194 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2195 UsedRegUnits, TRI);
2196 MemInsns.push_back(Elt: &MI);
2197 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
2198 << "keep looking.\n");
2199 continue;
2200 }
2201 Flags.setRenameReg(*RenameReg);
2202 }
2203
2204 Flags.setMergeForward(false);
2205 if (!SameLoadReg)
2206 Flags.clearRenameReg();
2207 return MBBI;
2208 }
2209
2210 // Likewise, if the Rt of the first instruction is not modified or used
2211 // between the two instructions and none of the instructions between the
2212 // first and the second alias with the first, we can combine the first
2213 // into the second.
2214 RtNotModified = !(
2215 MayLoad && !UsedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg()));
2216
2217 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
2218 << "Reg '" << getLdStRegOp(FirstMI)
2219 << "' not modified: "
2220 << (RtNotModified ? "true" : "false") << "\n");
2221
2222 if (RtNotModified && !mayAlias(MIa&: FirstMI, MemInsns, AA)) {
2223 if (ModifiedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg())) {
2224 Flags.setMergeForward(true);
2225 Flags.clearRenameReg();
2226 return MBBI;
2227 }
2228
2229 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
2230 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
2231 RequiredClasses, TRI);
2232 if (RenameReg) {
2233 Flags.setMergeForward(true);
2234 Flags.setRenameReg(*RenameReg);
2235 MBBIWithRenameReg = MBBI;
2236 }
2237 }
2238 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
2239 << "interference in between, keep looking.\n");
2240 }
2241 }
2242
2243 if (Flags.getRenameReg())
2244 return MBBIWithRenameReg;
2245
2246 // If the instruction wasn't a matching load or store. Stop searching if we
2247 // encounter a call instruction that might modify memory.
2248 if (MI.isCall()) {
2249 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
2250 return E;
2251 }
2252
2253 // Update modified / uses register units.
2254 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2255
2256 // Otherwise, if the base register is modified, we have no match, so
2257 // return early.
2258 if (!ModifiedRegUnits.available(Reg: BaseReg)) {
2259 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
2260 return E;
2261 }
2262
2263 // Update list of instructions that read/write memory.
2264 if (MI.mayLoadOrStore())
2265 MemInsns.push_back(Elt: &MI);
2266 }
2267 return E;
2268}
2269
2270static MachineBasicBlock::iterator
2271maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
2272 assert((MI.getOpcode() == AArch64::SUBXri ||
2273 MI.getOpcode() == AArch64::ADDXri) &&
2274 "Expected a register update instruction");
2275 auto End = MI.getParent()->end();
2276 if (MaybeCFI == End ||
2277 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
2278 !(MI.getFlag(Flag: MachineInstr::FrameSetup) ||
2279 MI.getFlag(Flag: MachineInstr::FrameDestroy)) ||
2280 MI.getOperand(i: 0).getReg() != AArch64::SP)
2281 return End;
2282
2283 const MachineFunction &MF = *MI.getParent()->getParent();
2284 unsigned CFIIndex = MaybeCFI->getOperand(i: 0).getCFIIndex();
2285 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
2286 switch (CFI.getOperation()) {
2287 case MCCFIInstruction::OpDefCfa:
2288 case MCCFIInstruction::OpDefCfaOffset:
2289 return MaybeCFI;
2290 default:
2291 return End;
2292 }
2293}
2294
2295std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
2296 MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
2297 bool IsForward, bool IsPreIdx, bool MergeEither) {
2298 assert((Update->getOpcode() == AArch64::ADDXri ||
2299 Update->getOpcode() == AArch64::SUBXri) &&
2300 "Unexpected base register update instruction to merge!");
2301 MachineBasicBlock::iterator E = I->getParent()->end();
2302 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2303
2304 // If updating the SP and the following instruction is CFA offset related CFI,
2305 // make sure the CFI follows the SP update either by merging at the location
2306 // of the update or by moving the CFI after the merged instruction. If unable
2307 // to do so, bail.
2308 MachineBasicBlock::iterator InsertPt = I;
2309 if (IsForward) {
2310 assert(IsPreIdx);
2311 if (auto CFI = maybeMoveCFI(MI&: *Update, MaybeCFI: next_nodbg(It: Update, End: E)); CFI != E) {
2312 if (MergeEither) {
2313 InsertPt = Update;
2314 } else {
2315 // Take care not to reorder CFIs.
2316 if (std::any_of(first: std::next(x: CFI), last: I, pred: [](const auto &Insn) {
2317 return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
2318 }))
2319 return std::nullopt;
2320
2321 MachineBasicBlock *MBB = InsertPt->getParent();
2322 MBB->splice(Where: std::next(x: InsertPt), Other: MBB, From: CFI);
2323 }
2324 }
2325 }
2326
2327 // Return the instruction following the merged instruction, which is
2328 // the instruction following our unmerged load. Unless that's the add/sub
2329 // instruction we're merging, in which case it's the one after that.
2330 if (NextI == Update)
2331 NextI = next_nodbg(It: NextI, End: E);
2332
2333 int Value = Update->getOperand(i: 2).getImm();
2334 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
2335 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
2336 if (Update->getOpcode() == AArch64::SUBXri)
2337 Value = -Value;
2338
2339 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(Opc: I->getOpcode())
2340 : getPostIndexedOpcode(Opc: I->getOpcode());
2341 MachineInstrBuilder MIB;
2342 int Scale, MinOffset, MaxOffset;
2343 getPrePostIndexedMemOpInfo(MI: *I, Scale, MinOffset, MaxOffset);
2344 if (!AArch64InstrInfo::isPairedLdSt(MI: *I)) {
2345 // Non-paired instruction.
2346 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2347 MCID: TII->get(Opcode: NewOpc))
2348 .add(MO: Update->getOperand(i: 0))
2349 .add(MO: getLdStRegOp(MI&: *I))
2350 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2351 .addImm(Val: Value / Scale)
2352 .setMemRefs(I->memoperands())
2353 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2354 } else {
2355 // Paired instruction.
2356 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2357 MCID: TII->get(Opcode: NewOpc))
2358 .add(MO: Update->getOperand(i: 0))
2359 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 0))
2360 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 1))
2361 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2362 .addImm(Val: Value / Scale)
2363 .setMemRefs(I->memoperands())
2364 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2365 }
2366
2367 if (IsPreIdx) {
2368 ++NumPreFolded;
2369 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2370 } else {
2371 ++NumPostFolded;
2372 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2373 }
2374 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2375 LLVM_DEBUG(I->print(dbgs()));
2376 LLVM_DEBUG(dbgs() << " ");
2377 LLVM_DEBUG(Update->print(dbgs()));
2378 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2379 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2380 LLVM_DEBUG(dbgs() << "\n");
2381
2382 // Erase the old instructions for the block.
2383 I->eraseFromParent();
2384 Update->eraseFromParent();
2385
2386 return NextI;
2387}
2388
2389MachineBasicBlock::iterator
2390AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2391 MachineBasicBlock::iterator Update,
2392 unsigned Offset, int Scale) {
2393 assert((Update->getOpcode() == AArch64::MOVKWi) &&
2394 "Unexpected const mov instruction to merge!");
2395 MachineBasicBlock::iterator E = I->getParent()->end();
2396 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2397 MachineBasicBlock::iterator PrevI = prev_nodbg(It: Update, Begin: E);
2398 MachineInstr &MemMI = *I;
2399 unsigned Mask = (1 << 12) * Scale - 1;
2400 unsigned Low = Offset & Mask;
2401 unsigned High = Offset - Low;
2402 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2403 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2404 MachineInstrBuilder AddMIB, MemMIB;
2405
2406 // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2407 AddMIB =
2408 BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: AArch64::ADDXri))
2409 .addDef(RegNo: IndexReg)
2410 .addUse(RegNo: BaseReg)
2411 .addImm(Val: High >> 12) // shifted value
2412 .addImm(Val: 12); // shift 12
2413 (void)AddMIB;
2414 // Ld/St DestReg, IndexReg, Imm12
2415 unsigned NewOpc = getBaseAddressOpcode(Opc: I->getOpcode());
2416 MemMIB = BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: NewOpc))
2417 .add(MO: getLdStRegOp(MI&: MemMI))
2418 .add(MO: AArch64InstrInfo::getLdStOffsetOp(MI: MemMI))
2419 .addImm(Val: Low / Scale)
2420 .setMemRefs(I->memoperands())
2421 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2422 (void)MemMIB;
2423
2424 ++NumConstOffsetFolded;
2425 LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2426 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2427 LLVM_DEBUG(PrevI->print(dbgs()));
2428 LLVM_DEBUG(dbgs() << " ");
2429 LLVM_DEBUG(Update->print(dbgs()));
2430 LLVM_DEBUG(dbgs() << " ");
2431 LLVM_DEBUG(I->print(dbgs()));
2432 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2433 LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2434 LLVM_DEBUG(dbgs() << " ");
2435 LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2436 LLVM_DEBUG(dbgs() << "\n");
2437
2438 // Erase the old instructions for the block.
2439 I->eraseFromParent();
2440 PrevI->eraseFromParent();
2441 Update->eraseFromParent();
2442
2443 return NextI;
2444}
2445
2446bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2447 MachineInstr &MI,
2448 unsigned BaseReg, int Offset) {
2449 switch (MI.getOpcode()) {
2450 default:
2451 break;
2452 case AArch64::SUBXri:
2453 case AArch64::ADDXri:
2454 // Make sure it's a vanilla immediate operand, not a relocation or
2455 // anything else we can't handle.
2456 if (!MI.getOperand(i: 2).isImm())
2457 break;
2458 // Watch out for 1 << 12 shifted value.
2459 if (AArch64_AM::getShiftValue(Imm: MI.getOperand(i: 3).getImm()))
2460 break;
2461
2462 // The update instruction source and destination register must be the
2463 // same as the load/store base register.
2464 if (MI.getOperand(i: 0).getReg() != BaseReg ||
2465 MI.getOperand(i: 1).getReg() != BaseReg)
2466 break;
2467
2468 int UpdateOffset = MI.getOperand(i: 2).getImm();
2469 if (MI.getOpcode() == AArch64::SUBXri)
2470 UpdateOffset = -UpdateOffset;
2471
2472 // The immediate must be a multiple of the scaling factor of the pre/post
2473 // indexed instruction.
2474 int Scale, MinOffset, MaxOffset;
2475 getPrePostIndexedMemOpInfo(MI: MemMI, Scale, MinOffset, MaxOffset);
2476 if (UpdateOffset % Scale != 0)
2477 break;
2478
2479 // Scaled offset must fit in the instruction immediate.
2480 int ScaledOffset = UpdateOffset / Scale;
2481 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2482 break;
2483
2484 // If we have a non-zero Offset, we check that it matches the amount
2485 // we're adding to the register.
2486 if (!Offset || Offset == UpdateOffset)
2487 return true;
2488 break;
2489 }
2490 return false;
2491}
2492
2493bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2494 MachineInstr &MI,
2495 unsigned IndexReg,
2496 unsigned &Offset) {
2497 // The update instruction source and destination register must be the
2498 // same as the load/store index register.
2499 if (MI.getOpcode() == AArch64::MOVKWi &&
2500 TRI->isSuperOrSubRegisterEq(RegA: IndexReg, RegB: MI.getOperand(i: 1).getReg())) {
2501
2502 // movz + movk hold a large offset of a Ld/St instruction.
2503 MachineBasicBlock::iterator B = MI.getParent()->begin();
2504 MachineBasicBlock::iterator MBBI = &MI;
2505 // Skip the scene when the MI is the first instruction of a block.
2506 if (MBBI == B)
2507 return false;
2508 MBBI = prev_nodbg(It: MBBI, Begin: B);
2509 MachineInstr &MovzMI = *MBBI;
2510 // Make sure the MOVKWi and MOVZWi set the same register.
2511 if (MovzMI.getOpcode() == AArch64::MOVZWi &&
2512 MovzMI.getOperand(i: 0).getReg() == MI.getOperand(i: 0).getReg()) {
2513 unsigned Low = MovzMI.getOperand(i: 1).getImm();
2514 unsigned High = MI.getOperand(i: 2).getImm() << MI.getOperand(i: 3).getImm();
2515 Offset = High + Low;
2516 // 12-bit optionally shifted immediates are legal for adds.
2517 return Offset >> 24 == 0;
2518 }
2519 }
2520 return false;
2521}
2522
2523MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2524 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2525 MachineBasicBlock::iterator E = I->getParent()->end();
2526 MachineInstr &MemMI = *I;
2527 MachineBasicBlock::iterator MBBI = I;
2528
2529 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2530 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm() *
2531 TII->getMemScale(MI: MemMI);
2532
2533 // Scan forward looking for post-index opportunities. Updating instructions
2534 // can't be formed if the memory instruction doesn't have the offset we're
2535 // looking for.
2536 if (MIUnscaledOffset != UnscaledOffset)
2537 return E;
2538
2539 // If the base register overlaps a source/destination register, we can't
2540 // merge the update. This does not apply to tag store instructions which
2541 // ignore the address part of the source register.
2542 // This does not apply to STGPi as well, which does not have unpredictable
2543 // behavior in this case unlike normal stores, and always performs writeback
2544 // after reading the source register value.
2545 if (!isTagStore(MI: MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2546 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2547 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2548 Register DestReg = getLdStRegOp(MI&: MemMI, PairedRegOp: i).getReg();
2549 if (DestReg == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg))
2550 return E;
2551 }
2552 }
2553
2554 // Track which register units have been modified and used between the first
2555 // insn (inclusive) and the second insn.
2556 ModifiedRegUnits.clear();
2557 UsedRegUnits.clear();
2558 MBBI = next_nodbg(It: MBBI, End: E);
2559
2560 // We can't post-increment the stack pointer if any instruction between
2561 // the memory access (I) and the increment (MBBI) can access the memory
2562 // region defined by [SP, MBBI].
2563 const bool BaseRegSP = BaseReg == AArch64::SP;
2564 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2565 // FIXME: For now, we always block the optimization over SP in windows
2566 // targets as it requires to adjust the unwind/debug info, messing up
2567 // the unwind info can actually cause a miscompile.
2568 return E;
2569 }
2570
2571 unsigned Count = 0;
2572 MachineBasicBlock *CurMBB = I->getParent();
2573 // choice of next block to visit is liveins-based
2574 bool VisitSucc = CurMBB->getParent()->getRegInfo().tracksLiveness();
2575
2576 while (true) {
2577 for (MachineBasicBlock::iterator CurEnd = CurMBB->end();
2578 MBBI != CurEnd && Count < Limit; MBBI = next_nodbg(It: MBBI, End: CurEnd)) {
2579 MachineInstr &MI = *MBBI;
2580
2581 // Don't count transient instructions towards the search limit since there
2582 // may be different numbers of them if e.g. debug information is present.
2583 if (!MI.isTransient())
2584 ++Count;
2585
2586 // If we found a match, return it.
2587 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset: UnscaledOffset))
2588 return MBBI;
2589
2590 // Update the status of what the instruction clobbered and used.
2591 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2592 TRI);
2593
2594 // Otherwise, if the base register is used or modified, we have no match,
2595 // so return early. If we are optimizing SP, do not allow instructions
2596 // that may load or store in between the load and the optimized value
2597 // update.
2598 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2599 !UsedRegUnits.available(Reg: BaseReg) ||
2600 (BaseRegSP && MBBI->mayLoadOrStore()))
2601 return E;
2602 }
2603
2604 if (!VisitSucc || Limit <= Count)
2605 break;
2606
2607 // Try to go downward to successors along a CF path w/o side enters
2608 // such that BaseReg is alive along it but not at its exits
2609 MachineBasicBlock *SuccToVisit = nullptr;
2610 unsigned LiveSuccCount = 0;
2611 for (MachineBasicBlock *Succ : CurMBB->successors()) {
2612 for (MCRegAliasIterator AI(BaseReg, TRI, true); AI.isValid(); ++AI) {
2613 if (Succ->isLiveIn(Reg: *AI)) {
2614 if (LiveSuccCount++)
2615 return E;
2616 if (Succ->pred_size() == 1)
2617 SuccToVisit = Succ;
2618 break;
2619 }
2620 }
2621 }
2622 if (!SuccToVisit)
2623 break;
2624 CurMBB = SuccToVisit;
2625 MBBI = CurMBB->begin();
2626 }
2627
2628 return E;
2629}
2630
2631MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2632 MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
2633 MachineBasicBlock::iterator B = I->getParent()->begin();
2634 MachineBasicBlock::iterator E = I->getParent()->end();
2635 MachineInstr &MemMI = *I;
2636 MachineBasicBlock::iterator MBBI = I;
2637 MachineFunction &MF = *MemMI.getMF();
2638
2639 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2640 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm();
2641
2642 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2643 Register DestReg[] = {getLdStRegOp(MI&: MemMI, PairedRegOp: 0).getReg(),
2644 IsPairedInsn ? getLdStRegOp(MI&: MemMI, PairedRegOp: 1).getReg()
2645 : AArch64::NoRegister};
2646
2647 // If the load/store is the first instruction in the block, there's obviously
2648 // not any matching update. Ditto if the memory offset isn't zero.
2649 if (MBBI == B || Offset != 0)
2650 return E;
2651 // If the base register overlaps a destination register, we can't
2652 // merge the update.
2653 if (!isTagStore(MI: MemMI)) {
2654 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i)
2655 if (DestReg[i] == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg[i]))
2656 return E;
2657 }
2658
2659 const bool BaseRegSP = BaseReg == AArch64::SP;
2660 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2661 // FIXME: For now, we always block the optimization over SP in windows
2662 // targets as it requires to adjust the unwind/debug info, messing up
2663 // the unwind info can actually cause a miscompile.
2664 return E;
2665 }
2666
2667 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2668 unsigned RedZoneSize =
2669 Subtarget.getTargetLowering()->getRedZoneSize(F: MF.getFunction());
2670
2671 // Track which register units have been modified and used between the first
2672 // insn (inclusive) and the second insn.
2673 ModifiedRegUnits.clear();
2674 UsedRegUnits.clear();
2675 unsigned Count = 0;
2676 bool MemAccessBeforeSPPreInc = false;
2677 MergeEither = true;
2678 do {
2679 MBBI = prev_nodbg(It: MBBI, Begin: B);
2680 MachineInstr &MI = *MBBI;
2681
2682 // Don't count transient instructions towards the search limit since there
2683 // may be different numbers of them if e.g. debug information is present.
2684 if (!MI.isTransient())
2685 ++Count;
2686
2687 // If we found a match, return it.
2688 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset)) {
2689 // Check that the update value is within our red zone limit (which may be
2690 // zero).
2691 if (MemAccessBeforeSPPreInc && MBBI->getOperand(i: 2).getImm() > RedZoneSize)
2692 return E;
2693 return MBBI;
2694 }
2695
2696 // Update the status of what the instruction clobbered and used.
2697 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2698
2699 // Otherwise, if the base register is used or modified, we have no match, so
2700 // return early.
2701 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2702 !UsedRegUnits.available(Reg: BaseReg))
2703 return E;
2704
2705 // If we have a destination register (i.e. a load instruction) and a
2706 // destination register is used or modified, then we can only merge forward,
2707 // i.e. the combined instruction is put in the place of the memory
2708 // instruction. Same applies if we see a memory access or side effects.
2709 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() ||
2710 (DestReg[0] != AArch64::NoRegister &&
2711 !(ModifiedRegUnits.available(Reg: DestReg[0]) &&
2712 UsedRegUnits.available(Reg: DestReg[0]))) ||
2713 (DestReg[1] != AArch64::NoRegister &&
2714 !(ModifiedRegUnits.available(Reg: DestReg[1]) &&
2715 UsedRegUnits.available(Reg: DestReg[1]))))
2716 MergeEither = false;
2717
2718 // Keep track if we have a memory access before an SP pre-increment, in this
2719 // case we need to validate later that the update amount respects the red
2720 // zone.
2721 if (BaseRegSP && MBBI->mayLoadOrStore())
2722 MemAccessBeforeSPPreInc = true;
2723 } while (MBBI != B && Count < Limit);
2724 return E;
2725}
2726
2727MachineBasicBlock::iterator
2728AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2729 MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2730 MachineBasicBlock::iterator B = I->getParent()->begin();
2731 MachineBasicBlock::iterator E = I->getParent()->end();
2732 MachineInstr &MemMI = *I;
2733 MachineBasicBlock::iterator MBBI = I;
2734
2735 // If the load is the first instruction in the block, there's obviously
2736 // not any matching load or store.
2737 if (MBBI == B)
2738 return E;
2739
2740 // Make sure the IndexReg is killed and the shift amount is zero.
2741 // TODO: Relex this restriction to extend, simplify processing now.
2742 if (!AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).isKill() ||
2743 !AArch64InstrInfo::getLdStAmountOp(MI: MemMI).isImm() ||
2744 (AArch64InstrInfo::getLdStAmountOp(MI: MemMI).getImm() != 0))
2745 return E;
2746
2747 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2748
2749 // Track which register units have been modified and used between the first
2750 // insn (inclusive) and the second insn.
2751 ModifiedRegUnits.clear();
2752 UsedRegUnits.clear();
2753 unsigned Count = 0;
2754 do {
2755 MBBI = prev_nodbg(It: MBBI, Begin: B);
2756 MachineInstr &MI = *MBBI;
2757
2758 // Don't count transient instructions towards the search limit since there
2759 // may be different numbers of them if e.g. debug information is present.
2760 if (!MI.isTransient())
2761 ++Count;
2762
2763 // If we found a match, return it.
2764 if (isMatchingMovConstInsn(MemMI&: *I, MI, IndexReg, Offset)) {
2765 return MBBI;
2766 }
2767
2768 // Update the status of what the instruction clobbered and used.
2769 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2770
2771 // Otherwise, if the index register is used or modified, we have no match,
2772 // so return early.
2773 if (!ModifiedRegUnits.available(Reg: IndexReg) ||
2774 !UsedRegUnits.available(Reg: IndexReg))
2775 return E;
2776
2777 } while (MBBI != B && Count < Limit);
2778 return E;
2779}
2780
2781bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2782 MachineBasicBlock::iterator &MBBI) {
2783 MachineInstr &MI = *MBBI;
2784 // If this is a volatile load, don't mess with it.
2785 if (MI.hasOrderedMemoryRef())
2786 return false;
2787
2788 if (needsWinCFI(MF: MI.getMF()) && MI.getFlag(Flag: MachineInstr::FrameDestroy))
2789 return false;
2790
2791 // Make sure this is a reg+imm.
2792 // FIXME: It is possible to extend it to handle reg+reg cases.
2793 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2794 return false;
2795
2796 // Look backward up to LdStLimit instructions.
2797 MachineBasicBlock::iterator StoreI;
2798 if (findMatchingStore(I: MBBI, Limit: LdStLimit, StoreI)) {
2799 ++NumLoadsFromStoresPromoted;
2800 // Promote the load. Keeping the iterator straight is a
2801 // pain, so we let the merge routine tell us what the next instruction
2802 // is after it's done mucking about.
2803 MBBI = promoteLoadFromStore(LoadI: MBBI, StoreI);
2804 return true;
2805 }
2806 return false;
2807}
2808
2809// Merge adjacent zero stores into a wider store.
2810bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2811 MachineBasicBlock::iterator &MBBI) {
2812 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2813 MachineInstr &MI = *MBBI;
2814 MachineBasicBlock::iterator E = MI.getParent()->end();
2815
2816 if (!TII->isCandidateToMergeOrPair(MI))
2817 return false;
2818
2819 // Look ahead up to LdStLimit instructions for a mergeable instruction.
2820 LdStPairFlags Flags;
2821 MachineBasicBlock::iterator MergeMI =
2822 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ true);
2823 if (MergeMI != E) {
2824 ++NumZeroStoresPromoted;
2825
2826 // Keeping the iterator straight is a pain, so we let the merge routine tell
2827 // us what the next instruction is after it's done mucking about.
2828 MBBI = mergeNarrowZeroStores(I: MBBI, MergeMI, Flags);
2829 return true;
2830 }
2831 return false;
2832}
2833
2834// Find loads and stores that can be merged into a single load or store pair
2835// instruction.
2836bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2837 MachineInstr &MI = *MBBI;
2838 MachineBasicBlock::iterator E = MI.getParent()->end();
2839
2840 if (!TII->isCandidateToMergeOrPair(MI))
2841 return false;
2842
2843 // If disable-ldp feature is opted, do not emit ldp.
2844 if (MI.mayLoad() && Subtarget->hasDisableLdp())
2845 return false;
2846
2847 // If disable-stp feature is opted, do not emit stp.
2848 if (MI.mayStore() && Subtarget->hasDisableStp())
2849 return false;
2850
2851 // Early exit if the offset is not possible to match. (6 bits of positive
2852 // range, plus allow an extra one in case we find a later insn that matches
2853 // with Offset-1)
2854 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2855 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2856 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2857 // Allow one more for offset.
2858 if (Offset > 0)
2859 Offset -= OffsetStride;
2860 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2861 return false;
2862
2863 // Look ahead up to LdStLimit instructions for a pairable instruction.
2864 LdStPairFlags Flags;
2865 MachineBasicBlock::iterator Paired =
2866 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ false);
2867 if (Paired != E) {
2868 // Keeping the iterator straight is a pain, so we let the merge routine tell
2869 // us what the next instruction is after it's done mucking about.
2870 auto Prev = std::prev(x: MBBI);
2871
2872 // Fetch the memoperand of the load/store that is a candidate for
2873 // combination.
2874 MachineMemOperand *MemOp =
2875 MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2876
2877 // If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2878 // that the alignment of the source pointer is at least double the alignment
2879 // of the type.
2880 if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||
2881 (MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2882 // If there is no size/align information, cancel the transformation.
2883 if (!MemOp || !MemOp->getMemoryType().isValid()) {
2884 NumFailedAlignmentCheck++;
2885 return false;
2886 }
2887
2888 // Get the needed alignments to check them if
2889 // ldp-aligned-only/stp-aligned-only features are opted.
2890 uint64_t MemAlignment = MemOp->getAlign().value();
2891 uint64_t TypeAlignment =
2892 Align(MemOp->getSize().getValue().getKnownMinValue()).value();
2893
2894 if (MemAlignment < 2 * TypeAlignment) {
2895 NumFailedAlignmentCheck++;
2896 return false;
2897 }
2898 }
2899
2900 ++NumPairCreated;
2901 if (TII->hasUnscaledLdStOffset(MI))
2902 ++NumUnscaledPairCreated;
2903
2904 MBBI = mergePairedInsns(I: MBBI, Paired, Flags);
2905 // Collect liveness info for instructions between Prev and the new position
2906 // MBBI.
2907 for (auto I = std::next(x: Prev); I != MBBI; I++)
2908 updateDefinedRegisters(MI&: *I, Units&: DefinedInBB, TRI);
2909
2910 return true;
2911 }
2912 return false;
2913}
2914
2915bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2916 (MachineBasicBlock::iterator &MBBI) {
2917 MachineInstr &MI = *MBBI;
2918 MachineBasicBlock::iterator E = MI.getParent()->end();
2919 MachineBasicBlock::iterator Update;
2920
2921 // Look forward to try to form a post-index instruction. For example,
2922 // ldr x0, [x20]
2923 // add x20, x20, #32
2924 // merged into:
2925 // ldr x0, [x20], #32
2926 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset: 0, Limit: UpdateLimit);
2927 if (Update != E) {
2928 // Merge the update into the ld/st.
2929 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2930 /*IsPreIdx=*/false,
2931 /*MergeEither=*/false)) {
2932 MBBI = *NextI;
2933 return true;
2934 }
2935 }
2936
2937 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2938 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2939 return false;
2940
2941 // Look back to try to find a pre-index instruction. For example,
2942 // add x0, x0, #8
2943 // ldr x1, [x0]
2944 // merged into:
2945 // ldr x1, [x0, #8]!
2946 bool MergeEither;
2947 Update = findMatchingUpdateInsnBackward(I: MBBI, Limit: UpdateLimit, MergeEither);
2948 if (Update != E) {
2949 // Merge the update into the ld/st.
2950 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/true,
2951 /*IsPreIdx=*/true, MergeEither)) {
2952 MBBI = *NextI;
2953 return true;
2954 }
2955 }
2956
2957 // The immediate in the load/store is scaled by the size of the memory
2958 // operation. The immediate in the add we're looking for,
2959 // however, is not, so adjust here.
2960 int UnscaledOffset =
2961 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2962
2963 // Look forward to try to find a pre-index instruction. For example,
2964 // ldr x1, [x0, #64]
2965 // add x0, x0, #64
2966 // merged into:
2967 // ldr x1, [x0, #64]!
2968 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset, Limit: UpdateLimit);
2969 if (Update != E) {
2970 // Merge the update into the ld/st.
2971 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2972 /*IsPreIdx=*/true,
2973 /*MergeEither=*/false)) {
2974 MBBI = *NextI;
2975 return true;
2976 }
2977 }
2978
2979 return false;
2980}
2981
2982bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2983 int Scale) {
2984 MachineInstr &MI = *MBBI;
2985 MachineBasicBlock::iterator E = MI.getParent()->end();
2986 MachineBasicBlock::iterator Update;
2987
2988 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2989 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2990 return false;
2991
2992 // Look back to try to find a const offset for index LdSt instruction. For
2993 // example,
2994 // mov x8, #LargeImm ; = a * (1<<12) + imm12
2995 // ldr x1, [x0, x8]
2996 // merged into:
2997 // add x8, x0, a * (1<<12)
2998 // ldr x1, [x8, imm12]
2999 unsigned Offset;
3000 Update = findMatchingConstOffsetBackward(I: MBBI, Limit: LdStConstLimit, Offset);
3001 if (Update != E && (Offset & (Scale - 1)) == 0) {
3002 // Merge the imm12 into the ld/st.
3003 MBBI = mergeConstOffsetInsn(I: MBBI, Update, Offset, Scale);
3004 return true;
3005 }
3006
3007 return false;
3008}
3009
3010bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
3011 bool EnableNarrowZeroStOpt) {
3012 AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
3013
3014 bool Modified = false;
3015 // Four transformations to do here:
3016 // 1) Find loads that directly read from stores and promote them by
3017 // replacing with mov instructions. If the store is wider than the load,
3018 // the load will be replaced with a bitfield extract.
3019 // e.g.,
3020 // str w1, [x0, #4]
3021 // ldrh w2, [x0, #6]
3022 // ; becomes
3023 // str w1, [x0, #4]
3024 // lsr w2, w1, #16
3025 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3026 MBBI != E;) {
3027 if (isPromotableLoadFromStore(MI&: *MBBI) && tryToPromoteLoadFromStore(MBBI))
3028 Modified = true;
3029 else
3030 ++MBBI;
3031 }
3032 // 2) Merge adjacent zero stores into a wider store.
3033 // e.g.,
3034 // strh wzr, [x0]
3035 // strh wzr, [x0, #2]
3036 // ; becomes
3037 // str wzr, [x0]
3038 // e.g.,
3039 // str wzr, [x0]
3040 // str wzr, [x0, #4]
3041 // ; becomes
3042 // str xzr, [x0]
3043 if (EnableNarrowZeroStOpt)
3044 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3045 MBBI != E;) {
3046 if (isPromotableZeroStoreInst(MI&: *MBBI) && tryToMergeZeroStInst(MBBI))
3047 Modified = true;
3048 else
3049 ++MBBI;
3050 }
3051 // 3) Find loads and stores that can be merged into a single load or store
3052 // pair instruction.
3053 // When compiling for SVE 128, also try to combine SVE fill/spill
3054 // instructions into LDP/STP.
3055 // e.g.,
3056 // ldr x0, [x2]
3057 // ldr x1, [x2, #8]
3058 // ; becomes
3059 // ldp x0, x1, [x2]
3060 // e.g.,
3061 // ldr z0, [x2]
3062 // ldr z1, [x2, #1, mul vl]
3063 // ; becomes
3064 // ldp q0, q1, [x2]
3065
3066 if (MBB.getParent()->getRegInfo().tracksLiveness()) {
3067 DefinedInBB.clear();
3068 DefinedInBB.addLiveIns(MBB);
3069 }
3070
3071 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3072 MBBI != E;) {
3073 // Track currently live registers up to this point, to help with
3074 // searching for a rename register on demand.
3075 updateDefinedRegisters(MI&: *MBBI, Units&: DefinedInBB, TRI);
3076 if (TII->isPairableLdStInst(MI: *MBBI) && tryToPairLdStInst(MBBI))
3077 Modified = true;
3078 else
3079 ++MBBI;
3080 }
3081 // 4) Find base register updates that can be merged into the load or store
3082 // as a base-reg writeback.
3083 // e.g.,
3084 // ldr x0, [x2]
3085 // add x2, x2, #4
3086 // ; becomes
3087 // ldr x0, [x2], #4
3088 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3089 MBBI != E;) {
3090 if (isMergeableLdStUpdate(MI&: *MBBI, AFI) && tryToMergeLdStUpdate(MBBI))
3091 Modified = true;
3092 else
3093 ++MBBI;
3094 }
3095
3096 // 5) Find a register assigned with a const value that can be combined with
3097 // into the load or store. e.g.,
3098 // mov x8, #LargeImm ; = a * (1<<12) + imm12
3099 // ldr x1, [x0, x8]
3100 // ; becomes
3101 // add x8, x0, a * (1<<12)
3102 // ldr x1, [x8, imm12]
3103 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3104 MBBI != E;) {
3105 int Scale;
3106 if (isMergeableIndexLdSt(MI&: *MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
3107 Modified = true;
3108 else
3109 ++MBBI;
3110 }
3111
3112 return Modified;
3113}
3114
3115bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
3116 if (skipFunction(F: Fn.getFunction()))
3117 return false;
3118
3119 Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
3120 TII = Subtarget->getInstrInfo();
3121 TRI = Subtarget->getRegisterInfo();
3122 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3123
3124 // Resize the modified and used register unit trackers. We do this once
3125 // per function and then clear the register units each time we optimize a load
3126 // or store.
3127 ModifiedRegUnits.init(TRI: *TRI);
3128 UsedRegUnits.init(TRI: *TRI);
3129 DefinedInBB.init(TRI: *TRI);
3130
3131 bool Modified = false;
3132 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
3133 for (auto &MBB : Fn) {
3134 auto M = optimizeBlock(MBB, EnableNarrowZeroStOpt: enableNarrowZeroStOpt);
3135 Modified |= M;
3136 }
3137
3138 return Modified;
3139}
3140
3141// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
3142// stores near one another? Note: The pre-RA instruction scheduler already has
3143// hooks to try and schedule pairable loads/stores together to improve pairing
3144// opportunities. Thus, pre-RA pairing pass may not be worth the effort.
3145
3146// FIXME: When pairing store instructions it's very possible for this pass to
3147// hoist a store with a KILL marker above another use (without a KILL marker).
3148// The resulting IR is invalid, but nothing uses the KILL markers after this
3149// pass, so it's never caused a problem in practice.
3150
3151/// createAArch64LoadStoreOptimizationPass - returns an instance of the
3152/// load / store optimization pass.
3153FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
3154 return new AArch64LoadStoreOpt();
3155}
3156