1//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains a pass that performs load / store related peephole
10// optimizations. This pass should be run after register allocation.
11//
12// The pass runs after the PrologEpilogInserter where we emit the CFI
13// instructions. In order to preserve the correctness of the unwind information,
14// the pass should not change the order of any two instructions, one of which
15// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16// to unwind information.
17//
18//===----------------------------------------------------------------------===//
19
20#include "AArch64InstrInfo.h"
21#include "AArch64MachineFunctionInfo.h"
22#include "AArch64Subtarget.h"
23#include "MCTargetDesc/AArch64AddressingModes.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/ADT/StringRef.h"
27#include "llvm/ADT/iterator_range.h"
28#include "llvm/Analysis/AliasAnalysis.h"
29#include "llvm/CodeGen/MachineBasicBlock.h"
30#include "llvm/CodeGen/MachineFunction.h"
31#include "llvm/CodeGen/MachineFunctionPass.h"
32#include "llvm/CodeGen/MachineInstr.h"
33#include "llvm/CodeGen/MachineInstrBuilder.h"
34#include "llvm/CodeGen/MachineOperand.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/TargetRegisterInfo.h"
37#include "llvm/IR/DebugLoc.h"
38#include "llvm/MC/MCAsmInfo.h"
39#include "llvm/MC/MCDwarf.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/Debug.h"
43#include "llvm/Support/DebugCounter.h"
44#include "llvm/Support/ErrorHandling.h"
45#include <cassert>
46#include <cstdint>
47#include <functional>
48#include <iterator>
49#include <limits>
50#include <optional>
51
52using namespace llvm;
53
54#define DEBUG_TYPE "aarch64-ldst-opt"
55
56STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
57STATISTIC(NumPostFolded, "Number of post-index updates folded");
58STATISTIC(NumPreFolded, "Number of pre-index updates folded");
59STATISTIC(NumUnscaledPairCreated,
60 "Number of load/store from unscaled generated");
61STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
62STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
63STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
64 "not passed the alignment check");
65STATISTIC(NumConstOffsetFolded,
66 "Number of const offset of index address folded");
67
68DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
69 "Controls which pairs are considered for renaming");
70
71// The LdStLimit limits how far we search for load/store pairs.
72static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
73 cl::init(Val: 20), cl::Hidden);
74
75// The UpdateLimit limits how far we search for update instructions when we form
76// pre-/post-index instructions.
77static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(Val: 100),
78 cl::Hidden);
79
80// The LdStConstLimit limits how far we search for const offset instructions
81// when we form index address load/store instructions.
82static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
83 cl::init(Val: 10), cl::Hidden);
84
85// Enable register renaming to find additional store pairing opportunities.
86static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
87 cl::init(Val: true), cl::Hidden);
88
89#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
90
91namespace {
92
93using LdStPairFlags = struct LdStPairFlags {
94 // If a matching instruction is found, MergeForward is set to true if the
95 // merge is to remove the first instruction and replace the second with
96 // a pair-wise insn, and false if the reverse is true.
97 bool MergeForward = false;
98
99 // SExtIdx gives the index of the result of the load pair that must be
100 // extended. The value of SExtIdx assumes that the paired load produces the
101 // value in this order: (I, returned iterator), i.e., -1 means no value has
102 // to be extended, 0 means I, and 1 means the returned iterator.
103 int SExtIdx = -1;
104
105 // If not none, RenameReg can be used to rename the result register of the
106 // first store in a pair. Currently this only works when merging stores
107 // forward.
108 std::optional<MCPhysReg> RenameReg;
109
110 LdStPairFlags() = default;
111
112 void setMergeForward(bool V = true) { MergeForward = V; }
113 bool getMergeForward() const { return MergeForward; }
114
115 void setSExtIdx(int V) { SExtIdx = V; }
116 int getSExtIdx() const { return SExtIdx; }
117
118 void setRenameReg(MCPhysReg R) { RenameReg = R; }
119 void clearRenameReg() { RenameReg = std::nullopt; }
120 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
121};
122
123struct AArch64LoadStoreOpt {
124 AliasAnalysis *AA;
125 const AArch64InstrInfo *TII;
126 const TargetRegisterInfo *TRI;
127 const AArch64Subtarget *Subtarget;
128
129 // Track which register units have been modified and used.
130 LiveRegUnits ModifiedRegUnits, UsedRegUnits;
131 LiveRegUnits DefinedInBB;
132
133 // Scan the instructions looking for a load/store that can be combined
134 // with the current instruction into a load/store pair.
135 // Return the matching instruction if one is found, else MBB->end().
136 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
137 LdStPairFlags &Flags,
138 unsigned Limit,
139 bool FindNarrowMerge);
140
141 // Scan the instructions looking for a store that writes to the address from
142 // which the current load instruction reads. Return true if one is found.
143 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
144 MachineBasicBlock::iterator &StoreI);
145
146 // Merge the two instructions indicated into a wider narrow store instruction.
147 MachineBasicBlock::iterator
148 mergeNarrowZeroStores(MachineBasicBlock::iterator I,
149 MachineBasicBlock::iterator MergeMI,
150 const LdStPairFlags &Flags);
151
152 // Merge the two instructions indicated into a single pair-wise instruction.
153 MachineBasicBlock::iterator
154 mergePairedInsns(MachineBasicBlock::iterator I,
155 MachineBasicBlock::iterator Paired,
156 const LdStPairFlags &Flags);
157
158 // Promote the load that reads directly from the address stored to.
159 MachineBasicBlock::iterator
160 promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
161 MachineBasicBlock::iterator StoreI);
162
163 // Scan the instruction list to find a base register update that can
164 // be combined with the current instruction (a load or store) using
165 // pre or post indexed addressing with writeback. Scan forwards.
166 MachineBasicBlock::iterator
167 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
168 int UnscaledOffset, unsigned Limit);
169
170 // Scan the instruction list to find a register assigned with a const
171 // value that can be combined with the current instruction (a load or store)
172 // using base addressing with writeback. Scan backwards.
173 MachineBasicBlock::iterator
174 findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
175 unsigned &Offset);
176
177 // Scan the instruction list to find a base register update that can
178 // be combined with the current instruction (a load or store) using
179 // pre or post indexed addressing with writeback. Scan backwards.
180 // `MergeEither` is set to true if the combined instruction may be placed
181 // either at the location of the load/store instruction or at the location of
182 // the update instruction.
183 MachineBasicBlock::iterator
184 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
185 bool &MergeEither);
186
187 // Find an instruction that updates the base register of the ld/st
188 // instruction.
189 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
190 unsigned BaseReg, int Offset);
191
192 bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
193 unsigned IndexReg, unsigned &Offset);
194
195 // Merge a pre- or post-index base register update into a ld/st instruction.
196 std::optional<MachineBasicBlock::iterator>
197 mergeUpdateInsn(MachineBasicBlock::iterator I,
198 MachineBasicBlock::iterator Update, bool IsForward,
199 bool IsPreIdx, bool MergeEither);
200
201 MachineBasicBlock::iterator
202 mergeConstOffsetInsn(MachineBasicBlock::iterator I,
203 MachineBasicBlock::iterator Update, unsigned Offset,
204 int Scale);
205
206 // Find and merge zero store instructions.
207 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
208
209 // Find and pair ldr/str instructions.
210 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
211
212 // Find and promote load instructions which read directly from store.
213 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
214
215 // Find and merge a base register updates before or after a ld/st instruction.
216 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
217
218 // Find and merge an index ldr/st instruction into a base ld/st instruction.
219 bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
220
221 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
222
223 bool runOnMachineFunction(MachineFunction &MF);
224};
225
226struct AArch64LoadStoreOptLegacy : public MachineFunctionPass {
227 static char ID;
228
229 AArch64LoadStoreOptLegacy() : MachineFunctionPass(ID) {}
230
231 bool runOnMachineFunction(MachineFunction &Fn) override;
232
233 void getAnalysisUsage(AnalysisUsage &AU) const override {
234 AU.addRequired<AAResultsWrapperPass>();
235 MachineFunctionPass::getAnalysisUsage(AU);
236 }
237
238 MachineFunctionProperties getRequiredProperties() const override {
239 return MachineFunctionProperties().setNoVRegs();
240 }
241
242 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
243};
244
245char AArch64LoadStoreOptLegacy::ID = 0;
246
247} // end anonymous namespace
248
249INITIALIZE_PASS(AArch64LoadStoreOptLegacy, "aarch64-ldst-opt",
250 AARCH64_LOAD_STORE_OPT_NAME, false, false)
251
252static bool isNarrowStore(unsigned Opc) {
253 switch (Opc) {
254 default:
255 return false;
256 case AArch64::STRBBui:
257 case AArch64::STURBBi:
258 case AArch64::STRHHui:
259 case AArch64::STURHHi:
260 return true;
261 }
262}
263
264// These instruction set memory tag and either keep memory contents unchanged or
265// set it to zero, ignoring the address part of the source register.
266static bool isTagStore(const MachineInstr &MI) {
267 switch (MI.getOpcode()) {
268 default:
269 return false;
270 case AArch64::STGi:
271 case AArch64::STZGi:
272 case AArch64::ST2Gi:
273 case AArch64::STZ2Gi:
274 return true;
275 }
276}
277
278static unsigned getMatchingNonSExtOpcode(unsigned Opc,
279 bool *IsValidLdStrOpc = nullptr) {
280 if (IsValidLdStrOpc)
281 *IsValidLdStrOpc = true;
282 switch (Opc) {
283 default:
284 if (IsValidLdStrOpc)
285 *IsValidLdStrOpc = false;
286 return std::numeric_limits<unsigned>::max();
287 case AArch64::STRDui:
288 case AArch64::STURDi:
289 case AArch64::STRDpre:
290 case AArch64::STRQui:
291 case AArch64::STURQi:
292 case AArch64::STRQpre:
293 case AArch64::STRBBui:
294 case AArch64::STURBBi:
295 case AArch64::STRHHui:
296 case AArch64::STURHHi:
297 case AArch64::STRWui:
298 case AArch64::STRWpre:
299 case AArch64::STURWi:
300 case AArch64::STRXui:
301 case AArch64::STRXpre:
302 case AArch64::STURXi:
303 case AArch64::STR_ZXI:
304 case AArch64::LDRDui:
305 case AArch64::LDURDi:
306 case AArch64::LDRDpre:
307 case AArch64::LDRQui:
308 case AArch64::LDURQi:
309 case AArch64::LDRQpre:
310 case AArch64::LDRWui:
311 case AArch64::LDURWi:
312 case AArch64::LDRWpre:
313 case AArch64::LDRXui:
314 case AArch64::LDURXi:
315 case AArch64::LDRXpre:
316 case AArch64::STRSui:
317 case AArch64::STURSi:
318 case AArch64::STRSpre:
319 case AArch64::LDRSui:
320 case AArch64::LDURSi:
321 case AArch64::LDRSpre:
322 case AArch64::LDR_ZXI:
323 return Opc;
324 case AArch64::LDRSWui:
325 return AArch64::LDRWui;
326 case AArch64::LDURSWi:
327 return AArch64::LDURWi;
328 case AArch64::LDRSWpre:
329 return AArch64::LDRWpre;
330 }
331}
332
333static unsigned getMatchingWideOpcode(unsigned Opc) {
334 switch (Opc) {
335 default:
336 llvm_unreachable("Opcode has no wide equivalent!");
337 case AArch64::STRBBui:
338 return AArch64::STRHHui;
339 case AArch64::STRHHui:
340 return AArch64::STRWui;
341 case AArch64::STURBBi:
342 return AArch64::STURHHi;
343 case AArch64::STURHHi:
344 return AArch64::STURWi;
345 case AArch64::STURWi:
346 return AArch64::STURXi;
347 case AArch64::STRWui:
348 return AArch64::STRXui;
349 }
350}
351
352static unsigned getMatchingPairOpcode(unsigned Opc) {
353 switch (Opc) {
354 default:
355 llvm_unreachable("Opcode has no pairwise equivalent!");
356 case AArch64::STRSui:
357 case AArch64::STURSi:
358 return AArch64::STPSi;
359 case AArch64::STRSpre:
360 return AArch64::STPSpre;
361 case AArch64::STRDui:
362 case AArch64::STURDi:
363 return AArch64::STPDi;
364 case AArch64::STRDpre:
365 return AArch64::STPDpre;
366 case AArch64::STRQui:
367 case AArch64::STURQi:
368 case AArch64::STR_ZXI:
369 return AArch64::STPQi;
370 case AArch64::STRQpre:
371 return AArch64::STPQpre;
372 case AArch64::STRWui:
373 case AArch64::STURWi:
374 return AArch64::STPWi;
375 case AArch64::STRWpre:
376 return AArch64::STPWpre;
377 case AArch64::STRXui:
378 case AArch64::STURXi:
379 return AArch64::STPXi;
380 case AArch64::STRXpre:
381 return AArch64::STPXpre;
382 case AArch64::LDRSui:
383 case AArch64::LDURSi:
384 return AArch64::LDPSi;
385 case AArch64::LDRSpre:
386 return AArch64::LDPSpre;
387 case AArch64::LDRDui:
388 case AArch64::LDURDi:
389 return AArch64::LDPDi;
390 case AArch64::LDRDpre:
391 return AArch64::LDPDpre;
392 case AArch64::LDRQui:
393 case AArch64::LDURQi:
394 case AArch64::LDR_ZXI:
395 return AArch64::LDPQi;
396 case AArch64::LDRQpre:
397 return AArch64::LDPQpre;
398 case AArch64::LDRWui:
399 case AArch64::LDURWi:
400 return AArch64::LDPWi;
401 case AArch64::LDRWpre:
402 return AArch64::LDPWpre;
403 case AArch64::LDRXui:
404 case AArch64::LDURXi:
405 return AArch64::LDPXi;
406 case AArch64::LDRXpre:
407 return AArch64::LDPXpre;
408 case AArch64::LDRSWui:
409 case AArch64::LDURSWi:
410 return AArch64::LDPSWi;
411 case AArch64::LDRSWpre:
412 return AArch64::LDPSWpre;
413 }
414}
415
416static unsigned isMatchingStore(MachineInstr &LoadInst,
417 MachineInstr &StoreInst) {
418 unsigned LdOpc = LoadInst.getOpcode();
419 unsigned StOpc = StoreInst.getOpcode();
420 switch (LdOpc) {
421 default:
422 llvm_unreachable("Unsupported load instruction!");
423 case AArch64::LDRBBui:
424 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
425 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
426 case AArch64::LDURBBi:
427 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
428 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
429 case AArch64::LDRHHui:
430 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
431 StOpc == AArch64::STRXui;
432 case AArch64::LDURHHi:
433 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
434 StOpc == AArch64::STURXi;
435 case AArch64::LDRWui:
436 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
437 case AArch64::LDURWi:
438 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
439 case AArch64::LDRXui:
440 return StOpc == AArch64::STRXui;
441 case AArch64::LDURXi:
442 return StOpc == AArch64::STURXi;
443 }
444}
445
446static unsigned getPreIndexedOpcode(unsigned Opc) {
447 // FIXME: We don't currently support creating pre-indexed loads/stores when
448 // the load or store is the unscaled version. If we decide to perform such an
449 // optimization in the future the cases for the unscaled loads/stores will
450 // need to be added here.
451 switch (Opc) {
452 default:
453 llvm_unreachable("Opcode has no pre-indexed equivalent!");
454 case AArch64::STRBui:
455 return AArch64::STRBpre;
456 case AArch64::STRHui:
457 return AArch64::STRHpre;
458 case AArch64::STRSui:
459 return AArch64::STRSpre;
460 case AArch64::STRDui:
461 return AArch64::STRDpre;
462 case AArch64::STRQui:
463 return AArch64::STRQpre;
464 case AArch64::STRBBui:
465 return AArch64::STRBBpre;
466 case AArch64::STRHHui:
467 return AArch64::STRHHpre;
468 case AArch64::STRWui:
469 return AArch64::STRWpre;
470 case AArch64::STRXui:
471 return AArch64::STRXpre;
472 case AArch64::LDRBui:
473 return AArch64::LDRBpre;
474 case AArch64::LDRHui:
475 return AArch64::LDRHpre;
476 case AArch64::LDRSui:
477 return AArch64::LDRSpre;
478 case AArch64::LDRDui:
479 return AArch64::LDRDpre;
480 case AArch64::LDRQui:
481 return AArch64::LDRQpre;
482 case AArch64::LDRBBui:
483 return AArch64::LDRBBpre;
484 case AArch64::LDRHHui:
485 return AArch64::LDRHHpre;
486 case AArch64::LDRWui:
487 return AArch64::LDRWpre;
488 case AArch64::LDRXui:
489 return AArch64::LDRXpre;
490 case AArch64::LDRSWui:
491 return AArch64::LDRSWpre;
492 case AArch64::LDPSi:
493 return AArch64::LDPSpre;
494 case AArch64::LDPSWi:
495 return AArch64::LDPSWpre;
496 case AArch64::LDPDi:
497 return AArch64::LDPDpre;
498 case AArch64::LDPQi:
499 return AArch64::LDPQpre;
500 case AArch64::LDPWi:
501 return AArch64::LDPWpre;
502 case AArch64::LDPXi:
503 return AArch64::LDPXpre;
504 case AArch64::STPSi:
505 return AArch64::STPSpre;
506 case AArch64::STPDi:
507 return AArch64::STPDpre;
508 case AArch64::STPQi:
509 return AArch64::STPQpre;
510 case AArch64::STPWi:
511 return AArch64::STPWpre;
512 case AArch64::STPXi:
513 return AArch64::STPXpre;
514 case AArch64::STGi:
515 return AArch64::STGPreIndex;
516 case AArch64::STZGi:
517 return AArch64::STZGPreIndex;
518 case AArch64::ST2Gi:
519 return AArch64::ST2GPreIndex;
520 case AArch64::STZ2Gi:
521 return AArch64::STZ2GPreIndex;
522 case AArch64::STGPi:
523 return AArch64::STGPpre;
524 }
525}
526
527static unsigned getBaseAddressOpcode(unsigned Opc) {
528 // TODO: Add more index address stores.
529 switch (Opc) {
530 default:
531 llvm_unreachable("Opcode has no base address equivalent!");
532 case AArch64::LDRBroX:
533 return AArch64::LDRBui;
534 case AArch64::LDRBBroX:
535 return AArch64::LDRBBui;
536 case AArch64::LDRSBXroX:
537 return AArch64::LDRSBXui;
538 case AArch64::LDRSBWroX:
539 return AArch64::LDRSBWui;
540 case AArch64::LDRHroX:
541 return AArch64::LDRHui;
542 case AArch64::LDRHHroX:
543 return AArch64::LDRHHui;
544 case AArch64::LDRSHXroX:
545 return AArch64::LDRSHXui;
546 case AArch64::LDRSHWroX:
547 return AArch64::LDRSHWui;
548 case AArch64::LDRWroX:
549 return AArch64::LDRWui;
550 case AArch64::LDRSroX:
551 return AArch64::LDRSui;
552 case AArch64::LDRSWroX:
553 return AArch64::LDRSWui;
554 case AArch64::LDRDroX:
555 return AArch64::LDRDui;
556 case AArch64::LDRXroX:
557 return AArch64::LDRXui;
558 case AArch64::LDRQroX:
559 return AArch64::LDRQui;
560 }
561}
562
563static unsigned getPostIndexedOpcode(unsigned Opc) {
564 switch (Opc) {
565 default:
566 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
567 case AArch64::STRBui:
568 return AArch64::STRBpost;
569 case AArch64::STRHui:
570 return AArch64::STRHpost;
571 case AArch64::STRSui:
572 case AArch64::STURSi:
573 return AArch64::STRSpost;
574 case AArch64::STRDui:
575 case AArch64::STURDi:
576 return AArch64::STRDpost;
577 case AArch64::STRQui:
578 case AArch64::STURQi:
579 return AArch64::STRQpost;
580 case AArch64::STRBBui:
581 return AArch64::STRBBpost;
582 case AArch64::STRHHui:
583 return AArch64::STRHHpost;
584 case AArch64::STRWui:
585 case AArch64::STURWi:
586 return AArch64::STRWpost;
587 case AArch64::STRXui:
588 case AArch64::STURXi:
589 return AArch64::STRXpost;
590 case AArch64::LDRBui:
591 return AArch64::LDRBpost;
592 case AArch64::LDRHui:
593 return AArch64::LDRHpost;
594 case AArch64::LDRSui:
595 case AArch64::LDURSi:
596 return AArch64::LDRSpost;
597 case AArch64::LDRDui:
598 case AArch64::LDURDi:
599 return AArch64::LDRDpost;
600 case AArch64::LDRQui:
601 case AArch64::LDURQi:
602 return AArch64::LDRQpost;
603 case AArch64::LDRBBui:
604 return AArch64::LDRBBpost;
605 case AArch64::LDRHHui:
606 return AArch64::LDRHHpost;
607 case AArch64::LDRWui:
608 case AArch64::LDURWi:
609 return AArch64::LDRWpost;
610 case AArch64::LDRXui:
611 case AArch64::LDURXi:
612 return AArch64::LDRXpost;
613 case AArch64::LDRSWui:
614 return AArch64::LDRSWpost;
615 case AArch64::LDPSi:
616 return AArch64::LDPSpost;
617 case AArch64::LDPSWi:
618 return AArch64::LDPSWpost;
619 case AArch64::LDPDi:
620 return AArch64::LDPDpost;
621 case AArch64::LDPQi:
622 return AArch64::LDPQpost;
623 case AArch64::LDPWi:
624 return AArch64::LDPWpost;
625 case AArch64::LDPXi:
626 return AArch64::LDPXpost;
627 case AArch64::STPSi:
628 return AArch64::STPSpost;
629 case AArch64::STPDi:
630 return AArch64::STPDpost;
631 case AArch64::STPQi:
632 return AArch64::STPQpost;
633 case AArch64::STPWi:
634 return AArch64::STPWpost;
635 case AArch64::STPXi:
636 return AArch64::STPXpost;
637 case AArch64::STGi:
638 return AArch64::STGPostIndex;
639 case AArch64::STZGi:
640 return AArch64::STZGPostIndex;
641 case AArch64::ST2Gi:
642 return AArch64::ST2GPostIndex;
643 case AArch64::STZ2Gi:
644 return AArch64::STZ2GPostIndex;
645 case AArch64::STGPi:
646 return AArch64::STGPpost;
647 }
648}
649
650static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
651
652 unsigned OpcA = FirstMI.getOpcode();
653 unsigned OpcB = MI.getOpcode();
654
655 switch (OpcA) {
656 default:
657 return false;
658 case AArch64::STRSpre:
659 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
660 case AArch64::STRDpre:
661 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
662 case AArch64::STRQpre:
663 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
664 case AArch64::STRWpre:
665 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
666 case AArch64::STRXpre:
667 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
668 case AArch64::LDRSpre:
669 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
670 case AArch64::LDRDpre:
671 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
672 case AArch64::LDRQpre:
673 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
674 case AArch64::LDRWpre:
675 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
676 case AArch64::LDRXpre:
677 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
678 case AArch64::LDRSWpre:
679 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
680 }
681}
682
683// Returns the scale and offset range of pre/post indexed variants of MI.
684static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
685 int &MinOffset, int &MaxOffset) {
686 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
687 bool IsTagStore = isTagStore(MI);
688 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
689 // as in the "unsigned offset" variant.
690 // All other pre/post indexed ldst instructions are unscaled.
691 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
692
693 if (IsPaired) {
694 MinOffset = -64;
695 MaxOffset = 63;
696 } else {
697 MinOffset = -256;
698 MaxOffset = 255;
699 }
700}
701
702static MachineOperand &getLdStRegOp(MachineInstr &MI,
703 unsigned PairedRegOp = 0) {
704 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
705 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
706 if (IsPreLdSt)
707 PairedRegOp += 1;
708 unsigned Idx =
709 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
710 return MI.getOperand(i: Idx);
711}
712
713static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
714 MachineInstr &StoreInst,
715 const AArch64InstrInfo *TII) {
716 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
717 int LoadSize = TII->getMemScale(MI: LoadInst);
718 int StoreSize = TII->getMemScale(MI: StoreInst);
719 int UnscaledStOffset =
720 TII->hasUnscaledLdStOffset(MI&: StoreInst)
721 ? AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm()
722 : AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm() * StoreSize;
723 int UnscaledLdOffset =
724 TII->hasUnscaledLdStOffset(MI&: LoadInst)
725 ? AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm()
726 : AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm() * LoadSize;
727 return (UnscaledStOffset <= UnscaledLdOffset) &&
728 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
729}
730
731static bool isPromotableZeroStoreInst(MachineInstr &MI) {
732 unsigned Opc = MI.getOpcode();
733 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
734 isNarrowStore(Opc)) &&
735 getLdStRegOp(MI).getReg() == AArch64::WZR;
736}
737
738static bool isPromotableLoadFromStore(MachineInstr &MI) {
739 switch (MI.getOpcode()) {
740 default:
741 return false;
742 // Scaled instructions.
743 case AArch64::LDRBBui:
744 case AArch64::LDRHHui:
745 case AArch64::LDRWui:
746 case AArch64::LDRXui:
747 // Unscaled instructions.
748 case AArch64::LDURBBi:
749 case AArch64::LDURHHi:
750 case AArch64::LDURWi:
751 case AArch64::LDURXi:
752 return true;
753 }
754}
755
756static bool isMergeableLdStUpdate(MachineInstr &MI, AArch64FunctionInfo &AFI) {
757 unsigned Opc = MI.getOpcode();
758 switch (Opc) {
759 default:
760 return false;
761 // Scaled instructions.
762 case AArch64::STRBui:
763 case AArch64::STRHui:
764 case AArch64::STRSui:
765 case AArch64::STRDui:
766 case AArch64::STRQui:
767 case AArch64::STRXui:
768 case AArch64::STRWui:
769 case AArch64::STRHHui:
770 case AArch64::STRBBui:
771 case AArch64::LDRBui:
772 case AArch64::LDRHui:
773 case AArch64::LDRSui:
774 case AArch64::LDRDui:
775 case AArch64::LDRQui:
776 case AArch64::LDRXui:
777 case AArch64::LDRWui:
778 case AArch64::LDRHHui:
779 case AArch64::LDRBBui:
780 case AArch64::STGi:
781 case AArch64::STZGi:
782 case AArch64::ST2Gi:
783 case AArch64::STZ2Gi:
784 case AArch64::STGPi:
785 // Unscaled instructions.
786 case AArch64::STURSi:
787 case AArch64::STURDi:
788 case AArch64::STURQi:
789 case AArch64::STURWi:
790 case AArch64::STURXi:
791 case AArch64::LDURSi:
792 case AArch64::LDURDi:
793 case AArch64::LDURQi:
794 case AArch64::LDURWi:
795 case AArch64::LDURXi:
796 // Paired instructions.
797 case AArch64::LDPSi:
798 case AArch64::LDPSWi:
799 case AArch64::LDPDi:
800 case AArch64::LDPQi:
801 case AArch64::LDPWi:
802 case AArch64::LDPXi:
803 case AArch64::STPSi:
804 case AArch64::STPDi:
805 case AArch64::STPQi:
806 case AArch64::STPWi:
807 case AArch64::STPXi:
808 // Make sure this is a reg+imm (as opposed to an address reloc).
809 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
810 return false;
811
812 // When using stack tagging, simple sp+imm loads and stores are not
813 // tag-checked, but pre- and post-indexed versions of them are, so we can't
814 // replace the former with the latter. This transformation would be valid
815 // if the load/store accesses an untagged stack slot, but we don't have
816 // that information available after frame indices have been eliminated.
817 if (AFI.isMTETagged() &&
818 AArch64InstrInfo::getLdStBaseOp(MI).getReg() == AArch64::SP)
819 return false;
820
821 return true;
822 }
823}
824
825// Make sure this is a reg+reg Ld/St
826static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
827 unsigned Opc = MI.getOpcode();
828 switch (Opc) {
829 default:
830 return false;
831 // Scaled instructions.
832 // TODO: Add more index address stores.
833 case AArch64::LDRBroX:
834 case AArch64::LDRBBroX:
835 case AArch64::LDRSBXroX:
836 case AArch64::LDRSBWroX:
837 Scale = 1;
838 return true;
839 case AArch64::LDRHroX:
840 case AArch64::LDRHHroX:
841 case AArch64::LDRSHXroX:
842 case AArch64::LDRSHWroX:
843 Scale = 2;
844 return true;
845 case AArch64::LDRWroX:
846 case AArch64::LDRSroX:
847 case AArch64::LDRSWroX:
848 Scale = 4;
849 return true;
850 case AArch64::LDRDroX:
851 case AArch64::LDRXroX:
852 Scale = 8;
853 return true;
854 case AArch64::LDRQroX:
855 Scale = 16;
856 return true;
857 }
858}
859
860static bool isRewritableImplicitDef(const MachineOperand &MO) {
861 switch (MO.getParent()->getOpcode()) {
862 default:
863 return MO.isRenamable();
864 case AArch64::ORRWrs:
865 case AArch64::ADDWri:
866 return true;
867 }
868}
869
870MachineBasicBlock::iterator
871AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
872 MachineBasicBlock::iterator MergeMI,
873 const LdStPairFlags &Flags) {
874 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
875 "Expected promotable zero stores.");
876
877 MachineBasicBlock::iterator E = I->getParent()->end();
878 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
879 // If NextI is the second of the two instructions to be merged, we need
880 // to skip one further. Either way we merge will invalidate the iterator,
881 // and we don't need to scan the new instruction, as it's a pairwise
882 // instruction, which we're not considering for further action anyway.
883 if (NextI == MergeMI)
884 NextI = next_nodbg(It: NextI, End: E);
885
886 unsigned Opc = I->getOpcode();
887 unsigned MergeMIOpc = MergeMI->getOpcode();
888 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
889 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(Opc: MergeMIOpc);
890 int OffsetStride = IsScaled ? TII->getMemScale(MI: *I) : 1;
891 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(MI: *MergeMI) : 1;
892
893 bool MergeForward = Flags.getMergeForward();
894 // Insert our new paired instruction after whichever of the paired
895 // instructions MergeForward indicates.
896 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
897 // Also based on MergeForward is from where we copy the base register operand
898 // so we get the flags compatible with the input code.
899 const MachineOperand &BaseRegOp =
900 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *MergeMI)
901 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
902
903 // Which register is Rt and which is Rt2 depends on the offset order.
904 int64_t IOffsetInBytes =
905 AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm() * OffsetStride;
906 int64_t MIOffsetInBytes =
907 AArch64InstrInfo::getLdStOffsetOp(MI: *MergeMI).getImm() *
908 MergeMIOffsetStride;
909 // Select final offset based on the offset order.
910 int64_t OffsetImm;
911 if (IOffsetInBytes > MIOffsetInBytes)
912 OffsetImm = MIOffsetInBytes;
913 else
914 OffsetImm = IOffsetInBytes;
915
916 int NewOpcode = getMatchingWideOpcode(Opc);
917 // Adjust final offset on scaled stores because the new instruction
918 // has a different scale.
919 if (!TII->hasUnscaledLdStOffset(Opc: NewOpcode)) {
920 int NewOffsetStride = TII->getMemScale(Opc: NewOpcode);
921 assert(((OffsetImm % NewOffsetStride) == 0) &&
922 "Offset should be a multiple of the store memory scale");
923 OffsetImm = OffsetImm / NewOffsetStride;
924 }
925
926 // Construct the new instruction.
927 DebugLoc DL = I->getDebugLoc();
928 MachineBasicBlock *MBB = I->getParent();
929 MachineInstrBuilder MIB;
930 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: NewOpcode))
931 .addReg(RegNo: isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
932 .add(MO: BaseRegOp)
933 .addImm(Val: OffsetImm)
934 .cloneMergedMemRefs(OtherMIs: {&*I, &*MergeMI})
935 .setMIFlags(I->mergeFlagsWith(Other: *MergeMI));
936 (void)MIB;
937
938 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
939 LLVM_DEBUG(I->print(dbgs()));
940 LLVM_DEBUG(dbgs() << " ");
941 LLVM_DEBUG(MergeMI->print(dbgs()));
942 LLVM_DEBUG(dbgs() << " with instruction:\n ");
943 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
944 LLVM_DEBUG(dbgs() << "\n");
945
946 // Erase the old instructions.
947 I->eraseFromParent();
948 MergeMI->eraseFromParent();
949 return NextI;
950}
951
952// Apply Fn to all instructions between MI and the beginning of the block, until
953// a def for DefReg is reached. Returns true, iff Fn returns true for all
954// visited instructions. Stop after visiting Limit iterations.
955static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
956 const TargetRegisterInfo *TRI, unsigned Limit,
957 std::function<bool(MachineInstr &, bool)> &Fn) {
958 auto MBB = MI.getParent();
959 for (MachineInstr &I :
960 instructionsWithoutDebug(It: MI.getReverseIterator(), End: MBB->instr_rend())) {
961 if (!Limit)
962 return false;
963 --Limit;
964
965 bool isDef = any_of(Range: I.operands(), P: [DefReg, TRI](MachineOperand &MOP) {
966 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
967 TRI->regsOverlap(RegA: MOP.getReg(), RegB: DefReg);
968 });
969 if (!Fn(I, isDef))
970 return false;
971 if (isDef)
972 break;
973 }
974 return true;
975}
976
977static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
978 const TargetRegisterInfo *TRI) {
979
980 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
981 if (MOP.isReg() && MOP.isKill())
982 Units.removeReg(Reg: MOP.getReg());
983
984 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
985 if (MOP.isReg() && !MOP.isKill())
986 Units.addReg(Reg: MOP.getReg());
987}
988
989/// This function will add a new entry into the debugValueSubstitutions table
990/// when two instruction have been merged into a new one represented by \p
991/// MergedInstr.
992static void addDebugSubstitutionsToTable(MachineFunction *MF,
993 unsigned InstrNumToSet,
994 MachineInstr &OriginalInstr,
995 MachineInstr &MergedInstr) {
996
997 // Figure out the Operand Index of the destination register of the
998 // OriginalInstr in the new MergedInstr.
999 auto Reg = OriginalInstr.getOperand(i: 0).getReg();
1000 unsigned OperandNo = 0;
1001 bool RegFound = false;
1002 for (const auto Op : MergedInstr.operands()) {
1003 if (Op.getReg() == Reg) {
1004 RegFound = true;
1005 break;
1006 }
1007 OperandNo++;
1008 }
1009
1010 if (RegFound)
1011 MF->makeDebugValueSubstitution({OriginalInstr.peekDebugInstrNum(), 0},
1012 {InstrNumToSet, OperandNo});
1013}
1014
1015MachineBasicBlock::iterator
1016AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
1017 MachineBasicBlock::iterator Paired,
1018 const LdStPairFlags &Flags) {
1019 MachineBasicBlock::iterator E = I->getParent()->end();
1020 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
1021 // If NextI is the second of the two instructions to be merged, we need
1022 // to skip one further. Either way we merge will invalidate the iterator,
1023 // and we don't need to scan the new instruction, as it's a pairwise
1024 // instruction, which we're not considering for further action anyway.
1025 if (NextI == Paired)
1026 NextI = next_nodbg(It: NextI, End: E);
1027
1028 int SExtIdx = Flags.getSExtIdx();
1029 unsigned Opc =
1030 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(Opc: I->getOpcode());
1031 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
1032 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: *I) : 1;
1033
1034 bool MergeForward = Flags.getMergeForward();
1035
1036 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
1037 if (RenameReg) {
1038 MCRegister RegToRename = getLdStRegOp(MI&: *I).getReg();
1039 DefinedInBB.addReg(Reg: *RenameReg);
1040
1041 // Return the sub/super register for RenameReg, matching the size of
1042 // OriginalReg.
1043 auto GetMatchingSubReg =
1044 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
1045 for (MCPhysReg SubOrSuper :
1046 TRI->sub_and_superregs_inclusive(Reg: *RenameReg)) {
1047 if (C->contains(Reg: SubOrSuper))
1048 return SubOrSuper;
1049 }
1050 llvm_unreachable("Should have found matching sub or super register!");
1051 };
1052
1053 std::function<bool(MachineInstr &, bool)> UpdateMIs =
1054 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
1055 bool IsDef) {
1056 if (IsDef) {
1057 bool SeenDef = false;
1058 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1059 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1060 // Rename the first explicit definition and all implicit
1061 // definitions matching RegToRename.
1062 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1063 (!MergeForward || !SeenDef ||
1064 (MOP.isDef() && MOP.isImplicit())) &&
1065 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1066 assert((MOP.isImplicit() ||
1067 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1068 "Need renamable operands");
1069 Register MatchingReg;
1070 if (const TargetRegisterClass *RC =
1071 MI.getRegClassConstraint(OpIdx, TII, TRI))
1072 MatchingReg = GetMatchingSubReg(RC);
1073 else {
1074 if (!isRewritableImplicitDef(MO: MOP))
1075 continue;
1076 MatchingReg = GetMatchingSubReg(
1077 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1078 }
1079 MOP.setReg(MatchingReg);
1080 SeenDef = true;
1081 }
1082 }
1083 } else {
1084 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1085 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1086 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1087 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1088 assert((MOP.isImplicit() ||
1089 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1090 "Need renamable operands");
1091 Register MatchingReg;
1092 if (const TargetRegisterClass *RC =
1093 MI.getRegClassConstraint(OpIdx, TII, TRI))
1094 MatchingReg = GetMatchingSubReg(RC);
1095 else
1096 MatchingReg = GetMatchingSubReg(
1097 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1098 assert(MatchingReg != AArch64::NoRegister &&
1099 "Cannot find matching regs for renaming");
1100 MOP.setReg(MatchingReg);
1101 }
1102 }
1103 }
1104 LLVM_DEBUG(dbgs() << "Renamed " << MI);
1105 return true;
1106 };
1107 forAllMIsUntilDef(MI&: MergeForward ? *I : *Paired->getPrevNode(), DefReg: RegToRename,
1108 TRI, UINT32_MAX, Fn&: UpdateMIs);
1109
1110#if !defined(NDEBUG)
1111 // For forward merging store:
1112 // Make sure the register used for renaming is not used between the
1113 // paired instructions. That would trash the content before the new
1114 // paired instruction.
1115 MCPhysReg RegToCheck = *RenameReg;
1116 // For backward merging load:
1117 // Make sure the register being renamed is not used between the
1118 // paired instructions. That would trash the content after the new
1119 // paired instruction.
1120 if (!MergeForward)
1121 RegToCheck = RegToRename;
1122 for (auto &MI :
1123 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
1124 MergeForward ? std::next(I) : I,
1125 MergeForward ? std::next(Paired) : Paired))
1126 assert(all_of(MI.operands(),
1127 [this, RegToCheck](const MachineOperand &MOP) {
1128 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1129 MOP.isUndef() ||
1130 !TRI->regsOverlap(MOP.getReg(), RegToCheck);
1131 }) &&
1132 "Rename register used between paired instruction, trashing the "
1133 "content");
1134#endif
1135 }
1136
1137 // Insert our new paired instruction after whichever of the paired
1138 // instructions MergeForward indicates.
1139 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
1140 // Also based on MergeForward is from where we copy the base register operand
1141 // so we get the flags compatible with the input code.
1142 const MachineOperand &BaseRegOp =
1143 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *Paired)
1144 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
1145
1146 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm();
1147 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(MI: *Paired).getImm();
1148 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Opc: Paired->getOpcode());
1149 if (IsUnscaled != PairedIsUnscaled) {
1150 // We're trying to pair instructions that differ in how they are scaled. If
1151 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
1152 // the opposite (i.e., make Paired's offset unscaled).
1153 int MemSize = TII->getMemScale(MI: *Paired);
1154 if (PairedIsUnscaled) {
1155 // If the unscaled offset isn't a multiple of the MemSize, we can't
1156 // pair the operations together.
1157 assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1158 "Offset should be a multiple of the stride!");
1159 PairedOffset /= MemSize;
1160 } else {
1161 PairedOffset *= MemSize;
1162 }
1163 }
1164
1165 // Which register is Rt and which is Rt2 depends on the offset order.
1166 // However, for pre load/stores the Rt should be the one of the pre
1167 // load/store.
1168 MachineInstr *RtMI, *Rt2MI;
1169 if (Offset == PairedOffset + OffsetStride &&
1170 !AArch64InstrInfo::isPreLdSt(MI: *I)) {
1171 RtMI = &*Paired;
1172 Rt2MI = &*I;
1173 // Here we swapped the assumption made for SExtIdx.
1174 // I.e., we turn ldp I, Paired into ldp Paired, I.
1175 // Update the index accordingly.
1176 if (SExtIdx != -1)
1177 SExtIdx = (SExtIdx + 1) % 2;
1178 } else {
1179 RtMI = &*I;
1180 Rt2MI = &*Paired;
1181 }
1182 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(MI: *RtMI).getImm();
1183 // Scale the immediate offset, if necessary.
1184 if (TII->hasUnscaledLdStOffset(Opc: RtMI->getOpcode())) {
1185 assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1186 "Unscaled offset cannot be scaled.");
1187 OffsetImm /= TII->getMemScale(MI: *RtMI);
1188 }
1189
1190 // Construct the new instruction.
1191 MachineInstrBuilder MIB;
1192 DebugLoc DL = I->getDebugLoc();
1193 MachineBasicBlock *MBB = I->getParent();
1194 MachineOperand RegOp0 = getLdStRegOp(MI&: *RtMI);
1195 MachineOperand RegOp1 = getLdStRegOp(MI&: *Rt2MI);
1196 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1197 // Kill flags may become invalid when moving stores for pairing.
1198 if (RegOp0.isUse()) {
1199 if (!MergeForward) {
1200 // Clear kill flags on store if moving upwards. Example:
1201 // STRWui kill %w0, ...
1202 // USE %w1
1203 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1204 // We are about to move the store of w1, so its kill flag may become
1205 // invalid; not the case for w0.
1206 // Since w1 is used between the stores, the kill flag on w1 is cleared
1207 // after merging.
1208 // STPWi kill %w0, %w1, ...
1209 // USE %w1
1210 for (auto It = std::next(x: I); It != Paired && PairedRegOp.isKill(); ++It)
1211 if (It->readsRegister(Reg: PairedRegOp.getReg(), TRI))
1212 PairedRegOp.setIsKill(false);
1213 } else {
1214 // Clear kill flags of the first stores register. Example:
1215 // STRWui %w1, ...
1216 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1217 // STRW %w0
1218 Register Reg = getLdStRegOp(MI&: *I).getReg();
1219 for (MachineInstr &MI :
1220 make_range(x: std::next(x: I->getIterator()), y: Paired->getIterator()))
1221 MI.clearRegisterKills(Reg, RegInfo: TRI);
1222 }
1223 }
1224
1225 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1226 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: MatchPairOpcode));
1227
1228 // Adds the pre-index operand for pre-indexed ld/st pairs.
1229 if (AArch64InstrInfo::isPreLdSt(MI: *RtMI))
1230 MIB.addReg(RegNo: BaseRegOp.getReg(), Flags: RegState::Define);
1231
1232 MIB.add(MO: RegOp0)
1233 .add(MO: RegOp1)
1234 .add(MO: BaseRegOp)
1235 .addImm(Val: OffsetImm)
1236 .cloneMergedMemRefs(OtherMIs: {&*I, &*Paired})
1237 .setMIFlags(I->mergeFlagsWith(Other: *Paired));
1238
1239 (void)MIB;
1240
1241 LLVM_DEBUG(
1242 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1243 LLVM_DEBUG(I->print(dbgs()));
1244 LLVM_DEBUG(dbgs() << " ");
1245 LLVM_DEBUG(Paired->print(dbgs()));
1246 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1247 if (SExtIdx != -1) {
1248 // Generate the sign extension for the proper result of the ldp.
1249 // I.e., with X1, that would be:
1250 // %w1 = KILL %w1, implicit-def %x1
1251 // %x1 = SBFMXri killed %x1, 0, 31
1252 MachineOperand &DstMO = MIB->getOperand(i: SExtIdx);
1253 // Right now, DstMO has the extended register, since it comes from an
1254 // extended opcode.
1255 Register DstRegX = DstMO.getReg();
1256 // Get the W variant of that register.
1257 Register DstRegW = TRI->getSubReg(Reg: DstRegX, Idx: AArch64::sub_32);
1258 // Update the result of LDP to use the W instead of the X variant.
1259 DstMO.setReg(DstRegW);
1260 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1261 LLVM_DEBUG(dbgs() << "\n");
1262 // Make the machine verifier happy by providing a definition for
1263 // the X register.
1264 // Insert this definition right after the generated LDP, i.e., before
1265 // InsertionPoint.
1266 MachineInstrBuilder MIBKill =
1267 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::KILL), DestReg: DstRegW)
1268 .addReg(RegNo: DstRegW)
1269 .addReg(RegNo: DstRegX, Flags: RegState::Define);
1270 MIBKill->getOperand(i: 2).setImplicit();
1271 // Create the sign extension.
1272 MachineInstrBuilder MIBSXTW =
1273 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: AArch64::SBFMXri), DestReg: DstRegX)
1274 .addReg(RegNo: DstRegX)
1275 .addImm(Val: 0)
1276 .addImm(Val: 31);
1277 (void)MIBSXTW;
1278
1279 // In the case of a sign-extend, where we have something like:
1280 // debugValueSubstitutions:[]
1281 // $w1 = LDRWui $x0, 1, debug-instr-number 1
1282 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1283 // $x0 = LDRSWui $x0, 0, debug-instr-number 2
1284 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1285
1286 // It will be converted to:
1287 // debugValueSubstitutions:[]
1288 // $w0, $w1 = LDPWi $x0, 0
1289 // $w0 = KILL $w0, implicit-def $x0
1290 // $x0 = SBFMXri $x0, 0, 31
1291 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1292 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1293
1294 // We want the final result to look like:
1295 // debugValueSubstitutions:
1296 // - { srcinst: 1, srcop: 0, dstinst: 4, dstop: 1, subreg: 0 }
1297 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1298 // $w0, $w1 = LDPWi $x0, 0, debug-instr-number 4
1299 // $w0 = KILL $w0, implicit-def $x0
1300 // $x0 = SBFMXri $x0, 0, 31, debug-instr-number 3
1301 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1302 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1303
1304 // $x0 is where the final value is stored, so the sign extend (SBFMXri)
1305 // instruction contains the final value we care about we give it a new
1306 // debug-instr-number 3. Whereas, $w1 contains the final value that we care
1307 // about, therefore the LDP instruction is also given a new
1308 // debug-instr-number 4. We have to add these substitutions to the
1309 // debugValueSubstitutions table. However, we also have to ensure that the
1310 // OpIndex that pointed to debug-instr-number 1 gets updated to 1, because
1311 // $w1 is the second operand of the LDP instruction.
1312
1313 if (I->peekDebugInstrNum()) {
1314 // If I is the instruction which got sign extended and has a
1315 // debug-instr-number, give the SBFMXri instruction a new
1316 // debug-instr-number, and update the debugValueSubstitutions table with
1317 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1318 // instruction a new debug-instr-number, and update the
1319 // debugValueSubstitutions table with the new debug-instr-number and
1320 // OpIndex pair.
1321 unsigned NewInstrNum;
1322 if (DstRegX == I->getOperand(i: 0).getReg()) {
1323 NewInstrNum = MIBSXTW->getDebugInstrNum();
1324 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I,
1325 MergedInstr&: *MIBSXTW);
1326 } else {
1327 NewInstrNum = MIB->getDebugInstrNum();
1328 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I, MergedInstr&: *MIB);
1329 }
1330 }
1331 if (Paired->peekDebugInstrNum()) {
1332 // If Paired is the instruction which got sign extended and has a
1333 // debug-instr-number, give the SBFMXri instruction a new
1334 // debug-instr-number, and update the debugValueSubstitutions table with
1335 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1336 // instruction a new debug-instr-number, and update the
1337 // debugValueSubstitutions table with the new debug-instr-number and
1338 // OpIndex pair.
1339 unsigned NewInstrNum;
1340 if (DstRegX == Paired->getOperand(i: 0).getReg()) {
1341 NewInstrNum = MIBSXTW->getDebugInstrNum();
1342 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1343 MergedInstr&: *MIBSXTW);
1344 } else {
1345 NewInstrNum = MIB->getDebugInstrNum();
1346 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1347 MergedInstr&: *MIB);
1348 }
1349 }
1350
1351 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1352 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1353 } else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) {
1354 // We are combining SVE fill/spill to LDP/STP, so we need to use the Q
1355 // variant of the registers.
1356 MachineOperand &MOp0 = MIB->getOperand(i: 0);
1357 MachineOperand &MOp1 = MIB->getOperand(i: 1);
1358 assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
1359 AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
1360 MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
1361 MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
1362 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1363 } else {
1364
1365 // In the case that the merge doesn't result in a sign-extend, if we have
1366 // something like:
1367 // debugValueSubstitutions:[]
1368 // $x1 = LDRXui $x0, 1, debug-instr-number 1
1369 // DBG_INSTR_REF !13, dbg-instr-ref(1, 0), debug-location !11
1370 // $x0 = LDRXui killed $x0, 0, debug-instr-number 2
1371 // DBG_INSTR_REF !14, dbg-instr-ref(2, 0), debug-location !11
1372
1373 // It will be converted to:
1374 // debugValueSubstitutions: []
1375 // $x0, $x1 = LDPXi $x0, 0
1376 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1377 // DBG_INSTR_REF !13, dbg-instr-ref(2, 0), debug-location !14
1378
1379 // We want the final result to look like:
1380 // debugValueSubstitutions:
1381 // - { srcinst: 1, srcop: 0, dstinst: 3, dstop: 1, subreg: 0 }
1382 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1383 // $x0, $x1 = LDPXi $x0, 0, debug-instr-number 3
1384 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1385 // DBG_INSTR_REF !12, dbg-instr-ref(2, 0), debug-location !14
1386
1387 // Here all that needs to be done is, that the LDP instruction needs to be
1388 // updated with a new debug-instr-number, we then need to add entries into
1389 // the debugSubstitutions table to map the old instr-refs to the new ones.
1390
1391 // Assign new DebugInstrNum to the Paired instruction.
1392 if (I->peekDebugInstrNum()) {
1393 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1394 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *I,
1395 MergedInstr&: *MIB);
1396 }
1397 if (Paired->peekDebugInstrNum()) {
1398 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1399 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *Paired,
1400 MergedInstr&: *MIB);
1401 }
1402
1403 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1404 }
1405 LLVM_DEBUG(dbgs() << "\n");
1406
1407 if (MergeForward)
1408 for (const MachineOperand &MOP : phys_regs_and_masks(MI: *I))
1409 if (MOP.isReg() && MOP.isKill())
1410 DefinedInBB.addReg(Reg: MOP.getReg());
1411
1412 // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but
1413 // only copies implicit defs and makes sure that each operand is only added
1414 // once in case of duplicates.
1415 auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1,
1416 MachineBasicBlock::iterator MI2) {
1417 SmallSetVector<Register, 4> Ops;
1418 for (const MachineOperand &MO :
1419 llvm::drop_begin(RangeOrContainer: MI1->operands(), N: MI1->getDesc().getNumOperands()))
1420 if (MO.isReg() && MO.isImplicit() && MO.isDef())
1421 Ops.insert(X: MO.getReg());
1422 for (const MachineOperand &MO :
1423 llvm::drop_begin(RangeOrContainer: MI2->operands(), N: MI2->getDesc().getNumOperands()))
1424 if (MO.isReg() && MO.isImplicit() && MO.isDef())
1425 Ops.insert(X: MO.getReg());
1426 for (auto Op : Ops)
1427 MIB.addDef(RegNo: Op, Flags: RegState::Implicit);
1428 };
1429 CopyImplicitOps(I, Paired);
1430
1431 // Erase the old instructions.
1432 I->eraseFromParent();
1433 Paired->eraseFromParent();
1434
1435 return NextI;
1436}
1437
1438MachineBasicBlock::iterator
1439AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1440 MachineBasicBlock::iterator StoreI) {
1441 MachineBasicBlock::iterator NextI =
1442 next_nodbg(It: LoadI, End: LoadI->getParent()->end());
1443
1444 int LoadSize = TII->getMemScale(MI: *LoadI);
1445 int StoreSize = TII->getMemScale(MI: *StoreI);
1446 Register LdRt = getLdStRegOp(MI&: *LoadI).getReg();
1447 const MachineOperand &StMO = getLdStRegOp(MI&: *StoreI);
1448 Register StRt = getLdStRegOp(MI&: *StoreI).getReg();
1449 bool IsStoreXReg = TRI->getRegClass(i: AArch64::GPR64RegClassID)->contains(Reg: StRt);
1450
1451 assert((IsStoreXReg ||
1452 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1453 "Unexpected RegClass");
1454
1455 MachineInstr *BitExtMI;
1456 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1457 // Remove the load, if the destination register of the loads is the same
1458 // register for stored value.
1459 if (StRt == LdRt && LoadSize == 8) {
1460 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1461 y: LoadI->getIterator())) {
1462 if (MI.killsRegister(Reg: StRt, TRI)) {
1463 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1464 break;
1465 }
1466 }
1467 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1468 LLVM_DEBUG(LoadI->print(dbgs()));
1469 LLVM_DEBUG(dbgs() << "\n");
1470 LoadI->eraseFromParent();
1471 return NextI;
1472 }
1473 // Replace the load with a mov if the load and store are in the same size.
1474 BitExtMI =
1475 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1476 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), DestReg: LdRt)
1477 .addReg(RegNo: IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1478 .add(MO: StMO)
1479 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
1480 .setMIFlags(LoadI->getFlags());
1481 } else {
1482 // FIXME: Currently we disable this transformation in big-endian targets as
1483 // performance and correctness are verified only in little-endian.
1484 if (!Subtarget->isLittleEndian())
1485 return NextI;
1486 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: *LoadI);
1487 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1488 "Unsupported ld/st match");
1489 assert(LoadSize <= StoreSize && "Invalid load size");
1490 int UnscaledLdOffset =
1491 IsUnscaled
1492 ? AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm()
1493 : AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm() * LoadSize;
1494 int UnscaledStOffset =
1495 IsUnscaled
1496 ? AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm()
1497 : AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm() * StoreSize;
1498 int Width = LoadSize * 8;
1499 Register DestReg =
1500 IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1501 Reg: LdRt, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64RegClass))
1502 : LdRt;
1503
1504 assert((UnscaledLdOffset >= UnscaledStOffset &&
1505 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1506 "Invalid offset");
1507
1508 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1509 int Imms = Immr + Width - 1;
1510 if (UnscaledLdOffset == UnscaledStOffset) {
1511 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1512 | ((Immr) << 6) // immr
1513 | ((Imms) << 0) // imms
1514 ;
1515
1516 BitExtMI =
1517 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1518 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1519 DestReg)
1520 .add(MO: StMO)
1521 .addImm(Val: AndMaskEncoded)
1522 .setMIFlags(LoadI->getFlags());
1523 } else if (IsStoreXReg && Imms == 31) {
1524 // Use the 32 bit variant of UBFM if it's the LSR alias of the
1525 // instruction.
1526 assert(Immr <= Imms && "Expected LSR alias of UBFM");
1527 BitExtMI = BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1528 MCID: TII->get(Opcode: AArch64::UBFMWri),
1529 DestReg: TRI->getSubReg(Reg: DestReg, Idx: AArch64::sub_32))
1530 .addReg(RegNo: TRI->getSubReg(Reg: StRt, Idx: AArch64::sub_32))
1531 .addImm(Val: Immr)
1532 .addImm(Val: Imms)
1533 .setMIFlags(LoadI->getFlags());
1534 } else {
1535 BitExtMI =
1536 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1537 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1538 DestReg)
1539 .add(MO: StMO)
1540 .addImm(Val: Immr)
1541 .addImm(Val: Imms)
1542 .setMIFlags(LoadI->getFlags());
1543 }
1544 }
1545
1546 // Clear kill flags between store and load.
1547 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1548 y: BitExtMI->getIterator()))
1549 if (MI.killsRegister(Reg: StRt, TRI)) {
1550 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1551 break;
1552 }
1553
1554 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1555 LLVM_DEBUG(StoreI->print(dbgs()));
1556 LLVM_DEBUG(dbgs() << " ");
1557 LLVM_DEBUG(LoadI->print(dbgs()));
1558 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1559 LLVM_DEBUG(StoreI->print(dbgs()));
1560 LLVM_DEBUG(dbgs() << " ");
1561 LLVM_DEBUG((BitExtMI)->print(dbgs()));
1562 LLVM_DEBUG(dbgs() << "\n");
1563
1564 // Erase the old instructions.
1565 LoadI->eraseFromParent();
1566 return NextI;
1567}
1568
1569static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1570 // Convert the byte-offset used by unscaled into an "element" offset used
1571 // by the scaled pair load/store instructions.
1572 if (IsUnscaled) {
1573 // If the byte-offset isn't a multiple of the stride, there's no point
1574 // trying to match it.
1575 if (Offset % OffsetStride)
1576 return false;
1577 Offset /= OffsetStride;
1578 }
1579 return Offset <= 63 && Offset >= -64;
1580}
1581
1582// Do alignment, specialized to power of 2 and for signed ints,
1583// avoiding having to do a C-style cast from uint_64t to int when
1584// using alignTo from include/llvm/Support/MathExtras.h.
1585// FIXME: Move this function to include/MathExtras.h?
1586static int alignTo(int Num, int PowOf2) {
1587 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1588}
1589
1590static bool mayAlias(MachineInstr &MIa,
1591 SmallVectorImpl<MachineInstr *> &MemInsns,
1592 AliasAnalysis *AA) {
1593 for (MachineInstr *MIb : MemInsns) {
1594 if (MIa.mayAlias(AA, Other: *MIb, /*UseTBAA*/ false)) {
1595 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1596 return true;
1597 }
1598 }
1599
1600 LLVM_DEBUG(dbgs() << "No aliases found\n");
1601 return false;
1602}
1603
1604bool AArch64LoadStoreOpt::findMatchingStore(
1605 MachineBasicBlock::iterator I, unsigned Limit,
1606 MachineBasicBlock::iterator &StoreI) {
1607 MachineBasicBlock::iterator B = I->getParent()->begin();
1608 MachineBasicBlock::iterator MBBI = I;
1609 MachineInstr &LoadMI = *I;
1610 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: LoadMI).getReg();
1611
1612 // If the load is the first instruction in the block, there's obviously
1613 // not any matching store.
1614 if (MBBI == B)
1615 return false;
1616
1617 // Track which register units have been modified and used between the first
1618 // insn and the second insn.
1619 ModifiedRegUnits.clear();
1620 UsedRegUnits.clear();
1621
1622 unsigned Count = 0;
1623 do {
1624 MBBI = prev_nodbg(It: MBBI, Begin: B);
1625 MachineInstr &MI = *MBBI;
1626
1627 // Don't count transient instructions towards the search limit since there
1628 // may be different numbers of them if e.g. debug information is present.
1629 if (!MI.isTransient())
1630 ++Count;
1631
1632 // If the load instruction reads directly from the address to which the
1633 // store instruction writes and the stored value is not modified, we can
1634 // promote the load. Since we do not handle stores with pre-/post-index,
1635 // it's unnecessary to check if BaseReg is modified by the store itself.
1636 // Also we can't handle stores without an immediate offset operand,
1637 // while the operand might be the address for a global variable.
1638 if (MI.mayStore() && isMatchingStore(LoadInst&: LoadMI, StoreInst&: MI) &&
1639 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1640 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1641 isLdOffsetInRangeOfSt(LoadInst&: LoadMI, StoreInst&: MI, TII) &&
1642 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg())) {
1643 StoreI = MBBI;
1644 return true;
1645 }
1646
1647 if (MI.isCall())
1648 return false;
1649
1650 // Update modified / uses register units.
1651 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1652
1653 // Otherwise, if the base register is modified, we have no match, so
1654 // return early.
1655 if (!ModifiedRegUnits.available(Reg: BaseReg))
1656 return false;
1657
1658 // If we encounter a store aliased with the load, return early.
1659 if (MI.mayStore() && LoadMI.mayAlias(AA, Other: MI, /*UseTBAA*/ false))
1660 return false;
1661 } while (MBBI != B && Count < Limit);
1662 return false;
1663}
1664
1665static bool needsWinCFI(const MachineFunction *MF) {
1666 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1667 MF->getFunction().needsUnwindTableEntry();
1668}
1669
1670// Returns true if FirstMI and MI are candidates for merging or pairing.
1671// Otherwise, returns false.
1672static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1673 LdStPairFlags &Flags,
1674 const AArch64InstrInfo *TII) {
1675 // If this is volatile or if pairing is suppressed, not a candidate.
1676 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1677 return false;
1678
1679 // We should have already checked FirstMI for pair suppression and volatility.
1680 assert(!FirstMI.hasOrderedMemoryRef() &&
1681 !TII->isLdStPairSuppressed(FirstMI) &&
1682 "FirstMI shouldn't get here if either of these checks are true.");
1683
1684 if (needsWinCFI(MF: MI.getMF()) && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
1685 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
1686 return false;
1687
1688 unsigned OpcA = FirstMI.getOpcode();
1689 unsigned OpcB = MI.getOpcode();
1690
1691 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1692 if (OpcA == OpcB)
1693 return !AArch64InstrInfo::isPreLdSt(MI: FirstMI);
1694
1695 // Bail out if one of the opcodes is SVE fill/spill, as we currently don't
1696 // allow pairing them with other instructions.
1697 if (OpcA == AArch64::LDR_ZXI || OpcA == AArch64::STR_ZXI ||
1698 OpcB == AArch64::LDR_ZXI || OpcB == AArch64::STR_ZXI)
1699 return false;
1700
1701 // Two pre ld/st of different opcodes cannot be merged either
1702 if (AArch64InstrInfo::isPreLdSt(MI: FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1703 return false;
1704
1705 // Try to match a sign-extended load/store with a zero-extended load/store.
1706 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1707 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc: OpcA, IsValidLdStrOpc: &IsValidLdStrOpc);
1708 assert(IsValidLdStrOpc &&
1709 "Given Opc should be a Load or Store with an immediate");
1710 // OpcA will be the first instruction in the pair.
1711 if (NonSExtOpc == getMatchingNonSExtOpcode(Opc: OpcB, IsValidLdStrOpc: &PairIsValidLdStrOpc)) {
1712 Flags.setSExtIdx(NonSExtOpc == OpcA ? 1 : 0);
1713 return true;
1714 }
1715
1716 // If the second instruction isn't even a mergable/pairable load/store, bail
1717 // out.
1718 if (!PairIsValidLdStrOpc)
1719 return false;
1720
1721 // Narrow stores do not have a matching pair opcodes, so constrain their
1722 // merging to zero stores.
1723 if (isNarrowStore(Opc: OpcA) || isNarrowStore(Opc: OpcB))
1724 return getLdStRegOp(MI&: FirstMI).getReg() == AArch64::WZR &&
1725 getLdStRegOp(MI).getReg() == AArch64::WZR &&
1726 TII->getMemScale(MI: FirstMI) == TII->getMemScale(MI);
1727
1728 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1729 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1730 // are candidate pairs that can be merged.
1731 if (isPreLdStPairCandidate(FirstMI, MI))
1732 return true;
1733
1734 // Try to match an unscaled load/store with a scaled load/store.
1735 return TII->hasUnscaledLdStOffset(Opc: OpcA) != TII->hasUnscaledLdStOffset(Opc: OpcB) &&
1736 getMatchingPairOpcode(Opc: OpcA) == getMatchingPairOpcode(Opc: OpcB);
1737
1738 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1739}
1740
1741static bool canRenameMOP(const MachineOperand &MOP,
1742 const TargetRegisterInfo *TRI) {
1743 if (MOP.isReg()) {
1744 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: MOP.getReg());
1745 // Renaming registers with multiple disjunct sub-registers (e.g. the
1746 // result of a LD3) means that all sub-registers are renamed, potentially
1747 // impacting other instructions we did not check. Bail out.
1748 // Note that this relies on the structure of the AArch64 register file. In
1749 // particular, a subregister cannot be written without overwriting the
1750 // whole register.
1751 if (RegClass->HasDisjunctSubRegs && RegClass->CoveredBySubRegs &&
1752 (TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::dsub0) ||
1753 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::qsub0) ||
1754 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::zsub0))) {
1755 LLVM_DEBUG(
1756 dbgs()
1757 << " Cannot rename operands with multiple disjunct subregisters ("
1758 << MOP << ")\n");
1759 return false;
1760 }
1761
1762 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1763 // them must be known. For example, in ORRWrs the implicit-def
1764 // corresponds to the result register.
1765 if (MOP.isImplicit() && MOP.isDef()) {
1766 if (!isRewritableImplicitDef(MO: MOP))
1767 return false;
1768 return TRI->isSuperOrSubRegisterEq(
1769 RegA: MOP.getParent()->getOperand(i: 0).getReg(), RegB: MOP.getReg());
1770 }
1771 }
1772 return MOP.isImplicit() ||
1773 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1774}
1775
1776static bool
1777canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1778 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1779 const TargetRegisterInfo *TRI) {
1780 if (!FirstMI.mayStore())
1781 return false;
1782
1783 // Check if we can find an unused register which we can use to rename
1784 // the register used by the first load/store.
1785
1786 auto RegToRename = getLdStRegOp(MI&: FirstMI).getReg();
1787 // For now, we only rename if the store operand gets killed at the store.
1788 if (!getLdStRegOp(MI&: FirstMI).isKill() &&
1789 !any_of(Range: FirstMI.operands(),
1790 P: [TRI, RegToRename](const MachineOperand &MOP) {
1791 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1792 MOP.isImplicit() && MOP.isKill() &&
1793 TRI->regsOverlap(RegA: RegToRename, RegB: MOP.getReg());
1794 })) {
1795 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1796 return false;
1797 }
1798
1799 bool FoundDef = false;
1800
1801 // For each instruction between FirstMI and the previous def for RegToRename,
1802 // we
1803 // * check if we can rename RegToRename in this instruction
1804 // * collect the registers used and required register classes for RegToRename.
1805 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1806 bool IsDef) {
1807 LLVM_DEBUG(dbgs() << "Checking " << MI);
1808 // Currently we do not try to rename across frame-setup instructions.
1809 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1810 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1811 << "currently\n");
1812 return false;
1813 }
1814
1815 UsedInBetween.accumulate(MI);
1816
1817 // For a definition, check that we can rename the definition and exit the
1818 // loop.
1819 FoundDef = IsDef;
1820
1821 // For defs, check if we can rename the first def of RegToRename.
1822 if (FoundDef) {
1823 // For some pseudo instructions, we might not generate code in the end
1824 // (e.g. KILL) and we would end up without a correct def for the rename
1825 // register.
1826 // TODO: This might be overly conservative and we could handle those cases
1827 // in multiple ways:
1828 // 1. Insert an extra copy, to materialize the def.
1829 // 2. Skip pseudo-defs until we find an non-pseudo def.
1830 if (MI.isPseudo()) {
1831 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1832 return false;
1833 }
1834
1835 for (auto &MOP : MI.operands()) {
1836 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1837 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1838 continue;
1839 if (!canRenameMOP(MOP, TRI)) {
1840 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1841 return false;
1842 }
1843 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1844 }
1845 return true;
1846 } else {
1847 for (auto &MOP : MI.operands()) {
1848 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1849 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1850 continue;
1851
1852 if (!canRenameMOP(MOP, TRI)) {
1853 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1854 return false;
1855 }
1856 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1857 }
1858 }
1859 return true;
1860 };
1861
1862 if (!forAllMIsUntilDef(MI&: FirstMI, DefReg: RegToRename, TRI, Limit: LdStLimit, Fn&: CheckMIs))
1863 return false;
1864
1865 if (!FoundDef) {
1866 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1867 return false;
1868 }
1869 return true;
1870}
1871
1872// We want to merge the second load into the first by rewriting the usages of
1873// the same reg between first (incl.) and second (excl.). We don't need to care
1874// about any insns before FirstLoad or after SecondLoad.
1875// 1. The second load writes new value into the same reg.
1876// - The renaming is impossible to impact later use of the reg.
1877// - The second load always trash the value written by the first load which
1878// means the reg must be killed before the second load.
1879// 2. The first load must be a def for the same reg so we don't need to look
1880// into anything before it.
1881static bool canRenameUntilSecondLoad(
1882 MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1883 LiveRegUnits &UsedInBetween,
1884 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1885 const TargetRegisterInfo *TRI) {
1886 if (FirstLoad.isPseudo())
1887 return false;
1888
1889 UsedInBetween.accumulate(MI: FirstLoad);
1890 auto RegToRename = getLdStRegOp(MI&: FirstLoad).getReg();
1891 bool Success = std::all_of(
1892 first: FirstLoad.getIterator(), last: SecondLoad.getIterator(),
1893 pred: [&](MachineInstr &MI) {
1894 LLVM_DEBUG(dbgs() << "Checking " << MI);
1895 // Currently we do not try to rename across frame-setup instructions.
1896 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1897 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1898 << "currently\n");
1899 return false;
1900 }
1901
1902 for (auto &MOP : MI.operands()) {
1903 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1904 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1905 continue;
1906 if (!canRenameMOP(MOP, TRI)) {
1907 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1908 return false;
1909 }
1910 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1911 }
1912
1913 return true;
1914 });
1915 return Success;
1916}
1917
1918// Check if we can find a physical register for renaming \p Reg. This register
1919// must:
1920// * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1921// defined registers up to the point where the renamed register will be used,
1922// * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1923// registers in the range the rename register will be used,
1924// * is available in all used register classes (checked using RequiredClasses).
1925static std::optional<MCPhysReg> tryToFindRegisterToRename(
1926 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1927 LiveRegUnits &UsedInBetween,
1928 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1929 const TargetRegisterInfo *TRI) {
1930 const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1931
1932 // Checks if any sub- or super-register of PR is callee saved.
1933 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1934 return any_of(Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1935 P: [&MF, TRI](MCPhysReg SubOrSuper) {
1936 return TRI->isCalleeSavedPhysReg(PhysReg: SubOrSuper, MF);
1937 });
1938 };
1939
1940 // Check if PR or one of its sub- or super-registers can be used for all
1941 // required register classes.
1942 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1943 return all_of(Range&: RequiredClasses, P: [PR, TRI](const TargetRegisterClass *C) {
1944 return any_of(
1945 Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1946 P: [C](MCPhysReg SubOrSuper) { return C->contains(Reg: SubOrSuper); });
1947 });
1948 };
1949
1950 auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1951 for (const MCPhysReg &PR : *RegClass) {
1952 if (DefinedInBB.available(Reg: PR) && UsedInBetween.available(Reg: PR) &&
1953 !RegInfo.isReserved(PhysReg: PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1954 CanBeUsedForAllClasses(PR)) {
1955 DefinedInBB.addReg(Reg: PR);
1956 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1957 << "\n");
1958 return {PR};
1959 }
1960 }
1961 LLVM_DEBUG(dbgs() << "No rename register found from "
1962 << TRI->getRegClassName(RegClass) << "\n");
1963 return std::nullopt;
1964}
1965
1966// For store pairs: returns a register from FirstMI to the beginning of the
1967// block that can be renamed.
1968// For load pairs: returns a register from FirstMI to MI that can be renamed.
1969static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1970 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1971 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1972 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1973 const TargetRegisterInfo *TRI) {
1974 std::optional<MCPhysReg> RenameReg;
1975 if (!DebugCounter::shouldExecute(Counter&: RegRenamingCounter))
1976 return RenameReg;
1977
1978 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: getLdStRegOp(MI&: FirstMI).getReg());
1979 MachineFunction &MF = *FirstMI.getParent()->getParent();
1980 if (!RegClass || !MF.getRegInfo().tracksLiveness())
1981 return RenameReg;
1982
1983 const bool IsLoad = FirstMI.mayLoad();
1984
1985 if (!MaybeCanRename) {
1986 if (IsLoad)
1987 MaybeCanRename = {canRenameUntilSecondLoad(FirstLoad&: FirstMI, SecondLoad&: MI, UsedInBetween,
1988 RequiredClasses, TRI)};
1989 else
1990 MaybeCanRename = {
1991 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1992 }
1993
1994 if (*MaybeCanRename) {
1995 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1996 RequiredClasses, TRI);
1997 }
1998 return RenameReg;
1999}
2000
2001/// Scan the instructions looking for a load/store that can be combined with the
2002/// current instruction into a wider equivalent or a load/store pair.
2003MachineBasicBlock::iterator
2004AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
2005 LdStPairFlags &Flags, unsigned Limit,
2006 bool FindNarrowMerge) {
2007 MachineBasicBlock::iterator E = I->getParent()->end();
2008 MachineBasicBlock::iterator MBBI = I;
2009 MachineBasicBlock::iterator MBBIWithRenameReg;
2010 MachineInstr &FirstMI = *I;
2011 MBBI = next_nodbg(It: MBBI, End: E);
2012
2013 bool MayLoad = FirstMI.mayLoad();
2014 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: FirstMI);
2015 Register Reg = getLdStRegOp(MI&: FirstMI).getReg();
2016 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: FirstMI).getReg();
2017 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: FirstMI).getImm();
2018 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: FirstMI) : 1;
2019 bool IsPromotableZeroStore = isPromotableZeroStoreInst(MI&: FirstMI);
2020
2021 std::optional<bool> MaybeCanRename;
2022 if (!EnableRenaming)
2023 MaybeCanRename = {false};
2024
2025 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
2026 LiveRegUnits UsedInBetween;
2027 UsedInBetween.init(TRI: *TRI);
2028
2029 Flags.clearRenameReg();
2030
2031 // Track which register units have been modified and used between the first
2032 // insn (inclusive) and the second insn.
2033 ModifiedRegUnits.clear();
2034 UsedRegUnits.clear();
2035
2036 // Remember any instructions that read/write memory between FirstMI and MI.
2037 SmallVector<MachineInstr *, 4> MemInsns;
2038
2039 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
2040 for (unsigned Count = 0; MBBI != E && Count < Limit;
2041 MBBI = next_nodbg(It: MBBI, End: E)) {
2042 MachineInstr &MI = *MBBI;
2043 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
2044
2045 UsedInBetween.accumulate(MI);
2046
2047 // Don't count transient instructions towards the search limit since there
2048 // may be different numbers of them if e.g. debug information is present.
2049 if (!MI.isTransient())
2050 ++Count;
2051
2052 Flags.setSExtIdx(-1);
2053 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
2054 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
2055 assert(MI.mayLoadOrStore() && "Expected memory operation.");
2056 // If we've found another instruction with the same opcode, check to see
2057 // if the base and offset are compatible with our starting instruction.
2058 // These instructions all have scaled immediate operands, so we just
2059 // check for +1/-1. Make sure to check the new instruction offset is
2060 // actually an immediate and not a symbolic reference destined for
2061 // a relocation.
2062 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
2063 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2064 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
2065 if (IsUnscaled != MIIsUnscaled) {
2066 // We're trying to pair instructions that differ in how they are scaled.
2067 // If FirstMI is scaled then scale the offset of MI accordingly.
2068 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
2069 int MemSize = TII->getMemScale(MI);
2070 if (MIIsUnscaled) {
2071 // If the unscaled offset isn't a multiple of the MemSize, we can't
2072 // pair the operations together: bail and keep looking.
2073 if (MIOffset % MemSize) {
2074 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2075 UsedRegUnits, TRI);
2076 MemInsns.push_back(Elt: &MI);
2077 continue;
2078 }
2079 MIOffset /= MemSize;
2080 } else {
2081 MIOffset *= MemSize;
2082 }
2083 }
2084
2085 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
2086
2087 if (BaseReg == MIBaseReg) {
2088 // If the offset of the second ld/st is not equal to the size of the
2089 // destination register it can’t be paired with a pre-index ld/st
2090 // pair. Additionally if the base reg is used or modified the operations
2091 // can't be paired: bail and keep looking.
2092 if (IsPreLdSt) {
2093 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
2094 bool IsBaseRegUsed = !UsedRegUnits.available(
2095 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2096 bool IsBaseRegModified = !ModifiedRegUnits.available(
2097 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2098 // If the stored value and the address of the second instruction is
2099 // the same, it needs to be using the updated register and therefore
2100 // it must not be folded.
2101 bool IsMIRegTheSame =
2102 TRI->regsOverlap(RegA: getLdStRegOp(MI).getReg(),
2103 RegB: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2104 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
2105 IsMIRegTheSame) {
2106 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2107 UsedRegUnits, TRI);
2108 MemInsns.push_back(Elt: &MI);
2109 continue;
2110 }
2111 } else {
2112 if ((Offset != MIOffset + OffsetStride) &&
2113 (Offset + OffsetStride != MIOffset)) {
2114 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2115 UsedRegUnits, TRI);
2116 MemInsns.push_back(Elt: &MI);
2117 continue;
2118 }
2119 }
2120
2121 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
2122 if (FindNarrowMerge) {
2123 // If the alignment requirements of the scaled wide load/store
2124 // instruction can't express the offset of the scaled narrow input,
2125 // bail and keep looking. For promotable zero stores, allow only when
2126 // the stored value is the same (i.e., WZR).
2127 if ((!IsUnscaled && alignTo(Num: MinOffset, PowOf2: 2) != MinOffset) ||
2128 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
2129 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2130 UsedRegUnits, TRI);
2131 MemInsns.push_back(Elt: &MI);
2132 continue;
2133 }
2134 } else {
2135 // Pairwise instructions have a 7-bit signed offset field. Single
2136 // insns have a 12-bit unsigned offset field. If the resultant
2137 // immediate offset of merging these instructions is out of range for
2138 // a pairwise instruction, bail and keep looking.
2139 if (!inBoundsForPair(IsUnscaled, Offset: MinOffset, OffsetStride)) {
2140 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2141 UsedRegUnits, TRI);
2142 MemInsns.push_back(Elt: &MI);
2143 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
2144 << "keep looking.\n");
2145 continue;
2146 }
2147 // If the alignment requirements of the paired (scaled) instruction
2148 // can't express the offset of the unscaled input, bail and keep
2149 // looking.
2150 if (IsUnscaled && (alignTo(Num: MinOffset, PowOf2: OffsetStride) != MinOffset)) {
2151 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2152 UsedRegUnits, TRI);
2153 MemInsns.push_back(Elt: &MI);
2154 LLVM_DEBUG(dbgs()
2155 << "Offset doesn't fit due to alignment requirements, "
2156 << "keep looking.\n");
2157 continue;
2158 }
2159 }
2160
2161 // If the BaseReg has been modified, then we cannot do the optimization.
2162 // For example, in the following pattern
2163 // ldr x1 [x2]
2164 // ldr x2 [x3]
2165 // ldr x4 [x2, #8],
2166 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
2167 if (!ModifiedRegUnits.available(Reg: BaseReg))
2168 return E;
2169
2170 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
2171 RegA: Reg, RegB: getLdStRegOp(MI).getReg());
2172
2173 // If the Rt of the second instruction (destination register of the
2174 // load) was not modified or used between the two instructions and none
2175 // of the instructions between the second and first alias with the
2176 // second, we can combine the second into the first.
2177 bool RtNotModified =
2178 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg());
2179 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
2180 !UsedRegUnits.available(Reg: getLdStRegOp(MI).getReg()));
2181
2182 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
2183 << "Reg '" << getLdStRegOp(MI) << "' not modified: "
2184 << (RtNotModified ? "true" : "false") << "\n"
2185 << "Reg '" << getLdStRegOp(MI) << "' not used: "
2186 << (RtNotUsed ? "true" : "false") << "\n");
2187
2188 if (RtNotModified && RtNotUsed && !mayAlias(MIa&: MI, MemInsns, AA)) {
2189 // For pairs loading into the same reg, try to find a renaming
2190 // opportunity to allow the renaming of Reg between FirstMI and MI
2191 // and combine MI into FirstMI; otherwise bail and keep looking.
2192 if (SameLoadReg) {
2193 std::optional<MCPhysReg> RenameReg =
2194 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
2195 Reg, DefinedInBB, UsedInBetween,
2196 RequiredClasses, TRI);
2197 if (!RenameReg) {
2198 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2199 UsedRegUnits, TRI);
2200 MemInsns.push_back(Elt: &MI);
2201 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
2202 << "keep looking.\n");
2203 continue;
2204 }
2205 Flags.setRenameReg(*RenameReg);
2206 }
2207
2208 Flags.setMergeForward(false);
2209 if (!SameLoadReg)
2210 Flags.clearRenameReg();
2211 return MBBI;
2212 }
2213
2214 // Likewise, if the Rt of the first instruction is not modified or used
2215 // between the two instructions and none of the instructions between the
2216 // first and the second alias with the first, we can combine the first
2217 // into the second.
2218 RtNotModified = !(
2219 MayLoad && !UsedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg()));
2220
2221 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
2222 << "Reg '" << getLdStRegOp(FirstMI)
2223 << "' not modified: "
2224 << (RtNotModified ? "true" : "false") << "\n");
2225
2226 if (RtNotModified && !mayAlias(MIa&: FirstMI, MemInsns, AA)) {
2227 if (ModifiedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg())) {
2228 Flags.setMergeForward(true);
2229 Flags.clearRenameReg();
2230 return MBBI;
2231 }
2232
2233 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
2234 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
2235 RequiredClasses, TRI);
2236 if (RenameReg) {
2237 Flags.setMergeForward(true);
2238 Flags.setRenameReg(*RenameReg);
2239 MBBIWithRenameReg = MBBI;
2240 }
2241 }
2242 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
2243 << "interference in between, keep looking.\n");
2244 }
2245 }
2246
2247 if (Flags.getRenameReg())
2248 return MBBIWithRenameReg;
2249
2250 // If the instruction wasn't a matching load or store. Stop searching if we
2251 // encounter a call instruction that might modify memory.
2252 if (MI.isCall()) {
2253 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
2254 return E;
2255 }
2256
2257 // Update modified / uses register units.
2258 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2259
2260 // Otherwise, if the base register is modified, we have no match, so
2261 // return early.
2262 if (!ModifiedRegUnits.available(Reg: BaseReg)) {
2263 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
2264 return E;
2265 }
2266
2267 // Update list of instructions that read/write memory.
2268 if (MI.mayLoadOrStore())
2269 MemInsns.push_back(Elt: &MI);
2270 }
2271 return E;
2272}
2273
2274static MachineBasicBlock::iterator
2275maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
2276 assert((MI.getOpcode() == AArch64::SUBXri ||
2277 MI.getOpcode() == AArch64::ADDXri) &&
2278 "Expected a register update instruction");
2279 auto End = MI.getParent()->end();
2280 if (MaybeCFI == End ||
2281 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
2282 !(MI.getFlag(Flag: MachineInstr::FrameSetup) ||
2283 MI.getFlag(Flag: MachineInstr::FrameDestroy)) ||
2284 MI.getOperand(i: 0).getReg() != AArch64::SP)
2285 return End;
2286
2287 const MachineFunction &MF = *MI.getParent()->getParent();
2288 unsigned CFIIndex = MaybeCFI->getOperand(i: 0).getCFIIndex();
2289 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
2290 switch (CFI.getOperation()) {
2291 case MCCFIInstruction::OpDefCfa:
2292 case MCCFIInstruction::OpDefCfaOffset:
2293 return MaybeCFI;
2294 default:
2295 return End;
2296 }
2297}
2298
2299std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
2300 MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
2301 bool IsForward, bool IsPreIdx, bool MergeEither) {
2302 assert((Update->getOpcode() == AArch64::ADDXri ||
2303 Update->getOpcode() == AArch64::SUBXri) &&
2304 "Unexpected base register update instruction to merge!");
2305 MachineBasicBlock::iterator E = I->getParent()->end();
2306 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2307
2308 // If updating the SP and the following instruction is CFA offset related CFI,
2309 // make sure the CFI follows the SP update either by merging at the location
2310 // of the update or by moving the CFI after the merged instruction. If unable
2311 // to do so, bail.
2312 MachineBasicBlock::iterator InsertPt = I;
2313 if (IsForward) {
2314 assert(IsPreIdx);
2315 if (auto CFI = maybeMoveCFI(MI&: *Update, MaybeCFI: next_nodbg(It: Update, End: E)); CFI != E) {
2316 if (MergeEither) {
2317 InsertPt = Update;
2318 } else {
2319 // Take care not to reorder CFIs.
2320 if (std::any_of(first: std::next(x: CFI), last: I, pred: [](const auto &Insn) {
2321 return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
2322 }))
2323 return std::nullopt;
2324
2325 MachineBasicBlock *MBB = InsertPt->getParent();
2326 MBB->splice(Where: std::next(x: InsertPt), Other: MBB, From: CFI);
2327 }
2328 }
2329 }
2330
2331 // Return the instruction following the merged instruction, which is
2332 // the instruction following our unmerged load. Unless that's the add/sub
2333 // instruction we're merging, in which case it's the one after that.
2334 if (NextI == Update)
2335 NextI = next_nodbg(It: NextI, End: E);
2336
2337 int Value = Update->getOperand(i: 2).getImm();
2338 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
2339 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
2340 if (Update->getOpcode() == AArch64::SUBXri)
2341 Value = -Value;
2342
2343 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(Opc: I->getOpcode())
2344 : getPostIndexedOpcode(Opc: I->getOpcode());
2345 MachineInstrBuilder MIB;
2346 int Scale, MinOffset, MaxOffset;
2347 getPrePostIndexedMemOpInfo(MI: *I, Scale, MinOffset, MaxOffset);
2348 if (!AArch64InstrInfo::isPairedLdSt(MI: *I)) {
2349 // Non-paired instruction.
2350 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2351 MCID: TII->get(Opcode: NewOpc))
2352 .add(MO: Update->getOperand(i: 0))
2353 .add(MO: getLdStRegOp(MI&: *I))
2354 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2355 .addImm(Val: Value / Scale)
2356 .setMemRefs(I->memoperands())
2357 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2358 } else {
2359 // Paired instruction.
2360 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2361 MCID: TII->get(Opcode: NewOpc))
2362 .add(MO: Update->getOperand(i: 0))
2363 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 0))
2364 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 1))
2365 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2366 .addImm(Val: Value / Scale)
2367 .setMemRefs(I->memoperands())
2368 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2369 }
2370
2371 if (IsPreIdx) {
2372 ++NumPreFolded;
2373 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2374 } else {
2375 ++NumPostFolded;
2376 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2377 }
2378 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2379 LLVM_DEBUG(I->print(dbgs()));
2380 LLVM_DEBUG(dbgs() << " ");
2381 LLVM_DEBUG(Update->print(dbgs()));
2382 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2383 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2384 LLVM_DEBUG(dbgs() << "\n");
2385
2386 // Erase the old instructions for the block.
2387 I->eraseFromParent();
2388 Update->eraseFromParent();
2389
2390 return NextI;
2391}
2392
2393MachineBasicBlock::iterator
2394AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2395 MachineBasicBlock::iterator Update,
2396 unsigned Offset, int Scale) {
2397 assert((Update->getOpcode() == AArch64::MOVKWi) &&
2398 "Unexpected const mov instruction to merge!");
2399 MachineBasicBlock::iterator E = I->getParent()->end();
2400 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2401 MachineBasicBlock::iterator PrevI = prev_nodbg(It: Update, Begin: E);
2402 MachineInstr &MemMI = *I;
2403 unsigned Mask = (1 << 12) * Scale - 1;
2404 unsigned Low = Offset & Mask;
2405 unsigned High = Offset - Low;
2406 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2407 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2408 MachineInstrBuilder AddMIB, MemMIB;
2409
2410 // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2411 AddMIB =
2412 BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: AArch64::ADDXri))
2413 .addDef(RegNo: IndexReg)
2414 .addUse(RegNo: BaseReg)
2415 .addImm(Val: High >> 12) // shifted value
2416 .addImm(Val: 12); // shift 12
2417 (void)AddMIB;
2418 // Ld/St DestReg, IndexReg, Imm12
2419 unsigned NewOpc = getBaseAddressOpcode(Opc: I->getOpcode());
2420 MemMIB = BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: NewOpc))
2421 .add(MO: getLdStRegOp(MI&: MemMI))
2422 .add(MO: AArch64InstrInfo::getLdStOffsetOp(MI: MemMI))
2423 .addImm(Val: Low / Scale)
2424 .setMemRefs(I->memoperands())
2425 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2426 (void)MemMIB;
2427
2428 ++NumConstOffsetFolded;
2429 LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2430 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2431 LLVM_DEBUG(PrevI->print(dbgs()));
2432 LLVM_DEBUG(dbgs() << " ");
2433 LLVM_DEBUG(Update->print(dbgs()));
2434 LLVM_DEBUG(dbgs() << " ");
2435 LLVM_DEBUG(I->print(dbgs()));
2436 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2437 LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2438 LLVM_DEBUG(dbgs() << " ");
2439 LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2440 LLVM_DEBUG(dbgs() << "\n");
2441
2442 // Erase the old instructions for the block.
2443 I->eraseFromParent();
2444 PrevI->eraseFromParent();
2445 Update->eraseFromParent();
2446
2447 return NextI;
2448}
2449
2450bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2451 MachineInstr &MI,
2452 unsigned BaseReg, int Offset) {
2453 switch (MI.getOpcode()) {
2454 default:
2455 break;
2456 case AArch64::SUBXri:
2457 case AArch64::ADDXri:
2458 // Make sure it's a vanilla immediate operand, not a relocation or
2459 // anything else we can't handle.
2460 if (!MI.getOperand(i: 2).isImm())
2461 break;
2462 // Watch out for 1 << 12 shifted value.
2463 if (AArch64_AM::getShiftValue(Imm: MI.getOperand(i: 3).getImm()))
2464 break;
2465
2466 // The update instruction source and destination register must be the
2467 // same as the load/store base register.
2468 if (MI.getOperand(i: 0).getReg() != BaseReg ||
2469 MI.getOperand(i: 1).getReg() != BaseReg)
2470 break;
2471
2472 int UpdateOffset = MI.getOperand(i: 2).getImm();
2473 if (MI.getOpcode() == AArch64::SUBXri)
2474 UpdateOffset = -UpdateOffset;
2475
2476 // The immediate must be a multiple of the scaling factor of the pre/post
2477 // indexed instruction.
2478 int Scale, MinOffset, MaxOffset;
2479 getPrePostIndexedMemOpInfo(MI: MemMI, Scale, MinOffset, MaxOffset);
2480 if (UpdateOffset % Scale != 0)
2481 break;
2482
2483 // Scaled offset must fit in the instruction immediate.
2484 int ScaledOffset = UpdateOffset / Scale;
2485 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2486 break;
2487
2488 // If we have a non-zero Offset, we check that it matches the amount
2489 // we're adding to the register.
2490 if (!Offset || Offset == UpdateOffset)
2491 return true;
2492 break;
2493 }
2494 return false;
2495}
2496
2497bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2498 MachineInstr &MI,
2499 unsigned IndexReg,
2500 unsigned &Offset) {
2501 // The update instruction source and destination register must be the
2502 // same as the load/store index register.
2503 if (MI.getOpcode() == AArch64::MOVKWi &&
2504 TRI->isSuperOrSubRegisterEq(RegA: IndexReg, RegB: MI.getOperand(i: 1).getReg())) {
2505
2506 // movz + movk hold a large offset of a Ld/St instruction.
2507 MachineBasicBlock::iterator B = MI.getParent()->begin();
2508 MachineBasicBlock::iterator MBBI = &MI;
2509 // Skip the scene when the MI is the first instruction of a block.
2510 if (MBBI == B)
2511 return false;
2512 MBBI = prev_nodbg(It: MBBI, Begin: B);
2513 MachineInstr &MovzMI = *MBBI;
2514 // Make sure the MOVKWi and MOVZWi set the same register.
2515 if (MovzMI.getOpcode() == AArch64::MOVZWi &&
2516 MovzMI.getOperand(i: 0).getReg() == MI.getOperand(i: 0).getReg()) {
2517 unsigned Low = MovzMI.getOperand(i: 1).getImm();
2518 unsigned High = MI.getOperand(i: 2).getImm() << MI.getOperand(i: 3).getImm();
2519 Offset = High + Low;
2520 // 12-bit optionally shifted immediates are legal for adds.
2521 return Offset >> 24 == 0;
2522 }
2523 }
2524 return false;
2525}
2526
2527MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2528 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2529 MachineBasicBlock::iterator E = I->getParent()->end();
2530 MachineInstr &MemMI = *I;
2531 MachineBasicBlock::iterator MBBI = I;
2532
2533 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2534 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm() *
2535 TII->getMemScale(MI: MemMI);
2536
2537 // Scan forward looking for post-index opportunities. Updating instructions
2538 // can't be formed if the memory instruction doesn't have the offset we're
2539 // looking for.
2540 if (MIUnscaledOffset != UnscaledOffset)
2541 return E;
2542
2543 // If the base register overlaps a source/destination register, we can't
2544 // merge the update. This does not apply to tag store instructions which
2545 // ignore the address part of the source register.
2546 // This does not apply to STGPi as well, which does not have unpredictable
2547 // behavior in this case unlike normal stores, and always performs writeback
2548 // after reading the source register value.
2549 if (!isTagStore(MI: MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2550 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2551 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2552 Register DestReg = getLdStRegOp(MI&: MemMI, PairedRegOp: i).getReg();
2553 if (DestReg == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg))
2554 return E;
2555 }
2556 }
2557
2558 // Track which register units have been modified and used between the first
2559 // insn (inclusive) and the second insn.
2560 ModifiedRegUnits.clear();
2561 UsedRegUnits.clear();
2562 MBBI = next_nodbg(It: MBBI, End: E);
2563
2564 // We can't post-increment the stack pointer if any instruction between
2565 // the memory access (I) and the increment (MBBI) can access the memory
2566 // region defined by [SP, MBBI].
2567 const bool BaseRegSP = BaseReg == AArch64::SP;
2568 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2569 // FIXME: For now, we always block the optimization over SP in windows
2570 // targets as it requires to adjust the unwind/debug info, messing up
2571 // the unwind info can actually cause a miscompile.
2572 return E;
2573 }
2574
2575 unsigned Count = 0;
2576 MachineBasicBlock *CurMBB = I->getParent();
2577 // choice of next block to visit is liveins-based
2578 bool VisitSucc = CurMBB->getParent()->getRegInfo().tracksLiveness();
2579
2580 while (true) {
2581 for (MachineBasicBlock::iterator CurEnd = CurMBB->end();
2582 MBBI != CurEnd && Count < Limit; MBBI = next_nodbg(It: MBBI, End: CurEnd)) {
2583 MachineInstr &MI = *MBBI;
2584
2585 // Don't count transient instructions towards the search limit since there
2586 // may be different numbers of them if e.g. debug information is present.
2587 if (!MI.isTransient())
2588 ++Count;
2589
2590 // If we found a match, return it.
2591 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset: UnscaledOffset))
2592 return MBBI;
2593
2594 // Update the status of what the instruction clobbered and used.
2595 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2596 TRI);
2597
2598 // Otherwise, if the base register is used or modified, we have no match,
2599 // so return early. If we are optimizing SP, do not allow instructions
2600 // that may load or store in between the load and the optimized value
2601 // update.
2602 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2603 !UsedRegUnits.available(Reg: BaseReg) ||
2604 (BaseRegSP && MBBI->mayLoadOrStore()))
2605 return E;
2606 }
2607
2608 if (!VisitSucc || Limit <= Count)
2609 break;
2610
2611 // Try to go downward to successors along a CF path w/o side enters
2612 // such that BaseReg is alive along it but not at its exits
2613 MachineBasicBlock *SuccToVisit = nullptr;
2614 unsigned LiveSuccCount = 0;
2615 for (MachineBasicBlock *Succ : CurMBB->successors()) {
2616 for (MCRegAliasIterator AI(BaseReg, TRI, true); AI.isValid(); ++AI) {
2617 if (Succ->isLiveIn(Reg: *AI)) {
2618 if (LiveSuccCount++)
2619 return E;
2620 if (Succ->pred_size() == 1)
2621 SuccToVisit = Succ;
2622 break;
2623 }
2624 }
2625 }
2626 if (!SuccToVisit)
2627 break;
2628 CurMBB = SuccToVisit;
2629 MBBI = CurMBB->begin();
2630 }
2631
2632 return E;
2633}
2634
2635MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2636 MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
2637 MachineBasicBlock::iterator B = I->getParent()->begin();
2638 MachineBasicBlock::iterator E = I->getParent()->end();
2639 MachineInstr &MemMI = *I;
2640 MachineBasicBlock::iterator MBBI = I;
2641 MachineFunction &MF = *MemMI.getMF();
2642
2643 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2644 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm();
2645
2646 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2647 Register DestReg[] = {getLdStRegOp(MI&: MemMI, PairedRegOp: 0).getReg(),
2648 IsPairedInsn ? getLdStRegOp(MI&: MemMI, PairedRegOp: 1).getReg()
2649 : AArch64::NoRegister};
2650
2651 // If the load/store is the first instruction in the block, there's obviously
2652 // not any matching update. Ditto if the memory offset isn't zero.
2653 if (MBBI == B || Offset != 0)
2654 return E;
2655 // If the base register overlaps a destination register, we can't
2656 // merge the update.
2657 if (!isTagStore(MI: MemMI)) {
2658 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i)
2659 if (DestReg[i] == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg[i]))
2660 return E;
2661 }
2662
2663 const bool BaseRegSP = BaseReg == AArch64::SP;
2664 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2665 // FIXME: For now, we always block the optimization over SP in windows
2666 // targets as it requires to adjust the unwind/debug info, messing up
2667 // the unwind info can actually cause a miscompile.
2668 return E;
2669 }
2670
2671 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2672 unsigned RedZoneSize =
2673 Subtarget.getTargetLowering()->getRedZoneSize(F: MF.getFunction());
2674
2675 // Track which register units have been modified and used between the first
2676 // insn (inclusive) and the second insn.
2677 ModifiedRegUnits.clear();
2678 UsedRegUnits.clear();
2679 unsigned Count = 0;
2680 bool MemAccessBeforeSPPreInc = false;
2681 MergeEither = true;
2682 do {
2683 MBBI = prev_nodbg(It: MBBI, Begin: B);
2684 MachineInstr &MI = *MBBI;
2685
2686 // Don't count transient instructions towards the search limit since there
2687 // may be different numbers of them if e.g. debug information is present.
2688 if (!MI.isTransient())
2689 ++Count;
2690
2691 // If we found a match, return it.
2692 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset)) {
2693 // Check that the update value is within our red zone limit (which may be
2694 // zero).
2695 if (MemAccessBeforeSPPreInc && MBBI->getOperand(i: 2).getImm() > RedZoneSize)
2696 return E;
2697 return MBBI;
2698 }
2699
2700 // Update the status of what the instruction clobbered and used.
2701 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2702
2703 // Otherwise, if the base register is used or modified, we have no match, so
2704 // return early.
2705 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2706 !UsedRegUnits.available(Reg: BaseReg))
2707 return E;
2708
2709 // If we have a destination register (i.e. a load instruction) and a
2710 // destination register is used or modified, then we can only merge forward,
2711 // i.e. the combined instruction is put in the place of the memory
2712 // instruction. Same applies if we see a memory access or side effects.
2713 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() ||
2714 (DestReg[0] != AArch64::NoRegister &&
2715 !(ModifiedRegUnits.available(Reg: DestReg[0]) &&
2716 UsedRegUnits.available(Reg: DestReg[0]))) ||
2717 (DestReg[1] != AArch64::NoRegister &&
2718 !(ModifiedRegUnits.available(Reg: DestReg[1]) &&
2719 UsedRegUnits.available(Reg: DestReg[1]))))
2720 MergeEither = false;
2721
2722 // Keep track if we have a memory access before an SP pre-increment, in this
2723 // case we need to validate later that the update amount respects the red
2724 // zone.
2725 if (BaseRegSP && MBBI->mayLoadOrStore())
2726 MemAccessBeforeSPPreInc = true;
2727 } while (MBBI != B && Count < Limit);
2728 return E;
2729}
2730
2731MachineBasicBlock::iterator
2732AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2733 MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2734 MachineBasicBlock::iterator B = I->getParent()->begin();
2735 MachineBasicBlock::iterator E = I->getParent()->end();
2736 MachineInstr &MemMI = *I;
2737 MachineBasicBlock::iterator MBBI = I;
2738
2739 // If the load is the first instruction in the block, there's obviously
2740 // not any matching load or store.
2741 if (MBBI == B)
2742 return E;
2743
2744 // Make sure the IndexReg is killed and the shift amount is zero.
2745 // TODO: Relex this restriction to extend, simplify processing now.
2746 if (!AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).isKill() ||
2747 !AArch64InstrInfo::getLdStAmountOp(MI: MemMI).isImm() ||
2748 (AArch64InstrInfo::getLdStAmountOp(MI: MemMI).getImm() != 0))
2749 return E;
2750
2751 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2752
2753 // Track which register units have been modified and used between the first
2754 // insn (inclusive) and the second insn.
2755 ModifiedRegUnits.clear();
2756 UsedRegUnits.clear();
2757 unsigned Count = 0;
2758 do {
2759 MBBI = prev_nodbg(It: MBBI, Begin: B);
2760 MachineInstr &MI = *MBBI;
2761
2762 // Don't count transient instructions towards the search limit since there
2763 // may be different numbers of them if e.g. debug information is present.
2764 if (!MI.isTransient())
2765 ++Count;
2766
2767 // If we found a match, return it.
2768 if (isMatchingMovConstInsn(MemMI&: *I, MI, IndexReg, Offset)) {
2769 return MBBI;
2770 }
2771
2772 // Update the status of what the instruction clobbered and used.
2773 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2774
2775 // Otherwise, if the index register is used or modified, we have no match,
2776 // so return early.
2777 if (!ModifiedRegUnits.available(Reg: IndexReg) ||
2778 !UsedRegUnits.available(Reg: IndexReg))
2779 return E;
2780
2781 } while (MBBI != B && Count < Limit);
2782 return E;
2783}
2784
2785bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2786 MachineBasicBlock::iterator &MBBI) {
2787 MachineInstr &MI = *MBBI;
2788 // If this is a volatile load, don't mess with it.
2789 if (MI.hasOrderedMemoryRef())
2790 return false;
2791
2792 if (needsWinCFI(MF: MI.getMF()) && MI.getFlag(Flag: MachineInstr::FrameDestroy))
2793 return false;
2794
2795 // Make sure this is a reg+imm.
2796 // FIXME: It is possible to extend it to handle reg+reg cases.
2797 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2798 return false;
2799
2800 // Look backward up to LdStLimit instructions.
2801 MachineBasicBlock::iterator StoreI;
2802 if (findMatchingStore(I: MBBI, Limit: LdStLimit, StoreI)) {
2803 ++NumLoadsFromStoresPromoted;
2804 // Promote the load. Keeping the iterator straight is a
2805 // pain, so we let the merge routine tell us what the next instruction
2806 // is after it's done mucking about.
2807 MBBI = promoteLoadFromStore(LoadI: MBBI, StoreI);
2808 return true;
2809 }
2810 return false;
2811}
2812
2813// Merge adjacent zero stores into a wider store.
2814bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2815 MachineBasicBlock::iterator &MBBI) {
2816 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2817 MachineInstr &MI = *MBBI;
2818 MachineBasicBlock::iterator E = MI.getParent()->end();
2819
2820 if (!TII->isCandidateToMergeOrPair(MI))
2821 return false;
2822
2823 // Look ahead up to LdStLimit instructions for a mergeable instruction.
2824 LdStPairFlags Flags;
2825 MachineBasicBlock::iterator MergeMI =
2826 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ true);
2827 if (MergeMI != E) {
2828 ++NumZeroStoresPromoted;
2829
2830 // Keeping the iterator straight is a pain, so we let the merge routine tell
2831 // us what the next instruction is after it's done mucking about.
2832 MBBI = mergeNarrowZeroStores(I: MBBI, MergeMI, Flags);
2833 return true;
2834 }
2835 return false;
2836}
2837
2838// Find loads and stores that can be merged into a single load or store pair
2839// instruction.
2840bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2841 MachineInstr &MI = *MBBI;
2842 MachineBasicBlock::iterator E = MI.getParent()->end();
2843
2844 if (!TII->isCandidateToMergeOrPair(MI))
2845 return false;
2846
2847 // If disable-ldp feature is opted, do not emit ldp.
2848 if (MI.mayLoad() && Subtarget->hasDisableLdp())
2849 return false;
2850
2851 // If disable-stp feature is opted, do not emit stp.
2852 if (MI.mayStore() && Subtarget->hasDisableStp())
2853 return false;
2854
2855 // Early exit if the offset is not possible to match. (6 bits of positive
2856 // range, plus allow an extra one in case we find a later insn that matches
2857 // with Offset-1)
2858 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2859 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2860 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2861 // Allow one more for offset.
2862 if (Offset > 0)
2863 Offset -= OffsetStride;
2864 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2865 return false;
2866
2867 // Look ahead up to LdStLimit instructions for a pairable instruction.
2868 LdStPairFlags Flags;
2869 MachineBasicBlock::iterator Paired =
2870 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ false);
2871 if (Paired != E) {
2872 // Keeping the iterator straight is a pain, so we let the merge routine tell
2873 // us what the next instruction is after it's done mucking about.
2874 auto Prev = std::prev(x: MBBI);
2875
2876 // Fetch the memoperand of the load/store that is a candidate for
2877 // combination.
2878 MachineMemOperand *MemOp =
2879 MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2880
2881 // If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2882 // that the alignment of the source pointer is at least double the alignment
2883 // of the type.
2884 if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||
2885 (MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2886 // If there is no size/align information, cancel the transformation.
2887 if (!MemOp || !MemOp->getMemoryType().isValid()) {
2888 NumFailedAlignmentCheck++;
2889 return false;
2890 }
2891
2892 // Get the needed alignments to check them if
2893 // ldp-aligned-only/stp-aligned-only features are opted.
2894 uint64_t MemAlignment = MemOp->getAlign().value();
2895 uint64_t TypeAlignment =
2896 Align(MemOp->getSize().getValue().getKnownMinValue()).value();
2897
2898 if (MemAlignment < 2 * TypeAlignment) {
2899 NumFailedAlignmentCheck++;
2900 return false;
2901 }
2902 }
2903
2904 ++NumPairCreated;
2905 if (TII->hasUnscaledLdStOffset(MI))
2906 ++NumUnscaledPairCreated;
2907
2908 MBBI = mergePairedInsns(I: MBBI, Paired, Flags);
2909 // Collect liveness info for instructions between Prev and the new position
2910 // MBBI.
2911 for (auto I = std::next(x: Prev); I != MBBI; I++)
2912 updateDefinedRegisters(MI&: *I, Units&: DefinedInBB, TRI);
2913
2914 return true;
2915 }
2916 return false;
2917}
2918
2919bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2920 (MachineBasicBlock::iterator &MBBI) {
2921 MachineInstr &MI = *MBBI;
2922 MachineBasicBlock::iterator E = MI.getParent()->end();
2923 MachineBasicBlock::iterator Update;
2924
2925 // Look forward to try to form a post-index instruction. For example,
2926 // ldr x0, [x20]
2927 // add x20, x20, #32
2928 // merged into:
2929 // ldr x0, [x20], #32
2930 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset: 0, Limit: UpdateLimit);
2931 if (Update != E) {
2932 // Merge the update into the ld/st.
2933 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2934 /*IsPreIdx=*/false,
2935 /*MergeEither=*/false)) {
2936 MBBI = *NextI;
2937 return true;
2938 }
2939 }
2940
2941 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2942 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2943 return false;
2944
2945 // Look back to try to find a pre-index instruction. For example,
2946 // add x0, x0, #8
2947 // ldr x1, [x0]
2948 // merged into:
2949 // ldr x1, [x0, #8]!
2950 bool MergeEither;
2951 Update = findMatchingUpdateInsnBackward(I: MBBI, Limit: UpdateLimit, MergeEither);
2952 if (Update != E) {
2953 // Merge the update into the ld/st.
2954 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/true,
2955 /*IsPreIdx=*/true, MergeEither)) {
2956 MBBI = *NextI;
2957 return true;
2958 }
2959 }
2960
2961 // The immediate in the load/store is scaled by the size of the memory
2962 // operation. The immediate in the add we're looking for,
2963 // however, is not, so adjust here.
2964 int UnscaledOffset =
2965 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2966
2967 // Look forward to try to find a pre-index instruction. For example,
2968 // ldr x1, [x0, #64]
2969 // add x0, x0, #64
2970 // merged into:
2971 // ldr x1, [x0, #64]!
2972 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset, Limit: UpdateLimit);
2973 if (Update != E) {
2974 // Merge the update into the ld/st.
2975 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2976 /*IsPreIdx=*/true,
2977 /*MergeEither=*/false)) {
2978 MBBI = *NextI;
2979 return true;
2980 }
2981 }
2982
2983 return false;
2984}
2985
2986bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2987 int Scale) {
2988 MachineInstr &MI = *MBBI;
2989 MachineBasicBlock::iterator E = MI.getParent()->end();
2990 MachineBasicBlock::iterator Update;
2991
2992 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2993 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2994 return false;
2995
2996 // Look back to try to find a const offset for index LdSt instruction. For
2997 // example,
2998 // mov x8, #LargeImm ; = a * (1<<12) + imm12
2999 // ldr x1, [x0, x8]
3000 // merged into:
3001 // add x8, x0, a * (1<<12)
3002 // ldr x1, [x8, imm12]
3003 unsigned Offset;
3004 Update = findMatchingConstOffsetBackward(I: MBBI, Limit: LdStConstLimit, Offset);
3005 if (Update != E && (Offset & (Scale - 1)) == 0) {
3006 // Merge the imm12 into the ld/st.
3007 MBBI = mergeConstOffsetInsn(I: MBBI, Update, Offset, Scale);
3008 return true;
3009 }
3010
3011 return false;
3012}
3013
3014bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
3015 bool EnableNarrowZeroStOpt) {
3016 AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
3017
3018 bool Modified = false;
3019 // Four transformations to do here:
3020 // 1) Find loads that directly read from stores and promote them by
3021 // replacing with mov instructions. If the store is wider than the load,
3022 // the load will be replaced with a bitfield extract.
3023 // e.g.,
3024 // str w1, [x0, #4]
3025 // ldrh w2, [x0, #6]
3026 // ; becomes
3027 // str w1, [x0, #4]
3028 // lsr w2, w1, #16
3029 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3030 MBBI != E;) {
3031 if (isPromotableLoadFromStore(MI&: *MBBI) && tryToPromoteLoadFromStore(MBBI))
3032 Modified = true;
3033 else
3034 ++MBBI;
3035 }
3036 // 2) Merge adjacent zero stores into a wider store.
3037 // e.g.,
3038 // strh wzr, [x0]
3039 // strh wzr, [x0, #2]
3040 // ; becomes
3041 // str wzr, [x0]
3042 // e.g.,
3043 // str wzr, [x0]
3044 // str wzr, [x0, #4]
3045 // ; becomes
3046 // str xzr, [x0]
3047 if (EnableNarrowZeroStOpt)
3048 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3049 MBBI != E;) {
3050 if (isPromotableZeroStoreInst(MI&: *MBBI) && tryToMergeZeroStInst(MBBI))
3051 Modified = true;
3052 else
3053 ++MBBI;
3054 }
3055 // 3) Find loads and stores that can be merged into a single load or store
3056 // pair instruction.
3057 // When compiling for SVE 128, also try to combine SVE fill/spill
3058 // instructions into LDP/STP.
3059 // e.g.,
3060 // ldr x0, [x2]
3061 // ldr x1, [x2, #8]
3062 // ; becomes
3063 // ldp x0, x1, [x2]
3064 // e.g.,
3065 // ldr z0, [x2]
3066 // ldr z1, [x2, #1, mul vl]
3067 // ; becomes
3068 // ldp q0, q1, [x2]
3069
3070 if (MBB.getParent()->getRegInfo().tracksLiveness()) {
3071 DefinedInBB.clear();
3072 DefinedInBB.addLiveIns(MBB);
3073 }
3074
3075 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3076 MBBI != E;) {
3077 // Track currently live registers up to this point, to help with
3078 // searching for a rename register on demand.
3079 updateDefinedRegisters(MI&: *MBBI, Units&: DefinedInBB, TRI);
3080 if (TII->isPairableLdStInst(MI: *MBBI) && tryToPairLdStInst(MBBI))
3081 Modified = true;
3082 else
3083 ++MBBI;
3084 }
3085 // 4) Find base register updates that can be merged into the load or store
3086 // as a base-reg writeback.
3087 // e.g.,
3088 // ldr x0, [x2]
3089 // add x2, x2, #4
3090 // ; becomes
3091 // ldr x0, [x2], #4
3092 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3093 MBBI != E;) {
3094 if (isMergeableLdStUpdate(MI&: *MBBI, AFI) && tryToMergeLdStUpdate(MBBI))
3095 Modified = true;
3096 else
3097 ++MBBI;
3098 }
3099
3100 // 5) Find a register assigned with a const value that can be combined with
3101 // into the load or store. e.g.,
3102 // mov x8, #LargeImm ; = a * (1<<12) + imm12
3103 // ldr x1, [x0, x8]
3104 // ; becomes
3105 // add x8, x0, a * (1<<12)
3106 // ldr x1, [x8, imm12]
3107 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3108 MBBI != E;) {
3109 int Scale;
3110 if (isMergeableIndexLdSt(MI&: *MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
3111 Modified = true;
3112 else
3113 ++MBBI;
3114 }
3115
3116 return Modified;
3117}
3118
3119bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
3120 Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
3121 TII = Subtarget->getInstrInfo();
3122 TRI = Subtarget->getRegisterInfo();
3123
3124 // Resize the modified and used register unit trackers. We do this once
3125 // per function and then clear the register units each time we optimize a load
3126 // or store.
3127 ModifiedRegUnits.init(TRI: *TRI);
3128 UsedRegUnits.init(TRI: *TRI);
3129 DefinedInBB.init(TRI: *TRI);
3130
3131 bool Modified = false;
3132 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
3133 for (auto &MBB : Fn) {
3134 auto M = optimizeBlock(MBB, EnableNarrowZeroStOpt: enableNarrowZeroStOpt);
3135 Modified |= M;
3136 }
3137
3138 return Modified;
3139}
3140
3141// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
3142// stores near one another? Note: The pre-RA instruction scheduler already has
3143// hooks to try and schedule pairable loads/stores together to improve pairing
3144// opportunities. Thus, pre-RA pairing pass may not be worth the effort.
3145
3146// FIXME: When pairing store instructions it's very possible for this pass to
3147// hoist a store with a KILL marker above another use (without a KILL marker).
3148// The resulting IR is invalid, but nothing uses the KILL markers after this
3149// pass, so it's never caused a problem in practice.
3150
3151bool AArch64LoadStoreOptLegacy::runOnMachineFunction(MachineFunction &MF) {
3152 if (skipFunction(F: MF.getFunction()))
3153 return false;
3154 AArch64LoadStoreOpt Impl;
3155 Impl.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3156 return Impl.runOnMachineFunction(Fn&: MF);
3157}
3158
3159/// createAArch64LoadStoreOptimizationPass - returns an instance of the
3160/// load / store optimization pass.
3161FunctionPass *llvm::createAArch64LoadStoreOptLegacyPass() {
3162 return new AArch64LoadStoreOptLegacy();
3163}
3164
3165PreservedAnalyses
3166AArch64LoadStoreOptPass::run(MachineFunction &MF,
3167 MachineFunctionAnalysisManager &MFAM) {
3168 AArch64LoadStoreOpt Impl;
3169 Impl.AA = &MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
3170 .getManager()
3171 .getResult<AAManager>(IR&: MF.getFunction());
3172 bool Changed = Impl.runOnMachineFunction(Fn&: MF);
3173 if (!Changed)
3174 return PreservedAnalyses::all();
3175 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
3176 PA.preserveSet<CFGAnalyses>();
3177 return PA;
3178}
3179