1//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains a pass that performs load / store related peephole
10// optimizations. This pass should be run after register allocation.
11//
12// The pass runs after the PrologEpilogInserter where we emit the CFI
13// instructions. In order to preserve the correctness of the unwind information,
14// the pass should not change the order of any two instructions, one of which
15// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16// to unwind information.
17//
18//===----------------------------------------------------------------------===//
19
20#include "AArch64InstrInfo.h"
21#include "AArch64MachineFunctionInfo.h"
22#include "AArch64Subtarget.h"
23#include "MCTargetDesc/AArch64AddressingModes.h"
24#include "llvm/ADT/SetVector.h"
25#include "llvm/ADT/SmallVector.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringRef.h"
28#include "llvm/ADT/iterator_range.h"
29#include "llvm/Analysis/AliasAnalysis.h"
30#include "llvm/CodeGen/MachineBasicBlock.h"
31#include "llvm/CodeGen/MachineFunction.h"
32#include "llvm/CodeGen/MachineFunctionPass.h"
33#include "llvm/CodeGen/MachineInstr.h"
34#include "llvm/CodeGen/MachineInstrBuilder.h"
35#include "llvm/CodeGen/MachineOperand.h"
36#include "llvm/CodeGen/MachineRegisterInfo.h"
37#include "llvm/CodeGen/TargetRegisterInfo.h"
38#include "llvm/IR/DebugLoc.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCDwarf.h"
41#include "llvm/Pass.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/DebugCounter.h"
45#include "llvm/Support/ErrorHandling.h"
46#include <cassert>
47#include <cstdint>
48#include <functional>
49#include <iterator>
50#include <limits>
51#include <optional>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "aarch64-ldst-opt"
56
57STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
58STATISTIC(NumPostFolded, "Number of post-index updates folded");
59STATISTIC(NumPreFolded, "Number of pre-index updates folded");
60STATISTIC(NumUnscaledPairCreated,
61 "Number of load/store from unscaled generated");
62STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
63STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
64STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
65 "not passed the alignment check");
66STATISTIC(NumConstOffsetFolded,
67 "Number of const offset of index address folded");
68
69DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
70 "Controls which pairs are considered for renaming");
71
72// The LdStLimit limits how far we search for load/store pairs.
73static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
74 cl::init(Val: 20), cl::Hidden);
75
76// The UpdateLimit limits how far we search for update instructions when we form
77// pre-/post-index instructions.
78static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(Val: 100),
79 cl::Hidden);
80
81// The LdStConstLimit limits how far we search for const offset instructions
82// when we form index address load/store instructions.
83static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
84 cl::init(Val: 10), cl::Hidden);
85
86// Enable register renaming to find additional store pairing opportunities.
87static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
88 cl::init(Val: true), cl::Hidden);
89
90#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
91
92namespace {
93
94using LdStPairFlags = struct LdStPairFlags {
95 // If a matching instruction is found, MergeForward is set to true if the
96 // merge is to remove the first instruction and replace the second with
97 // a pair-wise insn, and false if the reverse is true.
98 bool MergeForward = false;
99
100 // SExtIdx gives the index of the result of the load pair that must be
101 // extended. The value of SExtIdx assumes that the paired load produces the
102 // value in this order: (I, returned iterator), i.e., -1 means no value has
103 // to be extended, 0 means I, and 1 means the returned iterator.
104 int SExtIdx = -1;
105
106 // If not none, RenameReg can be used to rename the result register of the
107 // first store in a pair. Currently this only works when merging stores
108 // forward.
109 std::optional<MCPhysReg> RenameReg;
110
111 LdStPairFlags() = default;
112
113 void setMergeForward(bool V = true) { MergeForward = V; }
114 bool getMergeForward() const { return MergeForward; }
115
116 void setSExtIdx(int V) { SExtIdx = V; }
117 int getSExtIdx() const { return SExtIdx; }
118
119 void setRenameReg(MCPhysReg R) { RenameReg = R; }
120 void clearRenameReg() { RenameReg = std::nullopt; }
121 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
122};
123
124struct AArch64LoadStoreOpt {
125 AliasAnalysis *AA;
126 const AArch64InstrInfo *TII;
127 const TargetRegisterInfo *TRI;
128 const AArch64Subtarget *Subtarget;
129
130 // Track which register units have been modified and used.
131 LiveRegUnits ModifiedRegUnits, UsedRegUnits;
132 LiveRegUnits DefinedInBB;
133
134 // Scan the instructions looking for a load/store that can be combined
135 // with the current instruction into a load/store pair.
136 // Return the matching instruction if one is found, else MBB->end().
137 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
138 LdStPairFlags &Flags,
139 unsigned Limit,
140 bool FindNarrowMerge);
141
142 // Scan the instructions looking for a store that writes to the address from
143 // which the current load instruction reads. Return true if one is found.
144 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
145 MachineBasicBlock::iterator &StoreI);
146
147 // Merge the two instructions indicated into a wider narrow store instruction.
148 MachineBasicBlock::iterator
149 mergeNarrowZeroStores(MachineBasicBlock::iterator I,
150 MachineBasicBlock::iterator MergeMI,
151 const LdStPairFlags &Flags);
152
153 // Merge the two instructions indicated into a single pair-wise instruction.
154 MachineBasicBlock::iterator
155 mergePairedInsns(MachineBasicBlock::iterator I,
156 MachineBasicBlock::iterator Paired,
157 const LdStPairFlags &Flags);
158
159 // Promote the load that reads directly from the address stored to.
160 MachineBasicBlock::iterator
161 promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
162 MachineBasicBlock::iterator StoreI);
163
164 // Scan the instruction list to find a base register update that can
165 // be combined with the current instruction (a load or store) using
166 // pre or post indexed addressing with writeback. Scan forwards.
167 MachineBasicBlock::iterator
168 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
169 int UnscaledOffset, unsigned Limit);
170
171 // Scan the instruction list to find a register assigned with a const
172 // value that can be combined with the current instruction (a load or store)
173 // using base addressing with writeback. Scan backwards.
174 MachineBasicBlock::iterator
175 findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
176 unsigned &Offset);
177
178 // Scan the instruction list to find a base register update that can
179 // be combined with the current instruction (a load or store) using
180 // pre or post indexed addressing with writeback. Scan backwards.
181 // `MergeEither` is set to true if the combined instruction may be placed
182 // either at the location of the load/store instruction or at the location of
183 // the update instruction.
184 MachineBasicBlock::iterator
185 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
186 bool &MergeEither);
187
188 // Find an instruction that updates the base register of the ld/st
189 // instruction.
190 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
191 unsigned BaseReg, int Offset);
192
193 bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
194 unsigned IndexReg, unsigned &Offset);
195
196 // Merge a pre- or post-index base register update into a ld/st instruction.
197 std::optional<MachineBasicBlock::iterator>
198 mergeUpdateInsn(MachineBasicBlock::iterator I,
199 MachineBasicBlock::iterator Update, bool IsForward,
200 bool IsPreIdx, bool MergeEither);
201
202 MachineBasicBlock::iterator
203 mergeConstOffsetInsn(MachineBasicBlock::iterator I,
204 MachineBasicBlock::iterator Update, unsigned Offset,
205 int Scale);
206
207 // Find and merge zero store instructions.
208 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
209
210 // Find and pair ldr/str instructions.
211 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
212
213 // Find and promote load instructions which read directly from store.
214 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
215
216 // Find and merge a base register updates before or after a ld/st instruction.
217 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
218
219 // Find and merge an index ldr/st instruction into a base ld/st instruction.
220 bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
221
222 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
223
224 bool runOnMachineFunction(MachineFunction &MF);
225};
226
227struct AArch64LoadStoreOptLegacy : public MachineFunctionPass {
228 static char ID;
229
230 AArch64LoadStoreOptLegacy() : MachineFunctionPass(ID) {}
231
232 bool runOnMachineFunction(MachineFunction &Fn) override;
233
234 void getAnalysisUsage(AnalysisUsage &AU) const override {
235 AU.addRequired<AAResultsWrapperPass>();
236 MachineFunctionPass::getAnalysisUsage(AU);
237 }
238
239 MachineFunctionProperties getRequiredProperties() const override {
240 return MachineFunctionProperties().setNoVRegs();
241 }
242
243 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
244};
245
246char AArch64LoadStoreOptLegacy::ID = 0;
247
248} // end anonymous namespace
249
250INITIALIZE_PASS(AArch64LoadStoreOptLegacy, "aarch64-ldst-opt",
251 AARCH64_LOAD_STORE_OPT_NAME, false, false)
252
253static bool isNarrowStore(unsigned Opc) {
254 switch (Opc) {
255 default:
256 return false;
257 case AArch64::STRBBui:
258 case AArch64::STURBBi:
259 case AArch64::STRHHui:
260 case AArch64::STURHHi:
261 return true;
262 }
263}
264
265// These instruction set memory tag and either keep memory contents unchanged or
266// set it to zero, ignoring the address part of the source register.
267static bool isTagStore(const MachineInstr &MI) {
268 switch (MI.getOpcode()) {
269 default:
270 return false;
271 case AArch64::STGi:
272 case AArch64::STZGi:
273 case AArch64::ST2Gi:
274 case AArch64::STZ2Gi:
275 return true;
276 }
277}
278
279static unsigned getMatchingNonSExtOpcode(unsigned Opc,
280 bool *IsValidLdStrOpc = nullptr) {
281 if (IsValidLdStrOpc)
282 *IsValidLdStrOpc = true;
283 switch (Opc) {
284 default:
285 if (IsValidLdStrOpc)
286 *IsValidLdStrOpc = false;
287 return std::numeric_limits<unsigned>::max();
288 case AArch64::STRDui:
289 case AArch64::STURDi:
290 case AArch64::STRDpre:
291 case AArch64::STRQui:
292 case AArch64::STURQi:
293 case AArch64::STRQpre:
294 case AArch64::STRBBui:
295 case AArch64::STURBBi:
296 case AArch64::STRHHui:
297 case AArch64::STURHHi:
298 case AArch64::STRWui:
299 case AArch64::STRWpre:
300 case AArch64::STURWi:
301 case AArch64::STRXui:
302 case AArch64::STRXpre:
303 case AArch64::STURXi:
304 case AArch64::STR_ZXI:
305 case AArch64::LDRDui:
306 case AArch64::LDURDi:
307 case AArch64::LDRDpre:
308 case AArch64::LDRQui:
309 case AArch64::LDURQi:
310 case AArch64::LDRQpre:
311 case AArch64::LDRWui:
312 case AArch64::LDURWi:
313 case AArch64::LDRWpre:
314 case AArch64::LDRXui:
315 case AArch64::LDURXi:
316 case AArch64::LDRXpre:
317 case AArch64::STRSui:
318 case AArch64::STURSi:
319 case AArch64::STRSpre:
320 case AArch64::LDRSui:
321 case AArch64::LDURSi:
322 case AArch64::LDRSpre:
323 case AArch64::LDR_ZXI:
324 return Opc;
325 case AArch64::LDRSWui:
326 return AArch64::LDRWui;
327 case AArch64::LDURSWi:
328 return AArch64::LDURWi;
329 case AArch64::LDRSWpre:
330 return AArch64::LDRWpre;
331 }
332}
333
334static unsigned getMatchingWideOpcode(unsigned Opc) {
335 switch (Opc) {
336 default:
337 llvm_unreachable("Opcode has no wide equivalent!");
338 case AArch64::STRBBui:
339 return AArch64::STRHHui;
340 case AArch64::STRHHui:
341 return AArch64::STRWui;
342 case AArch64::STURBBi:
343 return AArch64::STURHHi;
344 case AArch64::STURHHi:
345 return AArch64::STURWi;
346 case AArch64::STURWi:
347 return AArch64::STURXi;
348 case AArch64::STRWui:
349 return AArch64::STRXui;
350 }
351}
352
353static unsigned getMatchingPairOpcode(unsigned Opc) {
354 switch (Opc) {
355 default:
356 llvm_unreachable("Opcode has no pairwise equivalent!");
357 case AArch64::STRSui:
358 case AArch64::STURSi:
359 return AArch64::STPSi;
360 case AArch64::STRSpre:
361 return AArch64::STPSpre;
362 case AArch64::STRDui:
363 case AArch64::STURDi:
364 return AArch64::STPDi;
365 case AArch64::STRDpre:
366 return AArch64::STPDpre;
367 case AArch64::STRQui:
368 case AArch64::STURQi:
369 case AArch64::STR_ZXI:
370 return AArch64::STPQi;
371 case AArch64::STRQpre:
372 return AArch64::STPQpre;
373 case AArch64::STRWui:
374 case AArch64::STURWi:
375 return AArch64::STPWi;
376 case AArch64::STRWpre:
377 return AArch64::STPWpre;
378 case AArch64::STRXui:
379 case AArch64::STURXi:
380 return AArch64::STPXi;
381 case AArch64::STRXpre:
382 return AArch64::STPXpre;
383 case AArch64::LDRSui:
384 case AArch64::LDURSi:
385 return AArch64::LDPSi;
386 case AArch64::LDRSpre:
387 return AArch64::LDPSpre;
388 case AArch64::LDRDui:
389 case AArch64::LDURDi:
390 return AArch64::LDPDi;
391 case AArch64::LDRDpre:
392 return AArch64::LDPDpre;
393 case AArch64::LDRQui:
394 case AArch64::LDURQi:
395 case AArch64::LDR_ZXI:
396 return AArch64::LDPQi;
397 case AArch64::LDRQpre:
398 return AArch64::LDPQpre;
399 case AArch64::LDRWui:
400 case AArch64::LDURWi:
401 return AArch64::LDPWi;
402 case AArch64::LDRWpre:
403 return AArch64::LDPWpre;
404 case AArch64::LDRXui:
405 case AArch64::LDURXi:
406 return AArch64::LDPXi;
407 case AArch64::LDRXpre:
408 return AArch64::LDPXpre;
409 case AArch64::LDRSWui:
410 case AArch64::LDURSWi:
411 return AArch64::LDPSWi;
412 case AArch64::LDRSWpre:
413 return AArch64::LDPSWpre;
414 }
415}
416
417static unsigned isMatchingStore(MachineInstr &LoadInst,
418 MachineInstr &StoreInst) {
419 unsigned LdOpc = LoadInst.getOpcode();
420 unsigned StOpc = StoreInst.getOpcode();
421 switch (LdOpc) {
422 default:
423 llvm_unreachable("Unsupported load instruction!");
424 case AArch64::LDRBBui:
425 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
426 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
427 case AArch64::LDURBBi:
428 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
429 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
430 case AArch64::LDRHHui:
431 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
432 StOpc == AArch64::STRXui;
433 case AArch64::LDURHHi:
434 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
435 StOpc == AArch64::STURXi;
436 case AArch64::LDRWui:
437 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
438 case AArch64::LDURWi:
439 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
440 case AArch64::LDRXui:
441 return StOpc == AArch64::STRXui;
442 case AArch64::LDURXi:
443 return StOpc == AArch64::STURXi;
444 }
445}
446
447static unsigned getPreIndexedOpcode(unsigned Opc) {
448 // FIXME: We don't currently support creating pre-indexed loads/stores when
449 // the load or store is the unscaled version. If we decide to perform such an
450 // optimization in the future the cases for the unscaled loads/stores will
451 // need to be added here.
452 switch (Opc) {
453 default:
454 llvm_unreachable("Opcode has no pre-indexed equivalent!");
455 case AArch64::STRBui:
456 return AArch64::STRBpre;
457 case AArch64::STRHui:
458 return AArch64::STRHpre;
459 case AArch64::STRSui:
460 return AArch64::STRSpre;
461 case AArch64::STRDui:
462 return AArch64::STRDpre;
463 case AArch64::STRQui:
464 return AArch64::STRQpre;
465 case AArch64::STRBBui:
466 return AArch64::STRBBpre;
467 case AArch64::STRHHui:
468 return AArch64::STRHHpre;
469 case AArch64::STRWui:
470 return AArch64::STRWpre;
471 case AArch64::STRXui:
472 return AArch64::STRXpre;
473 case AArch64::LDRBui:
474 return AArch64::LDRBpre;
475 case AArch64::LDRHui:
476 return AArch64::LDRHpre;
477 case AArch64::LDRSui:
478 return AArch64::LDRSpre;
479 case AArch64::LDRDui:
480 return AArch64::LDRDpre;
481 case AArch64::LDRQui:
482 return AArch64::LDRQpre;
483 case AArch64::LDRBBui:
484 return AArch64::LDRBBpre;
485 case AArch64::LDRHHui:
486 return AArch64::LDRHHpre;
487 case AArch64::LDRWui:
488 return AArch64::LDRWpre;
489 case AArch64::LDRXui:
490 return AArch64::LDRXpre;
491 case AArch64::LDRSWui:
492 return AArch64::LDRSWpre;
493 case AArch64::LDPSi:
494 return AArch64::LDPSpre;
495 case AArch64::LDPSWi:
496 return AArch64::LDPSWpre;
497 case AArch64::LDPDi:
498 return AArch64::LDPDpre;
499 case AArch64::LDPQi:
500 return AArch64::LDPQpre;
501 case AArch64::LDPWi:
502 return AArch64::LDPWpre;
503 case AArch64::LDPXi:
504 return AArch64::LDPXpre;
505 case AArch64::STPSi:
506 return AArch64::STPSpre;
507 case AArch64::STPDi:
508 return AArch64::STPDpre;
509 case AArch64::STPQi:
510 return AArch64::STPQpre;
511 case AArch64::STPWi:
512 return AArch64::STPWpre;
513 case AArch64::STPXi:
514 return AArch64::STPXpre;
515 case AArch64::STGi:
516 return AArch64::STGPreIndex;
517 case AArch64::STZGi:
518 return AArch64::STZGPreIndex;
519 case AArch64::ST2Gi:
520 return AArch64::ST2GPreIndex;
521 case AArch64::STZ2Gi:
522 return AArch64::STZ2GPreIndex;
523 case AArch64::STGPi:
524 return AArch64::STGPpre;
525 }
526}
527
528static unsigned getBaseAddressOpcode(unsigned Opc) {
529 // TODO: Add more index address stores.
530 switch (Opc) {
531 default:
532 llvm_unreachable("Opcode has no base address equivalent!");
533 case AArch64::LDRBroX:
534 return AArch64::LDRBui;
535 case AArch64::LDRBBroX:
536 return AArch64::LDRBBui;
537 case AArch64::LDRSBXroX:
538 return AArch64::LDRSBXui;
539 case AArch64::LDRSBWroX:
540 return AArch64::LDRSBWui;
541 case AArch64::LDRHroX:
542 return AArch64::LDRHui;
543 case AArch64::LDRHHroX:
544 return AArch64::LDRHHui;
545 case AArch64::LDRSHXroX:
546 return AArch64::LDRSHXui;
547 case AArch64::LDRSHWroX:
548 return AArch64::LDRSHWui;
549 case AArch64::LDRWroX:
550 return AArch64::LDRWui;
551 case AArch64::LDRSroX:
552 return AArch64::LDRSui;
553 case AArch64::LDRSWroX:
554 return AArch64::LDRSWui;
555 case AArch64::LDRDroX:
556 return AArch64::LDRDui;
557 case AArch64::LDRXroX:
558 return AArch64::LDRXui;
559 case AArch64::LDRQroX:
560 return AArch64::LDRQui;
561 }
562}
563
564static unsigned getPostIndexedOpcode(unsigned Opc) {
565 switch (Opc) {
566 default:
567 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
568 case AArch64::STRBui:
569 return AArch64::STRBpost;
570 case AArch64::STRHui:
571 return AArch64::STRHpost;
572 case AArch64::STRSui:
573 case AArch64::STURSi:
574 return AArch64::STRSpost;
575 case AArch64::STRDui:
576 case AArch64::STURDi:
577 return AArch64::STRDpost;
578 case AArch64::STRQui:
579 case AArch64::STURQi:
580 return AArch64::STRQpost;
581 case AArch64::STRBBui:
582 return AArch64::STRBBpost;
583 case AArch64::STRHHui:
584 return AArch64::STRHHpost;
585 case AArch64::STRWui:
586 case AArch64::STURWi:
587 return AArch64::STRWpost;
588 case AArch64::STRXui:
589 case AArch64::STURXi:
590 return AArch64::STRXpost;
591 case AArch64::LDRBui:
592 return AArch64::LDRBpost;
593 case AArch64::LDRHui:
594 return AArch64::LDRHpost;
595 case AArch64::LDRSui:
596 case AArch64::LDURSi:
597 return AArch64::LDRSpost;
598 case AArch64::LDRDui:
599 case AArch64::LDURDi:
600 return AArch64::LDRDpost;
601 case AArch64::LDRQui:
602 case AArch64::LDURQi:
603 return AArch64::LDRQpost;
604 case AArch64::LDRBBui:
605 return AArch64::LDRBBpost;
606 case AArch64::LDRHHui:
607 return AArch64::LDRHHpost;
608 case AArch64::LDRWui:
609 case AArch64::LDURWi:
610 return AArch64::LDRWpost;
611 case AArch64::LDRXui:
612 case AArch64::LDURXi:
613 return AArch64::LDRXpost;
614 case AArch64::LDRSWui:
615 return AArch64::LDRSWpost;
616 case AArch64::LDPSi:
617 return AArch64::LDPSpost;
618 case AArch64::LDPSWi:
619 return AArch64::LDPSWpost;
620 case AArch64::LDPDi:
621 return AArch64::LDPDpost;
622 case AArch64::LDPQi:
623 return AArch64::LDPQpost;
624 case AArch64::LDPWi:
625 return AArch64::LDPWpost;
626 case AArch64::LDPXi:
627 return AArch64::LDPXpost;
628 case AArch64::STPSi:
629 return AArch64::STPSpost;
630 case AArch64::STPDi:
631 return AArch64::STPDpost;
632 case AArch64::STPQi:
633 return AArch64::STPQpost;
634 case AArch64::STPWi:
635 return AArch64::STPWpost;
636 case AArch64::STPXi:
637 return AArch64::STPXpost;
638 case AArch64::STGi:
639 return AArch64::STGPostIndex;
640 case AArch64::STZGi:
641 return AArch64::STZGPostIndex;
642 case AArch64::ST2Gi:
643 return AArch64::ST2GPostIndex;
644 case AArch64::STZ2Gi:
645 return AArch64::STZ2GPostIndex;
646 case AArch64::STGPi:
647 return AArch64::STGPpost;
648 }
649}
650
651static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
652
653 unsigned OpcA = FirstMI.getOpcode();
654 unsigned OpcB = MI.getOpcode();
655
656 switch (OpcA) {
657 default:
658 return false;
659 case AArch64::STRSpre:
660 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
661 case AArch64::STRDpre:
662 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
663 case AArch64::STRQpre:
664 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
665 case AArch64::STRWpre:
666 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
667 case AArch64::STRXpre:
668 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
669 case AArch64::LDRSpre:
670 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
671 case AArch64::LDRDpre:
672 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
673 case AArch64::LDRQpre:
674 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
675 case AArch64::LDRWpre:
676 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
677 case AArch64::LDRXpre:
678 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
679 case AArch64::LDRSWpre:
680 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
681 }
682}
683
684// Returns the scale and offset range of pre/post indexed variants of MI.
685static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
686 int &MinOffset, int &MaxOffset) {
687 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
688 bool IsTagStore = isTagStore(MI);
689 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
690 // as in the "unsigned offset" variant.
691 // All other pre/post indexed ldst instructions are unscaled.
692 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
693
694 if (IsPaired) {
695 MinOffset = -64;
696 MaxOffset = 63;
697 } else {
698 MinOffset = -256;
699 MaxOffset = 255;
700 }
701}
702
703static MachineOperand &getLdStRegOp(MachineInstr &MI,
704 unsigned PairedRegOp = 0) {
705 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
706 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
707 if (IsPreLdSt)
708 PairedRegOp += 1;
709 unsigned Idx =
710 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
711 return MI.getOperand(i: Idx);
712}
713
714static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
715 MachineInstr &StoreInst,
716 const AArch64InstrInfo *TII) {
717 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
718 int LoadSize = TII->getMemScale(MI: LoadInst);
719 int StoreSize = TII->getMemScale(MI: StoreInst);
720 int UnscaledStOffset =
721 TII->hasUnscaledLdStOffset(MI&: StoreInst)
722 ? AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm()
723 : AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm() * StoreSize;
724 int UnscaledLdOffset =
725 TII->hasUnscaledLdStOffset(MI&: LoadInst)
726 ? AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm()
727 : AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm() * LoadSize;
728 return (UnscaledStOffset <= UnscaledLdOffset) &&
729 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
730}
731
732static bool isPromotableZeroStoreInst(MachineInstr &MI) {
733 unsigned Opc = MI.getOpcode();
734 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
735 isNarrowStore(Opc)) &&
736 getLdStRegOp(MI).getReg() == AArch64::WZR;
737}
738
739static bool isPromotableLoadFromStore(MachineInstr &MI) {
740 switch (MI.getOpcode()) {
741 default:
742 return false;
743 // Scaled instructions.
744 case AArch64::LDRBBui:
745 case AArch64::LDRHHui:
746 case AArch64::LDRWui:
747 case AArch64::LDRXui:
748 // Unscaled instructions.
749 case AArch64::LDURBBi:
750 case AArch64::LDURHHi:
751 case AArch64::LDURWi:
752 case AArch64::LDURXi:
753 return true;
754 }
755}
756
757static bool isMergeableLdStUpdate(MachineInstr &MI, AArch64FunctionInfo &AFI) {
758 unsigned Opc = MI.getOpcode();
759 switch (Opc) {
760 default:
761 return false;
762 // Scaled instructions.
763 case AArch64::STRBui:
764 case AArch64::STRHui:
765 case AArch64::STRSui:
766 case AArch64::STRDui:
767 case AArch64::STRQui:
768 case AArch64::STRXui:
769 case AArch64::STRWui:
770 case AArch64::STRHHui:
771 case AArch64::STRBBui:
772 case AArch64::LDRBui:
773 case AArch64::LDRHui:
774 case AArch64::LDRSui:
775 case AArch64::LDRDui:
776 case AArch64::LDRQui:
777 case AArch64::LDRXui:
778 case AArch64::LDRWui:
779 case AArch64::LDRHHui:
780 case AArch64::LDRBBui:
781 case AArch64::STGi:
782 case AArch64::STZGi:
783 case AArch64::ST2Gi:
784 case AArch64::STZ2Gi:
785 case AArch64::STGPi:
786 // Unscaled instructions.
787 case AArch64::STURSi:
788 case AArch64::STURDi:
789 case AArch64::STURQi:
790 case AArch64::STURWi:
791 case AArch64::STURXi:
792 case AArch64::LDURSi:
793 case AArch64::LDURDi:
794 case AArch64::LDURQi:
795 case AArch64::LDURWi:
796 case AArch64::LDURXi:
797 // Paired instructions.
798 case AArch64::LDPSi:
799 case AArch64::LDPSWi:
800 case AArch64::LDPDi:
801 case AArch64::LDPQi:
802 case AArch64::LDPWi:
803 case AArch64::LDPXi:
804 case AArch64::STPSi:
805 case AArch64::STPDi:
806 case AArch64::STPQi:
807 case AArch64::STPWi:
808 case AArch64::STPXi:
809 // Make sure this is a reg+imm (as opposed to an address reloc).
810 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
811 return false;
812
813 // When using stack tagging, simple sp+imm loads and stores are not
814 // tag-checked, but pre- and post-indexed versions of them are, so we can't
815 // replace the former with the latter. This transformation would be valid
816 // if the load/store accesses an untagged stack slot, but we don't have
817 // that information available after frame indices have been eliminated.
818 if (AFI.isMTETagged() &&
819 AArch64InstrInfo::getLdStBaseOp(MI).getReg() == AArch64::SP)
820 return false;
821
822 return true;
823 }
824}
825
826// Make sure this is a reg+reg Ld/St
827static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
828 unsigned Opc = MI.getOpcode();
829 switch (Opc) {
830 default:
831 return false;
832 // Scaled instructions.
833 // TODO: Add more index address stores.
834 case AArch64::LDRBroX:
835 case AArch64::LDRBBroX:
836 case AArch64::LDRSBXroX:
837 case AArch64::LDRSBWroX:
838 Scale = 1;
839 return true;
840 case AArch64::LDRHroX:
841 case AArch64::LDRHHroX:
842 case AArch64::LDRSHXroX:
843 case AArch64::LDRSHWroX:
844 Scale = 2;
845 return true;
846 case AArch64::LDRWroX:
847 case AArch64::LDRSroX:
848 case AArch64::LDRSWroX:
849 Scale = 4;
850 return true;
851 case AArch64::LDRDroX:
852 case AArch64::LDRXroX:
853 Scale = 8;
854 return true;
855 case AArch64::LDRQroX:
856 Scale = 16;
857 return true;
858 }
859}
860
861static bool isRewritableImplicitDef(const MachineOperand &MO) {
862 switch (MO.getParent()->getOpcode()) {
863 default:
864 return MO.isRenamable();
865 case AArch64::ORRWrs:
866 case AArch64::ADDWri:
867 return true;
868 }
869}
870
871MachineBasicBlock::iterator
872AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
873 MachineBasicBlock::iterator MergeMI,
874 const LdStPairFlags &Flags) {
875 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
876 "Expected promotable zero stores.");
877
878 MachineBasicBlock::iterator E = I->getParent()->end();
879 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
880 // If NextI is the second of the two instructions to be merged, we need
881 // to skip one further. Either way we merge will invalidate the iterator,
882 // and we don't need to scan the new instruction, as it's a pairwise
883 // instruction, which we're not considering for further action anyway.
884 if (NextI == MergeMI)
885 NextI = next_nodbg(It: NextI, End: E);
886
887 unsigned Opc = I->getOpcode();
888 unsigned MergeMIOpc = MergeMI->getOpcode();
889 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
890 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(Opc: MergeMIOpc);
891 int OffsetStride = IsScaled ? TII->getMemScale(MI: *I) : 1;
892 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(MI: *MergeMI) : 1;
893
894 bool MergeForward = Flags.getMergeForward();
895 // Insert our new paired instruction after whichever of the paired
896 // instructions MergeForward indicates.
897 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
898 // Also based on MergeForward is from where we copy the base register operand
899 // so we get the flags compatible with the input code.
900 const MachineOperand &BaseRegOp =
901 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *MergeMI)
902 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
903
904 // Which register is Rt and which is Rt2 depends on the offset order.
905 int64_t IOffsetInBytes =
906 AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm() * OffsetStride;
907 int64_t MIOffsetInBytes =
908 AArch64InstrInfo::getLdStOffsetOp(MI: *MergeMI).getImm() *
909 MergeMIOffsetStride;
910 // Select final offset based on the offset order.
911 int64_t OffsetImm;
912 if (IOffsetInBytes > MIOffsetInBytes)
913 OffsetImm = MIOffsetInBytes;
914 else
915 OffsetImm = IOffsetInBytes;
916
917 int NewOpcode = getMatchingWideOpcode(Opc);
918 // Adjust final offset on scaled stores because the new instruction
919 // has a different scale.
920 if (!TII->hasUnscaledLdStOffset(Opc: NewOpcode)) {
921 int NewOffsetStride = TII->getMemScale(Opc: NewOpcode);
922 assert(((OffsetImm % NewOffsetStride) == 0) &&
923 "Offset should be a multiple of the store memory scale");
924 OffsetImm = OffsetImm / NewOffsetStride;
925 }
926
927 // Construct the new instruction.
928 DebugLoc DL = I->getDebugLoc();
929 MachineBasicBlock *MBB = I->getParent();
930 MachineInstrBuilder MIB;
931 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: NewOpcode))
932 .addReg(RegNo: isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
933 .add(MO: BaseRegOp)
934 .addImm(Val: OffsetImm)
935 .cloneMergedMemRefs(OtherMIs: {&*I, &*MergeMI})
936 .setMIFlags(I->mergeFlagsWith(Other: *MergeMI));
937 (void)MIB;
938
939 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
940 LLVM_DEBUG(I->print(dbgs()));
941 LLVM_DEBUG(dbgs() << " ");
942 LLVM_DEBUG(MergeMI->print(dbgs()));
943 LLVM_DEBUG(dbgs() << " with instruction:\n ");
944 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
945 LLVM_DEBUG(dbgs() << "\n");
946
947 // Erase the old instructions.
948 I->eraseFromParent();
949 MergeMI->eraseFromParent();
950 return NextI;
951}
952
953// Apply Fn to all instructions between MI and the beginning of the block, until
954// a def for DefReg is reached. Returns true, iff Fn returns true for all
955// visited instructions. Stop after visiting Limit iterations.
956static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
957 const TargetRegisterInfo *TRI, unsigned Limit,
958 std::function<bool(MachineInstr &, bool)> &Fn) {
959 auto MBB = MI.getParent();
960 for (MachineInstr &I :
961 instructionsWithoutDebug(It: MI.getReverseIterator(), End: MBB->instr_rend())) {
962 if (!Limit)
963 return false;
964 --Limit;
965
966 bool isDef = any_of(Range: I.operands(), P: [DefReg, TRI](MachineOperand &MOP) {
967 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
968 TRI->regsOverlap(RegA: MOP.getReg(), RegB: DefReg);
969 });
970 if (!Fn(I, isDef))
971 return false;
972 if (isDef)
973 break;
974 }
975 return true;
976}
977
978static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
979 const TargetRegisterInfo *TRI) {
980
981 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
982 if (MOP.isReg() && MOP.isKill())
983 Units.removeReg(Reg: MOP.getReg());
984
985 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
986 if (MOP.isReg() && !MOP.isKill())
987 Units.addReg(Reg: MOP.getReg());
988}
989
990/// This function will add a new entry into the debugValueSubstitutions table
991/// when two instruction have been merged into a new one represented by \p
992/// MergedInstr.
993static void addDebugSubstitutionsToTable(MachineFunction *MF,
994 unsigned InstrNumToSet,
995 MachineInstr &OriginalInstr,
996 MachineInstr &MergedInstr) {
997
998 // Figure out the Operand Index of the destination register of the
999 // OriginalInstr in the new MergedInstr.
1000 auto Reg = OriginalInstr.getOperand(i: 0).getReg();
1001 unsigned OperandNo = 0;
1002 bool RegFound = false;
1003 for (const auto Op : MergedInstr.operands()) {
1004 if (Op.getReg() == Reg) {
1005 RegFound = true;
1006 break;
1007 }
1008 OperandNo++;
1009 }
1010
1011 if (RegFound)
1012 MF->makeDebugValueSubstitution({OriginalInstr.peekDebugInstrNum(), 0},
1013 {InstrNumToSet, OperandNo});
1014}
1015
1016MachineBasicBlock::iterator
1017AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
1018 MachineBasicBlock::iterator Paired,
1019 const LdStPairFlags &Flags) {
1020 MachineBasicBlock::iterator E = I->getParent()->end();
1021 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
1022 // If NextI is the second of the two instructions to be merged, we need
1023 // to skip one further. Either way we merge will invalidate the iterator,
1024 // and we don't need to scan the new instruction, as it's a pairwise
1025 // instruction, which we're not considering for further action anyway.
1026 if (NextI == Paired)
1027 NextI = next_nodbg(It: NextI, End: E);
1028
1029 int SExtIdx = Flags.getSExtIdx();
1030 unsigned Opc =
1031 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(Opc: I->getOpcode());
1032 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
1033 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: *I) : 1;
1034
1035 bool MergeForward = Flags.getMergeForward();
1036
1037 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
1038 if (RenameReg) {
1039 MCRegister RegToRename = getLdStRegOp(MI&: *I).getReg();
1040 DefinedInBB.addReg(Reg: *RenameReg);
1041
1042 // Return the sub/super register for RenameReg, matching the size of
1043 // OriginalReg.
1044 auto GetMatchingSubReg =
1045 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
1046 for (MCPhysReg SubOrSuper :
1047 TRI->sub_and_superregs_inclusive(Reg: *RenameReg)) {
1048 if (C->contains(Reg: SubOrSuper))
1049 return SubOrSuper;
1050 }
1051 llvm_unreachable("Should have found matching sub or super register!");
1052 };
1053
1054 std::function<bool(MachineInstr &, bool)> UpdateMIs =
1055 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
1056 bool IsDef) {
1057 if (IsDef) {
1058 bool SeenDef = false;
1059 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1060 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1061 // Rename the first explicit definition and all implicit
1062 // definitions matching RegToRename.
1063 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1064 (!MergeForward || !SeenDef ||
1065 (MOP.isDef() && MOP.isImplicit())) &&
1066 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1067 assert((MOP.isImplicit() ||
1068 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1069 "Need renamable operands");
1070 Register MatchingReg;
1071 if (const TargetRegisterClass *RC =
1072 MI.getRegClassConstraint(OpIdx, TII, TRI))
1073 MatchingReg = GetMatchingSubReg(RC);
1074 else {
1075 if (!isRewritableImplicitDef(MO: MOP))
1076 continue;
1077 MatchingReg = GetMatchingSubReg(
1078 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1079 }
1080 MOP.setReg(MatchingReg);
1081 SeenDef = true;
1082 }
1083 }
1084 } else {
1085 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1086 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1087 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1088 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1089 assert((MOP.isImplicit() ||
1090 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1091 "Need renamable operands");
1092 Register MatchingReg;
1093 if (const TargetRegisterClass *RC =
1094 MI.getRegClassConstraint(OpIdx, TII, TRI))
1095 MatchingReg = GetMatchingSubReg(RC);
1096 else
1097 MatchingReg = GetMatchingSubReg(
1098 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1099 assert(MatchingReg != AArch64::NoRegister &&
1100 "Cannot find matching regs for renaming");
1101 MOP.setReg(MatchingReg);
1102 }
1103 }
1104 }
1105 LLVM_DEBUG(dbgs() << "Renamed " << MI);
1106 return true;
1107 };
1108 forAllMIsUntilDef(MI&: MergeForward ? *I : *Paired->getPrevNode(), DefReg: RegToRename,
1109 TRI, UINT32_MAX, Fn&: UpdateMIs);
1110
1111#if !defined(NDEBUG)
1112 // For forward merging store:
1113 // Make sure the register used for renaming is not used between the
1114 // paired instructions. That would trash the content before the new
1115 // paired instruction.
1116 MCPhysReg RegToCheck = *RenameReg;
1117 // For backward merging load:
1118 // Make sure the register being renamed is not used between the
1119 // paired instructions. That would trash the content after the new
1120 // paired instruction.
1121 if (!MergeForward)
1122 RegToCheck = RegToRename;
1123 for (auto &MI :
1124 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
1125 MergeForward ? std::next(I) : I,
1126 MergeForward ? std::next(Paired) : Paired))
1127 assert(all_of(MI.operands(),
1128 [this, RegToCheck](const MachineOperand &MOP) {
1129 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1130 MOP.isUndef() ||
1131 !TRI->regsOverlap(MOP.getReg(), RegToCheck);
1132 }) &&
1133 "Rename register used between paired instruction, trashing the "
1134 "content");
1135#endif
1136 }
1137
1138 // Insert our new paired instruction after whichever of the paired
1139 // instructions MergeForward indicates.
1140 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
1141 // Also based on MergeForward is from where we copy the base register operand
1142 // so we get the flags compatible with the input code.
1143 const MachineOperand &BaseRegOp =
1144 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *Paired)
1145 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
1146
1147 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm();
1148 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(MI: *Paired).getImm();
1149 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Opc: Paired->getOpcode());
1150 if (IsUnscaled != PairedIsUnscaled) {
1151 // We're trying to pair instructions that differ in how they are scaled. If
1152 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
1153 // the opposite (i.e., make Paired's offset unscaled).
1154 int MemSize = TII->getMemScale(MI: *Paired);
1155 if (PairedIsUnscaled) {
1156 // If the unscaled offset isn't a multiple of the MemSize, we can't
1157 // pair the operations together.
1158 assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1159 "Offset should be a multiple of the stride!");
1160 PairedOffset /= MemSize;
1161 } else {
1162 PairedOffset *= MemSize;
1163 }
1164 }
1165
1166 // Which register is Rt and which is Rt2 depends on the offset order.
1167 // However, for pre load/stores the Rt should be the one of the pre
1168 // load/store.
1169 MachineInstr *RtMI, *Rt2MI;
1170 if (Offset == PairedOffset + OffsetStride &&
1171 !AArch64InstrInfo::isPreLdSt(MI: *I)) {
1172 RtMI = &*Paired;
1173 Rt2MI = &*I;
1174 // Here we swapped the assumption made for SExtIdx.
1175 // I.e., we turn ldp I, Paired into ldp Paired, I.
1176 // Update the index accordingly.
1177 if (SExtIdx != -1)
1178 SExtIdx = (SExtIdx + 1) % 2;
1179 } else {
1180 RtMI = &*I;
1181 Rt2MI = &*Paired;
1182 }
1183 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(MI: *RtMI).getImm();
1184 // Scale the immediate offset, if necessary.
1185 if (TII->hasUnscaledLdStOffset(Opc: RtMI->getOpcode())) {
1186 assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1187 "Unscaled offset cannot be scaled.");
1188 OffsetImm /= TII->getMemScale(MI: *RtMI);
1189 }
1190
1191 // Construct the new instruction.
1192 MachineInstrBuilder MIB;
1193 DebugLoc DL = I->getDebugLoc();
1194 MachineBasicBlock *MBB = I->getParent();
1195 MachineOperand RegOp0 = getLdStRegOp(MI&: *RtMI);
1196 MachineOperand RegOp1 = getLdStRegOp(MI&: *Rt2MI);
1197 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1198 // Kill flags may become invalid when moving stores for pairing.
1199 if (RegOp0.isUse()) {
1200 if (!MergeForward) {
1201 // Clear kill flags on store if moving upwards. Example:
1202 // STRWui kill %w0, ...
1203 // USE %w1
1204 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1205 // We are about to move the store of w1, so its kill flag may become
1206 // invalid; not the case for w0.
1207 // Since w1 is used between the stores, the kill flag on w1 is cleared
1208 // after merging.
1209 // STPWi kill %w0, %w1, ...
1210 // USE %w1
1211 for (auto It = std::next(x: I); It != Paired && PairedRegOp.isKill(); ++It)
1212 if (It->readsRegister(Reg: PairedRegOp.getReg(), TRI))
1213 PairedRegOp.setIsKill(false);
1214 } else {
1215 // Clear kill flags of the first stores register. Example:
1216 // STRWui %w1, ...
1217 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1218 // STRW %w0
1219 Register Reg = getLdStRegOp(MI&: *I).getReg();
1220 for (MachineInstr &MI :
1221 make_range(x: std::next(x: I->getIterator()), y: Paired->getIterator()))
1222 MI.clearRegisterKills(Reg, RegInfo: TRI);
1223 }
1224 }
1225
1226 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1227 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: MatchPairOpcode));
1228
1229 // Adds the pre-index operand for pre-indexed ld/st pairs.
1230 if (AArch64InstrInfo::isPreLdSt(MI: *RtMI))
1231 MIB.addReg(RegNo: BaseRegOp.getReg(), Flags: RegState::Define);
1232
1233 MIB.add(MO: RegOp0)
1234 .add(MO: RegOp1)
1235 .add(MO: BaseRegOp)
1236 .addImm(Val: OffsetImm)
1237 .cloneMergedMemRefs(OtherMIs: {&*I, &*Paired})
1238 .setMIFlags(I->mergeFlagsWith(Other: *Paired));
1239
1240 (void)MIB;
1241
1242 LLVM_DEBUG(
1243 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1244 LLVM_DEBUG(I->print(dbgs()));
1245 LLVM_DEBUG(dbgs() << " ");
1246 LLVM_DEBUG(Paired->print(dbgs()));
1247 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1248 if (SExtIdx != -1) {
1249 // Generate the sign extension for the proper result of the ldp.
1250 // I.e., with X1, that would be:
1251 // %w1 = KILL %w1, implicit-def %x1
1252 // %x1 = SBFMXri killed %x1, 0, 31
1253 MachineOperand &DstMO = MIB->getOperand(i: SExtIdx);
1254 // Right now, DstMO has the extended register, since it comes from an
1255 // extended opcode.
1256 Register DstRegX = DstMO.getReg();
1257 // Get the W variant of that register.
1258 Register DstRegW = TRI->getSubReg(Reg: DstRegX, Idx: AArch64::sub_32);
1259 // Update the result of LDP to use the W instead of the X variant.
1260 DstMO.setReg(DstRegW);
1261 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1262 LLVM_DEBUG(dbgs() << "\n");
1263 // Make the machine verifier happy by providing a definition for
1264 // the X register.
1265 // Insert this definition right after the generated LDP, i.e., before
1266 // InsertionPoint.
1267 MachineInstrBuilder MIBKill =
1268 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::KILL), DestReg: DstRegW)
1269 .addReg(RegNo: DstRegW)
1270 .addReg(RegNo: DstRegX, Flags: RegState::Define);
1271 MIBKill->getOperand(i: 2).setImplicit();
1272 // Create the sign extension.
1273 MachineInstrBuilder MIBSXTW =
1274 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: AArch64::SBFMXri), DestReg: DstRegX)
1275 .addReg(RegNo: DstRegX)
1276 .addImm(Val: 0)
1277 .addImm(Val: 31);
1278 (void)MIBSXTW;
1279
1280 // In the case of a sign-extend, where we have something like:
1281 // debugValueSubstitutions:[]
1282 // $w1 = LDRWui $x0, 1, debug-instr-number 1
1283 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1284 // $x0 = LDRSWui $x0, 0, debug-instr-number 2
1285 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1286
1287 // It will be converted to:
1288 // debugValueSubstitutions:[]
1289 // $w0, $w1 = LDPWi $x0, 0
1290 // $w0 = KILL $w0, implicit-def $x0
1291 // $x0 = SBFMXri $x0, 0, 31
1292 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1293 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1294
1295 // We want the final result to look like:
1296 // debugValueSubstitutions:
1297 // - { srcinst: 1, srcop: 0, dstinst: 4, dstop: 1, subreg: 0 }
1298 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1299 // $w0, $w1 = LDPWi $x0, 0, debug-instr-number 4
1300 // $w0 = KILL $w0, implicit-def $x0
1301 // $x0 = SBFMXri $x0, 0, 31, debug-instr-number 3
1302 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1303 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1304
1305 // $x0 is where the final value is stored, so the sign extend (SBFMXri)
1306 // instruction contains the final value we care about we give it a new
1307 // debug-instr-number 3. Whereas, $w1 contains the final value that we care
1308 // about, therefore the LDP instruction is also given a new
1309 // debug-instr-number 4. We have to add these substitutions to the
1310 // debugValueSubstitutions table. However, we also have to ensure that the
1311 // OpIndex that pointed to debug-instr-number 1 gets updated to 1, because
1312 // $w1 is the second operand of the LDP instruction.
1313
1314 if (I->peekDebugInstrNum()) {
1315 // If I is the instruction which got sign extended and has a
1316 // debug-instr-number, give the SBFMXri instruction a new
1317 // debug-instr-number, and update the debugValueSubstitutions table with
1318 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1319 // instruction a new debug-instr-number, and update the
1320 // debugValueSubstitutions table with the new debug-instr-number and
1321 // OpIndex pair.
1322 unsigned NewInstrNum;
1323 if (DstRegX == I->getOperand(i: 0).getReg()) {
1324 NewInstrNum = MIBSXTW->getDebugInstrNum();
1325 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I,
1326 MergedInstr&: *MIBSXTW);
1327 } else {
1328 NewInstrNum = MIB->getDebugInstrNum();
1329 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I, MergedInstr&: *MIB);
1330 }
1331 }
1332 if (Paired->peekDebugInstrNum()) {
1333 // If Paired is the instruction which got sign extended and has a
1334 // debug-instr-number, give the SBFMXri instruction a new
1335 // debug-instr-number, and update the debugValueSubstitutions table with
1336 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1337 // instruction a new debug-instr-number, and update the
1338 // debugValueSubstitutions table with the new debug-instr-number and
1339 // OpIndex pair.
1340 unsigned NewInstrNum;
1341 if (DstRegX == Paired->getOperand(i: 0).getReg()) {
1342 NewInstrNum = MIBSXTW->getDebugInstrNum();
1343 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1344 MergedInstr&: *MIBSXTW);
1345 } else {
1346 NewInstrNum = MIB->getDebugInstrNum();
1347 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1348 MergedInstr&: *MIB);
1349 }
1350 }
1351
1352 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1353 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1354 } else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) {
1355 // We are combining SVE fill/spill to LDP/STP, so we need to use the Q
1356 // variant of the registers.
1357 MachineOperand &MOp0 = MIB->getOperand(i: 0);
1358 MachineOperand &MOp1 = MIB->getOperand(i: 1);
1359 assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
1360 AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
1361 MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
1362 MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
1363 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1364 } else {
1365
1366 // In the case that the merge doesn't result in a sign-extend, if we have
1367 // something like:
1368 // debugValueSubstitutions:[]
1369 // $x1 = LDRXui $x0, 1, debug-instr-number 1
1370 // DBG_INSTR_REF !13, dbg-instr-ref(1, 0), debug-location !11
1371 // $x0 = LDRXui killed $x0, 0, debug-instr-number 2
1372 // DBG_INSTR_REF !14, dbg-instr-ref(2, 0), debug-location !11
1373
1374 // It will be converted to:
1375 // debugValueSubstitutions: []
1376 // $x0, $x1 = LDPXi $x0, 0
1377 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1378 // DBG_INSTR_REF !13, dbg-instr-ref(2, 0), debug-location !14
1379
1380 // We want the final result to look like:
1381 // debugValueSubstitutions:
1382 // - { srcinst: 1, srcop: 0, dstinst: 3, dstop: 1, subreg: 0 }
1383 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1384 // $x0, $x1 = LDPXi $x0, 0, debug-instr-number 3
1385 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1386 // DBG_INSTR_REF !12, dbg-instr-ref(2, 0), debug-location !14
1387
1388 // Here all that needs to be done is, that the LDP instruction needs to be
1389 // updated with a new debug-instr-number, we then need to add entries into
1390 // the debugSubstitutions table to map the old instr-refs to the new ones.
1391
1392 // Assign new DebugInstrNum to the Paired instruction.
1393 if (I->peekDebugInstrNum()) {
1394 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1395 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *I,
1396 MergedInstr&: *MIB);
1397 }
1398 if (Paired->peekDebugInstrNum()) {
1399 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1400 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *Paired,
1401 MergedInstr&: *MIB);
1402 }
1403
1404 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1405 }
1406 LLVM_DEBUG(dbgs() << "\n");
1407
1408 if (MergeForward)
1409 for (const MachineOperand &MOP : phys_regs_and_masks(MI: *I))
1410 if (MOP.isReg() && MOP.isKill())
1411 DefinedInBB.addReg(Reg: MOP.getReg());
1412
1413 // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but
1414 // only copies implicit defs and makes sure that each operand is only added
1415 // once in case of duplicates.
1416 auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1,
1417 MachineBasicBlock::iterator MI2) {
1418 SmallSetVector<Register, 4> Ops;
1419 for (const MachineOperand &MO :
1420 llvm::drop_begin(RangeOrContainer: MI1->operands(), N: MI1->getDesc().getNumOperands()))
1421 if (MO.isReg() && MO.isImplicit() && MO.isDef())
1422 Ops.insert(X: MO.getReg());
1423 for (const MachineOperand &MO :
1424 llvm::drop_begin(RangeOrContainer: MI2->operands(), N: MI2->getDesc().getNumOperands()))
1425 if (MO.isReg() && MO.isImplicit() && MO.isDef())
1426 Ops.insert(X: MO.getReg());
1427 for (auto Op : Ops)
1428 MIB.addDef(RegNo: Op, Flags: RegState::Implicit);
1429 };
1430 CopyImplicitOps(I, Paired);
1431
1432 // Erase the old instructions.
1433 I->eraseFromParent();
1434 Paired->eraseFromParent();
1435
1436 return NextI;
1437}
1438
1439MachineBasicBlock::iterator
1440AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1441 MachineBasicBlock::iterator StoreI) {
1442 MachineBasicBlock::iterator NextI =
1443 next_nodbg(It: LoadI, End: LoadI->getParent()->end());
1444
1445 int LoadSize = TII->getMemScale(MI: *LoadI);
1446 int StoreSize = TII->getMemScale(MI: *StoreI);
1447 Register LdRt = getLdStRegOp(MI&: *LoadI).getReg();
1448 const MachineOperand &StMO = getLdStRegOp(MI&: *StoreI);
1449 Register StRt = getLdStRegOp(MI&: *StoreI).getReg();
1450 bool IsStoreXReg = TRI->getRegClass(i: AArch64::GPR64RegClassID)->contains(Reg: StRt);
1451
1452 assert((IsStoreXReg ||
1453 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1454 "Unexpected RegClass");
1455
1456 MachineInstr *BitExtMI;
1457 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1458 // Remove the load, if the destination register of the loads is the same
1459 // register for stored value.
1460 if (StRt == LdRt && LoadSize == 8) {
1461 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1462 y: LoadI->getIterator())) {
1463 if (MI.killsRegister(Reg: StRt, TRI)) {
1464 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1465 break;
1466 }
1467 }
1468 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1469 LLVM_DEBUG(LoadI->print(dbgs()));
1470 LLVM_DEBUG(dbgs() << "\n");
1471 LoadI->eraseFromParent();
1472 return NextI;
1473 }
1474 // Replace the load with a mov if the load and store are in the same size.
1475 BitExtMI =
1476 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1477 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), DestReg: LdRt)
1478 .addReg(RegNo: IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1479 .add(MO: StMO)
1480 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
1481 .setMIFlags(LoadI->getFlags());
1482 } else {
1483 // FIXME: Currently we disable this transformation in big-endian targets as
1484 // performance and correctness are verified only in little-endian.
1485 if (!Subtarget->isLittleEndian())
1486 return NextI;
1487 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: *LoadI);
1488 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1489 "Unsupported ld/st match");
1490 assert(LoadSize <= StoreSize && "Invalid load size");
1491 int UnscaledLdOffset =
1492 IsUnscaled
1493 ? AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm()
1494 : AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm() * LoadSize;
1495 int UnscaledStOffset =
1496 IsUnscaled
1497 ? AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm()
1498 : AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm() * StoreSize;
1499 int Width = LoadSize * 8;
1500 Register DestReg =
1501 IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1502 Reg: LdRt, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64RegClass))
1503 : LdRt;
1504
1505 assert((UnscaledLdOffset >= UnscaledStOffset &&
1506 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1507 "Invalid offset");
1508
1509 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1510 int Imms = Immr + Width - 1;
1511 if (UnscaledLdOffset == UnscaledStOffset) {
1512 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1513 | ((Immr) << 6) // immr
1514 | ((Imms) << 0) // imms
1515 ;
1516
1517 BitExtMI =
1518 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1519 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1520 DestReg)
1521 .add(MO: StMO)
1522 .addImm(Val: AndMaskEncoded)
1523 .setMIFlags(LoadI->getFlags());
1524 } else if (IsStoreXReg && Imms == 31) {
1525 // Use the 32 bit variant of UBFM if it's the LSR alias of the
1526 // instruction.
1527 assert(Immr <= Imms && "Expected LSR alias of UBFM");
1528 BitExtMI = BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1529 MCID: TII->get(Opcode: AArch64::UBFMWri),
1530 DestReg: TRI->getSubReg(Reg: DestReg, Idx: AArch64::sub_32))
1531 .addReg(RegNo: TRI->getSubReg(Reg: StRt, Idx: AArch64::sub_32))
1532 .addImm(Val: Immr)
1533 .addImm(Val: Imms)
1534 .setMIFlags(LoadI->getFlags());
1535 } else {
1536 BitExtMI =
1537 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1538 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1539 DestReg)
1540 .add(MO: StMO)
1541 .addImm(Val: Immr)
1542 .addImm(Val: Imms)
1543 .setMIFlags(LoadI->getFlags());
1544 }
1545 }
1546
1547 // Clear kill flags between store and load.
1548 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1549 y: BitExtMI->getIterator()))
1550 if (MI.killsRegister(Reg: StRt, TRI)) {
1551 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1552 break;
1553 }
1554
1555 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1556 LLVM_DEBUG(StoreI->print(dbgs()));
1557 LLVM_DEBUG(dbgs() << " ");
1558 LLVM_DEBUG(LoadI->print(dbgs()));
1559 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1560 LLVM_DEBUG(StoreI->print(dbgs()));
1561 LLVM_DEBUG(dbgs() << " ");
1562 LLVM_DEBUG((BitExtMI)->print(dbgs()));
1563 LLVM_DEBUG(dbgs() << "\n");
1564
1565 // Erase the old instructions.
1566 LoadI->eraseFromParent();
1567 return NextI;
1568}
1569
1570static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1571 // Convert the byte-offset used by unscaled into an "element" offset used
1572 // by the scaled pair load/store instructions.
1573 if (IsUnscaled) {
1574 // If the byte-offset isn't a multiple of the stride, there's no point
1575 // trying to match it.
1576 if (Offset % OffsetStride)
1577 return false;
1578 Offset /= OffsetStride;
1579 }
1580 return Offset <= 63 && Offset >= -64;
1581}
1582
1583// Do alignment, specialized to power of 2 and for signed ints,
1584// avoiding having to do a C-style cast from uint_64t to int when
1585// using alignTo from include/llvm/Support/MathExtras.h.
1586// FIXME: Move this function to include/MathExtras.h?
1587static int alignTo(int Num, int PowOf2) {
1588 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1589}
1590
1591static bool mayAlias(MachineInstr &MIa,
1592 SmallVectorImpl<MachineInstr *> &MemInsns,
1593 AliasAnalysis *AA) {
1594 for (MachineInstr *MIb : MemInsns) {
1595 if (MIa.mayAlias(AA, Other: *MIb, /*UseTBAA*/ false)) {
1596 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1597 return true;
1598 }
1599 }
1600
1601 LLVM_DEBUG(dbgs() << "No aliases found\n");
1602 return false;
1603}
1604
1605bool AArch64LoadStoreOpt::findMatchingStore(
1606 MachineBasicBlock::iterator I, unsigned Limit,
1607 MachineBasicBlock::iterator &StoreI) {
1608 MachineBasicBlock::iterator B = I->getParent()->begin();
1609 MachineBasicBlock::iterator MBBI = I;
1610 MachineInstr &LoadMI = *I;
1611 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: LoadMI).getReg();
1612
1613 // If the load is the first instruction in the block, there's obviously
1614 // not any matching store.
1615 if (MBBI == B)
1616 return false;
1617
1618 // Track which register units have been modified and used between the first
1619 // insn and the second insn.
1620 ModifiedRegUnits.clear();
1621 UsedRegUnits.clear();
1622
1623 unsigned Count = 0;
1624 do {
1625 MBBI = prev_nodbg(It: MBBI, Begin: B);
1626 MachineInstr &MI = *MBBI;
1627
1628 // Don't count transient instructions towards the search limit since there
1629 // may be different numbers of them if e.g. debug information is present.
1630 if (!MI.isTransient())
1631 ++Count;
1632
1633 // If the load instruction reads directly from the address to which the
1634 // store instruction writes and the stored value is not modified, we can
1635 // promote the load. Since we do not handle stores with pre-/post-index,
1636 // it's unnecessary to check if BaseReg is modified by the store itself.
1637 // Also we can't handle stores without an immediate offset operand,
1638 // while the operand might be the address for a global variable.
1639 if (MI.mayStore() && isMatchingStore(LoadInst&: LoadMI, StoreInst&: MI) &&
1640 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1641 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1642 isLdOffsetInRangeOfSt(LoadInst&: LoadMI, StoreInst&: MI, TII) &&
1643 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg())) {
1644 StoreI = MBBI;
1645 return true;
1646 }
1647
1648 if (MI.isCall())
1649 return false;
1650
1651 // Update modified / uses register units.
1652 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1653
1654 // Otherwise, if the base register is modified, we have no match, so
1655 // return early.
1656 if (!ModifiedRegUnits.available(Reg: BaseReg))
1657 return false;
1658
1659 // If we encounter a store aliased with the load, return early.
1660 if (MI.mayStore() && LoadMI.mayAlias(AA, Other: MI, /*UseTBAA*/ false))
1661 return false;
1662 } while (MBBI != B && Count < Limit);
1663 return false;
1664}
1665
1666static bool needsWinCFI(const MachineFunction *MF) {
1667 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1668 MF->getFunction().needsUnwindTableEntry();
1669}
1670
1671// Returns true if FirstMI and MI are candidates for merging or pairing.
1672// Otherwise, returns false.
1673static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1674 LdStPairFlags &Flags,
1675 const AArch64InstrInfo *TII) {
1676 // If this is volatile or if pairing is suppressed, not a candidate.
1677 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1678 return false;
1679
1680 // We should have already checked FirstMI for pair suppression and volatility.
1681 assert(!FirstMI.hasOrderedMemoryRef() &&
1682 !TII->isLdStPairSuppressed(FirstMI) &&
1683 "FirstMI shouldn't get here if either of these checks are true.");
1684
1685 if (needsWinCFI(MF: MI.getMF()) && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
1686 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
1687 return false;
1688
1689 unsigned OpcA = FirstMI.getOpcode();
1690 unsigned OpcB = MI.getOpcode();
1691
1692 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1693 if (OpcA == OpcB)
1694 return !AArch64InstrInfo::isPreLdSt(MI: FirstMI);
1695
1696 // Bail out if one of the opcodes is SVE fill/spill, as we currently don't
1697 // allow pairing them with other instructions.
1698 if (OpcA == AArch64::LDR_ZXI || OpcA == AArch64::STR_ZXI ||
1699 OpcB == AArch64::LDR_ZXI || OpcB == AArch64::STR_ZXI)
1700 return false;
1701
1702 // Two pre ld/st of different opcodes cannot be merged either
1703 if (AArch64InstrInfo::isPreLdSt(MI: FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1704 return false;
1705
1706 // Try to match a sign-extended load/store with a zero-extended load/store.
1707 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1708 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc: OpcA, IsValidLdStrOpc: &IsValidLdStrOpc);
1709 assert(IsValidLdStrOpc &&
1710 "Given Opc should be a Load or Store with an immediate");
1711 // OpcA will be the first instruction in the pair.
1712 if (NonSExtOpc == getMatchingNonSExtOpcode(Opc: OpcB, IsValidLdStrOpc: &PairIsValidLdStrOpc)) {
1713 Flags.setSExtIdx(NonSExtOpc == OpcA ? 1 : 0);
1714 return true;
1715 }
1716
1717 // If the second instruction isn't even a mergable/pairable load/store, bail
1718 // out.
1719 if (!PairIsValidLdStrOpc)
1720 return false;
1721
1722 // Narrow stores do not have a matching pair opcodes, so constrain their
1723 // merging to zero stores.
1724 if (isNarrowStore(Opc: OpcA) || isNarrowStore(Opc: OpcB))
1725 return getLdStRegOp(MI&: FirstMI).getReg() == AArch64::WZR &&
1726 getLdStRegOp(MI).getReg() == AArch64::WZR &&
1727 TII->getMemScale(MI: FirstMI) == TII->getMemScale(MI);
1728
1729 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1730 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1731 // are candidate pairs that can be merged.
1732 if (isPreLdStPairCandidate(FirstMI, MI))
1733 return true;
1734
1735 // Try to match an unscaled load/store with a scaled load/store.
1736 return TII->hasUnscaledLdStOffset(Opc: OpcA) != TII->hasUnscaledLdStOffset(Opc: OpcB) &&
1737 getMatchingPairOpcode(Opc: OpcA) == getMatchingPairOpcode(Opc: OpcB);
1738
1739 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1740}
1741
1742static bool canRenameMOP(const MachineOperand &MOP,
1743 const TargetRegisterInfo *TRI) {
1744 if (MOP.isReg()) {
1745 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: MOP.getReg());
1746 // Renaming registers with multiple disjunct sub-registers (e.g. the
1747 // result of a LD3) means that all sub-registers are renamed, potentially
1748 // impacting other instructions we did not check. Bail out.
1749 // Note that this relies on the structure of the AArch64 register file. In
1750 // particular, a subregister cannot be written without overwriting the
1751 // whole register.
1752 if (RegClass->HasDisjunctSubRegs && RegClass->CoveredBySubRegs &&
1753 (TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::dsub0) ||
1754 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::qsub0) ||
1755 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::zsub0))) {
1756 LLVM_DEBUG(
1757 dbgs()
1758 << " Cannot rename operands with multiple disjunct subregisters ("
1759 << MOP << ")\n");
1760 return false;
1761 }
1762
1763 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1764 // them must be known. For example, in ORRWrs the implicit-def
1765 // corresponds to the result register.
1766 if (MOP.isImplicit() && MOP.isDef()) {
1767 if (!isRewritableImplicitDef(MO: MOP))
1768 return false;
1769 return TRI->isSuperOrSubRegisterEq(
1770 RegA: MOP.getParent()->getOperand(i: 0).getReg(), RegB: MOP.getReg());
1771 }
1772 }
1773 return MOP.isImplicit() ||
1774 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1775}
1776
1777static bool
1778canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1779 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1780 const TargetRegisterInfo *TRI) {
1781 if (!FirstMI.mayStore())
1782 return false;
1783
1784 // Check if we can find an unused register which we can use to rename
1785 // the register used by the first load/store.
1786
1787 auto RegToRename = getLdStRegOp(MI&: FirstMI).getReg();
1788 // For now, we only rename if the store operand gets killed at the store.
1789 if (!getLdStRegOp(MI&: FirstMI).isKill() &&
1790 !any_of(Range: FirstMI.operands(),
1791 P: [TRI, RegToRename](const MachineOperand &MOP) {
1792 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1793 MOP.isImplicit() && MOP.isKill() &&
1794 TRI->regsOverlap(RegA: RegToRename, RegB: MOP.getReg());
1795 })) {
1796 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1797 return false;
1798 }
1799
1800 bool FoundDef = false;
1801
1802 // For each instruction between FirstMI and the previous def for RegToRename,
1803 // we
1804 // * check if we can rename RegToRename in this instruction
1805 // * collect the registers used and required register classes for RegToRename.
1806 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1807 bool IsDef) {
1808 LLVM_DEBUG(dbgs() << "Checking " << MI);
1809 // Currently we do not try to rename across frame-setup instructions.
1810 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1811 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1812 << "currently\n");
1813 return false;
1814 }
1815
1816 UsedInBetween.accumulate(MI);
1817
1818 // For a definition, check that we can rename the definition and exit the
1819 // loop.
1820 FoundDef = IsDef;
1821
1822 // For defs, check if we can rename the first def of RegToRename.
1823 if (FoundDef) {
1824 // For some pseudo instructions, we might not generate code in the end
1825 // (e.g. KILL) and we would end up without a correct def for the rename
1826 // register.
1827 // TODO: This might be overly conservative and we could handle those cases
1828 // in multiple ways:
1829 // 1. Insert an extra copy, to materialize the def.
1830 // 2. Skip pseudo-defs until we find an non-pseudo def.
1831 if (MI.isPseudo()) {
1832 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1833 return false;
1834 }
1835
1836 for (auto &MOP : MI.operands()) {
1837 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1838 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1839 continue;
1840 if (!canRenameMOP(MOP, TRI)) {
1841 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1842 return false;
1843 }
1844 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1845 }
1846 return true;
1847 } else {
1848 for (auto &MOP : MI.operands()) {
1849 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1850 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1851 continue;
1852
1853 if (!canRenameMOP(MOP, TRI)) {
1854 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1855 return false;
1856 }
1857 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1858 }
1859 }
1860 return true;
1861 };
1862
1863 if (!forAllMIsUntilDef(MI&: FirstMI, DefReg: RegToRename, TRI, Limit: LdStLimit, Fn&: CheckMIs))
1864 return false;
1865
1866 if (!FoundDef) {
1867 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1868 return false;
1869 }
1870 return true;
1871}
1872
1873// We want to merge the second load into the first by rewriting the usages of
1874// the same reg between first (incl.) and second (excl.). We don't need to care
1875// about any insns before FirstLoad or after SecondLoad.
1876// 1. The second load writes new value into the same reg.
1877// - The renaming is impossible to impact later use of the reg.
1878// - The second load always trash the value written by the first load which
1879// means the reg must be killed before the second load.
1880// 2. The first load must be a def for the same reg so we don't need to look
1881// into anything before it.
1882static bool canRenameUntilSecondLoad(
1883 MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1884 LiveRegUnits &UsedInBetween,
1885 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1886 const TargetRegisterInfo *TRI) {
1887 if (FirstLoad.isPseudo())
1888 return false;
1889
1890 UsedInBetween.accumulate(MI: FirstLoad);
1891 auto RegToRename = getLdStRegOp(MI&: FirstLoad).getReg();
1892 bool Success = std::all_of(
1893 first: FirstLoad.getIterator(), last: SecondLoad.getIterator(),
1894 pred: [&](MachineInstr &MI) {
1895 LLVM_DEBUG(dbgs() << "Checking " << MI);
1896 // Currently we do not try to rename across frame-setup instructions.
1897 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1898 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1899 << "currently\n");
1900 return false;
1901 }
1902
1903 for (auto &MOP : MI.operands()) {
1904 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1905 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1906 continue;
1907 if (!canRenameMOP(MOP, TRI)) {
1908 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1909 return false;
1910 }
1911 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1912 }
1913
1914 return true;
1915 });
1916 return Success;
1917}
1918
1919// Check if we can find a physical register for renaming \p Reg. This register
1920// must:
1921// * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1922// defined registers up to the point where the renamed register will be used,
1923// * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1924// registers in the range the rename register will be used,
1925// * is available in all used register classes (checked using RequiredClasses).
1926static std::optional<MCPhysReg> tryToFindRegisterToRename(
1927 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1928 LiveRegUnits &UsedInBetween,
1929 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1930 const TargetRegisterInfo *TRI) {
1931 const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1932
1933 // Checks if any sub- or super-register of PR is callee saved.
1934 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1935 return any_of(Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1936 P: [&MF, TRI](MCPhysReg SubOrSuper) {
1937 return TRI->isCalleeSavedPhysReg(PhysReg: SubOrSuper, MF);
1938 });
1939 };
1940
1941 // Check if PR or one of its sub- or super-registers can be used for all
1942 // required register classes.
1943 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1944 return all_of(Range&: RequiredClasses, P: [PR, TRI](const TargetRegisterClass *C) {
1945 return any_of(
1946 Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1947 P: [C](MCPhysReg SubOrSuper) { return C->contains(Reg: SubOrSuper); });
1948 });
1949 };
1950
1951 auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1952 for (const MCPhysReg &PR : *RegClass) {
1953 if (DefinedInBB.available(Reg: PR) && UsedInBetween.available(Reg: PR) &&
1954 !RegInfo.isReserved(PhysReg: PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1955 CanBeUsedForAllClasses(PR)) {
1956 DefinedInBB.addReg(Reg: PR);
1957 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1958 << "\n");
1959 return {PR};
1960 }
1961 }
1962 LLVM_DEBUG(dbgs() << "No rename register found from "
1963 << TRI->getRegClassName(RegClass) << "\n");
1964 return std::nullopt;
1965}
1966
1967// For store pairs: returns a register from FirstMI to the beginning of the
1968// block that can be renamed.
1969// For load pairs: returns a register from FirstMI to MI that can be renamed.
1970static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1971 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1972 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1973 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1974 const TargetRegisterInfo *TRI) {
1975 std::optional<MCPhysReg> RenameReg;
1976 if (!DebugCounter::shouldExecute(Counter&: RegRenamingCounter))
1977 return RenameReg;
1978
1979 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: getLdStRegOp(MI&: FirstMI).getReg());
1980 MachineFunction &MF = *FirstMI.getParent()->getParent();
1981 if (!RegClass || !MF.getRegInfo().tracksLiveness())
1982 return RenameReg;
1983
1984 const bool IsLoad = FirstMI.mayLoad();
1985
1986 if (!MaybeCanRename) {
1987 if (IsLoad)
1988 MaybeCanRename = {canRenameUntilSecondLoad(FirstLoad&: FirstMI, SecondLoad&: MI, UsedInBetween,
1989 RequiredClasses, TRI)};
1990 else
1991 MaybeCanRename = {
1992 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1993 }
1994
1995 if (*MaybeCanRename) {
1996 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1997 RequiredClasses, TRI);
1998 }
1999 return RenameReg;
2000}
2001
2002/// Scan the instructions looking for a load/store that can be combined with the
2003/// current instruction into a wider equivalent or a load/store pair.
2004MachineBasicBlock::iterator
2005AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
2006 LdStPairFlags &Flags, unsigned Limit,
2007 bool FindNarrowMerge) {
2008 MachineBasicBlock::iterator E = I->getParent()->end();
2009 MachineBasicBlock::iterator MBBI = I;
2010 MachineBasicBlock::iterator MBBIWithRenameReg;
2011 MachineInstr &FirstMI = *I;
2012 MBBI = next_nodbg(It: MBBI, End: E);
2013
2014 bool MayLoad = FirstMI.mayLoad();
2015 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: FirstMI);
2016 Register Reg = getLdStRegOp(MI&: FirstMI).getReg();
2017 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: FirstMI).getReg();
2018 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: FirstMI).getImm();
2019 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: FirstMI) : 1;
2020 bool IsPromotableZeroStore = isPromotableZeroStoreInst(MI&: FirstMI);
2021
2022 std::optional<bool> MaybeCanRename;
2023 if (!EnableRenaming)
2024 MaybeCanRename = {false};
2025
2026 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
2027 LiveRegUnits UsedInBetween;
2028 UsedInBetween.init(TRI: *TRI);
2029
2030 Flags.clearRenameReg();
2031
2032 // Track which register units have been modified and used between the first
2033 // insn (inclusive) and the second insn.
2034 ModifiedRegUnits.clear();
2035 UsedRegUnits.clear();
2036
2037 // Remember any instructions that read/write memory between FirstMI and MI.
2038 SmallVector<MachineInstr *, 4> MemInsns;
2039
2040 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
2041 for (unsigned Count = 0; MBBI != E && Count < Limit;
2042 MBBI = next_nodbg(It: MBBI, End: E)) {
2043 MachineInstr &MI = *MBBI;
2044 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
2045
2046 UsedInBetween.accumulate(MI);
2047
2048 // Don't count transient instructions towards the search limit since there
2049 // may be different numbers of them if e.g. debug information is present.
2050 if (!MI.isTransient())
2051 ++Count;
2052
2053 Flags.setSExtIdx(-1);
2054 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
2055 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
2056 assert(MI.mayLoadOrStore() && "Expected memory operation.");
2057 // If we've found another instruction with the same opcode, check to see
2058 // if the base and offset are compatible with our starting instruction.
2059 // These instructions all have scaled immediate operands, so we just
2060 // check for +1/-1. Make sure to check the new instruction offset is
2061 // actually an immediate and not a symbolic reference destined for
2062 // a relocation.
2063 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
2064 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2065 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
2066 if (IsUnscaled != MIIsUnscaled) {
2067 // We're trying to pair instructions that differ in how they are scaled.
2068 // If FirstMI is scaled then scale the offset of MI accordingly.
2069 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
2070 int MemSize = TII->getMemScale(MI);
2071 if (MIIsUnscaled) {
2072 // If the unscaled offset isn't a multiple of the MemSize, we can't
2073 // pair the operations together: bail and keep looking.
2074 if (MIOffset % MemSize) {
2075 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2076 UsedRegUnits, TRI);
2077 MemInsns.push_back(Elt: &MI);
2078 continue;
2079 }
2080 MIOffset /= MemSize;
2081 } else {
2082 MIOffset *= MemSize;
2083 }
2084 }
2085
2086 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
2087
2088 if (BaseReg == MIBaseReg) {
2089 // If the offset of the second ld/st is not equal to the size of the
2090 // destination register it can’t be paired with a pre-index ld/st
2091 // pair. Additionally if the base reg is used or modified the operations
2092 // can't be paired: bail and keep looking.
2093 if (IsPreLdSt) {
2094 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
2095 bool IsBaseRegUsed = !UsedRegUnits.available(
2096 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2097 bool IsBaseRegModified = !ModifiedRegUnits.available(
2098 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2099 // If the stored value and the address of the second instruction is
2100 // the same, it needs to be using the updated register and therefore
2101 // it must not be folded.
2102 bool IsMIRegTheSame =
2103 TRI->regsOverlap(RegA: getLdStRegOp(MI).getReg(),
2104 RegB: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2105 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
2106 IsMIRegTheSame) {
2107 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2108 UsedRegUnits, TRI);
2109 MemInsns.push_back(Elt: &MI);
2110 continue;
2111 }
2112 } else {
2113 if ((Offset != MIOffset + OffsetStride) &&
2114 (Offset + OffsetStride != MIOffset)) {
2115 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2116 UsedRegUnits, TRI);
2117 MemInsns.push_back(Elt: &MI);
2118 continue;
2119 }
2120 }
2121
2122 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
2123 if (FindNarrowMerge) {
2124 // If the alignment requirements of the scaled wide load/store
2125 // instruction can't express the offset of the scaled narrow input,
2126 // bail and keep looking. For promotable zero stores, allow only when
2127 // the stored value is the same (i.e., WZR).
2128 if ((!IsUnscaled && alignTo(Num: MinOffset, PowOf2: 2) != MinOffset) ||
2129 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
2130 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2131 UsedRegUnits, TRI);
2132 MemInsns.push_back(Elt: &MI);
2133 continue;
2134 }
2135 } else {
2136 // Pairwise instructions have a 7-bit signed offset field. Single
2137 // insns have a 12-bit unsigned offset field. If the resultant
2138 // immediate offset of merging these instructions is out of range for
2139 // a pairwise instruction, bail and keep looking.
2140 if (!inBoundsForPair(IsUnscaled, Offset: MinOffset, OffsetStride)) {
2141 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2142 UsedRegUnits, TRI);
2143 MemInsns.push_back(Elt: &MI);
2144 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
2145 << "keep looking.\n");
2146 continue;
2147 }
2148 // If the alignment requirements of the paired (scaled) instruction
2149 // can't express the offset of the unscaled input, bail and keep
2150 // looking.
2151 if (IsUnscaled && (alignTo(Num: MinOffset, PowOf2: OffsetStride) != MinOffset)) {
2152 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2153 UsedRegUnits, TRI);
2154 MemInsns.push_back(Elt: &MI);
2155 LLVM_DEBUG(dbgs()
2156 << "Offset doesn't fit due to alignment requirements, "
2157 << "keep looking.\n");
2158 continue;
2159 }
2160 }
2161
2162 // If the BaseReg has been modified, then we cannot do the optimization.
2163 // For example, in the following pattern
2164 // ldr x1 [x2]
2165 // ldr x2 [x3]
2166 // ldr x4 [x2, #8],
2167 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
2168 if (!ModifiedRegUnits.available(Reg: BaseReg))
2169 return E;
2170
2171 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
2172 RegA: Reg, RegB: getLdStRegOp(MI).getReg());
2173
2174 // If the Rt of the second instruction (destination register of the
2175 // load) was not modified or used between the two instructions and none
2176 // of the instructions between the second and first alias with the
2177 // second, we can combine the second into the first.
2178 bool RtNotModified =
2179 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg());
2180 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
2181 !UsedRegUnits.available(Reg: getLdStRegOp(MI).getReg()));
2182
2183 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
2184 << "Reg '" << getLdStRegOp(MI) << "' not modified: "
2185 << (RtNotModified ? "true" : "false") << "\n"
2186 << "Reg '" << getLdStRegOp(MI) << "' not used: "
2187 << (RtNotUsed ? "true" : "false") << "\n");
2188
2189 if (RtNotModified && RtNotUsed && !mayAlias(MIa&: MI, MemInsns, AA)) {
2190 // For pairs loading into the same reg, try to find a renaming
2191 // opportunity to allow the renaming of Reg between FirstMI and MI
2192 // and combine MI into FirstMI; otherwise bail and keep looking.
2193 if (SameLoadReg) {
2194 std::optional<MCPhysReg> RenameReg =
2195 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
2196 Reg, DefinedInBB, UsedInBetween,
2197 RequiredClasses, TRI);
2198 if (!RenameReg) {
2199 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2200 UsedRegUnits, TRI);
2201 MemInsns.push_back(Elt: &MI);
2202 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
2203 << "keep looking.\n");
2204 continue;
2205 }
2206 Flags.setRenameReg(*RenameReg);
2207 }
2208
2209 Flags.setMergeForward(false);
2210 if (!SameLoadReg)
2211 Flags.clearRenameReg();
2212 return MBBI;
2213 }
2214
2215 // Likewise, if the Rt of the first instruction is not modified or used
2216 // between the two instructions and none of the instructions between the
2217 // first and the second alias with the first, we can combine the first
2218 // into the second.
2219 RtNotModified = !(
2220 MayLoad && !UsedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg()));
2221
2222 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
2223 << "Reg '" << getLdStRegOp(FirstMI)
2224 << "' not modified: "
2225 << (RtNotModified ? "true" : "false") << "\n");
2226
2227 if (RtNotModified && !mayAlias(MIa&: FirstMI, MemInsns, AA)) {
2228 if (ModifiedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg())) {
2229 Flags.setMergeForward(true);
2230 Flags.clearRenameReg();
2231 return MBBI;
2232 }
2233
2234 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
2235 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
2236 RequiredClasses, TRI);
2237 if (RenameReg) {
2238 Flags.setMergeForward(true);
2239 Flags.setRenameReg(*RenameReg);
2240 MBBIWithRenameReg = MBBI;
2241 }
2242 }
2243 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
2244 << "interference in between, keep looking.\n");
2245 }
2246 }
2247
2248 if (Flags.getRenameReg())
2249 return MBBIWithRenameReg;
2250
2251 // If the instruction wasn't a matching load or store. Stop searching if we
2252 // encounter a call instruction that might modify memory.
2253 if (MI.isCall()) {
2254 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
2255 return E;
2256 }
2257
2258 // Update modified / uses register units.
2259 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2260
2261 // Otherwise, if the base register is modified, we have no match, so
2262 // return early.
2263 if (!ModifiedRegUnits.available(Reg: BaseReg)) {
2264 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
2265 return E;
2266 }
2267
2268 // Update list of instructions that read/write memory.
2269 if (MI.mayLoadOrStore())
2270 MemInsns.push_back(Elt: &MI);
2271 }
2272 return E;
2273}
2274
2275static MachineBasicBlock::iterator
2276maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
2277 assert((MI.getOpcode() == AArch64::SUBXri ||
2278 MI.getOpcode() == AArch64::ADDXri) &&
2279 "Expected a register update instruction");
2280 auto End = MI.getParent()->end();
2281 if (MaybeCFI == End ||
2282 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
2283 !(MI.getFlag(Flag: MachineInstr::FrameSetup) ||
2284 MI.getFlag(Flag: MachineInstr::FrameDestroy)) ||
2285 MI.getOperand(i: 0).getReg() != AArch64::SP)
2286 return End;
2287
2288 const MachineFunction &MF = *MI.getParent()->getParent();
2289 unsigned CFIIndex = MaybeCFI->getOperand(i: 0).getCFIIndex();
2290 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
2291 switch (CFI.getOperation()) {
2292 case MCCFIInstruction::OpDefCfa:
2293 case MCCFIInstruction::OpDefCfaOffset:
2294 return MaybeCFI;
2295 default:
2296 return End;
2297 }
2298}
2299
2300std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
2301 MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
2302 bool IsForward, bool IsPreIdx, bool MergeEither) {
2303 assert((Update->getOpcode() == AArch64::ADDXri ||
2304 Update->getOpcode() == AArch64::SUBXri) &&
2305 "Unexpected base register update instruction to merge!");
2306 MachineBasicBlock::iterator E = I->getParent()->end();
2307 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2308
2309 // If updating the SP and the following instruction is CFA offset related CFI,
2310 // make sure the CFI follows the SP update either by merging at the location
2311 // of the update or by moving the CFI after the merged instruction. If unable
2312 // to do so, bail.
2313 MachineBasicBlock::iterator InsertPt = I;
2314 if (IsForward) {
2315 assert(IsPreIdx);
2316 if (auto CFI = maybeMoveCFI(MI&: *Update, MaybeCFI: next_nodbg(It: Update, End: E)); CFI != E) {
2317 if (MergeEither) {
2318 InsertPt = Update;
2319 } else {
2320 // Take care not to reorder CFIs.
2321 if (std::any_of(first: std::next(x: CFI), last: I, pred: [](const auto &Insn) {
2322 return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
2323 }))
2324 return std::nullopt;
2325
2326 MachineBasicBlock *MBB = InsertPt->getParent();
2327 MBB->splice(Where: std::next(x: InsertPt), Other: MBB, From: CFI);
2328 }
2329 }
2330 }
2331
2332 // Return the instruction following the merged instruction, which is
2333 // the instruction following our unmerged load. Unless that's the add/sub
2334 // instruction we're merging, in which case it's the one after that.
2335 if (NextI == Update)
2336 NextI = next_nodbg(It: NextI, End: E);
2337
2338 int Value = Update->getOperand(i: 2).getImm();
2339 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
2340 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
2341 if (Update->getOpcode() == AArch64::SUBXri)
2342 Value = -Value;
2343
2344 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(Opc: I->getOpcode())
2345 : getPostIndexedOpcode(Opc: I->getOpcode());
2346 MachineInstrBuilder MIB;
2347 int Scale, MinOffset, MaxOffset;
2348 getPrePostIndexedMemOpInfo(MI: *I, Scale, MinOffset, MaxOffset);
2349 if (!AArch64InstrInfo::isPairedLdSt(MI: *I)) {
2350 // Non-paired instruction.
2351 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2352 MCID: TII->get(Opcode: NewOpc))
2353 .add(MO: Update->getOperand(i: 0))
2354 .add(MO: getLdStRegOp(MI&: *I))
2355 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2356 .addImm(Val: Value / Scale)
2357 .setMemRefs(I->memoperands())
2358 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2359 } else {
2360 // Paired instruction.
2361 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2362 MCID: TII->get(Opcode: NewOpc))
2363 .add(MO: Update->getOperand(i: 0))
2364 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 0))
2365 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 1))
2366 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2367 .addImm(Val: Value / Scale)
2368 .setMemRefs(I->memoperands())
2369 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2370 }
2371
2372 if (IsPreIdx) {
2373 ++NumPreFolded;
2374 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2375 } else {
2376 ++NumPostFolded;
2377 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2378 }
2379 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2380 LLVM_DEBUG(I->print(dbgs()));
2381 LLVM_DEBUG(dbgs() << " ");
2382 LLVM_DEBUG(Update->print(dbgs()));
2383 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2384 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2385 LLVM_DEBUG(dbgs() << "\n");
2386
2387 // Erase the old instructions for the block.
2388 I->eraseFromParent();
2389 Update->eraseFromParent();
2390
2391 return NextI;
2392}
2393
2394MachineBasicBlock::iterator
2395AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2396 MachineBasicBlock::iterator Update,
2397 unsigned Offset, int Scale) {
2398 assert((Update->getOpcode() == AArch64::MOVKWi) &&
2399 "Unexpected const mov instruction to merge!");
2400 MachineBasicBlock::iterator E = I->getParent()->end();
2401 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2402 MachineBasicBlock::iterator PrevI = prev_nodbg(It: Update, Begin: E);
2403 MachineInstr &MemMI = *I;
2404 unsigned Mask = (1 << 12) * Scale - 1;
2405 unsigned Low = Offset & Mask;
2406 unsigned High = Offset - Low;
2407 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2408 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2409 MachineInstrBuilder AddMIB, MemMIB;
2410
2411 // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2412 AddMIB =
2413 BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: AArch64::ADDXri))
2414 .addDef(RegNo: IndexReg)
2415 .addUse(RegNo: BaseReg)
2416 .addImm(Val: High >> 12) // shifted value
2417 .addImm(Val: 12); // shift 12
2418 (void)AddMIB;
2419 // Ld/St DestReg, IndexReg, Imm12
2420 unsigned NewOpc = getBaseAddressOpcode(Opc: I->getOpcode());
2421 MemMIB = BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: NewOpc))
2422 .add(MO: getLdStRegOp(MI&: MemMI))
2423 .add(MO: AArch64InstrInfo::getLdStOffsetOp(MI: MemMI))
2424 .addImm(Val: Low / Scale)
2425 .setMemRefs(I->memoperands())
2426 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2427 (void)MemMIB;
2428
2429 ++NumConstOffsetFolded;
2430 LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2431 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2432 LLVM_DEBUG(PrevI->print(dbgs()));
2433 LLVM_DEBUG(dbgs() << " ");
2434 LLVM_DEBUG(Update->print(dbgs()));
2435 LLVM_DEBUG(dbgs() << " ");
2436 LLVM_DEBUG(I->print(dbgs()));
2437 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2438 LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2439 LLVM_DEBUG(dbgs() << " ");
2440 LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2441 LLVM_DEBUG(dbgs() << "\n");
2442
2443 // Erase the old instructions for the block.
2444 I->eraseFromParent();
2445 PrevI->eraseFromParent();
2446 Update->eraseFromParent();
2447
2448 return NextI;
2449}
2450
2451bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2452 MachineInstr &MI,
2453 unsigned BaseReg, int Offset) {
2454 switch (MI.getOpcode()) {
2455 default:
2456 break;
2457 case AArch64::SUBXri:
2458 case AArch64::ADDXri:
2459 // Make sure it's a vanilla immediate operand, not a relocation or
2460 // anything else we can't handle.
2461 if (!MI.getOperand(i: 2).isImm())
2462 break;
2463 // Watch out for 1 << 12 shifted value.
2464 if (AArch64_AM::getShiftValue(Imm: MI.getOperand(i: 3).getImm()))
2465 break;
2466
2467 // The update instruction source and destination register must be the
2468 // same as the load/store base register.
2469 if (MI.getOperand(i: 0).getReg() != BaseReg ||
2470 MI.getOperand(i: 1).getReg() != BaseReg)
2471 break;
2472
2473 int UpdateOffset = MI.getOperand(i: 2).getImm();
2474 if (MI.getOpcode() == AArch64::SUBXri)
2475 UpdateOffset = -UpdateOffset;
2476
2477 // The immediate must be a multiple of the scaling factor of the pre/post
2478 // indexed instruction.
2479 int Scale, MinOffset, MaxOffset;
2480 getPrePostIndexedMemOpInfo(MI: MemMI, Scale, MinOffset, MaxOffset);
2481 if (UpdateOffset % Scale != 0)
2482 break;
2483
2484 // Scaled offset must fit in the instruction immediate.
2485 int ScaledOffset = UpdateOffset / Scale;
2486 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2487 break;
2488
2489 // If we have a non-zero Offset, we check that it matches the amount
2490 // we're adding to the register.
2491 if (!Offset || Offset == UpdateOffset)
2492 return true;
2493 break;
2494 }
2495 return false;
2496}
2497
2498bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2499 MachineInstr &MI,
2500 unsigned IndexReg,
2501 unsigned &Offset) {
2502 // The update instruction source and destination register must be the
2503 // same as the load/store index register.
2504 if (MI.getOpcode() == AArch64::MOVKWi &&
2505 TRI->isSuperOrSubRegisterEq(RegA: IndexReg, RegB: MI.getOperand(i: 1).getReg())) {
2506
2507 // movz + movk hold a large offset of a Ld/St instruction.
2508 MachineBasicBlock::iterator B = MI.getParent()->begin();
2509 MachineBasicBlock::iterator MBBI = &MI;
2510 // Skip the scene when the MI is the first instruction of a block.
2511 if (MBBI == B)
2512 return false;
2513 MBBI = prev_nodbg(It: MBBI, Begin: B);
2514 MachineInstr &MovzMI = *MBBI;
2515 // Make sure the MOVKWi and MOVZWi set the same register.
2516 if (MovzMI.getOpcode() == AArch64::MOVZWi &&
2517 MovzMI.getOperand(i: 0).getReg() == MI.getOperand(i: 0).getReg()) {
2518 unsigned Low = MovzMI.getOperand(i: 1).getImm();
2519 unsigned High = MI.getOperand(i: 2).getImm() << MI.getOperand(i: 3).getImm();
2520 Offset = High + Low;
2521 // 12-bit optionally shifted immediates are legal for adds.
2522 return Offset >> 24 == 0;
2523 }
2524 }
2525 return false;
2526}
2527
2528MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2529 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2530 MachineBasicBlock::iterator E = I->getParent()->end();
2531 MachineInstr &MemMI = *I;
2532 MachineBasicBlock::iterator MBBI = I;
2533
2534 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2535 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm() *
2536 TII->getMemScale(MI: MemMI);
2537
2538 // Scan forward looking for post-index opportunities. Updating instructions
2539 // can't be formed if the memory instruction doesn't have the offset we're
2540 // looking for.
2541 if (MIUnscaledOffset != UnscaledOffset)
2542 return E;
2543
2544 // If the base register overlaps a source/destination register, we can't
2545 // merge the update. This does not apply to tag store instructions which
2546 // ignore the address part of the source register.
2547 // This does not apply to STGPi as well, which does not have unpredictable
2548 // behavior in this case unlike normal stores, and always performs writeback
2549 // after reading the source register value.
2550 if (!isTagStore(MI: MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2551 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2552 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2553 Register DestReg = getLdStRegOp(MI&: MemMI, PairedRegOp: i).getReg();
2554 if (DestReg == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg))
2555 return E;
2556 }
2557 }
2558
2559 // Track which register units have been modified and used between the first
2560 // insn (inclusive) and the second insn.
2561 ModifiedRegUnits.clear();
2562 UsedRegUnits.clear();
2563 MBBI = next_nodbg(It: MBBI, End: E);
2564
2565 // We can't post-increment the stack pointer if any instruction between
2566 // the memory access (I) and the increment (MBBI) can access the memory
2567 // region defined by [SP, MBBI].
2568 const bool BaseRegSP = BaseReg == AArch64::SP;
2569 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2570 // FIXME: For now, we always block the optimization over SP in windows
2571 // targets as it requires to adjust the unwind/debug info, messing up
2572 // the unwind info can actually cause a miscompile.
2573 return E;
2574 }
2575
2576 unsigned Count = 0;
2577 MachineBasicBlock *CurMBB = I->getParent();
2578 // choice of next block to visit is liveins-based
2579 bool VisitSucc = CurMBB->getParent()->getRegInfo().tracksLiveness();
2580
2581 while (true) {
2582 for (MachineBasicBlock::iterator CurEnd = CurMBB->end();
2583 MBBI != CurEnd && Count < Limit; MBBI = next_nodbg(It: MBBI, End: CurEnd)) {
2584 MachineInstr &MI = *MBBI;
2585
2586 // Don't count transient instructions towards the search limit since there
2587 // may be different numbers of them if e.g. debug information is present.
2588 if (!MI.isTransient())
2589 ++Count;
2590
2591 // If we found a match, return it.
2592 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset: UnscaledOffset))
2593 return MBBI;
2594
2595 // Update the status of what the instruction clobbered and used.
2596 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2597 TRI);
2598
2599 // Otherwise, if the base register is used or modified, we have no match,
2600 // so return early. If we are optimizing SP, do not allow instructions
2601 // that may load or store in between the load and the optimized value
2602 // update.
2603 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2604 !UsedRegUnits.available(Reg: BaseReg) ||
2605 (BaseRegSP && MBBI->mayLoadOrStore()))
2606 return E;
2607 }
2608
2609 if (!VisitSucc || Limit <= Count)
2610 break;
2611
2612 // Try to go downward to successors along a CF path w/o side enters
2613 // such that BaseReg is alive along it but not at its exits
2614 MachineBasicBlock *SuccToVisit = nullptr;
2615 unsigned LiveSuccCount = 0;
2616 for (MachineBasicBlock *Succ : CurMBB->successors()) {
2617 for (MCRegAliasIterator AI(BaseReg, TRI, true); AI.isValid(); ++AI) {
2618 if (Succ->isLiveIn(Reg: *AI)) {
2619 if (LiveSuccCount++)
2620 return E;
2621 if (Succ->pred_size() == 1)
2622 SuccToVisit = Succ;
2623 break;
2624 }
2625 }
2626 }
2627 if (!SuccToVisit)
2628 break;
2629 CurMBB = SuccToVisit;
2630 MBBI = CurMBB->begin();
2631 }
2632
2633 return E;
2634}
2635
2636MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2637 MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
2638 MachineBasicBlock::iterator B = I->getParent()->begin();
2639 MachineBasicBlock::iterator E = I->getParent()->end();
2640 MachineInstr &MemMI = *I;
2641 MachineBasicBlock::iterator MBBI = I;
2642 MachineFunction &MF = *MemMI.getMF();
2643
2644 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2645 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm();
2646
2647 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2648 Register DestReg[] = {getLdStRegOp(MI&: MemMI, PairedRegOp: 0).getReg(),
2649 IsPairedInsn ? getLdStRegOp(MI&: MemMI, PairedRegOp: 1).getReg()
2650 : AArch64::NoRegister};
2651
2652 // If the load/store is the first instruction in the block, there's obviously
2653 // not any matching update. Ditto if the memory offset isn't zero.
2654 if (MBBI == B || Offset != 0)
2655 return E;
2656 // If the base register overlaps a destination register, we can't
2657 // merge the update.
2658 if (!isTagStore(MI: MemMI)) {
2659 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i)
2660 if (DestReg[i] == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg[i]))
2661 return E;
2662 }
2663
2664 const bool BaseRegSP = BaseReg == AArch64::SP;
2665 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2666 // FIXME: For now, we always block the optimization over SP in windows
2667 // targets as it requires to adjust the unwind/debug info, messing up
2668 // the unwind info can actually cause a miscompile.
2669 return E;
2670 }
2671
2672 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2673 unsigned RedZoneSize =
2674 Subtarget.getTargetLowering()->getRedZoneSize(F: MF.getFunction());
2675
2676 // Track which register units have been modified and used between the first
2677 // insn (inclusive) and the second insn.
2678 ModifiedRegUnits.clear();
2679 UsedRegUnits.clear();
2680 unsigned Count = 0;
2681 bool MemAccessBeforeSPPreInc = false;
2682 MergeEither = true;
2683 do {
2684 MBBI = prev_nodbg(It: MBBI, Begin: B);
2685 MachineInstr &MI = *MBBI;
2686
2687 // Don't count transient instructions towards the search limit since there
2688 // may be different numbers of them if e.g. debug information is present.
2689 if (!MI.isTransient())
2690 ++Count;
2691
2692 // If we found a match, return it.
2693 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset)) {
2694 // Check that the update value is within our red zone limit (which may be
2695 // zero).
2696 if (MemAccessBeforeSPPreInc && MBBI->getOperand(i: 2).getImm() > RedZoneSize)
2697 return E;
2698 return MBBI;
2699 }
2700
2701 // Update the status of what the instruction clobbered and used.
2702 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2703
2704 // Otherwise, if the base register is used or modified, we have no match, so
2705 // return early.
2706 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2707 !UsedRegUnits.available(Reg: BaseReg))
2708 return E;
2709
2710 // If we have a destination register (i.e. a load instruction) and a
2711 // destination register is used or modified, then we can only merge forward,
2712 // i.e. the combined instruction is put in the place of the memory
2713 // instruction. Same applies if we see a memory access or side effects.
2714 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() ||
2715 (DestReg[0] != AArch64::NoRegister &&
2716 !(ModifiedRegUnits.available(Reg: DestReg[0]) &&
2717 UsedRegUnits.available(Reg: DestReg[0]))) ||
2718 (DestReg[1] != AArch64::NoRegister &&
2719 !(ModifiedRegUnits.available(Reg: DestReg[1]) &&
2720 UsedRegUnits.available(Reg: DestReg[1]))))
2721 MergeEither = false;
2722
2723 // Keep track if we have a memory access before an SP pre-increment, in this
2724 // case we need to validate later that the update amount respects the red
2725 // zone.
2726 if (BaseRegSP && MBBI->mayLoadOrStore())
2727 MemAccessBeforeSPPreInc = true;
2728 } while (MBBI != B && Count < Limit);
2729 return E;
2730}
2731
2732MachineBasicBlock::iterator
2733AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2734 MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2735 MachineBasicBlock::iterator B = I->getParent()->begin();
2736 MachineBasicBlock::iterator E = I->getParent()->end();
2737 MachineInstr &MemMI = *I;
2738 MachineBasicBlock::iterator MBBI = I;
2739
2740 // If the load is the first instruction in the block, there's obviously
2741 // not any matching load or store.
2742 if (MBBI == B)
2743 return E;
2744
2745 // Make sure the IndexReg is killed and the shift amount is zero.
2746 // TODO: Relex this restriction to extend, simplify processing now.
2747 if (!AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).isKill() ||
2748 !AArch64InstrInfo::getLdStAmountOp(MI: MemMI).isImm() ||
2749 (AArch64InstrInfo::getLdStAmountOp(MI: MemMI).getImm() != 0))
2750 return E;
2751
2752 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2753
2754 // Track which register units have been modified and used between the first
2755 // insn (inclusive) and the second insn.
2756 ModifiedRegUnits.clear();
2757 UsedRegUnits.clear();
2758 unsigned Count = 0;
2759 do {
2760 MBBI = prev_nodbg(It: MBBI, Begin: B);
2761 MachineInstr &MI = *MBBI;
2762
2763 // Don't count transient instructions towards the search limit since there
2764 // may be different numbers of them if e.g. debug information is present.
2765 if (!MI.isTransient())
2766 ++Count;
2767
2768 // If we found a match, return it.
2769 if (isMatchingMovConstInsn(MemMI&: *I, MI, IndexReg, Offset)) {
2770 return MBBI;
2771 }
2772
2773 // Update the status of what the instruction clobbered and used.
2774 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2775
2776 // Otherwise, if the index register is used or modified, we have no match,
2777 // so return early.
2778 if (!ModifiedRegUnits.available(Reg: IndexReg) ||
2779 !UsedRegUnits.available(Reg: IndexReg))
2780 return E;
2781
2782 } while (MBBI != B && Count < Limit);
2783 return E;
2784}
2785
2786bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2787 MachineBasicBlock::iterator &MBBI) {
2788 MachineInstr &MI = *MBBI;
2789 // If this is a volatile load, don't mess with it.
2790 if (MI.hasOrderedMemoryRef())
2791 return false;
2792
2793 if (needsWinCFI(MF: MI.getMF()) && MI.getFlag(Flag: MachineInstr::FrameDestroy))
2794 return false;
2795
2796 // Make sure this is a reg+imm.
2797 // FIXME: It is possible to extend it to handle reg+reg cases.
2798 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2799 return false;
2800
2801 // Look backward up to LdStLimit instructions.
2802 MachineBasicBlock::iterator StoreI;
2803 if (findMatchingStore(I: MBBI, Limit: LdStLimit, StoreI)) {
2804 ++NumLoadsFromStoresPromoted;
2805 // Promote the load. Keeping the iterator straight is a
2806 // pain, so we let the merge routine tell us what the next instruction
2807 // is after it's done mucking about.
2808 MBBI = promoteLoadFromStore(LoadI: MBBI, StoreI);
2809 return true;
2810 }
2811 return false;
2812}
2813
2814// Merge adjacent zero stores into a wider store.
2815bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2816 MachineBasicBlock::iterator &MBBI) {
2817 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2818 MachineInstr &MI = *MBBI;
2819 MachineBasicBlock::iterator E = MI.getParent()->end();
2820
2821 if (!TII->isCandidateToMergeOrPair(MI))
2822 return false;
2823
2824 // Look ahead up to LdStLimit instructions for a mergeable instruction.
2825 LdStPairFlags Flags;
2826 MachineBasicBlock::iterator MergeMI =
2827 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ true);
2828 if (MergeMI != E) {
2829 ++NumZeroStoresPromoted;
2830
2831 // Keeping the iterator straight is a pain, so we let the merge routine tell
2832 // us what the next instruction is after it's done mucking about.
2833 MBBI = mergeNarrowZeroStores(I: MBBI, MergeMI, Flags);
2834 return true;
2835 }
2836 return false;
2837}
2838
2839// Find loads and stores that can be merged into a single load or store pair
2840// instruction.
2841bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2842 MachineInstr &MI = *MBBI;
2843 MachineBasicBlock::iterator E = MI.getParent()->end();
2844
2845 if (!TII->isCandidateToMergeOrPair(MI))
2846 return false;
2847
2848 // If disable-ldp feature is opted, do not emit ldp.
2849 if (MI.mayLoad() && Subtarget->hasDisableLdp())
2850 return false;
2851
2852 // If disable-stp feature is opted, do not emit stp.
2853 if (MI.mayStore() && Subtarget->hasDisableStp())
2854 return false;
2855
2856 // Early exit if the offset is not possible to match. (6 bits of positive
2857 // range, plus allow an extra one in case we find a later insn that matches
2858 // with Offset-1)
2859 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2860 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2861 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2862 // Allow one more for offset.
2863 if (Offset > 0)
2864 Offset -= OffsetStride;
2865 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2866 return false;
2867
2868 // Look ahead up to LdStLimit instructions for a pairable instruction.
2869 LdStPairFlags Flags;
2870 MachineBasicBlock::iterator Paired =
2871 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ false);
2872 if (Paired != E) {
2873 // Keeping the iterator straight is a pain, so we let the merge routine tell
2874 // us what the next instruction is after it's done mucking about.
2875 auto Prev = std::prev(x: MBBI);
2876
2877 // Fetch the memoperand of the load/store that is a candidate for
2878 // combination.
2879 MachineMemOperand *MemOp =
2880 MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2881
2882 // If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2883 // that the alignment of the source pointer is at least double the alignment
2884 // of the type.
2885 if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||
2886 (MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2887 // If there is no size/align information, cancel the transformation.
2888 if (!MemOp || !MemOp->getMemoryType().isValid()) {
2889 NumFailedAlignmentCheck++;
2890 return false;
2891 }
2892
2893 // Get the needed alignments to check them if
2894 // ldp-aligned-only/stp-aligned-only features are opted.
2895 uint64_t MemAlignment = MemOp->getAlign().value();
2896 uint64_t TypeAlignment =
2897 Align(MemOp->getSize().getValue().getKnownMinValue()).value();
2898
2899 if (MemAlignment < 2 * TypeAlignment) {
2900 NumFailedAlignmentCheck++;
2901 return false;
2902 }
2903 }
2904
2905 ++NumPairCreated;
2906 if (TII->hasUnscaledLdStOffset(MI))
2907 ++NumUnscaledPairCreated;
2908
2909 MBBI = mergePairedInsns(I: MBBI, Paired, Flags);
2910 // Collect liveness info for instructions between Prev and the new position
2911 // MBBI.
2912 for (auto I = std::next(x: Prev); I != MBBI; I++)
2913 updateDefinedRegisters(MI&: *I, Units&: DefinedInBB, TRI);
2914
2915 return true;
2916 }
2917 return false;
2918}
2919
2920bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2921 (MachineBasicBlock::iterator &MBBI) {
2922 MachineInstr &MI = *MBBI;
2923 MachineBasicBlock::iterator E = MI.getParent()->end();
2924 MachineBasicBlock::iterator Update;
2925
2926 // Look forward to try to form a post-index instruction. For example,
2927 // ldr x0, [x20]
2928 // add x20, x20, #32
2929 // merged into:
2930 // ldr x0, [x20], #32
2931 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset: 0, Limit: UpdateLimit);
2932 if (Update != E) {
2933 // Merge the update into the ld/st.
2934 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2935 /*IsPreIdx=*/false,
2936 /*MergeEither=*/false)) {
2937 MBBI = *NextI;
2938 return true;
2939 }
2940 }
2941
2942 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2943 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2944 return false;
2945
2946 // Look back to try to find a pre-index instruction. For example,
2947 // add x0, x0, #8
2948 // ldr x1, [x0]
2949 // merged into:
2950 // ldr x1, [x0, #8]!
2951 bool MergeEither;
2952 Update = findMatchingUpdateInsnBackward(I: MBBI, Limit: UpdateLimit, MergeEither);
2953 if (Update != E) {
2954 // Merge the update into the ld/st.
2955 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/true,
2956 /*IsPreIdx=*/true, MergeEither)) {
2957 MBBI = *NextI;
2958 return true;
2959 }
2960 }
2961
2962 // The immediate in the load/store is scaled by the size of the memory
2963 // operation. The immediate in the add we're looking for,
2964 // however, is not, so adjust here.
2965 int UnscaledOffset =
2966 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2967
2968 // Look forward to try to find a pre-index instruction. For example,
2969 // ldr x1, [x0, #64]
2970 // add x0, x0, #64
2971 // merged into:
2972 // ldr x1, [x0, #64]!
2973 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset, Limit: UpdateLimit);
2974 if (Update != E) {
2975 // Merge the update into the ld/st.
2976 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2977 /*IsPreIdx=*/true,
2978 /*MergeEither=*/false)) {
2979 MBBI = *NextI;
2980 return true;
2981 }
2982 }
2983
2984 return false;
2985}
2986
2987bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2988 int Scale) {
2989 MachineInstr &MI = *MBBI;
2990 MachineBasicBlock::iterator E = MI.getParent()->end();
2991 MachineBasicBlock::iterator Update;
2992
2993 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2994 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2995 return false;
2996
2997 // Look back to try to find a const offset for index LdSt instruction. For
2998 // example,
2999 // mov x8, #LargeImm ; = a * (1<<12) + imm12
3000 // ldr x1, [x0, x8]
3001 // merged into:
3002 // add x8, x0, a * (1<<12)
3003 // ldr x1, [x8, imm12]
3004 unsigned Offset;
3005 Update = findMatchingConstOffsetBackward(I: MBBI, Limit: LdStConstLimit, Offset);
3006 if (Update != E && (Offset & (Scale - 1)) == 0) {
3007 // Merge the imm12 into the ld/st.
3008 MBBI = mergeConstOffsetInsn(I: MBBI, Update, Offset, Scale);
3009 return true;
3010 }
3011
3012 return false;
3013}
3014
3015bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
3016 bool EnableNarrowZeroStOpt) {
3017 AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
3018
3019 bool Modified = false;
3020 // Four transformations to do here:
3021 // 1) Find loads that directly read from stores and promote them by
3022 // replacing with mov instructions. If the store is wider than the load,
3023 // the load will be replaced with a bitfield extract.
3024 // e.g.,
3025 // str w1, [x0, #4]
3026 // ldrh w2, [x0, #6]
3027 // ; becomes
3028 // str w1, [x0, #4]
3029 // lsr w2, w1, #16
3030 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3031 MBBI != E;) {
3032 if (isPromotableLoadFromStore(MI&: *MBBI) && tryToPromoteLoadFromStore(MBBI))
3033 Modified = true;
3034 else
3035 ++MBBI;
3036 }
3037 // 2) Merge adjacent zero stores into a wider store.
3038 // e.g.,
3039 // strh wzr, [x0]
3040 // strh wzr, [x0, #2]
3041 // ; becomes
3042 // str wzr, [x0]
3043 // e.g.,
3044 // str wzr, [x0]
3045 // str wzr, [x0, #4]
3046 // ; becomes
3047 // str xzr, [x0]
3048 if (EnableNarrowZeroStOpt)
3049 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3050 MBBI != E;) {
3051 if (isPromotableZeroStoreInst(MI&: *MBBI) && tryToMergeZeroStInst(MBBI))
3052 Modified = true;
3053 else
3054 ++MBBI;
3055 }
3056 // 3) Find loads and stores that can be merged into a single load or store
3057 // pair instruction.
3058 // When compiling for SVE 128, also try to combine SVE fill/spill
3059 // instructions into LDP/STP.
3060 // e.g.,
3061 // ldr x0, [x2]
3062 // ldr x1, [x2, #8]
3063 // ; becomes
3064 // ldp x0, x1, [x2]
3065 // e.g.,
3066 // ldr z0, [x2]
3067 // ldr z1, [x2, #1, mul vl]
3068 // ; becomes
3069 // ldp q0, q1, [x2]
3070
3071 if (MBB.getParent()->getRegInfo().tracksLiveness()) {
3072 DefinedInBB.clear();
3073 DefinedInBB.addLiveIns(MBB);
3074 }
3075
3076 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3077 MBBI != E;) {
3078 // Track currently live registers up to this point, to help with
3079 // searching for a rename register on demand.
3080 updateDefinedRegisters(MI&: *MBBI, Units&: DefinedInBB, TRI);
3081 if (TII->isPairableLdStInst(MI: *MBBI) && tryToPairLdStInst(MBBI))
3082 Modified = true;
3083 else
3084 ++MBBI;
3085 }
3086 // 4) Find base register updates that can be merged into the load or store
3087 // as a base-reg writeback.
3088 // e.g.,
3089 // ldr x0, [x2]
3090 // add x2, x2, #4
3091 // ; becomes
3092 // ldr x0, [x2], #4
3093 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3094 MBBI != E;) {
3095 if (isMergeableLdStUpdate(MI&: *MBBI, AFI) && tryToMergeLdStUpdate(MBBI))
3096 Modified = true;
3097 else
3098 ++MBBI;
3099 }
3100
3101 // 5) Find a register assigned with a const value that can be combined with
3102 // into the load or store. e.g.,
3103 // mov x8, #LargeImm ; = a * (1<<12) + imm12
3104 // ldr x1, [x0, x8]
3105 // ; becomes
3106 // add x8, x0, a * (1<<12)
3107 // ldr x1, [x8, imm12]
3108 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3109 MBBI != E;) {
3110 int Scale;
3111 if (isMergeableIndexLdSt(MI&: *MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
3112 Modified = true;
3113 else
3114 ++MBBI;
3115 }
3116
3117 return Modified;
3118}
3119
3120bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
3121 Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
3122 TII = Subtarget->getInstrInfo();
3123 TRI = Subtarget->getRegisterInfo();
3124
3125 // Resize the modified and used register unit trackers. We do this once
3126 // per function and then clear the register units each time we optimize a load
3127 // or store.
3128 ModifiedRegUnits.init(TRI: *TRI);
3129 UsedRegUnits.init(TRI: *TRI);
3130 DefinedInBB.init(TRI: *TRI);
3131
3132 bool Modified = false;
3133 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
3134 for (auto &MBB : Fn) {
3135 auto M = optimizeBlock(MBB, EnableNarrowZeroStOpt: enableNarrowZeroStOpt);
3136 Modified |= M;
3137 }
3138
3139 return Modified;
3140}
3141
3142// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
3143// stores near one another? Note: The pre-RA instruction scheduler already has
3144// hooks to try and schedule pairable loads/stores together to improve pairing
3145// opportunities. Thus, pre-RA pairing pass may not be worth the effort.
3146
3147// FIXME: When pairing store instructions it's very possible for this pass to
3148// hoist a store with a KILL marker above another use (without a KILL marker).
3149// The resulting IR is invalid, but nothing uses the KILL markers after this
3150// pass, so it's never caused a problem in practice.
3151
3152bool AArch64LoadStoreOptLegacy::runOnMachineFunction(MachineFunction &MF) {
3153 if (skipFunction(F: MF.getFunction()))
3154 return false;
3155 AArch64LoadStoreOpt Impl;
3156 Impl.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3157 return Impl.runOnMachineFunction(Fn&: MF);
3158}
3159
3160/// createAArch64LoadStoreOptimizationPass - returns an instance of the
3161/// load / store optimization pass.
3162FunctionPass *llvm::createAArch64LoadStoreOptLegacyPass() {
3163 return new AArch64LoadStoreOptLegacy();
3164}
3165
3166PreservedAnalyses
3167AArch64LoadStoreOptPass::run(MachineFunction &MF,
3168 MachineFunctionAnalysisManager &MFAM) {
3169 AArch64LoadStoreOpt Impl;
3170 Impl.AA = &MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
3171 .getManager()
3172 .getResult<AAManager>(IR&: MF.getFunction());
3173 bool Changed = Impl.runOnMachineFunction(Fn&: MF);
3174 if (!Changed)
3175 return PreservedAnalyses::all();
3176 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
3177 PA.preserveSet<CFGAnalyses>();
3178 return PA;
3179}
3180