1//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains a pass that performs load / store related peephole
10// optimizations. This pass should be run after register allocation.
11//
12// The pass runs after the PrologEpilogInserter where we emit the CFI
13// instructions. In order to preserve the correctness of the unwind information,
14// the pass should not change the order of any two instructions, one of which
15// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16// to unwind information.
17//
18//===----------------------------------------------------------------------===//
19
20#include "AArch64InstrInfo.h"
21#include "AArch64MachineFunctionInfo.h"
22#include "AArch64Subtarget.h"
23#include "MCTargetDesc/AArch64AddressingModes.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/ADT/StringRef.h"
27#include "llvm/ADT/iterator_range.h"
28#include "llvm/Analysis/AliasAnalysis.h"
29#include "llvm/CodeGen/MachineBasicBlock.h"
30#include "llvm/CodeGen/MachineFunction.h"
31#include "llvm/CodeGen/MachineFunctionPass.h"
32#include "llvm/CodeGen/MachineInstr.h"
33#include "llvm/CodeGen/MachineInstrBuilder.h"
34#include "llvm/CodeGen/MachineOperand.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/TargetRegisterInfo.h"
37#include "llvm/IR/DebugLoc.h"
38#include "llvm/MC/MCAsmInfo.h"
39#include "llvm/MC/MCDwarf.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/Debug.h"
43#include "llvm/Support/DebugCounter.h"
44#include "llvm/Support/ErrorHandling.h"
45#include <cassert>
46#include <cstdint>
47#include <functional>
48#include <iterator>
49#include <limits>
50#include <optional>
51
52using namespace llvm;
53
54#define DEBUG_TYPE "aarch64-ldst-opt"
55
56STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
57STATISTIC(NumPostFolded, "Number of post-index updates folded");
58STATISTIC(NumPreFolded, "Number of pre-index updates folded");
59STATISTIC(NumUnscaledPairCreated,
60 "Number of load/store from unscaled generated");
61STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
62STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
63STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
64 "not passed the alignment check");
65STATISTIC(NumConstOffsetFolded,
66 "Number of const offset of index address folded");
67
68DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
69 "Controls which pairs are considered for renaming");
70
71// The LdStLimit limits how far we search for load/store pairs.
72static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
73 cl::init(Val: 20), cl::Hidden);
74
75// The UpdateLimit limits how far we search for update instructions when we form
76// pre-/post-index instructions.
77static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(Val: 100),
78 cl::Hidden);
79
80// The LdStConstLimit limits how far we search for const offset instructions
81// when we form index address load/store instructions.
82static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
83 cl::init(Val: 10), cl::Hidden);
84
85// Enable register renaming to find additional store pairing opportunities.
86static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
87 cl::init(Val: true), cl::Hidden);
88
89#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
90
91namespace {
92
93using LdStPairFlags = struct LdStPairFlags {
94 // If a matching instruction is found, MergeForward is set to true if the
95 // merge is to remove the first instruction and replace the second with
96 // a pair-wise insn, and false if the reverse is true.
97 bool MergeForward = false;
98
99 // SExtIdx gives the index of the result of the load pair that must be
100 // extended. The value of SExtIdx assumes that the paired load produces the
101 // value in this order: (I, returned iterator), i.e., -1 means no value has
102 // to be extended, 0 means I, and 1 means the returned iterator.
103 int SExtIdx = -1;
104
105 // If not none, RenameReg can be used to rename the result register of the
106 // first store in a pair. Currently this only works when merging stores
107 // forward.
108 std::optional<MCPhysReg> RenameReg;
109
110 LdStPairFlags() = default;
111
112 void setMergeForward(bool V = true) { MergeForward = V; }
113 bool getMergeForward() const { return MergeForward; }
114
115 void setSExtIdx(int V) { SExtIdx = V; }
116 int getSExtIdx() const { return SExtIdx; }
117
118 void setRenameReg(MCPhysReg R) { RenameReg = R; }
119 void clearRenameReg() { RenameReg = std::nullopt; }
120 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
121};
122
123struct AArch64LoadStoreOpt : public MachineFunctionPass {
124 static char ID;
125
126 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
127
128 AliasAnalysis *AA;
129 const AArch64InstrInfo *TII;
130 const TargetRegisterInfo *TRI;
131 const AArch64Subtarget *Subtarget;
132
133 // Track which register units have been modified and used.
134 LiveRegUnits ModifiedRegUnits, UsedRegUnits;
135 LiveRegUnits DefinedInBB;
136
137 void getAnalysisUsage(AnalysisUsage &AU) const override {
138 AU.addRequired<AAResultsWrapperPass>();
139 MachineFunctionPass::getAnalysisUsage(AU);
140 }
141
142 // Scan the instructions looking for a load/store that can be combined
143 // with the current instruction into a load/store pair.
144 // Return the matching instruction if one is found, else MBB->end().
145 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
146 LdStPairFlags &Flags,
147 unsigned Limit,
148 bool FindNarrowMerge);
149
150 // Scan the instructions looking for a store that writes to the address from
151 // which the current load instruction reads. Return true if one is found.
152 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
153 MachineBasicBlock::iterator &StoreI);
154
155 // Merge the two instructions indicated into a wider narrow store instruction.
156 MachineBasicBlock::iterator
157 mergeNarrowZeroStores(MachineBasicBlock::iterator I,
158 MachineBasicBlock::iterator MergeMI,
159 const LdStPairFlags &Flags);
160
161 // Merge the two instructions indicated into a single pair-wise instruction.
162 MachineBasicBlock::iterator
163 mergePairedInsns(MachineBasicBlock::iterator I,
164 MachineBasicBlock::iterator Paired,
165 const LdStPairFlags &Flags);
166
167 // Promote the load that reads directly from the address stored to.
168 MachineBasicBlock::iterator
169 promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
170 MachineBasicBlock::iterator StoreI);
171
172 // Scan the instruction list to find a base register update that can
173 // be combined with the current instruction (a load or store) using
174 // pre or post indexed addressing with writeback. Scan forwards.
175 MachineBasicBlock::iterator
176 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
177 int UnscaledOffset, unsigned Limit);
178
179 // Scan the instruction list to find a register assigned with a const
180 // value that can be combined with the current instruction (a load or store)
181 // using base addressing with writeback. Scan backwards.
182 MachineBasicBlock::iterator
183 findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
184 unsigned &Offset);
185
186 // Scan the instruction list to find a base register update that can
187 // be combined with the current instruction (a load or store) using
188 // pre or post indexed addressing with writeback. Scan backwards.
189 // `MergeEither` is set to true if the combined instruction may be placed
190 // either at the location of the load/store instruction or at the location of
191 // the update instruction.
192 MachineBasicBlock::iterator
193 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
194 bool &MergeEither);
195
196 // Find an instruction that updates the base register of the ld/st
197 // instruction.
198 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
199 unsigned BaseReg, int Offset);
200
201 bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
202 unsigned IndexReg, unsigned &Offset);
203
204 // Merge a pre- or post-index base register update into a ld/st instruction.
205 std::optional<MachineBasicBlock::iterator>
206 mergeUpdateInsn(MachineBasicBlock::iterator I,
207 MachineBasicBlock::iterator Update, bool IsForward,
208 bool IsPreIdx, bool MergeEither);
209
210 MachineBasicBlock::iterator
211 mergeConstOffsetInsn(MachineBasicBlock::iterator I,
212 MachineBasicBlock::iterator Update, unsigned Offset,
213 int Scale);
214
215 // Find and merge zero store instructions.
216 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
217
218 // Find and pair ldr/str instructions.
219 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
220
221 // Find and promote load instructions which read directly from store.
222 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
223
224 // Find and merge a base register updates before or after a ld/st instruction.
225 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
226
227 // Find and merge an index ldr/st instruction into a base ld/st instruction.
228 bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
229
230 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
231
232 bool runOnMachineFunction(MachineFunction &Fn) override;
233
234 MachineFunctionProperties getRequiredProperties() const override {
235 return MachineFunctionProperties().setNoVRegs();
236 }
237
238 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
239};
240
241char AArch64LoadStoreOpt::ID = 0;
242
243} // end anonymous namespace
244
245INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
246 AARCH64_LOAD_STORE_OPT_NAME, false, false)
247
248static bool isNarrowStore(unsigned Opc) {
249 switch (Opc) {
250 default:
251 return false;
252 case AArch64::STRBBui:
253 case AArch64::STURBBi:
254 case AArch64::STRHHui:
255 case AArch64::STURHHi:
256 return true;
257 }
258}
259
260// These instruction set memory tag and either keep memory contents unchanged or
261// set it to zero, ignoring the address part of the source register.
262static bool isTagStore(const MachineInstr &MI) {
263 switch (MI.getOpcode()) {
264 default:
265 return false;
266 case AArch64::STGi:
267 case AArch64::STZGi:
268 case AArch64::ST2Gi:
269 case AArch64::STZ2Gi:
270 return true;
271 }
272}
273
274static unsigned getMatchingNonSExtOpcode(unsigned Opc,
275 bool *IsValidLdStrOpc = nullptr) {
276 if (IsValidLdStrOpc)
277 *IsValidLdStrOpc = true;
278 switch (Opc) {
279 default:
280 if (IsValidLdStrOpc)
281 *IsValidLdStrOpc = false;
282 return std::numeric_limits<unsigned>::max();
283 case AArch64::STRDui:
284 case AArch64::STURDi:
285 case AArch64::STRDpre:
286 case AArch64::STRQui:
287 case AArch64::STURQi:
288 case AArch64::STRQpre:
289 case AArch64::STRBBui:
290 case AArch64::STURBBi:
291 case AArch64::STRHHui:
292 case AArch64::STURHHi:
293 case AArch64::STRWui:
294 case AArch64::STRWpre:
295 case AArch64::STURWi:
296 case AArch64::STRXui:
297 case AArch64::STRXpre:
298 case AArch64::STURXi:
299 case AArch64::STR_ZXI:
300 case AArch64::LDRDui:
301 case AArch64::LDURDi:
302 case AArch64::LDRDpre:
303 case AArch64::LDRQui:
304 case AArch64::LDURQi:
305 case AArch64::LDRQpre:
306 case AArch64::LDRWui:
307 case AArch64::LDURWi:
308 case AArch64::LDRWpre:
309 case AArch64::LDRXui:
310 case AArch64::LDURXi:
311 case AArch64::LDRXpre:
312 case AArch64::STRSui:
313 case AArch64::STURSi:
314 case AArch64::STRSpre:
315 case AArch64::LDRSui:
316 case AArch64::LDURSi:
317 case AArch64::LDRSpre:
318 case AArch64::LDR_ZXI:
319 return Opc;
320 case AArch64::LDRSWui:
321 return AArch64::LDRWui;
322 case AArch64::LDURSWi:
323 return AArch64::LDURWi;
324 case AArch64::LDRSWpre:
325 return AArch64::LDRWpre;
326 }
327}
328
329static unsigned getMatchingWideOpcode(unsigned Opc) {
330 switch (Opc) {
331 default:
332 llvm_unreachable("Opcode has no wide equivalent!");
333 case AArch64::STRBBui:
334 return AArch64::STRHHui;
335 case AArch64::STRHHui:
336 return AArch64::STRWui;
337 case AArch64::STURBBi:
338 return AArch64::STURHHi;
339 case AArch64::STURHHi:
340 return AArch64::STURWi;
341 case AArch64::STURWi:
342 return AArch64::STURXi;
343 case AArch64::STRWui:
344 return AArch64::STRXui;
345 }
346}
347
348static unsigned getMatchingPairOpcode(unsigned Opc) {
349 switch (Opc) {
350 default:
351 llvm_unreachable("Opcode has no pairwise equivalent!");
352 case AArch64::STRSui:
353 case AArch64::STURSi:
354 return AArch64::STPSi;
355 case AArch64::STRSpre:
356 return AArch64::STPSpre;
357 case AArch64::STRDui:
358 case AArch64::STURDi:
359 return AArch64::STPDi;
360 case AArch64::STRDpre:
361 return AArch64::STPDpre;
362 case AArch64::STRQui:
363 case AArch64::STURQi:
364 case AArch64::STR_ZXI:
365 return AArch64::STPQi;
366 case AArch64::STRQpre:
367 return AArch64::STPQpre;
368 case AArch64::STRWui:
369 case AArch64::STURWi:
370 return AArch64::STPWi;
371 case AArch64::STRWpre:
372 return AArch64::STPWpre;
373 case AArch64::STRXui:
374 case AArch64::STURXi:
375 return AArch64::STPXi;
376 case AArch64::STRXpre:
377 return AArch64::STPXpre;
378 case AArch64::LDRSui:
379 case AArch64::LDURSi:
380 return AArch64::LDPSi;
381 case AArch64::LDRSpre:
382 return AArch64::LDPSpre;
383 case AArch64::LDRDui:
384 case AArch64::LDURDi:
385 return AArch64::LDPDi;
386 case AArch64::LDRDpre:
387 return AArch64::LDPDpre;
388 case AArch64::LDRQui:
389 case AArch64::LDURQi:
390 case AArch64::LDR_ZXI:
391 return AArch64::LDPQi;
392 case AArch64::LDRQpre:
393 return AArch64::LDPQpre;
394 case AArch64::LDRWui:
395 case AArch64::LDURWi:
396 return AArch64::LDPWi;
397 case AArch64::LDRWpre:
398 return AArch64::LDPWpre;
399 case AArch64::LDRXui:
400 case AArch64::LDURXi:
401 return AArch64::LDPXi;
402 case AArch64::LDRXpre:
403 return AArch64::LDPXpre;
404 case AArch64::LDRSWui:
405 case AArch64::LDURSWi:
406 return AArch64::LDPSWi;
407 case AArch64::LDRSWpre:
408 return AArch64::LDPSWpre;
409 }
410}
411
412static unsigned isMatchingStore(MachineInstr &LoadInst,
413 MachineInstr &StoreInst) {
414 unsigned LdOpc = LoadInst.getOpcode();
415 unsigned StOpc = StoreInst.getOpcode();
416 switch (LdOpc) {
417 default:
418 llvm_unreachable("Unsupported load instruction!");
419 case AArch64::LDRBBui:
420 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
421 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
422 case AArch64::LDURBBi:
423 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
424 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
425 case AArch64::LDRHHui:
426 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
427 StOpc == AArch64::STRXui;
428 case AArch64::LDURHHi:
429 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
430 StOpc == AArch64::STURXi;
431 case AArch64::LDRWui:
432 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
433 case AArch64::LDURWi:
434 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
435 case AArch64::LDRXui:
436 return StOpc == AArch64::STRXui;
437 case AArch64::LDURXi:
438 return StOpc == AArch64::STURXi;
439 }
440}
441
442static unsigned getPreIndexedOpcode(unsigned Opc) {
443 // FIXME: We don't currently support creating pre-indexed loads/stores when
444 // the load or store is the unscaled version. If we decide to perform such an
445 // optimization in the future the cases for the unscaled loads/stores will
446 // need to be added here.
447 switch (Opc) {
448 default:
449 llvm_unreachable("Opcode has no pre-indexed equivalent!");
450 case AArch64::STRSui:
451 return AArch64::STRSpre;
452 case AArch64::STRDui:
453 return AArch64::STRDpre;
454 case AArch64::STRQui:
455 return AArch64::STRQpre;
456 case AArch64::STRBBui:
457 return AArch64::STRBBpre;
458 case AArch64::STRHHui:
459 return AArch64::STRHHpre;
460 case AArch64::STRWui:
461 return AArch64::STRWpre;
462 case AArch64::STRXui:
463 return AArch64::STRXpre;
464 case AArch64::LDRSui:
465 return AArch64::LDRSpre;
466 case AArch64::LDRDui:
467 return AArch64::LDRDpre;
468 case AArch64::LDRQui:
469 return AArch64::LDRQpre;
470 case AArch64::LDRBBui:
471 return AArch64::LDRBBpre;
472 case AArch64::LDRHHui:
473 return AArch64::LDRHHpre;
474 case AArch64::LDRWui:
475 return AArch64::LDRWpre;
476 case AArch64::LDRXui:
477 return AArch64::LDRXpre;
478 case AArch64::LDRSWui:
479 return AArch64::LDRSWpre;
480 case AArch64::LDPSi:
481 return AArch64::LDPSpre;
482 case AArch64::LDPSWi:
483 return AArch64::LDPSWpre;
484 case AArch64::LDPDi:
485 return AArch64::LDPDpre;
486 case AArch64::LDPQi:
487 return AArch64::LDPQpre;
488 case AArch64::LDPWi:
489 return AArch64::LDPWpre;
490 case AArch64::LDPXi:
491 return AArch64::LDPXpre;
492 case AArch64::STPSi:
493 return AArch64::STPSpre;
494 case AArch64::STPDi:
495 return AArch64::STPDpre;
496 case AArch64::STPQi:
497 return AArch64::STPQpre;
498 case AArch64::STPWi:
499 return AArch64::STPWpre;
500 case AArch64::STPXi:
501 return AArch64::STPXpre;
502 case AArch64::STGi:
503 return AArch64::STGPreIndex;
504 case AArch64::STZGi:
505 return AArch64::STZGPreIndex;
506 case AArch64::ST2Gi:
507 return AArch64::ST2GPreIndex;
508 case AArch64::STZ2Gi:
509 return AArch64::STZ2GPreIndex;
510 case AArch64::STGPi:
511 return AArch64::STGPpre;
512 }
513}
514
515static unsigned getBaseAddressOpcode(unsigned Opc) {
516 // TODO: Add more index address stores.
517 switch (Opc) {
518 default:
519 llvm_unreachable("Opcode has no base address equivalent!");
520 case AArch64::LDRBroX:
521 return AArch64::LDRBui;
522 case AArch64::LDRBBroX:
523 return AArch64::LDRBBui;
524 case AArch64::LDRSBXroX:
525 return AArch64::LDRSBXui;
526 case AArch64::LDRSBWroX:
527 return AArch64::LDRSBWui;
528 case AArch64::LDRHroX:
529 return AArch64::LDRHui;
530 case AArch64::LDRHHroX:
531 return AArch64::LDRHHui;
532 case AArch64::LDRSHXroX:
533 return AArch64::LDRSHXui;
534 case AArch64::LDRSHWroX:
535 return AArch64::LDRSHWui;
536 case AArch64::LDRWroX:
537 return AArch64::LDRWui;
538 case AArch64::LDRSroX:
539 return AArch64::LDRSui;
540 case AArch64::LDRSWroX:
541 return AArch64::LDRSWui;
542 case AArch64::LDRDroX:
543 return AArch64::LDRDui;
544 case AArch64::LDRXroX:
545 return AArch64::LDRXui;
546 case AArch64::LDRQroX:
547 return AArch64::LDRQui;
548 }
549}
550
551static unsigned getPostIndexedOpcode(unsigned Opc) {
552 switch (Opc) {
553 default:
554 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
555 case AArch64::STRSui:
556 case AArch64::STURSi:
557 return AArch64::STRSpost;
558 case AArch64::STRDui:
559 case AArch64::STURDi:
560 return AArch64::STRDpost;
561 case AArch64::STRQui:
562 case AArch64::STURQi:
563 return AArch64::STRQpost;
564 case AArch64::STRBBui:
565 return AArch64::STRBBpost;
566 case AArch64::STRHHui:
567 return AArch64::STRHHpost;
568 case AArch64::STRWui:
569 case AArch64::STURWi:
570 return AArch64::STRWpost;
571 case AArch64::STRXui:
572 case AArch64::STURXi:
573 return AArch64::STRXpost;
574 case AArch64::LDRSui:
575 case AArch64::LDURSi:
576 return AArch64::LDRSpost;
577 case AArch64::LDRDui:
578 case AArch64::LDURDi:
579 return AArch64::LDRDpost;
580 case AArch64::LDRQui:
581 case AArch64::LDURQi:
582 return AArch64::LDRQpost;
583 case AArch64::LDRBBui:
584 return AArch64::LDRBBpost;
585 case AArch64::LDRHHui:
586 return AArch64::LDRHHpost;
587 case AArch64::LDRWui:
588 case AArch64::LDURWi:
589 return AArch64::LDRWpost;
590 case AArch64::LDRXui:
591 case AArch64::LDURXi:
592 return AArch64::LDRXpost;
593 case AArch64::LDRSWui:
594 return AArch64::LDRSWpost;
595 case AArch64::LDPSi:
596 return AArch64::LDPSpost;
597 case AArch64::LDPSWi:
598 return AArch64::LDPSWpost;
599 case AArch64::LDPDi:
600 return AArch64::LDPDpost;
601 case AArch64::LDPQi:
602 return AArch64::LDPQpost;
603 case AArch64::LDPWi:
604 return AArch64::LDPWpost;
605 case AArch64::LDPXi:
606 return AArch64::LDPXpost;
607 case AArch64::STPSi:
608 return AArch64::STPSpost;
609 case AArch64::STPDi:
610 return AArch64::STPDpost;
611 case AArch64::STPQi:
612 return AArch64::STPQpost;
613 case AArch64::STPWi:
614 return AArch64::STPWpost;
615 case AArch64::STPXi:
616 return AArch64::STPXpost;
617 case AArch64::STGi:
618 return AArch64::STGPostIndex;
619 case AArch64::STZGi:
620 return AArch64::STZGPostIndex;
621 case AArch64::ST2Gi:
622 return AArch64::ST2GPostIndex;
623 case AArch64::STZ2Gi:
624 return AArch64::STZ2GPostIndex;
625 case AArch64::STGPi:
626 return AArch64::STGPpost;
627 }
628}
629
630static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
631
632 unsigned OpcA = FirstMI.getOpcode();
633 unsigned OpcB = MI.getOpcode();
634
635 switch (OpcA) {
636 default:
637 return false;
638 case AArch64::STRSpre:
639 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
640 case AArch64::STRDpre:
641 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
642 case AArch64::STRQpre:
643 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
644 case AArch64::STRWpre:
645 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
646 case AArch64::STRXpre:
647 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
648 case AArch64::LDRSpre:
649 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
650 case AArch64::LDRDpre:
651 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
652 case AArch64::LDRQpre:
653 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
654 case AArch64::LDRWpre:
655 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
656 case AArch64::LDRXpre:
657 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
658 case AArch64::LDRSWpre:
659 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
660 }
661}
662
663// Returns the scale and offset range of pre/post indexed variants of MI.
664static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
665 int &MinOffset, int &MaxOffset) {
666 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
667 bool IsTagStore = isTagStore(MI);
668 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
669 // as in the "unsigned offset" variant.
670 // All other pre/post indexed ldst instructions are unscaled.
671 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
672
673 if (IsPaired) {
674 MinOffset = -64;
675 MaxOffset = 63;
676 } else {
677 MinOffset = -256;
678 MaxOffset = 255;
679 }
680}
681
682static MachineOperand &getLdStRegOp(MachineInstr &MI,
683 unsigned PairedRegOp = 0) {
684 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
685 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
686 if (IsPreLdSt)
687 PairedRegOp += 1;
688 unsigned Idx =
689 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
690 return MI.getOperand(i: Idx);
691}
692
693static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
694 MachineInstr &StoreInst,
695 const AArch64InstrInfo *TII) {
696 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
697 int LoadSize = TII->getMemScale(MI: LoadInst);
698 int StoreSize = TII->getMemScale(MI: StoreInst);
699 int UnscaledStOffset =
700 TII->hasUnscaledLdStOffset(MI&: StoreInst)
701 ? AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm()
702 : AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm() * StoreSize;
703 int UnscaledLdOffset =
704 TII->hasUnscaledLdStOffset(MI&: LoadInst)
705 ? AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm()
706 : AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm() * LoadSize;
707 return (UnscaledStOffset <= UnscaledLdOffset) &&
708 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
709}
710
711static bool isPromotableZeroStoreInst(MachineInstr &MI) {
712 unsigned Opc = MI.getOpcode();
713 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
714 isNarrowStore(Opc)) &&
715 getLdStRegOp(MI).getReg() == AArch64::WZR;
716}
717
718static bool isPromotableLoadFromStore(MachineInstr &MI) {
719 switch (MI.getOpcode()) {
720 default:
721 return false;
722 // Scaled instructions.
723 case AArch64::LDRBBui:
724 case AArch64::LDRHHui:
725 case AArch64::LDRWui:
726 case AArch64::LDRXui:
727 // Unscaled instructions.
728 case AArch64::LDURBBi:
729 case AArch64::LDURHHi:
730 case AArch64::LDURWi:
731 case AArch64::LDURXi:
732 return true;
733 }
734}
735
736static bool isMergeableLdStUpdate(MachineInstr &MI, AArch64FunctionInfo &AFI) {
737 unsigned Opc = MI.getOpcode();
738 switch (Opc) {
739 default:
740 return false;
741 // Scaled instructions.
742 case AArch64::STRSui:
743 case AArch64::STRDui:
744 case AArch64::STRQui:
745 case AArch64::STRXui:
746 case AArch64::STRWui:
747 case AArch64::STRHHui:
748 case AArch64::STRBBui:
749 case AArch64::LDRSui:
750 case AArch64::LDRDui:
751 case AArch64::LDRQui:
752 case AArch64::LDRXui:
753 case AArch64::LDRWui:
754 case AArch64::LDRHHui:
755 case AArch64::LDRBBui:
756 case AArch64::STGi:
757 case AArch64::STZGi:
758 case AArch64::ST2Gi:
759 case AArch64::STZ2Gi:
760 case AArch64::STGPi:
761 // Unscaled instructions.
762 case AArch64::STURSi:
763 case AArch64::STURDi:
764 case AArch64::STURQi:
765 case AArch64::STURWi:
766 case AArch64::STURXi:
767 case AArch64::LDURSi:
768 case AArch64::LDURDi:
769 case AArch64::LDURQi:
770 case AArch64::LDURWi:
771 case AArch64::LDURXi:
772 // Paired instructions.
773 case AArch64::LDPSi:
774 case AArch64::LDPSWi:
775 case AArch64::LDPDi:
776 case AArch64::LDPQi:
777 case AArch64::LDPWi:
778 case AArch64::LDPXi:
779 case AArch64::STPSi:
780 case AArch64::STPDi:
781 case AArch64::STPQi:
782 case AArch64::STPWi:
783 case AArch64::STPXi:
784 // Make sure this is a reg+imm (as opposed to an address reloc).
785 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
786 return false;
787
788 // When using stack tagging, simple sp+imm loads and stores are not
789 // tag-checked, but pre- and post-indexed versions of them are, so we can't
790 // replace the former with the latter. This transformation would be valid
791 // if the load/store accesses an untagged stack slot, but we don't have
792 // that information available after frame indices have been eliminated.
793 if (AFI.isMTETagged() &&
794 AArch64InstrInfo::getLdStBaseOp(MI).getReg() == AArch64::SP)
795 return false;
796
797 return true;
798 }
799}
800
801// Make sure this is a reg+reg Ld/St
802static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
803 unsigned Opc = MI.getOpcode();
804 switch (Opc) {
805 default:
806 return false;
807 // Scaled instructions.
808 // TODO: Add more index address stores.
809 case AArch64::LDRBroX:
810 case AArch64::LDRBBroX:
811 case AArch64::LDRSBXroX:
812 case AArch64::LDRSBWroX:
813 Scale = 1;
814 return true;
815 case AArch64::LDRHroX:
816 case AArch64::LDRHHroX:
817 case AArch64::LDRSHXroX:
818 case AArch64::LDRSHWroX:
819 Scale = 2;
820 return true;
821 case AArch64::LDRWroX:
822 case AArch64::LDRSroX:
823 case AArch64::LDRSWroX:
824 Scale = 4;
825 return true;
826 case AArch64::LDRDroX:
827 case AArch64::LDRXroX:
828 Scale = 8;
829 return true;
830 case AArch64::LDRQroX:
831 Scale = 16;
832 return true;
833 }
834}
835
836static bool isRewritableImplicitDef(const MachineOperand &MO) {
837 switch (MO.getParent()->getOpcode()) {
838 default:
839 return MO.isRenamable();
840 case AArch64::ORRWrs:
841 case AArch64::ADDWri:
842 return true;
843 }
844}
845
846MachineBasicBlock::iterator
847AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
848 MachineBasicBlock::iterator MergeMI,
849 const LdStPairFlags &Flags) {
850 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
851 "Expected promotable zero stores.");
852
853 MachineBasicBlock::iterator E = I->getParent()->end();
854 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
855 // If NextI is the second of the two instructions to be merged, we need
856 // to skip one further. Either way we merge will invalidate the iterator,
857 // and we don't need to scan the new instruction, as it's a pairwise
858 // instruction, which we're not considering for further action anyway.
859 if (NextI == MergeMI)
860 NextI = next_nodbg(It: NextI, End: E);
861
862 unsigned Opc = I->getOpcode();
863 unsigned MergeMIOpc = MergeMI->getOpcode();
864 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
865 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(Opc: MergeMIOpc);
866 int OffsetStride = IsScaled ? TII->getMemScale(MI: *I) : 1;
867 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(MI: *MergeMI) : 1;
868
869 bool MergeForward = Flags.getMergeForward();
870 // Insert our new paired instruction after whichever of the paired
871 // instructions MergeForward indicates.
872 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
873 // Also based on MergeForward is from where we copy the base register operand
874 // so we get the flags compatible with the input code.
875 const MachineOperand &BaseRegOp =
876 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *MergeMI)
877 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
878
879 // Which register is Rt and which is Rt2 depends on the offset order.
880 int64_t IOffsetInBytes =
881 AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm() * OffsetStride;
882 int64_t MIOffsetInBytes =
883 AArch64InstrInfo::getLdStOffsetOp(MI: *MergeMI).getImm() *
884 MergeMIOffsetStride;
885 // Select final offset based on the offset order.
886 int64_t OffsetImm;
887 if (IOffsetInBytes > MIOffsetInBytes)
888 OffsetImm = MIOffsetInBytes;
889 else
890 OffsetImm = IOffsetInBytes;
891
892 int NewOpcode = getMatchingWideOpcode(Opc);
893 // Adjust final offset on scaled stores because the new instruction
894 // has a different scale.
895 if (!TII->hasUnscaledLdStOffset(Opc: NewOpcode)) {
896 int NewOffsetStride = TII->getMemScale(Opc: NewOpcode);
897 assert(((OffsetImm % NewOffsetStride) == 0) &&
898 "Offset should be a multiple of the store memory scale");
899 OffsetImm = OffsetImm / NewOffsetStride;
900 }
901
902 // Construct the new instruction.
903 DebugLoc DL = I->getDebugLoc();
904 MachineBasicBlock *MBB = I->getParent();
905 MachineInstrBuilder MIB;
906 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: NewOpcode))
907 .addReg(RegNo: isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
908 .add(MO: BaseRegOp)
909 .addImm(Val: OffsetImm)
910 .cloneMergedMemRefs(OtherMIs: {&*I, &*MergeMI})
911 .setMIFlags(I->mergeFlagsWith(Other: *MergeMI));
912 (void)MIB;
913
914 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
915 LLVM_DEBUG(I->print(dbgs()));
916 LLVM_DEBUG(dbgs() << " ");
917 LLVM_DEBUG(MergeMI->print(dbgs()));
918 LLVM_DEBUG(dbgs() << " with instruction:\n ");
919 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
920 LLVM_DEBUG(dbgs() << "\n");
921
922 // Erase the old instructions.
923 I->eraseFromParent();
924 MergeMI->eraseFromParent();
925 return NextI;
926}
927
928// Apply Fn to all instructions between MI and the beginning of the block, until
929// a def for DefReg is reached. Returns true, iff Fn returns true for all
930// visited instructions. Stop after visiting Limit iterations.
931static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
932 const TargetRegisterInfo *TRI, unsigned Limit,
933 std::function<bool(MachineInstr &, bool)> &Fn) {
934 auto MBB = MI.getParent();
935 for (MachineInstr &I :
936 instructionsWithoutDebug(It: MI.getReverseIterator(), End: MBB->instr_rend())) {
937 if (!Limit)
938 return false;
939 --Limit;
940
941 bool isDef = any_of(Range: I.operands(), P: [DefReg, TRI](MachineOperand &MOP) {
942 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
943 TRI->regsOverlap(RegA: MOP.getReg(), RegB: DefReg);
944 });
945 if (!Fn(I, isDef))
946 return false;
947 if (isDef)
948 break;
949 }
950 return true;
951}
952
953static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
954 const TargetRegisterInfo *TRI) {
955
956 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
957 if (MOP.isReg() && MOP.isKill())
958 Units.removeReg(Reg: MOP.getReg());
959
960 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
961 if (MOP.isReg() && !MOP.isKill())
962 Units.addReg(Reg: MOP.getReg());
963}
964
965/// This function will add a new entry into the debugValueSubstitutions table
966/// when two instruction have been merged into a new one represented by \p
967/// MergedInstr.
968static void addDebugSubstitutionsToTable(MachineFunction *MF,
969 unsigned InstrNumToSet,
970 MachineInstr &OriginalInstr,
971 MachineInstr &MergedInstr) {
972
973 // Figure out the Operand Index of the destination register of the
974 // OriginalInstr in the new MergedInstr.
975 auto Reg = OriginalInstr.getOperand(i: 0).getReg();
976 unsigned OperandNo = 0;
977 bool RegFound = false;
978 for (const auto Op : MergedInstr.operands()) {
979 if (Op.getReg() == Reg) {
980 RegFound = true;
981 break;
982 }
983 OperandNo++;
984 }
985
986 if (RegFound)
987 MF->makeDebugValueSubstitution({OriginalInstr.peekDebugInstrNum(), 0},
988 {InstrNumToSet, OperandNo});
989}
990
991MachineBasicBlock::iterator
992AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
993 MachineBasicBlock::iterator Paired,
994 const LdStPairFlags &Flags) {
995 MachineBasicBlock::iterator E = I->getParent()->end();
996 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
997 // If NextI is the second of the two instructions to be merged, we need
998 // to skip one further. Either way we merge will invalidate the iterator,
999 // and we don't need to scan the new instruction, as it's a pairwise
1000 // instruction, which we're not considering for further action anyway.
1001 if (NextI == Paired)
1002 NextI = next_nodbg(It: NextI, End: E);
1003
1004 int SExtIdx = Flags.getSExtIdx();
1005 unsigned Opc =
1006 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(Opc: I->getOpcode());
1007 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
1008 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: *I) : 1;
1009
1010 bool MergeForward = Flags.getMergeForward();
1011
1012 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
1013 if (RenameReg) {
1014 MCRegister RegToRename = getLdStRegOp(MI&: *I).getReg();
1015 DefinedInBB.addReg(Reg: *RenameReg);
1016
1017 // Return the sub/super register for RenameReg, matching the size of
1018 // OriginalReg.
1019 auto GetMatchingSubReg =
1020 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
1021 for (MCPhysReg SubOrSuper :
1022 TRI->sub_and_superregs_inclusive(Reg: *RenameReg)) {
1023 if (C->contains(Reg: SubOrSuper))
1024 return SubOrSuper;
1025 }
1026 llvm_unreachable("Should have found matching sub or super register!");
1027 };
1028
1029 std::function<bool(MachineInstr &, bool)> UpdateMIs =
1030 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
1031 bool IsDef) {
1032 if (IsDef) {
1033 bool SeenDef = false;
1034 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1035 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1036 // Rename the first explicit definition and all implicit
1037 // definitions matching RegToRename.
1038 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1039 (!MergeForward || !SeenDef ||
1040 (MOP.isDef() && MOP.isImplicit())) &&
1041 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1042 assert((MOP.isImplicit() ||
1043 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1044 "Need renamable operands");
1045 Register MatchingReg;
1046 if (const TargetRegisterClass *RC =
1047 MI.getRegClassConstraint(OpIdx, TII, TRI))
1048 MatchingReg = GetMatchingSubReg(RC);
1049 else {
1050 if (!isRewritableImplicitDef(MO: MOP))
1051 continue;
1052 MatchingReg = GetMatchingSubReg(
1053 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1054 }
1055 MOP.setReg(MatchingReg);
1056 SeenDef = true;
1057 }
1058 }
1059 } else {
1060 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1061 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1062 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1063 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1064 assert((MOP.isImplicit() ||
1065 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1066 "Need renamable operands");
1067 Register MatchingReg;
1068 if (const TargetRegisterClass *RC =
1069 MI.getRegClassConstraint(OpIdx, TII, TRI))
1070 MatchingReg = GetMatchingSubReg(RC);
1071 else
1072 MatchingReg = GetMatchingSubReg(
1073 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1074 assert(MatchingReg != AArch64::NoRegister &&
1075 "Cannot find matching regs for renaming");
1076 MOP.setReg(MatchingReg);
1077 }
1078 }
1079 }
1080 LLVM_DEBUG(dbgs() << "Renamed " << MI);
1081 return true;
1082 };
1083 forAllMIsUntilDef(MI&: MergeForward ? *I : *Paired->getPrevNode(), DefReg: RegToRename,
1084 TRI, UINT32_MAX, Fn&: UpdateMIs);
1085
1086#if !defined(NDEBUG)
1087 // For forward merging store:
1088 // Make sure the register used for renaming is not used between the
1089 // paired instructions. That would trash the content before the new
1090 // paired instruction.
1091 MCPhysReg RegToCheck = *RenameReg;
1092 // For backward merging load:
1093 // Make sure the register being renamed is not used between the
1094 // paired instructions. That would trash the content after the new
1095 // paired instruction.
1096 if (!MergeForward)
1097 RegToCheck = RegToRename;
1098 for (auto &MI :
1099 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
1100 MergeForward ? std::next(I) : I,
1101 MergeForward ? std::next(Paired) : Paired))
1102 assert(all_of(MI.operands(),
1103 [this, RegToCheck](const MachineOperand &MOP) {
1104 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1105 MOP.isUndef() ||
1106 !TRI->regsOverlap(MOP.getReg(), RegToCheck);
1107 }) &&
1108 "Rename register used between paired instruction, trashing the "
1109 "content");
1110#endif
1111 }
1112
1113 // Insert our new paired instruction after whichever of the paired
1114 // instructions MergeForward indicates.
1115 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
1116 // Also based on MergeForward is from where we copy the base register operand
1117 // so we get the flags compatible with the input code.
1118 const MachineOperand &BaseRegOp =
1119 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *Paired)
1120 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
1121
1122 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm();
1123 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(MI: *Paired).getImm();
1124 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Opc: Paired->getOpcode());
1125 if (IsUnscaled != PairedIsUnscaled) {
1126 // We're trying to pair instructions that differ in how they are scaled. If
1127 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
1128 // the opposite (i.e., make Paired's offset unscaled).
1129 int MemSize = TII->getMemScale(MI: *Paired);
1130 if (PairedIsUnscaled) {
1131 // If the unscaled offset isn't a multiple of the MemSize, we can't
1132 // pair the operations together.
1133 assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1134 "Offset should be a multiple of the stride!");
1135 PairedOffset /= MemSize;
1136 } else {
1137 PairedOffset *= MemSize;
1138 }
1139 }
1140
1141 // Which register is Rt and which is Rt2 depends on the offset order.
1142 // However, for pre load/stores the Rt should be the one of the pre
1143 // load/store.
1144 MachineInstr *RtMI, *Rt2MI;
1145 if (Offset == PairedOffset + OffsetStride &&
1146 !AArch64InstrInfo::isPreLdSt(MI: *I)) {
1147 RtMI = &*Paired;
1148 Rt2MI = &*I;
1149 // Here we swapped the assumption made for SExtIdx.
1150 // I.e., we turn ldp I, Paired into ldp Paired, I.
1151 // Update the index accordingly.
1152 if (SExtIdx != -1)
1153 SExtIdx = (SExtIdx + 1) % 2;
1154 } else {
1155 RtMI = &*I;
1156 Rt2MI = &*Paired;
1157 }
1158 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(MI: *RtMI).getImm();
1159 // Scale the immediate offset, if necessary.
1160 if (TII->hasUnscaledLdStOffset(Opc: RtMI->getOpcode())) {
1161 assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1162 "Unscaled offset cannot be scaled.");
1163 OffsetImm /= TII->getMemScale(MI: *RtMI);
1164 }
1165
1166 // Construct the new instruction.
1167 MachineInstrBuilder MIB;
1168 DebugLoc DL = I->getDebugLoc();
1169 MachineBasicBlock *MBB = I->getParent();
1170 MachineOperand RegOp0 = getLdStRegOp(MI&: *RtMI);
1171 MachineOperand RegOp1 = getLdStRegOp(MI&: *Rt2MI);
1172 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1173 // Kill flags may become invalid when moving stores for pairing.
1174 if (RegOp0.isUse()) {
1175 if (!MergeForward) {
1176 // Clear kill flags on store if moving upwards. Example:
1177 // STRWui kill %w0, ...
1178 // USE %w1
1179 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1180 // We are about to move the store of w1, so its kill flag may become
1181 // invalid; not the case for w0.
1182 // Since w1 is used between the stores, the kill flag on w1 is cleared
1183 // after merging.
1184 // STPWi kill %w0, %w1, ...
1185 // USE %w1
1186 for (auto It = std::next(x: I); It != Paired && PairedRegOp.isKill(); ++It)
1187 if (It->readsRegister(Reg: PairedRegOp.getReg(), TRI))
1188 PairedRegOp.setIsKill(false);
1189 } else {
1190 // Clear kill flags of the first stores register. Example:
1191 // STRWui %w1, ...
1192 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1193 // STRW %w0
1194 Register Reg = getLdStRegOp(MI&: *I).getReg();
1195 for (MachineInstr &MI :
1196 make_range(x: std::next(x: I->getIterator()), y: Paired->getIterator()))
1197 MI.clearRegisterKills(Reg, RegInfo: TRI);
1198 }
1199 }
1200
1201 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1202 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: MatchPairOpcode));
1203
1204 // Adds the pre-index operand for pre-indexed ld/st pairs.
1205 if (AArch64InstrInfo::isPreLdSt(MI: *RtMI))
1206 MIB.addReg(RegNo: BaseRegOp.getReg(), Flags: RegState::Define);
1207
1208 MIB.add(MO: RegOp0)
1209 .add(MO: RegOp1)
1210 .add(MO: BaseRegOp)
1211 .addImm(Val: OffsetImm)
1212 .cloneMergedMemRefs(OtherMIs: {&*I, &*Paired})
1213 .setMIFlags(I->mergeFlagsWith(Other: *Paired));
1214
1215 (void)MIB;
1216
1217 LLVM_DEBUG(
1218 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1219 LLVM_DEBUG(I->print(dbgs()));
1220 LLVM_DEBUG(dbgs() << " ");
1221 LLVM_DEBUG(Paired->print(dbgs()));
1222 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1223 if (SExtIdx != -1) {
1224 // Generate the sign extension for the proper result of the ldp.
1225 // I.e., with X1, that would be:
1226 // %w1 = KILL %w1, implicit-def %x1
1227 // %x1 = SBFMXri killed %x1, 0, 31
1228 MachineOperand &DstMO = MIB->getOperand(i: SExtIdx);
1229 // Right now, DstMO has the extended register, since it comes from an
1230 // extended opcode.
1231 Register DstRegX = DstMO.getReg();
1232 // Get the W variant of that register.
1233 Register DstRegW = TRI->getSubReg(Reg: DstRegX, Idx: AArch64::sub_32);
1234 // Update the result of LDP to use the W instead of the X variant.
1235 DstMO.setReg(DstRegW);
1236 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1237 LLVM_DEBUG(dbgs() << "\n");
1238 // Make the machine verifier happy by providing a definition for
1239 // the X register.
1240 // Insert this definition right after the generated LDP, i.e., before
1241 // InsertionPoint.
1242 MachineInstrBuilder MIBKill =
1243 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::KILL), DestReg: DstRegW)
1244 .addReg(RegNo: DstRegW)
1245 .addReg(RegNo: DstRegX, Flags: RegState::Define);
1246 MIBKill->getOperand(i: 2).setImplicit();
1247 // Create the sign extension.
1248 MachineInstrBuilder MIBSXTW =
1249 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: AArch64::SBFMXri), DestReg: DstRegX)
1250 .addReg(RegNo: DstRegX)
1251 .addImm(Val: 0)
1252 .addImm(Val: 31);
1253 (void)MIBSXTW;
1254
1255 // In the case of a sign-extend, where we have something like:
1256 // debugValueSubstitutions:[]
1257 // $w1 = LDRWui $x0, 1, debug-instr-number 1
1258 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1259 // $x0 = LDRSWui $x0, 0, debug-instr-number 2
1260 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1261
1262 // It will be converted to:
1263 // debugValueSubstitutions:[]
1264 // $w0, $w1 = LDPWi $x0, 0
1265 // $w0 = KILL $w0, implicit-def $x0
1266 // $x0 = SBFMXri $x0, 0, 31
1267 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1268 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1269
1270 // We want the final result to look like:
1271 // debugValueSubstitutions:
1272 // - { srcinst: 1, srcop: 0, dstinst: 4, dstop: 1, subreg: 0 }
1273 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1274 // $w0, $w1 = LDPWi $x0, 0, debug-instr-number 4
1275 // $w0 = KILL $w0, implicit-def $x0
1276 // $x0 = SBFMXri $x0, 0, 31, debug-instr-number 3
1277 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1278 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1279
1280 // $x0 is where the final value is stored, so the sign extend (SBFMXri)
1281 // instruction contains the final value we care about we give it a new
1282 // debug-instr-number 3. Whereas, $w1 contains the final value that we care
1283 // about, therefore the LDP instruction is also given a new
1284 // debug-instr-number 4. We have to add these substitutions to the
1285 // debugValueSubstitutions table. However, we also have to ensure that the
1286 // OpIndex that pointed to debug-instr-number 1 gets updated to 1, because
1287 // $w1 is the second operand of the LDP instruction.
1288
1289 if (I->peekDebugInstrNum()) {
1290 // If I is the instruction which got sign extended and has a
1291 // debug-instr-number, give the SBFMXri instruction a new
1292 // debug-instr-number, and update the debugValueSubstitutions table with
1293 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1294 // instruction a new debug-instr-number, and update the
1295 // debugValueSubstitutions table with the new debug-instr-number and
1296 // OpIndex pair.
1297 unsigned NewInstrNum;
1298 if (DstRegX == I->getOperand(i: 0).getReg()) {
1299 NewInstrNum = MIBSXTW->getDebugInstrNum();
1300 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I,
1301 MergedInstr&: *MIBSXTW);
1302 } else {
1303 NewInstrNum = MIB->getDebugInstrNum();
1304 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I, MergedInstr&: *MIB);
1305 }
1306 }
1307 if (Paired->peekDebugInstrNum()) {
1308 // If Paired is the instruction which got sign extended and has a
1309 // debug-instr-number, give the SBFMXri instruction a new
1310 // debug-instr-number, and update the debugValueSubstitutions table with
1311 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1312 // instruction a new debug-instr-number, and update the
1313 // debugValueSubstitutions table with the new debug-instr-number and
1314 // OpIndex pair.
1315 unsigned NewInstrNum;
1316 if (DstRegX == Paired->getOperand(i: 0).getReg()) {
1317 NewInstrNum = MIBSXTW->getDebugInstrNum();
1318 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1319 MergedInstr&: *MIBSXTW);
1320 } else {
1321 NewInstrNum = MIB->getDebugInstrNum();
1322 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1323 MergedInstr&: *MIB);
1324 }
1325 }
1326
1327 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1328 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1329 } else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) {
1330 // We are combining SVE fill/spill to LDP/STP, so we need to use the Q
1331 // variant of the registers.
1332 MachineOperand &MOp0 = MIB->getOperand(i: 0);
1333 MachineOperand &MOp1 = MIB->getOperand(i: 1);
1334 assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
1335 AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
1336 MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
1337 MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
1338 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1339 } else {
1340
1341 // In the case that the merge doesn't result in a sign-extend, if we have
1342 // something like:
1343 // debugValueSubstitutions:[]
1344 // $x1 = LDRXui $x0, 1, debug-instr-number 1
1345 // DBG_INSTR_REF !13, dbg-instr-ref(1, 0), debug-location !11
1346 // $x0 = LDRXui killed $x0, 0, debug-instr-number 2
1347 // DBG_INSTR_REF !14, dbg-instr-ref(2, 0), debug-location !11
1348
1349 // It will be converted to:
1350 // debugValueSubstitutions: []
1351 // $x0, $x1 = LDPXi $x0, 0
1352 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1353 // DBG_INSTR_REF !13, dbg-instr-ref(2, 0), debug-location !14
1354
1355 // We want the final result to look like:
1356 // debugValueSubstitutions:
1357 // - { srcinst: 1, srcop: 0, dstinst: 3, dstop: 1, subreg: 0 }
1358 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1359 // $x0, $x1 = LDPXi $x0, 0, debug-instr-number 3
1360 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1361 // DBG_INSTR_REF !12, dbg-instr-ref(2, 0), debug-location !14
1362
1363 // Here all that needs to be done is, that the LDP instruction needs to be
1364 // updated with a new debug-instr-number, we then need to add entries into
1365 // the debugSubstitutions table to map the old instr-refs to the new ones.
1366
1367 // Assign new DebugInstrNum to the Paired instruction.
1368 if (I->peekDebugInstrNum()) {
1369 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1370 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *I,
1371 MergedInstr&: *MIB);
1372 }
1373 if (Paired->peekDebugInstrNum()) {
1374 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1375 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *Paired,
1376 MergedInstr&: *MIB);
1377 }
1378
1379 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1380 }
1381 LLVM_DEBUG(dbgs() << "\n");
1382
1383 if (MergeForward)
1384 for (const MachineOperand &MOP : phys_regs_and_masks(MI: *I))
1385 if (MOP.isReg() && MOP.isKill())
1386 DefinedInBB.addReg(Reg: MOP.getReg());
1387
1388 // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but
1389 // only copies implicit defs and makes sure that each operand is only added
1390 // once in case of duplicates.
1391 auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1,
1392 MachineBasicBlock::iterator MI2) {
1393 SmallSetVector<Register, 4> Ops;
1394 for (const MachineOperand &MO :
1395 llvm::drop_begin(RangeOrContainer: MI1->operands(), N: MI1->getDesc().getNumOperands()))
1396 if (MO.isReg() && MO.isImplicit() && MO.isDef())
1397 Ops.insert(X: MO.getReg());
1398 for (const MachineOperand &MO :
1399 llvm::drop_begin(RangeOrContainer: MI2->operands(), N: MI2->getDesc().getNumOperands()))
1400 if (MO.isReg() && MO.isImplicit() && MO.isDef())
1401 Ops.insert(X: MO.getReg());
1402 for (auto Op : Ops)
1403 MIB.addDef(RegNo: Op, Flags: RegState::Implicit);
1404 };
1405 CopyImplicitOps(I, Paired);
1406
1407 // Erase the old instructions.
1408 I->eraseFromParent();
1409 Paired->eraseFromParent();
1410
1411 return NextI;
1412}
1413
1414MachineBasicBlock::iterator
1415AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1416 MachineBasicBlock::iterator StoreI) {
1417 MachineBasicBlock::iterator NextI =
1418 next_nodbg(It: LoadI, End: LoadI->getParent()->end());
1419
1420 int LoadSize = TII->getMemScale(MI: *LoadI);
1421 int StoreSize = TII->getMemScale(MI: *StoreI);
1422 Register LdRt = getLdStRegOp(MI&: *LoadI).getReg();
1423 const MachineOperand &StMO = getLdStRegOp(MI&: *StoreI);
1424 Register StRt = getLdStRegOp(MI&: *StoreI).getReg();
1425 bool IsStoreXReg = TRI->getRegClass(i: AArch64::GPR64RegClassID)->contains(Reg: StRt);
1426
1427 assert((IsStoreXReg ||
1428 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1429 "Unexpected RegClass");
1430
1431 MachineInstr *BitExtMI;
1432 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1433 // Remove the load, if the destination register of the loads is the same
1434 // register for stored value.
1435 if (StRt == LdRt && LoadSize == 8) {
1436 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1437 y: LoadI->getIterator())) {
1438 if (MI.killsRegister(Reg: StRt, TRI)) {
1439 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1440 break;
1441 }
1442 }
1443 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1444 LLVM_DEBUG(LoadI->print(dbgs()));
1445 LLVM_DEBUG(dbgs() << "\n");
1446 LoadI->eraseFromParent();
1447 return NextI;
1448 }
1449 // Replace the load with a mov if the load and store are in the same size.
1450 BitExtMI =
1451 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1452 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), DestReg: LdRt)
1453 .addReg(RegNo: IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1454 .add(MO: StMO)
1455 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
1456 .setMIFlags(LoadI->getFlags());
1457 } else {
1458 // FIXME: Currently we disable this transformation in big-endian targets as
1459 // performance and correctness are verified only in little-endian.
1460 if (!Subtarget->isLittleEndian())
1461 return NextI;
1462 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: *LoadI);
1463 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1464 "Unsupported ld/st match");
1465 assert(LoadSize <= StoreSize && "Invalid load size");
1466 int UnscaledLdOffset =
1467 IsUnscaled
1468 ? AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm()
1469 : AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm() * LoadSize;
1470 int UnscaledStOffset =
1471 IsUnscaled
1472 ? AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm()
1473 : AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm() * StoreSize;
1474 int Width = LoadSize * 8;
1475 Register DestReg =
1476 IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1477 Reg: LdRt, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64RegClass))
1478 : LdRt;
1479
1480 assert((UnscaledLdOffset >= UnscaledStOffset &&
1481 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1482 "Invalid offset");
1483
1484 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1485 int Imms = Immr + Width - 1;
1486 if (UnscaledLdOffset == UnscaledStOffset) {
1487 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1488 | ((Immr) << 6) // immr
1489 | ((Imms) << 0) // imms
1490 ;
1491
1492 BitExtMI =
1493 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1494 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1495 DestReg)
1496 .add(MO: StMO)
1497 .addImm(Val: AndMaskEncoded)
1498 .setMIFlags(LoadI->getFlags());
1499 } else if (IsStoreXReg && Imms == 31) {
1500 // Use the 32 bit variant of UBFM if it's the LSR alias of the
1501 // instruction.
1502 assert(Immr <= Imms && "Expected LSR alias of UBFM");
1503 BitExtMI = BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1504 MCID: TII->get(Opcode: AArch64::UBFMWri),
1505 DestReg: TRI->getSubReg(Reg: DestReg, Idx: AArch64::sub_32))
1506 .addReg(RegNo: TRI->getSubReg(Reg: StRt, Idx: AArch64::sub_32))
1507 .addImm(Val: Immr)
1508 .addImm(Val: Imms)
1509 .setMIFlags(LoadI->getFlags());
1510 } else {
1511 BitExtMI =
1512 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1513 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1514 DestReg)
1515 .add(MO: StMO)
1516 .addImm(Val: Immr)
1517 .addImm(Val: Imms)
1518 .setMIFlags(LoadI->getFlags());
1519 }
1520 }
1521
1522 // Clear kill flags between store and load.
1523 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1524 y: BitExtMI->getIterator()))
1525 if (MI.killsRegister(Reg: StRt, TRI)) {
1526 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1527 break;
1528 }
1529
1530 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1531 LLVM_DEBUG(StoreI->print(dbgs()));
1532 LLVM_DEBUG(dbgs() << " ");
1533 LLVM_DEBUG(LoadI->print(dbgs()));
1534 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1535 LLVM_DEBUG(StoreI->print(dbgs()));
1536 LLVM_DEBUG(dbgs() << " ");
1537 LLVM_DEBUG((BitExtMI)->print(dbgs()));
1538 LLVM_DEBUG(dbgs() << "\n");
1539
1540 // Erase the old instructions.
1541 LoadI->eraseFromParent();
1542 return NextI;
1543}
1544
1545static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1546 // Convert the byte-offset used by unscaled into an "element" offset used
1547 // by the scaled pair load/store instructions.
1548 if (IsUnscaled) {
1549 // If the byte-offset isn't a multiple of the stride, there's no point
1550 // trying to match it.
1551 if (Offset % OffsetStride)
1552 return false;
1553 Offset /= OffsetStride;
1554 }
1555 return Offset <= 63 && Offset >= -64;
1556}
1557
1558// Do alignment, specialized to power of 2 and for signed ints,
1559// avoiding having to do a C-style cast from uint_64t to int when
1560// using alignTo from include/llvm/Support/MathExtras.h.
1561// FIXME: Move this function to include/MathExtras.h?
1562static int alignTo(int Num, int PowOf2) {
1563 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1564}
1565
1566static bool mayAlias(MachineInstr &MIa,
1567 SmallVectorImpl<MachineInstr *> &MemInsns,
1568 AliasAnalysis *AA) {
1569 for (MachineInstr *MIb : MemInsns) {
1570 if (MIa.mayAlias(AA, Other: *MIb, /*UseTBAA*/ false)) {
1571 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1572 return true;
1573 }
1574 }
1575
1576 LLVM_DEBUG(dbgs() << "No aliases found\n");
1577 return false;
1578}
1579
1580bool AArch64LoadStoreOpt::findMatchingStore(
1581 MachineBasicBlock::iterator I, unsigned Limit,
1582 MachineBasicBlock::iterator &StoreI) {
1583 MachineBasicBlock::iterator B = I->getParent()->begin();
1584 MachineBasicBlock::iterator MBBI = I;
1585 MachineInstr &LoadMI = *I;
1586 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: LoadMI).getReg();
1587
1588 // If the load is the first instruction in the block, there's obviously
1589 // not any matching store.
1590 if (MBBI == B)
1591 return false;
1592
1593 // Track which register units have been modified and used between the first
1594 // insn and the second insn.
1595 ModifiedRegUnits.clear();
1596 UsedRegUnits.clear();
1597
1598 unsigned Count = 0;
1599 do {
1600 MBBI = prev_nodbg(It: MBBI, Begin: B);
1601 MachineInstr &MI = *MBBI;
1602
1603 // Don't count transient instructions towards the search limit since there
1604 // may be different numbers of them if e.g. debug information is present.
1605 if (!MI.isTransient())
1606 ++Count;
1607
1608 // If the load instruction reads directly from the address to which the
1609 // store instruction writes and the stored value is not modified, we can
1610 // promote the load. Since we do not handle stores with pre-/post-index,
1611 // it's unnecessary to check if BaseReg is modified by the store itself.
1612 // Also we can't handle stores without an immediate offset operand,
1613 // while the operand might be the address for a global variable.
1614 if (MI.mayStore() && isMatchingStore(LoadInst&: LoadMI, StoreInst&: MI) &&
1615 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1616 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1617 isLdOffsetInRangeOfSt(LoadInst&: LoadMI, StoreInst&: MI, TII) &&
1618 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg())) {
1619 StoreI = MBBI;
1620 return true;
1621 }
1622
1623 if (MI.isCall())
1624 return false;
1625
1626 // Update modified / uses register units.
1627 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1628
1629 // Otherwise, if the base register is modified, we have no match, so
1630 // return early.
1631 if (!ModifiedRegUnits.available(Reg: BaseReg))
1632 return false;
1633
1634 // If we encounter a store aliased with the load, return early.
1635 if (MI.mayStore() && LoadMI.mayAlias(AA, Other: MI, /*UseTBAA*/ false))
1636 return false;
1637 } while (MBBI != B && Count < Limit);
1638 return false;
1639}
1640
1641static bool needsWinCFI(const MachineFunction *MF) {
1642 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1643 MF->getFunction().needsUnwindTableEntry();
1644}
1645
1646// Returns true if FirstMI and MI are candidates for merging or pairing.
1647// Otherwise, returns false.
1648static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1649 LdStPairFlags &Flags,
1650 const AArch64InstrInfo *TII) {
1651 // If this is volatile or if pairing is suppressed, not a candidate.
1652 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1653 return false;
1654
1655 // We should have already checked FirstMI for pair suppression and volatility.
1656 assert(!FirstMI.hasOrderedMemoryRef() &&
1657 !TII->isLdStPairSuppressed(FirstMI) &&
1658 "FirstMI shouldn't get here if either of these checks are true.");
1659
1660 if (needsWinCFI(MF: MI.getMF()) && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
1661 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
1662 return false;
1663
1664 unsigned OpcA = FirstMI.getOpcode();
1665 unsigned OpcB = MI.getOpcode();
1666
1667 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1668 if (OpcA == OpcB)
1669 return !AArch64InstrInfo::isPreLdSt(MI: FirstMI);
1670
1671 // Bail out if one of the opcodes is SVE fill/spill, as we currently don't
1672 // allow pairing them with other instructions.
1673 if (OpcA == AArch64::LDR_ZXI || OpcA == AArch64::STR_ZXI ||
1674 OpcB == AArch64::LDR_ZXI || OpcB == AArch64::STR_ZXI)
1675 return false;
1676
1677 // Two pre ld/st of different opcodes cannot be merged either
1678 if (AArch64InstrInfo::isPreLdSt(MI: FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1679 return false;
1680
1681 // Try to match a sign-extended load/store with a zero-extended load/store.
1682 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1683 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc: OpcA, IsValidLdStrOpc: &IsValidLdStrOpc);
1684 assert(IsValidLdStrOpc &&
1685 "Given Opc should be a Load or Store with an immediate");
1686 // OpcA will be the first instruction in the pair.
1687 if (NonSExtOpc == getMatchingNonSExtOpcode(Opc: OpcB, IsValidLdStrOpc: &PairIsValidLdStrOpc)) {
1688 Flags.setSExtIdx(NonSExtOpc == OpcA ? 1 : 0);
1689 return true;
1690 }
1691
1692 // If the second instruction isn't even a mergable/pairable load/store, bail
1693 // out.
1694 if (!PairIsValidLdStrOpc)
1695 return false;
1696
1697 // Narrow stores do not have a matching pair opcodes, so constrain their
1698 // merging to zero stores.
1699 if (isNarrowStore(Opc: OpcA) || isNarrowStore(Opc: OpcB))
1700 return getLdStRegOp(MI&: FirstMI).getReg() == AArch64::WZR &&
1701 getLdStRegOp(MI).getReg() == AArch64::WZR &&
1702 TII->getMemScale(MI: FirstMI) == TII->getMemScale(MI);
1703
1704 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1705 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1706 // are candidate pairs that can be merged.
1707 if (isPreLdStPairCandidate(FirstMI, MI))
1708 return true;
1709
1710 // Try to match an unscaled load/store with a scaled load/store.
1711 return TII->hasUnscaledLdStOffset(Opc: OpcA) != TII->hasUnscaledLdStOffset(Opc: OpcB) &&
1712 getMatchingPairOpcode(Opc: OpcA) == getMatchingPairOpcode(Opc: OpcB);
1713
1714 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1715}
1716
1717static bool canRenameMOP(const MachineOperand &MOP,
1718 const TargetRegisterInfo *TRI) {
1719 if (MOP.isReg()) {
1720 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: MOP.getReg());
1721 // Renaming registers with multiple disjunct sub-registers (e.g. the
1722 // result of a LD3) means that all sub-registers are renamed, potentially
1723 // impacting other instructions we did not check. Bail out.
1724 // Note that this relies on the structure of the AArch64 register file. In
1725 // particular, a subregister cannot be written without overwriting the
1726 // whole register.
1727 if (RegClass->HasDisjunctSubRegs && RegClass->CoveredBySubRegs &&
1728 (TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::dsub0) ||
1729 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::qsub0) ||
1730 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::zsub0))) {
1731 LLVM_DEBUG(
1732 dbgs()
1733 << " Cannot rename operands with multiple disjunct subregisters ("
1734 << MOP << ")\n");
1735 return false;
1736 }
1737
1738 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1739 // them must be known. For example, in ORRWrs the implicit-def
1740 // corresponds to the result register.
1741 if (MOP.isImplicit() && MOP.isDef()) {
1742 if (!isRewritableImplicitDef(MO: MOP))
1743 return false;
1744 return TRI->isSuperOrSubRegisterEq(
1745 RegA: MOP.getParent()->getOperand(i: 0).getReg(), RegB: MOP.getReg());
1746 }
1747 }
1748 return MOP.isImplicit() ||
1749 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1750}
1751
1752static bool
1753canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1754 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1755 const TargetRegisterInfo *TRI) {
1756 if (!FirstMI.mayStore())
1757 return false;
1758
1759 // Check if we can find an unused register which we can use to rename
1760 // the register used by the first load/store.
1761
1762 auto RegToRename = getLdStRegOp(MI&: FirstMI).getReg();
1763 // For now, we only rename if the store operand gets killed at the store.
1764 if (!getLdStRegOp(MI&: FirstMI).isKill() &&
1765 !any_of(Range: FirstMI.operands(),
1766 P: [TRI, RegToRename](const MachineOperand &MOP) {
1767 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1768 MOP.isImplicit() && MOP.isKill() &&
1769 TRI->regsOverlap(RegA: RegToRename, RegB: MOP.getReg());
1770 })) {
1771 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1772 return false;
1773 }
1774
1775 bool FoundDef = false;
1776
1777 // For each instruction between FirstMI and the previous def for RegToRename,
1778 // we
1779 // * check if we can rename RegToRename in this instruction
1780 // * collect the registers used and required register classes for RegToRename.
1781 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1782 bool IsDef) {
1783 LLVM_DEBUG(dbgs() << "Checking " << MI);
1784 // Currently we do not try to rename across frame-setup instructions.
1785 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1786 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1787 << "currently\n");
1788 return false;
1789 }
1790
1791 UsedInBetween.accumulate(MI);
1792
1793 // For a definition, check that we can rename the definition and exit the
1794 // loop.
1795 FoundDef = IsDef;
1796
1797 // For defs, check if we can rename the first def of RegToRename.
1798 if (FoundDef) {
1799 // For some pseudo instructions, we might not generate code in the end
1800 // (e.g. KILL) and we would end up without a correct def for the rename
1801 // register.
1802 // TODO: This might be overly conservative and we could handle those cases
1803 // in multiple ways:
1804 // 1. Insert an extra copy, to materialize the def.
1805 // 2. Skip pseudo-defs until we find an non-pseudo def.
1806 if (MI.isPseudo()) {
1807 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1808 return false;
1809 }
1810
1811 for (auto &MOP : MI.operands()) {
1812 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1813 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1814 continue;
1815 if (!canRenameMOP(MOP, TRI)) {
1816 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1817 return false;
1818 }
1819 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1820 }
1821 return true;
1822 } else {
1823 for (auto &MOP : MI.operands()) {
1824 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1825 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1826 continue;
1827
1828 if (!canRenameMOP(MOP, TRI)) {
1829 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1830 return false;
1831 }
1832 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1833 }
1834 }
1835 return true;
1836 };
1837
1838 if (!forAllMIsUntilDef(MI&: FirstMI, DefReg: RegToRename, TRI, Limit: LdStLimit, Fn&: CheckMIs))
1839 return false;
1840
1841 if (!FoundDef) {
1842 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1843 return false;
1844 }
1845 return true;
1846}
1847
1848// We want to merge the second load into the first by rewriting the usages of
1849// the same reg between first (incl.) and second (excl.). We don't need to care
1850// about any insns before FirstLoad or after SecondLoad.
1851// 1. The second load writes new value into the same reg.
1852// - The renaming is impossible to impact later use of the reg.
1853// - The second load always trash the value written by the first load which
1854// means the reg must be killed before the second load.
1855// 2. The first load must be a def for the same reg so we don't need to look
1856// into anything before it.
1857static bool canRenameUntilSecondLoad(
1858 MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1859 LiveRegUnits &UsedInBetween,
1860 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1861 const TargetRegisterInfo *TRI) {
1862 if (FirstLoad.isPseudo())
1863 return false;
1864
1865 UsedInBetween.accumulate(MI: FirstLoad);
1866 auto RegToRename = getLdStRegOp(MI&: FirstLoad).getReg();
1867 bool Success = std::all_of(
1868 first: FirstLoad.getIterator(), last: SecondLoad.getIterator(),
1869 pred: [&](MachineInstr &MI) {
1870 LLVM_DEBUG(dbgs() << "Checking " << MI);
1871 // Currently we do not try to rename across frame-setup instructions.
1872 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1873 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1874 << "currently\n");
1875 return false;
1876 }
1877
1878 for (auto &MOP : MI.operands()) {
1879 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1880 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1881 continue;
1882 if (!canRenameMOP(MOP, TRI)) {
1883 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1884 return false;
1885 }
1886 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1887 }
1888
1889 return true;
1890 });
1891 return Success;
1892}
1893
1894// Check if we can find a physical register for renaming \p Reg. This register
1895// must:
1896// * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1897// defined registers up to the point where the renamed register will be used,
1898// * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1899// registers in the range the rename register will be used,
1900// * is available in all used register classes (checked using RequiredClasses).
1901static std::optional<MCPhysReg> tryToFindRegisterToRename(
1902 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1903 LiveRegUnits &UsedInBetween,
1904 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1905 const TargetRegisterInfo *TRI) {
1906 const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1907
1908 // Checks if any sub- or super-register of PR is callee saved.
1909 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1910 return any_of(Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1911 P: [&MF, TRI](MCPhysReg SubOrSuper) {
1912 return TRI->isCalleeSavedPhysReg(PhysReg: SubOrSuper, MF);
1913 });
1914 };
1915
1916 // Check if PR or one of its sub- or super-registers can be used for all
1917 // required register classes.
1918 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1919 return all_of(Range&: RequiredClasses, P: [PR, TRI](const TargetRegisterClass *C) {
1920 return any_of(
1921 Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1922 P: [C](MCPhysReg SubOrSuper) { return C->contains(Reg: SubOrSuper); });
1923 });
1924 };
1925
1926 auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1927 for (const MCPhysReg &PR : *RegClass) {
1928 if (DefinedInBB.available(Reg: PR) && UsedInBetween.available(Reg: PR) &&
1929 !RegInfo.isReserved(PhysReg: PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1930 CanBeUsedForAllClasses(PR)) {
1931 DefinedInBB.addReg(Reg: PR);
1932 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1933 << "\n");
1934 return {PR};
1935 }
1936 }
1937 LLVM_DEBUG(dbgs() << "No rename register found from "
1938 << TRI->getRegClassName(RegClass) << "\n");
1939 return std::nullopt;
1940}
1941
1942// For store pairs: returns a register from FirstMI to the beginning of the
1943// block that can be renamed.
1944// For load pairs: returns a register from FirstMI to MI that can be renamed.
1945static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1946 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1947 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1948 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1949 const TargetRegisterInfo *TRI) {
1950 std::optional<MCPhysReg> RenameReg;
1951 if (!DebugCounter::shouldExecute(Counter&: RegRenamingCounter))
1952 return RenameReg;
1953
1954 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: getLdStRegOp(MI&: FirstMI).getReg());
1955 MachineFunction &MF = *FirstMI.getParent()->getParent();
1956 if (!RegClass || !MF.getRegInfo().tracksLiveness())
1957 return RenameReg;
1958
1959 const bool IsLoad = FirstMI.mayLoad();
1960
1961 if (!MaybeCanRename) {
1962 if (IsLoad)
1963 MaybeCanRename = {canRenameUntilSecondLoad(FirstLoad&: FirstMI, SecondLoad&: MI, UsedInBetween,
1964 RequiredClasses, TRI)};
1965 else
1966 MaybeCanRename = {
1967 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1968 }
1969
1970 if (*MaybeCanRename) {
1971 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1972 RequiredClasses, TRI);
1973 }
1974 return RenameReg;
1975}
1976
1977/// Scan the instructions looking for a load/store that can be combined with the
1978/// current instruction into a wider equivalent or a load/store pair.
1979MachineBasicBlock::iterator
1980AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1981 LdStPairFlags &Flags, unsigned Limit,
1982 bool FindNarrowMerge) {
1983 MachineBasicBlock::iterator E = I->getParent()->end();
1984 MachineBasicBlock::iterator MBBI = I;
1985 MachineBasicBlock::iterator MBBIWithRenameReg;
1986 MachineInstr &FirstMI = *I;
1987 MBBI = next_nodbg(It: MBBI, End: E);
1988
1989 bool MayLoad = FirstMI.mayLoad();
1990 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: FirstMI);
1991 Register Reg = getLdStRegOp(MI&: FirstMI).getReg();
1992 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: FirstMI).getReg();
1993 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: FirstMI).getImm();
1994 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: FirstMI) : 1;
1995 bool IsPromotableZeroStore = isPromotableZeroStoreInst(MI&: FirstMI);
1996
1997 std::optional<bool> MaybeCanRename;
1998 if (!EnableRenaming)
1999 MaybeCanRename = {false};
2000
2001 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
2002 LiveRegUnits UsedInBetween;
2003 UsedInBetween.init(TRI: *TRI);
2004
2005 Flags.clearRenameReg();
2006
2007 // Track which register units have been modified and used between the first
2008 // insn (inclusive) and the second insn.
2009 ModifiedRegUnits.clear();
2010 UsedRegUnits.clear();
2011
2012 // Remember any instructions that read/write memory between FirstMI and MI.
2013 SmallVector<MachineInstr *, 4> MemInsns;
2014
2015 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
2016 for (unsigned Count = 0; MBBI != E && Count < Limit;
2017 MBBI = next_nodbg(It: MBBI, End: E)) {
2018 MachineInstr &MI = *MBBI;
2019 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
2020
2021 UsedInBetween.accumulate(MI);
2022
2023 // Don't count transient instructions towards the search limit since there
2024 // may be different numbers of them if e.g. debug information is present.
2025 if (!MI.isTransient())
2026 ++Count;
2027
2028 Flags.setSExtIdx(-1);
2029 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
2030 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
2031 assert(MI.mayLoadOrStore() && "Expected memory operation.");
2032 // If we've found another instruction with the same opcode, check to see
2033 // if the base and offset are compatible with our starting instruction.
2034 // These instructions all have scaled immediate operands, so we just
2035 // check for +1/-1. Make sure to check the new instruction offset is
2036 // actually an immediate and not a symbolic reference destined for
2037 // a relocation.
2038 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
2039 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2040 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
2041 if (IsUnscaled != MIIsUnscaled) {
2042 // We're trying to pair instructions that differ in how they are scaled.
2043 // If FirstMI is scaled then scale the offset of MI accordingly.
2044 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
2045 int MemSize = TII->getMemScale(MI);
2046 if (MIIsUnscaled) {
2047 // If the unscaled offset isn't a multiple of the MemSize, we can't
2048 // pair the operations together: bail and keep looking.
2049 if (MIOffset % MemSize) {
2050 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2051 UsedRegUnits, TRI);
2052 MemInsns.push_back(Elt: &MI);
2053 continue;
2054 }
2055 MIOffset /= MemSize;
2056 } else {
2057 MIOffset *= MemSize;
2058 }
2059 }
2060
2061 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
2062
2063 if (BaseReg == MIBaseReg) {
2064 // If the offset of the second ld/st is not equal to the size of the
2065 // destination register it can’t be paired with a pre-index ld/st
2066 // pair. Additionally if the base reg is used or modified the operations
2067 // can't be paired: bail and keep looking.
2068 if (IsPreLdSt) {
2069 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
2070 bool IsBaseRegUsed = !UsedRegUnits.available(
2071 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2072 bool IsBaseRegModified = !ModifiedRegUnits.available(
2073 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2074 // If the stored value and the address of the second instruction is
2075 // the same, it needs to be using the updated register and therefore
2076 // it must not be folded.
2077 bool IsMIRegTheSame =
2078 TRI->regsOverlap(RegA: getLdStRegOp(MI).getReg(),
2079 RegB: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2080 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
2081 IsMIRegTheSame) {
2082 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2083 UsedRegUnits, TRI);
2084 MemInsns.push_back(Elt: &MI);
2085 continue;
2086 }
2087 } else {
2088 if ((Offset != MIOffset + OffsetStride) &&
2089 (Offset + OffsetStride != MIOffset)) {
2090 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2091 UsedRegUnits, TRI);
2092 MemInsns.push_back(Elt: &MI);
2093 continue;
2094 }
2095 }
2096
2097 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
2098 if (FindNarrowMerge) {
2099 // If the alignment requirements of the scaled wide load/store
2100 // instruction can't express the offset of the scaled narrow input,
2101 // bail and keep looking. For promotable zero stores, allow only when
2102 // the stored value is the same (i.e., WZR).
2103 if ((!IsUnscaled && alignTo(Num: MinOffset, PowOf2: 2) != MinOffset) ||
2104 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
2105 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2106 UsedRegUnits, TRI);
2107 MemInsns.push_back(Elt: &MI);
2108 continue;
2109 }
2110 } else {
2111 // Pairwise instructions have a 7-bit signed offset field. Single
2112 // insns have a 12-bit unsigned offset field. If the resultant
2113 // immediate offset of merging these instructions is out of range for
2114 // a pairwise instruction, bail and keep looking.
2115 if (!inBoundsForPair(IsUnscaled, Offset: MinOffset, OffsetStride)) {
2116 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2117 UsedRegUnits, TRI);
2118 MemInsns.push_back(Elt: &MI);
2119 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
2120 << "keep looking.\n");
2121 continue;
2122 }
2123 // If the alignment requirements of the paired (scaled) instruction
2124 // can't express the offset of the unscaled input, bail and keep
2125 // looking.
2126 if (IsUnscaled && (alignTo(Num: MinOffset, PowOf2: OffsetStride) != MinOffset)) {
2127 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2128 UsedRegUnits, TRI);
2129 MemInsns.push_back(Elt: &MI);
2130 LLVM_DEBUG(dbgs()
2131 << "Offset doesn't fit due to alignment requirements, "
2132 << "keep looking.\n");
2133 continue;
2134 }
2135 }
2136
2137 // If the BaseReg has been modified, then we cannot do the optimization.
2138 // For example, in the following pattern
2139 // ldr x1 [x2]
2140 // ldr x2 [x3]
2141 // ldr x4 [x2, #8],
2142 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
2143 if (!ModifiedRegUnits.available(Reg: BaseReg))
2144 return E;
2145
2146 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
2147 RegA: Reg, RegB: getLdStRegOp(MI).getReg());
2148
2149 // If the Rt of the second instruction (destination register of the
2150 // load) was not modified or used between the two instructions and none
2151 // of the instructions between the second and first alias with the
2152 // second, we can combine the second into the first.
2153 bool RtNotModified =
2154 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg());
2155 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
2156 !UsedRegUnits.available(Reg: getLdStRegOp(MI).getReg()));
2157
2158 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
2159 << "Reg '" << getLdStRegOp(MI) << "' not modified: "
2160 << (RtNotModified ? "true" : "false") << "\n"
2161 << "Reg '" << getLdStRegOp(MI) << "' not used: "
2162 << (RtNotUsed ? "true" : "false") << "\n");
2163
2164 if (RtNotModified && RtNotUsed && !mayAlias(MIa&: MI, MemInsns, AA)) {
2165 // For pairs loading into the same reg, try to find a renaming
2166 // opportunity to allow the renaming of Reg between FirstMI and MI
2167 // and combine MI into FirstMI; otherwise bail and keep looking.
2168 if (SameLoadReg) {
2169 std::optional<MCPhysReg> RenameReg =
2170 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
2171 Reg, DefinedInBB, UsedInBetween,
2172 RequiredClasses, TRI);
2173 if (!RenameReg) {
2174 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2175 UsedRegUnits, TRI);
2176 MemInsns.push_back(Elt: &MI);
2177 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
2178 << "keep looking.\n");
2179 continue;
2180 }
2181 Flags.setRenameReg(*RenameReg);
2182 }
2183
2184 Flags.setMergeForward(false);
2185 if (!SameLoadReg)
2186 Flags.clearRenameReg();
2187 return MBBI;
2188 }
2189
2190 // Likewise, if the Rt of the first instruction is not modified or used
2191 // between the two instructions and none of the instructions between the
2192 // first and the second alias with the first, we can combine the first
2193 // into the second.
2194 RtNotModified = !(
2195 MayLoad && !UsedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg()));
2196
2197 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
2198 << "Reg '" << getLdStRegOp(FirstMI)
2199 << "' not modified: "
2200 << (RtNotModified ? "true" : "false") << "\n");
2201
2202 if (RtNotModified && !mayAlias(MIa&: FirstMI, MemInsns, AA)) {
2203 if (ModifiedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg())) {
2204 Flags.setMergeForward(true);
2205 Flags.clearRenameReg();
2206 return MBBI;
2207 }
2208
2209 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
2210 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
2211 RequiredClasses, TRI);
2212 if (RenameReg) {
2213 Flags.setMergeForward(true);
2214 Flags.setRenameReg(*RenameReg);
2215 MBBIWithRenameReg = MBBI;
2216 }
2217 }
2218 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
2219 << "interference in between, keep looking.\n");
2220 }
2221 }
2222
2223 if (Flags.getRenameReg())
2224 return MBBIWithRenameReg;
2225
2226 // If the instruction wasn't a matching load or store. Stop searching if we
2227 // encounter a call instruction that might modify memory.
2228 if (MI.isCall()) {
2229 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
2230 return E;
2231 }
2232
2233 // Update modified / uses register units.
2234 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2235
2236 // Otherwise, if the base register is modified, we have no match, so
2237 // return early.
2238 if (!ModifiedRegUnits.available(Reg: BaseReg)) {
2239 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
2240 return E;
2241 }
2242
2243 // Update list of instructions that read/write memory.
2244 if (MI.mayLoadOrStore())
2245 MemInsns.push_back(Elt: &MI);
2246 }
2247 return E;
2248}
2249
2250static MachineBasicBlock::iterator
2251maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
2252 assert((MI.getOpcode() == AArch64::SUBXri ||
2253 MI.getOpcode() == AArch64::ADDXri) &&
2254 "Expected a register update instruction");
2255 auto End = MI.getParent()->end();
2256 if (MaybeCFI == End ||
2257 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
2258 !(MI.getFlag(Flag: MachineInstr::FrameSetup) ||
2259 MI.getFlag(Flag: MachineInstr::FrameDestroy)) ||
2260 MI.getOperand(i: 0).getReg() != AArch64::SP)
2261 return End;
2262
2263 const MachineFunction &MF = *MI.getParent()->getParent();
2264 unsigned CFIIndex = MaybeCFI->getOperand(i: 0).getCFIIndex();
2265 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
2266 switch (CFI.getOperation()) {
2267 case MCCFIInstruction::OpDefCfa:
2268 case MCCFIInstruction::OpDefCfaOffset:
2269 return MaybeCFI;
2270 default:
2271 return End;
2272 }
2273}
2274
2275std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
2276 MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
2277 bool IsForward, bool IsPreIdx, bool MergeEither) {
2278 assert((Update->getOpcode() == AArch64::ADDXri ||
2279 Update->getOpcode() == AArch64::SUBXri) &&
2280 "Unexpected base register update instruction to merge!");
2281 MachineBasicBlock::iterator E = I->getParent()->end();
2282 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2283
2284 // If updating the SP and the following instruction is CFA offset related CFI,
2285 // make sure the CFI follows the SP update either by merging at the location
2286 // of the update or by moving the CFI after the merged instruction. If unable
2287 // to do so, bail.
2288 MachineBasicBlock::iterator InsertPt = I;
2289 if (IsForward) {
2290 assert(IsPreIdx);
2291 if (auto CFI = maybeMoveCFI(MI&: *Update, MaybeCFI: next_nodbg(It: Update, End: E)); CFI != E) {
2292 if (MergeEither) {
2293 InsertPt = Update;
2294 } else {
2295 // Take care not to reorder CFIs.
2296 if (std::any_of(first: std::next(x: CFI), last: I, pred: [](const auto &Insn) {
2297 return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
2298 }))
2299 return std::nullopt;
2300
2301 MachineBasicBlock *MBB = InsertPt->getParent();
2302 MBB->splice(Where: std::next(x: InsertPt), Other: MBB, From: CFI);
2303 }
2304 }
2305 }
2306
2307 // Return the instruction following the merged instruction, which is
2308 // the instruction following our unmerged load. Unless that's the add/sub
2309 // instruction we're merging, in which case it's the one after that.
2310 if (NextI == Update)
2311 NextI = next_nodbg(It: NextI, End: E);
2312
2313 int Value = Update->getOperand(i: 2).getImm();
2314 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
2315 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
2316 if (Update->getOpcode() == AArch64::SUBXri)
2317 Value = -Value;
2318
2319 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(Opc: I->getOpcode())
2320 : getPostIndexedOpcode(Opc: I->getOpcode());
2321 MachineInstrBuilder MIB;
2322 int Scale, MinOffset, MaxOffset;
2323 getPrePostIndexedMemOpInfo(MI: *I, Scale, MinOffset, MaxOffset);
2324 if (!AArch64InstrInfo::isPairedLdSt(MI: *I)) {
2325 // Non-paired instruction.
2326 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2327 MCID: TII->get(Opcode: NewOpc))
2328 .add(MO: Update->getOperand(i: 0))
2329 .add(MO: getLdStRegOp(MI&: *I))
2330 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2331 .addImm(Val: Value / Scale)
2332 .setMemRefs(I->memoperands())
2333 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2334 } else {
2335 // Paired instruction.
2336 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2337 MCID: TII->get(Opcode: NewOpc))
2338 .add(MO: Update->getOperand(i: 0))
2339 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 0))
2340 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 1))
2341 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2342 .addImm(Val: Value / Scale)
2343 .setMemRefs(I->memoperands())
2344 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2345 }
2346
2347 if (IsPreIdx) {
2348 ++NumPreFolded;
2349 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2350 } else {
2351 ++NumPostFolded;
2352 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2353 }
2354 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2355 LLVM_DEBUG(I->print(dbgs()));
2356 LLVM_DEBUG(dbgs() << " ");
2357 LLVM_DEBUG(Update->print(dbgs()));
2358 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2359 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2360 LLVM_DEBUG(dbgs() << "\n");
2361
2362 // Erase the old instructions for the block.
2363 I->eraseFromParent();
2364 Update->eraseFromParent();
2365
2366 return NextI;
2367}
2368
2369MachineBasicBlock::iterator
2370AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2371 MachineBasicBlock::iterator Update,
2372 unsigned Offset, int Scale) {
2373 assert((Update->getOpcode() == AArch64::MOVKWi) &&
2374 "Unexpected const mov instruction to merge!");
2375 MachineBasicBlock::iterator E = I->getParent()->end();
2376 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2377 MachineBasicBlock::iterator PrevI = prev_nodbg(It: Update, Begin: E);
2378 MachineInstr &MemMI = *I;
2379 unsigned Mask = (1 << 12) * Scale - 1;
2380 unsigned Low = Offset & Mask;
2381 unsigned High = Offset - Low;
2382 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2383 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2384 MachineInstrBuilder AddMIB, MemMIB;
2385
2386 // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2387 AddMIB =
2388 BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: AArch64::ADDXri))
2389 .addDef(RegNo: IndexReg)
2390 .addUse(RegNo: BaseReg)
2391 .addImm(Val: High >> 12) // shifted value
2392 .addImm(Val: 12); // shift 12
2393 (void)AddMIB;
2394 // Ld/St DestReg, IndexReg, Imm12
2395 unsigned NewOpc = getBaseAddressOpcode(Opc: I->getOpcode());
2396 MemMIB = BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: NewOpc))
2397 .add(MO: getLdStRegOp(MI&: MemMI))
2398 .add(MO: AArch64InstrInfo::getLdStOffsetOp(MI: MemMI))
2399 .addImm(Val: Low / Scale)
2400 .setMemRefs(I->memoperands())
2401 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2402 (void)MemMIB;
2403
2404 ++NumConstOffsetFolded;
2405 LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2406 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2407 LLVM_DEBUG(PrevI->print(dbgs()));
2408 LLVM_DEBUG(dbgs() << " ");
2409 LLVM_DEBUG(Update->print(dbgs()));
2410 LLVM_DEBUG(dbgs() << " ");
2411 LLVM_DEBUG(I->print(dbgs()));
2412 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2413 LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2414 LLVM_DEBUG(dbgs() << " ");
2415 LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2416 LLVM_DEBUG(dbgs() << "\n");
2417
2418 // Erase the old instructions for the block.
2419 I->eraseFromParent();
2420 PrevI->eraseFromParent();
2421 Update->eraseFromParent();
2422
2423 return NextI;
2424}
2425
2426bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2427 MachineInstr &MI,
2428 unsigned BaseReg, int Offset) {
2429 switch (MI.getOpcode()) {
2430 default:
2431 break;
2432 case AArch64::SUBXri:
2433 case AArch64::ADDXri:
2434 // Make sure it's a vanilla immediate operand, not a relocation or
2435 // anything else we can't handle.
2436 if (!MI.getOperand(i: 2).isImm())
2437 break;
2438 // Watch out for 1 << 12 shifted value.
2439 if (AArch64_AM::getShiftValue(Imm: MI.getOperand(i: 3).getImm()))
2440 break;
2441
2442 // The update instruction source and destination register must be the
2443 // same as the load/store base register.
2444 if (MI.getOperand(i: 0).getReg() != BaseReg ||
2445 MI.getOperand(i: 1).getReg() != BaseReg)
2446 break;
2447
2448 int UpdateOffset = MI.getOperand(i: 2).getImm();
2449 if (MI.getOpcode() == AArch64::SUBXri)
2450 UpdateOffset = -UpdateOffset;
2451
2452 // The immediate must be a multiple of the scaling factor of the pre/post
2453 // indexed instruction.
2454 int Scale, MinOffset, MaxOffset;
2455 getPrePostIndexedMemOpInfo(MI: MemMI, Scale, MinOffset, MaxOffset);
2456 if (UpdateOffset % Scale != 0)
2457 break;
2458
2459 // Scaled offset must fit in the instruction immediate.
2460 int ScaledOffset = UpdateOffset / Scale;
2461 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2462 break;
2463
2464 // If we have a non-zero Offset, we check that it matches the amount
2465 // we're adding to the register.
2466 if (!Offset || Offset == UpdateOffset)
2467 return true;
2468 break;
2469 }
2470 return false;
2471}
2472
2473bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2474 MachineInstr &MI,
2475 unsigned IndexReg,
2476 unsigned &Offset) {
2477 // The update instruction source and destination register must be the
2478 // same as the load/store index register.
2479 if (MI.getOpcode() == AArch64::MOVKWi &&
2480 TRI->isSuperOrSubRegisterEq(RegA: IndexReg, RegB: MI.getOperand(i: 1).getReg())) {
2481
2482 // movz + movk hold a large offset of a Ld/St instruction.
2483 MachineBasicBlock::iterator B = MI.getParent()->begin();
2484 MachineBasicBlock::iterator MBBI = &MI;
2485 // Skip the scene when the MI is the first instruction of a block.
2486 if (MBBI == B)
2487 return false;
2488 MBBI = prev_nodbg(It: MBBI, Begin: B);
2489 MachineInstr &MovzMI = *MBBI;
2490 // Make sure the MOVKWi and MOVZWi set the same register.
2491 if (MovzMI.getOpcode() == AArch64::MOVZWi &&
2492 MovzMI.getOperand(i: 0).getReg() == MI.getOperand(i: 0).getReg()) {
2493 unsigned Low = MovzMI.getOperand(i: 1).getImm();
2494 unsigned High = MI.getOperand(i: 2).getImm() << MI.getOperand(i: 3).getImm();
2495 Offset = High + Low;
2496 // 12-bit optionally shifted immediates are legal for adds.
2497 return Offset >> 24 == 0;
2498 }
2499 }
2500 return false;
2501}
2502
2503MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2504 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2505 MachineBasicBlock::iterator E = I->getParent()->end();
2506 MachineInstr &MemMI = *I;
2507 MachineBasicBlock::iterator MBBI = I;
2508
2509 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2510 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm() *
2511 TII->getMemScale(MI: MemMI);
2512
2513 // Scan forward looking for post-index opportunities. Updating instructions
2514 // can't be formed if the memory instruction doesn't have the offset we're
2515 // looking for.
2516 if (MIUnscaledOffset != UnscaledOffset)
2517 return E;
2518
2519 // If the base register overlaps a source/destination register, we can't
2520 // merge the update. This does not apply to tag store instructions which
2521 // ignore the address part of the source register.
2522 // This does not apply to STGPi as well, which does not have unpredictable
2523 // behavior in this case unlike normal stores, and always performs writeback
2524 // after reading the source register value.
2525 if (!isTagStore(MI: MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2526 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2527 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2528 Register DestReg = getLdStRegOp(MI&: MemMI, PairedRegOp: i).getReg();
2529 if (DestReg == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg))
2530 return E;
2531 }
2532 }
2533
2534 // Track which register units have been modified and used between the first
2535 // insn (inclusive) and the second insn.
2536 ModifiedRegUnits.clear();
2537 UsedRegUnits.clear();
2538 MBBI = next_nodbg(It: MBBI, End: E);
2539
2540 // We can't post-increment the stack pointer if any instruction between
2541 // the memory access (I) and the increment (MBBI) can access the memory
2542 // region defined by [SP, MBBI].
2543 const bool BaseRegSP = BaseReg == AArch64::SP;
2544 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2545 // FIXME: For now, we always block the optimization over SP in windows
2546 // targets as it requires to adjust the unwind/debug info, messing up
2547 // the unwind info can actually cause a miscompile.
2548 return E;
2549 }
2550
2551 unsigned Count = 0;
2552 MachineBasicBlock *CurMBB = I->getParent();
2553 // choice of next block to visit is liveins-based
2554 bool VisitSucc = CurMBB->getParent()->getRegInfo().tracksLiveness();
2555
2556 while (true) {
2557 for (MachineBasicBlock::iterator CurEnd = CurMBB->end();
2558 MBBI != CurEnd && Count < Limit; MBBI = next_nodbg(It: MBBI, End: CurEnd)) {
2559 MachineInstr &MI = *MBBI;
2560
2561 // Don't count transient instructions towards the search limit since there
2562 // may be different numbers of them if e.g. debug information is present.
2563 if (!MI.isTransient())
2564 ++Count;
2565
2566 // If we found a match, return it.
2567 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset: UnscaledOffset))
2568 return MBBI;
2569
2570 // Update the status of what the instruction clobbered and used.
2571 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2572 TRI);
2573
2574 // Otherwise, if the base register is used or modified, we have no match,
2575 // so return early. If we are optimizing SP, do not allow instructions
2576 // that may load or store in between the load and the optimized value
2577 // update.
2578 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2579 !UsedRegUnits.available(Reg: BaseReg) ||
2580 (BaseRegSP && MBBI->mayLoadOrStore()))
2581 return E;
2582 }
2583
2584 if (!VisitSucc || Limit <= Count)
2585 break;
2586
2587 // Try to go downward to successors along a CF path w/o side enters
2588 // such that BaseReg is alive along it but not at its exits
2589 MachineBasicBlock *SuccToVisit = nullptr;
2590 unsigned LiveSuccCount = 0;
2591 for (MachineBasicBlock *Succ : CurMBB->successors()) {
2592 for (MCRegAliasIterator AI(BaseReg, TRI, true); AI.isValid(); ++AI) {
2593 if (Succ->isLiveIn(Reg: *AI)) {
2594 if (LiveSuccCount++)
2595 return E;
2596 if (Succ->pred_size() == 1)
2597 SuccToVisit = Succ;
2598 break;
2599 }
2600 }
2601 }
2602 if (!SuccToVisit)
2603 break;
2604 CurMBB = SuccToVisit;
2605 MBBI = CurMBB->begin();
2606 }
2607
2608 return E;
2609}
2610
2611MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2612 MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
2613 MachineBasicBlock::iterator B = I->getParent()->begin();
2614 MachineBasicBlock::iterator E = I->getParent()->end();
2615 MachineInstr &MemMI = *I;
2616 MachineBasicBlock::iterator MBBI = I;
2617 MachineFunction &MF = *MemMI.getMF();
2618
2619 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2620 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm();
2621
2622 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2623 Register DestReg[] = {getLdStRegOp(MI&: MemMI, PairedRegOp: 0).getReg(),
2624 IsPairedInsn ? getLdStRegOp(MI&: MemMI, PairedRegOp: 1).getReg()
2625 : AArch64::NoRegister};
2626
2627 // If the load/store is the first instruction in the block, there's obviously
2628 // not any matching update. Ditto if the memory offset isn't zero.
2629 if (MBBI == B || Offset != 0)
2630 return E;
2631 // If the base register overlaps a destination register, we can't
2632 // merge the update.
2633 if (!isTagStore(MI: MemMI)) {
2634 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i)
2635 if (DestReg[i] == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg[i]))
2636 return E;
2637 }
2638
2639 const bool BaseRegSP = BaseReg == AArch64::SP;
2640 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2641 // FIXME: For now, we always block the optimization over SP in windows
2642 // targets as it requires to adjust the unwind/debug info, messing up
2643 // the unwind info can actually cause a miscompile.
2644 return E;
2645 }
2646
2647 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2648 unsigned RedZoneSize =
2649 Subtarget.getTargetLowering()->getRedZoneSize(F: MF.getFunction());
2650
2651 // Track which register units have been modified and used between the first
2652 // insn (inclusive) and the second insn.
2653 ModifiedRegUnits.clear();
2654 UsedRegUnits.clear();
2655 unsigned Count = 0;
2656 bool MemAccessBeforeSPPreInc = false;
2657 MergeEither = true;
2658 do {
2659 MBBI = prev_nodbg(It: MBBI, Begin: B);
2660 MachineInstr &MI = *MBBI;
2661
2662 // Don't count transient instructions towards the search limit since there
2663 // may be different numbers of them if e.g. debug information is present.
2664 if (!MI.isTransient())
2665 ++Count;
2666
2667 // If we found a match, return it.
2668 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset)) {
2669 // Check that the update value is within our red zone limit (which may be
2670 // zero).
2671 if (MemAccessBeforeSPPreInc && MBBI->getOperand(i: 2).getImm() > RedZoneSize)
2672 return E;
2673 return MBBI;
2674 }
2675
2676 // Update the status of what the instruction clobbered and used.
2677 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2678
2679 // Otherwise, if the base register is used or modified, we have no match, so
2680 // return early.
2681 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2682 !UsedRegUnits.available(Reg: BaseReg))
2683 return E;
2684
2685 // If we have a destination register (i.e. a load instruction) and a
2686 // destination register is used or modified, then we can only merge forward,
2687 // i.e. the combined instruction is put in the place of the memory
2688 // instruction. Same applies if we see a memory access or side effects.
2689 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() ||
2690 (DestReg[0] != AArch64::NoRegister &&
2691 !(ModifiedRegUnits.available(Reg: DestReg[0]) &&
2692 UsedRegUnits.available(Reg: DestReg[0]))) ||
2693 (DestReg[1] != AArch64::NoRegister &&
2694 !(ModifiedRegUnits.available(Reg: DestReg[1]) &&
2695 UsedRegUnits.available(Reg: DestReg[1]))))
2696 MergeEither = false;
2697
2698 // Keep track if we have a memory access before an SP pre-increment, in this
2699 // case we need to validate later that the update amount respects the red
2700 // zone.
2701 if (BaseRegSP && MBBI->mayLoadOrStore())
2702 MemAccessBeforeSPPreInc = true;
2703 } while (MBBI != B && Count < Limit);
2704 return E;
2705}
2706
2707MachineBasicBlock::iterator
2708AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2709 MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2710 MachineBasicBlock::iterator B = I->getParent()->begin();
2711 MachineBasicBlock::iterator E = I->getParent()->end();
2712 MachineInstr &MemMI = *I;
2713 MachineBasicBlock::iterator MBBI = I;
2714
2715 // If the load is the first instruction in the block, there's obviously
2716 // not any matching load or store.
2717 if (MBBI == B)
2718 return E;
2719
2720 // Make sure the IndexReg is killed and the shift amount is zero.
2721 // TODO: Relex this restriction to extend, simplify processing now.
2722 if (!AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).isKill() ||
2723 !AArch64InstrInfo::getLdStAmountOp(MI: MemMI).isImm() ||
2724 (AArch64InstrInfo::getLdStAmountOp(MI: MemMI).getImm() != 0))
2725 return E;
2726
2727 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2728
2729 // Track which register units have been modified and used between the first
2730 // insn (inclusive) and the second insn.
2731 ModifiedRegUnits.clear();
2732 UsedRegUnits.clear();
2733 unsigned Count = 0;
2734 do {
2735 MBBI = prev_nodbg(It: MBBI, Begin: B);
2736 MachineInstr &MI = *MBBI;
2737
2738 // Don't count transient instructions towards the search limit since there
2739 // may be different numbers of them if e.g. debug information is present.
2740 if (!MI.isTransient())
2741 ++Count;
2742
2743 // If we found a match, return it.
2744 if (isMatchingMovConstInsn(MemMI&: *I, MI, IndexReg, Offset)) {
2745 return MBBI;
2746 }
2747
2748 // Update the status of what the instruction clobbered and used.
2749 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2750
2751 // Otherwise, if the index register is used or modified, we have no match,
2752 // so return early.
2753 if (!ModifiedRegUnits.available(Reg: IndexReg) ||
2754 !UsedRegUnits.available(Reg: IndexReg))
2755 return E;
2756
2757 } while (MBBI != B && Count < Limit);
2758 return E;
2759}
2760
2761bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2762 MachineBasicBlock::iterator &MBBI) {
2763 MachineInstr &MI = *MBBI;
2764 // If this is a volatile load, don't mess with it.
2765 if (MI.hasOrderedMemoryRef())
2766 return false;
2767
2768 if (needsWinCFI(MF: MI.getMF()) && MI.getFlag(Flag: MachineInstr::FrameDestroy))
2769 return false;
2770
2771 // Make sure this is a reg+imm.
2772 // FIXME: It is possible to extend it to handle reg+reg cases.
2773 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2774 return false;
2775
2776 // Look backward up to LdStLimit instructions.
2777 MachineBasicBlock::iterator StoreI;
2778 if (findMatchingStore(I: MBBI, Limit: LdStLimit, StoreI)) {
2779 ++NumLoadsFromStoresPromoted;
2780 // Promote the load. Keeping the iterator straight is a
2781 // pain, so we let the merge routine tell us what the next instruction
2782 // is after it's done mucking about.
2783 MBBI = promoteLoadFromStore(LoadI: MBBI, StoreI);
2784 return true;
2785 }
2786 return false;
2787}
2788
2789// Merge adjacent zero stores into a wider store.
2790bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2791 MachineBasicBlock::iterator &MBBI) {
2792 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2793 MachineInstr &MI = *MBBI;
2794 MachineBasicBlock::iterator E = MI.getParent()->end();
2795
2796 if (!TII->isCandidateToMergeOrPair(MI))
2797 return false;
2798
2799 // Look ahead up to LdStLimit instructions for a mergeable instruction.
2800 LdStPairFlags Flags;
2801 MachineBasicBlock::iterator MergeMI =
2802 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ true);
2803 if (MergeMI != E) {
2804 ++NumZeroStoresPromoted;
2805
2806 // Keeping the iterator straight is a pain, so we let the merge routine tell
2807 // us what the next instruction is after it's done mucking about.
2808 MBBI = mergeNarrowZeroStores(I: MBBI, MergeMI, Flags);
2809 return true;
2810 }
2811 return false;
2812}
2813
2814// Find loads and stores that can be merged into a single load or store pair
2815// instruction.
2816bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2817 MachineInstr &MI = *MBBI;
2818 MachineBasicBlock::iterator E = MI.getParent()->end();
2819
2820 if (!TII->isCandidateToMergeOrPair(MI))
2821 return false;
2822
2823 // If disable-ldp feature is opted, do not emit ldp.
2824 if (MI.mayLoad() && Subtarget->hasDisableLdp())
2825 return false;
2826
2827 // If disable-stp feature is opted, do not emit stp.
2828 if (MI.mayStore() && Subtarget->hasDisableStp())
2829 return false;
2830
2831 // Early exit if the offset is not possible to match. (6 bits of positive
2832 // range, plus allow an extra one in case we find a later insn that matches
2833 // with Offset-1)
2834 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2835 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2836 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2837 // Allow one more for offset.
2838 if (Offset > 0)
2839 Offset -= OffsetStride;
2840 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2841 return false;
2842
2843 // Look ahead up to LdStLimit instructions for a pairable instruction.
2844 LdStPairFlags Flags;
2845 MachineBasicBlock::iterator Paired =
2846 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ false);
2847 if (Paired != E) {
2848 // Keeping the iterator straight is a pain, so we let the merge routine tell
2849 // us what the next instruction is after it's done mucking about.
2850 auto Prev = std::prev(x: MBBI);
2851
2852 // Fetch the memoperand of the load/store that is a candidate for
2853 // combination.
2854 MachineMemOperand *MemOp =
2855 MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2856
2857 // If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2858 // that the alignment of the source pointer is at least double the alignment
2859 // of the type.
2860 if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||
2861 (MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2862 // If there is no size/align information, cancel the transformation.
2863 if (!MemOp || !MemOp->getMemoryType().isValid()) {
2864 NumFailedAlignmentCheck++;
2865 return false;
2866 }
2867
2868 // Get the needed alignments to check them if
2869 // ldp-aligned-only/stp-aligned-only features are opted.
2870 uint64_t MemAlignment = MemOp->getAlign().value();
2871 uint64_t TypeAlignment =
2872 Align(MemOp->getSize().getValue().getKnownMinValue()).value();
2873
2874 if (MemAlignment < 2 * TypeAlignment) {
2875 NumFailedAlignmentCheck++;
2876 return false;
2877 }
2878 }
2879
2880 ++NumPairCreated;
2881 if (TII->hasUnscaledLdStOffset(MI))
2882 ++NumUnscaledPairCreated;
2883
2884 MBBI = mergePairedInsns(I: MBBI, Paired, Flags);
2885 // Collect liveness info for instructions between Prev and the new position
2886 // MBBI.
2887 for (auto I = std::next(x: Prev); I != MBBI; I++)
2888 updateDefinedRegisters(MI&: *I, Units&: DefinedInBB, TRI);
2889
2890 return true;
2891 }
2892 return false;
2893}
2894
2895bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2896 (MachineBasicBlock::iterator &MBBI) {
2897 MachineInstr &MI = *MBBI;
2898 MachineBasicBlock::iterator E = MI.getParent()->end();
2899 MachineBasicBlock::iterator Update;
2900
2901 // Look forward to try to form a post-index instruction. For example,
2902 // ldr x0, [x20]
2903 // add x20, x20, #32
2904 // merged into:
2905 // ldr x0, [x20], #32
2906 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset: 0, Limit: UpdateLimit);
2907 if (Update != E) {
2908 // Merge the update into the ld/st.
2909 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2910 /*IsPreIdx=*/false,
2911 /*MergeEither=*/false)) {
2912 MBBI = *NextI;
2913 return true;
2914 }
2915 }
2916
2917 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2918 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2919 return false;
2920
2921 // Look back to try to find a pre-index instruction. For example,
2922 // add x0, x0, #8
2923 // ldr x1, [x0]
2924 // merged into:
2925 // ldr x1, [x0, #8]!
2926 bool MergeEither;
2927 Update = findMatchingUpdateInsnBackward(I: MBBI, Limit: UpdateLimit, MergeEither);
2928 if (Update != E) {
2929 // Merge the update into the ld/st.
2930 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/true,
2931 /*IsPreIdx=*/true, MergeEither)) {
2932 MBBI = *NextI;
2933 return true;
2934 }
2935 }
2936
2937 // The immediate in the load/store is scaled by the size of the memory
2938 // operation. The immediate in the add we're looking for,
2939 // however, is not, so adjust here.
2940 int UnscaledOffset =
2941 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2942
2943 // Look forward to try to find a pre-index instruction. For example,
2944 // ldr x1, [x0, #64]
2945 // add x0, x0, #64
2946 // merged into:
2947 // ldr x1, [x0, #64]!
2948 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset, Limit: UpdateLimit);
2949 if (Update != E) {
2950 // Merge the update into the ld/st.
2951 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2952 /*IsPreIdx=*/true,
2953 /*MergeEither=*/false)) {
2954 MBBI = *NextI;
2955 return true;
2956 }
2957 }
2958
2959 return false;
2960}
2961
2962bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2963 int Scale) {
2964 MachineInstr &MI = *MBBI;
2965 MachineBasicBlock::iterator E = MI.getParent()->end();
2966 MachineBasicBlock::iterator Update;
2967
2968 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2969 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2970 return false;
2971
2972 // Look back to try to find a const offset for index LdSt instruction. For
2973 // example,
2974 // mov x8, #LargeImm ; = a * (1<<12) + imm12
2975 // ldr x1, [x0, x8]
2976 // merged into:
2977 // add x8, x0, a * (1<<12)
2978 // ldr x1, [x8, imm12]
2979 unsigned Offset;
2980 Update = findMatchingConstOffsetBackward(I: MBBI, Limit: LdStConstLimit, Offset);
2981 if (Update != E && (Offset & (Scale - 1)) == 0) {
2982 // Merge the imm12 into the ld/st.
2983 MBBI = mergeConstOffsetInsn(I: MBBI, Update, Offset, Scale);
2984 return true;
2985 }
2986
2987 return false;
2988}
2989
2990bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2991 bool EnableNarrowZeroStOpt) {
2992 AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
2993
2994 bool Modified = false;
2995 // Four transformations to do here:
2996 // 1) Find loads that directly read from stores and promote them by
2997 // replacing with mov instructions. If the store is wider than the load,
2998 // the load will be replaced with a bitfield extract.
2999 // e.g.,
3000 // str w1, [x0, #4]
3001 // ldrh w2, [x0, #6]
3002 // ; becomes
3003 // str w1, [x0, #4]
3004 // lsr w2, w1, #16
3005 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3006 MBBI != E;) {
3007 if (isPromotableLoadFromStore(MI&: *MBBI) && tryToPromoteLoadFromStore(MBBI))
3008 Modified = true;
3009 else
3010 ++MBBI;
3011 }
3012 // 2) Merge adjacent zero stores into a wider store.
3013 // e.g.,
3014 // strh wzr, [x0]
3015 // strh wzr, [x0, #2]
3016 // ; becomes
3017 // str wzr, [x0]
3018 // e.g.,
3019 // str wzr, [x0]
3020 // str wzr, [x0, #4]
3021 // ; becomes
3022 // str xzr, [x0]
3023 if (EnableNarrowZeroStOpt)
3024 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3025 MBBI != E;) {
3026 if (isPromotableZeroStoreInst(MI&: *MBBI) && tryToMergeZeroStInst(MBBI))
3027 Modified = true;
3028 else
3029 ++MBBI;
3030 }
3031 // 3) Find loads and stores that can be merged into a single load or store
3032 // pair instruction.
3033 // When compiling for SVE 128, also try to combine SVE fill/spill
3034 // instructions into LDP/STP.
3035 // e.g.,
3036 // ldr x0, [x2]
3037 // ldr x1, [x2, #8]
3038 // ; becomes
3039 // ldp x0, x1, [x2]
3040 // e.g.,
3041 // ldr z0, [x2]
3042 // ldr z1, [x2, #1, mul vl]
3043 // ; becomes
3044 // ldp q0, q1, [x2]
3045
3046 if (MBB.getParent()->getRegInfo().tracksLiveness()) {
3047 DefinedInBB.clear();
3048 DefinedInBB.addLiveIns(MBB);
3049 }
3050
3051 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3052 MBBI != E;) {
3053 // Track currently live registers up to this point, to help with
3054 // searching for a rename register on demand.
3055 updateDefinedRegisters(MI&: *MBBI, Units&: DefinedInBB, TRI);
3056 if (TII->isPairableLdStInst(MI: *MBBI) && tryToPairLdStInst(MBBI))
3057 Modified = true;
3058 else
3059 ++MBBI;
3060 }
3061 // 4) Find base register updates that can be merged into the load or store
3062 // as a base-reg writeback.
3063 // e.g.,
3064 // ldr x0, [x2]
3065 // add x2, x2, #4
3066 // ; becomes
3067 // ldr x0, [x2], #4
3068 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3069 MBBI != E;) {
3070 if (isMergeableLdStUpdate(MI&: *MBBI, AFI) && tryToMergeLdStUpdate(MBBI))
3071 Modified = true;
3072 else
3073 ++MBBI;
3074 }
3075
3076 // 5) Find a register assigned with a const value that can be combined with
3077 // into the load or store. e.g.,
3078 // mov x8, #LargeImm ; = a * (1<<12) + imm12
3079 // ldr x1, [x0, x8]
3080 // ; becomes
3081 // add x8, x0, a * (1<<12)
3082 // ldr x1, [x8, imm12]
3083 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3084 MBBI != E;) {
3085 int Scale;
3086 if (isMergeableIndexLdSt(MI&: *MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
3087 Modified = true;
3088 else
3089 ++MBBI;
3090 }
3091
3092 return Modified;
3093}
3094
3095bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
3096 if (skipFunction(F: Fn.getFunction()))
3097 return false;
3098
3099 Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
3100 TII = Subtarget->getInstrInfo();
3101 TRI = Subtarget->getRegisterInfo();
3102 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3103
3104 // Resize the modified and used register unit trackers. We do this once
3105 // per function and then clear the register units each time we optimize a load
3106 // or store.
3107 ModifiedRegUnits.init(TRI: *TRI);
3108 UsedRegUnits.init(TRI: *TRI);
3109 DefinedInBB.init(TRI: *TRI);
3110
3111 bool Modified = false;
3112 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
3113 for (auto &MBB : Fn) {
3114 auto M = optimizeBlock(MBB, EnableNarrowZeroStOpt: enableNarrowZeroStOpt);
3115 Modified |= M;
3116 }
3117
3118 return Modified;
3119}
3120
3121// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
3122// stores near one another? Note: The pre-RA instruction scheduler already has
3123// hooks to try and schedule pairable loads/stores together to improve pairing
3124// opportunities. Thus, pre-RA pairing pass may not be worth the effort.
3125
3126// FIXME: When pairing store instructions it's very possible for this pass to
3127// hoist a store with a KILL marker above another use (without a KILL marker).
3128// The resulting IR is invalid, but nothing uses the KILL markers after this
3129// pass, so it's never caused a problem in practice.
3130
3131/// createAArch64LoadStoreOptimizationPass - returns an instance of the
3132/// load / store optimization pass.
3133FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
3134 return new AArch64LoadStoreOpt();
3135}
3136