1//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains a pass that performs load / store related peephole
10// optimizations. This pass should be run after register allocation.
11//
12// The pass runs after the PrologEpilogInserter where we emit the CFI
13// instructions. In order to preserve the correctness of the unwind information,
14// the pass should not change the order of any two instructions, one of which
15// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16// to unwind information.
17//
18//===----------------------------------------------------------------------===//
19
20#include "AArch64InstrInfo.h"
21#include "AArch64MachineFunctionInfo.h"
22#include "AArch64Subtarget.h"
23#include "MCTargetDesc/AArch64AddressingModes.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/ADT/StringRef.h"
27#include "llvm/ADT/iterator_range.h"
28#include "llvm/Analysis/AliasAnalysis.h"
29#include "llvm/CodeGen/MachineBasicBlock.h"
30#include "llvm/CodeGen/MachineFunction.h"
31#include "llvm/CodeGen/MachineFunctionPass.h"
32#include "llvm/CodeGen/MachineInstr.h"
33#include "llvm/CodeGen/MachineInstrBuilder.h"
34#include "llvm/CodeGen/MachineOperand.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/TargetRegisterInfo.h"
37#include "llvm/IR/DebugLoc.h"
38#include "llvm/MC/MCAsmInfo.h"
39#include "llvm/MC/MCDwarf.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/Debug.h"
43#include "llvm/Support/DebugCounter.h"
44#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/raw_ostream.h"
46#include <cassert>
47#include <cstdint>
48#include <functional>
49#include <iterator>
50#include <limits>
51#include <optional>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "aarch64-ldst-opt"
56
57STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
58STATISTIC(NumPostFolded, "Number of post-index updates folded");
59STATISTIC(NumPreFolded, "Number of pre-index updates folded");
60STATISTIC(NumUnscaledPairCreated,
61 "Number of load/store from unscaled generated");
62STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
63STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
64STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
65 "not passed the alignment check");
66STATISTIC(NumConstOffsetFolded,
67 "Number of const offset of index address folded");
68
69DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
70 "Controls which pairs are considered for renaming");
71
72// The LdStLimit limits how far we search for load/store pairs.
73static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
74 cl::init(Val: 20), cl::Hidden);
75
76// The UpdateLimit limits how far we search for update instructions when we form
77// pre-/post-index instructions.
78static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(Val: 100),
79 cl::Hidden);
80
81// The LdStConstLimit limits how far we search for const offset instructions
82// when we form index address load/store instructions.
83static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
84 cl::init(Val: 10), cl::Hidden);
85
86// Enable register renaming to find additional store pairing opportunities.
87static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
88 cl::init(Val: true), cl::Hidden);
89
90#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
91
92namespace {
93
94using LdStPairFlags = struct LdStPairFlags {
95 // If a matching instruction is found, MergeForward is set to true if the
96 // merge is to remove the first instruction and replace the second with
97 // a pair-wise insn, and false if the reverse is true.
98 bool MergeForward = false;
99
100 // SExtIdx gives the index of the result of the load pair that must be
101 // extended. The value of SExtIdx assumes that the paired load produces the
102 // value in this order: (I, returned iterator), i.e., -1 means no value has
103 // to be extended, 0 means I, and 1 means the returned iterator.
104 int SExtIdx = -1;
105
106 // If not none, RenameReg can be used to rename the result register of the
107 // first store in a pair. Currently this only works when merging stores
108 // forward.
109 std::optional<MCPhysReg> RenameReg;
110
111 LdStPairFlags() = default;
112
113 void setMergeForward(bool V = true) { MergeForward = V; }
114 bool getMergeForward() const { return MergeForward; }
115
116 void setSExtIdx(int V) { SExtIdx = V; }
117 int getSExtIdx() const { return SExtIdx; }
118
119 void setRenameReg(MCPhysReg R) { RenameReg = R; }
120 void clearRenameReg() { RenameReg = std::nullopt; }
121 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
122};
123
124struct AArch64LoadStoreOpt : public MachineFunctionPass {
125 static char ID;
126
127 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
128
129 AliasAnalysis *AA;
130 const AArch64InstrInfo *TII;
131 const TargetRegisterInfo *TRI;
132 const AArch64Subtarget *Subtarget;
133
134 // Track which register units have been modified and used.
135 LiveRegUnits ModifiedRegUnits, UsedRegUnits;
136 LiveRegUnits DefinedInBB;
137
138 void getAnalysisUsage(AnalysisUsage &AU) const override {
139 AU.addRequired<AAResultsWrapperPass>();
140 MachineFunctionPass::getAnalysisUsage(AU);
141 }
142
143 // Scan the instructions looking for a load/store that can be combined
144 // with the current instruction into a load/store pair.
145 // Return the matching instruction if one is found, else MBB->end().
146 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
147 LdStPairFlags &Flags,
148 unsigned Limit,
149 bool FindNarrowMerge);
150
151 // Scan the instructions looking for a store that writes to the address from
152 // which the current load instruction reads. Return true if one is found.
153 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
154 MachineBasicBlock::iterator &StoreI);
155
156 // Merge the two instructions indicated into a wider narrow store instruction.
157 MachineBasicBlock::iterator
158 mergeNarrowZeroStores(MachineBasicBlock::iterator I,
159 MachineBasicBlock::iterator MergeMI,
160 const LdStPairFlags &Flags);
161
162 // Merge the two instructions indicated into a single pair-wise instruction.
163 MachineBasicBlock::iterator
164 mergePairedInsns(MachineBasicBlock::iterator I,
165 MachineBasicBlock::iterator Paired,
166 const LdStPairFlags &Flags);
167
168 // Promote the load that reads directly from the address stored to.
169 MachineBasicBlock::iterator
170 promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
171 MachineBasicBlock::iterator StoreI);
172
173 // Scan the instruction list to find a base register update that can
174 // be combined with the current instruction (a load or store) using
175 // pre or post indexed addressing with writeback. Scan forwards.
176 MachineBasicBlock::iterator
177 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
178 int UnscaledOffset, unsigned Limit);
179
180 // Scan the instruction list to find a register assigned with a const
181 // value that can be combined with the current instruction (a load or store)
182 // using base addressing with writeback. Scan backwards.
183 MachineBasicBlock::iterator
184 findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
185 unsigned &Offset);
186
187 // Scan the instruction list to find a base register update that can
188 // be combined with the current instruction (a load or store) using
189 // pre or post indexed addressing with writeback. Scan backwards.
190 // `MergeEither` is set to true if the combined instruction may be placed
191 // either at the location of the load/store instruction or at the location of
192 // the update instruction.
193 MachineBasicBlock::iterator
194 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
195 bool &MergeEither);
196
197 // Find an instruction that updates the base register of the ld/st
198 // instruction.
199 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
200 unsigned BaseReg, int Offset);
201
202 bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
203 unsigned IndexReg, unsigned &Offset);
204
205 // Merge a pre- or post-index base register update into a ld/st instruction.
206 std::optional<MachineBasicBlock::iterator>
207 mergeUpdateInsn(MachineBasicBlock::iterator I,
208 MachineBasicBlock::iterator Update, bool IsForward,
209 bool IsPreIdx, bool MergeEither);
210
211 MachineBasicBlock::iterator
212 mergeConstOffsetInsn(MachineBasicBlock::iterator I,
213 MachineBasicBlock::iterator Update, unsigned Offset,
214 int Scale);
215
216 // Find and merge zero store instructions.
217 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
218
219 // Find and pair ldr/str instructions.
220 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
221
222 // Find and promote load instructions which read directly from store.
223 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
224
225 // Find and merge a base register updates before or after a ld/st instruction.
226 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
227
228 // Find and merge an index ldr/st instruction into a base ld/st instruction.
229 bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
230
231 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
232
233 bool runOnMachineFunction(MachineFunction &Fn) override;
234
235 MachineFunctionProperties getRequiredProperties() const override {
236 return MachineFunctionProperties().setNoVRegs();
237 }
238
239 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
240};
241
242char AArch64LoadStoreOpt::ID = 0;
243
244} // end anonymous namespace
245
246INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
247 AARCH64_LOAD_STORE_OPT_NAME, false, false)
248
249static bool isNarrowStore(unsigned Opc) {
250 switch (Opc) {
251 default:
252 return false;
253 case AArch64::STRBBui:
254 case AArch64::STURBBi:
255 case AArch64::STRHHui:
256 case AArch64::STURHHi:
257 return true;
258 }
259}
260
261// These instruction set memory tag and either keep memory contents unchanged or
262// set it to zero, ignoring the address part of the source register.
263static bool isTagStore(const MachineInstr &MI) {
264 switch (MI.getOpcode()) {
265 default:
266 return false;
267 case AArch64::STGi:
268 case AArch64::STZGi:
269 case AArch64::ST2Gi:
270 case AArch64::STZ2Gi:
271 return true;
272 }
273}
274
275static unsigned getMatchingNonSExtOpcode(unsigned Opc,
276 bool *IsValidLdStrOpc = nullptr) {
277 if (IsValidLdStrOpc)
278 *IsValidLdStrOpc = true;
279 switch (Opc) {
280 default:
281 if (IsValidLdStrOpc)
282 *IsValidLdStrOpc = false;
283 return std::numeric_limits<unsigned>::max();
284 case AArch64::STRDui:
285 case AArch64::STURDi:
286 case AArch64::STRDpre:
287 case AArch64::STRQui:
288 case AArch64::STURQi:
289 case AArch64::STRQpre:
290 case AArch64::STRBBui:
291 case AArch64::STURBBi:
292 case AArch64::STRHHui:
293 case AArch64::STURHHi:
294 case AArch64::STRWui:
295 case AArch64::STRWpre:
296 case AArch64::STURWi:
297 case AArch64::STRXui:
298 case AArch64::STRXpre:
299 case AArch64::STURXi:
300 case AArch64::STR_ZXI:
301 case AArch64::LDRDui:
302 case AArch64::LDURDi:
303 case AArch64::LDRDpre:
304 case AArch64::LDRQui:
305 case AArch64::LDURQi:
306 case AArch64::LDRQpre:
307 case AArch64::LDRWui:
308 case AArch64::LDURWi:
309 case AArch64::LDRWpre:
310 case AArch64::LDRXui:
311 case AArch64::LDURXi:
312 case AArch64::LDRXpre:
313 case AArch64::STRSui:
314 case AArch64::STURSi:
315 case AArch64::STRSpre:
316 case AArch64::LDRSui:
317 case AArch64::LDURSi:
318 case AArch64::LDRSpre:
319 case AArch64::LDR_ZXI:
320 return Opc;
321 case AArch64::LDRSWui:
322 return AArch64::LDRWui;
323 case AArch64::LDURSWi:
324 return AArch64::LDURWi;
325 case AArch64::LDRSWpre:
326 return AArch64::LDRWpre;
327 }
328}
329
330static unsigned getMatchingWideOpcode(unsigned Opc) {
331 switch (Opc) {
332 default:
333 llvm_unreachable("Opcode has no wide equivalent!");
334 case AArch64::STRBBui:
335 return AArch64::STRHHui;
336 case AArch64::STRHHui:
337 return AArch64::STRWui;
338 case AArch64::STURBBi:
339 return AArch64::STURHHi;
340 case AArch64::STURHHi:
341 return AArch64::STURWi;
342 case AArch64::STURWi:
343 return AArch64::STURXi;
344 case AArch64::STRWui:
345 return AArch64::STRXui;
346 }
347}
348
349static unsigned getMatchingPairOpcode(unsigned Opc) {
350 switch (Opc) {
351 default:
352 llvm_unreachable("Opcode has no pairwise equivalent!");
353 case AArch64::STRSui:
354 case AArch64::STURSi:
355 return AArch64::STPSi;
356 case AArch64::STRSpre:
357 return AArch64::STPSpre;
358 case AArch64::STRDui:
359 case AArch64::STURDi:
360 return AArch64::STPDi;
361 case AArch64::STRDpre:
362 return AArch64::STPDpre;
363 case AArch64::STRQui:
364 case AArch64::STURQi:
365 case AArch64::STR_ZXI:
366 return AArch64::STPQi;
367 case AArch64::STRQpre:
368 return AArch64::STPQpre;
369 case AArch64::STRWui:
370 case AArch64::STURWi:
371 return AArch64::STPWi;
372 case AArch64::STRWpre:
373 return AArch64::STPWpre;
374 case AArch64::STRXui:
375 case AArch64::STURXi:
376 return AArch64::STPXi;
377 case AArch64::STRXpre:
378 return AArch64::STPXpre;
379 case AArch64::LDRSui:
380 case AArch64::LDURSi:
381 return AArch64::LDPSi;
382 case AArch64::LDRSpre:
383 return AArch64::LDPSpre;
384 case AArch64::LDRDui:
385 case AArch64::LDURDi:
386 return AArch64::LDPDi;
387 case AArch64::LDRDpre:
388 return AArch64::LDPDpre;
389 case AArch64::LDRQui:
390 case AArch64::LDURQi:
391 case AArch64::LDR_ZXI:
392 return AArch64::LDPQi;
393 case AArch64::LDRQpre:
394 return AArch64::LDPQpre;
395 case AArch64::LDRWui:
396 case AArch64::LDURWi:
397 return AArch64::LDPWi;
398 case AArch64::LDRWpre:
399 return AArch64::LDPWpre;
400 case AArch64::LDRXui:
401 case AArch64::LDURXi:
402 return AArch64::LDPXi;
403 case AArch64::LDRXpre:
404 return AArch64::LDPXpre;
405 case AArch64::LDRSWui:
406 case AArch64::LDURSWi:
407 return AArch64::LDPSWi;
408 case AArch64::LDRSWpre:
409 return AArch64::LDPSWpre;
410 }
411}
412
413static unsigned isMatchingStore(MachineInstr &LoadInst,
414 MachineInstr &StoreInst) {
415 unsigned LdOpc = LoadInst.getOpcode();
416 unsigned StOpc = StoreInst.getOpcode();
417 switch (LdOpc) {
418 default:
419 llvm_unreachable("Unsupported load instruction!");
420 case AArch64::LDRBBui:
421 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
422 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
423 case AArch64::LDURBBi:
424 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
425 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
426 case AArch64::LDRHHui:
427 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
428 StOpc == AArch64::STRXui;
429 case AArch64::LDURHHi:
430 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
431 StOpc == AArch64::STURXi;
432 case AArch64::LDRWui:
433 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
434 case AArch64::LDURWi:
435 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
436 case AArch64::LDRXui:
437 return StOpc == AArch64::STRXui;
438 case AArch64::LDURXi:
439 return StOpc == AArch64::STURXi;
440 }
441}
442
443static unsigned getPreIndexedOpcode(unsigned Opc) {
444 // FIXME: We don't currently support creating pre-indexed loads/stores when
445 // the load or store is the unscaled version. If we decide to perform such an
446 // optimization in the future the cases for the unscaled loads/stores will
447 // need to be added here.
448 switch (Opc) {
449 default:
450 llvm_unreachable("Opcode has no pre-indexed equivalent!");
451 case AArch64::STRSui:
452 return AArch64::STRSpre;
453 case AArch64::STRDui:
454 return AArch64::STRDpre;
455 case AArch64::STRQui:
456 return AArch64::STRQpre;
457 case AArch64::STRBBui:
458 return AArch64::STRBBpre;
459 case AArch64::STRHHui:
460 return AArch64::STRHHpre;
461 case AArch64::STRWui:
462 return AArch64::STRWpre;
463 case AArch64::STRXui:
464 return AArch64::STRXpre;
465 case AArch64::LDRSui:
466 return AArch64::LDRSpre;
467 case AArch64::LDRDui:
468 return AArch64::LDRDpre;
469 case AArch64::LDRQui:
470 return AArch64::LDRQpre;
471 case AArch64::LDRBBui:
472 return AArch64::LDRBBpre;
473 case AArch64::LDRHHui:
474 return AArch64::LDRHHpre;
475 case AArch64::LDRWui:
476 return AArch64::LDRWpre;
477 case AArch64::LDRXui:
478 return AArch64::LDRXpre;
479 case AArch64::LDRSWui:
480 return AArch64::LDRSWpre;
481 case AArch64::LDPSi:
482 return AArch64::LDPSpre;
483 case AArch64::LDPSWi:
484 return AArch64::LDPSWpre;
485 case AArch64::LDPDi:
486 return AArch64::LDPDpre;
487 case AArch64::LDPQi:
488 return AArch64::LDPQpre;
489 case AArch64::LDPWi:
490 return AArch64::LDPWpre;
491 case AArch64::LDPXi:
492 return AArch64::LDPXpre;
493 case AArch64::STPSi:
494 return AArch64::STPSpre;
495 case AArch64::STPDi:
496 return AArch64::STPDpre;
497 case AArch64::STPQi:
498 return AArch64::STPQpre;
499 case AArch64::STPWi:
500 return AArch64::STPWpre;
501 case AArch64::STPXi:
502 return AArch64::STPXpre;
503 case AArch64::STGi:
504 return AArch64::STGPreIndex;
505 case AArch64::STZGi:
506 return AArch64::STZGPreIndex;
507 case AArch64::ST2Gi:
508 return AArch64::ST2GPreIndex;
509 case AArch64::STZ2Gi:
510 return AArch64::STZ2GPreIndex;
511 case AArch64::STGPi:
512 return AArch64::STGPpre;
513 }
514}
515
516static unsigned getBaseAddressOpcode(unsigned Opc) {
517 // TODO: Add more index address stores.
518 switch (Opc) {
519 default:
520 llvm_unreachable("Opcode has no base address equivalent!");
521 case AArch64::LDRBroX:
522 return AArch64::LDRBui;
523 case AArch64::LDRBBroX:
524 return AArch64::LDRBBui;
525 case AArch64::LDRSBXroX:
526 return AArch64::LDRSBXui;
527 case AArch64::LDRSBWroX:
528 return AArch64::LDRSBWui;
529 case AArch64::LDRHroX:
530 return AArch64::LDRHui;
531 case AArch64::LDRHHroX:
532 return AArch64::LDRHHui;
533 case AArch64::LDRSHXroX:
534 return AArch64::LDRSHXui;
535 case AArch64::LDRSHWroX:
536 return AArch64::LDRSHWui;
537 case AArch64::LDRWroX:
538 return AArch64::LDRWui;
539 case AArch64::LDRSroX:
540 return AArch64::LDRSui;
541 case AArch64::LDRSWroX:
542 return AArch64::LDRSWui;
543 case AArch64::LDRDroX:
544 return AArch64::LDRDui;
545 case AArch64::LDRXroX:
546 return AArch64::LDRXui;
547 case AArch64::LDRQroX:
548 return AArch64::LDRQui;
549 }
550}
551
552static unsigned getPostIndexedOpcode(unsigned Opc) {
553 switch (Opc) {
554 default:
555 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
556 case AArch64::STRSui:
557 case AArch64::STURSi:
558 return AArch64::STRSpost;
559 case AArch64::STRDui:
560 case AArch64::STURDi:
561 return AArch64::STRDpost;
562 case AArch64::STRQui:
563 case AArch64::STURQi:
564 return AArch64::STRQpost;
565 case AArch64::STRBBui:
566 return AArch64::STRBBpost;
567 case AArch64::STRHHui:
568 return AArch64::STRHHpost;
569 case AArch64::STRWui:
570 case AArch64::STURWi:
571 return AArch64::STRWpost;
572 case AArch64::STRXui:
573 case AArch64::STURXi:
574 return AArch64::STRXpost;
575 case AArch64::LDRSui:
576 case AArch64::LDURSi:
577 return AArch64::LDRSpost;
578 case AArch64::LDRDui:
579 case AArch64::LDURDi:
580 return AArch64::LDRDpost;
581 case AArch64::LDRQui:
582 case AArch64::LDURQi:
583 return AArch64::LDRQpost;
584 case AArch64::LDRBBui:
585 return AArch64::LDRBBpost;
586 case AArch64::LDRHHui:
587 return AArch64::LDRHHpost;
588 case AArch64::LDRWui:
589 case AArch64::LDURWi:
590 return AArch64::LDRWpost;
591 case AArch64::LDRXui:
592 case AArch64::LDURXi:
593 return AArch64::LDRXpost;
594 case AArch64::LDRSWui:
595 return AArch64::LDRSWpost;
596 case AArch64::LDPSi:
597 return AArch64::LDPSpost;
598 case AArch64::LDPSWi:
599 return AArch64::LDPSWpost;
600 case AArch64::LDPDi:
601 return AArch64::LDPDpost;
602 case AArch64::LDPQi:
603 return AArch64::LDPQpost;
604 case AArch64::LDPWi:
605 return AArch64::LDPWpost;
606 case AArch64::LDPXi:
607 return AArch64::LDPXpost;
608 case AArch64::STPSi:
609 return AArch64::STPSpost;
610 case AArch64::STPDi:
611 return AArch64::STPDpost;
612 case AArch64::STPQi:
613 return AArch64::STPQpost;
614 case AArch64::STPWi:
615 return AArch64::STPWpost;
616 case AArch64::STPXi:
617 return AArch64::STPXpost;
618 case AArch64::STGi:
619 return AArch64::STGPostIndex;
620 case AArch64::STZGi:
621 return AArch64::STZGPostIndex;
622 case AArch64::ST2Gi:
623 return AArch64::ST2GPostIndex;
624 case AArch64::STZ2Gi:
625 return AArch64::STZ2GPostIndex;
626 case AArch64::STGPi:
627 return AArch64::STGPpost;
628 }
629}
630
631static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
632
633 unsigned OpcA = FirstMI.getOpcode();
634 unsigned OpcB = MI.getOpcode();
635
636 switch (OpcA) {
637 default:
638 return false;
639 case AArch64::STRSpre:
640 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
641 case AArch64::STRDpre:
642 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
643 case AArch64::STRQpre:
644 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
645 case AArch64::STRWpre:
646 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
647 case AArch64::STRXpre:
648 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
649 case AArch64::LDRSpre:
650 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
651 case AArch64::LDRDpre:
652 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
653 case AArch64::LDRQpre:
654 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
655 case AArch64::LDRWpre:
656 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
657 case AArch64::LDRXpre:
658 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
659 case AArch64::LDRSWpre:
660 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
661 }
662}
663
664// Returns the scale and offset range of pre/post indexed variants of MI.
665static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
666 int &MinOffset, int &MaxOffset) {
667 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
668 bool IsTagStore = isTagStore(MI);
669 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
670 // as in the "unsigned offset" variant.
671 // All other pre/post indexed ldst instructions are unscaled.
672 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
673
674 if (IsPaired) {
675 MinOffset = -64;
676 MaxOffset = 63;
677 } else {
678 MinOffset = -256;
679 MaxOffset = 255;
680 }
681}
682
683static MachineOperand &getLdStRegOp(MachineInstr &MI,
684 unsigned PairedRegOp = 0) {
685 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
686 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
687 if (IsPreLdSt)
688 PairedRegOp += 1;
689 unsigned Idx =
690 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
691 return MI.getOperand(i: Idx);
692}
693
694static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
695 MachineInstr &StoreInst,
696 const AArch64InstrInfo *TII) {
697 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
698 int LoadSize = TII->getMemScale(MI: LoadInst);
699 int StoreSize = TII->getMemScale(MI: StoreInst);
700 int UnscaledStOffset =
701 TII->hasUnscaledLdStOffset(MI&: StoreInst)
702 ? AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm()
703 : AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm() * StoreSize;
704 int UnscaledLdOffset =
705 TII->hasUnscaledLdStOffset(MI&: LoadInst)
706 ? AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm()
707 : AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm() * LoadSize;
708 return (UnscaledStOffset <= UnscaledLdOffset) &&
709 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
710}
711
712static bool isPromotableZeroStoreInst(MachineInstr &MI) {
713 unsigned Opc = MI.getOpcode();
714 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
715 isNarrowStore(Opc)) &&
716 getLdStRegOp(MI).getReg() == AArch64::WZR;
717}
718
719static bool isPromotableLoadFromStore(MachineInstr &MI) {
720 switch (MI.getOpcode()) {
721 default:
722 return false;
723 // Scaled instructions.
724 case AArch64::LDRBBui:
725 case AArch64::LDRHHui:
726 case AArch64::LDRWui:
727 case AArch64::LDRXui:
728 // Unscaled instructions.
729 case AArch64::LDURBBi:
730 case AArch64::LDURHHi:
731 case AArch64::LDURWi:
732 case AArch64::LDURXi:
733 return true;
734 }
735}
736
737static bool isMergeableLdStUpdate(MachineInstr &MI, AArch64FunctionInfo &AFI) {
738 unsigned Opc = MI.getOpcode();
739 switch (Opc) {
740 default:
741 return false;
742 // Scaled instructions.
743 case AArch64::STRSui:
744 case AArch64::STRDui:
745 case AArch64::STRQui:
746 case AArch64::STRXui:
747 case AArch64::STRWui:
748 case AArch64::STRHHui:
749 case AArch64::STRBBui:
750 case AArch64::LDRSui:
751 case AArch64::LDRDui:
752 case AArch64::LDRQui:
753 case AArch64::LDRXui:
754 case AArch64::LDRWui:
755 case AArch64::LDRHHui:
756 case AArch64::LDRBBui:
757 case AArch64::STGi:
758 case AArch64::STZGi:
759 case AArch64::ST2Gi:
760 case AArch64::STZ2Gi:
761 case AArch64::STGPi:
762 // Unscaled instructions.
763 case AArch64::STURSi:
764 case AArch64::STURDi:
765 case AArch64::STURQi:
766 case AArch64::STURWi:
767 case AArch64::STURXi:
768 case AArch64::LDURSi:
769 case AArch64::LDURDi:
770 case AArch64::LDURQi:
771 case AArch64::LDURWi:
772 case AArch64::LDURXi:
773 // Paired instructions.
774 case AArch64::LDPSi:
775 case AArch64::LDPSWi:
776 case AArch64::LDPDi:
777 case AArch64::LDPQi:
778 case AArch64::LDPWi:
779 case AArch64::LDPXi:
780 case AArch64::STPSi:
781 case AArch64::STPDi:
782 case AArch64::STPQi:
783 case AArch64::STPWi:
784 case AArch64::STPXi:
785 // Make sure this is a reg+imm (as opposed to an address reloc).
786 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
787 return false;
788
789 // When using stack tagging, simple sp+imm loads and stores are not
790 // tag-checked, but pre- and post-indexed versions of them are, so we can't
791 // replace the former with the latter. This transformation would be valid
792 // if the load/store accesses an untagged stack slot, but we don't have
793 // that information available after frame indices have been eliminated.
794 if (AFI.isMTETagged() &&
795 AArch64InstrInfo::getLdStBaseOp(MI).getReg() == AArch64::SP)
796 return false;
797
798 return true;
799 }
800}
801
802// Make sure this is a reg+reg Ld/St
803static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
804 unsigned Opc = MI.getOpcode();
805 switch (Opc) {
806 default:
807 return false;
808 // Scaled instructions.
809 // TODO: Add more index address stores.
810 case AArch64::LDRBroX:
811 case AArch64::LDRBBroX:
812 case AArch64::LDRSBXroX:
813 case AArch64::LDRSBWroX:
814 Scale = 1;
815 return true;
816 case AArch64::LDRHroX:
817 case AArch64::LDRHHroX:
818 case AArch64::LDRSHXroX:
819 case AArch64::LDRSHWroX:
820 Scale = 2;
821 return true;
822 case AArch64::LDRWroX:
823 case AArch64::LDRSroX:
824 case AArch64::LDRSWroX:
825 Scale = 4;
826 return true;
827 case AArch64::LDRDroX:
828 case AArch64::LDRXroX:
829 Scale = 8;
830 return true;
831 case AArch64::LDRQroX:
832 Scale = 16;
833 return true;
834 }
835}
836
837static bool isRewritableImplicitDef(unsigned Opc) {
838 switch (Opc) {
839 default:
840 return false;
841 case AArch64::ORRWrs:
842 case AArch64::ADDWri:
843 return true;
844 }
845}
846
847MachineBasicBlock::iterator
848AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
849 MachineBasicBlock::iterator MergeMI,
850 const LdStPairFlags &Flags) {
851 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
852 "Expected promotable zero stores.");
853
854 MachineBasicBlock::iterator E = I->getParent()->end();
855 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
856 // If NextI is the second of the two instructions to be merged, we need
857 // to skip one further. Either way we merge will invalidate the iterator,
858 // and we don't need to scan the new instruction, as it's a pairwise
859 // instruction, which we're not considering for further action anyway.
860 if (NextI == MergeMI)
861 NextI = next_nodbg(It: NextI, End: E);
862
863 unsigned Opc = I->getOpcode();
864 unsigned MergeMIOpc = MergeMI->getOpcode();
865 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
866 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(Opc: MergeMIOpc);
867 int OffsetStride = IsScaled ? TII->getMemScale(MI: *I) : 1;
868 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(MI: *MergeMI) : 1;
869
870 bool MergeForward = Flags.getMergeForward();
871 // Insert our new paired instruction after whichever of the paired
872 // instructions MergeForward indicates.
873 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
874 // Also based on MergeForward is from where we copy the base register operand
875 // so we get the flags compatible with the input code.
876 const MachineOperand &BaseRegOp =
877 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *MergeMI)
878 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
879
880 // Which register is Rt and which is Rt2 depends on the offset order.
881 int64_t IOffsetInBytes =
882 AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm() * OffsetStride;
883 int64_t MIOffsetInBytes =
884 AArch64InstrInfo::getLdStOffsetOp(MI: *MergeMI).getImm() *
885 MergeMIOffsetStride;
886 // Select final offset based on the offset order.
887 int64_t OffsetImm;
888 if (IOffsetInBytes > MIOffsetInBytes)
889 OffsetImm = MIOffsetInBytes;
890 else
891 OffsetImm = IOffsetInBytes;
892
893 int NewOpcode = getMatchingWideOpcode(Opc);
894 // Adjust final offset on scaled stores because the new instruction
895 // has a different scale.
896 if (!TII->hasUnscaledLdStOffset(Opc: NewOpcode)) {
897 int NewOffsetStride = TII->getMemScale(Opc: NewOpcode);
898 assert(((OffsetImm % NewOffsetStride) == 0) &&
899 "Offset should be a multiple of the store memory scale");
900 OffsetImm = OffsetImm / NewOffsetStride;
901 }
902
903 // Construct the new instruction.
904 DebugLoc DL = I->getDebugLoc();
905 MachineBasicBlock *MBB = I->getParent();
906 MachineInstrBuilder MIB;
907 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: NewOpcode))
908 .addReg(RegNo: isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
909 .add(MO: BaseRegOp)
910 .addImm(Val: OffsetImm)
911 .cloneMergedMemRefs(OtherMIs: {&*I, &*MergeMI})
912 .setMIFlags(I->mergeFlagsWith(Other: *MergeMI));
913 (void)MIB;
914
915 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
916 LLVM_DEBUG(I->print(dbgs()));
917 LLVM_DEBUG(dbgs() << " ");
918 LLVM_DEBUG(MergeMI->print(dbgs()));
919 LLVM_DEBUG(dbgs() << " with instruction:\n ");
920 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
921 LLVM_DEBUG(dbgs() << "\n");
922
923 // Erase the old instructions.
924 I->eraseFromParent();
925 MergeMI->eraseFromParent();
926 return NextI;
927}
928
929// Apply Fn to all instructions between MI and the beginning of the block, until
930// a def for DefReg is reached. Returns true, iff Fn returns true for all
931// visited instructions. Stop after visiting Limit iterations.
932static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
933 const TargetRegisterInfo *TRI, unsigned Limit,
934 std::function<bool(MachineInstr &, bool)> &Fn) {
935 auto MBB = MI.getParent();
936 for (MachineInstr &I :
937 instructionsWithoutDebug(It: MI.getReverseIterator(), End: MBB->instr_rend())) {
938 if (!Limit)
939 return false;
940 --Limit;
941
942 bool isDef = any_of(Range: I.operands(), P: [DefReg, TRI](MachineOperand &MOP) {
943 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
944 TRI->regsOverlap(RegA: MOP.getReg(), RegB: DefReg);
945 });
946 if (!Fn(I, isDef))
947 return false;
948 if (isDef)
949 break;
950 }
951 return true;
952}
953
954static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
955 const TargetRegisterInfo *TRI) {
956
957 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
958 if (MOP.isReg() && MOP.isKill())
959 Units.removeReg(Reg: MOP.getReg());
960
961 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
962 if (MOP.isReg() && !MOP.isKill())
963 Units.addReg(Reg: MOP.getReg());
964}
965
966/// This function will add a new entry into the debugValueSubstitutions table
967/// when two instruction have been merged into a new one represented by \p
968/// MergedInstr.
969static void addDebugSubstitutionsToTable(MachineFunction *MF,
970 unsigned InstrNumToSet,
971 MachineInstr &OriginalInstr,
972 MachineInstr &MergedInstr) {
973
974 // Figure out the Operand Index of the destination register of the
975 // OriginalInstr in the new MergedInstr.
976 auto Reg = OriginalInstr.getOperand(i: 0).getReg();
977 unsigned OperandNo = 0;
978 bool RegFound = false;
979 for (const auto Op : MergedInstr.operands()) {
980 if (Op.getReg() == Reg) {
981 RegFound = true;
982 break;
983 }
984 OperandNo++;
985 }
986
987 if (RegFound)
988 MF->makeDebugValueSubstitution({OriginalInstr.peekDebugInstrNum(), 0},
989 {InstrNumToSet, OperandNo});
990}
991
992MachineBasicBlock::iterator
993AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
994 MachineBasicBlock::iterator Paired,
995 const LdStPairFlags &Flags) {
996 MachineBasicBlock::iterator E = I->getParent()->end();
997 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
998 // If NextI is the second of the two instructions to be merged, we need
999 // to skip one further. Either way we merge will invalidate the iterator,
1000 // and we don't need to scan the new instruction, as it's a pairwise
1001 // instruction, which we're not considering for further action anyway.
1002 if (NextI == Paired)
1003 NextI = next_nodbg(It: NextI, End: E);
1004
1005 int SExtIdx = Flags.getSExtIdx();
1006 unsigned Opc =
1007 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(Opc: I->getOpcode());
1008 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
1009 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: *I) : 1;
1010
1011 bool MergeForward = Flags.getMergeForward();
1012
1013 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
1014 if (RenameReg) {
1015 MCRegister RegToRename = getLdStRegOp(MI&: *I).getReg();
1016 DefinedInBB.addReg(Reg: *RenameReg);
1017
1018 // Return the sub/super register for RenameReg, matching the size of
1019 // OriginalReg.
1020 auto GetMatchingSubReg =
1021 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
1022 for (MCPhysReg SubOrSuper :
1023 TRI->sub_and_superregs_inclusive(Reg: *RenameReg)) {
1024 if (C->contains(Reg: SubOrSuper))
1025 return SubOrSuper;
1026 }
1027 llvm_unreachable("Should have found matching sub or super register!");
1028 };
1029
1030 std::function<bool(MachineInstr &, bool)> UpdateMIs =
1031 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
1032 bool IsDef) {
1033 if (IsDef) {
1034 bool SeenDef = false;
1035 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1036 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1037 // Rename the first explicit definition and all implicit
1038 // definitions matching RegToRename.
1039 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1040 (!MergeForward || !SeenDef ||
1041 (MOP.isDef() && MOP.isImplicit())) &&
1042 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1043 assert((MOP.isImplicit() ||
1044 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1045 "Need renamable operands");
1046 Register MatchingReg;
1047 if (const TargetRegisterClass *RC =
1048 MI.getRegClassConstraint(OpIdx, TII, TRI))
1049 MatchingReg = GetMatchingSubReg(RC);
1050 else {
1051 if (!isRewritableImplicitDef(Opc: MI.getOpcode()))
1052 continue;
1053 MatchingReg = GetMatchingSubReg(
1054 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1055 }
1056 MOP.setReg(MatchingReg);
1057 SeenDef = true;
1058 }
1059 }
1060 } else {
1061 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1062 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1063 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1064 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1065 assert((MOP.isImplicit() ||
1066 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1067 "Need renamable operands");
1068 Register MatchingReg;
1069 if (const TargetRegisterClass *RC =
1070 MI.getRegClassConstraint(OpIdx, TII, TRI))
1071 MatchingReg = GetMatchingSubReg(RC);
1072 else
1073 MatchingReg = GetMatchingSubReg(
1074 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1075 assert(MatchingReg != AArch64::NoRegister &&
1076 "Cannot find matching regs for renaming");
1077 MOP.setReg(MatchingReg);
1078 }
1079 }
1080 }
1081 LLVM_DEBUG(dbgs() << "Renamed " << MI);
1082 return true;
1083 };
1084 forAllMIsUntilDef(MI&: MergeForward ? *I : *Paired->getPrevNode(), DefReg: RegToRename,
1085 TRI, UINT32_MAX, Fn&: UpdateMIs);
1086
1087#if !defined(NDEBUG)
1088 // For forward merging store:
1089 // Make sure the register used for renaming is not used between the
1090 // paired instructions. That would trash the content before the new
1091 // paired instruction.
1092 MCPhysReg RegToCheck = *RenameReg;
1093 // For backward merging load:
1094 // Make sure the register being renamed is not used between the
1095 // paired instructions. That would trash the content after the new
1096 // paired instruction.
1097 if (!MergeForward)
1098 RegToCheck = RegToRename;
1099 for (auto &MI :
1100 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
1101 MergeForward ? std::next(I) : I,
1102 MergeForward ? std::next(Paired) : Paired))
1103 assert(all_of(MI.operands(),
1104 [this, RegToCheck](const MachineOperand &MOP) {
1105 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1106 MOP.isUndef() ||
1107 !TRI->regsOverlap(MOP.getReg(), RegToCheck);
1108 }) &&
1109 "Rename register used between paired instruction, trashing the "
1110 "content");
1111#endif
1112 }
1113
1114 // Insert our new paired instruction after whichever of the paired
1115 // instructions MergeForward indicates.
1116 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
1117 // Also based on MergeForward is from where we copy the base register operand
1118 // so we get the flags compatible with the input code.
1119 const MachineOperand &BaseRegOp =
1120 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *Paired)
1121 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
1122
1123 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm();
1124 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(MI: *Paired).getImm();
1125 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Opc: Paired->getOpcode());
1126 if (IsUnscaled != PairedIsUnscaled) {
1127 // We're trying to pair instructions that differ in how they are scaled. If
1128 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
1129 // the opposite (i.e., make Paired's offset unscaled).
1130 int MemSize = TII->getMemScale(MI: *Paired);
1131 if (PairedIsUnscaled) {
1132 // If the unscaled offset isn't a multiple of the MemSize, we can't
1133 // pair the operations together.
1134 assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1135 "Offset should be a multiple of the stride!");
1136 PairedOffset /= MemSize;
1137 } else {
1138 PairedOffset *= MemSize;
1139 }
1140 }
1141
1142 // Which register is Rt and which is Rt2 depends on the offset order.
1143 // However, for pre load/stores the Rt should be the one of the pre
1144 // load/store.
1145 MachineInstr *RtMI, *Rt2MI;
1146 if (Offset == PairedOffset + OffsetStride &&
1147 !AArch64InstrInfo::isPreLdSt(MI: *I)) {
1148 RtMI = &*Paired;
1149 Rt2MI = &*I;
1150 // Here we swapped the assumption made for SExtIdx.
1151 // I.e., we turn ldp I, Paired into ldp Paired, I.
1152 // Update the index accordingly.
1153 if (SExtIdx != -1)
1154 SExtIdx = (SExtIdx + 1) % 2;
1155 } else {
1156 RtMI = &*I;
1157 Rt2MI = &*Paired;
1158 }
1159 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(MI: *RtMI).getImm();
1160 // Scale the immediate offset, if necessary.
1161 if (TII->hasUnscaledLdStOffset(Opc: RtMI->getOpcode())) {
1162 assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1163 "Unscaled offset cannot be scaled.");
1164 OffsetImm /= TII->getMemScale(MI: *RtMI);
1165 }
1166
1167 // Construct the new instruction.
1168 MachineInstrBuilder MIB;
1169 DebugLoc DL = I->getDebugLoc();
1170 MachineBasicBlock *MBB = I->getParent();
1171 MachineOperand RegOp0 = getLdStRegOp(MI&: *RtMI);
1172 MachineOperand RegOp1 = getLdStRegOp(MI&: *Rt2MI);
1173 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1174 // Kill flags may become invalid when moving stores for pairing.
1175 if (RegOp0.isUse()) {
1176 if (!MergeForward) {
1177 // Clear kill flags on store if moving upwards. Example:
1178 // STRWui kill %w0, ...
1179 // USE %w1
1180 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1181 // We are about to move the store of w1, so its kill flag may become
1182 // invalid; not the case for w0.
1183 // Since w1 is used between the stores, the kill flag on w1 is cleared
1184 // after merging.
1185 // STPWi kill %w0, %w1, ...
1186 // USE %w1
1187 for (auto It = std::next(x: I); It != Paired && PairedRegOp.isKill(); ++It)
1188 if (It->readsRegister(Reg: PairedRegOp.getReg(), TRI))
1189 PairedRegOp.setIsKill(false);
1190 } else {
1191 // Clear kill flags of the first stores register. Example:
1192 // STRWui %w1, ...
1193 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1194 // STRW %w0
1195 Register Reg = getLdStRegOp(MI&: *I).getReg();
1196 for (MachineInstr &MI : make_range(x: std::next(x: I), y: Paired))
1197 MI.clearRegisterKills(Reg, RegInfo: TRI);
1198 }
1199 }
1200
1201 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1202 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: MatchPairOpcode));
1203
1204 // Adds the pre-index operand for pre-indexed ld/st pairs.
1205 if (AArch64InstrInfo::isPreLdSt(MI: *RtMI))
1206 MIB.addReg(RegNo: BaseRegOp.getReg(), flags: RegState::Define);
1207
1208 MIB.add(MO: RegOp0)
1209 .add(MO: RegOp1)
1210 .add(MO: BaseRegOp)
1211 .addImm(Val: OffsetImm)
1212 .cloneMergedMemRefs(OtherMIs: {&*I, &*Paired})
1213 .setMIFlags(I->mergeFlagsWith(Other: *Paired));
1214
1215 (void)MIB;
1216
1217 LLVM_DEBUG(
1218 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1219 LLVM_DEBUG(I->print(dbgs()));
1220 LLVM_DEBUG(dbgs() << " ");
1221 LLVM_DEBUG(Paired->print(dbgs()));
1222 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1223 if (SExtIdx != -1) {
1224 // Generate the sign extension for the proper result of the ldp.
1225 // I.e., with X1, that would be:
1226 // %w1 = KILL %w1, implicit-def %x1
1227 // %x1 = SBFMXri killed %x1, 0, 31
1228 MachineOperand &DstMO = MIB->getOperand(i: SExtIdx);
1229 // Right now, DstMO has the extended register, since it comes from an
1230 // extended opcode.
1231 Register DstRegX = DstMO.getReg();
1232 // Get the W variant of that register.
1233 Register DstRegW = TRI->getSubReg(Reg: DstRegX, Idx: AArch64::sub_32);
1234 // Update the result of LDP to use the W instead of the X variant.
1235 DstMO.setReg(DstRegW);
1236 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1237 LLVM_DEBUG(dbgs() << "\n");
1238 // Make the machine verifier happy by providing a definition for
1239 // the X register.
1240 // Insert this definition right after the generated LDP, i.e., before
1241 // InsertionPoint.
1242 MachineInstrBuilder MIBKill =
1243 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::KILL), DestReg: DstRegW)
1244 .addReg(RegNo: DstRegW)
1245 .addReg(RegNo: DstRegX, flags: RegState::Define);
1246 MIBKill->getOperand(i: 2).setImplicit();
1247 // Create the sign extension.
1248 MachineInstrBuilder MIBSXTW =
1249 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: AArch64::SBFMXri), DestReg: DstRegX)
1250 .addReg(RegNo: DstRegX)
1251 .addImm(Val: 0)
1252 .addImm(Val: 31);
1253 (void)MIBSXTW;
1254
1255 // In the case of a sign-extend, where we have something like:
1256 // debugValueSubstitutions:[]
1257 // $w1 = LDRWui $x0, 1, debug-instr-number 1
1258 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1259 // $x0 = LDRSWui $x0, 0, debug-instr-number 2
1260 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1261
1262 // It will be converted to:
1263 // debugValueSubstitutions:[]
1264 // $w0, $w1 = LDPWi $x0, 0
1265 // $w0 = KILL $w0, implicit-def $x0
1266 // $x0 = SBFMXri $x0, 0, 31
1267 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1268 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1269
1270 // We want the final result to look like:
1271 // debugValueSubstitutions:
1272 // - { srcinst: 1, srcop: 0, dstinst: 4, dstop: 1, subreg: 0 }
1273 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1274 // $w0, $w1 = LDPWi $x0, 0, debug-instr-number 4
1275 // $w0 = KILL $w0, implicit-def $x0
1276 // $x0 = SBFMXri $x0, 0, 31, debug-instr-number 3
1277 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1278 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1279
1280 // $x0 is where the final value is stored, so the sign extend (SBFMXri)
1281 // instruction contains the final value we care about we give it a new
1282 // debug-instr-number 3. Whereas, $w1 contains the final value that we care
1283 // about, therefore the LDP instruction is also given a new
1284 // debug-instr-number 4. We have to add these substitutions to the
1285 // debugValueSubstitutions table. However, we also have to ensure that the
1286 // OpIndex that pointed to debug-instr-number 1 gets updated to 1, because
1287 // $w1 is the second operand of the LDP instruction.
1288
1289 if (I->peekDebugInstrNum()) {
1290 // If I is the instruction which got sign extended and has a
1291 // debug-instr-number, give the SBFMXri instruction a new
1292 // debug-instr-number, and update the debugValueSubstitutions table with
1293 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1294 // instruction a new debug-instr-number, and update the
1295 // debugValueSubstitutions table with the new debug-instr-number and
1296 // OpIndex pair.
1297 unsigned NewInstrNum;
1298 if (DstRegX == I->getOperand(i: 0).getReg()) {
1299 NewInstrNum = MIBSXTW->getDebugInstrNum();
1300 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I,
1301 MergedInstr&: *MIBSXTW);
1302 } else {
1303 NewInstrNum = MIB->getDebugInstrNum();
1304 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I, MergedInstr&: *MIB);
1305 }
1306 }
1307 if (Paired->peekDebugInstrNum()) {
1308 // If Paired is the instruction which got sign extended and has a
1309 // debug-instr-number, give the SBFMXri instruction a new
1310 // debug-instr-number, and update the debugValueSubstitutions table with
1311 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1312 // instruction a new debug-instr-number, and update the
1313 // debugValueSubstitutions table with the new debug-instr-number and
1314 // OpIndex pair.
1315 unsigned NewInstrNum;
1316 if (DstRegX == Paired->getOperand(i: 0).getReg()) {
1317 NewInstrNum = MIBSXTW->getDebugInstrNum();
1318 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1319 MergedInstr&: *MIBSXTW);
1320 } else {
1321 NewInstrNum = MIB->getDebugInstrNum();
1322 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1323 MergedInstr&: *MIB);
1324 }
1325 }
1326
1327 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1328 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1329 } else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) {
1330 // We are combining SVE fill/spill to LDP/STP, so we need to use the Q
1331 // variant of the registers.
1332 MachineOperand &MOp0 = MIB->getOperand(i: 0);
1333 MachineOperand &MOp1 = MIB->getOperand(i: 1);
1334 assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
1335 AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
1336 MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
1337 MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
1338 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1339 } else {
1340
1341 // In the case that the merge doesn't result in a sign-extend, if we have
1342 // something like:
1343 // debugValueSubstitutions:[]
1344 // $x1 = LDRXui $x0, 1, debug-instr-number 1
1345 // DBG_INSTR_REF !13, dbg-instr-ref(1, 0), debug-location !11
1346 // $x0 = LDRXui killed $x0, 0, debug-instr-number 2
1347 // DBG_INSTR_REF !14, dbg-instr-ref(2, 0), debug-location !11
1348
1349 // It will be converted to:
1350 // debugValueSubstitutions: []
1351 // $x0, $x1 = LDPXi $x0, 0
1352 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1353 // DBG_INSTR_REF !13, dbg-instr-ref(2, 0), debug-location !14
1354
1355 // We want the final result to look like:
1356 // debugValueSubstitutions:
1357 // - { srcinst: 1, srcop: 0, dstinst: 3, dstop: 1, subreg: 0 }
1358 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1359 // $x0, $x1 = LDPXi $x0, 0, debug-instr-number 3
1360 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1361 // DBG_INSTR_REF !12, dbg-instr-ref(2, 0), debug-location !14
1362
1363 // Here all that needs to be done is, that the LDP instruction needs to be
1364 // updated with a new debug-instr-number, we then need to add entries into
1365 // the debugSubstitutions table to map the old instr-refs to the new ones.
1366
1367 // Assign new DebugInstrNum to the Paired instruction.
1368 if (I->peekDebugInstrNum()) {
1369 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1370 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *I,
1371 MergedInstr&: *MIB);
1372 }
1373 if (Paired->peekDebugInstrNum()) {
1374 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1375 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *Paired,
1376 MergedInstr&: *MIB);
1377 }
1378
1379 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1380 }
1381 LLVM_DEBUG(dbgs() << "\n");
1382
1383 if (MergeForward)
1384 for (const MachineOperand &MOP : phys_regs_and_masks(MI: *I))
1385 if (MOP.isReg() && MOP.isKill())
1386 DefinedInBB.addReg(Reg: MOP.getReg());
1387
1388 // Erase the old instructions.
1389 I->eraseFromParent();
1390 Paired->eraseFromParent();
1391
1392 return NextI;
1393}
1394
1395MachineBasicBlock::iterator
1396AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1397 MachineBasicBlock::iterator StoreI) {
1398 MachineBasicBlock::iterator NextI =
1399 next_nodbg(It: LoadI, End: LoadI->getParent()->end());
1400
1401 int LoadSize = TII->getMemScale(MI: *LoadI);
1402 int StoreSize = TII->getMemScale(MI: *StoreI);
1403 Register LdRt = getLdStRegOp(MI&: *LoadI).getReg();
1404 const MachineOperand &StMO = getLdStRegOp(MI&: *StoreI);
1405 Register StRt = getLdStRegOp(MI&: *StoreI).getReg();
1406 bool IsStoreXReg = TRI->getRegClass(i: AArch64::GPR64RegClassID)->contains(Reg: StRt);
1407
1408 assert((IsStoreXReg ||
1409 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1410 "Unexpected RegClass");
1411
1412 MachineInstr *BitExtMI;
1413 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1414 // Remove the load, if the destination register of the loads is the same
1415 // register for stored value.
1416 if (StRt == LdRt && LoadSize == 8) {
1417 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1418 y: LoadI->getIterator())) {
1419 if (MI.killsRegister(Reg: StRt, TRI)) {
1420 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1421 break;
1422 }
1423 }
1424 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1425 LLVM_DEBUG(LoadI->print(dbgs()));
1426 LLVM_DEBUG(dbgs() << "\n");
1427 LoadI->eraseFromParent();
1428 return NextI;
1429 }
1430 // Replace the load with a mov if the load and store are in the same size.
1431 BitExtMI =
1432 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1433 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), DestReg: LdRt)
1434 .addReg(RegNo: IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1435 .add(MO: StMO)
1436 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
1437 .setMIFlags(LoadI->getFlags());
1438 } else {
1439 // FIXME: Currently we disable this transformation in big-endian targets as
1440 // performance and correctness are verified only in little-endian.
1441 if (!Subtarget->isLittleEndian())
1442 return NextI;
1443 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: *LoadI);
1444 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1445 "Unsupported ld/st match");
1446 assert(LoadSize <= StoreSize && "Invalid load size");
1447 int UnscaledLdOffset =
1448 IsUnscaled
1449 ? AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm()
1450 : AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm() * LoadSize;
1451 int UnscaledStOffset =
1452 IsUnscaled
1453 ? AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm()
1454 : AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm() * StoreSize;
1455 int Width = LoadSize * 8;
1456 Register DestReg =
1457 IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1458 Reg: LdRt, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64RegClass))
1459 : LdRt;
1460
1461 assert((UnscaledLdOffset >= UnscaledStOffset &&
1462 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1463 "Invalid offset");
1464
1465 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1466 int Imms = Immr + Width - 1;
1467 if (UnscaledLdOffset == UnscaledStOffset) {
1468 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1469 | ((Immr) << 6) // immr
1470 | ((Imms) << 0) // imms
1471 ;
1472
1473 BitExtMI =
1474 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1475 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1476 DestReg)
1477 .add(MO: StMO)
1478 .addImm(Val: AndMaskEncoded)
1479 .setMIFlags(LoadI->getFlags());
1480 } else if (IsStoreXReg && Imms == 31) {
1481 // Use the 32 bit variant of UBFM if it's the LSR alias of the
1482 // instruction.
1483 assert(Immr <= Imms && "Expected LSR alias of UBFM");
1484 BitExtMI = BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1485 MCID: TII->get(Opcode: AArch64::UBFMWri),
1486 DestReg: TRI->getSubReg(Reg: DestReg, Idx: AArch64::sub_32))
1487 .addReg(RegNo: TRI->getSubReg(Reg: StRt, Idx: AArch64::sub_32))
1488 .addImm(Val: Immr)
1489 .addImm(Val: Imms)
1490 .setMIFlags(LoadI->getFlags());
1491 } else {
1492 BitExtMI =
1493 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1494 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1495 DestReg)
1496 .add(MO: StMO)
1497 .addImm(Val: Immr)
1498 .addImm(Val: Imms)
1499 .setMIFlags(LoadI->getFlags());
1500 }
1501 }
1502
1503 // Clear kill flags between store and load.
1504 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1505 y: BitExtMI->getIterator()))
1506 if (MI.killsRegister(Reg: StRt, TRI)) {
1507 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1508 break;
1509 }
1510
1511 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1512 LLVM_DEBUG(StoreI->print(dbgs()));
1513 LLVM_DEBUG(dbgs() << " ");
1514 LLVM_DEBUG(LoadI->print(dbgs()));
1515 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1516 LLVM_DEBUG(StoreI->print(dbgs()));
1517 LLVM_DEBUG(dbgs() << " ");
1518 LLVM_DEBUG((BitExtMI)->print(dbgs()));
1519 LLVM_DEBUG(dbgs() << "\n");
1520
1521 // Erase the old instructions.
1522 LoadI->eraseFromParent();
1523 return NextI;
1524}
1525
1526static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1527 // Convert the byte-offset used by unscaled into an "element" offset used
1528 // by the scaled pair load/store instructions.
1529 if (IsUnscaled) {
1530 // If the byte-offset isn't a multiple of the stride, there's no point
1531 // trying to match it.
1532 if (Offset % OffsetStride)
1533 return false;
1534 Offset /= OffsetStride;
1535 }
1536 return Offset <= 63 && Offset >= -64;
1537}
1538
1539// Do alignment, specialized to power of 2 and for signed ints,
1540// avoiding having to do a C-style cast from uint_64t to int when
1541// using alignTo from include/llvm/Support/MathExtras.h.
1542// FIXME: Move this function to include/MathExtras.h?
1543static int alignTo(int Num, int PowOf2) {
1544 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1545}
1546
1547static bool mayAlias(MachineInstr &MIa,
1548 SmallVectorImpl<MachineInstr *> &MemInsns,
1549 AliasAnalysis *AA) {
1550 for (MachineInstr *MIb : MemInsns) {
1551 if (MIa.mayAlias(AA, Other: *MIb, /*UseTBAA*/ false)) {
1552 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1553 return true;
1554 }
1555 }
1556
1557 LLVM_DEBUG(dbgs() << "No aliases found\n");
1558 return false;
1559}
1560
1561bool AArch64LoadStoreOpt::findMatchingStore(
1562 MachineBasicBlock::iterator I, unsigned Limit,
1563 MachineBasicBlock::iterator &StoreI) {
1564 MachineBasicBlock::iterator B = I->getParent()->begin();
1565 MachineBasicBlock::iterator MBBI = I;
1566 MachineInstr &LoadMI = *I;
1567 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: LoadMI).getReg();
1568
1569 // If the load is the first instruction in the block, there's obviously
1570 // not any matching store.
1571 if (MBBI == B)
1572 return false;
1573
1574 // Track which register units have been modified and used between the first
1575 // insn and the second insn.
1576 ModifiedRegUnits.clear();
1577 UsedRegUnits.clear();
1578
1579 unsigned Count = 0;
1580 do {
1581 MBBI = prev_nodbg(It: MBBI, Begin: B);
1582 MachineInstr &MI = *MBBI;
1583
1584 // Don't count transient instructions towards the search limit since there
1585 // may be different numbers of them if e.g. debug information is present.
1586 if (!MI.isTransient())
1587 ++Count;
1588
1589 // If the load instruction reads directly from the address to which the
1590 // store instruction writes and the stored value is not modified, we can
1591 // promote the load. Since we do not handle stores with pre-/post-index,
1592 // it's unnecessary to check if BaseReg is modified by the store itself.
1593 // Also we can't handle stores without an immediate offset operand,
1594 // while the operand might be the address for a global variable.
1595 if (MI.mayStore() && isMatchingStore(LoadInst&: LoadMI, StoreInst&: MI) &&
1596 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1597 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1598 isLdOffsetInRangeOfSt(LoadInst&: LoadMI, StoreInst&: MI, TII) &&
1599 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg())) {
1600 StoreI = MBBI;
1601 return true;
1602 }
1603
1604 if (MI.isCall())
1605 return false;
1606
1607 // Update modified / uses register units.
1608 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1609
1610 // Otherwise, if the base register is modified, we have no match, so
1611 // return early.
1612 if (!ModifiedRegUnits.available(Reg: BaseReg))
1613 return false;
1614
1615 // If we encounter a store aliased with the load, return early.
1616 if (MI.mayStore() && LoadMI.mayAlias(AA, Other: MI, /*UseTBAA*/ false))
1617 return false;
1618 } while (MBBI != B && Count < Limit);
1619 return false;
1620}
1621
1622static bool needsWinCFI(const MachineFunction *MF) {
1623 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1624 MF->getFunction().needsUnwindTableEntry();
1625}
1626
1627// Returns true if FirstMI and MI are candidates for merging or pairing.
1628// Otherwise, returns false.
1629static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1630 LdStPairFlags &Flags,
1631 const AArch64InstrInfo *TII) {
1632 // If this is volatile or if pairing is suppressed, not a candidate.
1633 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1634 return false;
1635
1636 // We should have already checked FirstMI for pair suppression and volatility.
1637 assert(!FirstMI.hasOrderedMemoryRef() &&
1638 !TII->isLdStPairSuppressed(FirstMI) &&
1639 "FirstMI shouldn't get here if either of these checks are true.");
1640
1641 if (needsWinCFI(MF: MI.getMF()) && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
1642 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
1643 return false;
1644
1645 unsigned OpcA = FirstMI.getOpcode();
1646 unsigned OpcB = MI.getOpcode();
1647
1648 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1649 if (OpcA == OpcB)
1650 return !AArch64InstrInfo::isPreLdSt(MI: FirstMI);
1651
1652 // Bail out if one of the opcodes is SVE fill/spill, as we currently don't
1653 // allow pairing them with other instructions.
1654 if (OpcA == AArch64::LDR_ZXI || OpcA == AArch64::STR_ZXI ||
1655 OpcB == AArch64::LDR_ZXI || OpcB == AArch64::STR_ZXI)
1656 return false;
1657
1658 // Two pre ld/st of different opcodes cannot be merged either
1659 if (AArch64InstrInfo::isPreLdSt(MI: FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1660 return false;
1661
1662 // Try to match a sign-extended load/store with a zero-extended load/store.
1663 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1664 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc: OpcA, IsValidLdStrOpc: &IsValidLdStrOpc);
1665 assert(IsValidLdStrOpc &&
1666 "Given Opc should be a Load or Store with an immediate");
1667 // OpcA will be the first instruction in the pair.
1668 if (NonSExtOpc == getMatchingNonSExtOpcode(Opc: OpcB, IsValidLdStrOpc: &PairIsValidLdStrOpc)) {
1669 Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1670 return true;
1671 }
1672
1673 // If the second instruction isn't even a mergable/pairable load/store, bail
1674 // out.
1675 if (!PairIsValidLdStrOpc)
1676 return false;
1677
1678 // Narrow stores do not have a matching pair opcodes, so constrain their
1679 // merging to zero stores.
1680 if (isNarrowStore(Opc: OpcA) || isNarrowStore(Opc: OpcB))
1681 return getLdStRegOp(MI&: FirstMI).getReg() == AArch64::WZR &&
1682 getLdStRegOp(MI).getReg() == AArch64::WZR &&
1683 TII->getMemScale(MI: FirstMI) == TII->getMemScale(MI);
1684
1685 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1686 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1687 // are candidate pairs that can be merged.
1688 if (isPreLdStPairCandidate(FirstMI, MI))
1689 return true;
1690
1691 // Try to match an unscaled load/store with a scaled load/store.
1692 return TII->hasUnscaledLdStOffset(Opc: OpcA) != TII->hasUnscaledLdStOffset(Opc: OpcB) &&
1693 getMatchingPairOpcode(Opc: OpcA) == getMatchingPairOpcode(Opc: OpcB);
1694
1695 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1696}
1697
1698static bool canRenameMOP(const MachineOperand &MOP,
1699 const TargetRegisterInfo *TRI) {
1700 if (MOP.isReg()) {
1701 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: MOP.getReg());
1702 // Renaming registers with multiple disjunct sub-registers (e.g. the
1703 // result of a LD3) means that all sub-registers are renamed, potentially
1704 // impacting other instructions we did not check. Bail out.
1705 // Note that this relies on the structure of the AArch64 register file. In
1706 // particular, a subregister cannot be written without overwriting the
1707 // whole register.
1708 if (RegClass->HasDisjunctSubRegs && RegClass->CoveredBySubRegs &&
1709 (TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::dsub0) ||
1710 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::qsub0) ||
1711 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::zsub0))) {
1712 LLVM_DEBUG(
1713 dbgs()
1714 << " Cannot rename operands with multiple disjunct subregisters ("
1715 << MOP << ")\n");
1716 return false;
1717 }
1718
1719 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1720 // them must be known. For example, in ORRWrs the implicit-def
1721 // corresponds to the result register.
1722 if (MOP.isImplicit() && MOP.isDef()) {
1723 if (!isRewritableImplicitDef(Opc: MOP.getParent()->getOpcode()))
1724 return false;
1725 return TRI->isSuperOrSubRegisterEq(
1726 RegA: MOP.getParent()->getOperand(i: 0).getReg(), RegB: MOP.getReg());
1727 }
1728 }
1729 return MOP.isImplicit() ||
1730 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1731}
1732
1733static bool
1734canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1735 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1736 const TargetRegisterInfo *TRI) {
1737 if (!FirstMI.mayStore())
1738 return false;
1739
1740 // Check if we can find an unused register which we can use to rename
1741 // the register used by the first load/store.
1742
1743 auto RegToRename = getLdStRegOp(MI&: FirstMI).getReg();
1744 // For now, we only rename if the store operand gets killed at the store.
1745 if (!getLdStRegOp(MI&: FirstMI).isKill() &&
1746 !any_of(Range: FirstMI.operands(),
1747 P: [TRI, RegToRename](const MachineOperand &MOP) {
1748 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1749 MOP.isImplicit() && MOP.isKill() &&
1750 TRI->regsOverlap(RegA: RegToRename, RegB: MOP.getReg());
1751 })) {
1752 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1753 return false;
1754 }
1755
1756 bool FoundDef = false;
1757
1758 // For each instruction between FirstMI and the previous def for RegToRename,
1759 // we
1760 // * check if we can rename RegToRename in this instruction
1761 // * collect the registers used and required register classes for RegToRename.
1762 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1763 bool IsDef) {
1764 LLVM_DEBUG(dbgs() << "Checking " << MI);
1765 // Currently we do not try to rename across frame-setup instructions.
1766 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1767 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1768 << "currently\n");
1769 return false;
1770 }
1771
1772 UsedInBetween.accumulate(MI);
1773
1774 // For a definition, check that we can rename the definition and exit the
1775 // loop.
1776 FoundDef = IsDef;
1777
1778 // For defs, check if we can rename the first def of RegToRename.
1779 if (FoundDef) {
1780 // For some pseudo instructions, we might not generate code in the end
1781 // (e.g. KILL) and we would end up without a correct def for the rename
1782 // register.
1783 // TODO: This might be overly conservative and we could handle those cases
1784 // in multiple ways:
1785 // 1. Insert an extra copy, to materialize the def.
1786 // 2. Skip pseudo-defs until we find an non-pseudo def.
1787 if (MI.isPseudo()) {
1788 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1789 return false;
1790 }
1791
1792 for (auto &MOP : MI.operands()) {
1793 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1794 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1795 continue;
1796 if (!canRenameMOP(MOP, TRI)) {
1797 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1798 return false;
1799 }
1800 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1801 }
1802 return true;
1803 } else {
1804 for (auto &MOP : MI.operands()) {
1805 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1806 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1807 continue;
1808
1809 if (!canRenameMOP(MOP, TRI)) {
1810 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1811 return false;
1812 }
1813 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1814 }
1815 }
1816 return true;
1817 };
1818
1819 if (!forAllMIsUntilDef(MI&: FirstMI, DefReg: RegToRename, TRI, Limit: LdStLimit, Fn&: CheckMIs))
1820 return false;
1821
1822 if (!FoundDef) {
1823 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1824 return false;
1825 }
1826 return true;
1827}
1828
1829// We want to merge the second load into the first by rewriting the usages of
1830// the same reg between first (incl.) and second (excl.). We don't need to care
1831// about any insns before FirstLoad or after SecondLoad.
1832// 1. The second load writes new value into the same reg.
1833// - The renaming is impossible to impact later use of the reg.
1834// - The second load always trash the value written by the first load which
1835// means the reg must be killed before the second load.
1836// 2. The first load must be a def for the same reg so we don't need to look
1837// into anything before it.
1838static bool canRenameUntilSecondLoad(
1839 MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1840 LiveRegUnits &UsedInBetween,
1841 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1842 const TargetRegisterInfo *TRI) {
1843 if (FirstLoad.isPseudo())
1844 return false;
1845
1846 UsedInBetween.accumulate(MI: FirstLoad);
1847 auto RegToRename = getLdStRegOp(MI&: FirstLoad).getReg();
1848 bool Success = std::all_of(
1849 first: FirstLoad.getIterator(), last: SecondLoad.getIterator(),
1850 pred: [&](MachineInstr &MI) {
1851 LLVM_DEBUG(dbgs() << "Checking " << MI);
1852 // Currently we do not try to rename across frame-setup instructions.
1853 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1854 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1855 << "currently\n");
1856 return false;
1857 }
1858
1859 for (auto &MOP : MI.operands()) {
1860 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1861 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1862 continue;
1863 if (!canRenameMOP(MOP, TRI)) {
1864 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1865 return false;
1866 }
1867 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1868 }
1869
1870 return true;
1871 });
1872 return Success;
1873}
1874
1875// Check if we can find a physical register for renaming \p Reg. This register
1876// must:
1877// * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1878// defined registers up to the point where the renamed register will be used,
1879// * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1880// registers in the range the rename register will be used,
1881// * is available in all used register classes (checked using RequiredClasses).
1882static std::optional<MCPhysReg> tryToFindRegisterToRename(
1883 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1884 LiveRegUnits &UsedInBetween,
1885 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1886 const TargetRegisterInfo *TRI) {
1887 const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1888
1889 // Checks if any sub- or super-register of PR is callee saved.
1890 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1891 return any_of(Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1892 P: [&MF, TRI](MCPhysReg SubOrSuper) {
1893 return TRI->isCalleeSavedPhysReg(PhysReg: SubOrSuper, MF);
1894 });
1895 };
1896
1897 // Check if PR or one of its sub- or super-registers can be used for all
1898 // required register classes.
1899 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1900 return all_of(Range&: RequiredClasses, P: [PR, TRI](const TargetRegisterClass *C) {
1901 return any_of(
1902 Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1903 P: [C](MCPhysReg SubOrSuper) { return C->contains(Reg: SubOrSuper); });
1904 });
1905 };
1906
1907 auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1908 for (const MCPhysReg &PR : *RegClass) {
1909 if (DefinedInBB.available(Reg: PR) && UsedInBetween.available(Reg: PR) &&
1910 !RegInfo.isReserved(PhysReg: PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1911 CanBeUsedForAllClasses(PR)) {
1912 DefinedInBB.addReg(Reg: PR);
1913 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1914 << "\n");
1915 return {PR};
1916 }
1917 }
1918 LLVM_DEBUG(dbgs() << "No rename register found from "
1919 << TRI->getRegClassName(RegClass) << "\n");
1920 return std::nullopt;
1921}
1922
1923// For store pairs: returns a register from FirstMI to the beginning of the
1924// block that can be renamed.
1925// For load pairs: returns a register from FirstMI to MI that can be renamed.
1926static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1927 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1928 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1929 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1930 const TargetRegisterInfo *TRI) {
1931 std::optional<MCPhysReg> RenameReg;
1932 if (!DebugCounter::shouldExecute(CounterName: RegRenamingCounter))
1933 return RenameReg;
1934
1935 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: getLdStRegOp(MI&: FirstMI).getReg());
1936 MachineFunction &MF = *FirstMI.getParent()->getParent();
1937 if (!RegClass || !MF.getRegInfo().tracksLiveness())
1938 return RenameReg;
1939
1940 const bool IsLoad = FirstMI.mayLoad();
1941
1942 if (!MaybeCanRename) {
1943 if (IsLoad)
1944 MaybeCanRename = {canRenameUntilSecondLoad(FirstLoad&: FirstMI, SecondLoad&: MI, UsedInBetween,
1945 RequiredClasses, TRI)};
1946 else
1947 MaybeCanRename = {
1948 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1949 }
1950
1951 if (*MaybeCanRename) {
1952 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1953 RequiredClasses, TRI);
1954 }
1955 return RenameReg;
1956}
1957
1958/// Scan the instructions looking for a load/store that can be combined with the
1959/// current instruction into a wider equivalent or a load/store pair.
1960MachineBasicBlock::iterator
1961AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1962 LdStPairFlags &Flags, unsigned Limit,
1963 bool FindNarrowMerge) {
1964 MachineBasicBlock::iterator E = I->getParent()->end();
1965 MachineBasicBlock::iterator MBBI = I;
1966 MachineBasicBlock::iterator MBBIWithRenameReg;
1967 MachineInstr &FirstMI = *I;
1968 MBBI = next_nodbg(It: MBBI, End: E);
1969
1970 bool MayLoad = FirstMI.mayLoad();
1971 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: FirstMI);
1972 Register Reg = getLdStRegOp(MI&: FirstMI).getReg();
1973 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: FirstMI).getReg();
1974 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: FirstMI).getImm();
1975 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: FirstMI) : 1;
1976 bool IsPromotableZeroStore = isPromotableZeroStoreInst(MI&: FirstMI);
1977
1978 std::optional<bool> MaybeCanRename;
1979 if (!EnableRenaming)
1980 MaybeCanRename = {false};
1981
1982 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1983 LiveRegUnits UsedInBetween;
1984 UsedInBetween.init(TRI: *TRI);
1985
1986 Flags.clearRenameReg();
1987
1988 // Track which register units have been modified and used between the first
1989 // insn (inclusive) and the second insn.
1990 ModifiedRegUnits.clear();
1991 UsedRegUnits.clear();
1992
1993 // Remember any instructions that read/write memory between FirstMI and MI.
1994 SmallVector<MachineInstr *, 4> MemInsns;
1995
1996 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
1997 for (unsigned Count = 0; MBBI != E && Count < Limit;
1998 MBBI = next_nodbg(It: MBBI, End: E)) {
1999 MachineInstr &MI = *MBBI;
2000 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
2001
2002 UsedInBetween.accumulate(MI);
2003
2004 // Don't count transient instructions towards the search limit since there
2005 // may be different numbers of them if e.g. debug information is present.
2006 if (!MI.isTransient())
2007 ++Count;
2008
2009 Flags.setSExtIdx(-1);
2010 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
2011 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
2012 assert(MI.mayLoadOrStore() && "Expected memory operation.");
2013 // If we've found another instruction with the same opcode, check to see
2014 // if the base and offset are compatible with our starting instruction.
2015 // These instructions all have scaled immediate operands, so we just
2016 // check for +1/-1. Make sure to check the new instruction offset is
2017 // actually an immediate and not a symbolic reference destined for
2018 // a relocation.
2019 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
2020 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2021 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
2022 if (IsUnscaled != MIIsUnscaled) {
2023 // We're trying to pair instructions that differ in how they are scaled.
2024 // If FirstMI is scaled then scale the offset of MI accordingly.
2025 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
2026 int MemSize = TII->getMemScale(MI);
2027 if (MIIsUnscaled) {
2028 // If the unscaled offset isn't a multiple of the MemSize, we can't
2029 // pair the operations together: bail and keep looking.
2030 if (MIOffset % MemSize) {
2031 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2032 UsedRegUnits, TRI);
2033 MemInsns.push_back(Elt: &MI);
2034 continue;
2035 }
2036 MIOffset /= MemSize;
2037 } else {
2038 MIOffset *= MemSize;
2039 }
2040 }
2041
2042 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
2043
2044 if (BaseReg == MIBaseReg) {
2045 // If the offset of the second ld/st is not equal to the size of the
2046 // destination register it can’t be paired with a pre-index ld/st
2047 // pair. Additionally if the base reg is used or modified the operations
2048 // can't be paired: bail and keep looking.
2049 if (IsPreLdSt) {
2050 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
2051 bool IsBaseRegUsed = !UsedRegUnits.available(
2052 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2053 bool IsBaseRegModified = !ModifiedRegUnits.available(
2054 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2055 // If the stored value and the address of the second instruction is
2056 // the same, it needs to be using the updated register and therefore
2057 // it must not be folded.
2058 bool IsMIRegTheSame =
2059 TRI->regsOverlap(RegA: getLdStRegOp(MI).getReg(),
2060 RegB: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2061 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
2062 IsMIRegTheSame) {
2063 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2064 UsedRegUnits, TRI);
2065 MemInsns.push_back(Elt: &MI);
2066 continue;
2067 }
2068 } else {
2069 if ((Offset != MIOffset + OffsetStride) &&
2070 (Offset + OffsetStride != MIOffset)) {
2071 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2072 UsedRegUnits, TRI);
2073 MemInsns.push_back(Elt: &MI);
2074 continue;
2075 }
2076 }
2077
2078 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
2079 if (FindNarrowMerge) {
2080 // If the alignment requirements of the scaled wide load/store
2081 // instruction can't express the offset of the scaled narrow input,
2082 // bail and keep looking. For promotable zero stores, allow only when
2083 // the stored value is the same (i.e., WZR).
2084 if ((!IsUnscaled && alignTo(Num: MinOffset, PowOf2: 2) != MinOffset) ||
2085 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
2086 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2087 UsedRegUnits, TRI);
2088 MemInsns.push_back(Elt: &MI);
2089 continue;
2090 }
2091 } else {
2092 // Pairwise instructions have a 7-bit signed offset field. Single
2093 // insns have a 12-bit unsigned offset field. If the resultant
2094 // immediate offset of merging these instructions is out of range for
2095 // a pairwise instruction, bail and keep looking.
2096 if (!inBoundsForPair(IsUnscaled, Offset: MinOffset, OffsetStride)) {
2097 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2098 UsedRegUnits, TRI);
2099 MemInsns.push_back(Elt: &MI);
2100 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
2101 << "keep looking.\n");
2102 continue;
2103 }
2104 // If the alignment requirements of the paired (scaled) instruction
2105 // can't express the offset of the unscaled input, bail and keep
2106 // looking.
2107 if (IsUnscaled && (alignTo(Num: MinOffset, PowOf2: OffsetStride) != MinOffset)) {
2108 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2109 UsedRegUnits, TRI);
2110 MemInsns.push_back(Elt: &MI);
2111 LLVM_DEBUG(dbgs()
2112 << "Offset doesn't fit due to alignment requirements, "
2113 << "keep looking.\n");
2114 continue;
2115 }
2116 }
2117
2118 // If the BaseReg has been modified, then we cannot do the optimization.
2119 // For example, in the following pattern
2120 // ldr x1 [x2]
2121 // ldr x2 [x3]
2122 // ldr x4 [x2, #8],
2123 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
2124 if (!ModifiedRegUnits.available(Reg: BaseReg))
2125 return E;
2126
2127 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
2128 RegA: Reg, RegB: getLdStRegOp(MI).getReg());
2129
2130 // If the Rt of the second instruction (destination register of the
2131 // load) was not modified or used between the two instructions and none
2132 // of the instructions between the second and first alias with the
2133 // second, we can combine the second into the first.
2134 bool RtNotModified =
2135 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg());
2136 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
2137 !UsedRegUnits.available(Reg: getLdStRegOp(MI).getReg()));
2138
2139 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
2140 << "Reg '" << getLdStRegOp(MI) << "' not modified: "
2141 << (RtNotModified ? "true" : "false") << "\n"
2142 << "Reg '" << getLdStRegOp(MI) << "' not used: "
2143 << (RtNotUsed ? "true" : "false") << "\n");
2144
2145 if (RtNotModified && RtNotUsed && !mayAlias(MIa&: MI, MemInsns, AA)) {
2146 // For pairs loading into the same reg, try to find a renaming
2147 // opportunity to allow the renaming of Reg between FirstMI and MI
2148 // and combine MI into FirstMI; otherwise bail and keep looking.
2149 if (SameLoadReg) {
2150 std::optional<MCPhysReg> RenameReg =
2151 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
2152 Reg, DefinedInBB, UsedInBetween,
2153 RequiredClasses, TRI);
2154 if (!RenameReg) {
2155 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2156 UsedRegUnits, TRI);
2157 MemInsns.push_back(Elt: &MI);
2158 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
2159 << "keep looking.\n");
2160 continue;
2161 }
2162 Flags.setRenameReg(*RenameReg);
2163 }
2164
2165 Flags.setMergeForward(false);
2166 if (!SameLoadReg)
2167 Flags.clearRenameReg();
2168 return MBBI;
2169 }
2170
2171 // Likewise, if the Rt of the first instruction is not modified or used
2172 // between the two instructions and none of the instructions between the
2173 // first and the second alias with the first, we can combine the first
2174 // into the second.
2175 RtNotModified = !(
2176 MayLoad && !UsedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg()));
2177
2178 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
2179 << "Reg '" << getLdStRegOp(FirstMI)
2180 << "' not modified: "
2181 << (RtNotModified ? "true" : "false") << "\n");
2182
2183 if (RtNotModified && !mayAlias(MIa&: FirstMI, MemInsns, AA)) {
2184 if (ModifiedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg())) {
2185 Flags.setMergeForward(true);
2186 Flags.clearRenameReg();
2187 return MBBI;
2188 }
2189
2190 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
2191 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
2192 RequiredClasses, TRI);
2193 if (RenameReg) {
2194 Flags.setMergeForward(true);
2195 Flags.setRenameReg(*RenameReg);
2196 MBBIWithRenameReg = MBBI;
2197 }
2198 }
2199 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
2200 << "interference in between, keep looking.\n");
2201 }
2202 }
2203
2204 if (Flags.getRenameReg())
2205 return MBBIWithRenameReg;
2206
2207 // If the instruction wasn't a matching load or store. Stop searching if we
2208 // encounter a call instruction that might modify memory.
2209 if (MI.isCall()) {
2210 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
2211 return E;
2212 }
2213
2214 // Update modified / uses register units.
2215 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2216
2217 // Otherwise, if the base register is modified, we have no match, so
2218 // return early.
2219 if (!ModifiedRegUnits.available(Reg: BaseReg)) {
2220 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
2221 return E;
2222 }
2223
2224 // Update list of instructions that read/write memory.
2225 if (MI.mayLoadOrStore())
2226 MemInsns.push_back(Elt: &MI);
2227 }
2228 return E;
2229}
2230
2231static MachineBasicBlock::iterator
2232maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
2233 assert((MI.getOpcode() == AArch64::SUBXri ||
2234 MI.getOpcode() == AArch64::ADDXri) &&
2235 "Expected a register update instruction");
2236 auto End = MI.getParent()->end();
2237 if (MaybeCFI == End ||
2238 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
2239 !(MI.getFlag(Flag: MachineInstr::FrameSetup) ||
2240 MI.getFlag(Flag: MachineInstr::FrameDestroy)) ||
2241 MI.getOperand(i: 0).getReg() != AArch64::SP)
2242 return End;
2243
2244 const MachineFunction &MF = *MI.getParent()->getParent();
2245 unsigned CFIIndex = MaybeCFI->getOperand(i: 0).getCFIIndex();
2246 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
2247 switch (CFI.getOperation()) {
2248 case MCCFIInstruction::OpDefCfa:
2249 case MCCFIInstruction::OpDefCfaOffset:
2250 return MaybeCFI;
2251 default:
2252 return End;
2253 }
2254}
2255
2256std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
2257 MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
2258 bool IsForward, bool IsPreIdx, bool MergeEither) {
2259 assert((Update->getOpcode() == AArch64::ADDXri ||
2260 Update->getOpcode() == AArch64::SUBXri) &&
2261 "Unexpected base register update instruction to merge!");
2262 MachineBasicBlock::iterator E = I->getParent()->end();
2263 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2264
2265 // If updating the SP and the following instruction is CFA offset related CFI,
2266 // make sure the CFI follows the SP update either by merging at the location
2267 // of the update or by moving the CFI after the merged instruction. If unable
2268 // to do so, bail.
2269 MachineBasicBlock::iterator InsertPt = I;
2270 if (IsForward) {
2271 assert(IsPreIdx);
2272 if (auto CFI = maybeMoveCFI(MI&: *Update, MaybeCFI: next_nodbg(It: Update, End: E)); CFI != E) {
2273 if (MergeEither) {
2274 InsertPt = Update;
2275 } else {
2276 // Take care not to reorder CFIs.
2277 if (std::any_of(first: std::next(x: CFI), last: I, pred: [](const auto &Insn) {
2278 return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
2279 }))
2280 return std::nullopt;
2281
2282 MachineBasicBlock *MBB = InsertPt->getParent();
2283 MBB->splice(Where: std::next(x: InsertPt), Other: MBB, From: CFI);
2284 }
2285 }
2286 }
2287
2288 // Return the instruction following the merged instruction, which is
2289 // the instruction following our unmerged load. Unless that's the add/sub
2290 // instruction we're merging, in which case it's the one after that.
2291 if (NextI == Update)
2292 NextI = next_nodbg(It: NextI, End: E);
2293
2294 int Value = Update->getOperand(i: 2).getImm();
2295 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
2296 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
2297 if (Update->getOpcode() == AArch64::SUBXri)
2298 Value = -Value;
2299
2300 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(Opc: I->getOpcode())
2301 : getPostIndexedOpcode(Opc: I->getOpcode());
2302 MachineInstrBuilder MIB;
2303 int Scale, MinOffset, MaxOffset;
2304 getPrePostIndexedMemOpInfo(MI: *I, Scale, MinOffset, MaxOffset);
2305 if (!AArch64InstrInfo::isPairedLdSt(MI: *I)) {
2306 // Non-paired instruction.
2307 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2308 MCID: TII->get(Opcode: NewOpc))
2309 .add(MO: Update->getOperand(i: 0))
2310 .add(MO: getLdStRegOp(MI&: *I))
2311 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2312 .addImm(Val: Value / Scale)
2313 .setMemRefs(I->memoperands())
2314 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2315 } else {
2316 // Paired instruction.
2317 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2318 MCID: TII->get(Opcode: NewOpc))
2319 .add(MO: Update->getOperand(i: 0))
2320 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 0))
2321 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 1))
2322 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2323 .addImm(Val: Value / Scale)
2324 .setMemRefs(I->memoperands())
2325 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2326 }
2327
2328 if (IsPreIdx) {
2329 ++NumPreFolded;
2330 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2331 } else {
2332 ++NumPostFolded;
2333 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2334 }
2335 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2336 LLVM_DEBUG(I->print(dbgs()));
2337 LLVM_DEBUG(dbgs() << " ");
2338 LLVM_DEBUG(Update->print(dbgs()));
2339 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2340 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2341 LLVM_DEBUG(dbgs() << "\n");
2342
2343 // Erase the old instructions for the block.
2344 I->eraseFromParent();
2345 Update->eraseFromParent();
2346
2347 return NextI;
2348}
2349
2350MachineBasicBlock::iterator
2351AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2352 MachineBasicBlock::iterator Update,
2353 unsigned Offset, int Scale) {
2354 assert((Update->getOpcode() == AArch64::MOVKWi) &&
2355 "Unexpected const mov instruction to merge!");
2356 MachineBasicBlock::iterator E = I->getParent()->end();
2357 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2358 MachineBasicBlock::iterator PrevI = prev_nodbg(It: Update, Begin: E);
2359 MachineInstr &MemMI = *I;
2360 unsigned Mask = (1 << 12) * Scale - 1;
2361 unsigned Low = Offset & Mask;
2362 unsigned High = Offset - Low;
2363 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2364 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2365 MachineInstrBuilder AddMIB, MemMIB;
2366
2367 // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2368 AddMIB =
2369 BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: AArch64::ADDXri))
2370 .addDef(RegNo: IndexReg)
2371 .addUse(RegNo: BaseReg)
2372 .addImm(Val: High >> 12) // shifted value
2373 .addImm(Val: 12); // shift 12
2374 (void)AddMIB;
2375 // Ld/St DestReg, IndexReg, Imm12
2376 unsigned NewOpc = getBaseAddressOpcode(Opc: I->getOpcode());
2377 MemMIB = BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: NewOpc))
2378 .add(MO: getLdStRegOp(MI&: MemMI))
2379 .add(MO: AArch64InstrInfo::getLdStOffsetOp(MI: MemMI))
2380 .addImm(Val: Low / Scale)
2381 .setMemRefs(I->memoperands())
2382 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2383 (void)MemMIB;
2384
2385 ++NumConstOffsetFolded;
2386 LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2387 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2388 LLVM_DEBUG(PrevI->print(dbgs()));
2389 LLVM_DEBUG(dbgs() << " ");
2390 LLVM_DEBUG(Update->print(dbgs()));
2391 LLVM_DEBUG(dbgs() << " ");
2392 LLVM_DEBUG(I->print(dbgs()));
2393 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2394 LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2395 LLVM_DEBUG(dbgs() << " ");
2396 LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2397 LLVM_DEBUG(dbgs() << "\n");
2398
2399 // Erase the old instructions for the block.
2400 I->eraseFromParent();
2401 PrevI->eraseFromParent();
2402 Update->eraseFromParent();
2403
2404 return NextI;
2405}
2406
2407bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2408 MachineInstr &MI,
2409 unsigned BaseReg, int Offset) {
2410 switch (MI.getOpcode()) {
2411 default:
2412 break;
2413 case AArch64::SUBXri:
2414 case AArch64::ADDXri:
2415 // Make sure it's a vanilla immediate operand, not a relocation or
2416 // anything else we can't handle.
2417 if (!MI.getOperand(i: 2).isImm())
2418 break;
2419 // Watch out for 1 << 12 shifted value.
2420 if (AArch64_AM::getShiftValue(Imm: MI.getOperand(i: 3).getImm()))
2421 break;
2422
2423 // The update instruction source and destination register must be the
2424 // same as the load/store base register.
2425 if (MI.getOperand(i: 0).getReg() != BaseReg ||
2426 MI.getOperand(i: 1).getReg() != BaseReg)
2427 break;
2428
2429 int UpdateOffset = MI.getOperand(i: 2).getImm();
2430 if (MI.getOpcode() == AArch64::SUBXri)
2431 UpdateOffset = -UpdateOffset;
2432
2433 // The immediate must be a multiple of the scaling factor of the pre/post
2434 // indexed instruction.
2435 int Scale, MinOffset, MaxOffset;
2436 getPrePostIndexedMemOpInfo(MI: MemMI, Scale, MinOffset, MaxOffset);
2437 if (UpdateOffset % Scale != 0)
2438 break;
2439
2440 // Scaled offset must fit in the instruction immediate.
2441 int ScaledOffset = UpdateOffset / Scale;
2442 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2443 break;
2444
2445 // If we have a non-zero Offset, we check that it matches the amount
2446 // we're adding to the register.
2447 if (!Offset || Offset == UpdateOffset)
2448 return true;
2449 break;
2450 }
2451 return false;
2452}
2453
2454bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2455 MachineInstr &MI,
2456 unsigned IndexReg,
2457 unsigned &Offset) {
2458 // The update instruction source and destination register must be the
2459 // same as the load/store index register.
2460 if (MI.getOpcode() == AArch64::MOVKWi &&
2461 TRI->isSuperOrSubRegisterEq(RegA: IndexReg, RegB: MI.getOperand(i: 1).getReg())) {
2462
2463 // movz + movk hold a large offset of a Ld/St instruction.
2464 MachineBasicBlock::iterator B = MI.getParent()->begin();
2465 MachineBasicBlock::iterator MBBI = &MI;
2466 // Skip the scene when the MI is the first instruction of a block.
2467 if (MBBI == B)
2468 return false;
2469 MBBI = prev_nodbg(It: MBBI, Begin: B);
2470 MachineInstr &MovzMI = *MBBI;
2471 // Make sure the MOVKWi and MOVZWi set the same register.
2472 if (MovzMI.getOpcode() == AArch64::MOVZWi &&
2473 MovzMI.getOperand(i: 0).getReg() == MI.getOperand(i: 0).getReg()) {
2474 unsigned Low = MovzMI.getOperand(i: 1).getImm();
2475 unsigned High = MI.getOperand(i: 2).getImm() << MI.getOperand(i: 3).getImm();
2476 Offset = High + Low;
2477 // 12-bit optionally shifted immediates are legal for adds.
2478 return Offset >> 24 == 0;
2479 }
2480 }
2481 return false;
2482}
2483
2484MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2485 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2486 MachineBasicBlock::iterator E = I->getParent()->end();
2487 MachineInstr &MemMI = *I;
2488 MachineBasicBlock::iterator MBBI = I;
2489
2490 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2491 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm() *
2492 TII->getMemScale(MI: MemMI);
2493
2494 // Scan forward looking for post-index opportunities. Updating instructions
2495 // can't be formed if the memory instruction doesn't have the offset we're
2496 // looking for.
2497 if (MIUnscaledOffset != UnscaledOffset)
2498 return E;
2499
2500 // If the base register overlaps a source/destination register, we can't
2501 // merge the update. This does not apply to tag store instructions which
2502 // ignore the address part of the source register.
2503 // This does not apply to STGPi as well, which does not have unpredictable
2504 // behavior in this case unlike normal stores, and always performs writeback
2505 // after reading the source register value.
2506 if (!isTagStore(MI: MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2507 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2508 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2509 Register DestReg = getLdStRegOp(MI&: MemMI, PairedRegOp: i).getReg();
2510 if (DestReg == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg))
2511 return E;
2512 }
2513 }
2514
2515 // Track which register units have been modified and used between the first
2516 // insn (inclusive) and the second insn.
2517 ModifiedRegUnits.clear();
2518 UsedRegUnits.clear();
2519 MBBI = next_nodbg(It: MBBI, End: E);
2520
2521 // We can't post-increment the stack pointer if any instruction between
2522 // the memory access (I) and the increment (MBBI) can access the memory
2523 // region defined by [SP, MBBI].
2524 const bool BaseRegSP = BaseReg == AArch64::SP;
2525 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2526 // FIXME: For now, we always block the optimization over SP in windows
2527 // targets as it requires to adjust the unwind/debug info, messing up
2528 // the unwind info can actually cause a miscompile.
2529 return E;
2530 }
2531
2532 for (unsigned Count = 0; MBBI != E && Count < Limit;
2533 MBBI = next_nodbg(It: MBBI, End: E)) {
2534 MachineInstr &MI = *MBBI;
2535
2536 // Don't count transient instructions towards the search limit since there
2537 // may be different numbers of them if e.g. debug information is present.
2538 if (!MI.isTransient())
2539 ++Count;
2540
2541 // If we found a match, return it.
2542 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset: UnscaledOffset))
2543 return MBBI;
2544
2545 // Update the status of what the instruction clobbered and used.
2546 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2547
2548 // Otherwise, if the base register is used or modified, we have no match, so
2549 // return early.
2550 // If we are optimizing SP, do not allow instructions that may load or store
2551 // in between the load and the optimized value update.
2552 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2553 !UsedRegUnits.available(Reg: BaseReg) ||
2554 (BaseRegSP && MBBI->mayLoadOrStore()))
2555 return E;
2556 }
2557 return E;
2558}
2559
2560MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2561 MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
2562 MachineBasicBlock::iterator B = I->getParent()->begin();
2563 MachineBasicBlock::iterator E = I->getParent()->end();
2564 MachineInstr &MemMI = *I;
2565 MachineBasicBlock::iterator MBBI = I;
2566 MachineFunction &MF = *MemMI.getMF();
2567
2568 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2569 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm();
2570
2571 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2572 Register DestReg[] = {getLdStRegOp(MI&: MemMI, PairedRegOp: 0).getReg(),
2573 IsPairedInsn ? getLdStRegOp(MI&: MemMI, PairedRegOp: 1).getReg()
2574 : AArch64::NoRegister};
2575
2576 // If the load/store is the first instruction in the block, there's obviously
2577 // not any matching update. Ditto if the memory offset isn't zero.
2578 if (MBBI == B || Offset != 0)
2579 return E;
2580 // If the base register overlaps a destination register, we can't
2581 // merge the update.
2582 if (!isTagStore(MI: MemMI)) {
2583 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i)
2584 if (DestReg[i] == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg[i]))
2585 return E;
2586 }
2587
2588 const bool BaseRegSP = BaseReg == AArch64::SP;
2589 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2590 // FIXME: For now, we always block the optimization over SP in windows
2591 // targets as it requires to adjust the unwind/debug info, messing up
2592 // the unwind info can actually cause a miscompile.
2593 return E;
2594 }
2595
2596 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2597 unsigned RedZoneSize =
2598 Subtarget.getTargetLowering()->getRedZoneSize(F: MF.getFunction());
2599
2600 // Track which register units have been modified and used between the first
2601 // insn (inclusive) and the second insn.
2602 ModifiedRegUnits.clear();
2603 UsedRegUnits.clear();
2604 unsigned Count = 0;
2605 bool MemAccessBeforeSPPreInc = false;
2606 MergeEither = true;
2607 do {
2608 MBBI = prev_nodbg(It: MBBI, Begin: B);
2609 MachineInstr &MI = *MBBI;
2610
2611 // Don't count transient instructions towards the search limit since there
2612 // may be different numbers of them if e.g. debug information is present.
2613 if (!MI.isTransient())
2614 ++Count;
2615
2616 // If we found a match, return it.
2617 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset)) {
2618 // Check that the update value is within our red zone limit (which may be
2619 // zero).
2620 if (MemAccessBeforeSPPreInc && MBBI->getOperand(i: 2).getImm() > RedZoneSize)
2621 return E;
2622 return MBBI;
2623 }
2624
2625 // Update the status of what the instruction clobbered and used.
2626 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2627
2628 // Otherwise, if the base register is used or modified, we have no match, so
2629 // return early.
2630 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2631 !UsedRegUnits.available(Reg: BaseReg))
2632 return E;
2633
2634 // If we have a destination register (i.e. a load instruction) and a
2635 // destination register is used or modified, then we can only merge forward,
2636 // i.e. the combined instruction is put in the place of the memory
2637 // instruction. Same applies if we see a memory access or side effects.
2638 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() ||
2639 (DestReg[0] != AArch64::NoRegister &&
2640 !(ModifiedRegUnits.available(Reg: DestReg[0]) &&
2641 UsedRegUnits.available(Reg: DestReg[0]))) ||
2642 (DestReg[1] != AArch64::NoRegister &&
2643 !(ModifiedRegUnits.available(Reg: DestReg[1]) &&
2644 UsedRegUnits.available(Reg: DestReg[1]))))
2645 MergeEither = false;
2646
2647 // Keep track if we have a memory access before an SP pre-increment, in this
2648 // case we need to validate later that the update amount respects the red
2649 // zone.
2650 if (BaseRegSP && MBBI->mayLoadOrStore())
2651 MemAccessBeforeSPPreInc = true;
2652 } while (MBBI != B && Count < Limit);
2653 return E;
2654}
2655
2656MachineBasicBlock::iterator
2657AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2658 MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2659 MachineBasicBlock::iterator B = I->getParent()->begin();
2660 MachineBasicBlock::iterator E = I->getParent()->end();
2661 MachineInstr &MemMI = *I;
2662 MachineBasicBlock::iterator MBBI = I;
2663
2664 // If the load is the first instruction in the block, there's obviously
2665 // not any matching load or store.
2666 if (MBBI == B)
2667 return E;
2668
2669 // Make sure the IndexReg is killed and the shift amount is zero.
2670 // TODO: Relex this restriction to extend, simplify processing now.
2671 if (!AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).isKill() ||
2672 !AArch64InstrInfo::getLdStAmountOp(MI: MemMI).isImm() ||
2673 (AArch64InstrInfo::getLdStAmountOp(MI: MemMI).getImm() != 0))
2674 return E;
2675
2676 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2677
2678 // Track which register units have been modified and used between the first
2679 // insn (inclusive) and the second insn.
2680 ModifiedRegUnits.clear();
2681 UsedRegUnits.clear();
2682 unsigned Count = 0;
2683 do {
2684 MBBI = prev_nodbg(It: MBBI, Begin: B);
2685 MachineInstr &MI = *MBBI;
2686
2687 // Don't count transient instructions towards the search limit since there
2688 // may be different numbers of them if e.g. debug information is present.
2689 if (!MI.isTransient())
2690 ++Count;
2691
2692 // If we found a match, return it.
2693 if (isMatchingMovConstInsn(MemMI&: *I, MI, IndexReg, Offset)) {
2694 return MBBI;
2695 }
2696
2697 // Update the status of what the instruction clobbered and used.
2698 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2699
2700 // Otherwise, if the index register is used or modified, we have no match,
2701 // so return early.
2702 if (!ModifiedRegUnits.available(Reg: IndexReg) ||
2703 !UsedRegUnits.available(Reg: IndexReg))
2704 return E;
2705
2706 } while (MBBI != B && Count < Limit);
2707 return E;
2708}
2709
2710bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2711 MachineBasicBlock::iterator &MBBI) {
2712 MachineInstr &MI = *MBBI;
2713 // If this is a volatile load, don't mess with it.
2714 if (MI.hasOrderedMemoryRef())
2715 return false;
2716
2717 if (needsWinCFI(MF: MI.getMF()) && MI.getFlag(Flag: MachineInstr::FrameDestroy))
2718 return false;
2719
2720 // Make sure this is a reg+imm.
2721 // FIXME: It is possible to extend it to handle reg+reg cases.
2722 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2723 return false;
2724
2725 // Look backward up to LdStLimit instructions.
2726 MachineBasicBlock::iterator StoreI;
2727 if (findMatchingStore(I: MBBI, Limit: LdStLimit, StoreI)) {
2728 ++NumLoadsFromStoresPromoted;
2729 // Promote the load. Keeping the iterator straight is a
2730 // pain, so we let the merge routine tell us what the next instruction
2731 // is after it's done mucking about.
2732 MBBI = promoteLoadFromStore(LoadI: MBBI, StoreI);
2733 return true;
2734 }
2735 return false;
2736}
2737
2738// Merge adjacent zero stores into a wider store.
2739bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2740 MachineBasicBlock::iterator &MBBI) {
2741 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2742 MachineInstr &MI = *MBBI;
2743 MachineBasicBlock::iterator E = MI.getParent()->end();
2744
2745 if (!TII->isCandidateToMergeOrPair(MI))
2746 return false;
2747
2748 // Look ahead up to LdStLimit instructions for a mergeable instruction.
2749 LdStPairFlags Flags;
2750 MachineBasicBlock::iterator MergeMI =
2751 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ true);
2752 if (MergeMI != E) {
2753 ++NumZeroStoresPromoted;
2754
2755 // Keeping the iterator straight is a pain, so we let the merge routine tell
2756 // us what the next instruction is after it's done mucking about.
2757 MBBI = mergeNarrowZeroStores(I: MBBI, MergeMI, Flags);
2758 return true;
2759 }
2760 return false;
2761}
2762
2763// Find loads and stores that can be merged into a single load or store pair
2764// instruction.
2765bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2766 MachineInstr &MI = *MBBI;
2767 MachineBasicBlock::iterator E = MI.getParent()->end();
2768
2769 if (!TII->isCandidateToMergeOrPair(MI))
2770 return false;
2771
2772 // If disable-ldp feature is opted, do not emit ldp.
2773 if (MI.mayLoad() && Subtarget->hasDisableLdp())
2774 return false;
2775
2776 // If disable-stp feature is opted, do not emit stp.
2777 if (MI.mayStore() && Subtarget->hasDisableStp())
2778 return false;
2779
2780 // Early exit if the offset is not possible to match. (6 bits of positive
2781 // range, plus allow an extra one in case we find a later insn that matches
2782 // with Offset-1)
2783 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2784 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2785 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2786 // Allow one more for offset.
2787 if (Offset > 0)
2788 Offset -= OffsetStride;
2789 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2790 return false;
2791
2792 // Look ahead up to LdStLimit instructions for a pairable instruction.
2793 LdStPairFlags Flags;
2794 MachineBasicBlock::iterator Paired =
2795 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ false);
2796 if (Paired != E) {
2797 // Keeping the iterator straight is a pain, so we let the merge routine tell
2798 // us what the next instruction is after it's done mucking about.
2799 auto Prev = std::prev(x: MBBI);
2800
2801 // Fetch the memoperand of the load/store that is a candidate for
2802 // combination.
2803 MachineMemOperand *MemOp =
2804 MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2805
2806 // If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2807 // that the alignment of the source pointer is at least double the alignment
2808 // of the type.
2809 if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||
2810 (MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2811 // If there is no size/align information, cancel the transformation.
2812 if (!MemOp || !MemOp->getMemoryType().isValid()) {
2813 NumFailedAlignmentCheck++;
2814 return false;
2815 }
2816
2817 // Get the needed alignments to check them if
2818 // ldp-aligned-only/stp-aligned-only features are opted.
2819 uint64_t MemAlignment = MemOp->getAlign().value();
2820 uint64_t TypeAlignment =
2821 Align(MemOp->getSize().getValue().getKnownMinValue()).value();
2822
2823 if (MemAlignment < 2 * TypeAlignment) {
2824 NumFailedAlignmentCheck++;
2825 return false;
2826 }
2827 }
2828
2829 ++NumPairCreated;
2830 if (TII->hasUnscaledLdStOffset(MI))
2831 ++NumUnscaledPairCreated;
2832
2833 MBBI = mergePairedInsns(I: MBBI, Paired, Flags);
2834 // Collect liveness info for instructions between Prev and the new position
2835 // MBBI.
2836 for (auto I = std::next(x: Prev); I != MBBI; I++)
2837 updateDefinedRegisters(MI&: *I, Units&: DefinedInBB, TRI);
2838
2839 return true;
2840 }
2841 return false;
2842}
2843
2844bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2845 (MachineBasicBlock::iterator &MBBI) {
2846 MachineInstr &MI = *MBBI;
2847 MachineBasicBlock::iterator E = MI.getParent()->end();
2848 MachineBasicBlock::iterator Update;
2849
2850 // Look forward to try to form a post-index instruction. For example,
2851 // ldr x0, [x20]
2852 // add x20, x20, #32
2853 // merged into:
2854 // ldr x0, [x20], #32
2855 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset: 0, Limit: UpdateLimit);
2856 if (Update != E) {
2857 // Merge the update into the ld/st.
2858 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2859 /*IsPreIdx=*/false,
2860 /*MergeEither=*/false)) {
2861 MBBI = *NextI;
2862 return true;
2863 }
2864 }
2865
2866 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2867 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2868 return false;
2869
2870 // Look back to try to find a pre-index instruction. For example,
2871 // add x0, x0, #8
2872 // ldr x1, [x0]
2873 // merged into:
2874 // ldr x1, [x0, #8]!
2875 bool MergeEither;
2876 Update = findMatchingUpdateInsnBackward(I: MBBI, Limit: UpdateLimit, MergeEither);
2877 if (Update != E) {
2878 // Merge the update into the ld/st.
2879 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/true,
2880 /*IsPreIdx=*/true, MergeEither)) {
2881 MBBI = *NextI;
2882 return true;
2883 }
2884 }
2885
2886 // The immediate in the load/store is scaled by the size of the memory
2887 // operation. The immediate in the add we're looking for,
2888 // however, is not, so adjust here.
2889 int UnscaledOffset =
2890 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2891
2892 // Look forward to try to find a pre-index instruction. For example,
2893 // ldr x1, [x0, #64]
2894 // add x0, x0, #64
2895 // merged into:
2896 // ldr x1, [x0, #64]!
2897 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset, Limit: UpdateLimit);
2898 if (Update != E) {
2899 // Merge the update into the ld/st.
2900 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2901 /*IsPreIdx=*/true,
2902 /*MergeEither=*/false)) {
2903 MBBI = *NextI;
2904 return true;
2905 }
2906 }
2907
2908 return false;
2909}
2910
2911bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2912 int Scale) {
2913 MachineInstr &MI = *MBBI;
2914 MachineBasicBlock::iterator E = MI.getParent()->end();
2915 MachineBasicBlock::iterator Update;
2916
2917 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2918 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2919 return false;
2920
2921 // Look back to try to find a const offset for index LdSt instruction. For
2922 // example,
2923 // mov x8, #LargeImm ; = a * (1<<12) + imm12
2924 // ldr x1, [x0, x8]
2925 // merged into:
2926 // add x8, x0, a * (1<<12)
2927 // ldr x1, [x8, imm12]
2928 unsigned Offset;
2929 Update = findMatchingConstOffsetBackward(I: MBBI, Limit: LdStConstLimit, Offset);
2930 if (Update != E && (Offset & (Scale - 1)) == 0) {
2931 // Merge the imm12 into the ld/st.
2932 MBBI = mergeConstOffsetInsn(I: MBBI, Update, Offset, Scale);
2933 return true;
2934 }
2935
2936 return false;
2937}
2938
2939bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2940 bool EnableNarrowZeroStOpt) {
2941 AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
2942
2943 bool Modified = false;
2944 // Four transformations to do here:
2945 // 1) Find loads that directly read from stores and promote them by
2946 // replacing with mov instructions. If the store is wider than the load,
2947 // the load will be replaced with a bitfield extract.
2948 // e.g.,
2949 // str w1, [x0, #4]
2950 // ldrh w2, [x0, #6]
2951 // ; becomes
2952 // str w1, [x0, #4]
2953 // lsr w2, w1, #16
2954 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2955 MBBI != E;) {
2956 if (isPromotableLoadFromStore(MI&: *MBBI) && tryToPromoteLoadFromStore(MBBI))
2957 Modified = true;
2958 else
2959 ++MBBI;
2960 }
2961 // 2) Merge adjacent zero stores into a wider store.
2962 // e.g.,
2963 // strh wzr, [x0]
2964 // strh wzr, [x0, #2]
2965 // ; becomes
2966 // str wzr, [x0]
2967 // e.g.,
2968 // str wzr, [x0]
2969 // str wzr, [x0, #4]
2970 // ; becomes
2971 // str xzr, [x0]
2972 if (EnableNarrowZeroStOpt)
2973 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2974 MBBI != E;) {
2975 if (isPromotableZeroStoreInst(MI&: *MBBI) && tryToMergeZeroStInst(MBBI))
2976 Modified = true;
2977 else
2978 ++MBBI;
2979 }
2980 // 3) Find loads and stores that can be merged into a single load or store
2981 // pair instruction.
2982 // When compiling for SVE 128, also try to combine SVE fill/spill
2983 // instructions into LDP/STP.
2984 // e.g.,
2985 // ldr x0, [x2]
2986 // ldr x1, [x2, #8]
2987 // ; becomes
2988 // ldp x0, x1, [x2]
2989 // e.g.,
2990 // ldr z0, [x2]
2991 // ldr z1, [x2, #1, mul vl]
2992 // ; becomes
2993 // ldp q0, q1, [x2]
2994
2995 if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2996 DefinedInBB.clear();
2997 DefinedInBB.addLiveIns(MBB);
2998 }
2999
3000 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3001 MBBI != E;) {
3002 // Track currently live registers up to this point, to help with
3003 // searching for a rename register on demand.
3004 updateDefinedRegisters(MI&: *MBBI, Units&: DefinedInBB, TRI);
3005 if (TII->isPairableLdStInst(MI: *MBBI) && tryToPairLdStInst(MBBI))
3006 Modified = true;
3007 else
3008 ++MBBI;
3009 }
3010 // 4) Find base register updates that can be merged into the load or store
3011 // as a base-reg writeback.
3012 // e.g.,
3013 // ldr x0, [x2]
3014 // add x2, x2, #4
3015 // ; becomes
3016 // ldr x0, [x2], #4
3017 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3018 MBBI != E;) {
3019 if (isMergeableLdStUpdate(MI&: *MBBI, AFI) && tryToMergeLdStUpdate(MBBI))
3020 Modified = true;
3021 else
3022 ++MBBI;
3023 }
3024
3025 // 5) Find a register assigned with a const value that can be combined with
3026 // into the load or store. e.g.,
3027 // mov x8, #LargeImm ; = a * (1<<12) + imm12
3028 // ldr x1, [x0, x8]
3029 // ; becomes
3030 // add x8, x0, a * (1<<12)
3031 // ldr x1, [x8, imm12]
3032 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3033 MBBI != E;) {
3034 int Scale;
3035 if (isMergeableIndexLdSt(MI&: *MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
3036 Modified = true;
3037 else
3038 ++MBBI;
3039 }
3040
3041 return Modified;
3042}
3043
3044bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
3045 if (skipFunction(F: Fn.getFunction()))
3046 return false;
3047
3048 Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
3049 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
3050 TRI = Subtarget->getRegisterInfo();
3051 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3052
3053 // Resize the modified and used register unit trackers. We do this once
3054 // per function and then clear the register units each time we optimize a load
3055 // or store.
3056 ModifiedRegUnits.init(TRI: *TRI);
3057 UsedRegUnits.init(TRI: *TRI);
3058 DefinedInBB.init(TRI: *TRI);
3059
3060 bool Modified = false;
3061 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
3062 for (auto &MBB : Fn) {
3063 auto M = optimizeBlock(MBB, EnableNarrowZeroStOpt: enableNarrowZeroStOpt);
3064 Modified |= M;
3065 }
3066
3067 return Modified;
3068}
3069
3070// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
3071// stores near one another? Note: The pre-RA instruction scheduler already has
3072// hooks to try and schedule pairable loads/stores together to improve pairing
3073// opportunities. Thus, pre-RA pairing pass may not be worth the effort.
3074
3075// FIXME: When pairing store instructions it's very possible for this pass to
3076// hoist a store with a KILL marker above another use (without a KILL marker).
3077// The resulting IR is invalid, but nothing uses the KILL markers after this
3078// pass, so it's never caused a problem in practice.
3079
3080/// createAArch64LoadStoreOptimizationPass - returns an instance of the
3081/// load / store optimization pass.
3082FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
3083 return new AArch64LoadStoreOpt();
3084}
3085