1//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains a pass that performs load / store related peephole
10// optimizations. This pass should be run after register allocation.
11//
12// The pass runs after the PrologEpilogInserter where we emit the CFI
13// instructions. In order to preserve the correctness of the unwind information,
14// the pass should not change the order of any two instructions, one of which
15// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16// to unwind information.
17//
18//===----------------------------------------------------------------------===//
19
20#include "AArch64InstrInfo.h"
21#include "AArch64MachineFunctionInfo.h"
22#include "AArch64Subtarget.h"
23#include "MCTargetDesc/AArch64AddressingModes.h"
24#include "llvm/ADT/SetVector.h"
25#include "llvm/ADT/SmallVector.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringRef.h"
28#include "llvm/ADT/iterator_range.h"
29#include "llvm/Analysis/AliasAnalysis.h"
30#include "llvm/CodeGen/MachineBasicBlock.h"
31#include "llvm/CodeGen/MachineFunction.h"
32#include "llvm/CodeGen/MachineFunctionPass.h"
33#include "llvm/CodeGen/MachineInstr.h"
34#include "llvm/CodeGen/MachineInstrBuilder.h"
35#include "llvm/CodeGen/MachineOperand.h"
36#include "llvm/CodeGen/MachineRegisterInfo.h"
37#include "llvm/CodeGen/TargetRegisterInfo.h"
38#include "llvm/IR/DebugLoc.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCDwarf.h"
41#include "llvm/Pass.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/DebugCounter.h"
45#include "llvm/Support/ErrorHandling.h"
46#include <cassert>
47#include <cstdint>
48#include <functional>
49#include <iterator>
50#include <limits>
51#include <optional>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "aarch64-ldst-opt"
56
57STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
58STATISTIC(NumPostFolded, "Number of post-index updates folded");
59STATISTIC(NumPreFolded, "Number of pre-index updates folded");
60STATISTIC(NumUnscaledPairCreated,
61 "Number of load/store from unscaled generated");
62STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
63STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
64STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
65 "not passed the alignment check");
66STATISTIC(NumConstOffsetFolded,
67 "Number of const offset of index address folded");
68STATISTIC(NumUMOVFoldedToFPRStore,
69 "Number of UMOV + GPR stores folded to FPR stores");
70
71DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
72 "Controls which pairs are considered for renaming");
73
74// The LdStLimit limits how far we search for load/store pairs.
75static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
76 cl::init(Val: 20), cl::Hidden);
77
78// The UpdateLimit limits how far we search for update instructions when we form
79// pre-/post-index instructions.
80static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(Val: 100),
81 cl::Hidden);
82
83// The LdStConstLimit limits how far we search for const offset instructions
84// when we form index address load/store instructions.
85static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
86 cl::init(Val: 10), cl::Hidden);
87
88// The UMOVFoldLimit limits how far back we scan from a GPR store to find a
89// UMOV that can be folded into a direct FPR store.
90static cl::opt<unsigned> UMOVFoldLimit("aarch64-umov-fold-scan-limit",
91 cl::init(Val: 16), cl::Hidden);
92
93// Enable register renaming to find additional store pairing opportunities.
94static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
95 cl::init(Val: true), cl::Hidden);
96
97#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
98
99namespace {
100
101using LdStPairFlags = struct LdStPairFlags {
102 // If a matching instruction is found, MergeForward is set to true if the
103 // merge is to remove the first instruction and replace the second with
104 // a pair-wise insn, and false if the reverse is true.
105 bool MergeForward = false;
106
107 // SExtIdx gives the index of the result of the load pair that must be
108 // extended. The value of SExtIdx assumes that the paired load produces the
109 // value in this order: (I, returned iterator), i.e., -1 means no value has
110 // to be extended, 0 means I, and 1 means the returned iterator.
111 int SExtIdx = -1;
112
113 // If not none, RenameReg can be used to rename the result register of the
114 // first store in a pair. Currently this only works when merging stores
115 // forward.
116 std::optional<MCPhysReg> RenameReg;
117
118 LdStPairFlags() = default;
119
120 void setMergeForward(bool V = true) { MergeForward = V; }
121 bool getMergeForward() const { return MergeForward; }
122
123 void setSExtIdx(int V) { SExtIdx = V; }
124 int getSExtIdx() const { return SExtIdx; }
125
126 void setRenameReg(MCPhysReg R) { RenameReg = R; }
127 void clearRenameReg() { RenameReg = std::nullopt; }
128 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
129};
130
131struct AArch64LoadStoreOpt {
132 AliasAnalysis *AA;
133 const AArch64InstrInfo *TII;
134 const TargetRegisterInfo *TRI;
135 const AArch64Subtarget *Subtarget;
136
137 // Track which register units have been modified and used.
138 LiveRegUnits ModifiedRegUnits, UsedRegUnits;
139 LiveRegUnits DefinedInBB;
140
141 // Scan the instructions looking for a load/store that can be combined
142 // with the current instruction into a load/store pair.
143 // Return the matching instruction if one is found, else MBB->end().
144 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
145 LdStPairFlags &Flags,
146 unsigned Limit,
147 bool FindNarrowMerge);
148
149 // Scan the instructions looking for a store that writes to the address from
150 // which the current load instruction reads. Return true if one is found.
151 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
152 MachineBasicBlock::iterator &StoreI);
153
154 // Merge the two instructions indicated into a wider narrow store instruction.
155 MachineBasicBlock::iterator
156 mergeNarrowZeroStores(MachineBasicBlock::iterator I,
157 MachineBasicBlock::iterator MergeMI,
158 const LdStPairFlags &Flags);
159
160 // Merge the two instructions indicated into a single pair-wise instruction.
161 MachineBasicBlock::iterator
162 mergePairedInsns(MachineBasicBlock::iterator I,
163 MachineBasicBlock::iterator Paired,
164 const LdStPairFlags &Flags);
165
166 // Promote the load that reads directly from the address stored to.
167 MachineBasicBlock::iterator
168 promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
169 MachineBasicBlock::iterator StoreI);
170
171 // Scan the instruction list to find a base register update that can
172 // be combined with the current instruction (a load or store) using
173 // pre or post indexed addressing with writeback. Scan forwards.
174 MachineBasicBlock::iterator
175 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
176 int UnscaledOffset, unsigned Limit);
177
178 // Scan the instruction list to find a register assigned with a const
179 // value that can be combined with the current instruction (a load or store)
180 // using base addressing with writeback. Scan backwards.
181 MachineBasicBlock::iterator
182 findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
183 unsigned &Offset);
184
185 // Scan the instruction list to find a base register update that can
186 // be combined with the current instruction (a load or store) using
187 // pre or post indexed addressing with writeback. Scan backwards.
188 // `MergeEither` is set to true if the combined instruction may be placed
189 // either at the location of the load/store instruction or at the location of
190 // the update instruction.
191 MachineBasicBlock::iterator
192 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
193 bool &MergeEither);
194
195 // Find an instruction that updates the base register of the ld/st
196 // instruction.
197 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
198 unsigned BaseReg, int Offset);
199
200 bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
201 unsigned IndexReg, unsigned &Offset);
202
203 // Merge a pre- or post-index base register update into a ld/st instruction.
204 std::optional<MachineBasicBlock::iterator>
205 mergeUpdateInsn(MachineBasicBlock::iterator I,
206 MachineBasicBlock::iterator Update, bool IsForward,
207 bool IsPreIdx, bool MergeEither);
208
209 MachineBasicBlock::iterator
210 mergeConstOffsetInsn(MachineBasicBlock::iterator I,
211 MachineBasicBlock::iterator Update, unsigned Offset,
212 int Scale);
213
214 // Find and merge zero store instructions.
215 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
216
217 // Find and pair ldr/str instructions.
218 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
219
220 // Find and promote load instructions which read directly from store.
221 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
222
223 // Find and merge a base register updates before or after a ld/st instruction.
224 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
225
226 // Find and merge an index ldr/st instruction into a base ld/st instruction.
227 bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
228
229 // Replace a UMOV (lane 0) + GPR store with a direct FPR sub-register store.
230 bool tryToReplaceUMOVStore(MachineBasicBlock::iterator &MBBI);
231
232 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
233
234 bool runOnMachineFunction(MachineFunction &MF);
235};
236
237struct AArch64LoadStoreOptLegacy : public MachineFunctionPass {
238 static char ID;
239
240 AArch64LoadStoreOptLegacy() : MachineFunctionPass(ID) {}
241
242 bool runOnMachineFunction(MachineFunction &Fn) override;
243
244 void getAnalysisUsage(AnalysisUsage &AU) const override {
245 AU.addRequired<AAResultsWrapperPass>();
246 MachineFunctionPass::getAnalysisUsage(AU);
247 }
248
249 MachineFunctionProperties getRequiredProperties() const override {
250 return MachineFunctionProperties().setNoVRegs();
251 }
252
253 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
254};
255
256char AArch64LoadStoreOptLegacy::ID = 0;
257
258} // end anonymous namespace
259
260INITIALIZE_PASS(AArch64LoadStoreOptLegacy, "aarch64-ldst-opt",
261 AARCH64_LOAD_STORE_OPT_NAME, false, false)
262
263static bool isNarrowStore(unsigned Opc) {
264 switch (Opc) {
265 default:
266 return false;
267 case AArch64::STRBBui:
268 case AArch64::STURBBi:
269 case AArch64::STRHHui:
270 case AArch64::STURHHi:
271 return true;
272 }
273}
274
275// These instruction set memory tag and either keep memory contents unchanged or
276// set it to zero, ignoring the address part of the source register.
277static bool isTagStore(const MachineInstr &MI) {
278 switch (MI.getOpcode()) {
279 default:
280 return false;
281 case AArch64::STGi:
282 case AArch64::STZGi:
283 case AArch64::ST2Gi:
284 case AArch64::STZ2Gi:
285 return true;
286 }
287}
288
289static unsigned getMatchingNonSExtOpcode(unsigned Opc,
290 bool *IsValidLdStrOpc = nullptr) {
291 if (IsValidLdStrOpc)
292 *IsValidLdStrOpc = true;
293 switch (Opc) {
294 default:
295 if (IsValidLdStrOpc)
296 *IsValidLdStrOpc = false;
297 return std::numeric_limits<unsigned>::max();
298 case AArch64::STRDui:
299 case AArch64::STURDi:
300 case AArch64::STRDpre:
301 case AArch64::STRQui:
302 case AArch64::STURQi:
303 case AArch64::STRQpre:
304 case AArch64::STRBBui:
305 case AArch64::STURBBi:
306 case AArch64::STRHHui:
307 case AArch64::STURHHi:
308 case AArch64::STRWui:
309 case AArch64::STRWpre:
310 case AArch64::STURWi:
311 case AArch64::STRXui:
312 case AArch64::STRXpre:
313 case AArch64::STURXi:
314 case AArch64::STR_ZXI:
315 case AArch64::LDRDui:
316 case AArch64::LDURDi:
317 case AArch64::LDRDpre:
318 case AArch64::LDRQui:
319 case AArch64::LDURQi:
320 case AArch64::LDRQpre:
321 case AArch64::LDRWui:
322 case AArch64::LDURWi:
323 case AArch64::LDRWpre:
324 case AArch64::LDRXui:
325 case AArch64::LDURXi:
326 case AArch64::LDRXpre:
327 case AArch64::STRSui:
328 case AArch64::STURSi:
329 case AArch64::STRSpre:
330 case AArch64::LDRSui:
331 case AArch64::LDURSi:
332 case AArch64::LDRSpre:
333 case AArch64::LDR_ZXI:
334 return Opc;
335 case AArch64::LDRSWui:
336 return AArch64::LDRWui;
337 case AArch64::LDURSWi:
338 return AArch64::LDURWi;
339 case AArch64::LDRSWpre:
340 return AArch64::LDRWpre;
341 }
342}
343
344static unsigned getMatchingWideOpcode(unsigned Opc) {
345 switch (Opc) {
346 default:
347 llvm_unreachable("Opcode has no wide equivalent!");
348 case AArch64::STRBBui:
349 return AArch64::STRHHui;
350 case AArch64::STRHHui:
351 return AArch64::STRWui;
352 case AArch64::STURBBi:
353 return AArch64::STURHHi;
354 case AArch64::STURHHi:
355 return AArch64::STURWi;
356 case AArch64::STURWi:
357 return AArch64::STURXi;
358 case AArch64::STRWui:
359 return AArch64::STRXui;
360 }
361}
362
363static unsigned getMatchingPairOpcode(unsigned Opc) {
364 switch (Opc) {
365 default:
366 llvm_unreachable("Opcode has no pairwise equivalent!");
367 case AArch64::STRSui:
368 case AArch64::STURSi:
369 return AArch64::STPSi;
370 case AArch64::STRSpre:
371 return AArch64::STPSpre;
372 case AArch64::STRDui:
373 case AArch64::STURDi:
374 return AArch64::STPDi;
375 case AArch64::STRDpre:
376 return AArch64::STPDpre;
377 case AArch64::STRQui:
378 case AArch64::STURQi:
379 case AArch64::STR_ZXI:
380 return AArch64::STPQi;
381 case AArch64::STRQpre:
382 return AArch64::STPQpre;
383 case AArch64::STRWui:
384 case AArch64::STURWi:
385 return AArch64::STPWi;
386 case AArch64::STRWpre:
387 return AArch64::STPWpre;
388 case AArch64::STRXui:
389 case AArch64::STURXi:
390 return AArch64::STPXi;
391 case AArch64::STRXpre:
392 return AArch64::STPXpre;
393 case AArch64::LDRSui:
394 case AArch64::LDURSi:
395 return AArch64::LDPSi;
396 case AArch64::LDRSpre:
397 return AArch64::LDPSpre;
398 case AArch64::LDRDui:
399 case AArch64::LDURDi:
400 return AArch64::LDPDi;
401 case AArch64::LDRDpre:
402 return AArch64::LDPDpre;
403 case AArch64::LDRQui:
404 case AArch64::LDURQi:
405 case AArch64::LDR_ZXI:
406 return AArch64::LDPQi;
407 case AArch64::LDRQpre:
408 return AArch64::LDPQpre;
409 case AArch64::LDRWui:
410 case AArch64::LDURWi:
411 return AArch64::LDPWi;
412 case AArch64::LDRWpre:
413 return AArch64::LDPWpre;
414 case AArch64::LDRXui:
415 case AArch64::LDURXi:
416 return AArch64::LDPXi;
417 case AArch64::LDRXpre:
418 return AArch64::LDPXpre;
419 case AArch64::LDRSWui:
420 case AArch64::LDURSWi:
421 return AArch64::LDPSWi;
422 case AArch64::LDRSWpre:
423 return AArch64::LDPSWpre;
424 }
425}
426
427static unsigned isMatchingStore(MachineInstr &LoadInst,
428 MachineInstr &StoreInst) {
429 unsigned LdOpc = LoadInst.getOpcode();
430 unsigned StOpc = StoreInst.getOpcode();
431 switch (LdOpc) {
432 default:
433 llvm_unreachable("Unsupported load instruction!");
434 case AArch64::LDRBBui:
435 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
436 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
437 case AArch64::LDURBBi:
438 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
439 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
440 case AArch64::LDRHHui:
441 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
442 StOpc == AArch64::STRXui;
443 case AArch64::LDURHHi:
444 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
445 StOpc == AArch64::STURXi;
446 case AArch64::LDRWui:
447 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
448 case AArch64::LDURWi:
449 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
450 case AArch64::LDRXui:
451 return StOpc == AArch64::STRXui;
452 case AArch64::LDURXi:
453 return StOpc == AArch64::STURXi;
454 }
455}
456
457static unsigned getPreIndexedOpcode(unsigned Opc) {
458 // FIXME: We don't currently support creating pre-indexed loads/stores when
459 // the load or store is the unscaled version. If we decide to perform such an
460 // optimization in the future the cases for the unscaled loads/stores will
461 // need to be added here.
462 switch (Opc) {
463 default:
464 llvm_unreachable("Opcode has no pre-indexed equivalent!");
465 case AArch64::STRBui:
466 return AArch64::STRBpre;
467 case AArch64::STRHui:
468 return AArch64::STRHpre;
469 case AArch64::STRSui:
470 return AArch64::STRSpre;
471 case AArch64::STRDui:
472 return AArch64::STRDpre;
473 case AArch64::STRQui:
474 return AArch64::STRQpre;
475 case AArch64::STRBBui:
476 return AArch64::STRBBpre;
477 case AArch64::STRHHui:
478 return AArch64::STRHHpre;
479 case AArch64::STRWui:
480 return AArch64::STRWpre;
481 case AArch64::STRXui:
482 return AArch64::STRXpre;
483 case AArch64::LDRBui:
484 return AArch64::LDRBpre;
485 case AArch64::LDRHui:
486 return AArch64::LDRHpre;
487 case AArch64::LDRSui:
488 return AArch64::LDRSpre;
489 case AArch64::LDRDui:
490 return AArch64::LDRDpre;
491 case AArch64::LDRQui:
492 return AArch64::LDRQpre;
493 case AArch64::LDRBBui:
494 return AArch64::LDRBBpre;
495 case AArch64::LDRHHui:
496 return AArch64::LDRHHpre;
497 case AArch64::LDRWui:
498 return AArch64::LDRWpre;
499 case AArch64::LDRXui:
500 return AArch64::LDRXpre;
501 case AArch64::LDRSWui:
502 return AArch64::LDRSWpre;
503 case AArch64::LDPSi:
504 return AArch64::LDPSpre;
505 case AArch64::LDPSWi:
506 return AArch64::LDPSWpre;
507 case AArch64::LDPDi:
508 return AArch64::LDPDpre;
509 case AArch64::LDPQi:
510 return AArch64::LDPQpre;
511 case AArch64::LDPWi:
512 return AArch64::LDPWpre;
513 case AArch64::LDPXi:
514 return AArch64::LDPXpre;
515 case AArch64::STPSi:
516 return AArch64::STPSpre;
517 case AArch64::STPDi:
518 return AArch64::STPDpre;
519 case AArch64::STPQi:
520 return AArch64::STPQpre;
521 case AArch64::STPWi:
522 return AArch64::STPWpre;
523 case AArch64::STPXi:
524 return AArch64::STPXpre;
525 case AArch64::STGi:
526 return AArch64::STGPreIndex;
527 case AArch64::STZGi:
528 return AArch64::STZGPreIndex;
529 case AArch64::ST2Gi:
530 return AArch64::ST2GPreIndex;
531 case AArch64::STZ2Gi:
532 return AArch64::STZ2GPreIndex;
533 case AArch64::STGPi:
534 return AArch64::STGPpre;
535 }
536}
537
538static unsigned getBaseAddressOpcode(unsigned Opc) {
539 // TODO: Add more index address stores.
540 switch (Opc) {
541 default:
542 llvm_unreachable("Opcode has no base address equivalent!");
543 case AArch64::LDRBroX:
544 return AArch64::LDRBui;
545 case AArch64::LDRBBroX:
546 return AArch64::LDRBBui;
547 case AArch64::LDRSBXroX:
548 return AArch64::LDRSBXui;
549 case AArch64::LDRSBWroX:
550 return AArch64::LDRSBWui;
551 case AArch64::LDRHroX:
552 return AArch64::LDRHui;
553 case AArch64::LDRHHroX:
554 return AArch64::LDRHHui;
555 case AArch64::LDRSHXroX:
556 return AArch64::LDRSHXui;
557 case AArch64::LDRSHWroX:
558 return AArch64::LDRSHWui;
559 case AArch64::LDRWroX:
560 return AArch64::LDRWui;
561 case AArch64::LDRSroX:
562 return AArch64::LDRSui;
563 case AArch64::LDRSWroX:
564 return AArch64::LDRSWui;
565 case AArch64::LDRDroX:
566 return AArch64::LDRDui;
567 case AArch64::LDRXroX:
568 return AArch64::LDRXui;
569 case AArch64::LDRQroX:
570 return AArch64::LDRQui;
571 }
572}
573
574static unsigned getPostIndexedOpcode(unsigned Opc) {
575 switch (Opc) {
576 default:
577 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
578 case AArch64::STRBui:
579 return AArch64::STRBpost;
580 case AArch64::STRHui:
581 return AArch64::STRHpost;
582 case AArch64::STRSui:
583 case AArch64::STURSi:
584 return AArch64::STRSpost;
585 case AArch64::STRDui:
586 case AArch64::STURDi:
587 return AArch64::STRDpost;
588 case AArch64::STRQui:
589 case AArch64::STURQi:
590 return AArch64::STRQpost;
591 case AArch64::STRBBui:
592 return AArch64::STRBBpost;
593 case AArch64::STRHHui:
594 return AArch64::STRHHpost;
595 case AArch64::STRWui:
596 case AArch64::STURWi:
597 return AArch64::STRWpost;
598 case AArch64::STRXui:
599 case AArch64::STURXi:
600 return AArch64::STRXpost;
601 case AArch64::LDRBui:
602 return AArch64::LDRBpost;
603 case AArch64::LDRHui:
604 return AArch64::LDRHpost;
605 case AArch64::LDRSui:
606 case AArch64::LDURSi:
607 return AArch64::LDRSpost;
608 case AArch64::LDRDui:
609 case AArch64::LDURDi:
610 return AArch64::LDRDpost;
611 case AArch64::LDRQui:
612 case AArch64::LDURQi:
613 return AArch64::LDRQpost;
614 case AArch64::LDRBBui:
615 return AArch64::LDRBBpost;
616 case AArch64::LDRHHui:
617 return AArch64::LDRHHpost;
618 case AArch64::LDRWui:
619 case AArch64::LDURWi:
620 return AArch64::LDRWpost;
621 case AArch64::LDRXui:
622 case AArch64::LDURXi:
623 return AArch64::LDRXpost;
624 case AArch64::LDRSWui:
625 return AArch64::LDRSWpost;
626 case AArch64::LDPSi:
627 return AArch64::LDPSpost;
628 case AArch64::LDPSWi:
629 return AArch64::LDPSWpost;
630 case AArch64::LDPDi:
631 return AArch64::LDPDpost;
632 case AArch64::LDPQi:
633 return AArch64::LDPQpost;
634 case AArch64::LDPWi:
635 return AArch64::LDPWpost;
636 case AArch64::LDPXi:
637 return AArch64::LDPXpost;
638 case AArch64::STPSi:
639 return AArch64::STPSpost;
640 case AArch64::STPDi:
641 return AArch64::STPDpost;
642 case AArch64::STPQi:
643 return AArch64::STPQpost;
644 case AArch64::STPWi:
645 return AArch64::STPWpost;
646 case AArch64::STPXi:
647 return AArch64::STPXpost;
648 case AArch64::STGi:
649 return AArch64::STGPostIndex;
650 case AArch64::STZGi:
651 return AArch64::STZGPostIndex;
652 case AArch64::ST2Gi:
653 return AArch64::ST2GPostIndex;
654 case AArch64::STZ2Gi:
655 return AArch64::STZ2GPostIndex;
656 case AArch64::STGPi:
657 return AArch64::STGPpost;
658 }
659}
660
661static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
662
663 unsigned OpcA = FirstMI.getOpcode();
664 unsigned OpcB = MI.getOpcode();
665
666 switch (OpcA) {
667 default:
668 return false;
669 case AArch64::STRSpre:
670 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
671 case AArch64::STRDpre:
672 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
673 case AArch64::STRQpre:
674 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
675 case AArch64::STRWpre:
676 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
677 case AArch64::STRXpre:
678 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
679 case AArch64::LDRSpre:
680 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
681 case AArch64::LDRDpre:
682 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
683 case AArch64::LDRQpre:
684 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
685 case AArch64::LDRWpre:
686 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
687 case AArch64::LDRXpre:
688 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
689 case AArch64::LDRSWpre:
690 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
691 }
692}
693
694// Returns the scale and offset range of pre/post indexed variants of MI.
695static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
696 int &MinOffset, int &MaxOffset) {
697 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
698 bool IsTagStore = isTagStore(MI);
699 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
700 // as in the "unsigned offset" variant.
701 // All other pre/post indexed ldst instructions are unscaled.
702 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
703
704 if (IsPaired) {
705 MinOffset = -64;
706 MaxOffset = 63;
707 } else {
708 MinOffset = -256;
709 MaxOffset = 255;
710 }
711}
712
713static MachineOperand &getLdStRegOp(MachineInstr &MI,
714 unsigned PairedRegOp = 0) {
715 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
716 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
717 if (IsPreLdSt)
718 PairedRegOp += 1;
719 unsigned Idx =
720 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
721 return MI.getOperand(i: Idx);
722}
723
724static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
725 MachineInstr &StoreInst,
726 const AArch64InstrInfo *TII) {
727 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
728 int LoadSize = TII->getMemScale(MI: LoadInst);
729 int StoreSize = TII->getMemScale(MI: StoreInst);
730 int UnscaledStOffset =
731 TII->hasUnscaledLdStOffset(MI&: StoreInst)
732 ? AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm()
733 : AArch64InstrInfo::getLdStOffsetOp(MI: StoreInst).getImm() * StoreSize;
734 int UnscaledLdOffset =
735 TII->hasUnscaledLdStOffset(MI&: LoadInst)
736 ? AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm()
737 : AArch64InstrInfo::getLdStOffsetOp(MI: LoadInst).getImm() * LoadSize;
738 return (UnscaledStOffset <= UnscaledLdOffset) &&
739 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
740}
741
742static bool isPromotableZeroStoreInst(MachineInstr &MI) {
743 unsigned Opc = MI.getOpcode();
744 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
745 isNarrowStore(Opc)) &&
746 getLdStRegOp(MI).getReg() == AArch64::WZR;
747}
748
749static bool isPromotableLoadFromStore(MachineInstr &MI) {
750 switch (MI.getOpcode()) {
751 default:
752 return false;
753 // Scaled instructions.
754 case AArch64::LDRBBui:
755 case AArch64::LDRHHui:
756 case AArch64::LDRWui:
757 case AArch64::LDRXui:
758 // Unscaled instructions.
759 case AArch64::LDURBBi:
760 case AArch64::LDURHHi:
761 case AArch64::LDURWi:
762 case AArch64::LDURXi:
763 return true;
764 }
765}
766
767static bool isMergeableLdStUpdate(MachineInstr &MI, AArch64FunctionInfo &AFI) {
768 unsigned Opc = MI.getOpcode();
769 switch (Opc) {
770 default:
771 return false;
772 // Scaled instructions.
773 case AArch64::STRBui:
774 case AArch64::STRHui:
775 case AArch64::STRSui:
776 case AArch64::STRDui:
777 case AArch64::STRQui:
778 case AArch64::STRXui:
779 case AArch64::STRWui:
780 case AArch64::STRHHui:
781 case AArch64::STRBBui:
782 case AArch64::LDRBui:
783 case AArch64::LDRHui:
784 case AArch64::LDRSui:
785 case AArch64::LDRDui:
786 case AArch64::LDRQui:
787 case AArch64::LDRXui:
788 case AArch64::LDRWui:
789 case AArch64::LDRHHui:
790 case AArch64::LDRBBui:
791 case AArch64::STGi:
792 case AArch64::STZGi:
793 case AArch64::ST2Gi:
794 case AArch64::STZ2Gi:
795 case AArch64::STGPi:
796 // Unscaled instructions.
797 case AArch64::STURSi:
798 case AArch64::STURDi:
799 case AArch64::STURQi:
800 case AArch64::STURWi:
801 case AArch64::STURXi:
802 case AArch64::LDURSi:
803 case AArch64::LDURDi:
804 case AArch64::LDURQi:
805 case AArch64::LDURWi:
806 case AArch64::LDURXi:
807 // Paired instructions.
808 case AArch64::LDPSi:
809 case AArch64::LDPSWi:
810 case AArch64::LDPDi:
811 case AArch64::LDPQi:
812 case AArch64::LDPWi:
813 case AArch64::LDPXi:
814 case AArch64::STPSi:
815 case AArch64::STPDi:
816 case AArch64::STPQi:
817 case AArch64::STPWi:
818 case AArch64::STPXi:
819 // Make sure this is a reg+imm (as opposed to an address reloc).
820 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
821 return false;
822
823 // When using stack tagging, simple sp+imm loads and stores are not
824 // tag-checked, but pre- and post-indexed versions of them are, so we can't
825 // replace the former with the latter. This transformation would be valid
826 // if the load/store accesses an untagged stack slot, but we don't have
827 // that information available after frame indices have been eliminated.
828 if (AFI.isMTETagged() &&
829 AArch64InstrInfo::getLdStBaseOp(MI).getReg() == AArch64::SP)
830 return false;
831
832 return true;
833 }
834}
835
836// Make sure this is a reg+reg Ld/St
837static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
838 unsigned Opc = MI.getOpcode();
839 switch (Opc) {
840 default:
841 return false;
842 // Scaled instructions.
843 // TODO: Add more index address stores.
844 case AArch64::LDRBroX:
845 case AArch64::LDRBBroX:
846 case AArch64::LDRSBXroX:
847 case AArch64::LDRSBWroX:
848 Scale = 1;
849 return true;
850 case AArch64::LDRHroX:
851 case AArch64::LDRHHroX:
852 case AArch64::LDRSHXroX:
853 case AArch64::LDRSHWroX:
854 Scale = 2;
855 return true;
856 case AArch64::LDRWroX:
857 case AArch64::LDRSroX:
858 case AArch64::LDRSWroX:
859 Scale = 4;
860 return true;
861 case AArch64::LDRDroX:
862 case AArch64::LDRXroX:
863 Scale = 8;
864 return true;
865 case AArch64::LDRQroX:
866 Scale = 16;
867 return true;
868 }
869}
870
871static bool isRewritableImplicitDef(const MachineOperand &MO) {
872 switch (MO.getParent()->getOpcode()) {
873 default:
874 return MO.isRenamable();
875 case AArch64::ORRWrs:
876 case AArch64::ADDWri:
877 return true;
878 }
879}
880
881MachineBasicBlock::iterator
882AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
883 MachineBasicBlock::iterator MergeMI,
884 const LdStPairFlags &Flags) {
885 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
886 "Expected promotable zero stores.");
887
888 MachineBasicBlock::iterator E = I->getParent()->end();
889 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
890 // If NextI is the second of the two instructions to be merged, we need
891 // to skip one further. Either way we merge will invalidate the iterator,
892 // and we don't need to scan the new instruction, as it's a pairwise
893 // instruction, which we're not considering for further action anyway.
894 if (NextI == MergeMI)
895 NextI = next_nodbg(It: NextI, End: E);
896
897 unsigned Opc = I->getOpcode();
898 unsigned MergeMIOpc = MergeMI->getOpcode();
899 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
900 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(Opc: MergeMIOpc);
901 int OffsetStride = IsScaled ? TII->getMemScale(MI: *I) : 1;
902 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(MI: *MergeMI) : 1;
903
904 bool MergeForward = Flags.getMergeForward();
905 // Insert our new paired instruction after whichever of the paired
906 // instructions MergeForward indicates.
907 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
908 // Also based on MergeForward is from where we copy the base register operand
909 // so we get the flags compatible with the input code.
910 const MachineOperand &BaseRegOp =
911 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *MergeMI)
912 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
913
914 // Which register is Rt and which is Rt2 depends on the offset order.
915 int64_t IOffsetInBytes =
916 AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm() * OffsetStride;
917 int64_t MIOffsetInBytes =
918 AArch64InstrInfo::getLdStOffsetOp(MI: *MergeMI).getImm() *
919 MergeMIOffsetStride;
920 // Select final offset based on the offset order.
921 int64_t OffsetImm;
922 if (IOffsetInBytes > MIOffsetInBytes)
923 OffsetImm = MIOffsetInBytes;
924 else
925 OffsetImm = IOffsetInBytes;
926
927 int NewOpcode = getMatchingWideOpcode(Opc);
928 // Adjust final offset on scaled stores because the new instruction
929 // has a different scale.
930 if (!TII->hasUnscaledLdStOffset(Opc: NewOpcode)) {
931 int NewOffsetStride = TII->getMemScale(Opc: NewOpcode);
932 assert(((OffsetImm % NewOffsetStride) == 0) &&
933 "Offset should be a multiple of the store memory scale");
934 OffsetImm = OffsetImm / NewOffsetStride;
935 }
936
937 // Construct the new instruction.
938 DebugLoc DL = I->getDebugLoc();
939 MachineBasicBlock *MBB = I->getParent();
940 MachineInstrBuilder MIB;
941 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: NewOpcode))
942 .addReg(RegNo: isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
943 .add(MO: BaseRegOp)
944 .addImm(Val: OffsetImm)
945 .cloneMergedMemRefs(OtherMIs: {&*I, &*MergeMI})
946 .setMIFlags(I->mergeFlagsWith(Other: *MergeMI));
947 (void)MIB;
948
949 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
950 LLVM_DEBUG(I->print(dbgs()));
951 LLVM_DEBUG(dbgs() << " ");
952 LLVM_DEBUG(MergeMI->print(dbgs()));
953 LLVM_DEBUG(dbgs() << " with instruction:\n ");
954 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
955 LLVM_DEBUG(dbgs() << "\n");
956
957 // Erase the old instructions.
958 I->eraseFromParent();
959 MergeMI->eraseFromParent();
960 return NextI;
961}
962
963// Apply Fn to all instructions between MI and the beginning of the block, until
964// a def for DefReg is reached. Returns true, iff Fn returns true for all
965// visited instructions. Stop after visiting Limit iterations.
966static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
967 const TargetRegisterInfo *TRI, unsigned Limit,
968 std::function<bool(MachineInstr &, bool)> &Fn) {
969 auto MBB = MI.getParent();
970 for (MachineInstr &I :
971 instructionsWithoutDebug(It: MI.getReverseIterator(), End: MBB->instr_rend())) {
972 if (!Limit)
973 return false;
974 --Limit;
975
976 bool isDef = any_of(Range: I.operands(), P: [DefReg, TRI](MachineOperand &MOP) {
977 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
978 TRI->regsOverlap(RegA: MOP.getReg(), RegB: DefReg);
979 });
980 if (!Fn(I, isDef))
981 return false;
982 if (isDef)
983 break;
984 }
985 return true;
986}
987
988static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
989 const TargetRegisterInfo *TRI) {
990
991 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
992 if (MOP.isReg() && MOP.isKill())
993 Units.removeReg(Reg: MOP.getReg());
994
995 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
996 if (MOP.isReg() && !MOP.isKill())
997 Units.addReg(Reg: MOP.getReg());
998}
999
1000/// This function will add a new entry into the debugValueSubstitutions table
1001/// when two instruction have been merged into a new one represented by \p
1002/// MergedInstr.
1003static void addDebugSubstitutionsToTable(MachineFunction *MF,
1004 unsigned InstrNumToSet,
1005 MachineInstr &OriginalInstr,
1006 MachineInstr &MergedInstr) {
1007
1008 // Figure out the Operand Index of the destination register of the
1009 // OriginalInstr in the new MergedInstr.
1010 auto Reg = OriginalInstr.getOperand(i: 0).getReg();
1011 unsigned OperandNo = 0;
1012 bool RegFound = false;
1013 for (const auto Op : MergedInstr.operands()) {
1014 if (Op.getReg() == Reg) {
1015 RegFound = true;
1016 break;
1017 }
1018 OperandNo++;
1019 }
1020
1021 if (RegFound)
1022 MF->makeDebugValueSubstitution({OriginalInstr.peekDebugInstrNum(), 0},
1023 {InstrNumToSet, OperandNo});
1024}
1025
1026MachineBasicBlock::iterator
1027AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
1028 MachineBasicBlock::iterator Paired,
1029 const LdStPairFlags &Flags) {
1030 MachineBasicBlock::iterator E = I->getParent()->end();
1031 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
1032 // If NextI is the second of the two instructions to be merged, we need
1033 // to skip one further. Either way we merge will invalidate the iterator,
1034 // and we don't need to scan the new instruction, as it's a pairwise
1035 // instruction, which we're not considering for further action anyway.
1036 if (NextI == Paired)
1037 NextI = next_nodbg(It: NextI, End: E);
1038
1039 int SExtIdx = Flags.getSExtIdx();
1040 unsigned Opc =
1041 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(Opc: I->getOpcode());
1042 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
1043 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: *I) : 1;
1044
1045 bool MergeForward = Flags.getMergeForward();
1046
1047 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
1048 if (RenameReg) {
1049 MCRegister RegToRename = getLdStRegOp(MI&: *I).getReg();
1050 DefinedInBB.addReg(Reg: *RenameReg);
1051
1052 // Return the sub/super register for RenameReg, matching the size of
1053 // OriginalReg.
1054 auto GetMatchingSubReg =
1055 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
1056 for (MCPhysReg SubOrSuper :
1057 TRI->sub_and_superregs_inclusive(Reg: *RenameReg)) {
1058 if (C->contains(Reg: SubOrSuper))
1059 return SubOrSuper;
1060 }
1061 llvm_unreachable("Should have found matching sub or super register!");
1062 };
1063
1064 std::function<bool(MachineInstr &, bool)> UpdateMIs =
1065 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
1066 bool IsDef) {
1067 if (IsDef) {
1068 bool SeenDef = false;
1069 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1070 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1071 // Rename the first explicit definition and all implicit
1072 // definitions matching RegToRename.
1073 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1074 (!MergeForward || !SeenDef ||
1075 (MOP.isDef() && MOP.isImplicit())) &&
1076 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1077 assert((MOP.isImplicit() ||
1078 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1079 "Need renamable operands");
1080 Register MatchingReg;
1081 if (const TargetRegisterClass *RC =
1082 MI.getRegClassConstraint(OpIdx, TII, TRI))
1083 MatchingReg = GetMatchingSubReg(RC);
1084 else {
1085 if (!isRewritableImplicitDef(MO: MOP))
1086 continue;
1087 MatchingReg = GetMatchingSubReg(
1088 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1089 }
1090 MOP.setReg(MatchingReg);
1091 SeenDef = true;
1092 }
1093 }
1094 } else {
1095 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1096 MachineOperand &MOP = MI.getOperand(i: OpIdx);
1097 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1098 TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename)) {
1099 assert((MOP.isImplicit() ||
1100 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1101 "Need renamable operands");
1102 Register MatchingReg;
1103 if (const TargetRegisterClass *RC =
1104 MI.getRegClassConstraint(OpIdx, TII, TRI))
1105 MatchingReg = GetMatchingSubReg(RC);
1106 else
1107 MatchingReg = GetMatchingSubReg(
1108 TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1109 assert(MatchingReg != AArch64::NoRegister &&
1110 "Cannot find matching regs for renaming");
1111 MOP.setReg(MatchingReg);
1112 }
1113 }
1114 }
1115 LLVM_DEBUG(dbgs() << "Renamed " << MI);
1116 return true;
1117 };
1118 forAllMIsUntilDef(MI&: MergeForward ? *I : *Paired->getPrevNode(), DefReg: RegToRename,
1119 TRI, UINT32_MAX, Fn&: UpdateMIs);
1120
1121#if !defined(NDEBUG)
1122 // For forward merging store:
1123 // Make sure the register used for renaming is not used between the
1124 // paired instructions. That would trash the content before the new
1125 // paired instruction.
1126 MCPhysReg RegToCheck = *RenameReg;
1127 // For backward merging load:
1128 // Make sure the register being renamed is not used between the
1129 // paired instructions. That would trash the content after the new
1130 // paired instruction.
1131 if (!MergeForward)
1132 RegToCheck = RegToRename;
1133 for (auto &MI :
1134 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
1135 MergeForward ? std::next(I) : I,
1136 MergeForward ? std::next(Paired) : Paired))
1137 assert(all_of(MI.operands(),
1138 [this, RegToCheck](const MachineOperand &MOP) {
1139 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1140 MOP.isUndef() ||
1141 !TRI->regsOverlap(MOP.getReg(), RegToCheck);
1142 }) &&
1143 "Rename register used between paired instruction, trashing the "
1144 "content");
1145#endif
1146 }
1147
1148 // Insert our new paired instruction after whichever of the paired
1149 // instructions MergeForward indicates.
1150 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
1151 // Also based on MergeForward is from where we copy the base register operand
1152 // so we get the flags compatible with the input code.
1153 const MachineOperand &BaseRegOp =
1154 MergeForward ? AArch64InstrInfo::getLdStBaseOp(MI: *Paired)
1155 : AArch64InstrInfo::getLdStBaseOp(MI: *I);
1156
1157 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: *I).getImm();
1158 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(MI: *Paired).getImm();
1159 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Opc: Paired->getOpcode());
1160 if (IsUnscaled != PairedIsUnscaled) {
1161 // We're trying to pair instructions that differ in how they are scaled. If
1162 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
1163 // the opposite (i.e., make Paired's offset unscaled).
1164 int MemSize = TII->getMemScale(MI: *Paired);
1165 if (PairedIsUnscaled) {
1166 // If the unscaled offset isn't a multiple of the MemSize, we can't
1167 // pair the operations together.
1168 assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1169 "Offset should be a multiple of the stride!");
1170 PairedOffset /= MemSize;
1171 } else {
1172 PairedOffset *= MemSize;
1173 }
1174 }
1175
1176 // Which register is Rt and which is Rt2 depends on the offset order.
1177 // However, for pre load/stores the Rt should be the one of the pre
1178 // load/store.
1179 MachineInstr *RtMI, *Rt2MI;
1180 if (Offset == PairedOffset + OffsetStride &&
1181 !AArch64InstrInfo::isPreLdSt(MI: *I)) {
1182 RtMI = &*Paired;
1183 Rt2MI = &*I;
1184 // Here we swapped the assumption made for SExtIdx.
1185 // I.e., we turn ldp I, Paired into ldp Paired, I.
1186 // Update the index accordingly.
1187 if (SExtIdx != -1)
1188 SExtIdx = (SExtIdx + 1) % 2;
1189 } else {
1190 RtMI = &*I;
1191 Rt2MI = &*Paired;
1192 }
1193 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(MI: *RtMI).getImm();
1194 // Scale the immediate offset, if necessary.
1195 if (TII->hasUnscaledLdStOffset(Opc: RtMI->getOpcode())) {
1196 assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1197 "Unscaled offset cannot be scaled.");
1198 OffsetImm /= TII->getMemScale(MI: *RtMI);
1199 }
1200
1201 // Construct the new instruction.
1202 MachineInstrBuilder MIB;
1203 DebugLoc DL = I->getDebugLoc();
1204 MachineBasicBlock *MBB = I->getParent();
1205 MachineOperand RegOp0 = getLdStRegOp(MI&: *RtMI);
1206 MachineOperand RegOp1 = getLdStRegOp(MI&: *Rt2MI);
1207 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1208 // Kill flags may become invalid when moving stores for pairing.
1209 if (RegOp0.isUse()) {
1210 if (!MergeForward) {
1211 // Clear kill flags on store if moving upwards. Example:
1212 // STRWui kill %w0, ...
1213 // USE %w1
1214 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1215 // We are about to move the store of w1, so its kill flag may become
1216 // invalid; not the case for w0.
1217 // Since w1 is used between the stores, the kill flag on w1 is cleared
1218 // after merging.
1219 // STPWi kill %w0, %w1, ...
1220 // USE %w1
1221 for (auto It = std::next(x: I); It != Paired && PairedRegOp.isKill(); ++It)
1222 if (It->readsRegister(Reg: PairedRegOp.getReg(), TRI))
1223 PairedRegOp.setIsKill(false);
1224 } else {
1225 // Clear kill flags of the first stores register. Example:
1226 // STRWui %w1, ...
1227 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1228 // STRW %w0
1229 Register Reg = getLdStRegOp(MI&: *I).getReg();
1230 for (MachineInstr &MI :
1231 make_range(x: std::next(x: I->getIterator()), y: Paired->getIterator()))
1232 MI.clearRegisterKills(Reg, RegInfo: TRI);
1233 }
1234 }
1235
1236 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1237 MIB = BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: MatchPairOpcode));
1238
1239 // Adds the pre-index operand for pre-indexed ld/st pairs.
1240 if (AArch64InstrInfo::isPreLdSt(MI: *RtMI))
1241 MIB.addReg(RegNo: BaseRegOp.getReg(), Flags: RegState::Define);
1242
1243 MIB.add(MO: RegOp0)
1244 .add(MO: RegOp1)
1245 .add(MO: BaseRegOp)
1246 .addImm(Val: OffsetImm)
1247 .cloneMergedMemRefs(OtherMIs: {&*I, &*Paired})
1248 .setMIFlags(I->mergeFlagsWith(Other: *Paired));
1249
1250 (void)MIB;
1251
1252 LLVM_DEBUG(
1253 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1254 LLVM_DEBUG(I->print(dbgs()));
1255 LLVM_DEBUG(dbgs() << " ");
1256 LLVM_DEBUG(Paired->print(dbgs()));
1257 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1258 if (SExtIdx != -1) {
1259 // Generate the sign extension for the proper result of the ldp.
1260 // I.e., with X1, that would be:
1261 // %w1 = KILL %w1, implicit-def %x1
1262 // %x1 = SBFMXri killed %x1, 0, 31
1263 MachineOperand &DstMO = MIB->getOperand(i: SExtIdx);
1264 // Right now, DstMO has the extended register, since it comes from an
1265 // extended opcode.
1266 Register DstRegX = DstMO.getReg();
1267 // Get the W variant of that register.
1268 Register DstRegW = TRI->getSubReg(Reg: DstRegX, Idx: AArch64::sub_32);
1269 // Update the result of LDP to use the W instead of the X variant.
1270 DstMO.setReg(DstRegW);
1271 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1272 LLVM_DEBUG(dbgs() << "\n");
1273 // Make the machine verifier happy by providing a definition for
1274 // the X register.
1275 // Insert this definition right after the generated LDP, i.e., before
1276 // InsertionPoint.
1277 MachineInstrBuilder MIBKill =
1278 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::KILL), DestReg: DstRegW)
1279 .addReg(RegNo: DstRegW)
1280 .addReg(RegNo: DstRegX, Flags: RegState::Define);
1281 MIBKill->getOperand(i: 2).setImplicit();
1282 // Create the sign extension.
1283 MachineInstrBuilder MIBSXTW =
1284 BuildMI(BB&: *MBB, I: InsertionPoint, MIMD: DL, MCID: TII->get(Opcode: AArch64::SBFMXri), DestReg: DstRegX)
1285 .addReg(RegNo: DstRegX)
1286 .addImm(Val: 0)
1287 .addImm(Val: 31);
1288 (void)MIBSXTW;
1289
1290 // In the case of a sign-extend, where we have something like:
1291 // debugValueSubstitutions:[]
1292 // $w1 = LDRWui $x0, 1, debug-instr-number 1
1293 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1294 // $x0 = LDRSWui $x0, 0, debug-instr-number 2
1295 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1296
1297 // It will be converted to:
1298 // debugValueSubstitutions:[]
1299 // $w0, $w1 = LDPWi $x0, 0
1300 // $w0 = KILL $w0, implicit-def $x0
1301 // $x0 = SBFMXri $x0, 0, 31
1302 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1303 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1304
1305 // We want the final result to look like:
1306 // debugValueSubstitutions:
1307 // - { srcinst: 1, srcop: 0, dstinst: 4, dstop: 1, subreg: 0 }
1308 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1309 // $w0, $w1 = LDPWi $x0, 0, debug-instr-number 4
1310 // $w0 = KILL $w0, implicit-def $x0
1311 // $x0 = SBFMXri $x0, 0, 31, debug-instr-number 3
1312 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1313 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1314
1315 // $x0 is where the final value is stored, so the sign extend (SBFMXri)
1316 // instruction contains the final value we care about we give it a new
1317 // debug-instr-number 3. Whereas, $w1 contains the final value that we care
1318 // about, therefore the LDP instruction is also given a new
1319 // debug-instr-number 4. We have to add these substitutions to the
1320 // debugValueSubstitutions table. However, we also have to ensure that the
1321 // OpIndex that pointed to debug-instr-number 1 gets updated to 1, because
1322 // $w1 is the second operand of the LDP instruction.
1323
1324 if (I->peekDebugInstrNum()) {
1325 // If I is the instruction which got sign extended and has a
1326 // debug-instr-number, give the SBFMXri instruction a new
1327 // debug-instr-number, and update the debugValueSubstitutions table with
1328 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1329 // instruction a new debug-instr-number, and update the
1330 // debugValueSubstitutions table with the new debug-instr-number and
1331 // OpIndex pair.
1332 unsigned NewInstrNum;
1333 if (DstRegX == I->getOperand(i: 0).getReg()) {
1334 NewInstrNum = MIBSXTW->getDebugInstrNum();
1335 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I,
1336 MergedInstr&: *MIBSXTW);
1337 } else {
1338 NewInstrNum = MIB->getDebugInstrNum();
1339 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *I, MergedInstr&: *MIB);
1340 }
1341 }
1342 if (Paired->peekDebugInstrNum()) {
1343 // If Paired is the instruction which got sign extended and has a
1344 // debug-instr-number, give the SBFMXri instruction a new
1345 // debug-instr-number, and update the debugValueSubstitutions table with
1346 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1347 // instruction a new debug-instr-number, and update the
1348 // debugValueSubstitutions table with the new debug-instr-number and
1349 // OpIndex pair.
1350 unsigned NewInstrNum;
1351 if (DstRegX == Paired->getOperand(i: 0).getReg()) {
1352 NewInstrNum = MIBSXTW->getDebugInstrNum();
1353 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1354 MergedInstr&: *MIBSXTW);
1355 } else {
1356 NewInstrNum = MIB->getDebugInstrNum();
1357 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewInstrNum, OriginalInstr&: *Paired,
1358 MergedInstr&: *MIB);
1359 }
1360 }
1361
1362 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1363 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1364 } else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) {
1365 // We are combining SVE fill/spill to LDP/STP, so we need to use the Q
1366 // variant of the registers.
1367 MachineOperand &MOp0 = MIB->getOperand(i: 0);
1368 MachineOperand &MOp1 = MIB->getOperand(i: 1);
1369 assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
1370 AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
1371 MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
1372 MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
1373 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1374 } else {
1375
1376 // In the case that the merge doesn't result in a sign-extend, if we have
1377 // something like:
1378 // debugValueSubstitutions:[]
1379 // $x1 = LDRXui $x0, 1, debug-instr-number 1
1380 // DBG_INSTR_REF !13, dbg-instr-ref(1, 0), debug-location !11
1381 // $x0 = LDRXui killed $x0, 0, debug-instr-number 2
1382 // DBG_INSTR_REF !14, dbg-instr-ref(2, 0), debug-location !11
1383
1384 // It will be converted to:
1385 // debugValueSubstitutions: []
1386 // $x0, $x1 = LDPXi $x0, 0
1387 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1388 // DBG_INSTR_REF !13, dbg-instr-ref(2, 0), debug-location !14
1389
1390 // We want the final result to look like:
1391 // debugValueSubstitutions:
1392 // - { srcinst: 1, srcop: 0, dstinst: 3, dstop: 1, subreg: 0 }
1393 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1394 // $x0, $x1 = LDPXi $x0, 0, debug-instr-number 3
1395 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1396 // DBG_INSTR_REF !12, dbg-instr-ref(2, 0), debug-location !14
1397
1398 // Here all that needs to be done is, that the LDP instruction needs to be
1399 // updated with a new debug-instr-number, we then need to add entries into
1400 // the debugSubstitutions table to map the old instr-refs to the new ones.
1401
1402 // Assign new DebugInstrNum to the Paired instruction.
1403 if (I->peekDebugInstrNum()) {
1404 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1405 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *I,
1406 MergedInstr&: *MIB);
1407 }
1408 if (Paired->peekDebugInstrNum()) {
1409 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1410 addDebugSubstitutionsToTable(MF: MBB->getParent(), InstrNumToSet: NewDebugInstrNum, OriginalInstr&: *Paired,
1411 MergedInstr&: *MIB);
1412 }
1413
1414 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1415 }
1416 LLVM_DEBUG(dbgs() << "\n");
1417
1418 if (MergeForward)
1419 for (const MachineOperand &MOP : phys_regs_and_masks(MI: *I))
1420 if (MOP.isReg() && MOP.isKill())
1421 DefinedInBB.addReg(Reg: MOP.getReg());
1422
1423 // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but
1424 // only copies implicit defs and makes sure that each operand is only added
1425 // once in case of duplicates.
1426 auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1,
1427 MachineBasicBlock::iterator MI2) {
1428 SmallSetVector<Register, 4> Ops;
1429 for (const MachineOperand &MO :
1430 llvm::drop_begin(RangeOrContainer: MI1->operands(), N: MI1->getDesc().getNumOperands()))
1431 if (MO.isReg() && MO.isImplicit() && MO.isDef())
1432 Ops.insert(X: MO.getReg());
1433 for (const MachineOperand &MO :
1434 llvm::drop_begin(RangeOrContainer: MI2->operands(), N: MI2->getDesc().getNumOperands()))
1435 if (MO.isReg() && MO.isImplicit() && MO.isDef())
1436 Ops.insert(X: MO.getReg());
1437 for (auto Op : Ops)
1438 MIB.addDef(RegNo: Op, Flags: RegState::Implicit);
1439 };
1440 CopyImplicitOps(I, Paired);
1441
1442 // Erase the old instructions.
1443 I->eraseFromParent();
1444 Paired->eraseFromParent();
1445
1446 return NextI;
1447}
1448
1449MachineBasicBlock::iterator
1450AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1451 MachineBasicBlock::iterator StoreI) {
1452 MachineBasicBlock::iterator NextI =
1453 next_nodbg(It: LoadI, End: LoadI->getParent()->end());
1454
1455 int LoadSize = TII->getMemScale(MI: *LoadI);
1456 int StoreSize = TII->getMemScale(MI: *StoreI);
1457 Register LdRt = getLdStRegOp(MI&: *LoadI).getReg();
1458 const MachineOperand &StMO = getLdStRegOp(MI&: *StoreI);
1459 Register StRt = getLdStRegOp(MI&: *StoreI).getReg();
1460 bool IsStoreXReg = TRI->getRegClass(i: AArch64::GPR64RegClassID)->contains(Reg: StRt);
1461
1462 assert((IsStoreXReg ||
1463 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1464 "Unexpected RegClass");
1465
1466 MachineInstr *BitExtMI;
1467 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1468 // Remove the load, if the destination register of the loads is the same
1469 // register for stored value.
1470 if (StRt == LdRt && LoadSize == 8) {
1471 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1472 y: LoadI->getIterator())) {
1473 if (MI.killsRegister(Reg: StRt, TRI)) {
1474 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1475 break;
1476 }
1477 }
1478 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1479 LLVM_DEBUG(LoadI->print(dbgs()));
1480 LLVM_DEBUG(dbgs() << "\n");
1481 LoadI->eraseFromParent();
1482 return NextI;
1483 }
1484 // Replace the load with a mov if the load and store are in the same size.
1485 BitExtMI =
1486 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1487 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), DestReg: LdRt)
1488 .addReg(RegNo: IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1489 .add(MO: StMO)
1490 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
1491 .setMIFlags(LoadI->getFlags());
1492 } else {
1493 // FIXME: Currently we disable this transformation in big-endian targets as
1494 // performance and correctness are verified only in little-endian.
1495 if (!Subtarget->isLittleEndian())
1496 return NextI;
1497 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: *LoadI);
1498 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1499 "Unsupported ld/st match");
1500 assert(LoadSize <= StoreSize && "Invalid load size");
1501 int UnscaledLdOffset =
1502 IsUnscaled
1503 ? AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm()
1504 : AArch64InstrInfo::getLdStOffsetOp(MI: *LoadI).getImm() * LoadSize;
1505 int UnscaledStOffset =
1506 IsUnscaled
1507 ? AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm()
1508 : AArch64InstrInfo::getLdStOffsetOp(MI: *StoreI).getImm() * StoreSize;
1509 int Width = LoadSize * 8;
1510 Register DestReg =
1511 IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1512 Reg: LdRt, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64RegClass))
1513 : LdRt;
1514
1515 assert((UnscaledLdOffset >= UnscaledStOffset &&
1516 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1517 "Invalid offset");
1518
1519 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1520 int Imms = Immr + Width - 1;
1521 if (UnscaledLdOffset == UnscaledStOffset) {
1522 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1523 | ((Immr) << 6) // immr
1524 | ((Imms) << 0) // imms
1525 ;
1526
1527 BitExtMI =
1528 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1529 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1530 DestReg)
1531 .add(MO: StMO)
1532 .addImm(Val: AndMaskEncoded)
1533 .setMIFlags(LoadI->getFlags());
1534 } else if (IsStoreXReg && Imms == 31) {
1535 // Use the 32 bit variant of UBFM if it's the LSR alias of the
1536 // instruction.
1537 assert(Immr <= Imms && "Expected LSR alias of UBFM");
1538 BitExtMI = BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1539 MCID: TII->get(Opcode: AArch64::UBFMWri),
1540 DestReg: TRI->getSubReg(Reg: DestReg, Idx: AArch64::sub_32))
1541 .addReg(RegNo: TRI->getSubReg(Reg: StRt, Idx: AArch64::sub_32))
1542 .addImm(Val: Immr)
1543 .addImm(Val: Imms)
1544 .setMIFlags(LoadI->getFlags());
1545 } else {
1546 BitExtMI =
1547 BuildMI(BB&: *LoadI->getParent(), I: LoadI, MIMD: LoadI->getDebugLoc(),
1548 MCID: TII->get(Opcode: IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1549 DestReg)
1550 .add(MO: StMO)
1551 .addImm(Val: Immr)
1552 .addImm(Val: Imms)
1553 .setMIFlags(LoadI->getFlags());
1554 }
1555 }
1556
1557 // Clear kill flags between store and load.
1558 for (MachineInstr &MI : make_range(x: StoreI->getIterator(),
1559 y: BitExtMI->getIterator()))
1560 if (MI.killsRegister(Reg: StRt, TRI)) {
1561 MI.clearRegisterKills(Reg: StRt, RegInfo: TRI);
1562 break;
1563 }
1564
1565 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1566 LLVM_DEBUG(StoreI->print(dbgs()));
1567 LLVM_DEBUG(dbgs() << " ");
1568 LLVM_DEBUG(LoadI->print(dbgs()));
1569 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1570 LLVM_DEBUG(StoreI->print(dbgs()));
1571 LLVM_DEBUG(dbgs() << " ");
1572 LLVM_DEBUG((BitExtMI)->print(dbgs()));
1573 LLVM_DEBUG(dbgs() << "\n");
1574
1575 // Erase the old instructions.
1576 LoadI->eraseFromParent();
1577 return NextI;
1578}
1579
1580static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1581 // Convert the byte-offset used by unscaled into an "element" offset used
1582 // by the scaled pair load/store instructions.
1583 if (IsUnscaled) {
1584 // If the byte-offset isn't a multiple of the stride, there's no point
1585 // trying to match it.
1586 if (Offset % OffsetStride)
1587 return false;
1588 Offset /= OffsetStride;
1589 }
1590 return Offset <= 63 && Offset >= -64;
1591}
1592
1593// Do alignment, specialized to power of 2 and for signed ints,
1594// avoiding having to do a C-style cast from uint_64t to int when
1595// using alignTo from include/llvm/Support/MathExtras.h.
1596// FIXME: Move this function to include/MathExtras.h?
1597static int alignTo(int Num, int PowOf2) {
1598 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1599}
1600
1601static bool mayAlias(MachineInstr &MIa,
1602 SmallVectorImpl<MachineInstr *> &MemInsns,
1603 AliasAnalysis *AA) {
1604 for (MachineInstr *MIb : MemInsns) {
1605 if (MIa.mayAlias(AA, Other: *MIb, /*UseTBAA*/ false)) {
1606 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1607 return true;
1608 }
1609 }
1610
1611 LLVM_DEBUG(dbgs() << "No aliases found\n");
1612 return false;
1613}
1614
1615bool AArch64LoadStoreOpt::findMatchingStore(
1616 MachineBasicBlock::iterator I, unsigned Limit,
1617 MachineBasicBlock::iterator &StoreI) {
1618 MachineBasicBlock::iterator B = I->getParent()->begin();
1619 MachineBasicBlock::iterator MBBI = I;
1620 MachineInstr &LoadMI = *I;
1621 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: LoadMI).getReg();
1622
1623 // If the load is the first instruction in the block, there's obviously
1624 // not any matching store.
1625 if (MBBI == B)
1626 return false;
1627
1628 // Track which register units have been modified and used between the first
1629 // insn and the second insn.
1630 ModifiedRegUnits.clear();
1631 UsedRegUnits.clear();
1632
1633 unsigned Count = 0;
1634 do {
1635 MBBI = prev_nodbg(It: MBBI, Begin: B);
1636 MachineInstr &MI = *MBBI;
1637
1638 // Don't count transient instructions towards the search limit since there
1639 // may be different numbers of them if e.g. debug information is present.
1640 if (!MI.isTransient())
1641 ++Count;
1642
1643 // If the load instruction reads directly from the address to which the
1644 // store instruction writes and the stored value is not modified, we can
1645 // promote the load. Since we do not handle stores with pre-/post-index,
1646 // it's unnecessary to check if BaseReg is modified by the store itself.
1647 // Also we can't handle stores without an immediate offset operand,
1648 // while the operand might be the address for a global variable.
1649 if (MI.mayStore() && isMatchingStore(LoadInst&: LoadMI, StoreInst&: MI) &&
1650 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1651 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1652 isLdOffsetInRangeOfSt(LoadInst&: LoadMI, StoreInst&: MI, TII) &&
1653 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg())) {
1654 StoreI = MBBI;
1655 return true;
1656 }
1657
1658 if (MI.isCall())
1659 return false;
1660
1661 // Update modified / uses register units.
1662 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1663
1664 // Otherwise, if the base register is modified, we have no match, so
1665 // return early.
1666 if (!ModifiedRegUnits.available(Reg: BaseReg))
1667 return false;
1668
1669 // If we encounter a store aliased with the load, return early.
1670 if (MI.mayStore() && LoadMI.mayAlias(AA, Other: MI, /*UseTBAA*/ false))
1671 return false;
1672 } while (MBBI != B && Count < Limit);
1673 return false;
1674}
1675
1676static bool needsWinCFI(const MachineFunction *MF) {
1677 return MF->getTarget().getMCAsmInfo().usesWindowsCFI() &&
1678 MF->getFunction().needsUnwindTableEntry();
1679}
1680
1681// Returns true if FirstMI and MI are candidates for merging or pairing.
1682// Otherwise, returns false.
1683static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1684 LdStPairFlags &Flags,
1685 const AArch64InstrInfo *TII) {
1686 // If this is volatile or if pairing is suppressed, not a candidate.
1687 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1688 return false;
1689
1690 // We should have already checked FirstMI for pair suppression and volatility.
1691 assert(!FirstMI.hasOrderedMemoryRef() &&
1692 !TII->isLdStPairSuppressed(FirstMI) &&
1693 "FirstMI shouldn't get here if either of these checks are true.");
1694
1695 if (needsWinCFI(MF: MI.getMF()) && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
1696 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
1697 return false;
1698
1699 unsigned OpcA = FirstMI.getOpcode();
1700 unsigned OpcB = MI.getOpcode();
1701
1702 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1703 if (OpcA == OpcB)
1704 return !AArch64InstrInfo::isPreLdSt(MI: FirstMI);
1705
1706 // Bail out if one of the opcodes is SVE fill/spill, as we currently don't
1707 // allow pairing them with other instructions.
1708 if (OpcA == AArch64::LDR_ZXI || OpcA == AArch64::STR_ZXI ||
1709 OpcB == AArch64::LDR_ZXI || OpcB == AArch64::STR_ZXI)
1710 return false;
1711
1712 // Two pre ld/st of different opcodes cannot be merged either
1713 if (AArch64InstrInfo::isPreLdSt(MI: FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1714 return false;
1715
1716 // Try to match a sign-extended load/store with a zero-extended load/store.
1717 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1718 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc: OpcA, IsValidLdStrOpc: &IsValidLdStrOpc);
1719 assert(IsValidLdStrOpc &&
1720 "Given Opc should be a Load or Store with an immediate");
1721 // OpcA will be the first instruction in the pair.
1722 if (NonSExtOpc == getMatchingNonSExtOpcode(Opc: OpcB, IsValidLdStrOpc: &PairIsValidLdStrOpc)) {
1723 Flags.setSExtIdx(NonSExtOpc == OpcA ? 1 : 0);
1724 return true;
1725 }
1726
1727 // If the second instruction isn't even a mergable/pairable load/store, bail
1728 // out.
1729 if (!PairIsValidLdStrOpc)
1730 return false;
1731
1732 // Narrow stores do not have a matching pair opcodes, so constrain their
1733 // merging to zero stores.
1734 if (isNarrowStore(Opc: OpcA) || isNarrowStore(Opc: OpcB))
1735 return getLdStRegOp(MI&: FirstMI).getReg() == AArch64::WZR &&
1736 getLdStRegOp(MI).getReg() == AArch64::WZR &&
1737 TII->getMemScale(MI: FirstMI) == TII->getMemScale(MI);
1738
1739 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1740 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1741 // are candidate pairs that can be merged.
1742 if (isPreLdStPairCandidate(FirstMI, MI))
1743 return true;
1744
1745 // Try to match an unscaled load/store with a scaled load/store.
1746 return TII->hasUnscaledLdStOffset(Opc: OpcA) != TII->hasUnscaledLdStOffset(Opc: OpcB) &&
1747 getMatchingPairOpcode(Opc: OpcA) == getMatchingPairOpcode(Opc: OpcB);
1748
1749 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1750}
1751
1752static bool canRenameMOP(const MachineOperand &MOP,
1753 const TargetRegisterInfo *TRI) {
1754 if (MOP.isReg()) {
1755 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: MOP.getReg());
1756 // Renaming registers with multiple disjunct sub-registers (e.g. the
1757 // result of a LD3) means that all sub-registers are renamed, potentially
1758 // impacting other instructions we did not check. Bail out.
1759 // Note that this relies on the structure of the AArch64 register file. In
1760 // particular, a subregister cannot be written without overwriting the
1761 // whole register.
1762 if (RegClass->HasDisjunctSubRegs && RegClass->CoveredBySubRegs &&
1763 (TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::dsub0) ||
1764 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::qsub0) ||
1765 TRI->getSubRegisterClass(SuperRC: RegClass, SubRegIdx: AArch64::zsub0))) {
1766 LLVM_DEBUG(
1767 dbgs()
1768 << " Cannot rename operands with multiple disjunct subregisters ("
1769 << MOP << ")\n");
1770 return false;
1771 }
1772
1773 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1774 // them must be known. For example, in ORRWrs the implicit-def
1775 // corresponds to the result register.
1776 if (MOP.isImplicit() && MOP.isDef()) {
1777 if (!isRewritableImplicitDef(MO: MOP))
1778 return false;
1779 return TRI->isSuperOrSubRegisterEq(
1780 RegA: MOP.getParent()->getOperand(i: 0).getReg(), RegB: MOP.getReg());
1781 }
1782 }
1783 return MOP.isImplicit() ||
1784 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1785}
1786
1787static bool
1788canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1789 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1790 const TargetRegisterInfo *TRI) {
1791 if (!FirstMI.mayStore())
1792 return false;
1793
1794 // Check if we can find an unused register which we can use to rename
1795 // the register used by the first load/store.
1796
1797 auto RegToRename = getLdStRegOp(MI&: FirstMI).getReg();
1798 // For now, we only rename if the store operand gets killed at the store.
1799 if (!getLdStRegOp(MI&: FirstMI).isKill() &&
1800 !any_of(Range: FirstMI.operands(),
1801 P: [TRI, RegToRename](const MachineOperand &MOP) {
1802 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1803 MOP.isImplicit() && MOP.isKill() &&
1804 TRI->regsOverlap(RegA: RegToRename, RegB: MOP.getReg());
1805 })) {
1806 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1807 return false;
1808 }
1809
1810 bool FoundDef = false;
1811
1812 // For each instruction between FirstMI and the previous def for RegToRename,
1813 // we
1814 // * check if we can rename RegToRename in this instruction
1815 // * collect the registers used and required register classes for RegToRename.
1816 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1817 bool IsDef) {
1818 LLVM_DEBUG(dbgs() << "Checking " << MI);
1819 // Currently we do not try to rename across frame-setup instructions.
1820 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1821 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1822 << "currently\n");
1823 return false;
1824 }
1825
1826 UsedInBetween.accumulate(MI);
1827
1828 // For a definition, check that we can rename the definition and exit the
1829 // loop.
1830 FoundDef = IsDef;
1831
1832 // For defs, check if we can rename the first def of RegToRename.
1833 if (FoundDef) {
1834 // For some pseudo instructions, we might not generate code in the end
1835 // (e.g. KILL) and we would end up without a correct def for the rename
1836 // register.
1837 // TODO: This might be overly conservative and we could handle those cases
1838 // in multiple ways:
1839 // 1. Insert an extra copy, to materialize the def.
1840 // 2. Skip pseudo-defs until we find an non-pseudo def.
1841 if (MI.isPseudo()) {
1842 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1843 return false;
1844 }
1845
1846 for (auto &MOP : MI.operands()) {
1847 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1848 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1849 continue;
1850 if (!canRenameMOP(MOP, TRI)) {
1851 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1852 return false;
1853 }
1854 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1855 }
1856 return true;
1857 } else {
1858 for (auto &MOP : MI.operands()) {
1859 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1860 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1861 continue;
1862
1863 if (!canRenameMOP(MOP, TRI)) {
1864 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1865 return false;
1866 }
1867 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1868 }
1869 }
1870 return true;
1871 };
1872
1873 if (!forAllMIsUntilDef(MI&: FirstMI, DefReg: RegToRename, TRI, Limit: LdStLimit, Fn&: CheckMIs))
1874 return false;
1875
1876 if (!FoundDef) {
1877 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1878 return false;
1879 }
1880 return true;
1881}
1882
1883// We want to merge the second load into the first by rewriting the usages of
1884// the same reg between first (incl.) and second (excl.). We don't need to care
1885// about any insns before FirstLoad or after SecondLoad.
1886// 1. The second load writes new value into the same reg.
1887// - The renaming is impossible to impact later use of the reg.
1888// - The second load always trash the value written by the first load which
1889// means the reg must be killed before the second load.
1890// 2. The first load must be a def for the same reg so we don't need to look
1891// into anything before it.
1892static bool canRenameUntilSecondLoad(
1893 MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1894 LiveRegUnits &UsedInBetween,
1895 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1896 const TargetRegisterInfo *TRI) {
1897 if (FirstLoad.isPseudo())
1898 return false;
1899
1900 UsedInBetween.accumulate(MI: FirstLoad);
1901 auto RegToRename = getLdStRegOp(MI&: FirstLoad).getReg();
1902 bool Success = std::all_of(
1903 first: FirstLoad.getIterator(), last: SecondLoad.getIterator(),
1904 pred: [&](MachineInstr &MI) {
1905 LLVM_DEBUG(dbgs() << "Checking " << MI);
1906 // Currently we do not try to rename across frame-setup instructions.
1907 if (MI.getFlag(Flag: MachineInstr::FrameSetup)) {
1908 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1909 << "currently\n");
1910 return false;
1911 }
1912
1913 for (auto &MOP : MI.operands()) {
1914 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1915 !TRI->regsOverlap(RegA: MOP.getReg(), RegB: RegToRename))
1916 continue;
1917 if (!canRenameMOP(MOP, TRI)) {
1918 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1919 return false;
1920 }
1921 RequiredClasses.insert(Ptr: TRI->getMinimalPhysRegClass(Reg: MOP.getReg()));
1922 }
1923
1924 return true;
1925 });
1926 return Success;
1927}
1928
1929// Check if we can find a physical register for renaming \p Reg. This register
1930// must:
1931// * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1932// defined registers up to the point where the renamed register will be used,
1933// * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1934// registers in the range the rename register will be used,
1935// * is available in all used register classes (checked using RequiredClasses).
1936static std::optional<MCPhysReg> tryToFindRegisterToRename(
1937 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1938 LiveRegUnits &UsedInBetween,
1939 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1940 const TargetRegisterInfo *TRI) {
1941 const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1942
1943 // Checks if any sub- or super-register of PR is callee saved.
1944 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1945 return any_of(Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1946 P: [&MF, TRI](MCPhysReg SubOrSuper) {
1947 return TRI->isCalleeSavedPhysReg(PhysReg: SubOrSuper, MF);
1948 });
1949 };
1950
1951 // Check if PR or one of its sub- or super-registers can be used for all
1952 // required register classes.
1953 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1954 return all_of(Range&: RequiredClasses, P: [PR, TRI](const TargetRegisterClass *C) {
1955 return any_of(
1956 Range: TRI->sub_and_superregs_inclusive(Reg: PR),
1957 P: [C](MCPhysReg SubOrSuper) { return C->contains(Reg: SubOrSuper); });
1958 });
1959 };
1960
1961 auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1962 for (const MCPhysReg &PR : *RegClass) {
1963 if (DefinedInBB.available(Reg: PR) && UsedInBetween.available(Reg: PR) &&
1964 !RegInfo.isReserved(PhysReg: PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1965 CanBeUsedForAllClasses(PR)) {
1966 DefinedInBB.addReg(Reg: PR);
1967 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1968 << "\n");
1969 return {PR};
1970 }
1971 }
1972 LLVM_DEBUG(dbgs() << "No rename register found from "
1973 << TRI->getRegClassName(RegClass) << "\n");
1974 return std::nullopt;
1975}
1976
1977// For store pairs: returns a register from FirstMI to the beginning of the
1978// block that can be renamed.
1979// For load pairs: returns a register from FirstMI to MI that can be renamed.
1980static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1981 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1982 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1983 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1984 const TargetRegisterInfo *TRI) {
1985 std::optional<MCPhysReg> RenameReg;
1986 if (!DebugCounter::shouldExecute(Counter&: RegRenamingCounter))
1987 return RenameReg;
1988
1989 auto *RegClass = TRI->getMinimalPhysRegClass(Reg: getLdStRegOp(MI&: FirstMI).getReg());
1990 MachineFunction &MF = *FirstMI.getParent()->getParent();
1991 if (!RegClass || !MF.getRegInfo().tracksLiveness())
1992 return RenameReg;
1993
1994 const bool IsLoad = FirstMI.mayLoad();
1995
1996 if (!MaybeCanRename) {
1997 if (IsLoad)
1998 MaybeCanRename = {canRenameUntilSecondLoad(FirstLoad&: FirstMI, SecondLoad&: MI, UsedInBetween,
1999 RequiredClasses, TRI)};
2000 else
2001 MaybeCanRename = {
2002 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
2003 }
2004
2005 if (*MaybeCanRename) {
2006 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
2007 RequiredClasses, TRI);
2008 }
2009 return RenameReg;
2010}
2011
2012/// Scan the instructions looking for a load/store that can be combined with the
2013/// current instruction into a wider equivalent or a load/store pair.
2014MachineBasicBlock::iterator
2015AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
2016 LdStPairFlags &Flags, unsigned Limit,
2017 bool FindNarrowMerge) {
2018 MachineBasicBlock::iterator E = I->getParent()->end();
2019 MachineBasicBlock::iterator MBBI = I;
2020 MachineBasicBlock::iterator MBBIWithRenameReg;
2021 MachineInstr &FirstMI = *I;
2022 MBBI = next_nodbg(It: MBBI, End: E);
2023
2024 bool MayLoad = FirstMI.mayLoad();
2025 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI&: FirstMI);
2026 Register Reg = getLdStRegOp(MI&: FirstMI).getReg();
2027 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: FirstMI).getReg();
2028 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: FirstMI).getImm();
2029 int OffsetStride = IsUnscaled ? TII->getMemScale(MI: FirstMI) : 1;
2030 bool IsPromotableZeroStore = isPromotableZeroStoreInst(MI&: FirstMI);
2031
2032 std::optional<bool> MaybeCanRename;
2033 if (!EnableRenaming)
2034 MaybeCanRename = {false};
2035
2036 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
2037 LiveRegUnits UsedInBetween;
2038 UsedInBetween.init(TRI: *TRI);
2039
2040 Flags.clearRenameReg();
2041
2042 // Track which register units have been modified and used between the first
2043 // insn (inclusive) and the second insn.
2044 ModifiedRegUnits.clear();
2045 UsedRegUnits.clear();
2046
2047 // Remember any instructions that read/write memory between FirstMI and MI.
2048 SmallVector<MachineInstr *, 4> MemInsns;
2049
2050 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
2051 for (unsigned Count = 0; MBBI != E && Count < Limit;
2052 MBBI = next_nodbg(It: MBBI, End: E)) {
2053 MachineInstr &MI = *MBBI;
2054 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
2055
2056 UsedInBetween.accumulate(MI);
2057
2058 // Don't count transient instructions towards the search limit since there
2059 // may be different numbers of them if e.g. debug information is present.
2060 if (!MI.isTransient())
2061 ++Count;
2062
2063 Flags.setSExtIdx(-1);
2064 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
2065 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
2066 assert(MI.mayLoadOrStore() && "Expected memory operation.");
2067 // If we've found another instruction with the same opcode, check to see
2068 // if the base and offset are compatible with our starting instruction.
2069 // These instructions all have scaled immediate operands, so we just
2070 // check for +1/-1. Make sure to check the new instruction offset is
2071 // actually an immediate and not a symbolic reference destined for
2072 // a relocation.
2073 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
2074 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2075 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
2076 if (IsUnscaled != MIIsUnscaled) {
2077 // We're trying to pair instructions that differ in how they are scaled.
2078 // If FirstMI is scaled then scale the offset of MI accordingly.
2079 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
2080 int MemSize = TII->getMemScale(MI);
2081 if (MIIsUnscaled) {
2082 // If the unscaled offset isn't a multiple of the MemSize, we can't
2083 // pair the operations together: bail and keep looking.
2084 if (MIOffset % MemSize) {
2085 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2086 UsedRegUnits, TRI);
2087 MemInsns.push_back(Elt: &MI);
2088 continue;
2089 }
2090 MIOffset /= MemSize;
2091 } else {
2092 MIOffset *= MemSize;
2093 }
2094 }
2095
2096 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
2097
2098 if (BaseReg == MIBaseReg) {
2099 // If the offset of the second ld/st is not equal to the size of the
2100 // destination register it can’t be paired with a pre-index ld/st
2101 // pair. Additionally if the base reg is used or modified the operations
2102 // can't be paired: bail and keep looking.
2103 if (IsPreLdSt) {
2104 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
2105 bool IsBaseRegUsed = !UsedRegUnits.available(
2106 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2107 bool IsBaseRegModified = !ModifiedRegUnits.available(
2108 Reg: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2109 // If the stored value and the address of the second instruction is
2110 // the same, it needs to be using the updated register and therefore
2111 // it must not be folded.
2112 bool IsMIRegTheSame =
2113 TRI->regsOverlap(RegA: getLdStRegOp(MI).getReg(),
2114 RegB: AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2115 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
2116 IsMIRegTheSame) {
2117 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2118 UsedRegUnits, TRI);
2119 MemInsns.push_back(Elt: &MI);
2120 continue;
2121 }
2122 } else {
2123 if ((Offset != MIOffset + OffsetStride) &&
2124 (Offset + OffsetStride != MIOffset)) {
2125 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2126 UsedRegUnits, TRI);
2127 MemInsns.push_back(Elt: &MI);
2128 continue;
2129 }
2130 }
2131
2132 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
2133 if (FindNarrowMerge) {
2134 // If the alignment requirements of the scaled wide load/store
2135 // instruction can't express the offset of the scaled narrow input,
2136 // bail and keep looking. For promotable zero stores, allow only when
2137 // the stored value is the same (i.e., WZR).
2138 if ((!IsUnscaled && alignTo(Num: MinOffset, PowOf2: 2) != MinOffset) ||
2139 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
2140 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2141 UsedRegUnits, TRI);
2142 MemInsns.push_back(Elt: &MI);
2143 continue;
2144 }
2145 } else {
2146 // Pairwise instructions have a 7-bit signed offset field. Single
2147 // insns have a 12-bit unsigned offset field. If the resultant
2148 // immediate offset of merging these instructions is out of range for
2149 // a pairwise instruction, bail and keep looking.
2150 if (!inBoundsForPair(IsUnscaled, Offset: MinOffset, OffsetStride)) {
2151 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2152 UsedRegUnits, TRI);
2153 MemInsns.push_back(Elt: &MI);
2154 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
2155 << "keep looking.\n");
2156 continue;
2157 }
2158 // If the alignment requirements of the paired (scaled) instruction
2159 // can't express the offset of the unscaled input, bail and keep
2160 // looking.
2161 if (IsUnscaled && (alignTo(Num: MinOffset, PowOf2: OffsetStride) != MinOffset)) {
2162 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2163 UsedRegUnits, TRI);
2164 MemInsns.push_back(Elt: &MI);
2165 LLVM_DEBUG(dbgs()
2166 << "Offset doesn't fit due to alignment requirements, "
2167 << "keep looking.\n");
2168 continue;
2169 }
2170 }
2171
2172 // If the BaseReg has been modified, then we cannot do the optimization.
2173 // For example, in the following pattern
2174 // ldr x1 [x2]
2175 // ldr x2 [x3]
2176 // ldr x4 [x2, #8],
2177 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
2178 if (!ModifiedRegUnits.available(Reg: BaseReg))
2179 return E;
2180
2181 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
2182 RegA: Reg, RegB: getLdStRegOp(MI).getReg());
2183
2184 // If the Rt of the second instruction (destination register of the
2185 // load) was not modified or used between the two instructions and none
2186 // of the instructions between the second and first alias with the
2187 // second, we can combine the second into the first.
2188 bool RtNotModified =
2189 ModifiedRegUnits.available(Reg: getLdStRegOp(MI).getReg());
2190 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
2191 !UsedRegUnits.available(Reg: getLdStRegOp(MI).getReg()));
2192
2193 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
2194 << "Reg '" << getLdStRegOp(MI) << "' not modified: "
2195 << (RtNotModified ? "true" : "false") << "\n"
2196 << "Reg '" << getLdStRegOp(MI) << "' not used: "
2197 << (RtNotUsed ? "true" : "false") << "\n");
2198
2199 if (RtNotModified && RtNotUsed && !mayAlias(MIa&: MI, MemInsns, AA)) {
2200 // For pairs loading into the same reg, try to find a renaming
2201 // opportunity to allow the renaming of Reg between FirstMI and MI
2202 // and combine MI into FirstMI; otherwise bail and keep looking.
2203 if (SameLoadReg) {
2204 std::optional<MCPhysReg> RenameReg =
2205 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
2206 Reg, DefinedInBB, UsedInBetween,
2207 RequiredClasses, TRI);
2208 if (!RenameReg) {
2209 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2210 UsedRegUnits, TRI);
2211 MemInsns.push_back(Elt: &MI);
2212 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
2213 << "keep looking.\n");
2214 continue;
2215 }
2216 Flags.setRenameReg(*RenameReg);
2217 }
2218
2219 Flags.setMergeForward(false);
2220 if (!SameLoadReg)
2221 Flags.clearRenameReg();
2222 return MBBI;
2223 }
2224
2225 // Likewise, if the Rt of the first instruction is not modified or used
2226 // between the two instructions and none of the instructions between the
2227 // first and the second alias with the first, we can combine the first
2228 // into the second.
2229 RtNotModified = !(
2230 MayLoad && !UsedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg()));
2231
2232 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
2233 << "Reg '" << getLdStRegOp(FirstMI)
2234 << "' not modified: "
2235 << (RtNotModified ? "true" : "false") << "\n");
2236
2237 if (RtNotModified && !mayAlias(MIa&: FirstMI, MemInsns, AA)) {
2238 if (ModifiedRegUnits.available(Reg: getLdStRegOp(MI&: FirstMI).getReg())) {
2239 Flags.setMergeForward(true);
2240 Flags.clearRenameReg();
2241 return MBBI;
2242 }
2243
2244 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
2245 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
2246 RequiredClasses, TRI);
2247 if (RenameReg) {
2248 Flags.setMergeForward(true);
2249 Flags.setRenameReg(*RenameReg);
2250 MBBIWithRenameReg = MBBI;
2251 }
2252 }
2253 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
2254 << "interference in between, keep looking.\n");
2255 }
2256 }
2257
2258 if (Flags.getRenameReg())
2259 return MBBIWithRenameReg;
2260
2261 // If the instruction wasn't a matching load or store. Stop searching if we
2262 // encounter a call instruction that might modify memory.
2263 if (MI.isCall()) {
2264 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
2265 return E;
2266 }
2267
2268 // Update modified / uses register units.
2269 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2270
2271 // Otherwise, if the base register is modified, we have no match, so
2272 // return early.
2273 if (!ModifiedRegUnits.available(Reg: BaseReg)) {
2274 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
2275 return E;
2276 }
2277
2278 // Update list of instructions that read/write memory.
2279 if (MI.mayLoadOrStore())
2280 MemInsns.push_back(Elt: &MI);
2281 }
2282 return E;
2283}
2284
2285static MachineBasicBlock::iterator
2286maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
2287 assert((MI.getOpcode() == AArch64::SUBXri ||
2288 MI.getOpcode() == AArch64::ADDXri) &&
2289 "Expected a register update instruction");
2290 auto End = MI.getParent()->end();
2291 if (MaybeCFI == End ||
2292 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
2293 !(MI.getFlag(Flag: MachineInstr::FrameSetup) ||
2294 MI.getFlag(Flag: MachineInstr::FrameDestroy)) ||
2295 MI.getOperand(i: 0).getReg() != AArch64::SP)
2296 return End;
2297
2298 const MachineFunction &MF = *MI.getParent()->getParent();
2299 unsigned CFIIndex = MaybeCFI->getOperand(i: 0).getCFIIndex();
2300 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
2301 switch (CFI.getOperation()) {
2302 case MCCFIInstruction::OpDefCfa:
2303 case MCCFIInstruction::OpDefCfaOffset:
2304 return MaybeCFI;
2305 default:
2306 return End;
2307 }
2308}
2309
2310std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
2311 MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
2312 bool IsForward, bool IsPreIdx, bool MergeEither) {
2313 assert((Update->getOpcode() == AArch64::ADDXri ||
2314 Update->getOpcode() == AArch64::SUBXri) &&
2315 "Unexpected base register update instruction to merge!");
2316 MachineBasicBlock::iterator E = I->getParent()->end();
2317 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2318
2319 // If updating the SP and the following instruction is CFA offset related CFI,
2320 // make sure the CFI follows the SP update either by merging at the location
2321 // of the update or by moving the CFI after the merged instruction. If unable
2322 // to do so, bail.
2323 MachineBasicBlock::iterator InsertPt = I;
2324 if (IsForward) {
2325 assert(IsPreIdx);
2326 if (auto CFI = maybeMoveCFI(MI&: *Update, MaybeCFI: next_nodbg(It: Update, End: E)); CFI != E) {
2327 if (MergeEither) {
2328 InsertPt = Update;
2329 } else {
2330 // Take care not to reorder CFIs.
2331 if (std::any_of(first: std::next(x: CFI), last: I, pred: [](const auto &Insn) {
2332 return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
2333 }))
2334 return std::nullopt;
2335
2336 MachineBasicBlock *MBB = InsertPt->getParent();
2337 MBB->splice(Where: std::next(x: InsertPt), Other: MBB, From: CFI);
2338 }
2339 }
2340 }
2341
2342 // Return the instruction following the merged instruction, which is
2343 // the instruction following our unmerged load. Unless that's the add/sub
2344 // instruction we're merging, in which case it's the one after that.
2345 if (NextI == Update)
2346 NextI = next_nodbg(It: NextI, End: E);
2347
2348 int Value = Update->getOperand(i: 2).getImm();
2349 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
2350 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
2351 if (Update->getOpcode() == AArch64::SUBXri)
2352 Value = -Value;
2353
2354 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(Opc: I->getOpcode())
2355 : getPostIndexedOpcode(Opc: I->getOpcode());
2356 MachineInstrBuilder MIB;
2357 int Scale, MinOffset, MaxOffset;
2358 getPrePostIndexedMemOpInfo(MI: *I, Scale, MinOffset, MaxOffset);
2359 if (!AArch64InstrInfo::isPairedLdSt(MI: *I)) {
2360 // Non-paired instruction.
2361 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2362 MCID: TII->get(Opcode: NewOpc))
2363 .add(MO: Update->getOperand(i: 0))
2364 .add(MO: getLdStRegOp(MI&: *I))
2365 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2366 .addImm(Val: Value / Scale)
2367 .setMemRefs(I->memoperands())
2368 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2369 } else {
2370 // Paired instruction.
2371 MIB = BuildMI(BB&: *InsertPt->getParent(), I: InsertPt, MIMD: InsertPt->getDebugLoc(),
2372 MCID: TII->get(Opcode: NewOpc))
2373 .add(MO: Update->getOperand(i: 0))
2374 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 0))
2375 .add(MO: getLdStRegOp(MI&: *I, PairedRegOp: 1))
2376 .add(MO: AArch64InstrInfo::getLdStBaseOp(MI: *I))
2377 .addImm(Val: Value / Scale)
2378 .setMemRefs(I->memoperands())
2379 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2380 }
2381
2382 if (IsPreIdx) {
2383 ++NumPreFolded;
2384 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2385 } else {
2386 ++NumPostFolded;
2387 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2388 }
2389 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2390 LLVM_DEBUG(I->print(dbgs()));
2391 LLVM_DEBUG(dbgs() << " ");
2392 LLVM_DEBUG(Update->print(dbgs()));
2393 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2394 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2395 LLVM_DEBUG(dbgs() << "\n");
2396
2397 // Erase the old instructions for the block.
2398 I->eraseFromParent();
2399 Update->eraseFromParent();
2400
2401 return NextI;
2402}
2403
2404MachineBasicBlock::iterator
2405AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2406 MachineBasicBlock::iterator Update,
2407 unsigned Offset, int Scale) {
2408 assert((Update->getOpcode() == AArch64::MOVKWi) &&
2409 "Unexpected const mov instruction to merge!");
2410 MachineBasicBlock::iterator E = I->getParent()->end();
2411 MachineBasicBlock::iterator NextI = next_nodbg(It: I, End: E);
2412 MachineBasicBlock::iterator PrevI = prev_nodbg(It: Update, Begin: E);
2413 MachineInstr &MemMI = *I;
2414 unsigned Mask = (1 << 12) * Scale - 1;
2415 unsigned Low = Offset & Mask;
2416 unsigned High = Offset - Low;
2417 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2418 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2419 MachineInstrBuilder AddMIB, MemMIB;
2420
2421 // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2422 AddMIB =
2423 BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: AArch64::ADDXri))
2424 .addDef(RegNo: IndexReg)
2425 .addUse(RegNo: BaseReg)
2426 .addImm(Val: High >> 12) // shifted value
2427 .addImm(Val: 12); // shift 12
2428 (void)AddMIB;
2429 // Ld/St DestReg, IndexReg, Imm12
2430 unsigned NewOpc = getBaseAddressOpcode(Opc: I->getOpcode());
2431 MemMIB = BuildMI(BB&: *I->getParent(), I, MIMD: I->getDebugLoc(), MCID: TII->get(Opcode: NewOpc))
2432 .add(MO: getLdStRegOp(MI&: MemMI))
2433 .add(MO: AArch64InstrInfo::getLdStOffsetOp(MI: MemMI))
2434 .addImm(Val: Low / Scale)
2435 .setMemRefs(I->memoperands())
2436 .setMIFlags(I->mergeFlagsWith(Other: *Update));
2437 (void)MemMIB;
2438
2439 ++NumConstOffsetFolded;
2440 LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2441 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2442 LLVM_DEBUG(PrevI->print(dbgs()));
2443 LLVM_DEBUG(dbgs() << " ");
2444 LLVM_DEBUG(Update->print(dbgs()));
2445 LLVM_DEBUG(dbgs() << " ");
2446 LLVM_DEBUG(I->print(dbgs()));
2447 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2448 LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2449 LLVM_DEBUG(dbgs() << " ");
2450 LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2451 LLVM_DEBUG(dbgs() << "\n");
2452
2453 // Erase the old instructions for the block.
2454 I->eraseFromParent();
2455 PrevI->eraseFromParent();
2456 Update->eraseFromParent();
2457
2458 return NextI;
2459}
2460
2461bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2462 MachineInstr &MI,
2463 unsigned BaseReg, int Offset) {
2464 switch (MI.getOpcode()) {
2465 default:
2466 break;
2467 case AArch64::SUBXri:
2468 case AArch64::ADDXri:
2469 // Make sure it's a vanilla immediate operand, not a relocation or
2470 // anything else we can't handle.
2471 if (!MI.getOperand(i: 2).isImm())
2472 break;
2473 // Watch out for 1 << 12 shifted value.
2474 if (AArch64_AM::getShiftValue(Imm: MI.getOperand(i: 3).getImm()))
2475 break;
2476
2477 // The update instruction source and destination register must be the
2478 // same as the load/store base register.
2479 if (MI.getOperand(i: 0).getReg() != BaseReg ||
2480 MI.getOperand(i: 1).getReg() != BaseReg)
2481 break;
2482
2483 int UpdateOffset = MI.getOperand(i: 2).getImm();
2484 if (MI.getOpcode() == AArch64::SUBXri)
2485 UpdateOffset = -UpdateOffset;
2486
2487 // The immediate must be a multiple of the scaling factor of the pre/post
2488 // indexed instruction.
2489 int Scale, MinOffset, MaxOffset;
2490 getPrePostIndexedMemOpInfo(MI: MemMI, Scale, MinOffset, MaxOffset);
2491 if (UpdateOffset % Scale != 0)
2492 break;
2493
2494 // Scaled offset must fit in the instruction immediate.
2495 int ScaledOffset = UpdateOffset / Scale;
2496 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2497 break;
2498
2499 // If we have a non-zero Offset, we check that it matches the amount
2500 // we're adding to the register.
2501 if (!Offset || Offset == UpdateOffset)
2502 return true;
2503 break;
2504 }
2505 return false;
2506}
2507
2508bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2509 MachineInstr &MI,
2510 unsigned IndexReg,
2511 unsigned &Offset) {
2512 // The update instruction source and destination register must be the
2513 // same as the load/store index register.
2514 if (MI.getOpcode() == AArch64::MOVKWi &&
2515 TRI->isSuperOrSubRegisterEq(RegA: IndexReg, RegB: MI.getOperand(i: 1).getReg())) {
2516
2517 // movz + movk hold a large offset of a Ld/St instruction.
2518 MachineBasicBlock::iterator B = MI.getParent()->begin();
2519 MachineBasicBlock::iterator MBBI = &MI;
2520 // Skip the scene when the MI is the first instruction of a block.
2521 if (MBBI == B)
2522 return false;
2523 MBBI = prev_nodbg(It: MBBI, Begin: B);
2524 MachineInstr &MovzMI = *MBBI;
2525 // Make sure the MOVKWi and MOVZWi set the same register.
2526 if (MovzMI.getOpcode() == AArch64::MOVZWi &&
2527 MovzMI.getOperand(i: 0).getReg() == MI.getOperand(i: 0).getReg()) {
2528 unsigned Low = MovzMI.getOperand(i: 1).getImm();
2529 unsigned High = MI.getOperand(i: 2).getImm() << MI.getOperand(i: 3).getImm();
2530 Offset = High + Low;
2531 // 12-bit optionally shifted immediates are legal for adds.
2532 return Offset >> 24 == 0;
2533 }
2534 }
2535 return false;
2536}
2537
2538MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2539 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2540 MachineBasicBlock::iterator E = I->getParent()->end();
2541 MachineInstr &MemMI = *I;
2542 MachineBasicBlock::iterator MBBI = I;
2543
2544 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2545 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm() *
2546 TII->getMemScale(MI: MemMI);
2547
2548 // Scan forward looking for post-index opportunities. Updating instructions
2549 // can't be formed if the memory instruction doesn't have the offset we're
2550 // looking for.
2551 if (MIUnscaledOffset != UnscaledOffset)
2552 return E;
2553
2554 // If the base register overlaps a source/destination register, we can't
2555 // merge the update. This does not apply to tag store instructions which
2556 // ignore the address part of the source register.
2557 // This does not apply to STGPi as well, which does not have unpredictable
2558 // behavior in this case unlike normal stores, and always performs writeback
2559 // after reading the source register value.
2560 if (!isTagStore(MI: MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2561 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2562 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2563 Register DestReg = getLdStRegOp(MI&: MemMI, PairedRegOp: i).getReg();
2564 if (DestReg == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg))
2565 return E;
2566 }
2567 }
2568
2569 // Track which register units have been modified and used between the first
2570 // insn (inclusive) and the second insn.
2571 ModifiedRegUnits.clear();
2572 UsedRegUnits.clear();
2573 MBBI = next_nodbg(It: MBBI, End: E);
2574
2575 // We can't post-increment the stack pointer if any instruction between
2576 // the memory access (I) and the increment (MBBI) can access the memory
2577 // region defined by [SP, MBBI].
2578 const bool BaseRegSP = BaseReg == AArch64::SP;
2579 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2580 // FIXME: For now, we always block the optimization over SP in windows
2581 // targets as it requires to adjust the unwind/debug info, messing up
2582 // the unwind info can actually cause a miscompile.
2583 return E;
2584 }
2585
2586 unsigned Count = 0;
2587 MachineBasicBlock *CurMBB = I->getParent();
2588 // choice of next block to visit is liveins-based
2589 bool VisitSucc = CurMBB->getParent()->getRegInfo().tracksLiveness();
2590
2591 while (true) {
2592 for (MachineBasicBlock::iterator CurEnd = CurMBB->end();
2593 MBBI != CurEnd && Count < Limit; MBBI = next_nodbg(It: MBBI, End: CurEnd)) {
2594 MachineInstr &MI = *MBBI;
2595
2596 // Don't count transient instructions towards the search limit since there
2597 // may be different numbers of them if e.g. debug information is present.
2598 if (!MI.isTransient())
2599 ++Count;
2600
2601 // If we found a match, return it.
2602 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset: UnscaledOffset))
2603 return MBBI;
2604
2605 // Update the status of what the instruction clobbered and used.
2606 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2607 TRI);
2608
2609 // Otherwise, if the base register is used or modified, we have no match,
2610 // so return early. If we are optimizing SP, do not allow instructions
2611 // that may load or store in between the load and the optimized value
2612 // update.
2613 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2614 !UsedRegUnits.available(Reg: BaseReg) ||
2615 (BaseRegSP && MBBI->mayLoadOrStore()))
2616 return E;
2617 }
2618
2619 if (!VisitSucc || Limit <= Count)
2620 break;
2621
2622 // Try to go downward to successors along a CF path w/o side enters
2623 // such that BaseReg is alive along it but not at its exits
2624 MachineBasicBlock *SuccToVisit = nullptr;
2625 unsigned LiveSuccCount = 0;
2626 for (MachineBasicBlock *Succ : CurMBB->successors()) {
2627 for (MCRegAliasIterator AI(BaseReg, TRI, true); AI.isValid(); ++AI) {
2628 if (Succ->isLiveIn(Reg: *AI)) {
2629 if (LiveSuccCount++)
2630 return E;
2631 if (Succ->pred_size() == 1)
2632 SuccToVisit = Succ;
2633 break;
2634 }
2635 }
2636 }
2637 if (!SuccToVisit)
2638 break;
2639 CurMBB = SuccToVisit;
2640 MBBI = CurMBB->begin();
2641 }
2642
2643 return E;
2644}
2645
2646MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2647 MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
2648 MachineBasicBlock::iterator B = I->getParent()->begin();
2649 MachineBasicBlock::iterator E = I->getParent()->end();
2650 MachineInstr &MemMI = *I;
2651 MachineBasicBlock::iterator MBBI = I;
2652 MachineFunction &MF = *MemMI.getMF();
2653
2654 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MI: MemMI).getReg();
2655 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getImm();
2656
2657 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MI: MemMI);
2658 Register DestReg[] = {getLdStRegOp(MI&: MemMI, PairedRegOp: 0).getReg(),
2659 IsPairedInsn ? getLdStRegOp(MI&: MemMI, PairedRegOp: 1).getReg()
2660 : AArch64::NoRegister};
2661
2662 // If the load/store is the first instruction in the block, there's obviously
2663 // not any matching update. Ditto if the memory offset isn't zero.
2664 if (MBBI == B || Offset != 0)
2665 return E;
2666 // If the base register overlaps a destination register, we can't
2667 // merge the update.
2668 if (!isTagStore(MI: MemMI)) {
2669 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i)
2670 if (DestReg[i] == BaseReg || TRI->isSubRegister(RegA: BaseReg, RegB: DestReg[i]))
2671 return E;
2672 }
2673
2674 const bool BaseRegSP = BaseReg == AArch64::SP;
2675 if (BaseRegSP && needsWinCFI(MF: I->getMF())) {
2676 // FIXME: For now, we always block the optimization over SP in windows
2677 // targets as it requires to adjust the unwind/debug info, messing up
2678 // the unwind info can actually cause a miscompile.
2679 return E;
2680 }
2681
2682 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2683 unsigned RedZoneSize =
2684 Subtarget.getTargetLowering()->getRedZoneSize(F: MF.getFunction());
2685
2686 // Track which register units have been modified and used between the first
2687 // insn (inclusive) and the second insn.
2688 ModifiedRegUnits.clear();
2689 UsedRegUnits.clear();
2690 unsigned Count = 0;
2691 bool MemAccessBeforeSPPreInc = false;
2692 MergeEither = true;
2693 do {
2694 MBBI = prev_nodbg(It: MBBI, Begin: B);
2695 MachineInstr &MI = *MBBI;
2696
2697 // Don't count transient instructions towards the search limit since there
2698 // may be different numbers of them if e.g. debug information is present.
2699 if (!MI.isTransient())
2700 ++Count;
2701
2702 // If we found a match, return it.
2703 if (isMatchingUpdateInsn(MemMI&: *I, MI, BaseReg, Offset)) {
2704 // Check that the update value is within our red zone limit (which may be
2705 // zero).
2706 if (MemAccessBeforeSPPreInc && MBBI->getOperand(i: 2).getImm() > RedZoneSize)
2707 return E;
2708 return MBBI;
2709 }
2710
2711 // Update the status of what the instruction clobbered and used.
2712 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2713
2714 // Otherwise, if the base register is used or modified, we have no match, so
2715 // return early.
2716 if (!ModifiedRegUnits.available(Reg: BaseReg) ||
2717 !UsedRegUnits.available(Reg: BaseReg))
2718 return E;
2719
2720 // If we have a destination register (i.e. a load instruction) and a
2721 // destination register is used or modified, then we can only merge forward,
2722 // i.e. the combined instruction is put in the place of the memory
2723 // instruction. Same applies if we see a memory access or side effects.
2724 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() ||
2725 (DestReg[0] != AArch64::NoRegister &&
2726 !(ModifiedRegUnits.available(Reg: DestReg[0]) &&
2727 UsedRegUnits.available(Reg: DestReg[0]))) ||
2728 (DestReg[1] != AArch64::NoRegister &&
2729 !(ModifiedRegUnits.available(Reg: DestReg[1]) &&
2730 UsedRegUnits.available(Reg: DestReg[1]))))
2731 MergeEither = false;
2732
2733 // Keep track if we have a memory access before an SP pre-increment, in this
2734 // case we need to validate later that the update amount respects the red
2735 // zone.
2736 if (BaseRegSP && MBBI->mayLoadOrStore())
2737 MemAccessBeforeSPPreInc = true;
2738 } while (MBBI != B && Count < Limit);
2739 return E;
2740}
2741
2742MachineBasicBlock::iterator
2743AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2744 MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2745 MachineBasicBlock::iterator B = I->getParent()->begin();
2746 MachineBasicBlock::iterator E = I->getParent()->end();
2747 MachineInstr &MemMI = *I;
2748 MachineBasicBlock::iterator MBBI = I;
2749
2750 // If the load is the first instruction in the block, there's obviously
2751 // not any matching load or store.
2752 if (MBBI == B)
2753 return E;
2754
2755 // Make sure the IndexReg is killed and the shift amount is zero.
2756 // TODO: Relex this restriction to extend, simplify processing now.
2757 if (!AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).isKill() ||
2758 !AArch64InstrInfo::getLdStAmountOp(MI: MemMI).isImm() ||
2759 (AArch64InstrInfo::getLdStAmountOp(MI: MemMI).getImm() != 0))
2760 return E;
2761
2762 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MI: MemMI).getReg();
2763
2764 // Track which register units have been modified and used between the first
2765 // insn (inclusive) and the second insn.
2766 ModifiedRegUnits.clear();
2767 UsedRegUnits.clear();
2768 unsigned Count = 0;
2769 do {
2770 MBBI = prev_nodbg(It: MBBI, Begin: B);
2771 MachineInstr &MI = *MBBI;
2772
2773 // Don't count transient instructions towards the search limit since there
2774 // may be different numbers of them if e.g. debug information is present.
2775 if (!MI.isTransient())
2776 ++Count;
2777
2778 // If we found a match, return it.
2779 if (isMatchingMovConstInsn(MemMI&: *I, MI, IndexReg, Offset)) {
2780 return MBBI;
2781 }
2782
2783 // Update the status of what the instruction clobbered and used.
2784 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2785
2786 // Otherwise, if the index register is used or modified, we have no match,
2787 // so return early.
2788 if (!ModifiedRegUnits.available(Reg: IndexReg) ||
2789 !UsedRegUnits.available(Reg: IndexReg))
2790 return E;
2791
2792 } while (MBBI != B && Count < Limit);
2793 return E;
2794}
2795
2796bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2797 MachineBasicBlock::iterator &MBBI) {
2798 MachineInstr &MI = *MBBI;
2799 // If this is a volatile load, don't mess with it.
2800 if (MI.hasOrderedMemoryRef())
2801 return false;
2802
2803 if (needsWinCFI(MF: MI.getMF()) && MI.getFlag(Flag: MachineInstr::FrameDestroy))
2804 return false;
2805
2806 // Make sure this is a reg+imm.
2807 // FIXME: It is possible to extend it to handle reg+reg cases.
2808 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2809 return false;
2810
2811 // Look backward up to LdStLimit instructions.
2812 MachineBasicBlock::iterator StoreI;
2813 if (findMatchingStore(I: MBBI, Limit: LdStLimit, StoreI)) {
2814 ++NumLoadsFromStoresPromoted;
2815 // Promote the load. Keeping the iterator straight is a
2816 // pain, so we let the merge routine tell us what the next instruction
2817 // is after it's done mucking about.
2818 MBBI = promoteLoadFromStore(LoadI: MBBI, StoreI);
2819 return true;
2820 }
2821 return false;
2822}
2823
2824// Merge adjacent zero stores into a wider store.
2825bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2826 MachineBasicBlock::iterator &MBBI) {
2827 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2828 MachineInstr &MI = *MBBI;
2829 MachineBasicBlock::iterator E = MI.getParent()->end();
2830
2831 if (!TII->isCandidateToMergeOrPair(MI))
2832 return false;
2833
2834 // Look ahead up to LdStLimit instructions for a mergeable instruction.
2835 LdStPairFlags Flags;
2836 MachineBasicBlock::iterator MergeMI =
2837 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ true);
2838 if (MergeMI != E) {
2839 ++NumZeroStoresPromoted;
2840
2841 // Keeping the iterator straight is a pain, so we let the merge routine tell
2842 // us what the next instruction is after it's done mucking about.
2843 MBBI = mergeNarrowZeroStores(I: MBBI, MergeMI, Flags);
2844 return true;
2845 }
2846 return false;
2847}
2848
2849// Find loads and stores that can be merged into a single load or store pair
2850// instruction.
2851bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2852 MachineInstr &MI = *MBBI;
2853 MachineBasicBlock::iterator E = MI.getParent()->end();
2854
2855 if (!TII->isCandidateToMergeOrPair(MI))
2856 return false;
2857
2858 // If disable-ldp feature is opted, do not emit ldp.
2859 if (MI.mayLoad() && Subtarget->hasDisableLdp())
2860 return false;
2861
2862 // If disable-stp feature is opted, do not emit stp.
2863 if (MI.mayStore() && Subtarget->hasDisableStp())
2864 return false;
2865
2866 // Early exit if the offset is not possible to match. (6 bits of positive
2867 // range, plus allow an extra one in case we find a later insn that matches
2868 // with Offset-1)
2869 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2870 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2871 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2872 // Allow one more for offset.
2873 if (Offset > 0)
2874 Offset -= OffsetStride;
2875 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2876 return false;
2877
2878 // Look ahead up to LdStLimit instructions for a pairable instruction.
2879 LdStPairFlags Flags;
2880 MachineBasicBlock::iterator Paired =
2881 findMatchingInsn(I: MBBI, Flags, Limit: LdStLimit, /* FindNarrowMerge = */ false);
2882
2883 if (Paired == E)
2884 return false;
2885
2886 // Keeping the iterator straight is a pain, so we let the merge routine tell
2887 // us what the next instruction is after it's done mucking about.
2888 auto Prev = std::prev(x: MBBI);
2889
2890 // Fetch the memoperand of the load/store that is a candidate for combination.
2891 MachineMemOperand *MemOp =
2892 MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2893
2894 // If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2895 // that the alignment of the source pointer is at least double the alignment
2896 // of the type.
2897 if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||
2898 (MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2899 // If there is no size/align information, cancel the transformation.
2900 if (!MemOp || !MemOp->getMemoryType().isValid()) {
2901 NumFailedAlignmentCheck++;
2902 return false;
2903 }
2904
2905 // Get the needed alignments to check them if
2906 // ldp-aligned-only/stp-aligned-only features are opted.
2907 uint64_t MemAlignment = MemOp->getAlign().value();
2908 uint64_t TypeAlignment =
2909 Align(MemOp->getSize().getValue().getKnownMinValue()).value();
2910
2911 if (MemAlignment < 2 * TypeAlignment) {
2912 NumFailedAlignmentCheck++;
2913 return false;
2914 }
2915 }
2916
2917 ++NumPairCreated;
2918 if (TII->hasUnscaledLdStOffset(MI))
2919 ++NumUnscaledPairCreated;
2920
2921 MBBI = mergePairedInsns(I: MBBI, Paired, Flags);
2922 // Collect liveness info for instructions between Prev and the new position
2923 // MBBI.
2924 for (auto I = std::next(x: Prev); I != MBBI; I++)
2925 updateDefinedRegisters(MI&: *I, Units&: DefinedInBB, TRI);
2926
2927 return true;
2928}
2929
2930bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2931 (MachineBasicBlock::iterator &MBBI) {
2932 MachineInstr &MI = *MBBI;
2933 MachineBasicBlock::iterator E = MI.getParent()->end();
2934 MachineBasicBlock::iterator Update;
2935
2936 // Do not form post-inc addressing mode for volatile accesses. Instructions
2937 // performing register writeback do not set a valid instruction syndrome,
2938 // making it impossible to handle MMIO in protected hypervisors.
2939 // Exclude accesses based on the stack pointer, as these can't be MMIO.
2940 // Also exclude MTE tag store instructions.
2941 if (MBBI->hasOrderedMemoryRef() &&
2942 AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP &&
2943 !isTagStore(MI) && MI.getOpcode() != AArch64::STGPi)
2944 return false;
2945
2946 // Look forward to try to form a post-index instruction. For example,
2947 // ldr x0, [x20]
2948 // add x20, x20, #32
2949 // merged into:
2950 // ldr x0, [x20], #32
2951 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset: 0, Limit: UpdateLimit);
2952 if (Update != E) {
2953 // Merge the update into the ld/st.
2954 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2955 /*IsPreIdx=*/false,
2956 /*MergeEither=*/false)) {
2957 MBBI = *NextI;
2958 return true;
2959 }
2960 }
2961
2962 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2963 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
2964 return false;
2965
2966 // Look back to try to find a pre-index instruction. For example,
2967 // add x0, x0, #8
2968 // ldr x1, [x0]
2969 // merged into:
2970 // ldr x1, [x0, #8]!
2971 bool MergeEither;
2972 Update = findMatchingUpdateInsnBackward(I: MBBI, Limit: UpdateLimit, MergeEither);
2973 if (Update != E) {
2974 // Merge the update into the ld/st.
2975 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/true,
2976 /*IsPreIdx=*/true, MergeEither)) {
2977 MBBI = *NextI;
2978 return true;
2979 }
2980 }
2981
2982 // The immediate in the load/store is scaled by the size of the memory
2983 // operation. The immediate in the add we're looking for,
2984 // however, is not, so adjust here.
2985 int UnscaledOffset =
2986 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2987
2988 // Look forward to try to find a pre-index instruction. For example,
2989 // ldr x1, [x0, #64]
2990 // add x0, x0, #64
2991 // merged into:
2992 // ldr x1, [x0, #64]!
2993 Update = findMatchingUpdateInsnForward(I: MBBI, UnscaledOffset, Limit: UpdateLimit);
2994 if (Update != E) {
2995 // Merge the update into the ld/st.
2996 if (auto NextI = mergeUpdateInsn(I: MBBI, Update, /*IsForward=*/false,
2997 /*IsPreIdx=*/true,
2998 /*MergeEither=*/false)) {
2999 MBBI = *NextI;
3000 return true;
3001 }
3002 }
3003
3004 return false;
3005}
3006
3007bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
3008 int Scale) {
3009 MachineInstr &MI = *MBBI;
3010 MachineBasicBlock::iterator E = MI.getParent()->end();
3011 MachineBasicBlock::iterator Update;
3012
3013 // Don't know how to handle unscaled pre/post-index versions below, so bail.
3014 if (TII->hasUnscaledLdStOffset(Opc: MI.getOpcode()))
3015 return false;
3016
3017 // Look back to try to find a const offset for index LdSt instruction. For
3018 // example,
3019 // mov x8, #LargeImm ; = a * (1<<12) + imm12
3020 // ldr x1, [x0, x8]
3021 // merged into:
3022 // add x8, x0, a * (1<<12)
3023 // ldr x1, [x8, imm12]
3024 unsigned Offset;
3025 Update = findMatchingConstOffsetBackward(I: MBBI, Limit: LdStConstLimit, Offset);
3026 if (Update != E && (Offset & (Scale - 1)) == 0) {
3027 // Merge the imm12 into the ld/st.
3028 MBBI = mergeConstOffsetInsn(I: MBBI, Update, Offset, Scale);
3029 return true;
3030 }
3031
3032 return false;
3033}
3034
3035// Map a GPR store opcode to its FPR equivalent at the same data width.
3036// Returns 0 if no mapping exists.
3037static unsigned getGPRToFPRStoreOpcode(unsigned GPRStoreOpc) {
3038 switch (GPRStoreOpc) {
3039 // Unsigned immediate.
3040 case AArch64::STRBBui:
3041 return AArch64::STRBui;
3042 case AArch64::STRHHui:
3043 return AArch64::STRHui;
3044 case AArch64::STRWui:
3045 return AArch64::STRSui;
3046 case AArch64::STRXui:
3047 return AArch64::STRDui;
3048 // Unscaled immediate.
3049 case AArch64::STURBBi:
3050 return AArch64::STURBi;
3051 case AArch64::STURHHi:
3052 return AArch64::STURHi;
3053 case AArch64::STURWi:
3054 return AArch64::STURSi;
3055 case AArch64::STURXi:
3056 return AArch64::STURDi;
3057 // Register offset.
3058 case AArch64::STRBBroW:
3059 return AArch64::STRBroW;
3060 case AArch64::STRBBroX:
3061 return AArch64::STRBroX;
3062 case AArch64::STRHHroW:
3063 return AArch64::STRHroW;
3064 case AArch64::STRHHroX:
3065 return AArch64::STRHroX;
3066 case AArch64::STRWroW:
3067 return AArch64::STRSroW;
3068 case AArch64::STRWroX:
3069 return AArch64::STRSroX;
3070 case AArch64::STRXroW:
3071 return AArch64::STRDroW;
3072 case AArch64::STRXroX:
3073 return AArch64::STRDroX;
3074 default:
3075 return 0;
3076 }
3077}
3078
3079// Given a UMOV-lane-0 opcode, return the sub-register index to extract from
3080// the vector register, or 0 if the opcode is not a supported UMOV.
3081static unsigned getUMOVSubRegIdx(unsigned UMOVOpc) {
3082 switch (UMOVOpc) {
3083 case AArch64::UMOVvi8_idx0:
3084 return AArch64::bsub;
3085 case AArch64::UMOVvi16_idx0:
3086 return AArch64::hsub;
3087 case AArch64::UMOVvi32_idx0:
3088 return AArch64::ssub;
3089 case AArch64::UMOVvi64_idx0:
3090 return AArch64::dsub;
3091 default:
3092 return 0;
3093 }
3094}
3095
3096bool AArch64LoadStoreOpt::tryToReplaceUMOVStore(
3097 MachineBasicBlock::iterator &MBBI) {
3098 MachineInstr &StoreMI = *MBBI;
3099
3100 unsigned FPRStoreOpc = getGPRToFPRStoreOpcode(GPRStoreOpc: StoreMI.getOpcode());
3101 if (!FPRStoreOpc)
3102 return false;
3103
3104 if (StoreMI.hasOrderedMemoryRef() || StoreMI.memoperands().size() != 1)
3105 return false;
3106
3107 MachineBasicBlock *MBB = StoreMI.getParent();
3108 MCPhysReg StoreValReg = StoreMI.getOperand(i: 0).getReg();
3109
3110 if (!StoreMI.getOperand(i: 0).isKill())
3111 return false;
3112
3113 // Bail out if the store uses the value register elsewhere (e.g., as the base
3114 // address in `str w8, [x8, #0]`).
3115 for (unsigned I = 1, E = StoreMI.getNumExplicitOperands(); I < E; ++I)
3116 if (StoreMI.getOperand(i: I).isReg() &&
3117 TRI->regsOverlap(RegA: StoreMI.getOperand(i: I).getReg(), RegB: StoreValReg))
3118 return false;
3119
3120 // Scan backward to find the UMOV that defines the store's value register.
3121 MachineInstr *UMOVMI = nullptr;
3122 MachineBasicBlock::iterator B = MBB->begin();
3123 unsigned SubRegIdx = 0;
3124 unsigned Count = 0;
3125 for (auto It = MBBI; It != B;) {
3126 MachineInstr &MI = *--It;
3127 if (MI.isDebugInstr())
3128 continue;
3129 if (++Count > UMOVFoldLimit)
3130 return false;
3131 if (MI.readsRegister(Reg: StoreValReg, TRI))
3132 return false;
3133 if (MI.modifiesRegister(Reg: StoreValReg, TRI)) {
3134 SubRegIdx = getUMOVSubRegIdx(UMOVOpc: MI.getOpcode());
3135 if (!SubRegIdx)
3136 return false;
3137 UMOVMI = &MI;
3138 break;
3139 }
3140 }
3141 if (!UMOVMI)
3142 return false;
3143 MCPhysReg VecReg = UMOVMI->getOperand(i: 1).getReg();
3144 MCPhysReg FPRReg = TRI->getSubReg(Reg: VecReg, Idx: SubRegIdx);
3145 if ((*StoreMI.memoperands_begin())->getSizeInBits() !=
3146 TRI->getRegSizeInBits(RC: *TRI->getMinimalPhysRegClass(Reg: FPRReg)))
3147 return false;
3148
3149 // Check that no instruction between UMOV and store clobbers the vector
3150 // register. Also track whether VecReg is killed anywhere from the UMOV
3151 // (inclusive) through the intervening instructions -- we need this to decide
3152 // whether the FPR sub-register can be marked killed on the new store.
3153 bool VecRegKilled = UMOVMI->killsRegister(Reg: VecReg, TRI);
3154 for (auto It = std::next(x: UMOVMI->getIterator()); It != MBBI; ++It) {
3155 if (It->modifiesRegister(Reg: VecReg, TRI))
3156 return false;
3157 if (!VecRegKilled && It->killsRegister(Reg: VecReg, TRI))
3158 VecRegKilled = true;
3159 }
3160
3161 // Safe to proceed. Clear kill flags on the vector register between UMOV and
3162 // the new store so the FPR sub-register stays live.
3163 UMOVMI->clearRegisterKills(Reg: VecReg, RegInfo: TRI);
3164 for (auto It = std::next(x: UMOVMI->getIterator()); It != MBBI; ++It)
3165 It->clearRegisterKills(Reg: VecReg, RegInfo: TRI);
3166
3167 LLVM_DEBUG(dbgs() << "Folding UMOV + store: " << *UMOVMI << " + "
3168 << StoreMI);
3169
3170 auto MIB = BuildMI(BB&: *MBB, I: MBBI, MIMD: StoreMI.getDebugLoc(), MCID: TII->get(Opcode: FPRStoreOpc))
3171 .addReg(RegNo: FPRReg, Flags: getKillRegState(B: VecRegKilled));
3172 for (unsigned I = 1, E = StoreMI.getNumExplicitOperands(); I < E; ++I)
3173 MIB.add(MO: StoreMI.getOperand(i: I));
3174 MIB.setMemRefs(StoreMI.memoperands());
3175
3176 MBBI = MBB->erase(I: MBBI);
3177 UMOVMI->eraseFromParent();
3178
3179 ++NumUMOVFoldedToFPRStore;
3180 return true;
3181}
3182
3183bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
3184 bool EnableNarrowZeroStOpt) {
3185 AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
3186
3187 bool Modified = false;
3188 // Six transformations to do here:
3189 // 1) Find loads that directly read from stores and promote them by
3190 // replacing with mov instructions. If the store is wider than the load,
3191 // the load will be replaced with a bitfield extract.
3192 // e.g.,
3193 // str w1, [x0, #4]
3194 // ldrh w2, [x0, #6]
3195 // ; becomes
3196 // str w1, [x0, #4]
3197 // lsr w2, w1, #16
3198 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3199 MBBI != E;) {
3200 if (isPromotableLoadFromStore(MI&: *MBBI) && tryToPromoteLoadFromStore(MBBI))
3201 Modified = true;
3202 else
3203 ++MBBI;
3204 }
3205 // 2) Merge adjacent zero stores into a wider store.
3206 // e.g.,
3207 // strh wzr, [x0]
3208 // strh wzr, [x0, #2]
3209 // ; becomes
3210 // str wzr, [x0]
3211 // e.g.,
3212 // str wzr, [x0]
3213 // str wzr, [x0, #4]
3214 // ; becomes
3215 // str xzr, [x0]
3216 if (EnableNarrowZeroStOpt)
3217 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3218 MBBI != E;) {
3219 if (isPromotableZeroStoreInst(MI&: *MBBI) && tryToMergeZeroStInst(MBBI))
3220 Modified = true;
3221 else
3222 ++MBBI;
3223 }
3224 // 3) Find loads and stores that can be merged into a single load or store
3225 // pair instruction.
3226 // When compiling for SVE 128, also try to combine SVE fill/spill
3227 // instructions into LDP/STP.
3228 // e.g.,
3229 // ldr x0, [x2]
3230 // ldr x1, [x2, #8]
3231 // ; becomes
3232 // ldp x0, x1, [x2]
3233 // e.g.,
3234 // ldr z0, [x2]
3235 // ldr z1, [x2, #1, mul vl]
3236 // ; becomes
3237 // ldp q0, q1, [x2]
3238
3239 if (MBB.getParent()->getRegInfo().tracksLiveness()) {
3240 DefinedInBB.clear();
3241 DefinedInBB.addLiveIns(MBB);
3242 }
3243
3244 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3245 MBBI != E;) {
3246 // Track currently live registers up to this point, to help with
3247 // searching for a rename register on demand.
3248 updateDefinedRegisters(MI&: *MBBI, Units&: DefinedInBB, TRI);
3249 if (TII->isPairableLdStInst(MI: *MBBI) && tryToPairLdStInst(MBBI))
3250 Modified = true;
3251 else
3252 ++MBBI;
3253 }
3254 // 4) Find base register updates that can be merged into the load or store
3255 // as a base-reg writeback.
3256 // e.g.,
3257 // ldr x0, [x2]
3258 // add x2, x2, #4
3259 // ; becomes
3260 // ldr x0, [x2], #4
3261 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3262 MBBI != E;) {
3263 if (isMergeableLdStUpdate(MI&: *MBBI, AFI) && tryToMergeLdStUpdate(MBBI))
3264 Modified = true;
3265 else
3266 ++MBBI;
3267 }
3268
3269 // 5) Find a register assigned with a const value that can be combined with
3270 // into the load or store. e.g.,
3271 // mov x8, #LargeImm ; = a * (1<<12) + imm12
3272 // ldr x1, [x0, x8]
3273 // ; becomes
3274 // add x8, x0, a * (1<<12)
3275 // ldr x1, [x8, imm12]
3276 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3277 MBBI != E;) {
3278 int Scale;
3279 if (isMergeableIndexLdSt(MI&: *MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
3280 Modified = true;
3281 else
3282 ++MBBI;
3283 }
3284
3285 // 6) Replace UMOV (lane 0) + GPR store with a direct FPR sub-register store.
3286 // e.g.,
3287 // umov w8, v0.h[0]
3288 // strh w8, [x0]
3289 // ; becomes
3290 // str h0, [x0]
3291 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3292 MBBI != E;) {
3293 if (tryToReplaceUMOVStore(MBBI))
3294 Modified = true;
3295 else
3296 ++MBBI;
3297 }
3298
3299 return Modified;
3300}
3301
3302bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
3303 Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
3304 TII = Subtarget->getInstrInfo();
3305 TRI = Subtarget->getRegisterInfo();
3306
3307 // Resize the modified and used register unit trackers. We do this once
3308 // per function and then clear the register units each time we optimize a load
3309 // or store.
3310 ModifiedRegUnits.init(TRI: *TRI);
3311 UsedRegUnits.init(TRI: *TRI);
3312 DefinedInBB.init(TRI: *TRI);
3313
3314 bool Modified = false;
3315 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
3316 for (auto &MBB : Fn) {
3317 auto M = optimizeBlock(MBB, EnableNarrowZeroStOpt: enableNarrowZeroStOpt);
3318 Modified |= M;
3319 }
3320
3321 return Modified;
3322}
3323
3324// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
3325// stores near one another? Note: The pre-RA instruction scheduler already has
3326// hooks to try and schedule pairable loads/stores together to improve pairing
3327// opportunities. Thus, pre-RA pairing pass may not be worth the effort.
3328
3329// FIXME: When pairing store instructions it's very possible for this pass to
3330// hoist a store with a KILL marker above another use (without a KILL marker).
3331// The resulting IR is invalid, but nothing uses the KILL markers after this
3332// pass, so it's never caused a problem in practice.
3333
3334bool AArch64LoadStoreOptLegacy::runOnMachineFunction(MachineFunction &MF) {
3335 if (skipFunction(F: MF.getFunction()))
3336 return false;
3337 AArch64LoadStoreOpt Impl;
3338 Impl.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3339 return Impl.runOnMachineFunction(Fn&: MF);
3340}
3341
3342/// createAArch64LoadStoreOptimizationPass - returns an instance of the
3343/// load / store optimization pass.
3344FunctionPass *llvm::createAArch64LoadStoreOptLegacyPass() {
3345 return new AArch64LoadStoreOptLegacy();
3346}
3347
3348PreservedAnalyses
3349AArch64LoadStoreOptPass::run(MachineFunction &MF,
3350 MachineFunctionAnalysisManager &MFAM) {
3351 AArch64LoadStoreOpt Impl;
3352 Impl.AA = &MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(IR&: MF)
3353 .getManager()
3354 .getResult<AAManager>(IR&: MF.getFunction());
3355 bool Changed = Impl.runOnMachineFunction(Fn&: MF);
3356 if (!Changed)
3357 return PreservedAnalyses::all();
3358 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
3359 PA.preserveSet<CFGAnalyses>();
3360 return PA;
3361}
3362