1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
15#include "AArch64FrameLowering.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64PointerAuth.h"
18#include "AArch64Subtarget.h"
19#include "MCTargetDesc/AArch64AddressingModes.h"
20#include "MCTargetDesc/AArch64MCTargetDesc.h"
21#include "Utils/AArch64BaseInfo.h"
22#include "llvm/ADT/ArrayRef.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/CodeGen/LivePhysRegs.h"
26#include "llvm/CodeGen/MachineBasicBlock.h"
27#include "llvm/CodeGen/MachineCombinerPattern.h"
28#include "llvm/CodeGen/MachineFrameInfo.h"
29#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineInstr.h"
31#include "llvm/CodeGen/MachineInstrBuilder.h"
32#include "llvm/CodeGen/MachineMemOperand.h"
33#include "llvm/CodeGen/MachineModuleInfo.h"
34#include "llvm/CodeGen/MachineOperand.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/RegisterScavenging.h"
37#include "llvm/CodeGen/StackMaps.h"
38#include "llvm/CodeGen/TargetRegisterInfo.h"
39#include "llvm/CodeGen/TargetSubtargetInfo.h"
40#include "llvm/IR/DebugInfoMetadata.h"
41#include "llvm/IR/DebugLoc.h"
42#include "llvm/IR/GlobalValue.h"
43#include "llvm/IR/Module.h"
44#include "llvm/MC/MCAsmInfo.h"
45#include "llvm/MC/MCInst.h"
46#include "llvm/MC/MCInstBuilder.h"
47#include "llvm/MC/MCInstrDesc.h"
48#include "llvm/Support/Casting.h"
49#include "llvm/Support/CodeGen.h"
50#include "llvm/Support/CommandLine.h"
51#include "llvm/Support/ErrorHandling.h"
52#include "llvm/Support/LEB128.h"
53#include "llvm/Support/MathExtras.h"
54#include "llvm/Target/TargetMachine.h"
55#include "llvm/Target/TargetOptions.h"
56#include <cassert>
57#include <cstdint>
58#include <iterator>
59#include <utility>
60
61using namespace llvm;
62
63#define GET_INSTRINFO_CTOR_DTOR
64#include "AArch64GenInstrInfo.inc"
65
66static cl::opt<unsigned> TBZDisplacementBits(
67 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(Val: 14),
68 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
69
70static cl::opt<unsigned> CBZDisplacementBits(
71 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(Val: 19),
72 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
73
74static cl::opt<unsigned>
75 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(Val: 19),
76 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
77
78static cl::opt<unsigned>
79 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(Val: 26),
80 cl::desc("Restrict range of B instructions (DEBUG)"));
81
82AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
83 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
84 AArch64::CATCHRET),
85 RI(STI.getTargetTriple()), Subtarget(STI) {}
86
87/// GetInstSize - Return the number of bytes of code the specified
88/// instruction may be. This returns the maximum number of bytes.
89unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
90 const MachineBasicBlock &MBB = *MI.getParent();
91 const MachineFunction *MF = MBB.getParent();
92 const Function &F = MF->getFunction();
93 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
94
95 {
96 auto Op = MI.getOpcode();
97 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
98 return getInlineAsmLength(Str: MI.getOperand(i: 0).getSymbolName(), MAI: *MAI);
99 }
100
101 // Meta-instructions emit no code.
102 if (MI.isMetaInstruction())
103 return 0;
104
105 // FIXME: We currently only handle pseudoinstructions that don't get expanded
106 // before the assembly printer.
107 unsigned NumBytes = 0;
108 const MCInstrDesc &Desc = MI.getDesc();
109
110 // Size should be preferably set in
111 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
112 // Specific cases handle instructions of variable sizes
113 switch (Desc.getOpcode()) {
114 default:
115 if (Desc.getSize())
116 return Desc.getSize();
117
118 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
119 // with fixed constant size but not specified in .td file) is a normal
120 // 4-byte insn.
121 NumBytes = 4;
122 break;
123 case TargetOpcode::STACKMAP:
124 // The upper bound for a stackmap intrinsic is the full length of its shadow
125 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
126 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
127 break;
128 case TargetOpcode::PATCHPOINT:
129 // The size of the patchpoint intrinsic is the number of bytes requested
130 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
131 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
132 break;
133 case TargetOpcode::STATEPOINT:
134 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
135 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
136 // No patch bytes means a normal call inst is emitted
137 if (NumBytes == 0)
138 NumBytes = 4;
139 break;
140 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
141 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
142 // instructions are expanded to the specified number of NOPs. Otherwise,
143 // they are expanded to 36-byte XRay sleds.
144 NumBytes =
145 F.getFnAttributeAsParsedInteger(Kind: "patchable-function-entry", Default: 9) * 4;
146 break;
147 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
148 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
149 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
150 NumBytes = 36;
151 break;
152 case TargetOpcode::PATCHABLE_EVENT_CALL:
153 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
154 NumBytes = 24;
155 break;
156
157 case AArch64::SPACE:
158 NumBytes = MI.getOperand(i: 1).getImm();
159 break;
160 case TargetOpcode::BUNDLE:
161 NumBytes = getInstBundleLength(MI);
162 break;
163 }
164
165 return NumBytes;
166}
167
168unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
169 unsigned Size = 0;
170 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
171 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
172 while (++I != E && I->isInsideBundle()) {
173 assert(!I->isBundle() && "No nested bundle!");
174 Size += getInstSizeInBytes(MI: *I);
175 }
176 return Size;
177}
178
179static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
180 SmallVectorImpl<MachineOperand> &Cond) {
181 // Block ends with fall-through condbranch.
182 switch (LastInst->getOpcode()) {
183 default:
184 llvm_unreachable("Unknown branch instruction?");
185 case AArch64::Bcc:
186 Target = LastInst->getOperand(i: 1).getMBB();
187 Cond.push_back(Elt: LastInst->getOperand(i: 0));
188 break;
189 case AArch64::CBZW:
190 case AArch64::CBZX:
191 case AArch64::CBNZW:
192 case AArch64::CBNZX:
193 Target = LastInst->getOperand(i: 1).getMBB();
194 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
195 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
196 Cond.push_back(Elt: LastInst->getOperand(i: 0));
197 break;
198 case AArch64::TBZW:
199 case AArch64::TBZX:
200 case AArch64::TBNZW:
201 case AArch64::TBNZX:
202 Target = LastInst->getOperand(i: 2).getMBB();
203 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
204 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
205 Cond.push_back(Elt: LastInst->getOperand(i: 0));
206 Cond.push_back(Elt: LastInst->getOperand(i: 1));
207 }
208}
209
210static unsigned getBranchDisplacementBits(unsigned Opc) {
211 switch (Opc) {
212 default:
213 llvm_unreachable("unexpected opcode!");
214 case AArch64::B:
215 return BDisplacementBits;
216 case AArch64::TBNZW:
217 case AArch64::TBZW:
218 case AArch64::TBNZX:
219 case AArch64::TBZX:
220 return TBZDisplacementBits;
221 case AArch64::CBNZW:
222 case AArch64::CBZW:
223 case AArch64::CBNZX:
224 case AArch64::CBZX:
225 return CBZDisplacementBits;
226 case AArch64::Bcc:
227 return BCCDisplacementBits;
228 }
229}
230
231bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
232 int64_t BrOffset) const {
233 unsigned Bits = getBranchDisplacementBits(Opc: BranchOp);
234 assert(Bits >= 3 && "max branch displacement must be enough to jump"
235 "over conditional branch expansion");
236 return isIntN(N: Bits, x: BrOffset / 4);
237}
238
239MachineBasicBlock *
240AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
241 switch (MI.getOpcode()) {
242 default:
243 llvm_unreachable("unexpected opcode!");
244 case AArch64::B:
245 return MI.getOperand(i: 0).getMBB();
246 case AArch64::TBZW:
247 case AArch64::TBNZW:
248 case AArch64::TBZX:
249 case AArch64::TBNZX:
250 return MI.getOperand(i: 2).getMBB();
251 case AArch64::CBZW:
252 case AArch64::CBNZW:
253 case AArch64::CBZX:
254 case AArch64::CBNZX:
255 case AArch64::Bcc:
256 return MI.getOperand(i: 1).getMBB();
257 }
258}
259
260void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
261 MachineBasicBlock &NewDestBB,
262 MachineBasicBlock &RestoreBB,
263 const DebugLoc &DL,
264 int64_t BrOffset,
265 RegScavenger *RS) const {
266 assert(RS && "RegScavenger required for long branching");
267 assert(MBB.empty() &&
268 "new block should be inserted for expanding unconditional branch");
269 assert(MBB.pred_size() == 1);
270 assert(RestoreBB.empty() &&
271 "restore block should be inserted for restoring clobbered registers");
272
273 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
274 // Offsets outside of the signed 33-bit range are not supported for ADRP +
275 // ADD.
276 if (!isInt<33>(x: BrOffset))
277 report_fatal_error(
278 reason: "Branch offsets outside of the signed 33-bit range not supported");
279
280 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::ADRP), DestReg: Reg)
281 .addSym(Sym: DestBB.getSymbol(), TargetFlags: AArch64II::MO_PAGE);
282 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg: Reg)
283 .addReg(RegNo: Reg)
284 .addSym(Sym: DestBB.getSymbol(), TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
285 .addImm(Val: 0);
286 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::BR)).addReg(RegNo: Reg);
287 };
288
289 RS->enterBasicBlockEnd(MBB);
290 // If X16 is unused, we can rely on the linker to insert a range extension
291 // thunk if NewDestBB is out of range of a single B instruction.
292 constexpr Register Reg = AArch64::X16;
293 if (!RS->isRegUsed(Reg)) {
294 insertUnconditionalBranch(MBB, DestBB: &NewDestBB, DL);
295 RS->setRegUsed(Reg);
296 return;
297 }
298
299 // If there's a free register and it's worth inflating the code size,
300 // manually insert the indirect branch.
301 Register Scavenged = RS->FindUnusedReg(RC: &AArch64::GPR64RegClass);
302 if (Scavenged != AArch64::NoRegister &&
303 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
304 buildIndirectBranch(Scavenged, NewDestBB);
305 RS->setRegUsed(Reg: Scavenged);
306 return;
307 }
308
309 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
310 // with red zones.
311 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
312 if (!AFI || AFI->hasRedZone().value_or(u: true))
313 report_fatal_error(
314 reason: "Unable to insert indirect branch inside function that has red zone");
315
316 // Otherwise, spill X16 and defer range extension to the linker.
317 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::STRXpre))
318 .addReg(RegNo: AArch64::SP, flags: RegState::Define)
319 .addReg(RegNo: Reg)
320 .addReg(RegNo: AArch64::SP)
321 .addImm(Val: -16);
322
323 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: &RestoreBB);
324
325 BuildMI(BB&: RestoreBB, I: RestoreBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::LDRXpost))
326 .addReg(RegNo: AArch64::SP, flags: RegState::Define)
327 .addReg(RegNo: Reg, flags: RegState::Define)
328 .addReg(RegNo: AArch64::SP)
329 .addImm(Val: 16);
330}
331
332// Branch analysis.
333bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
334 MachineBasicBlock *&TBB,
335 MachineBasicBlock *&FBB,
336 SmallVectorImpl<MachineOperand> &Cond,
337 bool AllowModify) const {
338 // If the block has no terminators, it just falls into the block after it.
339 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
340 if (I == MBB.end())
341 return false;
342
343 // Skip over SpeculationBarrierEndBB terminators
344 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
345 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
346 --I;
347 }
348
349 if (!isUnpredicatedTerminator(MI: *I))
350 return false;
351
352 // Get the last instruction in the block.
353 MachineInstr *LastInst = &*I;
354
355 // If there is only one terminator instruction, process it.
356 unsigned LastOpc = LastInst->getOpcode();
357 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
358 if (isUncondBranchOpcode(Opc: LastOpc)) {
359 TBB = LastInst->getOperand(i: 0).getMBB();
360 return false;
361 }
362 if (isCondBranchOpcode(Opc: LastOpc)) {
363 // Block ends with fall-through condbranch.
364 parseCondBranch(LastInst, Target&: TBB, Cond);
365 return false;
366 }
367 return true; // Can't handle indirect branch.
368 }
369
370 // Get the instruction before it if it is a terminator.
371 MachineInstr *SecondLastInst = &*I;
372 unsigned SecondLastOpc = SecondLastInst->getOpcode();
373
374 // If AllowModify is true and the block ends with two or more unconditional
375 // branches, delete all but the first unconditional branch.
376 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc)) {
377 while (isUncondBranchOpcode(Opc: SecondLastOpc)) {
378 LastInst->eraseFromParent();
379 LastInst = SecondLastInst;
380 LastOpc = LastInst->getOpcode();
381 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
382 // Return now the only terminator is an unconditional branch.
383 TBB = LastInst->getOperand(i: 0).getMBB();
384 return false;
385 }
386 SecondLastInst = &*I;
387 SecondLastOpc = SecondLastInst->getOpcode();
388 }
389 }
390
391 // If we're allowed to modify and the block ends in a unconditional branch
392 // which could simply fallthrough, remove the branch. (Note: This case only
393 // matters when we can't understand the whole sequence, otherwise it's also
394 // handled by BranchFolding.cpp.)
395 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc) &&
396 MBB.isLayoutSuccessor(MBB: getBranchDestBlock(MI: *LastInst))) {
397 LastInst->eraseFromParent();
398 LastInst = SecondLastInst;
399 LastOpc = LastInst->getOpcode();
400 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
401 assert(!isUncondBranchOpcode(LastOpc) &&
402 "unreachable unconditional branches removed above");
403
404 if (isCondBranchOpcode(Opc: LastOpc)) {
405 // Block ends with fall-through condbranch.
406 parseCondBranch(LastInst, Target&: TBB, Cond);
407 return false;
408 }
409 return true; // Can't handle indirect branch.
410 }
411 SecondLastInst = &*I;
412 SecondLastOpc = SecondLastInst->getOpcode();
413 }
414
415 // If there are three terminators, we don't know what sort of block this is.
416 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(MI: *--I))
417 return true;
418
419 // If the block ends with a B and a Bcc, handle it.
420 if (isCondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
421 parseCondBranch(LastInst: SecondLastInst, Target&: TBB, Cond);
422 FBB = LastInst->getOperand(i: 0).getMBB();
423 return false;
424 }
425
426 // If the block ends with two unconditional branches, handle it. The second
427 // one is not executed, so remove it.
428 if (isUncondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
429 TBB = SecondLastInst->getOperand(i: 0).getMBB();
430 I = LastInst;
431 if (AllowModify)
432 I->eraseFromParent();
433 return false;
434 }
435
436 // ...likewise if it ends with an indirect branch followed by an unconditional
437 // branch.
438 if (isIndirectBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
439 I = LastInst;
440 if (AllowModify)
441 I->eraseFromParent();
442 return true;
443 }
444
445 // Otherwise, can't handle this.
446 return true;
447}
448
449bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
450 MachineBranchPredicate &MBP,
451 bool AllowModify) const {
452 // For the moment, handle only a block which ends with a cb(n)zx followed by
453 // a fallthrough. Why this? Because it is a common form.
454 // TODO: Should we handle b.cc?
455
456 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
457 if (I == MBB.end())
458 return true;
459
460 // Skip over SpeculationBarrierEndBB terminators
461 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
462 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
463 --I;
464 }
465
466 if (!isUnpredicatedTerminator(MI: *I))
467 return true;
468
469 // Get the last instruction in the block.
470 MachineInstr *LastInst = &*I;
471 unsigned LastOpc = LastInst->getOpcode();
472 if (!isCondBranchOpcode(Opc: LastOpc))
473 return true;
474
475 switch (LastOpc) {
476 default:
477 return true;
478 case AArch64::CBZW:
479 case AArch64::CBZX:
480 case AArch64::CBNZW:
481 case AArch64::CBNZX:
482 break;
483 };
484
485 MBP.TrueDest = LastInst->getOperand(i: 1).getMBB();
486 assert(MBP.TrueDest && "expected!");
487 MBP.FalseDest = MBB.getNextNode();
488
489 MBP.ConditionDef = nullptr;
490 MBP.SingleUseCondition = false;
491
492 MBP.LHS = LastInst->getOperand(i: 0);
493 MBP.RHS = MachineOperand::CreateImm(Val: 0);
494 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
495 : MachineBranchPredicate::PRED_EQ;
496 return false;
497}
498
499bool AArch64InstrInfo::reverseBranchCondition(
500 SmallVectorImpl<MachineOperand> &Cond) const {
501 if (Cond[0].getImm() != -1) {
502 // Regular Bcc
503 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
504 Cond[0].setImm(AArch64CC::getInvertedCondCode(Code: CC));
505 } else {
506 // Folded compare-and-branch
507 switch (Cond[1].getImm()) {
508 default:
509 llvm_unreachable("Unknown conditional branch!");
510 case AArch64::CBZW:
511 Cond[1].setImm(AArch64::CBNZW);
512 break;
513 case AArch64::CBNZW:
514 Cond[1].setImm(AArch64::CBZW);
515 break;
516 case AArch64::CBZX:
517 Cond[1].setImm(AArch64::CBNZX);
518 break;
519 case AArch64::CBNZX:
520 Cond[1].setImm(AArch64::CBZX);
521 break;
522 case AArch64::TBZW:
523 Cond[1].setImm(AArch64::TBNZW);
524 break;
525 case AArch64::TBNZW:
526 Cond[1].setImm(AArch64::TBZW);
527 break;
528 case AArch64::TBZX:
529 Cond[1].setImm(AArch64::TBNZX);
530 break;
531 case AArch64::TBNZX:
532 Cond[1].setImm(AArch64::TBZX);
533 break;
534 }
535 }
536
537 return false;
538}
539
540unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
541 int *BytesRemoved) const {
542 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
543 if (I == MBB.end())
544 return 0;
545
546 if (!isUncondBranchOpcode(Opc: I->getOpcode()) &&
547 !isCondBranchOpcode(Opc: I->getOpcode()))
548 return 0;
549
550 // Remove the branch.
551 I->eraseFromParent();
552
553 I = MBB.end();
554
555 if (I == MBB.begin()) {
556 if (BytesRemoved)
557 *BytesRemoved = 4;
558 return 1;
559 }
560 --I;
561 if (!isCondBranchOpcode(Opc: I->getOpcode())) {
562 if (BytesRemoved)
563 *BytesRemoved = 4;
564 return 1;
565 }
566
567 // Remove the branch.
568 I->eraseFromParent();
569 if (BytesRemoved)
570 *BytesRemoved = 8;
571
572 return 2;
573}
574
575void AArch64InstrInfo::instantiateCondBranch(
576 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
577 ArrayRef<MachineOperand> Cond) const {
578 if (Cond[0].getImm() != -1) {
579 // Regular Bcc
580 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::Bcc)).addImm(Val: Cond[0].getImm()).addMBB(MBB: TBB);
581 } else {
582 // Folded compare-and-branch
583 // Note that we use addOperand instead of addReg to keep the flags.
584 const MachineInstrBuilder MIB =
585 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: Cond[1].getImm())).add(MO: Cond[2]);
586 if (Cond.size() > 3)
587 MIB.addImm(Val: Cond[3].getImm());
588 MIB.addMBB(MBB: TBB);
589 }
590}
591
592unsigned AArch64InstrInfo::insertBranch(
593 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
594 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
595 // Shouldn't be a fall through.
596 assert(TBB && "insertBranch must not be told to insert a fallthrough");
597
598 if (!FBB) {
599 if (Cond.empty()) // Unconditional branch?
600 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: TBB);
601 else
602 instantiateCondBranch(MBB, DL, TBB, Cond);
603
604 if (BytesAdded)
605 *BytesAdded = 4;
606
607 return 1;
608 }
609
610 // Two-way conditional branch.
611 instantiateCondBranch(MBB, DL, TBB, Cond);
612 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: FBB);
613
614 if (BytesAdded)
615 *BytesAdded = 8;
616
617 return 2;
618}
619
620// Find the original register that VReg is copied from.
621static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
622 while (Register::isVirtualRegister(Reg: VReg)) {
623 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
624 if (!DefMI->isFullCopy())
625 return VReg;
626 VReg = DefMI->getOperand(i: 1).getReg();
627 }
628 return VReg;
629}
630
631// Determine if VReg is defined by an instruction that can be folded into a
632// csel instruction. If so, return the folded opcode, and the replacement
633// register.
634static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
635 unsigned *NewVReg = nullptr) {
636 VReg = removeCopies(MRI, VReg);
637 if (!Register::isVirtualRegister(Reg: VReg))
638 return 0;
639
640 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(RC: MRI.getRegClass(Reg: VReg));
641 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
642 unsigned Opc = 0;
643 unsigned SrcOpNum = 0;
644 switch (DefMI->getOpcode()) {
645 case AArch64::ADDSXri:
646 case AArch64::ADDSWri:
647 // if NZCV is used, do not fold.
648 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
649 isDead: true) == -1)
650 return 0;
651 // fall-through to ADDXri and ADDWri.
652 [[fallthrough]];
653 case AArch64::ADDXri:
654 case AArch64::ADDWri:
655 // add x, 1 -> csinc.
656 if (!DefMI->getOperand(i: 2).isImm() || DefMI->getOperand(i: 2).getImm() != 1 ||
657 DefMI->getOperand(i: 3).getImm() != 0)
658 return 0;
659 SrcOpNum = 1;
660 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
661 break;
662
663 case AArch64::ORNXrr:
664 case AArch64::ORNWrr: {
665 // not x -> csinv, represented as orn dst, xzr, src.
666 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
667 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
668 return 0;
669 SrcOpNum = 2;
670 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
671 break;
672 }
673
674 case AArch64::SUBSXrr:
675 case AArch64::SUBSWrr:
676 // if NZCV is used, do not fold.
677 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
678 isDead: true) == -1)
679 return 0;
680 // fall-through to SUBXrr and SUBWrr.
681 [[fallthrough]];
682 case AArch64::SUBXrr:
683 case AArch64::SUBWrr: {
684 // neg x -> csneg, represented as sub dst, xzr, src.
685 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
686 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
687 return 0;
688 SrcOpNum = 2;
689 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
690 break;
691 }
692 default:
693 return 0;
694 }
695 assert(Opc && SrcOpNum && "Missing parameters");
696
697 if (NewVReg)
698 *NewVReg = DefMI->getOperand(i: SrcOpNum).getReg();
699 return Opc;
700}
701
702bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
703 ArrayRef<MachineOperand> Cond,
704 Register DstReg, Register TrueReg,
705 Register FalseReg, int &CondCycles,
706 int &TrueCycles,
707 int &FalseCycles) const {
708 // Check register classes.
709 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
710 const TargetRegisterClass *RC =
711 RI.getCommonSubClass(A: MRI.getRegClass(Reg: TrueReg), B: MRI.getRegClass(Reg: FalseReg));
712 if (!RC)
713 return false;
714
715 // Also need to check the dest regclass, in case we're trying to optimize
716 // something like:
717 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
718 if (!RI.getCommonSubClass(A: RC, B: MRI.getRegClass(Reg: DstReg)))
719 return false;
720
721 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
722 unsigned ExtraCondLat = Cond.size() != 1;
723
724 // GPRs are handled by csel.
725 // FIXME: Fold in x+1, -x, and ~x when applicable.
726 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
727 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
728 // Single-cycle csel, csinc, csinv, and csneg.
729 CondCycles = 1 + ExtraCondLat;
730 TrueCycles = FalseCycles = 1;
731 if (canFoldIntoCSel(MRI, VReg: TrueReg))
732 TrueCycles = 0;
733 else if (canFoldIntoCSel(MRI, VReg: FalseReg))
734 FalseCycles = 0;
735 return true;
736 }
737
738 // Scalar floating point is handled by fcsel.
739 // FIXME: Form fabs, fmin, and fmax when applicable.
740 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
741 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
742 CondCycles = 5 + ExtraCondLat;
743 TrueCycles = FalseCycles = 2;
744 return true;
745 }
746
747 // Can't do vectors.
748 return false;
749}
750
751void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
752 MachineBasicBlock::iterator I,
753 const DebugLoc &DL, Register DstReg,
754 ArrayRef<MachineOperand> Cond,
755 Register TrueReg, Register FalseReg) const {
756 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
757
758 // Parse the condition code, see parseCondBranch() above.
759 AArch64CC::CondCode CC;
760 switch (Cond.size()) {
761 default:
762 llvm_unreachable("Unknown condition opcode in Cond");
763 case 1: // b.cc
764 CC = AArch64CC::CondCode(Cond[0].getImm());
765 break;
766 case 3: { // cbz/cbnz
767 // We must insert a compare against 0.
768 bool Is64Bit;
769 switch (Cond[1].getImm()) {
770 default:
771 llvm_unreachable("Unknown branch opcode in Cond");
772 case AArch64::CBZW:
773 Is64Bit = false;
774 CC = AArch64CC::EQ;
775 break;
776 case AArch64::CBZX:
777 Is64Bit = true;
778 CC = AArch64CC::EQ;
779 break;
780 case AArch64::CBNZW:
781 Is64Bit = false;
782 CC = AArch64CC::NE;
783 break;
784 case AArch64::CBNZX:
785 Is64Bit = true;
786 CC = AArch64CC::NE;
787 break;
788 }
789 Register SrcReg = Cond[2].getReg();
790 if (Is64Bit) {
791 // cmp reg, #0 is actually subs xzr, reg, #0.
792 MRI.constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64spRegClass);
793 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSXri), DestReg: AArch64::XZR)
794 .addReg(RegNo: SrcReg)
795 .addImm(Val: 0)
796 .addImm(Val: 0);
797 } else {
798 MRI.constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR32spRegClass);
799 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWri), DestReg: AArch64::WZR)
800 .addReg(RegNo: SrcReg)
801 .addImm(Val: 0)
802 .addImm(Val: 0);
803 }
804 break;
805 }
806 case 4: { // tbz/tbnz
807 // We must insert a tst instruction.
808 switch (Cond[1].getImm()) {
809 default:
810 llvm_unreachable("Unknown branch opcode in Cond");
811 case AArch64::TBZW:
812 case AArch64::TBZX:
813 CC = AArch64CC::EQ;
814 break;
815 case AArch64::TBNZW:
816 case AArch64::TBNZX:
817 CC = AArch64CC::NE;
818 break;
819 }
820 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
821 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
822 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ANDSWri), DestReg: AArch64::WZR)
823 .addReg(RegNo: Cond[2].getReg())
824 .addImm(
825 Val: AArch64_AM::encodeLogicalImmediate(imm: 1ull << Cond[3].getImm(), regSize: 32));
826 else
827 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ANDSXri), DestReg: AArch64::XZR)
828 .addReg(RegNo: Cond[2].getReg())
829 .addImm(
830 Val: AArch64_AM::encodeLogicalImmediate(imm: 1ull << Cond[3].getImm(), regSize: 64));
831 break;
832 }
833 }
834
835 unsigned Opc = 0;
836 const TargetRegisterClass *RC = nullptr;
837 bool TryFold = false;
838 if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::GPR64RegClass)) {
839 RC = &AArch64::GPR64RegClass;
840 Opc = AArch64::CSELXr;
841 TryFold = true;
842 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::GPR32RegClass)) {
843 RC = &AArch64::GPR32RegClass;
844 Opc = AArch64::CSELWr;
845 TryFold = true;
846 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::FPR64RegClass)) {
847 RC = &AArch64::FPR64RegClass;
848 Opc = AArch64::FCSELDrrr;
849 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::FPR32RegClass)) {
850 RC = &AArch64::FPR32RegClass;
851 Opc = AArch64::FCSELSrrr;
852 }
853 assert(RC && "Unsupported regclass");
854
855 // Try folding simple instructions into the csel.
856 if (TryFold) {
857 unsigned NewVReg = 0;
858 unsigned FoldedOpc = canFoldIntoCSel(MRI, VReg: TrueReg, NewVReg: &NewVReg);
859 if (FoldedOpc) {
860 // The folded opcodes csinc, csinc and csneg apply the operation to
861 // FalseReg, so we need to invert the condition.
862 CC = AArch64CC::getInvertedCondCode(Code: CC);
863 TrueReg = FalseReg;
864 } else
865 FoldedOpc = canFoldIntoCSel(MRI, VReg: FalseReg, NewVReg: &NewVReg);
866
867 // Fold the operation. Leave any dead instructions for DCE to clean up.
868 if (FoldedOpc) {
869 FalseReg = NewVReg;
870 Opc = FoldedOpc;
871 // The extends the live range of NewVReg.
872 MRI.clearKillFlags(Reg: NewVReg);
873 }
874 }
875
876 // Pull all virtual register into the appropriate class.
877 MRI.constrainRegClass(Reg: TrueReg, RC);
878 MRI.constrainRegClass(Reg: FalseReg, RC);
879
880 // Insert the csel.
881 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: Opc), DestReg: DstReg)
882 .addReg(RegNo: TrueReg)
883 .addReg(RegNo: FalseReg)
884 .addImm(Val: CC);
885}
886
887// Return true if Imm can be loaded into a register by a "cheap" sequence of
888// instructions. For now, "cheap" means at most two instructions.
889static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
890 if (BitSize == 32)
891 return true;
892
893 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
894 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(i: 1).getImm());
895 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
896 AArch64_IMM::expandMOVImm(Imm, BitSize, Insn&: Is);
897
898 return Is.size() <= 2;
899}
900
901// FIXME: this implementation should be micro-architecture dependent, so a
902// micro-architecture target hook should be introduced here in future.
903bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
904 if (Subtarget.hasExynosCheapAsMoveHandling()) {
905 if (isExynosCheapAsMove(MI))
906 return true;
907 return MI.isAsCheapAsAMove();
908 }
909
910 switch (MI.getOpcode()) {
911 default:
912 return MI.isAsCheapAsAMove();
913
914 case AArch64::ADDWrs:
915 case AArch64::ADDXrs:
916 case AArch64::SUBWrs:
917 case AArch64::SUBXrs:
918 return Subtarget.hasALULSLFast() && MI.getOperand(i: 3).getImm() <= 4;
919
920 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
921 // ORRXri, it is as cheap as MOV.
922 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
923 case AArch64::MOVi32imm:
924 return isCheapImmediate(MI, BitSize: 32);
925 case AArch64::MOVi64imm:
926 return isCheapImmediate(MI, BitSize: 64);
927 }
928}
929
930bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
931 switch (MI.getOpcode()) {
932 default:
933 return false;
934
935 case AArch64::ADDWrs:
936 case AArch64::ADDXrs:
937 case AArch64::ADDSWrs:
938 case AArch64::ADDSXrs: {
939 unsigned Imm = MI.getOperand(i: 3).getImm();
940 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
941 if (ShiftVal == 0)
942 return true;
943 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
944 }
945
946 case AArch64::ADDWrx:
947 case AArch64::ADDXrx:
948 case AArch64::ADDXrx64:
949 case AArch64::ADDSWrx:
950 case AArch64::ADDSXrx:
951 case AArch64::ADDSXrx64: {
952 unsigned Imm = MI.getOperand(i: 3).getImm();
953 switch (AArch64_AM::getArithExtendType(Imm)) {
954 default:
955 return false;
956 case AArch64_AM::UXTB:
957 case AArch64_AM::UXTH:
958 case AArch64_AM::UXTW:
959 case AArch64_AM::UXTX:
960 return AArch64_AM::getArithShiftValue(Imm) <= 4;
961 }
962 }
963
964 case AArch64::SUBWrs:
965 case AArch64::SUBSWrs: {
966 unsigned Imm = MI.getOperand(i: 3).getImm();
967 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
968 return ShiftVal == 0 ||
969 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
970 }
971
972 case AArch64::SUBXrs:
973 case AArch64::SUBSXrs: {
974 unsigned Imm = MI.getOperand(i: 3).getImm();
975 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
976 return ShiftVal == 0 ||
977 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
978 }
979
980 case AArch64::SUBWrx:
981 case AArch64::SUBXrx:
982 case AArch64::SUBXrx64:
983 case AArch64::SUBSWrx:
984 case AArch64::SUBSXrx:
985 case AArch64::SUBSXrx64: {
986 unsigned Imm = MI.getOperand(i: 3).getImm();
987 switch (AArch64_AM::getArithExtendType(Imm)) {
988 default:
989 return false;
990 case AArch64_AM::UXTB:
991 case AArch64_AM::UXTH:
992 case AArch64_AM::UXTW:
993 case AArch64_AM::UXTX:
994 return AArch64_AM::getArithShiftValue(Imm) == 0;
995 }
996 }
997
998 case AArch64::LDRBBroW:
999 case AArch64::LDRBBroX:
1000 case AArch64::LDRBroW:
1001 case AArch64::LDRBroX:
1002 case AArch64::LDRDroW:
1003 case AArch64::LDRDroX:
1004 case AArch64::LDRHHroW:
1005 case AArch64::LDRHHroX:
1006 case AArch64::LDRHroW:
1007 case AArch64::LDRHroX:
1008 case AArch64::LDRQroW:
1009 case AArch64::LDRQroX:
1010 case AArch64::LDRSBWroW:
1011 case AArch64::LDRSBWroX:
1012 case AArch64::LDRSBXroW:
1013 case AArch64::LDRSBXroX:
1014 case AArch64::LDRSHWroW:
1015 case AArch64::LDRSHWroX:
1016 case AArch64::LDRSHXroW:
1017 case AArch64::LDRSHXroX:
1018 case AArch64::LDRSWroW:
1019 case AArch64::LDRSWroX:
1020 case AArch64::LDRSroW:
1021 case AArch64::LDRSroX:
1022 case AArch64::LDRWroW:
1023 case AArch64::LDRWroX:
1024 case AArch64::LDRXroW:
1025 case AArch64::LDRXroX:
1026 case AArch64::PRFMroW:
1027 case AArch64::PRFMroX:
1028 case AArch64::STRBBroW:
1029 case AArch64::STRBBroX:
1030 case AArch64::STRBroW:
1031 case AArch64::STRBroX:
1032 case AArch64::STRDroW:
1033 case AArch64::STRDroX:
1034 case AArch64::STRHHroW:
1035 case AArch64::STRHHroX:
1036 case AArch64::STRHroW:
1037 case AArch64::STRHroX:
1038 case AArch64::STRQroW:
1039 case AArch64::STRQroX:
1040 case AArch64::STRSroW:
1041 case AArch64::STRSroX:
1042 case AArch64::STRWroW:
1043 case AArch64::STRWroX:
1044 case AArch64::STRXroW:
1045 case AArch64::STRXroX: {
1046 unsigned IsSigned = MI.getOperand(i: 3).getImm();
1047 return !IsSigned;
1048 }
1049 }
1050}
1051
1052bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1053 unsigned Opc = MI.getOpcode();
1054 switch (Opc) {
1055 default:
1056 return false;
1057 case AArch64::SEH_StackAlloc:
1058 case AArch64::SEH_SaveFPLR:
1059 case AArch64::SEH_SaveFPLR_X:
1060 case AArch64::SEH_SaveReg:
1061 case AArch64::SEH_SaveReg_X:
1062 case AArch64::SEH_SaveRegP:
1063 case AArch64::SEH_SaveRegP_X:
1064 case AArch64::SEH_SaveFReg:
1065 case AArch64::SEH_SaveFReg_X:
1066 case AArch64::SEH_SaveFRegP:
1067 case AArch64::SEH_SaveFRegP_X:
1068 case AArch64::SEH_SetFP:
1069 case AArch64::SEH_AddFP:
1070 case AArch64::SEH_Nop:
1071 case AArch64::SEH_PrologEnd:
1072 case AArch64::SEH_EpilogStart:
1073 case AArch64::SEH_EpilogEnd:
1074 case AArch64::SEH_PACSignLR:
1075 case AArch64::SEH_SaveAnyRegQP:
1076 case AArch64::SEH_SaveAnyRegQPX:
1077 return true;
1078 }
1079}
1080
1081bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1082 Register &SrcReg, Register &DstReg,
1083 unsigned &SubIdx) const {
1084 switch (MI.getOpcode()) {
1085 default:
1086 return false;
1087 case AArch64::SBFMXri: // aka sxtw
1088 case AArch64::UBFMXri: // aka uxtw
1089 // Check for the 32 -> 64 bit extension case, these instructions can do
1090 // much more.
1091 if (MI.getOperand(i: 2).getImm() != 0 || MI.getOperand(i: 3).getImm() != 31)
1092 return false;
1093 // This is a signed or unsigned 32 -> 64 bit extension.
1094 SrcReg = MI.getOperand(i: 1).getReg();
1095 DstReg = MI.getOperand(i: 0).getReg();
1096 SubIdx = AArch64::sub_32;
1097 return true;
1098 }
1099}
1100
1101bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1102 const MachineInstr &MIa, const MachineInstr &MIb) const {
1103 const TargetRegisterInfo *TRI = &getRegisterInfo();
1104 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1105 int64_t OffsetA = 0, OffsetB = 0;
1106 TypeSize WidthA(0, false), WidthB(0, false);
1107 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1108
1109 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1110 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1111
1112 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1113 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1114 return false;
1115
1116 // Retrieve the base, offset from the base and width. Width
1117 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1118 // base are identical, and the offset of a lower memory access +
1119 // the width doesn't overlap the offset of a higher memory access,
1120 // then the memory accesses are different.
1121 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1122 // are assumed to have the same scale (vscale).
1123 if (getMemOperandWithOffsetWidth(MI: MIa, BaseOp&: BaseOpA, Offset&: OffsetA, OffsetIsScalable&: OffsetAIsScalable,
1124 Width&: WidthA, TRI) &&
1125 getMemOperandWithOffsetWidth(MI: MIb, BaseOp&: BaseOpB, Offset&: OffsetB, OffsetIsScalable&: OffsetBIsScalable,
1126 Width&: WidthB, TRI)) {
1127 if (BaseOpA->isIdenticalTo(Other: *BaseOpB) &&
1128 OffsetAIsScalable == OffsetBIsScalable) {
1129 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1130 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1131 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1132 if (LowWidth.isScalable() == OffsetAIsScalable &&
1133 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1134 return true;
1135 }
1136 }
1137 return false;
1138}
1139
1140bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1141 const MachineBasicBlock *MBB,
1142 const MachineFunction &MF) const {
1143 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1144 return true;
1145
1146 // Do not move an instruction that can be recognized as a branch target.
1147 if (hasBTISemantics(MI))
1148 return true;
1149
1150 switch (MI.getOpcode()) {
1151 case AArch64::HINT:
1152 // CSDB hints are scheduling barriers.
1153 if (MI.getOperand(i: 0).getImm() == 0x14)
1154 return true;
1155 break;
1156 case AArch64::DSB:
1157 case AArch64::ISB:
1158 // DSB and ISB also are scheduling barriers.
1159 return true;
1160 case AArch64::MSRpstatesvcrImm1:
1161 // SMSTART and SMSTOP are also scheduling barriers.
1162 return true;
1163 default:;
1164 }
1165 if (isSEHInstruction(MI))
1166 return true;
1167 auto Next = std::next(x: MI.getIterator());
1168 return Next != MBB->end() && Next->isCFIInstruction();
1169}
1170
1171/// analyzeCompare - For a comparison instruction, return the source registers
1172/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1173/// Return true if the comparison instruction can be analyzed.
1174bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1175 Register &SrcReg2, int64_t &CmpMask,
1176 int64_t &CmpValue) const {
1177 // The first operand can be a frame index where we'd normally expect a
1178 // register.
1179 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1180 if (!MI.getOperand(i: 1).isReg())
1181 return false;
1182
1183 switch (MI.getOpcode()) {
1184 default:
1185 break;
1186 case AArch64::PTEST_PP:
1187 case AArch64::PTEST_PP_ANY:
1188 SrcReg = MI.getOperand(i: 0).getReg();
1189 SrcReg2 = MI.getOperand(i: 1).getReg();
1190 // Not sure about the mask and value for now...
1191 CmpMask = ~0;
1192 CmpValue = 0;
1193 return true;
1194 case AArch64::SUBSWrr:
1195 case AArch64::SUBSWrs:
1196 case AArch64::SUBSWrx:
1197 case AArch64::SUBSXrr:
1198 case AArch64::SUBSXrs:
1199 case AArch64::SUBSXrx:
1200 case AArch64::ADDSWrr:
1201 case AArch64::ADDSWrs:
1202 case AArch64::ADDSWrx:
1203 case AArch64::ADDSXrr:
1204 case AArch64::ADDSXrs:
1205 case AArch64::ADDSXrx:
1206 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1207 SrcReg = MI.getOperand(i: 1).getReg();
1208 SrcReg2 = MI.getOperand(i: 2).getReg();
1209 CmpMask = ~0;
1210 CmpValue = 0;
1211 return true;
1212 case AArch64::SUBSWri:
1213 case AArch64::ADDSWri:
1214 case AArch64::SUBSXri:
1215 case AArch64::ADDSXri:
1216 SrcReg = MI.getOperand(i: 1).getReg();
1217 SrcReg2 = 0;
1218 CmpMask = ~0;
1219 CmpValue = MI.getOperand(i: 2).getImm();
1220 return true;
1221 case AArch64::ANDSWri:
1222 case AArch64::ANDSXri:
1223 // ANDS does not use the same encoding scheme as the others xxxS
1224 // instructions.
1225 SrcReg = MI.getOperand(i: 1).getReg();
1226 SrcReg2 = 0;
1227 CmpMask = ~0;
1228 CmpValue = AArch64_AM::decodeLogicalImmediate(
1229 val: MI.getOperand(i: 2).getImm(),
1230 regSize: MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1231 return true;
1232 }
1233
1234 return false;
1235}
1236
1237static bool UpdateOperandRegClass(MachineInstr &Instr) {
1238 MachineBasicBlock *MBB = Instr.getParent();
1239 assert(MBB && "Can't get MachineBasicBlock here");
1240 MachineFunction *MF = MBB->getParent();
1241 assert(MF && "Can't get MachineFunction here");
1242 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1243 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1244 MachineRegisterInfo *MRI = &MF->getRegInfo();
1245
1246 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1247 ++OpIdx) {
1248 MachineOperand &MO = Instr.getOperand(i: OpIdx);
1249 const TargetRegisterClass *OpRegCstraints =
1250 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1251
1252 // If there's no constraint, there's nothing to do.
1253 if (!OpRegCstraints)
1254 continue;
1255 // If the operand is a frame index, there's nothing to do here.
1256 // A frame index operand will resolve correctly during PEI.
1257 if (MO.isFI())
1258 continue;
1259
1260 assert(MO.isReg() &&
1261 "Operand has register constraints without being a register!");
1262
1263 Register Reg = MO.getReg();
1264 if (Reg.isPhysical()) {
1265 if (!OpRegCstraints->contains(Reg))
1266 return false;
1267 } else if (!OpRegCstraints->hasSubClassEq(RC: MRI->getRegClass(Reg)) &&
1268 !MRI->constrainRegClass(Reg, RC: OpRegCstraints))
1269 return false;
1270 }
1271
1272 return true;
1273}
1274
1275/// Return the opcode that does not set flags when possible - otherwise
1276/// return the original opcode. The caller is responsible to do the actual
1277/// substitution and legality checking.
1278static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1279 // Don't convert all compare instructions, because for some the zero register
1280 // encoding becomes the sp register.
1281 bool MIDefinesZeroReg = false;
1282 if (MI.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) ||
1283 MI.definesRegister(Reg: AArch64::XZR, /*TRI=*/nullptr))
1284 MIDefinesZeroReg = true;
1285
1286 switch (MI.getOpcode()) {
1287 default:
1288 return MI.getOpcode();
1289 case AArch64::ADDSWrr:
1290 return AArch64::ADDWrr;
1291 case AArch64::ADDSWri:
1292 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1293 case AArch64::ADDSWrs:
1294 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1295 case AArch64::ADDSWrx:
1296 return AArch64::ADDWrx;
1297 case AArch64::ADDSXrr:
1298 return AArch64::ADDXrr;
1299 case AArch64::ADDSXri:
1300 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1301 case AArch64::ADDSXrs:
1302 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1303 case AArch64::ADDSXrx:
1304 return AArch64::ADDXrx;
1305 case AArch64::SUBSWrr:
1306 return AArch64::SUBWrr;
1307 case AArch64::SUBSWri:
1308 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1309 case AArch64::SUBSWrs:
1310 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1311 case AArch64::SUBSWrx:
1312 return AArch64::SUBWrx;
1313 case AArch64::SUBSXrr:
1314 return AArch64::SUBXrr;
1315 case AArch64::SUBSXri:
1316 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1317 case AArch64::SUBSXrs:
1318 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1319 case AArch64::SUBSXrx:
1320 return AArch64::SUBXrx;
1321 }
1322}
1323
1324enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1325
1326/// True when condition flags are accessed (either by writing or reading)
1327/// on the instruction trace starting at From and ending at To.
1328///
1329/// Note: If From and To are from different blocks it's assumed CC are accessed
1330/// on the path.
1331static bool areCFlagsAccessedBetweenInstrs(
1332 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1333 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1334 // Early exit if To is at the beginning of the BB.
1335 if (To == To->getParent()->begin())
1336 return true;
1337
1338 // Check whether the instructions are in the same basic block
1339 // If not, assume the condition flags might get modified somewhere.
1340 if (To->getParent() != From->getParent())
1341 return true;
1342
1343 // From must be above To.
1344 assert(std::any_of(
1345 ++To.getReverse(), To->getParent()->rend(),
1346 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1347
1348 // We iterate backward starting at \p To until we hit \p From.
1349 for (const MachineInstr &Instr :
1350 instructionsWithoutDebug(It: ++To.getReverse(), End: From.getReverse())) {
1351 if (((AccessToCheck & AK_Write) &&
1352 Instr.modifiesRegister(Reg: AArch64::NZCV, TRI)) ||
1353 ((AccessToCheck & AK_Read) && Instr.readsRegister(Reg: AArch64::NZCV, TRI)))
1354 return true;
1355 }
1356 return false;
1357}
1358
1359std::optional<unsigned>
1360AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1361 MachineInstr *Pred,
1362 const MachineRegisterInfo *MRI) const {
1363 unsigned MaskOpcode = Mask->getOpcode();
1364 unsigned PredOpcode = Pred->getOpcode();
1365 bool PredIsPTestLike = isPTestLikeOpcode(Opc: PredOpcode);
1366 bool PredIsWhileLike = isWhileOpcode(Opc: PredOpcode);
1367
1368 if (PredIsWhileLike) {
1369 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1370 // instruction and the condition is "any" since WHILcc does an implicit
1371 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1372 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1373 return PredOpcode;
1374
1375 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1376 // redundant since WHILE performs an implicit PTEST with an all active
1377 // mask.
1378 if (isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31 &&
1379 getElementSizeForOpcode(Opc: MaskOpcode) ==
1380 getElementSizeForOpcode(Opc: PredOpcode))
1381 return PredOpcode;
1382
1383 return {};
1384 }
1385
1386 if (PredIsPTestLike) {
1387 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1388 // instruction that sets the flags as PTEST would and the condition is
1389 // "any" since PG is always a subset of the governing predicate of the
1390 // ptest-like instruction.
1391 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1392 return PredOpcode;
1393
1394 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1395 // the element size matches and either the PTEST_LIKE instruction uses
1396 // the same all active mask or the condition is "any".
1397 if (isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31 &&
1398 getElementSizeForOpcode(Opc: MaskOpcode) ==
1399 getElementSizeForOpcode(Opc: PredOpcode)) {
1400 auto PTestLikeMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1401 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1402 return PredOpcode;
1403 }
1404
1405 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1406 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1407 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1408 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1409 // performed by the compare could consider fewer lanes for these element
1410 // sizes.
1411 //
1412 // For example, consider
1413 //
1414 // ptrue p0.b ; P0=1111-1111-1111-1111
1415 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1416 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1417 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1418 // ; ^ last active
1419 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1420 // ; ^ last active
1421 //
1422 // where the compare generates a canonical all active 32-bit predicate
1423 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1424 // active flag, whereas the PTEST instruction with the same mask doesn't.
1425 // For PTEST_ANY this doesn't apply as the flags in this case would be
1426 // identical regardless of element size.
1427 auto PTestLikeMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1428 uint64_t PredElementSize = getElementSizeForOpcode(Opc: PredOpcode);
1429 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1430 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1431 return PredOpcode;
1432
1433 return {};
1434 }
1435
1436 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1437 // opcode so the PTEST becomes redundant.
1438 switch (PredOpcode) {
1439 case AArch64::AND_PPzPP:
1440 case AArch64::BIC_PPzPP:
1441 case AArch64::EOR_PPzPP:
1442 case AArch64::NAND_PPzPP:
1443 case AArch64::NOR_PPzPP:
1444 case AArch64::ORN_PPzPP:
1445 case AArch64::ORR_PPzPP:
1446 case AArch64::BRKA_PPzP:
1447 case AArch64::BRKPA_PPzPP:
1448 case AArch64::BRKB_PPzP:
1449 case AArch64::BRKPB_PPzPP:
1450 case AArch64::RDFFR_PPz: {
1451 // Check to see if our mask is the same. If not the resulting flag bits
1452 // may be different and we can't remove the ptest.
1453 auto *PredMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1454 if (Mask != PredMask)
1455 return {};
1456 break;
1457 }
1458 case AArch64::BRKN_PPzP: {
1459 // BRKN uses an all active implicit mask to set flags unlike the other
1460 // flag-setting instructions.
1461 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1462 if ((MaskOpcode != AArch64::PTRUE_B) ||
1463 (Mask->getOperand(i: 1).getImm() != 31))
1464 return {};
1465 break;
1466 }
1467 case AArch64::PTRUE_B:
1468 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1469 break;
1470 default:
1471 // Bail out if we don't recognize the input
1472 return {};
1473 }
1474
1475 return convertToFlagSettingOpc(Opc: PredOpcode);
1476}
1477
1478/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1479/// operation which could set the flags in an identical manner
1480bool AArch64InstrInfo::optimizePTestInstr(
1481 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1482 const MachineRegisterInfo *MRI) const {
1483 auto *Mask = MRI->getUniqueVRegDef(Reg: MaskReg);
1484 auto *Pred = MRI->getUniqueVRegDef(Reg: PredReg);
1485 unsigned PredOpcode = Pred->getOpcode();
1486 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1487 if (!NewOp)
1488 return false;
1489
1490 const TargetRegisterInfo *TRI = &getRegisterInfo();
1491
1492 // If another instruction between Pred and PTest accesses flags, don't remove
1493 // the ptest or update the earlier instruction to modify them.
1494 if (areCFlagsAccessedBetweenInstrs(From: Pred, To: PTest, TRI))
1495 return false;
1496
1497 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1498 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1499 // operand to be replaced with an equivalent instruction that also sets the
1500 // flags.
1501 PTest->eraseFromParent();
1502 if (*NewOp != PredOpcode) {
1503 Pred->setDesc(get(Opcode: *NewOp));
1504 bool succeeded = UpdateOperandRegClass(Instr&: *Pred);
1505 (void)succeeded;
1506 assert(succeeded && "Operands have incompatible register classes!");
1507 Pred->addRegisterDefined(Reg: AArch64::NZCV, RegInfo: TRI);
1508 }
1509
1510 // Ensure that the flags def is live.
1511 if (Pred->registerDefIsDead(Reg: AArch64::NZCV, TRI)) {
1512 unsigned i = 0, e = Pred->getNumOperands();
1513 for (; i != e; ++i) {
1514 MachineOperand &MO = Pred->getOperand(i);
1515 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1516 MO.setIsDead(false);
1517 break;
1518 }
1519 }
1520 }
1521 return true;
1522}
1523
1524/// Try to optimize a compare instruction. A compare instruction is an
1525/// instruction which produces AArch64::NZCV. It can be truly compare
1526/// instruction
1527/// when there are no uses of its destination register.
1528///
1529/// The following steps are tried in order:
1530/// 1. Convert CmpInstr into an unconditional version.
1531/// 2. Remove CmpInstr if above there is an instruction producing a needed
1532/// condition code or an instruction which can be converted into such an
1533/// instruction.
1534/// Only comparison with zero is supported.
1535bool AArch64InstrInfo::optimizeCompareInstr(
1536 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1537 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1538 assert(CmpInstr.getParent());
1539 assert(MRI);
1540
1541 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1542 int DeadNZCVIdx =
1543 CmpInstr.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true);
1544 if (DeadNZCVIdx != -1) {
1545 if (CmpInstr.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) ||
1546 CmpInstr.definesRegister(Reg: AArch64::XZR, /*TRI=*/nullptr)) {
1547 CmpInstr.eraseFromParent();
1548 return true;
1549 }
1550 unsigned Opc = CmpInstr.getOpcode();
1551 unsigned NewOpc = convertToNonFlagSettingOpc(MI: CmpInstr);
1552 if (NewOpc == Opc)
1553 return false;
1554 const MCInstrDesc &MCID = get(Opcode: NewOpc);
1555 CmpInstr.setDesc(MCID);
1556 CmpInstr.removeOperand(OpNo: DeadNZCVIdx);
1557 bool succeeded = UpdateOperandRegClass(Instr&: CmpInstr);
1558 (void)succeeded;
1559 assert(succeeded && "Some operands reg class are incompatible!");
1560 return true;
1561 }
1562
1563 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1564 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1565 return optimizePTestInstr(PTest: &CmpInstr, MaskReg: SrcReg, PredReg: SrcReg2, MRI);
1566
1567 if (SrcReg2 != 0)
1568 return false;
1569
1570 // CmpInstr is a Compare instruction if destination register is not used.
1571 if (!MRI->use_nodbg_empty(RegNo: CmpInstr.getOperand(i: 0).getReg()))
1572 return false;
1573
1574 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, MRI: *MRI))
1575 return true;
1576 return (CmpValue == 0 || CmpValue == 1) &&
1577 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, MRI: *MRI);
1578}
1579
1580/// Get opcode of S version of Instr.
1581/// If Instr is S version its opcode is returned.
1582/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1583/// or we are not interested in it.
1584static unsigned sForm(MachineInstr &Instr) {
1585 switch (Instr.getOpcode()) {
1586 default:
1587 return AArch64::INSTRUCTION_LIST_END;
1588
1589 case AArch64::ADDSWrr:
1590 case AArch64::ADDSWri:
1591 case AArch64::ADDSXrr:
1592 case AArch64::ADDSXri:
1593 case AArch64::SUBSWrr:
1594 case AArch64::SUBSWri:
1595 case AArch64::SUBSXrr:
1596 case AArch64::SUBSXri:
1597 return Instr.getOpcode();
1598
1599 case AArch64::ADDWrr:
1600 return AArch64::ADDSWrr;
1601 case AArch64::ADDWri:
1602 return AArch64::ADDSWri;
1603 case AArch64::ADDXrr:
1604 return AArch64::ADDSXrr;
1605 case AArch64::ADDXri:
1606 return AArch64::ADDSXri;
1607 case AArch64::ADCWr:
1608 return AArch64::ADCSWr;
1609 case AArch64::ADCXr:
1610 return AArch64::ADCSXr;
1611 case AArch64::SUBWrr:
1612 return AArch64::SUBSWrr;
1613 case AArch64::SUBWri:
1614 return AArch64::SUBSWri;
1615 case AArch64::SUBXrr:
1616 return AArch64::SUBSXrr;
1617 case AArch64::SUBXri:
1618 return AArch64::SUBSXri;
1619 case AArch64::SBCWr:
1620 return AArch64::SBCSWr;
1621 case AArch64::SBCXr:
1622 return AArch64::SBCSXr;
1623 case AArch64::ANDWri:
1624 return AArch64::ANDSWri;
1625 case AArch64::ANDXri:
1626 return AArch64::ANDSXri;
1627 }
1628}
1629
1630/// Check if AArch64::NZCV should be alive in successors of MBB.
1631static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1632 for (auto *BB : MBB->successors())
1633 if (BB->isLiveIn(Reg: AArch64::NZCV))
1634 return true;
1635 return false;
1636}
1637
1638/// \returns The condition code operand index for \p Instr if it is a branch
1639/// or select and -1 otherwise.
1640static int
1641findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1642 switch (Instr.getOpcode()) {
1643 default:
1644 return -1;
1645
1646 case AArch64::Bcc: {
1647 int Idx = Instr.findRegisterUseOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
1648 assert(Idx >= 2);
1649 return Idx - 2;
1650 }
1651
1652 case AArch64::CSINVWr:
1653 case AArch64::CSINVXr:
1654 case AArch64::CSINCWr:
1655 case AArch64::CSINCXr:
1656 case AArch64::CSELWr:
1657 case AArch64::CSELXr:
1658 case AArch64::CSNEGWr:
1659 case AArch64::CSNEGXr:
1660 case AArch64::FCSELSrrr:
1661 case AArch64::FCSELDrrr: {
1662 int Idx = Instr.findRegisterUseOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
1663 assert(Idx >= 1);
1664 return Idx - 1;
1665 }
1666 }
1667}
1668
1669/// Find a condition code used by the instruction.
1670/// Returns AArch64CC::Invalid if either the instruction does not use condition
1671/// codes or we don't optimize CmpInstr in the presence of such instructions.
1672static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1673 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1674 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1675 Instr.getOperand(i: CCIdx).getImm())
1676 : AArch64CC::Invalid;
1677}
1678
1679static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1680 assert(CC != AArch64CC::Invalid);
1681 UsedNZCV UsedFlags;
1682 switch (CC) {
1683 default:
1684 break;
1685
1686 case AArch64CC::EQ: // Z set
1687 case AArch64CC::NE: // Z clear
1688 UsedFlags.Z = true;
1689 break;
1690
1691 case AArch64CC::HI: // Z clear and C set
1692 case AArch64CC::LS: // Z set or C clear
1693 UsedFlags.Z = true;
1694 [[fallthrough]];
1695 case AArch64CC::HS: // C set
1696 case AArch64CC::LO: // C clear
1697 UsedFlags.C = true;
1698 break;
1699
1700 case AArch64CC::MI: // N set
1701 case AArch64CC::PL: // N clear
1702 UsedFlags.N = true;
1703 break;
1704
1705 case AArch64CC::VS: // V set
1706 case AArch64CC::VC: // V clear
1707 UsedFlags.V = true;
1708 break;
1709
1710 case AArch64CC::GT: // Z clear, N and V the same
1711 case AArch64CC::LE: // Z set, N and V differ
1712 UsedFlags.Z = true;
1713 [[fallthrough]];
1714 case AArch64CC::GE: // N and V the same
1715 case AArch64CC::LT: // N and V differ
1716 UsedFlags.N = true;
1717 UsedFlags.V = true;
1718 break;
1719 }
1720 return UsedFlags;
1721}
1722
1723/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1724/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1725/// \returns std::nullopt otherwise.
1726///
1727/// Collect instructions using that flags in \p CCUseInstrs if provided.
1728std::optional<UsedNZCV>
1729llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1730 const TargetRegisterInfo &TRI,
1731 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1732 MachineBasicBlock *CmpParent = CmpInstr.getParent();
1733 if (MI.getParent() != CmpParent)
1734 return std::nullopt;
1735
1736 if (areCFlagsAliveInSuccessors(MBB: CmpParent))
1737 return std::nullopt;
1738
1739 UsedNZCV NZCVUsedAfterCmp;
1740 for (MachineInstr &Instr : instructionsWithoutDebug(
1741 It: std::next(x: CmpInstr.getIterator()), End: CmpParent->instr_end())) {
1742 if (Instr.readsRegister(Reg: AArch64::NZCV, TRI: &TRI)) {
1743 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1744 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1745 return std::nullopt;
1746 NZCVUsedAfterCmp |= getUsedNZCV(CC);
1747 if (CCUseInstrs)
1748 CCUseInstrs->push_back(Elt: &Instr);
1749 }
1750 if (Instr.modifiesRegister(Reg: AArch64::NZCV, TRI: &TRI))
1751 break;
1752 }
1753 return NZCVUsedAfterCmp;
1754}
1755
1756static bool isADDSRegImm(unsigned Opcode) {
1757 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1758}
1759
1760static bool isSUBSRegImm(unsigned Opcode) {
1761 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1762}
1763
1764/// Check if CmpInstr can be substituted by MI.
1765///
1766/// CmpInstr can be substituted:
1767/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1768/// - and, MI and CmpInstr are from the same MachineBB
1769/// - and, condition flags are not alive in successors of the CmpInstr parent
1770/// - and, if MI opcode is the S form there must be no defs of flags between
1771/// MI and CmpInstr
1772/// or if MI opcode is not the S form there must be neither defs of flags
1773/// nor uses of flags between MI and CmpInstr.
1774/// - and, if C/V flags are not used after CmpInstr
1775/// or if N flag is used but MI produces poison value if signed overflow
1776/// occurs.
1777static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1778 const TargetRegisterInfo &TRI) {
1779 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1780 // that may or may not set flags.
1781 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1782
1783 const unsigned CmpOpcode = CmpInstr.getOpcode();
1784 if (!isADDSRegImm(Opcode: CmpOpcode) && !isSUBSRegImm(Opcode: CmpOpcode))
1785 return false;
1786
1787 assert((CmpInstr.getOperand(2).isImm() &&
1788 CmpInstr.getOperand(2).getImm() == 0) &&
1789 "Caller guarantees that CmpInstr compares with constant 0");
1790
1791 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1792 if (!NZVCUsed || NZVCUsed->C)
1793 return false;
1794
1795 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1796 // '%vreg = add ...' or '%vreg = sub ...'.
1797 // Condition flag V is used to indicate signed overflow.
1798 // 1) MI and CmpInstr set N and V to the same value.
1799 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1800 // signed overflow occurs, so CmpInstr could still be simplified away.
1801 if (NZVCUsed->V && !MI.getFlag(Flag: MachineInstr::NoSWrap))
1802 return false;
1803
1804 AccessKind AccessToCheck = AK_Write;
1805 if (sForm(Instr&: MI) != MI.getOpcode())
1806 AccessToCheck = AK_All;
1807 return !areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck);
1808}
1809
1810/// Substitute an instruction comparing to zero with another instruction
1811/// which produces needed condition flags.
1812///
1813/// Return true on success.
1814bool AArch64InstrInfo::substituteCmpToZero(
1815 MachineInstr &CmpInstr, unsigned SrcReg,
1816 const MachineRegisterInfo &MRI) const {
1817 // Get the unique definition of SrcReg.
1818 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
1819 if (!MI)
1820 return false;
1821
1822 const TargetRegisterInfo &TRI = getRegisterInfo();
1823
1824 unsigned NewOpc = sForm(Instr&: *MI);
1825 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1826 return false;
1827
1828 if (!canInstrSubstituteCmpInstr(MI&: *MI, CmpInstr, TRI))
1829 return false;
1830
1831 // Update the instruction to set NZCV.
1832 MI->setDesc(get(Opcode: NewOpc));
1833 CmpInstr.eraseFromParent();
1834 bool succeeded = UpdateOperandRegClass(Instr&: *MI);
1835 (void)succeeded;
1836 assert(succeeded && "Some operands reg class are incompatible!");
1837 MI->addRegisterDefined(Reg: AArch64::NZCV, RegInfo: &TRI);
1838 return true;
1839}
1840
1841/// \returns True if \p CmpInstr can be removed.
1842///
1843/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1844/// codes used in \p CCUseInstrs must be inverted.
1845static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1846 int CmpValue, const TargetRegisterInfo &TRI,
1847 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1848 bool &IsInvertCC) {
1849 assert((CmpValue == 0 || CmpValue == 1) &&
1850 "Only comparisons to 0 or 1 considered for removal!");
1851
1852 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1853 unsigned MIOpc = MI.getOpcode();
1854 if (MIOpc == AArch64::CSINCWr) {
1855 if (MI.getOperand(i: 1).getReg() != AArch64::WZR ||
1856 MI.getOperand(i: 2).getReg() != AArch64::WZR)
1857 return false;
1858 } else if (MIOpc == AArch64::CSINCXr) {
1859 if (MI.getOperand(i: 1).getReg() != AArch64::XZR ||
1860 MI.getOperand(i: 2).getReg() != AArch64::XZR)
1861 return false;
1862 } else {
1863 return false;
1864 }
1865 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(Instr: MI);
1866 if (MICC == AArch64CC::Invalid)
1867 return false;
1868
1869 // NZCV needs to be defined
1870 if (MI.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) != -1)
1871 return false;
1872
1873 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1874 const unsigned CmpOpcode = CmpInstr.getOpcode();
1875 bool IsSubsRegImm = isSUBSRegImm(Opcode: CmpOpcode);
1876 if (CmpValue && !IsSubsRegImm)
1877 return false;
1878 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(Opcode: CmpOpcode))
1879 return false;
1880
1881 // MI conditions allowed: eq, ne, mi, pl
1882 UsedNZCV MIUsedNZCV = getUsedNZCV(CC: MICC);
1883 if (MIUsedNZCV.C || MIUsedNZCV.V)
1884 return false;
1885
1886 std::optional<UsedNZCV> NZCVUsedAfterCmp =
1887 examineCFlagsUse(MI, CmpInstr, TRI, CCUseInstrs: &CCUseInstrs);
1888 // Condition flags are not used in CmpInstr basic block successors and only
1889 // Z or N flags allowed to be used after CmpInstr within its basic block
1890 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1891 return false;
1892 // Z or N flag used after CmpInstr must correspond to the flag used in MI
1893 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1894 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1895 return false;
1896 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1897 if (MIUsedNZCV.N && !CmpValue)
1898 return false;
1899
1900 // There must be no defs of flags between MI and CmpInstr
1901 if (areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck: AK_Write))
1902 return false;
1903
1904 // Condition code is inverted in the following cases:
1905 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1906 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1907 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1908 (!CmpValue && MICC == AArch64CC::NE);
1909 return true;
1910}
1911
1912/// Remove comparison in csinc-cmp sequence
1913///
1914/// Examples:
1915/// 1. \code
1916/// csinc w9, wzr, wzr, ne
1917/// cmp w9, #0
1918/// b.eq
1919/// \endcode
1920/// to
1921/// \code
1922/// csinc w9, wzr, wzr, ne
1923/// b.ne
1924/// \endcode
1925///
1926/// 2. \code
1927/// csinc x2, xzr, xzr, mi
1928/// cmp x2, #1
1929/// b.pl
1930/// \endcode
1931/// to
1932/// \code
1933/// csinc x2, xzr, xzr, mi
1934/// b.pl
1935/// \endcode
1936///
1937/// \param CmpInstr comparison instruction
1938/// \return True when comparison removed
1939bool AArch64InstrInfo::removeCmpToZeroOrOne(
1940 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1941 const MachineRegisterInfo &MRI) const {
1942 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
1943 if (!MI)
1944 return false;
1945 const TargetRegisterInfo &TRI = getRegisterInfo();
1946 SmallVector<MachineInstr *, 4> CCUseInstrs;
1947 bool IsInvertCC = false;
1948 if (!canCmpInstrBeRemoved(MI&: *MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1949 IsInvertCC))
1950 return false;
1951 // Make transformation
1952 CmpInstr.eraseFromParent();
1953 if (IsInvertCC) {
1954 // Invert condition codes in CmpInstr CC users
1955 for (MachineInstr *CCUseInstr : CCUseInstrs) {
1956 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(Instr: *CCUseInstr);
1957 assert(Idx >= 0 && "Unexpected instruction using CC.");
1958 MachineOperand &CCOperand = CCUseInstr->getOperand(i: Idx);
1959 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1960 Code: static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1961 CCOperand.setImm(CCUse);
1962 }
1963 }
1964 return true;
1965}
1966
1967bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1968 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1969 MI.getOpcode() != AArch64::CATCHRET)
1970 return false;
1971
1972 MachineBasicBlock &MBB = *MI.getParent();
1973 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1974 auto TRI = Subtarget.getRegisterInfo();
1975 DebugLoc DL = MI.getDebugLoc();
1976
1977 if (MI.getOpcode() == AArch64::CATCHRET) {
1978 // Skip to the first instruction before the epilog.
1979 const TargetInstrInfo *TII =
1980 MBB.getParent()->getSubtarget().getInstrInfo();
1981 MachineBasicBlock *TargetMBB = MI.getOperand(i: 0).getMBB();
1982 auto MBBI = MachineBasicBlock::iterator(MI);
1983 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(x: MBBI);
1984 while (FirstEpilogSEH->getFlag(Flag: MachineInstr::FrameDestroy) &&
1985 FirstEpilogSEH != MBB.begin())
1986 FirstEpilogSEH = std::prev(x: FirstEpilogSEH);
1987 if (FirstEpilogSEH != MBB.begin())
1988 FirstEpilogSEH = std::next(x: FirstEpilogSEH);
1989 BuildMI(BB&: MBB, I: FirstEpilogSEH, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADRP))
1990 .addReg(RegNo: AArch64::X0, flags: RegState::Define)
1991 .addMBB(MBB: TargetMBB);
1992 BuildMI(BB&: MBB, I: FirstEpilogSEH, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri))
1993 .addReg(RegNo: AArch64::X0, flags: RegState::Define)
1994 .addReg(RegNo: AArch64::X0)
1995 .addMBB(MBB: TargetMBB)
1996 .addImm(Val: 0);
1997 return true;
1998 }
1999
2000 Register Reg = MI.getOperand(i: 0).getReg();
2001 Module &M = *MBB.getParent()->getFunction().getParent();
2002 if (M.getStackProtectorGuard() == "sysreg") {
2003 const AArch64SysReg::SysReg *SrcReg =
2004 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2005 if (!SrcReg)
2006 report_fatal_error(reason: "Unknown SysReg for Stack Protector Guard Register");
2007
2008 // mrs xN, sysreg
2009 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MRS))
2010 .addDef(RegNo: Reg, Flags: RegState::Renamable)
2011 .addImm(Val: SrcReg->Encoding);
2012 int Offset = M.getStackProtectorGuardOffset();
2013 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2014 // ldr xN, [xN, #offset]
2015 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui))
2016 .addDef(RegNo: Reg)
2017 .addUse(RegNo: Reg, Flags: RegState::Kill)
2018 .addImm(Val: Offset / 8);
2019 } else if (Offset >= -256 && Offset <= 255) {
2020 // ldur xN, [xN, #offset]
2021 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDURXi))
2022 .addDef(RegNo: Reg)
2023 .addUse(RegNo: Reg, Flags: RegState::Kill)
2024 .addImm(Val: Offset);
2025 } else if (Offset >= -4095 && Offset <= 4095) {
2026 if (Offset > 0) {
2027 // add xN, xN, #offset
2028 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri))
2029 .addDef(RegNo: Reg)
2030 .addUse(RegNo: Reg, Flags: RegState::Kill)
2031 .addImm(Val: Offset)
2032 .addImm(Val: 0);
2033 } else {
2034 // sub xN, xN, #offset
2035 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::SUBXri))
2036 .addDef(RegNo: Reg)
2037 .addUse(RegNo: Reg, Flags: RegState::Kill)
2038 .addImm(Val: -Offset)
2039 .addImm(Val: 0);
2040 }
2041 // ldr xN, [xN]
2042 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui))
2043 .addDef(RegNo: Reg)
2044 .addUse(RegNo: Reg, Flags: RegState::Kill)
2045 .addImm(Val: 0);
2046 } else {
2047 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2048 // than 23760.
2049 // It might be nice to use AArch64::MOVi32imm here, which would get
2050 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2051 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2052 // AArch64FrameLowering might help us find such a scratch register
2053 // though. If we failed to find a scratch register, we could emit a
2054 // stream of add instructions to build up the immediate. Or, we could try
2055 // to insert a AArch64::MOVi32imm before register allocation so that we
2056 // didn't need to scavenge for a scratch register.
2057 report_fatal_error(reason: "Unable to encode Stack Protector Guard Offset");
2058 }
2059 MBB.erase(I: MI);
2060 return true;
2061 }
2062
2063 const GlobalValue *GV =
2064 cast<GlobalValue>(Val: (*MI.memoperands_begin())->getValue());
2065 const TargetMachine &TM = MBB.getParent()->getTarget();
2066 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2067 const unsigned char MO_NC = AArch64II::MO_NC;
2068
2069 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2070 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LOADgot), DestReg: Reg)
2071 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags);
2072 if (Subtarget.isTargetILP32()) {
2073 unsigned Reg32 = TRI->getSubReg(Reg, Idx: AArch64::sub_32);
2074 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRWui))
2075 .addDef(RegNo: Reg32, Flags: RegState::Dead)
2076 .addUse(RegNo: Reg, Flags: RegState::Kill)
2077 .addImm(Val: 0)
2078 .addMemOperand(MMO: *MI.memoperands_begin())
2079 .addDef(RegNo: Reg, Flags: RegState::Implicit);
2080 } else {
2081 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2082 .addReg(RegNo: Reg, flags: RegState::Kill)
2083 .addImm(Val: 0)
2084 .addMemOperand(MMO: *MI.memoperands_begin());
2085 }
2086 } else if (TM.getCodeModel() == CodeModel::Large) {
2087 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2088 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: Reg)
2089 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G0 | MO_NC)
2090 .addImm(Val: 0);
2091 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2092 .addReg(RegNo: Reg, flags: RegState::Kill)
2093 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G1 | MO_NC)
2094 .addImm(Val: 16);
2095 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2096 .addReg(RegNo: Reg, flags: RegState::Kill)
2097 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G2 | MO_NC)
2098 .addImm(Val: 32);
2099 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2100 .addReg(RegNo: Reg, flags: RegState::Kill)
2101 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G3)
2102 .addImm(Val: 48);
2103 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2104 .addReg(RegNo: Reg, flags: RegState::Kill)
2105 .addImm(Val: 0)
2106 .addMemOperand(MMO: *MI.memoperands_begin());
2107 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2108 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADR), DestReg: Reg)
2109 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags);
2110 } else {
2111 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADRP), DestReg: Reg)
2112 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags | AArch64II::MO_PAGE);
2113 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2114 if (Subtarget.isTargetILP32()) {
2115 unsigned Reg32 = TRI->getSubReg(Reg, Idx: AArch64::sub_32);
2116 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRWui))
2117 .addDef(RegNo: Reg32, Flags: RegState::Dead)
2118 .addUse(RegNo: Reg, Flags: RegState::Kill)
2119 .addGlobalAddress(GV, Offset: 0, TargetFlags: LoFlags)
2120 .addMemOperand(MMO: *MI.memoperands_begin())
2121 .addDef(RegNo: Reg, Flags: RegState::Implicit);
2122 } else {
2123 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2124 .addReg(RegNo: Reg, flags: RegState::Kill)
2125 .addGlobalAddress(GV, Offset: 0, TargetFlags: LoFlags)
2126 .addMemOperand(MMO: *MI.memoperands_begin());
2127 }
2128 }
2129
2130 MBB.erase(I: MI);
2131
2132 return true;
2133}
2134
2135// Return true if this instruction simply sets its single destination register
2136// to zero. This is equivalent to a register rename of the zero-register.
2137bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2138 switch (MI.getOpcode()) {
2139 default:
2140 break;
2141 case AArch64::MOVZWi:
2142 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2143 if (MI.getOperand(i: 1).isImm() && MI.getOperand(i: 1).getImm() == 0) {
2144 assert(MI.getDesc().getNumOperands() == 3 &&
2145 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2146 return true;
2147 }
2148 break;
2149 case AArch64::ANDWri: // and Rd, Rzr, #imm
2150 return MI.getOperand(i: 1).getReg() == AArch64::WZR;
2151 case AArch64::ANDXri:
2152 return MI.getOperand(i: 1).getReg() == AArch64::XZR;
2153 case TargetOpcode::COPY:
2154 return MI.getOperand(i: 1).getReg() == AArch64::WZR;
2155 }
2156 return false;
2157}
2158
2159// Return true if this instruction simply renames a general register without
2160// modifying bits.
2161bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2162 switch (MI.getOpcode()) {
2163 default:
2164 break;
2165 case TargetOpcode::COPY: {
2166 // GPR32 copies will by lowered to ORRXrs
2167 Register DstReg = MI.getOperand(i: 0).getReg();
2168 return (AArch64::GPR32RegClass.contains(Reg: DstReg) ||
2169 AArch64::GPR64RegClass.contains(Reg: DstReg));
2170 }
2171 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2172 if (MI.getOperand(i: 1).getReg() == AArch64::XZR) {
2173 assert(MI.getDesc().getNumOperands() == 4 &&
2174 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2175 return true;
2176 }
2177 break;
2178 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2179 if (MI.getOperand(i: 2).getImm() == 0) {
2180 assert(MI.getDesc().getNumOperands() == 4 &&
2181 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2182 return true;
2183 }
2184 break;
2185 }
2186 return false;
2187}
2188
2189// Return true if this instruction simply renames a general register without
2190// modifying bits.
2191bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2192 switch (MI.getOpcode()) {
2193 default:
2194 break;
2195 case TargetOpcode::COPY: {
2196 Register DstReg = MI.getOperand(i: 0).getReg();
2197 return AArch64::FPR128RegClass.contains(Reg: DstReg);
2198 }
2199 case AArch64::ORRv16i8:
2200 if (MI.getOperand(i: 1).getReg() == MI.getOperand(i: 2).getReg()) {
2201 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2202 "invalid ORRv16i8 operands");
2203 return true;
2204 }
2205 break;
2206 }
2207 return false;
2208}
2209
2210Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2211 int &FrameIndex) const {
2212 switch (MI.getOpcode()) {
2213 default:
2214 break;
2215 case AArch64::LDRWui:
2216 case AArch64::LDRXui:
2217 case AArch64::LDRBui:
2218 case AArch64::LDRHui:
2219 case AArch64::LDRSui:
2220 case AArch64::LDRDui:
2221 case AArch64::LDRQui:
2222 case AArch64::LDR_PXI:
2223 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2224 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2225 FrameIndex = MI.getOperand(i: 1).getIndex();
2226 return MI.getOperand(i: 0).getReg();
2227 }
2228 break;
2229 }
2230
2231 return 0;
2232}
2233
2234Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2235 int &FrameIndex) const {
2236 switch (MI.getOpcode()) {
2237 default:
2238 break;
2239 case AArch64::STRWui:
2240 case AArch64::STRXui:
2241 case AArch64::STRBui:
2242 case AArch64::STRHui:
2243 case AArch64::STRSui:
2244 case AArch64::STRDui:
2245 case AArch64::STRQui:
2246 case AArch64::STR_PXI:
2247 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2248 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2249 FrameIndex = MI.getOperand(i: 1).getIndex();
2250 return MI.getOperand(i: 0).getReg();
2251 }
2252 break;
2253 }
2254 return 0;
2255}
2256
2257/// Check all MachineMemOperands for a hint to suppress pairing.
2258bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2259 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2260 return MMO->getFlags() & MOSuppressPair;
2261 });
2262}
2263
2264/// Set a flag on the first MachineMemOperand to suppress pairing.
2265void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2266 if (MI.memoperands_empty())
2267 return;
2268 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2269}
2270
2271/// Check all MachineMemOperands for a hint that the load/store is strided.
2272bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2273 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2274 return MMO->getFlags() & MOStridedAccess;
2275 });
2276}
2277
2278bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2279 switch (Opc) {
2280 default:
2281 return false;
2282 case AArch64::STURSi:
2283 case AArch64::STRSpre:
2284 case AArch64::STURDi:
2285 case AArch64::STRDpre:
2286 case AArch64::STURQi:
2287 case AArch64::STRQpre:
2288 case AArch64::STURBBi:
2289 case AArch64::STURHHi:
2290 case AArch64::STURWi:
2291 case AArch64::STRWpre:
2292 case AArch64::STURXi:
2293 case AArch64::STRXpre:
2294 case AArch64::LDURSi:
2295 case AArch64::LDRSpre:
2296 case AArch64::LDURDi:
2297 case AArch64::LDRDpre:
2298 case AArch64::LDURQi:
2299 case AArch64::LDRQpre:
2300 case AArch64::LDURWi:
2301 case AArch64::LDRWpre:
2302 case AArch64::LDURXi:
2303 case AArch64::LDRXpre:
2304 case AArch64::LDRSWpre:
2305 case AArch64::LDURSWi:
2306 case AArch64::LDURHHi:
2307 case AArch64::LDURBBi:
2308 case AArch64::LDURSBWi:
2309 case AArch64::LDURSHWi:
2310 return true;
2311 }
2312}
2313
2314std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2315 switch (Opc) {
2316 default: return {};
2317 case AArch64::PRFMui: return AArch64::PRFUMi;
2318 case AArch64::LDRXui: return AArch64::LDURXi;
2319 case AArch64::LDRWui: return AArch64::LDURWi;
2320 case AArch64::LDRBui: return AArch64::LDURBi;
2321 case AArch64::LDRHui: return AArch64::LDURHi;
2322 case AArch64::LDRSui: return AArch64::LDURSi;
2323 case AArch64::LDRDui: return AArch64::LDURDi;
2324 case AArch64::LDRQui: return AArch64::LDURQi;
2325 case AArch64::LDRBBui: return AArch64::LDURBBi;
2326 case AArch64::LDRHHui: return AArch64::LDURHHi;
2327 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2328 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2329 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2330 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2331 case AArch64::LDRSWui: return AArch64::LDURSWi;
2332 case AArch64::STRXui: return AArch64::STURXi;
2333 case AArch64::STRWui: return AArch64::STURWi;
2334 case AArch64::STRBui: return AArch64::STURBi;
2335 case AArch64::STRHui: return AArch64::STURHi;
2336 case AArch64::STRSui: return AArch64::STURSi;
2337 case AArch64::STRDui: return AArch64::STURDi;
2338 case AArch64::STRQui: return AArch64::STURQi;
2339 case AArch64::STRBBui: return AArch64::STURBBi;
2340 case AArch64::STRHHui: return AArch64::STURHHi;
2341 }
2342}
2343
2344unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2345 switch (Opc) {
2346 default:
2347 return 2;
2348 case AArch64::LDPXi:
2349 case AArch64::LDPDi:
2350 case AArch64::STPXi:
2351 case AArch64::STPDi:
2352 case AArch64::LDNPXi:
2353 case AArch64::LDNPDi:
2354 case AArch64::STNPXi:
2355 case AArch64::STNPDi:
2356 case AArch64::LDPQi:
2357 case AArch64::STPQi:
2358 case AArch64::LDNPQi:
2359 case AArch64::STNPQi:
2360 case AArch64::LDPWi:
2361 case AArch64::LDPSi:
2362 case AArch64::STPWi:
2363 case AArch64::STPSi:
2364 case AArch64::LDNPWi:
2365 case AArch64::LDNPSi:
2366 case AArch64::STNPWi:
2367 case AArch64::STNPSi:
2368 case AArch64::LDG:
2369 case AArch64::STGPi:
2370
2371 case AArch64::LD1B_IMM:
2372 case AArch64::LD1B_H_IMM:
2373 case AArch64::LD1B_S_IMM:
2374 case AArch64::LD1B_D_IMM:
2375 case AArch64::LD1SB_H_IMM:
2376 case AArch64::LD1SB_S_IMM:
2377 case AArch64::LD1SB_D_IMM:
2378 case AArch64::LD1H_IMM:
2379 case AArch64::LD1H_S_IMM:
2380 case AArch64::LD1H_D_IMM:
2381 case AArch64::LD1SH_S_IMM:
2382 case AArch64::LD1SH_D_IMM:
2383 case AArch64::LD1W_IMM:
2384 case AArch64::LD1W_D_IMM:
2385 case AArch64::LD1SW_D_IMM:
2386 case AArch64::LD1D_IMM:
2387
2388 case AArch64::LD2B_IMM:
2389 case AArch64::LD2H_IMM:
2390 case AArch64::LD2W_IMM:
2391 case AArch64::LD2D_IMM:
2392 case AArch64::LD3B_IMM:
2393 case AArch64::LD3H_IMM:
2394 case AArch64::LD3W_IMM:
2395 case AArch64::LD3D_IMM:
2396 case AArch64::LD4B_IMM:
2397 case AArch64::LD4H_IMM:
2398 case AArch64::LD4W_IMM:
2399 case AArch64::LD4D_IMM:
2400
2401 case AArch64::ST1B_IMM:
2402 case AArch64::ST1B_H_IMM:
2403 case AArch64::ST1B_S_IMM:
2404 case AArch64::ST1B_D_IMM:
2405 case AArch64::ST1H_IMM:
2406 case AArch64::ST1H_S_IMM:
2407 case AArch64::ST1H_D_IMM:
2408 case AArch64::ST1W_IMM:
2409 case AArch64::ST1W_D_IMM:
2410 case AArch64::ST1D_IMM:
2411
2412 case AArch64::ST2B_IMM:
2413 case AArch64::ST2H_IMM:
2414 case AArch64::ST2W_IMM:
2415 case AArch64::ST2D_IMM:
2416 case AArch64::ST3B_IMM:
2417 case AArch64::ST3H_IMM:
2418 case AArch64::ST3W_IMM:
2419 case AArch64::ST3D_IMM:
2420 case AArch64::ST4B_IMM:
2421 case AArch64::ST4H_IMM:
2422 case AArch64::ST4W_IMM:
2423 case AArch64::ST4D_IMM:
2424
2425 case AArch64::LD1RB_IMM:
2426 case AArch64::LD1RB_H_IMM:
2427 case AArch64::LD1RB_S_IMM:
2428 case AArch64::LD1RB_D_IMM:
2429 case AArch64::LD1RSB_H_IMM:
2430 case AArch64::LD1RSB_S_IMM:
2431 case AArch64::LD1RSB_D_IMM:
2432 case AArch64::LD1RH_IMM:
2433 case AArch64::LD1RH_S_IMM:
2434 case AArch64::LD1RH_D_IMM:
2435 case AArch64::LD1RSH_S_IMM:
2436 case AArch64::LD1RSH_D_IMM:
2437 case AArch64::LD1RW_IMM:
2438 case AArch64::LD1RW_D_IMM:
2439 case AArch64::LD1RSW_IMM:
2440 case AArch64::LD1RD_IMM:
2441
2442 case AArch64::LDNT1B_ZRI:
2443 case AArch64::LDNT1H_ZRI:
2444 case AArch64::LDNT1W_ZRI:
2445 case AArch64::LDNT1D_ZRI:
2446 case AArch64::STNT1B_ZRI:
2447 case AArch64::STNT1H_ZRI:
2448 case AArch64::STNT1W_ZRI:
2449 case AArch64::STNT1D_ZRI:
2450
2451 case AArch64::LDNF1B_IMM:
2452 case AArch64::LDNF1B_H_IMM:
2453 case AArch64::LDNF1B_S_IMM:
2454 case AArch64::LDNF1B_D_IMM:
2455 case AArch64::LDNF1SB_H_IMM:
2456 case AArch64::LDNF1SB_S_IMM:
2457 case AArch64::LDNF1SB_D_IMM:
2458 case AArch64::LDNF1H_IMM:
2459 case AArch64::LDNF1H_S_IMM:
2460 case AArch64::LDNF1H_D_IMM:
2461 case AArch64::LDNF1SH_S_IMM:
2462 case AArch64::LDNF1SH_D_IMM:
2463 case AArch64::LDNF1W_IMM:
2464 case AArch64::LDNF1W_D_IMM:
2465 case AArch64::LDNF1SW_D_IMM:
2466 case AArch64::LDNF1D_IMM:
2467 return 3;
2468 case AArch64::ADDG:
2469 case AArch64::STGi:
2470 case AArch64::LDR_PXI:
2471 case AArch64::STR_PXI:
2472 return 2;
2473 }
2474}
2475
2476bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2477 switch (MI.getOpcode()) {
2478 default:
2479 return false;
2480 // Scaled instructions.
2481 case AArch64::STRSui:
2482 case AArch64::STRDui:
2483 case AArch64::STRQui:
2484 case AArch64::STRXui:
2485 case AArch64::STRWui:
2486 case AArch64::LDRSui:
2487 case AArch64::LDRDui:
2488 case AArch64::LDRQui:
2489 case AArch64::LDRXui:
2490 case AArch64::LDRWui:
2491 case AArch64::LDRSWui:
2492 // Unscaled instructions.
2493 case AArch64::STURSi:
2494 case AArch64::STRSpre:
2495 case AArch64::STURDi:
2496 case AArch64::STRDpre:
2497 case AArch64::STURQi:
2498 case AArch64::STRQpre:
2499 case AArch64::STURWi:
2500 case AArch64::STRWpre:
2501 case AArch64::STURXi:
2502 case AArch64::STRXpre:
2503 case AArch64::LDURSi:
2504 case AArch64::LDRSpre:
2505 case AArch64::LDURDi:
2506 case AArch64::LDRDpre:
2507 case AArch64::LDURQi:
2508 case AArch64::LDRQpre:
2509 case AArch64::LDURWi:
2510 case AArch64::LDRWpre:
2511 case AArch64::LDURXi:
2512 case AArch64::LDRXpre:
2513 case AArch64::LDURSWi:
2514 case AArch64::LDRSWpre:
2515 return true;
2516 }
2517}
2518
2519bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2520 switch (MI.getOpcode()) {
2521 default:
2522 assert((!MI.isCall() || !MI.isReturn()) &&
2523 "Unexpected instruction - was a new tail call opcode introduced?");
2524 return false;
2525 case AArch64::TCRETURNdi:
2526 case AArch64::TCRETURNri:
2527 case AArch64::TCRETURNrix16x17:
2528 case AArch64::TCRETURNrix17:
2529 case AArch64::TCRETURNrinotx16:
2530 case AArch64::TCRETURNriALL:
2531 case AArch64::AUTH_TCRETURN:
2532 case AArch64::AUTH_TCRETURN_BTI:
2533 return true;
2534 }
2535}
2536
2537unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2538 switch (Opc) {
2539 default:
2540 llvm_unreachable("Opcode has no flag setting equivalent!");
2541 // 32-bit cases:
2542 case AArch64::ADDWri:
2543 return AArch64::ADDSWri;
2544 case AArch64::ADDWrr:
2545 return AArch64::ADDSWrr;
2546 case AArch64::ADDWrs:
2547 return AArch64::ADDSWrs;
2548 case AArch64::ADDWrx:
2549 return AArch64::ADDSWrx;
2550 case AArch64::ANDWri:
2551 return AArch64::ANDSWri;
2552 case AArch64::ANDWrr:
2553 return AArch64::ANDSWrr;
2554 case AArch64::ANDWrs:
2555 return AArch64::ANDSWrs;
2556 case AArch64::BICWrr:
2557 return AArch64::BICSWrr;
2558 case AArch64::BICWrs:
2559 return AArch64::BICSWrs;
2560 case AArch64::SUBWri:
2561 return AArch64::SUBSWri;
2562 case AArch64::SUBWrr:
2563 return AArch64::SUBSWrr;
2564 case AArch64::SUBWrs:
2565 return AArch64::SUBSWrs;
2566 case AArch64::SUBWrx:
2567 return AArch64::SUBSWrx;
2568 // 64-bit cases:
2569 case AArch64::ADDXri:
2570 return AArch64::ADDSXri;
2571 case AArch64::ADDXrr:
2572 return AArch64::ADDSXrr;
2573 case AArch64::ADDXrs:
2574 return AArch64::ADDSXrs;
2575 case AArch64::ADDXrx:
2576 return AArch64::ADDSXrx;
2577 case AArch64::ANDXri:
2578 return AArch64::ANDSXri;
2579 case AArch64::ANDXrr:
2580 return AArch64::ANDSXrr;
2581 case AArch64::ANDXrs:
2582 return AArch64::ANDSXrs;
2583 case AArch64::BICXrr:
2584 return AArch64::BICSXrr;
2585 case AArch64::BICXrs:
2586 return AArch64::BICSXrs;
2587 case AArch64::SUBXri:
2588 return AArch64::SUBSXri;
2589 case AArch64::SUBXrr:
2590 return AArch64::SUBSXrr;
2591 case AArch64::SUBXrs:
2592 return AArch64::SUBSXrs;
2593 case AArch64::SUBXrx:
2594 return AArch64::SUBSXrx;
2595 // SVE instructions:
2596 case AArch64::AND_PPzPP:
2597 return AArch64::ANDS_PPzPP;
2598 case AArch64::BIC_PPzPP:
2599 return AArch64::BICS_PPzPP;
2600 case AArch64::EOR_PPzPP:
2601 return AArch64::EORS_PPzPP;
2602 case AArch64::NAND_PPzPP:
2603 return AArch64::NANDS_PPzPP;
2604 case AArch64::NOR_PPzPP:
2605 return AArch64::NORS_PPzPP;
2606 case AArch64::ORN_PPzPP:
2607 return AArch64::ORNS_PPzPP;
2608 case AArch64::ORR_PPzPP:
2609 return AArch64::ORRS_PPzPP;
2610 case AArch64::BRKA_PPzP:
2611 return AArch64::BRKAS_PPzP;
2612 case AArch64::BRKPA_PPzPP:
2613 return AArch64::BRKPAS_PPzPP;
2614 case AArch64::BRKB_PPzP:
2615 return AArch64::BRKBS_PPzP;
2616 case AArch64::BRKPB_PPzPP:
2617 return AArch64::BRKPBS_PPzPP;
2618 case AArch64::BRKN_PPzP:
2619 return AArch64::BRKNS_PPzP;
2620 case AArch64::RDFFR_PPz:
2621 return AArch64::RDFFRS_PPz;
2622 case AArch64::PTRUE_B:
2623 return AArch64::PTRUES_B;
2624 }
2625}
2626
2627// Is this a candidate for ld/st merging or pairing? For example, we don't
2628// touch volatiles or load/stores that have a hint to avoid pair formation.
2629bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2630
2631 bool IsPreLdSt = isPreLdSt(MI);
2632
2633 // If this is a volatile load/store, don't mess with it.
2634 if (MI.hasOrderedMemoryRef())
2635 return false;
2636
2637 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2638 // For Pre-inc LD/ST, the operand is shifted by one.
2639 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2640 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2641 "Expected a reg or frame index operand.");
2642
2643 // For Pre-indexed addressing quadword instructions, the third operand is the
2644 // immediate value.
2645 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(i: 3).isImm();
2646
2647 if (!MI.getOperand(i: 2).isImm() && !IsImmPreLdSt)
2648 return false;
2649
2650 // Can't merge/pair if the instruction modifies the base register.
2651 // e.g., ldr x0, [x0]
2652 // This case will never occur with an FI base.
2653 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2654 // STR<S,D,Q,W,X>pre, it can be merged.
2655 // For example:
2656 // ldr q0, [x11, #32]!
2657 // ldr q1, [x11, #16]
2658 // to
2659 // ldp q0, q1, [x11, #32]!
2660 if (MI.getOperand(i: 1).isReg() && !IsPreLdSt) {
2661 Register BaseReg = MI.getOperand(i: 1).getReg();
2662 const TargetRegisterInfo *TRI = &getRegisterInfo();
2663 if (MI.modifiesRegister(Reg: BaseReg, TRI))
2664 return false;
2665 }
2666
2667 // Check if this load/store has a hint to avoid pair formation.
2668 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2669 if (isLdStPairSuppressed(MI))
2670 return false;
2671
2672 // Do not pair any callee-save store/reload instructions in the
2673 // prologue/epilogue if the CFI information encoded the operations as separate
2674 // instructions, as that will cause the size of the actual prologue to mismatch
2675 // with the prologue size recorded in the Windows CFI.
2676 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2677 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2678 MI.getMF()->getFunction().needsUnwindTableEntry();
2679 if (NeedsWinCFI && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
2680 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
2681 return false;
2682
2683 // On some CPUs quad load/store pairs are slower than two single load/stores.
2684 if (Subtarget.isPaired128Slow()) {
2685 switch (MI.getOpcode()) {
2686 default:
2687 break;
2688 case AArch64::LDURQi:
2689 case AArch64::STURQi:
2690 case AArch64::LDRQui:
2691 case AArch64::STRQui:
2692 return false;
2693 }
2694 }
2695
2696 return true;
2697}
2698
2699bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2700 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2701 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
2702 const TargetRegisterInfo *TRI) const {
2703 if (!LdSt.mayLoadOrStore())
2704 return false;
2705
2706 const MachineOperand *BaseOp;
2707 TypeSize WidthN(0, false);
2708 if (!getMemOperandWithOffsetWidth(MI: LdSt, BaseOp, Offset, OffsetIsScalable,
2709 Width&: WidthN, TRI))
2710 return false;
2711 // The maximum vscale is 16 under AArch64, return the maximal extent for the
2712 // vector.
2713 Width = LocationSize::precise(Value: WidthN);
2714 BaseOps.push_back(Elt: BaseOp);
2715 return true;
2716}
2717
2718std::optional<ExtAddrMode>
2719AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2720 const TargetRegisterInfo *TRI) const {
2721 const MachineOperand *Base; // Filled with the base operand of MI.
2722 int64_t Offset; // Filled with the offset of MI.
2723 bool OffsetIsScalable;
2724 if (!getMemOperandWithOffset(MI: MemI, BaseOp&: Base, Offset, OffsetIsScalable, TRI))
2725 return std::nullopt;
2726
2727 if (!Base->isReg())
2728 return std::nullopt;
2729 ExtAddrMode AM;
2730 AM.BaseReg = Base->getReg();
2731 AM.Displacement = Offset;
2732 AM.ScaledReg = 0;
2733 AM.Scale = 0;
2734 return AM;
2735}
2736
2737bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2738 Register Reg,
2739 const MachineInstr &AddrI,
2740 ExtAddrMode &AM) const {
2741 // Filter out instructions into which we cannot fold.
2742 unsigned NumBytes;
2743 int64_t OffsetScale = 1;
2744 switch (MemI.getOpcode()) {
2745 default:
2746 return false;
2747
2748 case AArch64::LDURQi:
2749 case AArch64::STURQi:
2750 NumBytes = 16;
2751 break;
2752
2753 case AArch64::LDURDi:
2754 case AArch64::STURDi:
2755 case AArch64::LDURXi:
2756 case AArch64::STURXi:
2757 NumBytes = 8;
2758 break;
2759
2760 case AArch64::LDURWi:
2761 case AArch64::LDURSWi:
2762 case AArch64::STURWi:
2763 NumBytes = 4;
2764 break;
2765
2766 case AArch64::LDURHi:
2767 case AArch64::STURHi:
2768 case AArch64::LDURHHi:
2769 case AArch64::STURHHi:
2770 case AArch64::LDURSHXi:
2771 case AArch64::LDURSHWi:
2772 NumBytes = 2;
2773 break;
2774
2775 case AArch64::LDRBroX:
2776 case AArch64::LDRBBroX:
2777 case AArch64::LDRSBXroX:
2778 case AArch64::LDRSBWroX:
2779 case AArch64::STRBroX:
2780 case AArch64::STRBBroX:
2781 case AArch64::LDURBi:
2782 case AArch64::LDURBBi:
2783 case AArch64::LDURSBXi:
2784 case AArch64::LDURSBWi:
2785 case AArch64::STURBi:
2786 case AArch64::STURBBi:
2787 case AArch64::LDRBui:
2788 case AArch64::LDRBBui:
2789 case AArch64::LDRSBXui:
2790 case AArch64::LDRSBWui:
2791 case AArch64::STRBui:
2792 case AArch64::STRBBui:
2793 NumBytes = 1;
2794 break;
2795
2796 case AArch64::LDRQroX:
2797 case AArch64::STRQroX:
2798 case AArch64::LDRQui:
2799 case AArch64::STRQui:
2800 NumBytes = 16;
2801 OffsetScale = 16;
2802 break;
2803
2804 case AArch64::LDRDroX:
2805 case AArch64::STRDroX:
2806 case AArch64::LDRXroX:
2807 case AArch64::STRXroX:
2808 case AArch64::LDRDui:
2809 case AArch64::STRDui:
2810 case AArch64::LDRXui:
2811 case AArch64::STRXui:
2812 NumBytes = 8;
2813 OffsetScale = 8;
2814 break;
2815
2816 case AArch64::LDRWroX:
2817 case AArch64::LDRSWroX:
2818 case AArch64::STRWroX:
2819 case AArch64::LDRWui:
2820 case AArch64::LDRSWui:
2821 case AArch64::STRWui:
2822 NumBytes = 4;
2823 OffsetScale = 4;
2824 break;
2825
2826 case AArch64::LDRHroX:
2827 case AArch64::STRHroX:
2828 case AArch64::LDRHHroX:
2829 case AArch64::STRHHroX:
2830 case AArch64::LDRSHXroX:
2831 case AArch64::LDRSHWroX:
2832 case AArch64::LDRHui:
2833 case AArch64::STRHui:
2834 case AArch64::LDRHHui:
2835 case AArch64::STRHHui:
2836 case AArch64::LDRSHXui:
2837 case AArch64::LDRSHWui:
2838 NumBytes = 2;
2839 OffsetScale = 2;
2840 break;
2841 }
2842
2843 // Check the fold operand is not the loaded/stored value.
2844 const MachineOperand &BaseRegOp = MemI.getOperand(i: 0);
2845 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2846 return false;
2847
2848 // Handle memory instructions with a [Reg, Reg] addressing mode.
2849 if (MemI.getOperand(i: 2).isReg()) {
2850 // Bail if the addressing mode already includes extension of the offset
2851 // register.
2852 if (MemI.getOperand(i: 3).getImm())
2853 return false;
2854
2855 // Check if we actually have a scaled offset.
2856 if (MemI.getOperand(i: 4).getImm() == 0)
2857 OffsetScale = 1;
2858
2859 // If the address instructions is folded into the base register, then the
2860 // addressing mode must not have a scale. Then we can swap the base and the
2861 // scaled registers.
2862 if (MemI.getOperand(i: 1).getReg() == Reg && OffsetScale != 1)
2863 return false;
2864
2865 switch (AddrI.getOpcode()) {
2866 default:
2867 return false;
2868
2869 case AArch64::SBFMXri:
2870 // sxtw Xa, Wm
2871 // ldr Xd, [Xn, Xa, lsl #N]
2872 // ->
2873 // ldr Xd, [Xn, Wm, sxtw #N]
2874 if (AddrI.getOperand(i: 2).getImm() != 0 ||
2875 AddrI.getOperand(i: 3).getImm() != 31)
2876 return false;
2877
2878 AM.BaseReg = MemI.getOperand(i: 1).getReg();
2879 if (AM.BaseReg == Reg)
2880 AM.BaseReg = MemI.getOperand(i: 2).getReg();
2881 AM.ScaledReg = AddrI.getOperand(i: 1).getReg();
2882 AM.Scale = OffsetScale;
2883 AM.Displacement = 0;
2884 AM.Form = ExtAddrMode::Formula::SExtScaledReg;
2885 return true;
2886
2887 case TargetOpcode::SUBREG_TO_REG: {
2888 // mov Wa, Wm
2889 // ldr Xd, [Xn, Xa, lsl #N]
2890 // ->
2891 // ldr Xd, [Xn, Wm, uxtw #N]
2892
2893 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
2894 if (AddrI.getOperand(i: 1).getImm() != 0 ||
2895 AddrI.getOperand(i: 3).getImm() != AArch64::sub_32)
2896 return false;
2897
2898 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
2899 Register OffsetReg = AddrI.getOperand(i: 2).getReg();
2900 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(RegNo: OffsetReg))
2901 return false;
2902
2903 const MachineInstr &DefMI = *MRI.getVRegDef(Reg: OffsetReg);
2904 if (DefMI.getOpcode() != AArch64::ORRWrs ||
2905 DefMI.getOperand(i: 1).getReg() != AArch64::WZR ||
2906 DefMI.getOperand(i: 3).getImm() != 0)
2907 return false;
2908
2909 AM.BaseReg = MemI.getOperand(i: 1).getReg();
2910 if (AM.BaseReg == Reg)
2911 AM.BaseReg = MemI.getOperand(i: 2).getReg();
2912 AM.ScaledReg = DefMI.getOperand(i: 2).getReg();
2913 AM.Scale = OffsetScale;
2914 AM.Displacement = 0;
2915 AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
2916 return true;
2917 }
2918 }
2919 }
2920
2921 // Handle memory instructions with a [Reg, #Imm] addressing mode.
2922
2923 // Check we are not breaking a potential conversion to an LDP.
2924 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
2925 int64_t NewOffset) -> bool {
2926 int64_t MinOffset, MaxOffset;
2927 switch (NumBytes) {
2928 default:
2929 return true;
2930 case 4:
2931 MinOffset = -256;
2932 MaxOffset = 252;
2933 break;
2934 case 8:
2935 MinOffset = -512;
2936 MaxOffset = 504;
2937 break;
2938 case 16:
2939 MinOffset = -1024;
2940 MaxOffset = 1008;
2941 break;
2942 }
2943 return OldOffset < MinOffset || OldOffset > MaxOffset ||
2944 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
2945 };
2946 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
2947 int64_t OldOffset = MemI.getOperand(i: 2).getImm() * OffsetScale;
2948 int64_t NewOffset = OldOffset + Disp;
2949 if (!isLegalAddressingMode(NumBytes, Offset: NewOffset, /* Scale */ 0))
2950 return false;
2951 // If the old offset would fit into an LDP, but the new offset wouldn't,
2952 // bail out.
2953 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
2954 return false;
2955 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
2956 AM.ScaledReg = 0;
2957 AM.Scale = 0;
2958 AM.Displacement = NewOffset;
2959 AM.Form = ExtAddrMode::Formula::Basic;
2960 return true;
2961 };
2962
2963 auto canFoldAddRegIntoAddrMode =
2964 [&](int64_t Scale,
2965 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
2966 if (MemI.getOperand(i: 2).getImm() != 0)
2967 return false;
2968 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
2969 return false;
2970 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
2971 AM.ScaledReg = AddrI.getOperand(i: 2).getReg();
2972 AM.Scale = Scale;
2973 AM.Displacement = 0;
2974 AM.Form = Form;
2975 return true;
2976 };
2977
2978 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
2979 unsigned Opcode = MemI.getOpcode();
2980 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
2981 Subtarget.isSTRQroSlow();
2982 };
2983
2984 int64_t Disp = 0;
2985 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
2986 switch (AddrI.getOpcode()) {
2987 default:
2988 return false;
2989
2990 case AArch64::ADDXri:
2991 // add Xa, Xn, #N
2992 // ldr Xd, [Xa, #M]
2993 // ->
2994 // ldr Xd, [Xn, #N'+M]
2995 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
2996 return canFoldAddSubImmIntoAddrMode(Disp);
2997
2998 case AArch64::SUBXri:
2999 // sub Xa, Xn, #N
3000 // ldr Xd, [Xa, #M]
3001 // ->
3002 // ldr Xd, [Xn, #N'+M]
3003 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
3004 return canFoldAddSubImmIntoAddrMode(-Disp);
3005
3006 case AArch64::ADDXrs: {
3007 // add Xa, Xn, Xm, lsl #N
3008 // ldr Xd, [Xa]
3009 // ->
3010 // ldr Xd, [Xn, Xm, lsl #N]
3011
3012 // Don't fold the add if the result would be slower, unless optimising for
3013 // size.
3014 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3015 if (AArch64_AM::getShiftType(Imm: Shift) != AArch64_AM::ShiftExtendType::LSL)
3016 return false;
3017 Shift = AArch64_AM::getShiftValue(Imm: Shift);
3018 if (!OptSize) {
3019 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3020 return false;
3021 if (avoidSlowSTRQ(MemI))
3022 return false;
3023 }
3024 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3025 }
3026
3027 case AArch64::ADDXrr:
3028 // add Xa, Xn, Xm
3029 // ldr Xd, [Xa]
3030 // ->
3031 // ldr Xd, [Xn, Xm, lsl #0]
3032
3033 // Don't fold the add if the result would be slower, unless optimising for
3034 // size.
3035 if (!OptSize && avoidSlowSTRQ(MemI))
3036 return false;
3037 return canFoldAddRegIntoAddrMode(1);
3038
3039 case AArch64::ADDXrx:
3040 // add Xa, Xn, Wm, {s,u}xtw #N
3041 // ldr Xd, [Xa]
3042 // ->
3043 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3044
3045 // Don't fold the add if the result would be slower, unless optimising for
3046 // size.
3047 if (!OptSize && avoidSlowSTRQ(MemI))
3048 return false;
3049
3050 // Can fold only sign-/zero-extend of a word.
3051 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3052 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3053 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3054 return false;
3055
3056 return canFoldAddRegIntoAddrMode(
3057 1ULL << AArch64_AM::getArithShiftValue(Imm),
3058 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3059 : ExtAddrMode::Formula::ZExtScaledReg);
3060 }
3061}
3062
3063// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3064// return the opcode of an instruction performing the same operation, but using
3065// the [Reg, Reg] addressing mode.
3066static unsigned regOffsetOpcode(unsigned Opcode) {
3067 switch (Opcode) {
3068 default:
3069 llvm_unreachable("Address folding not implemented for instruction");
3070
3071 case AArch64::LDURQi:
3072 case AArch64::LDRQui:
3073 return AArch64::LDRQroX;
3074 case AArch64::STURQi:
3075 case AArch64::STRQui:
3076 return AArch64::STRQroX;
3077 case AArch64::LDURDi:
3078 case AArch64::LDRDui:
3079 return AArch64::LDRDroX;
3080 case AArch64::STURDi:
3081 case AArch64::STRDui:
3082 return AArch64::STRDroX;
3083 case AArch64::LDURXi:
3084 case AArch64::LDRXui:
3085 return AArch64::LDRXroX;
3086 case AArch64::STURXi:
3087 case AArch64::STRXui:
3088 return AArch64::STRXroX;
3089 case AArch64::LDURWi:
3090 case AArch64::LDRWui:
3091 return AArch64::LDRWroX;
3092 case AArch64::LDURSWi:
3093 case AArch64::LDRSWui:
3094 return AArch64::LDRSWroX;
3095 case AArch64::STURWi:
3096 case AArch64::STRWui:
3097 return AArch64::STRWroX;
3098 case AArch64::LDURHi:
3099 case AArch64::LDRHui:
3100 return AArch64::LDRHroX;
3101 case AArch64::STURHi:
3102 case AArch64::STRHui:
3103 return AArch64::STRHroX;
3104 case AArch64::LDURHHi:
3105 case AArch64::LDRHHui:
3106 return AArch64::LDRHHroX;
3107 case AArch64::STURHHi:
3108 case AArch64::STRHHui:
3109 return AArch64::STRHHroX;
3110 case AArch64::LDURSHXi:
3111 case AArch64::LDRSHXui:
3112 return AArch64::LDRSHXroX;
3113 case AArch64::LDURSHWi:
3114 case AArch64::LDRSHWui:
3115 return AArch64::LDRSHWroX;
3116 case AArch64::LDURBi:
3117 case AArch64::LDRBui:
3118 return AArch64::LDRBroX;
3119 case AArch64::LDURBBi:
3120 case AArch64::LDRBBui:
3121 return AArch64::LDRBBroX;
3122 case AArch64::LDURSBXi:
3123 case AArch64::LDRSBXui:
3124 return AArch64::LDRSBXroX;
3125 case AArch64::LDURSBWi:
3126 case AArch64::LDRSBWui:
3127 return AArch64::LDRSBWroX;
3128 case AArch64::STURBi:
3129 case AArch64::STRBui:
3130 return AArch64::STRBroX;
3131 case AArch64::STURBBi:
3132 case AArch64::STRBBui:
3133 return AArch64::STRBBroX;
3134 }
3135}
3136
3137// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3138// the opcode of an instruction performing the same operation, but using the
3139// [Reg, #Imm] addressing mode with scaled offset.
3140unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3141 switch (Opcode) {
3142 default:
3143 llvm_unreachable("Address folding not implemented for instruction");
3144
3145 case AArch64::LDURQi:
3146 Scale = 16;
3147 return AArch64::LDRQui;
3148 case AArch64::STURQi:
3149 Scale = 16;
3150 return AArch64::STRQui;
3151 case AArch64::LDURDi:
3152 Scale = 8;
3153 return AArch64::LDRDui;
3154 case AArch64::STURDi:
3155 Scale = 8;
3156 return AArch64::STRDui;
3157 case AArch64::LDURXi:
3158 Scale = 8;
3159 return AArch64::LDRXui;
3160 case AArch64::STURXi:
3161 Scale = 8;
3162 return AArch64::STRXui;
3163 case AArch64::LDURWi:
3164 Scale = 4;
3165 return AArch64::LDRWui;
3166 case AArch64::LDURSWi:
3167 Scale = 4;
3168 return AArch64::LDRSWui;
3169 case AArch64::STURWi:
3170 Scale = 4;
3171 return AArch64::STRWui;
3172 case AArch64::LDURHi:
3173 Scale = 2;
3174 return AArch64::LDRHui;
3175 case AArch64::STURHi:
3176 Scale = 2;
3177 return AArch64::STRHui;
3178 case AArch64::LDURHHi:
3179 Scale = 2;
3180 return AArch64::LDRHHui;
3181 case AArch64::STURHHi:
3182 Scale = 2;
3183 return AArch64::STRHHui;
3184 case AArch64::LDURSHXi:
3185 Scale = 2;
3186 return AArch64::LDRSHXui;
3187 case AArch64::LDURSHWi:
3188 Scale = 2;
3189 return AArch64::LDRSHWui;
3190 case AArch64::LDURBi:
3191 Scale = 1;
3192 return AArch64::LDRBui;
3193 case AArch64::LDURBBi:
3194 Scale = 1;
3195 return AArch64::LDRBBui;
3196 case AArch64::LDURSBXi:
3197 Scale = 1;
3198 return AArch64::LDRSBXui;
3199 case AArch64::LDURSBWi:
3200 Scale = 1;
3201 return AArch64::LDRSBWui;
3202 case AArch64::STURBi:
3203 Scale = 1;
3204 return AArch64::STRBui;
3205 case AArch64::STURBBi:
3206 Scale = 1;
3207 return AArch64::STRBBui;
3208 case AArch64::LDRQui:
3209 case AArch64::STRQui:
3210 Scale = 16;
3211 return Opcode;
3212 case AArch64::LDRDui:
3213 case AArch64::STRDui:
3214 case AArch64::LDRXui:
3215 case AArch64::STRXui:
3216 Scale = 8;
3217 return Opcode;
3218 case AArch64::LDRWui:
3219 case AArch64::LDRSWui:
3220 case AArch64::STRWui:
3221 Scale = 4;
3222 return Opcode;
3223 case AArch64::LDRHui:
3224 case AArch64::STRHui:
3225 case AArch64::LDRHHui:
3226 case AArch64::STRHHui:
3227 case AArch64::LDRSHXui:
3228 case AArch64::LDRSHWui:
3229 Scale = 2;
3230 return Opcode;
3231 case AArch64::LDRBui:
3232 case AArch64::LDRBBui:
3233 case AArch64::LDRSBXui:
3234 case AArch64::LDRSBWui:
3235 case AArch64::STRBui:
3236 case AArch64::STRBBui:
3237 Scale = 1;
3238 return Opcode;
3239 }
3240}
3241
3242// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3243// the opcode of an instruction performing the same operation, but using the
3244// [Reg, #Imm] addressing mode with unscaled offset.
3245unsigned unscaledOffsetOpcode(unsigned Opcode) {
3246 switch (Opcode) {
3247 default:
3248 llvm_unreachable("Address folding not implemented for instruction");
3249
3250 case AArch64::LDURQi:
3251 case AArch64::STURQi:
3252 case AArch64::LDURDi:
3253 case AArch64::STURDi:
3254 case AArch64::LDURXi:
3255 case AArch64::STURXi:
3256 case AArch64::LDURWi:
3257 case AArch64::LDURSWi:
3258 case AArch64::STURWi:
3259 case AArch64::LDURHi:
3260 case AArch64::STURHi:
3261 case AArch64::LDURHHi:
3262 case AArch64::STURHHi:
3263 case AArch64::LDURSHXi:
3264 case AArch64::LDURSHWi:
3265 case AArch64::LDURBi:
3266 case AArch64::STURBi:
3267 case AArch64::LDURBBi:
3268 case AArch64::STURBBi:
3269 case AArch64::LDURSBWi:
3270 case AArch64::LDURSBXi:
3271 return Opcode;
3272 case AArch64::LDRQui:
3273 return AArch64::LDURQi;
3274 case AArch64::STRQui:
3275 return AArch64::STURQi;
3276 case AArch64::LDRDui:
3277 return AArch64::LDURDi;
3278 case AArch64::STRDui:
3279 return AArch64::STURDi;
3280 case AArch64::LDRXui:
3281 return AArch64::LDURXi;
3282 case AArch64::STRXui:
3283 return AArch64::STURXi;
3284 case AArch64::LDRWui:
3285 return AArch64::LDURWi;
3286 case AArch64::LDRSWui:
3287 return AArch64::LDURSWi;
3288 case AArch64::STRWui:
3289 return AArch64::STURWi;
3290 case AArch64::LDRHui:
3291 return AArch64::LDURHi;
3292 case AArch64::STRHui:
3293 return AArch64::STURHi;
3294 case AArch64::LDRHHui:
3295 return AArch64::LDURHHi;
3296 case AArch64::STRHHui:
3297 return AArch64::STURHHi;
3298 case AArch64::LDRSHXui:
3299 return AArch64::LDURSHXi;
3300 case AArch64::LDRSHWui:
3301 return AArch64::LDURSHWi;
3302 case AArch64::LDRBBui:
3303 return AArch64::LDURBBi;
3304 case AArch64::LDRBui:
3305 return AArch64::LDURBi;
3306 case AArch64::STRBBui:
3307 return AArch64::STURBBi;
3308 case AArch64::STRBui:
3309 return AArch64::STURBi;
3310 case AArch64::LDRSBWui:
3311 return AArch64::LDURSBWi;
3312 case AArch64::LDRSBXui:
3313 return AArch64::LDURSBXi;
3314 }
3315}
3316
3317// Given the opcode of a memory load/store instruction, return the opcode of an
3318// instruction performing the same operation, but using
3319// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3320// offset register.
3321static unsigned offsetExtendOpcode(unsigned Opcode) {
3322 switch (Opcode) {
3323 default:
3324 llvm_unreachable("Address folding not implemented for instruction");
3325
3326 case AArch64::LDRQroX:
3327 case AArch64::LDURQi:
3328 case AArch64::LDRQui:
3329 return AArch64::LDRQroW;
3330 case AArch64::STRQroX:
3331 case AArch64::STURQi:
3332 case AArch64::STRQui:
3333 return AArch64::STRQroW;
3334 case AArch64::LDRDroX:
3335 case AArch64::LDURDi:
3336 case AArch64::LDRDui:
3337 return AArch64::LDRDroW;
3338 case AArch64::STRDroX:
3339 case AArch64::STURDi:
3340 case AArch64::STRDui:
3341 return AArch64::STRDroW;
3342 case AArch64::LDRXroX:
3343 case AArch64::LDURXi:
3344 case AArch64::LDRXui:
3345 return AArch64::LDRXroW;
3346 case AArch64::STRXroX:
3347 case AArch64::STURXi:
3348 case AArch64::STRXui:
3349 return AArch64::STRXroW;
3350 case AArch64::LDRWroX:
3351 case AArch64::LDURWi:
3352 case AArch64::LDRWui:
3353 return AArch64::LDRWroW;
3354 case AArch64::LDRSWroX:
3355 case AArch64::LDURSWi:
3356 case AArch64::LDRSWui:
3357 return AArch64::LDRSWroW;
3358 case AArch64::STRWroX:
3359 case AArch64::STURWi:
3360 case AArch64::STRWui:
3361 return AArch64::STRWroW;
3362 case AArch64::LDRHroX:
3363 case AArch64::LDURHi:
3364 case AArch64::LDRHui:
3365 return AArch64::LDRHroW;
3366 case AArch64::STRHroX:
3367 case AArch64::STURHi:
3368 case AArch64::STRHui:
3369 return AArch64::STRHroW;
3370 case AArch64::LDRHHroX:
3371 case AArch64::LDURHHi:
3372 case AArch64::LDRHHui:
3373 return AArch64::LDRHHroW;
3374 case AArch64::STRHHroX:
3375 case AArch64::STURHHi:
3376 case AArch64::STRHHui:
3377 return AArch64::STRHHroW;
3378 case AArch64::LDRSHXroX:
3379 case AArch64::LDURSHXi:
3380 case AArch64::LDRSHXui:
3381 return AArch64::LDRSHXroW;
3382 case AArch64::LDRSHWroX:
3383 case AArch64::LDURSHWi:
3384 case AArch64::LDRSHWui:
3385 return AArch64::LDRSHWroW;
3386 case AArch64::LDRBroX:
3387 case AArch64::LDURBi:
3388 case AArch64::LDRBui:
3389 return AArch64::LDRBroW;
3390 case AArch64::LDRBBroX:
3391 case AArch64::LDURBBi:
3392 case AArch64::LDRBBui:
3393 return AArch64::LDRBBroW;
3394 case AArch64::LDRSBXroX:
3395 case AArch64::LDURSBXi:
3396 case AArch64::LDRSBXui:
3397 return AArch64::LDRSBXroW;
3398 case AArch64::LDRSBWroX:
3399 case AArch64::LDURSBWi:
3400 case AArch64::LDRSBWui:
3401 return AArch64::LDRSBWroW;
3402 case AArch64::STRBroX:
3403 case AArch64::STURBi:
3404 case AArch64::STRBui:
3405 return AArch64::STRBroW;
3406 case AArch64::STRBBroX:
3407 case AArch64::STURBBi:
3408 case AArch64::STRBBui:
3409 return AArch64::STRBBroW;
3410 }
3411}
3412
3413MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3414 const ExtAddrMode &AM) const {
3415
3416 const DebugLoc &DL = MemI.getDebugLoc();
3417 MachineBasicBlock &MBB = *MemI.getParent();
3418 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3419
3420 if (AM.Form == ExtAddrMode::Formula::Basic) {
3421 if (AM.ScaledReg) {
3422 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3423 unsigned Opcode = regOffsetOpcode(Opcode: MemI.getOpcode());
3424 MRI.constrainRegClass(Reg: AM.BaseReg, RC: &AArch64::GPR64spRegClass);
3425 auto B = BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
3426 .addReg(RegNo: MemI.getOperand(i: 0).getReg(),
3427 flags: MemI.mayLoad() ? RegState::Define : 0)
3428 .addReg(RegNo: AM.BaseReg)
3429 .addReg(RegNo: AM.ScaledReg)
3430 .addImm(Val: 0)
3431 .addImm(Val: AM.Scale > 1)
3432 .setMemRefs(MemI.memoperands())
3433 .setMIFlags(MemI.getFlags());
3434 return B.getInstr();
3435 }
3436
3437 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3438 "Addressing mode not supported for folding");
3439
3440 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3441 unsigned Scale = 1;
3442 unsigned Opcode = MemI.getOpcode();
3443 if (isInt<9>(x: AM.Displacement))
3444 Opcode = unscaledOffsetOpcode(Opcode);
3445 else
3446 Opcode = scaledOffsetOpcode(Opcode, Scale);
3447
3448 auto B = BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
3449 .addReg(RegNo: MemI.getOperand(i: 0).getReg(),
3450 flags: MemI.mayLoad() ? RegState::Define : 0)
3451 .addReg(RegNo: AM.BaseReg)
3452 .addImm(Val: AM.Displacement / Scale)
3453 .setMemRefs(MemI.memoperands())
3454 .setMIFlags(MemI.getFlags());
3455 return B.getInstr();
3456 }
3457
3458 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3459 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3460 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3461 assert(AM.ScaledReg && !AM.Displacement &&
3462 "Address offset can be a register or an immediate, but not both");
3463 unsigned Opcode = offsetExtendOpcode(Opcode: MemI.getOpcode());
3464 MRI.constrainRegClass(Reg: AM.BaseReg, RC: &AArch64::GPR64spRegClass);
3465 // Make sure the offset register is in the correct register class.
3466 Register OffsetReg = AM.ScaledReg;
3467 const TargetRegisterClass *RC = MRI.getRegClass(Reg: OffsetReg);
3468 if (RC->hasSuperClassEq(RC: &AArch64::GPR64RegClass)) {
3469 OffsetReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
3470 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: OffsetReg)
3471 .addReg(RegNo: AM.ScaledReg, flags: 0, SubReg: AArch64::sub_32);
3472 }
3473 auto B = BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
3474 .addReg(RegNo: MemI.getOperand(i: 0).getReg(),
3475 flags: MemI.mayLoad() ? RegState::Define : 0)
3476 .addReg(RegNo: AM.BaseReg)
3477 .addReg(RegNo: OffsetReg)
3478 .addImm(Val: AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3479 .addImm(Val: AM.Scale != 1)
3480 .setMemRefs(MemI.memoperands())
3481 .setMIFlags(MemI.getFlags());
3482
3483 return B.getInstr();
3484 }
3485
3486 llvm_unreachable(
3487 "Function must not be called with an addressing mode it can't handle");
3488}
3489
3490bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3491 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3492 bool &OffsetIsScalable, TypeSize &Width,
3493 const TargetRegisterInfo *TRI) const {
3494 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3495 // Handle only loads/stores with base register followed by immediate offset.
3496 if (LdSt.getNumExplicitOperands() == 3) {
3497 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
3498 if ((!LdSt.getOperand(i: 1).isReg() && !LdSt.getOperand(i: 1).isFI()) ||
3499 !LdSt.getOperand(i: 2).isImm())
3500 return false;
3501 } else if (LdSt.getNumExplicitOperands() == 4) {
3502 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3503 if (!LdSt.getOperand(i: 1).isReg() ||
3504 (!LdSt.getOperand(i: 2).isReg() && !LdSt.getOperand(i: 2).isFI()) ||
3505 !LdSt.getOperand(i: 3).isImm())
3506 return false;
3507 } else
3508 return false;
3509
3510 // Get the scaling factor for the instruction and set the width for the
3511 // instruction.
3512 TypeSize Scale(0U, false);
3513 int64_t Dummy1, Dummy2;
3514
3515 // If this returns false, then it's an instruction we don't want to handle.
3516 if (!getMemOpInfo(Opcode: LdSt.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2))
3517 return false;
3518
3519 // Compute the offset. Offset is calculated as the immediate operand
3520 // multiplied by the scaling factor. Unscaled instructions have scaling factor
3521 // set to 1.
3522 if (LdSt.getNumExplicitOperands() == 3) {
3523 BaseOp = &LdSt.getOperand(i: 1);
3524 Offset = LdSt.getOperand(i: 2).getImm() * Scale.getKnownMinValue();
3525 } else {
3526 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3527 BaseOp = &LdSt.getOperand(i: 2);
3528 Offset = LdSt.getOperand(i: 3).getImm() * Scale.getKnownMinValue();
3529 }
3530 OffsetIsScalable = Scale.isScalable();
3531
3532 if (!BaseOp->isReg() && !BaseOp->isFI())
3533 return false;
3534
3535 return true;
3536}
3537
3538MachineOperand &
3539AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3540 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3541 MachineOperand &OfsOp = LdSt.getOperand(i: LdSt.getNumExplicitOperands() - 1);
3542 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3543 return OfsOp;
3544}
3545
3546bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3547 TypeSize &Width, int64_t &MinOffset,
3548 int64_t &MaxOffset) {
3549 switch (Opcode) {
3550 // Not a memory operation or something we want to handle.
3551 default:
3552 Scale = TypeSize::getFixed(ExactSize: 0);
3553 Width = TypeSize::getFixed(ExactSize: 0);
3554 MinOffset = MaxOffset = 0;
3555 return false;
3556 // LDR / STR
3557 case AArch64::LDRQui:
3558 case AArch64::STRQui:
3559 Scale = TypeSize::getFixed(ExactSize: 16);
3560 Width = TypeSize::getFixed(ExactSize: 16);
3561 MinOffset = 0;
3562 MaxOffset = 4095;
3563 break;
3564 case AArch64::LDRXui:
3565 case AArch64::LDRDui:
3566 case AArch64::STRXui:
3567 case AArch64::STRDui:
3568 case AArch64::PRFMui:
3569 Scale = TypeSize::getFixed(ExactSize: 8);
3570 Width = TypeSize::getFixed(ExactSize: 8);
3571 MinOffset = 0;
3572 MaxOffset = 4095;
3573 break;
3574 case AArch64::LDRWui:
3575 case AArch64::LDRSui:
3576 case AArch64::LDRSWui:
3577 case AArch64::STRWui:
3578 case AArch64::STRSui:
3579 Scale = TypeSize::getFixed(ExactSize: 4);
3580 Width = TypeSize::getFixed(ExactSize: 4);
3581 MinOffset = 0;
3582 MaxOffset = 4095;
3583 break;
3584 case AArch64::LDRHui:
3585 case AArch64::LDRHHui:
3586 case AArch64::LDRSHWui:
3587 case AArch64::LDRSHXui:
3588 case AArch64::STRHui:
3589 case AArch64::STRHHui:
3590 Scale = TypeSize::getFixed(ExactSize: 2);
3591 Width = TypeSize::getFixed(ExactSize: 2);
3592 MinOffset = 0;
3593 MaxOffset = 4095;
3594 break;
3595 case AArch64::LDRBui:
3596 case AArch64::LDRBBui:
3597 case AArch64::LDRSBWui:
3598 case AArch64::LDRSBXui:
3599 case AArch64::STRBui:
3600 case AArch64::STRBBui:
3601 Scale = TypeSize::getFixed(ExactSize: 1);
3602 Width = TypeSize::getFixed(ExactSize: 1);
3603 MinOffset = 0;
3604 MaxOffset = 4095;
3605 break;
3606 // post/pre inc
3607 case AArch64::STRQpre:
3608 case AArch64::LDRQpost:
3609 Scale = TypeSize::getFixed(ExactSize: 1);
3610 Width = TypeSize::getFixed(ExactSize: 16);
3611 MinOffset = -256;
3612 MaxOffset = 255;
3613 break;
3614 case AArch64::STRXpre:
3615 case AArch64::STRDpre:
3616 case AArch64::LDRXpost:
3617 case AArch64::LDRDpost:
3618 Scale = TypeSize::getFixed(ExactSize: 1);
3619 Width = TypeSize::getFixed(ExactSize: 8);
3620 MinOffset = -256;
3621 MaxOffset = 255;
3622 break;
3623 case AArch64::STRWpost:
3624 case AArch64::LDRWpost:
3625 Scale = TypeSize::getFixed(ExactSize: 4);
3626 Width = TypeSize::getFixed(ExactSize: 32);
3627 MinOffset = -256;
3628 MaxOffset = 255;
3629 break;
3630 // Unscaled
3631 case AArch64::LDURQi:
3632 case AArch64::STURQi:
3633 Scale = TypeSize::getFixed(ExactSize: 1);
3634 Width = TypeSize::getFixed(ExactSize: 16);
3635 MinOffset = -256;
3636 MaxOffset = 255;
3637 break;
3638 case AArch64::LDURXi:
3639 case AArch64::LDURDi:
3640 case AArch64::LDAPURXi:
3641 case AArch64::STURXi:
3642 case AArch64::STURDi:
3643 case AArch64::STLURXi:
3644 case AArch64::PRFUMi:
3645 Scale = TypeSize::getFixed(ExactSize: 1);
3646 Width = TypeSize::getFixed(ExactSize: 8);
3647 MinOffset = -256;
3648 MaxOffset = 255;
3649 break;
3650 case AArch64::LDURWi:
3651 case AArch64::LDURSi:
3652 case AArch64::LDURSWi:
3653 case AArch64::LDAPURi:
3654 case AArch64::LDAPURSWi:
3655 case AArch64::STURWi:
3656 case AArch64::STURSi:
3657 case AArch64::STLURWi:
3658 Scale = TypeSize::getFixed(ExactSize: 1);
3659 Width = TypeSize::getFixed(ExactSize: 4);
3660 MinOffset = -256;
3661 MaxOffset = 255;
3662 break;
3663 case AArch64::LDURHi:
3664 case AArch64::LDURHHi:
3665 case AArch64::LDURSHXi:
3666 case AArch64::LDURSHWi:
3667 case AArch64::LDAPURHi:
3668 case AArch64::LDAPURSHWi:
3669 case AArch64::LDAPURSHXi:
3670 case AArch64::STURHi:
3671 case AArch64::STURHHi:
3672 case AArch64::STLURHi:
3673 Scale = TypeSize::getFixed(ExactSize: 1);
3674 Width = TypeSize::getFixed(ExactSize: 2);
3675 MinOffset = -256;
3676 MaxOffset = 255;
3677 break;
3678 case AArch64::LDURBi:
3679 case AArch64::LDURBBi:
3680 case AArch64::LDURSBXi:
3681 case AArch64::LDURSBWi:
3682 case AArch64::LDAPURBi:
3683 case AArch64::LDAPURSBWi:
3684 case AArch64::LDAPURSBXi:
3685 case AArch64::STURBi:
3686 case AArch64::STURBBi:
3687 case AArch64::STLURBi:
3688 Scale = TypeSize::getFixed(ExactSize: 1);
3689 Width = TypeSize::getFixed(ExactSize: 1);
3690 MinOffset = -256;
3691 MaxOffset = 255;
3692 break;
3693 // LDP / STP
3694 case AArch64::LDPQi:
3695 case AArch64::LDNPQi:
3696 case AArch64::STPQi:
3697 case AArch64::STNPQi:
3698 Scale = TypeSize::getFixed(ExactSize: 16);
3699 Width = TypeSize::getFixed(ExactSize: 32);
3700 MinOffset = -64;
3701 MaxOffset = 63;
3702 break;
3703 case AArch64::LDPXi:
3704 case AArch64::LDPDi:
3705 case AArch64::LDNPXi:
3706 case AArch64::LDNPDi:
3707 case AArch64::STPXi:
3708 case AArch64::STPDi:
3709 case AArch64::STNPXi:
3710 case AArch64::STNPDi:
3711 Scale = TypeSize::getFixed(ExactSize: 8);
3712 Width = TypeSize::getFixed(ExactSize: 16);
3713 MinOffset = -64;
3714 MaxOffset = 63;
3715 break;
3716 case AArch64::LDPWi:
3717 case AArch64::LDPSi:
3718 case AArch64::LDNPWi:
3719 case AArch64::LDNPSi:
3720 case AArch64::STPWi:
3721 case AArch64::STPSi:
3722 case AArch64::STNPWi:
3723 case AArch64::STNPSi:
3724 Scale = TypeSize::getFixed(ExactSize: 4);
3725 Width = TypeSize::getFixed(ExactSize: 8);
3726 MinOffset = -64;
3727 MaxOffset = 63;
3728 break;
3729 // pre/post inc
3730 case AArch64::STPQpre:
3731 case AArch64::LDPQpost:
3732 Scale = TypeSize::getFixed(ExactSize: 16);
3733 Width = TypeSize::getFixed(ExactSize: 16);
3734 MinOffset = -1024;
3735 MaxOffset = 1008;
3736 break;
3737 case AArch64::STPXpre:
3738 case AArch64::LDPXpost:
3739 case AArch64::STPDpre:
3740 case AArch64::LDPDpost:
3741 Scale = TypeSize::getFixed(ExactSize: 8);
3742 Width = TypeSize::getFixed(ExactSize: 8);
3743 MinOffset = -512;
3744 MaxOffset = 504;
3745 break;
3746 case AArch64::StoreSwiftAsyncContext:
3747 // Store is an STRXui, but there might be an ADDXri in the expansion too.
3748 Scale = TypeSize::getFixed(ExactSize: 1);
3749 Width = TypeSize::getFixed(ExactSize: 8);
3750 MinOffset = 0;
3751 MaxOffset = 4095;
3752 break;
3753 case AArch64::ADDG:
3754 Scale = TypeSize::getFixed(ExactSize: 16);
3755 Width = TypeSize::getFixed(ExactSize: 0);
3756 MinOffset = 0;
3757 MaxOffset = 63;
3758 break;
3759 case AArch64::TAGPstack:
3760 Scale = TypeSize::getFixed(ExactSize: 16);
3761 Width = TypeSize::getFixed(ExactSize: 0);
3762 // TAGP with a negative offset turns into SUBP, which has a maximum offset
3763 // of 63 (not 64!).
3764 MinOffset = -63;
3765 MaxOffset = 63;
3766 break;
3767 case AArch64::LDG:
3768 case AArch64::STGi:
3769 case AArch64::STZGi:
3770 Scale = TypeSize::getFixed(ExactSize: 16);
3771 Width = TypeSize::getFixed(ExactSize: 16);
3772 MinOffset = -256;
3773 MaxOffset = 255;
3774 break;
3775 // SVE
3776 case AArch64::STR_ZZZZXI:
3777 case AArch64::LDR_ZZZZXI:
3778 Scale = TypeSize::getScalable(MinimumSize: 16);
3779 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
3780 MinOffset = -256;
3781 MaxOffset = 252;
3782 break;
3783 case AArch64::STR_ZZZXI:
3784 case AArch64::LDR_ZZZXI:
3785 Scale = TypeSize::getScalable(MinimumSize: 16);
3786 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
3787 MinOffset = -256;
3788 MaxOffset = 253;
3789 break;
3790 case AArch64::STR_ZZXI:
3791 case AArch64::LDR_ZZXI:
3792 Scale = TypeSize::getScalable(MinimumSize: 16);
3793 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
3794 MinOffset = -256;
3795 MaxOffset = 254;
3796 break;
3797 case AArch64::LDR_PXI:
3798 case AArch64::STR_PXI:
3799 Scale = TypeSize::getScalable(MinimumSize: 2);
3800 Width = TypeSize::getScalable(MinimumSize: 2);
3801 MinOffset = -256;
3802 MaxOffset = 255;
3803 break;
3804 case AArch64::LDR_PPXI:
3805 case AArch64::STR_PPXI:
3806 Scale = TypeSize::getScalable(MinimumSize: 2);
3807 Width = TypeSize::getScalable(MinimumSize: 2 * 2);
3808 MinOffset = -256;
3809 MaxOffset = 254;
3810 break;
3811 case AArch64::LDR_ZXI:
3812 case AArch64::STR_ZXI:
3813 Scale = TypeSize::getScalable(MinimumSize: 16);
3814 Width = TypeSize::getScalable(MinimumSize: 16);
3815 MinOffset = -256;
3816 MaxOffset = 255;
3817 break;
3818 case AArch64::LD1B_IMM:
3819 case AArch64::LD1H_IMM:
3820 case AArch64::LD1W_IMM:
3821 case AArch64::LD1D_IMM:
3822 case AArch64::LDNT1B_ZRI:
3823 case AArch64::LDNT1H_ZRI:
3824 case AArch64::LDNT1W_ZRI:
3825 case AArch64::LDNT1D_ZRI:
3826 case AArch64::ST1B_IMM:
3827 case AArch64::ST1H_IMM:
3828 case AArch64::ST1W_IMM:
3829 case AArch64::ST1D_IMM:
3830 case AArch64::STNT1B_ZRI:
3831 case AArch64::STNT1H_ZRI:
3832 case AArch64::STNT1W_ZRI:
3833 case AArch64::STNT1D_ZRI:
3834 case AArch64::LDNF1B_IMM:
3835 case AArch64::LDNF1H_IMM:
3836 case AArch64::LDNF1W_IMM:
3837 case AArch64::LDNF1D_IMM:
3838 // A full vectors worth of data
3839 // Width = mbytes * elements
3840 Scale = TypeSize::getScalable(MinimumSize: 16);
3841 Width = TypeSize::getScalable(MinimumSize: 16);
3842 MinOffset = -8;
3843 MaxOffset = 7;
3844 break;
3845 case AArch64::LD2B_IMM:
3846 case AArch64::LD2H_IMM:
3847 case AArch64::LD2W_IMM:
3848 case AArch64::LD2D_IMM:
3849 case AArch64::ST2B_IMM:
3850 case AArch64::ST2H_IMM:
3851 case AArch64::ST2W_IMM:
3852 case AArch64::ST2D_IMM:
3853 Scale = TypeSize::getScalable(MinimumSize: 32);
3854 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
3855 MinOffset = -8;
3856 MaxOffset = 7;
3857 break;
3858 case AArch64::LD3B_IMM:
3859 case AArch64::LD3H_IMM:
3860 case AArch64::LD3W_IMM:
3861 case AArch64::LD3D_IMM:
3862 case AArch64::ST3B_IMM:
3863 case AArch64::ST3H_IMM:
3864 case AArch64::ST3W_IMM:
3865 case AArch64::ST3D_IMM:
3866 Scale = TypeSize::getScalable(MinimumSize: 48);
3867 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
3868 MinOffset = -8;
3869 MaxOffset = 7;
3870 break;
3871 case AArch64::LD4B_IMM:
3872 case AArch64::LD4H_IMM:
3873 case AArch64::LD4W_IMM:
3874 case AArch64::LD4D_IMM:
3875 case AArch64::ST4B_IMM:
3876 case AArch64::ST4H_IMM:
3877 case AArch64::ST4W_IMM:
3878 case AArch64::ST4D_IMM:
3879 Scale = TypeSize::getScalable(MinimumSize: 64);
3880 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
3881 MinOffset = -8;
3882 MaxOffset = 7;
3883 break;
3884 case AArch64::LD1B_H_IMM:
3885 case AArch64::LD1SB_H_IMM:
3886 case AArch64::LD1H_S_IMM:
3887 case AArch64::LD1SH_S_IMM:
3888 case AArch64::LD1W_D_IMM:
3889 case AArch64::LD1SW_D_IMM:
3890 case AArch64::ST1B_H_IMM:
3891 case AArch64::ST1H_S_IMM:
3892 case AArch64::ST1W_D_IMM:
3893 case AArch64::LDNF1B_H_IMM:
3894 case AArch64::LDNF1SB_H_IMM:
3895 case AArch64::LDNF1H_S_IMM:
3896 case AArch64::LDNF1SH_S_IMM:
3897 case AArch64::LDNF1W_D_IMM:
3898 case AArch64::LDNF1SW_D_IMM:
3899 // A half vector worth of data
3900 // Width = mbytes * elements
3901 Scale = TypeSize::getScalable(MinimumSize: 8);
3902 Width = TypeSize::getScalable(MinimumSize: 8);
3903 MinOffset = -8;
3904 MaxOffset = 7;
3905 break;
3906 case AArch64::LD1B_S_IMM:
3907 case AArch64::LD1SB_S_IMM:
3908 case AArch64::LD1H_D_IMM:
3909 case AArch64::LD1SH_D_IMM:
3910 case AArch64::ST1B_S_IMM:
3911 case AArch64::ST1H_D_IMM:
3912 case AArch64::LDNF1B_S_IMM:
3913 case AArch64::LDNF1SB_S_IMM:
3914 case AArch64::LDNF1H_D_IMM:
3915 case AArch64::LDNF1SH_D_IMM:
3916 // A quarter vector worth of data
3917 // Width = mbytes * elements
3918 Scale = TypeSize::getScalable(MinimumSize: 4);
3919 Width = TypeSize::getScalable(MinimumSize: 4);
3920 MinOffset = -8;
3921 MaxOffset = 7;
3922 break;
3923 case AArch64::LD1B_D_IMM:
3924 case AArch64::LD1SB_D_IMM:
3925 case AArch64::ST1B_D_IMM:
3926 case AArch64::LDNF1B_D_IMM:
3927 case AArch64::LDNF1SB_D_IMM:
3928 // A eighth vector worth of data
3929 // Width = mbytes * elements
3930 Scale = TypeSize::getScalable(MinimumSize: 2);
3931 Width = TypeSize::getScalable(MinimumSize: 2);
3932 MinOffset = -8;
3933 MaxOffset = 7;
3934 break;
3935 case AArch64::ST2Gi:
3936 case AArch64::STZ2Gi:
3937 Scale = TypeSize::getFixed(ExactSize: 16);
3938 Width = TypeSize::getFixed(ExactSize: 32);
3939 MinOffset = -256;
3940 MaxOffset = 255;
3941 break;
3942 case AArch64::STGPi:
3943 Scale = TypeSize::getFixed(ExactSize: 16);
3944 Width = TypeSize::getFixed(ExactSize: 16);
3945 MinOffset = -64;
3946 MaxOffset = 63;
3947 break;
3948 case AArch64::LD1RB_IMM:
3949 case AArch64::LD1RB_H_IMM:
3950 case AArch64::LD1RB_S_IMM:
3951 case AArch64::LD1RB_D_IMM:
3952 case AArch64::LD1RSB_H_IMM:
3953 case AArch64::LD1RSB_S_IMM:
3954 case AArch64::LD1RSB_D_IMM:
3955 Scale = TypeSize::getFixed(ExactSize: 1);
3956 Width = TypeSize::getFixed(ExactSize: 1);
3957 MinOffset = 0;
3958 MaxOffset = 63;
3959 break;
3960 case AArch64::LD1RH_IMM:
3961 case AArch64::LD1RH_S_IMM:
3962 case AArch64::LD1RH_D_IMM:
3963 case AArch64::LD1RSH_S_IMM:
3964 case AArch64::LD1RSH_D_IMM:
3965 Scale = TypeSize::getFixed(ExactSize: 2);
3966 Width = TypeSize::getFixed(ExactSize: 2);
3967 MinOffset = 0;
3968 MaxOffset = 63;
3969 break;
3970 case AArch64::LD1RW_IMM:
3971 case AArch64::LD1RW_D_IMM:
3972 case AArch64::LD1RSW_IMM:
3973 Scale = TypeSize::getFixed(ExactSize: 4);
3974 Width = TypeSize::getFixed(ExactSize: 4);
3975 MinOffset = 0;
3976 MaxOffset = 63;
3977 break;
3978 case AArch64::LD1RD_IMM:
3979 Scale = TypeSize::getFixed(ExactSize: 8);
3980 Width = TypeSize::getFixed(ExactSize: 8);
3981 MinOffset = 0;
3982 MaxOffset = 63;
3983 break;
3984 }
3985
3986 return true;
3987}
3988
3989// Scaling factor for unscaled load or store.
3990int AArch64InstrInfo::getMemScale(unsigned Opc) {
3991 switch (Opc) {
3992 default:
3993 llvm_unreachable("Opcode has unknown scale!");
3994 case AArch64::LDRBBui:
3995 case AArch64::LDURBBi:
3996 case AArch64::LDRSBWui:
3997 case AArch64::LDURSBWi:
3998 case AArch64::STRBBui:
3999 case AArch64::STURBBi:
4000 return 1;
4001 case AArch64::LDRHHui:
4002 case AArch64::LDURHHi:
4003 case AArch64::LDRSHWui:
4004 case AArch64::LDURSHWi:
4005 case AArch64::STRHHui:
4006 case AArch64::STURHHi:
4007 return 2;
4008 case AArch64::LDRSui:
4009 case AArch64::LDURSi:
4010 case AArch64::LDRSpre:
4011 case AArch64::LDRSWui:
4012 case AArch64::LDURSWi:
4013 case AArch64::LDRSWpre:
4014 case AArch64::LDRWpre:
4015 case AArch64::LDRWui:
4016 case AArch64::LDURWi:
4017 case AArch64::STRSui:
4018 case AArch64::STURSi:
4019 case AArch64::STRSpre:
4020 case AArch64::STRWui:
4021 case AArch64::STURWi:
4022 case AArch64::STRWpre:
4023 case AArch64::LDPSi:
4024 case AArch64::LDPSWi:
4025 case AArch64::LDPWi:
4026 case AArch64::STPSi:
4027 case AArch64::STPWi:
4028 return 4;
4029 case AArch64::LDRDui:
4030 case AArch64::LDURDi:
4031 case AArch64::LDRDpre:
4032 case AArch64::LDRXui:
4033 case AArch64::LDURXi:
4034 case AArch64::LDRXpre:
4035 case AArch64::STRDui:
4036 case AArch64::STURDi:
4037 case AArch64::STRDpre:
4038 case AArch64::STRXui:
4039 case AArch64::STURXi:
4040 case AArch64::STRXpre:
4041 case AArch64::LDPDi:
4042 case AArch64::LDPXi:
4043 case AArch64::STPDi:
4044 case AArch64::STPXi:
4045 return 8;
4046 case AArch64::LDRQui:
4047 case AArch64::LDURQi:
4048 case AArch64::STRQui:
4049 case AArch64::STURQi:
4050 case AArch64::STRQpre:
4051 case AArch64::LDPQi:
4052 case AArch64::LDRQpre:
4053 case AArch64::STPQi:
4054 case AArch64::STGi:
4055 case AArch64::STZGi:
4056 case AArch64::ST2Gi:
4057 case AArch64::STZ2Gi:
4058 case AArch64::STGPi:
4059 return 16;
4060 }
4061}
4062
4063bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4064 switch (MI.getOpcode()) {
4065 default:
4066 return false;
4067 case AArch64::LDRWpre:
4068 case AArch64::LDRXpre:
4069 case AArch64::LDRSWpre:
4070 case AArch64::LDRSpre:
4071 case AArch64::LDRDpre:
4072 case AArch64::LDRQpre:
4073 return true;
4074 }
4075}
4076
4077bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4078 switch (MI.getOpcode()) {
4079 default:
4080 return false;
4081 case AArch64::STRWpre:
4082 case AArch64::STRXpre:
4083 case AArch64::STRSpre:
4084 case AArch64::STRDpre:
4085 case AArch64::STRQpre:
4086 return true;
4087 }
4088}
4089
4090bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4091 return isPreLd(MI) || isPreSt(MI);
4092}
4093
4094bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4095 switch (MI.getOpcode()) {
4096 default:
4097 return false;
4098 case AArch64::LDPSi:
4099 case AArch64::LDPSWi:
4100 case AArch64::LDPDi:
4101 case AArch64::LDPQi:
4102 case AArch64::LDPWi:
4103 case AArch64::LDPXi:
4104 case AArch64::STPSi:
4105 case AArch64::STPDi:
4106 case AArch64::STPQi:
4107 case AArch64::STPWi:
4108 case AArch64::STPXi:
4109 case AArch64::STGPi:
4110 return true;
4111 }
4112}
4113
4114const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4115 unsigned Idx =
4116 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4117 : 1;
4118 return MI.getOperand(i: Idx);
4119}
4120
4121const MachineOperand &
4122AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4123 unsigned Idx =
4124 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4125 : 2;
4126 return MI.getOperand(i: Idx);
4127}
4128
4129static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4130 Register Reg) {
4131 if (MI.getParent() == nullptr)
4132 return nullptr;
4133 const MachineFunction *MF = MI.getParent()->getParent();
4134 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4135}
4136
4137bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4138 auto IsHFPR = [&](const MachineOperand &Op) {
4139 if (!Op.isReg())
4140 return false;
4141 auto Reg = Op.getReg();
4142 if (Reg.isPhysical())
4143 return AArch64::FPR16RegClass.contains(Reg);
4144 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4145 return TRC == &AArch64::FPR16RegClass ||
4146 TRC == &AArch64::FPR16_loRegClass;
4147 };
4148 return llvm::any_of(Range: MI.operands(), P: IsHFPR);
4149}
4150
4151bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4152 auto IsQFPR = [&](const MachineOperand &Op) {
4153 if (!Op.isReg())
4154 return false;
4155 auto Reg = Op.getReg();
4156 if (Reg.isPhysical())
4157 return AArch64::FPR128RegClass.contains(Reg);
4158 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4159 return TRC == &AArch64::FPR128RegClass ||
4160 TRC == &AArch64::FPR128_loRegClass;
4161 };
4162 return llvm::any_of(Range: MI.operands(), P: IsQFPR);
4163}
4164
4165bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4166 switch (MI.getOpcode()) {
4167 case AArch64::BRK:
4168 case AArch64::HLT:
4169 case AArch64::PACIASP:
4170 case AArch64::PACIBSP:
4171 // Implicit BTI behavior.
4172 return true;
4173 case AArch64::PAUTH_PROLOGUE:
4174 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4175 return true;
4176 case AArch64::HINT: {
4177 unsigned Imm = MI.getOperand(i: 0).getImm();
4178 // Explicit BTI instruction.
4179 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4180 return true;
4181 // PACI(A|B)SP instructions.
4182 if (Imm == 25 || Imm == 27)
4183 return true;
4184 return false;
4185 }
4186 default:
4187 return false;
4188 }
4189}
4190
4191bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
4192 if (Reg == 0)
4193 return false;
4194 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
4195 return AArch64::FPR128RegClass.contains(Reg) ||
4196 AArch64::FPR64RegClass.contains(Reg) ||
4197 AArch64::FPR32RegClass.contains(Reg) ||
4198 AArch64::FPR16RegClass.contains(Reg) ||
4199 AArch64::FPR8RegClass.contains(Reg);
4200}
4201
4202bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4203 auto IsFPR = [&](const MachineOperand &Op) {
4204 if (!Op.isReg())
4205 return false;
4206 auto Reg = Op.getReg();
4207 if (Reg.isPhysical())
4208 return isFpOrNEON(Reg);
4209
4210 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4211 return TRC == &AArch64::FPR128RegClass ||
4212 TRC == &AArch64::FPR128_loRegClass ||
4213 TRC == &AArch64::FPR64RegClass ||
4214 TRC == &AArch64::FPR64_loRegClass ||
4215 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4216 TRC == &AArch64::FPR8RegClass;
4217 };
4218 return llvm::any_of(Range: MI.operands(), P: IsFPR);
4219}
4220
4221// Scale the unscaled offsets. Returns false if the unscaled offset can't be
4222// scaled.
4223static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4224 int Scale = AArch64InstrInfo::getMemScale(Opc);
4225
4226 // If the byte-offset isn't a multiple of the stride, we can't scale this
4227 // offset.
4228 if (Offset % Scale != 0)
4229 return false;
4230
4231 // Convert the byte-offset used by unscaled into an "element" offset used
4232 // by the scaled pair load/store instructions.
4233 Offset /= Scale;
4234 return true;
4235}
4236
4237static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4238 if (FirstOpc == SecondOpc)
4239 return true;
4240 // We can also pair sign-ext and zero-ext instructions.
4241 switch (FirstOpc) {
4242 default:
4243 return false;
4244 case AArch64::STRSui:
4245 case AArch64::STURSi:
4246 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4247 case AArch64::STRDui:
4248 case AArch64::STURDi:
4249 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4250 case AArch64::STRQui:
4251 case AArch64::STURQi:
4252 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4253 case AArch64::STRWui:
4254 case AArch64::STURWi:
4255 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4256 case AArch64::STRXui:
4257 case AArch64::STURXi:
4258 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4259 case AArch64::LDRSui:
4260 case AArch64::LDURSi:
4261 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4262 case AArch64::LDRDui:
4263 case AArch64::LDURDi:
4264 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
4265 case AArch64::LDRQui:
4266 case AArch64::LDURQi:
4267 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4268 case AArch64::LDRWui:
4269 case AArch64::LDURWi:
4270 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4271 case AArch64::LDRSWui:
4272 case AArch64::LDURSWi:
4273 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4274 case AArch64::LDRXui:
4275 case AArch64::LDURXi:
4276 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
4277 }
4278 // These instructions can't be paired based on their opcodes.
4279 return false;
4280}
4281
4282static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4283 int64_t Offset1, unsigned Opcode1, int FI2,
4284 int64_t Offset2, unsigned Opcode2) {
4285 // Accesses through fixed stack object frame indices may access a different
4286 // fixed stack slot. Check that the object offsets + offsets match.
4287 if (MFI.isFixedObjectIndex(ObjectIdx: FI1) && MFI.isFixedObjectIndex(ObjectIdx: FI2)) {
4288 int64_t ObjectOffset1 = MFI.getObjectOffset(ObjectIdx: FI1);
4289 int64_t ObjectOffset2 = MFI.getObjectOffset(ObjectIdx: FI2);
4290 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4291 // Convert to scaled object offsets.
4292 int Scale1 = AArch64InstrInfo::getMemScale(Opc: Opcode1);
4293 if (ObjectOffset1 % Scale1 != 0)
4294 return false;
4295 ObjectOffset1 /= Scale1;
4296 int Scale2 = AArch64InstrInfo::getMemScale(Opc: Opcode2);
4297 if (ObjectOffset2 % Scale2 != 0)
4298 return false;
4299 ObjectOffset2 /= Scale2;
4300 ObjectOffset1 += Offset1;
4301 ObjectOffset2 += Offset2;
4302 return ObjectOffset1 + 1 == ObjectOffset2;
4303 }
4304
4305 return FI1 == FI2;
4306}
4307
4308/// Detect opportunities for ldp/stp formation.
4309///
4310/// Only called for LdSt for which getMemOperandWithOffset returns true.
4311bool AArch64InstrInfo::shouldClusterMemOps(
4312 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4313 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4314 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4315 unsigned NumBytes) const {
4316 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4317 const MachineOperand &BaseOp1 = *BaseOps1.front();
4318 const MachineOperand &BaseOp2 = *BaseOps2.front();
4319 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4320 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4321 if (BaseOp1.getType() != BaseOp2.getType())
4322 return false;
4323
4324 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4325 "Only base registers and frame indices are supported.");
4326
4327 // Check for both base regs and base FI.
4328 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4329 return false;
4330
4331 // Only cluster up to a single pair.
4332 if (ClusterSize > 2)
4333 return false;
4334
4335 if (!isPairableLdStInst(MI: FirstLdSt) || !isPairableLdStInst(MI: SecondLdSt))
4336 return false;
4337
4338 // Can we pair these instructions based on their opcodes?
4339 unsigned FirstOpc = FirstLdSt.getOpcode();
4340 unsigned SecondOpc = SecondLdSt.getOpcode();
4341 if (!canPairLdStOpc(FirstOpc, SecondOpc))
4342 return false;
4343
4344 // Can't merge volatiles or load/stores that have a hint to avoid pair
4345 // formation, for example.
4346 if (!isCandidateToMergeOrPair(MI: FirstLdSt) ||
4347 !isCandidateToMergeOrPair(MI: SecondLdSt))
4348 return false;
4349
4350 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4351 int64_t Offset1 = FirstLdSt.getOperand(i: 2).getImm();
4352 if (hasUnscaledLdStOffset(Opc: FirstOpc) && !scaleOffset(Opc: FirstOpc, Offset&: Offset1))
4353 return false;
4354
4355 int64_t Offset2 = SecondLdSt.getOperand(i: 2).getImm();
4356 if (hasUnscaledLdStOffset(Opc: SecondOpc) && !scaleOffset(Opc: SecondOpc, Offset&: Offset2))
4357 return false;
4358
4359 // Pairwise instructions have a 7-bit signed offset field.
4360 if (Offset1 > 63 || Offset1 < -64)
4361 return false;
4362
4363 // The caller should already have ordered First/SecondLdSt by offset.
4364 // Note: except for non-equal frame index bases
4365 if (BaseOp1.isFI()) {
4366 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4367 "Caller should have ordered offsets.");
4368
4369 const MachineFrameInfo &MFI =
4370 FirstLdSt.getParent()->getParent()->getFrameInfo();
4371 return shouldClusterFI(MFI, FI1: BaseOp1.getIndex(), Offset1, Opcode1: FirstOpc,
4372 FI2: BaseOp2.getIndex(), Offset2, Opcode2: SecondOpc);
4373 }
4374
4375 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4376
4377 return Offset1 + 1 == Offset2;
4378}
4379
4380static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4381 unsigned Reg, unsigned SubIdx,
4382 unsigned State,
4383 const TargetRegisterInfo *TRI) {
4384 if (!SubIdx)
4385 return MIB.addReg(RegNo: Reg, flags: State);
4386
4387 if (Register::isPhysicalRegister(Reg))
4388 return MIB.addReg(RegNo: TRI->getSubReg(Reg, Idx: SubIdx), flags: State);
4389 return MIB.addReg(RegNo: Reg, flags: State, SubReg: SubIdx);
4390}
4391
4392static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4393 unsigned NumRegs) {
4394 // We really want the positive remainder mod 32 here, that happens to be
4395 // easily obtainable with a mask.
4396 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4397}
4398
4399void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4400 MachineBasicBlock::iterator I,
4401 const DebugLoc &DL, MCRegister DestReg,
4402 MCRegister SrcReg, bool KillSrc,
4403 unsigned Opcode,
4404 ArrayRef<unsigned> Indices) const {
4405 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4406 const TargetRegisterInfo *TRI = &getRegisterInfo();
4407 uint16_t DestEncoding = TRI->getEncodingValue(RegNo: DestReg);
4408 uint16_t SrcEncoding = TRI->getEncodingValue(RegNo: SrcReg);
4409 unsigned NumRegs = Indices.size();
4410
4411 int SubReg = 0, End = NumRegs, Incr = 1;
4412 if (forwardCopyWillClobberTuple(DestReg: DestEncoding, SrcReg: SrcEncoding, NumRegs)) {
4413 SubReg = NumRegs - 1;
4414 End = -1;
4415 Incr = -1;
4416 }
4417
4418 for (; SubReg != End; SubReg += Incr) {
4419 const MachineInstrBuilder MIB = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode));
4420 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
4421 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: 0, TRI);
4422 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
4423 }
4424}
4425
4426void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4427 MachineBasicBlock::iterator I,
4428 DebugLoc DL, unsigned DestReg,
4429 unsigned SrcReg, bool KillSrc,
4430 unsigned Opcode, unsigned ZeroReg,
4431 llvm::ArrayRef<unsigned> Indices) const {
4432 const TargetRegisterInfo *TRI = &getRegisterInfo();
4433 unsigned NumRegs = Indices.size();
4434
4435#ifndef NDEBUG
4436 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4437 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4438 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4439 "GPR reg sequences should not be able to overlap");
4440#endif
4441
4442 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4443 const MachineInstrBuilder MIB = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode));
4444 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
4445 MIB.addReg(RegNo: ZeroReg);
4446 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
4447 MIB.addImm(Val: 0);
4448 }
4449}
4450
4451void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4452 MachineBasicBlock::iterator I,
4453 const DebugLoc &DL, MCRegister DestReg,
4454 MCRegister SrcReg, bool KillSrc) const {
4455 if (AArch64::GPR32spRegClass.contains(Reg: DestReg) &&
4456 (AArch64::GPR32spRegClass.contains(Reg: SrcReg) || SrcReg == AArch64::WZR)) {
4457 const TargetRegisterInfo *TRI = &getRegisterInfo();
4458
4459 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4460 // If either operand is WSP, expand to ADD #0.
4461 if (Subtarget.hasZeroCycleRegMove()) {
4462 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4463 MCRegister DestRegX = TRI->getMatchingSuperReg(
4464 Reg: DestReg, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64spRegClass);
4465 MCRegister SrcRegX = TRI->getMatchingSuperReg(
4466 Reg: SrcReg, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64spRegClass);
4467 // This instruction is reading and writing X registers. This may upset
4468 // the register scavenger and machine verifier, so we need to indicate
4469 // that we are reading an undefined value from SrcRegX, but a proper
4470 // value from SrcReg.
4471 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg: DestRegX)
4472 .addReg(RegNo: SrcRegX, flags: RegState::Undef)
4473 .addImm(Val: 0)
4474 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
4475 .addReg(RegNo: SrcReg, flags: RegState::Implicit | getKillRegState(B: KillSrc));
4476 } else {
4477 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDWri), DestReg)
4478 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc))
4479 .addImm(Val: 0)
4480 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
4481 }
4482 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4483 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZWi), DestReg)
4484 .addImm(Val: 0)
4485 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
4486 } else {
4487 if (Subtarget.hasZeroCycleRegMove()) {
4488 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4489 MCRegister DestRegX = TRI->getMatchingSuperReg(
4490 Reg: DestReg, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64spRegClass);
4491 MCRegister SrcRegX = TRI->getMatchingSuperReg(
4492 Reg: SrcReg, SubIdx: AArch64::sub_32, RC: &AArch64::GPR64spRegClass);
4493 // This instruction is reading and writing X registers. This may upset
4494 // the register scavenger and machine verifier, so we need to indicate
4495 // that we are reading an undefined value from SrcRegX, but a proper
4496 // value from SrcReg.
4497 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg: DestRegX)
4498 .addReg(RegNo: AArch64::XZR)
4499 .addReg(RegNo: SrcRegX, flags: RegState::Undef)
4500 .addReg(RegNo: SrcReg, flags: RegState::Implicit | getKillRegState(B: KillSrc));
4501 } else {
4502 // Otherwise, expand to ORR WZR.
4503 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRWrr), DestReg)
4504 .addReg(RegNo: AArch64::WZR)
4505 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4506 }
4507 }
4508 return;
4509 }
4510
4511 // Copy a Predicate register by ORRing with itself.
4512 if (AArch64::PPRRegClass.contains(Reg: DestReg) &&
4513 AArch64::PPRRegClass.contains(Reg: SrcReg)) {
4514 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4515 "Unexpected SVE register.");
4516 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_PPzPP), DestReg)
4517 .addReg(RegNo: SrcReg) // Pg
4518 .addReg(RegNo: SrcReg)
4519 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4520 return;
4521 }
4522
4523 // Copy a predicate-as-counter register by ORRing with itself as if it
4524 // were a regular predicate (mask) register.
4525 bool DestIsPNR = AArch64::PNRRegClass.contains(Reg: DestReg);
4526 bool SrcIsPNR = AArch64::PNRRegClass.contains(Reg: SrcReg);
4527 if (DestIsPNR || SrcIsPNR) {
4528 auto ToPPR = [](MCRegister R) -> MCRegister {
4529 return (R - AArch64::PN0) + AArch64::P0;
4530 };
4531 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4532 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4533
4534 if (PPRSrcReg != PPRDestReg) {
4535 auto NewMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_PPzPP), DestReg: PPRDestReg)
4536 .addReg(RegNo: PPRSrcReg) // Pg
4537 .addReg(RegNo: PPRSrcReg)
4538 .addReg(RegNo: PPRSrcReg, flags: getKillRegState(B: KillSrc));
4539 if (DestIsPNR)
4540 NewMI.addDef(RegNo: DestReg, Flags: RegState::Implicit);
4541 }
4542 return;
4543 }
4544
4545 // Copy a Z register by ORRing with itself.
4546 if (AArch64::ZPRRegClass.contains(Reg: DestReg) &&
4547 AArch64::ZPRRegClass.contains(Reg: SrcReg)) {
4548 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4549 "Unexpected SVE register.");
4550 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_ZZZ), DestReg)
4551 .addReg(RegNo: SrcReg)
4552 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4553 return;
4554 }
4555
4556 // Copy a Z register pair by copying the individual sub-registers.
4557 if ((AArch64::ZPR2RegClass.contains(Reg: DestReg) ||
4558 AArch64::ZPR2StridedOrContiguousRegClass.contains(Reg: DestReg)) &&
4559 (AArch64::ZPR2RegClass.contains(Reg: SrcReg) ||
4560 AArch64::ZPR2StridedOrContiguousRegClass.contains(Reg: SrcReg))) {
4561 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4562 "Unexpected SVE register.");
4563 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
4564 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
4565 Indices);
4566 return;
4567 }
4568
4569 // Copy a Z register triple by copying the individual sub-registers.
4570 if (AArch64::ZPR3RegClass.contains(Reg: DestReg) &&
4571 AArch64::ZPR3RegClass.contains(Reg: SrcReg)) {
4572 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4573 "Unexpected SVE register.");
4574 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4575 AArch64::zsub2};
4576 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
4577 Indices);
4578 return;
4579 }
4580
4581 // Copy a Z register quad by copying the individual sub-registers.
4582 if ((AArch64::ZPR4RegClass.contains(Reg: DestReg) ||
4583 AArch64::ZPR4StridedOrContiguousRegClass.contains(Reg: DestReg)) &&
4584 (AArch64::ZPR4RegClass.contains(Reg: SrcReg) ||
4585 AArch64::ZPR4StridedOrContiguousRegClass.contains(Reg: SrcReg))) {
4586 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4587 "Unexpected SVE register.");
4588 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4589 AArch64::zsub2, AArch64::zsub3};
4590 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
4591 Indices);
4592 return;
4593 }
4594
4595 if (AArch64::GPR64spRegClass.contains(Reg: DestReg) &&
4596 (AArch64::GPR64spRegClass.contains(Reg: SrcReg) || SrcReg == AArch64::XZR)) {
4597 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
4598 // If either operand is SP, expand to ADD #0.
4599 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg)
4600 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc))
4601 .addImm(Val: 0)
4602 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
4603 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
4604 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg)
4605 .addImm(Val: 0)
4606 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
4607 } else {
4608 // Otherwise, expand to ORR XZR.
4609 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg)
4610 .addReg(RegNo: AArch64::XZR)
4611 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4612 }
4613 return;
4614 }
4615
4616 // Copy a DDDD register quad by copying the individual sub-registers.
4617 if (AArch64::DDDDRegClass.contains(Reg: DestReg) &&
4618 AArch64::DDDDRegClass.contains(Reg: SrcReg)) {
4619 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4620 AArch64::dsub2, AArch64::dsub3};
4621 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
4622 Indices);
4623 return;
4624 }
4625
4626 // Copy a DDD register triple by copying the individual sub-registers.
4627 if (AArch64::DDDRegClass.contains(Reg: DestReg) &&
4628 AArch64::DDDRegClass.contains(Reg: SrcReg)) {
4629 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4630 AArch64::dsub2};
4631 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
4632 Indices);
4633 return;
4634 }
4635
4636 // Copy a DD register pair by copying the individual sub-registers.
4637 if (AArch64::DDRegClass.contains(Reg: DestReg) &&
4638 AArch64::DDRegClass.contains(Reg: SrcReg)) {
4639 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
4640 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
4641 Indices);
4642 return;
4643 }
4644
4645 // Copy a QQQQ register quad by copying the individual sub-registers.
4646 if (AArch64::QQQQRegClass.contains(Reg: DestReg) &&
4647 AArch64::QQQQRegClass.contains(Reg: SrcReg)) {
4648 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4649 AArch64::qsub2, AArch64::qsub3};
4650 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
4651 Indices);
4652 return;
4653 }
4654
4655 // Copy a QQQ register triple by copying the individual sub-registers.
4656 if (AArch64::QQQRegClass.contains(Reg: DestReg) &&
4657 AArch64::QQQRegClass.contains(Reg: SrcReg)) {
4658 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4659 AArch64::qsub2};
4660 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
4661 Indices);
4662 return;
4663 }
4664
4665 // Copy a QQ register pair by copying the individual sub-registers.
4666 if (AArch64::QQRegClass.contains(Reg: DestReg) &&
4667 AArch64::QQRegClass.contains(Reg: SrcReg)) {
4668 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
4669 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
4670 Indices);
4671 return;
4672 }
4673
4674 if (AArch64::XSeqPairsClassRegClass.contains(Reg: DestReg) &&
4675 AArch64::XSeqPairsClassRegClass.contains(Reg: SrcReg)) {
4676 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
4677 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRXrs,
4678 ZeroReg: AArch64::XZR, Indices);
4679 return;
4680 }
4681
4682 if (AArch64::WSeqPairsClassRegClass.contains(Reg: DestReg) &&
4683 AArch64::WSeqPairsClassRegClass.contains(Reg: SrcReg)) {
4684 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
4685 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRWrs,
4686 ZeroReg: AArch64::WZR, Indices);
4687 return;
4688 }
4689
4690 if (AArch64::FPR128RegClass.contains(Reg: DestReg) &&
4691 AArch64::FPR128RegClass.contains(Reg: SrcReg)) {
4692 if (Subtarget.isSVEorStreamingSVEAvailable() &&
4693 !Subtarget.isNeonAvailable())
4694 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_ZZZ))
4695 .addReg(RegNo: AArch64::Z0 + (DestReg - AArch64::Q0), flags: RegState::Define)
4696 .addReg(RegNo: AArch64::Z0 + (SrcReg - AArch64::Q0))
4697 .addReg(RegNo: AArch64::Z0 + (SrcReg - AArch64::Q0));
4698 else if (Subtarget.isNeonAvailable())
4699 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg)
4700 .addReg(RegNo: SrcReg)
4701 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4702 else {
4703 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::STRQpre))
4704 .addReg(RegNo: AArch64::SP, flags: RegState::Define)
4705 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc))
4706 .addReg(RegNo: AArch64::SP)
4707 .addImm(Val: -16);
4708 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::LDRQpost))
4709 .addReg(RegNo: AArch64::SP, flags: RegState::Define)
4710 .addReg(RegNo: DestReg, flags: RegState::Define)
4711 .addReg(RegNo: AArch64::SP)
4712 .addImm(Val: 16);
4713 }
4714 return;
4715 }
4716
4717 if (AArch64::FPR64RegClass.contains(Reg: DestReg) &&
4718 AArch64::FPR64RegClass.contains(Reg: SrcReg)) {
4719 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg)
4720 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4721 return;
4722 }
4723
4724 if (AArch64::FPR32RegClass.contains(Reg: DestReg) &&
4725 AArch64::FPR32RegClass.contains(Reg: SrcReg)) {
4726 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
4727 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4728 return;
4729 }
4730
4731 if (AArch64::FPR16RegClass.contains(Reg: DestReg) &&
4732 AArch64::FPR16RegClass.contains(Reg: SrcReg)) {
4733 DestReg =
4734 RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub, RC: &AArch64::FPR32RegClass);
4735 SrcReg =
4736 RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub, RC: &AArch64::FPR32RegClass);
4737 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
4738 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4739 return;
4740 }
4741
4742 if (AArch64::FPR8RegClass.contains(Reg: DestReg) &&
4743 AArch64::FPR8RegClass.contains(Reg: SrcReg)) {
4744 DestReg =
4745 RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub, RC: &AArch64::FPR32RegClass);
4746 SrcReg =
4747 RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub, RC: &AArch64::FPR32RegClass);
4748 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
4749 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4750 return;
4751 }
4752
4753 // Copies between GPR64 and FPR64.
4754 if (AArch64::FPR64RegClass.contains(Reg: DestReg) &&
4755 AArch64::GPR64RegClass.contains(Reg: SrcReg)) {
4756 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVXDr), DestReg)
4757 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4758 return;
4759 }
4760 if (AArch64::GPR64RegClass.contains(Reg: DestReg) &&
4761 AArch64::FPR64RegClass.contains(Reg: SrcReg)) {
4762 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDXr), DestReg)
4763 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4764 return;
4765 }
4766 // Copies between GPR32 and FPR32.
4767 if (AArch64::FPR32RegClass.contains(Reg: DestReg) &&
4768 AArch64::GPR32RegClass.contains(Reg: SrcReg)) {
4769 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVWSr), DestReg)
4770 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4771 return;
4772 }
4773 if (AArch64::GPR32RegClass.contains(Reg: DestReg) &&
4774 AArch64::FPR32RegClass.contains(Reg: SrcReg)) {
4775 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSWr), DestReg)
4776 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc));
4777 return;
4778 }
4779
4780 if (DestReg == AArch64::NZCV) {
4781 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
4782 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MSR))
4783 .addImm(Val: AArch64SysReg::NZCV)
4784 .addReg(RegNo: SrcReg, flags: getKillRegState(B: KillSrc))
4785 .addReg(RegNo: AArch64::NZCV, flags: RegState::Implicit | RegState::Define);
4786 return;
4787 }
4788
4789 if (SrcReg == AArch64::NZCV) {
4790 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
4791 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MRS), DestReg)
4792 .addImm(Val: AArch64SysReg::NZCV)
4793 .addReg(RegNo: AArch64::NZCV, flags: RegState::Implicit | getKillRegState(B: KillSrc));
4794 return;
4795 }
4796
4797#ifndef NDEBUG
4798 const TargetRegisterInfo &TRI = getRegisterInfo();
4799 errs() << TRI.getRegAsmName(DestReg) << " = COPY "
4800 << TRI.getRegAsmName(SrcReg) << "\n";
4801#endif
4802 llvm_unreachable("unimplemented reg-to-reg copy");
4803}
4804
4805static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
4806 MachineBasicBlock &MBB,
4807 MachineBasicBlock::iterator InsertBefore,
4808 const MCInstrDesc &MCID,
4809 Register SrcReg, bool IsKill,
4810 unsigned SubIdx0, unsigned SubIdx1, int FI,
4811 MachineMemOperand *MMO) {
4812 Register SrcReg0 = SrcReg;
4813 Register SrcReg1 = SrcReg;
4814 if (SrcReg.isPhysical()) {
4815 SrcReg0 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx0);
4816 SubIdx0 = 0;
4817 SrcReg1 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx1);
4818 SubIdx1 = 0;
4819 }
4820 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
4821 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: IsKill), SubReg: SubIdx0)
4822 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: IsKill), SubReg: SubIdx1)
4823 .addFrameIndex(Idx: FI)
4824 .addImm(Val: 0)
4825 .addMemOperand(MMO);
4826}
4827
4828void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
4829 MachineBasicBlock::iterator MBBI,
4830 Register SrcReg, bool isKill, int FI,
4831 const TargetRegisterClass *RC,
4832 const TargetRegisterInfo *TRI,
4833 Register VReg) const {
4834 MachineFunction &MF = *MBB.getParent();
4835 MachineFrameInfo &MFI = MF.getFrameInfo();
4836
4837 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4838 MachineMemOperand *MMO =
4839 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore,
4840 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
4841 unsigned Opc = 0;
4842 bool Offset = true;
4843 MCRegister PNRReg = MCRegister::NoRegister;
4844 unsigned StackID = TargetStackID::Default;
4845 switch (TRI->getSpillSize(RC: *RC)) {
4846 case 1:
4847 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4848 Opc = AArch64::STRBui;
4849 break;
4850 case 2: {
4851 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4852 Opc = AArch64::STRHui;
4853 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
4854 AArch64::PPRRegClass.hasSubClassEq(RC)) {
4855 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4856 "Unexpected register store without SVE store instructions");
4857 Opc = AArch64::STR_PXI;
4858 StackID = TargetStackID::ScalableVector;
4859 }
4860 break;
4861 }
4862 case 4:
4863 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4864 Opc = AArch64::STRWui;
4865 if (SrcReg.isVirtual())
4866 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR32RegClass);
4867 else
4868 assert(SrcReg != AArch64::WSP);
4869 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4870 Opc = AArch64::STRSui;
4871 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
4872 Opc = AArch64::STR_PPXI;
4873 StackID = TargetStackID::ScalableVector;
4874 }
4875 break;
4876 case 8:
4877 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
4878 Opc = AArch64::STRXui;
4879 if (SrcReg.isVirtual())
4880 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64RegClass);
4881 else
4882 assert(SrcReg != AArch64::SP);
4883 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
4884 Opc = AArch64::STRDui;
4885 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
4886 storeRegPairToStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
4887 MCID: get(Opcode: AArch64::STPWi), SrcReg, IsKill: isKill,
4888 SubIdx0: AArch64::sube32, SubIdx1: AArch64::subo32, FI, MMO);
4889 return;
4890 }
4891 break;
4892 case 16:
4893 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
4894 Opc = AArch64::STRQui;
4895 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
4896 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4897 Opc = AArch64::ST1Twov1d;
4898 Offset = false;
4899 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
4900 storeRegPairToStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
4901 MCID: get(Opcode: AArch64::STPXi), SrcReg, IsKill: isKill,
4902 SubIdx0: AArch64::sube64, SubIdx1: AArch64::subo64, FI, MMO);
4903 return;
4904 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
4905 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4906 "Unexpected register store without SVE store instructions");
4907 Opc = AArch64::STR_ZXI;
4908 StackID = TargetStackID::ScalableVector;
4909 }
4910 break;
4911 case 24:
4912 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
4913 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4914 Opc = AArch64::ST1Threev1d;
4915 Offset = false;
4916 }
4917 break;
4918 case 32:
4919 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
4920 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4921 Opc = AArch64::ST1Fourv1d;
4922 Offset = false;
4923 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
4924 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4925 Opc = AArch64::ST1Twov2d;
4926 Offset = false;
4927 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4928 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4929 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4930 "Unexpected register store without SVE store instructions");
4931 Opc = AArch64::STR_ZZXI;
4932 StackID = TargetStackID::ScalableVector;
4933 }
4934 break;
4935 case 48:
4936 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
4937 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4938 Opc = AArch64::ST1Threev2d;
4939 Offset = false;
4940 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
4941 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4942 "Unexpected register store without SVE store instructions");
4943 Opc = AArch64::STR_ZZZXI;
4944 StackID = TargetStackID::ScalableVector;
4945 }
4946 break;
4947 case 64:
4948 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
4949 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4950 Opc = AArch64::ST1Fourv2d;
4951 Offset = false;
4952 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4953 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4954 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4955 "Unexpected register store without SVE store instructions");
4956 Opc = AArch64::STR_ZZZZXI;
4957 StackID = TargetStackID::ScalableVector;
4958 }
4959 break;
4960 }
4961 assert(Opc && "Unknown register class");
4962 MFI.setStackID(ObjectIdx: FI, ID: StackID);
4963
4964 const MachineInstrBuilder MI = BuildMI(BB&: MBB, I: MBBI, MIMD: DebugLoc(), MCID: get(Opcode: Opc))
4965 .addReg(RegNo: SrcReg, flags: getKillRegState(B: isKill))
4966 .addFrameIndex(Idx: FI);
4967
4968 if (Offset)
4969 MI.addImm(Val: 0);
4970 if (PNRReg.isValid())
4971 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
4972 MI.addMemOperand(MMO);
4973}
4974
4975static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
4976 MachineBasicBlock &MBB,
4977 MachineBasicBlock::iterator InsertBefore,
4978 const MCInstrDesc &MCID,
4979 Register DestReg, unsigned SubIdx0,
4980 unsigned SubIdx1, int FI,
4981 MachineMemOperand *MMO) {
4982 Register DestReg0 = DestReg;
4983 Register DestReg1 = DestReg;
4984 bool IsUndef = true;
4985 if (DestReg.isPhysical()) {
4986 DestReg0 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx0);
4987 SubIdx0 = 0;
4988 DestReg1 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx1);
4989 SubIdx1 = 0;
4990 IsUndef = false;
4991 }
4992 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
4993 .addReg(RegNo: DestReg0, flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx0)
4994 .addReg(RegNo: DestReg1, flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx1)
4995 .addFrameIndex(Idx: FI)
4996 .addImm(Val: 0)
4997 .addMemOperand(MMO);
4998}
4999
5000void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
5001 MachineBasicBlock::iterator MBBI,
5002 Register DestReg, int FI,
5003 const TargetRegisterClass *RC,
5004 const TargetRegisterInfo *TRI,
5005 Register VReg) const {
5006 MachineFunction &MF = *MBB.getParent();
5007 MachineFrameInfo &MFI = MF.getFrameInfo();
5008 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5009 MachineMemOperand *MMO =
5010 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOLoad,
5011 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
5012
5013 unsigned Opc = 0;
5014 bool Offset = true;
5015 unsigned StackID = TargetStackID::Default;
5016 Register PNRReg = MCRegister::NoRegister;
5017 switch (TRI->getSpillSize(RC: *RC)) {
5018 case 1:
5019 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5020 Opc = AArch64::LDRBui;
5021 break;
5022 case 2: {
5023 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
5024 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5025 Opc = AArch64::LDRHui;
5026 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
5027 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5028 "Unexpected register load without SVE load instructions");
5029 if (IsPNR)
5030 PNRReg = DestReg;
5031 Opc = AArch64::LDR_PXI;
5032 StackID = TargetStackID::ScalableVector;
5033 }
5034 break;
5035 }
5036 case 4:
5037 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5038 Opc = AArch64::LDRWui;
5039 if (DestReg.isVirtual())
5040 MF.getRegInfo().constrainRegClass(Reg: DestReg, RC: &AArch64::GPR32RegClass);
5041 else
5042 assert(DestReg != AArch64::WSP);
5043 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5044 Opc = AArch64::LDRSui;
5045 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5046 Opc = AArch64::LDR_PPXI;
5047 StackID = TargetStackID::ScalableVector;
5048 }
5049 break;
5050 case 8:
5051 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5052 Opc = AArch64::LDRXui;
5053 if (DestReg.isVirtual())
5054 MF.getRegInfo().constrainRegClass(Reg: DestReg, RC: &AArch64::GPR64RegClass);
5055 else
5056 assert(DestReg != AArch64::SP);
5057 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5058 Opc = AArch64::LDRDui;
5059 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5060 loadRegPairFromStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
5061 MCID: get(Opcode: AArch64::LDPWi), DestReg, SubIdx0: AArch64::sube32,
5062 SubIdx1: AArch64::subo32, FI, MMO);
5063 return;
5064 }
5065 break;
5066 case 16:
5067 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5068 Opc = AArch64::LDRQui;
5069 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5070 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5071 Opc = AArch64::LD1Twov1d;
5072 Offset = false;
5073 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5074 loadRegPairFromStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
5075 MCID: get(Opcode: AArch64::LDPXi), DestReg, SubIdx0: AArch64::sube64,
5076 SubIdx1: AArch64::subo64, FI, MMO);
5077 return;
5078 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5079 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5080 "Unexpected register load without SVE load instructions");
5081 Opc = AArch64::LDR_ZXI;
5082 StackID = TargetStackID::ScalableVector;
5083 }
5084 break;
5085 case 24:
5086 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5087 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5088 Opc = AArch64::LD1Threev1d;
5089 Offset = false;
5090 }
5091 break;
5092 case 32:
5093 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5094 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5095 Opc = AArch64::LD1Fourv1d;
5096 Offset = false;
5097 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5098 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5099 Opc = AArch64::LD1Twov2d;
5100 Offset = false;
5101 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5102 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5103 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5104 "Unexpected register load without SVE load instructions");
5105 Opc = AArch64::LDR_ZZXI;
5106 StackID = TargetStackID::ScalableVector;
5107 }
5108 break;
5109 case 48:
5110 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5111 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5112 Opc = AArch64::LD1Threev2d;
5113 Offset = false;
5114 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5115 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5116 "Unexpected register load without SVE load instructions");
5117 Opc = AArch64::LDR_ZZZXI;
5118 StackID = TargetStackID::ScalableVector;
5119 }
5120 break;
5121 case 64:
5122 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5123 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5124 Opc = AArch64::LD1Fourv2d;
5125 Offset = false;
5126 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5127 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5128 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5129 "Unexpected register load without SVE load instructions");
5130 Opc = AArch64::LDR_ZZZZXI;
5131 StackID = TargetStackID::ScalableVector;
5132 }
5133 break;
5134 }
5135
5136 assert(Opc && "Unknown register class");
5137 MFI.setStackID(ObjectIdx: FI, ID: StackID);
5138
5139 const MachineInstrBuilder MI = BuildMI(BB&: MBB, I: MBBI, MIMD: DebugLoc(), MCID: get(Opcode: Opc))
5140 .addReg(RegNo: DestReg, flags: getDefRegState(B: true))
5141 .addFrameIndex(Idx: FI);
5142 if (Offset)
5143 MI.addImm(Val: 0);
5144 if (PNRReg.isValid() && !PNRReg.isVirtual())
5145 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
5146 MI.addMemOperand(MMO);
5147}
5148
5149bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5150 const MachineInstr &UseMI,
5151 const TargetRegisterInfo *TRI) {
5152 return any_of(Range: instructionsWithoutDebug(It: std::next(x: DefMI.getIterator()),
5153 End: UseMI.getIterator()),
5154 P: [TRI](const MachineInstr &I) {
5155 return I.modifiesRegister(Reg: AArch64::NZCV, TRI) ||
5156 I.readsRegister(Reg: AArch64::NZCV, TRI);
5157 });
5158}
5159
5160void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5161 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5162 // The smallest scalable element supported by scaled SVE addressing
5163 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5164 // byte offset must always be a multiple of 2.
5165 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5166
5167 // VGSized offsets are divided by '2', because the VG register is the
5168 // the number of 64bit granules as opposed to 128bit vector chunks,
5169 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5170 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5171 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5172 ByteSized = Offset.getFixed();
5173 VGSized = Offset.getScalable() / 2;
5174}
5175
5176/// Returns the offset in parts to which this frame offset can be
5177/// decomposed for the purpose of describing a frame offset.
5178/// For non-scalable offsets this is simply its byte size.
5179void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5180 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5181 int64_t &NumDataVectors) {
5182 // The smallest scalable element supported by scaled SVE addressing
5183 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5184 // byte offset must always be a multiple of 2.
5185 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5186
5187 NumBytes = Offset.getFixed();
5188 NumDataVectors = 0;
5189 NumPredicateVectors = Offset.getScalable() / 2;
5190 // This method is used to get the offsets to adjust the frame offset.
5191 // If the function requires ADDPL to be used and needs more than two ADDPL
5192 // instructions, part of the offset is folded into NumDataVectors so that it
5193 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5194 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5195 NumPredicateVectors > 62) {
5196 NumDataVectors = NumPredicateVectors / 8;
5197 NumPredicateVectors -= NumDataVectors * 8;
5198 }
5199}
5200
5201// Convenience function to create a DWARF expression for
5202// Expr + NumBytes + NumVGScaledBytes * AArch64::VG
5203static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5204 int NumVGScaledBytes, unsigned VG,
5205 llvm::raw_string_ostream &Comment) {
5206 uint8_t buffer[16];
5207
5208 if (NumBytes) {
5209 Expr.push_back(Elt: dwarf::DW_OP_consts);
5210 Expr.append(in_start: buffer, in_end: buffer + encodeSLEB128(Value: NumBytes, p: buffer));
5211 Expr.push_back(Elt: (uint8_t)dwarf::DW_OP_plus);
5212 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(x: NumBytes);
5213 }
5214
5215 if (NumVGScaledBytes) {
5216 Expr.push_back(Elt: (uint8_t)dwarf::DW_OP_consts);
5217 Expr.append(in_start: buffer, in_end: buffer + encodeSLEB128(Value: NumVGScaledBytes, p: buffer));
5218
5219 Expr.push_back(Elt: (uint8_t)dwarf::DW_OP_bregx);
5220 Expr.append(in_start: buffer, in_end: buffer + encodeULEB128(Value: VG, p: buffer));
5221 Expr.push_back(Elt: 0);
5222
5223 Expr.push_back(Elt: (uint8_t)dwarf::DW_OP_mul);
5224 Expr.push_back(Elt: (uint8_t)dwarf::DW_OP_plus);
5225
5226 Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5227 << std::abs(x: NumVGScaledBytes) << " * VG";
5228 }
5229}
5230
5231// Creates an MCCFIInstruction:
5232// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
5233static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5234 unsigned Reg,
5235 const StackOffset &Offset) {
5236 int64_t NumBytes, NumVGScaledBytes;
5237 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, ByteSized&: NumBytes,
5238 VGSized&: NumVGScaledBytes);
5239 std::string CommentBuffer;
5240 llvm::raw_string_ostream Comment(CommentBuffer);
5241
5242 if (Reg == AArch64::SP)
5243 Comment << "sp";
5244 else if (Reg == AArch64::FP)
5245 Comment << "fp";
5246 else
5247 Comment << printReg(Reg, TRI: &TRI);
5248
5249 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5250 SmallString<64> Expr;
5251 unsigned DwarfReg = TRI.getDwarfRegNum(RegNum: Reg, isEH: true);
5252 Expr.push_back(Elt: (uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5253 Expr.push_back(Elt: 0);
5254 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5255 VG: TRI.getDwarfRegNum(RegNum: AArch64::VG, isEH: true), Comment);
5256
5257 // Wrap this into DW_CFA_def_cfa.
5258 SmallString<64> DefCfaExpr;
5259 DefCfaExpr.push_back(Elt: dwarf::DW_CFA_def_cfa_expression);
5260 uint8_t buffer[16];
5261 DefCfaExpr.append(in_start: buffer, in_end: buffer + encodeULEB128(Value: Expr.size(), p: buffer));
5262 DefCfaExpr.append(RHS: Expr.str());
5263 return MCCFIInstruction::createEscape(L: nullptr, Vals: DefCfaExpr.str(), Loc: SMLoc(),
5264 Comment: Comment.str());
5265}
5266
5267MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5268 unsigned FrameReg, unsigned Reg,
5269 const StackOffset &Offset,
5270 bool LastAdjustmentWasScalable) {
5271 if (Offset.getScalable())
5272 return createDefCFAExpression(TRI, Reg, Offset);
5273
5274 if (FrameReg == Reg && !LastAdjustmentWasScalable)
5275 return MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: int(Offset.getFixed()));
5276
5277 unsigned DwarfReg = TRI.getDwarfRegNum(RegNum: Reg, isEH: true);
5278 return MCCFIInstruction::cfiDefCfa(L: nullptr, Register: DwarfReg, Offset: (int)Offset.getFixed());
5279}
5280
5281MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5282 unsigned Reg,
5283 const StackOffset &OffsetFromDefCFA) {
5284 int64_t NumBytes, NumVGScaledBytes;
5285 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5286 Offset: OffsetFromDefCFA, ByteSized&: NumBytes, VGSized&: NumVGScaledBytes);
5287
5288 unsigned DwarfReg = TRI.getDwarfRegNum(RegNum: Reg, isEH: true);
5289
5290 // Non-scalable offsets can use DW_CFA_offset directly.
5291 if (!NumVGScaledBytes)
5292 return MCCFIInstruction::createOffset(L: nullptr, Register: DwarfReg, Offset: NumBytes);
5293
5294 std::string CommentBuffer;
5295 llvm::raw_string_ostream Comment(CommentBuffer);
5296 Comment << printReg(Reg, TRI: &TRI) << " @ cfa";
5297
5298 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5299 SmallString<64> OffsetExpr;
5300 appendVGScaledOffsetExpr(Expr&: OffsetExpr, NumBytes, NumVGScaledBytes,
5301 VG: TRI.getDwarfRegNum(RegNum: AArch64::VG, isEH: true), Comment);
5302
5303 // Wrap this into DW_CFA_expression
5304 SmallString<64> CfaExpr;
5305 CfaExpr.push_back(Elt: dwarf::DW_CFA_expression);
5306 uint8_t buffer[16];
5307 CfaExpr.append(in_start: buffer, in_end: buffer + encodeULEB128(Value: DwarfReg, p: buffer));
5308 CfaExpr.append(in_start: buffer, in_end: buffer + encodeULEB128(Value: OffsetExpr.size(), p: buffer));
5309 CfaExpr.append(RHS: OffsetExpr.str());
5310
5311 return MCCFIInstruction::createEscape(L: nullptr, Vals: CfaExpr.str(), Loc: SMLoc(),
5312 Comment: Comment.str());
5313}
5314
5315// Helper function to emit a frame offset adjustment from a given
5316// pointer (SrcReg), stored into DestReg. This function is explicit
5317// in that it requires the opcode.
5318static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5319 MachineBasicBlock::iterator MBBI,
5320 const DebugLoc &DL, unsigned DestReg,
5321 unsigned SrcReg, int64_t Offset, unsigned Opc,
5322 const TargetInstrInfo *TII,
5323 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5324 bool *HasWinCFI, bool EmitCFAOffset,
5325 StackOffset CFAOffset, unsigned FrameReg) {
5326 int Sign = 1;
5327 unsigned MaxEncoding, ShiftSize;
5328 switch (Opc) {
5329 case AArch64::ADDXri:
5330 case AArch64::ADDSXri:
5331 case AArch64::SUBXri:
5332 case AArch64::SUBSXri:
5333 MaxEncoding = 0xfff;
5334 ShiftSize = 12;
5335 break;
5336 case AArch64::ADDVL_XXI:
5337 case AArch64::ADDPL_XXI:
5338 case AArch64::ADDSVL_XXI:
5339 case AArch64::ADDSPL_XXI:
5340 MaxEncoding = 31;
5341 ShiftSize = 0;
5342 if (Offset < 0) {
5343 MaxEncoding = 32;
5344 Sign = -1;
5345 Offset = -Offset;
5346 }
5347 break;
5348 default:
5349 llvm_unreachable("Unsupported opcode");
5350 }
5351
5352 // `Offset` can be in bytes or in "scalable bytes".
5353 int VScale = 1;
5354 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5355 VScale = 16;
5356 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5357 VScale = 2;
5358
5359 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
5360 // scratch register. If DestReg is a virtual register, use it as the
5361 // scratch register; otherwise, create a new virtual register (to be
5362 // replaced by the scavenger at the end of PEI). That case can be optimized
5363 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5364 // register can be loaded with offset%8 and the add/sub can use an extending
5365 // instruction with LSL#3.
5366 // Currently the function handles any offsets but generates a poor sequence
5367 // of code.
5368 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5369
5370 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5371 Register TmpReg = DestReg;
5372 if (TmpReg == AArch64::XZR)
5373 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5374 RegClass: &AArch64::GPR64RegClass);
5375 do {
5376 uint64_t ThisVal = std::min<uint64_t>(a: Offset, b: MaxEncodableValue);
5377 unsigned LocalShiftSize = 0;
5378 if (ThisVal > MaxEncoding) {
5379 ThisVal = ThisVal >> ShiftSize;
5380 LocalShiftSize = ShiftSize;
5381 }
5382 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5383 "Encoding cannot handle value that big");
5384
5385 Offset -= ThisVal << LocalShiftSize;
5386 if (Offset == 0)
5387 TmpReg = DestReg;
5388 auto MBI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: TmpReg)
5389 .addReg(RegNo: SrcReg)
5390 .addImm(Val: Sign * (int)ThisVal);
5391 if (ShiftSize)
5392 MBI = MBI.addImm(
5393 Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: LocalShiftSize));
5394 MBI = MBI.setMIFlag(Flag);
5395
5396 auto Change =
5397 VScale == 1
5398 ? StackOffset::getFixed(Fixed: ThisVal << LocalShiftSize)
5399 : StackOffset::getScalable(Scalable: VScale * (ThisVal << LocalShiftSize));
5400 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5401 CFAOffset += Change;
5402 else
5403 CFAOffset -= Change;
5404 if (EmitCFAOffset && DestReg == TmpReg) {
5405 MachineFunction &MF = *MBB.getParent();
5406 const TargetSubtargetInfo &STI = MF.getSubtarget();
5407 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5408
5409 unsigned CFIIndex = MF.addFrameInst(
5410 Inst: createDefCFA(TRI, FrameReg, Reg: DestReg, Offset: CFAOffset, LastAdjustmentWasScalable: VScale != 1));
5411 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::CFI_INSTRUCTION))
5412 .addCFIIndex(CFIIndex)
5413 .setMIFlags(Flag);
5414 }
5415
5416 if (NeedsWinCFI) {
5417 assert(Sign == 1 && "SEH directives should always have a positive sign");
5418 int Imm = (int)(ThisVal << LocalShiftSize);
5419 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5420 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5421 if (HasWinCFI)
5422 *HasWinCFI = true;
5423 if (Imm == 0)
5424 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_SetFP)).setMIFlag(Flag);
5425 else
5426 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_AddFP))
5427 .addImm(Val: Imm)
5428 .setMIFlag(Flag);
5429 assert(Offset == 0 && "Expected remaining offset to be zero to "
5430 "emit a single SEH directive");
5431 } else if (DestReg == AArch64::SP) {
5432 if (HasWinCFI)
5433 *HasWinCFI = true;
5434 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5435 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_StackAlloc))
5436 .addImm(Val: Imm)
5437 .setMIFlag(Flag);
5438 }
5439 }
5440
5441 SrcReg = TmpReg;
5442 } while (Offset);
5443}
5444
5445void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5446 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5447 unsigned DestReg, unsigned SrcReg,
5448 StackOffset Offset, const TargetInstrInfo *TII,
5449 MachineInstr::MIFlag Flag, bool SetNZCV,
5450 bool NeedsWinCFI, bool *HasWinCFI,
5451 bool EmitCFAOffset, StackOffset CFAOffset,
5452 unsigned FrameReg) {
5453 // If a function is marked as arm_locally_streaming, then the runtime value of
5454 // vscale in the prologue/epilogue is different the runtime value of vscale
5455 // in the function's body. To avoid having to consider multiple vscales,
5456 // we can use `addsvl` to allocate any scalable stack-slots, which under
5457 // most circumstances will be only locals, not callee-save slots.
5458 const Function &F = MBB.getParent()->getFunction();
5459 bool UseSVL = F.hasFnAttribute(Kind: "aarch64_pstate_sm_body");
5460
5461 int64_t Bytes, NumPredicateVectors, NumDataVectors;
5462 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5463 Offset, NumBytes&: Bytes, NumPredicateVectors, NumDataVectors);
5464
5465 // First emit non-scalable frame offsets, or a simple 'mov'.
5466 if (Bytes || (!Offset && SrcReg != DestReg)) {
5467 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5468 "SP increment/decrement not 8-byte aligned");
5469 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5470 if (Bytes < 0) {
5471 Bytes = -Bytes;
5472 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5473 }
5474 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: Bytes, Opc, TII, Flag,
5475 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5476 FrameReg);
5477 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5478 ? StackOffset::getFixed(Fixed: -Bytes)
5479 : StackOffset::getFixed(Fixed: Bytes);
5480 SrcReg = DestReg;
5481 FrameReg = DestReg;
5482 }
5483
5484 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5485 "SetNZCV not supported with SVE vectors");
5486 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5487 "WinCFI not supported with SVE vectors");
5488
5489 if (NumDataVectors) {
5490 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: NumDataVectors,
5491 Opc: UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5492 TII, Flag, NeedsWinCFI, HasWinCFI: nullptr, EmitCFAOffset,
5493 CFAOffset, FrameReg);
5494 CFAOffset += StackOffset::getScalable(Scalable: -NumDataVectors * 16);
5495 SrcReg = DestReg;
5496 }
5497
5498 if (NumPredicateVectors) {
5499 assert(DestReg != AArch64::SP && "Unaligned access to SP");
5500 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: NumPredicateVectors,
5501 Opc: UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5502 TII, Flag, NeedsWinCFI, HasWinCFI: nullptr, EmitCFAOffset,
5503 CFAOffset, FrameReg);
5504 }
5505}
5506
5507MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5508 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5509 MachineBasicBlock::iterator InsertPt, int FrameIndex,
5510 LiveIntervals *LIS, VirtRegMap *VRM) const {
5511 // This is a bit of a hack. Consider this instruction:
5512 //
5513 // %0 = COPY %sp; GPR64all:%0
5514 //
5515 // We explicitly chose GPR64all for the virtual register so such a copy might
5516 // be eliminated by RegisterCoalescer. However, that may not be possible, and
5517 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5518 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5519 //
5520 // To prevent that, we are going to constrain the %0 register class here.
5521 if (MI.isFullCopy()) {
5522 Register DstReg = MI.getOperand(i: 0).getReg();
5523 Register SrcReg = MI.getOperand(i: 1).getReg();
5524 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5525 MF.getRegInfo().constrainRegClass(Reg: DstReg, RC: &AArch64::GPR64RegClass);
5526 return nullptr;
5527 }
5528 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5529 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64RegClass);
5530 return nullptr;
5531 }
5532 // Nothing can folded with copy from/to NZCV.
5533 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5534 return nullptr;
5535 }
5536
5537 // Handle the case where a copy is being spilled or filled but the source
5538 // and destination register class don't match. For example:
5539 //
5540 // %0 = COPY %xzr; GPR64common:%0
5541 //
5542 // In this case we can still safely fold away the COPY and generate the
5543 // following spill code:
5544 //
5545 // STRXui %xzr, %stack.0
5546 //
5547 // This also eliminates spilled cross register class COPYs (e.g. between x and
5548 // d regs) of the same size. For example:
5549 //
5550 // %0 = COPY %1; GPR64:%0, FPR64:%1
5551 //
5552 // will be filled as
5553 //
5554 // LDRDui %0, fi<#0>
5555 //
5556 // instead of
5557 //
5558 // LDRXui %Temp, fi<#0>
5559 // %0 = FMOV %Temp
5560 //
5561 if (MI.isCopy() && Ops.size() == 1 &&
5562 // Make sure we're only folding the explicit COPY defs/uses.
5563 (Ops[0] == 0 || Ops[0] == 1)) {
5564 bool IsSpill = Ops[0] == 0;
5565 bool IsFill = !IsSpill;
5566 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
5567 const MachineRegisterInfo &MRI = MF.getRegInfo();
5568 MachineBasicBlock &MBB = *MI.getParent();
5569 const MachineOperand &DstMO = MI.getOperand(i: 0);
5570 const MachineOperand &SrcMO = MI.getOperand(i: 1);
5571 Register DstReg = DstMO.getReg();
5572 Register SrcReg = SrcMO.getReg();
5573 // This is slightly expensive to compute for physical regs since
5574 // getMinimalPhysRegClass is slow.
5575 auto getRegClass = [&](unsigned Reg) {
5576 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
5577 : TRI.getMinimalPhysRegClass(Reg);
5578 };
5579
5580 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
5581 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
5582 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
5583 "Mismatched register size in non subreg COPY");
5584 if (IsSpill)
5585 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg, isKill: SrcMO.isKill(), FI: FrameIndex,
5586 RC: getRegClass(SrcReg), TRI: &TRI, VReg: Register());
5587 else
5588 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex,
5589 RC: getRegClass(DstReg), TRI: &TRI, VReg: Register());
5590 return &*--InsertPt;
5591 }
5592
5593 // Handle cases like spilling def of:
5594 //
5595 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
5596 //
5597 // where the physical register source can be widened and stored to the full
5598 // virtual reg destination stack slot, in this case producing:
5599 //
5600 // STRXui %xzr, %stack.0
5601 //
5602 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
5603 TRI.getRegSizeInBits(RC: *getRegClass(DstReg)) == 64) {
5604 assert(SrcMO.getSubReg() == 0 &&
5605 "Unexpected subreg on physical register");
5606 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg: AArch64::XZR, isKill: SrcMO.isKill(),
5607 FI: FrameIndex, RC: &AArch64::GPR64RegClass, TRI: &TRI,
5608 VReg: Register());
5609 return &*--InsertPt;
5610 }
5611
5612 // Handle cases like filling use of:
5613 //
5614 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
5615 //
5616 // where we can load the full virtual reg source stack slot, into the subreg
5617 // destination, in this case producing:
5618 //
5619 // LDRWui %0:sub_32<def,read-undef>, %stack.0
5620 //
5621 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
5622 const TargetRegisterClass *FillRC;
5623 switch (DstMO.getSubReg()) {
5624 default:
5625 FillRC = nullptr;
5626 break;
5627 case AArch64::sub_32:
5628 FillRC = &AArch64::GPR32RegClass;
5629 break;
5630 case AArch64::ssub:
5631 FillRC = &AArch64::FPR32RegClass;
5632 break;
5633 case AArch64::dsub:
5634 FillRC = &AArch64::FPR64RegClass;
5635 break;
5636 }
5637
5638 if (FillRC) {
5639 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
5640 TRI.getRegSizeInBits(*FillRC) &&
5641 "Mismatched regclass size on folded subreg COPY");
5642 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex, RC: FillRC, TRI: &TRI,
5643 VReg: Register());
5644 MachineInstr &LoadMI = *--InsertPt;
5645 MachineOperand &LoadDst = LoadMI.getOperand(i: 0);
5646 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
5647 LoadDst.setSubReg(DstMO.getSubReg());
5648 LoadDst.setIsUndef();
5649 return &LoadMI;
5650 }
5651 }
5652 }
5653
5654 // Cannot fold.
5655 return nullptr;
5656}
5657
5658int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
5659 StackOffset &SOffset,
5660 bool *OutUseUnscaledOp,
5661 unsigned *OutUnscaledOp,
5662 int64_t *EmittableOffset) {
5663 // Set output values in case of early exit.
5664 if (EmittableOffset)
5665 *EmittableOffset = 0;
5666 if (OutUseUnscaledOp)
5667 *OutUseUnscaledOp = false;
5668 if (OutUnscaledOp)
5669 *OutUnscaledOp = 0;
5670
5671 // Exit early for structured vector spills/fills as they can't take an
5672 // immediate offset.
5673 switch (MI.getOpcode()) {
5674 default:
5675 break;
5676 case AArch64::LD1Rv1d:
5677 case AArch64::LD1Rv2s:
5678 case AArch64::LD1Rv2d:
5679 case AArch64::LD1Rv4h:
5680 case AArch64::LD1Rv4s:
5681 case AArch64::LD1Rv8b:
5682 case AArch64::LD1Rv8h:
5683 case AArch64::LD1Rv16b:
5684 case AArch64::LD1Twov2d:
5685 case AArch64::LD1Threev2d:
5686 case AArch64::LD1Fourv2d:
5687 case AArch64::LD1Twov1d:
5688 case AArch64::LD1Threev1d:
5689 case AArch64::LD1Fourv1d:
5690 case AArch64::ST1Twov2d:
5691 case AArch64::ST1Threev2d:
5692 case AArch64::ST1Fourv2d:
5693 case AArch64::ST1Twov1d:
5694 case AArch64::ST1Threev1d:
5695 case AArch64::ST1Fourv1d:
5696 case AArch64::ST1i8:
5697 case AArch64::ST1i16:
5698 case AArch64::ST1i32:
5699 case AArch64::ST1i64:
5700 case AArch64::IRG:
5701 case AArch64::IRGstack:
5702 case AArch64::STGloop:
5703 case AArch64::STZGloop:
5704 return AArch64FrameOffsetCannotUpdate;
5705 }
5706
5707 // Get the min/max offset and the scale.
5708 TypeSize ScaleValue(0U, false), Width(0U, false);
5709 int64_t MinOff, MaxOff;
5710 if (!AArch64InstrInfo::getMemOpInfo(Opcode: MI.getOpcode(), Scale&: ScaleValue, Width, MinOffset&: MinOff,
5711 MaxOffset&: MaxOff))
5712 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5713
5714 // Construct the complete offset.
5715 bool IsMulVL = ScaleValue.isScalable();
5716 unsigned Scale = ScaleValue.getKnownMinValue();
5717 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
5718
5719 const MachineOperand &ImmOpnd =
5720 MI.getOperand(i: AArch64InstrInfo::getLoadStoreImmIdx(Opc: MI.getOpcode()));
5721 Offset += ImmOpnd.getImm() * Scale;
5722
5723 // If the offset doesn't match the scale, we rewrite the instruction to
5724 // use the unscaled instruction instead. Likewise, if we have a negative
5725 // offset and there is an unscaled op to use.
5726 std::optional<unsigned> UnscaledOp =
5727 AArch64InstrInfo::getUnscaledLdSt(Opc: MI.getOpcode());
5728 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
5729 if (useUnscaledOp &&
5730 !AArch64InstrInfo::getMemOpInfo(Opcode: *UnscaledOp, Scale&: ScaleValue, Width, MinOffset&: MinOff,
5731 MaxOffset&: MaxOff))
5732 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5733
5734 Scale = ScaleValue.getKnownMinValue();
5735 assert(IsMulVL == ScaleValue.isScalable() &&
5736 "Unscaled opcode has different value for scalable");
5737
5738 int64_t Remainder = Offset % Scale;
5739 assert(!(Remainder && useUnscaledOp) &&
5740 "Cannot have remainder when using unscaled op");
5741
5742 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
5743 int64_t NewOffset = Offset / Scale;
5744 if (MinOff <= NewOffset && NewOffset <= MaxOff)
5745 Offset = Remainder;
5746 else {
5747 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
5748 Offset = Offset - (NewOffset * Scale);
5749 }
5750
5751 if (EmittableOffset)
5752 *EmittableOffset = NewOffset;
5753 if (OutUseUnscaledOp)
5754 *OutUseUnscaledOp = useUnscaledOp;
5755 if (OutUnscaledOp && UnscaledOp)
5756 *OutUnscaledOp = *UnscaledOp;
5757
5758 if (IsMulVL)
5759 SOffset = StackOffset::get(Fixed: SOffset.getFixed(), Scalable: Offset);
5760 else
5761 SOffset = StackOffset::get(Fixed: Offset, Scalable: SOffset.getScalable());
5762 return AArch64FrameOffsetCanUpdate |
5763 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
5764}
5765
5766bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
5767 unsigned FrameReg, StackOffset &Offset,
5768 const AArch64InstrInfo *TII) {
5769 unsigned Opcode = MI.getOpcode();
5770 unsigned ImmIdx = FrameRegIdx + 1;
5771
5772 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
5773 Offset += StackOffset::getFixed(Fixed: MI.getOperand(i: ImmIdx).getImm());
5774 emitFrameOffset(MBB&: *MI.getParent(), MBBI: MI, DL: MI.getDebugLoc(),
5775 DestReg: MI.getOperand(i: 0).getReg(), SrcReg: FrameReg, Offset, TII,
5776 Flag: MachineInstr::NoFlags, SetNZCV: (Opcode == AArch64::ADDSXri));
5777 MI.eraseFromParent();
5778 Offset = StackOffset();
5779 return true;
5780 }
5781
5782 int64_t NewOffset;
5783 unsigned UnscaledOp;
5784 bool UseUnscaledOp;
5785 int Status = isAArch64FrameOffsetLegal(MI, SOffset&: Offset, OutUseUnscaledOp: &UseUnscaledOp,
5786 OutUnscaledOp: &UnscaledOp, EmittableOffset: &NewOffset);
5787 if (Status & AArch64FrameOffsetCanUpdate) {
5788 if (Status & AArch64FrameOffsetIsLegal)
5789 // Replace the FrameIndex with FrameReg.
5790 MI.getOperand(i: FrameRegIdx).ChangeToRegister(Reg: FrameReg, isDef: false);
5791 if (UseUnscaledOp)
5792 MI.setDesc(TII->get(Opcode: UnscaledOp));
5793
5794 MI.getOperand(i: ImmIdx).ChangeToImmediate(ImmVal: NewOffset);
5795 return !Offset;
5796 }
5797
5798 return false;
5799}
5800
5801void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
5802 MachineBasicBlock::iterator MI) const {
5803 DebugLoc DL;
5804 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AArch64::HINT)).addImm(Val: 0);
5805}
5806
5807MCInst AArch64InstrInfo::getNop() const {
5808 return MCInstBuilder(AArch64::HINT).addImm(Val: 0);
5809}
5810
5811// AArch64 supports MachineCombiner.
5812bool AArch64InstrInfo::useMachineCombiner() const { return true; }
5813
5814// True when Opc sets flag
5815static bool isCombineInstrSettingFlag(unsigned Opc) {
5816 switch (Opc) {
5817 case AArch64::ADDSWrr:
5818 case AArch64::ADDSWri:
5819 case AArch64::ADDSXrr:
5820 case AArch64::ADDSXri:
5821 case AArch64::SUBSWrr:
5822 case AArch64::SUBSXrr:
5823 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5824 case AArch64::SUBSWri:
5825 case AArch64::SUBSXri:
5826 return true;
5827 default:
5828 break;
5829 }
5830 return false;
5831}
5832
5833// 32b Opcodes that can be combined with a MUL
5834static bool isCombineInstrCandidate32(unsigned Opc) {
5835 switch (Opc) {
5836 case AArch64::ADDWrr:
5837 case AArch64::ADDWri:
5838 case AArch64::SUBWrr:
5839 case AArch64::ADDSWrr:
5840 case AArch64::ADDSWri:
5841 case AArch64::SUBSWrr:
5842 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5843 case AArch64::SUBWri:
5844 case AArch64::SUBSWri:
5845 return true;
5846 default:
5847 break;
5848 }
5849 return false;
5850}
5851
5852// 64b Opcodes that can be combined with a MUL
5853static bool isCombineInstrCandidate64(unsigned Opc) {
5854 switch (Opc) {
5855 case AArch64::ADDXrr:
5856 case AArch64::ADDXri:
5857 case AArch64::SUBXrr:
5858 case AArch64::ADDSXrr:
5859 case AArch64::ADDSXri:
5860 case AArch64::SUBSXrr:
5861 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5862 case AArch64::SUBXri:
5863 case AArch64::SUBSXri:
5864 case AArch64::ADDv8i8:
5865 case AArch64::ADDv16i8:
5866 case AArch64::ADDv4i16:
5867 case AArch64::ADDv8i16:
5868 case AArch64::ADDv2i32:
5869 case AArch64::ADDv4i32:
5870 case AArch64::SUBv8i8:
5871 case AArch64::SUBv16i8:
5872 case AArch64::SUBv4i16:
5873 case AArch64::SUBv8i16:
5874 case AArch64::SUBv2i32:
5875 case AArch64::SUBv4i32:
5876 return true;
5877 default:
5878 break;
5879 }
5880 return false;
5881}
5882
5883// FP Opcodes that can be combined with a FMUL.
5884static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
5885 switch (Inst.getOpcode()) {
5886 default:
5887 break;
5888 case AArch64::FADDHrr:
5889 case AArch64::FADDSrr:
5890 case AArch64::FADDDrr:
5891 case AArch64::FADDv4f16:
5892 case AArch64::FADDv8f16:
5893 case AArch64::FADDv2f32:
5894 case AArch64::FADDv2f64:
5895 case AArch64::FADDv4f32:
5896 case AArch64::FSUBHrr:
5897 case AArch64::FSUBSrr:
5898 case AArch64::FSUBDrr:
5899 case AArch64::FSUBv4f16:
5900 case AArch64::FSUBv8f16:
5901 case AArch64::FSUBv2f32:
5902 case AArch64::FSUBv2f64:
5903 case AArch64::FSUBv4f32:
5904 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
5905 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
5906 // the target options or if FADD/FSUB has the contract fast-math flag.
5907 return Options.UnsafeFPMath ||
5908 Options.AllowFPOpFusion == FPOpFusion::Fast ||
5909 Inst.getFlag(Flag: MachineInstr::FmContract);
5910 return true;
5911 }
5912 return false;
5913}
5914
5915// Opcodes that can be combined with a MUL
5916static bool isCombineInstrCandidate(unsigned Opc) {
5917 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
5918}
5919
5920//
5921// Utility routine that checks if \param MO is defined by an
5922// \param CombineOpc instruction in the basic block \param MBB
5923static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
5924 unsigned CombineOpc, unsigned ZeroReg = 0,
5925 bool CheckZeroReg = false) {
5926 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5927 MachineInstr *MI = nullptr;
5928
5929 if (MO.isReg() && MO.getReg().isVirtual())
5930 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
5931 // And it needs to be in the trace (otherwise, it won't have a depth).
5932 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
5933 return false;
5934 // Must only used by the user we combine with.
5935 if (!MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
5936 return false;
5937
5938 if (CheckZeroReg) {
5939 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
5940 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
5941 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
5942 // The third input reg must be zero.
5943 if (MI->getOperand(i: 3).getReg() != ZeroReg)
5944 return false;
5945 }
5946
5947 if (isCombineInstrSettingFlag(Opc: CombineOpc) &&
5948 MI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) == -1)
5949 return false;
5950
5951 return true;
5952}
5953
5954//
5955// Is \param MO defined by an integer multiply and can be combined?
5956static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5957 unsigned MulOpc, unsigned ZeroReg) {
5958 return canCombine(MBB, MO, CombineOpc: MulOpc, ZeroReg, CheckZeroReg: true);
5959}
5960
5961//
5962// Is \param MO defined by a floating-point multiply and can be combined?
5963static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5964 unsigned MulOpc) {
5965 return canCombine(MBB, MO, CombineOpc: MulOpc);
5966}
5967
5968// TODO: There are many more machine instruction opcodes to match:
5969// 1. Other data types (integer, vectors)
5970// 2. Other math / logic operations (xor, or)
5971// 3. Other forms of the same operation (intrinsics and other variants)
5972bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
5973 bool Invert) const {
5974 if (Invert)
5975 return false;
5976 switch (Inst.getOpcode()) {
5977 // == Floating-point types ==
5978 // -- Floating-point instructions --
5979 case AArch64::FADDHrr:
5980 case AArch64::FADDSrr:
5981 case AArch64::FADDDrr:
5982 case AArch64::FMULHrr:
5983 case AArch64::FMULSrr:
5984 case AArch64::FMULDrr:
5985 case AArch64::FMULX16:
5986 case AArch64::FMULX32:
5987 case AArch64::FMULX64:
5988 // -- Advanced SIMD instructions --
5989 case AArch64::FADDv4f16:
5990 case AArch64::FADDv8f16:
5991 case AArch64::FADDv2f32:
5992 case AArch64::FADDv4f32:
5993 case AArch64::FADDv2f64:
5994 case AArch64::FMULv4f16:
5995 case AArch64::FMULv8f16:
5996 case AArch64::FMULv2f32:
5997 case AArch64::FMULv4f32:
5998 case AArch64::FMULv2f64:
5999 case AArch64::FMULXv4f16:
6000 case AArch64::FMULXv8f16:
6001 case AArch64::FMULXv2f32:
6002 case AArch64::FMULXv4f32:
6003 case AArch64::FMULXv2f64:
6004 // -- SVE instructions --
6005 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
6006 // in the SVE instruction set (though there are predicated ones).
6007 case AArch64::FADD_ZZZ_H:
6008 case AArch64::FADD_ZZZ_S:
6009 case AArch64::FADD_ZZZ_D:
6010 case AArch64::FMUL_ZZZ_H:
6011 case AArch64::FMUL_ZZZ_S:
6012 case AArch64::FMUL_ZZZ_D:
6013 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
6014 (Inst.getFlag(Flag: MachineInstr::MIFlag::FmReassoc) &&
6015 Inst.getFlag(Flag: MachineInstr::MIFlag::FmNsz));
6016
6017 // == Integer types ==
6018 // -- Base instructions --
6019 // Opcodes MULWrr and MULXrr don't exist because
6020 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
6021 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
6022 // The machine-combiner does not support three-source-operands machine
6023 // instruction. So we cannot reassociate MULs.
6024 case AArch64::ADDWrr:
6025 case AArch64::ADDXrr:
6026 case AArch64::ANDWrr:
6027 case AArch64::ANDXrr:
6028 case AArch64::ORRWrr:
6029 case AArch64::ORRXrr:
6030 case AArch64::EORWrr:
6031 case AArch64::EORXrr:
6032 case AArch64::EONWrr:
6033 case AArch64::EONXrr:
6034 // -- Advanced SIMD instructions --
6035 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
6036 // in the Advanced SIMD instruction set.
6037 case AArch64::ADDv8i8:
6038 case AArch64::ADDv16i8:
6039 case AArch64::ADDv4i16:
6040 case AArch64::ADDv8i16:
6041 case AArch64::ADDv2i32:
6042 case AArch64::ADDv4i32:
6043 case AArch64::ADDv1i64:
6044 case AArch64::ADDv2i64:
6045 case AArch64::MULv8i8:
6046 case AArch64::MULv16i8:
6047 case AArch64::MULv4i16:
6048 case AArch64::MULv8i16:
6049 case AArch64::MULv2i32:
6050 case AArch64::MULv4i32:
6051 case AArch64::ANDv8i8:
6052 case AArch64::ANDv16i8:
6053 case AArch64::ORRv8i8:
6054 case AArch64::ORRv16i8:
6055 case AArch64::EORv8i8:
6056 case AArch64::EORv16i8:
6057 // -- SVE instructions --
6058 case AArch64::ADD_ZZZ_B:
6059 case AArch64::ADD_ZZZ_H:
6060 case AArch64::ADD_ZZZ_S:
6061 case AArch64::ADD_ZZZ_D:
6062 case AArch64::MUL_ZZZ_B:
6063 case AArch64::MUL_ZZZ_H:
6064 case AArch64::MUL_ZZZ_S:
6065 case AArch64::MUL_ZZZ_D:
6066 case AArch64::AND_ZZZ:
6067 case AArch64::ORR_ZZZ:
6068 case AArch64::EOR_ZZZ:
6069 return true;
6070
6071 default:
6072 return false;
6073 }
6074}
6075
6076/// Find instructions that can be turned into madd.
6077static bool getMaddPatterns(MachineInstr &Root,
6078 SmallVectorImpl<unsigned> &Patterns) {
6079 unsigned Opc = Root.getOpcode();
6080 MachineBasicBlock &MBB = *Root.getParent();
6081 bool Found = false;
6082
6083 if (!isCombineInstrCandidate(Opc))
6084 return false;
6085 if (isCombineInstrSettingFlag(Opc)) {
6086 int Cmp_NZCV =
6087 Root.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true);
6088 // When NZCV is live bail out.
6089 if (Cmp_NZCV == -1)
6090 return false;
6091 unsigned NewOpc = convertToNonFlagSettingOpc(MI: Root);
6092 // When opcode can't change bail out.
6093 // CHECKME: do we miss any cases for opcode conversion?
6094 if (NewOpc == Opc)
6095 return false;
6096 Opc = NewOpc;
6097 }
6098
6099 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6100 unsigned Pattern) {
6101 if (canCombineWithMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode, ZeroReg)) {
6102 Patterns.push_back(Elt: Pattern);
6103 Found = true;
6104 }
6105 };
6106
6107 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
6108 if (canCombine(MBB, MO&: Root.getOperand(i: Operand), CombineOpc: Opcode)) {
6109 Patterns.push_back(Elt: Pattern);
6110 Found = true;
6111 }
6112 };
6113
6114 typedef AArch64MachineCombinerPattern MCP;
6115
6116 switch (Opc) {
6117 default:
6118 break;
6119 case AArch64::ADDWrr:
6120 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6121 "ADDWrr does not have register operands");
6122 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6123 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6124 break;
6125 case AArch64::ADDXrr:
6126 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6127 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6128 break;
6129 case AArch64::SUBWrr:
6130 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6131 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6132 break;
6133 case AArch64::SUBXrr:
6134 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6135 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6136 break;
6137 case AArch64::ADDWri:
6138 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6139 break;
6140 case AArch64::ADDXri:
6141 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6142 break;
6143 case AArch64::SUBWri:
6144 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6145 break;
6146 case AArch64::SUBXri:
6147 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6148 break;
6149 case AArch64::ADDv8i8:
6150 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6151 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6152 break;
6153 case AArch64::ADDv16i8:
6154 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6155 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6156 break;
6157 case AArch64::ADDv4i16:
6158 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6159 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6160 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6161 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6162 break;
6163 case AArch64::ADDv8i16:
6164 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6165 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6166 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6167 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6168 break;
6169 case AArch64::ADDv2i32:
6170 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6171 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6172 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6173 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6174 break;
6175 case AArch64::ADDv4i32:
6176 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6177 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6178 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6179 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6180 break;
6181 case AArch64::SUBv8i8:
6182 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6183 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6184 break;
6185 case AArch64::SUBv16i8:
6186 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6187 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6188 break;
6189 case AArch64::SUBv4i16:
6190 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6191 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6192 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6193 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6194 break;
6195 case AArch64::SUBv8i16:
6196 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6197 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6198 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6199 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6200 break;
6201 case AArch64::SUBv2i32:
6202 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6203 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6204 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6205 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6206 break;
6207 case AArch64::SUBv4i32:
6208 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6209 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6210 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6211 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6212 break;
6213 }
6214 return Found;
6215}
6216/// Floating-Point Support
6217
6218/// Find instructions that can be turned into madd.
6219static bool getFMAPatterns(MachineInstr &Root,
6220 SmallVectorImpl<unsigned> &Patterns) {
6221
6222 if (!isCombineInstrCandidateFP(Inst: Root))
6223 return false;
6224
6225 MachineBasicBlock &MBB = *Root.getParent();
6226 bool Found = false;
6227
6228 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
6229 if (canCombineWithFMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode)) {
6230 Patterns.push_back(Elt: Pattern);
6231 return true;
6232 }
6233 return false;
6234 };
6235
6236 typedef AArch64MachineCombinerPattern MCP;
6237
6238 switch (Root.getOpcode()) {
6239 default:
6240 assert(false && "Unsupported FP instruction in combiner\n");
6241 break;
6242 case AArch64::FADDHrr:
6243 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6244 "FADDHrr does not have register operands");
6245
6246 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6247 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6248 break;
6249 case AArch64::FADDSrr:
6250 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6251 "FADDSrr does not have register operands");
6252
6253 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6254 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6255
6256 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6257 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6258 break;
6259 case AArch64::FADDDrr:
6260 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6261 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6262
6263 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6264 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6265 break;
6266 case AArch64::FADDv4f16:
6267 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6268 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6269
6270 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6271 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6272 break;
6273 case AArch64::FADDv8f16:
6274 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6275 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6276
6277 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6278 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6279 break;
6280 case AArch64::FADDv2f32:
6281 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6282 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6283
6284 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6285 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6286 break;
6287 case AArch64::FADDv2f64:
6288 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6289 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6290
6291 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6292 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6293 break;
6294 case AArch64::FADDv4f32:
6295 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6296 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6297
6298 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6299 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6300 break;
6301 case AArch64::FSUBHrr:
6302 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6303 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6304 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6305 break;
6306 case AArch64::FSUBSrr:
6307 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6308
6309 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6310 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6311
6312 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6313 break;
6314 case AArch64::FSUBDrr:
6315 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6316
6317 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6318 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6319
6320 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6321 break;
6322 case AArch64::FSUBv4f16:
6323 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6324 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6325
6326 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6327 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6328 break;
6329 case AArch64::FSUBv8f16:
6330 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6331 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6332
6333 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6334 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6335 break;
6336 case AArch64::FSUBv2f32:
6337 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6338 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6339
6340 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6341 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6342 break;
6343 case AArch64::FSUBv2f64:
6344 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6345 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6346
6347 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6348 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6349 break;
6350 case AArch64::FSUBv4f32:
6351 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6352 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6353
6354 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6355 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6356 break;
6357 }
6358 return Found;
6359}
6360
6361static bool getFMULPatterns(MachineInstr &Root,
6362 SmallVectorImpl<unsigned> &Patterns) {
6363 MachineBasicBlock &MBB = *Root.getParent();
6364 bool Found = false;
6365
6366 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
6367 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6368 MachineOperand &MO = Root.getOperand(i: Operand);
6369 MachineInstr *MI = nullptr;
6370 if (MO.isReg() && MO.getReg().isVirtual())
6371 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
6372 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6373 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6374 MI->getOperand(i: 1).getReg().isVirtual())
6375 MI = MRI.getUniqueVRegDef(Reg: MI->getOperand(i: 1).getReg());
6376 if (MI && MI->getOpcode() == Opcode) {
6377 Patterns.push_back(Elt: Pattern);
6378 return true;
6379 }
6380 return false;
6381 };
6382
6383 typedef AArch64MachineCombinerPattern MCP;
6384
6385 switch (Root.getOpcode()) {
6386 default:
6387 return false;
6388 case AArch64::FMULv2f32:
6389 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6390 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6391 break;
6392 case AArch64::FMULv2f64:
6393 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6394 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6395 break;
6396 case AArch64::FMULv4f16:
6397 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6398 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6399 break;
6400 case AArch64::FMULv4f32:
6401 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6402 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6403 break;
6404 case AArch64::FMULv8f16:
6405 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6406 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6407 break;
6408 }
6409
6410 return Found;
6411}
6412
6413static bool getFNEGPatterns(MachineInstr &Root,
6414 SmallVectorImpl<unsigned> &Patterns) {
6415 unsigned Opc = Root.getOpcode();
6416 MachineBasicBlock &MBB = *Root.getParent();
6417 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6418
6419 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
6420 MachineOperand &MO = Root.getOperand(i: 1);
6421 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
6422 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6423 MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()) &&
6424 Root.getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
6425 Root.getFlag(Flag: MachineInstr::MIFlag::FmNsz) &&
6426 MI->getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
6427 MI->getFlag(Flag: MachineInstr::MIFlag::FmNsz)) {
6428 Patterns.push_back(Elt: Pattern);
6429 return true;
6430 }
6431 return false;
6432 };
6433
6434 switch (Opc) {
6435 default:
6436 break;
6437 case AArch64::FNEGDr:
6438 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
6439 case AArch64::FNEGSr:
6440 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
6441 }
6442
6443 return false;
6444}
6445
6446/// Return true when a code sequence can improve throughput. It
6447/// should be called only for instructions in loops.
6448/// \param Pattern - combiner pattern
6449bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
6450 switch (Pattern) {
6451 default:
6452 break;
6453 case AArch64MachineCombinerPattern::FMULADDH_OP1:
6454 case AArch64MachineCombinerPattern::FMULADDH_OP2:
6455 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
6456 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
6457 case AArch64MachineCombinerPattern::FMULADDS_OP1:
6458 case AArch64MachineCombinerPattern::FMULADDS_OP2:
6459 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
6460 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
6461 case AArch64MachineCombinerPattern::FMULADDD_OP1:
6462 case AArch64MachineCombinerPattern::FMULADDD_OP2:
6463 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
6464 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
6465 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
6466 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
6467 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
6468 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6469 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6470 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6471 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6472 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6473 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6474 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6475 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6476 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
6477 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
6478 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
6479 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
6480 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
6481 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
6482 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
6483 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
6484 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6485 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6486 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6487 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6488 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
6489 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
6490 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6491 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6492 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6493 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6494 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6495 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6496 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6497 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6498 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6499 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6500 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
6501 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
6502 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
6503 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
6504 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
6505 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
6506 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6507 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
6508 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
6509 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
6510 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
6511 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
6512 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
6513 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
6514 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
6515 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
6516 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
6517 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
6518 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
6519 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
6520 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
6521 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
6522 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
6523 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
6524 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
6525 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
6526 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
6527 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
6528 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
6529 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
6530 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
6531 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
6532 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
6533 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
6534 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
6535 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
6536 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
6537 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
6538 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
6539 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
6540 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
6541 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
6542 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6543 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6544 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6545 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6546 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6547 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6548 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6549 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6550 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6551 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6552 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
6553 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
6554 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
6555 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
6556 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
6557 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
6558 return true;
6559 } // end switch (Pattern)
6560 return false;
6561}
6562
6563/// Find other MI combine patterns.
6564static bool getMiscPatterns(MachineInstr &Root,
6565 SmallVectorImpl<unsigned> &Patterns) {
6566 // A - (B + C) ==> (A - B) - C or (A - C) - B
6567 unsigned Opc = Root.getOpcode();
6568 MachineBasicBlock &MBB = *Root.getParent();
6569
6570 switch (Opc) {
6571 case AArch64::SUBWrr:
6572 case AArch64::SUBSWrr:
6573 case AArch64::SUBXrr:
6574 case AArch64::SUBSXrr:
6575 // Found candidate root.
6576 break;
6577 default:
6578 return false;
6579 }
6580
6581 if (isCombineInstrSettingFlag(Opc) &&
6582 Root.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) ==
6583 -1)
6584 return false;
6585
6586 if (canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDWrr) ||
6587 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDSWrr) ||
6588 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDXrr) ||
6589 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDSXrr)) {
6590 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP1);
6591 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP2);
6592 return true;
6593 }
6594
6595 return false;
6596}
6597
6598CombinerObjective
6599AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
6600 switch (Pattern) {
6601 case AArch64MachineCombinerPattern::SUBADD_OP1:
6602 case AArch64MachineCombinerPattern::SUBADD_OP2:
6603 return CombinerObjective::MustReduceDepth;
6604 default:
6605 return TargetInstrInfo::getCombinerObjective(Pattern);
6606 }
6607}
6608
6609/// Return true when there is potentially a faster code sequence for an
6610/// instruction chain ending in \p Root. All potential patterns are listed in
6611/// the \p Pattern vector. Pattern should be sorted in priority order since the
6612/// pattern evaluator stops checking as soon as it finds a faster sequence.
6613
6614bool AArch64InstrInfo::getMachineCombinerPatterns(
6615 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
6616 bool DoRegPressureReduce) const {
6617 // Integer patterns
6618 if (getMaddPatterns(Root, Patterns))
6619 return true;
6620 // Floating point patterns
6621 if (getFMULPatterns(Root, Patterns))
6622 return true;
6623 if (getFMAPatterns(Root, Patterns))
6624 return true;
6625 if (getFNEGPatterns(Root, Patterns))
6626 return true;
6627
6628 // Other patterns
6629 if (getMiscPatterns(Root, Patterns))
6630 return true;
6631
6632 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
6633 DoRegPressureReduce);
6634}
6635
6636enum class FMAInstKind { Default, Indexed, Accumulator };
6637/// genFusedMultiply - Generate fused multiply instructions.
6638/// This function supports both integer and floating point instructions.
6639/// A typical example:
6640/// F|MUL I=A,B,0
6641/// F|ADD R,I,C
6642/// ==> F|MADD R,A,B,C
6643/// \param MF Containing MachineFunction
6644/// \param MRI Register information
6645/// \param TII Target information
6646/// \param Root is the F|ADD instruction
6647/// \param [out] InsInstrs is a vector of machine instructions and will
6648/// contain the generated madd instruction
6649/// \param IdxMulOpd is index of operand in Root that is the result of
6650/// the F|MUL. In the example above IdxMulOpd is 1.
6651/// \param MaddOpc the opcode fo the f|madd instruction
6652/// \param RC Register class of operands
6653/// \param kind of fma instruction (addressing mode) to be generated
6654/// \param ReplacedAddend is the result register from the instruction
6655/// replacing the non-combined operand, if any.
6656static MachineInstr *
6657genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
6658 const TargetInstrInfo *TII, MachineInstr &Root,
6659 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
6660 unsigned MaddOpc, const TargetRegisterClass *RC,
6661 FMAInstKind kind = FMAInstKind::Default,
6662 const Register *ReplacedAddend = nullptr) {
6663 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6664
6665 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
6666 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
6667 Register ResultReg = Root.getOperand(i: 0).getReg();
6668 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
6669 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
6670 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
6671 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
6672
6673 Register SrcReg2;
6674 bool Src2IsKill;
6675 if (ReplacedAddend) {
6676 // If we just generated a new addend, we must be it's only use.
6677 SrcReg2 = *ReplacedAddend;
6678 Src2IsKill = true;
6679 } else {
6680 SrcReg2 = Root.getOperand(i: IdxOtherOpd).getReg();
6681 Src2IsKill = Root.getOperand(i: IdxOtherOpd).isKill();
6682 }
6683
6684 if (ResultReg.isVirtual())
6685 MRI.constrainRegClass(Reg: ResultReg, RC);
6686 if (SrcReg0.isVirtual())
6687 MRI.constrainRegClass(Reg: SrcReg0, RC);
6688 if (SrcReg1.isVirtual())
6689 MRI.constrainRegClass(Reg: SrcReg1, RC);
6690 if (SrcReg2.isVirtual())
6691 MRI.constrainRegClass(Reg: SrcReg2, RC);
6692
6693 MachineInstrBuilder MIB;
6694 if (kind == FMAInstKind::Default)
6695 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
6696 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: Src0IsKill))
6697 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: Src1IsKill))
6698 .addReg(RegNo: SrcReg2, flags: getKillRegState(B: Src2IsKill));
6699 else if (kind == FMAInstKind::Indexed)
6700 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
6701 .addReg(RegNo: SrcReg2, flags: getKillRegState(B: Src2IsKill))
6702 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: Src0IsKill))
6703 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: Src1IsKill))
6704 .addImm(Val: MUL->getOperand(i: 3).getImm());
6705 else if (kind == FMAInstKind::Accumulator)
6706 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
6707 .addReg(RegNo: SrcReg2, flags: getKillRegState(B: Src2IsKill))
6708 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: Src0IsKill))
6709 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: Src1IsKill));
6710 else
6711 assert(false && "Invalid FMA instruction kind \n");
6712 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
6713 InsInstrs.push_back(Elt: MIB);
6714 return MUL;
6715}
6716
6717static MachineInstr *
6718genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
6719 const TargetInstrInfo *TII, MachineInstr &Root,
6720 SmallVectorImpl<MachineInstr *> &InsInstrs) {
6721 MachineInstr *MAD = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 1).getReg());
6722
6723 unsigned Opc = 0;
6724 const TargetRegisterClass *RC = MRI.getRegClass(Reg: MAD->getOperand(i: 0).getReg());
6725 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6726 Opc = AArch64::FNMADDSrrr;
6727 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
6728 Opc = AArch64::FNMADDDrrr;
6729 else
6730 return nullptr;
6731
6732 Register ResultReg = Root.getOperand(i: 0).getReg();
6733 Register SrcReg0 = MAD->getOperand(i: 1).getReg();
6734 Register SrcReg1 = MAD->getOperand(i: 2).getReg();
6735 Register SrcReg2 = MAD->getOperand(i: 3).getReg();
6736 bool Src0IsKill = MAD->getOperand(i: 1).isKill();
6737 bool Src1IsKill = MAD->getOperand(i: 2).isKill();
6738 bool Src2IsKill = MAD->getOperand(i: 3).isKill();
6739 if (ResultReg.isVirtual())
6740 MRI.constrainRegClass(Reg: ResultReg, RC);
6741 if (SrcReg0.isVirtual())
6742 MRI.constrainRegClass(Reg: SrcReg0, RC);
6743 if (SrcReg1.isVirtual())
6744 MRI.constrainRegClass(Reg: SrcReg1, RC);
6745 if (SrcReg2.isVirtual())
6746 MRI.constrainRegClass(Reg: SrcReg2, RC);
6747
6748 MachineInstrBuilder MIB =
6749 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: Opc), DestReg: ResultReg)
6750 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: Src0IsKill))
6751 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: Src1IsKill))
6752 .addReg(RegNo: SrcReg2, flags: getKillRegState(B: Src2IsKill));
6753 InsInstrs.push_back(Elt: MIB);
6754
6755 return MAD;
6756}
6757
6758/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
6759static MachineInstr *
6760genIndexedMultiply(MachineInstr &Root,
6761 SmallVectorImpl<MachineInstr *> &InsInstrs,
6762 unsigned IdxDupOp, unsigned MulOpc,
6763 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
6764 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
6765 "Invalid index of FMUL operand");
6766
6767 MachineFunction &MF = *Root.getMF();
6768 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6769
6770 MachineInstr *Dup =
6771 MF.getRegInfo().getUniqueVRegDef(Reg: Root.getOperand(i: IdxDupOp).getReg());
6772
6773 if (Dup->getOpcode() == TargetOpcode::COPY)
6774 Dup = MRI.getUniqueVRegDef(Reg: Dup->getOperand(i: 1).getReg());
6775
6776 Register DupSrcReg = Dup->getOperand(i: 1).getReg();
6777 MRI.clearKillFlags(Reg: DupSrcReg);
6778 MRI.constrainRegClass(Reg: DupSrcReg, RC);
6779
6780 unsigned DupSrcLane = Dup->getOperand(i: 2).getImm();
6781
6782 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
6783 MachineOperand &MulOp = Root.getOperand(i: IdxMulOp);
6784
6785 Register ResultReg = Root.getOperand(i: 0).getReg();
6786
6787 MachineInstrBuilder MIB;
6788 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MulOpc), DestReg: ResultReg)
6789 .add(MO: MulOp)
6790 .addReg(RegNo: DupSrcReg)
6791 .addImm(Val: DupSrcLane);
6792
6793 InsInstrs.push_back(Elt: MIB);
6794 return &Root;
6795}
6796
6797/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
6798/// instructions.
6799///
6800/// \see genFusedMultiply
6801static MachineInstr *genFusedMultiplyAcc(
6802 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6803 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6804 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6805 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6806 kind: FMAInstKind::Accumulator);
6807}
6808
6809/// genNeg - Helper to generate an intermediate negation of the second operand
6810/// of Root
6811static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
6812 const TargetInstrInfo *TII, MachineInstr &Root,
6813 SmallVectorImpl<MachineInstr *> &InsInstrs,
6814 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
6815 unsigned MnegOpc, const TargetRegisterClass *RC) {
6816 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
6817 MachineInstrBuilder MIB =
6818 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MnegOpc), DestReg: NewVR)
6819 .add(MO: Root.getOperand(i: 2));
6820 InsInstrs.push_back(Elt: MIB);
6821
6822 assert(InstrIdxForVirtReg.empty());
6823 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
6824
6825 return NewVR;
6826}
6827
6828/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6829/// instructions with an additional negation of the accumulator
6830static MachineInstr *genFusedMultiplyAccNeg(
6831 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6832 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6833 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6834 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6835 assert(IdxMulOpd == 1);
6836
6837 Register NewVR =
6838 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6839 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6840 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
6841}
6842
6843/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
6844/// instructions.
6845///
6846/// \see genFusedMultiply
6847static MachineInstr *genFusedMultiplyIdx(
6848 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6849 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6850 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6851 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6852 kind: FMAInstKind::Indexed);
6853}
6854
6855/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6856/// instructions with an additional negation of the accumulator
6857static MachineInstr *genFusedMultiplyIdxNeg(
6858 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6859 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6860 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6861 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6862 assert(IdxMulOpd == 1);
6863
6864 Register NewVR =
6865 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6866
6867 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6868 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
6869}
6870
6871/// genMaddR - Generate madd instruction and combine mul and add using
6872/// an extra virtual register
6873/// Example - an ADD intermediate needs to be stored in a register:
6874/// MUL I=A,B,0
6875/// ADD R,I,Imm
6876/// ==> ORR V, ZR, Imm
6877/// ==> MADD R,A,B,V
6878/// \param MF Containing MachineFunction
6879/// \param MRI Register information
6880/// \param TII Target information
6881/// \param Root is the ADD instruction
6882/// \param [out] InsInstrs is a vector of machine instructions and will
6883/// contain the generated madd instruction
6884/// \param IdxMulOpd is index of operand in Root that is the result of
6885/// the MUL. In the example above IdxMulOpd is 1.
6886/// \param MaddOpc the opcode fo the madd instruction
6887/// \param VR is a virtual register that holds the value of an ADD operand
6888/// (V in the example above).
6889/// \param RC Register class of operands
6890static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
6891 const TargetInstrInfo *TII, MachineInstr &Root,
6892 SmallVectorImpl<MachineInstr *> &InsInstrs,
6893 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
6894 const TargetRegisterClass *RC) {
6895 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6896
6897 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
6898 Register ResultReg = Root.getOperand(i: 0).getReg();
6899 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
6900 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
6901 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
6902 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
6903
6904 if (ResultReg.isVirtual())
6905 MRI.constrainRegClass(Reg: ResultReg, RC);
6906 if (SrcReg0.isVirtual())
6907 MRI.constrainRegClass(Reg: SrcReg0, RC);
6908 if (SrcReg1.isVirtual())
6909 MRI.constrainRegClass(Reg: SrcReg1, RC);
6910 if (Register::isVirtualRegister(Reg: VR))
6911 MRI.constrainRegClass(Reg: VR, RC);
6912
6913 MachineInstrBuilder MIB =
6914 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
6915 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: Src0IsKill))
6916 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: Src1IsKill))
6917 .addReg(RegNo: VR);
6918 // Insert the MADD
6919 InsInstrs.push_back(Elt: MIB);
6920 return MUL;
6921}
6922
6923/// Do the following transformation
6924/// A - (B + C) ==> (A - B) - C
6925/// A - (B + C) ==> (A - C) - B
6926static void
6927genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
6928 const TargetInstrInfo *TII, MachineInstr &Root,
6929 SmallVectorImpl<MachineInstr *> &InsInstrs,
6930 SmallVectorImpl<MachineInstr *> &DelInstrs,
6931 unsigned IdxOpd1,
6932 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
6933 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
6934 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
6935 MachineInstr *AddMI = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 2).getReg());
6936
6937 Register ResultReg = Root.getOperand(i: 0).getReg();
6938 Register RegA = Root.getOperand(i: 1).getReg();
6939 bool RegAIsKill = Root.getOperand(i: 1).isKill();
6940 Register RegB = AddMI->getOperand(i: IdxOpd1).getReg();
6941 bool RegBIsKill = AddMI->getOperand(i: IdxOpd1).isKill();
6942 Register RegC = AddMI->getOperand(i: IdxOtherOpd).getReg();
6943 bool RegCIsKill = AddMI->getOperand(i: IdxOtherOpd).isKill();
6944 Register NewVR = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: RegA));
6945
6946 unsigned Opcode = Root.getOpcode();
6947 if (Opcode == AArch64::SUBSWrr)
6948 Opcode = AArch64::SUBWrr;
6949 else if (Opcode == AArch64::SUBSXrr)
6950 Opcode = AArch64::SUBXrr;
6951 else
6952 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
6953 "Unexpected instruction opcode.");
6954
6955 uint32_t Flags = Root.mergeFlagsWith(Other: *AddMI);
6956 Flags &= ~MachineInstr::NoSWrap;
6957 Flags &= ~MachineInstr::NoUWrap;
6958
6959 MachineInstrBuilder MIB1 =
6960 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: NewVR)
6961 .addReg(RegNo: RegA, flags: getKillRegState(B: RegAIsKill))
6962 .addReg(RegNo: RegB, flags: getKillRegState(B: RegBIsKill))
6963 .setMIFlags(Flags);
6964 MachineInstrBuilder MIB2 =
6965 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: ResultReg)
6966 .addReg(RegNo: NewVR, flags: getKillRegState(B: true))
6967 .addReg(RegNo: RegC, flags: getKillRegState(B: RegCIsKill))
6968 .setMIFlags(Flags);
6969
6970 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
6971 InsInstrs.push_back(Elt: MIB1);
6972 InsInstrs.push_back(Elt: MIB2);
6973 DelInstrs.push_back(Elt: AddMI);
6974 DelInstrs.push_back(Elt: &Root);
6975}
6976
6977/// When getMachineCombinerPatterns() finds potential patterns,
6978/// this function generates the instructions that could replace the
6979/// original code sequence
6980void AArch64InstrInfo::genAlternativeCodeSequence(
6981 MachineInstr &Root, unsigned Pattern,
6982 SmallVectorImpl<MachineInstr *> &InsInstrs,
6983 SmallVectorImpl<MachineInstr *> &DelInstrs,
6984 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
6985 MachineBasicBlock &MBB = *Root.getParent();
6986 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6987 MachineFunction &MF = *MBB.getParent();
6988 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6989
6990 MachineInstr *MUL = nullptr;
6991 const TargetRegisterClass *RC;
6992 unsigned Opc;
6993 switch (Pattern) {
6994 default:
6995 // Reassociate instructions.
6996 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
6997 DelInstrs, InstIdxForVirtReg&: InstrIdxForVirtReg);
6998 return;
6999 case AArch64MachineCombinerPattern::SUBADD_OP1:
7000 // A - (B + C)
7001 // ==> (A - B) - C
7002 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 1,
7003 InstrIdxForVirtReg);
7004 return;
7005 case AArch64MachineCombinerPattern::SUBADD_OP2:
7006 // A - (B + C)
7007 // ==> (A - C) - B
7008 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 2,
7009 InstrIdxForVirtReg);
7010 return;
7011 case AArch64MachineCombinerPattern::MULADDW_OP1:
7012 case AArch64MachineCombinerPattern::MULADDX_OP1:
7013 // MUL I=A,B,0
7014 // ADD R,I,C
7015 // ==> MADD R,A,B,C
7016 // --- Create(MADD);
7017 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
7018 Opc = AArch64::MADDWrrr;
7019 RC = &AArch64::GPR32RegClass;
7020 } else {
7021 Opc = AArch64::MADDXrrr;
7022 RC = &AArch64::GPR64RegClass;
7023 }
7024 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7025 break;
7026 case AArch64MachineCombinerPattern::MULADDW_OP2:
7027 case AArch64MachineCombinerPattern::MULADDX_OP2:
7028 // MUL I=A,B,0
7029 // ADD R,C,I
7030 // ==> MADD R,A,B,C
7031 // --- Create(MADD);
7032 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
7033 Opc = AArch64::MADDWrrr;
7034 RC = &AArch64::GPR32RegClass;
7035 } else {
7036 Opc = AArch64::MADDXrrr;
7037 RC = &AArch64::GPR64RegClass;
7038 }
7039 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7040 break;
7041 case AArch64MachineCombinerPattern::MULADDWI_OP1:
7042 case AArch64MachineCombinerPattern::MULADDXI_OP1: {
7043 // MUL I=A,B,0
7044 // ADD R,I,Imm
7045 // ==> MOV V, Imm
7046 // ==> MADD R,A,B,V
7047 // --- Create(MADD);
7048 const TargetRegisterClass *OrrRC;
7049 unsigned BitSize, OrrOpc, ZeroReg;
7050 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {
7051 OrrOpc = AArch64::ORRWri;
7052 OrrRC = &AArch64::GPR32spRegClass;
7053 BitSize = 32;
7054 ZeroReg = AArch64::WZR;
7055 Opc = AArch64::MADDWrrr;
7056 RC = &AArch64::GPR32RegClass;
7057 } else {
7058 OrrOpc = AArch64::ORRXri;
7059 OrrRC = &AArch64::GPR64spRegClass;
7060 BitSize = 64;
7061 ZeroReg = AArch64::XZR;
7062 Opc = AArch64::MADDXrrr;
7063 RC = &AArch64::GPR64RegClass;
7064 }
7065 Register NewVR = MRI.createVirtualRegister(RegClass: OrrRC);
7066 uint64_t Imm = Root.getOperand(i: 2).getImm();
7067
7068 if (Root.getOperand(i: 3).isImm()) {
7069 unsigned Val = Root.getOperand(i: 3).getImm();
7070 Imm = Imm << Val;
7071 }
7072 uint64_t UImm = SignExtend64(X: Imm, B: BitSize);
7073 // The immediate can be composed via a single instruction.
7074 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7075 AArch64_IMM::expandMOVImm(Imm: UImm, BitSize, Insn);
7076 if (Insn.size() != 1)
7077 return;
7078 auto MovI = Insn.begin();
7079 MachineInstrBuilder MIB1;
7080 // MOV is an alias for one of three instructions: movz, movn, and orr.
7081 if (MovI->Opcode == OrrOpc)
7082 MIB1 = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: OrrOpc), DestReg: NewVR)
7083 .addReg(RegNo: ZeroReg)
7084 .addImm(Val: MovI->Op2);
7085 else {
7086 if (BitSize == 32)
7087 assert((MovI->Opcode == AArch64::MOVNWi ||
7088 MovI->Opcode == AArch64::MOVZWi) &&
7089 "Expected opcode");
7090 else
7091 assert((MovI->Opcode == AArch64::MOVNXi ||
7092 MovI->Opcode == AArch64::MOVZXi) &&
7093 "Expected opcode");
7094 MIB1 = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MovI->Opcode), DestReg: NewVR)
7095 .addImm(Val: MovI->Op1)
7096 .addImm(Val: MovI->Op2);
7097 }
7098 InsInstrs.push_back(Elt: MIB1);
7099 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7100 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
7101 break;
7102 }
7103 case AArch64MachineCombinerPattern::MULSUBW_OP1:
7104 case AArch64MachineCombinerPattern::MULSUBX_OP1: {
7105 // MUL I=A,B,0
7106 // SUB R,I, C
7107 // ==> SUB V, 0, C
7108 // ==> MADD R,A,B,V // = -C + A*B
7109 // --- Create(MADD);
7110 const TargetRegisterClass *SubRC;
7111 unsigned SubOpc, ZeroReg;
7112 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
7113 SubOpc = AArch64::SUBWrr;
7114 SubRC = &AArch64::GPR32spRegClass;
7115 ZeroReg = AArch64::WZR;
7116 Opc = AArch64::MADDWrrr;
7117 RC = &AArch64::GPR32RegClass;
7118 } else {
7119 SubOpc = AArch64::SUBXrr;
7120 SubRC = &AArch64::GPR64spRegClass;
7121 ZeroReg = AArch64::XZR;
7122 Opc = AArch64::MADDXrrr;
7123 RC = &AArch64::GPR64RegClass;
7124 }
7125 Register NewVR = MRI.createVirtualRegister(RegClass: SubRC);
7126 // SUB NewVR, 0, C
7127 MachineInstrBuilder MIB1 =
7128 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: SubOpc), DestReg: NewVR)
7129 .addReg(RegNo: ZeroReg)
7130 .add(MO: Root.getOperand(i: 2));
7131 InsInstrs.push_back(Elt: MIB1);
7132 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7133 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
7134 break;
7135 }
7136 case AArch64MachineCombinerPattern::MULSUBW_OP2:
7137 case AArch64MachineCombinerPattern::MULSUBX_OP2:
7138 // MUL I=A,B,0
7139 // SUB R,C,I
7140 // ==> MSUB R,A,B,C (computes C - A*B)
7141 // --- Create(MSUB);
7142 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
7143 Opc = AArch64::MSUBWrrr;
7144 RC = &AArch64::GPR32RegClass;
7145 } else {
7146 Opc = AArch64::MSUBXrrr;
7147 RC = &AArch64::GPR64RegClass;
7148 }
7149 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7150 break;
7151 case AArch64MachineCombinerPattern::MULSUBWI_OP1:
7152 case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
7153 // MUL I=A,B,0
7154 // SUB R,I, Imm
7155 // ==> MOV V, -Imm
7156 // ==> MADD R,A,B,V // = -Imm + A*B
7157 // --- Create(MADD);
7158 const TargetRegisterClass *OrrRC;
7159 unsigned BitSize, OrrOpc, ZeroReg;
7160 if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
7161 OrrOpc = AArch64::ORRWri;
7162 OrrRC = &AArch64::GPR32spRegClass;
7163 BitSize = 32;
7164 ZeroReg = AArch64::WZR;
7165 Opc = AArch64::MADDWrrr;
7166 RC = &AArch64::GPR32RegClass;
7167 } else {
7168 OrrOpc = AArch64::ORRXri;
7169 OrrRC = &AArch64::GPR64spRegClass;
7170 BitSize = 64;
7171 ZeroReg = AArch64::XZR;
7172 Opc = AArch64::MADDXrrr;
7173 RC = &AArch64::GPR64RegClass;
7174 }
7175 Register NewVR = MRI.createVirtualRegister(RegClass: OrrRC);
7176 uint64_t Imm = Root.getOperand(i: 2).getImm();
7177 if (Root.getOperand(i: 3).isImm()) {
7178 unsigned Val = Root.getOperand(i: 3).getImm();
7179 Imm = Imm << Val;
7180 }
7181 uint64_t UImm = SignExtend64(X: -Imm, B: BitSize);
7182 // The immediate can be composed via a single instruction.
7183 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7184 AArch64_IMM::expandMOVImm(Imm: UImm, BitSize, Insn);
7185 if (Insn.size() != 1)
7186 return;
7187 auto MovI = Insn.begin();
7188 MachineInstrBuilder MIB1;
7189 // MOV is an alias for one of three instructions: movz, movn, and orr.
7190 if (MovI->Opcode == OrrOpc)
7191 MIB1 = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: OrrOpc), DestReg: NewVR)
7192 .addReg(RegNo: ZeroReg)
7193 .addImm(Val: MovI->Op2);
7194 else {
7195 if (BitSize == 32)
7196 assert((MovI->Opcode == AArch64::MOVNWi ||
7197 MovI->Opcode == AArch64::MOVZWi) &&
7198 "Expected opcode");
7199 else
7200 assert((MovI->Opcode == AArch64::MOVNXi ||
7201 MovI->Opcode == AArch64::MOVZXi) &&
7202 "Expected opcode");
7203 MIB1 = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MovI->Opcode), DestReg: NewVR)
7204 .addImm(Val: MovI->Op1)
7205 .addImm(Val: MovI->Op2);
7206 }
7207 InsInstrs.push_back(Elt: MIB1);
7208 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7209 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
7210 break;
7211 }
7212
7213 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7214 Opc = AArch64::MLAv8i8;
7215 RC = &AArch64::FPR64RegClass;
7216 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7217 break;
7218 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7219 Opc = AArch64::MLAv8i8;
7220 RC = &AArch64::FPR64RegClass;
7221 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7222 break;
7223 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7224 Opc = AArch64::MLAv16i8;
7225 RC = &AArch64::FPR128RegClass;
7226 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7227 break;
7228 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7229 Opc = AArch64::MLAv16i8;
7230 RC = &AArch64::FPR128RegClass;
7231 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7232 break;
7233 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7234 Opc = AArch64::MLAv4i16;
7235 RC = &AArch64::FPR64RegClass;
7236 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7237 break;
7238 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7239 Opc = AArch64::MLAv4i16;
7240 RC = &AArch64::FPR64RegClass;
7241 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7242 break;
7243 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7244 Opc = AArch64::MLAv8i16;
7245 RC = &AArch64::FPR128RegClass;
7246 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7247 break;
7248 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7249 Opc = AArch64::MLAv8i16;
7250 RC = &AArch64::FPR128RegClass;
7251 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7252 break;
7253 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7254 Opc = AArch64::MLAv2i32;
7255 RC = &AArch64::FPR64RegClass;
7256 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7257 break;
7258 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7259 Opc = AArch64::MLAv2i32;
7260 RC = &AArch64::FPR64RegClass;
7261 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7262 break;
7263 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7264 Opc = AArch64::MLAv4i32;
7265 RC = &AArch64::FPR128RegClass;
7266 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7267 break;
7268 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7269 Opc = AArch64::MLAv4i32;
7270 RC = &AArch64::FPR128RegClass;
7271 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7272 break;
7273
7274 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7275 Opc = AArch64::MLAv8i8;
7276 RC = &AArch64::FPR64RegClass;
7277 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7278 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i8,
7279 RC);
7280 break;
7281 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7282 Opc = AArch64::MLSv8i8;
7283 RC = &AArch64::FPR64RegClass;
7284 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7285 break;
7286 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7287 Opc = AArch64::MLAv16i8;
7288 RC = &AArch64::FPR128RegClass;
7289 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7290 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv16i8,
7291 RC);
7292 break;
7293 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7294 Opc = AArch64::MLSv16i8;
7295 RC = &AArch64::FPR128RegClass;
7296 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7297 break;
7298 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7299 Opc = AArch64::MLAv4i16;
7300 RC = &AArch64::FPR64RegClass;
7301 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7302 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i16,
7303 RC);
7304 break;
7305 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7306 Opc = AArch64::MLSv4i16;
7307 RC = &AArch64::FPR64RegClass;
7308 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7309 break;
7310 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7311 Opc = AArch64::MLAv8i16;
7312 RC = &AArch64::FPR128RegClass;
7313 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7314 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i16,
7315 RC);
7316 break;
7317 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7318 Opc = AArch64::MLSv8i16;
7319 RC = &AArch64::FPR128RegClass;
7320 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7321 break;
7322 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7323 Opc = AArch64::MLAv2i32;
7324 RC = &AArch64::FPR64RegClass;
7325 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7326 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv2i32,
7327 RC);
7328 break;
7329 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7330 Opc = AArch64::MLSv2i32;
7331 RC = &AArch64::FPR64RegClass;
7332 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7333 break;
7334 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7335 Opc = AArch64::MLAv4i32;
7336 RC = &AArch64::FPR128RegClass;
7337 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7338 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i32,
7339 RC);
7340 break;
7341 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7342 Opc = AArch64::MLSv4i32;
7343 RC = &AArch64::FPR128RegClass;
7344 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7345 break;
7346
7347 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7348 Opc = AArch64::MLAv4i16_indexed;
7349 RC = &AArch64::FPR64RegClass;
7350 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7351 break;
7352 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7353 Opc = AArch64::MLAv4i16_indexed;
7354 RC = &AArch64::FPR64RegClass;
7355 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7356 break;
7357 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7358 Opc = AArch64::MLAv8i16_indexed;
7359 RC = &AArch64::FPR128RegClass;
7360 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7361 break;
7362 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7363 Opc = AArch64::MLAv8i16_indexed;
7364 RC = &AArch64::FPR128RegClass;
7365 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7366 break;
7367 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7368 Opc = AArch64::MLAv2i32_indexed;
7369 RC = &AArch64::FPR64RegClass;
7370 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7371 break;
7372 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7373 Opc = AArch64::MLAv2i32_indexed;
7374 RC = &AArch64::FPR64RegClass;
7375 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7376 break;
7377 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7378 Opc = AArch64::MLAv4i32_indexed;
7379 RC = &AArch64::FPR128RegClass;
7380 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7381 break;
7382 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7383 Opc = AArch64::MLAv4i32_indexed;
7384 RC = &AArch64::FPR128RegClass;
7385 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7386 break;
7387
7388 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7389 Opc = AArch64::MLAv4i16_indexed;
7390 RC = &AArch64::FPR64RegClass;
7391 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7392 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i16,
7393 RC);
7394 break;
7395 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7396 Opc = AArch64::MLSv4i16_indexed;
7397 RC = &AArch64::FPR64RegClass;
7398 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7399 break;
7400 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7401 Opc = AArch64::MLAv8i16_indexed;
7402 RC = &AArch64::FPR128RegClass;
7403 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7404 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i16,
7405 RC);
7406 break;
7407 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7408 Opc = AArch64::MLSv8i16_indexed;
7409 RC = &AArch64::FPR128RegClass;
7410 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7411 break;
7412 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7413 Opc = AArch64::MLAv2i32_indexed;
7414 RC = &AArch64::FPR64RegClass;
7415 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7416 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv2i32,
7417 RC);
7418 break;
7419 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7420 Opc = AArch64::MLSv2i32_indexed;
7421 RC = &AArch64::FPR64RegClass;
7422 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7423 break;
7424 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7425 Opc = AArch64::MLAv4i32_indexed;
7426 RC = &AArch64::FPR128RegClass;
7427 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7428 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i32,
7429 RC);
7430 break;
7431 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7432 Opc = AArch64::MLSv4i32_indexed;
7433 RC = &AArch64::FPR128RegClass;
7434 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7435 break;
7436
7437 // Floating Point Support
7438 case AArch64MachineCombinerPattern::FMULADDH_OP1:
7439 Opc = AArch64::FMADDHrrr;
7440 RC = &AArch64::FPR16RegClass;
7441 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7442 break;
7443 case AArch64MachineCombinerPattern::FMULADDS_OP1:
7444 Opc = AArch64::FMADDSrrr;
7445 RC = &AArch64::FPR32RegClass;
7446 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7447 break;
7448 case AArch64MachineCombinerPattern::FMULADDD_OP1:
7449 Opc = AArch64::FMADDDrrr;
7450 RC = &AArch64::FPR64RegClass;
7451 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7452 break;
7453
7454 case AArch64MachineCombinerPattern::FMULADDH_OP2:
7455 Opc = AArch64::FMADDHrrr;
7456 RC = &AArch64::FPR16RegClass;
7457 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7458 break;
7459 case AArch64MachineCombinerPattern::FMULADDS_OP2:
7460 Opc = AArch64::FMADDSrrr;
7461 RC = &AArch64::FPR32RegClass;
7462 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7463 break;
7464 case AArch64MachineCombinerPattern::FMULADDD_OP2:
7465 Opc = AArch64::FMADDDrrr;
7466 RC = &AArch64::FPR64RegClass;
7467 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7468 break;
7469
7470 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7471 Opc = AArch64::FMLAv1i32_indexed;
7472 RC = &AArch64::FPR32RegClass;
7473 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7474 kind: FMAInstKind::Indexed);
7475 break;
7476 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7477 Opc = AArch64::FMLAv1i32_indexed;
7478 RC = &AArch64::FPR32RegClass;
7479 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7480 kind: FMAInstKind::Indexed);
7481 break;
7482
7483 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7484 Opc = AArch64::FMLAv1i64_indexed;
7485 RC = &AArch64::FPR64RegClass;
7486 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7487 kind: FMAInstKind::Indexed);
7488 break;
7489 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7490 Opc = AArch64::FMLAv1i64_indexed;
7491 RC = &AArch64::FPR64RegClass;
7492 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7493 kind: FMAInstKind::Indexed);
7494 break;
7495
7496 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7497 RC = &AArch64::FPR64RegClass;
7498 Opc = AArch64::FMLAv4i16_indexed;
7499 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7500 kind: FMAInstKind::Indexed);
7501 break;
7502 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7503 RC = &AArch64::FPR64RegClass;
7504 Opc = AArch64::FMLAv4f16;
7505 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7506 kind: FMAInstKind::Accumulator);
7507 break;
7508 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7509 RC = &AArch64::FPR64RegClass;
7510 Opc = AArch64::FMLAv4i16_indexed;
7511 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7512 kind: FMAInstKind::Indexed);
7513 break;
7514 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7515 RC = &AArch64::FPR64RegClass;
7516 Opc = AArch64::FMLAv4f16;
7517 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7518 kind: FMAInstKind::Accumulator);
7519 break;
7520
7521 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7522 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7523 RC = &AArch64::FPR64RegClass;
7524 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7525 Opc = AArch64::FMLAv2i32_indexed;
7526 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7527 kind: FMAInstKind::Indexed);
7528 } else {
7529 Opc = AArch64::FMLAv2f32;
7530 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7531 kind: FMAInstKind::Accumulator);
7532 }
7533 break;
7534 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7535 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7536 RC = &AArch64::FPR64RegClass;
7537 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7538 Opc = AArch64::FMLAv2i32_indexed;
7539 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7540 kind: FMAInstKind::Indexed);
7541 } else {
7542 Opc = AArch64::FMLAv2f32;
7543 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7544 kind: FMAInstKind::Accumulator);
7545 }
7546 break;
7547
7548 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7549 RC = &AArch64::FPR128RegClass;
7550 Opc = AArch64::FMLAv8i16_indexed;
7551 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7552 kind: FMAInstKind::Indexed);
7553 break;
7554 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7555 RC = &AArch64::FPR128RegClass;
7556 Opc = AArch64::FMLAv8f16;
7557 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7558 kind: FMAInstKind::Accumulator);
7559 break;
7560 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7561 RC = &AArch64::FPR128RegClass;
7562 Opc = AArch64::FMLAv8i16_indexed;
7563 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7564 kind: FMAInstKind::Indexed);
7565 break;
7566 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7567 RC = &AArch64::FPR128RegClass;
7568 Opc = AArch64::FMLAv8f16;
7569 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7570 kind: FMAInstKind::Accumulator);
7571 break;
7572
7573 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7574 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7575 RC = &AArch64::FPR128RegClass;
7576 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
7577 Opc = AArch64::FMLAv2i64_indexed;
7578 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7579 kind: FMAInstKind::Indexed);
7580 } else {
7581 Opc = AArch64::FMLAv2f64;
7582 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7583 kind: FMAInstKind::Accumulator);
7584 }
7585 break;
7586 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7587 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7588 RC = &AArch64::FPR128RegClass;
7589 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
7590 Opc = AArch64::FMLAv2i64_indexed;
7591 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7592 kind: FMAInstKind::Indexed);
7593 } else {
7594 Opc = AArch64::FMLAv2f64;
7595 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7596 kind: FMAInstKind::Accumulator);
7597 }
7598 break;
7599
7600 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7601 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7602 RC = &AArch64::FPR128RegClass;
7603 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
7604 Opc = AArch64::FMLAv4i32_indexed;
7605 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7606 kind: FMAInstKind::Indexed);
7607 } else {
7608 Opc = AArch64::FMLAv4f32;
7609 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7610 kind: FMAInstKind::Accumulator);
7611 }
7612 break;
7613
7614 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7615 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7616 RC = &AArch64::FPR128RegClass;
7617 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
7618 Opc = AArch64::FMLAv4i32_indexed;
7619 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7620 kind: FMAInstKind::Indexed);
7621 } else {
7622 Opc = AArch64::FMLAv4f32;
7623 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7624 kind: FMAInstKind::Accumulator);
7625 }
7626 break;
7627
7628 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7629 Opc = AArch64::FNMSUBHrrr;
7630 RC = &AArch64::FPR16RegClass;
7631 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7632 break;
7633 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7634 Opc = AArch64::FNMSUBSrrr;
7635 RC = &AArch64::FPR32RegClass;
7636 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7637 break;
7638 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7639 Opc = AArch64::FNMSUBDrrr;
7640 RC = &AArch64::FPR64RegClass;
7641 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7642 break;
7643
7644 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7645 Opc = AArch64::FNMADDHrrr;
7646 RC = &AArch64::FPR16RegClass;
7647 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7648 break;
7649 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7650 Opc = AArch64::FNMADDSrrr;
7651 RC = &AArch64::FPR32RegClass;
7652 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7653 break;
7654 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7655 Opc = AArch64::FNMADDDrrr;
7656 RC = &AArch64::FPR64RegClass;
7657 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7658 break;
7659
7660 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7661 Opc = AArch64::FMSUBHrrr;
7662 RC = &AArch64::FPR16RegClass;
7663 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7664 break;
7665 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7666 Opc = AArch64::FMSUBSrrr;
7667 RC = &AArch64::FPR32RegClass;
7668 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7669 break;
7670 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7671 Opc = AArch64::FMSUBDrrr;
7672 RC = &AArch64::FPR64RegClass;
7673 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7674 break;
7675
7676 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7677 Opc = AArch64::FMLSv1i32_indexed;
7678 RC = &AArch64::FPR32RegClass;
7679 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7680 kind: FMAInstKind::Indexed);
7681 break;
7682
7683 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7684 Opc = AArch64::FMLSv1i64_indexed;
7685 RC = &AArch64::FPR64RegClass;
7686 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7687 kind: FMAInstKind::Indexed);
7688 break;
7689
7690 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7691 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
7692 RC = &AArch64::FPR64RegClass;
7693 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
7694 MachineInstrBuilder MIB1 =
7695 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv4f16), DestReg: NewVR)
7696 .add(MO: Root.getOperand(i: 2));
7697 InsInstrs.push_back(Elt: MIB1);
7698 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7699 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
7700 Opc = AArch64::FMLAv4f16;
7701 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7702 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
7703 } else {
7704 Opc = AArch64::FMLAv4i16_indexed;
7705 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7706 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
7707 }
7708 break;
7709 }
7710 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7711 RC = &AArch64::FPR64RegClass;
7712 Opc = AArch64::FMLSv4f16;
7713 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7714 kind: FMAInstKind::Accumulator);
7715 break;
7716 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7717 RC = &AArch64::FPR64RegClass;
7718 Opc = AArch64::FMLSv4i16_indexed;
7719 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7720 kind: FMAInstKind::Indexed);
7721 break;
7722
7723 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7724 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7725 RC = &AArch64::FPR64RegClass;
7726 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
7727 Opc = AArch64::FMLSv2i32_indexed;
7728 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7729 kind: FMAInstKind::Indexed);
7730 } else {
7731 Opc = AArch64::FMLSv2f32;
7732 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7733 kind: FMAInstKind::Accumulator);
7734 }
7735 break;
7736
7737 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7738 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
7739 RC = &AArch64::FPR128RegClass;
7740 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
7741 MachineInstrBuilder MIB1 =
7742 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv8f16), DestReg: NewVR)
7743 .add(MO: Root.getOperand(i: 2));
7744 InsInstrs.push_back(Elt: MIB1);
7745 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7746 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
7747 Opc = AArch64::FMLAv8f16;
7748 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7749 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
7750 } else {
7751 Opc = AArch64::FMLAv8i16_indexed;
7752 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7753 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
7754 }
7755 break;
7756 }
7757 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7758 RC = &AArch64::FPR128RegClass;
7759 Opc = AArch64::FMLSv8f16;
7760 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7761 kind: FMAInstKind::Accumulator);
7762 break;
7763 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7764 RC = &AArch64::FPR128RegClass;
7765 Opc = AArch64::FMLSv8i16_indexed;
7766 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7767 kind: FMAInstKind::Indexed);
7768 break;
7769
7770 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7771 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7772 RC = &AArch64::FPR128RegClass;
7773 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
7774 Opc = AArch64::FMLSv2i64_indexed;
7775 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7776 kind: FMAInstKind::Indexed);
7777 } else {
7778 Opc = AArch64::FMLSv2f64;
7779 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7780 kind: FMAInstKind::Accumulator);
7781 }
7782 break;
7783
7784 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7785 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7786 RC = &AArch64::FPR128RegClass;
7787 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
7788 Opc = AArch64::FMLSv4i32_indexed;
7789 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7790 kind: FMAInstKind::Indexed);
7791 } else {
7792 Opc = AArch64::FMLSv4f32;
7793 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7794 kind: FMAInstKind::Accumulator);
7795 }
7796 break;
7797 case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
7798 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
7799 RC = &AArch64::FPR64RegClass;
7800 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
7801 MachineInstrBuilder MIB1 =
7802 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv2f32), DestReg: NewVR)
7803 .add(MO: Root.getOperand(i: 2));
7804 InsInstrs.push_back(Elt: MIB1);
7805 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7806 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
7807 Opc = AArch64::FMLAv2i32_indexed;
7808 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7809 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
7810 } else {
7811 Opc = AArch64::FMLAv2f32;
7812 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7813 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
7814 }
7815 break;
7816 }
7817 case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
7818 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
7819 RC = &AArch64::FPR128RegClass;
7820 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
7821 MachineInstrBuilder MIB1 =
7822 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv4f32), DestReg: NewVR)
7823 .add(MO: Root.getOperand(i: 2));
7824 InsInstrs.push_back(Elt: MIB1);
7825 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7826 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
7827 Opc = AArch64::FMLAv4i32_indexed;
7828 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7829 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
7830 } else {
7831 Opc = AArch64::FMLAv4f32;
7832 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7833 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
7834 }
7835 break;
7836 }
7837 case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
7838 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
7839 RC = &AArch64::FPR128RegClass;
7840 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
7841 MachineInstrBuilder MIB1 =
7842 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv2f64), DestReg: NewVR)
7843 .add(MO: Root.getOperand(i: 2));
7844 InsInstrs.push_back(Elt: MIB1);
7845 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7846 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
7847 Opc = AArch64::FMLAv2i64_indexed;
7848 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7849 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
7850 } else {
7851 Opc = AArch64::FMLAv2f64;
7852 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7853 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
7854 }
7855 break;
7856 }
7857 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7858 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
7859 unsigned IdxDupOp =
7860 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
7861 : 2;
7862 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv2i32_indexed,
7863 RC: &AArch64::FPR128RegClass, MRI);
7864 break;
7865 }
7866 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7867 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
7868 unsigned IdxDupOp =
7869 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
7870 : 2;
7871 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv2i64_indexed,
7872 RC: &AArch64::FPR128RegClass, MRI);
7873 break;
7874 }
7875 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7876 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
7877 unsigned IdxDupOp =
7878 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
7879 : 2;
7880 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv4i16_indexed,
7881 RC: &AArch64::FPR128_loRegClass, MRI);
7882 break;
7883 }
7884 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7885 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
7886 unsigned IdxDupOp =
7887 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
7888 : 2;
7889 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv4i32_indexed,
7890 RC: &AArch64::FPR128RegClass, MRI);
7891 break;
7892 }
7893 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7894 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
7895 unsigned IdxDupOp =
7896 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
7897 : 2;
7898 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv8i16_indexed,
7899 RC: &AArch64::FPR128_loRegClass, MRI);
7900 break;
7901 }
7902 case AArch64MachineCombinerPattern::FNMADD: {
7903 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
7904 break;
7905 }
7906
7907 } // end switch (Pattern)
7908 // Record MUL and ADD/SUB for deletion
7909 if (MUL)
7910 DelInstrs.push_back(Elt: MUL);
7911 DelInstrs.push_back(Elt: &Root);
7912
7913 // Set the flags on the inserted instructions to be the merged flags of the
7914 // instructions that we have combined.
7915 uint32_t Flags = Root.getFlags();
7916 if (MUL)
7917 Flags = Root.mergeFlagsWith(Other: *MUL);
7918 for (auto *MI : InsInstrs)
7919 MI->setFlags(Flags);
7920}
7921
7922/// Replace csincr-branch sequence by simple conditional branch
7923///
7924/// Examples:
7925/// 1. \code
7926/// csinc w9, wzr, wzr, <condition code>
7927/// tbnz w9, #0, 0x44
7928/// \endcode
7929/// to
7930/// \code
7931/// b.<inverted condition code>
7932/// \endcode
7933///
7934/// 2. \code
7935/// csinc w9, wzr, wzr, <condition code>
7936/// tbz w9, #0, 0x44
7937/// \endcode
7938/// to
7939/// \code
7940/// b.<condition code>
7941/// \endcode
7942///
7943/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
7944/// compare's constant operand is power of 2.
7945///
7946/// Examples:
7947/// \code
7948/// and w8, w8, #0x400
7949/// cbnz w8, L1
7950/// \endcode
7951/// to
7952/// \code
7953/// tbnz w8, #10, L1
7954/// \endcode
7955///
7956/// \param MI Conditional Branch
7957/// \return True when the simple conditional branch is generated
7958///
7959bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
7960 bool IsNegativeBranch = false;
7961 bool IsTestAndBranch = false;
7962 unsigned TargetBBInMI = 0;
7963 switch (MI.getOpcode()) {
7964 default:
7965 llvm_unreachable("Unknown branch instruction?");
7966 case AArch64::Bcc:
7967 return false;
7968 case AArch64::CBZW:
7969 case AArch64::CBZX:
7970 TargetBBInMI = 1;
7971 break;
7972 case AArch64::CBNZW:
7973 case AArch64::CBNZX:
7974 TargetBBInMI = 1;
7975 IsNegativeBranch = true;
7976 break;
7977 case AArch64::TBZW:
7978 case AArch64::TBZX:
7979 TargetBBInMI = 2;
7980 IsTestAndBranch = true;
7981 break;
7982 case AArch64::TBNZW:
7983 case AArch64::TBNZX:
7984 TargetBBInMI = 2;
7985 IsNegativeBranch = true;
7986 IsTestAndBranch = true;
7987 break;
7988 }
7989 // So we increment a zero register and test for bits other
7990 // than bit 0? Conservatively bail out in case the verifier
7991 // missed this case.
7992 if (IsTestAndBranch && MI.getOperand(i: 1).getImm())
7993 return false;
7994
7995 // Find Definition.
7996 assert(MI.getParent() && "Incomplete machine instruciton\n");
7997 MachineBasicBlock *MBB = MI.getParent();
7998 MachineFunction *MF = MBB->getParent();
7999 MachineRegisterInfo *MRI = &MF->getRegInfo();
8000 Register VReg = MI.getOperand(i: 0).getReg();
8001 if (!VReg.isVirtual())
8002 return false;
8003
8004 MachineInstr *DefMI = MRI->getVRegDef(Reg: VReg);
8005
8006 // Look through COPY instructions to find definition.
8007 while (DefMI->isCopy()) {
8008 Register CopyVReg = DefMI->getOperand(i: 1).getReg();
8009 if (!MRI->hasOneNonDBGUse(RegNo: CopyVReg))
8010 return false;
8011 if (!MRI->hasOneDef(RegNo: CopyVReg))
8012 return false;
8013 DefMI = MRI->getVRegDef(Reg: CopyVReg);
8014 }
8015
8016 switch (DefMI->getOpcode()) {
8017 default:
8018 return false;
8019 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
8020 case AArch64::ANDWri:
8021 case AArch64::ANDXri: {
8022 if (IsTestAndBranch)
8023 return false;
8024 if (DefMI->getParent() != MBB)
8025 return false;
8026 if (!MRI->hasOneNonDBGUse(RegNo: VReg))
8027 return false;
8028
8029 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
8030 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
8031 val: DefMI->getOperand(i: 2).getImm(), regSize: Is32Bit ? 32 : 64);
8032 if (!isPowerOf2_64(Value: Mask))
8033 return false;
8034
8035 MachineOperand &MO = DefMI->getOperand(i: 1);
8036 Register NewReg = MO.getReg();
8037 if (!NewReg.isVirtual())
8038 return false;
8039
8040 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
8041
8042 MachineBasicBlock &RefToMBB = *MBB;
8043 MachineBasicBlock *TBB = MI.getOperand(i: 1).getMBB();
8044 DebugLoc DL = MI.getDebugLoc();
8045 unsigned Imm = Log2_64(Value: Mask);
8046 unsigned Opc = (Imm < 32)
8047 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
8048 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
8049 MachineInstr *NewMI = BuildMI(BB&: RefToMBB, I&: MI, MIMD: DL, MCID: get(Opcode: Opc))
8050 .addReg(RegNo: NewReg)
8051 .addImm(Val: Imm)
8052 .addMBB(MBB: TBB);
8053 // Register lives on to the CBZ now.
8054 MO.setIsKill(false);
8055
8056 // For immediate smaller than 32, we need to use the 32-bit
8057 // variant (W) in all cases. Indeed the 64-bit variant does not
8058 // allow to encode them.
8059 // Therefore, if the input register is 64-bit, we need to take the
8060 // 32-bit sub-part.
8061 if (!Is32Bit && Imm < 32)
8062 NewMI->getOperand(i: 0).setSubReg(AArch64::sub_32);
8063 MI.eraseFromParent();
8064 return true;
8065 }
8066 // Look for CSINC
8067 case AArch64::CSINCWr:
8068 case AArch64::CSINCXr: {
8069 if (!(DefMI->getOperand(i: 1).getReg() == AArch64::WZR &&
8070 DefMI->getOperand(i: 2).getReg() == AArch64::WZR) &&
8071 !(DefMI->getOperand(i: 1).getReg() == AArch64::XZR &&
8072 DefMI->getOperand(i: 2).getReg() == AArch64::XZR))
8073 return false;
8074
8075 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
8076 isDead: true) != -1)
8077 return false;
8078
8079 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(i: 3).getImm();
8080 // Convert only when the condition code is not modified between
8081 // the CSINC and the branch. The CC may be used by other
8082 // instructions in between.
8083 if (areCFlagsAccessedBetweenInstrs(From: DefMI, To: MI, TRI: &getRegisterInfo(), AccessToCheck: AK_Write))
8084 return false;
8085 MachineBasicBlock &RefToMBB = *MBB;
8086 MachineBasicBlock *TBB = MI.getOperand(i: TargetBBInMI).getMBB();
8087 DebugLoc DL = MI.getDebugLoc();
8088 if (IsNegativeBranch)
8089 CC = AArch64CC::getInvertedCondCode(Code: CC);
8090 BuildMI(BB&: RefToMBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::Bcc)).addImm(Val: CC).addMBB(MBB: TBB);
8091 MI.eraseFromParent();
8092 return true;
8093 }
8094 }
8095}
8096
8097std::pair<unsigned, unsigned>
8098AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8099 const unsigned Mask = AArch64II::MO_FRAGMENT;
8100 return std::make_pair(x: TF & Mask, y: TF & ~Mask);
8101}
8102
8103ArrayRef<std::pair<unsigned, const char *>>
8104AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8105 using namespace AArch64II;
8106
8107 static const std::pair<unsigned, const char *> TargetFlags[] = {
8108 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8109 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
8110 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
8111 {MO_HI12, "aarch64-hi12"}};
8112 return ArrayRef(TargetFlags);
8113}
8114
8115ArrayRef<std::pair<unsigned, const char *>>
8116AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8117 using namespace AArch64II;
8118
8119 static const std::pair<unsigned, const char *> TargetFlags[] = {
8120 {MO_COFFSTUB, "aarch64-coffstub"},
8121 {MO_GOT, "aarch64-got"},
8122 {MO_NC, "aarch64-nc"},
8123 {MO_S, "aarch64-s"},
8124 {MO_TLS, "aarch64-tls"},
8125 {MO_DLLIMPORT, "aarch64-dllimport"},
8126 {MO_PREL, "aarch64-prel"},
8127 {MO_TAGGED, "aarch64-tagged"},
8128 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8129 };
8130 return ArrayRef(TargetFlags);
8131}
8132
8133ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
8134AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8135 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8136 {{MOSuppressPair, "aarch64-suppress-pair"},
8137 {MOStridedAccess, "aarch64-strided-access"}};
8138 return ArrayRef(TargetFlags);
8139}
8140
8141/// Constants defining how certain sequences should be outlined.
8142/// This encompasses how an outlined function should be called, and what kind of
8143/// frame should be emitted for that outlined function.
8144///
8145/// \p MachineOutlinerDefault implies that the function should be called with
8146/// a save and restore of LR to the stack.
8147///
8148/// That is,
8149///
8150/// I1 Save LR OUTLINED_FUNCTION:
8151/// I2 --> BL OUTLINED_FUNCTION I1
8152/// I3 Restore LR I2
8153/// I3
8154/// RET
8155///
8156/// * Call construction overhead: 3 (save + BL + restore)
8157/// * Frame construction overhead: 1 (ret)
8158/// * Requires stack fixups? Yes
8159///
8160/// \p MachineOutlinerTailCall implies that the function is being created from
8161/// a sequence of instructions ending in a return.
8162///
8163/// That is,
8164///
8165/// I1 OUTLINED_FUNCTION:
8166/// I2 --> B OUTLINED_FUNCTION I1
8167/// RET I2
8168/// RET
8169///
8170/// * Call construction overhead: 1 (B)
8171/// * Frame construction overhead: 0 (Return included in sequence)
8172/// * Requires stack fixups? No
8173///
8174/// \p MachineOutlinerNoLRSave implies that the function should be called using
8175/// a BL instruction, but doesn't require LR to be saved and restored. This
8176/// happens when LR is known to be dead.
8177///
8178/// That is,
8179///
8180/// I1 OUTLINED_FUNCTION:
8181/// I2 --> BL OUTLINED_FUNCTION I1
8182/// I3 I2
8183/// I3
8184/// RET
8185///
8186/// * Call construction overhead: 1 (BL)
8187/// * Frame construction overhead: 1 (RET)
8188/// * Requires stack fixups? No
8189///
8190/// \p MachineOutlinerThunk implies that the function is being created from
8191/// a sequence of instructions ending in a call. The outlined function is
8192/// called with a BL instruction, and the outlined function tail-calls the
8193/// original call destination.
8194///
8195/// That is,
8196///
8197/// I1 OUTLINED_FUNCTION:
8198/// I2 --> BL OUTLINED_FUNCTION I1
8199/// BL f I2
8200/// B f
8201/// * Call construction overhead: 1 (BL)
8202/// * Frame construction overhead: 0
8203/// * Requires stack fixups? No
8204///
8205/// \p MachineOutlinerRegSave implies that the function should be called with a
8206/// save and restore of LR to an available register. This allows us to avoid
8207/// stack fixups. Note that this outlining variant is compatible with the
8208/// NoLRSave case.
8209///
8210/// That is,
8211///
8212/// I1 Save LR OUTLINED_FUNCTION:
8213/// I2 --> BL OUTLINED_FUNCTION I1
8214/// I3 Restore LR I2
8215/// I3
8216/// RET
8217///
8218/// * Call construction overhead: 3 (save + BL + restore)
8219/// * Frame construction overhead: 1 (ret)
8220/// * Requires stack fixups? No
8221enum MachineOutlinerClass {
8222 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
8223 MachineOutlinerTailCall, /// Only emit a branch.
8224 MachineOutlinerNoLRSave, /// Emit a call and return.
8225 MachineOutlinerThunk, /// Emit a call and tail-call.
8226 MachineOutlinerRegSave /// Same as default, but save to a register.
8227};
8228
8229enum MachineOutlinerMBBFlags {
8230 LRUnavailableSomewhere = 0x2,
8231 HasCalls = 0x4,
8232 UnsafeRegsDead = 0x8
8233};
8234
8235Register
8236AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8237 MachineFunction *MF = C.getMF();
8238 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8239 const AArch64RegisterInfo *ARI =
8240 static_cast<const AArch64RegisterInfo *>(&TRI);
8241 // Check if there is an available register across the sequence that we can
8242 // use.
8243 for (unsigned Reg : AArch64::GPR64RegClass) {
8244 if (!ARI->isReservedReg(MF: *MF, Reg) &&
8245 Reg != AArch64::LR && // LR is not reserved, but don't use it.
8246 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8247 Reg != AArch64::X17 && // Ditto for X17.
8248 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8249 C.isAvailableInsideSeq(Reg, TRI))
8250 return Reg;
8251 }
8252 return Register();
8253}
8254
8255static bool
8256outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8257 const outliner::Candidate &b) {
8258 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8259 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8260
8261 return MFIa->shouldSignReturnAddress(SpillsLR: false) == MFIb->shouldSignReturnAddress(SpillsLR: false) &&
8262 MFIa->shouldSignReturnAddress(SpillsLR: true) == MFIb->shouldSignReturnAddress(SpillsLR: true);
8263}
8264
8265static bool
8266outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8267 const outliner::Candidate &b) {
8268 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8269 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8270
8271 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8272}
8273
8274static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8275 const outliner::Candidate &b) {
8276 const AArch64Subtarget &SubtargetA =
8277 a.getMF()->getSubtarget<AArch64Subtarget>();
8278 const AArch64Subtarget &SubtargetB =
8279 b.getMF()->getSubtarget<AArch64Subtarget>();
8280 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8281}
8282
8283std::optional<outliner::OutlinedFunction>
8284AArch64InstrInfo::getOutliningCandidateInfo(
8285 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
8286 unsigned SequenceSize = 0;
8287 for (auto &MI : RepeatedSequenceLocs[0])
8288 SequenceSize += getInstSizeInBytes(MI);
8289
8290 unsigned NumBytesToCreateFrame = 0;
8291
8292 // We only allow outlining for functions having exactly matching return
8293 // address signing attributes, i.e., all share the same value for the
8294 // attribute "sign-return-address" and all share the same type of key they
8295 // are signed with.
8296 // Additionally we require all functions to simultaniously either support
8297 // v8.3a features or not. Otherwise an outlined function could get signed
8298 // using dedicated v8.3 instructions and a call from a function that doesn't
8299 // support v8.3 instructions would therefore be invalid.
8300 if (std::adjacent_find(
8301 first: RepeatedSequenceLocs.begin(), last: RepeatedSequenceLocs.end(),
8302 binary_pred: [](const outliner::Candidate &a, const outliner::Candidate &b) {
8303 // Return true if a and b are non-equal w.r.t. return address
8304 // signing or support of v8.3a features
8305 if (outliningCandidatesSigningScopeConsensus(a, b) &&
8306 outliningCandidatesSigningKeyConsensus(a, b) &&
8307 outliningCandidatesV8_3OpsConsensus(a, b)) {
8308 return false;
8309 }
8310 return true;
8311 }) != RepeatedSequenceLocs.end()) {
8312 return std::nullopt;
8313 }
8314
8315 // Since at this point all candidates agree on their return address signing
8316 // picking just one is fine. If the candidate functions potentially sign their
8317 // return addresses, the outlined function should do the same. Note that in
8318 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
8319 // not certainly true that the outlined function will have to sign its return
8320 // address but this decision is made later, when the decision to outline
8321 // has already been made.
8322 // The same holds for the number of additional instructions we need: On
8323 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8324 // necessary. However, at this point we don't know if the outlined function
8325 // will have a RET instruction so we assume the worst.
8326 const TargetRegisterInfo &TRI = getRegisterInfo();
8327 // Performing a tail call may require extra checks when PAuth is enabled.
8328 // If PAuth is disabled, set it to zero for uniformity.
8329 unsigned NumBytesToCheckLRInTCEpilogue = 0;
8330 if (RepeatedSequenceLocs[0]
8331 .getMF()
8332 ->getInfo<AArch64FunctionInfo>()
8333 ->shouldSignReturnAddress(SpillsLR: true)) {
8334 // One PAC and one AUT instructions
8335 NumBytesToCreateFrame += 8;
8336
8337 // PAuth is enabled - set extra tail call cost, if any.
8338 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
8339 MF: *RepeatedSequenceLocs[0].getMF());
8340 NumBytesToCheckLRInTCEpilogue =
8341 AArch64PAuth::getCheckerSizeInBytes(Method: LRCheckMethod);
8342 // Checking the authenticated LR value may significantly impact
8343 // SequenceSize, so account for it for more precise results.
8344 if (isTailCallReturnInst(MI: RepeatedSequenceLocs[0].back()))
8345 SequenceSize += NumBytesToCheckLRInTCEpilogue;
8346
8347 // We have to check if sp modifying instructions would get outlined.
8348 // If so we only allow outlining if sp is unchanged overall, so matching
8349 // sub and add instructions are okay to outline, all other sp modifications
8350 // are not
8351 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8352 int SPValue = 0;
8353 for (auto &MI : C) {
8354 if (MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI)) {
8355 switch (MI.getOpcode()) {
8356 case AArch64::ADDXri:
8357 case AArch64::ADDWri:
8358 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8359 assert(MI.getOperand(2).isImm() &&
8360 "Expected operand to be immediate");
8361 assert(MI.getOperand(1).isReg() &&
8362 "Expected operand to be a register");
8363 // Check if the add just increments sp. If so, we search for
8364 // matching sub instructions that decrement sp. If not, the
8365 // modification is illegal
8366 if (MI.getOperand(i: 1).getReg() == AArch64::SP)
8367 SPValue += MI.getOperand(i: 2).getImm();
8368 else
8369 return true;
8370 break;
8371 case AArch64::SUBXri:
8372 case AArch64::SUBWri:
8373 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8374 assert(MI.getOperand(2).isImm() &&
8375 "Expected operand to be immediate");
8376 assert(MI.getOperand(1).isReg() &&
8377 "Expected operand to be a register");
8378 // Check if the sub just decrements sp. If so, we search for
8379 // matching add instructions that increment sp. If not, the
8380 // modification is illegal
8381 if (MI.getOperand(i: 1).getReg() == AArch64::SP)
8382 SPValue -= MI.getOperand(i: 2).getImm();
8383 else
8384 return true;
8385 break;
8386 default:
8387 return true;
8388 }
8389 }
8390 }
8391 if (SPValue)
8392 return true;
8393 return false;
8394 };
8395 // Remove candidates with illegal stack modifying instructions
8396 llvm::erase_if(C&: RepeatedSequenceLocs, P: hasIllegalSPModification);
8397
8398 // If the sequence doesn't have enough candidates left, then we're done.
8399 if (RepeatedSequenceLocs.size() < 2)
8400 return std::nullopt;
8401 }
8402
8403 // Properties about candidate MBBs that hold for all of them.
8404 unsigned FlagsSetInAll = 0xF;
8405
8406 // Compute liveness information for each candidate, and set FlagsSetInAll.
8407 for (outliner::Candidate &C : RepeatedSequenceLocs)
8408 FlagsSetInAll &= C.Flags;
8409
8410 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
8411
8412 // Helper lambda which sets call information for every candidate.
8413 auto SetCandidateCallInfo =
8414 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8415 for (outliner::Candidate &C : RepeatedSequenceLocs)
8416 C.setCallInfo(CID: CallID, CO: NumBytesForCall);
8417 };
8418
8419 unsigned FrameID = MachineOutlinerDefault;
8420 NumBytesToCreateFrame += 4;
8421
8422 bool HasBTI = any_of(Range&: RepeatedSequenceLocs, P: [](outliner::Candidate &C) {
8423 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8424 });
8425
8426 // We check to see if CFI Instructions are present, and if they are
8427 // we find the number of CFI Instructions in the candidates.
8428 unsigned CFICount = 0;
8429 for (auto &I : RepeatedSequenceLocs[0]) {
8430 if (I.isCFIInstruction())
8431 CFICount++;
8432 }
8433
8434 // We compare the number of found CFI Instructions to the number of CFI
8435 // instructions in the parent function for each candidate. We must check this
8436 // since if we outline one of the CFI instructions in a function, we have to
8437 // outline them all for correctness. If we do not, the address offsets will be
8438 // incorrect between the two sections of the program.
8439 for (outliner::Candidate &C : RepeatedSequenceLocs) {
8440 std::vector<MCCFIInstruction> CFIInstructions =
8441 C.getMF()->getFrameInstructions();
8442
8443 if (CFICount > 0 && CFICount != CFIInstructions.size())
8444 return std::nullopt;
8445 }
8446
8447 // Returns true if an instructions is safe to fix up, false otherwise.
8448 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8449 if (MI.isCall())
8450 return true;
8451
8452 if (!MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI) &&
8453 !MI.readsRegister(Reg: AArch64::SP, TRI: &TRI))
8454 return true;
8455
8456 // Any modification of SP will break our code to save/restore LR.
8457 // FIXME: We could handle some instructions which add a constant
8458 // offset to SP, with a bit more work.
8459 if (MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI))
8460 return false;
8461
8462 // At this point, we have a stack instruction that we might need to
8463 // fix up. We'll handle it if it's a load or store.
8464 if (MI.mayLoadOrStore()) {
8465 const MachineOperand *Base; // Filled with the base operand of MI.
8466 int64_t Offset; // Filled with the offset of MI.
8467 bool OffsetIsScalable;
8468
8469 // Does it allow us to offset the base operand and is the base the
8470 // register SP?
8471 if (!getMemOperandWithOffset(MI, BaseOp&: Base, Offset, OffsetIsScalable, TRI: &TRI) ||
8472 !Base->isReg() || Base->getReg() != AArch64::SP)
8473 return false;
8474
8475 // Fixe-up code below assumes bytes.
8476 if (OffsetIsScalable)
8477 return false;
8478
8479 // Find the minimum/maximum offset for this instruction and check
8480 // if fixing it up would be in range.
8481 int64_t MinOffset,
8482 MaxOffset; // Unscaled offsets for the instruction.
8483 // The scale to multiply the offsets by.
8484 TypeSize Scale(0U, false), DummyWidth(0U, false);
8485 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width&: DummyWidth, MinOffset, MaxOffset);
8486
8487 Offset += 16; // Update the offset to what it would be if we outlined.
8488 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8489 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8490 return false;
8491
8492 // It's in range, so we can outline it.
8493 return true;
8494 }
8495
8496 // FIXME: Add handling for instructions like "add x0, sp, #8".
8497
8498 // We can't fix it up, so don't outline it.
8499 return false;
8500 };
8501
8502 // True if it's possible to fix up each stack instruction in this sequence.
8503 // Important for frames/call variants that modify the stack.
8504 bool AllStackInstrsSafe =
8505 llvm::all_of(Range&: RepeatedSequenceLocs[0], P: IsSafeToFixup);
8506
8507 // If the last instruction in any candidate is a terminator, then we should
8508 // tail call all of the candidates.
8509 if (RepeatedSequenceLocs[0].back().isTerminator()) {
8510 FrameID = MachineOutlinerTailCall;
8511 NumBytesToCreateFrame = 0;
8512 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8513 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8514 }
8515
8516 else if (LastInstrOpcode == AArch64::BL ||
8517 ((LastInstrOpcode == AArch64::BLR ||
8518 LastInstrOpcode == AArch64::BLRNoIP) &&
8519 !HasBTI)) {
8520 // FIXME: Do we need to check if the code after this uses the value of LR?
8521 FrameID = MachineOutlinerThunk;
8522 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8523 SetCandidateCallInfo(MachineOutlinerThunk, 4);
8524 }
8525
8526 else {
8527 // We need to decide how to emit calls + frames. We can always emit the same
8528 // frame if we don't need to save to the stack. If we have to save to the
8529 // stack, then we need a different frame.
8530 unsigned NumBytesNoStackCalls = 0;
8531 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8532
8533 // Check if we have to save LR.
8534 for (outliner::Candidate &C : RepeatedSequenceLocs) {
8535 bool LRAvailable =
8536 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8537 ? C.isAvailableAcrossAndOutOfSeq(Reg: AArch64::LR, TRI)
8538 : true;
8539 // If we have a noreturn caller, then we're going to be conservative and
8540 // say that we have to save LR. If we don't have a ret at the end of the
8541 // block, then we can't reason about liveness accurately.
8542 //
8543 // FIXME: We can probably do better than always disabling this in
8544 // noreturn functions by fixing up the liveness info.
8545 bool IsNoReturn =
8546 C.getMF()->getFunction().hasFnAttribute(Kind: Attribute::NoReturn);
8547
8548 // Is LR available? If so, we don't need a save.
8549 if (LRAvailable && !IsNoReturn) {
8550 NumBytesNoStackCalls += 4;
8551 C.setCallInfo(CID: MachineOutlinerNoLRSave, CO: 4);
8552 CandidatesWithoutStackFixups.push_back(x: C);
8553 }
8554
8555 // Is an unused register available? If so, we won't modify the stack, so
8556 // we can outline with the same frame type as those that don't save LR.
8557 else if (findRegisterToSaveLRTo(C)) {
8558 NumBytesNoStackCalls += 12;
8559 C.setCallInfo(CID: MachineOutlinerRegSave, CO: 12);
8560 CandidatesWithoutStackFixups.push_back(x: C);
8561 }
8562
8563 // Is SP used in the sequence at all? If not, we don't have to modify
8564 // the stack, so we are guaranteed to get the same frame.
8565 else if (C.isAvailableInsideSeq(Reg: AArch64::SP, TRI)) {
8566 NumBytesNoStackCalls += 12;
8567 C.setCallInfo(CID: MachineOutlinerDefault, CO: 12);
8568 CandidatesWithoutStackFixups.push_back(x: C);
8569 }
8570
8571 // If we outline this, we need to modify the stack. Pretend we don't
8572 // outline this by saving all of its bytes.
8573 else {
8574 NumBytesNoStackCalls += SequenceSize;
8575 }
8576 }
8577
8578 // If there are no places where we have to save LR, then note that we
8579 // don't have to update the stack. Otherwise, give every candidate the
8580 // default call type, as long as it's safe to do so.
8581 if (!AllStackInstrsSafe ||
8582 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
8583 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
8584 FrameID = MachineOutlinerNoLRSave;
8585 if (RepeatedSequenceLocs.size() < 2)
8586 return std::nullopt;
8587 } else {
8588 SetCandidateCallInfo(MachineOutlinerDefault, 12);
8589
8590 // Bugzilla ID: 46767
8591 // TODO: Check if fixing up the stack more than once is safe so we can
8592 // outline these.
8593 //
8594 // An outline resulting in a caller that requires stack fixups at the
8595 // callsite to a callee that also requires stack fixups can happen when
8596 // there are no available registers at the candidate callsite for a
8597 // candidate that itself also has calls.
8598 //
8599 // In other words if function_containing_sequence in the following pseudo
8600 // assembly requires that we save LR at the point of the call, but there
8601 // are no available registers: in this case we save using SP and as a
8602 // result the SP offsets requires stack fixups by multiples of 16.
8603 //
8604 // function_containing_sequence:
8605 // ...
8606 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8607 // call OUTLINED_FUNCTION_N
8608 // restore LR from SP
8609 // ...
8610 //
8611 // OUTLINED_FUNCTION_N:
8612 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8613 // ...
8614 // bl foo
8615 // restore LR from SP
8616 // ret
8617 //
8618 // Because the code to handle more than one stack fixup does not
8619 // currently have the proper checks for legality, these cases will assert
8620 // in the AArch64 MachineOutliner. This is because the code to do this
8621 // needs more hardening, testing, better checks that generated code is
8622 // legal, etc and because it is only verified to handle a single pass of
8623 // stack fixup.
8624 //
8625 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
8626 // these cases until they are known to be handled. Bugzilla 46767 is
8627 // referenced in comments at the assert site.
8628 //
8629 // To avoid asserting (or generating non-legal code on noassert builds)
8630 // we remove all candidates which would need more than one stack fixup by
8631 // pruning the cases where the candidate has calls while also having no
8632 // available LR and having no available general purpose registers to copy
8633 // LR to (ie one extra stack save/restore).
8634 //
8635 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8636 erase_if(C&: RepeatedSequenceLocs, P: [this, &TRI](outliner::Candidate &C) {
8637 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
8638 return (llvm::any_of(Range&: C, P: IsCall)) &&
8639 (!C.isAvailableAcrossAndOutOfSeq(Reg: AArch64::LR, TRI) ||
8640 !findRegisterToSaveLRTo(C));
8641 });
8642 }
8643 }
8644
8645 // If we dropped all of the candidates, bail out here.
8646 if (RepeatedSequenceLocs.size() < 2) {
8647 RepeatedSequenceLocs.clear();
8648 return std::nullopt;
8649 }
8650 }
8651
8652 // Does every candidate's MBB contain a call? If so, then we might have a call
8653 // in the range.
8654 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8655 // Check if the range contains a call. These require a save + restore of the
8656 // link register.
8657 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
8658 bool ModStackToSaveLR = false;
8659 if (std::any_of(first: FirstCand.begin(), last: std::prev(x: FirstCand.end()),
8660 pred: [](const MachineInstr &MI) { return MI.isCall(); }))
8661 ModStackToSaveLR = true;
8662
8663 // Handle the last instruction separately. If this is a tail call, then the
8664 // last instruction is a call. We don't want to save + restore in this case.
8665 // However, it could be possible that the last instruction is a call without
8666 // it being valid to tail call this sequence. We should consider this as
8667 // well.
8668 else if (FrameID != MachineOutlinerThunk &&
8669 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
8670 ModStackToSaveLR = true;
8671
8672 if (ModStackToSaveLR) {
8673 // We can't fix up the stack. Bail out.
8674 if (!AllStackInstrsSafe) {
8675 RepeatedSequenceLocs.clear();
8676 return std::nullopt;
8677 }
8678
8679 // Save + restore LR.
8680 NumBytesToCreateFrame += 8;
8681 }
8682 }
8683
8684 // If we have CFI instructions, we can only outline if the outlined section
8685 // can be a tail call
8686 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
8687 return std::nullopt;
8688
8689 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
8690 NumBytesToCreateFrame, FrameID);
8691}
8692
8693void AArch64InstrInfo::mergeOutliningCandidateAttributes(
8694 Function &F, std::vector<outliner::Candidate> &Candidates) const {
8695 // If a bunch of candidates reach this point they must agree on their return
8696 // address signing. It is therefore enough to just consider the signing
8697 // behaviour of one of them
8698 const auto &CFn = Candidates.front().getMF()->getFunction();
8699
8700 if (CFn.hasFnAttribute(Kind: "ptrauth-returns"))
8701 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "ptrauth-returns"));
8702 if (CFn.hasFnAttribute(Kind: "ptrauth-auth-traps"))
8703 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "ptrauth-auth-traps"));
8704 // Since all candidates belong to the same module, just copy the
8705 // function-level attributes of an arbitrary function.
8706 if (CFn.hasFnAttribute(Kind: "sign-return-address"))
8707 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address"));
8708 if (CFn.hasFnAttribute(Kind: "sign-return-address-key"))
8709 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address-key"));
8710
8711 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
8712}
8713
8714bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
8715 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
8716 const Function &F = MF.getFunction();
8717
8718 // Can F be deduplicated by the linker? If it can, don't outline from it.
8719 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
8720 return false;
8721
8722 // Don't outline from functions with section markings; the program could
8723 // expect that all the code is in the named section.
8724 // FIXME: Allow outlining from multiple functions with the same section
8725 // marking.
8726 if (F.hasSection())
8727 return false;
8728
8729 // Outlining from functions with redzones is unsafe since the outliner may
8730 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
8731 // outline from it.
8732 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
8733 if (!AFI || AFI->hasRedZone().value_or(u: true))
8734 return false;
8735
8736 // FIXME: Determine whether it is safe to outline from functions which contain
8737 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
8738 // outlined together and ensure it is safe to outline with async unwind info,
8739 // required for saving & restoring VG around calls.
8740 if (AFI->hasStreamingModeChanges())
8741 return false;
8742
8743 // FIXME: Teach the outliner to generate/handle Windows unwind info.
8744 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
8745 return false;
8746
8747 // It's safe to outline from MF.
8748 return true;
8749}
8750
8751SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8752AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
8753 unsigned &Flags) const {
8754 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
8755 "Must track liveness!");
8756 SmallVector<
8757 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8758 Ranges;
8759 // According to the AArch64 Procedure Call Standard, the following are
8760 // undefined on entry/exit from a function call:
8761 //
8762 // * Registers x16, x17, (and thus w16, w17)
8763 // * Condition codes (and thus the NZCV register)
8764 //
8765 // If any of these registers are used inside or live across an outlined
8766 // function, then they may be modified later, either by the compiler or
8767 // some other tool (like the linker).
8768 //
8769 // To avoid outlining in these situations, partition each block into ranges
8770 // where these registers are dead. We will only outline from those ranges.
8771 LiveRegUnits LRU(getRegisterInfo());
8772 auto AreAllUnsafeRegsDead = [&LRU]() {
8773 return LRU.available(Reg: AArch64::W16) && LRU.available(Reg: AArch64::W17) &&
8774 LRU.available(Reg: AArch64::NZCV);
8775 };
8776
8777 // We need to know if LR is live across an outlining boundary later on in
8778 // order to decide how we'll create the outlined call, frame, etc.
8779 //
8780 // It's pretty expensive to check this for *every candidate* within a block.
8781 // That's some potentially n^2 behaviour, since in the worst case, we'd need
8782 // to compute liveness from the end of the block for O(n) candidates within
8783 // the block.
8784 //
8785 // So, to improve the average case, let's keep track of liveness from the end
8786 // of the block to the beginning of *every outlinable range*. If we know that
8787 // LR is available in every range we could outline from, then we know that
8788 // we don't need to check liveness for any candidate within that range.
8789 bool LRAvailableEverywhere = true;
8790 // Compute liveness bottom-up.
8791 LRU.addLiveOuts(MBB);
8792 // Update flags that require info about the entire MBB.
8793 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
8794 if (MI.isCall() && !MI.isTerminator())
8795 Flags |= MachineOutlinerMBBFlags::HasCalls;
8796 };
8797 // Range: [RangeBegin, RangeEnd)
8798 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
8799 unsigned RangeLen;
8800 auto CreateNewRangeStartingAt =
8801 [&RangeBegin, &RangeEnd,
8802 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
8803 RangeBegin = NewBegin;
8804 RangeEnd = std::next(x: RangeBegin);
8805 RangeLen = 0;
8806 };
8807 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
8808 // At least one unsafe register is not dead. We do not want to outline at
8809 // this point. If it is long enough to outline from, save the range
8810 // [RangeBegin, RangeEnd).
8811 if (RangeLen > 1)
8812 Ranges.push_back(Elt: std::make_pair(x&: RangeBegin, y&: RangeEnd));
8813 };
8814 // Find the first point where all unsafe registers are dead.
8815 // FIND: <safe instr> <-- end of first potential range
8816 // SKIP: <unsafe def>
8817 // SKIP: ... everything between ...
8818 // SKIP: <unsafe use>
8819 auto FirstPossibleEndPt = MBB.instr_rbegin();
8820 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
8821 LRU.stepBackward(MI: *FirstPossibleEndPt);
8822 // Update flags that impact how we outline across the entire block,
8823 // regardless of safety.
8824 UpdateWholeMBBFlags(*FirstPossibleEndPt);
8825 if (AreAllUnsafeRegsDead())
8826 break;
8827 }
8828 // If we exhausted the entire block, we have no safe ranges to outline.
8829 if (FirstPossibleEndPt == MBB.instr_rend())
8830 return Ranges;
8831 // Current range.
8832 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
8833 // StartPt points to the first place where all unsafe registers
8834 // are dead (if there is any such point). Begin partitioning the MBB into
8835 // ranges.
8836 for (auto &MI : make_range(x: FirstPossibleEndPt, y: MBB.instr_rend())) {
8837 LRU.stepBackward(MI);
8838 UpdateWholeMBBFlags(MI);
8839 if (!AreAllUnsafeRegsDead()) {
8840 SaveRangeIfNonEmpty();
8841 CreateNewRangeStartingAt(MI.getIterator());
8842 continue;
8843 }
8844 LRAvailableEverywhere &= LRU.available(Reg: AArch64::LR);
8845 RangeBegin = MI.getIterator();
8846 ++RangeLen;
8847 }
8848 // Above loop misses the last (or only) range. If we are still safe, then
8849 // let's save the range.
8850 if (AreAllUnsafeRegsDead())
8851 SaveRangeIfNonEmpty();
8852 if (Ranges.empty())
8853 return Ranges;
8854 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
8855 // the order.
8856 std::reverse(first: Ranges.begin(), last: Ranges.end());
8857 // If there is at least one outlinable range where LR is unavailable
8858 // somewhere, remember that.
8859 if (!LRAvailableEverywhere)
8860 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
8861 return Ranges;
8862}
8863
8864outliner::InstrType
8865AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
8866 unsigned Flags) const {
8867 MachineInstr &MI = *MIT;
8868 MachineBasicBlock *MBB = MI.getParent();
8869 MachineFunction *MF = MBB->getParent();
8870 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
8871
8872 // Don't outline anything used for return address signing. The outlined
8873 // function will get signed later if needed
8874 switch (MI.getOpcode()) {
8875 case AArch64::PACM:
8876 case AArch64::PACIASP:
8877 case AArch64::PACIBSP:
8878 case AArch64::PACIASPPC:
8879 case AArch64::PACIBSPPC:
8880 case AArch64::AUTIASP:
8881 case AArch64::AUTIBSP:
8882 case AArch64::AUTIASPPCi:
8883 case AArch64::AUTIASPPCr:
8884 case AArch64::AUTIBSPPCi:
8885 case AArch64::AUTIBSPPCr:
8886 case AArch64::RETAA:
8887 case AArch64::RETAB:
8888 case AArch64::RETAASPPCi:
8889 case AArch64::RETAASPPCr:
8890 case AArch64::RETABSPPCi:
8891 case AArch64::RETABSPPCr:
8892 case AArch64::EMITBKEY:
8893 case AArch64::PAUTH_PROLOGUE:
8894 case AArch64::PAUTH_EPILOGUE:
8895 return outliner::InstrType::Illegal;
8896 }
8897
8898 // Don't outline LOHs.
8899 if (FuncInfo->getLOHRelated().count(Ptr: &MI))
8900 return outliner::InstrType::Illegal;
8901
8902 // We can only outline these if we will tail call the outlined function, or
8903 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
8904 // in a tail call.
8905 //
8906 // FIXME: If the proper fixups for the offset are implemented, this should be
8907 // possible.
8908 if (MI.isCFIInstruction())
8909 return outliner::InstrType::Legal;
8910
8911 // Is this a terminator for a basic block?
8912 if (MI.isTerminator())
8913 // TargetInstrInfo::getOutliningType has already filtered out anything
8914 // that would break this, so we can allow it here.
8915 return outliner::InstrType::Legal;
8916
8917 // Make sure none of the operands are un-outlinable.
8918 for (const MachineOperand &MOP : MI.operands()) {
8919 // A check preventing CFI indices was here before, but only CFI
8920 // instructions should have those.
8921 assert(!MOP.isCFIIndex());
8922
8923 // If it uses LR or W30 explicitly, then don't touch it.
8924 if (MOP.isReg() && !MOP.isImplicit() &&
8925 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
8926 return outliner::InstrType::Illegal;
8927 }
8928
8929 // Special cases for instructions that can always be outlined, but will fail
8930 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
8931 // be outlined because they don't require a *specific* value to be in LR.
8932 if (MI.getOpcode() == AArch64::ADRP)
8933 return outliner::InstrType::Legal;
8934
8935 // If MI is a call we might be able to outline it. We don't want to outline
8936 // any calls that rely on the position of items on the stack. When we outline
8937 // something containing a call, we have to emit a save and restore of LR in
8938 // the outlined function. Currently, this always happens by saving LR to the
8939 // stack. Thus, if we outline, say, half the parameters for a function call
8940 // plus the call, then we'll break the callee's expectations for the layout
8941 // of the stack.
8942 //
8943 // FIXME: Allow calls to functions which construct a stack frame, as long
8944 // as they don't access arguments on the stack.
8945 // FIXME: Figure out some way to analyze functions defined in other modules.
8946 // We should be able to compute the memory usage based on the IR calling
8947 // convention, even if we can't see the definition.
8948 if (MI.isCall()) {
8949 // Get the function associated with the call. Look at each operand and find
8950 // the one that represents the callee and get its name.
8951 const Function *Callee = nullptr;
8952 for (const MachineOperand &MOP : MI.operands()) {
8953 if (MOP.isGlobal()) {
8954 Callee = dyn_cast<Function>(Val: MOP.getGlobal());
8955 break;
8956 }
8957 }
8958
8959 // Never outline calls to mcount. There isn't any rule that would require
8960 // this, but the Linux kernel's "ftrace" feature depends on it.
8961 if (Callee && Callee->getName() == "\01_mcount")
8962 return outliner::InstrType::Illegal;
8963
8964 // If we don't know anything about the callee, assume it depends on the
8965 // stack layout of the caller. In that case, it's only legal to outline
8966 // as a tail-call. Explicitly list the call instructions we know about so we
8967 // don't get unexpected results with call pseudo-instructions.
8968 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
8969 if (MI.getOpcode() == AArch64::BLR ||
8970 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
8971 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
8972
8973 if (!Callee)
8974 return UnknownCallOutlineType;
8975
8976 // We have a function we have information about. Check it if it's something
8977 // can safely outline.
8978 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(F: *Callee);
8979
8980 // We don't know what's going on with the callee at all. Don't touch it.
8981 if (!CalleeMF)
8982 return UnknownCallOutlineType;
8983
8984 // Check if we know anything about the callee saves on the function. If we
8985 // don't, then don't touch it, since that implies that we haven't
8986 // computed anything about its stack frame yet.
8987 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
8988 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
8989 MFI.getNumObjects() > 0)
8990 return UnknownCallOutlineType;
8991
8992 // At this point, we can say that CalleeMF ought to not pass anything on the
8993 // stack. Therefore, we can outline it.
8994 return outliner::InstrType::Legal;
8995 }
8996
8997 // Don't touch the link register or W30.
8998 if (MI.readsRegister(Reg: AArch64::W30, TRI: &getRegisterInfo()) ||
8999 MI.modifiesRegister(Reg: AArch64::W30, TRI: &getRegisterInfo()))
9000 return outliner::InstrType::Illegal;
9001
9002 // Don't outline BTI instructions, because that will prevent the outlining
9003 // site from being indirectly callable.
9004 if (hasBTISemantics(MI))
9005 return outliner::InstrType::Illegal;
9006
9007 return outliner::InstrType::Legal;
9008}
9009
9010void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
9011 for (MachineInstr &MI : MBB) {
9012 const MachineOperand *Base;
9013 TypeSize Width(0, false);
9014 int64_t Offset;
9015 bool OffsetIsScalable;
9016
9017 // Is this a load or store with an immediate offset with SP as the base?
9018 if (!MI.mayLoadOrStore() ||
9019 !getMemOperandWithOffsetWidth(LdSt: MI, BaseOp&: Base, Offset, OffsetIsScalable, Width,
9020 TRI: &RI) ||
9021 (Base->isReg() && Base->getReg() != AArch64::SP))
9022 continue;
9023
9024 // It is, so we have to fix it up.
9025 TypeSize Scale(0U, false);
9026 int64_t Dummy1, Dummy2;
9027
9028 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(LdSt&: MI);
9029 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
9030 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2);
9031 assert(Scale != 0 && "Unexpected opcode!");
9032 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
9033
9034 // We've pushed the return address to the stack, so add 16 to the offset.
9035 // This is safe, since we already checked if it would overflow when we
9036 // checked if this instruction was legal to outline.
9037 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
9038 StackOffsetOperand.setImm(NewImm);
9039 }
9040}
9041
9042static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
9043 const AArch64InstrInfo *TII,
9044 bool ShouldSignReturnAddr) {
9045 if (!ShouldSignReturnAddr)
9046 return;
9047
9048 BuildMI(BB&: MBB, I: MBB.begin(), MIMD: DebugLoc(), MCID: TII->get(Opcode: AArch64::PAUTH_PROLOGUE))
9049 .setMIFlag(MachineInstr::FrameSetup);
9050 BuildMI(BB&: MBB, I: MBB.getFirstInstrTerminator(), MIMD: DebugLoc(),
9051 MCID: TII->get(Opcode: AArch64::PAUTH_EPILOGUE))
9052 .setMIFlag(MachineInstr::FrameDestroy);
9053}
9054
9055void AArch64InstrInfo::buildOutlinedFrame(
9056 MachineBasicBlock &MBB, MachineFunction &MF,
9057 const outliner::OutlinedFunction &OF) const {
9058
9059 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
9060
9061 if (OF.FrameConstructionID == MachineOutlinerTailCall)
9062 FI->setOutliningStyle("Tail Call");
9063 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
9064 // For thunk outlining, rewrite the last instruction from a call to a
9065 // tail-call.
9066 MachineInstr *Call = &*--MBB.instr_end();
9067 unsigned TailOpcode;
9068 if (Call->getOpcode() == AArch64::BL) {
9069 TailOpcode = AArch64::TCRETURNdi;
9070 } else {
9071 assert(Call->getOpcode() == AArch64::BLR ||
9072 Call->getOpcode() == AArch64::BLRNoIP);
9073 TailOpcode = AArch64::TCRETURNriALL;
9074 }
9075 MachineInstr *TC = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: TailOpcode))
9076 .add(MO: Call->getOperand(i: 0))
9077 .addImm(Val: 0);
9078 MBB.insert(I: MBB.end(), MI: TC);
9079 Call->eraseFromParent();
9080
9081 FI->setOutliningStyle("Thunk");
9082 }
9083
9084 bool IsLeafFunction = true;
9085
9086 // Is there a call in the outlined range?
9087 auto IsNonTailCall = [](const MachineInstr &MI) {
9088 return MI.isCall() && !MI.isReturn();
9089 };
9090
9091 if (llvm::any_of(Range: MBB.instrs(), P: IsNonTailCall)) {
9092 // Fix up the instructions in the range, since we're going to modify the
9093 // stack.
9094
9095 // Bugzilla ID: 46767
9096 // TODO: Check if fixing up twice is safe so we can outline these.
9097 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9098 "Can only fix up stack references once");
9099 fixupPostOutline(MBB);
9100
9101 IsLeafFunction = false;
9102
9103 // LR has to be a live in so that we can save it.
9104 if (!MBB.isLiveIn(Reg: AArch64::LR))
9105 MBB.addLiveIn(PhysReg: AArch64::LR);
9106
9107 MachineBasicBlock::iterator It = MBB.begin();
9108 MachineBasicBlock::iterator Et = MBB.end();
9109
9110 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9111 OF.FrameConstructionID == MachineOutlinerThunk)
9112 Et = std::prev(x: MBB.end());
9113
9114 // Insert a save before the outlined region
9115 MachineInstr *STRXpre = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::STRXpre))
9116 .addReg(RegNo: AArch64::SP, flags: RegState::Define)
9117 .addReg(RegNo: AArch64::LR)
9118 .addReg(RegNo: AArch64::SP)
9119 .addImm(Val: -16);
9120 It = MBB.insert(I: It, MI: STRXpre);
9121
9122 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9123 const TargetSubtargetInfo &STI = MF.getSubtarget();
9124 const MCRegisterInfo *MRI = STI.getRegisterInfo();
9125 unsigned DwarfReg = MRI->getDwarfRegNum(RegNum: AArch64::LR, isEH: true);
9126
9127 // Add a CFI saying the stack was moved 16 B down.
9128 int64_t StackPosEntry =
9129 MF.addFrameInst(Inst: MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: 16));
9130 BuildMI(BB&: MBB, I: It, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::CFI_INSTRUCTION))
9131 .addCFIIndex(CFIIndex: StackPosEntry)
9132 .setMIFlags(MachineInstr::FrameSetup);
9133
9134 // Add a CFI saying that the LR that we want to find is now 16 B higher
9135 // than before.
9136 int64_t LRPosEntry = MF.addFrameInst(
9137 Inst: MCCFIInstruction::createOffset(L: nullptr, Register: DwarfReg, Offset: -16));
9138 BuildMI(BB&: MBB, I: It, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::CFI_INSTRUCTION))
9139 .addCFIIndex(CFIIndex: LRPosEntry)
9140 .setMIFlags(MachineInstr::FrameSetup);
9141 }
9142
9143 // Insert a restore before the terminator for the function.
9144 MachineInstr *LDRXpost = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::LDRXpost))
9145 .addReg(RegNo: AArch64::SP, flags: RegState::Define)
9146 .addReg(RegNo: AArch64::LR, flags: RegState::Define)
9147 .addReg(RegNo: AArch64::SP)
9148 .addImm(Val: 16);
9149 Et = MBB.insert(I: Et, MI: LDRXpost);
9150 }
9151
9152 bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(SpillsLR: !IsLeafFunction);
9153
9154 // If this is a tail call outlined function, then there's already a return.
9155 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9156 OF.FrameConstructionID == MachineOutlinerThunk) {
9157 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
9158 return;
9159 }
9160
9161 // It's not a tail call, so we have to insert the return ourselves.
9162
9163 // LR has to be a live in so that we can return to it.
9164 if (!MBB.isLiveIn(Reg: AArch64::LR))
9165 MBB.addLiveIn(PhysReg: AArch64::LR);
9166
9167 MachineInstr *ret = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::RET))
9168 .addReg(RegNo: AArch64::LR);
9169 MBB.insert(I: MBB.end(), MI: ret);
9170
9171 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
9172
9173 FI->setOutliningStyle("Function");
9174
9175 // Did we have to modify the stack by saving the link register?
9176 if (OF.FrameConstructionID != MachineOutlinerDefault)
9177 return;
9178
9179 // We modified the stack.
9180 // Walk over the basic block and fix up all the stack accesses.
9181 fixupPostOutline(MBB);
9182}
9183
9184MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9185 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9186 MachineFunction &MF, outliner::Candidate &C) const {
9187
9188 // Are we tail calling?
9189 if (C.CallConstructionID == MachineOutlinerTailCall) {
9190 // If yes, then we can just branch to the label.
9191 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::TCRETURNdi))
9192 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName()))
9193 .addImm(Val: 0));
9194 return It;
9195 }
9196
9197 // Are we saving the link register?
9198 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9199 C.CallConstructionID == MachineOutlinerThunk) {
9200 // No, so just insert the call.
9201 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::BL))
9202 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName())));
9203 return It;
9204 }
9205
9206 // We want to return the spot where we inserted the call.
9207 MachineBasicBlock::iterator CallPt;
9208
9209 // Instructions for saving and restoring LR around the call instruction we're
9210 // going to insert.
9211 MachineInstr *Save;
9212 MachineInstr *Restore;
9213 // Can we save to a register?
9214 if (C.CallConstructionID == MachineOutlinerRegSave) {
9215 // FIXME: This logic should be sunk into a target-specific interface so that
9216 // we don't have to recompute the register.
9217 Register Reg = findRegisterToSaveLRTo(C);
9218 assert(Reg && "No callee-saved register available?");
9219
9220 // LR has to be a live in so that we can save it.
9221 if (!MBB.isLiveIn(Reg: AArch64::LR))
9222 MBB.addLiveIn(PhysReg: AArch64::LR);
9223
9224 // Save and restore LR from Reg.
9225 Save = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::ORRXrs), DestReg: Reg)
9226 .addReg(RegNo: AArch64::XZR)
9227 .addReg(RegNo: AArch64::LR)
9228 .addImm(Val: 0);
9229 Restore = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::ORRXrs), DestReg: AArch64::LR)
9230 .addReg(RegNo: AArch64::XZR)
9231 .addReg(RegNo: Reg)
9232 .addImm(Val: 0);
9233 } else {
9234 // We have the default case. Save and restore from SP.
9235 Save = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::STRXpre))
9236 .addReg(RegNo: AArch64::SP, flags: RegState::Define)
9237 .addReg(RegNo: AArch64::LR)
9238 .addReg(RegNo: AArch64::SP)
9239 .addImm(Val: -16);
9240 Restore = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::LDRXpost))
9241 .addReg(RegNo: AArch64::SP, flags: RegState::Define)
9242 .addReg(RegNo: AArch64::LR, flags: RegState::Define)
9243 .addReg(RegNo: AArch64::SP)
9244 .addImm(Val: 16);
9245 }
9246
9247 It = MBB.insert(I: It, MI: Save);
9248 It++;
9249
9250 // Insert the call.
9251 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::BL))
9252 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName())));
9253 CallPt = It;
9254 It++;
9255
9256 It = MBB.insert(I: It, MI: Restore);
9257 return CallPt;
9258}
9259
9260bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9261 MachineFunction &MF) const {
9262 return MF.getFunction().hasMinSize();
9263}
9264
9265void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9266 MachineBasicBlock::iterator Iter,
9267 DebugLoc &DL,
9268 bool AllowSideEffects) const {
9269 const MachineFunction &MF = *MBB.getParent();
9270 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9271 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9272
9273 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9274 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: Reg).addImm(Val: 0).addImm(Val: 0);
9275 } else if (STI.hasSVE()) {
9276 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::DUP_ZI_D), DestReg: Reg)
9277 .addImm(Val: 0)
9278 .addImm(Val: 0);
9279 } else {
9280 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::MOVIv2d_ns), DestReg: Reg)
9281 .addImm(Val: 0);
9282 }
9283}
9284
9285std::optional<DestSourcePair>
9286AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9287
9288 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9289 // and zero immediate operands used as an alias for mov instruction.
9290 if (MI.getOpcode() == AArch64::ORRWrs &&
9291 MI.getOperand(i: 1).getReg() == AArch64::WZR &&
9292 MI.getOperand(i: 3).getImm() == 0x0 &&
9293 // Check that the w->w move is not a zero-extending w->x mov.
9294 (!MI.getOperand(i: 0).getReg().isVirtual() ||
9295 MI.getOperand(i: 0).getSubReg() == 0) &&
9296 (!MI.getOperand(i: 0).getReg().isPhysical() ||
9297 MI.findRegisterDefOperandIdx(Reg: MI.getOperand(i: 0).getReg() - AArch64::W0 +
9298 AArch64::X0,
9299 /*TRI=*/nullptr) == -1))
9300 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
9301
9302 if (MI.getOpcode() == AArch64::ORRXrs &&
9303 MI.getOperand(i: 1).getReg() == AArch64::XZR &&
9304 MI.getOperand(i: 3).getImm() == 0x0)
9305 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
9306
9307 return std::nullopt;
9308}
9309
9310std::optional<DestSourcePair>
9311AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9312 if (MI.getOpcode() == AArch64::ORRWrs &&
9313 MI.getOperand(i: 1).getReg() == AArch64::WZR &&
9314 MI.getOperand(i: 3).getImm() == 0x0)
9315 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
9316 return std::nullopt;
9317}
9318
9319std::optional<RegImmPair>
9320AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9321 int Sign = 1;
9322 int64_t Offset = 0;
9323
9324 // TODO: Handle cases where Reg is a super- or sub-register of the
9325 // destination register.
9326 const MachineOperand &Op0 = MI.getOperand(i: 0);
9327 if (!Op0.isReg() || Reg != Op0.getReg())
9328 return std::nullopt;
9329
9330 switch (MI.getOpcode()) {
9331 default:
9332 return std::nullopt;
9333 case AArch64::SUBWri:
9334 case AArch64::SUBXri:
9335 case AArch64::SUBSWri:
9336 case AArch64::SUBSXri:
9337 Sign *= -1;
9338 [[fallthrough]];
9339 case AArch64::ADDSWri:
9340 case AArch64::ADDSXri:
9341 case AArch64::ADDWri:
9342 case AArch64::ADDXri: {
9343 // TODO: Third operand can be global address (usually some string).
9344 if (!MI.getOperand(i: 0).isReg() || !MI.getOperand(i: 1).isReg() ||
9345 !MI.getOperand(i: 2).isImm())
9346 return std::nullopt;
9347 int Shift = MI.getOperand(i: 3).getImm();
9348 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9349 Offset = Sign * (MI.getOperand(i: 2).getImm() << Shift);
9350 }
9351 }
9352 return RegImmPair{MI.getOperand(i: 1).getReg(), Offset};
9353}
9354
9355/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9356/// the destination register then, if possible, describe the value in terms of
9357/// the source register.
9358static std::optional<ParamLoadedValue>
9359describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9360 const TargetInstrInfo *TII,
9361 const TargetRegisterInfo *TRI) {
9362 auto DestSrc = TII->isCopyLikeInstr(MI);
9363 if (!DestSrc)
9364 return std::nullopt;
9365
9366 Register DestReg = DestSrc->Destination->getReg();
9367 Register SrcReg = DestSrc->Source->getReg();
9368
9369 auto Expr = DIExpression::get(Context&: MI.getMF()->getFunction().getContext(), Elements: {});
9370
9371 // If the described register is the destination, just return the source.
9372 if (DestReg == DescribedReg)
9373 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
9374
9375 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9376 if (MI.getOpcode() == AArch64::ORRWrs &&
9377 TRI->isSuperRegister(RegA: DestReg, RegB: DescribedReg))
9378 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
9379
9380 // We may need to describe the lower part of a ORRXrs move.
9381 if (MI.getOpcode() == AArch64::ORRXrs &&
9382 TRI->isSubRegister(RegA: DestReg, RegB: DescribedReg)) {
9383 Register SrcSubReg = TRI->getSubReg(Reg: SrcReg, Idx: AArch64::sub_32);
9384 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcSubReg, isDef: false), Expr);
9385 }
9386
9387 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9388 "Unhandled ORR[XW]rs copy case");
9389
9390 return std::nullopt;
9391}
9392
9393bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9394 // Functions cannot be split to different sections on AArch64 if they have
9395 // a red zone. This is because relaxing a cross-section branch may require
9396 // incrementing the stack pointer to spill a register, which would overwrite
9397 // the red zone.
9398 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(u: true))
9399 return false;
9400
9401 return TargetInstrInfo::isFunctionSafeToSplit(MF);
9402}
9403
9404bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9405 const MachineBasicBlock &MBB) const {
9406 // Asm Goto blocks can contain conditional branches to goto labels, which can
9407 // get moved out of range of the branch instruction.
9408 auto isAsmGoto = [](const MachineInstr &MI) {
9409 return MI.getOpcode() == AArch64::INLINEASM_BR;
9410 };
9411 if (llvm::any_of(Range: MBB, P: isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9412 return false;
9413
9414 // Because jump tables are label-relative instead of table-relative, they all
9415 // must be in the same section or relocation fixup handling will fail.
9416
9417 // Check if MBB is a jump table target
9418 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9419 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9420 return llvm::is_contained(Range: JTE.MBBs, Element: &MBB);
9421 };
9422 if (MJTI != nullptr && llvm::any_of(Range: MJTI->getJumpTables(), P: containsMBB))
9423 return false;
9424
9425 // Check if MBB contains a jump table lookup
9426 for (const MachineInstr &MI : MBB) {
9427 switch (MI.getOpcode()) {
9428 case TargetOpcode::G_BRJT:
9429 case AArch64::JumpTableDest32:
9430 case AArch64::JumpTableDest16:
9431 case AArch64::JumpTableDest8:
9432 return false;
9433 default:
9434 continue;
9435 }
9436 }
9437
9438 // MBB isn't a special case, so it's safe to be split to the cold section.
9439 return true;
9440}
9441
9442std::optional<ParamLoadedValue>
9443AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9444 Register Reg) const {
9445 const MachineFunction *MF = MI.getMF();
9446 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9447 switch (MI.getOpcode()) {
9448 case AArch64::MOVZWi:
9449 case AArch64::MOVZXi: {
9450 // MOVZWi may be used for producing zero-extended 32-bit immediates in
9451 // 64-bit parameters, so we need to consider super-registers.
9452 if (!TRI->isSuperRegisterEq(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
9453 return std::nullopt;
9454
9455 if (!MI.getOperand(i: 1).isImm())
9456 return std::nullopt;
9457 int64_t Immediate = MI.getOperand(i: 1).getImm();
9458 int Shift = MI.getOperand(i: 2).getImm();
9459 return ParamLoadedValue(MachineOperand::CreateImm(Val: Immediate << Shift),
9460 nullptr);
9461 }
9462 case AArch64::ORRWrs:
9463 case AArch64::ORRXrs:
9464 return describeORRLoadedValue(MI, DescribedReg: Reg, TII: this, TRI);
9465 }
9466
9467 return TargetInstrInfo::describeLoadedValue(MI, Reg);
9468}
9469
9470bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9471 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9472 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9473 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9474 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9475
9476 // Anyexts are nops.
9477 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9478 return true;
9479
9480 Register DefReg = ExtMI.getOperand(i: 0).getReg();
9481 if (!MRI.hasOneNonDBGUse(RegNo: DefReg))
9482 return false;
9483
9484 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9485 // addressing mode.
9486 auto *UserMI = &*MRI.use_instr_nodbg_begin(RegNo: DefReg);
9487 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9488}
9489
9490uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9491 return get(Opcode: Opc).TSFlags & AArch64::ElementSizeMask;
9492}
9493
9494bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9495 return get(Opcode: Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9496}
9497
9498bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9499 return get(Opcode: Opc).TSFlags & AArch64::InstrFlagIsWhile;
9500}
9501
9502unsigned int
9503AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9504 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9505}
9506
9507bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9508 unsigned Scale) const {
9509 if (Offset && Scale)
9510 return false;
9511
9512 // Check Reg + Imm
9513 if (!Scale) {
9514 // 9-bit signed offset
9515 if (isInt<9>(x: Offset))
9516 return true;
9517
9518 // 12-bit unsigned offset
9519 unsigned Shift = Log2_64(Value: NumBytes);
9520 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9521 // Must be a multiple of NumBytes (NumBytes is a power of 2)
9522 (Offset >> Shift) << Shift == Offset)
9523 return true;
9524 return false;
9525 }
9526
9527 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9528 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9529}
9530
9531unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9532 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9533 return AArch64::BLRNoIP;
9534 else
9535 return AArch64::BLR;
9536}
9537
9538MachineBasicBlock::iterator
9539AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9540 Register TargetReg, bool FrameSetup) const {
9541 assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
9542
9543 MachineBasicBlock &MBB = *MBBI->getParent();
9544 MachineFunction &MF = *MBB.getParent();
9545 const AArch64InstrInfo *TII =
9546 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
9547 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
9548 DebugLoc DL = MBB.findDebugLoc(MBBI);
9549
9550 MachineFunction::iterator MBBInsertPoint = std::next(x: MBB.getIterator());
9551 MachineBasicBlock *LoopTestMBB =
9552 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
9553 MF.insert(MBBI: MBBInsertPoint, MBB: LoopTestMBB);
9554 MachineBasicBlock *LoopBodyMBB =
9555 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
9556 MF.insert(MBBI: MBBInsertPoint, MBB: LoopBodyMBB);
9557 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
9558 MF.insert(MBBI: MBBInsertPoint, MBB: ExitMBB);
9559 MachineInstr::MIFlag Flags =
9560 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
9561
9562 // LoopTest:
9563 // SUB SP, SP, #ProbeSize
9564 emitFrameOffset(MBB&: *LoopTestMBB, MBBI: LoopTestMBB->end(), DL, DestReg: AArch64::SP,
9565 SrcReg: AArch64::SP, Offset: StackOffset::getFixed(Fixed: -ProbeSize), TII, Flag: Flags);
9566
9567 // CMP SP, TargetReg
9568 BuildMI(BB&: *LoopTestMBB, I: LoopTestMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::SUBSXrx64),
9569 DestReg: AArch64::XZR)
9570 .addReg(RegNo: AArch64::SP)
9571 .addReg(RegNo: TargetReg)
9572 .addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: 0))
9573 .setMIFlags(Flags);
9574
9575 // B.<Cond> LoopExit
9576 BuildMI(BB&: *LoopTestMBB, I: LoopTestMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc))
9577 .addImm(Val: AArch64CC::LE)
9578 .addMBB(MBB: ExitMBB)
9579 .setMIFlags(Flags);
9580
9581 // STR XZR, [SP]
9582 BuildMI(BB&: *LoopBodyMBB, I: LoopBodyMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::STRXui))
9583 .addReg(RegNo: AArch64::XZR)
9584 .addReg(RegNo: AArch64::SP)
9585 .addImm(Val: 0)
9586 .setMIFlags(Flags);
9587
9588 // B loop
9589 BuildMI(BB&: *LoopBodyMBB, I: LoopBodyMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::B))
9590 .addMBB(MBB: LoopTestMBB)
9591 .setMIFlags(Flags);
9592
9593 // LoopExit:
9594 // MOV SP, TargetReg
9595 BuildMI(BB&: *ExitMBB, I: ExitMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri), DestReg: AArch64::SP)
9596 .addReg(RegNo: TargetReg)
9597 .addImm(Val: 0)
9598 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
9599 .setMIFlags(Flags);
9600
9601 // LDR XZR, [SP]
9602 BuildMI(BB&: *ExitMBB, I: ExitMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
9603 .addReg(RegNo: AArch64::XZR, flags: RegState::Define)
9604 .addReg(RegNo: AArch64::SP)
9605 .addImm(Val: 0)
9606 .setMIFlags(Flags);
9607
9608 ExitMBB->splice(Where: ExitMBB->end(), Other: &MBB, From: std::next(x: MBBI), To: MBB.end());
9609 ExitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
9610
9611 LoopTestMBB->addSuccessor(Succ: ExitMBB);
9612 LoopTestMBB->addSuccessor(Succ: LoopBodyMBB);
9613 LoopBodyMBB->addSuccessor(Succ: LoopTestMBB);
9614 MBB.addSuccessor(Succ: LoopTestMBB);
9615
9616 // Update liveins.
9617 if (MF.getRegInfo().reservedRegsFrozen())
9618 fullyRecomputeLiveIns(MBBs: {ExitMBB, LoopBodyMBB, LoopTestMBB});
9619
9620 return ExitMBB->begin();
9621}
9622
9623namespace {
9624class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
9625 MachineFunction *MF;
9626 const TargetInstrInfo *TII;
9627 const TargetRegisterInfo *TRI;
9628 MachineRegisterInfo &MRI;
9629
9630 /// The block of the loop
9631 MachineBasicBlock *LoopBB;
9632 /// The conditional branch of the loop
9633 MachineInstr *CondBranch;
9634 /// The compare instruction for loop control
9635 MachineInstr *Comp;
9636 /// The number of the operand of the loop counter value in Comp
9637 unsigned CompCounterOprNum;
9638 /// The instruction that updates the loop counter value
9639 MachineInstr *Update;
9640 /// The number of the operand of the loop counter value in Update
9641 unsigned UpdateCounterOprNum;
9642 /// The initial value of the loop counter
9643 Register Init;
9644 /// True iff Update is a predecessor of Comp
9645 bool IsUpdatePriorComp;
9646
9647 /// The normalized condition used by createTripCountGreaterCondition()
9648 SmallVector<MachineOperand, 4> Cond;
9649
9650public:
9651 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
9652 MachineInstr *Comp, unsigned CompCounterOprNum,
9653 MachineInstr *Update, unsigned UpdateCounterOprNum,
9654 Register Init, bool IsUpdatePriorComp,
9655 const SmallVectorImpl<MachineOperand> &Cond)
9656 : MF(Comp->getParent()->getParent()),
9657 TII(MF->getSubtarget().getInstrInfo()),
9658 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
9659 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
9660 CompCounterOprNum(CompCounterOprNum), Update(Update),
9661 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
9662 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
9663
9664 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
9665 // Make the instructions for loop control be placed in stage 0.
9666 // The predecessors of Comp are considered by the caller.
9667 return MI == Comp;
9668 }
9669
9670 std::optional<bool> createTripCountGreaterCondition(
9671 int TC, MachineBasicBlock &MBB,
9672 SmallVectorImpl<MachineOperand> &CondParam) override {
9673 // A branch instruction will be inserted as "if (Cond) goto epilogue".
9674 // Cond is normalized for such use.
9675 // The predecessors of the branch are assumed to have already been inserted.
9676 CondParam = Cond;
9677 return {};
9678 }
9679
9680 void createRemainingIterationsGreaterCondition(
9681 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
9682 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
9683
9684 void setPreheader(MachineBasicBlock *NewPreheader) override {}
9685
9686 void adjustTripCount(int TripCountAdjust) override {}
9687
9688 void disposed() override {}
9689 bool isMVEExpanderSupported() override { return true; }
9690};
9691} // namespace
9692
9693/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
9694/// is replaced by ReplaceReg. The output register is newly created.
9695/// The other operands are unchanged from MI.
9696static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
9697 Register ReplaceReg, MachineBasicBlock &MBB,
9698 MachineBasicBlock::iterator InsertTo) {
9699 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9700 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
9701 const TargetRegisterInfo *TRI =
9702 MBB.getParent()->getSubtarget().getRegisterInfo();
9703 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(Orig: MI);
9704 Register Result = 0;
9705 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
9706 if (I == 0 && NewMI->getOperand(i: 0).getReg().isVirtual()) {
9707 Result = MRI.createVirtualRegister(
9708 RegClass: MRI.getRegClass(Reg: NewMI->getOperand(i: 0).getReg()));
9709 NewMI->getOperand(i: I).setReg(Result);
9710 } else if (I == ReplaceOprNum) {
9711 MRI.constrainRegClass(
9712 Reg: ReplaceReg,
9713 RC: TII->getRegClass(MCID: NewMI->getDesc(), OpNum: I, TRI, MF: *MBB.getParent()));
9714 NewMI->getOperand(i: I).setReg(ReplaceReg);
9715 }
9716 }
9717 MBB.insert(I: InsertTo, MI: NewMI);
9718 return Result;
9719}
9720
9721void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
9722 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
9723 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
9724 // Create and accumulate conditions for next TC iterations.
9725 // Example:
9726 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
9727 // # iteration of the kernel
9728 //
9729 // # insert the following instructions
9730 // cond = CSINCXr 0, 0, C, implicit $nzcv
9731 // counter = ADDXri counter, 1 # clone from this->Update
9732 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
9733 // cond = CSINCXr cond, cond, C, implicit $nzcv
9734 // ... (repeat TC times)
9735 // SUBSXri cond, 0, implicit-def $nzcv
9736
9737 assert(CondBranch->getOpcode() == AArch64::Bcc);
9738 // CondCode to exit the loop
9739 AArch64CC::CondCode CC =
9740 (AArch64CC::CondCode)CondBranch->getOperand(i: 0).getImm();
9741 if (CondBranch->getOperand(i: 1).getMBB() == LoopBB)
9742 CC = AArch64CC::getInvertedCondCode(Code: CC);
9743
9744 // Accumulate conditions to exit the loop
9745 Register AccCond = AArch64::XZR;
9746
9747 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
9748 auto AccumulateCond = [&](Register CurCond,
9749 AArch64CC::CondCode CC) -> Register {
9750 Register NewCond = MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
9751 BuildMI(BB&: MBB, I: MBB.end(), MIMD: Comp->getDebugLoc(), MCID: TII->get(Opcode: AArch64::CSINCXr))
9752 .addReg(RegNo: NewCond, flags: RegState::Define)
9753 .addReg(RegNo: CurCond)
9754 .addReg(RegNo: CurCond)
9755 .addImm(Val: AArch64CC::getInvertedCondCode(Code: CC));
9756 return NewCond;
9757 };
9758
9759 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
9760 // Update and Comp for I==0 are already exists in MBB
9761 // (MBB is an unrolled kernel)
9762 Register Counter;
9763 for (int I = 0; I <= TC; ++I) {
9764 Register NextCounter;
9765 if (I != 0)
9766 NextCounter =
9767 cloneInstr(MI: Comp, ReplaceOprNum: CompCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
9768
9769 AccCond = AccumulateCond(AccCond, CC);
9770
9771 if (I != TC) {
9772 if (I == 0) {
9773 if (Update != Comp && IsUpdatePriorComp) {
9774 Counter =
9775 LastStage0Insts[Comp]->getOperand(i: CompCounterOprNum).getReg();
9776 NextCounter = cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB,
9777 InsertTo: MBB.end());
9778 } else {
9779 // can use already calculated value
9780 NextCounter = LastStage0Insts[Update]->getOperand(i: 0).getReg();
9781 }
9782 } else if (Update != Comp) {
9783 NextCounter =
9784 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
9785 }
9786 }
9787 Counter = NextCounter;
9788 }
9789 } else {
9790 Register Counter;
9791 if (LastStage0Insts.empty()) {
9792 // use initial counter value (testing if the trip count is sufficient to
9793 // be executed by pipelined code)
9794 Counter = Init;
9795 if (IsUpdatePriorComp)
9796 Counter =
9797 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
9798 } else {
9799 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
9800 Counter = LastStage0Insts[Comp]->getOperand(i: CompCounterOprNum).getReg();
9801 }
9802
9803 for (int I = 0; I <= TC; ++I) {
9804 Register NextCounter;
9805 NextCounter =
9806 cloneInstr(MI: Comp, ReplaceOprNum: CompCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
9807 AccCond = AccumulateCond(AccCond, CC);
9808 if (I != TC && Update != Comp)
9809 NextCounter =
9810 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
9811 Counter = NextCounter;
9812 }
9813 }
9814
9815 // If AccCond == 0, the remainder is greater than TC.
9816 BuildMI(BB&: MBB, I: MBB.end(), MIMD: Comp->getDebugLoc(), MCID: TII->get(Opcode: AArch64::SUBSXri))
9817 .addReg(RegNo: AArch64::XZR, flags: RegState::Define | RegState::Dead)
9818 .addReg(RegNo: AccCond)
9819 .addImm(Val: 0)
9820 .addImm(Val: 0);
9821 Cond.clear();
9822 Cond.push_back(Elt: MachineOperand::CreateImm(Val: AArch64CC::EQ));
9823}
9824
9825static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
9826 Register &RegMBB, Register &RegOther) {
9827 assert(Phi.getNumOperands() == 5);
9828 if (Phi.getOperand(i: 2).getMBB() == MBB) {
9829 RegMBB = Phi.getOperand(i: 1).getReg();
9830 RegOther = Phi.getOperand(i: 3).getReg();
9831 } else {
9832 assert(Phi.getOperand(4).getMBB() == MBB);
9833 RegMBB = Phi.getOperand(i: 3).getReg();
9834 RegOther = Phi.getOperand(i: 1).getReg();
9835 }
9836}
9837
9838static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
9839 if (!Reg.isVirtual())
9840 return false;
9841 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
9842 return MRI.getVRegDef(Reg)->getParent() != BB;
9843}
9844
9845/// If Reg is an induction variable, return true and set some parameters
9846static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
9847 MachineInstr *&UpdateInst,
9848 unsigned &UpdateCounterOprNum, Register &InitReg,
9849 bool &IsUpdatePriorComp) {
9850 // Example:
9851 //
9852 // Preheader:
9853 // InitReg = ...
9854 // LoopBB:
9855 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
9856 // Reg = COPY Reg0 ; COPY is ignored.
9857 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
9858 // ; Reg is the value calculated in the previous
9859 // ; iteration, so IsUpdatePriorComp == false.
9860
9861 if (LoopBB->pred_size() != 2)
9862 return false;
9863 if (!Reg.isVirtual())
9864 return false;
9865 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
9866 UpdateInst = nullptr;
9867 UpdateCounterOprNum = 0;
9868 InitReg = 0;
9869 IsUpdatePriorComp = true;
9870 Register CurReg = Reg;
9871 while (true) {
9872 MachineInstr *Def = MRI.getVRegDef(Reg: CurReg);
9873 if (Def->getParent() != LoopBB)
9874 return false;
9875 if (Def->isCopy()) {
9876 // Ignore copy instructions unless they contain subregisters
9877 if (Def->getOperand(i: 0).getSubReg() || Def->getOperand(i: 1).getSubReg())
9878 return false;
9879 CurReg = Def->getOperand(i: 1).getReg();
9880 } else if (Def->isPHI()) {
9881 if (InitReg != 0)
9882 return false;
9883 if (!UpdateInst)
9884 IsUpdatePriorComp = false;
9885 extractPhiReg(Phi: *Def, MBB: LoopBB, RegMBB&: CurReg, RegOther&: InitReg);
9886 } else {
9887 if (UpdateInst)
9888 return false;
9889 switch (Def->getOpcode()) {
9890 case AArch64::ADDSXri:
9891 case AArch64::ADDSWri:
9892 case AArch64::SUBSXri:
9893 case AArch64::SUBSWri:
9894 case AArch64::ADDXri:
9895 case AArch64::ADDWri:
9896 case AArch64::SUBXri:
9897 case AArch64::SUBWri:
9898 UpdateInst = Def;
9899 UpdateCounterOprNum = 1;
9900 break;
9901 case AArch64::ADDSXrr:
9902 case AArch64::ADDSWrr:
9903 case AArch64::SUBSXrr:
9904 case AArch64::SUBSWrr:
9905 case AArch64::ADDXrr:
9906 case AArch64::ADDWrr:
9907 case AArch64::SUBXrr:
9908 case AArch64::SUBWrr:
9909 UpdateInst = Def;
9910 if (isDefinedOutside(Reg: Def->getOperand(i: 2).getReg(), BB: LoopBB))
9911 UpdateCounterOprNum = 1;
9912 else if (isDefinedOutside(Reg: Def->getOperand(i: 1).getReg(), BB: LoopBB))
9913 UpdateCounterOprNum = 2;
9914 else
9915 return false;
9916 break;
9917 default:
9918 return false;
9919 }
9920 CurReg = Def->getOperand(i: UpdateCounterOprNum).getReg();
9921 }
9922
9923 if (!CurReg.isVirtual())
9924 return false;
9925 if (Reg == CurReg)
9926 break;
9927 }
9928
9929 if (!UpdateInst)
9930 return false;
9931
9932 return true;
9933}
9934
9935std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
9936AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
9937 // Accept loops that meet the following conditions
9938 // * The conditional branch is BCC
9939 // * The compare instruction is ADDS/SUBS/WHILEXX
9940 // * One operand of the compare is an induction variable and the other is a
9941 // loop invariant value
9942 // * The induction variable is incremented/decremented by a single instruction
9943 // * Does not contain CALL or instructions which have unmodeled side effects
9944
9945 for (MachineInstr &MI : *LoopBB)
9946 if (MI.isCall() || MI.hasUnmodeledSideEffects())
9947 // This instruction may use NZCV, which interferes with the instruction to
9948 // be inserted for loop control.
9949 return nullptr;
9950
9951 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
9952 SmallVector<MachineOperand, 4> Cond;
9953 if (analyzeBranch(MBB&: *LoopBB, TBB, FBB, Cond))
9954 return nullptr;
9955
9956 // Infinite loops are not supported
9957 if (TBB == LoopBB && FBB == LoopBB)
9958 return nullptr;
9959
9960 // Must be conditional branch
9961 if (TBB != LoopBB && FBB == nullptr)
9962 return nullptr;
9963
9964 assert((TBB == LoopBB || FBB == LoopBB) &&
9965 "The Loop must be a single-basic-block loop");
9966
9967 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
9968 const TargetRegisterInfo &TRI = getRegisterInfo();
9969
9970 if (CondBranch->getOpcode() != AArch64::Bcc)
9971 return nullptr;
9972
9973 // Normalization for createTripCountGreaterCondition()
9974 if (TBB == LoopBB)
9975 reverseBranchCondition(Cond);
9976
9977 MachineInstr *Comp = nullptr;
9978 unsigned CompCounterOprNum = 0;
9979 for (MachineInstr &MI : reverse(C&: *LoopBB)) {
9980 if (MI.modifiesRegister(Reg: AArch64::NZCV, TRI: &TRI)) {
9981 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
9982 // operands is a loop invariant value
9983
9984 switch (MI.getOpcode()) {
9985 case AArch64::SUBSXri:
9986 case AArch64::SUBSWri:
9987 case AArch64::ADDSXri:
9988 case AArch64::ADDSWri:
9989 Comp = &MI;
9990 CompCounterOprNum = 1;
9991 break;
9992 case AArch64::ADDSWrr:
9993 case AArch64::ADDSXrr:
9994 case AArch64::SUBSWrr:
9995 case AArch64::SUBSXrr:
9996 Comp = &MI;
9997 break;
9998 default:
9999 if (isWhileOpcode(Opc: MI.getOpcode())) {
10000 Comp = &MI;
10001 break;
10002 }
10003 return nullptr;
10004 }
10005
10006 if (CompCounterOprNum == 0) {
10007 if (isDefinedOutside(Reg: Comp->getOperand(i: 1).getReg(), BB: LoopBB))
10008 CompCounterOprNum = 2;
10009 else if (isDefinedOutside(Reg: Comp->getOperand(i: 2).getReg(), BB: LoopBB))
10010 CompCounterOprNum = 1;
10011 else
10012 return nullptr;
10013 }
10014 break;
10015 }
10016 }
10017 if (!Comp)
10018 return nullptr;
10019
10020 MachineInstr *Update = nullptr;
10021 Register Init;
10022 bool IsUpdatePriorComp;
10023 unsigned UpdateCounterOprNum;
10024 if (!getIndVarInfo(Reg: Comp->getOperand(i: CompCounterOprNum).getReg(), LoopBB,
10025 UpdateInst&: Update, UpdateCounterOprNum, InitReg&: Init, IsUpdatePriorComp))
10026 return nullptr;
10027
10028 return std::make_unique<AArch64PipelinerLoopInfo>(
10029 args&: LoopBB, args&: CondBranch, args&: Comp, args&: CompCounterOprNum, args&: Update, args&: UpdateCounterOprNum,
10030 args&: Init, args&: IsUpdatePriorComp, args&: Cond);
10031}
10032
10033#define GET_INSTRINFO_HELPERS
10034#define GET_INSTRMAP_INFO
10035#include "AArch64GenInstrInfo.inc"
10036