1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
15#include "AArch64MachineFunctionInfo.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
18#include "MCTargetDesc/AArch64AddressingModes.h"
19#include "MCTargetDesc/AArch64MCTargetDesc.h"
20#include "Utils/AArch64BaseInfo.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/Analysis/AliasAnalysis.h"
27#include "llvm/CodeGen/CFIInstBuilder.h"
28#include "llvm/CodeGen/LivePhysRegs.h"
29#include "llvm/CodeGen/MachineBasicBlock.h"
30#include "llvm/CodeGen/MachineCombinerPattern.h"
31#include "llvm/CodeGen/MachineFrameInfo.h"
32#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineInstr.h"
34#include "llvm/CodeGen/MachineInstrBuilder.h"
35#include "llvm/CodeGen/MachineMemOperand.h"
36#include "llvm/CodeGen/MachineModuleInfo.h"
37#include "llvm/CodeGen/MachineOperand.h"
38#include "llvm/CodeGen/MachineRegisterInfo.h"
39#include "llvm/CodeGen/RegisterScavenging.h"
40#include "llvm/CodeGen/StackMaps.h"
41#include "llvm/CodeGen/TargetRegisterInfo.h"
42#include "llvm/CodeGen/TargetSubtargetInfo.h"
43#include "llvm/IR/DebugInfoMetadata.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
49#include "llvm/MC/MCInstBuilder.h"
50#include "llvm/MC/MCInstrDesc.h"
51#include "llvm/Support/Casting.h"
52#include "llvm/Support/CodeGen.h"
53#include "llvm/Support/CommandLine.h"
54#include "llvm/Support/ErrorHandling.h"
55#include "llvm/Support/LEB128.h"
56#include "llvm/Support/MathExtras.h"
57#include "llvm/Target/TargetMachine.h"
58#include "llvm/Target/TargetOptions.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
80static cl::opt<unsigned>
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(Val: 9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
84static cl::opt<unsigned> TBZDisplacementBits(
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(Val: 14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
88static cl::opt<unsigned> CBZDisplacementBits(
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(Val: 19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
92static cl::opt<unsigned>
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(Val: 19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
96static cl::opt<unsigned>
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(Val: 26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
100static cl::opt<unsigned> GatherOptSearchLimit(
101 "aarch64-search-limit", cl::Hidden, cl::init(Val: 2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
105AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// GetInstSize - Return the number of bytes of code the specified
111/// instruction may be. This returns the maximum number of bytes.
112unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
113 const MachineBasicBlock &MBB = *MI.getParent();
114 const MachineFunction *MF = MBB.getParent();
115 const Function &F = MF->getFunction();
116 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
117
118 {
119 auto Op = MI.getOpcode();
120 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
121 return getInlineAsmLength(Str: MI.getOperand(i: 0).getSymbolName(), MAI: *MAI);
122 }
123
124 // Meta-instructions emit no code.
125 if (MI.isMetaInstruction())
126 return 0;
127
128 // FIXME: We currently only handle pseudoinstructions that don't get expanded
129 // before the assembly printer.
130 unsigned NumBytes = 0;
131 const MCInstrDesc &Desc = MI.getDesc();
132
133 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
134 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
135
136 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
137 if (!MFI->shouldSignReturnAddress(MF: *MF))
138 return NumBytes;
139
140 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
141 auto Method = STI.getAuthenticatedLRCheckMethod(MF: *MF);
142 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
143 return NumBytes;
144 }
145
146 // Size should be preferably set in
147 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
148 // Specific cases handle instructions of variable sizes
149 switch (Desc.getOpcode()) {
150 default:
151 if (Desc.getSize())
152 return Desc.getSize();
153
154 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
155 // with fixed constant size but not specified in .td file) is a normal
156 // 4-byte insn.
157 NumBytes = 4;
158 break;
159 case TargetOpcode::STACKMAP:
160 // The upper bound for a stackmap intrinsic is the full length of its shadow
161 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
162 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
163 break;
164 case TargetOpcode::PATCHPOINT:
165 // The size of the patchpoint intrinsic is the number of bytes requested
166 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
167 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
168 break;
169 case TargetOpcode::STATEPOINT:
170 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
171 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
172 // No patch bytes means a normal call inst is emitted
173 if (NumBytes == 0)
174 NumBytes = 4;
175 break;
176 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
177 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
178 // instructions are expanded to the specified number of NOPs. Otherwise,
179 // they are expanded to 36-byte XRay sleds.
180 NumBytes =
181 F.getFnAttributeAsParsedInteger(Kind: "patchable-function-entry", Default: 9) * 4;
182 break;
183 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
184 case TargetOpcode::PATCHABLE_TAIL_CALL:
185 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
186 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
187 NumBytes = 36;
188 break;
189 case TargetOpcode::PATCHABLE_EVENT_CALL:
190 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
191 NumBytes = 24;
192 break;
193
194 case AArch64::SPACE:
195 NumBytes = MI.getOperand(i: 1).getImm();
196 break;
197 case TargetOpcode::BUNDLE:
198 NumBytes = getInstBundleLength(MI);
199 break;
200 }
201
202 return NumBytes;
203}
204
205unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
206 unsigned Size = 0;
207 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
208 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
209 while (++I != E && I->isInsideBundle()) {
210 assert(!I->isBundle() && "No nested bundle!");
211 Size += getInstSizeInBytes(MI: *I);
212 }
213 return Size;
214}
215
216static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
217 SmallVectorImpl<MachineOperand> &Cond) {
218 // Block ends with fall-through condbranch.
219 switch (LastInst->getOpcode()) {
220 default:
221 llvm_unreachable("Unknown branch instruction?");
222 case AArch64::Bcc:
223 Target = LastInst->getOperand(i: 1).getMBB();
224 Cond.push_back(Elt: LastInst->getOperand(i: 0));
225 break;
226 case AArch64::CBZW:
227 case AArch64::CBZX:
228 case AArch64::CBNZW:
229 case AArch64::CBNZX:
230 Target = LastInst->getOperand(i: 1).getMBB();
231 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
232 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
233 Cond.push_back(Elt: LastInst->getOperand(i: 0));
234 break;
235 case AArch64::TBZW:
236 case AArch64::TBZX:
237 case AArch64::TBNZW:
238 case AArch64::TBNZX:
239 Target = LastInst->getOperand(i: 2).getMBB();
240 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
241 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
242 Cond.push_back(Elt: LastInst->getOperand(i: 0));
243 Cond.push_back(Elt: LastInst->getOperand(i: 1));
244 break;
245 case AArch64::CBWPri:
246 case AArch64::CBXPri:
247 case AArch64::CBWPrr:
248 case AArch64::CBXPrr:
249 Target = LastInst->getOperand(i: 3).getMBB();
250 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
251 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
252 Cond.push_back(Elt: LastInst->getOperand(i: 0));
253 Cond.push_back(Elt: LastInst->getOperand(i: 1));
254 Cond.push_back(Elt: LastInst->getOperand(i: 2));
255 break;
256 case AArch64::CBBAssertExt:
257 case AArch64::CBHAssertExt:
258 Target = LastInst->getOperand(i: 3).getMBB();
259 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1)); // -1
260 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode())); // Opc
261 Cond.push_back(Elt: LastInst->getOperand(i: 0)); // Cond
262 Cond.push_back(Elt: LastInst->getOperand(i: 1)); // Op0
263 Cond.push_back(Elt: LastInst->getOperand(i: 2)); // Op1
264 Cond.push_back(Elt: LastInst->getOperand(i: 4)); // Ext0
265 Cond.push_back(Elt: LastInst->getOperand(i: 5)); // Ext1
266 break;
267 }
268}
269
270static unsigned getBranchDisplacementBits(unsigned Opc) {
271 switch (Opc) {
272 default:
273 llvm_unreachable("unexpected opcode!");
274 case AArch64::B:
275 return BDisplacementBits;
276 case AArch64::TBNZW:
277 case AArch64::TBZW:
278 case AArch64::TBNZX:
279 case AArch64::TBZX:
280 return TBZDisplacementBits;
281 case AArch64::CBNZW:
282 case AArch64::CBZW:
283 case AArch64::CBNZX:
284 case AArch64::CBZX:
285 return CBZDisplacementBits;
286 case AArch64::Bcc:
287 return BCCDisplacementBits;
288 case AArch64::CBWPri:
289 case AArch64::CBXPri:
290 case AArch64::CBBAssertExt:
291 case AArch64::CBHAssertExt:
292 case AArch64::CBWPrr:
293 case AArch64::CBXPrr:
294 return CBDisplacementBits;
295 }
296}
297
298bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
299 int64_t BrOffset) const {
300 unsigned Bits = getBranchDisplacementBits(Opc: BranchOp);
301 assert(Bits >= 3 && "max branch displacement must be enough to jump"
302 "over conditional branch expansion");
303 return isIntN(N: Bits, x: BrOffset / 4);
304}
305
306MachineBasicBlock *
307AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
308 switch (MI.getOpcode()) {
309 default:
310 llvm_unreachable("unexpected opcode!");
311 case AArch64::B:
312 return MI.getOperand(i: 0).getMBB();
313 case AArch64::TBZW:
314 case AArch64::TBNZW:
315 case AArch64::TBZX:
316 case AArch64::TBNZX:
317 return MI.getOperand(i: 2).getMBB();
318 case AArch64::CBZW:
319 case AArch64::CBNZW:
320 case AArch64::CBZX:
321 case AArch64::CBNZX:
322 case AArch64::Bcc:
323 return MI.getOperand(i: 1).getMBB();
324 case AArch64::CBWPri:
325 case AArch64::CBXPri:
326 case AArch64::CBBAssertExt:
327 case AArch64::CBHAssertExt:
328 case AArch64::CBWPrr:
329 case AArch64::CBXPrr:
330 return MI.getOperand(i: 3).getMBB();
331 }
332}
333
334void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
335 MachineBasicBlock &NewDestBB,
336 MachineBasicBlock &RestoreBB,
337 const DebugLoc &DL,
338 int64_t BrOffset,
339 RegScavenger *RS) const {
340 assert(RS && "RegScavenger required for long branching");
341 assert(MBB.empty() &&
342 "new block should be inserted for expanding unconditional branch");
343 assert(MBB.pred_size() == 1);
344 assert(RestoreBB.empty() &&
345 "restore block should be inserted for restoring clobbered registers");
346
347 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
348 // Offsets outside of the signed 33-bit range are not supported for ADRP +
349 // ADD.
350 if (!isInt<33>(x: BrOffset))
351 report_fatal_error(
352 reason: "Branch offsets outside of the signed 33-bit range not supported");
353
354 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::ADRP), DestReg: Reg)
355 .addSym(Sym: DestBB.getSymbol(), TargetFlags: AArch64II::MO_PAGE);
356 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg: Reg)
357 .addReg(RegNo: Reg)
358 .addSym(Sym: DestBB.getSymbol(), TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
359 .addImm(Val: 0);
360 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::BR)).addReg(RegNo: Reg);
361 };
362
363 RS->enterBasicBlockEnd(MBB);
364 // If X16 is unused, we can rely on the linker to insert a range extension
365 // thunk if NewDestBB is out of range of a single B instruction.
366 constexpr Register Reg = AArch64::X16;
367 if (!RS->isRegUsed(Reg)) {
368 insertUnconditionalBranch(MBB, DestBB: &NewDestBB, DL);
369 RS->setRegUsed(Reg);
370 return;
371 }
372
373 // If there's a free register and it's worth inflating the code size,
374 // manually insert the indirect branch.
375 Register Scavenged = RS->FindUnusedReg(RC: &AArch64::GPR64RegClass);
376 if (Scavenged != AArch64::NoRegister &&
377 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
378 buildIndirectBranch(Scavenged, NewDestBB);
379 RS->setRegUsed(Reg: Scavenged);
380 return;
381 }
382
383 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
384 // with red zones.
385 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
386 if (!AFI || AFI->hasRedZone().value_or(u: true))
387 report_fatal_error(
388 reason: "Unable to insert indirect branch inside function that has red zone");
389
390 // Otherwise, spill X16 and defer range extension to the linker.
391 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::STRXpre))
392 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
393 .addReg(RegNo: Reg)
394 .addReg(RegNo: AArch64::SP)
395 .addImm(Val: -16);
396
397 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: &RestoreBB);
398
399 BuildMI(BB&: RestoreBB, I: RestoreBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::LDRXpost))
400 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
401 .addReg(RegNo: Reg, Flags: RegState::Define)
402 .addReg(RegNo: AArch64::SP)
403 .addImm(Val: 16);
404}
405
406// Branch analysis.
407bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
408 MachineBasicBlock *&TBB,
409 MachineBasicBlock *&FBB,
410 SmallVectorImpl<MachineOperand> &Cond,
411 bool AllowModify) const {
412 // If the block has no terminators, it just falls into the block after it.
413 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
414 if (I == MBB.end())
415 return false;
416
417 // Skip over SpeculationBarrierEndBB terminators
418 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
419 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
420 --I;
421 }
422
423 if (!isUnpredicatedTerminator(MI: *I))
424 return false;
425
426 // Get the last instruction in the block.
427 MachineInstr *LastInst = &*I;
428
429 // If there is only one terminator instruction, process it.
430 unsigned LastOpc = LastInst->getOpcode();
431 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
432 if (isUncondBranchOpcode(Opc: LastOpc)) {
433 TBB = LastInst->getOperand(i: 0).getMBB();
434 return false;
435 }
436 if (isCondBranchOpcode(Opc: LastOpc)) {
437 // Block ends with fall-through condbranch.
438 parseCondBranch(LastInst, Target&: TBB, Cond);
439 return false;
440 }
441 return true; // Can't handle indirect branch.
442 }
443
444 // Get the instruction before it if it is a terminator.
445 MachineInstr *SecondLastInst = &*I;
446 unsigned SecondLastOpc = SecondLastInst->getOpcode();
447
448 // If AllowModify is true and the block ends with two or more unconditional
449 // branches, delete all but the first unconditional branch.
450 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc)) {
451 while (isUncondBranchOpcode(Opc: SecondLastOpc)) {
452 LastInst->eraseFromParent();
453 LastInst = SecondLastInst;
454 LastOpc = LastInst->getOpcode();
455 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
456 // Return now the only terminator is an unconditional branch.
457 TBB = LastInst->getOperand(i: 0).getMBB();
458 return false;
459 }
460 SecondLastInst = &*I;
461 SecondLastOpc = SecondLastInst->getOpcode();
462 }
463 }
464
465 // If we're allowed to modify and the block ends in a unconditional branch
466 // which could simply fallthrough, remove the branch. (Note: This case only
467 // matters when we can't understand the whole sequence, otherwise it's also
468 // handled by BranchFolding.cpp.)
469 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc) &&
470 MBB.isLayoutSuccessor(MBB: getBranchDestBlock(MI: *LastInst))) {
471 LastInst->eraseFromParent();
472 LastInst = SecondLastInst;
473 LastOpc = LastInst->getOpcode();
474 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
475 assert(!isUncondBranchOpcode(LastOpc) &&
476 "unreachable unconditional branches removed above");
477
478 if (isCondBranchOpcode(Opc: LastOpc)) {
479 // Block ends with fall-through condbranch.
480 parseCondBranch(LastInst, Target&: TBB, Cond);
481 return false;
482 }
483 return true; // Can't handle indirect branch.
484 }
485 SecondLastInst = &*I;
486 SecondLastOpc = SecondLastInst->getOpcode();
487 }
488
489 // If there are three terminators, we don't know what sort of block this is.
490 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(MI: *--I))
491 return true;
492
493 // If the block ends with a B and a Bcc, handle it.
494 if (isCondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
495 parseCondBranch(LastInst: SecondLastInst, Target&: TBB, Cond);
496 FBB = LastInst->getOperand(i: 0).getMBB();
497 return false;
498 }
499
500 // If the block ends with two unconditional branches, handle it. The second
501 // one is not executed, so remove it.
502 if (isUncondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
503 TBB = SecondLastInst->getOperand(i: 0).getMBB();
504 I = LastInst;
505 if (AllowModify)
506 I->eraseFromParent();
507 return false;
508 }
509
510 // ...likewise if it ends with an indirect branch followed by an unconditional
511 // branch.
512 if (isIndirectBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
513 I = LastInst;
514 if (AllowModify)
515 I->eraseFromParent();
516 return true;
517 }
518
519 // Otherwise, can't handle this.
520 return true;
521}
522
523bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
524 MachineBranchPredicate &MBP,
525 bool AllowModify) const {
526 // Use analyzeBranch to validate the branch pattern.
527 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
528 SmallVector<MachineOperand, 4> Cond;
529 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
530 return true;
531
532 // analyzeBranch returns success with empty Cond for unconditional branches.
533 if (Cond.empty())
534 return true;
535
536 MBP.TrueDest = TBB;
537 assert(MBP.TrueDest && "expected!");
538 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
539
540 MBP.ConditionDef = nullptr;
541 MBP.SingleUseCondition = false;
542
543 // Find the conditional branch. After analyzeBranch succeeds with non-empty
544 // Cond, there's exactly one conditional branch - either last (fallthrough)
545 // or second-to-last (followed by unconditional B).
546 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
547 if (I == MBB.end())
548 return true;
549
550 if (isUncondBranchOpcode(Opc: I->getOpcode())) {
551 if (I == MBB.begin())
552 return true;
553 --I;
554 }
555
556 MachineInstr *CondBranch = &*I;
557 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
558
559 switch (CondBranch->getOpcode()) {
560 default:
561 return true;
562
563 case AArch64::Bcc:
564 // Bcc takes the NZCV flag as the operand to branch on, walk up the
565 // instruction stream to find the last instruction to define NZCV.
566 for (MachineInstr &MI : llvm::drop_begin(RangeOrContainer: llvm::reverse(C&: MBB))) {
567 if (MI.modifiesRegister(Reg: AArch64::NZCV, /*TRI=*/nullptr)) {
568 MBP.ConditionDef = &MI;
569 break;
570 }
571 }
572 return false;
573
574 case AArch64::CBZW:
575 case AArch64::CBZX:
576 case AArch64::CBNZW:
577 case AArch64::CBNZX: {
578 MBP.LHS = CondBranch->getOperand(i: 0);
579 MBP.RHS = MachineOperand::CreateImm(Val: 0);
580 unsigned Opc = CondBranch->getOpcode();
581 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
582 ? MachineBranchPredicate::PRED_NE
583 : MachineBranchPredicate::PRED_EQ;
584 Register CondReg = MBP.LHS.getReg();
585 if (CondReg.isVirtual())
586 MBP.ConditionDef = MRI.getVRegDef(Reg: CondReg);
587 return false;
588 }
589
590 case AArch64::TBZW:
591 case AArch64::TBZX:
592 case AArch64::TBNZW:
593 case AArch64::TBNZX: {
594 Register CondReg = CondBranch->getOperand(i: 0).getReg();
595 if (CondReg.isVirtual())
596 MBP.ConditionDef = MRI.getVRegDef(Reg: CondReg);
597 return false;
598 }
599 }
600}
601
602bool AArch64InstrInfo::reverseBranchCondition(
603 SmallVectorImpl<MachineOperand> &Cond) const {
604 if (Cond[0].getImm() != -1) {
605 // Regular Bcc
606 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
607 Cond[0].setImm(AArch64CC::getInvertedCondCode(Code: CC));
608 } else {
609 // Folded compare-and-branch
610 switch (Cond[1].getImm()) {
611 default:
612 llvm_unreachable("Unknown conditional branch!");
613 case AArch64::CBZW:
614 Cond[1].setImm(AArch64::CBNZW);
615 break;
616 case AArch64::CBNZW:
617 Cond[1].setImm(AArch64::CBZW);
618 break;
619 case AArch64::CBZX:
620 Cond[1].setImm(AArch64::CBNZX);
621 break;
622 case AArch64::CBNZX:
623 Cond[1].setImm(AArch64::CBZX);
624 break;
625 case AArch64::TBZW:
626 Cond[1].setImm(AArch64::TBNZW);
627 break;
628 case AArch64::TBNZW:
629 Cond[1].setImm(AArch64::TBZW);
630 break;
631 case AArch64::TBZX:
632 Cond[1].setImm(AArch64::TBNZX);
633 break;
634 case AArch64::TBNZX:
635 Cond[1].setImm(AArch64::TBZX);
636 break;
637
638 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
639 case AArch64::CBWPri:
640 case AArch64::CBXPri:
641 case AArch64::CBBAssertExt:
642 case AArch64::CBHAssertExt:
643 case AArch64::CBWPrr:
644 case AArch64::CBXPrr: {
645 // Pseudos using standard 4bit Arm condition codes
646 AArch64CC::CondCode CC =
647 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
648 Cond[2].setImm(AArch64CC::getInvertedCondCode(Code: CC));
649 }
650 }
651 }
652
653 return false;
654}
655
656unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
657 int *BytesRemoved) const {
658 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
659 if (I == MBB.end())
660 return 0;
661
662 if (!isUncondBranchOpcode(Opc: I->getOpcode()) &&
663 !isCondBranchOpcode(Opc: I->getOpcode()))
664 return 0;
665
666 // Remove the branch.
667 I->eraseFromParent();
668
669 I = MBB.end();
670
671 if (I == MBB.begin()) {
672 if (BytesRemoved)
673 *BytesRemoved = 4;
674 return 1;
675 }
676 --I;
677 if (!isCondBranchOpcode(Opc: I->getOpcode())) {
678 if (BytesRemoved)
679 *BytesRemoved = 4;
680 return 1;
681 }
682
683 // Remove the branch.
684 I->eraseFromParent();
685 if (BytesRemoved)
686 *BytesRemoved = 8;
687
688 return 2;
689}
690
691void AArch64InstrInfo::instantiateCondBranch(
692 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
693 ArrayRef<MachineOperand> Cond) const {
694 if (Cond[0].getImm() != -1) {
695 // Regular Bcc
696 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::Bcc)).addImm(Val: Cond[0].getImm()).addMBB(MBB: TBB);
697 } else {
698 // Folded compare-and-branch
699 // Note that we use addOperand instead of addReg to keep the flags.
700
701 // cbz, cbnz
702 const MachineInstrBuilder MIB =
703 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: Cond[1].getImm())).add(MO: Cond[2]);
704
705 // tbz/tbnz
706 if (Cond.size() > 3)
707 MIB.add(MO: Cond[3]);
708
709 // cb
710 if (Cond.size() > 4)
711 MIB.add(MO: Cond[4]);
712
713 MIB.addMBB(MBB: TBB);
714
715 // cb[b,h]
716 if (Cond.size() > 5) {
717 MIB.addImm(Val: Cond[5].getImm());
718 MIB.addImm(Val: Cond[6].getImm());
719 }
720 }
721}
722
723unsigned AArch64InstrInfo::insertBranch(
724 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
725 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
726 // Shouldn't be a fall through.
727 assert(TBB && "insertBranch must not be told to insert a fallthrough");
728
729 if (!FBB) {
730 if (Cond.empty()) // Unconditional branch?
731 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: TBB);
732 else
733 instantiateCondBranch(MBB, DL, TBB, Cond);
734
735 if (BytesAdded)
736 *BytesAdded = 4;
737
738 return 1;
739 }
740
741 // Two-way conditional branch.
742 instantiateCondBranch(MBB, DL, TBB, Cond);
743 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: FBB);
744
745 if (BytesAdded)
746 *BytesAdded = 8;
747
748 return 2;
749}
750
751bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
752 const TargetInstrInfo &TII) {
753 for (MachineInstr &MI : MBB->terminators()) {
754 unsigned Opc = MI.getOpcode();
755 switch (Opc) {
756 case AArch64::CBZW:
757 case AArch64::CBZX:
758 case AArch64::TBZW:
759 case AArch64::TBZX:
760 // CBZ/TBZ with WZR/XZR -> unconditional B
761 if (MI.getOperand(i: 0).getReg() == AArch64::WZR ||
762 MI.getOperand(i: 0).getReg() == AArch64::XZR) {
763 DEBUG_WITH_TYPE("optimizeTerminators",
764 dbgs() << "Removing always taken branch: " << MI);
765 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
766 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
767 for (auto *S : Succs)
768 if (S != Target)
769 MBB->removeSuccessor(Succ: S);
770 DebugLoc DL = MI.getDebugLoc();
771 while (MBB->rbegin() != &MI)
772 MBB->rbegin()->eraseFromParent();
773 MI.eraseFromParent();
774 BuildMI(BB: MBB, MIMD: DL, MCID: TII.get(Opcode: AArch64::B)).addMBB(MBB: Target);
775 return true;
776 }
777 break;
778 case AArch64::CBNZW:
779 case AArch64::CBNZX:
780 case AArch64::TBNZW:
781 case AArch64::TBNZX:
782 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
783 if (MI.getOperand(i: 0).getReg() == AArch64::WZR ||
784 MI.getOperand(i: 0).getReg() == AArch64::XZR) {
785 DEBUG_WITH_TYPE("optimizeTerminators",
786 dbgs() << "Removing never taken branch: " << MI);
787 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
788 MI.getParent()->removeSuccessor(Succ: Target);
789 MI.eraseFromParent();
790 return true;
791 }
792 break;
793 }
794 }
795 return false;
796}
797
798// Find the original register that VReg is copied from.
799static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
800 while (Register::isVirtualRegister(Reg: VReg)) {
801 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
802 if (!DefMI->isFullCopy())
803 return VReg;
804 VReg = DefMI->getOperand(i: 1).getReg();
805 }
806 return VReg;
807}
808
809// Determine if VReg is defined by an instruction that can be folded into a
810// csel instruction. If so, return the folded opcode, and the replacement
811// register.
812static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
813 unsigned *NewReg = nullptr) {
814 VReg = removeCopies(MRI, VReg);
815 if (!Register::isVirtualRegister(Reg: VReg))
816 return 0;
817
818 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(RC: MRI.getRegClass(Reg: VReg));
819 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
820 unsigned Opc = 0;
821 unsigned SrcReg = 0;
822 switch (DefMI->getOpcode()) {
823 case AArch64::SUBREG_TO_REG:
824 // Check for the following way to define an 64-bit immediate:
825 // %0:gpr32 = MOVi32imm 1
826 // %1:gpr64 = SUBREG_TO_REG 0, %0:gpr32, %subreg.sub_32
827 if (!DefMI->getOperand(i: 1).isImm() || DefMI->getOperand(i: 1).getImm() != 0)
828 return 0;
829 if (!DefMI->getOperand(i: 2).isReg())
830 return 0;
831 if (!DefMI->getOperand(i: 3).isImm() ||
832 DefMI->getOperand(i: 3).getImm() != AArch64::sub_32)
833 return 0;
834 DefMI = MRI.getVRegDef(Reg: DefMI->getOperand(i: 2).getReg());
835 if (DefMI->getOpcode() != AArch64::MOVi32imm)
836 return 0;
837 if (!DefMI->getOperand(i: 1).isImm() || DefMI->getOperand(i: 1).getImm() != 1)
838 return 0;
839 assert(Is64Bit);
840 SrcReg = AArch64::XZR;
841 Opc = AArch64::CSINCXr;
842 break;
843
844 case AArch64::MOVi32imm:
845 case AArch64::MOVi64imm:
846 if (!DefMI->getOperand(i: 1).isImm() || DefMI->getOperand(i: 1).getImm() != 1)
847 return 0;
848 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
849 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
850 break;
851
852 case AArch64::ADDSXri:
853 case AArch64::ADDSWri:
854 // if NZCV is used, do not fold.
855 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
856 isDead: true) == -1)
857 return 0;
858 // fall-through to ADDXri and ADDWri.
859 [[fallthrough]];
860 case AArch64::ADDXri:
861 case AArch64::ADDWri:
862 // add x, 1 -> csinc.
863 if (!DefMI->getOperand(i: 2).isImm() || DefMI->getOperand(i: 2).getImm() != 1 ||
864 DefMI->getOperand(i: 3).getImm() != 0)
865 return 0;
866 SrcReg = DefMI->getOperand(i: 1).getReg();
867 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
868 break;
869
870 case AArch64::ORNXrr:
871 case AArch64::ORNWrr: {
872 // not x -> csinv, represented as orn dst, xzr, src.
873 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
874 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
875 return 0;
876 SrcReg = DefMI->getOperand(i: 2).getReg();
877 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
878 break;
879 }
880
881 case AArch64::SUBSXrr:
882 case AArch64::SUBSWrr:
883 // if NZCV is used, do not fold.
884 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
885 isDead: true) == -1)
886 return 0;
887 // fall-through to SUBXrr and SUBWrr.
888 [[fallthrough]];
889 case AArch64::SUBXrr:
890 case AArch64::SUBWrr: {
891 // neg x -> csneg, represented as sub dst, xzr, src.
892 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
893 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
894 return 0;
895 SrcReg = DefMI->getOperand(i: 2).getReg();
896 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
897 break;
898 }
899 default:
900 return 0;
901 }
902 assert(Opc && SrcReg && "Missing parameters");
903
904 if (NewReg)
905 *NewReg = SrcReg;
906 return Opc;
907}
908
909bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
910 ArrayRef<MachineOperand> Cond,
911 Register DstReg, Register TrueReg,
912 Register FalseReg, int &CondCycles,
913 int &TrueCycles,
914 int &FalseCycles) const {
915 // Check register classes.
916 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
917 const TargetRegisterClass *RC =
918 RI.getCommonSubClass(A: MRI.getRegClass(Reg: TrueReg), B: MRI.getRegClass(Reg: FalseReg));
919 if (!RC)
920 return false;
921
922 // Also need to check the dest regclass, in case we're trying to optimize
923 // something like:
924 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
925 if (!RI.getCommonSubClass(A: RC, B: MRI.getRegClass(Reg: DstReg)))
926 return false;
927
928 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
929 unsigned ExtraCondLat = Cond.size() != 1;
930
931 // GPRs are handled by csel.
932 // FIXME: Fold in x+1, -x, and ~x when applicable.
933 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
934 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
935 // Single-cycle csel, csinc, csinv, and csneg.
936 CondCycles = 1 + ExtraCondLat;
937 TrueCycles = FalseCycles = 1;
938 if (canFoldIntoCSel(MRI, VReg: TrueReg))
939 TrueCycles = 0;
940 else if (canFoldIntoCSel(MRI, VReg: FalseReg))
941 FalseCycles = 0;
942 return true;
943 }
944
945 // Scalar floating point is handled by fcsel.
946 // FIXME: Form fabs, fmin, and fmax when applicable.
947 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
948 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
949 CondCycles = 5 + ExtraCondLat;
950 TrueCycles = FalseCycles = 2;
951 return true;
952 }
953
954 // Can't do vectors.
955 return false;
956}
957
958void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
959 MachineBasicBlock::iterator I,
960 const DebugLoc &DL, Register DstReg,
961 ArrayRef<MachineOperand> Cond,
962 Register TrueReg, Register FalseReg) const {
963 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
964
965 // Parse the condition code, see parseCondBranch() above.
966 AArch64CC::CondCode CC;
967 switch (Cond.size()) {
968 default:
969 llvm_unreachable("Unknown condition opcode in Cond");
970 case 1: // b.cc
971 CC = AArch64CC::CondCode(Cond[0].getImm());
972 break;
973 case 3: { // cbz/cbnz
974 // We must insert a compare against 0.
975 bool Is64Bit;
976 switch (Cond[1].getImm()) {
977 default:
978 llvm_unreachable("Unknown branch opcode in Cond");
979 case AArch64::CBZW:
980 Is64Bit = false;
981 CC = AArch64CC::EQ;
982 break;
983 case AArch64::CBZX:
984 Is64Bit = true;
985 CC = AArch64CC::EQ;
986 break;
987 case AArch64::CBNZW:
988 Is64Bit = false;
989 CC = AArch64CC::NE;
990 break;
991 case AArch64::CBNZX:
992 Is64Bit = true;
993 CC = AArch64CC::NE;
994 break;
995 }
996 Register SrcReg = Cond[2].getReg();
997 if (Is64Bit) {
998 // cmp reg, #0 is actually subs xzr, reg, #0.
999 MRI.constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64spRegClass);
1000 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSXri), DestReg: AArch64::XZR)
1001 .addReg(RegNo: SrcReg)
1002 .addImm(Val: 0)
1003 .addImm(Val: 0);
1004 } else {
1005 MRI.constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR32spRegClass);
1006 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWri), DestReg: AArch64::WZR)
1007 .addReg(RegNo: SrcReg)
1008 .addImm(Val: 0)
1009 .addImm(Val: 0);
1010 }
1011 break;
1012 }
1013 case 4: { // tbz/tbnz
1014 // We must insert a tst instruction.
1015 switch (Cond[1].getImm()) {
1016 default:
1017 llvm_unreachable("Unknown branch opcode in Cond");
1018 case AArch64::TBZW:
1019 case AArch64::TBZX:
1020 CC = AArch64CC::EQ;
1021 break;
1022 case AArch64::TBNZW:
1023 case AArch64::TBNZX:
1024 CC = AArch64CC::NE;
1025 break;
1026 }
1027 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1028 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1029 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ANDSWri), DestReg: AArch64::WZR)
1030 .addReg(RegNo: Cond[2].getReg())
1031 .addImm(
1032 Val: AArch64_AM::encodeLogicalImmediate(imm: 1ull << Cond[3].getImm(), regSize: 32));
1033 else
1034 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ANDSXri), DestReg: AArch64::XZR)
1035 .addReg(RegNo: Cond[2].getReg())
1036 .addImm(
1037 Val: AArch64_AM::encodeLogicalImmediate(imm: 1ull << Cond[3].getImm(), regSize: 64));
1038 break;
1039 }
1040 case 5: { // cb
1041 // We must insert a cmp, that is a subs
1042 // 0 1 2 3 4
1043 // Cond is { -1, Opcode, CC, Op0, Op1 }
1044
1045 unsigned SubsOpc, SubsDestReg;
1046 bool IsImm = false;
1047 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1048 switch (Cond[1].getImm()) {
1049 default:
1050 llvm_unreachable("Unknown branch opcode in Cond");
1051 case AArch64::CBWPri:
1052 SubsOpc = AArch64::SUBSWri;
1053 SubsDestReg = AArch64::WZR;
1054 IsImm = true;
1055 break;
1056 case AArch64::CBXPri:
1057 SubsOpc = AArch64::SUBSXri;
1058 SubsDestReg = AArch64::XZR;
1059 IsImm = true;
1060 break;
1061 case AArch64::CBWPrr:
1062 SubsOpc = AArch64::SUBSWrr;
1063 SubsDestReg = AArch64::WZR;
1064 IsImm = false;
1065 break;
1066 case AArch64::CBXPrr:
1067 SubsOpc = AArch64::SUBSXrr;
1068 SubsDestReg = AArch64::XZR;
1069 IsImm = false;
1070 break;
1071 }
1072
1073 if (IsImm)
1074 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SubsOpc), DestReg: SubsDestReg)
1075 .addReg(RegNo: Cond[3].getReg())
1076 .addImm(Val: Cond[4].getImm())
1077 .addImm(Val: 0);
1078 else
1079 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SubsOpc), DestReg: SubsDestReg)
1080 .addReg(RegNo: Cond[3].getReg())
1081 .addReg(RegNo: Cond[4].getReg());
1082 } break;
1083 case 7: { // cb[b,h]
1084 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1085 // that have been folded. For the first operand we codegen an explicit
1086 // extension, for the second operand we fold the extension into cmp.
1087 // 0 1 2 3 4 5 6
1088 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1089
1090 // We need a new register for the now explicitly extended register
1091 Register Reg = Cond[4].getReg();
1092 if (Cond[5].getImm() != AArch64_AM::InvalidShiftExtend) {
1093 unsigned ExtOpc;
1094 unsigned ExtBits;
1095 AArch64_AM::ShiftExtendType ExtendType =
1096 AArch64_AM::getExtendType(Imm: Cond[5].getImm());
1097 switch (ExtendType) {
1098 default:
1099 llvm_unreachable("Unknown shift-extend for CB instruction");
1100 case AArch64_AM::SXTB:
1101 assert(
1102 Cond[1].getImm() == AArch64::CBBAssertExt &&
1103 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1104 ExtOpc = AArch64::SBFMWri;
1105 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xff, regSize: 32);
1106 break;
1107 case AArch64_AM::SXTH:
1108 assert(
1109 Cond[1].getImm() == AArch64::CBHAssertExt &&
1110 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1111 ExtOpc = AArch64::SBFMWri;
1112 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xffff, regSize: 32);
1113 break;
1114 case AArch64_AM::UXTB:
1115 assert(
1116 Cond[1].getImm() == AArch64::CBBAssertExt &&
1117 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1118 ExtOpc = AArch64::ANDWri;
1119 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xff, regSize: 32);
1120 break;
1121 case AArch64_AM::UXTH:
1122 assert(
1123 Cond[1].getImm() == AArch64::CBHAssertExt &&
1124 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1125 ExtOpc = AArch64::ANDWri;
1126 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xffff, regSize: 32);
1127 break;
1128 }
1129
1130 // Build the explicit extension of the first operand
1131 Reg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32spRegClass);
1132 MachineInstrBuilder MBBI =
1133 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: ExtOpc), DestReg: Reg).addReg(RegNo: Cond[4].getReg());
1134 if (ExtOpc != AArch64::ANDWri)
1135 MBBI.addImm(Val: 0);
1136 MBBI.addImm(Val: ExtBits);
1137 }
1138
1139 // Now, subs with an extended second operand
1140 if (Cond[6].getImm() != AArch64_AM::InvalidShiftExtend) {
1141 AArch64_AM::ShiftExtendType ExtendType =
1142 AArch64_AM::getExtendType(Imm: Cond[6].getImm());
1143 MRI.constrainRegClass(Reg, RC: MRI.getRegClass(Reg: Cond[3].getReg()));
1144 MRI.constrainRegClass(Reg: Cond[3].getReg(), RC: &AArch64::GPR32spRegClass);
1145 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWrx), DestReg: AArch64::WZR)
1146 .addReg(RegNo: Cond[3].getReg())
1147 .addReg(RegNo: Reg)
1148 .addImm(Val: AArch64_AM::getArithExtendImm(ET: ExtendType, Imm: 0));
1149 } // If no extension is needed, just a regular subs
1150 else {
1151 MRI.constrainRegClass(Reg, RC: MRI.getRegClass(Reg: Cond[3].getReg()));
1152 MRI.constrainRegClass(Reg: Cond[3].getReg(), RC: &AArch64::GPR32spRegClass);
1153 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWrr), DestReg: AArch64::WZR)
1154 .addReg(RegNo: Cond[3].getReg())
1155 .addReg(RegNo: Reg);
1156 }
1157
1158 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1159 } break;
1160 }
1161
1162 unsigned Opc = 0;
1163 const TargetRegisterClass *RC = nullptr;
1164 bool TryFold = false;
1165 if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::GPR64RegClass)) {
1166 RC = &AArch64::GPR64RegClass;
1167 Opc = AArch64::CSELXr;
1168 TryFold = true;
1169 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::GPR32RegClass)) {
1170 RC = &AArch64::GPR32RegClass;
1171 Opc = AArch64::CSELWr;
1172 TryFold = true;
1173 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::FPR64RegClass)) {
1174 RC = &AArch64::FPR64RegClass;
1175 Opc = AArch64::FCSELDrrr;
1176 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::FPR32RegClass)) {
1177 RC = &AArch64::FPR32RegClass;
1178 Opc = AArch64::FCSELSrrr;
1179 }
1180 assert(RC && "Unsupported regclass");
1181
1182 // Try folding simple instructions into the csel.
1183 if (TryFold) {
1184 unsigned NewReg = 0;
1185 unsigned FoldedOpc = canFoldIntoCSel(MRI, VReg: TrueReg, NewReg: &NewReg);
1186 if (FoldedOpc) {
1187 // The folded opcodes csinc, csinc and csneg apply the operation to
1188 // FalseReg, so we need to invert the condition.
1189 CC = AArch64CC::getInvertedCondCode(Code: CC);
1190 TrueReg = FalseReg;
1191 } else
1192 FoldedOpc = canFoldIntoCSel(MRI, VReg: FalseReg, NewReg: &NewReg);
1193
1194 // Fold the operation. Leave any dead instructions for DCE to clean up.
1195 if (FoldedOpc) {
1196 FalseReg = NewReg;
1197 Opc = FoldedOpc;
1198 // Extend the live range of NewReg.
1199 MRI.clearKillFlags(Reg: NewReg);
1200 }
1201 }
1202
1203 // Pull all virtual register into the appropriate class.
1204 MRI.constrainRegClass(Reg: TrueReg, RC);
1205 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1206 assert(
1207 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1208 FalseReg == AArch64::XZR) &&
1209 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1210 if (FalseReg.isVirtual())
1211 MRI.constrainRegClass(Reg: FalseReg, RC);
1212
1213 // Insert the csel.
1214 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: Opc), DestReg: DstReg)
1215 .addReg(RegNo: TrueReg)
1216 .addReg(RegNo: FalseReg)
1217 .addImm(Val: CC);
1218}
1219
1220// Return true if Imm can be loaded into a register by a "cheap" sequence of
1221// instructions. For now, "cheap" means at most two instructions.
1222static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1223 if (BitSize == 32)
1224 return true;
1225
1226 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1227 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(i: 1).getImm());
1228 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
1229 AArch64_IMM::expandMOVImm(Imm, BitSize, Insn&: Is);
1230
1231 return Is.size() <= 2;
1232}
1233
1234// Check if a COPY instruction is cheap.
1235static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1236 assert(MI.isCopy() && "Expected COPY instruction");
1237 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1238
1239 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1240 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1241 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1242 if (Reg.isVirtual())
1243 return MRI.getRegClass(Reg);
1244 if (Reg.isPhysical())
1245 return RI.getMinimalPhysRegClass(Reg);
1246 return nullptr;
1247 };
1248 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(i: 0).getReg());
1249 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(i: 1).getReg());
1250 if (DstRC && SrcRC && !RI.getCommonSubClass(A: DstRC, B: SrcRC))
1251 return false;
1252
1253 return MI.isAsCheapAsAMove();
1254}
1255
1256// FIXME: this implementation should be micro-architecture dependent, so a
1257// micro-architecture target hook should be introduced here in future.
1258bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
1259 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1260 if (isExynosCheapAsMove(MI))
1261 return true;
1262 return MI.isAsCheapAsAMove();
1263 }
1264
1265 switch (MI.getOpcode()) {
1266 default:
1267 return MI.isAsCheapAsAMove();
1268
1269 case TargetOpcode::COPY:
1270 return isCheapCopy(MI, RI);
1271
1272 case AArch64::ADDWrs:
1273 case AArch64::ADDXrs:
1274 case AArch64::SUBWrs:
1275 case AArch64::SUBXrs:
1276 return Subtarget.hasALULSLFast() && MI.getOperand(i: 3).getImm() <= 4;
1277
1278 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1279 // ORRXri, it is as cheap as MOV.
1280 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1281 case AArch64::MOVi32imm:
1282 return isCheapImmediate(MI, BitSize: 32);
1283 case AArch64::MOVi64imm:
1284 return isCheapImmediate(MI, BitSize: 64);
1285 }
1286}
1287
1288bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1289 switch (MI.getOpcode()) {
1290 default:
1291 return false;
1292
1293 case AArch64::ADDWrs:
1294 case AArch64::ADDXrs:
1295 case AArch64::ADDSWrs:
1296 case AArch64::ADDSXrs: {
1297 unsigned Imm = MI.getOperand(i: 3).getImm();
1298 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1299 if (ShiftVal == 0)
1300 return true;
1301 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1302 }
1303
1304 case AArch64::ADDWrx:
1305 case AArch64::ADDXrx:
1306 case AArch64::ADDXrx64:
1307 case AArch64::ADDSWrx:
1308 case AArch64::ADDSXrx:
1309 case AArch64::ADDSXrx64: {
1310 unsigned Imm = MI.getOperand(i: 3).getImm();
1311 switch (AArch64_AM::getArithExtendType(Imm)) {
1312 default:
1313 return false;
1314 case AArch64_AM::UXTB:
1315 case AArch64_AM::UXTH:
1316 case AArch64_AM::UXTW:
1317 case AArch64_AM::UXTX:
1318 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1319 }
1320 }
1321
1322 case AArch64::SUBWrs:
1323 case AArch64::SUBSWrs: {
1324 unsigned Imm = MI.getOperand(i: 3).getImm();
1325 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1326 return ShiftVal == 0 ||
1327 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1328 }
1329
1330 case AArch64::SUBXrs:
1331 case AArch64::SUBSXrs: {
1332 unsigned Imm = MI.getOperand(i: 3).getImm();
1333 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1334 return ShiftVal == 0 ||
1335 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1336 }
1337
1338 case AArch64::SUBWrx:
1339 case AArch64::SUBXrx:
1340 case AArch64::SUBXrx64:
1341 case AArch64::SUBSWrx:
1342 case AArch64::SUBSXrx:
1343 case AArch64::SUBSXrx64: {
1344 unsigned Imm = MI.getOperand(i: 3).getImm();
1345 switch (AArch64_AM::getArithExtendType(Imm)) {
1346 default:
1347 return false;
1348 case AArch64_AM::UXTB:
1349 case AArch64_AM::UXTH:
1350 case AArch64_AM::UXTW:
1351 case AArch64_AM::UXTX:
1352 return AArch64_AM::getArithShiftValue(Imm) == 0;
1353 }
1354 }
1355
1356 case AArch64::LDRBBroW:
1357 case AArch64::LDRBBroX:
1358 case AArch64::LDRBroW:
1359 case AArch64::LDRBroX:
1360 case AArch64::LDRDroW:
1361 case AArch64::LDRDroX:
1362 case AArch64::LDRHHroW:
1363 case AArch64::LDRHHroX:
1364 case AArch64::LDRHroW:
1365 case AArch64::LDRHroX:
1366 case AArch64::LDRQroW:
1367 case AArch64::LDRQroX:
1368 case AArch64::LDRSBWroW:
1369 case AArch64::LDRSBWroX:
1370 case AArch64::LDRSBXroW:
1371 case AArch64::LDRSBXroX:
1372 case AArch64::LDRSHWroW:
1373 case AArch64::LDRSHWroX:
1374 case AArch64::LDRSHXroW:
1375 case AArch64::LDRSHXroX:
1376 case AArch64::LDRSWroW:
1377 case AArch64::LDRSWroX:
1378 case AArch64::LDRSroW:
1379 case AArch64::LDRSroX:
1380 case AArch64::LDRWroW:
1381 case AArch64::LDRWroX:
1382 case AArch64::LDRXroW:
1383 case AArch64::LDRXroX:
1384 case AArch64::PRFMroW:
1385 case AArch64::PRFMroX:
1386 case AArch64::STRBBroW:
1387 case AArch64::STRBBroX:
1388 case AArch64::STRBroW:
1389 case AArch64::STRBroX:
1390 case AArch64::STRDroW:
1391 case AArch64::STRDroX:
1392 case AArch64::STRHHroW:
1393 case AArch64::STRHHroX:
1394 case AArch64::STRHroW:
1395 case AArch64::STRHroX:
1396 case AArch64::STRQroW:
1397 case AArch64::STRQroX:
1398 case AArch64::STRSroW:
1399 case AArch64::STRSroX:
1400 case AArch64::STRWroW:
1401 case AArch64::STRWroX:
1402 case AArch64::STRXroW:
1403 case AArch64::STRXroX: {
1404 unsigned IsSigned = MI.getOperand(i: 3).getImm();
1405 return !IsSigned;
1406 }
1407 }
1408}
1409
1410bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1411 unsigned Opc = MI.getOpcode();
1412 switch (Opc) {
1413 default:
1414 return false;
1415 case AArch64::SEH_StackAlloc:
1416 case AArch64::SEH_SaveFPLR:
1417 case AArch64::SEH_SaveFPLR_X:
1418 case AArch64::SEH_SaveReg:
1419 case AArch64::SEH_SaveReg_X:
1420 case AArch64::SEH_SaveRegP:
1421 case AArch64::SEH_SaveRegP_X:
1422 case AArch64::SEH_SaveFReg:
1423 case AArch64::SEH_SaveFReg_X:
1424 case AArch64::SEH_SaveFRegP:
1425 case AArch64::SEH_SaveFRegP_X:
1426 case AArch64::SEH_SetFP:
1427 case AArch64::SEH_AddFP:
1428 case AArch64::SEH_Nop:
1429 case AArch64::SEH_PrologEnd:
1430 case AArch64::SEH_EpilogStart:
1431 case AArch64::SEH_EpilogEnd:
1432 case AArch64::SEH_PACSignLR:
1433 case AArch64::SEH_SaveAnyRegI:
1434 case AArch64::SEH_SaveAnyRegIP:
1435 case AArch64::SEH_SaveAnyRegQP:
1436 case AArch64::SEH_SaveAnyRegQPX:
1437 case AArch64::SEH_AllocZ:
1438 case AArch64::SEH_SaveZReg:
1439 case AArch64::SEH_SavePReg:
1440 return true;
1441 }
1442}
1443
1444bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1445 Register &SrcReg, Register &DstReg,
1446 unsigned &SubIdx) const {
1447 switch (MI.getOpcode()) {
1448 default:
1449 return false;
1450 case AArch64::SBFMXri: // aka sxtw
1451 case AArch64::UBFMXri: // aka uxtw
1452 // Check for the 32 -> 64 bit extension case, these instructions can do
1453 // much more.
1454 if (MI.getOperand(i: 2).getImm() != 0 || MI.getOperand(i: 3).getImm() != 31)
1455 return false;
1456 // This is a signed or unsigned 32 -> 64 bit extension.
1457 SrcReg = MI.getOperand(i: 1).getReg();
1458 DstReg = MI.getOperand(i: 0).getReg();
1459 SubIdx = AArch64::sub_32;
1460 return true;
1461 }
1462}
1463
1464bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1465 const MachineInstr &MIa, const MachineInstr &MIb) const {
1466 const TargetRegisterInfo *TRI = &getRegisterInfo();
1467 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1468 int64_t OffsetA = 0, OffsetB = 0;
1469 TypeSize WidthA(0, false), WidthB(0, false);
1470 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1471
1472 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1473 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1474
1475 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1476 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1477 return false;
1478
1479 // Retrieve the base, offset from the base and width. Width
1480 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1481 // base are identical, and the offset of a lower memory access +
1482 // the width doesn't overlap the offset of a higher memory access,
1483 // then the memory accesses are different.
1484 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1485 // are assumed to have the same scale (vscale).
1486 if (getMemOperandWithOffsetWidth(MI: MIa, BaseOp&: BaseOpA, Offset&: OffsetA, OffsetIsScalable&: OffsetAIsScalable,
1487 Width&: WidthA, TRI) &&
1488 getMemOperandWithOffsetWidth(MI: MIb, BaseOp&: BaseOpB, Offset&: OffsetB, OffsetIsScalable&: OffsetBIsScalable,
1489 Width&: WidthB, TRI)) {
1490 if (BaseOpA->isIdenticalTo(Other: *BaseOpB) &&
1491 OffsetAIsScalable == OffsetBIsScalable) {
1492 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1493 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1494 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1495 if (LowWidth.isScalable() == OffsetAIsScalable &&
1496 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1497 return true;
1498 }
1499 }
1500 return false;
1501}
1502
1503bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1504 const MachineBasicBlock *MBB,
1505 const MachineFunction &MF) const {
1506 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1507 return true;
1508
1509 // Do not move an instruction that can be recognized as a branch target.
1510 if (hasBTISemantics(MI))
1511 return true;
1512
1513 switch (MI.getOpcode()) {
1514 case AArch64::HINT:
1515 // CSDB hints are scheduling barriers.
1516 if (MI.getOperand(i: 0).getImm() == 0x14)
1517 return true;
1518 break;
1519 case AArch64::DSB:
1520 case AArch64::ISB:
1521 // DSB and ISB also are scheduling barriers.
1522 return true;
1523 case AArch64::MSRpstatesvcrImm1:
1524 // SMSTART and SMSTOP are also scheduling barriers.
1525 return true;
1526 default:;
1527 }
1528 if (isSEHInstruction(MI))
1529 return true;
1530 auto Next = std::next(x: MI.getIterator());
1531 return Next != MBB->end() && Next->isCFIInstruction();
1532}
1533
1534/// analyzeCompare - For a comparison instruction, return the source registers
1535/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1536/// Return true if the comparison instruction can be analyzed.
1537bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1538 Register &SrcReg2, int64_t &CmpMask,
1539 int64_t &CmpValue) const {
1540 // The first operand can be a frame index where we'd normally expect a
1541 // register.
1542 // FIXME: Pass subregisters out of analyzeCompare
1543 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1544 if (!MI.getOperand(i: 1).isReg() || MI.getOperand(i: 1).getSubReg())
1545 return false;
1546
1547 switch (MI.getOpcode()) {
1548 default:
1549 break;
1550 case AArch64::PTEST_PP:
1551 case AArch64::PTEST_PP_ANY:
1552 case AArch64::PTEST_PP_FIRST:
1553 SrcReg = MI.getOperand(i: 0).getReg();
1554 SrcReg2 = MI.getOperand(i: 1).getReg();
1555 if (MI.getOperand(i: 2).getSubReg())
1556 return false;
1557
1558 // Not sure about the mask and value for now...
1559 CmpMask = ~0;
1560 CmpValue = 0;
1561 return true;
1562 case AArch64::SUBSWrr:
1563 case AArch64::SUBSWrs:
1564 case AArch64::SUBSWrx:
1565 case AArch64::SUBSXrr:
1566 case AArch64::SUBSXrs:
1567 case AArch64::SUBSXrx:
1568 case AArch64::ADDSWrr:
1569 case AArch64::ADDSWrs:
1570 case AArch64::ADDSWrx:
1571 case AArch64::ADDSXrr:
1572 case AArch64::ADDSXrs:
1573 case AArch64::ADDSXrx:
1574 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1575 SrcReg = MI.getOperand(i: 1).getReg();
1576 SrcReg2 = MI.getOperand(i: 2).getReg();
1577
1578 // FIXME: Pass subregisters out of analyzeCompare
1579 if (MI.getOperand(i: 2).getSubReg())
1580 return false;
1581
1582 CmpMask = ~0;
1583 CmpValue = 0;
1584 return true;
1585 case AArch64::SUBSWri:
1586 case AArch64::ADDSWri:
1587 case AArch64::SUBSXri:
1588 case AArch64::ADDSXri:
1589 SrcReg = MI.getOperand(i: 1).getReg();
1590 SrcReg2 = 0;
1591 CmpMask = ~0;
1592 CmpValue = MI.getOperand(i: 2).getImm();
1593 return true;
1594 case AArch64::ANDSWri:
1595 case AArch64::ANDSXri:
1596 // ANDS does not use the same encoding scheme as the others xxxS
1597 // instructions.
1598 SrcReg = MI.getOperand(i: 1).getReg();
1599 SrcReg2 = 0;
1600 CmpMask = ~0;
1601 CmpValue = AArch64_AM::decodeLogicalImmediate(
1602 val: MI.getOperand(i: 2).getImm(),
1603 regSize: MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1604 return true;
1605 }
1606
1607 return false;
1608}
1609
1610static bool UpdateOperandRegClass(MachineInstr &Instr) {
1611 MachineBasicBlock *MBB = Instr.getParent();
1612 assert(MBB && "Can't get MachineBasicBlock here");
1613 MachineFunction *MF = MBB->getParent();
1614 assert(MF && "Can't get MachineFunction here");
1615 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1616 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1617 MachineRegisterInfo *MRI = &MF->getRegInfo();
1618
1619 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1620 ++OpIdx) {
1621 MachineOperand &MO = Instr.getOperand(i: OpIdx);
1622 const TargetRegisterClass *OpRegCstraints =
1623 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1624
1625 // If there's no constraint, there's nothing to do.
1626 if (!OpRegCstraints)
1627 continue;
1628 // If the operand is a frame index, there's nothing to do here.
1629 // A frame index operand will resolve correctly during PEI.
1630 if (MO.isFI())
1631 continue;
1632
1633 assert(MO.isReg() &&
1634 "Operand has register constraints without being a register!");
1635
1636 Register Reg = MO.getReg();
1637 if (Reg.isPhysical()) {
1638 if (!OpRegCstraints->contains(Reg))
1639 return false;
1640 } else if (!OpRegCstraints->hasSubClassEq(RC: MRI->getRegClass(Reg)) &&
1641 !MRI->constrainRegClass(Reg, RC: OpRegCstraints))
1642 return false;
1643 }
1644
1645 return true;
1646}
1647
1648/// Return the opcode that does not set flags when possible - otherwise
1649/// return the original opcode. The caller is responsible to do the actual
1650/// substitution and legality checking.
1651static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1652 // Don't convert all compare instructions, because for some the zero register
1653 // encoding becomes the sp register.
1654 bool MIDefinesZeroReg = false;
1655 if (MI.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) ||
1656 MI.definesRegister(Reg: AArch64::XZR, /*TRI=*/nullptr))
1657 MIDefinesZeroReg = true;
1658
1659 switch (MI.getOpcode()) {
1660 default:
1661 return MI.getOpcode();
1662 case AArch64::ADDSWrr:
1663 return AArch64::ADDWrr;
1664 case AArch64::ADDSWri:
1665 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1666 case AArch64::ADDSWrs:
1667 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1668 case AArch64::ADDSWrx:
1669 return AArch64::ADDWrx;
1670 case AArch64::ADDSXrr:
1671 return AArch64::ADDXrr;
1672 case AArch64::ADDSXri:
1673 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1674 case AArch64::ADDSXrs:
1675 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1676 case AArch64::ADDSXrx:
1677 return AArch64::ADDXrx;
1678 case AArch64::SUBSWrr:
1679 return AArch64::SUBWrr;
1680 case AArch64::SUBSWri:
1681 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1682 case AArch64::SUBSWrs:
1683 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1684 case AArch64::SUBSWrx:
1685 return AArch64::SUBWrx;
1686 case AArch64::SUBSXrr:
1687 return AArch64::SUBXrr;
1688 case AArch64::SUBSXri:
1689 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1690 case AArch64::SUBSXrs:
1691 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1692 case AArch64::SUBSXrx:
1693 return AArch64::SUBXrx;
1694 }
1695}
1696
1697enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1698
1699/// True when condition flags are accessed (either by writing or reading)
1700/// on the instruction trace starting at From and ending at To.
1701///
1702/// Note: If From and To are from different blocks it's assumed CC are accessed
1703/// on the path.
1704static bool areCFlagsAccessedBetweenInstrs(
1705 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1706 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1707 // Early exit if To is at the beginning of the BB.
1708 if (To == To->getParent()->begin())
1709 return true;
1710
1711 // Check whether the instructions are in the same basic block
1712 // If not, assume the condition flags might get modified somewhere.
1713 if (To->getParent() != From->getParent())
1714 return true;
1715
1716 // From must be above To.
1717 assert(std::any_of(
1718 ++To.getReverse(), To->getParent()->rend(),
1719 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1720
1721 // We iterate backward starting at \p To until we hit \p From.
1722 for (const MachineInstr &Instr :
1723 instructionsWithoutDebug(It: ++To.getReverse(), End: From.getReverse())) {
1724 if (((AccessToCheck & AK_Write) &&
1725 Instr.modifiesRegister(Reg: AArch64::NZCV, TRI)) ||
1726 ((AccessToCheck & AK_Read) && Instr.readsRegister(Reg: AArch64::NZCV, TRI)))
1727 return true;
1728 }
1729 return false;
1730}
1731
1732std::optional<unsigned>
1733AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1734 MachineInstr *Pred,
1735 const MachineRegisterInfo *MRI) const {
1736 unsigned MaskOpcode = Mask->getOpcode();
1737 unsigned PredOpcode = Pred->getOpcode();
1738 bool PredIsPTestLike = isPTestLikeOpcode(Opc: PredOpcode);
1739 bool PredIsWhileLike = isWhileOpcode(Opc: PredOpcode);
1740
1741 if (PredIsWhileLike) {
1742 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1743 // instruction and the condition is "any" since WHILcc does an implicit
1744 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1745 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1746 return PredOpcode;
1747
1748 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1749 // redundant since WHILE performs an implicit PTEST with an all active
1750 // mask.
1751 if (isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31 &&
1752 getElementSizeForOpcode(Opc: MaskOpcode) ==
1753 getElementSizeForOpcode(Opc: PredOpcode))
1754 return PredOpcode;
1755
1756 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1757 // WHILEcc performs an implicit PTEST with an all active mask, setting
1758 // the N flag as the PTEST_FIRST would.
1759 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1760 isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31)
1761 return PredOpcode;
1762
1763 return {};
1764 }
1765
1766 if (PredIsPTestLike) {
1767 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1768 // instruction that sets the flags as PTEST would and the condition is
1769 // "any" since PG is always a subset of the governing predicate of the
1770 // ptest-like instruction.
1771 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1772 return PredOpcode;
1773
1774 auto PTestLikeMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1775
1776 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1777 // to look through a copy and try again. This is because some instructions
1778 // take a predicate whose register class is a subset of its result class.
1779 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1780 PTestLikeMask->getOperand(i: 1).getReg().isVirtual())
1781 PTestLikeMask =
1782 MRI->getUniqueVRegDef(Reg: PTestLikeMask->getOperand(i: 1).getReg());
1783
1784 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1785 // the element size matches and either the PTEST_LIKE instruction uses
1786 // the same all active mask or the condition is "any".
1787 if (isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31 &&
1788 getElementSizeForOpcode(Opc: MaskOpcode) ==
1789 getElementSizeForOpcode(Opc: PredOpcode)) {
1790 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1791 return PredOpcode;
1792 }
1793
1794 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1795 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1796 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1797 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1798 // performed by the compare could consider fewer lanes for these element
1799 // sizes.
1800 //
1801 // For example, consider
1802 //
1803 // ptrue p0.b ; P0=1111-1111-1111-1111
1804 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1805 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1806 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1807 // ; ^ last active
1808 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1809 // ; ^ last active
1810 //
1811 // where the compare generates a canonical all active 32-bit predicate
1812 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1813 // active flag, whereas the PTEST instruction with the same mask doesn't.
1814 // For PTEST_ANY this doesn't apply as the flags in this case would be
1815 // identical regardless of element size.
1816 uint64_t PredElementSize = getElementSizeForOpcode(Opc: PredOpcode);
1817 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1818 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1819 return PredOpcode;
1820
1821 return {};
1822 }
1823
1824 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1825 // opcode so the PTEST becomes redundant.
1826 switch (PredOpcode) {
1827 case AArch64::AND_PPzPP:
1828 case AArch64::BIC_PPzPP:
1829 case AArch64::EOR_PPzPP:
1830 case AArch64::NAND_PPzPP:
1831 case AArch64::NOR_PPzPP:
1832 case AArch64::ORN_PPzPP:
1833 case AArch64::ORR_PPzPP:
1834 case AArch64::BRKA_PPzP:
1835 case AArch64::BRKPA_PPzPP:
1836 case AArch64::BRKB_PPzP:
1837 case AArch64::BRKPB_PPzPP:
1838 case AArch64::RDFFR_PPz: {
1839 // Check to see if our mask is the same. If not the resulting flag bits
1840 // may be different and we can't remove the ptest.
1841 auto *PredMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1842 if (Mask != PredMask)
1843 return {};
1844 break;
1845 }
1846 case AArch64::BRKN_PPzP: {
1847 // BRKN uses an all active implicit mask to set flags unlike the other
1848 // flag-setting instructions.
1849 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1850 if ((MaskOpcode != AArch64::PTRUE_B) ||
1851 (Mask->getOperand(i: 1).getImm() != 31))
1852 return {};
1853 break;
1854 }
1855 case AArch64::PTRUE_B:
1856 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1857 break;
1858 default:
1859 // Bail out if we don't recognize the input
1860 return {};
1861 }
1862
1863 return convertToFlagSettingOpc(Opc: PredOpcode);
1864}
1865
1866/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1867/// operation which could set the flags in an identical manner
1868bool AArch64InstrInfo::optimizePTestInstr(
1869 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1870 const MachineRegisterInfo *MRI) const {
1871 auto *Mask = MRI->getUniqueVRegDef(Reg: MaskReg);
1872 auto *Pred = MRI->getUniqueVRegDef(Reg: PredReg);
1873
1874 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1875 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1876 // before the branch to extract each subregister.
1877 auto Op = Pred->getOperand(i: 1);
1878 if (Op.isReg() && Op.getReg().isVirtual() &&
1879 Op.getSubReg() == AArch64::psub0)
1880 Pred = MRI->getUniqueVRegDef(Reg: Op.getReg());
1881 }
1882
1883 unsigned PredOpcode = Pred->getOpcode();
1884 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1885 if (!NewOp)
1886 return false;
1887
1888 const TargetRegisterInfo *TRI = &getRegisterInfo();
1889
1890 // If another instruction between Pred and PTest accesses flags, don't remove
1891 // the ptest or update the earlier instruction to modify them.
1892 if (areCFlagsAccessedBetweenInstrs(From: Pred, To: PTest, TRI))
1893 return false;
1894
1895 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1896 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1897 // operand to be replaced with an equivalent instruction that also sets the
1898 // flags.
1899 PTest->eraseFromParent();
1900 if (*NewOp != PredOpcode) {
1901 Pred->setDesc(get(Opcode: *NewOp));
1902 bool succeeded = UpdateOperandRegClass(Instr&: *Pred);
1903 (void)succeeded;
1904 assert(succeeded && "Operands have incompatible register classes!");
1905 Pred->addRegisterDefined(Reg: AArch64::NZCV, RegInfo: TRI);
1906 }
1907
1908 // Ensure that the flags def is live.
1909 if (Pred->registerDefIsDead(Reg: AArch64::NZCV, TRI)) {
1910 unsigned i = 0, e = Pred->getNumOperands();
1911 for (; i != e; ++i) {
1912 MachineOperand &MO = Pred->getOperand(i);
1913 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1914 MO.setIsDead(false);
1915 break;
1916 }
1917 }
1918 }
1919 return true;
1920}
1921
1922/// Try to optimize a compare instruction. A compare instruction is an
1923/// instruction which produces AArch64::NZCV. It can be truly compare
1924/// instruction
1925/// when there are no uses of its destination register.
1926///
1927/// The following steps are tried in order:
1928/// 1. Convert CmpInstr into an unconditional version.
1929/// 2. Remove CmpInstr if above there is an instruction producing a needed
1930/// condition code or an instruction which can be converted into such an
1931/// instruction.
1932/// Only comparison with zero is supported.
1933bool AArch64InstrInfo::optimizeCompareInstr(
1934 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1935 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1936 assert(CmpInstr.getParent());
1937 assert(MRI);
1938
1939 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1940 int DeadNZCVIdx =
1941 CmpInstr.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true);
1942 if (DeadNZCVIdx != -1) {
1943 if (CmpInstr.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) ||
1944 CmpInstr.definesRegister(Reg: AArch64::XZR, /*TRI=*/nullptr)) {
1945 CmpInstr.eraseFromParent();
1946 return true;
1947 }
1948 unsigned Opc = CmpInstr.getOpcode();
1949 unsigned NewOpc = convertToNonFlagSettingOpc(MI: CmpInstr);
1950 if (NewOpc == Opc)
1951 return false;
1952 const MCInstrDesc &MCID = get(Opcode: NewOpc);
1953 CmpInstr.setDesc(MCID);
1954 CmpInstr.removeOperand(OpNo: DeadNZCVIdx);
1955 bool succeeded = UpdateOperandRegClass(Instr&: CmpInstr);
1956 (void)succeeded;
1957 assert(succeeded && "Some operands reg class are incompatible!");
1958 return true;
1959 }
1960
1961 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1962 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
1963 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
1964 return optimizePTestInstr(PTest: &CmpInstr, MaskReg: SrcReg, PredReg: SrcReg2, MRI);
1965
1966 if (SrcReg2 != 0)
1967 return false;
1968
1969 // CmpInstr is a Compare instruction if destination register is not used.
1970 if (!MRI->use_nodbg_empty(RegNo: CmpInstr.getOperand(i: 0).getReg()))
1971 return false;
1972
1973 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, MRI: *MRI))
1974 return true;
1975 return (CmpValue == 0 || CmpValue == 1) &&
1976 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, MRI: *MRI);
1977}
1978
1979/// Get opcode of S version of Instr.
1980/// If Instr is S version its opcode is returned.
1981/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1982/// or we are not interested in it.
1983static unsigned sForm(MachineInstr &Instr) {
1984 switch (Instr.getOpcode()) {
1985 default:
1986 return AArch64::INSTRUCTION_LIST_END;
1987
1988 case AArch64::ADDSWrr:
1989 case AArch64::ADDSWri:
1990 case AArch64::ADDSXrr:
1991 case AArch64::ADDSXri:
1992 case AArch64::ADDSWrx:
1993 case AArch64::ADDSXrx:
1994 case AArch64::SUBSWrr:
1995 case AArch64::SUBSWri:
1996 case AArch64::SUBSWrx:
1997 case AArch64::SUBSXrr:
1998 case AArch64::SUBSXri:
1999 case AArch64::SUBSXrx:
2000 case AArch64::ANDSWri:
2001 case AArch64::ANDSWrr:
2002 case AArch64::ANDSWrs:
2003 case AArch64::ANDSXri:
2004 case AArch64::ANDSXrr:
2005 case AArch64::ANDSXrs:
2006 case AArch64::BICSWrr:
2007 case AArch64::BICSXrr:
2008 case AArch64::BICSWrs:
2009 case AArch64::BICSXrs:
2010 return Instr.getOpcode();
2011
2012 case AArch64::ADDWrr:
2013 return AArch64::ADDSWrr;
2014 case AArch64::ADDWri:
2015 return AArch64::ADDSWri;
2016 case AArch64::ADDXrr:
2017 return AArch64::ADDSXrr;
2018 case AArch64::ADDXri:
2019 return AArch64::ADDSXri;
2020 case AArch64::ADDWrx:
2021 return AArch64::ADDSWrx;
2022 case AArch64::ADDXrx:
2023 return AArch64::ADDSXrx;
2024 case AArch64::ADCWr:
2025 return AArch64::ADCSWr;
2026 case AArch64::ADCXr:
2027 return AArch64::ADCSXr;
2028 case AArch64::SUBWrr:
2029 return AArch64::SUBSWrr;
2030 case AArch64::SUBWri:
2031 return AArch64::SUBSWri;
2032 case AArch64::SUBXrr:
2033 return AArch64::SUBSXrr;
2034 case AArch64::SUBXri:
2035 return AArch64::SUBSXri;
2036 case AArch64::SUBWrx:
2037 return AArch64::SUBSWrx;
2038 case AArch64::SUBXrx:
2039 return AArch64::SUBSXrx;
2040 case AArch64::SBCWr:
2041 return AArch64::SBCSWr;
2042 case AArch64::SBCXr:
2043 return AArch64::SBCSXr;
2044 case AArch64::ANDWri:
2045 return AArch64::ANDSWri;
2046 case AArch64::ANDXri:
2047 return AArch64::ANDSXri;
2048 case AArch64::ANDWrr:
2049 return AArch64::ANDSWrr;
2050 case AArch64::ANDWrs:
2051 return AArch64::ANDSWrs;
2052 case AArch64::ANDXrr:
2053 return AArch64::ANDSXrr;
2054 case AArch64::ANDXrs:
2055 return AArch64::ANDSXrs;
2056 case AArch64::BICWrr:
2057 return AArch64::BICSWrr;
2058 case AArch64::BICXrr:
2059 return AArch64::BICSXrr;
2060 case AArch64::BICWrs:
2061 return AArch64::BICSWrs;
2062 case AArch64::BICXrs:
2063 return AArch64::BICSXrs;
2064 }
2065}
2066
2067/// Check if AArch64::NZCV should be alive in successors of MBB.
2068static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
2069 for (auto *BB : MBB->successors())
2070 if (BB->isLiveIn(Reg: AArch64::NZCV))
2071 return true;
2072 return false;
2073}
2074
2075/// \returns The condition code operand index for \p Instr if it is a branch
2076/// or select and -1 otherwise.
2077static int
2078findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
2079 switch (Instr.getOpcode()) {
2080 default:
2081 return -1;
2082
2083 case AArch64::Bcc: {
2084 int Idx = Instr.findRegisterUseOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
2085 assert(Idx >= 2);
2086 return Idx - 2;
2087 }
2088
2089 case AArch64::CSINVWr:
2090 case AArch64::CSINVXr:
2091 case AArch64::CSINCWr:
2092 case AArch64::CSINCXr:
2093 case AArch64::CSELWr:
2094 case AArch64::CSELXr:
2095 case AArch64::CSNEGWr:
2096 case AArch64::CSNEGXr:
2097 case AArch64::FCSELSrrr:
2098 case AArch64::FCSELDrrr: {
2099 int Idx = Instr.findRegisterUseOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
2100 assert(Idx >= 1);
2101 return Idx - 1;
2102 }
2103 }
2104}
2105
2106/// Find a condition code used by the instruction.
2107/// Returns AArch64CC::Invalid if either the instruction does not use condition
2108/// codes or we don't optimize CmpInstr in the presence of such instructions.
2109static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
2110 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2111 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2112 Instr.getOperand(i: CCIdx).getImm())
2113 : AArch64CC::Invalid;
2114}
2115
2116static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
2117 assert(CC != AArch64CC::Invalid);
2118 UsedNZCV UsedFlags;
2119 switch (CC) {
2120 default:
2121 break;
2122
2123 case AArch64CC::EQ: // Z set
2124 case AArch64CC::NE: // Z clear
2125 UsedFlags.Z = true;
2126 break;
2127
2128 case AArch64CC::HI: // Z clear and C set
2129 case AArch64CC::LS: // Z set or C clear
2130 UsedFlags.Z = true;
2131 [[fallthrough]];
2132 case AArch64CC::HS: // C set
2133 case AArch64CC::LO: // C clear
2134 UsedFlags.C = true;
2135 break;
2136
2137 case AArch64CC::MI: // N set
2138 case AArch64CC::PL: // N clear
2139 UsedFlags.N = true;
2140 break;
2141
2142 case AArch64CC::VS: // V set
2143 case AArch64CC::VC: // V clear
2144 UsedFlags.V = true;
2145 break;
2146
2147 case AArch64CC::GT: // Z clear, N and V the same
2148 case AArch64CC::LE: // Z set, N and V differ
2149 UsedFlags.Z = true;
2150 [[fallthrough]];
2151 case AArch64CC::GE: // N and V the same
2152 case AArch64CC::LT: // N and V differ
2153 UsedFlags.N = true;
2154 UsedFlags.V = true;
2155 break;
2156 }
2157 return UsedFlags;
2158}
2159
2160/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2161/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2162/// \returns std::nullopt otherwise.
2163///
2164/// Collect instructions using that flags in \p CCUseInstrs if provided.
2165std::optional<UsedNZCV>
2166llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
2167 const TargetRegisterInfo &TRI,
2168 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2169 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2170 if (MI.getParent() != CmpParent)
2171 return std::nullopt;
2172
2173 if (areCFlagsAliveInSuccessors(MBB: CmpParent))
2174 return std::nullopt;
2175
2176 UsedNZCV NZCVUsedAfterCmp;
2177 for (MachineInstr &Instr : instructionsWithoutDebug(
2178 It: std::next(x: CmpInstr.getIterator()), End: CmpParent->instr_end())) {
2179 if (Instr.readsRegister(Reg: AArch64::NZCV, TRI: &TRI)) {
2180 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
2181 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2182 return std::nullopt;
2183 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2184 if (CCUseInstrs)
2185 CCUseInstrs->push_back(Elt: &Instr);
2186 }
2187 if (Instr.modifiesRegister(Reg: AArch64::NZCV, TRI: &TRI))
2188 break;
2189 }
2190 return NZCVUsedAfterCmp;
2191}
2192
2193static bool isADDSRegImm(unsigned Opcode) {
2194 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2195}
2196
2197static bool isSUBSRegImm(unsigned Opcode) {
2198 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2199}
2200
2201static bool isANDOpcode(MachineInstr &MI) {
2202 unsigned Opc = sForm(Instr&: MI);
2203 switch (Opc) {
2204 case AArch64::ANDSWri:
2205 case AArch64::ANDSWrr:
2206 case AArch64::ANDSWrs:
2207 case AArch64::ANDSXri:
2208 case AArch64::ANDSXrr:
2209 case AArch64::ANDSXrs:
2210 case AArch64::BICSWrr:
2211 case AArch64::BICSXrr:
2212 case AArch64::BICSWrs:
2213 case AArch64::BICSXrs:
2214 return true;
2215 default:
2216 return false;
2217 }
2218}
2219
2220/// Check if CmpInstr can be substituted by MI.
2221///
2222/// CmpInstr can be substituted:
2223/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2224/// - and, MI and CmpInstr are from the same MachineBB
2225/// - and, condition flags are not alive in successors of the CmpInstr parent
2226/// - and, if MI opcode is the S form there must be no defs of flags between
2227/// MI and CmpInstr
2228/// or if MI opcode is not the S form there must be neither defs of flags
2229/// nor uses of flags between MI and CmpInstr.
2230/// - and, if C/V flags are not used after CmpInstr
2231/// or if N flag is used but MI produces poison value if signed overflow
2232/// occurs.
2233static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
2234 const TargetRegisterInfo &TRI) {
2235 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2236 // that may or may not set flags.
2237 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2238
2239 const unsigned CmpOpcode = CmpInstr.getOpcode();
2240 if (!isADDSRegImm(Opcode: CmpOpcode) && !isSUBSRegImm(Opcode: CmpOpcode))
2241 return false;
2242
2243 assert((CmpInstr.getOperand(2).isImm() &&
2244 CmpInstr.getOperand(2).getImm() == 0) &&
2245 "Caller guarantees that CmpInstr compares with constant 0");
2246
2247 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2248 if (!NZVCUsed || NZVCUsed->C)
2249 return false;
2250
2251 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2252 // '%vreg = add ...' or '%vreg = sub ...'.
2253 // Condition flag V is used to indicate signed overflow.
2254 // 1) MI and CmpInstr set N and V to the same value.
2255 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2256 // signed overflow occurs, so CmpInstr could still be simplified away.
2257 // Note that Ands and Bics instructions always clear the V flag.
2258 if (NZVCUsed->V && !MI.getFlag(Flag: MachineInstr::NoSWrap) && !isANDOpcode(MI))
2259 return false;
2260
2261 AccessKind AccessToCheck = AK_Write;
2262 if (sForm(Instr&: MI) != MI.getOpcode())
2263 AccessToCheck = AK_All;
2264 return !areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck);
2265}
2266
2267/// Substitute an instruction comparing to zero with another instruction
2268/// which produces needed condition flags.
2269///
2270/// Return true on success.
2271bool AArch64InstrInfo::substituteCmpToZero(
2272 MachineInstr &CmpInstr, unsigned SrcReg,
2273 const MachineRegisterInfo &MRI) const {
2274 // Get the unique definition of SrcReg.
2275 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
2276 if (!MI)
2277 return false;
2278
2279 const TargetRegisterInfo &TRI = getRegisterInfo();
2280
2281 unsigned NewOpc = sForm(Instr&: *MI);
2282 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2283 return false;
2284
2285 if (!canInstrSubstituteCmpInstr(MI&: *MI, CmpInstr, TRI))
2286 return false;
2287
2288 // Update the instruction to set NZCV.
2289 MI->setDesc(get(Opcode: NewOpc));
2290 CmpInstr.eraseFromParent();
2291 bool succeeded = UpdateOperandRegClass(Instr&: *MI);
2292 (void)succeeded;
2293 assert(succeeded && "Some operands reg class are incompatible!");
2294 MI->addRegisterDefined(Reg: AArch64::NZCV, RegInfo: &TRI);
2295 return true;
2296}
2297
2298/// \returns True if \p CmpInstr can be removed.
2299///
2300/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2301/// codes used in \p CCUseInstrs must be inverted.
2302static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
2303 int CmpValue, const TargetRegisterInfo &TRI,
2304 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
2305 bool &IsInvertCC) {
2306 assert((CmpValue == 0 || CmpValue == 1) &&
2307 "Only comparisons to 0 or 1 considered for removal!");
2308
2309 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2310 unsigned MIOpc = MI.getOpcode();
2311 if (MIOpc == AArch64::CSINCWr) {
2312 if (MI.getOperand(i: 1).getReg() != AArch64::WZR ||
2313 MI.getOperand(i: 2).getReg() != AArch64::WZR)
2314 return false;
2315 } else if (MIOpc == AArch64::CSINCXr) {
2316 if (MI.getOperand(i: 1).getReg() != AArch64::XZR ||
2317 MI.getOperand(i: 2).getReg() != AArch64::XZR)
2318 return false;
2319 } else {
2320 return false;
2321 }
2322 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(Instr: MI);
2323 if (MICC == AArch64CC::Invalid)
2324 return false;
2325
2326 // NZCV needs to be defined
2327 if (MI.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) != -1)
2328 return false;
2329
2330 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2331 const unsigned CmpOpcode = CmpInstr.getOpcode();
2332 bool IsSubsRegImm = isSUBSRegImm(Opcode: CmpOpcode);
2333 if (CmpValue && !IsSubsRegImm)
2334 return false;
2335 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(Opcode: CmpOpcode))
2336 return false;
2337
2338 // MI conditions allowed: eq, ne, mi, pl
2339 UsedNZCV MIUsedNZCV = getUsedNZCV(CC: MICC);
2340 if (MIUsedNZCV.C || MIUsedNZCV.V)
2341 return false;
2342
2343 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2344 examineCFlagsUse(MI, CmpInstr, TRI, CCUseInstrs: &CCUseInstrs);
2345 // Condition flags are not used in CmpInstr basic block successors and only
2346 // Z or N flags allowed to be used after CmpInstr within its basic block
2347 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2348 return false;
2349 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2350 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2351 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2352 return false;
2353 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2354 if (MIUsedNZCV.N && !CmpValue)
2355 return false;
2356
2357 // There must be no defs of flags between MI and CmpInstr
2358 if (areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck: AK_Write))
2359 return false;
2360
2361 // Condition code is inverted in the following cases:
2362 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2363 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2364 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2365 (!CmpValue && MICC == AArch64CC::NE);
2366 return true;
2367}
2368
2369/// Remove comparison in csinc-cmp sequence
2370///
2371/// Examples:
2372/// 1. \code
2373/// csinc w9, wzr, wzr, ne
2374/// cmp w9, #0
2375/// b.eq
2376/// \endcode
2377/// to
2378/// \code
2379/// csinc w9, wzr, wzr, ne
2380/// b.ne
2381/// \endcode
2382///
2383/// 2. \code
2384/// csinc x2, xzr, xzr, mi
2385/// cmp x2, #1
2386/// b.pl
2387/// \endcode
2388/// to
2389/// \code
2390/// csinc x2, xzr, xzr, mi
2391/// b.pl
2392/// \endcode
2393///
2394/// \param CmpInstr comparison instruction
2395/// \return True when comparison removed
2396bool AArch64InstrInfo::removeCmpToZeroOrOne(
2397 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2398 const MachineRegisterInfo &MRI) const {
2399 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
2400 if (!MI)
2401 return false;
2402 const TargetRegisterInfo &TRI = getRegisterInfo();
2403 SmallVector<MachineInstr *, 4> CCUseInstrs;
2404 bool IsInvertCC = false;
2405 if (!canCmpInstrBeRemoved(MI&: *MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2406 IsInvertCC))
2407 return false;
2408 // Make transformation
2409 CmpInstr.eraseFromParent();
2410 if (IsInvertCC) {
2411 // Invert condition codes in CmpInstr CC users
2412 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2413 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(Instr: *CCUseInstr);
2414 assert(Idx >= 0 && "Unexpected instruction using CC.");
2415 MachineOperand &CCOperand = CCUseInstr->getOperand(i: Idx);
2416 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
2417 Code: static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2418 CCOperand.setImm(CCUse);
2419 }
2420 }
2421 return true;
2422}
2423
2424bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2425 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2426 MI.getOpcode() != AArch64::CATCHRET)
2427 return false;
2428
2429 MachineBasicBlock &MBB = *MI.getParent();
2430 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2431 auto TRI = Subtarget.getRegisterInfo();
2432 DebugLoc DL = MI.getDebugLoc();
2433
2434 if (MI.getOpcode() == AArch64::CATCHRET) {
2435 // Skip to the first instruction before the epilog.
2436 const TargetInstrInfo *TII =
2437 MBB.getParent()->getSubtarget().getInstrInfo();
2438 MachineBasicBlock *TargetMBB = MI.getOperand(i: 0).getMBB();
2439 auto MBBI = MachineBasicBlock::iterator(MI);
2440 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(x: MBBI);
2441 while (FirstEpilogSEH->getFlag(Flag: MachineInstr::FrameDestroy) &&
2442 FirstEpilogSEH != MBB.begin())
2443 FirstEpilogSEH = std::prev(x: FirstEpilogSEH);
2444 if (FirstEpilogSEH != MBB.begin())
2445 FirstEpilogSEH = std::next(x: FirstEpilogSEH);
2446 BuildMI(BB&: MBB, I: FirstEpilogSEH, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADRP))
2447 .addReg(RegNo: AArch64::X0, Flags: RegState::Define)
2448 .addMBB(MBB: TargetMBB);
2449 BuildMI(BB&: MBB, I: FirstEpilogSEH, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri))
2450 .addReg(RegNo: AArch64::X0, Flags: RegState::Define)
2451 .addReg(RegNo: AArch64::X0)
2452 .addMBB(MBB: TargetMBB)
2453 .addImm(Val: 0);
2454 TargetMBB->setMachineBlockAddressTaken();
2455 return true;
2456 }
2457
2458 Register Reg = MI.getOperand(i: 0).getReg();
2459 Module &M = *MBB.getParent()->getFunction().getParent();
2460 if (M.getStackProtectorGuard() == "sysreg") {
2461 const AArch64SysReg::SysReg *SrcReg =
2462 AArch64SysReg::lookupSysRegByName(Name: M.getStackProtectorGuardReg());
2463 if (!SrcReg)
2464 report_fatal_error(reason: "Unknown SysReg for Stack Protector Guard Register");
2465
2466 // mrs xN, sysreg
2467 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MRS))
2468 .addDef(RegNo: Reg, Flags: RegState::Renamable)
2469 .addImm(Val: SrcReg->Encoding);
2470 int Offset = M.getStackProtectorGuardOffset();
2471 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2472 // ldr xN, [xN, #offset]
2473 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui))
2474 .addDef(RegNo: Reg)
2475 .addUse(RegNo: Reg, Flags: RegState::Kill)
2476 .addImm(Val: Offset / 8);
2477 } else if (Offset >= -256 && Offset <= 255) {
2478 // ldur xN, [xN, #offset]
2479 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDURXi))
2480 .addDef(RegNo: Reg)
2481 .addUse(RegNo: Reg, Flags: RegState::Kill)
2482 .addImm(Val: Offset);
2483 } else if (Offset >= -4095 && Offset <= 4095) {
2484 if (Offset > 0) {
2485 // add xN, xN, #offset
2486 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri))
2487 .addDef(RegNo: Reg)
2488 .addUse(RegNo: Reg, Flags: RegState::Kill)
2489 .addImm(Val: Offset)
2490 .addImm(Val: 0);
2491 } else {
2492 // sub xN, xN, #offset
2493 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::SUBXri))
2494 .addDef(RegNo: Reg)
2495 .addUse(RegNo: Reg, Flags: RegState::Kill)
2496 .addImm(Val: -Offset)
2497 .addImm(Val: 0);
2498 }
2499 // ldr xN, [xN]
2500 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui))
2501 .addDef(RegNo: Reg)
2502 .addUse(RegNo: Reg, Flags: RegState::Kill)
2503 .addImm(Val: 0);
2504 } else {
2505 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2506 // than 23760.
2507 // It might be nice to use AArch64::MOVi32imm here, which would get
2508 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2509 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2510 // AArch64FrameLowering might help us find such a scratch register
2511 // though. If we failed to find a scratch register, we could emit a
2512 // stream of add instructions to build up the immediate. Or, we could try
2513 // to insert a AArch64::MOVi32imm before register allocation so that we
2514 // didn't need to scavenge for a scratch register.
2515 report_fatal_error(reason: "Unable to encode Stack Protector Guard Offset");
2516 }
2517 MBB.erase(I: MI);
2518 return true;
2519 }
2520
2521 const GlobalValue *GV =
2522 cast<GlobalValue>(Val: (*MI.memoperands_begin())->getValue());
2523 const TargetMachine &TM = MBB.getParent()->getTarget();
2524 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2525 const unsigned char MO_NC = AArch64II::MO_NC;
2526
2527 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2528 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LOADgot), DestReg: Reg)
2529 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags);
2530 if (Subtarget.isTargetILP32()) {
2531 unsigned Reg32 = TRI->getSubReg(Reg, Idx: AArch64::sub_32);
2532 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRWui))
2533 .addDef(RegNo: Reg32, Flags: RegState::Dead)
2534 .addUse(RegNo: Reg, Flags: RegState::Kill)
2535 .addImm(Val: 0)
2536 .addMemOperand(MMO: *MI.memoperands_begin())
2537 .addDef(RegNo: Reg, Flags: RegState::Implicit);
2538 } else {
2539 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2540 .addReg(RegNo: Reg, Flags: RegState::Kill)
2541 .addImm(Val: 0)
2542 .addMemOperand(MMO: *MI.memoperands_begin());
2543 }
2544 } else if (TM.getCodeModel() == CodeModel::Large) {
2545 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2546 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: Reg)
2547 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G0 | MO_NC)
2548 .addImm(Val: 0);
2549 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2550 .addReg(RegNo: Reg, Flags: RegState::Kill)
2551 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G1 | MO_NC)
2552 .addImm(Val: 16);
2553 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2554 .addReg(RegNo: Reg, Flags: RegState::Kill)
2555 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G2 | MO_NC)
2556 .addImm(Val: 32);
2557 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2558 .addReg(RegNo: Reg, Flags: RegState::Kill)
2559 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G3)
2560 .addImm(Val: 48);
2561 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2562 .addReg(RegNo: Reg, Flags: RegState::Kill)
2563 .addImm(Val: 0)
2564 .addMemOperand(MMO: *MI.memoperands_begin());
2565 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2566 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADR), DestReg: Reg)
2567 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags);
2568 } else {
2569 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADRP), DestReg: Reg)
2570 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags | AArch64II::MO_PAGE);
2571 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2572 if (Subtarget.isTargetILP32()) {
2573 unsigned Reg32 = TRI->getSubReg(Reg, Idx: AArch64::sub_32);
2574 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRWui))
2575 .addDef(RegNo: Reg32, Flags: RegState::Dead)
2576 .addUse(RegNo: Reg, Flags: RegState::Kill)
2577 .addGlobalAddress(GV, Offset: 0, TargetFlags: LoFlags)
2578 .addMemOperand(MMO: *MI.memoperands_begin())
2579 .addDef(RegNo: Reg, Flags: RegState::Implicit);
2580 } else {
2581 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2582 .addReg(RegNo: Reg, Flags: RegState::Kill)
2583 .addGlobalAddress(GV, Offset: 0, TargetFlags: LoFlags)
2584 .addMemOperand(MMO: *MI.memoperands_begin());
2585 }
2586 }
2587
2588 MBB.erase(I: MI);
2589
2590 return true;
2591}
2592
2593// Return true if this instruction simply sets its single destination register
2594// to zero. This is equivalent to a register rename of the zero-register.
2595bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2596 switch (MI.getOpcode()) {
2597 default:
2598 break;
2599 case AArch64::MOVZWi:
2600 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2601 if (MI.getOperand(i: 1).isImm() && MI.getOperand(i: 1).getImm() == 0) {
2602 assert(MI.getDesc().getNumOperands() == 3 &&
2603 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2604 return true;
2605 }
2606 break;
2607 case AArch64::ANDWri: // and Rd, Rzr, #imm
2608 return MI.getOperand(i: 1).getReg() == AArch64::WZR;
2609 case AArch64::ANDXri:
2610 return MI.getOperand(i: 1).getReg() == AArch64::XZR;
2611 case TargetOpcode::COPY:
2612 return MI.getOperand(i: 1).getReg() == AArch64::WZR;
2613 }
2614 return false;
2615}
2616
2617// Return true if this instruction simply renames a general register without
2618// modifying bits.
2619bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2620 switch (MI.getOpcode()) {
2621 default:
2622 break;
2623 case TargetOpcode::COPY: {
2624 // GPR32 copies will by lowered to ORRXrs
2625 Register DstReg = MI.getOperand(i: 0).getReg();
2626 return (AArch64::GPR32RegClass.contains(Reg: DstReg) ||
2627 AArch64::GPR64RegClass.contains(Reg: DstReg));
2628 }
2629 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2630 if (MI.getOperand(i: 1).getReg() == AArch64::XZR) {
2631 assert(MI.getDesc().getNumOperands() == 4 &&
2632 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2633 return true;
2634 }
2635 break;
2636 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2637 if (MI.getOperand(i: 2).getImm() == 0) {
2638 assert(MI.getDesc().getNumOperands() == 4 &&
2639 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2640 return true;
2641 }
2642 break;
2643 }
2644 return false;
2645}
2646
2647// Return true if this instruction simply renames a general register without
2648// modifying bits.
2649bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2650 switch (MI.getOpcode()) {
2651 default:
2652 break;
2653 case TargetOpcode::COPY: {
2654 Register DstReg = MI.getOperand(i: 0).getReg();
2655 return AArch64::FPR128RegClass.contains(Reg: DstReg);
2656 }
2657 case AArch64::ORRv16i8:
2658 if (MI.getOperand(i: 1).getReg() == MI.getOperand(i: 2).getReg()) {
2659 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2660 "invalid ORRv16i8 operands");
2661 return true;
2662 }
2663 break;
2664 }
2665 return false;
2666}
2667
2668static bool isFrameLoadOpcode(int Opcode) {
2669 switch (Opcode) {
2670 default:
2671 return false;
2672 case AArch64::LDRWui:
2673 case AArch64::LDRXui:
2674 case AArch64::LDRBui:
2675 case AArch64::LDRHui:
2676 case AArch64::LDRSui:
2677 case AArch64::LDRDui:
2678 case AArch64::LDRQui:
2679 case AArch64::LDR_PXI:
2680 return true;
2681 }
2682}
2683
2684Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2685 int &FrameIndex) const {
2686 if (!isFrameLoadOpcode(Opcode: MI.getOpcode()))
2687 return Register();
2688
2689 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2690 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2691 FrameIndex = MI.getOperand(i: 1).getIndex();
2692 return MI.getOperand(i: 0).getReg();
2693 }
2694 return Register();
2695}
2696
2697static bool isFrameStoreOpcode(int Opcode) {
2698 switch (Opcode) {
2699 default:
2700 return false;
2701 case AArch64::STRWui:
2702 case AArch64::STRXui:
2703 case AArch64::STRBui:
2704 case AArch64::STRHui:
2705 case AArch64::STRSui:
2706 case AArch64::STRDui:
2707 case AArch64::STRQui:
2708 case AArch64::STR_PXI:
2709 return true;
2710 }
2711}
2712
2713Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2714 int &FrameIndex) const {
2715 if (!isFrameStoreOpcode(Opcode: MI.getOpcode()))
2716 return Register();
2717
2718 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2719 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2720 FrameIndex = MI.getOperand(i: 1).getIndex();
2721 return MI.getOperand(i: 0).getReg();
2722 }
2723 return Register();
2724}
2725
2726Register AArch64InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
2727 int &FrameIndex) const {
2728 if (!isFrameStoreOpcode(Opcode: MI.getOpcode()))
2729 return Register();
2730
2731 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2732 return Reg;
2733
2734 SmallVector<const MachineMemOperand *, 1> Accesses;
2735 if (hasStoreToStackSlot(MI, Accesses)) {
2736 if (Accesses.size() > 1)
2737 return Register();
2738
2739 FrameIndex =
2740 cast<FixedStackPseudoSourceValue>(Val: Accesses.front()->getPseudoValue())
2741 ->getFrameIndex();
2742 return MI.getOperand(i: 0).getReg();
2743 }
2744 return Register();
2745}
2746
2747Register AArch64InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
2748 int &FrameIndex) const {
2749 if (!isFrameLoadOpcode(Opcode: MI.getOpcode()))
2750 return Register();
2751
2752 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2753 return Reg;
2754
2755 SmallVector<const MachineMemOperand *, 1> Accesses;
2756 if (hasLoadFromStackSlot(MI, Accesses)) {
2757 if (Accesses.size() > 1)
2758 return Register();
2759
2760 FrameIndex =
2761 cast<FixedStackPseudoSourceValue>(Val: Accesses.front()->getPseudoValue())
2762 ->getFrameIndex();
2763 return MI.getOperand(i: 0).getReg();
2764 }
2765 return Register();
2766}
2767
2768/// Check all MachineMemOperands for a hint to suppress pairing.
2769bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2770 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2771 return MMO->getFlags() & MOSuppressPair;
2772 });
2773}
2774
2775/// Set a flag on the first MachineMemOperand to suppress pairing.
2776void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2777 if (MI.memoperands_empty())
2778 return;
2779 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2780}
2781
2782/// Check all MachineMemOperands for a hint that the load/store is strided.
2783bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2784 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2785 return MMO->getFlags() & MOStridedAccess;
2786 });
2787}
2788
2789bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2790 switch (Opc) {
2791 default:
2792 return false;
2793 case AArch64::STURSi:
2794 case AArch64::STRSpre:
2795 case AArch64::STURDi:
2796 case AArch64::STRDpre:
2797 case AArch64::STURQi:
2798 case AArch64::STRQpre:
2799 case AArch64::STURBBi:
2800 case AArch64::STURHHi:
2801 case AArch64::STURWi:
2802 case AArch64::STRWpre:
2803 case AArch64::STURXi:
2804 case AArch64::STRXpre:
2805 case AArch64::LDURSi:
2806 case AArch64::LDRSpre:
2807 case AArch64::LDURDi:
2808 case AArch64::LDRDpre:
2809 case AArch64::LDURQi:
2810 case AArch64::LDRQpre:
2811 case AArch64::LDURWi:
2812 case AArch64::LDRWpre:
2813 case AArch64::LDURXi:
2814 case AArch64::LDRXpre:
2815 case AArch64::LDRSWpre:
2816 case AArch64::LDURSWi:
2817 case AArch64::LDURHHi:
2818 case AArch64::LDURBBi:
2819 case AArch64::LDURSBWi:
2820 case AArch64::LDURSHWi:
2821 return true;
2822 }
2823}
2824
2825std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2826 switch (Opc) {
2827 default: return {};
2828 case AArch64::PRFMui: return AArch64::PRFUMi;
2829 case AArch64::LDRXui: return AArch64::LDURXi;
2830 case AArch64::LDRWui: return AArch64::LDURWi;
2831 case AArch64::LDRBui: return AArch64::LDURBi;
2832 case AArch64::LDRHui: return AArch64::LDURHi;
2833 case AArch64::LDRSui: return AArch64::LDURSi;
2834 case AArch64::LDRDui: return AArch64::LDURDi;
2835 case AArch64::LDRQui: return AArch64::LDURQi;
2836 case AArch64::LDRBBui: return AArch64::LDURBBi;
2837 case AArch64::LDRHHui: return AArch64::LDURHHi;
2838 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2839 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2840 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2841 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2842 case AArch64::LDRSWui: return AArch64::LDURSWi;
2843 case AArch64::STRXui: return AArch64::STURXi;
2844 case AArch64::STRWui: return AArch64::STURWi;
2845 case AArch64::STRBui: return AArch64::STURBi;
2846 case AArch64::STRHui: return AArch64::STURHi;
2847 case AArch64::STRSui: return AArch64::STURSi;
2848 case AArch64::STRDui: return AArch64::STURDi;
2849 case AArch64::STRQui: return AArch64::STURQi;
2850 case AArch64::STRBBui: return AArch64::STURBBi;
2851 case AArch64::STRHHui: return AArch64::STURHHi;
2852 }
2853}
2854
2855unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2856 switch (Opc) {
2857 default:
2858 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2859 case AArch64::ADDG:
2860 case AArch64::LDAPURBi:
2861 case AArch64::LDAPURHi:
2862 case AArch64::LDAPURi:
2863 case AArch64::LDAPURSBWi:
2864 case AArch64::LDAPURSBXi:
2865 case AArch64::LDAPURSHWi:
2866 case AArch64::LDAPURSHXi:
2867 case AArch64::LDAPURSWi:
2868 case AArch64::LDAPURXi:
2869 case AArch64::LDR_PPXI:
2870 case AArch64::LDR_PXI:
2871 case AArch64::LDR_ZXI:
2872 case AArch64::LDR_ZZXI:
2873 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2874 case AArch64::LDR_ZZZXI:
2875 case AArch64::LDR_ZZZZXI:
2876 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2877 case AArch64::LDRBBui:
2878 case AArch64::LDRBui:
2879 case AArch64::LDRDui:
2880 case AArch64::LDRHHui:
2881 case AArch64::LDRHui:
2882 case AArch64::LDRQui:
2883 case AArch64::LDRSBWui:
2884 case AArch64::LDRSBXui:
2885 case AArch64::LDRSHWui:
2886 case AArch64::LDRSHXui:
2887 case AArch64::LDRSui:
2888 case AArch64::LDRSWui:
2889 case AArch64::LDRWui:
2890 case AArch64::LDRXui:
2891 case AArch64::LDURBBi:
2892 case AArch64::LDURBi:
2893 case AArch64::LDURDi:
2894 case AArch64::LDURHHi:
2895 case AArch64::LDURHi:
2896 case AArch64::LDURQi:
2897 case AArch64::LDURSBWi:
2898 case AArch64::LDURSBXi:
2899 case AArch64::LDURSHWi:
2900 case AArch64::LDURSHXi:
2901 case AArch64::LDURSi:
2902 case AArch64::LDURSWi:
2903 case AArch64::LDURWi:
2904 case AArch64::LDURXi:
2905 case AArch64::PRFMui:
2906 case AArch64::PRFUMi:
2907 case AArch64::ST2Gi:
2908 case AArch64::STGi:
2909 case AArch64::STLURBi:
2910 case AArch64::STLURHi:
2911 case AArch64::STLURWi:
2912 case AArch64::STLURXi:
2913 case AArch64::StoreSwiftAsyncContext:
2914 case AArch64::STR_PPXI:
2915 case AArch64::STR_PXI:
2916 case AArch64::STR_ZXI:
2917 case AArch64::STR_ZZXI:
2918 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2919 case AArch64::STR_ZZZXI:
2920 case AArch64::STR_ZZZZXI:
2921 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2922 case AArch64::STRBBui:
2923 case AArch64::STRBui:
2924 case AArch64::STRDui:
2925 case AArch64::STRHHui:
2926 case AArch64::STRHui:
2927 case AArch64::STRQui:
2928 case AArch64::STRSui:
2929 case AArch64::STRWui:
2930 case AArch64::STRXui:
2931 case AArch64::STURBBi:
2932 case AArch64::STURBi:
2933 case AArch64::STURDi:
2934 case AArch64::STURHHi:
2935 case AArch64::STURHi:
2936 case AArch64::STURQi:
2937 case AArch64::STURSi:
2938 case AArch64::STURWi:
2939 case AArch64::STURXi:
2940 case AArch64::STZ2Gi:
2941 case AArch64::STZGi:
2942 case AArch64::TAGPstack:
2943 return 2;
2944 case AArch64::LD1B_D_IMM:
2945 case AArch64::LD1B_H_IMM:
2946 case AArch64::LD1B_IMM:
2947 case AArch64::LD1B_S_IMM:
2948 case AArch64::LD1D_IMM:
2949 case AArch64::LD1H_D_IMM:
2950 case AArch64::LD1H_IMM:
2951 case AArch64::LD1H_S_IMM:
2952 case AArch64::LD1RB_D_IMM:
2953 case AArch64::LD1RB_H_IMM:
2954 case AArch64::LD1RB_IMM:
2955 case AArch64::LD1RB_S_IMM:
2956 case AArch64::LD1RD_IMM:
2957 case AArch64::LD1RH_D_IMM:
2958 case AArch64::LD1RH_IMM:
2959 case AArch64::LD1RH_S_IMM:
2960 case AArch64::LD1RSB_D_IMM:
2961 case AArch64::LD1RSB_H_IMM:
2962 case AArch64::LD1RSB_S_IMM:
2963 case AArch64::LD1RSH_D_IMM:
2964 case AArch64::LD1RSH_S_IMM:
2965 case AArch64::LD1RSW_IMM:
2966 case AArch64::LD1RW_D_IMM:
2967 case AArch64::LD1RW_IMM:
2968 case AArch64::LD1SB_D_IMM:
2969 case AArch64::LD1SB_H_IMM:
2970 case AArch64::LD1SB_S_IMM:
2971 case AArch64::LD1SH_D_IMM:
2972 case AArch64::LD1SH_S_IMM:
2973 case AArch64::LD1SW_D_IMM:
2974 case AArch64::LD1W_D_IMM:
2975 case AArch64::LD1W_IMM:
2976 case AArch64::LD2B_IMM:
2977 case AArch64::LD2D_IMM:
2978 case AArch64::LD2H_IMM:
2979 case AArch64::LD2W_IMM:
2980 case AArch64::LD3B_IMM:
2981 case AArch64::LD3D_IMM:
2982 case AArch64::LD3H_IMM:
2983 case AArch64::LD3W_IMM:
2984 case AArch64::LD4B_IMM:
2985 case AArch64::LD4D_IMM:
2986 case AArch64::LD4H_IMM:
2987 case AArch64::LD4W_IMM:
2988 case AArch64::LDG:
2989 case AArch64::LDNF1B_D_IMM:
2990 case AArch64::LDNF1B_H_IMM:
2991 case AArch64::LDNF1B_IMM:
2992 case AArch64::LDNF1B_S_IMM:
2993 case AArch64::LDNF1D_IMM:
2994 case AArch64::LDNF1H_D_IMM:
2995 case AArch64::LDNF1H_IMM:
2996 case AArch64::LDNF1H_S_IMM:
2997 case AArch64::LDNF1SB_D_IMM:
2998 case AArch64::LDNF1SB_H_IMM:
2999 case AArch64::LDNF1SB_S_IMM:
3000 case AArch64::LDNF1SH_D_IMM:
3001 case AArch64::LDNF1SH_S_IMM:
3002 case AArch64::LDNF1SW_D_IMM:
3003 case AArch64::LDNF1W_D_IMM:
3004 case AArch64::LDNF1W_IMM:
3005 case AArch64::LDNPDi:
3006 case AArch64::LDNPQi:
3007 case AArch64::LDNPSi:
3008 case AArch64::LDNPWi:
3009 case AArch64::LDNPXi:
3010 case AArch64::LDNT1B_ZRI:
3011 case AArch64::LDNT1D_ZRI:
3012 case AArch64::LDNT1H_ZRI:
3013 case AArch64::LDNT1W_ZRI:
3014 case AArch64::LDPDi:
3015 case AArch64::LDPQi:
3016 case AArch64::LDPSi:
3017 case AArch64::LDPWi:
3018 case AArch64::LDPXi:
3019 case AArch64::LDRBBpost:
3020 case AArch64::LDRBBpre:
3021 case AArch64::LDRBpost:
3022 case AArch64::LDRBpre:
3023 case AArch64::LDRDpost:
3024 case AArch64::LDRDpre:
3025 case AArch64::LDRHHpost:
3026 case AArch64::LDRHHpre:
3027 case AArch64::LDRHpost:
3028 case AArch64::LDRHpre:
3029 case AArch64::LDRQpost:
3030 case AArch64::LDRQpre:
3031 case AArch64::LDRSpost:
3032 case AArch64::LDRSpre:
3033 case AArch64::LDRWpost:
3034 case AArch64::LDRWpre:
3035 case AArch64::LDRXpost:
3036 case AArch64::LDRXpre:
3037 case AArch64::ST1B_D_IMM:
3038 case AArch64::ST1B_H_IMM:
3039 case AArch64::ST1B_IMM:
3040 case AArch64::ST1B_S_IMM:
3041 case AArch64::ST1D_IMM:
3042 case AArch64::ST1H_D_IMM:
3043 case AArch64::ST1H_IMM:
3044 case AArch64::ST1H_S_IMM:
3045 case AArch64::ST1W_D_IMM:
3046 case AArch64::ST1W_IMM:
3047 case AArch64::ST2B_IMM:
3048 case AArch64::ST2D_IMM:
3049 case AArch64::ST2H_IMM:
3050 case AArch64::ST2W_IMM:
3051 case AArch64::ST3B_IMM:
3052 case AArch64::ST3D_IMM:
3053 case AArch64::ST3H_IMM:
3054 case AArch64::ST3W_IMM:
3055 case AArch64::ST4B_IMM:
3056 case AArch64::ST4D_IMM:
3057 case AArch64::ST4H_IMM:
3058 case AArch64::ST4W_IMM:
3059 case AArch64::STGPi:
3060 case AArch64::STGPreIndex:
3061 case AArch64::STZGPreIndex:
3062 case AArch64::ST2GPreIndex:
3063 case AArch64::STZ2GPreIndex:
3064 case AArch64::STGPostIndex:
3065 case AArch64::STZGPostIndex:
3066 case AArch64::ST2GPostIndex:
3067 case AArch64::STZ2GPostIndex:
3068 case AArch64::STNPDi:
3069 case AArch64::STNPQi:
3070 case AArch64::STNPSi:
3071 case AArch64::STNPWi:
3072 case AArch64::STNPXi:
3073 case AArch64::STNT1B_ZRI:
3074 case AArch64::STNT1D_ZRI:
3075 case AArch64::STNT1H_ZRI:
3076 case AArch64::STNT1W_ZRI:
3077 case AArch64::STPDi:
3078 case AArch64::STPQi:
3079 case AArch64::STPSi:
3080 case AArch64::STPWi:
3081 case AArch64::STPXi:
3082 case AArch64::STRBBpost:
3083 case AArch64::STRBBpre:
3084 case AArch64::STRBpost:
3085 case AArch64::STRBpre:
3086 case AArch64::STRDpost:
3087 case AArch64::STRDpre:
3088 case AArch64::STRHHpost:
3089 case AArch64::STRHHpre:
3090 case AArch64::STRHpost:
3091 case AArch64::STRHpre:
3092 case AArch64::STRQpost:
3093 case AArch64::STRQpre:
3094 case AArch64::STRSpost:
3095 case AArch64::STRSpre:
3096 case AArch64::STRWpost:
3097 case AArch64::STRWpre:
3098 case AArch64::STRXpost:
3099 case AArch64::STRXpre:
3100 return 3;
3101 case AArch64::LDPDpost:
3102 case AArch64::LDPDpre:
3103 case AArch64::LDPQpost:
3104 case AArch64::LDPQpre:
3105 case AArch64::LDPSpost:
3106 case AArch64::LDPSpre:
3107 case AArch64::LDPWpost:
3108 case AArch64::LDPWpre:
3109 case AArch64::LDPXpost:
3110 case AArch64::LDPXpre:
3111 case AArch64::STGPpre:
3112 case AArch64::STGPpost:
3113 case AArch64::STPDpost:
3114 case AArch64::STPDpre:
3115 case AArch64::STPQpost:
3116 case AArch64::STPQpre:
3117 case AArch64::STPSpost:
3118 case AArch64::STPSpre:
3119 case AArch64::STPWpost:
3120 case AArch64::STPWpre:
3121 case AArch64::STPXpost:
3122 case AArch64::STPXpre:
3123 return 4;
3124 }
3125}
3126
3127bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
3128 switch (MI.getOpcode()) {
3129 default:
3130 return false;
3131 // Scaled instructions.
3132 case AArch64::STRSui:
3133 case AArch64::STRDui:
3134 case AArch64::STRQui:
3135 case AArch64::STRXui:
3136 case AArch64::STRWui:
3137 case AArch64::LDRSui:
3138 case AArch64::LDRDui:
3139 case AArch64::LDRQui:
3140 case AArch64::LDRXui:
3141 case AArch64::LDRWui:
3142 case AArch64::LDRSWui:
3143 // Unscaled instructions.
3144 case AArch64::STURSi:
3145 case AArch64::STRSpre:
3146 case AArch64::STURDi:
3147 case AArch64::STRDpre:
3148 case AArch64::STURQi:
3149 case AArch64::STRQpre:
3150 case AArch64::STURWi:
3151 case AArch64::STRWpre:
3152 case AArch64::STURXi:
3153 case AArch64::STRXpre:
3154 case AArch64::LDURSi:
3155 case AArch64::LDRSpre:
3156 case AArch64::LDURDi:
3157 case AArch64::LDRDpre:
3158 case AArch64::LDURQi:
3159 case AArch64::LDRQpre:
3160 case AArch64::LDURWi:
3161 case AArch64::LDRWpre:
3162 case AArch64::LDURXi:
3163 case AArch64::LDRXpre:
3164 case AArch64::LDURSWi:
3165 case AArch64::LDRSWpre:
3166 // SVE instructions.
3167 case AArch64::LDR_ZXI:
3168 case AArch64::STR_ZXI:
3169 return true;
3170 }
3171}
3172
3173bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
3174 switch (MI.getOpcode()) {
3175 default:
3176 assert((!MI.isCall() || !MI.isReturn()) &&
3177 "Unexpected instruction - was a new tail call opcode introduced?");
3178 return false;
3179 case AArch64::TCRETURNdi:
3180 case AArch64::TCRETURNri:
3181 case AArch64::TCRETURNrix16x17:
3182 case AArch64::TCRETURNrix17:
3183 case AArch64::TCRETURNrinotx16:
3184 case AArch64::TCRETURNriALL:
3185 case AArch64::AUTH_TCRETURN:
3186 case AArch64::AUTH_TCRETURN_BTI:
3187 return true;
3188 }
3189}
3190
3191unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
3192 switch (Opc) {
3193 default:
3194 llvm_unreachable("Opcode has no flag setting equivalent!");
3195 // 32-bit cases:
3196 case AArch64::ADDWri:
3197 return AArch64::ADDSWri;
3198 case AArch64::ADDWrr:
3199 return AArch64::ADDSWrr;
3200 case AArch64::ADDWrs:
3201 return AArch64::ADDSWrs;
3202 case AArch64::ADDWrx:
3203 return AArch64::ADDSWrx;
3204 case AArch64::ANDWri:
3205 return AArch64::ANDSWri;
3206 case AArch64::ANDWrr:
3207 return AArch64::ANDSWrr;
3208 case AArch64::ANDWrs:
3209 return AArch64::ANDSWrs;
3210 case AArch64::BICWrr:
3211 return AArch64::BICSWrr;
3212 case AArch64::BICWrs:
3213 return AArch64::BICSWrs;
3214 case AArch64::SUBWri:
3215 return AArch64::SUBSWri;
3216 case AArch64::SUBWrr:
3217 return AArch64::SUBSWrr;
3218 case AArch64::SUBWrs:
3219 return AArch64::SUBSWrs;
3220 case AArch64::SUBWrx:
3221 return AArch64::SUBSWrx;
3222 // 64-bit cases:
3223 case AArch64::ADDXri:
3224 return AArch64::ADDSXri;
3225 case AArch64::ADDXrr:
3226 return AArch64::ADDSXrr;
3227 case AArch64::ADDXrs:
3228 return AArch64::ADDSXrs;
3229 case AArch64::ADDXrx:
3230 return AArch64::ADDSXrx;
3231 case AArch64::ANDXri:
3232 return AArch64::ANDSXri;
3233 case AArch64::ANDXrr:
3234 return AArch64::ANDSXrr;
3235 case AArch64::ANDXrs:
3236 return AArch64::ANDSXrs;
3237 case AArch64::BICXrr:
3238 return AArch64::BICSXrr;
3239 case AArch64::BICXrs:
3240 return AArch64::BICSXrs;
3241 case AArch64::SUBXri:
3242 return AArch64::SUBSXri;
3243 case AArch64::SUBXrr:
3244 return AArch64::SUBSXrr;
3245 case AArch64::SUBXrs:
3246 return AArch64::SUBSXrs;
3247 case AArch64::SUBXrx:
3248 return AArch64::SUBSXrx;
3249 // SVE instructions:
3250 case AArch64::AND_PPzPP:
3251 return AArch64::ANDS_PPzPP;
3252 case AArch64::BIC_PPzPP:
3253 return AArch64::BICS_PPzPP;
3254 case AArch64::EOR_PPzPP:
3255 return AArch64::EORS_PPzPP;
3256 case AArch64::NAND_PPzPP:
3257 return AArch64::NANDS_PPzPP;
3258 case AArch64::NOR_PPzPP:
3259 return AArch64::NORS_PPzPP;
3260 case AArch64::ORN_PPzPP:
3261 return AArch64::ORNS_PPzPP;
3262 case AArch64::ORR_PPzPP:
3263 return AArch64::ORRS_PPzPP;
3264 case AArch64::BRKA_PPzP:
3265 return AArch64::BRKAS_PPzP;
3266 case AArch64::BRKPA_PPzPP:
3267 return AArch64::BRKPAS_PPzPP;
3268 case AArch64::BRKB_PPzP:
3269 return AArch64::BRKBS_PPzP;
3270 case AArch64::BRKPB_PPzPP:
3271 return AArch64::BRKPBS_PPzPP;
3272 case AArch64::BRKN_PPzP:
3273 return AArch64::BRKNS_PPzP;
3274 case AArch64::RDFFR_PPz:
3275 return AArch64::RDFFRS_PPz;
3276 case AArch64::PTRUE_B:
3277 return AArch64::PTRUES_B;
3278 }
3279}
3280
3281// Is this a candidate for ld/st merging or pairing? For example, we don't
3282// touch volatiles or load/stores that have a hint to avoid pair formation.
3283bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
3284
3285 bool IsPreLdSt = isPreLdSt(MI);
3286
3287 // If this is a volatile load/store, don't mess with it.
3288 if (MI.hasOrderedMemoryRef())
3289 return false;
3290
3291 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3292 // For Pre-inc LD/ST, the operand is shifted by one.
3293 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3294 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3295 "Expected a reg or frame index operand.");
3296
3297 // For Pre-indexed addressing quadword instructions, the third operand is the
3298 // immediate value.
3299 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(i: 3).isImm();
3300
3301 if (!MI.getOperand(i: 2).isImm() && !IsImmPreLdSt)
3302 return false;
3303
3304 // Can't merge/pair if the instruction modifies the base register.
3305 // e.g., ldr x0, [x0]
3306 // This case will never occur with an FI base.
3307 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3308 // STR<S,D,Q,W,X>pre, it can be merged.
3309 // For example:
3310 // ldr q0, [x11, #32]!
3311 // ldr q1, [x11, #16]
3312 // to
3313 // ldp q0, q1, [x11, #32]!
3314 if (MI.getOperand(i: 1).isReg() && !IsPreLdSt) {
3315 Register BaseReg = MI.getOperand(i: 1).getReg();
3316 const TargetRegisterInfo *TRI = &getRegisterInfo();
3317 if (MI.modifiesRegister(Reg: BaseReg, TRI))
3318 return false;
3319 }
3320
3321 // Pairing SVE fills/spills is only valid for little-endian targets that
3322 // implement VLS 128.
3323 switch (MI.getOpcode()) {
3324 default:
3325 break;
3326 case AArch64::LDR_ZXI:
3327 case AArch64::STR_ZXI:
3328 if (!Subtarget.isLittleEndian() ||
3329 Subtarget.getSVEVectorSizeInBits() != 128)
3330 return false;
3331 }
3332
3333 // Check if this load/store has a hint to avoid pair formation.
3334 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3335 if (isLdStPairSuppressed(MI))
3336 return false;
3337
3338 // Do not pair any callee-save store/reload instructions in the
3339 // prologue/epilogue if the CFI information encoded the operations as separate
3340 // instructions, as that will cause the size of the actual prologue to mismatch
3341 // with the prologue size recorded in the Windows CFI.
3342 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
3343 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
3344 MI.getMF()->getFunction().needsUnwindTableEntry();
3345 if (NeedsWinCFI && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
3346 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
3347 return false;
3348
3349 // On some CPUs quad load/store pairs are slower than two single load/stores.
3350 if (Subtarget.isPaired128Slow()) {
3351 switch (MI.getOpcode()) {
3352 default:
3353 break;
3354 case AArch64::LDURQi:
3355 case AArch64::STURQi:
3356 case AArch64::LDRQui:
3357 case AArch64::STRQui:
3358 return false;
3359 }
3360 }
3361
3362 return true;
3363}
3364
3365bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
3366 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
3367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3368 const TargetRegisterInfo *TRI) const {
3369 if (!LdSt.mayLoadOrStore())
3370 return false;
3371
3372 const MachineOperand *BaseOp;
3373 TypeSize WidthN(0, false);
3374 if (!getMemOperandWithOffsetWidth(MI: LdSt, BaseOp, Offset, OffsetIsScalable,
3375 Width&: WidthN, TRI))
3376 return false;
3377 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3378 // vector.
3379 Width = LocationSize::precise(Value: WidthN);
3380 BaseOps.push_back(Elt: BaseOp);
3381 return true;
3382}
3383
3384std::optional<ExtAddrMode>
3385AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
3386 const TargetRegisterInfo *TRI) const {
3387 const MachineOperand *Base; // Filled with the base operand of MI.
3388 int64_t Offset; // Filled with the offset of MI.
3389 bool OffsetIsScalable;
3390 if (!getMemOperandWithOffset(MI: MemI, BaseOp&: Base, Offset, OffsetIsScalable, TRI))
3391 return std::nullopt;
3392
3393 if (!Base->isReg())
3394 return std::nullopt;
3395 ExtAddrMode AM;
3396 AM.BaseReg = Base->getReg();
3397 AM.Displacement = Offset;
3398 AM.ScaledReg = 0;
3399 AM.Scale = 0;
3400 return AM;
3401}
3402
3403bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
3404 Register Reg,
3405 const MachineInstr &AddrI,
3406 ExtAddrMode &AM) const {
3407 // Filter out instructions into which we cannot fold.
3408 unsigned NumBytes;
3409 int64_t OffsetScale = 1;
3410 switch (MemI.getOpcode()) {
3411 default:
3412 return false;
3413
3414 case AArch64::LDURQi:
3415 case AArch64::STURQi:
3416 NumBytes = 16;
3417 break;
3418
3419 case AArch64::LDURDi:
3420 case AArch64::STURDi:
3421 case AArch64::LDURXi:
3422 case AArch64::STURXi:
3423 NumBytes = 8;
3424 break;
3425
3426 case AArch64::LDURWi:
3427 case AArch64::LDURSWi:
3428 case AArch64::STURWi:
3429 NumBytes = 4;
3430 break;
3431
3432 case AArch64::LDURHi:
3433 case AArch64::STURHi:
3434 case AArch64::LDURHHi:
3435 case AArch64::STURHHi:
3436 case AArch64::LDURSHXi:
3437 case AArch64::LDURSHWi:
3438 NumBytes = 2;
3439 break;
3440
3441 case AArch64::LDRBroX:
3442 case AArch64::LDRBBroX:
3443 case AArch64::LDRSBXroX:
3444 case AArch64::LDRSBWroX:
3445 case AArch64::STRBroX:
3446 case AArch64::STRBBroX:
3447 case AArch64::LDURBi:
3448 case AArch64::LDURBBi:
3449 case AArch64::LDURSBXi:
3450 case AArch64::LDURSBWi:
3451 case AArch64::STURBi:
3452 case AArch64::STURBBi:
3453 case AArch64::LDRBui:
3454 case AArch64::LDRBBui:
3455 case AArch64::LDRSBXui:
3456 case AArch64::LDRSBWui:
3457 case AArch64::STRBui:
3458 case AArch64::STRBBui:
3459 NumBytes = 1;
3460 break;
3461
3462 case AArch64::LDRQroX:
3463 case AArch64::STRQroX:
3464 case AArch64::LDRQui:
3465 case AArch64::STRQui:
3466 NumBytes = 16;
3467 OffsetScale = 16;
3468 break;
3469
3470 case AArch64::LDRDroX:
3471 case AArch64::STRDroX:
3472 case AArch64::LDRXroX:
3473 case AArch64::STRXroX:
3474 case AArch64::LDRDui:
3475 case AArch64::STRDui:
3476 case AArch64::LDRXui:
3477 case AArch64::STRXui:
3478 NumBytes = 8;
3479 OffsetScale = 8;
3480 break;
3481
3482 case AArch64::LDRWroX:
3483 case AArch64::LDRSWroX:
3484 case AArch64::STRWroX:
3485 case AArch64::LDRWui:
3486 case AArch64::LDRSWui:
3487 case AArch64::STRWui:
3488 NumBytes = 4;
3489 OffsetScale = 4;
3490 break;
3491
3492 case AArch64::LDRHroX:
3493 case AArch64::STRHroX:
3494 case AArch64::LDRHHroX:
3495 case AArch64::STRHHroX:
3496 case AArch64::LDRSHXroX:
3497 case AArch64::LDRSHWroX:
3498 case AArch64::LDRHui:
3499 case AArch64::STRHui:
3500 case AArch64::LDRHHui:
3501 case AArch64::STRHHui:
3502 case AArch64::LDRSHXui:
3503 case AArch64::LDRSHWui:
3504 NumBytes = 2;
3505 OffsetScale = 2;
3506 break;
3507 }
3508
3509 // Check the fold operand is not the loaded/stored value.
3510 const MachineOperand &BaseRegOp = MemI.getOperand(i: 0);
3511 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3512 return false;
3513
3514 // Handle memory instructions with a [Reg, Reg] addressing mode.
3515 if (MemI.getOperand(i: 2).isReg()) {
3516 // Bail if the addressing mode already includes extension of the offset
3517 // register.
3518 if (MemI.getOperand(i: 3).getImm())
3519 return false;
3520
3521 // Check if we actually have a scaled offset.
3522 if (MemI.getOperand(i: 4).getImm() == 0)
3523 OffsetScale = 1;
3524
3525 // If the address instructions is folded into the base register, then the
3526 // addressing mode must not have a scale. Then we can swap the base and the
3527 // scaled registers.
3528 if (MemI.getOperand(i: 1).getReg() == Reg && OffsetScale != 1)
3529 return false;
3530
3531 switch (AddrI.getOpcode()) {
3532 default:
3533 return false;
3534
3535 case AArch64::SBFMXri:
3536 // sxtw Xa, Wm
3537 // ldr Xd, [Xn, Xa, lsl #N]
3538 // ->
3539 // ldr Xd, [Xn, Wm, sxtw #N]
3540 if (AddrI.getOperand(i: 2).getImm() != 0 ||
3541 AddrI.getOperand(i: 3).getImm() != 31)
3542 return false;
3543
3544 AM.BaseReg = MemI.getOperand(i: 1).getReg();
3545 if (AM.BaseReg == Reg)
3546 AM.BaseReg = MemI.getOperand(i: 2).getReg();
3547 AM.ScaledReg = AddrI.getOperand(i: 1).getReg();
3548 AM.Scale = OffsetScale;
3549 AM.Displacement = 0;
3550 AM.Form = ExtAddrMode::Formula::SExtScaledReg;
3551 return true;
3552
3553 case TargetOpcode::SUBREG_TO_REG: {
3554 // mov Wa, Wm
3555 // ldr Xd, [Xn, Xa, lsl #N]
3556 // ->
3557 // ldr Xd, [Xn, Wm, uxtw #N]
3558
3559 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3560 if (AddrI.getOperand(i: 1).getImm() != 0 ||
3561 AddrI.getOperand(i: 3).getImm() != AArch64::sub_32)
3562 return false;
3563
3564 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3565 Register OffsetReg = AddrI.getOperand(i: 2).getReg();
3566 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(RegNo: OffsetReg))
3567 return false;
3568
3569 const MachineInstr &DefMI = *MRI.getVRegDef(Reg: OffsetReg);
3570 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3571 DefMI.getOperand(i: 1).getReg() != AArch64::WZR ||
3572 DefMI.getOperand(i: 3).getImm() != 0)
3573 return false;
3574
3575 AM.BaseReg = MemI.getOperand(i: 1).getReg();
3576 if (AM.BaseReg == Reg)
3577 AM.BaseReg = MemI.getOperand(i: 2).getReg();
3578 AM.ScaledReg = DefMI.getOperand(i: 2).getReg();
3579 AM.Scale = OffsetScale;
3580 AM.Displacement = 0;
3581 AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
3582 return true;
3583 }
3584 }
3585 }
3586
3587 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3588
3589 // Check we are not breaking a potential conversion to an LDP.
3590 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3591 int64_t NewOffset) -> bool {
3592 int64_t MinOffset, MaxOffset;
3593 switch (NumBytes) {
3594 default:
3595 return true;
3596 case 4:
3597 MinOffset = -256;
3598 MaxOffset = 252;
3599 break;
3600 case 8:
3601 MinOffset = -512;
3602 MaxOffset = 504;
3603 break;
3604 case 16:
3605 MinOffset = -1024;
3606 MaxOffset = 1008;
3607 break;
3608 }
3609 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3610 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3611 };
3612 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3613 int64_t OldOffset = MemI.getOperand(i: 2).getImm() * OffsetScale;
3614 int64_t NewOffset = OldOffset + Disp;
3615 if (!isLegalAddressingMode(NumBytes, Offset: NewOffset, /* Scale */ 0))
3616 return false;
3617 // If the old offset would fit into an LDP, but the new offset wouldn't,
3618 // bail out.
3619 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3620 return false;
3621 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
3622 AM.ScaledReg = 0;
3623 AM.Scale = 0;
3624 AM.Displacement = NewOffset;
3625 AM.Form = ExtAddrMode::Formula::Basic;
3626 return true;
3627 };
3628
3629 auto canFoldAddRegIntoAddrMode =
3630 [&](int64_t Scale,
3631 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
3632 if (MemI.getOperand(i: 2).getImm() != 0)
3633 return false;
3634 if ((unsigned)Scale != Scale)
3635 return false;
3636 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3637 return false;
3638 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
3639 AM.ScaledReg = AddrI.getOperand(i: 2).getReg();
3640 AM.Scale = Scale;
3641 AM.Displacement = 0;
3642 AM.Form = Form;
3643 return true;
3644 };
3645
3646 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3647 unsigned Opcode = MemI.getOpcode();
3648 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3649 Subtarget.isSTRQroSlow();
3650 };
3651
3652 int64_t Disp = 0;
3653 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3654 switch (AddrI.getOpcode()) {
3655 default:
3656 return false;
3657
3658 case AArch64::ADDXri:
3659 // add Xa, Xn, #N
3660 // ldr Xd, [Xa, #M]
3661 // ->
3662 // ldr Xd, [Xn, #N'+M]
3663 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
3664 return canFoldAddSubImmIntoAddrMode(Disp);
3665
3666 case AArch64::SUBXri:
3667 // sub Xa, Xn, #N
3668 // ldr Xd, [Xa, #M]
3669 // ->
3670 // ldr Xd, [Xn, #N'+M]
3671 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
3672 return canFoldAddSubImmIntoAddrMode(-Disp);
3673
3674 case AArch64::ADDXrs: {
3675 // add Xa, Xn, Xm, lsl #N
3676 // ldr Xd, [Xa]
3677 // ->
3678 // ldr Xd, [Xn, Xm, lsl #N]
3679
3680 // Don't fold the add if the result would be slower, unless optimising for
3681 // size.
3682 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3683 if (AArch64_AM::getShiftType(Imm: Shift) != AArch64_AM::ShiftExtendType::LSL)
3684 return false;
3685 Shift = AArch64_AM::getShiftValue(Imm: Shift);
3686 if (!OptSize) {
3687 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3688 return false;
3689 if (avoidSlowSTRQ(MemI))
3690 return false;
3691 }
3692 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3693 }
3694
3695 case AArch64::ADDXrr:
3696 // add Xa, Xn, Xm
3697 // ldr Xd, [Xa]
3698 // ->
3699 // ldr Xd, [Xn, Xm, lsl #0]
3700
3701 // Don't fold the add if the result would be slower, unless optimising for
3702 // size.
3703 if (!OptSize && avoidSlowSTRQ(MemI))
3704 return false;
3705 return canFoldAddRegIntoAddrMode(1);
3706
3707 case AArch64::ADDXrx:
3708 // add Xa, Xn, Wm, {s,u}xtw #N
3709 // ldr Xd, [Xa]
3710 // ->
3711 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3712
3713 // Don't fold the add if the result would be slower, unless optimising for
3714 // size.
3715 if (!OptSize && avoidSlowSTRQ(MemI))
3716 return false;
3717
3718 // Can fold only sign-/zero-extend of a word.
3719 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3720 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3721 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3722 return false;
3723
3724 return canFoldAddRegIntoAddrMode(
3725 1ULL << AArch64_AM::getArithShiftValue(Imm),
3726 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3727 : ExtAddrMode::Formula::ZExtScaledReg);
3728 }
3729}
3730
3731// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3732// return the opcode of an instruction performing the same operation, but using
3733// the [Reg, Reg] addressing mode.
3734static unsigned regOffsetOpcode(unsigned Opcode) {
3735 switch (Opcode) {
3736 default:
3737 llvm_unreachable("Address folding not implemented for instruction");
3738
3739 case AArch64::LDURQi:
3740 case AArch64::LDRQui:
3741 return AArch64::LDRQroX;
3742 case AArch64::STURQi:
3743 case AArch64::STRQui:
3744 return AArch64::STRQroX;
3745 case AArch64::LDURDi:
3746 case AArch64::LDRDui:
3747 return AArch64::LDRDroX;
3748 case AArch64::STURDi:
3749 case AArch64::STRDui:
3750 return AArch64::STRDroX;
3751 case AArch64::LDURXi:
3752 case AArch64::LDRXui:
3753 return AArch64::LDRXroX;
3754 case AArch64::STURXi:
3755 case AArch64::STRXui:
3756 return AArch64::STRXroX;
3757 case AArch64::LDURWi:
3758 case AArch64::LDRWui:
3759 return AArch64::LDRWroX;
3760 case AArch64::LDURSWi:
3761 case AArch64::LDRSWui:
3762 return AArch64::LDRSWroX;
3763 case AArch64::STURWi:
3764 case AArch64::STRWui:
3765 return AArch64::STRWroX;
3766 case AArch64::LDURHi:
3767 case AArch64::LDRHui:
3768 return AArch64::LDRHroX;
3769 case AArch64::STURHi:
3770 case AArch64::STRHui:
3771 return AArch64::STRHroX;
3772 case AArch64::LDURHHi:
3773 case AArch64::LDRHHui:
3774 return AArch64::LDRHHroX;
3775 case AArch64::STURHHi:
3776 case AArch64::STRHHui:
3777 return AArch64::STRHHroX;
3778 case AArch64::LDURSHXi:
3779 case AArch64::LDRSHXui:
3780 return AArch64::LDRSHXroX;
3781 case AArch64::LDURSHWi:
3782 case AArch64::LDRSHWui:
3783 return AArch64::LDRSHWroX;
3784 case AArch64::LDURBi:
3785 case AArch64::LDRBui:
3786 return AArch64::LDRBroX;
3787 case AArch64::LDURBBi:
3788 case AArch64::LDRBBui:
3789 return AArch64::LDRBBroX;
3790 case AArch64::LDURSBXi:
3791 case AArch64::LDRSBXui:
3792 return AArch64::LDRSBXroX;
3793 case AArch64::LDURSBWi:
3794 case AArch64::LDRSBWui:
3795 return AArch64::LDRSBWroX;
3796 case AArch64::STURBi:
3797 case AArch64::STRBui:
3798 return AArch64::STRBroX;
3799 case AArch64::STURBBi:
3800 case AArch64::STRBBui:
3801 return AArch64::STRBBroX;
3802 }
3803}
3804
3805// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3806// the opcode of an instruction performing the same operation, but using the
3807// [Reg, #Imm] addressing mode with scaled offset.
3808unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3809 switch (Opcode) {
3810 default:
3811 llvm_unreachable("Address folding not implemented for instruction");
3812
3813 case AArch64::LDURQi:
3814 Scale = 16;
3815 return AArch64::LDRQui;
3816 case AArch64::STURQi:
3817 Scale = 16;
3818 return AArch64::STRQui;
3819 case AArch64::LDURDi:
3820 Scale = 8;
3821 return AArch64::LDRDui;
3822 case AArch64::STURDi:
3823 Scale = 8;
3824 return AArch64::STRDui;
3825 case AArch64::LDURXi:
3826 Scale = 8;
3827 return AArch64::LDRXui;
3828 case AArch64::STURXi:
3829 Scale = 8;
3830 return AArch64::STRXui;
3831 case AArch64::LDURWi:
3832 Scale = 4;
3833 return AArch64::LDRWui;
3834 case AArch64::LDURSWi:
3835 Scale = 4;
3836 return AArch64::LDRSWui;
3837 case AArch64::STURWi:
3838 Scale = 4;
3839 return AArch64::STRWui;
3840 case AArch64::LDURHi:
3841 Scale = 2;
3842 return AArch64::LDRHui;
3843 case AArch64::STURHi:
3844 Scale = 2;
3845 return AArch64::STRHui;
3846 case AArch64::LDURHHi:
3847 Scale = 2;
3848 return AArch64::LDRHHui;
3849 case AArch64::STURHHi:
3850 Scale = 2;
3851 return AArch64::STRHHui;
3852 case AArch64::LDURSHXi:
3853 Scale = 2;
3854 return AArch64::LDRSHXui;
3855 case AArch64::LDURSHWi:
3856 Scale = 2;
3857 return AArch64::LDRSHWui;
3858 case AArch64::LDURBi:
3859 Scale = 1;
3860 return AArch64::LDRBui;
3861 case AArch64::LDURBBi:
3862 Scale = 1;
3863 return AArch64::LDRBBui;
3864 case AArch64::LDURSBXi:
3865 Scale = 1;
3866 return AArch64::LDRSBXui;
3867 case AArch64::LDURSBWi:
3868 Scale = 1;
3869 return AArch64::LDRSBWui;
3870 case AArch64::STURBi:
3871 Scale = 1;
3872 return AArch64::STRBui;
3873 case AArch64::STURBBi:
3874 Scale = 1;
3875 return AArch64::STRBBui;
3876 case AArch64::LDRQui:
3877 case AArch64::STRQui:
3878 Scale = 16;
3879 return Opcode;
3880 case AArch64::LDRDui:
3881 case AArch64::STRDui:
3882 case AArch64::LDRXui:
3883 case AArch64::STRXui:
3884 Scale = 8;
3885 return Opcode;
3886 case AArch64::LDRWui:
3887 case AArch64::LDRSWui:
3888 case AArch64::STRWui:
3889 Scale = 4;
3890 return Opcode;
3891 case AArch64::LDRHui:
3892 case AArch64::STRHui:
3893 case AArch64::LDRHHui:
3894 case AArch64::STRHHui:
3895 case AArch64::LDRSHXui:
3896 case AArch64::LDRSHWui:
3897 Scale = 2;
3898 return Opcode;
3899 case AArch64::LDRBui:
3900 case AArch64::LDRBBui:
3901 case AArch64::LDRSBXui:
3902 case AArch64::LDRSBWui:
3903 case AArch64::STRBui:
3904 case AArch64::STRBBui:
3905 Scale = 1;
3906 return Opcode;
3907 }
3908}
3909
3910// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3911// the opcode of an instruction performing the same operation, but using the
3912// [Reg, #Imm] addressing mode with unscaled offset.
3913unsigned unscaledOffsetOpcode(unsigned Opcode) {
3914 switch (Opcode) {
3915 default:
3916 llvm_unreachable("Address folding not implemented for instruction");
3917
3918 case AArch64::LDURQi:
3919 case AArch64::STURQi:
3920 case AArch64::LDURDi:
3921 case AArch64::STURDi:
3922 case AArch64::LDURXi:
3923 case AArch64::STURXi:
3924 case AArch64::LDURWi:
3925 case AArch64::LDURSWi:
3926 case AArch64::STURWi:
3927 case AArch64::LDURHi:
3928 case AArch64::STURHi:
3929 case AArch64::LDURHHi:
3930 case AArch64::STURHHi:
3931 case AArch64::LDURSHXi:
3932 case AArch64::LDURSHWi:
3933 case AArch64::LDURBi:
3934 case AArch64::STURBi:
3935 case AArch64::LDURBBi:
3936 case AArch64::STURBBi:
3937 case AArch64::LDURSBWi:
3938 case AArch64::LDURSBXi:
3939 return Opcode;
3940 case AArch64::LDRQui:
3941 return AArch64::LDURQi;
3942 case AArch64::STRQui:
3943 return AArch64::STURQi;
3944 case AArch64::LDRDui:
3945 return AArch64::LDURDi;
3946 case AArch64::STRDui:
3947 return AArch64::STURDi;
3948 case AArch64::LDRXui:
3949 return AArch64::LDURXi;
3950 case AArch64::STRXui:
3951 return AArch64::STURXi;
3952 case AArch64::LDRWui:
3953 return AArch64::LDURWi;
3954 case AArch64::LDRSWui:
3955 return AArch64::LDURSWi;
3956 case AArch64::STRWui:
3957 return AArch64::STURWi;
3958 case AArch64::LDRHui:
3959 return AArch64::LDURHi;
3960 case AArch64::STRHui:
3961 return AArch64::STURHi;
3962 case AArch64::LDRHHui:
3963 return AArch64::LDURHHi;
3964 case AArch64::STRHHui:
3965 return AArch64::STURHHi;
3966 case AArch64::LDRSHXui:
3967 return AArch64::LDURSHXi;
3968 case AArch64::LDRSHWui:
3969 return AArch64::LDURSHWi;
3970 case AArch64::LDRBBui:
3971 return AArch64::LDURBBi;
3972 case AArch64::LDRBui:
3973 return AArch64::LDURBi;
3974 case AArch64::STRBBui:
3975 return AArch64::STURBBi;
3976 case AArch64::STRBui:
3977 return AArch64::STURBi;
3978 case AArch64::LDRSBWui:
3979 return AArch64::LDURSBWi;
3980 case AArch64::LDRSBXui:
3981 return AArch64::LDURSBXi;
3982 }
3983}
3984
3985// Given the opcode of a memory load/store instruction, return the opcode of an
3986// instruction performing the same operation, but using
3987// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3988// offset register.
3989static unsigned offsetExtendOpcode(unsigned Opcode) {
3990 switch (Opcode) {
3991 default:
3992 llvm_unreachable("Address folding not implemented for instruction");
3993
3994 case AArch64::LDRQroX:
3995 case AArch64::LDURQi:
3996 case AArch64::LDRQui:
3997 return AArch64::LDRQroW;
3998 case AArch64::STRQroX:
3999 case AArch64::STURQi:
4000 case AArch64::STRQui:
4001 return AArch64::STRQroW;
4002 case AArch64::LDRDroX:
4003 case AArch64::LDURDi:
4004 case AArch64::LDRDui:
4005 return AArch64::LDRDroW;
4006 case AArch64::STRDroX:
4007 case AArch64::STURDi:
4008 case AArch64::STRDui:
4009 return AArch64::STRDroW;
4010 case AArch64::LDRXroX:
4011 case AArch64::LDURXi:
4012 case AArch64::LDRXui:
4013 return AArch64::LDRXroW;
4014 case AArch64::STRXroX:
4015 case AArch64::STURXi:
4016 case AArch64::STRXui:
4017 return AArch64::STRXroW;
4018 case AArch64::LDRWroX:
4019 case AArch64::LDURWi:
4020 case AArch64::LDRWui:
4021 return AArch64::LDRWroW;
4022 case AArch64::LDRSWroX:
4023 case AArch64::LDURSWi:
4024 case AArch64::LDRSWui:
4025 return AArch64::LDRSWroW;
4026 case AArch64::STRWroX:
4027 case AArch64::STURWi:
4028 case AArch64::STRWui:
4029 return AArch64::STRWroW;
4030 case AArch64::LDRHroX:
4031 case AArch64::LDURHi:
4032 case AArch64::LDRHui:
4033 return AArch64::LDRHroW;
4034 case AArch64::STRHroX:
4035 case AArch64::STURHi:
4036 case AArch64::STRHui:
4037 return AArch64::STRHroW;
4038 case AArch64::LDRHHroX:
4039 case AArch64::LDURHHi:
4040 case AArch64::LDRHHui:
4041 return AArch64::LDRHHroW;
4042 case AArch64::STRHHroX:
4043 case AArch64::STURHHi:
4044 case AArch64::STRHHui:
4045 return AArch64::STRHHroW;
4046 case AArch64::LDRSHXroX:
4047 case AArch64::LDURSHXi:
4048 case AArch64::LDRSHXui:
4049 return AArch64::LDRSHXroW;
4050 case AArch64::LDRSHWroX:
4051 case AArch64::LDURSHWi:
4052 case AArch64::LDRSHWui:
4053 return AArch64::LDRSHWroW;
4054 case AArch64::LDRBroX:
4055 case AArch64::LDURBi:
4056 case AArch64::LDRBui:
4057 return AArch64::LDRBroW;
4058 case AArch64::LDRBBroX:
4059 case AArch64::LDURBBi:
4060 case AArch64::LDRBBui:
4061 return AArch64::LDRBBroW;
4062 case AArch64::LDRSBXroX:
4063 case AArch64::LDURSBXi:
4064 case AArch64::LDRSBXui:
4065 return AArch64::LDRSBXroW;
4066 case AArch64::LDRSBWroX:
4067 case AArch64::LDURSBWi:
4068 case AArch64::LDRSBWui:
4069 return AArch64::LDRSBWroW;
4070 case AArch64::STRBroX:
4071 case AArch64::STURBi:
4072 case AArch64::STRBui:
4073 return AArch64::STRBroW;
4074 case AArch64::STRBBroX:
4075 case AArch64::STURBBi:
4076 case AArch64::STRBBui:
4077 return AArch64::STRBBroW;
4078 }
4079}
4080
4081MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
4082 const ExtAddrMode &AM) const {
4083
4084 const DebugLoc &DL = MemI.getDebugLoc();
4085 MachineBasicBlock &MBB = *MemI.getParent();
4086 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4087
4088 if (AM.Form == ExtAddrMode::Formula::Basic) {
4089 if (AM.ScaledReg) {
4090 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4091 unsigned Opcode = regOffsetOpcode(Opcode: MemI.getOpcode());
4092 MRI.constrainRegClass(Reg: AM.BaseReg, RC: &AArch64::GPR64spRegClass);
4093 auto B = BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4094 .addReg(RegNo: MemI.getOperand(i: 0).getReg(),
4095 Flags: getDefRegState(B: MemI.mayLoad()))
4096 .addReg(RegNo: AM.BaseReg)
4097 .addReg(RegNo: AM.ScaledReg)
4098 .addImm(Val: 0)
4099 .addImm(Val: AM.Scale > 1)
4100 .setMemRefs(MemI.memoperands())
4101 .setMIFlags(MemI.getFlags());
4102 return B.getInstr();
4103 }
4104
4105 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4106 "Addressing mode not supported for folding");
4107
4108 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4109 unsigned Scale = 1;
4110 unsigned Opcode = MemI.getOpcode();
4111 if (isInt<9>(x: AM.Displacement))
4112 Opcode = unscaledOffsetOpcode(Opcode);
4113 else
4114 Opcode = scaledOffsetOpcode(Opcode, Scale);
4115
4116 auto B =
4117 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4118 .addReg(RegNo: MemI.getOperand(i: 0).getReg(), Flags: getDefRegState(B: MemI.mayLoad()))
4119 .addReg(RegNo: AM.BaseReg)
4120 .addImm(Val: AM.Displacement / Scale)
4121 .setMemRefs(MemI.memoperands())
4122 .setMIFlags(MemI.getFlags());
4123 return B.getInstr();
4124 }
4125
4126 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
4127 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
4128 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4129 assert(AM.ScaledReg && !AM.Displacement &&
4130 "Address offset can be a register or an immediate, but not both");
4131 unsigned Opcode = offsetExtendOpcode(Opcode: MemI.getOpcode());
4132 MRI.constrainRegClass(Reg: AM.BaseReg, RC: &AArch64::GPR64spRegClass);
4133 // Make sure the offset register is in the correct register class.
4134 Register OffsetReg = AM.ScaledReg;
4135 const TargetRegisterClass *RC = MRI.getRegClass(Reg: OffsetReg);
4136 if (RC->hasSuperClassEq(RC: &AArch64::GPR64RegClass)) {
4137 OffsetReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
4138 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: OffsetReg)
4139 .addReg(RegNo: AM.ScaledReg, Flags: {}, SubReg: AArch64::sub_32);
4140 }
4141 auto B =
4142 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4143 .addReg(RegNo: MemI.getOperand(i: 0).getReg(), Flags: getDefRegState(B: MemI.mayLoad()))
4144 .addReg(RegNo: AM.BaseReg)
4145 .addReg(RegNo: OffsetReg)
4146 .addImm(Val: AM.Form == ExtAddrMode::Formula::SExtScaledReg)
4147 .addImm(Val: AM.Scale != 1)
4148 .setMemRefs(MemI.memoperands())
4149 .setMIFlags(MemI.getFlags());
4150
4151 return B.getInstr();
4152 }
4153
4154 llvm_unreachable(
4155 "Function must not be called with an addressing mode it can't handle");
4156}
4157
4158/// Return true if the opcode is a post-index ld/st instruction, which really
4159/// loads from base+0.
4160static bool isPostIndexLdStOpcode(unsigned Opcode) {
4161 switch (Opcode) {
4162 default:
4163 return false;
4164 case AArch64::LD1Fourv16b_POST:
4165 case AArch64::LD1Fourv1d_POST:
4166 case AArch64::LD1Fourv2d_POST:
4167 case AArch64::LD1Fourv2s_POST:
4168 case AArch64::LD1Fourv4h_POST:
4169 case AArch64::LD1Fourv4s_POST:
4170 case AArch64::LD1Fourv8b_POST:
4171 case AArch64::LD1Fourv8h_POST:
4172 case AArch64::LD1Onev16b_POST:
4173 case AArch64::LD1Onev1d_POST:
4174 case AArch64::LD1Onev2d_POST:
4175 case AArch64::LD1Onev2s_POST:
4176 case AArch64::LD1Onev4h_POST:
4177 case AArch64::LD1Onev4s_POST:
4178 case AArch64::LD1Onev8b_POST:
4179 case AArch64::LD1Onev8h_POST:
4180 case AArch64::LD1Rv16b_POST:
4181 case AArch64::LD1Rv1d_POST:
4182 case AArch64::LD1Rv2d_POST:
4183 case AArch64::LD1Rv2s_POST:
4184 case AArch64::LD1Rv4h_POST:
4185 case AArch64::LD1Rv4s_POST:
4186 case AArch64::LD1Rv8b_POST:
4187 case AArch64::LD1Rv8h_POST:
4188 case AArch64::LD1Threev16b_POST:
4189 case AArch64::LD1Threev1d_POST:
4190 case AArch64::LD1Threev2d_POST:
4191 case AArch64::LD1Threev2s_POST:
4192 case AArch64::LD1Threev4h_POST:
4193 case AArch64::LD1Threev4s_POST:
4194 case AArch64::LD1Threev8b_POST:
4195 case AArch64::LD1Threev8h_POST:
4196 case AArch64::LD1Twov16b_POST:
4197 case AArch64::LD1Twov1d_POST:
4198 case AArch64::LD1Twov2d_POST:
4199 case AArch64::LD1Twov2s_POST:
4200 case AArch64::LD1Twov4h_POST:
4201 case AArch64::LD1Twov4s_POST:
4202 case AArch64::LD1Twov8b_POST:
4203 case AArch64::LD1Twov8h_POST:
4204 case AArch64::LD1i16_POST:
4205 case AArch64::LD1i32_POST:
4206 case AArch64::LD1i64_POST:
4207 case AArch64::LD1i8_POST:
4208 case AArch64::LD2Rv16b_POST:
4209 case AArch64::LD2Rv1d_POST:
4210 case AArch64::LD2Rv2d_POST:
4211 case AArch64::LD2Rv2s_POST:
4212 case AArch64::LD2Rv4h_POST:
4213 case AArch64::LD2Rv4s_POST:
4214 case AArch64::LD2Rv8b_POST:
4215 case AArch64::LD2Rv8h_POST:
4216 case AArch64::LD2Twov16b_POST:
4217 case AArch64::LD2Twov2d_POST:
4218 case AArch64::LD2Twov2s_POST:
4219 case AArch64::LD2Twov4h_POST:
4220 case AArch64::LD2Twov4s_POST:
4221 case AArch64::LD2Twov8b_POST:
4222 case AArch64::LD2Twov8h_POST:
4223 case AArch64::LD2i16_POST:
4224 case AArch64::LD2i32_POST:
4225 case AArch64::LD2i64_POST:
4226 case AArch64::LD2i8_POST:
4227 case AArch64::LD3Rv16b_POST:
4228 case AArch64::LD3Rv1d_POST:
4229 case AArch64::LD3Rv2d_POST:
4230 case AArch64::LD3Rv2s_POST:
4231 case AArch64::LD3Rv4h_POST:
4232 case AArch64::LD3Rv4s_POST:
4233 case AArch64::LD3Rv8b_POST:
4234 case AArch64::LD3Rv8h_POST:
4235 case AArch64::LD3Threev16b_POST:
4236 case AArch64::LD3Threev2d_POST:
4237 case AArch64::LD3Threev2s_POST:
4238 case AArch64::LD3Threev4h_POST:
4239 case AArch64::LD3Threev4s_POST:
4240 case AArch64::LD3Threev8b_POST:
4241 case AArch64::LD3Threev8h_POST:
4242 case AArch64::LD3i16_POST:
4243 case AArch64::LD3i32_POST:
4244 case AArch64::LD3i64_POST:
4245 case AArch64::LD3i8_POST:
4246 case AArch64::LD4Fourv16b_POST:
4247 case AArch64::LD4Fourv2d_POST:
4248 case AArch64::LD4Fourv2s_POST:
4249 case AArch64::LD4Fourv4h_POST:
4250 case AArch64::LD4Fourv4s_POST:
4251 case AArch64::LD4Fourv8b_POST:
4252 case AArch64::LD4Fourv8h_POST:
4253 case AArch64::LD4Rv16b_POST:
4254 case AArch64::LD4Rv1d_POST:
4255 case AArch64::LD4Rv2d_POST:
4256 case AArch64::LD4Rv2s_POST:
4257 case AArch64::LD4Rv4h_POST:
4258 case AArch64::LD4Rv4s_POST:
4259 case AArch64::LD4Rv8b_POST:
4260 case AArch64::LD4Rv8h_POST:
4261 case AArch64::LD4i16_POST:
4262 case AArch64::LD4i32_POST:
4263 case AArch64::LD4i64_POST:
4264 case AArch64::LD4i8_POST:
4265 case AArch64::LDAPRWpost:
4266 case AArch64::LDAPRXpost:
4267 case AArch64::LDIAPPWpost:
4268 case AArch64::LDIAPPXpost:
4269 case AArch64::LDPDpost:
4270 case AArch64::LDPQpost:
4271 case AArch64::LDPSWpost:
4272 case AArch64::LDPSpost:
4273 case AArch64::LDPWpost:
4274 case AArch64::LDPXpost:
4275 case AArch64::LDRBBpost:
4276 case AArch64::LDRBpost:
4277 case AArch64::LDRDpost:
4278 case AArch64::LDRHHpost:
4279 case AArch64::LDRHpost:
4280 case AArch64::LDRQpost:
4281 case AArch64::LDRSBWpost:
4282 case AArch64::LDRSBXpost:
4283 case AArch64::LDRSHWpost:
4284 case AArch64::LDRSHXpost:
4285 case AArch64::LDRSWpost:
4286 case AArch64::LDRSpost:
4287 case AArch64::LDRWpost:
4288 case AArch64::LDRXpost:
4289 case AArch64::ST1Fourv16b_POST:
4290 case AArch64::ST1Fourv1d_POST:
4291 case AArch64::ST1Fourv2d_POST:
4292 case AArch64::ST1Fourv2s_POST:
4293 case AArch64::ST1Fourv4h_POST:
4294 case AArch64::ST1Fourv4s_POST:
4295 case AArch64::ST1Fourv8b_POST:
4296 case AArch64::ST1Fourv8h_POST:
4297 case AArch64::ST1Onev16b_POST:
4298 case AArch64::ST1Onev1d_POST:
4299 case AArch64::ST1Onev2d_POST:
4300 case AArch64::ST1Onev2s_POST:
4301 case AArch64::ST1Onev4h_POST:
4302 case AArch64::ST1Onev4s_POST:
4303 case AArch64::ST1Onev8b_POST:
4304 case AArch64::ST1Onev8h_POST:
4305 case AArch64::ST1Threev16b_POST:
4306 case AArch64::ST1Threev1d_POST:
4307 case AArch64::ST1Threev2d_POST:
4308 case AArch64::ST1Threev2s_POST:
4309 case AArch64::ST1Threev4h_POST:
4310 case AArch64::ST1Threev4s_POST:
4311 case AArch64::ST1Threev8b_POST:
4312 case AArch64::ST1Threev8h_POST:
4313 case AArch64::ST1Twov16b_POST:
4314 case AArch64::ST1Twov1d_POST:
4315 case AArch64::ST1Twov2d_POST:
4316 case AArch64::ST1Twov2s_POST:
4317 case AArch64::ST1Twov4h_POST:
4318 case AArch64::ST1Twov4s_POST:
4319 case AArch64::ST1Twov8b_POST:
4320 case AArch64::ST1Twov8h_POST:
4321 case AArch64::ST1i16_POST:
4322 case AArch64::ST1i32_POST:
4323 case AArch64::ST1i64_POST:
4324 case AArch64::ST1i8_POST:
4325 case AArch64::ST2GPostIndex:
4326 case AArch64::ST2Twov16b_POST:
4327 case AArch64::ST2Twov2d_POST:
4328 case AArch64::ST2Twov2s_POST:
4329 case AArch64::ST2Twov4h_POST:
4330 case AArch64::ST2Twov4s_POST:
4331 case AArch64::ST2Twov8b_POST:
4332 case AArch64::ST2Twov8h_POST:
4333 case AArch64::ST2i16_POST:
4334 case AArch64::ST2i32_POST:
4335 case AArch64::ST2i64_POST:
4336 case AArch64::ST2i8_POST:
4337 case AArch64::ST3Threev16b_POST:
4338 case AArch64::ST3Threev2d_POST:
4339 case AArch64::ST3Threev2s_POST:
4340 case AArch64::ST3Threev4h_POST:
4341 case AArch64::ST3Threev4s_POST:
4342 case AArch64::ST3Threev8b_POST:
4343 case AArch64::ST3Threev8h_POST:
4344 case AArch64::ST3i16_POST:
4345 case AArch64::ST3i32_POST:
4346 case AArch64::ST3i64_POST:
4347 case AArch64::ST3i8_POST:
4348 case AArch64::ST4Fourv16b_POST:
4349 case AArch64::ST4Fourv2d_POST:
4350 case AArch64::ST4Fourv2s_POST:
4351 case AArch64::ST4Fourv4h_POST:
4352 case AArch64::ST4Fourv4s_POST:
4353 case AArch64::ST4Fourv8b_POST:
4354 case AArch64::ST4Fourv8h_POST:
4355 case AArch64::ST4i16_POST:
4356 case AArch64::ST4i32_POST:
4357 case AArch64::ST4i64_POST:
4358 case AArch64::ST4i8_POST:
4359 case AArch64::STGPostIndex:
4360 case AArch64::STGPpost:
4361 case AArch64::STPDpost:
4362 case AArch64::STPQpost:
4363 case AArch64::STPSpost:
4364 case AArch64::STPWpost:
4365 case AArch64::STPXpost:
4366 case AArch64::STRBBpost:
4367 case AArch64::STRBpost:
4368 case AArch64::STRDpost:
4369 case AArch64::STRHHpost:
4370 case AArch64::STRHpost:
4371 case AArch64::STRQpost:
4372 case AArch64::STRSpost:
4373 case AArch64::STRWpost:
4374 case AArch64::STRXpost:
4375 case AArch64::STZ2GPostIndex:
4376 case AArch64::STZGPostIndex:
4377 return true;
4378 }
4379}
4380
4381bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
4382 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4383 bool &OffsetIsScalable, TypeSize &Width,
4384 const TargetRegisterInfo *TRI) const {
4385 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4386 // Handle only loads/stores with base register followed by immediate offset.
4387 if (LdSt.getNumExplicitOperands() == 3) {
4388 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4389 if ((!LdSt.getOperand(i: 1).isReg() && !LdSt.getOperand(i: 1).isFI()) ||
4390 !LdSt.getOperand(i: 2).isImm())
4391 return false;
4392 } else if (LdSt.getNumExplicitOperands() == 4) {
4393 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4394 if (!LdSt.getOperand(i: 1).isReg() ||
4395 (!LdSt.getOperand(i: 2).isReg() && !LdSt.getOperand(i: 2).isFI()) ||
4396 !LdSt.getOperand(i: 3).isImm())
4397 return false;
4398 } else
4399 return false;
4400
4401 // Get the scaling factor for the instruction and set the width for the
4402 // instruction.
4403 TypeSize Scale(0U, false);
4404 int64_t Dummy1, Dummy2;
4405
4406 // If this returns false, then it's an instruction we don't want to handle.
4407 if (!getMemOpInfo(Opcode: LdSt.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2))
4408 return false;
4409
4410 // Compute the offset. Offset is calculated as the immediate operand
4411 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4412 // set to 1. Postindex are a special case which have an offset of 0.
4413 if (isPostIndexLdStOpcode(Opcode: LdSt.getOpcode())) {
4414 BaseOp = &LdSt.getOperand(i: 2);
4415 Offset = 0;
4416 } else if (LdSt.getNumExplicitOperands() == 3) {
4417 BaseOp = &LdSt.getOperand(i: 1);
4418 Offset = LdSt.getOperand(i: 2).getImm() * Scale.getKnownMinValue();
4419 } else {
4420 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4421 BaseOp = &LdSt.getOperand(i: 2);
4422 Offset = LdSt.getOperand(i: 3).getImm() * Scale.getKnownMinValue();
4423 }
4424 OffsetIsScalable = Scale.isScalable();
4425
4426 return BaseOp->isReg() || BaseOp->isFI();
4427}
4428
4429MachineOperand &
4430AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
4431 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4432 MachineOperand &OfsOp = LdSt.getOperand(i: LdSt.getNumExplicitOperands() - 1);
4433 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4434 return OfsOp;
4435}
4436
4437bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4438 TypeSize &Width, int64_t &MinOffset,
4439 int64_t &MaxOffset) {
4440 switch (Opcode) {
4441 // Not a memory operation or something we want to handle.
4442 default:
4443 Scale = TypeSize::getFixed(ExactSize: 0);
4444 Width = TypeSize::getFixed(ExactSize: 0);
4445 MinOffset = MaxOffset = 0;
4446 return false;
4447 // LDR / STR
4448 case AArch64::LDRQui:
4449 case AArch64::STRQui:
4450 Scale = TypeSize::getFixed(ExactSize: 16);
4451 Width = TypeSize::getFixed(ExactSize: 16);
4452 MinOffset = 0;
4453 MaxOffset = 4095;
4454 break;
4455 case AArch64::LDRXui:
4456 case AArch64::LDRDui:
4457 case AArch64::STRXui:
4458 case AArch64::STRDui:
4459 case AArch64::PRFMui:
4460 Scale = TypeSize::getFixed(ExactSize: 8);
4461 Width = TypeSize::getFixed(ExactSize: 8);
4462 MinOffset = 0;
4463 MaxOffset = 4095;
4464 break;
4465 case AArch64::LDRWui:
4466 case AArch64::LDRSui:
4467 case AArch64::LDRSWui:
4468 case AArch64::STRWui:
4469 case AArch64::STRSui:
4470 Scale = TypeSize::getFixed(ExactSize: 4);
4471 Width = TypeSize::getFixed(ExactSize: 4);
4472 MinOffset = 0;
4473 MaxOffset = 4095;
4474 break;
4475 case AArch64::LDRHui:
4476 case AArch64::LDRHHui:
4477 case AArch64::LDRSHWui:
4478 case AArch64::LDRSHXui:
4479 case AArch64::STRHui:
4480 case AArch64::STRHHui:
4481 Scale = TypeSize::getFixed(ExactSize: 2);
4482 Width = TypeSize::getFixed(ExactSize: 2);
4483 MinOffset = 0;
4484 MaxOffset = 4095;
4485 break;
4486 case AArch64::LDRBui:
4487 case AArch64::LDRBBui:
4488 case AArch64::LDRSBWui:
4489 case AArch64::LDRSBXui:
4490 case AArch64::STRBui:
4491 case AArch64::STRBBui:
4492 Scale = TypeSize::getFixed(ExactSize: 1);
4493 Width = TypeSize::getFixed(ExactSize: 1);
4494 MinOffset = 0;
4495 MaxOffset = 4095;
4496 break;
4497 // post/pre inc
4498 case AArch64::STRQpre:
4499 case AArch64::LDRQpost:
4500 Scale = TypeSize::getFixed(ExactSize: 1);
4501 Width = TypeSize::getFixed(ExactSize: 16);
4502 MinOffset = -256;
4503 MaxOffset = 255;
4504 break;
4505 case AArch64::LDRDpost:
4506 case AArch64::LDRDpre:
4507 case AArch64::LDRXpost:
4508 case AArch64::LDRXpre:
4509 case AArch64::STRDpost:
4510 case AArch64::STRDpre:
4511 case AArch64::STRXpost:
4512 case AArch64::STRXpre:
4513 Scale = TypeSize::getFixed(ExactSize: 1);
4514 Width = TypeSize::getFixed(ExactSize: 8);
4515 MinOffset = -256;
4516 MaxOffset = 255;
4517 break;
4518 case AArch64::STRWpost:
4519 case AArch64::STRWpre:
4520 case AArch64::LDRWpost:
4521 case AArch64::LDRWpre:
4522 case AArch64::STRSpost:
4523 case AArch64::STRSpre:
4524 case AArch64::LDRSpost:
4525 case AArch64::LDRSpre:
4526 Scale = TypeSize::getFixed(ExactSize: 1);
4527 Width = TypeSize::getFixed(ExactSize: 4);
4528 MinOffset = -256;
4529 MaxOffset = 255;
4530 break;
4531 case AArch64::LDRHpost:
4532 case AArch64::LDRHpre:
4533 case AArch64::STRHpost:
4534 case AArch64::STRHpre:
4535 case AArch64::LDRHHpost:
4536 case AArch64::LDRHHpre:
4537 case AArch64::STRHHpost:
4538 case AArch64::STRHHpre:
4539 Scale = TypeSize::getFixed(ExactSize: 1);
4540 Width = TypeSize::getFixed(ExactSize: 2);
4541 MinOffset = -256;
4542 MaxOffset = 255;
4543 break;
4544 case AArch64::LDRBpost:
4545 case AArch64::LDRBpre:
4546 case AArch64::STRBpost:
4547 case AArch64::STRBpre:
4548 case AArch64::LDRBBpost:
4549 case AArch64::LDRBBpre:
4550 case AArch64::STRBBpost:
4551 case AArch64::STRBBpre:
4552 Scale = TypeSize::getFixed(ExactSize: 1);
4553 Width = TypeSize::getFixed(ExactSize: 1);
4554 MinOffset = -256;
4555 MaxOffset = 255;
4556 break;
4557 // Unscaled
4558 case AArch64::LDURQi:
4559 case AArch64::STURQi:
4560 Scale = TypeSize::getFixed(ExactSize: 1);
4561 Width = TypeSize::getFixed(ExactSize: 16);
4562 MinOffset = -256;
4563 MaxOffset = 255;
4564 break;
4565 case AArch64::LDURXi:
4566 case AArch64::LDURDi:
4567 case AArch64::LDAPURXi:
4568 case AArch64::STURXi:
4569 case AArch64::STURDi:
4570 case AArch64::STLURXi:
4571 case AArch64::PRFUMi:
4572 Scale = TypeSize::getFixed(ExactSize: 1);
4573 Width = TypeSize::getFixed(ExactSize: 8);
4574 MinOffset = -256;
4575 MaxOffset = 255;
4576 break;
4577 case AArch64::LDURWi:
4578 case AArch64::LDURSi:
4579 case AArch64::LDURSWi:
4580 case AArch64::LDAPURi:
4581 case AArch64::LDAPURSWi:
4582 case AArch64::STURWi:
4583 case AArch64::STURSi:
4584 case AArch64::STLURWi:
4585 Scale = TypeSize::getFixed(ExactSize: 1);
4586 Width = TypeSize::getFixed(ExactSize: 4);
4587 MinOffset = -256;
4588 MaxOffset = 255;
4589 break;
4590 case AArch64::LDURHi:
4591 case AArch64::LDURHHi:
4592 case AArch64::LDURSHXi:
4593 case AArch64::LDURSHWi:
4594 case AArch64::LDAPURHi:
4595 case AArch64::LDAPURSHWi:
4596 case AArch64::LDAPURSHXi:
4597 case AArch64::STURHi:
4598 case AArch64::STURHHi:
4599 case AArch64::STLURHi:
4600 Scale = TypeSize::getFixed(ExactSize: 1);
4601 Width = TypeSize::getFixed(ExactSize: 2);
4602 MinOffset = -256;
4603 MaxOffset = 255;
4604 break;
4605 case AArch64::LDURBi:
4606 case AArch64::LDURBBi:
4607 case AArch64::LDURSBXi:
4608 case AArch64::LDURSBWi:
4609 case AArch64::LDAPURBi:
4610 case AArch64::LDAPURSBWi:
4611 case AArch64::LDAPURSBXi:
4612 case AArch64::STURBi:
4613 case AArch64::STURBBi:
4614 case AArch64::STLURBi:
4615 Scale = TypeSize::getFixed(ExactSize: 1);
4616 Width = TypeSize::getFixed(ExactSize: 1);
4617 MinOffset = -256;
4618 MaxOffset = 255;
4619 break;
4620 // LDP / STP (including pre/post inc)
4621 case AArch64::LDPQi:
4622 case AArch64::LDNPQi:
4623 case AArch64::STPQi:
4624 case AArch64::STNPQi:
4625 case AArch64::LDPQpost:
4626 case AArch64::LDPQpre:
4627 case AArch64::STPQpost:
4628 case AArch64::STPQpre:
4629 Scale = TypeSize::getFixed(ExactSize: 16);
4630 Width = TypeSize::getFixed(ExactSize: 16 * 2);
4631 MinOffset = -64;
4632 MaxOffset = 63;
4633 break;
4634 case AArch64::LDPXi:
4635 case AArch64::LDPDi:
4636 case AArch64::LDNPXi:
4637 case AArch64::LDNPDi:
4638 case AArch64::STPXi:
4639 case AArch64::STPDi:
4640 case AArch64::STNPXi:
4641 case AArch64::STNPDi:
4642 case AArch64::LDPDpost:
4643 case AArch64::LDPDpre:
4644 case AArch64::LDPXpost:
4645 case AArch64::LDPXpre:
4646 case AArch64::STPDpost:
4647 case AArch64::STPDpre:
4648 case AArch64::STPXpost:
4649 case AArch64::STPXpre:
4650 Scale = TypeSize::getFixed(ExactSize: 8);
4651 Width = TypeSize::getFixed(ExactSize: 8 * 2);
4652 MinOffset = -64;
4653 MaxOffset = 63;
4654 break;
4655 case AArch64::LDPWi:
4656 case AArch64::LDPSi:
4657 case AArch64::LDNPWi:
4658 case AArch64::LDNPSi:
4659 case AArch64::STPWi:
4660 case AArch64::STPSi:
4661 case AArch64::STNPWi:
4662 case AArch64::STNPSi:
4663 case AArch64::LDPSpost:
4664 case AArch64::LDPSpre:
4665 case AArch64::LDPWpost:
4666 case AArch64::LDPWpre:
4667 case AArch64::STPSpost:
4668 case AArch64::STPSpre:
4669 case AArch64::STPWpost:
4670 case AArch64::STPWpre:
4671 Scale = TypeSize::getFixed(ExactSize: 4);
4672 Width = TypeSize::getFixed(ExactSize: 4 * 2);
4673 MinOffset = -64;
4674 MaxOffset = 63;
4675 break;
4676 case AArch64::StoreSwiftAsyncContext:
4677 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4678 Scale = TypeSize::getFixed(ExactSize: 1);
4679 Width = TypeSize::getFixed(ExactSize: 8);
4680 MinOffset = 0;
4681 MaxOffset = 4095;
4682 break;
4683 case AArch64::ADDG:
4684 Scale = TypeSize::getFixed(ExactSize: 16);
4685 Width = TypeSize::getFixed(ExactSize: 0);
4686 MinOffset = 0;
4687 MaxOffset = 63;
4688 break;
4689 case AArch64::TAGPstack:
4690 Scale = TypeSize::getFixed(ExactSize: 16);
4691 Width = TypeSize::getFixed(ExactSize: 0);
4692 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4693 // of 63 (not 64!).
4694 MinOffset = -63;
4695 MaxOffset = 63;
4696 break;
4697 case AArch64::LDG:
4698 case AArch64::STGi:
4699 case AArch64::STGPreIndex:
4700 case AArch64::STGPostIndex:
4701 case AArch64::STZGi:
4702 case AArch64::STZGPreIndex:
4703 case AArch64::STZGPostIndex:
4704 Scale = TypeSize::getFixed(ExactSize: 16);
4705 Width = TypeSize::getFixed(ExactSize: 16);
4706 MinOffset = -256;
4707 MaxOffset = 255;
4708 break;
4709 // SVE
4710 case AArch64::STR_ZZZZXI:
4711 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4712 case AArch64::LDR_ZZZZXI:
4713 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4714 Scale = TypeSize::getScalable(MinimumSize: 16);
4715 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
4716 MinOffset = -256;
4717 MaxOffset = 252;
4718 break;
4719 case AArch64::STR_ZZZXI:
4720 case AArch64::LDR_ZZZXI:
4721 Scale = TypeSize::getScalable(MinimumSize: 16);
4722 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
4723 MinOffset = -256;
4724 MaxOffset = 253;
4725 break;
4726 case AArch64::STR_ZZXI:
4727 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4728 case AArch64::LDR_ZZXI:
4729 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4730 Scale = TypeSize::getScalable(MinimumSize: 16);
4731 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
4732 MinOffset = -256;
4733 MaxOffset = 254;
4734 break;
4735 case AArch64::LDR_PXI:
4736 case AArch64::STR_PXI:
4737 Scale = TypeSize::getScalable(MinimumSize: 2);
4738 Width = TypeSize::getScalable(MinimumSize: 2);
4739 MinOffset = -256;
4740 MaxOffset = 255;
4741 break;
4742 case AArch64::LDR_PPXI:
4743 case AArch64::STR_PPXI:
4744 Scale = TypeSize::getScalable(MinimumSize: 2);
4745 Width = TypeSize::getScalable(MinimumSize: 2 * 2);
4746 MinOffset = -256;
4747 MaxOffset = 254;
4748 break;
4749 case AArch64::LDR_ZXI:
4750 case AArch64::STR_ZXI:
4751 Scale = TypeSize::getScalable(MinimumSize: 16);
4752 Width = TypeSize::getScalable(MinimumSize: 16);
4753 MinOffset = -256;
4754 MaxOffset = 255;
4755 break;
4756 case AArch64::LD1B_IMM:
4757 case AArch64::LD1H_IMM:
4758 case AArch64::LD1W_IMM:
4759 case AArch64::LD1D_IMM:
4760 case AArch64::LDNT1B_ZRI:
4761 case AArch64::LDNT1H_ZRI:
4762 case AArch64::LDNT1W_ZRI:
4763 case AArch64::LDNT1D_ZRI:
4764 case AArch64::ST1B_IMM:
4765 case AArch64::ST1H_IMM:
4766 case AArch64::ST1W_IMM:
4767 case AArch64::ST1D_IMM:
4768 case AArch64::STNT1B_ZRI:
4769 case AArch64::STNT1H_ZRI:
4770 case AArch64::STNT1W_ZRI:
4771 case AArch64::STNT1D_ZRI:
4772 case AArch64::LDNF1B_IMM:
4773 case AArch64::LDNF1H_IMM:
4774 case AArch64::LDNF1W_IMM:
4775 case AArch64::LDNF1D_IMM:
4776 // A full vectors worth of data
4777 // Width = mbytes * elements
4778 Scale = TypeSize::getScalable(MinimumSize: 16);
4779 Width = TypeSize::getScalable(MinimumSize: 16);
4780 MinOffset = -8;
4781 MaxOffset = 7;
4782 break;
4783 case AArch64::LD2B_IMM:
4784 case AArch64::LD2H_IMM:
4785 case AArch64::LD2W_IMM:
4786 case AArch64::LD2D_IMM:
4787 case AArch64::ST2B_IMM:
4788 case AArch64::ST2H_IMM:
4789 case AArch64::ST2W_IMM:
4790 case AArch64::ST2D_IMM:
4791 Scale = TypeSize::getScalable(MinimumSize: 32);
4792 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
4793 MinOffset = -8;
4794 MaxOffset = 7;
4795 break;
4796 case AArch64::LD3B_IMM:
4797 case AArch64::LD3H_IMM:
4798 case AArch64::LD3W_IMM:
4799 case AArch64::LD3D_IMM:
4800 case AArch64::ST3B_IMM:
4801 case AArch64::ST3H_IMM:
4802 case AArch64::ST3W_IMM:
4803 case AArch64::ST3D_IMM:
4804 Scale = TypeSize::getScalable(MinimumSize: 48);
4805 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
4806 MinOffset = -8;
4807 MaxOffset = 7;
4808 break;
4809 case AArch64::LD4B_IMM:
4810 case AArch64::LD4H_IMM:
4811 case AArch64::LD4W_IMM:
4812 case AArch64::LD4D_IMM:
4813 case AArch64::ST4B_IMM:
4814 case AArch64::ST4H_IMM:
4815 case AArch64::ST4W_IMM:
4816 case AArch64::ST4D_IMM:
4817 Scale = TypeSize::getScalable(MinimumSize: 64);
4818 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
4819 MinOffset = -8;
4820 MaxOffset = 7;
4821 break;
4822 case AArch64::LD1B_H_IMM:
4823 case AArch64::LD1SB_H_IMM:
4824 case AArch64::LD1H_S_IMM:
4825 case AArch64::LD1SH_S_IMM:
4826 case AArch64::LD1W_D_IMM:
4827 case AArch64::LD1SW_D_IMM:
4828 case AArch64::ST1B_H_IMM:
4829 case AArch64::ST1H_S_IMM:
4830 case AArch64::ST1W_D_IMM:
4831 case AArch64::LDNF1B_H_IMM:
4832 case AArch64::LDNF1SB_H_IMM:
4833 case AArch64::LDNF1H_S_IMM:
4834 case AArch64::LDNF1SH_S_IMM:
4835 case AArch64::LDNF1W_D_IMM:
4836 case AArch64::LDNF1SW_D_IMM:
4837 // A half vector worth of data
4838 // Width = mbytes * elements
4839 Scale = TypeSize::getScalable(MinimumSize: 8);
4840 Width = TypeSize::getScalable(MinimumSize: 8);
4841 MinOffset = -8;
4842 MaxOffset = 7;
4843 break;
4844 case AArch64::LD1B_S_IMM:
4845 case AArch64::LD1SB_S_IMM:
4846 case AArch64::LD1H_D_IMM:
4847 case AArch64::LD1SH_D_IMM:
4848 case AArch64::ST1B_S_IMM:
4849 case AArch64::ST1H_D_IMM:
4850 case AArch64::LDNF1B_S_IMM:
4851 case AArch64::LDNF1SB_S_IMM:
4852 case AArch64::LDNF1H_D_IMM:
4853 case AArch64::LDNF1SH_D_IMM:
4854 // A quarter vector worth of data
4855 // Width = mbytes * elements
4856 Scale = TypeSize::getScalable(MinimumSize: 4);
4857 Width = TypeSize::getScalable(MinimumSize: 4);
4858 MinOffset = -8;
4859 MaxOffset = 7;
4860 break;
4861 case AArch64::LD1B_D_IMM:
4862 case AArch64::LD1SB_D_IMM:
4863 case AArch64::ST1B_D_IMM:
4864 case AArch64::LDNF1B_D_IMM:
4865 case AArch64::LDNF1SB_D_IMM:
4866 // A eighth vector worth of data
4867 // Width = mbytes * elements
4868 Scale = TypeSize::getScalable(MinimumSize: 2);
4869 Width = TypeSize::getScalable(MinimumSize: 2);
4870 MinOffset = -8;
4871 MaxOffset = 7;
4872 break;
4873 case AArch64::ST2Gi:
4874 case AArch64::ST2GPreIndex:
4875 case AArch64::ST2GPostIndex:
4876 case AArch64::STZ2Gi:
4877 case AArch64::STZ2GPreIndex:
4878 case AArch64::STZ2GPostIndex:
4879 Scale = TypeSize::getFixed(ExactSize: 16);
4880 Width = TypeSize::getFixed(ExactSize: 32);
4881 MinOffset = -256;
4882 MaxOffset = 255;
4883 break;
4884 case AArch64::STGPi:
4885 case AArch64::STGPpost:
4886 case AArch64::STGPpre:
4887 Scale = TypeSize::getFixed(ExactSize: 16);
4888 Width = TypeSize::getFixed(ExactSize: 16);
4889 MinOffset = -64;
4890 MaxOffset = 63;
4891 break;
4892 case AArch64::LD1RB_IMM:
4893 case AArch64::LD1RB_H_IMM:
4894 case AArch64::LD1RB_S_IMM:
4895 case AArch64::LD1RB_D_IMM:
4896 case AArch64::LD1RSB_H_IMM:
4897 case AArch64::LD1RSB_S_IMM:
4898 case AArch64::LD1RSB_D_IMM:
4899 Scale = TypeSize::getFixed(ExactSize: 1);
4900 Width = TypeSize::getFixed(ExactSize: 1);
4901 MinOffset = 0;
4902 MaxOffset = 63;
4903 break;
4904 case AArch64::LD1RH_IMM:
4905 case AArch64::LD1RH_S_IMM:
4906 case AArch64::LD1RH_D_IMM:
4907 case AArch64::LD1RSH_S_IMM:
4908 case AArch64::LD1RSH_D_IMM:
4909 Scale = TypeSize::getFixed(ExactSize: 2);
4910 Width = TypeSize::getFixed(ExactSize: 2);
4911 MinOffset = 0;
4912 MaxOffset = 63;
4913 break;
4914 case AArch64::LD1RW_IMM:
4915 case AArch64::LD1RW_D_IMM:
4916 case AArch64::LD1RSW_IMM:
4917 Scale = TypeSize::getFixed(ExactSize: 4);
4918 Width = TypeSize::getFixed(ExactSize: 4);
4919 MinOffset = 0;
4920 MaxOffset = 63;
4921 break;
4922 case AArch64::LD1RD_IMM:
4923 Scale = TypeSize::getFixed(ExactSize: 8);
4924 Width = TypeSize::getFixed(ExactSize: 8);
4925 MinOffset = 0;
4926 MaxOffset = 63;
4927 break;
4928 }
4929
4930 return true;
4931}
4932
4933// Scaling factor for unscaled load or store.
4934int AArch64InstrInfo::getMemScale(unsigned Opc) {
4935 switch (Opc) {
4936 default:
4937 llvm_unreachable("Opcode has unknown scale!");
4938 case AArch64::LDRBBui:
4939 case AArch64::LDURBBi:
4940 case AArch64::LDRSBWui:
4941 case AArch64::LDURSBWi:
4942 case AArch64::STRBBui:
4943 case AArch64::STURBBi:
4944 return 1;
4945 case AArch64::LDRHHui:
4946 case AArch64::LDURHHi:
4947 case AArch64::LDRSHWui:
4948 case AArch64::LDURSHWi:
4949 case AArch64::STRHHui:
4950 case AArch64::STURHHi:
4951 return 2;
4952 case AArch64::LDRSui:
4953 case AArch64::LDURSi:
4954 case AArch64::LDRSpre:
4955 case AArch64::LDRSWui:
4956 case AArch64::LDURSWi:
4957 case AArch64::LDRSWpre:
4958 case AArch64::LDRWpre:
4959 case AArch64::LDRWui:
4960 case AArch64::LDURWi:
4961 case AArch64::STRSui:
4962 case AArch64::STURSi:
4963 case AArch64::STRSpre:
4964 case AArch64::STRWui:
4965 case AArch64::STURWi:
4966 case AArch64::STRWpre:
4967 case AArch64::LDPSi:
4968 case AArch64::LDPSWi:
4969 case AArch64::LDPWi:
4970 case AArch64::STPSi:
4971 case AArch64::STPWi:
4972 return 4;
4973 case AArch64::LDRDui:
4974 case AArch64::LDURDi:
4975 case AArch64::LDRDpre:
4976 case AArch64::LDRXui:
4977 case AArch64::LDURXi:
4978 case AArch64::LDRXpre:
4979 case AArch64::STRDui:
4980 case AArch64::STURDi:
4981 case AArch64::STRDpre:
4982 case AArch64::STRXui:
4983 case AArch64::STURXi:
4984 case AArch64::STRXpre:
4985 case AArch64::LDPDi:
4986 case AArch64::LDPXi:
4987 case AArch64::STPDi:
4988 case AArch64::STPXi:
4989 return 8;
4990 case AArch64::LDRQui:
4991 case AArch64::LDURQi:
4992 case AArch64::STRQui:
4993 case AArch64::STURQi:
4994 case AArch64::STRQpre:
4995 case AArch64::LDPQi:
4996 case AArch64::LDRQpre:
4997 case AArch64::STPQi:
4998 case AArch64::STGi:
4999 case AArch64::STZGi:
5000 case AArch64::ST2Gi:
5001 case AArch64::STZ2Gi:
5002 case AArch64::STGPi:
5003 return 16;
5004 }
5005}
5006
5007bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
5008 switch (MI.getOpcode()) {
5009 default:
5010 return false;
5011 case AArch64::LDRWpre:
5012 case AArch64::LDRXpre:
5013 case AArch64::LDRSWpre:
5014 case AArch64::LDRSpre:
5015 case AArch64::LDRDpre:
5016 case AArch64::LDRQpre:
5017 return true;
5018 }
5019}
5020
5021bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
5022 switch (MI.getOpcode()) {
5023 default:
5024 return false;
5025 case AArch64::STRWpre:
5026 case AArch64::STRXpre:
5027 case AArch64::STRSpre:
5028 case AArch64::STRDpre:
5029 case AArch64::STRQpre:
5030 return true;
5031 }
5032}
5033
5034bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
5035 return isPreLd(MI) || isPreSt(MI);
5036}
5037
5038bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
5039 switch (MI.getOpcode()) {
5040 default:
5041 return false;
5042 case AArch64::LDPSi:
5043 case AArch64::LDPSWi:
5044 case AArch64::LDPDi:
5045 case AArch64::LDPQi:
5046 case AArch64::LDPWi:
5047 case AArch64::LDPXi:
5048 case AArch64::STPSi:
5049 case AArch64::STPDi:
5050 case AArch64::STPQi:
5051 case AArch64::STPWi:
5052 case AArch64::STPXi:
5053 case AArch64::STGPi:
5054 return true;
5055 }
5056}
5057
5058const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
5059 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5060 unsigned Idx =
5061 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
5062 : 1;
5063 return MI.getOperand(i: Idx);
5064}
5065
5066const MachineOperand &
5067AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
5068 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5069 unsigned Idx =
5070 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
5071 : 2;
5072 return MI.getOperand(i: Idx);
5073}
5074
5075const MachineOperand &
5076AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
5077 switch (MI.getOpcode()) {
5078 default:
5079 llvm_unreachable("Unexpected opcode");
5080 case AArch64::LDRBroX:
5081 case AArch64::LDRBBroX:
5082 case AArch64::LDRSBXroX:
5083 case AArch64::LDRSBWroX:
5084 case AArch64::LDRHroX:
5085 case AArch64::LDRHHroX:
5086 case AArch64::LDRSHXroX:
5087 case AArch64::LDRSHWroX:
5088 case AArch64::LDRWroX:
5089 case AArch64::LDRSroX:
5090 case AArch64::LDRSWroX:
5091 case AArch64::LDRDroX:
5092 case AArch64::LDRXroX:
5093 case AArch64::LDRQroX:
5094 return MI.getOperand(i: 4);
5095 }
5096}
5097
5098static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
5099 Register Reg) {
5100 if (MI.getParent() == nullptr)
5101 return nullptr;
5102 const MachineFunction *MF = MI.getParent()->getParent();
5103 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5104}
5105
5106bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
5107 auto IsHFPR = [&](const MachineOperand &Op) {
5108 if (!Op.isReg())
5109 return false;
5110 auto Reg = Op.getReg();
5111 if (Reg.isPhysical())
5112 return AArch64::FPR16RegClass.contains(Reg);
5113 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5114 return TRC == &AArch64::FPR16RegClass ||
5115 TRC == &AArch64::FPR16_loRegClass;
5116 };
5117 return llvm::any_of(Range: MI.operands(), P: IsHFPR);
5118}
5119
5120bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
5121 auto IsQFPR = [&](const MachineOperand &Op) {
5122 if (!Op.isReg())
5123 return false;
5124 auto Reg = Op.getReg();
5125 if (Reg.isPhysical())
5126 return AArch64::FPR128RegClass.contains(Reg);
5127 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5128 return TRC == &AArch64::FPR128RegClass ||
5129 TRC == &AArch64::FPR128_loRegClass;
5130 };
5131 return llvm::any_of(Range: MI.operands(), P: IsQFPR);
5132}
5133
5134bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
5135 switch (MI.getOpcode()) {
5136 case AArch64::BRK:
5137 case AArch64::HLT:
5138 case AArch64::PACIASP:
5139 case AArch64::PACIBSP:
5140 // Implicit BTI behavior.
5141 return true;
5142 case AArch64::PAUTH_PROLOGUE:
5143 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5144 return true;
5145 case AArch64::HINT: {
5146 unsigned Imm = MI.getOperand(i: 0).getImm();
5147 // Explicit BTI instruction.
5148 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5149 return true;
5150 // PACI(A|B)SP instructions.
5151 if (Imm == 25 || Imm == 27)
5152 return true;
5153 return false;
5154 }
5155 default:
5156 return false;
5157 }
5158}
5159
5160bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
5161 if (Reg == 0)
5162 return false;
5163 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5164 return AArch64::FPR128RegClass.contains(Reg) ||
5165 AArch64::FPR64RegClass.contains(Reg) ||
5166 AArch64::FPR32RegClass.contains(Reg) ||
5167 AArch64::FPR16RegClass.contains(Reg) ||
5168 AArch64::FPR8RegClass.contains(Reg);
5169}
5170
5171bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
5172 auto IsFPR = [&](const MachineOperand &Op) {
5173 if (!Op.isReg())
5174 return false;
5175 auto Reg = Op.getReg();
5176 if (Reg.isPhysical())
5177 return isFpOrNEON(Reg);
5178
5179 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5180 return TRC == &AArch64::FPR128RegClass ||
5181 TRC == &AArch64::FPR128_loRegClass ||
5182 TRC == &AArch64::FPR64RegClass ||
5183 TRC == &AArch64::FPR64_loRegClass ||
5184 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5185 TRC == &AArch64::FPR8RegClass;
5186 };
5187 return llvm::any_of(Range: MI.operands(), P: IsFPR);
5188}
5189
5190// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5191// scaled.
5192static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5193 int Scale = AArch64InstrInfo::getMemScale(Opc);
5194
5195 // If the byte-offset isn't a multiple of the stride, we can't scale this
5196 // offset.
5197 if (Offset % Scale != 0)
5198 return false;
5199
5200 // Convert the byte-offset used by unscaled into an "element" offset used
5201 // by the scaled pair load/store instructions.
5202 Offset /= Scale;
5203 return true;
5204}
5205
5206static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5207 if (FirstOpc == SecondOpc)
5208 return true;
5209 // We can also pair sign-ext and zero-ext instructions.
5210 switch (FirstOpc) {
5211 default:
5212 return false;
5213 case AArch64::STRSui:
5214 case AArch64::STURSi:
5215 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5216 case AArch64::STRDui:
5217 case AArch64::STURDi:
5218 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5219 case AArch64::STRQui:
5220 case AArch64::STURQi:
5221 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5222 case AArch64::STRWui:
5223 case AArch64::STURWi:
5224 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5225 case AArch64::STRXui:
5226 case AArch64::STURXi:
5227 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5228 case AArch64::LDRSui:
5229 case AArch64::LDURSi:
5230 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5231 case AArch64::LDRDui:
5232 case AArch64::LDURDi:
5233 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5234 case AArch64::LDRQui:
5235 case AArch64::LDURQi:
5236 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5237 case AArch64::LDRWui:
5238 case AArch64::LDURWi:
5239 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5240 case AArch64::LDRSWui:
5241 case AArch64::LDURSWi:
5242 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5243 case AArch64::LDRXui:
5244 case AArch64::LDURXi:
5245 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5246 }
5247 // These instructions can't be paired based on their opcodes.
5248 return false;
5249}
5250
5251static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5252 int64_t Offset1, unsigned Opcode1, int FI2,
5253 int64_t Offset2, unsigned Opcode2) {
5254 // Accesses through fixed stack object frame indices may access a different
5255 // fixed stack slot. Check that the object offsets + offsets match.
5256 if (MFI.isFixedObjectIndex(ObjectIdx: FI1) && MFI.isFixedObjectIndex(ObjectIdx: FI2)) {
5257 int64_t ObjectOffset1 = MFI.getObjectOffset(ObjectIdx: FI1);
5258 int64_t ObjectOffset2 = MFI.getObjectOffset(ObjectIdx: FI2);
5259 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5260 // Convert to scaled object offsets.
5261 int Scale1 = AArch64InstrInfo::getMemScale(Opc: Opcode1);
5262 if (ObjectOffset1 % Scale1 != 0)
5263 return false;
5264 ObjectOffset1 /= Scale1;
5265 int Scale2 = AArch64InstrInfo::getMemScale(Opc: Opcode2);
5266 if (ObjectOffset2 % Scale2 != 0)
5267 return false;
5268 ObjectOffset2 /= Scale2;
5269 ObjectOffset1 += Offset1;
5270 ObjectOffset2 += Offset2;
5271 return ObjectOffset1 + 1 == ObjectOffset2;
5272 }
5273
5274 return FI1 == FI2;
5275}
5276
5277/// Detect opportunities for ldp/stp formation.
5278///
5279/// Only called for LdSt for which getMemOperandWithOffset returns true.
5280bool AArch64InstrInfo::shouldClusterMemOps(
5281 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5282 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5283 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5284 unsigned NumBytes) const {
5285 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5286 const MachineOperand &BaseOp1 = *BaseOps1.front();
5287 const MachineOperand &BaseOp2 = *BaseOps2.front();
5288 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5289 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5290 if (BaseOp1.getType() != BaseOp2.getType())
5291 return false;
5292
5293 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5294 "Only base registers and frame indices are supported.");
5295
5296 // Check for both base regs and base FI.
5297 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5298 return false;
5299
5300 // Only cluster up to a single pair.
5301 if (ClusterSize > 2)
5302 return false;
5303
5304 if (!isPairableLdStInst(MI: FirstLdSt) || !isPairableLdStInst(MI: SecondLdSt))
5305 return false;
5306
5307 // Can we pair these instructions based on their opcodes?
5308 unsigned FirstOpc = FirstLdSt.getOpcode();
5309 unsigned SecondOpc = SecondLdSt.getOpcode();
5310 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5311 return false;
5312
5313 // Can't merge volatiles or load/stores that have a hint to avoid pair
5314 // formation, for example.
5315 if (!isCandidateToMergeOrPair(MI: FirstLdSt) ||
5316 !isCandidateToMergeOrPair(MI: SecondLdSt))
5317 return false;
5318
5319 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5320 int64_t Offset1 = FirstLdSt.getOperand(i: 2).getImm();
5321 if (hasUnscaledLdStOffset(Opc: FirstOpc) && !scaleOffset(Opc: FirstOpc, Offset&: Offset1))
5322 return false;
5323
5324 int64_t Offset2 = SecondLdSt.getOperand(i: 2).getImm();
5325 if (hasUnscaledLdStOffset(Opc: SecondOpc) && !scaleOffset(Opc: SecondOpc, Offset&: Offset2))
5326 return false;
5327
5328 // Pairwise instructions have a 7-bit signed offset field.
5329 if (Offset1 > 63 || Offset1 < -64)
5330 return false;
5331
5332 // The caller should already have ordered First/SecondLdSt by offset.
5333 // Note: except for non-equal frame index bases
5334 if (BaseOp1.isFI()) {
5335 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5336 "Caller should have ordered offsets.");
5337
5338 const MachineFrameInfo &MFI =
5339 FirstLdSt.getParent()->getParent()->getFrameInfo();
5340 return shouldClusterFI(MFI, FI1: BaseOp1.getIndex(), Offset1, Opcode1: FirstOpc,
5341 FI2: BaseOp2.getIndex(), Offset2, Opcode2: SecondOpc);
5342 }
5343
5344 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5345
5346 return Offset1 + 1 == Offset2;
5347}
5348
5349static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
5350 MCRegister Reg, unsigned SubIdx,
5351 RegState State,
5352 const TargetRegisterInfo *TRI) {
5353 if (!SubIdx)
5354 return MIB.addReg(RegNo: Reg, Flags: State);
5355
5356 if (Reg.isPhysical())
5357 return MIB.addReg(RegNo: TRI->getSubReg(Reg, Idx: SubIdx), Flags: State);
5358 return MIB.addReg(RegNo: Reg, Flags: State, SubReg: SubIdx);
5359}
5360
5361static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5362 unsigned NumRegs) {
5363 // We really want the positive remainder mod 32 here, that happens to be
5364 // easily obtainable with a mask.
5365 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5366}
5367
5368void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
5369 MachineBasicBlock::iterator I,
5370 const DebugLoc &DL, MCRegister DestReg,
5371 MCRegister SrcReg, bool KillSrc,
5372 unsigned Opcode,
5373 ArrayRef<unsigned> Indices) const {
5374 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5375 const TargetRegisterInfo *TRI = &getRegisterInfo();
5376 uint16_t DestEncoding = TRI->getEncodingValue(Reg: DestReg);
5377 uint16_t SrcEncoding = TRI->getEncodingValue(Reg: SrcReg);
5378 unsigned NumRegs = Indices.size();
5379
5380 int SubReg = 0, End = NumRegs, Incr = 1;
5381 if (forwardCopyWillClobberTuple(DestReg: DestEncoding, SrcReg: SrcEncoding, NumRegs)) {
5382 SubReg = NumRegs - 1;
5383 End = -1;
5384 Incr = -1;
5385 }
5386
5387 for (; SubReg != End; SubReg += Incr) {
5388 const MachineInstrBuilder MIB = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode));
5389 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
5390 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: {}, TRI);
5391 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
5392 }
5393}
5394
5395void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
5396 MachineBasicBlock::iterator I,
5397 const DebugLoc &DL, MCRegister DestReg,
5398 MCRegister SrcReg, bool KillSrc,
5399 unsigned Opcode, unsigned ZeroReg,
5400 llvm::ArrayRef<unsigned> Indices) const {
5401 const TargetRegisterInfo *TRI = &getRegisterInfo();
5402 unsigned NumRegs = Indices.size();
5403
5404#ifndef NDEBUG
5405 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5406 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5407 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5408 "GPR reg sequences should not be able to overlap");
5409#endif
5410
5411 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5412 const MachineInstrBuilder MIB = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode));
5413 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
5414 MIB.addReg(RegNo: ZeroReg);
5415 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
5416 MIB.addImm(Val: 0);
5417 }
5418}
5419
5420/// Returns true if the instruction at I is in a streaming call site region,
5421/// within a single basic block.
5422/// A "call site streaming region" starts after smstart and ends at smstop
5423/// around a call to a streaming function. This walks backward from I.
5424static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB,
5425 MachineBasicBlock::iterator I) {
5426 MachineFunction &MF = *MBB.getParent();
5427 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5428 if (!AFI->hasStreamingModeChanges())
5429 return false;
5430 // Walk backwards to find smstart/smstop
5431 for (MachineInstr &MI : reverse(C: make_range(x: MBB.begin(), y: I))) {
5432 unsigned Opc = MI.getOpcode();
5433 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5434 // Check if this is SM change (not ZA)
5435 int64_t PState = MI.getOperand(i: 0).getImm();
5436 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5437 // Operand 1 is 1 for start, 0 for stop
5438 return MI.getOperand(i: 1).getImm() == 1;
5439 }
5440 }
5441 }
5442 return false;
5443}
5444
5445/// Returns true if in a streaming call site region without SME-FA64.
5446static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5447 MachineBasicBlock &MBB,
5448 MachineBasicBlock::iterator I) {
5449 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5450}
5451
5452void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
5453 MachineBasicBlock::iterator I,
5454 const DebugLoc &DL, Register DestReg,
5455 Register SrcReg, bool KillSrc,
5456 bool RenamableDest,
5457 bool RenamableSrc) const {
5458 ++NumCopyInstrs;
5459 if (AArch64::GPR32spRegClass.contains(Reg: DestReg) &&
5460 AArch64::GPR32spRegClass.contains(Reg: SrcReg)) {
5461 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5462 // If either operand is WSP, expand to ADD #0.
5463 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5464 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5465 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5466 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5467 RC: &AArch64::GPR64spRegClass);
5468 MCRegister SrcRegX = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::sub_32,
5469 RC: &AArch64::GPR64spRegClass);
5470 // This instruction is reading and writing X registers. This may upset
5471 // the register scavenger and machine verifier, so we need to indicate
5472 // that we are reading an undefined value from SrcRegX, but a proper
5473 // value from SrcReg.
5474 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg: DestRegX)
5475 .addReg(RegNo: SrcRegX, Flags: RegState::Undef)
5476 .addImm(Val: 0)
5477 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
5478 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5479 ++NumZCRegMoveInstrsGPR;
5480 } else {
5481 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDWri), DestReg)
5482 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5483 .addImm(Val: 0)
5484 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5485 if (Subtarget.hasZeroCycleRegMoveGPR32())
5486 ++NumZCRegMoveInstrsGPR;
5487 }
5488 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5489 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5490 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5491 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5492 RC: &AArch64::GPR64spRegClass);
5493 assert(DestRegX.isValid() && "Destination super-reg not valid");
5494 MCRegister SrcRegX = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::sub_32,
5495 RC: &AArch64::GPR64spRegClass);
5496 assert(SrcRegX.isValid() && "Source super-reg not valid");
5497 // This instruction is reading and writing X registers. This may upset
5498 // the register scavenger and machine verifier, so we need to indicate
5499 // that we are reading an undefined value from SrcRegX, but a proper
5500 // value from SrcReg.
5501 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg: DestRegX)
5502 .addReg(RegNo: AArch64::XZR)
5503 .addReg(RegNo: SrcRegX, Flags: RegState::Undef)
5504 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5505 ++NumZCRegMoveInstrsGPR;
5506 } else {
5507 // Otherwise, expand to ORR WZR.
5508 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRWrr), DestReg)
5509 .addReg(RegNo: AArch64::WZR)
5510 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5511 if (Subtarget.hasZeroCycleRegMoveGPR32())
5512 ++NumZCRegMoveInstrsGPR;
5513 }
5514 return;
5515 }
5516
5517 // GPR32 zeroing
5518 if (AArch64::GPR32spRegClass.contains(Reg: DestReg) && SrcReg == AArch64::WZR) {
5519 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5520 !Subtarget.hasZeroCycleZeroingGPR32()) {
5521 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5522 RC: &AArch64::GPR64spRegClass);
5523 assert(DestRegX.isValid() && "Destination super-reg not valid");
5524 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: DestRegX)
5525 .addImm(Val: 0)
5526 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5527 ++NumZCZeroingInstrsGPR;
5528 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5529 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZWi), DestReg)
5530 .addImm(Val: 0)
5531 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5532 ++NumZCZeroingInstrsGPR;
5533 } else {
5534 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRWrr), DestReg)
5535 .addReg(RegNo: AArch64::WZR)
5536 .addReg(RegNo: AArch64::WZR);
5537 }
5538 return;
5539 }
5540
5541 if (AArch64::GPR64spRegClass.contains(Reg: DestReg) &&
5542 AArch64::GPR64spRegClass.contains(Reg: SrcReg)) {
5543 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5544 // If either operand is SP, expand to ADD #0.
5545 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg)
5546 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5547 .addImm(Val: 0)
5548 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5549 if (Subtarget.hasZeroCycleRegMoveGPR64())
5550 ++NumZCRegMoveInstrsGPR;
5551 } else {
5552 // Otherwise, expand to ORR XZR.
5553 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg)
5554 .addReg(RegNo: AArch64::XZR)
5555 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5556 if (Subtarget.hasZeroCycleRegMoveGPR64())
5557 ++NumZCRegMoveInstrsGPR;
5558 }
5559 return;
5560 }
5561
5562 // GPR64 zeroing
5563 if (AArch64::GPR64spRegClass.contains(Reg: DestReg) && SrcReg == AArch64::XZR) {
5564 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5565 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg)
5566 .addImm(Val: 0)
5567 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5568 ++NumZCZeroingInstrsGPR;
5569 } else {
5570 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg)
5571 .addReg(RegNo: AArch64::XZR)
5572 .addReg(RegNo: AArch64::XZR);
5573 }
5574 return;
5575 }
5576
5577 // Copy a Predicate register by ORRing with itself.
5578 if (AArch64::PPRRegClass.contains(Reg: DestReg) &&
5579 AArch64::PPRRegClass.contains(Reg: SrcReg)) {
5580 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5581 "Unexpected SVE register.");
5582 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_PPzPP), DestReg)
5583 .addReg(RegNo: SrcReg) // Pg
5584 .addReg(RegNo: SrcReg)
5585 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5586 return;
5587 }
5588
5589 // Copy a predicate-as-counter register by ORRing with itself as if it
5590 // were a regular predicate (mask) register.
5591 bool DestIsPNR = AArch64::PNRRegClass.contains(Reg: DestReg);
5592 bool SrcIsPNR = AArch64::PNRRegClass.contains(Reg: SrcReg);
5593 if (DestIsPNR || SrcIsPNR) {
5594 auto ToPPR = [](MCRegister R) -> MCRegister {
5595 return (R - AArch64::PN0) + AArch64::P0;
5596 };
5597 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5598 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5599
5600 if (PPRSrcReg != PPRDestReg) {
5601 auto NewMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_PPzPP), DestReg: PPRDestReg)
5602 .addReg(RegNo: PPRSrcReg) // Pg
5603 .addReg(RegNo: PPRSrcReg)
5604 .addReg(RegNo: PPRSrcReg, Flags: getKillRegState(B: KillSrc));
5605 if (DestIsPNR)
5606 NewMI.addDef(RegNo: DestReg, Flags: RegState::Implicit);
5607 }
5608 return;
5609 }
5610
5611 // Copy a Z register by ORRing with itself.
5612 if (AArch64::ZPRRegClass.contains(Reg: DestReg) &&
5613 AArch64::ZPRRegClass.contains(Reg: SrcReg)) {
5614 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5615 "Unexpected SVE register.");
5616 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_ZZZ), DestReg)
5617 .addReg(RegNo: SrcReg)
5618 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5619 return;
5620 }
5621
5622 // Copy a Z register pair by copying the individual sub-registers.
5623 if ((AArch64::ZPR2RegClass.contains(Reg: DestReg) ||
5624 AArch64::ZPR2StridedOrContiguousRegClass.contains(Reg: DestReg)) &&
5625 (AArch64::ZPR2RegClass.contains(Reg: SrcReg) ||
5626 AArch64::ZPR2StridedOrContiguousRegClass.contains(Reg: SrcReg))) {
5627 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5628 "Unexpected SVE register.");
5629 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5630 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5631 Indices);
5632 return;
5633 }
5634
5635 // Copy a Z register triple by copying the individual sub-registers.
5636 if (AArch64::ZPR3RegClass.contains(Reg: DestReg) &&
5637 AArch64::ZPR3RegClass.contains(Reg: SrcReg)) {
5638 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5639 "Unexpected SVE register.");
5640 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5641 AArch64::zsub2};
5642 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5643 Indices);
5644 return;
5645 }
5646
5647 // Copy a Z register quad by copying the individual sub-registers.
5648 if ((AArch64::ZPR4RegClass.contains(Reg: DestReg) ||
5649 AArch64::ZPR4StridedOrContiguousRegClass.contains(Reg: DestReg)) &&
5650 (AArch64::ZPR4RegClass.contains(Reg: SrcReg) ||
5651 AArch64::ZPR4StridedOrContiguousRegClass.contains(Reg: SrcReg))) {
5652 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5653 "Unexpected SVE register.");
5654 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5655 AArch64::zsub2, AArch64::zsub3};
5656 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5657 Indices);
5658 return;
5659 }
5660
5661 // Copy a DDDD register quad by copying the individual sub-registers.
5662 if (AArch64::DDDDRegClass.contains(Reg: DestReg) &&
5663 AArch64::DDDDRegClass.contains(Reg: SrcReg)) {
5664 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5665 AArch64::dsub2, AArch64::dsub3};
5666 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5667 Indices);
5668 return;
5669 }
5670
5671 // Copy a DDD register triple by copying the individual sub-registers.
5672 if (AArch64::DDDRegClass.contains(Reg: DestReg) &&
5673 AArch64::DDDRegClass.contains(Reg: SrcReg)) {
5674 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5675 AArch64::dsub2};
5676 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5677 Indices);
5678 return;
5679 }
5680
5681 // Copy a DD register pair by copying the individual sub-registers.
5682 if (AArch64::DDRegClass.contains(Reg: DestReg) &&
5683 AArch64::DDRegClass.contains(Reg: SrcReg)) {
5684 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5685 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5686 Indices);
5687 return;
5688 }
5689
5690 // Copy a QQQQ register quad by copying the individual sub-registers.
5691 if (AArch64::QQQQRegClass.contains(Reg: DestReg) &&
5692 AArch64::QQQQRegClass.contains(Reg: SrcReg)) {
5693 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5694 AArch64::qsub2, AArch64::qsub3};
5695 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5696 Indices);
5697 return;
5698 }
5699
5700 // Copy a QQQ register triple by copying the individual sub-registers.
5701 if (AArch64::QQQRegClass.contains(Reg: DestReg) &&
5702 AArch64::QQQRegClass.contains(Reg: SrcReg)) {
5703 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5704 AArch64::qsub2};
5705 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5706 Indices);
5707 return;
5708 }
5709
5710 // Copy a QQ register pair by copying the individual sub-registers.
5711 if (AArch64::QQRegClass.contains(Reg: DestReg) &&
5712 AArch64::QQRegClass.contains(Reg: SrcReg)) {
5713 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5714 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5715 Indices);
5716 return;
5717 }
5718
5719 if (AArch64::XSeqPairsClassRegClass.contains(Reg: DestReg) &&
5720 AArch64::XSeqPairsClassRegClass.contains(Reg: SrcReg)) {
5721 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5722 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRXrs,
5723 ZeroReg: AArch64::XZR, Indices);
5724 return;
5725 }
5726
5727 if (AArch64::WSeqPairsClassRegClass.contains(Reg: DestReg) &&
5728 AArch64::WSeqPairsClassRegClass.contains(Reg: SrcReg)) {
5729 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5730 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRWrs,
5731 ZeroReg: AArch64::WZR, Indices);
5732 return;
5733 }
5734
5735 if (AArch64::FPR128RegClass.contains(Reg: DestReg) &&
5736 AArch64::FPR128RegClass.contains(Reg: SrcReg)) {
5737 // In streaming regions, NEON is illegal but streaming-SVE is available.
5738 // Use SVE for copies if we're in a streaming region and SME is available.
5739 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
5740 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
5741 !Subtarget.isNeonAvailable()) ||
5742 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5743 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_ZZZ))
5744 .addReg(RegNo: AArch64::Z0 + (DestReg - AArch64::Q0), Flags: RegState::Define)
5745 .addReg(RegNo: AArch64::Z0 + (SrcReg - AArch64::Q0))
5746 .addReg(RegNo: AArch64::Z0 + (SrcReg - AArch64::Q0));
5747 } else if (Subtarget.isNeonAvailable()) {
5748 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg)
5749 .addReg(RegNo: SrcReg)
5750 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5751 if (Subtarget.hasZeroCycleRegMoveFPR128())
5752 ++NumZCRegMoveInstrsFPR;
5753 } else {
5754 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::STRQpre))
5755 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
5756 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5757 .addReg(RegNo: AArch64::SP)
5758 .addImm(Val: -16);
5759 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::LDRQpost))
5760 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
5761 .addReg(RegNo: DestReg, Flags: RegState::Define)
5762 .addReg(RegNo: AArch64::SP)
5763 .addImm(Val: 16);
5764 }
5765 return;
5766 }
5767
5768 if (AArch64::FPR64RegClass.contains(Reg: DestReg) &&
5769 AArch64::FPR64RegClass.contains(Reg: SrcReg)) {
5770 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5771 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5772 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5773 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5774 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::dsub,
5775 RC: &AArch64::FPR128RegClass);
5776 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::dsub,
5777 RC: &AArch64::FPR128RegClass);
5778 // This instruction is reading and writing Q registers. This may upset
5779 // the register scavenger and machine verifier, so we need to indicate
5780 // that we are reading an undefined value from SrcRegQ, but a proper
5781 // value from SrcReg.
5782 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5783 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5784 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5785 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5786 ++NumZCRegMoveInstrsFPR;
5787 } else {
5788 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg)
5789 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5790 if (Subtarget.hasZeroCycleRegMoveFPR64())
5791 ++NumZCRegMoveInstrsFPR;
5792 }
5793 return;
5794 }
5795
5796 if (AArch64::FPR32RegClass.contains(Reg: DestReg) &&
5797 AArch64::FPR32RegClass.contains(Reg: SrcReg)) {
5798 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5799 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5800 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5801 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5802 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::ssub,
5803 RC: &AArch64::FPR128RegClass);
5804 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::ssub,
5805 RC: &AArch64::FPR128RegClass);
5806 // This instruction is reading and writing Q registers. This may upset
5807 // the register scavenger and machine verifier, so we need to indicate
5808 // that we are reading an undefined value from SrcRegQ, but a proper
5809 // value from SrcReg.
5810 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5811 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5812 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5813 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5814 ++NumZCRegMoveInstrsFPR;
5815 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5816 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5817 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::ssub,
5818 RC: &AArch64::FPR64RegClass);
5819 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::ssub,
5820 RC: &AArch64::FPR64RegClass);
5821 // This instruction is reading and writing D registers. This may upset
5822 // the register scavenger and machine verifier, so we need to indicate
5823 // that we are reading an undefined value from SrcRegD, but a proper
5824 // value from SrcReg.
5825 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
5826 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
5827 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5828 ++NumZCRegMoveInstrsFPR;
5829 } else {
5830 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
5831 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5832 if (Subtarget.hasZeroCycleRegMoveFPR32())
5833 ++NumZCRegMoveInstrsFPR;
5834 }
5835 return;
5836 }
5837
5838 if (AArch64::FPR16RegClass.contains(Reg: DestReg) &&
5839 AArch64::FPR16RegClass.contains(Reg: SrcReg)) {
5840 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5841 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5842 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5843 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5844 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
5845 RC: &AArch64::FPR128RegClass);
5846 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
5847 RC: &AArch64::FPR128RegClass);
5848 // This instruction is reading and writing Q registers. This may upset
5849 // the register scavenger and machine verifier, so we need to indicate
5850 // that we are reading an undefined value from SrcRegQ, but a proper
5851 // value from SrcReg.
5852 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5853 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5854 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5855 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5856 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5857 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5858 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
5859 RC: &AArch64::FPR64RegClass);
5860 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
5861 RC: &AArch64::FPR64RegClass);
5862 // This instruction is reading and writing D registers. This may upset
5863 // the register scavenger and machine verifier, so we need to indicate
5864 // that we are reading an undefined value from SrcRegD, but a proper
5865 // value from SrcReg.
5866 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
5867 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
5868 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5869 } else {
5870 DestReg = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
5871 RC: &AArch64::FPR32RegClass);
5872 SrcReg = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
5873 RC: &AArch64::FPR32RegClass);
5874 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
5875 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5876 }
5877 return;
5878 }
5879
5880 if (AArch64::FPR8RegClass.contains(Reg: DestReg) &&
5881 AArch64::FPR8RegClass.contains(Reg: SrcReg)) {
5882 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5883 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5884 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5885 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5886 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
5887 RC: &AArch64::FPR128RegClass);
5888 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
5889 RC: &AArch64::FPR128RegClass);
5890 // This instruction is reading and writing Q registers. This may upset
5891 // the register scavenger and machine verifier, so we need to indicate
5892 // that we are reading an undefined value from SrcRegQ, but a proper
5893 // value from SrcReg.
5894 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5895 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5896 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5897 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5898 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5899 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5900 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
5901 RC: &AArch64::FPR64RegClass);
5902 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
5903 RC: &AArch64::FPR64RegClass);
5904 // This instruction is reading and writing D registers. This may upset
5905 // the register scavenger and machine verifier, so we need to indicate
5906 // that we are reading an undefined value from SrcRegD, but a proper
5907 // value from SrcReg.
5908 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
5909 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
5910 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5911 } else {
5912 DestReg = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
5913 RC: &AArch64::FPR32RegClass);
5914 SrcReg = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
5915 RC: &AArch64::FPR32RegClass);
5916 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
5917 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5918 }
5919 return;
5920 }
5921
5922 // Copies between GPR64 and FPR64.
5923 if (AArch64::FPR64RegClass.contains(Reg: DestReg) &&
5924 AArch64::GPR64RegClass.contains(Reg: SrcReg)) {
5925 if (AArch64::XZR == SrcReg) {
5926 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVD0), DestReg);
5927 } else {
5928 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVXDr), DestReg)
5929 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5930 }
5931 return;
5932 }
5933 if (AArch64::GPR64RegClass.contains(Reg: DestReg) &&
5934 AArch64::FPR64RegClass.contains(Reg: SrcReg)) {
5935 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDXr), DestReg)
5936 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5937 return;
5938 }
5939 // Copies between GPR32 and FPR32.
5940 if (AArch64::FPR32RegClass.contains(Reg: DestReg) &&
5941 AArch64::GPR32RegClass.contains(Reg: SrcReg)) {
5942 if (AArch64::WZR == SrcReg) {
5943 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVS0), DestReg);
5944 } else {
5945 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVWSr), DestReg)
5946 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5947 }
5948 return;
5949 }
5950 if (AArch64::GPR32RegClass.contains(Reg: DestReg) &&
5951 AArch64::FPR32RegClass.contains(Reg: SrcReg)) {
5952 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSWr), DestReg)
5953 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5954 return;
5955 }
5956
5957 if (DestReg == AArch64::NZCV) {
5958 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
5959 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MSR))
5960 .addImm(Val: AArch64SysReg::NZCV)
5961 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5962 .addReg(RegNo: AArch64::NZCV, Flags: RegState::Implicit | RegState::Define);
5963 return;
5964 }
5965
5966 if (SrcReg == AArch64::NZCV) {
5967 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
5968 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MRS), DestReg)
5969 .addImm(Val: AArch64SysReg::NZCV)
5970 .addReg(RegNo: AArch64::NZCV, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5971 return;
5972 }
5973
5974#ifndef NDEBUG
5975 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
5976 << "\n";
5977#endif
5978 llvm_unreachable("unimplemented reg-to-reg copy");
5979}
5980
5981static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
5982 MachineBasicBlock &MBB,
5983 MachineBasicBlock::iterator InsertBefore,
5984 const MCInstrDesc &MCID,
5985 Register SrcReg, bool IsKill,
5986 unsigned SubIdx0, unsigned SubIdx1, int FI,
5987 MachineMemOperand *MMO) {
5988 Register SrcReg0 = SrcReg;
5989 Register SrcReg1 = SrcReg;
5990 if (SrcReg.isPhysical()) {
5991 SrcReg0 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx0);
5992 SubIdx0 = 0;
5993 SrcReg1 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx1);
5994 SubIdx1 = 0;
5995 }
5996 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
5997 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: IsKill), SubReg: SubIdx0)
5998 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: IsKill), SubReg: SubIdx1)
5999 .addFrameIndex(Idx: FI)
6000 .addImm(Val: 0)
6001 .addMemOperand(MMO);
6002}
6003
6004void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
6005 MachineBasicBlock::iterator MBBI,
6006 Register SrcReg, bool isKill, int FI,
6007 const TargetRegisterClass *RC,
6008 Register VReg,
6009 MachineInstr::MIFlag Flags) const {
6010 MachineFunction &MF = *MBB.getParent();
6011 MachineFrameInfo &MFI = MF.getFrameInfo();
6012
6013 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6014 MachineMemOperand *MMO =
6015 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore,
6016 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
6017 unsigned Opc = 0;
6018 bool Offset = true;
6019 MCRegister PNRReg = MCRegister::NoRegister;
6020 unsigned StackID = TargetStackID::Default;
6021 switch (RI.getSpillSize(RC: *RC)) {
6022 case 1:
6023 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6024 Opc = AArch64::STRBui;
6025 break;
6026 case 2: {
6027 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6028 Opc = AArch64::STRHui;
6029 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6030 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6031 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6032 "Unexpected register store without SVE store instructions");
6033 Opc = AArch64::STR_PXI;
6034 StackID = TargetStackID::ScalablePredicateVector;
6035 }
6036 break;
6037 }
6038 case 4:
6039 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6040 Opc = AArch64::STRWui;
6041 if (SrcReg.isVirtual())
6042 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR32RegClass);
6043 else
6044 assert(SrcReg != AArch64::WSP);
6045 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6046 Opc = AArch64::STRSui;
6047 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6048 Opc = AArch64::STR_PPXI;
6049 StackID = TargetStackID::ScalablePredicateVector;
6050 }
6051 break;
6052 case 8:
6053 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6054 Opc = AArch64::STRXui;
6055 if (SrcReg.isVirtual())
6056 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64RegClass);
6057 else
6058 assert(SrcReg != AArch64::SP);
6059 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6060 Opc = AArch64::STRDui;
6061 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6062 storeRegPairToStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6063 MCID: get(Opcode: AArch64::STPWi), SrcReg, IsKill: isKill,
6064 SubIdx0: AArch64::sube32, SubIdx1: AArch64::subo32, FI, MMO);
6065 return;
6066 }
6067 break;
6068 case 16:
6069 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6070 Opc = AArch64::STRQui;
6071 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6072 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6073 Opc = AArch64::ST1Twov1d;
6074 Offset = false;
6075 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6076 storeRegPairToStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6077 MCID: get(Opcode: AArch64::STPXi), SrcReg, IsKill: isKill,
6078 SubIdx0: AArch64::sube64, SubIdx1: AArch64::subo64, FI, MMO);
6079 return;
6080 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6081 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6082 "Unexpected register store without SVE store instructions");
6083 Opc = AArch64::STR_ZXI;
6084 StackID = TargetStackID::ScalableVector;
6085 }
6086 break;
6087 case 24:
6088 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6089 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6090 Opc = AArch64::ST1Threev1d;
6091 Offset = false;
6092 }
6093 break;
6094 case 32:
6095 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6096 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6097 Opc = AArch64::ST1Fourv1d;
6098 Offset = false;
6099 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6100 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6101 Opc = AArch64::ST1Twov2d;
6102 Offset = false;
6103 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6104 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6105 "Unexpected register store without SVE store instructions");
6106 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6107 StackID = TargetStackID::ScalableVector;
6108 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6109 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6110 "Unexpected register store without SVE store instructions");
6111 Opc = AArch64::STR_ZZXI;
6112 StackID = TargetStackID::ScalableVector;
6113 }
6114 break;
6115 case 48:
6116 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6117 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6118 Opc = AArch64::ST1Threev2d;
6119 Offset = false;
6120 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6121 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6122 "Unexpected register store without SVE store instructions");
6123 Opc = AArch64::STR_ZZZXI;
6124 StackID = TargetStackID::ScalableVector;
6125 }
6126 break;
6127 case 64:
6128 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6129 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6130 Opc = AArch64::ST1Fourv2d;
6131 Offset = false;
6132 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6133 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6134 "Unexpected register store without SVE store instructions");
6135 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6136 StackID = TargetStackID::ScalableVector;
6137 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6138 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6139 "Unexpected register store without SVE store instructions");
6140 Opc = AArch64::STR_ZZZZXI;
6141 StackID = TargetStackID::ScalableVector;
6142 }
6143 break;
6144 }
6145 assert(Opc && "Unknown register class");
6146 MFI.setStackID(ObjectIdx: FI, ID: StackID);
6147
6148 const MachineInstrBuilder MI = BuildMI(BB&: MBB, I: MBBI, MIMD: DebugLoc(), MCID: get(Opcode: Opc))
6149 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill))
6150 .addFrameIndex(Idx: FI);
6151
6152 if (Offset)
6153 MI.addImm(Val: 0);
6154 if (PNRReg.isValid())
6155 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
6156 MI.addMemOperand(MMO);
6157}
6158
6159static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
6160 MachineBasicBlock &MBB,
6161 MachineBasicBlock::iterator InsertBefore,
6162 const MCInstrDesc &MCID,
6163 Register DestReg, unsigned SubIdx0,
6164 unsigned SubIdx1, int FI,
6165 MachineMemOperand *MMO) {
6166 Register DestReg0 = DestReg;
6167 Register DestReg1 = DestReg;
6168 bool IsUndef = true;
6169 if (DestReg.isPhysical()) {
6170 DestReg0 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx0);
6171 SubIdx0 = 0;
6172 DestReg1 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx1);
6173 SubIdx1 = 0;
6174 IsUndef = false;
6175 }
6176 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
6177 .addReg(RegNo: DestReg0, Flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx0)
6178 .addReg(RegNo: DestReg1, Flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx1)
6179 .addFrameIndex(Idx: FI)
6180 .addImm(Val: 0)
6181 .addMemOperand(MMO);
6182}
6183
6184void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
6185 MachineBasicBlock::iterator MBBI,
6186 Register DestReg, int FI,
6187 const TargetRegisterClass *RC,
6188 Register VReg, unsigned SubReg,
6189 MachineInstr::MIFlag Flags) const {
6190 MachineFunction &MF = *MBB.getParent();
6191 MachineFrameInfo &MFI = MF.getFrameInfo();
6192 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6193 MachineMemOperand *MMO =
6194 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOLoad,
6195 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
6196
6197 unsigned Opc = 0;
6198 bool Offset = true;
6199 unsigned StackID = TargetStackID::Default;
6200 Register PNRReg = MCRegister::NoRegister;
6201 switch (TRI.getSpillSize(RC: *RC)) {
6202 case 1:
6203 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6204 Opc = AArch64::LDRBui;
6205 break;
6206 case 2: {
6207 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6208 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6209 Opc = AArch64::LDRHui;
6210 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6211 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6212 "Unexpected register load without SVE load instructions");
6213 if (IsPNR)
6214 PNRReg = DestReg;
6215 Opc = AArch64::LDR_PXI;
6216 StackID = TargetStackID::ScalablePredicateVector;
6217 }
6218 break;
6219 }
6220 case 4:
6221 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6222 Opc = AArch64::LDRWui;
6223 if (DestReg.isVirtual())
6224 MF.getRegInfo().constrainRegClass(Reg: DestReg, RC: &AArch64::GPR32RegClass);
6225 else
6226 assert(DestReg != AArch64::WSP);
6227 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6228 Opc = AArch64::LDRSui;
6229 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6230 Opc = AArch64::LDR_PPXI;
6231 StackID = TargetStackID::ScalablePredicateVector;
6232 }
6233 break;
6234 case 8:
6235 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6236 Opc = AArch64::LDRXui;
6237 if (DestReg.isVirtual())
6238 MF.getRegInfo().constrainRegClass(Reg: DestReg, RC: &AArch64::GPR64RegClass);
6239 else
6240 assert(DestReg != AArch64::SP);
6241 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6242 Opc = AArch64::LDRDui;
6243 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6244 loadRegPairFromStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6245 MCID: get(Opcode: AArch64::LDPWi), DestReg, SubIdx0: AArch64::sube32,
6246 SubIdx1: AArch64::subo32, FI, MMO);
6247 return;
6248 }
6249 break;
6250 case 16:
6251 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6252 Opc = AArch64::LDRQui;
6253 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6254 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6255 Opc = AArch64::LD1Twov1d;
6256 Offset = false;
6257 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6258 loadRegPairFromStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6259 MCID: get(Opcode: AArch64::LDPXi), DestReg, SubIdx0: AArch64::sube64,
6260 SubIdx1: AArch64::subo64, FI, MMO);
6261 return;
6262 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6263 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6264 "Unexpected register load without SVE load instructions");
6265 Opc = AArch64::LDR_ZXI;
6266 StackID = TargetStackID::ScalableVector;
6267 }
6268 break;
6269 case 24:
6270 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6271 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6272 Opc = AArch64::LD1Threev1d;
6273 Offset = false;
6274 }
6275 break;
6276 case 32:
6277 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6278 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6279 Opc = AArch64::LD1Fourv1d;
6280 Offset = false;
6281 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6282 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6283 Opc = AArch64::LD1Twov2d;
6284 Offset = false;
6285 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6286 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6287 "Unexpected register load without SVE load instructions");
6288 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6289 StackID = TargetStackID::ScalableVector;
6290 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6291 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6292 "Unexpected register load without SVE load instructions");
6293 Opc = AArch64::LDR_ZZXI;
6294 StackID = TargetStackID::ScalableVector;
6295 }
6296 break;
6297 case 48:
6298 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6299 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6300 Opc = AArch64::LD1Threev2d;
6301 Offset = false;
6302 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6303 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6304 "Unexpected register load without SVE load instructions");
6305 Opc = AArch64::LDR_ZZZXI;
6306 StackID = TargetStackID::ScalableVector;
6307 }
6308 break;
6309 case 64:
6310 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6311 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6312 Opc = AArch64::LD1Fourv2d;
6313 Offset = false;
6314 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6315 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6316 "Unexpected register load without SVE load instructions");
6317 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6318 StackID = TargetStackID::ScalableVector;
6319 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6320 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6321 "Unexpected register load without SVE load instructions");
6322 Opc = AArch64::LDR_ZZZZXI;
6323 StackID = TargetStackID::ScalableVector;
6324 }
6325 break;
6326 }
6327
6328 assert(Opc && "Unknown register class");
6329 MFI.setStackID(ObjectIdx: FI, ID: StackID);
6330
6331 const MachineInstrBuilder MI = BuildMI(BB&: MBB, I: MBBI, MIMD: DebugLoc(), MCID: get(Opcode: Opc))
6332 .addReg(RegNo: DestReg, Flags: getDefRegState(B: true))
6333 .addFrameIndex(Idx: FI);
6334 if (Offset)
6335 MI.addImm(Val: 0);
6336 if (PNRReg.isValid() && !PNRReg.isVirtual())
6337 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
6338 MI.addMemOperand(MMO);
6339}
6340
6341bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
6342 const MachineInstr &UseMI,
6343 const TargetRegisterInfo *TRI) {
6344 return any_of(Range: instructionsWithoutDebug(It: std::next(x: DefMI.getIterator()),
6345 End: UseMI.getIterator()),
6346 P: [TRI](const MachineInstr &I) {
6347 return I.modifiesRegister(Reg: AArch64::NZCV, TRI) ||
6348 I.readsRegister(Reg: AArch64::NZCV, TRI);
6349 });
6350}
6351
6352void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6353 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6354 // The smallest scalable element supported by scaled SVE addressing
6355 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6356 // byte offset must always be a multiple of 2.
6357 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6358
6359 // VGSized offsets are divided by '2', because the VG register is the
6360 // the number of 64bit granules as opposed to 128bit vector chunks,
6361 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6362 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6363 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6364 ByteSized = Offset.getFixed();
6365 VGSized = Offset.getScalable() / 2;
6366}
6367
6368/// Returns the offset in parts to which this frame offset can be
6369/// decomposed for the purpose of describing a frame offset.
6370/// For non-scalable offsets this is simply its byte size.
6371void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6372 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6373 int64_t &NumDataVectors) {
6374 // The smallest scalable element supported by scaled SVE addressing
6375 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6376 // byte offset must always be a multiple of 2.
6377 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6378
6379 NumBytes = Offset.getFixed();
6380 NumDataVectors = 0;
6381 NumPredicateVectors = Offset.getScalable() / 2;
6382 // This method is used to get the offsets to adjust the frame offset.
6383 // If the function requires ADDPL to be used and needs more than two ADDPL
6384 // instructions, part of the offset is folded into NumDataVectors so that it
6385 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6386 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6387 NumPredicateVectors > 62) {
6388 NumDataVectors = NumPredicateVectors / 8;
6389 NumPredicateVectors -= NumDataVectors * 8;
6390 }
6391}
6392
6393// Convenience function to create a DWARF expression for: Constant `Operation`.
6394// This helper emits compact sequences for common cases. For example, for`-15
6395// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6396static void appendConstantExpr(SmallVectorImpl<char> &Expr, int64_t Constant,
6397 dwarf::LocationAtom Operation) {
6398 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6399 // -Constant (1 to 31)
6400 Expr.push_back(Elt: dwarf::DW_OP_lit0 - Constant);
6401 Operation = dwarf::DW_OP_minus;
6402 } else if (Constant >= 0 && Constant <= 31) {
6403 // Literal value 0 to 31
6404 Expr.push_back(Elt: dwarf::DW_OP_lit0 + Constant);
6405 } else {
6406 // Signed constant
6407 Expr.push_back(Elt: dwarf::DW_OP_consts);
6408 appendLEB128<LEB128Sign::Signed>(Buffer&: Expr, Value: Constant);
6409 }
6410 return Expr.push_back(Elt: Operation);
6411}
6412
6413// Convenience function to create a DWARF expression for a register.
6414static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6415 Expr.push_back(Elt: (char)dwarf::DW_OP_bregx);
6416 appendLEB128<LEB128Sign::Unsigned>(Buffer&: Expr, Value: RegNum);
6417 Expr.push_back(Elt: 0);
6418}
6419
6420// Convenience function to create a DWARF expression for loading a register from
6421// a CFA offset.
6422static void appendLoadRegExpr(SmallVectorImpl<char> &Expr,
6423 int64_t OffsetFromDefCFA) {
6424 // This assumes the top of the DWARF stack contains the CFA.
6425 Expr.push_back(Elt: dwarf::DW_OP_dup);
6426 // Add the offset to the register.
6427 appendConstantExpr(Expr, Constant: OffsetFromDefCFA, Operation: dwarf::DW_OP_plus);
6428 // Dereference the address (loads a 64 bit value)..
6429 Expr.push_back(Elt: dwarf::DW_OP_deref);
6430}
6431
6432// Convenience function to create a comment for
6433// (+/-) NumBytes (* RegScale)?
6434static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6435 StringRef RegScale = {}) {
6436 if (NumBytes) {
6437 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(x: NumBytes);
6438 if (!RegScale.empty())
6439 Comment << ' ' << RegScale;
6440 }
6441}
6442
6443// Creates an MCCFIInstruction:
6444// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6445static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
6446 unsigned Reg,
6447 const StackOffset &Offset) {
6448 int64_t NumBytes, NumVGScaledBytes;
6449 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, ByteSized&: NumBytes,
6450 VGSized&: NumVGScaledBytes);
6451 std::string CommentBuffer;
6452 llvm::raw_string_ostream Comment(CommentBuffer);
6453
6454 if (Reg == AArch64::SP)
6455 Comment << "sp";
6456 else if (Reg == AArch64::FP)
6457 Comment << "fp";
6458 else
6459 Comment << printReg(Reg, TRI: &TRI);
6460
6461 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6462 SmallString<64> Expr;
6463 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6464 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6465 // Reg + NumBytes
6466 Expr.push_back(Elt: dwarf::DW_OP_breg0 + DwarfReg);
6467 appendLEB128<LEB128Sign::Signed>(Buffer&: Expr, Value: NumBytes);
6468 appendOffsetComment(NumBytes, Comment);
6469 if (NumVGScaledBytes) {
6470 // + VG * NumVGScaledBytes
6471 appendOffsetComment(NumBytes: NumVGScaledBytes, Comment, RegScale: "* VG");
6472 appendReadRegExpr(Expr, RegNum: TRI.getDwarfRegNum(Reg: AArch64::VG, isEH: true));
6473 appendConstantExpr(Expr, Constant: NumVGScaledBytes, Operation: dwarf::DW_OP_mul);
6474 Expr.push_back(Elt: dwarf::DW_OP_plus);
6475 }
6476
6477 // Wrap this into DW_CFA_def_cfa.
6478 SmallString<64> DefCfaExpr;
6479 DefCfaExpr.push_back(Elt: dwarf::DW_CFA_def_cfa_expression);
6480 appendLEB128<LEB128Sign::Unsigned>(Buffer&: DefCfaExpr, Value: Expr.size());
6481 DefCfaExpr.append(RHS: Expr.str());
6482 return MCCFIInstruction::createEscape(L: nullptr, Vals: DefCfaExpr.str(), Loc: SMLoc(),
6483 Comment: Comment.str());
6484}
6485
6486MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
6487 unsigned FrameReg, unsigned Reg,
6488 const StackOffset &Offset,
6489 bool LastAdjustmentWasScalable) {
6490 if (Offset.getScalable())
6491 return createDefCFAExpression(TRI, Reg, Offset);
6492
6493 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6494 return MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: int(Offset.getFixed()));
6495
6496 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6497 return MCCFIInstruction::cfiDefCfa(L: nullptr, Register: DwarfReg, Offset: (int)Offset.getFixed());
6498}
6499
6500MCCFIInstruction
6501llvm::createCFAOffset(const TargetRegisterInfo &TRI, unsigned Reg,
6502 const StackOffset &OffsetFromDefCFA,
6503 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6504 int64_t NumBytes, NumVGScaledBytes;
6505 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6506 Offset: OffsetFromDefCFA, ByteSized&: NumBytes, VGSized&: NumVGScaledBytes);
6507
6508 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6509
6510 // Non-scalable offsets can use DW_CFA_offset directly.
6511 if (!NumVGScaledBytes)
6512 return MCCFIInstruction::createOffset(L: nullptr, Register: DwarfReg, Offset: NumBytes);
6513
6514 std::string CommentBuffer;
6515 llvm::raw_string_ostream Comment(CommentBuffer);
6516 Comment << printReg(Reg, TRI: &TRI) << " @ cfa";
6517
6518 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6519 assert(NumVGScaledBytes && "Expected scalable offset");
6520 SmallString<64> OffsetExpr;
6521 // + VG * NumVGScaledBytes
6522 StringRef VGRegScale;
6523 if (IncomingVGOffsetFromDefCFA) {
6524 appendLoadRegExpr(Expr&: OffsetExpr, OffsetFromDefCFA: *IncomingVGOffsetFromDefCFA);
6525 VGRegScale = "* IncomingVG";
6526 } else {
6527 appendReadRegExpr(Expr&: OffsetExpr, RegNum: TRI.getDwarfRegNum(Reg: AArch64::VG, isEH: true));
6528 VGRegScale = "* VG";
6529 }
6530 appendConstantExpr(Expr&: OffsetExpr, Constant: NumVGScaledBytes, Operation: dwarf::DW_OP_mul);
6531 appendOffsetComment(NumBytes: NumVGScaledBytes, Comment, RegScale: VGRegScale);
6532 OffsetExpr.push_back(Elt: dwarf::DW_OP_plus);
6533 if (NumBytes) {
6534 // + NumBytes
6535 appendOffsetComment(NumBytes, Comment);
6536 appendConstantExpr(Expr&: OffsetExpr, Constant: NumBytes, Operation: dwarf::DW_OP_plus);
6537 }
6538
6539 // Wrap this into DW_CFA_expression
6540 SmallString<64> CfaExpr;
6541 CfaExpr.push_back(Elt: dwarf::DW_CFA_expression);
6542 appendLEB128<LEB128Sign::Unsigned>(Buffer&: CfaExpr, Value: DwarfReg);
6543 appendLEB128<LEB128Sign::Unsigned>(Buffer&: CfaExpr, Value: OffsetExpr.size());
6544 CfaExpr.append(RHS: OffsetExpr.str());
6545
6546 return MCCFIInstruction::createEscape(L: nullptr, Vals: CfaExpr.str(), Loc: SMLoc(),
6547 Comment: Comment.str());
6548}
6549
6550// Helper function to emit a frame offset adjustment from a given
6551// pointer (SrcReg), stored into DestReg. This function is explicit
6552// in that it requires the opcode.
6553static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
6554 MachineBasicBlock::iterator MBBI,
6555 const DebugLoc &DL, unsigned DestReg,
6556 unsigned SrcReg, int64_t Offset, unsigned Opc,
6557 const TargetInstrInfo *TII,
6558 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6559 bool *HasWinCFI, bool EmitCFAOffset,
6560 StackOffset CFAOffset, unsigned FrameReg) {
6561 int Sign = 1;
6562 unsigned MaxEncoding, ShiftSize;
6563 switch (Opc) {
6564 case AArch64::ADDXri:
6565 case AArch64::ADDSXri:
6566 case AArch64::SUBXri:
6567 case AArch64::SUBSXri:
6568 MaxEncoding = 0xfff;
6569 ShiftSize = 12;
6570 break;
6571 case AArch64::ADDVL_XXI:
6572 case AArch64::ADDPL_XXI:
6573 case AArch64::ADDSVL_XXI:
6574 case AArch64::ADDSPL_XXI:
6575 MaxEncoding = 31;
6576 ShiftSize = 0;
6577 if (Offset < 0) {
6578 MaxEncoding = 32;
6579 Sign = -1;
6580 Offset = -Offset;
6581 }
6582 break;
6583 default:
6584 llvm_unreachable("Unsupported opcode");
6585 }
6586
6587 // `Offset` can be in bytes or in "scalable bytes".
6588 int VScale = 1;
6589 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6590 VScale = 16;
6591 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6592 VScale = 2;
6593
6594 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6595 // scratch register. If DestReg is a virtual register, use it as the
6596 // scratch register; otherwise, create a new virtual register (to be
6597 // replaced by the scavenger at the end of PEI). That case can be optimized
6598 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6599 // register can be loaded with offset%8 and the add/sub can use an extending
6600 // instruction with LSL#3.
6601 // Currently the function handles any offsets but generates a poor sequence
6602 // of code.
6603 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6604
6605 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6606 Register TmpReg = DestReg;
6607 if (TmpReg == AArch64::XZR)
6608 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6609 RegClass: &AArch64::GPR64RegClass);
6610 do {
6611 uint64_t ThisVal = std::min<uint64_t>(a: Offset, b: MaxEncodableValue);
6612 unsigned LocalShiftSize = 0;
6613 if (ThisVal > MaxEncoding) {
6614 ThisVal = ThisVal >> ShiftSize;
6615 LocalShiftSize = ShiftSize;
6616 }
6617 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6618 "Encoding cannot handle value that big");
6619
6620 Offset -= ThisVal << LocalShiftSize;
6621 if (Offset == 0)
6622 TmpReg = DestReg;
6623 auto MBI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: TmpReg)
6624 .addReg(RegNo: SrcReg)
6625 .addImm(Val: Sign * (int)ThisVal);
6626 if (ShiftSize)
6627 MBI = MBI.addImm(
6628 Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: LocalShiftSize));
6629 MBI = MBI.setMIFlag(Flag);
6630
6631 auto Change =
6632 VScale == 1
6633 ? StackOffset::getFixed(Fixed: ThisVal << LocalShiftSize)
6634 : StackOffset::getScalable(Scalable: VScale * (ThisVal << LocalShiftSize));
6635 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6636 CFAOffset += Change;
6637 else
6638 CFAOffset -= Change;
6639 if (EmitCFAOffset && DestReg == TmpReg) {
6640 MachineFunction &MF = *MBB.getParent();
6641 const TargetSubtargetInfo &STI = MF.getSubtarget();
6642 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6643
6644 unsigned CFIIndex = MF.addFrameInst(
6645 Inst: createDefCFA(TRI, FrameReg, Reg: DestReg, Offset: CFAOffset, LastAdjustmentWasScalable: VScale != 1));
6646 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::CFI_INSTRUCTION))
6647 .addCFIIndex(CFIIndex)
6648 .setMIFlags(Flag);
6649 }
6650
6651 if (NeedsWinCFI) {
6652 int Imm = (int)(ThisVal << LocalShiftSize);
6653 if (VScale != 1 && DestReg == AArch64::SP) {
6654 if (HasWinCFI)
6655 *HasWinCFI = true;
6656 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_AllocZ))
6657 .addImm(Val: ThisVal)
6658 .setMIFlag(Flag);
6659 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6660 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6661 assert(VScale == 1 && "Expected non-scalable operation");
6662 if (HasWinCFI)
6663 *HasWinCFI = true;
6664 if (Imm == 0)
6665 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_SetFP)).setMIFlag(Flag);
6666 else
6667 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_AddFP))
6668 .addImm(Val: Imm)
6669 .setMIFlag(Flag);
6670 assert(Offset == 0 && "Expected remaining offset to be zero to "
6671 "emit a single SEH directive");
6672 } else if (DestReg == AArch64::SP) {
6673 assert(VScale == 1 && "Expected non-scalable operation");
6674 if (HasWinCFI)
6675 *HasWinCFI = true;
6676 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6677 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_StackAlloc))
6678 .addImm(Val: Imm)
6679 .setMIFlag(Flag);
6680 }
6681 }
6682
6683 SrcReg = TmpReg;
6684 } while (Offset);
6685}
6686
6687void llvm::emitFrameOffset(MachineBasicBlock &MBB,
6688 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
6689 unsigned DestReg, unsigned SrcReg,
6690 StackOffset Offset, const TargetInstrInfo *TII,
6691 MachineInstr::MIFlag Flag, bool SetNZCV,
6692 bool NeedsWinCFI, bool *HasWinCFI,
6693 bool EmitCFAOffset, StackOffset CFAOffset,
6694 unsigned FrameReg) {
6695 // If a function is marked as arm_locally_streaming, then the runtime value of
6696 // vscale in the prologue/epilogue is different the runtime value of vscale
6697 // in the function's body. To avoid having to consider multiple vscales,
6698 // we can use `addsvl` to allocate any scalable stack-slots, which under
6699 // most circumstances will be only locals, not callee-save slots.
6700 const Function &F = MBB.getParent()->getFunction();
6701 bool UseSVL = F.hasFnAttribute(Kind: "aarch64_pstate_sm_body");
6702
6703 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6704 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6705 Offset, NumBytes&: Bytes, NumPredicateVectors, NumDataVectors);
6706
6707 // Insert ADDSXri for scalable offset at the end.
6708 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6709 if (NeedsFinalDefNZCV)
6710 SetNZCV = false;
6711
6712 // First emit non-scalable frame offsets, or a simple 'mov'.
6713 if (Bytes || (!Offset && SrcReg != DestReg)) {
6714 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6715 "SP increment/decrement not 8-byte aligned");
6716 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6717 if (Bytes < 0) {
6718 Bytes = -Bytes;
6719 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6720 }
6721 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: Bytes, Opc, TII, Flag,
6722 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6723 FrameReg);
6724 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6725 ? StackOffset::getFixed(Fixed: -Bytes)
6726 : StackOffset::getFixed(Fixed: Bytes);
6727 SrcReg = DestReg;
6728 FrameReg = DestReg;
6729 }
6730
6731 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6732 "WinCFI can't allocate fractions of an SVE data vector");
6733
6734 if (NumDataVectors) {
6735 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: NumDataVectors,
6736 Opc: UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6737 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6738 FrameReg);
6739 CFAOffset += StackOffset::getScalable(Scalable: -NumDataVectors * 16);
6740 SrcReg = DestReg;
6741 }
6742
6743 if (NumPredicateVectors) {
6744 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6745 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: NumPredicateVectors,
6746 Opc: UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6747 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6748 FrameReg);
6749 }
6750
6751 if (NeedsFinalDefNZCV)
6752 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDSXri), DestReg)
6753 .addReg(RegNo: DestReg)
6754 .addImm(Val: 0)
6755 .addImm(Val: 0);
6756}
6757
6758MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
6759 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
6760 MachineBasicBlock::iterator InsertPt, int FrameIndex,
6761 LiveIntervals *LIS, VirtRegMap *VRM) const {
6762 // This is a bit of a hack. Consider this instruction:
6763 //
6764 // %0 = COPY %sp; GPR64all:%0
6765 //
6766 // We explicitly chose GPR64all for the virtual register so such a copy might
6767 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6768 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6769 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6770 //
6771 // To prevent that, we are going to constrain the %0 register class here.
6772 if (MI.isFullCopy()) {
6773 Register DstReg = MI.getOperand(i: 0).getReg();
6774 Register SrcReg = MI.getOperand(i: 1).getReg();
6775 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6776 MF.getRegInfo().constrainRegClass(Reg: DstReg, RC: &AArch64::GPR64RegClass);
6777 return nullptr;
6778 }
6779 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6780 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64RegClass);
6781 return nullptr;
6782 }
6783 // Nothing can folded with copy from/to NZCV.
6784 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6785 return nullptr;
6786 }
6787
6788 // Handle the case where a copy is being spilled or filled but the source
6789 // and destination register class don't match. For example:
6790 //
6791 // %0 = COPY %xzr; GPR64common:%0
6792 //
6793 // In this case we can still safely fold away the COPY and generate the
6794 // following spill code:
6795 //
6796 // STRXui %xzr, %stack.0
6797 //
6798 // This also eliminates spilled cross register class COPYs (e.g. between x and
6799 // d regs) of the same size. For example:
6800 //
6801 // %0 = COPY %1; GPR64:%0, FPR64:%1
6802 //
6803 // will be filled as
6804 //
6805 // LDRDui %0, fi<#0>
6806 //
6807 // instead of
6808 //
6809 // LDRXui %Temp, fi<#0>
6810 // %0 = FMOV %Temp
6811 //
6812 if (MI.isCopy() && Ops.size() == 1 &&
6813 // Make sure we're only folding the explicit COPY defs/uses.
6814 (Ops[0] == 0 || Ops[0] == 1)) {
6815 bool IsSpill = Ops[0] == 0;
6816 bool IsFill = !IsSpill;
6817 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
6818 const MachineRegisterInfo &MRI = MF.getRegInfo();
6819 MachineBasicBlock &MBB = *MI.getParent();
6820 const MachineOperand &DstMO = MI.getOperand(i: 0);
6821 const MachineOperand &SrcMO = MI.getOperand(i: 1);
6822 Register DstReg = DstMO.getReg();
6823 Register SrcReg = SrcMO.getReg();
6824 // This is slightly expensive to compute for physical regs since
6825 // getMinimalPhysRegClass is slow.
6826 auto getRegClass = [&](unsigned Reg) {
6827 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6828 : TRI.getMinimalPhysRegClass(Reg);
6829 };
6830
6831 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6832 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6833 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6834 "Mismatched register size in non subreg COPY");
6835 if (IsSpill)
6836 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg, isKill: SrcMO.isKill(), FI: FrameIndex,
6837 RC: getRegClass(SrcReg), VReg: Register());
6838 else
6839 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex,
6840 RC: getRegClass(DstReg), VReg: Register());
6841 return &*--InsertPt;
6842 }
6843
6844 // Handle cases like spilling def of:
6845 //
6846 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6847 //
6848 // where the physical register source can be widened and stored to the full
6849 // virtual reg destination stack slot, in this case producing:
6850 //
6851 // STRXui %xzr, %stack.0
6852 //
6853 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6854 TRI.getRegSizeInBits(RC: *getRegClass(DstReg)) == 64) {
6855 assert(SrcMO.getSubReg() == 0 &&
6856 "Unexpected subreg on physical register");
6857 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg: AArch64::XZR, isKill: SrcMO.isKill(),
6858 FI: FrameIndex, RC: &AArch64::GPR64RegClass, VReg: Register());
6859 return &*--InsertPt;
6860 }
6861
6862 // Handle cases like filling use of:
6863 //
6864 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6865 //
6866 // where we can load the full virtual reg source stack slot, into the subreg
6867 // destination, in this case producing:
6868 //
6869 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6870 //
6871 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6872 const TargetRegisterClass *FillRC = nullptr;
6873 switch (DstMO.getSubReg()) {
6874 default:
6875 break;
6876 case AArch64::sub_32:
6877 if (AArch64::GPR64RegClass.hasSubClassEq(RC: getRegClass(DstReg)))
6878 FillRC = &AArch64::GPR32RegClass;
6879 break;
6880 case AArch64::ssub:
6881 FillRC = &AArch64::FPR32RegClass;
6882 break;
6883 case AArch64::dsub:
6884 FillRC = &AArch64::FPR64RegClass;
6885 break;
6886 }
6887
6888 if (FillRC) {
6889 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6890 TRI.getRegSizeInBits(*FillRC) &&
6891 "Mismatched regclass size on folded subreg COPY");
6892 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex, RC: FillRC,
6893 VReg: Register());
6894 MachineInstr &LoadMI = *--InsertPt;
6895 MachineOperand &LoadDst = LoadMI.getOperand(i: 0);
6896 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6897 LoadDst.setSubReg(DstMO.getSubReg());
6898 LoadDst.setIsUndef();
6899 return &LoadMI;
6900 }
6901 }
6902 }
6903
6904 // Cannot fold.
6905 return nullptr;
6906}
6907
6908int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
6909 StackOffset &SOffset,
6910 bool *OutUseUnscaledOp,
6911 unsigned *OutUnscaledOp,
6912 int64_t *EmittableOffset) {
6913 // Set output values in case of early exit.
6914 if (EmittableOffset)
6915 *EmittableOffset = 0;
6916 if (OutUseUnscaledOp)
6917 *OutUseUnscaledOp = false;
6918 if (OutUnscaledOp)
6919 *OutUnscaledOp = 0;
6920
6921 // Exit early for structured vector spills/fills as they can't take an
6922 // immediate offset.
6923 switch (MI.getOpcode()) {
6924 default:
6925 break;
6926 case AArch64::LD1Rv1d:
6927 case AArch64::LD1Rv2s:
6928 case AArch64::LD1Rv2d:
6929 case AArch64::LD1Rv4h:
6930 case AArch64::LD1Rv4s:
6931 case AArch64::LD1Rv8b:
6932 case AArch64::LD1Rv8h:
6933 case AArch64::LD1Rv16b:
6934 case AArch64::LD1Twov2d:
6935 case AArch64::LD1Threev2d:
6936 case AArch64::LD1Fourv2d:
6937 case AArch64::LD1Twov1d:
6938 case AArch64::LD1Threev1d:
6939 case AArch64::LD1Fourv1d:
6940 case AArch64::ST1Twov2d:
6941 case AArch64::ST1Threev2d:
6942 case AArch64::ST1Fourv2d:
6943 case AArch64::ST1Twov1d:
6944 case AArch64::ST1Threev1d:
6945 case AArch64::ST1Fourv1d:
6946 case AArch64::ST1i8:
6947 case AArch64::ST1i16:
6948 case AArch64::ST1i32:
6949 case AArch64::ST1i64:
6950 case AArch64::IRG:
6951 case AArch64::IRGstack:
6952 case AArch64::STGloop:
6953 case AArch64::STZGloop:
6954 return AArch64FrameOffsetCannotUpdate;
6955 }
6956
6957 // Get the min/max offset and the scale.
6958 TypeSize ScaleValue(0U, false), Width(0U, false);
6959 int64_t MinOff, MaxOff;
6960 if (!AArch64InstrInfo::getMemOpInfo(Opcode: MI.getOpcode(), Scale&: ScaleValue, Width, MinOffset&: MinOff,
6961 MaxOffset&: MaxOff))
6962 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6963
6964 // Construct the complete offset.
6965 bool IsMulVL = ScaleValue.isScalable();
6966 unsigned Scale = ScaleValue.getKnownMinValue();
6967 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
6968
6969 const MachineOperand &ImmOpnd =
6970 MI.getOperand(i: AArch64InstrInfo::getLoadStoreImmIdx(Opc: MI.getOpcode()));
6971 Offset += ImmOpnd.getImm() * Scale;
6972
6973 // If the offset doesn't match the scale, we rewrite the instruction to
6974 // use the unscaled instruction instead. Likewise, if we have a negative
6975 // offset and there is an unscaled op to use.
6976 std::optional<unsigned> UnscaledOp =
6977 AArch64InstrInfo::getUnscaledLdSt(Opc: MI.getOpcode());
6978 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
6979 if (useUnscaledOp &&
6980 !AArch64InstrInfo::getMemOpInfo(Opcode: *UnscaledOp, Scale&: ScaleValue, Width, MinOffset&: MinOff,
6981 MaxOffset&: MaxOff))
6982 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6983
6984 Scale = ScaleValue.getKnownMinValue();
6985 assert(IsMulVL == ScaleValue.isScalable() &&
6986 "Unscaled opcode has different value for scalable");
6987
6988 int64_t Remainder = Offset % Scale;
6989 assert(!(Remainder && useUnscaledOp) &&
6990 "Cannot have remainder when using unscaled op");
6991
6992 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
6993 int64_t NewOffset = Offset / Scale;
6994 if (MinOff <= NewOffset && NewOffset <= MaxOff)
6995 Offset = Remainder;
6996 else {
6997 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
6998 Offset = Offset - (NewOffset * Scale);
6999 }
7000
7001 if (EmittableOffset)
7002 *EmittableOffset = NewOffset;
7003 if (OutUseUnscaledOp)
7004 *OutUseUnscaledOp = useUnscaledOp;
7005 if (OutUnscaledOp && UnscaledOp)
7006 *OutUnscaledOp = *UnscaledOp;
7007
7008 if (IsMulVL)
7009 SOffset = StackOffset::get(Fixed: SOffset.getFixed(), Scalable: Offset);
7010 else
7011 SOffset = StackOffset::get(Fixed: Offset, Scalable: SOffset.getScalable());
7012 return AArch64FrameOffsetCanUpdate |
7013 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7014}
7015
7016bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
7017 unsigned FrameReg, StackOffset &Offset,
7018 const AArch64InstrInfo *TII) {
7019 unsigned Opcode = MI.getOpcode();
7020 unsigned ImmIdx = FrameRegIdx + 1;
7021
7022 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7023 Offset += StackOffset::getFixed(Fixed: MI.getOperand(i: ImmIdx).getImm());
7024 emitFrameOffset(MBB&: *MI.getParent(), MBBI: MI, DL: MI.getDebugLoc(),
7025 DestReg: MI.getOperand(i: 0).getReg(), SrcReg: FrameReg, Offset, TII,
7026 Flag: MachineInstr::NoFlags, SetNZCV: (Opcode == AArch64::ADDSXri));
7027 MI.eraseFromParent();
7028 Offset = StackOffset();
7029 return true;
7030 }
7031
7032 int64_t NewOffset;
7033 unsigned UnscaledOp;
7034 bool UseUnscaledOp;
7035 int Status = isAArch64FrameOffsetLegal(MI, SOffset&: Offset, OutUseUnscaledOp: &UseUnscaledOp,
7036 OutUnscaledOp: &UnscaledOp, EmittableOffset: &NewOffset);
7037 if (Status & AArch64FrameOffsetCanUpdate) {
7038 if (Status & AArch64FrameOffsetIsLegal)
7039 // Replace the FrameIndex with FrameReg.
7040 MI.getOperand(i: FrameRegIdx).ChangeToRegister(Reg: FrameReg, isDef: false);
7041 if (UseUnscaledOp)
7042 MI.setDesc(TII->get(Opcode: UnscaledOp));
7043
7044 MI.getOperand(i: ImmIdx).ChangeToImmediate(ImmVal: NewOffset);
7045 return !Offset;
7046 }
7047
7048 return false;
7049}
7050
7051void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
7052 MachineBasicBlock::iterator MI) const {
7053 DebugLoc DL;
7054 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AArch64::NOP));
7055}
7056
7057MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7058
7059// AArch64 supports MachineCombiner.
7060bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7061
7062// True when Opc sets flag
7063static bool isCombineInstrSettingFlag(unsigned Opc) {
7064 switch (Opc) {
7065 case AArch64::ADDSWrr:
7066 case AArch64::ADDSWri:
7067 case AArch64::ADDSXrr:
7068 case AArch64::ADDSXri:
7069 case AArch64::SUBSWrr:
7070 case AArch64::SUBSXrr:
7071 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7072 case AArch64::SUBSWri:
7073 case AArch64::SUBSXri:
7074 return true;
7075 default:
7076 break;
7077 }
7078 return false;
7079}
7080
7081// 32b Opcodes that can be combined with a MUL
7082static bool isCombineInstrCandidate32(unsigned Opc) {
7083 switch (Opc) {
7084 case AArch64::ADDWrr:
7085 case AArch64::ADDWri:
7086 case AArch64::SUBWrr:
7087 case AArch64::ADDSWrr:
7088 case AArch64::ADDSWri:
7089 case AArch64::SUBSWrr:
7090 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7091 case AArch64::SUBWri:
7092 case AArch64::SUBSWri:
7093 return true;
7094 default:
7095 break;
7096 }
7097 return false;
7098}
7099
7100// 64b Opcodes that can be combined with a MUL
7101static bool isCombineInstrCandidate64(unsigned Opc) {
7102 switch (Opc) {
7103 case AArch64::ADDXrr:
7104 case AArch64::ADDXri:
7105 case AArch64::SUBXrr:
7106 case AArch64::ADDSXrr:
7107 case AArch64::ADDSXri:
7108 case AArch64::SUBSXrr:
7109 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7110 case AArch64::SUBXri:
7111 case AArch64::SUBSXri:
7112 case AArch64::ADDv8i8:
7113 case AArch64::ADDv16i8:
7114 case AArch64::ADDv4i16:
7115 case AArch64::ADDv8i16:
7116 case AArch64::ADDv2i32:
7117 case AArch64::ADDv4i32:
7118 case AArch64::SUBv8i8:
7119 case AArch64::SUBv16i8:
7120 case AArch64::SUBv4i16:
7121 case AArch64::SUBv8i16:
7122 case AArch64::SUBv2i32:
7123 case AArch64::SUBv4i32:
7124 return true;
7125 default:
7126 break;
7127 }
7128 return false;
7129}
7130
7131// FP Opcodes that can be combined with a FMUL.
7132static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7133 switch (Inst.getOpcode()) {
7134 default:
7135 break;
7136 case AArch64::FADDHrr:
7137 case AArch64::FADDSrr:
7138 case AArch64::FADDDrr:
7139 case AArch64::FADDv4f16:
7140 case AArch64::FADDv8f16:
7141 case AArch64::FADDv2f32:
7142 case AArch64::FADDv2f64:
7143 case AArch64::FADDv4f32:
7144 case AArch64::FSUBHrr:
7145 case AArch64::FSUBSrr:
7146 case AArch64::FSUBDrr:
7147 case AArch64::FSUBv4f16:
7148 case AArch64::FSUBv8f16:
7149 case AArch64::FSUBv2f32:
7150 case AArch64::FSUBv2f64:
7151 case AArch64::FSUBv4f32:
7152 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
7153 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7154 // the target options or if FADD/FSUB has the contract fast-math flag.
7155 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7156 Inst.getFlag(Flag: MachineInstr::FmContract);
7157 }
7158 return false;
7159}
7160
7161// Opcodes that can be combined with a MUL
7162static bool isCombineInstrCandidate(unsigned Opc) {
7163 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
7164}
7165
7166//
7167// Utility routine that checks if \param MO is defined by an
7168// \param CombineOpc instruction in the basic block \param MBB
7169static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
7170 unsigned CombineOpc, unsigned ZeroReg = 0,
7171 bool CheckZeroReg = false) {
7172 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7173 MachineInstr *MI = nullptr;
7174
7175 if (MO.isReg() && MO.getReg().isVirtual())
7176 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
7177 // And it needs to be in the trace (otherwise, it won't have a depth).
7178 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7179 return false;
7180 // Must only used by the user we combine with.
7181 if (!MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
7182 return false;
7183
7184 if (CheckZeroReg) {
7185 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7186 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7187 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7188 // The third input reg must be zero.
7189 if (MI->getOperand(i: 3).getReg() != ZeroReg)
7190 return false;
7191 }
7192
7193 if (isCombineInstrSettingFlag(Opc: CombineOpc) &&
7194 MI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) == -1)
7195 return false;
7196
7197 return true;
7198}
7199
7200//
7201// Is \param MO defined by an integer multiply and can be combined?
7202static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
7203 unsigned MulOpc, unsigned ZeroReg) {
7204 return canCombine(MBB, MO, CombineOpc: MulOpc, ZeroReg, CheckZeroReg: true);
7205}
7206
7207//
7208// Is \param MO defined by a floating-point multiply and can be combined?
7209static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
7210 unsigned MulOpc) {
7211 return canCombine(MBB, MO, CombineOpc: MulOpc);
7212}
7213
7214// TODO: There are many more machine instruction opcodes to match:
7215// 1. Other data types (integer, vectors)
7216// 2. Other math / logic operations (xor, or)
7217// 3. Other forms of the same operation (intrinsics and other variants)
7218bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7219 bool Invert) const {
7220 if (Invert)
7221 return false;
7222 switch (Inst.getOpcode()) {
7223 // == Floating-point types ==
7224 // -- Floating-point instructions --
7225 case AArch64::FADDHrr:
7226 case AArch64::FADDSrr:
7227 case AArch64::FADDDrr:
7228 case AArch64::FMULHrr:
7229 case AArch64::FMULSrr:
7230 case AArch64::FMULDrr:
7231 case AArch64::FMULX16:
7232 case AArch64::FMULX32:
7233 case AArch64::FMULX64:
7234 // -- Advanced SIMD instructions --
7235 case AArch64::FADDv4f16:
7236 case AArch64::FADDv8f16:
7237 case AArch64::FADDv2f32:
7238 case AArch64::FADDv4f32:
7239 case AArch64::FADDv2f64:
7240 case AArch64::FMULv4f16:
7241 case AArch64::FMULv8f16:
7242 case AArch64::FMULv2f32:
7243 case AArch64::FMULv4f32:
7244 case AArch64::FMULv2f64:
7245 case AArch64::FMULXv4f16:
7246 case AArch64::FMULXv8f16:
7247 case AArch64::FMULXv2f32:
7248 case AArch64::FMULXv4f32:
7249 case AArch64::FMULXv2f64:
7250 // -- SVE instructions --
7251 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7252 // in the SVE instruction set (though there are predicated ones).
7253 case AArch64::FADD_ZZZ_H:
7254 case AArch64::FADD_ZZZ_S:
7255 case AArch64::FADD_ZZZ_D:
7256 case AArch64::FMUL_ZZZ_H:
7257 case AArch64::FMUL_ZZZ_S:
7258 case AArch64::FMUL_ZZZ_D:
7259 return Inst.getFlag(Flag: MachineInstr::MIFlag::FmReassoc) &&
7260 Inst.getFlag(Flag: MachineInstr::MIFlag::FmNsz);
7261
7262 // == Integer types ==
7263 // -- Base instructions --
7264 // Opcodes MULWrr and MULXrr don't exist because
7265 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7266 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7267 // The machine-combiner does not support three-source-operands machine
7268 // instruction. So we cannot reassociate MULs.
7269 case AArch64::ADDWrr:
7270 case AArch64::ADDXrr:
7271 case AArch64::ANDWrr:
7272 case AArch64::ANDXrr:
7273 case AArch64::ORRWrr:
7274 case AArch64::ORRXrr:
7275 case AArch64::EORWrr:
7276 case AArch64::EORXrr:
7277 case AArch64::EONWrr:
7278 case AArch64::EONXrr:
7279 // -- Advanced SIMD instructions --
7280 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7281 // in the Advanced SIMD instruction set.
7282 case AArch64::ADDv8i8:
7283 case AArch64::ADDv16i8:
7284 case AArch64::ADDv4i16:
7285 case AArch64::ADDv8i16:
7286 case AArch64::ADDv2i32:
7287 case AArch64::ADDv4i32:
7288 case AArch64::ADDv1i64:
7289 case AArch64::ADDv2i64:
7290 case AArch64::MULv8i8:
7291 case AArch64::MULv16i8:
7292 case AArch64::MULv4i16:
7293 case AArch64::MULv8i16:
7294 case AArch64::MULv2i32:
7295 case AArch64::MULv4i32:
7296 case AArch64::ANDv8i8:
7297 case AArch64::ANDv16i8:
7298 case AArch64::ORRv8i8:
7299 case AArch64::ORRv16i8:
7300 case AArch64::EORv8i8:
7301 case AArch64::EORv16i8:
7302 // -- SVE instructions --
7303 case AArch64::ADD_ZZZ_B:
7304 case AArch64::ADD_ZZZ_H:
7305 case AArch64::ADD_ZZZ_S:
7306 case AArch64::ADD_ZZZ_D:
7307 case AArch64::MUL_ZZZ_B:
7308 case AArch64::MUL_ZZZ_H:
7309 case AArch64::MUL_ZZZ_S:
7310 case AArch64::MUL_ZZZ_D:
7311 case AArch64::AND_ZZZ:
7312 case AArch64::ORR_ZZZ:
7313 case AArch64::EOR_ZZZ:
7314 return true;
7315
7316 default:
7317 return false;
7318 }
7319}
7320
7321/// Find instructions that can be turned into madd.
7322static bool getMaddPatterns(MachineInstr &Root,
7323 SmallVectorImpl<unsigned> &Patterns) {
7324 unsigned Opc = Root.getOpcode();
7325 MachineBasicBlock &MBB = *Root.getParent();
7326 bool Found = false;
7327
7328 if (!isCombineInstrCandidate(Opc))
7329 return false;
7330 if (isCombineInstrSettingFlag(Opc)) {
7331 int Cmp_NZCV =
7332 Root.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true);
7333 // When NZCV is live bail out.
7334 if (Cmp_NZCV == -1)
7335 return false;
7336 unsigned NewOpc = convertToNonFlagSettingOpc(MI: Root);
7337 // When opcode can't change bail out.
7338 // CHECKME: do we miss any cases for opcode conversion?
7339 if (NewOpc == Opc)
7340 return false;
7341 Opc = NewOpc;
7342 }
7343
7344 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7345 unsigned Pattern) {
7346 if (canCombineWithMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode, ZeroReg)) {
7347 Patterns.push_back(Elt: Pattern);
7348 Found = true;
7349 }
7350 };
7351
7352 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7353 if (canCombine(MBB, MO&: Root.getOperand(i: Operand), CombineOpc: Opcode)) {
7354 Patterns.push_back(Elt: Pattern);
7355 Found = true;
7356 }
7357 };
7358
7359 typedef AArch64MachineCombinerPattern MCP;
7360
7361 switch (Opc) {
7362 default:
7363 break;
7364 case AArch64::ADDWrr:
7365 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7366 "ADDWrr does not have register operands");
7367 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7368 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7369 break;
7370 case AArch64::ADDXrr:
7371 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7372 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7373 break;
7374 case AArch64::SUBWrr:
7375 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7376 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7377 break;
7378 case AArch64::SUBXrr:
7379 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7380 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7381 break;
7382 case AArch64::ADDWri:
7383 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7384 break;
7385 case AArch64::ADDXri:
7386 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7387 break;
7388 case AArch64::SUBWri:
7389 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7390 break;
7391 case AArch64::SUBXri:
7392 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7393 break;
7394 case AArch64::ADDv8i8:
7395 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7396 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7397 break;
7398 case AArch64::ADDv16i8:
7399 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7400 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7401 break;
7402 case AArch64::ADDv4i16:
7403 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7404 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7405 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7406 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7407 break;
7408 case AArch64::ADDv8i16:
7409 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7410 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7411 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7412 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7413 break;
7414 case AArch64::ADDv2i32:
7415 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7416 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7417 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7418 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7419 break;
7420 case AArch64::ADDv4i32:
7421 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7422 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7423 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7424 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7425 break;
7426 case AArch64::SUBv8i8:
7427 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7428 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7429 break;
7430 case AArch64::SUBv16i8:
7431 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7432 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7433 break;
7434 case AArch64::SUBv4i16:
7435 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7436 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7437 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7438 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7439 break;
7440 case AArch64::SUBv8i16:
7441 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7442 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7443 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7444 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7445 break;
7446 case AArch64::SUBv2i32:
7447 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7448 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7449 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7450 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7451 break;
7452 case AArch64::SUBv4i32:
7453 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7454 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7455 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7456 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7457 break;
7458 }
7459 return Found;
7460}
7461
7462bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7463 switch (Opcode) {
7464 default:
7465 break;
7466 case AArch64::UABALB_ZZZ_D:
7467 case AArch64::UABALB_ZZZ_H:
7468 case AArch64::UABALB_ZZZ_S:
7469 case AArch64::UABALT_ZZZ_D:
7470 case AArch64::UABALT_ZZZ_H:
7471 case AArch64::UABALT_ZZZ_S:
7472 case AArch64::SABALB_ZZZ_D:
7473 case AArch64::SABALB_ZZZ_S:
7474 case AArch64::SABALB_ZZZ_H:
7475 case AArch64::SABALT_ZZZ_D:
7476 case AArch64::SABALT_ZZZ_S:
7477 case AArch64::SABALT_ZZZ_H:
7478 case AArch64::UABALv16i8_v8i16:
7479 case AArch64::UABALv2i32_v2i64:
7480 case AArch64::UABALv4i16_v4i32:
7481 case AArch64::UABALv4i32_v2i64:
7482 case AArch64::UABALv8i16_v4i32:
7483 case AArch64::UABALv8i8_v8i16:
7484 case AArch64::UABAv16i8:
7485 case AArch64::UABAv2i32:
7486 case AArch64::UABAv4i16:
7487 case AArch64::UABAv4i32:
7488 case AArch64::UABAv8i16:
7489 case AArch64::UABAv8i8:
7490 case AArch64::SABALv16i8_v8i16:
7491 case AArch64::SABALv2i32_v2i64:
7492 case AArch64::SABALv4i16_v4i32:
7493 case AArch64::SABALv4i32_v2i64:
7494 case AArch64::SABALv8i16_v4i32:
7495 case AArch64::SABALv8i8_v8i16:
7496 case AArch64::SABAv16i8:
7497 case AArch64::SABAv2i32:
7498 case AArch64::SABAv4i16:
7499 case AArch64::SABAv4i32:
7500 case AArch64::SABAv8i16:
7501 case AArch64::SABAv8i8:
7502 return true;
7503 }
7504
7505 return false;
7506}
7507
7508unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7509 unsigned AccumulationOpcode) const {
7510 switch (AccumulationOpcode) {
7511 default:
7512 llvm_unreachable("Unsupported accumulation Opcode!");
7513 case AArch64::UABALB_ZZZ_D:
7514 return AArch64::UABDLB_ZZZ_D;
7515 case AArch64::UABALB_ZZZ_H:
7516 return AArch64::UABDLB_ZZZ_H;
7517 case AArch64::UABALB_ZZZ_S:
7518 return AArch64::UABDLB_ZZZ_S;
7519 case AArch64::UABALT_ZZZ_D:
7520 return AArch64::UABDLT_ZZZ_D;
7521 case AArch64::UABALT_ZZZ_H:
7522 return AArch64::UABDLT_ZZZ_H;
7523 case AArch64::UABALT_ZZZ_S:
7524 return AArch64::UABDLT_ZZZ_S;
7525 case AArch64::UABALv16i8_v8i16:
7526 return AArch64::UABDLv16i8_v8i16;
7527 case AArch64::UABALv2i32_v2i64:
7528 return AArch64::UABDLv2i32_v2i64;
7529 case AArch64::UABALv4i16_v4i32:
7530 return AArch64::UABDLv4i16_v4i32;
7531 case AArch64::UABALv4i32_v2i64:
7532 return AArch64::UABDLv4i32_v2i64;
7533 case AArch64::UABALv8i16_v4i32:
7534 return AArch64::UABDLv8i16_v4i32;
7535 case AArch64::UABALv8i8_v8i16:
7536 return AArch64::UABDLv8i8_v8i16;
7537 case AArch64::UABAv16i8:
7538 return AArch64::UABDv16i8;
7539 case AArch64::UABAv2i32:
7540 return AArch64::UABDv2i32;
7541 case AArch64::UABAv4i16:
7542 return AArch64::UABDv4i16;
7543 case AArch64::UABAv4i32:
7544 return AArch64::UABDv4i32;
7545 case AArch64::UABAv8i16:
7546 return AArch64::UABDv8i16;
7547 case AArch64::UABAv8i8:
7548 return AArch64::UABDv8i8;
7549 case AArch64::SABALB_ZZZ_D:
7550 return AArch64::SABDLB_ZZZ_D;
7551 case AArch64::SABALB_ZZZ_S:
7552 return AArch64::SABDLB_ZZZ_S;
7553 case AArch64::SABALB_ZZZ_H:
7554 return AArch64::SABDLB_ZZZ_H;
7555 case AArch64::SABALT_ZZZ_D:
7556 return AArch64::SABDLT_ZZZ_D;
7557 case AArch64::SABALT_ZZZ_S:
7558 return AArch64::SABDLT_ZZZ_S;
7559 case AArch64::SABALT_ZZZ_H:
7560 return AArch64::SABDLT_ZZZ_H;
7561 case AArch64::SABALv16i8_v8i16:
7562 return AArch64::SABDLv16i8_v8i16;
7563 case AArch64::SABALv2i32_v2i64:
7564 return AArch64::SABDLv2i32_v2i64;
7565 case AArch64::SABALv4i16_v4i32:
7566 return AArch64::SABDLv4i16_v4i32;
7567 case AArch64::SABALv4i32_v2i64:
7568 return AArch64::SABDLv4i32_v2i64;
7569 case AArch64::SABALv8i16_v4i32:
7570 return AArch64::SABDLv8i16_v4i32;
7571 case AArch64::SABALv8i8_v8i16:
7572 return AArch64::SABDLv8i8_v8i16;
7573 case AArch64::SABAv16i8:
7574 return AArch64::SABDv16i8;
7575 case AArch64::SABAv2i32:
7576 return AArch64::SABAv2i32;
7577 case AArch64::SABAv4i16:
7578 return AArch64::SABDv4i16;
7579 case AArch64::SABAv4i32:
7580 return AArch64::SABDv4i32;
7581 case AArch64::SABAv8i16:
7582 return AArch64::SABDv8i16;
7583 case AArch64::SABAv8i8:
7584 return AArch64::SABDv8i8;
7585 }
7586}
7587
7588/// Floating-Point Support
7589
7590/// Find instructions that can be turned into madd.
7591static bool getFMAPatterns(MachineInstr &Root,
7592 SmallVectorImpl<unsigned> &Patterns) {
7593
7594 if (!isCombineInstrCandidateFP(Inst: Root))
7595 return false;
7596
7597 MachineBasicBlock &MBB = *Root.getParent();
7598 bool Found = false;
7599
7600 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7601 if (canCombineWithFMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode)) {
7602 Patterns.push_back(Elt: Pattern);
7603 return true;
7604 }
7605 return false;
7606 };
7607
7608 typedef AArch64MachineCombinerPattern MCP;
7609
7610 switch (Root.getOpcode()) {
7611 default:
7612 assert(false && "Unsupported FP instruction in combiner\n");
7613 break;
7614 case AArch64::FADDHrr:
7615 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7616 "FADDHrr does not have register operands");
7617
7618 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7619 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7620 break;
7621 case AArch64::FADDSrr:
7622 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7623 "FADDSrr does not have register operands");
7624
7625 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7626 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7627
7628 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7629 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7630 break;
7631 case AArch64::FADDDrr:
7632 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7633 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7634
7635 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7636 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7637 break;
7638 case AArch64::FADDv4f16:
7639 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7640 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7641
7642 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7643 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7644 break;
7645 case AArch64::FADDv8f16:
7646 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7647 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7648
7649 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7650 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7651 break;
7652 case AArch64::FADDv2f32:
7653 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7654 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7655
7656 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7657 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7658 break;
7659 case AArch64::FADDv2f64:
7660 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7661 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7662
7663 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7664 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7665 break;
7666 case AArch64::FADDv4f32:
7667 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7668 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7669
7670 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7671 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7672 break;
7673 case AArch64::FSUBHrr:
7674 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7675 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7676 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7677 break;
7678 case AArch64::FSUBSrr:
7679 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7680
7681 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7682 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7683
7684 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7685 break;
7686 case AArch64::FSUBDrr:
7687 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7688
7689 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7690 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7691
7692 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7693 break;
7694 case AArch64::FSUBv4f16:
7695 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7696 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7697
7698 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7699 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7700 break;
7701 case AArch64::FSUBv8f16:
7702 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7703 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7704
7705 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7706 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7707 break;
7708 case AArch64::FSUBv2f32:
7709 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7710 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7711
7712 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7713 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7714 break;
7715 case AArch64::FSUBv2f64:
7716 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7717 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7718
7719 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7720 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7721 break;
7722 case AArch64::FSUBv4f32:
7723 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7724 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7725
7726 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7727 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7728 break;
7729 }
7730 return Found;
7731}
7732
7733static bool getFMULPatterns(MachineInstr &Root,
7734 SmallVectorImpl<unsigned> &Patterns) {
7735 MachineBasicBlock &MBB = *Root.getParent();
7736 bool Found = false;
7737
7738 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7739 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7740 MachineOperand &MO = Root.getOperand(i: Operand);
7741 MachineInstr *MI = nullptr;
7742 if (MO.isReg() && MO.getReg().isVirtual())
7743 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
7744 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7745 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7746 MI->getOperand(i: 1).getReg().isVirtual())
7747 MI = MRI.getUniqueVRegDef(Reg: MI->getOperand(i: 1).getReg());
7748 if (MI && MI->getOpcode() == Opcode) {
7749 Patterns.push_back(Elt: Pattern);
7750 return true;
7751 }
7752 return false;
7753 };
7754
7755 typedef AArch64MachineCombinerPattern MCP;
7756
7757 switch (Root.getOpcode()) {
7758 default:
7759 return false;
7760 case AArch64::FMULv2f32:
7761 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7762 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7763 break;
7764 case AArch64::FMULv2f64:
7765 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7766 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7767 break;
7768 case AArch64::FMULv4f16:
7769 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7770 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7771 break;
7772 case AArch64::FMULv4f32:
7773 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7774 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7775 break;
7776 case AArch64::FMULv8f16:
7777 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7778 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7779 break;
7780 }
7781
7782 return Found;
7783}
7784
7785static bool getFNEGPatterns(MachineInstr &Root,
7786 SmallVectorImpl<unsigned> &Patterns) {
7787 unsigned Opc = Root.getOpcode();
7788 MachineBasicBlock &MBB = *Root.getParent();
7789 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7790
7791 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7792 MachineOperand &MO = Root.getOperand(i: 1);
7793 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
7794 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7795 MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()) &&
7796 Root.getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
7797 Root.getFlag(Flag: MachineInstr::MIFlag::FmNsz) &&
7798 MI->getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
7799 MI->getFlag(Flag: MachineInstr::MIFlag::FmNsz)) {
7800 Patterns.push_back(Elt: Pattern);
7801 return true;
7802 }
7803 return false;
7804 };
7805
7806 switch (Opc) {
7807 default:
7808 break;
7809 case AArch64::FNEGDr:
7810 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7811 case AArch64::FNEGSr:
7812 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7813 }
7814
7815 return false;
7816}
7817
7818/// Return true when a code sequence can improve throughput. It
7819/// should be called only for instructions in loops.
7820/// \param Pattern - combiner pattern
7821bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
7822 switch (Pattern) {
7823 default:
7824 break;
7825 case AArch64MachineCombinerPattern::FMULADDH_OP1:
7826 case AArch64MachineCombinerPattern::FMULADDH_OP2:
7827 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7828 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7829 case AArch64MachineCombinerPattern::FMULADDS_OP1:
7830 case AArch64MachineCombinerPattern::FMULADDS_OP2:
7831 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7832 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7833 case AArch64MachineCombinerPattern::FMULADDD_OP1:
7834 case AArch64MachineCombinerPattern::FMULADDD_OP2:
7835 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7836 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7837 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7838 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7839 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7840 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7841 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7842 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7843 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7844 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7845 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7846 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7847 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7848 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7849 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7850 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7851 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7852 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7853 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7854 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7855 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7856 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7857 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7858 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7859 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7860 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7861 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7862 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7863 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7864 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
7865 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7866 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
7867 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7868 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7869 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7870 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7871 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7872 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7873 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7874 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7875 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7876 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7877 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7878 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7879 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7880 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7881 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
7882 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7883 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
7884 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7885 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
7886 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7887 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
7888 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7889 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
7890 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7891 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7892 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7893 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7894 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7895 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7896 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7897 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7898 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7899 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7900 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7901 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7902 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7903 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7904 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7905 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7906 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7907 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7908 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7909 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7910 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7911 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7912 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7913 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7914 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7915 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7916 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7917 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7918 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7919 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7920 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7921 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7922 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7923 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7924 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7925 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7926 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7927 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7928 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7929 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7930 return true;
7931 } // end switch (Pattern)
7932 return false;
7933}
7934
7935/// Find other MI combine patterns.
7936static bool getMiscPatterns(MachineInstr &Root,
7937 SmallVectorImpl<unsigned> &Patterns) {
7938 // A - (B + C) ==> (A - B) - C or (A - C) - B
7939 unsigned Opc = Root.getOpcode();
7940 MachineBasicBlock &MBB = *Root.getParent();
7941
7942 switch (Opc) {
7943 case AArch64::SUBWrr:
7944 case AArch64::SUBSWrr:
7945 case AArch64::SUBXrr:
7946 case AArch64::SUBSXrr:
7947 // Found candidate root.
7948 break;
7949 default:
7950 return false;
7951 }
7952
7953 if (isCombineInstrSettingFlag(Opc) &&
7954 Root.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) ==
7955 -1)
7956 return false;
7957
7958 if (canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDWrr) ||
7959 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDSWrr) ||
7960 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDXrr) ||
7961 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDSXrr)) {
7962 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP1);
7963 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP2);
7964 return true;
7965 }
7966
7967 return false;
7968}
7969
7970/// Check if the given instruction forms a gather load pattern that can be
7971/// optimized for better Memory-Level Parallelism (MLP). This function
7972/// identifies chains of NEON lane load instructions that load data from
7973/// different memory addresses into individual lanes of a 128-bit vector
7974/// register, then attempts to split the pattern into parallel loads to break
7975/// the serial dependency between instructions.
7976///
7977/// Pattern Matched:
7978/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
7979/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
7980///
7981/// Transformed Into:
7982/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
7983/// to combine the results, enabling better memory-level parallelism.
7984///
7985/// Supported Element Types:
7986/// - 32-bit elements (LD1i32, 4 lanes total)
7987/// - 16-bit elements (LD1i16, 8 lanes total)
7988/// - 8-bit elements (LD1i8, 16 lanes total)
7989static bool getGatherLanePattern(MachineInstr &Root,
7990 SmallVectorImpl<unsigned> &Patterns,
7991 unsigned LoadLaneOpCode, unsigned NumLanes) {
7992 const MachineFunction *MF = Root.getMF();
7993
7994 // Early exit if optimizing for size.
7995 if (MF->getFunction().hasMinSize())
7996 return false;
7997
7998 const MachineRegisterInfo &MRI = MF->getRegInfo();
7999 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
8000
8001 // The root of the pattern must load into the last lane of the vector.
8002 if (Root.getOperand(i: 2).getImm() != NumLanes - 1)
8003 return false;
8004
8005 // Check that we have load into all lanes except lane 0.
8006 // For each load we also want to check that:
8007 // 1. It has a single non-debug use (since we will be replacing the virtual
8008 // register)
8009 // 2. That the addressing mode only uses a single pointer operand
8010 auto *CurrInstr = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 1).getReg());
8011 auto Range = llvm::seq<unsigned>(Begin: 1, End: NumLanes - 1);
8012 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8013 SmallVector<const MachineInstr *, 16> LoadInstrs;
8014 while (!RemainingLanes.empty() && CurrInstr &&
8015 CurrInstr->getOpcode() == LoadLaneOpCode &&
8016 MRI.hasOneNonDBGUse(RegNo: CurrInstr->getOperand(i: 0).getReg()) &&
8017 CurrInstr->getNumOperands() == 4) {
8018 RemainingLanes.erase(V: CurrInstr->getOperand(i: 2).getImm());
8019 LoadInstrs.push_back(Elt: CurrInstr);
8020 CurrInstr = MRI.getUniqueVRegDef(Reg: CurrInstr->getOperand(i: 1).getReg());
8021 }
8022
8023 // Check that we have found a match for lanes N-1.. 1.
8024 if (!RemainingLanes.empty())
8025 return false;
8026
8027 // Match the SUBREG_TO_REG sequence.
8028 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8029 return false;
8030
8031 // Verify that the subreg to reg loads an integer into the first lane.
8032 auto Lane0LoadReg = CurrInstr->getOperand(i: 2).getReg();
8033 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8034 if (TRI->getRegSizeInBits(Reg: Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8035 return false;
8036
8037 // Verify that it also has a single non debug use.
8038 if (!MRI.hasOneNonDBGUse(RegNo: Lane0LoadReg))
8039 return false;
8040
8041 LoadInstrs.push_back(Elt: MRI.getUniqueVRegDef(Reg: Lane0LoadReg));
8042
8043 // If there is any chance of aliasing, do not apply the pattern.
8044 // Walk backward through the MBB starting from Root.
8045 // Exit early if we've encountered all load instructions or hit the search
8046 // limit.
8047 auto MBBItr = Root.getIterator();
8048 unsigned RemainingSteps = GatherOptSearchLimit;
8049 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8050 RemainingLoadInstrs.insert(I: LoadInstrs.begin(), E: LoadInstrs.end());
8051 const MachineBasicBlock *MBB = Root.getParent();
8052
8053 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8054 !RemainingLoadInstrs.empty();
8055 --MBBItr, --RemainingSteps) {
8056 const MachineInstr &CurrInstr = *MBBItr;
8057
8058 // Remove this instruction from remaining loads if it's one we're tracking.
8059 RemainingLoadInstrs.erase(Ptr: &CurrInstr);
8060
8061 // Check for potential aliasing with any of the load instructions to
8062 // optimize.
8063 if (CurrInstr.isLoadFoldBarrier())
8064 return false;
8065 }
8066
8067 // If we hit the search limit without finding all load instructions,
8068 // don't match the pattern.
8069 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8070 return false;
8071
8072 switch (NumLanes) {
8073 case 4:
8074 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i32);
8075 break;
8076 case 8:
8077 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i16);
8078 break;
8079 case 16:
8080 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i8);
8081 break;
8082 default:
8083 llvm_unreachable("Got bad number of lanes for gather pattern.");
8084 }
8085
8086 return true;
8087}
8088
8089/// Search for patterns of LD instructions we can optimize.
8090static bool getLoadPatterns(MachineInstr &Root,
8091 SmallVectorImpl<unsigned> &Patterns) {
8092
8093 // The pattern searches for loads into single lanes.
8094 switch (Root.getOpcode()) {
8095 case AArch64::LD1i32:
8096 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 4);
8097 case AArch64::LD1i16:
8098 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 8);
8099 case AArch64::LD1i8:
8100 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 16);
8101 default:
8102 return false;
8103 }
8104}
8105
8106/// Generate optimized instruction sequence for gather load patterns to improve
8107/// Memory-Level Parallelism (MLP). This function transforms a chain of
8108/// sequential NEON lane loads into parallel vector loads that can execute
8109/// concurrently.
8110static void
8111generateGatherLanePattern(MachineInstr &Root,
8112 SmallVectorImpl<MachineInstr *> &InsInstrs,
8113 SmallVectorImpl<MachineInstr *> &DelInstrs,
8114 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8115 unsigned Pattern, unsigned NumLanes) {
8116 MachineFunction &MF = *Root.getParent()->getParent();
8117 MachineRegisterInfo &MRI = MF.getRegInfo();
8118 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8119
8120 // Gather the initial load instructions to build the pattern.
8121 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8122 MachineInstr *CurrInstr = &Root;
8123 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8124 LoadToLaneInstrs.push_back(Elt: CurrInstr);
8125 CurrInstr = MRI.getUniqueVRegDef(Reg: CurrInstr->getOperand(i: 1).getReg());
8126 }
8127
8128 // Sort the load instructions according to the lane.
8129 llvm::sort(C&: LoadToLaneInstrs,
8130 Comp: [](const MachineInstr *A, const MachineInstr *B) {
8131 return A->getOperand(i: 2).getImm() > B->getOperand(i: 2).getImm();
8132 });
8133
8134 MachineInstr *SubregToReg = CurrInstr;
8135 LoadToLaneInstrs.push_back(
8136 Elt: MRI.getUniqueVRegDef(Reg: SubregToReg->getOperand(i: 2).getReg()));
8137 auto LoadToLaneInstrsAscending = llvm::reverse(C&: LoadToLaneInstrs);
8138
8139 const TargetRegisterClass *FPR128RegClass =
8140 MRI.getRegClass(Reg: Root.getOperand(i: 0).getReg());
8141
8142 // Helper lambda to create a LD1 instruction.
8143 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8144 Register SrcRegister, unsigned Lane,
8145 Register OffsetRegister,
8146 bool OffsetRegisterKillState) {
8147 auto NewRegister = MRI.createVirtualRegister(RegClass: FPR128RegClass);
8148 MachineInstrBuilder LoadIndexIntoRegister =
8149 BuildMI(MF, MIMD: MIMetadata(*OriginalInstr), MCID: TII->get(Opcode: Root.getOpcode()),
8150 DestReg: NewRegister)
8151 .addReg(RegNo: SrcRegister)
8152 .addImm(Val: Lane)
8153 .addReg(RegNo: OffsetRegister, Flags: getKillRegState(B: OffsetRegisterKillState));
8154 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewRegister, y: InsInstrs.size()));
8155 InsInstrs.push_back(Elt: LoadIndexIntoRegister);
8156 return NewRegister;
8157 };
8158
8159 // Helper to create load instruction based on the NumLanes in the NEON
8160 // register we are rewriting.
8161 auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
8162 Register OffsetReg,
8163 bool KillState) -> MachineInstrBuilder {
8164 unsigned Opcode;
8165 switch (NumLanes) {
8166 case 4:
8167 Opcode = AArch64::LDRSui;
8168 break;
8169 case 8:
8170 Opcode = AArch64::LDRHui;
8171 break;
8172 case 16:
8173 Opcode = AArch64::LDRBui;
8174 break;
8175 default:
8176 llvm_unreachable(
8177 "Got unsupported number of lanes in machine-combiner gather pattern");
8178 }
8179 // Immediate offset load
8180 return BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg)
8181 .addReg(RegNo: OffsetReg)
8182 .addImm(Val: 0);
8183 };
8184
8185 // Load the remaining lanes into register 0.
8186 auto LanesToLoadToReg0 =
8187 llvm::make_range(x: LoadToLaneInstrsAscending.begin() + 1,
8188 y: LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8189 Register PrevReg = SubregToReg->getOperand(i: 0).getReg();
8190 for (auto [Index, LoadInstr] : llvm::enumerate(First&: LanesToLoadToReg0)) {
8191 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(i: 3);
8192 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8193 OffsetRegOperand.getReg(),
8194 OffsetRegOperand.isKill());
8195 DelInstrs.push_back(Elt: LoadInstr);
8196 }
8197 Register LastLoadReg0 = PrevReg;
8198
8199 // First load into register 1. Perform an integer load to zero out the upper
8200 // lanes in a single instruction.
8201 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8202 MachineInstr *OriginalSplitLoad =
8203 *std::next(x: LoadToLaneInstrsAscending.begin(), n: NumLanes / 2);
8204 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8205 RegClass: MRI.getRegClass(Reg: Lane0Load->getOperand(i: 0).getReg()));
8206
8207 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8208 OriginalSplitLoad->getOperand(i: 3);
8209 MachineInstrBuilder MiddleIndexLoadInstr =
8210 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8211 OriginalSplitToLoadOffsetOperand.getReg(),
8212 OriginalSplitToLoadOffsetOperand.isKill());
8213
8214 InstrIdxForVirtReg.insert(
8215 KV: std::make_pair(x&: DestRegForMiddleIndex, y: InsInstrs.size()));
8216 InsInstrs.push_back(Elt: MiddleIndexLoadInstr);
8217 DelInstrs.push_back(Elt: OriginalSplitLoad);
8218
8219 // Subreg To Reg instruction for register 1.
8220 Register DestRegForSubregToReg = MRI.createVirtualRegister(RegClass: FPR128RegClass);
8221 unsigned SubregType;
8222 switch (NumLanes) {
8223 case 4:
8224 SubregType = AArch64::ssub;
8225 break;
8226 case 8:
8227 SubregType = AArch64::hsub;
8228 break;
8229 case 16:
8230 SubregType = AArch64::bsub;
8231 break;
8232 default:
8233 llvm_unreachable(
8234 "Got invalid NumLanes for machine-combiner gather pattern");
8235 }
8236
8237 auto SubRegToRegInstr =
8238 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: SubregToReg->getOpcode()),
8239 DestReg: DestRegForSubregToReg)
8240 .addImm(Val: 0)
8241 .addReg(RegNo: DestRegForMiddleIndex, Flags: getKillRegState(B: true))
8242 .addImm(Val: SubregType);
8243 InstrIdxForVirtReg.insert(
8244 KV: std::make_pair(x&: DestRegForSubregToReg, y: InsInstrs.size()));
8245 InsInstrs.push_back(Elt: SubRegToRegInstr);
8246
8247 // Load remaining lanes into register 1.
8248 auto LanesToLoadToReg1 =
8249 llvm::make_range(x: LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8250 y: LoadToLaneInstrsAscending.end());
8251 PrevReg = SubRegToRegInstr->getOperand(i: 0).getReg();
8252 for (auto [Index, LoadInstr] : llvm::enumerate(First&: LanesToLoadToReg1)) {
8253 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(i: 3);
8254 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8255 OffsetRegOperand.getReg(),
8256 OffsetRegOperand.isKill());
8257
8258 // Do not add the last reg to DelInstrs - it will be removed later.
8259 if (Index == NumLanes / 2 - 2) {
8260 break;
8261 }
8262 DelInstrs.push_back(Elt: LoadInstr);
8263 }
8264 Register LastLoadReg1 = PrevReg;
8265
8266 // Create the final zip instruction to combine the results.
8267 MachineInstrBuilder ZipInstr =
8268 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::ZIP1v2i64),
8269 DestReg: Root.getOperand(i: 0).getReg())
8270 .addReg(RegNo: LastLoadReg0)
8271 .addReg(RegNo: LastLoadReg1);
8272 InsInstrs.push_back(Elt: ZipInstr);
8273}
8274
8275CombinerObjective
8276AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
8277 switch (Pattern) {
8278 case AArch64MachineCombinerPattern::SUBADD_OP1:
8279 case AArch64MachineCombinerPattern::SUBADD_OP2:
8280 case AArch64MachineCombinerPattern::GATHER_LANE_i32:
8281 case AArch64MachineCombinerPattern::GATHER_LANE_i16:
8282 case AArch64MachineCombinerPattern::GATHER_LANE_i8:
8283 return CombinerObjective::MustReduceDepth;
8284 default:
8285 return TargetInstrInfo::getCombinerObjective(Pattern);
8286 }
8287}
8288
8289/// Return true when there is potentially a faster code sequence for an
8290/// instruction chain ending in \p Root. All potential patterns are listed in
8291/// the \p Pattern vector. Pattern should be sorted in priority order since the
8292/// pattern evaluator stops checking as soon as it finds a faster sequence.
8293
8294bool AArch64InstrInfo::getMachineCombinerPatterns(
8295 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8296 bool DoRegPressureReduce) const {
8297 // Integer patterns
8298 if (getMaddPatterns(Root, Patterns))
8299 return true;
8300 // Floating point patterns
8301 if (getFMULPatterns(Root, Patterns))
8302 return true;
8303 if (getFMAPatterns(Root, Patterns))
8304 return true;
8305 if (getFNEGPatterns(Root, Patterns))
8306 return true;
8307
8308 // Other patterns
8309 if (getMiscPatterns(Root, Patterns))
8310 return true;
8311
8312 // Load patterns
8313 if (getLoadPatterns(Root, Patterns))
8314 return true;
8315
8316 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8317 DoRegPressureReduce);
8318}
8319
8320enum class FMAInstKind { Default, Indexed, Accumulator };
8321/// genFusedMultiply - Generate fused multiply instructions.
8322/// This function supports both integer and floating point instructions.
8323/// A typical example:
8324/// F|MUL I=A,B,0
8325/// F|ADD R,I,C
8326/// ==> F|MADD R,A,B,C
8327/// \param MF Containing MachineFunction
8328/// \param MRI Register information
8329/// \param TII Target information
8330/// \param Root is the F|ADD instruction
8331/// \param [out] InsInstrs is a vector of machine instructions and will
8332/// contain the generated madd instruction
8333/// \param IdxMulOpd is index of operand in Root that is the result of
8334/// the F|MUL. In the example above IdxMulOpd is 1.
8335/// \param MaddOpc the opcode fo the f|madd instruction
8336/// \param RC Register class of operands
8337/// \param kind of fma instruction (addressing mode) to be generated
8338/// \param ReplacedAddend is the result register from the instruction
8339/// replacing the non-combined operand, if any.
8340static MachineInstr *
8341genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
8342 const TargetInstrInfo *TII, MachineInstr &Root,
8343 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8344 unsigned MaddOpc, const TargetRegisterClass *RC,
8345 FMAInstKind kind = FMAInstKind::Default,
8346 const Register *ReplacedAddend = nullptr) {
8347 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8348
8349 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8350 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
8351 Register ResultReg = Root.getOperand(i: 0).getReg();
8352 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
8353 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
8354 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
8355 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
8356
8357 Register SrcReg2;
8358 bool Src2IsKill;
8359 if (ReplacedAddend) {
8360 // If we just generated a new addend, we must be it's only use.
8361 SrcReg2 = *ReplacedAddend;
8362 Src2IsKill = true;
8363 } else {
8364 SrcReg2 = Root.getOperand(i: IdxOtherOpd).getReg();
8365 Src2IsKill = Root.getOperand(i: IdxOtherOpd).isKill();
8366 }
8367
8368 if (ResultReg.isVirtual())
8369 MRI.constrainRegClass(Reg: ResultReg, RC);
8370 if (SrcReg0.isVirtual())
8371 MRI.constrainRegClass(Reg: SrcReg0, RC);
8372 if (SrcReg1.isVirtual())
8373 MRI.constrainRegClass(Reg: SrcReg1, RC);
8374 if (SrcReg2.isVirtual())
8375 MRI.constrainRegClass(Reg: SrcReg2, RC);
8376
8377 MachineInstrBuilder MIB;
8378 if (kind == FMAInstKind::Default)
8379 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8380 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8381 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8382 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill));
8383 else if (kind == FMAInstKind::Indexed)
8384 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8385 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill))
8386 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8387 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8388 .addImm(Val: MUL->getOperand(i: 3).getImm());
8389 else if (kind == FMAInstKind::Accumulator)
8390 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8391 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill))
8392 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8393 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill));
8394 else
8395 assert(false && "Invalid FMA instruction kind \n");
8396 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8397 InsInstrs.push_back(Elt: MIB);
8398 return MUL;
8399}
8400
8401static MachineInstr *
8402genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
8403 const TargetInstrInfo *TII, MachineInstr &Root,
8404 SmallVectorImpl<MachineInstr *> &InsInstrs) {
8405 MachineInstr *MAD = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 1).getReg());
8406
8407 unsigned Opc = 0;
8408 const TargetRegisterClass *RC = MRI.getRegClass(Reg: MAD->getOperand(i: 0).getReg());
8409 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8410 Opc = AArch64::FNMADDSrrr;
8411 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8412 Opc = AArch64::FNMADDDrrr;
8413 else
8414 return nullptr;
8415
8416 Register ResultReg = Root.getOperand(i: 0).getReg();
8417 Register SrcReg0 = MAD->getOperand(i: 1).getReg();
8418 Register SrcReg1 = MAD->getOperand(i: 2).getReg();
8419 Register SrcReg2 = MAD->getOperand(i: 3).getReg();
8420 bool Src0IsKill = MAD->getOperand(i: 1).isKill();
8421 bool Src1IsKill = MAD->getOperand(i: 2).isKill();
8422 bool Src2IsKill = MAD->getOperand(i: 3).isKill();
8423 if (ResultReg.isVirtual())
8424 MRI.constrainRegClass(Reg: ResultReg, RC);
8425 if (SrcReg0.isVirtual())
8426 MRI.constrainRegClass(Reg: SrcReg0, RC);
8427 if (SrcReg1.isVirtual())
8428 MRI.constrainRegClass(Reg: SrcReg1, RC);
8429 if (SrcReg2.isVirtual())
8430 MRI.constrainRegClass(Reg: SrcReg2, RC);
8431
8432 MachineInstrBuilder MIB =
8433 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: Opc), DestReg: ResultReg)
8434 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8435 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8436 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill));
8437 InsInstrs.push_back(Elt: MIB);
8438
8439 return MAD;
8440}
8441
8442/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8443static MachineInstr *
8444genIndexedMultiply(MachineInstr &Root,
8445 SmallVectorImpl<MachineInstr *> &InsInstrs,
8446 unsigned IdxDupOp, unsigned MulOpc,
8447 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8448 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8449 "Invalid index of FMUL operand");
8450
8451 MachineFunction &MF = *Root.getMF();
8452 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8453
8454 MachineInstr *Dup =
8455 MF.getRegInfo().getUniqueVRegDef(Reg: Root.getOperand(i: IdxDupOp).getReg());
8456
8457 if (Dup->getOpcode() == TargetOpcode::COPY)
8458 Dup = MRI.getUniqueVRegDef(Reg: Dup->getOperand(i: 1).getReg());
8459
8460 Register DupSrcReg = Dup->getOperand(i: 1).getReg();
8461 MRI.clearKillFlags(Reg: DupSrcReg);
8462 MRI.constrainRegClass(Reg: DupSrcReg, RC);
8463
8464 unsigned DupSrcLane = Dup->getOperand(i: 2).getImm();
8465
8466 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8467 MachineOperand &MulOp = Root.getOperand(i: IdxMulOp);
8468
8469 Register ResultReg = Root.getOperand(i: 0).getReg();
8470
8471 MachineInstrBuilder MIB;
8472 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MulOpc), DestReg: ResultReg)
8473 .add(MO: MulOp)
8474 .addReg(RegNo: DupSrcReg)
8475 .addImm(Val: DupSrcLane);
8476
8477 InsInstrs.push_back(Elt: MIB);
8478 return &Root;
8479}
8480
8481/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8482/// instructions.
8483///
8484/// \see genFusedMultiply
8485static MachineInstr *genFusedMultiplyAcc(
8486 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8487 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8488 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8489 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8490 kind: FMAInstKind::Accumulator);
8491}
8492
8493/// genNeg - Helper to generate an intermediate negation of the second operand
8494/// of Root
8495static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
8496 const TargetInstrInfo *TII, MachineInstr &Root,
8497 SmallVectorImpl<MachineInstr *> &InsInstrs,
8498 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8499 unsigned MnegOpc, const TargetRegisterClass *RC) {
8500 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
8501 MachineInstrBuilder MIB =
8502 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MnegOpc), DestReg: NewVR)
8503 .add(MO: Root.getOperand(i: 2));
8504 InsInstrs.push_back(Elt: MIB);
8505
8506 assert(InstrIdxForVirtReg.empty());
8507 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8508
8509 return NewVR;
8510}
8511
8512/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8513/// instructions with an additional negation of the accumulator
8514static MachineInstr *genFusedMultiplyAccNeg(
8515 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8516 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8517 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8518 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8519 assert(IdxMulOpd == 1);
8520
8521 Register NewVR =
8522 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8523 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8524 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
8525}
8526
8527/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8528/// instructions.
8529///
8530/// \see genFusedMultiply
8531static MachineInstr *genFusedMultiplyIdx(
8532 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8533 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8534 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8535 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8536 kind: FMAInstKind::Indexed);
8537}
8538
8539/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8540/// instructions with an additional negation of the accumulator
8541static MachineInstr *genFusedMultiplyIdxNeg(
8542 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8543 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8544 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8545 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8546 assert(IdxMulOpd == 1);
8547
8548 Register NewVR =
8549 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8550
8551 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8552 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
8553}
8554
8555/// genMaddR - Generate madd instruction and combine mul and add using
8556/// an extra virtual register
8557/// Example - an ADD intermediate needs to be stored in a register:
8558/// MUL I=A,B,0
8559/// ADD R,I,Imm
8560/// ==> ORR V, ZR, Imm
8561/// ==> MADD R,A,B,V
8562/// \param MF Containing MachineFunction
8563/// \param MRI Register information
8564/// \param TII Target information
8565/// \param Root is the ADD instruction
8566/// \param [out] InsInstrs is a vector of machine instructions and will
8567/// contain the generated madd instruction
8568/// \param IdxMulOpd is index of operand in Root that is the result of
8569/// the MUL. In the example above IdxMulOpd is 1.
8570/// \param MaddOpc the opcode fo the madd instruction
8571/// \param VR is a virtual register that holds the value of an ADD operand
8572/// (V in the example above).
8573/// \param RC Register class of operands
8574static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
8575 const TargetInstrInfo *TII, MachineInstr &Root,
8576 SmallVectorImpl<MachineInstr *> &InsInstrs,
8577 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8578 const TargetRegisterClass *RC) {
8579 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8580
8581 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
8582 Register ResultReg = Root.getOperand(i: 0).getReg();
8583 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
8584 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
8585 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
8586 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
8587
8588 if (ResultReg.isVirtual())
8589 MRI.constrainRegClass(Reg: ResultReg, RC);
8590 if (SrcReg0.isVirtual())
8591 MRI.constrainRegClass(Reg: SrcReg0, RC);
8592 if (SrcReg1.isVirtual())
8593 MRI.constrainRegClass(Reg: SrcReg1, RC);
8594 if (Register::isVirtualRegister(Reg: VR))
8595 MRI.constrainRegClass(Reg: VR, RC);
8596
8597 MachineInstrBuilder MIB =
8598 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8599 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8600 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8601 .addReg(RegNo: VR);
8602 // Insert the MADD
8603 InsInstrs.push_back(Elt: MIB);
8604 return MUL;
8605}
8606
8607/// Do the following transformation
8608/// A - (B + C) ==> (A - B) - C
8609/// A - (B + C) ==> (A - C) - B
8610static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
8611 const TargetInstrInfo *TII, MachineInstr &Root,
8612 SmallVectorImpl<MachineInstr *> &InsInstrs,
8613 SmallVectorImpl<MachineInstr *> &DelInstrs,
8614 unsigned IdxOpd1,
8615 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8616 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8617 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8618 MachineInstr *AddMI = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 2).getReg());
8619
8620 Register ResultReg = Root.getOperand(i: 0).getReg();
8621 Register RegA = Root.getOperand(i: 1).getReg();
8622 bool RegAIsKill = Root.getOperand(i: 1).isKill();
8623 Register RegB = AddMI->getOperand(i: IdxOpd1).getReg();
8624 bool RegBIsKill = AddMI->getOperand(i: IdxOpd1).isKill();
8625 Register RegC = AddMI->getOperand(i: IdxOtherOpd).getReg();
8626 bool RegCIsKill = AddMI->getOperand(i: IdxOtherOpd).isKill();
8627 Register NewVR =
8628 MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: Root.getOperand(i: 2).getReg()));
8629
8630 unsigned Opcode = Root.getOpcode();
8631 if (Opcode == AArch64::SUBSWrr)
8632 Opcode = AArch64::SUBWrr;
8633 else if (Opcode == AArch64::SUBSXrr)
8634 Opcode = AArch64::SUBXrr;
8635 else
8636 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8637 "Unexpected instruction opcode.");
8638
8639 uint32_t Flags = Root.mergeFlagsWith(Other: *AddMI);
8640 Flags &= ~MachineInstr::NoSWrap;
8641 Flags &= ~MachineInstr::NoUWrap;
8642
8643 MachineInstrBuilder MIB1 =
8644 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: NewVR)
8645 .addReg(RegNo: RegA, Flags: getKillRegState(B: RegAIsKill))
8646 .addReg(RegNo: RegB, Flags: getKillRegState(B: RegBIsKill))
8647 .setMIFlags(Flags);
8648 MachineInstrBuilder MIB2 =
8649 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: ResultReg)
8650 .addReg(RegNo: NewVR, Flags: getKillRegState(B: true))
8651 .addReg(RegNo: RegC, Flags: getKillRegState(B: RegCIsKill))
8652 .setMIFlags(Flags);
8653
8654 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8655 InsInstrs.push_back(Elt: MIB1);
8656 InsInstrs.push_back(Elt: MIB2);
8657 DelInstrs.push_back(Elt: AddMI);
8658 DelInstrs.push_back(Elt: &Root);
8659}
8660
8661unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8662 unsigned int AccumulatorOpCode) const {
8663 switch (AccumulatorOpCode) {
8664 case AArch64::UABALB_ZZZ_D:
8665 case AArch64::SABALB_ZZZ_D:
8666 case AArch64::UABALT_ZZZ_D:
8667 case AArch64::SABALT_ZZZ_D:
8668 return AArch64::ADD_ZZZ_D;
8669 case AArch64::UABALB_ZZZ_H:
8670 case AArch64::SABALB_ZZZ_H:
8671 case AArch64::UABALT_ZZZ_H:
8672 case AArch64::SABALT_ZZZ_H:
8673 return AArch64::ADD_ZZZ_H;
8674 case AArch64::UABALB_ZZZ_S:
8675 case AArch64::SABALB_ZZZ_S:
8676 case AArch64::UABALT_ZZZ_S:
8677 case AArch64::SABALT_ZZZ_S:
8678 return AArch64::ADD_ZZZ_S;
8679 case AArch64::UABALv16i8_v8i16:
8680 case AArch64::SABALv8i8_v8i16:
8681 case AArch64::SABAv8i16:
8682 case AArch64::UABAv8i16:
8683 return AArch64::ADDv8i16;
8684 case AArch64::SABALv2i32_v2i64:
8685 case AArch64::UABALv2i32_v2i64:
8686 case AArch64::SABALv4i32_v2i64:
8687 return AArch64::ADDv2i64;
8688 case AArch64::UABALv4i16_v4i32:
8689 case AArch64::SABALv4i16_v4i32:
8690 case AArch64::SABALv8i16_v4i32:
8691 case AArch64::SABAv4i32:
8692 case AArch64::UABAv4i32:
8693 return AArch64::ADDv4i32;
8694 case AArch64::UABALv4i32_v2i64:
8695 return AArch64::ADDv2i64;
8696 case AArch64::UABALv8i16_v4i32:
8697 return AArch64::ADDv4i32;
8698 case AArch64::UABALv8i8_v8i16:
8699 case AArch64::SABALv16i8_v8i16:
8700 return AArch64::ADDv8i16;
8701 case AArch64::UABAv16i8:
8702 case AArch64::SABAv16i8:
8703 return AArch64::ADDv16i8;
8704 case AArch64::UABAv4i16:
8705 case AArch64::SABAv4i16:
8706 return AArch64::ADDv4i16;
8707 case AArch64::UABAv2i32:
8708 case AArch64::SABAv2i32:
8709 return AArch64::ADDv2i32;
8710 case AArch64::UABAv8i8:
8711 case AArch64::SABAv8i8:
8712 return AArch64::ADDv8i8;
8713 default:
8714 llvm_unreachable("Unknown accumulator opcode");
8715 }
8716}
8717
8718/// When getMachineCombinerPatterns() finds potential patterns,
8719/// this function generates the instructions that could replace the
8720/// original code sequence
8721void AArch64InstrInfo::genAlternativeCodeSequence(
8722 MachineInstr &Root, unsigned Pattern,
8723 SmallVectorImpl<MachineInstr *> &InsInstrs,
8724 SmallVectorImpl<MachineInstr *> &DelInstrs,
8725 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8726 MachineBasicBlock &MBB = *Root.getParent();
8727 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8728 MachineFunction &MF = *MBB.getParent();
8729 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8730
8731 MachineInstr *MUL = nullptr;
8732 const TargetRegisterClass *RC;
8733 unsigned Opc;
8734 switch (Pattern) {
8735 default:
8736 // Reassociate instructions.
8737 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8738 DelInstrs, InstIdxForVirtReg&: InstrIdxForVirtReg);
8739 return;
8740 case AArch64MachineCombinerPattern::SUBADD_OP1:
8741 // A - (B + C)
8742 // ==> (A - B) - C
8743 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 1,
8744 InstrIdxForVirtReg);
8745 return;
8746 case AArch64MachineCombinerPattern::SUBADD_OP2:
8747 // A - (B + C)
8748 // ==> (A - C) - B
8749 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 2,
8750 InstrIdxForVirtReg);
8751 return;
8752 case AArch64MachineCombinerPattern::MULADDW_OP1:
8753 case AArch64MachineCombinerPattern::MULADDX_OP1:
8754 // MUL I=A,B,0
8755 // ADD R,I,C
8756 // ==> MADD R,A,B,C
8757 // --- Create(MADD);
8758 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
8759 Opc = AArch64::MADDWrrr;
8760 RC = &AArch64::GPR32RegClass;
8761 } else {
8762 Opc = AArch64::MADDXrrr;
8763 RC = &AArch64::GPR64RegClass;
8764 }
8765 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8766 break;
8767 case AArch64MachineCombinerPattern::MULADDW_OP2:
8768 case AArch64MachineCombinerPattern::MULADDX_OP2:
8769 // MUL I=A,B,0
8770 // ADD R,C,I
8771 // ==> MADD R,A,B,C
8772 // --- Create(MADD);
8773 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
8774 Opc = AArch64::MADDWrrr;
8775 RC = &AArch64::GPR32RegClass;
8776 } else {
8777 Opc = AArch64::MADDXrrr;
8778 RC = &AArch64::GPR64RegClass;
8779 }
8780 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8781 break;
8782 case AArch64MachineCombinerPattern::MULADDWI_OP1:
8783 case AArch64MachineCombinerPattern::MULADDXI_OP1:
8784 case AArch64MachineCombinerPattern::MULSUBWI_OP1:
8785 case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
8786 // MUL I=A,B,0
8787 // ADD/SUB R,I,Imm
8788 // ==> MOV V, Imm/-Imm
8789 // ==> MADD R,A,B,V
8790 // --- Create(MADD);
8791 const TargetRegisterClass *RC;
8792 unsigned BitSize, MovImm;
8793 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1 ||
8794 Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
8795 MovImm = AArch64::MOVi32imm;
8796 RC = &AArch64::GPR32spRegClass;
8797 BitSize = 32;
8798 Opc = AArch64::MADDWrrr;
8799 RC = &AArch64::GPR32RegClass;
8800 } else {
8801 MovImm = AArch64::MOVi64imm;
8802 RC = &AArch64::GPR64spRegClass;
8803 BitSize = 64;
8804 Opc = AArch64::MADDXrrr;
8805 RC = &AArch64::GPR64RegClass;
8806 }
8807 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
8808 uint64_t Imm = Root.getOperand(i: 2).getImm();
8809
8810 if (Root.getOperand(i: 3).isImm()) {
8811 unsigned Val = Root.getOperand(i: 3).getImm();
8812 Imm = Imm << Val;
8813 }
8814 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8815 Pattern == AArch64MachineCombinerPattern::MULSUBXI_OP1;
8816 uint64_t UImm = SignExtend64(X: IsSub ? -Imm : Imm, B: BitSize);
8817 // Check that the immediate can be composed via a single instruction.
8818 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
8819 AArch64_IMM::expandMOVImm(Imm: UImm, BitSize, Insn);
8820 if (Insn.size() != 1)
8821 return;
8822 MachineInstrBuilder MIB1 =
8823 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MovImm), DestReg: NewVR)
8824 .addImm(Val: IsSub ? -Imm : Imm);
8825 InsInstrs.push_back(Elt: MIB1);
8826 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8827 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
8828 break;
8829 }
8830 case AArch64MachineCombinerPattern::MULSUBW_OP1:
8831 case AArch64MachineCombinerPattern::MULSUBX_OP1: {
8832 // MUL I=A,B,0
8833 // SUB R,I, C
8834 // ==> SUB V, 0, C
8835 // ==> MADD R,A,B,V // = -C + A*B
8836 // --- Create(MADD);
8837 const TargetRegisterClass *SubRC;
8838 unsigned SubOpc, ZeroReg;
8839 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
8840 SubOpc = AArch64::SUBWrr;
8841 SubRC = &AArch64::GPR32spRegClass;
8842 ZeroReg = AArch64::WZR;
8843 Opc = AArch64::MADDWrrr;
8844 RC = &AArch64::GPR32RegClass;
8845 } else {
8846 SubOpc = AArch64::SUBXrr;
8847 SubRC = &AArch64::GPR64spRegClass;
8848 ZeroReg = AArch64::XZR;
8849 Opc = AArch64::MADDXrrr;
8850 RC = &AArch64::GPR64RegClass;
8851 }
8852 Register NewVR = MRI.createVirtualRegister(RegClass: SubRC);
8853 // SUB NewVR, 0, C
8854 MachineInstrBuilder MIB1 =
8855 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: SubOpc), DestReg: NewVR)
8856 .addReg(RegNo: ZeroReg)
8857 .add(MO: Root.getOperand(i: 2));
8858 InsInstrs.push_back(Elt: MIB1);
8859 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8860 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
8861 break;
8862 }
8863 case AArch64MachineCombinerPattern::MULSUBW_OP2:
8864 case AArch64MachineCombinerPattern::MULSUBX_OP2:
8865 // MUL I=A,B,0
8866 // SUB R,C,I
8867 // ==> MSUB R,A,B,C (computes C - A*B)
8868 // --- Create(MSUB);
8869 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
8870 Opc = AArch64::MSUBWrrr;
8871 RC = &AArch64::GPR32RegClass;
8872 } else {
8873 Opc = AArch64::MSUBXrrr;
8874 RC = &AArch64::GPR64RegClass;
8875 }
8876 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8877 break;
8878 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
8879 Opc = AArch64::MLAv8i8;
8880 RC = &AArch64::FPR64RegClass;
8881 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8882 break;
8883 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
8884 Opc = AArch64::MLAv8i8;
8885 RC = &AArch64::FPR64RegClass;
8886 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8887 break;
8888 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
8889 Opc = AArch64::MLAv16i8;
8890 RC = &AArch64::FPR128RegClass;
8891 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8892 break;
8893 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
8894 Opc = AArch64::MLAv16i8;
8895 RC = &AArch64::FPR128RegClass;
8896 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8897 break;
8898 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
8899 Opc = AArch64::MLAv4i16;
8900 RC = &AArch64::FPR64RegClass;
8901 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8902 break;
8903 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
8904 Opc = AArch64::MLAv4i16;
8905 RC = &AArch64::FPR64RegClass;
8906 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8907 break;
8908 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
8909 Opc = AArch64::MLAv8i16;
8910 RC = &AArch64::FPR128RegClass;
8911 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8912 break;
8913 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
8914 Opc = AArch64::MLAv8i16;
8915 RC = &AArch64::FPR128RegClass;
8916 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8917 break;
8918 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
8919 Opc = AArch64::MLAv2i32;
8920 RC = &AArch64::FPR64RegClass;
8921 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8922 break;
8923 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
8924 Opc = AArch64::MLAv2i32;
8925 RC = &AArch64::FPR64RegClass;
8926 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8927 break;
8928 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
8929 Opc = AArch64::MLAv4i32;
8930 RC = &AArch64::FPR128RegClass;
8931 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8932 break;
8933 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
8934 Opc = AArch64::MLAv4i32;
8935 RC = &AArch64::FPR128RegClass;
8936 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8937 break;
8938
8939 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
8940 Opc = AArch64::MLAv8i8;
8941 RC = &AArch64::FPR64RegClass;
8942 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8943 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i8,
8944 RC);
8945 break;
8946 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
8947 Opc = AArch64::MLSv8i8;
8948 RC = &AArch64::FPR64RegClass;
8949 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8950 break;
8951 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
8952 Opc = AArch64::MLAv16i8;
8953 RC = &AArch64::FPR128RegClass;
8954 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8955 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv16i8,
8956 RC);
8957 break;
8958 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
8959 Opc = AArch64::MLSv16i8;
8960 RC = &AArch64::FPR128RegClass;
8961 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8962 break;
8963 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
8964 Opc = AArch64::MLAv4i16;
8965 RC = &AArch64::FPR64RegClass;
8966 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8967 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i16,
8968 RC);
8969 break;
8970 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
8971 Opc = AArch64::MLSv4i16;
8972 RC = &AArch64::FPR64RegClass;
8973 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8974 break;
8975 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
8976 Opc = AArch64::MLAv8i16;
8977 RC = &AArch64::FPR128RegClass;
8978 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8979 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i16,
8980 RC);
8981 break;
8982 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
8983 Opc = AArch64::MLSv8i16;
8984 RC = &AArch64::FPR128RegClass;
8985 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8986 break;
8987 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
8988 Opc = AArch64::MLAv2i32;
8989 RC = &AArch64::FPR64RegClass;
8990 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8991 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv2i32,
8992 RC);
8993 break;
8994 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
8995 Opc = AArch64::MLSv2i32;
8996 RC = &AArch64::FPR64RegClass;
8997 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8998 break;
8999 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
9000 Opc = AArch64::MLAv4i32;
9001 RC = &AArch64::FPR128RegClass;
9002 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9003 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i32,
9004 RC);
9005 break;
9006 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
9007 Opc = AArch64::MLSv4i32;
9008 RC = &AArch64::FPR128RegClass;
9009 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9010 break;
9011
9012 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
9013 Opc = AArch64::MLAv4i16_indexed;
9014 RC = &AArch64::FPR64RegClass;
9015 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9016 break;
9017 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
9018 Opc = AArch64::MLAv4i16_indexed;
9019 RC = &AArch64::FPR64RegClass;
9020 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9021 break;
9022 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
9023 Opc = AArch64::MLAv8i16_indexed;
9024 RC = &AArch64::FPR128RegClass;
9025 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9026 break;
9027 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
9028 Opc = AArch64::MLAv8i16_indexed;
9029 RC = &AArch64::FPR128RegClass;
9030 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9031 break;
9032 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
9033 Opc = AArch64::MLAv2i32_indexed;
9034 RC = &AArch64::FPR64RegClass;
9035 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9036 break;
9037 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
9038 Opc = AArch64::MLAv2i32_indexed;
9039 RC = &AArch64::FPR64RegClass;
9040 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9041 break;
9042 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
9043 Opc = AArch64::MLAv4i32_indexed;
9044 RC = &AArch64::FPR128RegClass;
9045 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9046 break;
9047 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
9048 Opc = AArch64::MLAv4i32_indexed;
9049 RC = &AArch64::FPR128RegClass;
9050 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9051 break;
9052
9053 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
9054 Opc = AArch64::MLAv4i16_indexed;
9055 RC = &AArch64::FPR64RegClass;
9056 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9057 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i16,
9058 RC);
9059 break;
9060 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
9061 Opc = AArch64::MLSv4i16_indexed;
9062 RC = &AArch64::FPR64RegClass;
9063 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9064 break;
9065 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
9066 Opc = AArch64::MLAv8i16_indexed;
9067 RC = &AArch64::FPR128RegClass;
9068 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9069 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i16,
9070 RC);
9071 break;
9072 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
9073 Opc = AArch64::MLSv8i16_indexed;
9074 RC = &AArch64::FPR128RegClass;
9075 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9076 break;
9077 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
9078 Opc = AArch64::MLAv2i32_indexed;
9079 RC = &AArch64::FPR64RegClass;
9080 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9081 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv2i32,
9082 RC);
9083 break;
9084 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
9085 Opc = AArch64::MLSv2i32_indexed;
9086 RC = &AArch64::FPR64RegClass;
9087 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9088 break;
9089 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
9090 Opc = AArch64::MLAv4i32_indexed;
9091 RC = &AArch64::FPR128RegClass;
9092 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9093 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i32,
9094 RC);
9095 break;
9096 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
9097 Opc = AArch64::MLSv4i32_indexed;
9098 RC = &AArch64::FPR128RegClass;
9099 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9100 break;
9101
9102 // Floating Point Support
9103 case AArch64MachineCombinerPattern::FMULADDH_OP1:
9104 Opc = AArch64::FMADDHrrr;
9105 RC = &AArch64::FPR16RegClass;
9106 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9107 break;
9108 case AArch64MachineCombinerPattern::FMULADDS_OP1:
9109 Opc = AArch64::FMADDSrrr;
9110 RC = &AArch64::FPR32RegClass;
9111 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9112 break;
9113 case AArch64MachineCombinerPattern::FMULADDD_OP1:
9114 Opc = AArch64::FMADDDrrr;
9115 RC = &AArch64::FPR64RegClass;
9116 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9117 break;
9118
9119 case AArch64MachineCombinerPattern::FMULADDH_OP2:
9120 Opc = AArch64::FMADDHrrr;
9121 RC = &AArch64::FPR16RegClass;
9122 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9123 break;
9124 case AArch64MachineCombinerPattern::FMULADDS_OP2:
9125 Opc = AArch64::FMADDSrrr;
9126 RC = &AArch64::FPR32RegClass;
9127 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9128 break;
9129 case AArch64MachineCombinerPattern::FMULADDD_OP2:
9130 Opc = AArch64::FMADDDrrr;
9131 RC = &AArch64::FPR64RegClass;
9132 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9133 break;
9134
9135 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
9136 Opc = AArch64::FMLAv1i32_indexed;
9137 RC = &AArch64::FPR32RegClass;
9138 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9139 kind: FMAInstKind::Indexed);
9140 break;
9141 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
9142 Opc = AArch64::FMLAv1i32_indexed;
9143 RC = &AArch64::FPR32RegClass;
9144 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9145 kind: FMAInstKind::Indexed);
9146 break;
9147
9148 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
9149 Opc = AArch64::FMLAv1i64_indexed;
9150 RC = &AArch64::FPR64RegClass;
9151 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9152 kind: FMAInstKind::Indexed);
9153 break;
9154 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
9155 Opc = AArch64::FMLAv1i64_indexed;
9156 RC = &AArch64::FPR64RegClass;
9157 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9158 kind: FMAInstKind::Indexed);
9159 break;
9160
9161 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
9162 RC = &AArch64::FPR64RegClass;
9163 Opc = AArch64::FMLAv4i16_indexed;
9164 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9165 kind: FMAInstKind::Indexed);
9166 break;
9167 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
9168 RC = &AArch64::FPR64RegClass;
9169 Opc = AArch64::FMLAv4f16;
9170 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9171 kind: FMAInstKind::Accumulator);
9172 break;
9173 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
9174 RC = &AArch64::FPR64RegClass;
9175 Opc = AArch64::FMLAv4i16_indexed;
9176 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9177 kind: FMAInstKind::Indexed);
9178 break;
9179 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
9180 RC = &AArch64::FPR64RegClass;
9181 Opc = AArch64::FMLAv4f16;
9182 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9183 kind: FMAInstKind::Accumulator);
9184 break;
9185
9186 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
9187 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
9188 RC = &AArch64::FPR64RegClass;
9189 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
9190 Opc = AArch64::FMLAv2i32_indexed;
9191 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9192 kind: FMAInstKind::Indexed);
9193 } else {
9194 Opc = AArch64::FMLAv2f32;
9195 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9196 kind: FMAInstKind::Accumulator);
9197 }
9198 break;
9199 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
9200 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
9201 RC = &AArch64::FPR64RegClass;
9202 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
9203 Opc = AArch64::FMLAv2i32_indexed;
9204 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9205 kind: FMAInstKind::Indexed);
9206 } else {
9207 Opc = AArch64::FMLAv2f32;
9208 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9209 kind: FMAInstKind::Accumulator);
9210 }
9211 break;
9212
9213 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
9214 RC = &AArch64::FPR128RegClass;
9215 Opc = AArch64::FMLAv8i16_indexed;
9216 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9217 kind: FMAInstKind::Indexed);
9218 break;
9219 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
9220 RC = &AArch64::FPR128RegClass;
9221 Opc = AArch64::FMLAv8f16;
9222 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9223 kind: FMAInstKind::Accumulator);
9224 break;
9225 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
9226 RC = &AArch64::FPR128RegClass;
9227 Opc = AArch64::FMLAv8i16_indexed;
9228 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9229 kind: FMAInstKind::Indexed);
9230 break;
9231 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
9232 RC = &AArch64::FPR128RegClass;
9233 Opc = AArch64::FMLAv8f16;
9234 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9235 kind: FMAInstKind::Accumulator);
9236 break;
9237
9238 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
9239 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
9240 RC = &AArch64::FPR128RegClass;
9241 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
9242 Opc = AArch64::FMLAv2i64_indexed;
9243 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9244 kind: FMAInstKind::Indexed);
9245 } else {
9246 Opc = AArch64::FMLAv2f64;
9247 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9248 kind: FMAInstKind::Accumulator);
9249 }
9250 break;
9251 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
9252 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
9253 RC = &AArch64::FPR128RegClass;
9254 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
9255 Opc = AArch64::FMLAv2i64_indexed;
9256 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9257 kind: FMAInstKind::Indexed);
9258 } else {
9259 Opc = AArch64::FMLAv2f64;
9260 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9261 kind: FMAInstKind::Accumulator);
9262 }
9263 break;
9264
9265 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
9266 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
9267 RC = &AArch64::FPR128RegClass;
9268 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
9269 Opc = AArch64::FMLAv4i32_indexed;
9270 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9271 kind: FMAInstKind::Indexed);
9272 } else {
9273 Opc = AArch64::FMLAv4f32;
9274 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9275 kind: FMAInstKind::Accumulator);
9276 }
9277 break;
9278
9279 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
9280 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
9281 RC = &AArch64::FPR128RegClass;
9282 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
9283 Opc = AArch64::FMLAv4i32_indexed;
9284 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9285 kind: FMAInstKind::Indexed);
9286 } else {
9287 Opc = AArch64::FMLAv4f32;
9288 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9289 kind: FMAInstKind::Accumulator);
9290 }
9291 break;
9292
9293 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
9294 Opc = AArch64::FNMSUBHrrr;
9295 RC = &AArch64::FPR16RegClass;
9296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9297 break;
9298 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
9299 Opc = AArch64::FNMSUBSrrr;
9300 RC = &AArch64::FPR32RegClass;
9301 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9302 break;
9303 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
9304 Opc = AArch64::FNMSUBDrrr;
9305 RC = &AArch64::FPR64RegClass;
9306 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9307 break;
9308
9309 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
9310 Opc = AArch64::FNMADDHrrr;
9311 RC = &AArch64::FPR16RegClass;
9312 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9313 break;
9314 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
9315 Opc = AArch64::FNMADDSrrr;
9316 RC = &AArch64::FPR32RegClass;
9317 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9318 break;
9319 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
9320 Opc = AArch64::FNMADDDrrr;
9321 RC = &AArch64::FPR64RegClass;
9322 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9323 break;
9324
9325 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
9326 Opc = AArch64::FMSUBHrrr;
9327 RC = &AArch64::FPR16RegClass;
9328 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9329 break;
9330 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
9331 Opc = AArch64::FMSUBSrrr;
9332 RC = &AArch64::FPR32RegClass;
9333 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9334 break;
9335 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
9336 Opc = AArch64::FMSUBDrrr;
9337 RC = &AArch64::FPR64RegClass;
9338 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9339 break;
9340
9341 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
9342 Opc = AArch64::FMLSv1i32_indexed;
9343 RC = &AArch64::FPR32RegClass;
9344 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9345 kind: FMAInstKind::Indexed);
9346 break;
9347
9348 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
9349 Opc = AArch64::FMLSv1i64_indexed;
9350 RC = &AArch64::FPR64RegClass;
9351 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9352 kind: FMAInstKind::Indexed);
9353 break;
9354
9355 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
9356 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
9357 RC = &AArch64::FPR64RegClass;
9358 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9359 MachineInstrBuilder MIB1 =
9360 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv4f16), DestReg: NewVR)
9361 .add(MO: Root.getOperand(i: 2));
9362 InsInstrs.push_back(Elt: MIB1);
9363 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9364 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
9365 Opc = AArch64::FMLAv4f16;
9366 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9367 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9368 } else {
9369 Opc = AArch64::FMLAv4i16_indexed;
9370 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9371 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9372 }
9373 break;
9374 }
9375 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
9376 RC = &AArch64::FPR64RegClass;
9377 Opc = AArch64::FMLSv4f16;
9378 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9379 kind: FMAInstKind::Accumulator);
9380 break;
9381 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
9382 RC = &AArch64::FPR64RegClass;
9383 Opc = AArch64::FMLSv4i16_indexed;
9384 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9385 kind: FMAInstKind::Indexed);
9386 break;
9387
9388 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
9389 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
9390 RC = &AArch64::FPR64RegClass;
9391 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
9392 Opc = AArch64::FMLSv2i32_indexed;
9393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9394 kind: FMAInstKind::Indexed);
9395 } else {
9396 Opc = AArch64::FMLSv2f32;
9397 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9398 kind: FMAInstKind::Accumulator);
9399 }
9400 break;
9401
9402 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
9403 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
9404 RC = &AArch64::FPR128RegClass;
9405 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9406 MachineInstrBuilder MIB1 =
9407 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv8f16), DestReg: NewVR)
9408 .add(MO: Root.getOperand(i: 2));
9409 InsInstrs.push_back(Elt: MIB1);
9410 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9411 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
9412 Opc = AArch64::FMLAv8f16;
9413 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9414 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9415 } else {
9416 Opc = AArch64::FMLAv8i16_indexed;
9417 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9418 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9419 }
9420 break;
9421 }
9422 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
9423 RC = &AArch64::FPR128RegClass;
9424 Opc = AArch64::FMLSv8f16;
9425 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9426 kind: FMAInstKind::Accumulator);
9427 break;
9428 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
9429 RC = &AArch64::FPR128RegClass;
9430 Opc = AArch64::FMLSv8i16_indexed;
9431 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9432 kind: FMAInstKind::Indexed);
9433 break;
9434
9435 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
9436 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
9437 RC = &AArch64::FPR128RegClass;
9438 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
9439 Opc = AArch64::FMLSv2i64_indexed;
9440 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9441 kind: FMAInstKind::Indexed);
9442 } else {
9443 Opc = AArch64::FMLSv2f64;
9444 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9445 kind: FMAInstKind::Accumulator);
9446 }
9447 break;
9448
9449 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
9450 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
9451 RC = &AArch64::FPR128RegClass;
9452 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
9453 Opc = AArch64::FMLSv4i32_indexed;
9454 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9455 kind: FMAInstKind::Indexed);
9456 } else {
9457 Opc = AArch64::FMLSv4f32;
9458 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9459 kind: FMAInstKind::Accumulator);
9460 }
9461 break;
9462 case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
9463 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
9464 RC = &AArch64::FPR64RegClass;
9465 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9466 MachineInstrBuilder MIB1 =
9467 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv2f32), DestReg: NewVR)
9468 .add(MO: Root.getOperand(i: 2));
9469 InsInstrs.push_back(Elt: MIB1);
9470 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9471 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
9472 Opc = AArch64::FMLAv2i32_indexed;
9473 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9474 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9475 } else {
9476 Opc = AArch64::FMLAv2f32;
9477 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9478 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9479 }
9480 break;
9481 }
9482 case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
9483 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
9484 RC = &AArch64::FPR128RegClass;
9485 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9486 MachineInstrBuilder MIB1 =
9487 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv4f32), DestReg: NewVR)
9488 .add(MO: Root.getOperand(i: 2));
9489 InsInstrs.push_back(Elt: MIB1);
9490 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9491 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
9492 Opc = AArch64::FMLAv4i32_indexed;
9493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9494 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9495 } else {
9496 Opc = AArch64::FMLAv4f32;
9497 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9498 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9499 }
9500 break;
9501 }
9502 case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
9503 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
9504 RC = &AArch64::FPR128RegClass;
9505 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9506 MachineInstrBuilder MIB1 =
9507 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv2f64), DestReg: NewVR)
9508 .add(MO: Root.getOperand(i: 2));
9509 InsInstrs.push_back(Elt: MIB1);
9510 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9511 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
9512 Opc = AArch64::FMLAv2i64_indexed;
9513 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9514 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9515 } else {
9516 Opc = AArch64::FMLAv2f64;
9517 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9518 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9519 }
9520 break;
9521 }
9522 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
9523 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
9524 unsigned IdxDupOp =
9525 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
9526 : 2;
9527 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv2i32_indexed,
9528 RC: &AArch64::FPR128RegClass, MRI);
9529 break;
9530 }
9531 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
9532 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
9533 unsigned IdxDupOp =
9534 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
9535 : 2;
9536 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv2i64_indexed,
9537 RC: &AArch64::FPR128RegClass, MRI);
9538 break;
9539 }
9540 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
9541 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
9542 unsigned IdxDupOp =
9543 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
9544 : 2;
9545 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv4i16_indexed,
9546 RC: &AArch64::FPR128_loRegClass, MRI);
9547 break;
9548 }
9549 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
9550 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
9551 unsigned IdxDupOp =
9552 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
9553 : 2;
9554 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv4i32_indexed,
9555 RC: &AArch64::FPR128RegClass, MRI);
9556 break;
9557 }
9558 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
9559 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
9560 unsigned IdxDupOp =
9561 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
9562 : 2;
9563 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv8i16_indexed,
9564 RC: &AArch64::FPR128_loRegClass, MRI);
9565 break;
9566 }
9567 case AArch64MachineCombinerPattern::FNMADD: {
9568 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9569 break;
9570 }
9571 case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
9572 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9573 Pattern, NumLanes: 4);
9574 break;
9575 }
9576 case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
9577 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9578 Pattern, NumLanes: 8);
9579 break;
9580 }
9581 case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
9582 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9583 Pattern, NumLanes: 16);
9584 break;
9585 }
9586
9587 } // end switch (Pattern)
9588 // Record MUL and ADD/SUB for deletion
9589 if (MUL)
9590 DelInstrs.push_back(Elt: MUL);
9591 DelInstrs.push_back(Elt: &Root);
9592
9593 // Set the flags on the inserted instructions to be the merged flags of the
9594 // instructions that we have combined.
9595 uint32_t Flags = Root.getFlags();
9596 if (MUL)
9597 Flags = Root.mergeFlagsWith(Other: *MUL);
9598 for (auto *MI : InsInstrs)
9599 MI->setFlags(Flags);
9600}
9601
9602/// Replace csincr-branch sequence by simple conditional branch
9603///
9604/// Examples:
9605/// 1. \code
9606/// csinc w9, wzr, wzr, <condition code>
9607/// tbnz w9, #0, 0x44
9608/// \endcode
9609/// to
9610/// \code
9611/// b.<inverted condition code>
9612/// \endcode
9613///
9614/// 2. \code
9615/// csinc w9, wzr, wzr, <condition code>
9616/// tbz w9, #0, 0x44
9617/// \endcode
9618/// to
9619/// \code
9620/// b.<condition code>
9621/// \endcode
9622///
9623/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9624/// compare's constant operand is power of 2.
9625///
9626/// Examples:
9627/// \code
9628/// and w8, w8, #0x400
9629/// cbnz w8, L1
9630/// \endcode
9631/// to
9632/// \code
9633/// tbnz w8, #10, L1
9634/// \endcode
9635///
9636/// \param MI Conditional Branch
9637/// \return True when the simple conditional branch is generated
9638///
9639bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
9640 bool IsNegativeBranch = false;
9641 bool IsTestAndBranch = false;
9642 unsigned TargetBBInMI = 0;
9643 switch (MI.getOpcode()) {
9644 default:
9645 llvm_unreachable("Unknown branch instruction?");
9646 case AArch64::Bcc:
9647 case AArch64::CBWPri:
9648 case AArch64::CBXPri:
9649 case AArch64::CBBAssertExt:
9650 case AArch64::CBHAssertExt:
9651 case AArch64::CBWPrr:
9652 case AArch64::CBXPrr:
9653 return false;
9654 case AArch64::CBZW:
9655 case AArch64::CBZX:
9656 TargetBBInMI = 1;
9657 break;
9658 case AArch64::CBNZW:
9659 case AArch64::CBNZX:
9660 TargetBBInMI = 1;
9661 IsNegativeBranch = true;
9662 break;
9663 case AArch64::TBZW:
9664 case AArch64::TBZX:
9665 TargetBBInMI = 2;
9666 IsTestAndBranch = true;
9667 break;
9668 case AArch64::TBNZW:
9669 case AArch64::TBNZX:
9670 TargetBBInMI = 2;
9671 IsNegativeBranch = true;
9672 IsTestAndBranch = true;
9673 break;
9674 }
9675 // So we increment a zero register and test for bits other
9676 // than bit 0? Conservatively bail out in case the verifier
9677 // missed this case.
9678 if (IsTestAndBranch && MI.getOperand(i: 1).getImm())
9679 return false;
9680
9681 // Find Definition.
9682 assert(MI.getParent() && "Incomplete machine instruction\n");
9683 MachineBasicBlock *MBB = MI.getParent();
9684 MachineFunction *MF = MBB->getParent();
9685 MachineRegisterInfo *MRI = &MF->getRegInfo();
9686 Register VReg = MI.getOperand(i: 0).getReg();
9687 if (!VReg.isVirtual())
9688 return false;
9689
9690 MachineInstr *DefMI = MRI->getVRegDef(Reg: VReg);
9691
9692 // Look through COPY instructions to find definition.
9693 while (DefMI->isCopy()) {
9694 Register CopyVReg = DefMI->getOperand(i: 1).getReg();
9695 if (!MRI->hasOneNonDBGUse(RegNo: CopyVReg))
9696 return false;
9697 if (!MRI->hasOneDef(RegNo: CopyVReg))
9698 return false;
9699 DefMI = MRI->getVRegDef(Reg: CopyVReg);
9700 }
9701
9702 switch (DefMI->getOpcode()) {
9703 default:
9704 return false;
9705 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9706 case AArch64::ANDWri:
9707 case AArch64::ANDXri: {
9708 if (IsTestAndBranch)
9709 return false;
9710 if (DefMI->getParent() != MBB)
9711 return false;
9712 if (!MRI->hasOneNonDBGUse(RegNo: VReg))
9713 return false;
9714
9715 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9716 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
9717 val: DefMI->getOperand(i: 2).getImm(), regSize: Is32Bit ? 32 : 64);
9718 if (!isPowerOf2_64(Value: Mask))
9719 return false;
9720
9721 MachineOperand &MO = DefMI->getOperand(i: 1);
9722 Register NewReg = MO.getReg();
9723 if (!NewReg.isVirtual())
9724 return false;
9725
9726 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9727
9728 MachineBasicBlock &RefToMBB = *MBB;
9729 MachineBasicBlock *TBB = MI.getOperand(i: 1).getMBB();
9730 DebugLoc DL = MI.getDebugLoc();
9731 unsigned Imm = Log2_64(Value: Mask);
9732 unsigned Opc = (Imm < 32)
9733 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9734 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9735 MachineInstr *NewMI = BuildMI(BB&: RefToMBB, I&: MI, MIMD: DL, MCID: get(Opcode: Opc))
9736 .addReg(RegNo: NewReg)
9737 .addImm(Val: Imm)
9738 .addMBB(MBB: TBB);
9739 // Register lives on to the CBZ now.
9740 MO.setIsKill(false);
9741
9742 // For immediate smaller than 32, we need to use the 32-bit
9743 // variant (W) in all cases. Indeed the 64-bit variant does not
9744 // allow to encode them.
9745 // Therefore, if the input register is 64-bit, we need to take the
9746 // 32-bit sub-part.
9747 if (!Is32Bit && Imm < 32)
9748 NewMI->getOperand(i: 0).setSubReg(AArch64::sub_32);
9749 MI.eraseFromParent();
9750 return true;
9751 }
9752 // Look for CSINC
9753 case AArch64::CSINCWr:
9754 case AArch64::CSINCXr: {
9755 if (!(DefMI->getOperand(i: 1).getReg() == AArch64::WZR &&
9756 DefMI->getOperand(i: 2).getReg() == AArch64::WZR) &&
9757 !(DefMI->getOperand(i: 1).getReg() == AArch64::XZR &&
9758 DefMI->getOperand(i: 2).getReg() == AArch64::XZR))
9759 return false;
9760
9761 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
9762 isDead: true) != -1)
9763 return false;
9764
9765 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(i: 3).getImm();
9766 // Convert only when the condition code is not modified between
9767 // the CSINC and the branch. The CC may be used by other
9768 // instructions in between.
9769 if (areCFlagsAccessedBetweenInstrs(From: DefMI, To: MI, TRI: &getRegisterInfo(), AccessToCheck: AK_Write))
9770 return false;
9771 MachineBasicBlock &RefToMBB = *MBB;
9772 MachineBasicBlock *TBB = MI.getOperand(i: TargetBBInMI).getMBB();
9773 DebugLoc DL = MI.getDebugLoc();
9774 if (IsNegativeBranch)
9775 CC = AArch64CC::getInvertedCondCode(Code: CC);
9776 BuildMI(BB&: RefToMBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::Bcc)).addImm(Val: CC).addMBB(MBB: TBB);
9777 MI.eraseFromParent();
9778 return true;
9779 }
9780 }
9781}
9782
9783std::pair<unsigned, unsigned>
9784AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9785 const unsigned Mask = AArch64II::MO_FRAGMENT;
9786 return std::make_pair(x: TF & Mask, y: TF & ~Mask);
9787}
9788
9789ArrayRef<std::pair<unsigned, const char *>>
9790AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9791 using namespace AArch64II;
9792
9793 static const std::pair<unsigned, const char *> TargetFlags[] = {
9794 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9795 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9796 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9797 {MO_HI12, "aarch64-hi12"}};
9798 return ArrayRef(TargetFlags);
9799}
9800
9801ArrayRef<std::pair<unsigned, const char *>>
9802AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9803 using namespace AArch64II;
9804
9805 static const std::pair<unsigned, const char *> TargetFlags[] = {
9806 {MO_COFFSTUB, "aarch64-coffstub"},
9807 {MO_GOT, "aarch64-got"},
9808 {MO_NC, "aarch64-nc"},
9809 {MO_S, "aarch64-s"},
9810 {MO_TLS, "aarch64-tls"},
9811 {MO_DLLIMPORT, "aarch64-dllimport"},
9812 {MO_PREL, "aarch64-prel"},
9813 {MO_TAGGED, "aarch64-tagged"},
9814 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9815 };
9816 return ArrayRef(TargetFlags);
9817}
9818
9819ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
9820AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9821 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9822 {{MOSuppressPair, "aarch64-suppress-pair"},
9823 {MOStridedAccess, "aarch64-strided-access"}};
9824 return ArrayRef(TargetFlags);
9825}
9826
9827/// Constants defining how certain sequences should be outlined.
9828/// This encompasses how an outlined function should be called, and what kind of
9829/// frame should be emitted for that outlined function.
9830///
9831/// \p MachineOutlinerDefault implies that the function should be called with
9832/// a save and restore of LR to the stack.
9833///
9834/// That is,
9835///
9836/// I1 Save LR OUTLINED_FUNCTION:
9837/// I2 --> BL OUTLINED_FUNCTION I1
9838/// I3 Restore LR I2
9839/// I3
9840/// RET
9841///
9842/// * Call construction overhead: 3 (save + BL + restore)
9843/// * Frame construction overhead: 1 (ret)
9844/// * Requires stack fixups? Yes
9845///
9846/// \p MachineOutlinerTailCall implies that the function is being created from
9847/// a sequence of instructions ending in a return.
9848///
9849/// That is,
9850///
9851/// I1 OUTLINED_FUNCTION:
9852/// I2 --> B OUTLINED_FUNCTION I1
9853/// RET I2
9854/// RET
9855///
9856/// * Call construction overhead: 1 (B)
9857/// * Frame construction overhead: 0 (Return included in sequence)
9858/// * Requires stack fixups? No
9859///
9860/// \p MachineOutlinerNoLRSave implies that the function should be called using
9861/// a BL instruction, but doesn't require LR to be saved and restored. This
9862/// happens when LR is known to be dead.
9863///
9864/// That is,
9865///
9866/// I1 OUTLINED_FUNCTION:
9867/// I2 --> BL OUTLINED_FUNCTION I1
9868/// I3 I2
9869/// I3
9870/// RET
9871///
9872/// * Call construction overhead: 1 (BL)
9873/// * Frame construction overhead: 1 (RET)
9874/// * Requires stack fixups? No
9875///
9876/// \p MachineOutlinerThunk implies that the function is being created from
9877/// a sequence of instructions ending in a call. The outlined function is
9878/// called with a BL instruction, and the outlined function tail-calls the
9879/// original call destination.
9880///
9881/// That is,
9882///
9883/// I1 OUTLINED_FUNCTION:
9884/// I2 --> BL OUTLINED_FUNCTION I1
9885/// BL f I2
9886/// B f
9887/// * Call construction overhead: 1 (BL)
9888/// * Frame construction overhead: 0
9889/// * Requires stack fixups? No
9890///
9891/// \p MachineOutlinerRegSave implies that the function should be called with a
9892/// save and restore of LR to an available register. This allows us to avoid
9893/// stack fixups. Note that this outlining variant is compatible with the
9894/// NoLRSave case.
9895///
9896/// That is,
9897///
9898/// I1 Save LR OUTLINED_FUNCTION:
9899/// I2 --> BL OUTLINED_FUNCTION I1
9900/// I3 Restore LR I2
9901/// I3
9902/// RET
9903///
9904/// * Call construction overhead: 3 (save + BL + restore)
9905/// * Frame construction overhead: 1 (ret)
9906/// * Requires stack fixups? No
9907enum MachineOutlinerClass {
9908 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9909 MachineOutlinerTailCall, /// Only emit a branch.
9910 MachineOutlinerNoLRSave, /// Emit a call and return.
9911 MachineOutlinerThunk, /// Emit a call and tail-call.
9912 MachineOutlinerRegSave /// Same as default, but save to a register.
9913};
9914
9915enum MachineOutlinerMBBFlags {
9916 LRUnavailableSomewhere = 0x2,
9917 HasCalls = 0x4,
9918 UnsafeRegsDead = 0x8
9919};
9920
9921Register
9922AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
9923 MachineFunction *MF = C.getMF();
9924 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
9925 const AArch64RegisterInfo *ARI =
9926 static_cast<const AArch64RegisterInfo *>(&TRI);
9927 // Check if there is an available register across the sequence that we can
9928 // use.
9929 for (unsigned Reg : AArch64::GPR64RegClass) {
9930 if (!ARI->isReservedReg(MF: *MF, Reg) &&
9931 Reg != AArch64::LR && // LR is not reserved, but don't use it.
9932 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
9933 Reg != AArch64::X17 && // Ditto for X17.
9934 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
9935 C.isAvailableInsideSeq(Reg, TRI))
9936 return Reg;
9937 }
9938 return Register();
9939}
9940
9941static bool
9942outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
9943 const outliner::Candidate &b) {
9944 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9945 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9946
9947 return MFIa->getSignReturnAddressCondition() ==
9948 MFIb->getSignReturnAddressCondition();
9949}
9950
9951static bool
9952outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
9953 const outliner::Candidate &b) {
9954 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9955 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9956
9957 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
9958}
9959
9960static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
9961 const outliner::Candidate &b) {
9962 const AArch64Subtarget &SubtargetA =
9963 a.getMF()->getSubtarget<AArch64Subtarget>();
9964 const AArch64Subtarget &SubtargetB =
9965 b.getMF()->getSubtarget<AArch64Subtarget>();
9966 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
9967}
9968
9969std::optional<std::unique_ptr<outliner::OutlinedFunction>>
9970AArch64InstrInfo::getOutliningCandidateInfo(
9971 const MachineModuleInfo &MMI,
9972 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
9973 unsigned MinRepeats) const {
9974 unsigned SequenceSize = 0;
9975 for (auto &MI : RepeatedSequenceLocs[0])
9976 SequenceSize += getInstSizeInBytes(MI);
9977
9978 unsigned NumBytesToCreateFrame = 0;
9979
9980 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
9981 // These instructions are fused together by the scheduler.
9982 // Any candidate where ADRP is the last instruction should be rejected
9983 // as that will lead to splitting ADRP pair.
9984 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
9985 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
9986 if (LastMI.getOpcode() == AArch64::ADRP &&
9987 (LastMI.getOperand(i: 1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
9988 (LastMI.getOperand(i: 1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
9989 return std::nullopt;
9990 }
9991
9992 // Similarly any candidate where the first instruction is ADD/LDR with a
9993 // page offset should be rejected to avoid ADRP splitting.
9994 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
9995 FirstMI.getOpcode() == AArch64::LDRXui) &&
9996 (FirstMI.getOperand(i: 2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
9997 (FirstMI.getOperand(i: 2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
9998 return std::nullopt;
9999 }
10000
10001 // We only allow outlining for functions having exactly matching return
10002 // address signing attributes, i.e., all share the same value for the
10003 // attribute "sign-return-address" and all share the same type of key they
10004 // are signed with.
10005 // Additionally we require all functions to simultaneously either support
10006 // v8.3a features or not. Otherwise an outlined function could get signed
10007 // using dedicated v8.3 instructions and a call from a function that doesn't
10008 // support v8.3 instructions would therefore be invalid.
10009 if (std::adjacent_find(
10010 first: RepeatedSequenceLocs.begin(), last: RepeatedSequenceLocs.end(),
10011 binary_pred: [](const outliner::Candidate &a, const outliner::Candidate &b) {
10012 // Return true if a and b are non-equal w.r.t. return address
10013 // signing or support of v8.3a features
10014 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10015 outliningCandidatesSigningKeyConsensus(a, b) &&
10016 outliningCandidatesV8_3OpsConsensus(a, b)) {
10017 return false;
10018 }
10019 return true;
10020 }) != RepeatedSequenceLocs.end()) {
10021 return std::nullopt;
10022 }
10023
10024 // Since at this point all candidates agree on their return address signing
10025 // picking just one is fine. If the candidate functions potentially sign their
10026 // return addresses, the outlined function should do the same. Note that in
10027 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10028 // not certainly true that the outlined function will have to sign its return
10029 // address but this decision is made later, when the decision to outline
10030 // has already been made.
10031 // The same holds for the number of additional instructions we need: On
10032 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10033 // necessary. However, at this point we don't know if the outlined function
10034 // will have a RET instruction so we assume the worst.
10035 const TargetRegisterInfo &TRI = getRegisterInfo();
10036 // Performing a tail call may require extra checks when PAuth is enabled.
10037 // If PAuth is disabled, set it to zero for uniformity.
10038 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10039 const auto RASignCondition = RepeatedSequenceLocs[0]
10040 .getMF()
10041 ->getInfo<AArch64FunctionInfo>()
10042 ->getSignReturnAddressCondition();
10043 if (RASignCondition != SignReturnAddress::None) {
10044 // One PAC and one AUT instructions
10045 NumBytesToCreateFrame += 8;
10046
10047 // PAuth is enabled - set extra tail call cost, if any.
10048 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10049 MF: *RepeatedSequenceLocs[0].getMF());
10050 NumBytesToCheckLRInTCEpilogue =
10051 AArch64PAuth::getCheckerSizeInBytes(Method: LRCheckMethod);
10052 // Checking the authenticated LR value may significantly impact
10053 // SequenceSize, so account for it for more precise results.
10054 if (isTailCallReturnInst(MI: RepeatedSequenceLocs[0].back()))
10055 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10056
10057 // We have to check if sp modifying instructions would get outlined.
10058 // If so we only allow outlining if sp is unchanged overall, so matching
10059 // sub and add instructions are okay to outline, all other sp modifications
10060 // are not
10061 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10062 int SPValue = 0;
10063 for (auto &MI : C) {
10064 if (MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI)) {
10065 switch (MI.getOpcode()) {
10066 case AArch64::ADDXri:
10067 case AArch64::ADDWri:
10068 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10069 assert(MI.getOperand(2).isImm() &&
10070 "Expected operand to be immediate");
10071 assert(MI.getOperand(1).isReg() &&
10072 "Expected operand to be a register");
10073 // Check if the add just increments sp. If so, we search for
10074 // matching sub instructions that decrement sp. If not, the
10075 // modification is illegal
10076 if (MI.getOperand(i: 1).getReg() == AArch64::SP)
10077 SPValue += MI.getOperand(i: 2).getImm();
10078 else
10079 return true;
10080 break;
10081 case AArch64::SUBXri:
10082 case AArch64::SUBWri:
10083 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10084 assert(MI.getOperand(2).isImm() &&
10085 "Expected operand to be immediate");
10086 assert(MI.getOperand(1).isReg() &&
10087 "Expected operand to be a register");
10088 // Check if the sub just decrements sp. If so, we search for
10089 // matching add instructions that increment sp. If not, the
10090 // modification is illegal
10091 if (MI.getOperand(i: 1).getReg() == AArch64::SP)
10092 SPValue -= MI.getOperand(i: 2).getImm();
10093 else
10094 return true;
10095 break;
10096 default:
10097 return true;
10098 }
10099 }
10100 }
10101 if (SPValue)
10102 return true;
10103 return false;
10104 };
10105 // Remove candidates with illegal stack modifying instructions
10106 llvm::erase_if(C&: RepeatedSequenceLocs, P: hasIllegalSPModification);
10107
10108 // If the sequence doesn't have enough candidates left, then we're done.
10109 if (RepeatedSequenceLocs.size() < MinRepeats)
10110 return std::nullopt;
10111 }
10112
10113 // Properties about candidate MBBs that hold for all of them.
10114 unsigned FlagsSetInAll = 0xF;
10115
10116 // Compute liveness information for each candidate, and set FlagsSetInAll.
10117 for (outliner::Candidate &C : RepeatedSequenceLocs)
10118 FlagsSetInAll &= C.Flags;
10119
10120 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10121
10122 // Helper lambda which sets call information for every candidate.
10123 auto SetCandidateCallInfo =
10124 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10125 for (outliner::Candidate &C : RepeatedSequenceLocs)
10126 C.setCallInfo(CID: CallID, CO: NumBytesForCall);
10127 };
10128
10129 unsigned FrameID = MachineOutlinerDefault;
10130 NumBytesToCreateFrame += 4;
10131
10132 bool HasBTI = any_of(Range&: RepeatedSequenceLocs, P: [](outliner::Candidate &C) {
10133 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10134 });
10135
10136 // We check to see if CFI Instructions are present, and if they are
10137 // we find the number of CFI Instructions in the candidates.
10138 unsigned CFICount = 0;
10139 for (auto &I : RepeatedSequenceLocs[0]) {
10140 if (I.isCFIInstruction())
10141 CFICount++;
10142 }
10143
10144 // We compare the number of found CFI Instructions to the number of CFI
10145 // instructions in the parent function for each candidate. We must check this
10146 // since if we outline one of the CFI instructions in a function, we have to
10147 // outline them all for correctness. If we do not, the address offsets will be
10148 // incorrect between the two sections of the program.
10149 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10150 std::vector<MCCFIInstruction> CFIInstructions =
10151 C.getMF()->getFrameInstructions();
10152
10153 if (CFICount > 0 && CFICount != CFIInstructions.size())
10154 return std::nullopt;
10155 }
10156
10157 // Returns true if an instructions is safe to fix up, false otherwise.
10158 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10159 if (MI.isCall())
10160 return true;
10161
10162 if (!MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI) &&
10163 !MI.readsRegister(Reg: AArch64::SP, TRI: &TRI))
10164 return true;
10165
10166 // Any modification of SP will break our code to save/restore LR.
10167 // FIXME: We could handle some instructions which add a constant
10168 // offset to SP, with a bit more work.
10169 if (MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI))
10170 return false;
10171
10172 // At this point, we have a stack instruction that we might need to
10173 // fix up. We'll handle it if it's a load or store.
10174 if (MI.mayLoadOrStore()) {
10175 const MachineOperand *Base; // Filled with the base operand of MI.
10176 int64_t Offset; // Filled with the offset of MI.
10177 bool OffsetIsScalable;
10178
10179 // Does it allow us to offset the base operand and is the base the
10180 // register SP?
10181 if (!getMemOperandWithOffset(MI, BaseOp&: Base, Offset, OffsetIsScalable, TRI: &TRI) ||
10182 !Base->isReg() || Base->getReg() != AArch64::SP)
10183 return false;
10184
10185 // Fixe-up code below assumes bytes.
10186 if (OffsetIsScalable)
10187 return false;
10188
10189 // Find the minimum/maximum offset for this instruction and check
10190 // if fixing it up would be in range.
10191 int64_t MinOffset,
10192 MaxOffset; // Unscaled offsets for the instruction.
10193 // The scale to multiply the offsets by.
10194 TypeSize Scale(0U, false), DummyWidth(0U, false);
10195 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width&: DummyWidth, MinOffset, MaxOffset);
10196
10197 Offset += 16; // Update the offset to what it would be if we outlined.
10198 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10199 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10200 return false;
10201
10202 // It's in range, so we can outline it.
10203 return true;
10204 }
10205
10206 // FIXME: Add handling for instructions like "add x0, sp, #8".
10207
10208 // We can't fix it up, so don't outline it.
10209 return false;
10210 };
10211
10212 // True if it's possible to fix up each stack instruction in this sequence.
10213 // Important for frames/call variants that modify the stack.
10214 bool AllStackInstrsSafe =
10215 llvm::all_of(Range&: RepeatedSequenceLocs[0], P: IsSafeToFixup);
10216
10217 // If the last instruction in any candidate is a terminator, then we should
10218 // tail call all of the candidates.
10219 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10220 FrameID = MachineOutlinerTailCall;
10221 NumBytesToCreateFrame = 0;
10222 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10223 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10224 }
10225
10226 else if (LastInstrOpcode == AArch64::BL ||
10227 ((LastInstrOpcode == AArch64::BLR ||
10228 LastInstrOpcode == AArch64::BLRNoIP) &&
10229 !HasBTI)) {
10230 // FIXME: Do we need to check if the code after this uses the value of LR?
10231 FrameID = MachineOutlinerThunk;
10232 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10233 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10234 }
10235
10236 else {
10237 // We need to decide how to emit calls + frames. We can always emit the same
10238 // frame if we don't need to save to the stack. If we have to save to the
10239 // stack, then we need a different frame.
10240 unsigned NumBytesNoStackCalls = 0;
10241 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10242
10243 // Check if we have to save LR.
10244 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10245 bool LRAvailable =
10246 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
10247 ? C.isAvailableAcrossAndOutOfSeq(Reg: AArch64::LR, TRI)
10248 : true;
10249 // If we have a noreturn caller, then we're going to be conservative and
10250 // say that we have to save LR. If we don't have a ret at the end of the
10251 // block, then we can't reason about liveness accurately.
10252 //
10253 // FIXME: We can probably do better than always disabling this in
10254 // noreturn functions by fixing up the liveness info.
10255 bool IsNoReturn =
10256 C.getMF()->getFunction().hasFnAttribute(Kind: Attribute::NoReturn);
10257
10258 // Is LR available? If so, we don't need a save.
10259 if (LRAvailable && !IsNoReturn) {
10260 NumBytesNoStackCalls += 4;
10261 C.setCallInfo(CID: MachineOutlinerNoLRSave, CO: 4);
10262 CandidatesWithoutStackFixups.push_back(x: C);
10263 }
10264
10265 // Is an unused register available? If so, we won't modify the stack, so
10266 // we can outline with the same frame type as those that don't save LR.
10267 else if (findRegisterToSaveLRTo(C)) {
10268 NumBytesNoStackCalls += 12;
10269 C.setCallInfo(CID: MachineOutlinerRegSave, CO: 12);
10270 CandidatesWithoutStackFixups.push_back(x: C);
10271 }
10272
10273 // Is SP used in the sequence at all? If not, we don't have to modify
10274 // the stack, so we are guaranteed to get the same frame.
10275 else if (C.isAvailableInsideSeq(Reg: AArch64::SP, TRI)) {
10276 NumBytesNoStackCalls += 12;
10277 C.setCallInfo(CID: MachineOutlinerDefault, CO: 12);
10278 CandidatesWithoutStackFixups.push_back(x: C);
10279 }
10280
10281 // If we outline this, we need to modify the stack. Pretend we don't
10282 // outline this by saving all of its bytes.
10283 else {
10284 NumBytesNoStackCalls += SequenceSize;
10285 }
10286 }
10287
10288 // If there are no places where we have to save LR, then note that we
10289 // don't have to update the stack. Otherwise, give every candidate the
10290 // default call type, as long as it's safe to do so.
10291 if (!AllStackInstrsSafe ||
10292 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10293 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10294 FrameID = MachineOutlinerNoLRSave;
10295 if (RepeatedSequenceLocs.size() < MinRepeats)
10296 return std::nullopt;
10297 } else {
10298 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10299
10300 // Bugzilla ID: 46767
10301 // TODO: Check if fixing up the stack more than once is safe so we can
10302 // outline these.
10303 //
10304 // An outline resulting in a caller that requires stack fixups at the
10305 // callsite to a callee that also requires stack fixups can happen when
10306 // there are no available registers at the candidate callsite for a
10307 // candidate that itself also has calls.
10308 //
10309 // In other words if function_containing_sequence in the following pseudo
10310 // assembly requires that we save LR at the point of the call, but there
10311 // are no available registers: in this case we save using SP and as a
10312 // result the SP offsets requires stack fixups by multiples of 16.
10313 //
10314 // function_containing_sequence:
10315 // ...
10316 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10317 // call OUTLINED_FUNCTION_N
10318 // restore LR from SP
10319 // ...
10320 //
10321 // OUTLINED_FUNCTION_N:
10322 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10323 // ...
10324 // bl foo
10325 // restore LR from SP
10326 // ret
10327 //
10328 // Because the code to handle more than one stack fixup does not
10329 // currently have the proper checks for legality, these cases will assert
10330 // in the AArch64 MachineOutliner. This is because the code to do this
10331 // needs more hardening, testing, better checks that generated code is
10332 // legal, etc and because it is only verified to handle a single pass of
10333 // stack fixup.
10334 //
10335 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10336 // these cases until they are known to be handled. Bugzilla 46767 is
10337 // referenced in comments at the assert site.
10338 //
10339 // To avoid asserting (or generating non-legal code on noassert builds)
10340 // we remove all candidates which would need more than one stack fixup by
10341 // pruning the cases where the candidate has calls while also having no
10342 // available LR and having no available general purpose registers to copy
10343 // LR to (ie one extra stack save/restore).
10344 //
10345 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10346 erase_if(C&: RepeatedSequenceLocs, P: [this, &TRI](outliner::Candidate &C) {
10347 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10348 return (llvm::any_of(Range&: C, P: IsCall)) &&
10349 (!C.isAvailableAcrossAndOutOfSeq(Reg: AArch64::LR, TRI) ||
10350 !findRegisterToSaveLRTo(C));
10351 });
10352 }
10353 }
10354
10355 // If we dropped all of the candidates, bail out here.
10356 if (RepeatedSequenceLocs.size() < MinRepeats)
10357 return std::nullopt;
10358 }
10359
10360 // Does every candidate's MBB contain a call? If so, then we might have a call
10361 // in the range.
10362 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10363 // Check if the range contains a call. These require a save + restore of the
10364 // link register.
10365 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10366 bool ModStackToSaveLR = false;
10367 if (any_of(Range: drop_end(RangeOrContainer&: FirstCand),
10368 P: [](const MachineInstr &MI) { return MI.isCall(); }))
10369 ModStackToSaveLR = true;
10370
10371 // Handle the last instruction separately. If this is a tail call, then the
10372 // last instruction is a call. We don't want to save + restore in this case.
10373 // However, it could be possible that the last instruction is a call without
10374 // it being valid to tail call this sequence. We should consider this as
10375 // well.
10376 else if (FrameID != MachineOutlinerThunk &&
10377 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10378 ModStackToSaveLR = true;
10379
10380 if (ModStackToSaveLR) {
10381 // We can't fix up the stack. Bail out.
10382 if (!AllStackInstrsSafe)
10383 return std::nullopt;
10384
10385 // Save + restore LR.
10386 NumBytesToCreateFrame += 8;
10387 }
10388 }
10389
10390 // If we have CFI instructions, we can only outline if the outlined section
10391 // can be a tail call
10392 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10393 return std::nullopt;
10394
10395 return std::make_unique<outliner::OutlinedFunction>(
10396 args&: RepeatedSequenceLocs, args&: SequenceSize, args&: NumBytesToCreateFrame, args&: FrameID);
10397}
10398
10399void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10400 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10401 // If a bunch of candidates reach this point they must agree on their return
10402 // address signing. It is therefore enough to just consider the signing
10403 // behaviour of one of them
10404 const auto &CFn = Candidates.front().getMF()->getFunction();
10405
10406 if (CFn.hasFnAttribute(Kind: "ptrauth-returns"))
10407 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "ptrauth-returns"));
10408 if (CFn.hasFnAttribute(Kind: "ptrauth-auth-traps"))
10409 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "ptrauth-auth-traps"));
10410 // Since all candidates belong to the same module, just copy the
10411 // function-level attributes of an arbitrary function.
10412 if (CFn.hasFnAttribute(Kind: "sign-return-address"))
10413 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address"));
10414 if (CFn.hasFnAttribute(Kind: "sign-return-address-key"))
10415 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address-key"));
10416
10417 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10418}
10419
10420bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10421 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10422 const Function &F = MF.getFunction();
10423
10424 // Can F be deduplicated by the linker? If it can, don't outline from it.
10425 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10426 return false;
10427
10428 // Don't outline from functions with section markings; the program could
10429 // expect that all the code is in the named section.
10430 // FIXME: Allow outlining from multiple functions with the same section
10431 // marking.
10432 if (F.hasSection())
10433 return false;
10434
10435 // Outlining from functions with redzones is unsafe since the outliner may
10436 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10437 // outline from it.
10438 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10439 if (!AFI || AFI->hasRedZone().value_or(u: true))
10440 return false;
10441
10442 // FIXME: Determine whether it is safe to outline from functions which contain
10443 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10444 // outlined together and ensure it is safe to outline with async unwind info,
10445 // required for saving & restoring VG around calls.
10446 if (AFI->hasStreamingModeChanges())
10447 return false;
10448
10449 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10450 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
10451 return false;
10452
10453 // It's safe to outline from MF.
10454 return true;
10455}
10456
10457SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10458AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10459 unsigned &Flags) const {
10460 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
10461 "Must track liveness!");
10462 SmallVector<
10463 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10464 Ranges;
10465 // According to the AArch64 Procedure Call Standard, the following are
10466 // undefined on entry/exit from a function call:
10467 //
10468 // * Registers x16, x17, (and thus w16, w17)
10469 // * Condition codes (and thus the NZCV register)
10470 //
10471 // If any of these registers are used inside or live across an outlined
10472 // function, then they may be modified later, either by the compiler or
10473 // some other tool (like the linker).
10474 //
10475 // To avoid outlining in these situations, partition each block into ranges
10476 // where these registers are dead. We will only outline from those ranges.
10477 LiveRegUnits LRU(getRegisterInfo());
10478 auto AreAllUnsafeRegsDead = [&LRU]() {
10479 return LRU.available(Reg: AArch64::W16) && LRU.available(Reg: AArch64::W17) &&
10480 LRU.available(Reg: AArch64::NZCV);
10481 };
10482
10483 // We need to know if LR is live across an outlining boundary later on in
10484 // order to decide how we'll create the outlined call, frame, etc.
10485 //
10486 // It's pretty expensive to check this for *every candidate* within a block.
10487 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10488 // to compute liveness from the end of the block for O(n) candidates within
10489 // the block.
10490 //
10491 // So, to improve the average case, let's keep track of liveness from the end
10492 // of the block to the beginning of *every outlinable range*. If we know that
10493 // LR is available in every range we could outline from, then we know that
10494 // we don't need to check liveness for any candidate within that range.
10495 bool LRAvailableEverywhere = true;
10496 // Compute liveness bottom-up.
10497 LRU.addLiveOuts(MBB);
10498 // Update flags that require info about the entire MBB.
10499 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10500 if (MI.isCall() && !MI.isTerminator())
10501 Flags |= MachineOutlinerMBBFlags::HasCalls;
10502 };
10503 // Range: [RangeBegin, RangeEnd)
10504 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10505 unsigned RangeLen;
10506 auto CreateNewRangeStartingAt =
10507 [&RangeBegin, &RangeEnd,
10508 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10509 RangeBegin = NewBegin;
10510 RangeEnd = std::next(x: RangeBegin);
10511 RangeLen = 0;
10512 };
10513 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10514 // At least one unsafe register is not dead. We do not want to outline at
10515 // this point. If it is long enough to outline from and does not cross a
10516 // bundle boundary, save the range [RangeBegin, RangeEnd).
10517 if (RangeLen <= 1)
10518 return;
10519 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10520 return;
10521 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10522 return;
10523 Ranges.emplace_back(Args&: RangeBegin, Args&: RangeEnd);
10524 };
10525 // Find the first point where all unsafe registers are dead.
10526 // FIND: <safe instr> <-- end of first potential range
10527 // SKIP: <unsafe def>
10528 // SKIP: ... everything between ...
10529 // SKIP: <unsafe use>
10530 auto FirstPossibleEndPt = MBB.instr_rbegin();
10531 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10532 LRU.stepBackward(MI: *FirstPossibleEndPt);
10533 // Update flags that impact how we outline across the entire block,
10534 // regardless of safety.
10535 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10536 if (AreAllUnsafeRegsDead())
10537 break;
10538 }
10539 // If we exhausted the entire block, we have no safe ranges to outline.
10540 if (FirstPossibleEndPt == MBB.instr_rend())
10541 return Ranges;
10542 // Current range.
10543 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10544 // StartPt points to the first place where all unsafe registers
10545 // are dead (if there is any such point). Begin partitioning the MBB into
10546 // ranges.
10547 for (auto &MI : make_range(x: FirstPossibleEndPt, y: MBB.instr_rend())) {
10548 LRU.stepBackward(MI);
10549 UpdateWholeMBBFlags(MI);
10550 if (!AreAllUnsafeRegsDead()) {
10551 SaveRangeIfNonEmpty();
10552 CreateNewRangeStartingAt(MI.getIterator());
10553 continue;
10554 }
10555 LRAvailableEverywhere &= LRU.available(Reg: AArch64::LR);
10556 RangeBegin = MI.getIterator();
10557 ++RangeLen;
10558 }
10559 // Above loop misses the last (or only) range. If we are still safe, then
10560 // let's save the range.
10561 if (AreAllUnsafeRegsDead())
10562 SaveRangeIfNonEmpty();
10563 if (Ranges.empty())
10564 return Ranges;
10565 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10566 // the order.
10567 std::reverse(first: Ranges.begin(), last: Ranges.end());
10568 // If there is at least one outlinable range where LR is unavailable
10569 // somewhere, remember that.
10570 if (!LRAvailableEverywhere)
10571 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
10572 return Ranges;
10573}
10574
10575outliner::InstrType
10576AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10577 MachineBasicBlock::iterator &MIT,
10578 unsigned Flags) const {
10579 MachineInstr &MI = *MIT;
10580
10581 // Don't outline anything used for return address signing. The outlined
10582 // function will get signed later if needed
10583 switch (MI.getOpcode()) {
10584 case AArch64::PACM:
10585 case AArch64::PACIASP:
10586 case AArch64::PACIBSP:
10587 case AArch64::PACIASPPC:
10588 case AArch64::PACIBSPPC:
10589 case AArch64::AUTIASP:
10590 case AArch64::AUTIBSP:
10591 case AArch64::AUTIASPPCi:
10592 case AArch64::AUTIASPPCr:
10593 case AArch64::AUTIBSPPCi:
10594 case AArch64::AUTIBSPPCr:
10595 case AArch64::RETAA:
10596 case AArch64::RETAB:
10597 case AArch64::RETAASPPCi:
10598 case AArch64::RETAASPPCr:
10599 case AArch64::RETABSPPCi:
10600 case AArch64::RETABSPPCr:
10601 case AArch64::EMITBKEY:
10602 case AArch64::PAUTH_PROLOGUE:
10603 case AArch64::PAUTH_EPILOGUE:
10604 return outliner::InstrType::Illegal;
10605 }
10606
10607 // We can only outline these if we will tail call the outlined function, or
10608 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10609 // in a tail call.
10610 //
10611 // FIXME: If the proper fixups for the offset are implemented, this should be
10612 // possible.
10613 if (MI.isCFIInstruction())
10614 return outliner::InstrType::Legal;
10615
10616 // Is this a terminator for a basic block?
10617 if (MI.isTerminator())
10618 // TargetInstrInfo::getOutliningType has already filtered out anything
10619 // that would break this, so we can allow it here.
10620 return outliner::InstrType::Legal;
10621
10622 // Make sure none of the operands are un-outlinable.
10623 for (const MachineOperand &MOP : MI.operands()) {
10624 // A check preventing CFI indices was here before, but only CFI
10625 // instructions should have those.
10626 assert(!MOP.isCFIIndex());
10627
10628 // If it uses LR or W30 explicitly, then don't touch it.
10629 if (MOP.isReg() && !MOP.isImplicit() &&
10630 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10631 return outliner::InstrType::Illegal;
10632 }
10633
10634 // Special cases for instructions that can always be outlined, but will fail
10635 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10636 // be outlined because they don't require a *specific* value to be in LR.
10637 if (MI.getOpcode() == AArch64::ADRP)
10638 return outliner::InstrType::Legal;
10639
10640 // If MI is a call we might be able to outline it. We don't want to outline
10641 // any calls that rely on the position of items on the stack. When we outline
10642 // something containing a call, we have to emit a save and restore of LR in
10643 // the outlined function. Currently, this always happens by saving LR to the
10644 // stack. Thus, if we outline, say, half the parameters for a function call
10645 // plus the call, then we'll break the callee's expectations for the layout
10646 // of the stack.
10647 //
10648 // FIXME: Allow calls to functions which construct a stack frame, as long
10649 // as they don't access arguments on the stack.
10650 // FIXME: Figure out some way to analyze functions defined in other modules.
10651 // We should be able to compute the memory usage based on the IR calling
10652 // convention, even if we can't see the definition.
10653 if (MI.isCall()) {
10654 // Get the function associated with the call. Look at each operand and find
10655 // the one that represents the callee and get its name.
10656 const Function *Callee = nullptr;
10657 for (const MachineOperand &MOP : MI.operands()) {
10658 if (MOP.isGlobal()) {
10659 Callee = dyn_cast<Function>(Val: MOP.getGlobal());
10660 break;
10661 }
10662 }
10663
10664 // Never outline calls to mcount. There isn't any rule that would require
10665 // this, but the Linux kernel's "ftrace" feature depends on it.
10666 if (Callee && Callee->getName() == "\01_mcount")
10667 return outliner::InstrType::Illegal;
10668
10669 // If we don't know anything about the callee, assume it depends on the
10670 // stack layout of the caller. In that case, it's only legal to outline
10671 // as a tail-call. Explicitly list the call instructions we know about so we
10672 // don't get unexpected results with call pseudo-instructions.
10673 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10674 if (MI.getOpcode() == AArch64::BLR ||
10675 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10676 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10677
10678 if (!Callee)
10679 return UnknownCallOutlineType;
10680
10681 // We have a function we have information about. Check it if it's something
10682 // can safely outline.
10683 MachineFunction *CalleeMF = MMI.getMachineFunction(F: *Callee);
10684
10685 // We don't know what's going on with the callee at all. Don't touch it.
10686 if (!CalleeMF)
10687 return UnknownCallOutlineType;
10688
10689 // Check if we know anything about the callee saves on the function. If we
10690 // don't, then don't touch it, since that implies that we haven't
10691 // computed anything about its stack frame yet.
10692 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10693 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10694 MFI.getNumObjects() > 0)
10695 return UnknownCallOutlineType;
10696
10697 // At this point, we can say that CalleeMF ought to not pass anything on the
10698 // stack. Therefore, we can outline it.
10699 return outliner::InstrType::Legal;
10700 }
10701
10702 // Don't touch the link register or W30.
10703 if (MI.readsRegister(Reg: AArch64::W30, TRI: &getRegisterInfo()) ||
10704 MI.modifiesRegister(Reg: AArch64::W30, TRI: &getRegisterInfo()))
10705 return outliner::InstrType::Illegal;
10706
10707 // Don't outline BTI instructions, because that will prevent the outlining
10708 // site from being indirectly callable.
10709 if (hasBTISemantics(MI))
10710 return outliner::InstrType::Illegal;
10711
10712 return outliner::InstrType::Legal;
10713}
10714
10715void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10716 for (MachineInstr &MI : MBB) {
10717 const MachineOperand *Base;
10718 TypeSize Width(0, false);
10719 int64_t Offset;
10720 bool OffsetIsScalable;
10721
10722 // Is this a load or store with an immediate offset with SP as the base?
10723 if (!MI.mayLoadOrStore() ||
10724 !getMemOperandWithOffsetWidth(LdSt: MI, BaseOp&: Base, Offset, OffsetIsScalable, Width,
10725 TRI: &RI) ||
10726 (Base->isReg() && Base->getReg() != AArch64::SP))
10727 continue;
10728
10729 // It is, so we have to fix it up.
10730 TypeSize Scale(0U, false);
10731 int64_t Dummy1, Dummy2;
10732
10733 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(LdSt&: MI);
10734 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10735 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2);
10736 assert(Scale != 0 && "Unexpected opcode!");
10737 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10738
10739 // We've pushed the return address to the stack, so add 16 to the offset.
10740 // This is safe, since we already checked if it would overflow when we
10741 // checked if this instruction was legal to outline.
10742 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10743 StackOffsetOperand.setImm(NewImm);
10744 }
10745}
10746
10747static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
10748 const AArch64InstrInfo *TII,
10749 bool ShouldSignReturnAddr) {
10750 if (!ShouldSignReturnAddr)
10751 return;
10752
10753 BuildMI(BB&: MBB, I: MBB.begin(), MIMD: DebugLoc(), MCID: TII->get(Opcode: AArch64::PAUTH_PROLOGUE))
10754 .setMIFlag(MachineInstr::FrameSetup);
10755 BuildMI(BB&: MBB, I: MBB.getFirstInstrTerminator(), MIMD: DebugLoc(),
10756 MCID: TII->get(Opcode: AArch64::PAUTH_EPILOGUE))
10757 .setMIFlag(MachineInstr::FrameDestroy);
10758}
10759
10760void AArch64InstrInfo::buildOutlinedFrame(
10761 MachineBasicBlock &MBB, MachineFunction &MF,
10762 const outliner::OutlinedFunction &OF) const {
10763
10764 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10765
10766 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10767 FI->setOutliningStyle("Tail Call");
10768 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10769 // For thunk outlining, rewrite the last instruction from a call to a
10770 // tail-call.
10771 MachineInstr *Call = &*--MBB.instr_end();
10772 unsigned TailOpcode;
10773 if (Call->getOpcode() == AArch64::BL) {
10774 TailOpcode = AArch64::TCRETURNdi;
10775 } else {
10776 assert(Call->getOpcode() == AArch64::BLR ||
10777 Call->getOpcode() == AArch64::BLRNoIP);
10778 TailOpcode = AArch64::TCRETURNriALL;
10779 }
10780 MachineInstr *TC = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: TailOpcode))
10781 .add(MO: Call->getOperand(i: 0))
10782 .addImm(Val: 0);
10783 MBB.insert(I: MBB.end(), MI: TC);
10784 Call->eraseFromParent();
10785
10786 FI->setOutliningStyle("Thunk");
10787 }
10788
10789 bool IsLeafFunction = true;
10790
10791 // Is there a call in the outlined range?
10792 auto IsNonTailCall = [](const MachineInstr &MI) {
10793 return MI.isCall() && !MI.isReturn();
10794 };
10795
10796 if (llvm::any_of(Range: MBB.instrs(), P: IsNonTailCall)) {
10797 // Fix up the instructions in the range, since we're going to modify the
10798 // stack.
10799
10800 // Bugzilla ID: 46767
10801 // TODO: Check if fixing up twice is safe so we can outline these.
10802 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10803 "Can only fix up stack references once");
10804 fixupPostOutline(MBB);
10805
10806 IsLeafFunction = false;
10807
10808 // LR has to be a live in so that we can save it.
10809 if (!MBB.isLiveIn(Reg: AArch64::LR))
10810 MBB.addLiveIn(PhysReg: AArch64::LR);
10811
10812 MachineBasicBlock::iterator It = MBB.begin();
10813 MachineBasicBlock::iterator Et = MBB.end();
10814
10815 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10816 OF.FrameConstructionID == MachineOutlinerThunk)
10817 Et = std::prev(x: MBB.end());
10818
10819 // Insert a save before the outlined region
10820 MachineInstr *STRXpre = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::STRXpre))
10821 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10822 .addReg(RegNo: AArch64::LR)
10823 .addReg(RegNo: AArch64::SP)
10824 .addImm(Val: -16);
10825 It = MBB.insert(I: It, MI: STRXpre);
10826
10827 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10828 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10829
10830 // Add a CFI saying the stack was moved 16 B down.
10831 CFIBuilder.buildDefCFAOffset(Offset: 16);
10832
10833 // Add a CFI saying that the LR that we want to find is now 16 B higher
10834 // than before.
10835 CFIBuilder.buildOffset(Reg: AArch64::LR, Offset: -16);
10836 }
10837
10838 // Insert a restore before the terminator for the function.
10839 MachineInstr *LDRXpost = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::LDRXpost))
10840 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10841 .addReg(RegNo: AArch64::LR, Flags: RegState::Define)
10842 .addReg(RegNo: AArch64::SP)
10843 .addImm(Val: 16);
10844 Et = MBB.insert(I: Et, MI: LDRXpost);
10845 }
10846
10847 auto RASignCondition = FI->getSignReturnAddressCondition();
10848 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10849 Condition: RASignCondition, IsLRSpilled: !IsLeafFunction);
10850
10851 // If this is a tail call outlined function, then there's already a return.
10852 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10853 OF.FrameConstructionID == MachineOutlinerThunk) {
10854 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
10855 return;
10856 }
10857
10858 // It's not a tail call, so we have to insert the return ourselves.
10859
10860 // LR has to be a live in so that we can return to it.
10861 if (!MBB.isLiveIn(Reg: AArch64::LR))
10862 MBB.addLiveIn(PhysReg: AArch64::LR);
10863
10864 MachineInstr *ret = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::RET))
10865 .addReg(RegNo: AArch64::LR);
10866 MBB.insert(I: MBB.end(), MI: ret);
10867
10868 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
10869
10870 FI->setOutliningStyle("Function");
10871
10872 // Did we have to modify the stack by saving the link register?
10873 if (OF.FrameConstructionID != MachineOutlinerDefault)
10874 return;
10875
10876 // We modified the stack.
10877 // Walk over the basic block and fix up all the stack accesses.
10878 fixupPostOutline(MBB);
10879}
10880
10881MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
10882 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
10883 MachineFunction &MF, outliner::Candidate &C) const {
10884
10885 // Are we tail calling?
10886 if (C.CallConstructionID == MachineOutlinerTailCall) {
10887 // If yes, then we can just branch to the label.
10888 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::TCRETURNdi))
10889 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName()))
10890 .addImm(Val: 0));
10891 return It;
10892 }
10893
10894 // Are we saving the link register?
10895 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
10896 C.CallConstructionID == MachineOutlinerThunk) {
10897 // No, so just insert the call.
10898 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::BL))
10899 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName())));
10900 return It;
10901 }
10902
10903 // We want to return the spot where we inserted the call.
10904 MachineBasicBlock::iterator CallPt;
10905
10906 // Instructions for saving and restoring LR around the call instruction we're
10907 // going to insert.
10908 MachineInstr *Save;
10909 MachineInstr *Restore;
10910 // Can we save to a register?
10911 if (C.CallConstructionID == MachineOutlinerRegSave) {
10912 // FIXME: This logic should be sunk into a target-specific interface so that
10913 // we don't have to recompute the register.
10914 Register Reg = findRegisterToSaveLRTo(C);
10915 assert(Reg && "No callee-saved register available?");
10916
10917 // LR has to be a live in so that we can save it.
10918 if (!MBB.isLiveIn(Reg: AArch64::LR))
10919 MBB.addLiveIn(PhysReg: AArch64::LR);
10920
10921 // Save and restore LR from Reg.
10922 Save = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::ORRXrs), DestReg: Reg)
10923 .addReg(RegNo: AArch64::XZR)
10924 .addReg(RegNo: AArch64::LR)
10925 .addImm(Val: 0);
10926 Restore = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::ORRXrs), DestReg: AArch64::LR)
10927 .addReg(RegNo: AArch64::XZR)
10928 .addReg(RegNo: Reg)
10929 .addImm(Val: 0);
10930 } else {
10931 // We have the default case. Save and restore from SP.
10932 Save = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::STRXpre))
10933 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10934 .addReg(RegNo: AArch64::LR)
10935 .addReg(RegNo: AArch64::SP)
10936 .addImm(Val: -16);
10937 Restore = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::LDRXpost))
10938 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10939 .addReg(RegNo: AArch64::LR, Flags: RegState::Define)
10940 .addReg(RegNo: AArch64::SP)
10941 .addImm(Val: 16);
10942 }
10943
10944 It = MBB.insert(I: It, MI: Save);
10945 It++;
10946
10947 // Insert the call.
10948 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::BL))
10949 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName())));
10950 CallPt = It;
10951 It++;
10952
10953 It = MBB.insert(I: It, MI: Restore);
10954 return CallPt;
10955}
10956
10957bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
10958 MachineFunction &MF) const {
10959 return MF.getFunction().hasMinSize();
10960}
10961
10962void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
10963 MachineBasicBlock::iterator Iter,
10964 DebugLoc &DL,
10965 bool AllowSideEffects) const {
10966 const MachineFunction &MF = *MBB.getParent();
10967 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
10968 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
10969
10970 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10971 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: Reg).addImm(Val: 0).addImm(Val: 0);
10972 } else if (STI.isSVEorStreamingSVEAvailable()) {
10973 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::DUP_ZI_D), DestReg: Reg)
10974 .addImm(Val: 0)
10975 .addImm(Val: 0);
10976 } else if (STI.isNeonAvailable()) {
10977 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::MOVIv2d_ns), DestReg: Reg)
10978 .addImm(Val: 0);
10979 } else {
10980 // This is a streaming-compatible function without SVE. We don't have full
10981 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
10982 // So given `movi v..` would be illegal use `fmov d..` instead.
10983 assert(STI.hasNEON() && "Expected to have NEON.");
10984 Register Reg64 = TRI.getSubReg(Reg, Idx: AArch64::dsub);
10985 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::FMOVD0), DestReg: Reg64);
10986 }
10987}
10988
10989std::optional<DestSourcePair>
10990AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
10991
10992 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
10993 // and zero immediate operands used as an alias for mov instruction.
10994 if (((MI.getOpcode() == AArch64::ORRWrs &&
10995 MI.getOperand(i: 1).getReg() == AArch64::WZR &&
10996 MI.getOperand(i: 3).getImm() == 0x0) ||
10997 (MI.getOpcode() == AArch64::ORRWrr &&
10998 MI.getOperand(i: 1).getReg() == AArch64::WZR)) &&
10999 // Check that the w->w move is not a zero-extending w->x mov.
11000 (!MI.getOperand(i: 0).getReg().isVirtual() ||
11001 MI.getOperand(i: 0).getSubReg() == 0) &&
11002 (!MI.getOperand(i: 0).getReg().isPhysical() ||
11003 MI.findRegisterDefOperandIdx(Reg: getXRegFromWReg(Reg: MI.getOperand(i: 0).getReg()),
11004 /*TRI=*/nullptr) == -1))
11005 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11006
11007 if (MI.getOpcode() == AArch64::ORRXrs &&
11008 MI.getOperand(i: 1).getReg() == AArch64::XZR &&
11009 MI.getOperand(i: 3).getImm() == 0x0)
11010 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11011
11012 return std::nullopt;
11013}
11014
11015std::optional<DestSourcePair>
11016AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
11017 if ((MI.getOpcode() == AArch64::ORRWrs &&
11018 MI.getOperand(i: 1).getReg() == AArch64::WZR &&
11019 MI.getOperand(i: 3).getImm() == 0x0) ||
11020 (MI.getOpcode() == AArch64::ORRWrr &&
11021 MI.getOperand(i: 1).getReg() == AArch64::WZR))
11022 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11023 return std::nullopt;
11024}
11025
11026std::optional<RegImmPair>
11027AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11028 int Sign = 1;
11029 int64_t Offset = 0;
11030
11031 // TODO: Handle cases where Reg is a super- or sub-register of the
11032 // destination register.
11033 const MachineOperand &Op0 = MI.getOperand(i: 0);
11034 if (!Op0.isReg() || Reg != Op0.getReg())
11035 return std::nullopt;
11036
11037 switch (MI.getOpcode()) {
11038 default:
11039 return std::nullopt;
11040 case AArch64::SUBWri:
11041 case AArch64::SUBXri:
11042 case AArch64::SUBSWri:
11043 case AArch64::SUBSXri:
11044 Sign *= -1;
11045 [[fallthrough]];
11046 case AArch64::ADDSWri:
11047 case AArch64::ADDSXri:
11048 case AArch64::ADDWri:
11049 case AArch64::ADDXri: {
11050 // TODO: Third operand can be global address (usually some string).
11051 if (!MI.getOperand(i: 0).isReg() || !MI.getOperand(i: 1).isReg() ||
11052 !MI.getOperand(i: 2).isImm())
11053 return std::nullopt;
11054 int Shift = MI.getOperand(i: 3).getImm();
11055 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11056 Offset = Sign * (MI.getOperand(i: 2).getImm() << Shift);
11057 }
11058 }
11059 return RegImmPair{MI.getOperand(i: 1).getReg(), Offset};
11060}
11061
11062/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11063/// the destination register then, if possible, describe the value in terms of
11064/// the source register.
11065static std::optional<ParamLoadedValue>
11066describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
11067 const TargetInstrInfo *TII,
11068 const TargetRegisterInfo *TRI) {
11069 auto DestSrc = TII->isCopyLikeInstr(MI);
11070 if (!DestSrc)
11071 return std::nullopt;
11072
11073 Register DestReg = DestSrc->Destination->getReg();
11074 Register SrcReg = DestSrc->Source->getReg();
11075
11076 if (!DestReg.isValid() || !SrcReg.isValid())
11077 return std::nullopt;
11078
11079 auto Expr = DIExpression::get(Context&: MI.getMF()->getFunction().getContext(), Elements: {});
11080
11081 // If the described register is the destination, just return the source.
11082 if (DestReg == DescribedReg)
11083 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
11084
11085 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11086 if (MI.getOpcode() == AArch64::ORRWrs &&
11087 TRI->isSuperRegister(RegA: DestReg, RegB: DescribedReg))
11088 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
11089
11090 // We may need to describe the lower part of a ORRXrs move.
11091 if (MI.getOpcode() == AArch64::ORRXrs &&
11092 TRI->isSubRegister(RegA: DestReg, RegB: DescribedReg)) {
11093 Register SrcSubReg = TRI->getSubReg(Reg: SrcReg, Idx: AArch64::sub_32);
11094 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcSubReg, isDef: false), Expr);
11095 }
11096
11097 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11098 "Unhandled ORR[XW]rs copy case");
11099
11100 return std::nullopt;
11101}
11102
11103bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11104 // Functions cannot be split to different sections on AArch64 if they have
11105 // a red zone. This is because relaxing a cross-section branch may require
11106 // incrementing the stack pointer to spill a register, which would overwrite
11107 // the red zone.
11108 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(u: true))
11109 return false;
11110
11111 return TargetInstrInfo::isFunctionSafeToSplit(MF);
11112}
11113
11114bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11115 const MachineBasicBlock &MBB) const {
11116 // Asm Goto blocks can contain conditional branches to goto labels, which can
11117 // get moved out of range of the branch instruction.
11118 auto isAsmGoto = [](const MachineInstr &MI) {
11119 return MI.getOpcode() == AArch64::INLINEASM_BR;
11120 };
11121 if (llvm::any_of(Range: MBB, P: isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11122 return false;
11123
11124 // Because jump tables are label-relative instead of table-relative, they all
11125 // must be in the same section or relocation fixup handling will fail.
11126
11127 // Check if MBB is a jump table target
11128 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11129 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11130 return llvm::is_contained(Range: JTE.MBBs, Element: &MBB);
11131 };
11132 if (MJTI != nullptr && llvm::any_of(Range: MJTI->getJumpTables(), P: containsMBB))
11133 return false;
11134
11135 // Check if MBB contains a jump table lookup
11136 for (const MachineInstr &MI : MBB) {
11137 switch (MI.getOpcode()) {
11138 case TargetOpcode::G_BRJT:
11139 case AArch64::JumpTableDest32:
11140 case AArch64::JumpTableDest16:
11141 case AArch64::JumpTableDest8:
11142 return false;
11143 default:
11144 continue;
11145 }
11146 }
11147
11148 // MBB isn't a special case, so it's safe to be split to the cold section.
11149 return true;
11150}
11151
11152std::optional<ParamLoadedValue>
11153AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11154 Register Reg) const {
11155 const MachineFunction *MF = MI.getMF();
11156 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11157 switch (MI.getOpcode()) {
11158 case AArch64::MOVZWi:
11159 case AArch64::MOVZXi: {
11160 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11161 // 64-bit parameters, so we need to consider super-registers.
11162 if (!TRI->isSuperRegisterEq(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
11163 return std::nullopt;
11164
11165 if (!MI.getOperand(i: 1).isImm())
11166 return std::nullopt;
11167 int64_t Immediate = MI.getOperand(i: 1).getImm();
11168 int Shift = MI.getOperand(i: 2).getImm();
11169 return ParamLoadedValue(MachineOperand::CreateImm(Val: Immediate << Shift),
11170 nullptr);
11171 }
11172 case AArch64::ORRWrs:
11173 case AArch64::ORRXrs:
11174 return describeORRLoadedValue(MI, DescribedReg: Reg, TII: this, TRI);
11175 }
11176
11177 return TargetInstrInfo::describeLoadedValue(MI, Reg);
11178}
11179
11180bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11181 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11182 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11183 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11184 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11185
11186 // Anyexts are nops.
11187 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11188 return true;
11189
11190 Register DefReg = ExtMI.getOperand(i: 0).getReg();
11191 if (!MRI.hasOneNonDBGUse(RegNo: DefReg))
11192 return false;
11193
11194 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11195 // addressing mode.
11196 auto *UserMI = &*MRI.use_instr_nodbg_begin(RegNo: DefReg);
11197 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11198}
11199
11200uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11201 return get(Opcode: Opc).TSFlags & AArch64::ElementSizeMask;
11202}
11203
11204bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11205 return get(Opcode: Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11206}
11207
11208bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11209 return get(Opcode: Opc).TSFlags & AArch64::InstrFlagIsWhile;
11210}
11211
11212unsigned int
11213AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11214 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11215}
11216
11217bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11218 unsigned Scale) const {
11219 if (Offset && Scale)
11220 return false;
11221
11222 // Check Reg + Imm
11223 if (!Scale) {
11224 // 9-bit signed offset
11225 if (isInt<9>(x: Offset))
11226 return true;
11227
11228 // 12-bit unsigned offset
11229 unsigned Shift = Log2_64(Value: NumBytes);
11230 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11231 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11232 (Offset >> Shift) << Shift == Offset)
11233 return true;
11234 return false;
11235 }
11236
11237 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11238 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11239}
11240
11241unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
11242 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11243 return AArch64::BLRNoIP;
11244 else
11245 return AArch64::BLR;
11246}
11247
11248MachineBasicBlock::iterator
11249AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11250 Register TargetReg, bool FrameSetup) const {
11251 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11252
11253 MachineBasicBlock &MBB = *MBBI->getParent();
11254 MachineFunction &MF = *MBB.getParent();
11255 const AArch64InstrInfo *TII =
11256 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11257 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11258 DebugLoc DL = MBB.findDebugLoc(MBBI);
11259
11260 MachineFunction::iterator MBBInsertPoint = std::next(x: MBB.getIterator());
11261 MachineBasicBlock *LoopTestMBB =
11262 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11263 MF.insert(MBBI: MBBInsertPoint, MBB: LoopTestMBB);
11264 MachineBasicBlock *LoopBodyMBB =
11265 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11266 MF.insert(MBBI: MBBInsertPoint, MBB: LoopBodyMBB);
11267 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11268 MF.insert(MBBI: MBBInsertPoint, MBB: ExitMBB);
11269 MachineInstr::MIFlag Flags =
11270 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
11271
11272 // LoopTest:
11273 // SUB SP, SP, #ProbeSize
11274 emitFrameOffset(MBB&: *LoopTestMBB, MBBI: LoopTestMBB->end(), DL, DestReg: AArch64::SP,
11275 SrcReg: AArch64::SP, Offset: StackOffset::getFixed(Fixed: -ProbeSize), TII, Flag: Flags);
11276
11277 // CMP SP, TargetReg
11278 BuildMI(BB&: *LoopTestMBB, I: LoopTestMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::SUBSXrx64),
11279 DestReg: AArch64::XZR)
11280 .addReg(RegNo: AArch64::SP)
11281 .addReg(RegNo: TargetReg)
11282 .addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: 0))
11283 .setMIFlags(Flags);
11284
11285 // B.<Cond> LoopExit
11286 BuildMI(BB&: *LoopTestMBB, I: LoopTestMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc))
11287 .addImm(Val: AArch64CC::LE)
11288 .addMBB(MBB: ExitMBB)
11289 .setMIFlags(Flags);
11290
11291 // LDR XZR, [SP]
11292 BuildMI(BB&: *LoopBodyMBB, I: LoopBodyMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
11293 .addDef(RegNo: AArch64::XZR)
11294 .addReg(RegNo: AArch64::SP)
11295 .addImm(Val: 0)
11296 .addMemOperand(MMO: MF.getMachineMemOperand(
11297 PtrInfo: MachinePointerInfo::getUnknownStack(MF),
11298 F: MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, Size: 8,
11299 BaseAlignment: Align(8)))
11300 .setMIFlags(Flags);
11301
11302 // B loop
11303 BuildMI(BB&: *LoopBodyMBB, I: LoopBodyMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::B))
11304 .addMBB(MBB: LoopTestMBB)
11305 .setMIFlags(Flags);
11306
11307 // LoopExit:
11308 // MOV SP, TargetReg
11309 BuildMI(BB&: *ExitMBB, I: ExitMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri), DestReg: AArch64::SP)
11310 .addReg(RegNo: TargetReg)
11311 .addImm(Val: 0)
11312 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
11313 .setMIFlags(Flags);
11314
11315 // LDR XZR, [SP]
11316 BuildMI(BB&: *ExitMBB, I: ExitMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
11317 .addReg(RegNo: AArch64::XZR, Flags: RegState::Define)
11318 .addReg(RegNo: AArch64::SP)
11319 .addImm(Val: 0)
11320 .setMIFlags(Flags);
11321
11322 ExitMBB->splice(Where: ExitMBB->end(), Other: &MBB, From: std::next(x: MBBI), To: MBB.end());
11323 ExitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
11324
11325 LoopTestMBB->addSuccessor(Succ: ExitMBB);
11326 LoopTestMBB->addSuccessor(Succ: LoopBodyMBB);
11327 LoopBodyMBB->addSuccessor(Succ: LoopTestMBB);
11328 MBB.addSuccessor(Succ: LoopTestMBB);
11329
11330 // Update liveins.
11331 if (MF.getRegInfo().reservedRegsFrozen())
11332 fullyRecomputeLiveIns(MBBs: {ExitMBB, LoopBodyMBB, LoopTestMBB});
11333
11334 return ExitMBB->begin();
11335}
11336
11337namespace {
11338class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11339 MachineFunction *MF;
11340 const TargetInstrInfo *TII;
11341 const TargetRegisterInfo *TRI;
11342 MachineRegisterInfo &MRI;
11343
11344 /// The block of the loop
11345 MachineBasicBlock *LoopBB;
11346 /// The conditional branch of the loop
11347 MachineInstr *CondBranch;
11348 /// The compare instruction for loop control
11349 MachineInstr *Comp;
11350 /// The number of the operand of the loop counter value in Comp
11351 unsigned CompCounterOprNum;
11352 /// The instruction that updates the loop counter value
11353 MachineInstr *Update;
11354 /// The number of the operand of the loop counter value in Update
11355 unsigned UpdateCounterOprNum;
11356 /// The initial value of the loop counter
11357 Register Init;
11358 /// True iff Update is a predecessor of Comp
11359 bool IsUpdatePriorComp;
11360
11361 /// The normalized condition used by createTripCountGreaterCondition()
11362 SmallVector<MachineOperand, 4> Cond;
11363
11364public:
11365 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11366 MachineInstr *Comp, unsigned CompCounterOprNum,
11367 MachineInstr *Update, unsigned UpdateCounterOprNum,
11368 Register Init, bool IsUpdatePriorComp,
11369 const SmallVectorImpl<MachineOperand> &Cond)
11370 : MF(Comp->getParent()->getParent()),
11371 TII(MF->getSubtarget().getInstrInfo()),
11372 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11373 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11374 CompCounterOprNum(CompCounterOprNum), Update(Update),
11375 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11376 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11377
11378 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11379 // Make the instructions for loop control be placed in stage 0.
11380 // The predecessors of Comp are considered by the caller.
11381 return MI == Comp;
11382 }
11383
11384 std::optional<bool> createTripCountGreaterCondition(
11385 int TC, MachineBasicBlock &MBB,
11386 SmallVectorImpl<MachineOperand> &CondParam) override {
11387 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11388 // Cond is normalized for such use.
11389 // The predecessors of the branch are assumed to have already been inserted.
11390 CondParam = Cond;
11391 return {};
11392 }
11393
11394 void createRemainingIterationsGreaterCondition(
11395 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11396 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11397
11398 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11399
11400 void adjustTripCount(int TripCountAdjust) override {}
11401
11402 bool isMVEExpanderSupported() override { return true; }
11403};
11404} // namespace
11405
11406/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11407/// is replaced by ReplaceReg. The output register is newly created.
11408/// The other operands are unchanged from MI.
11409static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11410 Register ReplaceReg, MachineBasicBlock &MBB,
11411 MachineBasicBlock::iterator InsertTo) {
11412 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11413 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11414 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(Orig: MI);
11415 Register Result = 0;
11416 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11417 if (I == 0 && NewMI->getOperand(i: 0).getReg().isVirtual()) {
11418 Result = MRI.createVirtualRegister(
11419 RegClass: MRI.getRegClass(Reg: NewMI->getOperand(i: 0).getReg()));
11420 NewMI->getOperand(i: I).setReg(Result);
11421 } else if (I == ReplaceOprNum) {
11422 MRI.constrainRegClass(Reg: ReplaceReg, RC: TII->getRegClass(MCID: NewMI->getDesc(), OpNum: I));
11423 NewMI->getOperand(i: I).setReg(ReplaceReg);
11424 }
11425 }
11426 MBB.insert(I: InsertTo, MI: NewMI);
11427 return Result;
11428}
11429
11430void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11431 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11432 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
11433 // Create and accumulate conditions for next TC iterations.
11434 // Example:
11435 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11436 // # iteration of the kernel
11437 //
11438 // # insert the following instructions
11439 // cond = CSINCXr 0, 0, C, implicit $nzcv
11440 // counter = ADDXri counter, 1 # clone from this->Update
11441 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11442 // cond = CSINCXr cond, cond, C, implicit $nzcv
11443 // ... (repeat TC times)
11444 // SUBSXri cond, 0, implicit-def $nzcv
11445
11446 assert(CondBranch->getOpcode() == AArch64::Bcc);
11447 // CondCode to exit the loop
11448 AArch64CC::CondCode CC =
11449 (AArch64CC::CondCode)CondBranch->getOperand(i: 0).getImm();
11450 if (CondBranch->getOperand(i: 1).getMBB() == LoopBB)
11451 CC = AArch64CC::getInvertedCondCode(Code: CC);
11452
11453 // Accumulate conditions to exit the loop
11454 Register AccCond = AArch64::XZR;
11455
11456 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11457 auto AccumulateCond = [&](Register CurCond,
11458 AArch64CC::CondCode CC) -> Register {
11459 Register NewCond = MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
11460 BuildMI(BB&: MBB, I: MBB.end(), MIMD: Comp->getDebugLoc(), MCID: TII->get(Opcode: AArch64::CSINCXr))
11461 .addReg(RegNo: NewCond, Flags: RegState::Define)
11462 .addReg(RegNo: CurCond)
11463 .addReg(RegNo: CurCond)
11464 .addImm(Val: AArch64CC::getInvertedCondCode(Code: CC));
11465 return NewCond;
11466 };
11467
11468 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11469 // Update and Comp for I==0 are already exists in MBB
11470 // (MBB is an unrolled kernel)
11471 Register Counter;
11472 for (int I = 0; I <= TC; ++I) {
11473 Register NextCounter;
11474 if (I != 0)
11475 NextCounter =
11476 cloneInstr(MI: Comp, ReplaceOprNum: CompCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11477
11478 AccCond = AccumulateCond(AccCond, CC);
11479
11480 if (I != TC) {
11481 if (I == 0) {
11482 if (Update != Comp && IsUpdatePriorComp) {
11483 Counter =
11484 LastStage0Insts[Comp]->getOperand(i: CompCounterOprNum).getReg();
11485 NextCounter = cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB,
11486 InsertTo: MBB.end());
11487 } else {
11488 // can use already calculated value
11489 NextCounter = LastStage0Insts[Update]->getOperand(i: 0).getReg();
11490 }
11491 } else if (Update != Comp) {
11492 NextCounter =
11493 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11494 }
11495 }
11496 Counter = NextCounter;
11497 }
11498 } else {
11499 Register Counter;
11500 if (LastStage0Insts.empty()) {
11501 // use initial counter value (testing if the trip count is sufficient to
11502 // be executed by pipelined code)
11503 Counter = Init;
11504 if (IsUpdatePriorComp)
11505 Counter =
11506 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11507 } else {
11508 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11509 Counter = LastStage0Insts[Comp]->getOperand(i: CompCounterOprNum).getReg();
11510 }
11511
11512 for (int I = 0; I <= TC; ++I) {
11513 Register NextCounter;
11514 NextCounter =
11515 cloneInstr(MI: Comp, ReplaceOprNum: CompCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11516 AccCond = AccumulateCond(AccCond, CC);
11517 if (I != TC && Update != Comp)
11518 NextCounter =
11519 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11520 Counter = NextCounter;
11521 }
11522 }
11523
11524 // If AccCond == 0, the remainder is greater than TC.
11525 BuildMI(BB&: MBB, I: MBB.end(), MIMD: Comp->getDebugLoc(), MCID: TII->get(Opcode: AArch64::SUBSXri))
11526 .addReg(RegNo: AArch64::XZR, Flags: RegState::Define | RegState::Dead)
11527 .addReg(RegNo: AccCond)
11528 .addImm(Val: 0)
11529 .addImm(Val: 0);
11530 Cond.clear();
11531 Cond.push_back(Elt: MachineOperand::CreateImm(Val: AArch64CC::EQ));
11532}
11533
11534static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11535 Register &RegMBB, Register &RegOther) {
11536 assert(Phi.getNumOperands() == 5);
11537 if (Phi.getOperand(i: 2).getMBB() == MBB) {
11538 RegMBB = Phi.getOperand(i: 1).getReg();
11539 RegOther = Phi.getOperand(i: 3).getReg();
11540 } else {
11541 assert(Phi.getOperand(4).getMBB() == MBB);
11542 RegMBB = Phi.getOperand(i: 3).getReg();
11543 RegOther = Phi.getOperand(i: 1).getReg();
11544 }
11545}
11546
11547static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
11548 if (!Reg.isVirtual())
11549 return false;
11550 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11551 return MRI.getVRegDef(Reg)->getParent() != BB;
11552}
11553
11554/// If Reg is an induction variable, return true and set some parameters
11555static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11556 MachineInstr *&UpdateInst,
11557 unsigned &UpdateCounterOprNum, Register &InitReg,
11558 bool &IsUpdatePriorComp) {
11559 // Example:
11560 //
11561 // Preheader:
11562 // InitReg = ...
11563 // LoopBB:
11564 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11565 // Reg = COPY Reg0 ; COPY is ignored.
11566 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11567 // ; Reg is the value calculated in the previous
11568 // ; iteration, so IsUpdatePriorComp == false.
11569
11570 if (LoopBB->pred_size() != 2)
11571 return false;
11572 if (!Reg.isVirtual())
11573 return false;
11574 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11575 UpdateInst = nullptr;
11576 UpdateCounterOprNum = 0;
11577 InitReg = 0;
11578 IsUpdatePriorComp = true;
11579 Register CurReg = Reg;
11580 while (true) {
11581 MachineInstr *Def = MRI.getVRegDef(Reg: CurReg);
11582 if (Def->getParent() != LoopBB)
11583 return false;
11584 if (Def->isCopy()) {
11585 // Ignore copy instructions unless they contain subregisters
11586 if (Def->getOperand(i: 0).getSubReg() || Def->getOperand(i: 1).getSubReg())
11587 return false;
11588 CurReg = Def->getOperand(i: 1).getReg();
11589 } else if (Def->isPHI()) {
11590 if (InitReg != 0)
11591 return false;
11592 if (!UpdateInst)
11593 IsUpdatePriorComp = false;
11594 extractPhiReg(Phi: *Def, MBB: LoopBB, RegMBB&: CurReg, RegOther&: InitReg);
11595 } else {
11596 if (UpdateInst)
11597 return false;
11598 switch (Def->getOpcode()) {
11599 case AArch64::ADDSXri:
11600 case AArch64::ADDSWri:
11601 case AArch64::SUBSXri:
11602 case AArch64::SUBSWri:
11603 case AArch64::ADDXri:
11604 case AArch64::ADDWri:
11605 case AArch64::SUBXri:
11606 case AArch64::SUBWri:
11607 UpdateInst = Def;
11608 UpdateCounterOprNum = 1;
11609 break;
11610 case AArch64::ADDSXrr:
11611 case AArch64::ADDSWrr:
11612 case AArch64::SUBSXrr:
11613 case AArch64::SUBSWrr:
11614 case AArch64::ADDXrr:
11615 case AArch64::ADDWrr:
11616 case AArch64::SUBXrr:
11617 case AArch64::SUBWrr:
11618 UpdateInst = Def;
11619 if (isDefinedOutside(Reg: Def->getOperand(i: 2).getReg(), BB: LoopBB))
11620 UpdateCounterOprNum = 1;
11621 else if (isDefinedOutside(Reg: Def->getOperand(i: 1).getReg(), BB: LoopBB))
11622 UpdateCounterOprNum = 2;
11623 else
11624 return false;
11625 break;
11626 default:
11627 return false;
11628 }
11629 CurReg = Def->getOperand(i: UpdateCounterOprNum).getReg();
11630 }
11631
11632 if (!CurReg.isVirtual())
11633 return false;
11634 if (Reg == CurReg)
11635 break;
11636 }
11637
11638 if (!UpdateInst)
11639 return false;
11640
11641 return true;
11642}
11643
11644std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11645AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
11646 // Accept loops that meet the following conditions
11647 // * The conditional branch is BCC
11648 // * The compare instruction is ADDS/SUBS/WHILEXX
11649 // * One operand of the compare is an induction variable and the other is a
11650 // loop invariant value
11651 // * The induction variable is incremented/decremented by a single instruction
11652 // * Does not contain CALL or instructions which have unmodeled side effects
11653
11654 for (MachineInstr &MI : *LoopBB)
11655 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11656 // This instruction may use NZCV, which interferes with the instruction to
11657 // be inserted for loop control.
11658 return nullptr;
11659
11660 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11661 SmallVector<MachineOperand, 4> Cond;
11662 if (analyzeBranch(MBB&: *LoopBB, TBB, FBB, Cond))
11663 return nullptr;
11664
11665 // Infinite loops are not supported
11666 if (TBB == LoopBB && FBB == LoopBB)
11667 return nullptr;
11668
11669 // Must be conditional branch
11670 if (TBB != LoopBB && FBB == nullptr)
11671 return nullptr;
11672
11673 assert((TBB == LoopBB || FBB == LoopBB) &&
11674 "The Loop must be a single-basic-block loop");
11675
11676 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11677 const TargetRegisterInfo &TRI = getRegisterInfo();
11678
11679 if (CondBranch->getOpcode() != AArch64::Bcc)
11680 return nullptr;
11681
11682 // Normalization for createTripCountGreaterCondition()
11683 if (TBB == LoopBB)
11684 reverseBranchCondition(Cond);
11685
11686 MachineInstr *Comp = nullptr;
11687 unsigned CompCounterOprNum = 0;
11688 for (MachineInstr &MI : reverse(C&: *LoopBB)) {
11689 if (MI.modifiesRegister(Reg: AArch64::NZCV, TRI: &TRI)) {
11690 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11691 // operands is a loop invariant value
11692
11693 switch (MI.getOpcode()) {
11694 case AArch64::SUBSXri:
11695 case AArch64::SUBSWri:
11696 case AArch64::ADDSXri:
11697 case AArch64::ADDSWri:
11698 Comp = &MI;
11699 CompCounterOprNum = 1;
11700 break;
11701 case AArch64::ADDSWrr:
11702 case AArch64::ADDSXrr:
11703 case AArch64::SUBSWrr:
11704 case AArch64::SUBSXrr:
11705 Comp = &MI;
11706 break;
11707 default:
11708 if (isWhileOpcode(Opc: MI.getOpcode())) {
11709 Comp = &MI;
11710 break;
11711 }
11712 return nullptr;
11713 }
11714
11715 if (CompCounterOprNum == 0) {
11716 if (isDefinedOutside(Reg: Comp->getOperand(i: 1).getReg(), BB: LoopBB))
11717 CompCounterOprNum = 2;
11718 else if (isDefinedOutside(Reg: Comp->getOperand(i: 2).getReg(), BB: LoopBB))
11719 CompCounterOprNum = 1;
11720 else
11721 return nullptr;
11722 }
11723 break;
11724 }
11725 }
11726 if (!Comp)
11727 return nullptr;
11728
11729 MachineInstr *Update = nullptr;
11730 Register Init;
11731 bool IsUpdatePriorComp;
11732 unsigned UpdateCounterOprNum;
11733 if (!getIndVarInfo(Reg: Comp->getOperand(i: CompCounterOprNum).getReg(), LoopBB,
11734 UpdateInst&: Update, UpdateCounterOprNum, InitReg&: Init, IsUpdatePriorComp))
11735 return nullptr;
11736
11737 return std::make_unique<AArch64PipelinerLoopInfo>(
11738 args&: LoopBB, args&: CondBranch, args&: Comp, args&: CompCounterOprNum, args&: Update, args&: UpdateCounterOprNum,
11739 args&: Init, args&: IsUpdatePriorComp, args&: Cond);
11740}
11741
11742/// verifyInstruction - Perform target specific instruction verification.
11743bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11744 StringRef &ErrInfo) const {
11745 // Verify that immediate offsets on load/store instructions are within range.
11746 // Stack objects with an FI operand are excluded as they can be fixed up
11747 // during PEI.
11748 TypeSize Scale(0U, false), Width(0U, false);
11749 int64_t MinOffset, MaxOffset;
11750 if (getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11751 unsigned ImmIdx = getLoadStoreImmIdx(Opc: MI.getOpcode());
11752 if (MI.getOperand(i: ImmIdx).isImm() && !MI.getOperand(i: ImmIdx - 1).isFI()) {
11753 int64_t Imm = MI.getOperand(i: ImmIdx).getImm();
11754 if (Imm < MinOffset || Imm > MaxOffset) {
11755 ErrInfo = "Unexpected immediate on load/store instruction";
11756 return false;
11757 }
11758 }
11759 }
11760
11761 const MCInstrDesc &MCID = MI.getDesc();
11762 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11763 const MachineOperand &MO = MI.getOperand(i: Op);
11764 switch (MCID.operands()[Op].OperandType) {
11765 case AArch64::OPERAND_IMPLICIT_IMM_0:
11766 if (!MO.isImm() || MO.getImm() != 0) {
11767 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11768 return false;
11769 }
11770 break;
11771 case AArch64::OPERAND_SHIFT_MSL:
11772 if (!MO.isImm() ||
11773 AArch64_AM::getShiftType(Imm: MO.getImm()) != AArch64_AM::MSL ||
11774 (AArch64_AM::getShiftValue(Imm: MO.getImm()) != 8 &&
11775 AArch64_AM::getShiftValue(Imm: MO.getImm()) != 16)) {
11776 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11777 return false;
11778 }
11779 break;
11780 default:
11781 break;
11782 }
11783 }
11784 return true;
11785}
11786
11787#define GET_INSTRINFO_HELPERS
11788#define GET_INSTRMAP_INFO
11789#include "AArch64GenInstrInfo.inc"
11790