1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
15#include "AArch64MachineFunctionInfo.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
18#include "MCTargetDesc/AArch64AddressingModes.h"
19#include "MCTargetDesc/AArch64MCTargetDesc.h"
20#include "Utils/AArch64BaseInfo.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/Analysis/AliasAnalysis.h"
27#include "llvm/CodeGen/CFIInstBuilder.h"
28#include "llvm/CodeGen/LivePhysRegs.h"
29#include "llvm/CodeGen/MachineBasicBlock.h"
30#include "llvm/CodeGen/MachineCombinerPattern.h"
31#include "llvm/CodeGen/MachineFrameInfo.h"
32#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineInstr.h"
34#include "llvm/CodeGen/MachineInstrBuilder.h"
35#include "llvm/CodeGen/MachineMemOperand.h"
36#include "llvm/CodeGen/MachineModuleInfo.h"
37#include "llvm/CodeGen/MachineOperand.h"
38#include "llvm/CodeGen/MachineRegisterInfo.h"
39#include "llvm/CodeGen/RegisterScavenging.h"
40#include "llvm/CodeGen/StackMaps.h"
41#include "llvm/CodeGen/TargetRegisterInfo.h"
42#include "llvm/CodeGen/TargetSubtargetInfo.h"
43#include "llvm/IR/DebugInfoMetadata.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
49#include "llvm/MC/MCInstBuilder.h"
50#include "llvm/MC/MCInstrDesc.h"
51#include "llvm/Support/Casting.h"
52#include "llvm/Support/CodeGen.h"
53#include "llvm/Support/CommandLine.h"
54#include "llvm/Support/ErrorHandling.h"
55#include "llvm/Support/LEB128.h"
56#include "llvm/Support/MathExtras.h"
57#include "llvm/Target/TargetMachine.h"
58#include "llvm/Target/TargetOptions.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
80static cl::opt<unsigned>
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(Val: 9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
84static cl::opt<unsigned> TBZDisplacementBits(
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(Val: 14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
88static cl::opt<unsigned> CBZDisplacementBits(
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(Val: 19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
92static cl::opt<unsigned>
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(Val: 19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
96static cl::opt<unsigned>
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(Val: 26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
100static cl::opt<unsigned> GatherOptSearchLimit(
101 "aarch64-search-limit", cl::Hidden, cl::init(Val: 2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
105AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// GetInstSize - Return the number of bytes of code the specified
111/// instruction may be. This returns the maximum number of bytes.
112unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
113 const MachineBasicBlock &MBB = *MI.getParent();
114 const MachineFunction *MF = MBB.getParent();
115 const Function &F = MF->getFunction();
116 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
117
118 {
119 auto Op = MI.getOpcode();
120 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
121 return getInlineAsmLength(Str: MI.getOperand(i: 0).getSymbolName(), MAI: *MAI);
122 }
123
124 // Meta-instructions emit no code.
125 if (MI.isMetaInstruction())
126 return 0;
127
128 // FIXME: We currently only handle pseudoinstructions that don't get expanded
129 // before the assembly printer.
130 unsigned NumBytes = 0;
131 const MCInstrDesc &Desc = MI.getDesc();
132
133 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
134 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
135
136 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
137 if (!MFI->shouldSignReturnAddress(MF: *MF))
138 return NumBytes;
139
140 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
141 auto Method = STI.getAuthenticatedLRCheckMethod(MF: *MF);
142 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
143 return NumBytes;
144 }
145
146 // Size should be preferably set in
147 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
148 // Specific cases handle instructions of variable sizes
149 switch (Desc.getOpcode()) {
150 default:
151 if (Desc.getSize())
152 return Desc.getSize();
153
154 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
155 // with fixed constant size but not specified in .td file) is a normal
156 // 4-byte insn.
157 NumBytes = 4;
158 break;
159 case TargetOpcode::STACKMAP:
160 // The upper bound for a stackmap intrinsic is the full length of its shadow
161 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
162 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
163 break;
164 case TargetOpcode::PATCHPOINT:
165 // The size of the patchpoint intrinsic is the number of bytes requested
166 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
167 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
168 break;
169 case TargetOpcode::STATEPOINT:
170 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
171 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
172 // No patch bytes means a normal call inst is emitted
173 if (NumBytes == 0)
174 NumBytes = 4;
175 break;
176 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
177 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
178 // instructions are expanded to the specified number of NOPs. Otherwise,
179 // they are expanded to 36-byte XRay sleds.
180 NumBytes =
181 F.getFnAttributeAsParsedInteger(Kind: "patchable-function-entry", Default: 9) * 4;
182 break;
183 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
184 case TargetOpcode::PATCHABLE_TAIL_CALL:
185 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
186 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
187 NumBytes = 36;
188 break;
189 case TargetOpcode::PATCHABLE_EVENT_CALL:
190 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
191 NumBytes = 24;
192 break;
193
194 case AArch64::SPACE:
195 NumBytes = MI.getOperand(i: 1).getImm();
196 break;
197 case TargetOpcode::BUNDLE:
198 NumBytes = getInstBundleLength(MI);
199 break;
200 }
201
202 return NumBytes;
203}
204
205unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
206 unsigned Size = 0;
207 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
208 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
209 while (++I != E && I->isInsideBundle()) {
210 assert(!I->isBundle() && "No nested bundle!");
211 Size += getInstSizeInBytes(MI: *I);
212 }
213 return Size;
214}
215
216static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
217 SmallVectorImpl<MachineOperand> &Cond) {
218 // Block ends with fall-through condbranch.
219 switch (LastInst->getOpcode()) {
220 default:
221 llvm_unreachable("Unknown branch instruction?");
222 case AArch64::Bcc:
223 Target = LastInst->getOperand(i: 1).getMBB();
224 Cond.push_back(Elt: LastInst->getOperand(i: 0));
225 break;
226 case AArch64::CBZW:
227 case AArch64::CBZX:
228 case AArch64::CBNZW:
229 case AArch64::CBNZX:
230 Target = LastInst->getOperand(i: 1).getMBB();
231 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
232 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
233 Cond.push_back(Elt: LastInst->getOperand(i: 0));
234 break;
235 case AArch64::TBZW:
236 case AArch64::TBZX:
237 case AArch64::TBNZW:
238 case AArch64::TBNZX:
239 Target = LastInst->getOperand(i: 2).getMBB();
240 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
241 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
242 Cond.push_back(Elt: LastInst->getOperand(i: 0));
243 Cond.push_back(Elt: LastInst->getOperand(i: 1));
244 break;
245 case AArch64::CBWPri:
246 case AArch64::CBXPri:
247 case AArch64::CBWPrr:
248 case AArch64::CBXPrr:
249 Target = LastInst->getOperand(i: 3).getMBB();
250 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
251 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
252 Cond.push_back(Elt: LastInst->getOperand(i: 0));
253 Cond.push_back(Elt: LastInst->getOperand(i: 1));
254 Cond.push_back(Elt: LastInst->getOperand(i: 2));
255 break;
256 case AArch64::CBBAssertExt:
257 case AArch64::CBHAssertExt:
258 Target = LastInst->getOperand(i: 3).getMBB();
259 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1)); // -1
260 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode())); // Opc
261 Cond.push_back(Elt: LastInst->getOperand(i: 0)); // Cond
262 Cond.push_back(Elt: LastInst->getOperand(i: 1)); // Op0
263 Cond.push_back(Elt: LastInst->getOperand(i: 2)); // Op1
264 Cond.push_back(Elt: LastInst->getOperand(i: 4)); // Ext0
265 Cond.push_back(Elt: LastInst->getOperand(i: 5)); // Ext1
266 break;
267 }
268}
269
270static unsigned getBranchDisplacementBits(unsigned Opc) {
271 switch (Opc) {
272 default:
273 llvm_unreachable("unexpected opcode!");
274 case AArch64::B:
275 return BDisplacementBits;
276 case AArch64::TBNZW:
277 case AArch64::TBZW:
278 case AArch64::TBNZX:
279 case AArch64::TBZX:
280 return TBZDisplacementBits;
281 case AArch64::CBNZW:
282 case AArch64::CBZW:
283 case AArch64::CBNZX:
284 case AArch64::CBZX:
285 return CBZDisplacementBits;
286 case AArch64::Bcc:
287 return BCCDisplacementBits;
288 case AArch64::CBWPri:
289 case AArch64::CBXPri:
290 case AArch64::CBBAssertExt:
291 case AArch64::CBHAssertExt:
292 case AArch64::CBWPrr:
293 case AArch64::CBXPrr:
294 return CBDisplacementBits;
295 }
296}
297
298bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
299 int64_t BrOffset) const {
300 unsigned Bits = getBranchDisplacementBits(Opc: BranchOp);
301 assert(Bits >= 3 && "max branch displacement must be enough to jump"
302 "over conditional branch expansion");
303 return isIntN(N: Bits, x: BrOffset / 4);
304}
305
306MachineBasicBlock *
307AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
308 switch (MI.getOpcode()) {
309 default:
310 llvm_unreachable("unexpected opcode!");
311 case AArch64::B:
312 return MI.getOperand(i: 0).getMBB();
313 case AArch64::TBZW:
314 case AArch64::TBNZW:
315 case AArch64::TBZX:
316 case AArch64::TBNZX:
317 return MI.getOperand(i: 2).getMBB();
318 case AArch64::CBZW:
319 case AArch64::CBNZW:
320 case AArch64::CBZX:
321 case AArch64::CBNZX:
322 case AArch64::Bcc:
323 return MI.getOperand(i: 1).getMBB();
324 case AArch64::CBWPri:
325 case AArch64::CBXPri:
326 case AArch64::CBBAssertExt:
327 case AArch64::CBHAssertExt:
328 case AArch64::CBWPrr:
329 case AArch64::CBXPrr:
330 return MI.getOperand(i: 3).getMBB();
331 }
332}
333
334void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
335 MachineBasicBlock &NewDestBB,
336 MachineBasicBlock &RestoreBB,
337 const DebugLoc &DL,
338 int64_t BrOffset,
339 RegScavenger *RS) const {
340 assert(RS && "RegScavenger required for long branching");
341 assert(MBB.empty() &&
342 "new block should be inserted for expanding unconditional branch");
343 assert(MBB.pred_size() == 1);
344 assert(RestoreBB.empty() &&
345 "restore block should be inserted for restoring clobbered registers");
346
347 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
348 // Offsets outside of the signed 33-bit range are not supported for ADRP +
349 // ADD.
350 if (!isInt<33>(x: BrOffset))
351 report_fatal_error(
352 reason: "Branch offsets outside of the signed 33-bit range not supported");
353
354 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::ADRP), DestReg: Reg)
355 .addSym(Sym: DestBB.getSymbol(), TargetFlags: AArch64II::MO_PAGE);
356 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg: Reg)
357 .addReg(RegNo: Reg)
358 .addSym(Sym: DestBB.getSymbol(), TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
359 .addImm(Val: 0);
360 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::BR)).addReg(RegNo: Reg);
361 };
362
363 RS->enterBasicBlockEnd(MBB);
364 // If X16 is unused, we can rely on the linker to insert a range extension
365 // thunk if NewDestBB is out of range of a single B instruction.
366 constexpr Register Reg = AArch64::X16;
367 if (!RS->isRegUsed(Reg)) {
368 insertUnconditionalBranch(MBB, DestBB: &NewDestBB, DL);
369 RS->setRegUsed(Reg);
370 return;
371 }
372
373 // If there's a free register and it's worth inflating the code size,
374 // manually insert the indirect branch.
375 Register Scavenged = RS->FindUnusedReg(RC: &AArch64::GPR64RegClass);
376 if (Scavenged != AArch64::NoRegister &&
377 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
378 buildIndirectBranch(Scavenged, NewDestBB);
379 RS->setRegUsed(Reg: Scavenged);
380 return;
381 }
382
383 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
384 // with red zones.
385 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
386 if (!AFI || AFI->hasRedZone().value_or(u: true))
387 report_fatal_error(
388 reason: "Unable to insert indirect branch inside function that has red zone");
389
390 // Otherwise, spill X16 and defer range extension to the linker.
391 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::STRXpre))
392 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
393 .addReg(RegNo: Reg)
394 .addReg(RegNo: AArch64::SP)
395 .addImm(Val: -16);
396
397 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: &RestoreBB);
398
399 BuildMI(BB&: RestoreBB, I: RestoreBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::LDRXpost))
400 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
401 .addReg(RegNo: Reg, Flags: RegState::Define)
402 .addReg(RegNo: AArch64::SP)
403 .addImm(Val: 16);
404}
405
406// Branch analysis.
407bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
408 MachineBasicBlock *&TBB,
409 MachineBasicBlock *&FBB,
410 SmallVectorImpl<MachineOperand> &Cond,
411 bool AllowModify) const {
412 // If the block has no terminators, it just falls into the block after it.
413 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
414 if (I == MBB.end())
415 return false;
416
417 // Skip over SpeculationBarrierEndBB terminators
418 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
419 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
420 --I;
421 }
422
423 if (!isUnpredicatedTerminator(MI: *I))
424 return false;
425
426 // Get the last instruction in the block.
427 MachineInstr *LastInst = &*I;
428
429 // If there is only one terminator instruction, process it.
430 unsigned LastOpc = LastInst->getOpcode();
431 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
432 if (isUncondBranchOpcode(Opc: LastOpc)) {
433 TBB = LastInst->getOperand(i: 0).getMBB();
434 return false;
435 }
436 if (isCondBranchOpcode(Opc: LastOpc)) {
437 // Block ends with fall-through condbranch.
438 parseCondBranch(LastInst, Target&: TBB, Cond);
439 return false;
440 }
441 return true; // Can't handle indirect branch.
442 }
443
444 // Get the instruction before it if it is a terminator.
445 MachineInstr *SecondLastInst = &*I;
446 unsigned SecondLastOpc = SecondLastInst->getOpcode();
447
448 // If AllowModify is true and the block ends with two or more unconditional
449 // branches, delete all but the first unconditional branch.
450 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc)) {
451 while (isUncondBranchOpcode(Opc: SecondLastOpc)) {
452 LastInst->eraseFromParent();
453 LastInst = SecondLastInst;
454 LastOpc = LastInst->getOpcode();
455 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
456 // Return now the only terminator is an unconditional branch.
457 TBB = LastInst->getOperand(i: 0).getMBB();
458 return false;
459 }
460 SecondLastInst = &*I;
461 SecondLastOpc = SecondLastInst->getOpcode();
462 }
463 }
464
465 // If we're allowed to modify and the block ends in a unconditional branch
466 // which could simply fallthrough, remove the branch. (Note: This case only
467 // matters when we can't understand the whole sequence, otherwise it's also
468 // handled by BranchFolding.cpp.)
469 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc) &&
470 MBB.isLayoutSuccessor(MBB: getBranchDestBlock(MI: *LastInst))) {
471 LastInst->eraseFromParent();
472 LastInst = SecondLastInst;
473 LastOpc = LastInst->getOpcode();
474 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
475 assert(!isUncondBranchOpcode(LastOpc) &&
476 "unreachable unconditional branches removed above");
477
478 if (isCondBranchOpcode(Opc: LastOpc)) {
479 // Block ends with fall-through condbranch.
480 parseCondBranch(LastInst, Target&: TBB, Cond);
481 return false;
482 }
483 return true; // Can't handle indirect branch.
484 }
485 SecondLastInst = &*I;
486 SecondLastOpc = SecondLastInst->getOpcode();
487 }
488
489 // If there are three terminators, we don't know what sort of block this is.
490 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(MI: *--I))
491 return true;
492
493 // If the block ends with a B and a Bcc, handle it.
494 if (isCondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
495 parseCondBranch(LastInst: SecondLastInst, Target&: TBB, Cond);
496 FBB = LastInst->getOperand(i: 0).getMBB();
497 return false;
498 }
499
500 // If the block ends with two unconditional branches, handle it. The second
501 // one is not executed, so remove it.
502 if (isUncondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
503 TBB = SecondLastInst->getOperand(i: 0).getMBB();
504 I = LastInst;
505 if (AllowModify)
506 I->eraseFromParent();
507 return false;
508 }
509
510 // ...likewise if it ends with an indirect branch followed by an unconditional
511 // branch.
512 if (isIndirectBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
513 I = LastInst;
514 if (AllowModify)
515 I->eraseFromParent();
516 return true;
517 }
518
519 // Otherwise, can't handle this.
520 return true;
521}
522
523bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
524 MachineBranchPredicate &MBP,
525 bool AllowModify) const {
526 // Use analyzeBranch to validate the branch pattern.
527 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
528 SmallVector<MachineOperand, 4> Cond;
529 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
530 return true;
531
532 // analyzeBranch returns success with empty Cond for unconditional branches.
533 if (Cond.empty())
534 return true;
535
536 MBP.TrueDest = TBB;
537 assert(MBP.TrueDest && "expected!");
538 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
539
540 MBP.ConditionDef = nullptr;
541 MBP.SingleUseCondition = false;
542
543 // Find the conditional branch. After analyzeBranch succeeds with non-empty
544 // Cond, there's exactly one conditional branch - either last (fallthrough)
545 // or second-to-last (followed by unconditional B).
546 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
547 if (I == MBB.end())
548 return true;
549
550 if (isUncondBranchOpcode(Opc: I->getOpcode())) {
551 if (I == MBB.begin())
552 return true;
553 --I;
554 }
555
556 MachineInstr *CondBranch = &*I;
557 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
558
559 switch (CondBranch->getOpcode()) {
560 default:
561 return true;
562
563 case AArch64::Bcc:
564 // Bcc takes the NZCV flag as the operand to branch on, walk up the
565 // instruction stream to find the last instruction to define NZCV.
566 for (MachineInstr &MI : llvm::drop_begin(RangeOrContainer: llvm::reverse(C&: MBB))) {
567 if (MI.modifiesRegister(Reg: AArch64::NZCV, /*TRI=*/nullptr)) {
568 MBP.ConditionDef = &MI;
569 break;
570 }
571 }
572 return false;
573
574 case AArch64::CBZW:
575 case AArch64::CBZX:
576 case AArch64::CBNZW:
577 case AArch64::CBNZX: {
578 MBP.LHS = CondBranch->getOperand(i: 0);
579 MBP.RHS = MachineOperand::CreateImm(Val: 0);
580 unsigned Opc = CondBranch->getOpcode();
581 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
582 ? MachineBranchPredicate::PRED_NE
583 : MachineBranchPredicate::PRED_EQ;
584 Register CondReg = MBP.LHS.getReg();
585 if (CondReg.isVirtual())
586 MBP.ConditionDef = MRI.getVRegDef(Reg: CondReg);
587 return false;
588 }
589
590 case AArch64::TBZW:
591 case AArch64::TBZX:
592 case AArch64::TBNZW:
593 case AArch64::TBNZX: {
594 Register CondReg = CondBranch->getOperand(i: 0).getReg();
595 if (CondReg.isVirtual())
596 MBP.ConditionDef = MRI.getVRegDef(Reg: CondReg);
597 return false;
598 }
599 }
600}
601
602bool AArch64InstrInfo::reverseBranchCondition(
603 SmallVectorImpl<MachineOperand> &Cond) const {
604 if (Cond[0].getImm() != -1) {
605 // Regular Bcc
606 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
607 Cond[0].setImm(AArch64CC::getInvertedCondCode(Code: CC));
608 } else {
609 // Folded compare-and-branch
610 switch (Cond[1].getImm()) {
611 default:
612 llvm_unreachable("Unknown conditional branch!");
613 case AArch64::CBZW:
614 Cond[1].setImm(AArch64::CBNZW);
615 break;
616 case AArch64::CBNZW:
617 Cond[1].setImm(AArch64::CBZW);
618 break;
619 case AArch64::CBZX:
620 Cond[1].setImm(AArch64::CBNZX);
621 break;
622 case AArch64::CBNZX:
623 Cond[1].setImm(AArch64::CBZX);
624 break;
625 case AArch64::TBZW:
626 Cond[1].setImm(AArch64::TBNZW);
627 break;
628 case AArch64::TBNZW:
629 Cond[1].setImm(AArch64::TBZW);
630 break;
631 case AArch64::TBZX:
632 Cond[1].setImm(AArch64::TBNZX);
633 break;
634 case AArch64::TBNZX:
635 Cond[1].setImm(AArch64::TBZX);
636 break;
637
638 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
639 case AArch64::CBWPri:
640 case AArch64::CBXPri:
641 case AArch64::CBBAssertExt:
642 case AArch64::CBHAssertExt:
643 case AArch64::CBWPrr:
644 case AArch64::CBXPrr: {
645 // Pseudos using standard 4bit Arm condition codes
646 AArch64CC::CondCode CC =
647 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
648 Cond[2].setImm(AArch64CC::getInvertedCondCode(Code: CC));
649 }
650 }
651 }
652
653 return false;
654}
655
656unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
657 int *BytesRemoved) const {
658 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
659 if (I == MBB.end())
660 return 0;
661
662 if (!isUncondBranchOpcode(Opc: I->getOpcode()) &&
663 !isCondBranchOpcode(Opc: I->getOpcode()))
664 return 0;
665
666 // Remove the branch.
667 I->eraseFromParent();
668
669 I = MBB.end();
670
671 if (I == MBB.begin()) {
672 if (BytesRemoved)
673 *BytesRemoved = 4;
674 return 1;
675 }
676 --I;
677 if (!isCondBranchOpcode(Opc: I->getOpcode())) {
678 if (BytesRemoved)
679 *BytesRemoved = 4;
680 return 1;
681 }
682
683 // Remove the branch.
684 I->eraseFromParent();
685 if (BytesRemoved)
686 *BytesRemoved = 8;
687
688 return 2;
689}
690
691void AArch64InstrInfo::instantiateCondBranch(
692 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
693 ArrayRef<MachineOperand> Cond) const {
694 if (Cond[0].getImm() != -1) {
695 // Regular Bcc
696 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::Bcc)).addImm(Val: Cond[0].getImm()).addMBB(MBB: TBB);
697 } else {
698 // Folded compare-and-branch
699 // Note that we use addOperand instead of addReg to keep the flags.
700
701 // cbz, cbnz
702 const MachineInstrBuilder MIB =
703 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: Cond[1].getImm())).add(MO: Cond[2]);
704
705 // tbz/tbnz
706 if (Cond.size() > 3)
707 MIB.add(MO: Cond[3]);
708
709 // cb
710 if (Cond.size() > 4)
711 MIB.add(MO: Cond[4]);
712
713 MIB.addMBB(MBB: TBB);
714
715 // cb[b,h]
716 if (Cond.size() > 5) {
717 MIB.addImm(Val: Cond[5].getImm());
718 MIB.addImm(Val: Cond[6].getImm());
719 }
720 }
721}
722
723unsigned AArch64InstrInfo::insertBranch(
724 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
725 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
726 // Shouldn't be a fall through.
727 assert(TBB && "insertBranch must not be told to insert a fallthrough");
728
729 if (!FBB) {
730 if (Cond.empty()) // Unconditional branch?
731 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: TBB);
732 else
733 instantiateCondBranch(MBB, DL, TBB, Cond);
734
735 if (BytesAdded)
736 *BytesAdded = 4;
737
738 return 1;
739 }
740
741 // Two-way conditional branch.
742 instantiateCondBranch(MBB, DL, TBB, Cond);
743 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: FBB);
744
745 if (BytesAdded)
746 *BytesAdded = 8;
747
748 return 2;
749}
750
751bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
752 const TargetInstrInfo &TII) {
753 for (MachineInstr &MI : MBB->terminators()) {
754 unsigned Opc = MI.getOpcode();
755 switch (Opc) {
756 case AArch64::CBZW:
757 case AArch64::CBZX:
758 case AArch64::TBZW:
759 case AArch64::TBZX:
760 // CBZ/TBZ with WZR/XZR -> unconditional B
761 if (MI.getOperand(i: 0).getReg() == AArch64::WZR ||
762 MI.getOperand(i: 0).getReg() == AArch64::XZR) {
763 DEBUG_WITH_TYPE("optimizeTerminators",
764 dbgs() << "Removing always taken branch: " << MI);
765 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
766 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
767 for (auto *S : Succs)
768 if (S != Target)
769 MBB->removeSuccessor(Succ: S);
770 DebugLoc DL = MI.getDebugLoc();
771 while (MBB->rbegin() != &MI)
772 MBB->rbegin()->eraseFromParent();
773 MI.eraseFromParent();
774 BuildMI(BB: MBB, MIMD: DL, MCID: TII.get(Opcode: AArch64::B)).addMBB(MBB: Target);
775 return true;
776 }
777 break;
778 case AArch64::CBNZW:
779 case AArch64::CBNZX:
780 case AArch64::TBNZW:
781 case AArch64::TBNZX:
782 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
783 if (MI.getOperand(i: 0).getReg() == AArch64::WZR ||
784 MI.getOperand(i: 0).getReg() == AArch64::XZR) {
785 DEBUG_WITH_TYPE("optimizeTerminators",
786 dbgs() << "Removing never taken branch: " << MI);
787 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
788 MI.getParent()->removeSuccessor(Succ: Target);
789 MI.eraseFromParent();
790 return true;
791 }
792 break;
793 }
794 }
795 return false;
796}
797
798// Find the original register that VReg is copied from.
799static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
800 while (Register::isVirtualRegister(Reg: VReg)) {
801 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
802 if (!DefMI->isFullCopy())
803 return VReg;
804 VReg = DefMI->getOperand(i: 1).getReg();
805 }
806 return VReg;
807}
808
809// Determine if VReg is defined by an instruction that can be folded into a
810// csel instruction. If so, return the folded opcode, and the replacement
811// register.
812static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
813 unsigned *NewReg = nullptr) {
814 VReg = removeCopies(MRI, VReg);
815 if (!Register::isVirtualRegister(Reg: VReg))
816 return 0;
817
818 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(RC: MRI.getRegClass(Reg: VReg));
819 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
820 unsigned Opc = 0;
821 unsigned SrcReg = 0;
822 switch (DefMI->getOpcode()) {
823 case AArch64::SUBREG_TO_REG:
824 // Check for the following way to define an 64-bit immediate:
825 // %0:gpr32 = MOVi32imm 1
826 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
827 if (!DefMI->getOperand(i: 1).isReg())
828 return 0;
829 if (!DefMI->getOperand(i: 2).isImm() ||
830 DefMI->getOperand(i: 2).getImm() != AArch64::sub_32)
831 return 0;
832 DefMI = MRI.getVRegDef(Reg: DefMI->getOperand(i: 1).getReg());
833 if (DefMI->getOpcode() != AArch64::MOVi32imm)
834 return 0;
835 if (!DefMI->getOperand(i: 1).isImm() || DefMI->getOperand(i: 1).getImm() != 1)
836 return 0;
837 assert(Is64Bit);
838 SrcReg = AArch64::XZR;
839 Opc = AArch64::CSINCXr;
840 break;
841
842 case AArch64::MOVi32imm:
843 case AArch64::MOVi64imm:
844 if (!DefMI->getOperand(i: 1).isImm() || DefMI->getOperand(i: 1).getImm() != 1)
845 return 0;
846 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
847 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
848 break;
849
850 case AArch64::ADDSXri:
851 case AArch64::ADDSWri:
852 // if NZCV is used, do not fold.
853 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
854 isDead: true) == -1)
855 return 0;
856 // fall-through to ADDXri and ADDWri.
857 [[fallthrough]];
858 case AArch64::ADDXri:
859 case AArch64::ADDWri:
860 // add x, 1 -> csinc.
861 if (!DefMI->getOperand(i: 2).isImm() || DefMI->getOperand(i: 2).getImm() != 1 ||
862 DefMI->getOperand(i: 3).getImm() != 0)
863 return 0;
864 SrcReg = DefMI->getOperand(i: 1).getReg();
865 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
866 break;
867
868 case AArch64::ORNXrr:
869 case AArch64::ORNWrr: {
870 // not x -> csinv, represented as orn dst, xzr, src.
871 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
872 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
873 return 0;
874 SrcReg = DefMI->getOperand(i: 2).getReg();
875 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
876 break;
877 }
878
879 case AArch64::SUBSXrr:
880 case AArch64::SUBSWrr:
881 // if NZCV is used, do not fold.
882 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
883 isDead: true) == -1)
884 return 0;
885 // fall-through to SUBXrr and SUBWrr.
886 [[fallthrough]];
887 case AArch64::SUBXrr:
888 case AArch64::SUBWrr: {
889 // neg x -> csneg, represented as sub dst, xzr, src.
890 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
891 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
892 return 0;
893 SrcReg = DefMI->getOperand(i: 2).getReg();
894 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
895 break;
896 }
897 default:
898 return 0;
899 }
900 assert(Opc && SrcReg && "Missing parameters");
901
902 if (NewReg)
903 *NewReg = SrcReg;
904 return Opc;
905}
906
907bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
908 ArrayRef<MachineOperand> Cond,
909 Register DstReg, Register TrueReg,
910 Register FalseReg, int &CondCycles,
911 int &TrueCycles,
912 int &FalseCycles) const {
913 // Check register classes.
914 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
915 const TargetRegisterClass *RC =
916 RI.getCommonSubClass(A: MRI.getRegClass(Reg: TrueReg), B: MRI.getRegClass(Reg: FalseReg));
917 if (!RC)
918 return false;
919
920 // Also need to check the dest regclass, in case we're trying to optimize
921 // something like:
922 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
923 if (!RI.getCommonSubClass(A: RC, B: MRI.getRegClass(Reg: DstReg)))
924 return false;
925
926 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
927 unsigned ExtraCondLat = Cond.size() != 1;
928
929 // GPRs are handled by csel.
930 // FIXME: Fold in x+1, -x, and ~x when applicable.
931 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
932 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
933 // Single-cycle csel, csinc, csinv, and csneg.
934 CondCycles = 1 + ExtraCondLat;
935 TrueCycles = FalseCycles = 1;
936 if (canFoldIntoCSel(MRI, VReg: TrueReg))
937 TrueCycles = 0;
938 else if (canFoldIntoCSel(MRI, VReg: FalseReg))
939 FalseCycles = 0;
940 return true;
941 }
942
943 // Scalar floating point is handled by fcsel.
944 // FIXME: Form fabs, fmin, and fmax when applicable.
945 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
946 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
947 CondCycles = 5 + ExtraCondLat;
948 TrueCycles = FalseCycles = 2;
949 return true;
950 }
951
952 // Can't do vectors.
953 return false;
954}
955
956void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
957 MachineBasicBlock::iterator I,
958 const DebugLoc &DL, Register DstReg,
959 ArrayRef<MachineOperand> Cond,
960 Register TrueReg, Register FalseReg) const {
961 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
962
963 // Parse the condition code, see parseCondBranch() above.
964 AArch64CC::CondCode CC;
965 switch (Cond.size()) {
966 default:
967 llvm_unreachable("Unknown condition opcode in Cond");
968 case 1: // b.cc
969 CC = AArch64CC::CondCode(Cond[0].getImm());
970 break;
971 case 3: { // cbz/cbnz
972 // We must insert a compare against 0.
973 bool Is64Bit;
974 switch (Cond[1].getImm()) {
975 default:
976 llvm_unreachable("Unknown branch opcode in Cond");
977 case AArch64::CBZW:
978 Is64Bit = false;
979 CC = AArch64CC::EQ;
980 break;
981 case AArch64::CBZX:
982 Is64Bit = true;
983 CC = AArch64CC::EQ;
984 break;
985 case AArch64::CBNZW:
986 Is64Bit = false;
987 CC = AArch64CC::NE;
988 break;
989 case AArch64::CBNZX:
990 Is64Bit = true;
991 CC = AArch64CC::NE;
992 break;
993 }
994 Register SrcReg = Cond[2].getReg();
995 if (Is64Bit) {
996 // cmp reg, #0 is actually subs xzr, reg, #0.
997 MRI.constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64spRegClass);
998 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSXri), DestReg: AArch64::XZR)
999 .addReg(RegNo: SrcReg)
1000 .addImm(Val: 0)
1001 .addImm(Val: 0);
1002 } else {
1003 MRI.constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR32spRegClass);
1004 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWri), DestReg: AArch64::WZR)
1005 .addReg(RegNo: SrcReg)
1006 .addImm(Val: 0)
1007 .addImm(Val: 0);
1008 }
1009 break;
1010 }
1011 case 4: { // tbz/tbnz
1012 // We must insert a tst instruction.
1013 switch (Cond[1].getImm()) {
1014 default:
1015 llvm_unreachable("Unknown branch opcode in Cond");
1016 case AArch64::TBZW:
1017 case AArch64::TBZX:
1018 CC = AArch64CC::EQ;
1019 break;
1020 case AArch64::TBNZW:
1021 case AArch64::TBNZX:
1022 CC = AArch64CC::NE;
1023 break;
1024 }
1025 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1026 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1027 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ANDSWri), DestReg: AArch64::WZR)
1028 .addReg(RegNo: Cond[2].getReg())
1029 .addImm(
1030 Val: AArch64_AM::encodeLogicalImmediate(imm: 1ull << Cond[3].getImm(), regSize: 32));
1031 else
1032 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ANDSXri), DestReg: AArch64::XZR)
1033 .addReg(RegNo: Cond[2].getReg())
1034 .addImm(
1035 Val: AArch64_AM::encodeLogicalImmediate(imm: 1ull << Cond[3].getImm(), regSize: 64));
1036 break;
1037 }
1038 case 5: { // cb
1039 // We must insert a cmp, that is a subs
1040 // 0 1 2 3 4
1041 // Cond is { -1, Opcode, CC, Op0, Op1 }
1042
1043 unsigned SubsOpc, SubsDestReg;
1044 bool IsImm = false;
1045 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1046 switch (Cond[1].getImm()) {
1047 default:
1048 llvm_unreachable("Unknown branch opcode in Cond");
1049 case AArch64::CBWPri:
1050 SubsOpc = AArch64::SUBSWri;
1051 SubsDestReg = AArch64::WZR;
1052 IsImm = true;
1053 break;
1054 case AArch64::CBXPri:
1055 SubsOpc = AArch64::SUBSXri;
1056 SubsDestReg = AArch64::XZR;
1057 IsImm = true;
1058 break;
1059 case AArch64::CBWPrr:
1060 SubsOpc = AArch64::SUBSWrr;
1061 SubsDestReg = AArch64::WZR;
1062 IsImm = false;
1063 break;
1064 case AArch64::CBXPrr:
1065 SubsOpc = AArch64::SUBSXrr;
1066 SubsDestReg = AArch64::XZR;
1067 IsImm = false;
1068 break;
1069 }
1070
1071 if (IsImm)
1072 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SubsOpc), DestReg: SubsDestReg)
1073 .addReg(RegNo: Cond[3].getReg())
1074 .addImm(Val: Cond[4].getImm())
1075 .addImm(Val: 0);
1076 else
1077 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SubsOpc), DestReg: SubsDestReg)
1078 .addReg(RegNo: Cond[3].getReg())
1079 .addReg(RegNo: Cond[4].getReg());
1080 } break;
1081 case 7: { // cb[b,h]
1082 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1083 // that have been folded. For the first operand we codegen an explicit
1084 // extension, for the second operand we fold the extension into cmp.
1085 // 0 1 2 3 4 5 6
1086 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1087
1088 // We need a new register for the now explicitly extended register
1089 Register Reg = Cond[4].getReg();
1090 if (Cond[5].getImm() != AArch64_AM::InvalidShiftExtend) {
1091 unsigned ExtOpc;
1092 unsigned ExtBits;
1093 AArch64_AM::ShiftExtendType ExtendType =
1094 AArch64_AM::getExtendType(Imm: Cond[5].getImm());
1095 switch (ExtendType) {
1096 default:
1097 llvm_unreachable("Unknown shift-extend for CB instruction");
1098 case AArch64_AM::SXTB:
1099 assert(
1100 Cond[1].getImm() == AArch64::CBBAssertExt &&
1101 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1102 ExtOpc = AArch64::SBFMWri;
1103 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xff, regSize: 32);
1104 break;
1105 case AArch64_AM::SXTH:
1106 assert(
1107 Cond[1].getImm() == AArch64::CBHAssertExt &&
1108 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1109 ExtOpc = AArch64::SBFMWri;
1110 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xffff, regSize: 32);
1111 break;
1112 case AArch64_AM::UXTB:
1113 assert(
1114 Cond[1].getImm() == AArch64::CBBAssertExt &&
1115 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1116 ExtOpc = AArch64::ANDWri;
1117 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xff, regSize: 32);
1118 break;
1119 case AArch64_AM::UXTH:
1120 assert(
1121 Cond[1].getImm() == AArch64::CBHAssertExt &&
1122 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1123 ExtOpc = AArch64::ANDWri;
1124 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xffff, regSize: 32);
1125 break;
1126 }
1127
1128 // Build the explicit extension of the first operand
1129 Reg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32spRegClass);
1130 MachineInstrBuilder MBBI =
1131 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: ExtOpc), DestReg: Reg).addReg(RegNo: Cond[4].getReg());
1132 if (ExtOpc != AArch64::ANDWri)
1133 MBBI.addImm(Val: 0);
1134 MBBI.addImm(Val: ExtBits);
1135 }
1136
1137 // Now, subs with an extended second operand
1138 if (Cond[6].getImm() != AArch64_AM::InvalidShiftExtend) {
1139 AArch64_AM::ShiftExtendType ExtendType =
1140 AArch64_AM::getExtendType(Imm: Cond[6].getImm());
1141 MRI.constrainRegClass(Reg, RC: MRI.getRegClass(Reg: Cond[3].getReg()));
1142 MRI.constrainRegClass(Reg: Cond[3].getReg(), RC: &AArch64::GPR32spRegClass);
1143 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWrx), DestReg: AArch64::WZR)
1144 .addReg(RegNo: Cond[3].getReg())
1145 .addReg(RegNo: Reg)
1146 .addImm(Val: AArch64_AM::getArithExtendImm(ET: ExtendType, Imm: 0));
1147 } // If no extension is needed, just a regular subs
1148 else {
1149 MRI.constrainRegClass(Reg, RC: MRI.getRegClass(Reg: Cond[3].getReg()));
1150 MRI.constrainRegClass(Reg: Cond[3].getReg(), RC: &AArch64::GPR32spRegClass);
1151 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWrr), DestReg: AArch64::WZR)
1152 .addReg(RegNo: Cond[3].getReg())
1153 .addReg(RegNo: Reg);
1154 }
1155
1156 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1157 } break;
1158 }
1159
1160 unsigned Opc = 0;
1161 const TargetRegisterClass *RC = nullptr;
1162 bool TryFold = false;
1163 if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::GPR64RegClass)) {
1164 RC = &AArch64::GPR64RegClass;
1165 Opc = AArch64::CSELXr;
1166 TryFold = true;
1167 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::GPR32RegClass)) {
1168 RC = &AArch64::GPR32RegClass;
1169 Opc = AArch64::CSELWr;
1170 TryFold = true;
1171 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::FPR64RegClass)) {
1172 RC = &AArch64::FPR64RegClass;
1173 Opc = AArch64::FCSELDrrr;
1174 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::FPR32RegClass)) {
1175 RC = &AArch64::FPR32RegClass;
1176 Opc = AArch64::FCSELSrrr;
1177 }
1178 assert(RC && "Unsupported regclass");
1179
1180 // Try folding simple instructions into the csel.
1181 if (TryFold) {
1182 unsigned NewReg = 0;
1183 unsigned FoldedOpc = canFoldIntoCSel(MRI, VReg: TrueReg, NewReg: &NewReg);
1184 if (FoldedOpc) {
1185 // The folded opcodes csinc, csinc and csneg apply the operation to
1186 // FalseReg, so we need to invert the condition.
1187 CC = AArch64CC::getInvertedCondCode(Code: CC);
1188 TrueReg = FalseReg;
1189 } else
1190 FoldedOpc = canFoldIntoCSel(MRI, VReg: FalseReg, NewReg: &NewReg);
1191
1192 // Fold the operation. Leave any dead instructions for DCE to clean up.
1193 if (FoldedOpc) {
1194 FalseReg = NewReg;
1195 Opc = FoldedOpc;
1196 // Extend the live range of NewReg.
1197 MRI.clearKillFlags(Reg: NewReg);
1198 }
1199 }
1200
1201 // Pull all virtual register into the appropriate class.
1202 MRI.constrainRegClass(Reg: TrueReg, RC);
1203 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1204 assert(
1205 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1206 FalseReg == AArch64::XZR) &&
1207 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1208 if (FalseReg.isVirtual())
1209 MRI.constrainRegClass(Reg: FalseReg, RC);
1210
1211 // Insert the csel.
1212 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: Opc), DestReg: DstReg)
1213 .addReg(RegNo: TrueReg)
1214 .addReg(RegNo: FalseReg)
1215 .addImm(Val: CC);
1216}
1217
1218// Return true if Imm can be loaded into a register by a "cheap" sequence of
1219// instructions. For now, "cheap" means at most two instructions.
1220static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1221 if (BitSize == 32)
1222 return true;
1223
1224 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1225 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(i: 1).getImm());
1226 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
1227 AArch64_IMM::expandMOVImm(Imm, BitSize, Insn&: Is);
1228
1229 return Is.size() <= 2;
1230}
1231
1232// Check if a COPY instruction is cheap.
1233static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1234 assert(MI.isCopy() && "Expected COPY instruction");
1235 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1236
1237 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1238 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1239 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1240 if (Reg.isVirtual())
1241 return MRI.getRegClass(Reg);
1242 if (Reg.isPhysical())
1243 return RI.getMinimalPhysRegClass(Reg);
1244 return nullptr;
1245 };
1246 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(i: 0).getReg());
1247 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(i: 1).getReg());
1248 if (DstRC && SrcRC && !RI.getCommonSubClass(A: DstRC, B: SrcRC))
1249 return false;
1250
1251 return MI.isAsCheapAsAMove();
1252}
1253
1254// FIXME: this implementation should be micro-architecture dependent, so a
1255// micro-architecture target hook should be introduced here in future.
1256bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
1257 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1258 if (isExynosCheapAsMove(MI))
1259 return true;
1260 return MI.isAsCheapAsAMove();
1261 }
1262
1263 switch (MI.getOpcode()) {
1264 default:
1265 return MI.isAsCheapAsAMove();
1266
1267 case TargetOpcode::COPY:
1268 return isCheapCopy(MI, RI);
1269
1270 case AArch64::ADDWrs:
1271 case AArch64::ADDXrs:
1272 case AArch64::SUBWrs:
1273 case AArch64::SUBXrs:
1274 return Subtarget.hasALULSLFast() && MI.getOperand(i: 3).getImm() <= 4;
1275
1276 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1277 // ORRXri, it is as cheap as MOV.
1278 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1279 case AArch64::MOVi32imm:
1280 return isCheapImmediate(MI, BitSize: 32);
1281 case AArch64::MOVi64imm:
1282 return isCheapImmediate(MI, BitSize: 64);
1283 }
1284}
1285
1286bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1287 switch (MI.getOpcode()) {
1288 default:
1289 return false;
1290
1291 case AArch64::ADDWrs:
1292 case AArch64::ADDXrs:
1293 case AArch64::ADDSWrs:
1294 case AArch64::ADDSXrs: {
1295 unsigned Imm = MI.getOperand(i: 3).getImm();
1296 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1297 if (ShiftVal == 0)
1298 return true;
1299 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1300 }
1301
1302 case AArch64::ADDWrx:
1303 case AArch64::ADDXrx:
1304 case AArch64::ADDXrx64:
1305 case AArch64::ADDSWrx:
1306 case AArch64::ADDSXrx:
1307 case AArch64::ADDSXrx64: {
1308 unsigned Imm = MI.getOperand(i: 3).getImm();
1309 switch (AArch64_AM::getArithExtendType(Imm)) {
1310 default:
1311 return false;
1312 case AArch64_AM::UXTB:
1313 case AArch64_AM::UXTH:
1314 case AArch64_AM::UXTW:
1315 case AArch64_AM::UXTX:
1316 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1317 }
1318 }
1319
1320 case AArch64::SUBWrs:
1321 case AArch64::SUBSWrs: {
1322 unsigned Imm = MI.getOperand(i: 3).getImm();
1323 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1324 return ShiftVal == 0 ||
1325 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1326 }
1327
1328 case AArch64::SUBXrs:
1329 case AArch64::SUBSXrs: {
1330 unsigned Imm = MI.getOperand(i: 3).getImm();
1331 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1332 return ShiftVal == 0 ||
1333 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1334 }
1335
1336 case AArch64::SUBWrx:
1337 case AArch64::SUBXrx:
1338 case AArch64::SUBXrx64:
1339 case AArch64::SUBSWrx:
1340 case AArch64::SUBSXrx:
1341 case AArch64::SUBSXrx64: {
1342 unsigned Imm = MI.getOperand(i: 3).getImm();
1343 switch (AArch64_AM::getArithExtendType(Imm)) {
1344 default:
1345 return false;
1346 case AArch64_AM::UXTB:
1347 case AArch64_AM::UXTH:
1348 case AArch64_AM::UXTW:
1349 case AArch64_AM::UXTX:
1350 return AArch64_AM::getArithShiftValue(Imm) == 0;
1351 }
1352 }
1353
1354 case AArch64::LDRBBroW:
1355 case AArch64::LDRBBroX:
1356 case AArch64::LDRBroW:
1357 case AArch64::LDRBroX:
1358 case AArch64::LDRDroW:
1359 case AArch64::LDRDroX:
1360 case AArch64::LDRHHroW:
1361 case AArch64::LDRHHroX:
1362 case AArch64::LDRHroW:
1363 case AArch64::LDRHroX:
1364 case AArch64::LDRQroW:
1365 case AArch64::LDRQroX:
1366 case AArch64::LDRSBWroW:
1367 case AArch64::LDRSBWroX:
1368 case AArch64::LDRSBXroW:
1369 case AArch64::LDRSBXroX:
1370 case AArch64::LDRSHWroW:
1371 case AArch64::LDRSHWroX:
1372 case AArch64::LDRSHXroW:
1373 case AArch64::LDRSHXroX:
1374 case AArch64::LDRSWroW:
1375 case AArch64::LDRSWroX:
1376 case AArch64::LDRSroW:
1377 case AArch64::LDRSroX:
1378 case AArch64::LDRWroW:
1379 case AArch64::LDRWroX:
1380 case AArch64::LDRXroW:
1381 case AArch64::LDRXroX:
1382 case AArch64::PRFMroW:
1383 case AArch64::PRFMroX:
1384 case AArch64::STRBBroW:
1385 case AArch64::STRBBroX:
1386 case AArch64::STRBroW:
1387 case AArch64::STRBroX:
1388 case AArch64::STRDroW:
1389 case AArch64::STRDroX:
1390 case AArch64::STRHHroW:
1391 case AArch64::STRHHroX:
1392 case AArch64::STRHroW:
1393 case AArch64::STRHroX:
1394 case AArch64::STRQroW:
1395 case AArch64::STRQroX:
1396 case AArch64::STRSroW:
1397 case AArch64::STRSroX:
1398 case AArch64::STRWroW:
1399 case AArch64::STRWroX:
1400 case AArch64::STRXroW:
1401 case AArch64::STRXroX: {
1402 unsigned IsSigned = MI.getOperand(i: 3).getImm();
1403 return !IsSigned;
1404 }
1405 }
1406}
1407
1408bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1409 unsigned Opc = MI.getOpcode();
1410 switch (Opc) {
1411 default:
1412 return false;
1413 case AArch64::SEH_StackAlloc:
1414 case AArch64::SEH_SaveFPLR:
1415 case AArch64::SEH_SaveFPLR_X:
1416 case AArch64::SEH_SaveReg:
1417 case AArch64::SEH_SaveReg_X:
1418 case AArch64::SEH_SaveRegP:
1419 case AArch64::SEH_SaveRegP_X:
1420 case AArch64::SEH_SaveFReg:
1421 case AArch64::SEH_SaveFReg_X:
1422 case AArch64::SEH_SaveFRegP:
1423 case AArch64::SEH_SaveFRegP_X:
1424 case AArch64::SEH_SetFP:
1425 case AArch64::SEH_AddFP:
1426 case AArch64::SEH_Nop:
1427 case AArch64::SEH_PrologEnd:
1428 case AArch64::SEH_EpilogStart:
1429 case AArch64::SEH_EpilogEnd:
1430 case AArch64::SEH_PACSignLR:
1431 case AArch64::SEH_SaveAnyRegI:
1432 case AArch64::SEH_SaveAnyRegIP:
1433 case AArch64::SEH_SaveAnyRegQP:
1434 case AArch64::SEH_SaveAnyRegQPX:
1435 case AArch64::SEH_AllocZ:
1436 case AArch64::SEH_SaveZReg:
1437 case AArch64::SEH_SavePReg:
1438 return true;
1439 }
1440}
1441
1442bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1443 Register &SrcReg, Register &DstReg,
1444 unsigned &SubIdx) const {
1445 switch (MI.getOpcode()) {
1446 default:
1447 return false;
1448 case AArch64::SBFMXri: // aka sxtw
1449 case AArch64::UBFMXri: // aka uxtw
1450 // Check for the 32 -> 64 bit extension case, these instructions can do
1451 // much more.
1452 if (MI.getOperand(i: 2).getImm() != 0 || MI.getOperand(i: 3).getImm() != 31)
1453 return false;
1454 // This is a signed or unsigned 32 -> 64 bit extension.
1455 SrcReg = MI.getOperand(i: 1).getReg();
1456 DstReg = MI.getOperand(i: 0).getReg();
1457 SubIdx = AArch64::sub_32;
1458 return true;
1459 }
1460}
1461
1462bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1463 const MachineInstr &MIa, const MachineInstr &MIb) const {
1464 const TargetRegisterInfo *TRI = &getRegisterInfo();
1465 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1466 int64_t OffsetA = 0, OffsetB = 0;
1467 TypeSize WidthA(0, false), WidthB(0, false);
1468 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1469
1470 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1471 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1472
1473 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1474 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1475 return false;
1476
1477 // Retrieve the base, offset from the base and width. Width
1478 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1479 // base are identical, and the offset of a lower memory access +
1480 // the width doesn't overlap the offset of a higher memory access,
1481 // then the memory accesses are different.
1482 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1483 // are assumed to have the same scale (vscale).
1484 if (getMemOperandWithOffsetWidth(MI: MIa, BaseOp&: BaseOpA, Offset&: OffsetA, OffsetIsScalable&: OffsetAIsScalable,
1485 Width&: WidthA, TRI) &&
1486 getMemOperandWithOffsetWidth(MI: MIb, BaseOp&: BaseOpB, Offset&: OffsetB, OffsetIsScalable&: OffsetBIsScalable,
1487 Width&: WidthB, TRI)) {
1488 if (BaseOpA->isIdenticalTo(Other: *BaseOpB) &&
1489 OffsetAIsScalable == OffsetBIsScalable) {
1490 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1491 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1492 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1493 if (LowWidth.isScalable() == OffsetAIsScalable &&
1494 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1495 return true;
1496 }
1497 }
1498 return false;
1499}
1500
1501bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1502 const MachineBasicBlock *MBB,
1503 const MachineFunction &MF) const {
1504 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1505 return true;
1506
1507 // Do not move an instruction that can be recognized as a branch target.
1508 if (hasBTISemantics(MI))
1509 return true;
1510
1511 switch (MI.getOpcode()) {
1512 case AArch64::HINT:
1513 // CSDB hints are scheduling barriers.
1514 if (MI.getOperand(i: 0).getImm() == 0x14)
1515 return true;
1516 break;
1517 case AArch64::DSB:
1518 case AArch64::ISB:
1519 // DSB and ISB also are scheduling barriers.
1520 return true;
1521 case AArch64::MSRpstatesvcrImm1:
1522 // SMSTART and SMSTOP are also scheduling barriers.
1523 return true;
1524 default:;
1525 }
1526 if (isSEHInstruction(MI))
1527 return true;
1528 auto Next = std::next(x: MI.getIterator());
1529 return Next != MBB->end() && Next->isCFIInstruction();
1530}
1531
1532/// analyzeCompare - For a comparison instruction, return the source registers
1533/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1534/// Return true if the comparison instruction can be analyzed.
1535bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1536 Register &SrcReg2, int64_t &CmpMask,
1537 int64_t &CmpValue) const {
1538 // The first operand can be a frame index where we'd normally expect a
1539 // register.
1540 // FIXME: Pass subregisters out of analyzeCompare
1541 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1542 if (!MI.getOperand(i: 1).isReg() || MI.getOperand(i: 1).getSubReg())
1543 return false;
1544
1545 switch (MI.getOpcode()) {
1546 default:
1547 break;
1548 case AArch64::PTEST_PP:
1549 case AArch64::PTEST_PP_ANY:
1550 case AArch64::PTEST_PP_FIRST:
1551 SrcReg = MI.getOperand(i: 0).getReg();
1552 SrcReg2 = MI.getOperand(i: 1).getReg();
1553 if (MI.getOperand(i: 2).getSubReg())
1554 return false;
1555
1556 // Not sure about the mask and value for now...
1557 CmpMask = ~0;
1558 CmpValue = 0;
1559 return true;
1560 case AArch64::SUBSWrr:
1561 case AArch64::SUBSWrs:
1562 case AArch64::SUBSWrx:
1563 case AArch64::SUBSXrr:
1564 case AArch64::SUBSXrs:
1565 case AArch64::SUBSXrx:
1566 case AArch64::ADDSWrr:
1567 case AArch64::ADDSWrs:
1568 case AArch64::ADDSWrx:
1569 case AArch64::ADDSXrr:
1570 case AArch64::ADDSXrs:
1571 case AArch64::ADDSXrx:
1572 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1573 SrcReg = MI.getOperand(i: 1).getReg();
1574 SrcReg2 = MI.getOperand(i: 2).getReg();
1575
1576 // FIXME: Pass subregisters out of analyzeCompare
1577 if (MI.getOperand(i: 2).getSubReg())
1578 return false;
1579
1580 CmpMask = ~0;
1581 CmpValue = 0;
1582 return true;
1583 case AArch64::SUBSWri:
1584 case AArch64::ADDSWri:
1585 case AArch64::SUBSXri:
1586 case AArch64::ADDSXri:
1587 SrcReg = MI.getOperand(i: 1).getReg();
1588 SrcReg2 = 0;
1589 CmpMask = ~0;
1590 CmpValue = MI.getOperand(i: 2).getImm();
1591 return true;
1592 case AArch64::ANDSWri:
1593 case AArch64::ANDSXri:
1594 // ANDS does not use the same encoding scheme as the others xxxS
1595 // instructions.
1596 SrcReg = MI.getOperand(i: 1).getReg();
1597 SrcReg2 = 0;
1598 CmpMask = ~0;
1599 CmpValue = AArch64_AM::decodeLogicalImmediate(
1600 val: MI.getOperand(i: 2).getImm(),
1601 regSize: MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1602 return true;
1603 }
1604
1605 return false;
1606}
1607
1608static bool UpdateOperandRegClass(MachineInstr &Instr) {
1609 MachineBasicBlock *MBB = Instr.getParent();
1610 assert(MBB && "Can't get MachineBasicBlock here");
1611 MachineFunction *MF = MBB->getParent();
1612 assert(MF && "Can't get MachineFunction here");
1613 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1614 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1615 MachineRegisterInfo *MRI = &MF->getRegInfo();
1616
1617 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1618 ++OpIdx) {
1619 MachineOperand &MO = Instr.getOperand(i: OpIdx);
1620 const TargetRegisterClass *OpRegCstraints =
1621 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1622
1623 // If there's no constraint, there's nothing to do.
1624 if (!OpRegCstraints)
1625 continue;
1626 // If the operand is a frame index, there's nothing to do here.
1627 // A frame index operand will resolve correctly during PEI.
1628 if (MO.isFI())
1629 continue;
1630
1631 assert(MO.isReg() &&
1632 "Operand has register constraints without being a register!");
1633
1634 Register Reg = MO.getReg();
1635 if (Reg.isPhysical()) {
1636 if (!OpRegCstraints->contains(Reg))
1637 return false;
1638 } else if (!OpRegCstraints->hasSubClassEq(RC: MRI->getRegClass(Reg)) &&
1639 !MRI->constrainRegClass(Reg, RC: OpRegCstraints))
1640 return false;
1641 }
1642
1643 return true;
1644}
1645
1646/// Return the opcode that does not set flags when possible - otherwise
1647/// return the original opcode. The caller is responsible to do the actual
1648/// substitution and legality checking.
1649static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1650 // Don't convert all compare instructions, because for some the zero register
1651 // encoding becomes the sp register.
1652 bool MIDefinesZeroReg = false;
1653 if (MI.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) ||
1654 MI.definesRegister(Reg: AArch64::XZR, /*TRI=*/nullptr))
1655 MIDefinesZeroReg = true;
1656
1657 switch (MI.getOpcode()) {
1658 default:
1659 return MI.getOpcode();
1660 case AArch64::ADDSWrr:
1661 return AArch64::ADDWrr;
1662 case AArch64::ADDSWri:
1663 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1664 case AArch64::ADDSWrs:
1665 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1666 case AArch64::ADDSWrx:
1667 return AArch64::ADDWrx;
1668 case AArch64::ADDSXrr:
1669 return AArch64::ADDXrr;
1670 case AArch64::ADDSXri:
1671 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1672 case AArch64::ADDSXrs:
1673 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1674 case AArch64::ADDSXrx:
1675 return AArch64::ADDXrx;
1676 case AArch64::SUBSWrr:
1677 return AArch64::SUBWrr;
1678 case AArch64::SUBSWri:
1679 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1680 case AArch64::SUBSWrs:
1681 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1682 case AArch64::SUBSWrx:
1683 return AArch64::SUBWrx;
1684 case AArch64::SUBSXrr:
1685 return AArch64::SUBXrr;
1686 case AArch64::SUBSXri:
1687 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1688 case AArch64::SUBSXrs:
1689 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1690 case AArch64::SUBSXrx:
1691 return AArch64::SUBXrx;
1692 }
1693}
1694
1695enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1696
1697/// True when condition flags are accessed (either by writing or reading)
1698/// on the instruction trace starting at From and ending at To.
1699///
1700/// Note: If From and To are from different blocks it's assumed CC are accessed
1701/// on the path.
1702static bool areCFlagsAccessedBetweenInstrs(
1703 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1704 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1705 // Early exit if To is at the beginning of the BB.
1706 if (To == To->getParent()->begin())
1707 return true;
1708
1709 // Check whether the instructions are in the same basic block
1710 // If not, assume the condition flags might get modified somewhere.
1711 if (To->getParent() != From->getParent())
1712 return true;
1713
1714 // From must be above To.
1715 assert(std::any_of(
1716 ++To.getReverse(), To->getParent()->rend(),
1717 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1718
1719 // We iterate backward starting at \p To until we hit \p From.
1720 for (const MachineInstr &Instr :
1721 instructionsWithoutDebug(It: ++To.getReverse(), End: From.getReverse())) {
1722 if (((AccessToCheck & AK_Write) &&
1723 Instr.modifiesRegister(Reg: AArch64::NZCV, TRI)) ||
1724 ((AccessToCheck & AK_Read) && Instr.readsRegister(Reg: AArch64::NZCV, TRI)))
1725 return true;
1726 }
1727 return false;
1728}
1729
1730std::optional<unsigned>
1731AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1732 MachineInstr *Pred,
1733 const MachineRegisterInfo *MRI) const {
1734 unsigned MaskOpcode = Mask->getOpcode();
1735 unsigned PredOpcode = Pred->getOpcode();
1736 bool PredIsPTestLike = isPTestLikeOpcode(Opc: PredOpcode);
1737 bool PredIsWhileLike = isWhileOpcode(Opc: PredOpcode);
1738
1739 if (PredIsWhileLike) {
1740 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1741 // instruction and the condition is "any" since WHILcc does an implicit
1742 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1743 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1744 return PredOpcode;
1745
1746 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1747 // redundant since WHILE performs an implicit PTEST with an all active
1748 // mask.
1749 if (isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31 &&
1750 getElementSizeForOpcode(Opc: MaskOpcode) ==
1751 getElementSizeForOpcode(Opc: PredOpcode))
1752 return PredOpcode;
1753
1754 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1755 // WHILEcc performs an implicit PTEST with an all active mask, setting
1756 // the N flag as the PTEST_FIRST would.
1757 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1758 isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31)
1759 return PredOpcode;
1760
1761 return {};
1762 }
1763
1764 if (PredIsPTestLike) {
1765 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1766 // instruction that sets the flags as PTEST would and the condition is
1767 // "any" since PG is always a subset of the governing predicate of the
1768 // ptest-like instruction.
1769 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1770 return PredOpcode;
1771
1772 auto PTestLikeMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1773
1774 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1775 // to look through a copy and try again. This is because some instructions
1776 // take a predicate whose register class is a subset of its result class.
1777 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1778 PTestLikeMask->getOperand(i: 1).getReg().isVirtual())
1779 PTestLikeMask =
1780 MRI->getUniqueVRegDef(Reg: PTestLikeMask->getOperand(i: 1).getReg());
1781
1782 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1783 // the element size matches and either the PTEST_LIKE instruction uses
1784 // the same all active mask or the condition is "any".
1785 if (isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31 &&
1786 getElementSizeForOpcode(Opc: MaskOpcode) ==
1787 getElementSizeForOpcode(Opc: PredOpcode)) {
1788 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1789 return PredOpcode;
1790 }
1791
1792 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1793 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1794 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1795 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1796 // performed by the compare could consider fewer lanes for these element
1797 // sizes.
1798 //
1799 // For example, consider
1800 //
1801 // ptrue p0.b ; P0=1111-1111-1111-1111
1802 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1803 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1804 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1805 // ; ^ last active
1806 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1807 // ; ^ last active
1808 //
1809 // where the compare generates a canonical all active 32-bit predicate
1810 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1811 // active flag, whereas the PTEST instruction with the same mask doesn't.
1812 // For PTEST_ANY this doesn't apply as the flags in this case would be
1813 // identical regardless of element size.
1814 uint64_t PredElementSize = getElementSizeForOpcode(Opc: PredOpcode);
1815 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1816 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1817 return PredOpcode;
1818
1819 return {};
1820 }
1821
1822 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1823 // opcode so the PTEST becomes redundant.
1824 switch (PredOpcode) {
1825 case AArch64::AND_PPzPP:
1826 case AArch64::BIC_PPzPP:
1827 case AArch64::EOR_PPzPP:
1828 case AArch64::NAND_PPzPP:
1829 case AArch64::NOR_PPzPP:
1830 case AArch64::ORN_PPzPP:
1831 case AArch64::ORR_PPzPP:
1832 case AArch64::BRKA_PPzP:
1833 case AArch64::BRKPA_PPzPP:
1834 case AArch64::BRKB_PPzP:
1835 case AArch64::BRKPB_PPzPP:
1836 case AArch64::RDFFR_PPz: {
1837 // Check to see if our mask is the same. If not the resulting flag bits
1838 // may be different and we can't remove the ptest.
1839 auto *PredMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1840 if (Mask != PredMask)
1841 return {};
1842 break;
1843 }
1844 case AArch64::BRKN_PPzP: {
1845 // BRKN uses an all active implicit mask to set flags unlike the other
1846 // flag-setting instructions.
1847 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1848 if ((MaskOpcode != AArch64::PTRUE_B) ||
1849 (Mask->getOperand(i: 1).getImm() != 31))
1850 return {};
1851 break;
1852 }
1853 case AArch64::PTRUE_B:
1854 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1855 break;
1856 default:
1857 // Bail out if we don't recognize the input
1858 return {};
1859 }
1860
1861 return convertToFlagSettingOpc(Opc: PredOpcode);
1862}
1863
1864/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1865/// operation which could set the flags in an identical manner
1866bool AArch64InstrInfo::optimizePTestInstr(
1867 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1868 const MachineRegisterInfo *MRI) const {
1869 auto *Mask = MRI->getUniqueVRegDef(Reg: MaskReg);
1870 auto *Pred = MRI->getUniqueVRegDef(Reg: PredReg);
1871
1872 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1873 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1874 // before the branch to extract each subregister.
1875 auto Op = Pred->getOperand(i: 1);
1876 if (Op.isReg() && Op.getReg().isVirtual() &&
1877 Op.getSubReg() == AArch64::psub0)
1878 Pred = MRI->getUniqueVRegDef(Reg: Op.getReg());
1879 }
1880
1881 unsigned PredOpcode = Pred->getOpcode();
1882 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1883 if (!NewOp)
1884 return false;
1885
1886 const TargetRegisterInfo *TRI = &getRegisterInfo();
1887
1888 // If another instruction between Pred and PTest accesses flags, don't remove
1889 // the ptest or update the earlier instruction to modify them.
1890 if (areCFlagsAccessedBetweenInstrs(From: Pred, To: PTest, TRI))
1891 return false;
1892
1893 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1894 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1895 // operand to be replaced with an equivalent instruction that also sets the
1896 // flags.
1897 PTest->eraseFromParent();
1898 if (*NewOp != PredOpcode) {
1899 Pred->setDesc(get(Opcode: *NewOp));
1900 bool succeeded = UpdateOperandRegClass(Instr&: *Pred);
1901 (void)succeeded;
1902 assert(succeeded && "Operands have incompatible register classes!");
1903 Pred->addRegisterDefined(Reg: AArch64::NZCV, RegInfo: TRI);
1904 }
1905
1906 // Ensure that the flags def is live.
1907 if (Pred->registerDefIsDead(Reg: AArch64::NZCV, TRI)) {
1908 unsigned i = 0, e = Pred->getNumOperands();
1909 for (; i != e; ++i) {
1910 MachineOperand &MO = Pred->getOperand(i);
1911 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1912 MO.setIsDead(false);
1913 break;
1914 }
1915 }
1916 }
1917 return true;
1918}
1919
1920/// Try to optimize a compare instruction. A compare instruction is an
1921/// instruction which produces AArch64::NZCV. It can be truly compare
1922/// instruction
1923/// when there are no uses of its destination register.
1924///
1925/// The following steps are tried in order:
1926/// 1. Convert CmpInstr into an unconditional version.
1927/// 2. Remove CmpInstr if above there is an instruction producing a needed
1928/// condition code or an instruction which can be converted into such an
1929/// instruction.
1930/// Only comparison with zero is supported.
1931bool AArch64InstrInfo::optimizeCompareInstr(
1932 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1933 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1934 assert(CmpInstr.getParent());
1935 assert(MRI);
1936
1937 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1938 int DeadNZCVIdx =
1939 CmpInstr.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true);
1940 if (DeadNZCVIdx != -1) {
1941 if (CmpInstr.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) ||
1942 CmpInstr.definesRegister(Reg: AArch64::XZR, /*TRI=*/nullptr)) {
1943 CmpInstr.eraseFromParent();
1944 return true;
1945 }
1946 unsigned Opc = CmpInstr.getOpcode();
1947 unsigned NewOpc = convertToNonFlagSettingOpc(MI: CmpInstr);
1948 if (NewOpc == Opc)
1949 return false;
1950 const MCInstrDesc &MCID = get(Opcode: NewOpc);
1951 CmpInstr.setDesc(MCID);
1952 CmpInstr.removeOperand(OpNo: DeadNZCVIdx);
1953 bool succeeded = UpdateOperandRegClass(Instr&: CmpInstr);
1954 (void)succeeded;
1955 assert(succeeded && "Some operands reg class are incompatible!");
1956 return true;
1957 }
1958
1959 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1960 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
1961 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
1962 return optimizePTestInstr(PTest: &CmpInstr, MaskReg: SrcReg, PredReg: SrcReg2, MRI);
1963
1964 if (SrcReg2 != 0)
1965 return false;
1966
1967 // CmpInstr is a Compare instruction if destination register is not used.
1968 if (!MRI->use_nodbg_empty(RegNo: CmpInstr.getOperand(i: 0).getReg()))
1969 return false;
1970
1971 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, MRI: *MRI))
1972 return true;
1973 return (CmpValue == 0 || CmpValue == 1) &&
1974 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, MRI: *MRI);
1975}
1976
1977/// Get opcode of S version of Instr.
1978/// If Instr is S version its opcode is returned.
1979/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1980/// or we are not interested in it.
1981static unsigned sForm(MachineInstr &Instr) {
1982 switch (Instr.getOpcode()) {
1983 default:
1984 return AArch64::INSTRUCTION_LIST_END;
1985
1986 case AArch64::ADDSWrr:
1987 case AArch64::ADDSWri:
1988 case AArch64::ADDSXrr:
1989 case AArch64::ADDSXri:
1990 case AArch64::ADDSWrx:
1991 case AArch64::ADDSXrx:
1992 case AArch64::SUBSWrr:
1993 case AArch64::SUBSWri:
1994 case AArch64::SUBSWrx:
1995 case AArch64::SUBSXrr:
1996 case AArch64::SUBSXri:
1997 case AArch64::SUBSXrx:
1998 case AArch64::ANDSWri:
1999 case AArch64::ANDSWrr:
2000 case AArch64::ANDSWrs:
2001 case AArch64::ANDSXri:
2002 case AArch64::ANDSXrr:
2003 case AArch64::ANDSXrs:
2004 case AArch64::BICSWrr:
2005 case AArch64::BICSXrr:
2006 case AArch64::BICSWrs:
2007 case AArch64::BICSXrs:
2008 return Instr.getOpcode();
2009
2010 case AArch64::ADDWrr:
2011 return AArch64::ADDSWrr;
2012 case AArch64::ADDWri:
2013 return AArch64::ADDSWri;
2014 case AArch64::ADDXrr:
2015 return AArch64::ADDSXrr;
2016 case AArch64::ADDXri:
2017 return AArch64::ADDSXri;
2018 case AArch64::ADDWrx:
2019 return AArch64::ADDSWrx;
2020 case AArch64::ADDXrx:
2021 return AArch64::ADDSXrx;
2022 case AArch64::ADCWr:
2023 return AArch64::ADCSWr;
2024 case AArch64::ADCXr:
2025 return AArch64::ADCSXr;
2026 case AArch64::SUBWrr:
2027 return AArch64::SUBSWrr;
2028 case AArch64::SUBWri:
2029 return AArch64::SUBSWri;
2030 case AArch64::SUBXrr:
2031 return AArch64::SUBSXrr;
2032 case AArch64::SUBXri:
2033 return AArch64::SUBSXri;
2034 case AArch64::SUBWrx:
2035 return AArch64::SUBSWrx;
2036 case AArch64::SUBXrx:
2037 return AArch64::SUBSXrx;
2038 case AArch64::SBCWr:
2039 return AArch64::SBCSWr;
2040 case AArch64::SBCXr:
2041 return AArch64::SBCSXr;
2042 case AArch64::ANDWri:
2043 return AArch64::ANDSWri;
2044 case AArch64::ANDXri:
2045 return AArch64::ANDSXri;
2046 case AArch64::ANDWrr:
2047 return AArch64::ANDSWrr;
2048 case AArch64::ANDWrs:
2049 return AArch64::ANDSWrs;
2050 case AArch64::ANDXrr:
2051 return AArch64::ANDSXrr;
2052 case AArch64::ANDXrs:
2053 return AArch64::ANDSXrs;
2054 case AArch64::BICWrr:
2055 return AArch64::BICSWrr;
2056 case AArch64::BICXrr:
2057 return AArch64::BICSXrr;
2058 case AArch64::BICWrs:
2059 return AArch64::BICSWrs;
2060 case AArch64::BICXrs:
2061 return AArch64::BICSXrs;
2062 }
2063}
2064
2065/// Check if AArch64::NZCV should be alive in successors of MBB.
2066static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
2067 for (auto *BB : MBB->successors())
2068 if (BB->isLiveIn(Reg: AArch64::NZCV))
2069 return true;
2070 return false;
2071}
2072
2073/// \returns The condition code operand index for \p Instr if it is a branch
2074/// or select and -1 otherwise.
2075static int
2076findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
2077 switch (Instr.getOpcode()) {
2078 default:
2079 return -1;
2080
2081 case AArch64::Bcc: {
2082 int Idx = Instr.findRegisterUseOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
2083 assert(Idx >= 2);
2084 return Idx - 2;
2085 }
2086
2087 case AArch64::CSINVWr:
2088 case AArch64::CSINVXr:
2089 case AArch64::CSINCWr:
2090 case AArch64::CSINCXr:
2091 case AArch64::CSELWr:
2092 case AArch64::CSELXr:
2093 case AArch64::CSNEGWr:
2094 case AArch64::CSNEGXr:
2095 case AArch64::FCSELSrrr:
2096 case AArch64::FCSELDrrr: {
2097 int Idx = Instr.findRegisterUseOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
2098 assert(Idx >= 1);
2099 return Idx - 1;
2100 }
2101 }
2102}
2103
2104/// Find a condition code used by the instruction.
2105/// Returns AArch64CC::Invalid if either the instruction does not use condition
2106/// codes or we don't optimize CmpInstr in the presence of such instructions.
2107static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
2108 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2109 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2110 Instr.getOperand(i: CCIdx).getImm())
2111 : AArch64CC::Invalid;
2112}
2113
2114static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
2115 assert(CC != AArch64CC::Invalid);
2116 UsedNZCV UsedFlags;
2117 switch (CC) {
2118 default:
2119 break;
2120
2121 case AArch64CC::EQ: // Z set
2122 case AArch64CC::NE: // Z clear
2123 UsedFlags.Z = true;
2124 break;
2125
2126 case AArch64CC::HI: // Z clear and C set
2127 case AArch64CC::LS: // Z set or C clear
2128 UsedFlags.Z = true;
2129 [[fallthrough]];
2130 case AArch64CC::HS: // C set
2131 case AArch64CC::LO: // C clear
2132 UsedFlags.C = true;
2133 break;
2134
2135 case AArch64CC::MI: // N set
2136 case AArch64CC::PL: // N clear
2137 UsedFlags.N = true;
2138 break;
2139
2140 case AArch64CC::VS: // V set
2141 case AArch64CC::VC: // V clear
2142 UsedFlags.V = true;
2143 break;
2144
2145 case AArch64CC::GT: // Z clear, N and V the same
2146 case AArch64CC::LE: // Z set, N and V differ
2147 UsedFlags.Z = true;
2148 [[fallthrough]];
2149 case AArch64CC::GE: // N and V the same
2150 case AArch64CC::LT: // N and V differ
2151 UsedFlags.N = true;
2152 UsedFlags.V = true;
2153 break;
2154 }
2155 return UsedFlags;
2156}
2157
2158/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2159/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2160/// \returns std::nullopt otherwise.
2161///
2162/// Collect instructions using that flags in \p CCUseInstrs if provided.
2163std::optional<UsedNZCV>
2164llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
2165 const TargetRegisterInfo &TRI,
2166 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2167 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2168 if (MI.getParent() != CmpParent)
2169 return std::nullopt;
2170
2171 if (areCFlagsAliveInSuccessors(MBB: CmpParent))
2172 return std::nullopt;
2173
2174 UsedNZCV NZCVUsedAfterCmp;
2175 for (MachineInstr &Instr : instructionsWithoutDebug(
2176 It: std::next(x: CmpInstr.getIterator()), End: CmpParent->instr_end())) {
2177 if (Instr.readsRegister(Reg: AArch64::NZCV, TRI: &TRI)) {
2178 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
2179 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2180 return std::nullopt;
2181 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2182 if (CCUseInstrs)
2183 CCUseInstrs->push_back(Elt: &Instr);
2184 }
2185 if (Instr.modifiesRegister(Reg: AArch64::NZCV, TRI: &TRI))
2186 break;
2187 }
2188 return NZCVUsedAfterCmp;
2189}
2190
2191static bool isADDSRegImm(unsigned Opcode) {
2192 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2193}
2194
2195static bool isSUBSRegImm(unsigned Opcode) {
2196 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2197}
2198
2199static bool isANDOpcode(MachineInstr &MI) {
2200 unsigned Opc = sForm(Instr&: MI);
2201 switch (Opc) {
2202 case AArch64::ANDSWri:
2203 case AArch64::ANDSWrr:
2204 case AArch64::ANDSWrs:
2205 case AArch64::ANDSXri:
2206 case AArch64::ANDSXrr:
2207 case AArch64::ANDSXrs:
2208 case AArch64::BICSWrr:
2209 case AArch64::BICSXrr:
2210 case AArch64::BICSWrs:
2211 case AArch64::BICSXrs:
2212 return true;
2213 default:
2214 return false;
2215 }
2216}
2217
2218/// Check if CmpInstr can be substituted by MI.
2219///
2220/// CmpInstr can be substituted:
2221/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2222/// - and, MI and CmpInstr are from the same MachineBB
2223/// - and, condition flags are not alive in successors of the CmpInstr parent
2224/// - and, if MI opcode is the S form there must be no defs of flags between
2225/// MI and CmpInstr
2226/// or if MI opcode is not the S form there must be neither defs of flags
2227/// nor uses of flags between MI and CmpInstr.
2228/// - and, if C/V flags are not used after CmpInstr
2229/// or if N flag is used but MI produces poison value if signed overflow
2230/// occurs.
2231static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
2232 const TargetRegisterInfo &TRI) {
2233 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2234 // that may or may not set flags.
2235 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2236
2237 const unsigned CmpOpcode = CmpInstr.getOpcode();
2238 if (!isADDSRegImm(Opcode: CmpOpcode) && !isSUBSRegImm(Opcode: CmpOpcode))
2239 return false;
2240
2241 assert((CmpInstr.getOperand(2).isImm() &&
2242 CmpInstr.getOperand(2).getImm() == 0) &&
2243 "Caller guarantees that CmpInstr compares with constant 0");
2244
2245 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2246 if (!NZVCUsed || NZVCUsed->C)
2247 return false;
2248
2249 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2250 // '%vreg = add ...' or '%vreg = sub ...'.
2251 // Condition flag V is used to indicate signed overflow.
2252 // 1) MI and CmpInstr set N and V to the same value.
2253 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2254 // signed overflow occurs, so CmpInstr could still be simplified away.
2255 // Note that Ands and Bics instructions always clear the V flag.
2256 if (NZVCUsed->V && !MI.getFlag(Flag: MachineInstr::NoSWrap) && !isANDOpcode(MI))
2257 return false;
2258
2259 AccessKind AccessToCheck = AK_Write;
2260 if (sForm(Instr&: MI) != MI.getOpcode())
2261 AccessToCheck = AK_All;
2262 return !areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck);
2263}
2264
2265/// Substitute an instruction comparing to zero with another instruction
2266/// which produces needed condition flags.
2267///
2268/// Return true on success.
2269bool AArch64InstrInfo::substituteCmpToZero(
2270 MachineInstr &CmpInstr, unsigned SrcReg,
2271 const MachineRegisterInfo &MRI) const {
2272 // Get the unique definition of SrcReg.
2273 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
2274 if (!MI)
2275 return false;
2276
2277 const TargetRegisterInfo &TRI = getRegisterInfo();
2278
2279 unsigned NewOpc = sForm(Instr&: *MI);
2280 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2281 return false;
2282
2283 if (!canInstrSubstituteCmpInstr(MI&: *MI, CmpInstr, TRI))
2284 return false;
2285
2286 // Update the instruction to set NZCV.
2287 MI->setDesc(get(Opcode: NewOpc));
2288 CmpInstr.eraseFromParent();
2289 bool succeeded = UpdateOperandRegClass(Instr&: *MI);
2290 (void)succeeded;
2291 assert(succeeded && "Some operands reg class are incompatible!");
2292 MI->addRegisterDefined(Reg: AArch64::NZCV, RegInfo: &TRI);
2293 return true;
2294}
2295
2296/// \returns True if \p CmpInstr can be removed.
2297///
2298/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2299/// codes used in \p CCUseInstrs must be inverted.
2300static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
2301 int CmpValue, const TargetRegisterInfo &TRI,
2302 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
2303 bool &IsInvertCC) {
2304 assert((CmpValue == 0 || CmpValue == 1) &&
2305 "Only comparisons to 0 or 1 considered for removal!");
2306
2307 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2308 unsigned MIOpc = MI.getOpcode();
2309 if (MIOpc == AArch64::CSINCWr) {
2310 if (MI.getOperand(i: 1).getReg() != AArch64::WZR ||
2311 MI.getOperand(i: 2).getReg() != AArch64::WZR)
2312 return false;
2313 } else if (MIOpc == AArch64::CSINCXr) {
2314 if (MI.getOperand(i: 1).getReg() != AArch64::XZR ||
2315 MI.getOperand(i: 2).getReg() != AArch64::XZR)
2316 return false;
2317 } else {
2318 return false;
2319 }
2320 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(Instr: MI);
2321 if (MICC == AArch64CC::Invalid)
2322 return false;
2323
2324 // NZCV needs to be defined
2325 if (MI.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) != -1)
2326 return false;
2327
2328 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2329 const unsigned CmpOpcode = CmpInstr.getOpcode();
2330 bool IsSubsRegImm = isSUBSRegImm(Opcode: CmpOpcode);
2331 if (CmpValue && !IsSubsRegImm)
2332 return false;
2333 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(Opcode: CmpOpcode))
2334 return false;
2335
2336 // MI conditions allowed: eq, ne, mi, pl
2337 UsedNZCV MIUsedNZCV = getUsedNZCV(CC: MICC);
2338 if (MIUsedNZCV.C || MIUsedNZCV.V)
2339 return false;
2340
2341 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2342 examineCFlagsUse(MI, CmpInstr, TRI, CCUseInstrs: &CCUseInstrs);
2343 // Condition flags are not used in CmpInstr basic block successors and only
2344 // Z or N flags allowed to be used after CmpInstr within its basic block
2345 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2346 return false;
2347 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2348 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2349 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2350 return false;
2351 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2352 if (MIUsedNZCV.N && !CmpValue)
2353 return false;
2354
2355 // There must be no defs of flags between MI and CmpInstr
2356 if (areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck: AK_Write))
2357 return false;
2358
2359 // Condition code is inverted in the following cases:
2360 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2361 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2362 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2363 (!CmpValue && MICC == AArch64CC::NE);
2364 return true;
2365}
2366
2367/// Remove comparison in csinc-cmp sequence
2368///
2369/// Examples:
2370/// 1. \code
2371/// csinc w9, wzr, wzr, ne
2372/// cmp w9, #0
2373/// b.eq
2374/// \endcode
2375/// to
2376/// \code
2377/// csinc w9, wzr, wzr, ne
2378/// b.ne
2379/// \endcode
2380///
2381/// 2. \code
2382/// csinc x2, xzr, xzr, mi
2383/// cmp x2, #1
2384/// b.pl
2385/// \endcode
2386/// to
2387/// \code
2388/// csinc x2, xzr, xzr, mi
2389/// b.pl
2390/// \endcode
2391///
2392/// \param CmpInstr comparison instruction
2393/// \return True when comparison removed
2394bool AArch64InstrInfo::removeCmpToZeroOrOne(
2395 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2396 const MachineRegisterInfo &MRI) const {
2397 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
2398 if (!MI)
2399 return false;
2400 const TargetRegisterInfo &TRI = getRegisterInfo();
2401 SmallVector<MachineInstr *, 4> CCUseInstrs;
2402 bool IsInvertCC = false;
2403 if (!canCmpInstrBeRemoved(MI&: *MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2404 IsInvertCC))
2405 return false;
2406 // Make transformation
2407 CmpInstr.eraseFromParent();
2408 if (IsInvertCC) {
2409 // Invert condition codes in CmpInstr CC users
2410 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2411 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(Instr: *CCUseInstr);
2412 assert(Idx >= 0 && "Unexpected instruction using CC.");
2413 MachineOperand &CCOperand = CCUseInstr->getOperand(i: Idx);
2414 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
2415 Code: static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2416 CCOperand.setImm(CCUse);
2417 }
2418 }
2419 return true;
2420}
2421
2422bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2423 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2424 MI.getOpcode() != AArch64::CATCHRET)
2425 return false;
2426
2427 MachineBasicBlock &MBB = *MI.getParent();
2428 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2429 auto TRI = Subtarget.getRegisterInfo();
2430 DebugLoc DL = MI.getDebugLoc();
2431
2432 if (MI.getOpcode() == AArch64::CATCHRET) {
2433 // Skip to the first instruction before the epilog.
2434 const TargetInstrInfo *TII =
2435 MBB.getParent()->getSubtarget().getInstrInfo();
2436 MachineBasicBlock *TargetMBB = MI.getOperand(i: 0).getMBB();
2437 auto MBBI = MachineBasicBlock::iterator(MI);
2438 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(x: MBBI);
2439 while (FirstEpilogSEH->getFlag(Flag: MachineInstr::FrameDestroy) &&
2440 FirstEpilogSEH != MBB.begin())
2441 FirstEpilogSEH = std::prev(x: FirstEpilogSEH);
2442 if (FirstEpilogSEH != MBB.begin())
2443 FirstEpilogSEH = std::next(x: FirstEpilogSEH);
2444 BuildMI(BB&: MBB, I: FirstEpilogSEH, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADRP))
2445 .addReg(RegNo: AArch64::X0, Flags: RegState::Define)
2446 .addMBB(MBB: TargetMBB);
2447 BuildMI(BB&: MBB, I: FirstEpilogSEH, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri))
2448 .addReg(RegNo: AArch64::X0, Flags: RegState::Define)
2449 .addReg(RegNo: AArch64::X0)
2450 .addMBB(MBB: TargetMBB)
2451 .addImm(Val: 0);
2452 TargetMBB->setMachineBlockAddressTaken();
2453 return true;
2454 }
2455
2456 Register Reg = MI.getOperand(i: 0).getReg();
2457 Module &M = *MBB.getParent()->getFunction().getParent();
2458 if (M.getStackProtectorGuard() == "sysreg") {
2459 const AArch64SysReg::SysReg *SrcReg =
2460 AArch64SysReg::lookupSysRegByName(Name: M.getStackProtectorGuardReg());
2461 if (!SrcReg)
2462 report_fatal_error(reason: "Unknown SysReg for Stack Protector Guard Register");
2463
2464 // mrs xN, sysreg
2465 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MRS))
2466 .addDef(RegNo: Reg, Flags: RegState::Renamable)
2467 .addImm(Val: SrcReg->Encoding);
2468 int Offset = M.getStackProtectorGuardOffset();
2469 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2470 // ldr xN, [xN, #offset]
2471 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui))
2472 .addDef(RegNo: Reg)
2473 .addUse(RegNo: Reg, Flags: RegState::Kill)
2474 .addImm(Val: Offset / 8);
2475 } else if (Offset >= -256 && Offset <= 255) {
2476 // ldur xN, [xN, #offset]
2477 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDURXi))
2478 .addDef(RegNo: Reg)
2479 .addUse(RegNo: Reg, Flags: RegState::Kill)
2480 .addImm(Val: Offset);
2481 } else if (Offset >= -4095 && Offset <= 4095) {
2482 if (Offset > 0) {
2483 // add xN, xN, #offset
2484 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri))
2485 .addDef(RegNo: Reg)
2486 .addUse(RegNo: Reg, Flags: RegState::Kill)
2487 .addImm(Val: Offset)
2488 .addImm(Val: 0);
2489 } else {
2490 // sub xN, xN, #offset
2491 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::SUBXri))
2492 .addDef(RegNo: Reg)
2493 .addUse(RegNo: Reg, Flags: RegState::Kill)
2494 .addImm(Val: -Offset)
2495 .addImm(Val: 0);
2496 }
2497 // ldr xN, [xN]
2498 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui))
2499 .addDef(RegNo: Reg)
2500 .addUse(RegNo: Reg, Flags: RegState::Kill)
2501 .addImm(Val: 0);
2502 } else {
2503 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2504 // than 23760.
2505 // It might be nice to use AArch64::MOVi32imm here, which would get
2506 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2507 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2508 // AArch64FrameLowering might help us find such a scratch register
2509 // though. If we failed to find a scratch register, we could emit a
2510 // stream of add instructions to build up the immediate. Or, we could try
2511 // to insert a AArch64::MOVi32imm before register allocation so that we
2512 // didn't need to scavenge for a scratch register.
2513 report_fatal_error(reason: "Unable to encode Stack Protector Guard Offset");
2514 }
2515 MBB.erase(I: MI);
2516 return true;
2517 }
2518
2519 const GlobalValue *GV =
2520 cast<GlobalValue>(Val: (*MI.memoperands_begin())->getValue());
2521 const TargetMachine &TM = MBB.getParent()->getTarget();
2522 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2523 const unsigned char MO_NC = AArch64II::MO_NC;
2524
2525 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2526 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LOADgot), DestReg: Reg)
2527 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags);
2528 if (Subtarget.isTargetILP32()) {
2529 unsigned Reg32 = TRI->getSubReg(Reg, Idx: AArch64::sub_32);
2530 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRWui))
2531 .addDef(RegNo: Reg32, Flags: RegState::Dead)
2532 .addUse(RegNo: Reg, Flags: RegState::Kill)
2533 .addImm(Val: 0)
2534 .addMemOperand(MMO: *MI.memoperands_begin())
2535 .addDef(RegNo: Reg, Flags: RegState::Implicit);
2536 } else {
2537 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2538 .addReg(RegNo: Reg, Flags: RegState::Kill)
2539 .addImm(Val: 0)
2540 .addMemOperand(MMO: *MI.memoperands_begin());
2541 }
2542 } else if (TM.getCodeModel() == CodeModel::Large) {
2543 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2544 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: Reg)
2545 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G0 | MO_NC)
2546 .addImm(Val: 0);
2547 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2548 .addReg(RegNo: Reg, Flags: RegState::Kill)
2549 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G1 | MO_NC)
2550 .addImm(Val: 16);
2551 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2552 .addReg(RegNo: Reg, Flags: RegState::Kill)
2553 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G2 | MO_NC)
2554 .addImm(Val: 32);
2555 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2556 .addReg(RegNo: Reg, Flags: RegState::Kill)
2557 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G3)
2558 .addImm(Val: 48);
2559 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2560 .addReg(RegNo: Reg, Flags: RegState::Kill)
2561 .addImm(Val: 0)
2562 .addMemOperand(MMO: *MI.memoperands_begin());
2563 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2564 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADR), DestReg: Reg)
2565 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags);
2566 } else {
2567 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADRP), DestReg: Reg)
2568 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags | AArch64II::MO_PAGE);
2569 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2570 if (Subtarget.isTargetILP32()) {
2571 unsigned Reg32 = TRI->getSubReg(Reg, Idx: AArch64::sub_32);
2572 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRWui))
2573 .addDef(RegNo: Reg32, Flags: RegState::Dead)
2574 .addUse(RegNo: Reg, Flags: RegState::Kill)
2575 .addGlobalAddress(GV, Offset: 0, TargetFlags: LoFlags)
2576 .addMemOperand(MMO: *MI.memoperands_begin())
2577 .addDef(RegNo: Reg, Flags: RegState::Implicit);
2578 } else {
2579 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2580 .addReg(RegNo: Reg, Flags: RegState::Kill)
2581 .addGlobalAddress(GV, Offset: 0, TargetFlags: LoFlags)
2582 .addMemOperand(MMO: *MI.memoperands_begin());
2583 }
2584 }
2585
2586 MBB.erase(I: MI);
2587
2588 return true;
2589}
2590
2591// Return true if this instruction simply sets its single destination register
2592// to zero. This is equivalent to a register rename of the zero-register.
2593bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2594 switch (MI.getOpcode()) {
2595 default:
2596 break;
2597 case AArch64::MOVZWi:
2598 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2599 if (MI.getOperand(i: 1).isImm() && MI.getOperand(i: 1).getImm() == 0) {
2600 assert(MI.getDesc().getNumOperands() == 3 &&
2601 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2602 return true;
2603 }
2604 break;
2605 case AArch64::ANDWri: // and Rd, Rzr, #imm
2606 return MI.getOperand(i: 1).getReg() == AArch64::WZR;
2607 case AArch64::ANDXri:
2608 return MI.getOperand(i: 1).getReg() == AArch64::XZR;
2609 case TargetOpcode::COPY:
2610 return MI.getOperand(i: 1).getReg() == AArch64::WZR;
2611 }
2612 return false;
2613}
2614
2615// Return true if this instruction simply renames a general register without
2616// modifying bits.
2617bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2618 switch (MI.getOpcode()) {
2619 default:
2620 break;
2621 case TargetOpcode::COPY: {
2622 // GPR32 copies will by lowered to ORRXrs
2623 Register DstReg = MI.getOperand(i: 0).getReg();
2624 return (AArch64::GPR32RegClass.contains(Reg: DstReg) ||
2625 AArch64::GPR64RegClass.contains(Reg: DstReg));
2626 }
2627 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2628 if (MI.getOperand(i: 1).getReg() == AArch64::XZR) {
2629 assert(MI.getDesc().getNumOperands() == 4 &&
2630 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2631 return true;
2632 }
2633 break;
2634 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2635 if (MI.getOperand(i: 2).getImm() == 0) {
2636 assert(MI.getDesc().getNumOperands() == 4 &&
2637 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2638 return true;
2639 }
2640 break;
2641 }
2642 return false;
2643}
2644
2645// Return true if this instruction simply renames a general register without
2646// modifying bits.
2647bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2648 switch (MI.getOpcode()) {
2649 default:
2650 break;
2651 case TargetOpcode::COPY: {
2652 Register DstReg = MI.getOperand(i: 0).getReg();
2653 return AArch64::FPR128RegClass.contains(Reg: DstReg);
2654 }
2655 case AArch64::ORRv16i8:
2656 if (MI.getOperand(i: 1).getReg() == MI.getOperand(i: 2).getReg()) {
2657 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2658 "invalid ORRv16i8 operands");
2659 return true;
2660 }
2661 break;
2662 }
2663 return false;
2664}
2665
2666static bool isFrameLoadOpcode(int Opcode) {
2667 switch (Opcode) {
2668 default:
2669 return false;
2670 case AArch64::LDRWui:
2671 case AArch64::LDRXui:
2672 case AArch64::LDRBui:
2673 case AArch64::LDRHui:
2674 case AArch64::LDRSui:
2675 case AArch64::LDRDui:
2676 case AArch64::LDRQui:
2677 case AArch64::LDR_PXI:
2678 return true;
2679 }
2680}
2681
2682Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2683 int &FrameIndex) const {
2684 if (!isFrameLoadOpcode(Opcode: MI.getOpcode()))
2685 return Register();
2686
2687 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2688 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2689 FrameIndex = MI.getOperand(i: 1).getIndex();
2690 return MI.getOperand(i: 0).getReg();
2691 }
2692 return Register();
2693}
2694
2695static bool isFrameStoreOpcode(int Opcode) {
2696 switch (Opcode) {
2697 default:
2698 return false;
2699 case AArch64::STRWui:
2700 case AArch64::STRXui:
2701 case AArch64::STRBui:
2702 case AArch64::STRHui:
2703 case AArch64::STRSui:
2704 case AArch64::STRDui:
2705 case AArch64::STRQui:
2706 case AArch64::STR_PXI:
2707 return true;
2708 }
2709}
2710
2711Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2712 int &FrameIndex) const {
2713 if (!isFrameStoreOpcode(Opcode: MI.getOpcode()))
2714 return Register();
2715
2716 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2717 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2718 FrameIndex = MI.getOperand(i: 1).getIndex();
2719 return MI.getOperand(i: 0).getReg();
2720 }
2721 return Register();
2722}
2723
2724Register AArch64InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
2725 int &FrameIndex) const {
2726 if (!isFrameStoreOpcode(Opcode: MI.getOpcode()))
2727 return Register();
2728
2729 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2730 return Reg;
2731
2732 SmallVector<const MachineMemOperand *, 1> Accesses;
2733 if (hasStoreToStackSlot(MI, Accesses)) {
2734 if (Accesses.size() > 1)
2735 return Register();
2736
2737 FrameIndex =
2738 cast<FixedStackPseudoSourceValue>(Val: Accesses.front()->getPseudoValue())
2739 ->getFrameIndex();
2740 return MI.getOperand(i: 0).getReg();
2741 }
2742 return Register();
2743}
2744
2745Register AArch64InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
2746 int &FrameIndex) const {
2747 if (!isFrameLoadOpcode(Opcode: MI.getOpcode()))
2748 return Register();
2749
2750 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2751 return Reg;
2752
2753 SmallVector<const MachineMemOperand *, 1> Accesses;
2754 if (hasLoadFromStackSlot(MI, Accesses)) {
2755 if (Accesses.size() > 1)
2756 return Register();
2757
2758 FrameIndex =
2759 cast<FixedStackPseudoSourceValue>(Val: Accesses.front()->getPseudoValue())
2760 ->getFrameIndex();
2761 return MI.getOperand(i: 0).getReg();
2762 }
2763 return Register();
2764}
2765
2766/// Check all MachineMemOperands for a hint to suppress pairing.
2767bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2768 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2769 return MMO->getFlags() & MOSuppressPair;
2770 });
2771}
2772
2773/// Set a flag on the first MachineMemOperand to suppress pairing.
2774void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2775 if (MI.memoperands_empty())
2776 return;
2777 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2778}
2779
2780/// Check all MachineMemOperands for a hint that the load/store is strided.
2781bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2782 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2783 return MMO->getFlags() & MOStridedAccess;
2784 });
2785}
2786
2787bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2788 switch (Opc) {
2789 default:
2790 return false;
2791 case AArch64::STURSi:
2792 case AArch64::STRSpre:
2793 case AArch64::STURDi:
2794 case AArch64::STRDpre:
2795 case AArch64::STURQi:
2796 case AArch64::STRQpre:
2797 case AArch64::STURBBi:
2798 case AArch64::STURHHi:
2799 case AArch64::STURWi:
2800 case AArch64::STRWpre:
2801 case AArch64::STURXi:
2802 case AArch64::STRXpre:
2803 case AArch64::LDURSi:
2804 case AArch64::LDRSpre:
2805 case AArch64::LDURDi:
2806 case AArch64::LDRDpre:
2807 case AArch64::LDURQi:
2808 case AArch64::LDRQpre:
2809 case AArch64::LDURWi:
2810 case AArch64::LDRWpre:
2811 case AArch64::LDURXi:
2812 case AArch64::LDRXpre:
2813 case AArch64::LDRSWpre:
2814 case AArch64::LDURSWi:
2815 case AArch64::LDURHHi:
2816 case AArch64::LDURBBi:
2817 case AArch64::LDURSBWi:
2818 case AArch64::LDURSHWi:
2819 return true;
2820 }
2821}
2822
2823std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2824 switch (Opc) {
2825 default: return {};
2826 case AArch64::PRFMui: return AArch64::PRFUMi;
2827 case AArch64::LDRXui: return AArch64::LDURXi;
2828 case AArch64::LDRWui: return AArch64::LDURWi;
2829 case AArch64::LDRBui: return AArch64::LDURBi;
2830 case AArch64::LDRHui: return AArch64::LDURHi;
2831 case AArch64::LDRSui: return AArch64::LDURSi;
2832 case AArch64::LDRDui: return AArch64::LDURDi;
2833 case AArch64::LDRQui: return AArch64::LDURQi;
2834 case AArch64::LDRBBui: return AArch64::LDURBBi;
2835 case AArch64::LDRHHui: return AArch64::LDURHHi;
2836 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2837 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2838 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2839 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2840 case AArch64::LDRSWui: return AArch64::LDURSWi;
2841 case AArch64::STRXui: return AArch64::STURXi;
2842 case AArch64::STRWui: return AArch64::STURWi;
2843 case AArch64::STRBui: return AArch64::STURBi;
2844 case AArch64::STRHui: return AArch64::STURHi;
2845 case AArch64::STRSui: return AArch64::STURSi;
2846 case AArch64::STRDui: return AArch64::STURDi;
2847 case AArch64::STRQui: return AArch64::STURQi;
2848 case AArch64::STRBBui: return AArch64::STURBBi;
2849 case AArch64::STRHHui: return AArch64::STURHHi;
2850 }
2851}
2852
2853unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2854 switch (Opc) {
2855 default:
2856 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2857 case AArch64::ADDG:
2858 case AArch64::LDAPURBi:
2859 case AArch64::LDAPURHi:
2860 case AArch64::LDAPURi:
2861 case AArch64::LDAPURSBWi:
2862 case AArch64::LDAPURSBXi:
2863 case AArch64::LDAPURSHWi:
2864 case AArch64::LDAPURSHXi:
2865 case AArch64::LDAPURSWi:
2866 case AArch64::LDAPURXi:
2867 case AArch64::LDR_PPXI:
2868 case AArch64::LDR_PXI:
2869 case AArch64::LDR_ZXI:
2870 case AArch64::LDR_ZZXI:
2871 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2872 case AArch64::LDR_ZZZXI:
2873 case AArch64::LDR_ZZZZXI:
2874 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2875 case AArch64::LDRBBui:
2876 case AArch64::LDRBui:
2877 case AArch64::LDRDui:
2878 case AArch64::LDRHHui:
2879 case AArch64::LDRHui:
2880 case AArch64::LDRQui:
2881 case AArch64::LDRSBWui:
2882 case AArch64::LDRSBXui:
2883 case AArch64::LDRSHWui:
2884 case AArch64::LDRSHXui:
2885 case AArch64::LDRSui:
2886 case AArch64::LDRSWui:
2887 case AArch64::LDRWui:
2888 case AArch64::LDRXui:
2889 case AArch64::LDURBBi:
2890 case AArch64::LDURBi:
2891 case AArch64::LDURDi:
2892 case AArch64::LDURHHi:
2893 case AArch64::LDURHi:
2894 case AArch64::LDURQi:
2895 case AArch64::LDURSBWi:
2896 case AArch64::LDURSBXi:
2897 case AArch64::LDURSHWi:
2898 case AArch64::LDURSHXi:
2899 case AArch64::LDURSi:
2900 case AArch64::LDURSWi:
2901 case AArch64::LDURWi:
2902 case AArch64::LDURXi:
2903 case AArch64::PRFMui:
2904 case AArch64::PRFUMi:
2905 case AArch64::ST2Gi:
2906 case AArch64::STGi:
2907 case AArch64::STLURBi:
2908 case AArch64::STLURHi:
2909 case AArch64::STLURWi:
2910 case AArch64::STLURXi:
2911 case AArch64::StoreSwiftAsyncContext:
2912 case AArch64::STR_PPXI:
2913 case AArch64::STR_PXI:
2914 case AArch64::STR_ZXI:
2915 case AArch64::STR_ZZXI:
2916 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2917 case AArch64::STR_ZZZXI:
2918 case AArch64::STR_ZZZZXI:
2919 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2920 case AArch64::STRBBui:
2921 case AArch64::STRBui:
2922 case AArch64::STRDui:
2923 case AArch64::STRHHui:
2924 case AArch64::STRHui:
2925 case AArch64::STRQui:
2926 case AArch64::STRSui:
2927 case AArch64::STRWui:
2928 case AArch64::STRXui:
2929 case AArch64::STURBBi:
2930 case AArch64::STURBi:
2931 case AArch64::STURDi:
2932 case AArch64::STURHHi:
2933 case AArch64::STURHi:
2934 case AArch64::STURQi:
2935 case AArch64::STURSi:
2936 case AArch64::STURWi:
2937 case AArch64::STURXi:
2938 case AArch64::STZ2Gi:
2939 case AArch64::STZGi:
2940 case AArch64::TAGPstack:
2941 return 2;
2942 case AArch64::LD1B_D_IMM:
2943 case AArch64::LD1B_H_IMM:
2944 case AArch64::LD1B_IMM:
2945 case AArch64::LD1B_S_IMM:
2946 case AArch64::LD1D_IMM:
2947 case AArch64::LD1H_D_IMM:
2948 case AArch64::LD1H_IMM:
2949 case AArch64::LD1H_S_IMM:
2950 case AArch64::LD1RB_D_IMM:
2951 case AArch64::LD1RB_H_IMM:
2952 case AArch64::LD1RB_IMM:
2953 case AArch64::LD1RB_S_IMM:
2954 case AArch64::LD1RD_IMM:
2955 case AArch64::LD1RH_D_IMM:
2956 case AArch64::LD1RH_IMM:
2957 case AArch64::LD1RH_S_IMM:
2958 case AArch64::LD1RSB_D_IMM:
2959 case AArch64::LD1RSB_H_IMM:
2960 case AArch64::LD1RSB_S_IMM:
2961 case AArch64::LD1RSH_D_IMM:
2962 case AArch64::LD1RSH_S_IMM:
2963 case AArch64::LD1RSW_IMM:
2964 case AArch64::LD1RW_D_IMM:
2965 case AArch64::LD1RW_IMM:
2966 case AArch64::LD1SB_D_IMM:
2967 case AArch64::LD1SB_H_IMM:
2968 case AArch64::LD1SB_S_IMM:
2969 case AArch64::LD1SH_D_IMM:
2970 case AArch64::LD1SH_S_IMM:
2971 case AArch64::LD1SW_D_IMM:
2972 case AArch64::LD1W_D_IMM:
2973 case AArch64::LD1W_IMM:
2974 case AArch64::LD2B_IMM:
2975 case AArch64::LD2D_IMM:
2976 case AArch64::LD2H_IMM:
2977 case AArch64::LD2W_IMM:
2978 case AArch64::LD3B_IMM:
2979 case AArch64::LD3D_IMM:
2980 case AArch64::LD3H_IMM:
2981 case AArch64::LD3W_IMM:
2982 case AArch64::LD4B_IMM:
2983 case AArch64::LD4D_IMM:
2984 case AArch64::LD4H_IMM:
2985 case AArch64::LD4W_IMM:
2986 case AArch64::LDG:
2987 case AArch64::LDNF1B_D_IMM:
2988 case AArch64::LDNF1B_H_IMM:
2989 case AArch64::LDNF1B_IMM:
2990 case AArch64::LDNF1B_S_IMM:
2991 case AArch64::LDNF1D_IMM:
2992 case AArch64::LDNF1H_D_IMM:
2993 case AArch64::LDNF1H_IMM:
2994 case AArch64::LDNF1H_S_IMM:
2995 case AArch64::LDNF1SB_D_IMM:
2996 case AArch64::LDNF1SB_H_IMM:
2997 case AArch64::LDNF1SB_S_IMM:
2998 case AArch64::LDNF1SH_D_IMM:
2999 case AArch64::LDNF1SH_S_IMM:
3000 case AArch64::LDNF1SW_D_IMM:
3001 case AArch64::LDNF1W_D_IMM:
3002 case AArch64::LDNF1W_IMM:
3003 case AArch64::LDNPDi:
3004 case AArch64::LDNPQi:
3005 case AArch64::LDNPSi:
3006 case AArch64::LDNPWi:
3007 case AArch64::LDNPXi:
3008 case AArch64::LDNT1B_ZRI:
3009 case AArch64::LDNT1D_ZRI:
3010 case AArch64::LDNT1H_ZRI:
3011 case AArch64::LDNT1W_ZRI:
3012 case AArch64::LDPDi:
3013 case AArch64::LDPQi:
3014 case AArch64::LDPSi:
3015 case AArch64::LDPWi:
3016 case AArch64::LDPXi:
3017 case AArch64::LDRBBpost:
3018 case AArch64::LDRBBpre:
3019 case AArch64::LDRBpost:
3020 case AArch64::LDRBpre:
3021 case AArch64::LDRDpost:
3022 case AArch64::LDRDpre:
3023 case AArch64::LDRHHpost:
3024 case AArch64::LDRHHpre:
3025 case AArch64::LDRHpost:
3026 case AArch64::LDRHpre:
3027 case AArch64::LDRQpost:
3028 case AArch64::LDRQpre:
3029 case AArch64::LDRSpost:
3030 case AArch64::LDRSpre:
3031 case AArch64::LDRWpost:
3032 case AArch64::LDRWpre:
3033 case AArch64::LDRXpost:
3034 case AArch64::LDRXpre:
3035 case AArch64::ST1B_D_IMM:
3036 case AArch64::ST1B_H_IMM:
3037 case AArch64::ST1B_IMM:
3038 case AArch64::ST1B_S_IMM:
3039 case AArch64::ST1D_IMM:
3040 case AArch64::ST1H_D_IMM:
3041 case AArch64::ST1H_IMM:
3042 case AArch64::ST1H_S_IMM:
3043 case AArch64::ST1W_D_IMM:
3044 case AArch64::ST1W_IMM:
3045 case AArch64::ST2B_IMM:
3046 case AArch64::ST2D_IMM:
3047 case AArch64::ST2H_IMM:
3048 case AArch64::ST2W_IMM:
3049 case AArch64::ST3B_IMM:
3050 case AArch64::ST3D_IMM:
3051 case AArch64::ST3H_IMM:
3052 case AArch64::ST3W_IMM:
3053 case AArch64::ST4B_IMM:
3054 case AArch64::ST4D_IMM:
3055 case AArch64::ST4H_IMM:
3056 case AArch64::ST4W_IMM:
3057 case AArch64::STGPi:
3058 case AArch64::STGPreIndex:
3059 case AArch64::STZGPreIndex:
3060 case AArch64::ST2GPreIndex:
3061 case AArch64::STZ2GPreIndex:
3062 case AArch64::STGPostIndex:
3063 case AArch64::STZGPostIndex:
3064 case AArch64::ST2GPostIndex:
3065 case AArch64::STZ2GPostIndex:
3066 case AArch64::STNPDi:
3067 case AArch64::STNPQi:
3068 case AArch64::STNPSi:
3069 case AArch64::STNPWi:
3070 case AArch64::STNPXi:
3071 case AArch64::STNT1B_ZRI:
3072 case AArch64::STNT1D_ZRI:
3073 case AArch64::STNT1H_ZRI:
3074 case AArch64::STNT1W_ZRI:
3075 case AArch64::STPDi:
3076 case AArch64::STPQi:
3077 case AArch64::STPSi:
3078 case AArch64::STPWi:
3079 case AArch64::STPXi:
3080 case AArch64::STRBBpost:
3081 case AArch64::STRBBpre:
3082 case AArch64::STRBpost:
3083 case AArch64::STRBpre:
3084 case AArch64::STRDpost:
3085 case AArch64::STRDpre:
3086 case AArch64::STRHHpost:
3087 case AArch64::STRHHpre:
3088 case AArch64::STRHpost:
3089 case AArch64::STRHpre:
3090 case AArch64::STRQpost:
3091 case AArch64::STRQpre:
3092 case AArch64::STRSpost:
3093 case AArch64::STRSpre:
3094 case AArch64::STRWpost:
3095 case AArch64::STRWpre:
3096 case AArch64::STRXpost:
3097 case AArch64::STRXpre:
3098 return 3;
3099 case AArch64::LDPDpost:
3100 case AArch64::LDPDpre:
3101 case AArch64::LDPQpost:
3102 case AArch64::LDPQpre:
3103 case AArch64::LDPSpost:
3104 case AArch64::LDPSpre:
3105 case AArch64::LDPWpost:
3106 case AArch64::LDPWpre:
3107 case AArch64::LDPXpost:
3108 case AArch64::LDPXpre:
3109 case AArch64::STGPpre:
3110 case AArch64::STGPpost:
3111 case AArch64::STPDpost:
3112 case AArch64::STPDpre:
3113 case AArch64::STPQpost:
3114 case AArch64::STPQpre:
3115 case AArch64::STPSpost:
3116 case AArch64::STPSpre:
3117 case AArch64::STPWpost:
3118 case AArch64::STPWpre:
3119 case AArch64::STPXpost:
3120 case AArch64::STPXpre:
3121 return 4;
3122 }
3123}
3124
3125bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
3126 switch (MI.getOpcode()) {
3127 default:
3128 return false;
3129 // Scaled instructions.
3130 case AArch64::STRSui:
3131 case AArch64::STRDui:
3132 case AArch64::STRQui:
3133 case AArch64::STRXui:
3134 case AArch64::STRWui:
3135 case AArch64::LDRSui:
3136 case AArch64::LDRDui:
3137 case AArch64::LDRQui:
3138 case AArch64::LDRXui:
3139 case AArch64::LDRWui:
3140 case AArch64::LDRSWui:
3141 // Unscaled instructions.
3142 case AArch64::STURSi:
3143 case AArch64::STRSpre:
3144 case AArch64::STURDi:
3145 case AArch64::STRDpre:
3146 case AArch64::STURQi:
3147 case AArch64::STRQpre:
3148 case AArch64::STURWi:
3149 case AArch64::STRWpre:
3150 case AArch64::STURXi:
3151 case AArch64::STRXpre:
3152 case AArch64::LDURSi:
3153 case AArch64::LDRSpre:
3154 case AArch64::LDURDi:
3155 case AArch64::LDRDpre:
3156 case AArch64::LDURQi:
3157 case AArch64::LDRQpre:
3158 case AArch64::LDURWi:
3159 case AArch64::LDRWpre:
3160 case AArch64::LDURXi:
3161 case AArch64::LDRXpre:
3162 case AArch64::LDURSWi:
3163 case AArch64::LDRSWpre:
3164 // SVE instructions.
3165 case AArch64::LDR_ZXI:
3166 case AArch64::STR_ZXI:
3167 return true;
3168 }
3169}
3170
3171bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
3172 switch (MI.getOpcode()) {
3173 default:
3174 assert((!MI.isCall() || !MI.isReturn()) &&
3175 "Unexpected instruction - was a new tail call opcode introduced?");
3176 return false;
3177 case AArch64::TCRETURNdi:
3178 case AArch64::TCRETURNri:
3179 case AArch64::TCRETURNrix16x17:
3180 case AArch64::TCRETURNrix17:
3181 case AArch64::TCRETURNrinotx16:
3182 case AArch64::TCRETURNriALL:
3183 case AArch64::AUTH_TCRETURN:
3184 case AArch64::AUTH_TCRETURN_BTI:
3185 return true;
3186 }
3187}
3188
3189unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
3190 switch (Opc) {
3191 default:
3192 llvm_unreachable("Opcode has no flag setting equivalent!");
3193 // 32-bit cases:
3194 case AArch64::ADDWri:
3195 return AArch64::ADDSWri;
3196 case AArch64::ADDWrr:
3197 return AArch64::ADDSWrr;
3198 case AArch64::ADDWrs:
3199 return AArch64::ADDSWrs;
3200 case AArch64::ADDWrx:
3201 return AArch64::ADDSWrx;
3202 case AArch64::ANDWri:
3203 return AArch64::ANDSWri;
3204 case AArch64::ANDWrr:
3205 return AArch64::ANDSWrr;
3206 case AArch64::ANDWrs:
3207 return AArch64::ANDSWrs;
3208 case AArch64::BICWrr:
3209 return AArch64::BICSWrr;
3210 case AArch64::BICWrs:
3211 return AArch64::BICSWrs;
3212 case AArch64::SUBWri:
3213 return AArch64::SUBSWri;
3214 case AArch64::SUBWrr:
3215 return AArch64::SUBSWrr;
3216 case AArch64::SUBWrs:
3217 return AArch64::SUBSWrs;
3218 case AArch64::SUBWrx:
3219 return AArch64::SUBSWrx;
3220 // 64-bit cases:
3221 case AArch64::ADDXri:
3222 return AArch64::ADDSXri;
3223 case AArch64::ADDXrr:
3224 return AArch64::ADDSXrr;
3225 case AArch64::ADDXrs:
3226 return AArch64::ADDSXrs;
3227 case AArch64::ADDXrx:
3228 return AArch64::ADDSXrx;
3229 case AArch64::ANDXri:
3230 return AArch64::ANDSXri;
3231 case AArch64::ANDXrr:
3232 return AArch64::ANDSXrr;
3233 case AArch64::ANDXrs:
3234 return AArch64::ANDSXrs;
3235 case AArch64::BICXrr:
3236 return AArch64::BICSXrr;
3237 case AArch64::BICXrs:
3238 return AArch64::BICSXrs;
3239 case AArch64::SUBXri:
3240 return AArch64::SUBSXri;
3241 case AArch64::SUBXrr:
3242 return AArch64::SUBSXrr;
3243 case AArch64::SUBXrs:
3244 return AArch64::SUBSXrs;
3245 case AArch64::SUBXrx:
3246 return AArch64::SUBSXrx;
3247 // SVE instructions:
3248 case AArch64::AND_PPzPP:
3249 return AArch64::ANDS_PPzPP;
3250 case AArch64::BIC_PPzPP:
3251 return AArch64::BICS_PPzPP;
3252 case AArch64::EOR_PPzPP:
3253 return AArch64::EORS_PPzPP;
3254 case AArch64::NAND_PPzPP:
3255 return AArch64::NANDS_PPzPP;
3256 case AArch64::NOR_PPzPP:
3257 return AArch64::NORS_PPzPP;
3258 case AArch64::ORN_PPzPP:
3259 return AArch64::ORNS_PPzPP;
3260 case AArch64::ORR_PPzPP:
3261 return AArch64::ORRS_PPzPP;
3262 case AArch64::BRKA_PPzP:
3263 return AArch64::BRKAS_PPzP;
3264 case AArch64::BRKPA_PPzPP:
3265 return AArch64::BRKPAS_PPzPP;
3266 case AArch64::BRKB_PPzP:
3267 return AArch64::BRKBS_PPzP;
3268 case AArch64::BRKPB_PPzPP:
3269 return AArch64::BRKPBS_PPzPP;
3270 case AArch64::BRKN_PPzP:
3271 return AArch64::BRKNS_PPzP;
3272 case AArch64::RDFFR_PPz:
3273 return AArch64::RDFFRS_PPz;
3274 case AArch64::PTRUE_B:
3275 return AArch64::PTRUES_B;
3276 }
3277}
3278
3279// Is this a candidate for ld/st merging or pairing? For example, we don't
3280// touch volatiles or load/stores that have a hint to avoid pair formation.
3281bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
3282
3283 bool IsPreLdSt = isPreLdSt(MI);
3284
3285 // If this is a volatile load/store, don't mess with it.
3286 if (MI.hasOrderedMemoryRef())
3287 return false;
3288
3289 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3290 // For Pre-inc LD/ST, the operand is shifted by one.
3291 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3292 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3293 "Expected a reg or frame index operand.");
3294
3295 // For Pre-indexed addressing quadword instructions, the third operand is the
3296 // immediate value.
3297 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(i: 3).isImm();
3298
3299 if (!MI.getOperand(i: 2).isImm() && !IsImmPreLdSt)
3300 return false;
3301
3302 // Can't merge/pair if the instruction modifies the base register.
3303 // e.g., ldr x0, [x0]
3304 // This case will never occur with an FI base.
3305 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3306 // STR<S,D,Q,W,X>pre, it can be merged.
3307 // For example:
3308 // ldr q0, [x11, #32]!
3309 // ldr q1, [x11, #16]
3310 // to
3311 // ldp q0, q1, [x11, #32]!
3312 if (MI.getOperand(i: 1).isReg() && !IsPreLdSt) {
3313 Register BaseReg = MI.getOperand(i: 1).getReg();
3314 const TargetRegisterInfo *TRI = &getRegisterInfo();
3315 if (MI.modifiesRegister(Reg: BaseReg, TRI))
3316 return false;
3317 }
3318
3319 // Pairing SVE fills/spills is only valid for little-endian targets that
3320 // implement VLS 128.
3321 switch (MI.getOpcode()) {
3322 default:
3323 break;
3324 case AArch64::LDR_ZXI:
3325 case AArch64::STR_ZXI:
3326 if (!Subtarget.isLittleEndian() ||
3327 Subtarget.getSVEVectorSizeInBits() != 128)
3328 return false;
3329 }
3330
3331 // Check if this load/store has a hint to avoid pair formation.
3332 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3333 if (isLdStPairSuppressed(MI))
3334 return false;
3335
3336 // Do not pair any callee-save store/reload instructions in the
3337 // prologue/epilogue if the CFI information encoded the operations as separate
3338 // instructions, as that will cause the size of the actual prologue to mismatch
3339 // with the prologue size recorded in the Windows CFI.
3340 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
3341 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
3342 MI.getMF()->getFunction().needsUnwindTableEntry();
3343 if (NeedsWinCFI && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
3344 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
3345 return false;
3346
3347 // On some CPUs quad load/store pairs are slower than two single load/stores.
3348 if (Subtarget.isPaired128Slow()) {
3349 switch (MI.getOpcode()) {
3350 default:
3351 break;
3352 case AArch64::LDURQi:
3353 case AArch64::STURQi:
3354 case AArch64::LDRQui:
3355 case AArch64::STRQui:
3356 return false;
3357 }
3358 }
3359
3360 return true;
3361}
3362
3363bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
3364 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
3365 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3366 const TargetRegisterInfo *TRI) const {
3367 if (!LdSt.mayLoadOrStore())
3368 return false;
3369
3370 const MachineOperand *BaseOp;
3371 TypeSize WidthN(0, false);
3372 if (!getMemOperandWithOffsetWidth(MI: LdSt, BaseOp, Offset, OffsetIsScalable,
3373 Width&: WidthN, TRI))
3374 return false;
3375 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3376 // vector.
3377 Width = LocationSize::precise(Value: WidthN);
3378 BaseOps.push_back(Elt: BaseOp);
3379 return true;
3380}
3381
3382std::optional<ExtAddrMode>
3383AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
3384 const TargetRegisterInfo *TRI) const {
3385 const MachineOperand *Base; // Filled with the base operand of MI.
3386 int64_t Offset; // Filled with the offset of MI.
3387 bool OffsetIsScalable;
3388 if (!getMemOperandWithOffset(MI: MemI, BaseOp&: Base, Offset, OffsetIsScalable, TRI))
3389 return std::nullopt;
3390
3391 if (!Base->isReg())
3392 return std::nullopt;
3393 ExtAddrMode AM;
3394 AM.BaseReg = Base->getReg();
3395 AM.Displacement = Offset;
3396 AM.ScaledReg = 0;
3397 AM.Scale = 0;
3398 return AM;
3399}
3400
3401bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
3402 Register Reg,
3403 const MachineInstr &AddrI,
3404 ExtAddrMode &AM) const {
3405 // Filter out instructions into which we cannot fold.
3406 unsigned NumBytes;
3407 int64_t OffsetScale = 1;
3408 switch (MemI.getOpcode()) {
3409 default:
3410 return false;
3411
3412 case AArch64::LDURQi:
3413 case AArch64::STURQi:
3414 NumBytes = 16;
3415 break;
3416
3417 case AArch64::LDURDi:
3418 case AArch64::STURDi:
3419 case AArch64::LDURXi:
3420 case AArch64::STURXi:
3421 NumBytes = 8;
3422 break;
3423
3424 case AArch64::LDURWi:
3425 case AArch64::LDURSWi:
3426 case AArch64::STURWi:
3427 NumBytes = 4;
3428 break;
3429
3430 case AArch64::LDURHi:
3431 case AArch64::STURHi:
3432 case AArch64::LDURHHi:
3433 case AArch64::STURHHi:
3434 case AArch64::LDURSHXi:
3435 case AArch64::LDURSHWi:
3436 NumBytes = 2;
3437 break;
3438
3439 case AArch64::LDRBroX:
3440 case AArch64::LDRBBroX:
3441 case AArch64::LDRSBXroX:
3442 case AArch64::LDRSBWroX:
3443 case AArch64::STRBroX:
3444 case AArch64::STRBBroX:
3445 case AArch64::LDURBi:
3446 case AArch64::LDURBBi:
3447 case AArch64::LDURSBXi:
3448 case AArch64::LDURSBWi:
3449 case AArch64::STURBi:
3450 case AArch64::STURBBi:
3451 case AArch64::LDRBui:
3452 case AArch64::LDRBBui:
3453 case AArch64::LDRSBXui:
3454 case AArch64::LDRSBWui:
3455 case AArch64::STRBui:
3456 case AArch64::STRBBui:
3457 NumBytes = 1;
3458 break;
3459
3460 case AArch64::LDRQroX:
3461 case AArch64::STRQroX:
3462 case AArch64::LDRQui:
3463 case AArch64::STRQui:
3464 NumBytes = 16;
3465 OffsetScale = 16;
3466 break;
3467
3468 case AArch64::LDRDroX:
3469 case AArch64::STRDroX:
3470 case AArch64::LDRXroX:
3471 case AArch64::STRXroX:
3472 case AArch64::LDRDui:
3473 case AArch64::STRDui:
3474 case AArch64::LDRXui:
3475 case AArch64::STRXui:
3476 NumBytes = 8;
3477 OffsetScale = 8;
3478 break;
3479
3480 case AArch64::LDRWroX:
3481 case AArch64::LDRSWroX:
3482 case AArch64::STRWroX:
3483 case AArch64::LDRWui:
3484 case AArch64::LDRSWui:
3485 case AArch64::STRWui:
3486 NumBytes = 4;
3487 OffsetScale = 4;
3488 break;
3489
3490 case AArch64::LDRHroX:
3491 case AArch64::STRHroX:
3492 case AArch64::LDRHHroX:
3493 case AArch64::STRHHroX:
3494 case AArch64::LDRSHXroX:
3495 case AArch64::LDRSHWroX:
3496 case AArch64::LDRHui:
3497 case AArch64::STRHui:
3498 case AArch64::LDRHHui:
3499 case AArch64::STRHHui:
3500 case AArch64::LDRSHXui:
3501 case AArch64::LDRSHWui:
3502 NumBytes = 2;
3503 OffsetScale = 2;
3504 break;
3505 }
3506
3507 // Check the fold operand is not the loaded/stored value.
3508 const MachineOperand &BaseRegOp = MemI.getOperand(i: 0);
3509 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3510 return false;
3511
3512 // Handle memory instructions with a [Reg, Reg] addressing mode.
3513 if (MemI.getOperand(i: 2).isReg()) {
3514 // Bail if the addressing mode already includes extension of the offset
3515 // register.
3516 if (MemI.getOperand(i: 3).getImm())
3517 return false;
3518
3519 // Check if we actually have a scaled offset.
3520 if (MemI.getOperand(i: 4).getImm() == 0)
3521 OffsetScale = 1;
3522
3523 // If the address instructions is folded into the base register, then the
3524 // addressing mode must not have a scale. Then we can swap the base and the
3525 // scaled registers.
3526 if (MemI.getOperand(i: 1).getReg() == Reg && OffsetScale != 1)
3527 return false;
3528
3529 switch (AddrI.getOpcode()) {
3530 default:
3531 return false;
3532
3533 case AArch64::SBFMXri:
3534 // sxtw Xa, Wm
3535 // ldr Xd, [Xn, Xa, lsl #N]
3536 // ->
3537 // ldr Xd, [Xn, Wm, sxtw #N]
3538 if (AddrI.getOperand(i: 2).getImm() != 0 ||
3539 AddrI.getOperand(i: 3).getImm() != 31)
3540 return false;
3541
3542 AM.BaseReg = MemI.getOperand(i: 1).getReg();
3543 if (AM.BaseReg == Reg)
3544 AM.BaseReg = MemI.getOperand(i: 2).getReg();
3545 AM.ScaledReg = AddrI.getOperand(i: 1).getReg();
3546 AM.Scale = OffsetScale;
3547 AM.Displacement = 0;
3548 AM.Form = ExtAddrMode::Formula::SExtScaledReg;
3549 return true;
3550
3551 case TargetOpcode::SUBREG_TO_REG: {
3552 // mov Wa, Wm
3553 // ldr Xd, [Xn, Xa, lsl #N]
3554 // ->
3555 // ldr Xd, [Xn, Wm, uxtw #N]
3556
3557 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3558 if (AddrI.getOperand(i: 2).getImm() != AArch64::sub_32)
3559 return false;
3560
3561 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3562 Register OffsetReg = AddrI.getOperand(i: 1).getReg();
3563 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(RegNo: OffsetReg))
3564 return false;
3565
3566 const MachineInstr &DefMI = *MRI.getVRegDef(Reg: OffsetReg);
3567 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3568 DefMI.getOperand(i: 1).getReg() != AArch64::WZR ||
3569 DefMI.getOperand(i: 3).getImm() != 0)
3570 return false;
3571
3572 AM.BaseReg = MemI.getOperand(i: 1).getReg();
3573 if (AM.BaseReg == Reg)
3574 AM.BaseReg = MemI.getOperand(i: 2).getReg();
3575 AM.ScaledReg = DefMI.getOperand(i: 2).getReg();
3576 AM.Scale = OffsetScale;
3577 AM.Displacement = 0;
3578 AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
3579 return true;
3580 }
3581 }
3582 }
3583
3584 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3585
3586 // Check we are not breaking a potential conversion to an LDP.
3587 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3588 int64_t NewOffset) -> bool {
3589 int64_t MinOffset, MaxOffset;
3590 switch (NumBytes) {
3591 default:
3592 return true;
3593 case 4:
3594 MinOffset = -256;
3595 MaxOffset = 252;
3596 break;
3597 case 8:
3598 MinOffset = -512;
3599 MaxOffset = 504;
3600 break;
3601 case 16:
3602 MinOffset = -1024;
3603 MaxOffset = 1008;
3604 break;
3605 }
3606 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3607 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3608 };
3609 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3610 int64_t OldOffset = MemI.getOperand(i: 2).getImm() * OffsetScale;
3611 int64_t NewOffset = OldOffset + Disp;
3612 if (!isLegalAddressingMode(NumBytes, Offset: NewOffset, /* Scale */ 0))
3613 return false;
3614 // If the old offset would fit into an LDP, but the new offset wouldn't,
3615 // bail out.
3616 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3617 return false;
3618 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
3619 AM.ScaledReg = 0;
3620 AM.Scale = 0;
3621 AM.Displacement = NewOffset;
3622 AM.Form = ExtAddrMode::Formula::Basic;
3623 return true;
3624 };
3625
3626 auto canFoldAddRegIntoAddrMode =
3627 [&](int64_t Scale,
3628 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
3629 if (MemI.getOperand(i: 2).getImm() != 0)
3630 return false;
3631 if ((unsigned)Scale != Scale)
3632 return false;
3633 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3634 return false;
3635 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
3636 AM.ScaledReg = AddrI.getOperand(i: 2).getReg();
3637 AM.Scale = Scale;
3638 AM.Displacement = 0;
3639 AM.Form = Form;
3640 return true;
3641 };
3642
3643 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3644 unsigned Opcode = MemI.getOpcode();
3645 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3646 Subtarget.isSTRQroSlow();
3647 };
3648
3649 int64_t Disp = 0;
3650 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3651 switch (AddrI.getOpcode()) {
3652 default:
3653 return false;
3654
3655 case AArch64::ADDXri:
3656 // add Xa, Xn, #N
3657 // ldr Xd, [Xa, #M]
3658 // ->
3659 // ldr Xd, [Xn, #N'+M]
3660 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
3661 return canFoldAddSubImmIntoAddrMode(Disp);
3662
3663 case AArch64::SUBXri:
3664 // sub Xa, Xn, #N
3665 // ldr Xd, [Xa, #M]
3666 // ->
3667 // ldr Xd, [Xn, #N'+M]
3668 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
3669 return canFoldAddSubImmIntoAddrMode(-Disp);
3670
3671 case AArch64::ADDXrs: {
3672 // add Xa, Xn, Xm, lsl #N
3673 // ldr Xd, [Xa]
3674 // ->
3675 // ldr Xd, [Xn, Xm, lsl #N]
3676
3677 // Don't fold the add if the result would be slower, unless optimising for
3678 // size.
3679 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3680 if (AArch64_AM::getShiftType(Imm: Shift) != AArch64_AM::ShiftExtendType::LSL)
3681 return false;
3682 Shift = AArch64_AM::getShiftValue(Imm: Shift);
3683 if (!OptSize) {
3684 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3685 return false;
3686 if (avoidSlowSTRQ(MemI))
3687 return false;
3688 }
3689 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3690 }
3691
3692 case AArch64::ADDXrr:
3693 // add Xa, Xn, Xm
3694 // ldr Xd, [Xa]
3695 // ->
3696 // ldr Xd, [Xn, Xm, lsl #0]
3697
3698 // Don't fold the add if the result would be slower, unless optimising for
3699 // size.
3700 if (!OptSize && avoidSlowSTRQ(MemI))
3701 return false;
3702 return canFoldAddRegIntoAddrMode(1);
3703
3704 case AArch64::ADDXrx:
3705 // add Xa, Xn, Wm, {s,u}xtw #N
3706 // ldr Xd, [Xa]
3707 // ->
3708 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3709
3710 // Don't fold the add if the result would be slower, unless optimising for
3711 // size.
3712 if (!OptSize && avoidSlowSTRQ(MemI))
3713 return false;
3714
3715 // Can fold only sign-/zero-extend of a word.
3716 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3717 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3718 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3719 return false;
3720
3721 return canFoldAddRegIntoAddrMode(
3722 1ULL << AArch64_AM::getArithShiftValue(Imm),
3723 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3724 : ExtAddrMode::Formula::ZExtScaledReg);
3725 }
3726}
3727
3728// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3729// return the opcode of an instruction performing the same operation, but using
3730// the [Reg, Reg] addressing mode.
3731static unsigned regOffsetOpcode(unsigned Opcode) {
3732 switch (Opcode) {
3733 default:
3734 llvm_unreachable("Address folding not implemented for instruction");
3735
3736 case AArch64::LDURQi:
3737 case AArch64::LDRQui:
3738 return AArch64::LDRQroX;
3739 case AArch64::STURQi:
3740 case AArch64::STRQui:
3741 return AArch64::STRQroX;
3742 case AArch64::LDURDi:
3743 case AArch64::LDRDui:
3744 return AArch64::LDRDroX;
3745 case AArch64::STURDi:
3746 case AArch64::STRDui:
3747 return AArch64::STRDroX;
3748 case AArch64::LDURXi:
3749 case AArch64::LDRXui:
3750 return AArch64::LDRXroX;
3751 case AArch64::STURXi:
3752 case AArch64::STRXui:
3753 return AArch64::STRXroX;
3754 case AArch64::LDURWi:
3755 case AArch64::LDRWui:
3756 return AArch64::LDRWroX;
3757 case AArch64::LDURSWi:
3758 case AArch64::LDRSWui:
3759 return AArch64::LDRSWroX;
3760 case AArch64::STURWi:
3761 case AArch64::STRWui:
3762 return AArch64::STRWroX;
3763 case AArch64::LDURHi:
3764 case AArch64::LDRHui:
3765 return AArch64::LDRHroX;
3766 case AArch64::STURHi:
3767 case AArch64::STRHui:
3768 return AArch64::STRHroX;
3769 case AArch64::LDURHHi:
3770 case AArch64::LDRHHui:
3771 return AArch64::LDRHHroX;
3772 case AArch64::STURHHi:
3773 case AArch64::STRHHui:
3774 return AArch64::STRHHroX;
3775 case AArch64::LDURSHXi:
3776 case AArch64::LDRSHXui:
3777 return AArch64::LDRSHXroX;
3778 case AArch64::LDURSHWi:
3779 case AArch64::LDRSHWui:
3780 return AArch64::LDRSHWroX;
3781 case AArch64::LDURBi:
3782 case AArch64::LDRBui:
3783 return AArch64::LDRBroX;
3784 case AArch64::LDURBBi:
3785 case AArch64::LDRBBui:
3786 return AArch64::LDRBBroX;
3787 case AArch64::LDURSBXi:
3788 case AArch64::LDRSBXui:
3789 return AArch64::LDRSBXroX;
3790 case AArch64::LDURSBWi:
3791 case AArch64::LDRSBWui:
3792 return AArch64::LDRSBWroX;
3793 case AArch64::STURBi:
3794 case AArch64::STRBui:
3795 return AArch64::STRBroX;
3796 case AArch64::STURBBi:
3797 case AArch64::STRBBui:
3798 return AArch64::STRBBroX;
3799 }
3800}
3801
3802// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3803// the opcode of an instruction performing the same operation, but using the
3804// [Reg, #Imm] addressing mode with scaled offset.
3805unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3806 switch (Opcode) {
3807 default:
3808 llvm_unreachable("Address folding not implemented for instruction");
3809
3810 case AArch64::LDURQi:
3811 Scale = 16;
3812 return AArch64::LDRQui;
3813 case AArch64::STURQi:
3814 Scale = 16;
3815 return AArch64::STRQui;
3816 case AArch64::LDURDi:
3817 Scale = 8;
3818 return AArch64::LDRDui;
3819 case AArch64::STURDi:
3820 Scale = 8;
3821 return AArch64::STRDui;
3822 case AArch64::LDURXi:
3823 Scale = 8;
3824 return AArch64::LDRXui;
3825 case AArch64::STURXi:
3826 Scale = 8;
3827 return AArch64::STRXui;
3828 case AArch64::LDURWi:
3829 Scale = 4;
3830 return AArch64::LDRWui;
3831 case AArch64::LDURSWi:
3832 Scale = 4;
3833 return AArch64::LDRSWui;
3834 case AArch64::STURWi:
3835 Scale = 4;
3836 return AArch64::STRWui;
3837 case AArch64::LDURHi:
3838 Scale = 2;
3839 return AArch64::LDRHui;
3840 case AArch64::STURHi:
3841 Scale = 2;
3842 return AArch64::STRHui;
3843 case AArch64::LDURHHi:
3844 Scale = 2;
3845 return AArch64::LDRHHui;
3846 case AArch64::STURHHi:
3847 Scale = 2;
3848 return AArch64::STRHHui;
3849 case AArch64::LDURSHXi:
3850 Scale = 2;
3851 return AArch64::LDRSHXui;
3852 case AArch64::LDURSHWi:
3853 Scale = 2;
3854 return AArch64::LDRSHWui;
3855 case AArch64::LDURBi:
3856 Scale = 1;
3857 return AArch64::LDRBui;
3858 case AArch64::LDURBBi:
3859 Scale = 1;
3860 return AArch64::LDRBBui;
3861 case AArch64::LDURSBXi:
3862 Scale = 1;
3863 return AArch64::LDRSBXui;
3864 case AArch64::LDURSBWi:
3865 Scale = 1;
3866 return AArch64::LDRSBWui;
3867 case AArch64::STURBi:
3868 Scale = 1;
3869 return AArch64::STRBui;
3870 case AArch64::STURBBi:
3871 Scale = 1;
3872 return AArch64::STRBBui;
3873 case AArch64::LDRQui:
3874 case AArch64::STRQui:
3875 Scale = 16;
3876 return Opcode;
3877 case AArch64::LDRDui:
3878 case AArch64::STRDui:
3879 case AArch64::LDRXui:
3880 case AArch64::STRXui:
3881 Scale = 8;
3882 return Opcode;
3883 case AArch64::LDRWui:
3884 case AArch64::LDRSWui:
3885 case AArch64::STRWui:
3886 Scale = 4;
3887 return Opcode;
3888 case AArch64::LDRHui:
3889 case AArch64::STRHui:
3890 case AArch64::LDRHHui:
3891 case AArch64::STRHHui:
3892 case AArch64::LDRSHXui:
3893 case AArch64::LDRSHWui:
3894 Scale = 2;
3895 return Opcode;
3896 case AArch64::LDRBui:
3897 case AArch64::LDRBBui:
3898 case AArch64::LDRSBXui:
3899 case AArch64::LDRSBWui:
3900 case AArch64::STRBui:
3901 case AArch64::STRBBui:
3902 Scale = 1;
3903 return Opcode;
3904 }
3905}
3906
3907// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3908// the opcode of an instruction performing the same operation, but using the
3909// [Reg, #Imm] addressing mode with unscaled offset.
3910unsigned unscaledOffsetOpcode(unsigned Opcode) {
3911 switch (Opcode) {
3912 default:
3913 llvm_unreachable("Address folding not implemented for instruction");
3914
3915 case AArch64::LDURQi:
3916 case AArch64::STURQi:
3917 case AArch64::LDURDi:
3918 case AArch64::STURDi:
3919 case AArch64::LDURXi:
3920 case AArch64::STURXi:
3921 case AArch64::LDURWi:
3922 case AArch64::LDURSWi:
3923 case AArch64::STURWi:
3924 case AArch64::LDURHi:
3925 case AArch64::STURHi:
3926 case AArch64::LDURHHi:
3927 case AArch64::STURHHi:
3928 case AArch64::LDURSHXi:
3929 case AArch64::LDURSHWi:
3930 case AArch64::LDURBi:
3931 case AArch64::STURBi:
3932 case AArch64::LDURBBi:
3933 case AArch64::STURBBi:
3934 case AArch64::LDURSBWi:
3935 case AArch64::LDURSBXi:
3936 return Opcode;
3937 case AArch64::LDRQui:
3938 return AArch64::LDURQi;
3939 case AArch64::STRQui:
3940 return AArch64::STURQi;
3941 case AArch64::LDRDui:
3942 return AArch64::LDURDi;
3943 case AArch64::STRDui:
3944 return AArch64::STURDi;
3945 case AArch64::LDRXui:
3946 return AArch64::LDURXi;
3947 case AArch64::STRXui:
3948 return AArch64::STURXi;
3949 case AArch64::LDRWui:
3950 return AArch64::LDURWi;
3951 case AArch64::LDRSWui:
3952 return AArch64::LDURSWi;
3953 case AArch64::STRWui:
3954 return AArch64::STURWi;
3955 case AArch64::LDRHui:
3956 return AArch64::LDURHi;
3957 case AArch64::STRHui:
3958 return AArch64::STURHi;
3959 case AArch64::LDRHHui:
3960 return AArch64::LDURHHi;
3961 case AArch64::STRHHui:
3962 return AArch64::STURHHi;
3963 case AArch64::LDRSHXui:
3964 return AArch64::LDURSHXi;
3965 case AArch64::LDRSHWui:
3966 return AArch64::LDURSHWi;
3967 case AArch64::LDRBBui:
3968 return AArch64::LDURBBi;
3969 case AArch64::LDRBui:
3970 return AArch64::LDURBi;
3971 case AArch64::STRBBui:
3972 return AArch64::STURBBi;
3973 case AArch64::STRBui:
3974 return AArch64::STURBi;
3975 case AArch64::LDRSBWui:
3976 return AArch64::LDURSBWi;
3977 case AArch64::LDRSBXui:
3978 return AArch64::LDURSBXi;
3979 }
3980}
3981
3982// Given the opcode of a memory load/store instruction, return the opcode of an
3983// instruction performing the same operation, but using
3984// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3985// offset register.
3986static unsigned offsetExtendOpcode(unsigned Opcode) {
3987 switch (Opcode) {
3988 default:
3989 llvm_unreachable("Address folding not implemented for instruction");
3990
3991 case AArch64::LDRQroX:
3992 case AArch64::LDURQi:
3993 case AArch64::LDRQui:
3994 return AArch64::LDRQroW;
3995 case AArch64::STRQroX:
3996 case AArch64::STURQi:
3997 case AArch64::STRQui:
3998 return AArch64::STRQroW;
3999 case AArch64::LDRDroX:
4000 case AArch64::LDURDi:
4001 case AArch64::LDRDui:
4002 return AArch64::LDRDroW;
4003 case AArch64::STRDroX:
4004 case AArch64::STURDi:
4005 case AArch64::STRDui:
4006 return AArch64::STRDroW;
4007 case AArch64::LDRXroX:
4008 case AArch64::LDURXi:
4009 case AArch64::LDRXui:
4010 return AArch64::LDRXroW;
4011 case AArch64::STRXroX:
4012 case AArch64::STURXi:
4013 case AArch64::STRXui:
4014 return AArch64::STRXroW;
4015 case AArch64::LDRWroX:
4016 case AArch64::LDURWi:
4017 case AArch64::LDRWui:
4018 return AArch64::LDRWroW;
4019 case AArch64::LDRSWroX:
4020 case AArch64::LDURSWi:
4021 case AArch64::LDRSWui:
4022 return AArch64::LDRSWroW;
4023 case AArch64::STRWroX:
4024 case AArch64::STURWi:
4025 case AArch64::STRWui:
4026 return AArch64::STRWroW;
4027 case AArch64::LDRHroX:
4028 case AArch64::LDURHi:
4029 case AArch64::LDRHui:
4030 return AArch64::LDRHroW;
4031 case AArch64::STRHroX:
4032 case AArch64::STURHi:
4033 case AArch64::STRHui:
4034 return AArch64::STRHroW;
4035 case AArch64::LDRHHroX:
4036 case AArch64::LDURHHi:
4037 case AArch64::LDRHHui:
4038 return AArch64::LDRHHroW;
4039 case AArch64::STRHHroX:
4040 case AArch64::STURHHi:
4041 case AArch64::STRHHui:
4042 return AArch64::STRHHroW;
4043 case AArch64::LDRSHXroX:
4044 case AArch64::LDURSHXi:
4045 case AArch64::LDRSHXui:
4046 return AArch64::LDRSHXroW;
4047 case AArch64::LDRSHWroX:
4048 case AArch64::LDURSHWi:
4049 case AArch64::LDRSHWui:
4050 return AArch64::LDRSHWroW;
4051 case AArch64::LDRBroX:
4052 case AArch64::LDURBi:
4053 case AArch64::LDRBui:
4054 return AArch64::LDRBroW;
4055 case AArch64::LDRBBroX:
4056 case AArch64::LDURBBi:
4057 case AArch64::LDRBBui:
4058 return AArch64::LDRBBroW;
4059 case AArch64::LDRSBXroX:
4060 case AArch64::LDURSBXi:
4061 case AArch64::LDRSBXui:
4062 return AArch64::LDRSBXroW;
4063 case AArch64::LDRSBWroX:
4064 case AArch64::LDURSBWi:
4065 case AArch64::LDRSBWui:
4066 return AArch64::LDRSBWroW;
4067 case AArch64::STRBroX:
4068 case AArch64::STURBi:
4069 case AArch64::STRBui:
4070 return AArch64::STRBroW;
4071 case AArch64::STRBBroX:
4072 case AArch64::STURBBi:
4073 case AArch64::STRBBui:
4074 return AArch64::STRBBroW;
4075 }
4076}
4077
4078MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
4079 const ExtAddrMode &AM) const {
4080
4081 const DebugLoc &DL = MemI.getDebugLoc();
4082 MachineBasicBlock &MBB = *MemI.getParent();
4083 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4084
4085 if (AM.Form == ExtAddrMode::Formula::Basic) {
4086 if (AM.ScaledReg) {
4087 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4088 unsigned Opcode = regOffsetOpcode(Opcode: MemI.getOpcode());
4089 MRI.constrainRegClass(Reg: AM.BaseReg, RC: &AArch64::GPR64spRegClass);
4090 auto B = BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4091 .addReg(RegNo: MemI.getOperand(i: 0).getReg(),
4092 Flags: getDefRegState(B: MemI.mayLoad()))
4093 .addReg(RegNo: AM.BaseReg)
4094 .addReg(RegNo: AM.ScaledReg)
4095 .addImm(Val: 0)
4096 .addImm(Val: AM.Scale > 1)
4097 .setMemRefs(MemI.memoperands())
4098 .setMIFlags(MemI.getFlags());
4099 return B.getInstr();
4100 }
4101
4102 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4103 "Addressing mode not supported for folding");
4104
4105 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4106 unsigned Scale = 1;
4107 unsigned Opcode = MemI.getOpcode();
4108 if (isInt<9>(x: AM.Displacement))
4109 Opcode = unscaledOffsetOpcode(Opcode);
4110 else
4111 Opcode = scaledOffsetOpcode(Opcode, Scale);
4112
4113 auto B =
4114 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4115 .addReg(RegNo: MemI.getOperand(i: 0).getReg(), Flags: getDefRegState(B: MemI.mayLoad()))
4116 .addReg(RegNo: AM.BaseReg)
4117 .addImm(Val: AM.Displacement / Scale)
4118 .setMemRefs(MemI.memoperands())
4119 .setMIFlags(MemI.getFlags());
4120 return B.getInstr();
4121 }
4122
4123 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
4124 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
4125 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4126 assert(AM.ScaledReg && !AM.Displacement &&
4127 "Address offset can be a register or an immediate, but not both");
4128 unsigned Opcode = offsetExtendOpcode(Opcode: MemI.getOpcode());
4129 MRI.constrainRegClass(Reg: AM.BaseReg, RC: &AArch64::GPR64spRegClass);
4130 // Make sure the offset register is in the correct register class.
4131 Register OffsetReg = AM.ScaledReg;
4132 const TargetRegisterClass *RC = MRI.getRegClass(Reg: OffsetReg);
4133 if (RC->hasSuperClassEq(RC: &AArch64::GPR64RegClass)) {
4134 OffsetReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
4135 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: OffsetReg)
4136 .addReg(RegNo: AM.ScaledReg, Flags: {}, SubReg: AArch64::sub_32);
4137 }
4138 auto B =
4139 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4140 .addReg(RegNo: MemI.getOperand(i: 0).getReg(), Flags: getDefRegState(B: MemI.mayLoad()))
4141 .addReg(RegNo: AM.BaseReg)
4142 .addReg(RegNo: OffsetReg)
4143 .addImm(Val: AM.Form == ExtAddrMode::Formula::SExtScaledReg)
4144 .addImm(Val: AM.Scale != 1)
4145 .setMemRefs(MemI.memoperands())
4146 .setMIFlags(MemI.getFlags());
4147
4148 return B.getInstr();
4149 }
4150
4151 llvm_unreachable(
4152 "Function must not be called with an addressing mode it can't handle");
4153}
4154
4155/// Return true if the opcode is a post-index ld/st instruction, which really
4156/// loads from base+0.
4157static bool isPostIndexLdStOpcode(unsigned Opcode) {
4158 switch (Opcode) {
4159 default:
4160 return false;
4161 case AArch64::LD1Fourv16b_POST:
4162 case AArch64::LD1Fourv1d_POST:
4163 case AArch64::LD1Fourv2d_POST:
4164 case AArch64::LD1Fourv2s_POST:
4165 case AArch64::LD1Fourv4h_POST:
4166 case AArch64::LD1Fourv4s_POST:
4167 case AArch64::LD1Fourv8b_POST:
4168 case AArch64::LD1Fourv8h_POST:
4169 case AArch64::LD1Onev16b_POST:
4170 case AArch64::LD1Onev1d_POST:
4171 case AArch64::LD1Onev2d_POST:
4172 case AArch64::LD1Onev2s_POST:
4173 case AArch64::LD1Onev4h_POST:
4174 case AArch64::LD1Onev4s_POST:
4175 case AArch64::LD1Onev8b_POST:
4176 case AArch64::LD1Onev8h_POST:
4177 case AArch64::LD1Rv16b_POST:
4178 case AArch64::LD1Rv1d_POST:
4179 case AArch64::LD1Rv2d_POST:
4180 case AArch64::LD1Rv2s_POST:
4181 case AArch64::LD1Rv4h_POST:
4182 case AArch64::LD1Rv4s_POST:
4183 case AArch64::LD1Rv8b_POST:
4184 case AArch64::LD1Rv8h_POST:
4185 case AArch64::LD1Threev16b_POST:
4186 case AArch64::LD1Threev1d_POST:
4187 case AArch64::LD1Threev2d_POST:
4188 case AArch64::LD1Threev2s_POST:
4189 case AArch64::LD1Threev4h_POST:
4190 case AArch64::LD1Threev4s_POST:
4191 case AArch64::LD1Threev8b_POST:
4192 case AArch64::LD1Threev8h_POST:
4193 case AArch64::LD1Twov16b_POST:
4194 case AArch64::LD1Twov1d_POST:
4195 case AArch64::LD1Twov2d_POST:
4196 case AArch64::LD1Twov2s_POST:
4197 case AArch64::LD1Twov4h_POST:
4198 case AArch64::LD1Twov4s_POST:
4199 case AArch64::LD1Twov8b_POST:
4200 case AArch64::LD1Twov8h_POST:
4201 case AArch64::LD1i16_POST:
4202 case AArch64::LD1i32_POST:
4203 case AArch64::LD1i64_POST:
4204 case AArch64::LD1i8_POST:
4205 case AArch64::LD2Rv16b_POST:
4206 case AArch64::LD2Rv1d_POST:
4207 case AArch64::LD2Rv2d_POST:
4208 case AArch64::LD2Rv2s_POST:
4209 case AArch64::LD2Rv4h_POST:
4210 case AArch64::LD2Rv4s_POST:
4211 case AArch64::LD2Rv8b_POST:
4212 case AArch64::LD2Rv8h_POST:
4213 case AArch64::LD2Twov16b_POST:
4214 case AArch64::LD2Twov2d_POST:
4215 case AArch64::LD2Twov2s_POST:
4216 case AArch64::LD2Twov4h_POST:
4217 case AArch64::LD2Twov4s_POST:
4218 case AArch64::LD2Twov8b_POST:
4219 case AArch64::LD2Twov8h_POST:
4220 case AArch64::LD2i16_POST:
4221 case AArch64::LD2i32_POST:
4222 case AArch64::LD2i64_POST:
4223 case AArch64::LD2i8_POST:
4224 case AArch64::LD3Rv16b_POST:
4225 case AArch64::LD3Rv1d_POST:
4226 case AArch64::LD3Rv2d_POST:
4227 case AArch64::LD3Rv2s_POST:
4228 case AArch64::LD3Rv4h_POST:
4229 case AArch64::LD3Rv4s_POST:
4230 case AArch64::LD3Rv8b_POST:
4231 case AArch64::LD3Rv8h_POST:
4232 case AArch64::LD3Threev16b_POST:
4233 case AArch64::LD3Threev2d_POST:
4234 case AArch64::LD3Threev2s_POST:
4235 case AArch64::LD3Threev4h_POST:
4236 case AArch64::LD3Threev4s_POST:
4237 case AArch64::LD3Threev8b_POST:
4238 case AArch64::LD3Threev8h_POST:
4239 case AArch64::LD3i16_POST:
4240 case AArch64::LD3i32_POST:
4241 case AArch64::LD3i64_POST:
4242 case AArch64::LD3i8_POST:
4243 case AArch64::LD4Fourv16b_POST:
4244 case AArch64::LD4Fourv2d_POST:
4245 case AArch64::LD4Fourv2s_POST:
4246 case AArch64::LD4Fourv4h_POST:
4247 case AArch64::LD4Fourv4s_POST:
4248 case AArch64::LD4Fourv8b_POST:
4249 case AArch64::LD4Fourv8h_POST:
4250 case AArch64::LD4Rv16b_POST:
4251 case AArch64::LD4Rv1d_POST:
4252 case AArch64::LD4Rv2d_POST:
4253 case AArch64::LD4Rv2s_POST:
4254 case AArch64::LD4Rv4h_POST:
4255 case AArch64::LD4Rv4s_POST:
4256 case AArch64::LD4Rv8b_POST:
4257 case AArch64::LD4Rv8h_POST:
4258 case AArch64::LD4i16_POST:
4259 case AArch64::LD4i32_POST:
4260 case AArch64::LD4i64_POST:
4261 case AArch64::LD4i8_POST:
4262 case AArch64::LDAPRWpost:
4263 case AArch64::LDAPRXpost:
4264 case AArch64::LDIAPPWpost:
4265 case AArch64::LDIAPPXpost:
4266 case AArch64::LDPDpost:
4267 case AArch64::LDPQpost:
4268 case AArch64::LDPSWpost:
4269 case AArch64::LDPSpost:
4270 case AArch64::LDPWpost:
4271 case AArch64::LDPXpost:
4272 case AArch64::LDRBBpost:
4273 case AArch64::LDRBpost:
4274 case AArch64::LDRDpost:
4275 case AArch64::LDRHHpost:
4276 case AArch64::LDRHpost:
4277 case AArch64::LDRQpost:
4278 case AArch64::LDRSBWpost:
4279 case AArch64::LDRSBXpost:
4280 case AArch64::LDRSHWpost:
4281 case AArch64::LDRSHXpost:
4282 case AArch64::LDRSWpost:
4283 case AArch64::LDRSpost:
4284 case AArch64::LDRWpost:
4285 case AArch64::LDRXpost:
4286 case AArch64::ST1Fourv16b_POST:
4287 case AArch64::ST1Fourv1d_POST:
4288 case AArch64::ST1Fourv2d_POST:
4289 case AArch64::ST1Fourv2s_POST:
4290 case AArch64::ST1Fourv4h_POST:
4291 case AArch64::ST1Fourv4s_POST:
4292 case AArch64::ST1Fourv8b_POST:
4293 case AArch64::ST1Fourv8h_POST:
4294 case AArch64::ST1Onev16b_POST:
4295 case AArch64::ST1Onev1d_POST:
4296 case AArch64::ST1Onev2d_POST:
4297 case AArch64::ST1Onev2s_POST:
4298 case AArch64::ST1Onev4h_POST:
4299 case AArch64::ST1Onev4s_POST:
4300 case AArch64::ST1Onev8b_POST:
4301 case AArch64::ST1Onev8h_POST:
4302 case AArch64::ST1Threev16b_POST:
4303 case AArch64::ST1Threev1d_POST:
4304 case AArch64::ST1Threev2d_POST:
4305 case AArch64::ST1Threev2s_POST:
4306 case AArch64::ST1Threev4h_POST:
4307 case AArch64::ST1Threev4s_POST:
4308 case AArch64::ST1Threev8b_POST:
4309 case AArch64::ST1Threev8h_POST:
4310 case AArch64::ST1Twov16b_POST:
4311 case AArch64::ST1Twov1d_POST:
4312 case AArch64::ST1Twov2d_POST:
4313 case AArch64::ST1Twov2s_POST:
4314 case AArch64::ST1Twov4h_POST:
4315 case AArch64::ST1Twov4s_POST:
4316 case AArch64::ST1Twov8b_POST:
4317 case AArch64::ST1Twov8h_POST:
4318 case AArch64::ST1i16_POST:
4319 case AArch64::ST1i32_POST:
4320 case AArch64::ST1i64_POST:
4321 case AArch64::ST1i8_POST:
4322 case AArch64::ST2GPostIndex:
4323 case AArch64::ST2Twov16b_POST:
4324 case AArch64::ST2Twov2d_POST:
4325 case AArch64::ST2Twov2s_POST:
4326 case AArch64::ST2Twov4h_POST:
4327 case AArch64::ST2Twov4s_POST:
4328 case AArch64::ST2Twov8b_POST:
4329 case AArch64::ST2Twov8h_POST:
4330 case AArch64::ST2i16_POST:
4331 case AArch64::ST2i32_POST:
4332 case AArch64::ST2i64_POST:
4333 case AArch64::ST2i8_POST:
4334 case AArch64::ST3Threev16b_POST:
4335 case AArch64::ST3Threev2d_POST:
4336 case AArch64::ST3Threev2s_POST:
4337 case AArch64::ST3Threev4h_POST:
4338 case AArch64::ST3Threev4s_POST:
4339 case AArch64::ST3Threev8b_POST:
4340 case AArch64::ST3Threev8h_POST:
4341 case AArch64::ST3i16_POST:
4342 case AArch64::ST3i32_POST:
4343 case AArch64::ST3i64_POST:
4344 case AArch64::ST3i8_POST:
4345 case AArch64::ST4Fourv16b_POST:
4346 case AArch64::ST4Fourv2d_POST:
4347 case AArch64::ST4Fourv2s_POST:
4348 case AArch64::ST4Fourv4h_POST:
4349 case AArch64::ST4Fourv4s_POST:
4350 case AArch64::ST4Fourv8b_POST:
4351 case AArch64::ST4Fourv8h_POST:
4352 case AArch64::ST4i16_POST:
4353 case AArch64::ST4i32_POST:
4354 case AArch64::ST4i64_POST:
4355 case AArch64::ST4i8_POST:
4356 case AArch64::STGPostIndex:
4357 case AArch64::STGPpost:
4358 case AArch64::STPDpost:
4359 case AArch64::STPQpost:
4360 case AArch64::STPSpost:
4361 case AArch64::STPWpost:
4362 case AArch64::STPXpost:
4363 case AArch64::STRBBpost:
4364 case AArch64::STRBpost:
4365 case AArch64::STRDpost:
4366 case AArch64::STRHHpost:
4367 case AArch64::STRHpost:
4368 case AArch64::STRQpost:
4369 case AArch64::STRSpost:
4370 case AArch64::STRWpost:
4371 case AArch64::STRXpost:
4372 case AArch64::STZ2GPostIndex:
4373 case AArch64::STZGPostIndex:
4374 return true;
4375 }
4376}
4377
4378bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
4379 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4380 bool &OffsetIsScalable, TypeSize &Width,
4381 const TargetRegisterInfo *TRI) const {
4382 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4383 // Handle only loads/stores with base register followed by immediate offset.
4384 if (LdSt.getNumExplicitOperands() == 3) {
4385 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4386 if ((!LdSt.getOperand(i: 1).isReg() && !LdSt.getOperand(i: 1).isFI()) ||
4387 !LdSt.getOperand(i: 2).isImm())
4388 return false;
4389 } else if (LdSt.getNumExplicitOperands() == 4) {
4390 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4391 if (!LdSt.getOperand(i: 1).isReg() ||
4392 (!LdSt.getOperand(i: 2).isReg() && !LdSt.getOperand(i: 2).isFI()) ||
4393 !LdSt.getOperand(i: 3).isImm())
4394 return false;
4395 } else
4396 return false;
4397
4398 // Get the scaling factor for the instruction and set the width for the
4399 // instruction.
4400 TypeSize Scale(0U, false);
4401 int64_t Dummy1, Dummy2;
4402
4403 // If this returns false, then it's an instruction we don't want to handle.
4404 if (!getMemOpInfo(Opcode: LdSt.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2))
4405 return false;
4406
4407 // Compute the offset. Offset is calculated as the immediate operand
4408 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4409 // set to 1. Postindex are a special case which have an offset of 0.
4410 if (isPostIndexLdStOpcode(Opcode: LdSt.getOpcode())) {
4411 BaseOp = &LdSt.getOperand(i: 2);
4412 Offset = 0;
4413 } else if (LdSt.getNumExplicitOperands() == 3) {
4414 BaseOp = &LdSt.getOperand(i: 1);
4415 Offset = LdSt.getOperand(i: 2).getImm() * Scale.getKnownMinValue();
4416 } else {
4417 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4418 BaseOp = &LdSt.getOperand(i: 2);
4419 Offset = LdSt.getOperand(i: 3).getImm() * Scale.getKnownMinValue();
4420 }
4421 OffsetIsScalable = Scale.isScalable();
4422
4423 return BaseOp->isReg() || BaseOp->isFI();
4424}
4425
4426MachineOperand &
4427AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
4428 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4429 MachineOperand &OfsOp = LdSt.getOperand(i: LdSt.getNumExplicitOperands() - 1);
4430 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4431 return OfsOp;
4432}
4433
4434bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4435 TypeSize &Width, int64_t &MinOffset,
4436 int64_t &MaxOffset) {
4437 switch (Opcode) {
4438 // Not a memory operation or something we want to handle.
4439 default:
4440 Scale = TypeSize::getFixed(ExactSize: 0);
4441 Width = TypeSize::getFixed(ExactSize: 0);
4442 MinOffset = MaxOffset = 0;
4443 return false;
4444 // LDR / STR
4445 case AArch64::LDRQui:
4446 case AArch64::STRQui:
4447 Scale = TypeSize::getFixed(ExactSize: 16);
4448 Width = TypeSize::getFixed(ExactSize: 16);
4449 MinOffset = 0;
4450 MaxOffset = 4095;
4451 break;
4452 case AArch64::LDRXui:
4453 case AArch64::LDRDui:
4454 case AArch64::STRXui:
4455 case AArch64::STRDui:
4456 case AArch64::PRFMui:
4457 Scale = TypeSize::getFixed(ExactSize: 8);
4458 Width = TypeSize::getFixed(ExactSize: 8);
4459 MinOffset = 0;
4460 MaxOffset = 4095;
4461 break;
4462 case AArch64::LDRWui:
4463 case AArch64::LDRSui:
4464 case AArch64::LDRSWui:
4465 case AArch64::STRWui:
4466 case AArch64::STRSui:
4467 Scale = TypeSize::getFixed(ExactSize: 4);
4468 Width = TypeSize::getFixed(ExactSize: 4);
4469 MinOffset = 0;
4470 MaxOffset = 4095;
4471 break;
4472 case AArch64::LDRHui:
4473 case AArch64::LDRHHui:
4474 case AArch64::LDRSHWui:
4475 case AArch64::LDRSHXui:
4476 case AArch64::STRHui:
4477 case AArch64::STRHHui:
4478 Scale = TypeSize::getFixed(ExactSize: 2);
4479 Width = TypeSize::getFixed(ExactSize: 2);
4480 MinOffset = 0;
4481 MaxOffset = 4095;
4482 break;
4483 case AArch64::LDRBui:
4484 case AArch64::LDRBBui:
4485 case AArch64::LDRSBWui:
4486 case AArch64::LDRSBXui:
4487 case AArch64::STRBui:
4488 case AArch64::STRBBui:
4489 Scale = TypeSize::getFixed(ExactSize: 1);
4490 Width = TypeSize::getFixed(ExactSize: 1);
4491 MinOffset = 0;
4492 MaxOffset = 4095;
4493 break;
4494 // post/pre inc
4495 case AArch64::STRQpre:
4496 case AArch64::LDRQpost:
4497 Scale = TypeSize::getFixed(ExactSize: 1);
4498 Width = TypeSize::getFixed(ExactSize: 16);
4499 MinOffset = -256;
4500 MaxOffset = 255;
4501 break;
4502 case AArch64::LDRDpost:
4503 case AArch64::LDRDpre:
4504 case AArch64::LDRXpost:
4505 case AArch64::LDRXpre:
4506 case AArch64::STRDpost:
4507 case AArch64::STRDpre:
4508 case AArch64::STRXpost:
4509 case AArch64::STRXpre:
4510 Scale = TypeSize::getFixed(ExactSize: 1);
4511 Width = TypeSize::getFixed(ExactSize: 8);
4512 MinOffset = -256;
4513 MaxOffset = 255;
4514 break;
4515 case AArch64::STRWpost:
4516 case AArch64::STRWpre:
4517 case AArch64::LDRWpost:
4518 case AArch64::LDRWpre:
4519 case AArch64::STRSpost:
4520 case AArch64::STRSpre:
4521 case AArch64::LDRSpost:
4522 case AArch64::LDRSpre:
4523 Scale = TypeSize::getFixed(ExactSize: 1);
4524 Width = TypeSize::getFixed(ExactSize: 4);
4525 MinOffset = -256;
4526 MaxOffset = 255;
4527 break;
4528 case AArch64::LDRHpost:
4529 case AArch64::LDRHpre:
4530 case AArch64::STRHpost:
4531 case AArch64::STRHpre:
4532 case AArch64::LDRHHpost:
4533 case AArch64::LDRHHpre:
4534 case AArch64::STRHHpost:
4535 case AArch64::STRHHpre:
4536 Scale = TypeSize::getFixed(ExactSize: 1);
4537 Width = TypeSize::getFixed(ExactSize: 2);
4538 MinOffset = -256;
4539 MaxOffset = 255;
4540 break;
4541 case AArch64::LDRBpost:
4542 case AArch64::LDRBpre:
4543 case AArch64::STRBpost:
4544 case AArch64::STRBpre:
4545 case AArch64::LDRBBpost:
4546 case AArch64::LDRBBpre:
4547 case AArch64::STRBBpost:
4548 case AArch64::STRBBpre:
4549 Scale = TypeSize::getFixed(ExactSize: 1);
4550 Width = TypeSize::getFixed(ExactSize: 1);
4551 MinOffset = -256;
4552 MaxOffset = 255;
4553 break;
4554 // Unscaled
4555 case AArch64::LDURQi:
4556 case AArch64::STURQi:
4557 Scale = TypeSize::getFixed(ExactSize: 1);
4558 Width = TypeSize::getFixed(ExactSize: 16);
4559 MinOffset = -256;
4560 MaxOffset = 255;
4561 break;
4562 case AArch64::LDURXi:
4563 case AArch64::LDURDi:
4564 case AArch64::LDAPURXi:
4565 case AArch64::STURXi:
4566 case AArch64::STURDi:
4567 case AArch64::STLURXi:
4568 case AArch64::PRFUMi:
4569 Scale = TypeSize::getFixed(ExactSize: 1);
4570 Width = TypeSize::getFixed(ExactSize: 8);
4571 MinOffset = -256;
4572 MaxOffset = 255;
4573 break;
4574 case AArch64::LDURWi:
4575 case AArch64::LDURSi:
4576 case AArch64::LDURSWi:
4577 case AArch64::LDAPURi:
4578 case AArch64::LDAPURSWi:
4579 case AArch64::STURWi:
4580 case AArch64::STURSi:
4581 case AArch64::STLURWi:
4582 Scale = TypeSize::getFixed(ExactSize: 1);
4583 Width = TypeSize::getFixed(ExactSize: 4);
4584 MinOffset = -256;
4585 MaxOffset = 255;
4586 break;
4587 case AArch64::LDURHi:
4588 case AArch64::LDURHHi:
4589 case AArch64::LDURSHXi:
4590 case AArch64::LDURSHWi:
4591 case AArch64::LDAPURHi:
4592 case AArch64::LDAPURSHWi:
4593 case AArch64::LDAPURSHXi:
4594 case AArch64::STURHi:
4595 case AArch64::STURHHi:
4596 case AArch64::STLURHi:
4597 Scale = TypeSize::getFixed(ExactSize: 1);
4598 Width = TypeSize::getFixed(ExactSize: 2);
4599 MinOffset = -256;
4600 MaxOffset = 255;
4601 break;
4602 case AArch64::LDURBi:
4603 case AArch64::LDURBBi:
4604 case AArch64::LDURSBXi:
4605 case AArch64::LDURSBWi:
4606 case AArch64::LDAPURBi:
4607 case AArch64::LDAPURSBWi:
4608 case AArch64::LDAPURSBXi:
4609 case AArch64::STURBi:
4610 case AArch64::STURBBi:
4611 case AArch64::STLURBi:
4612 Scale = TypeSize::getFixed(ExactSize: 1);
4613 Width = TypeSize::getFixed(ExactSize: 1);
4614 MinOffset = -256;
4615 MaxOffset = 255;
4616 break;
4617 // LDP / STP (including pre/post inc)
4618 case AArch64::LDPQi:
4619 case AArch64::LDNPQi:
4620 case AArch64::STPQi:
4621 case AArch64::STNPQi:
4622 case AArch64::LDPQpost:
4623 case AArch64::LDPQpre:
4624 case AArch64::STPQpost:
4625 case AArch64::STPQpre:
4626 Scale = TypeSize::getFixed(ExactSize: 16);
4627 Width = TypeSize::getFixed(ExactSize: 16 * 2);
4628 MinOffset = -64;
4629 MaxOffset = 63;
4630 break;
4631 case AArch64::LDPXi:
4632 case AArch64::LDPDi:
4633 case AArch64::LDNPXi:
4634 case AArch64::LDNPDi:
4635 case AArch64::STPXi:
4636 case AArch64::STPDi:
4637 case AArch64::STNPXi:
4638 case AArch64::STNPDi:
4639 case AArch64::LDPDpost:
4640 case AArch64::LDPDpre:
4641 case AArch64::LDPXpost:
4642 case AArch64::LDPXpre:
4643 case AArch64::STPDpost:
4644 case AArch64::STPDpre:
4645 case AArch64::STPXpost:
4646 case AArch64::STPXpre:
4647 Scale = TypeSize::getFixed(ExactSize: 8);
4648 Width = TypeSize::getFixed(ExactSize: 8 * 2);
4649 MinOffset = -64;
4650 MaxOffset = 63;
4651 break;
4652 case AArch64::LDPWi:
4653 case AArch64::LDPSi:
4654 case AArch64::LDNPWi:
4655 case AArch64::LDNPSi:
4656 case AArch64::STPWi:
4657 case AArch64::STPSi:
4658 case AArch64::STNPWi:
4659 case AArch64::STNPSi:
4660 case AArch64::LDPSpost:
4661 case AArch64::LDPSpre:
4662 case AArch64::LDPWpost:
4663 case AArch64::LDPWpre:
4664 case AArch64::STPSpost:
4665 case AArch64::STPSpre:
4666 case AArch64::STPWpost:
4667 case AArch64::STPWpre:
4668 Scale = TypeSize::getFixed(ExactSize: 4);
4669 Width = TypeSize::getFixed(ExactSize: 4 * 2);
4670 MinOffset = -64;
4671 MaxOffset = 63;
4672 break;
4673 case AArch64::StoreSwiftAsyncContext:
4674 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4675 Scale = TypeSize::getFixed(ExactSize: 1);
4676 Width = TypeSize::getFixed(ExactSize: 8);
4677 MinOffset = 0;
4678 MaxOffset = 4095;
4679 break;
4680 case AArch64::ADDG:
4681 Scale = TypeSize::getFixed(ExactSize: 16);
4682 Width = TypeSize::getFixed(ExactSize: 0);
4683 MinOffset = 0;
4684 MaxOffset = 63;
4685 break;
4686 case AArch64::TAGPstack:
4687 Scale = TypeSize::getFixed(ExactSize: 16);
4688 Width = TypeSize::getFixed(ExactSize: 0);
4689 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4690 // of 63 (not 64!).
4691 MinOffset = -63;
4692 MaxOffset = 63;
4693 break;
4694 case AArch64::LDG:
4695 case AArch64::STGi:
4696 case AArch64::STGPreIndex:
4697 case AArch64::STGPostIndex:
4698 case AArch64::STZGi:
4699 case AArch64::STZGPreIndex:
4700 case AArch64::STZGPostIndex:
4701 Scale = TypeSize::getFixed(ExactSize: 16);
4702 Width = TypeSize::getFixed(ExactSize: 16);
4703 MinOffset = -256;
4704 MaxOffset = 255;
4705 break;
4706 // SVE
4707 case AArch64::STR_ZZZZXI:
4708 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4709 case AArch64::LDR_ZZZZXI:
4710 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4711 Scale = TypeSize::getScalable(MinimumSize: 16);
4712 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
4713 MinOffset = -256;
4714 MaxOffset = 252;
4715 break;
4716 case AArch64::STR_ZZZXI:
4717 case AArch64::LDR_ZZZXI:
4718 Scale = TypeSize::getScalable(MinimumSize: 16);
4719 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
4720 MinOffset = -256;
4721 MaxOffset = 253;
4722 break;
4723 case AArch64::STR_ZZXI:
4724 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4725 case AArch64::LDR_ZZXI:
4726 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4727 Scale = TypeSize::getScalable(MinimumSize: 16);
4728 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
4729 MinOffset = -256;
4730 MaxOffset = 254;
4731 break;
4732 case AArch64::LDR_PXI:
4733 case AArch64::STR_PXI:
4734 Scale = TypeSize::getScalable(MinimumSize: 2);
4735 Width = TypeSize::getScalable(MinimumSize: 2);
4736 MinOffset = -256;
4737 MaxOffset = 255;
4738 break;
4739 case AArch64::LDR_PPXI:
4740 case AArch64::STR_PPXI:
4741 Scale = TypeSize::getScalable(MinimumSize: 2);
4742 Width = TypeSize::getScalable(MinimumSize: 2 * 2);
4743 MinOffset = -256;
4744 MaxOffset = 254;
4745 break;
4746 case AArch64::LDR_ZXI:
4747 case AArch64::STR_ZXI:
4748 Scale = TypeSize::getScalable(MinimumSize: 16);
4749 Width = TypeSize::getScalable(MinimumSize: 16);
4750 MinOffset = -256;
4751 MaxOffset = 255;
4752 break;
4753 case AArch64::LD1B_IMM:
4754 case AArch64::LD1H_IMM:
4755 case AArch64::LD1W_IMM:
4756 case AArch64::LD1D_IMM:
4757 case AArch64::LDNT1B_ZRI:
4758 case AArch64::LDNT1H_ZRI:
4759 case AArch64::LDNT1W_ZRI:
4760 case AArch64::LDNT1D_ZRI:
4761 case AArch64::ST1B_IMM:
4762 case AArch64::ST1H_IMM:
4763 case AArch64::ST1W_IMM:
4764 case AArch64::ST1D_IMM:
4765 case AArch64::STNT1B_ZRI:
4766 case AArch64::STNT1H_ZRI:
4767 case AArch64::STNT1W_ZRI:
4768 case AArch64::STNT1D_ZRI:
4769 case AArch64::LDNF1B_IMM:
4770 case AArch64::LDNF1H_IMM:
4771 case AArch64::LDNF1W_IMM:
4772 case AArch64::LDNF1D_IMM:
4773 // A full vectors worth of data
4774 // Width = mbytes * elements
4775 Scale = TypeSize::getScalable(MinimumSize: 16);
4776 Width = TypeSize::getScalable(MinimumSize: 16);
4777 MinOffset = -8;
4778 MaxOffset = 7;
4779 break;
4780 case AArch64::LD2B_IMM:
4781 case AArch64::LD2H_IMM:
4782 case AArch64::LD2W_IMM:
4783 case AArch64::LD2D_IMM:
4784 case AArch64::ST2B_IMM:
4785 case AArch64::ST2H_IMM:
4786 case AArch64::ST2W_IMM:
4787 case AArch64::ST2D_IMM:
4788 Scale = TypeSize::getScalable(MinimumSize: 32);
4789 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
4790 MinOffset = -8;
4791 MaxOffset = 7;
4792 break;
4793 case AArch64::LD3B_IMM:
4794 case AArch64::LD3H_IMM:
4795 case AArch64::LD3W_IMM:
4796 case AArch64::LD3D_IMM:
4797 case AArch64::ST3B_IMM:
4798 case AArch64::ST3H_IMM:
4799 case AArch64::ST3W_IMM:
4800 case AArch64::ST3D_IMM:
4801 Scale = TypeSize::getScalable(MinimumSize: 48);
4802 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
4803 MinOffset = -8;
4804 MaxOffset = 7;
4805 break;
4806 case AArch64::LD4B_IMM:
4807 case AArch64::LD4H_IMM:
4808 case AArch64::LD4W_IMM:
4809 case AArch64::LD4D_IMM:
4810 case AArch64::ST4B_IMM:
4811 case AArch64::ST4H_IMM:
4812 case AArch64::ST4W_IMM:
4813 case AArch64::ST4D_IMM:
4814 Scale = TypeSize::getScalable(MinimumSize: 64);
4815 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
4816 MinOffset = -8;
4817 MaxOffset = 7;
4818 break;
4819 case AArch64::LD1B_H_IMM:
4820 case AArch64::LD1SB_H_IMM:
4821 case AArch64::LD1H_S_IMM:
4822 case AArch64::LD1SH_S_IMM:
4823 case AArch64::LD1W_D_IMM:
4824 case AArch64::LD1SW_D_IMM:
4825 case AArch64::ST1B_H_IMM:
4826 case AArch64::ST1H_S_IMM:
4827 case AArch64::ST1W_D_IMM:
4828 case AArch64::LDNF1B_H_IMM:
4829 case AArch64::LDNF1SB_H_IMM:
4830 case AArch64::LDNF1H_S_IMM:
4831 case AArch64::LDNF1SH_S_IMM:
4832 case AArch64::LDNF1W_D_IMM:
4833 case AArch64::LDNF1SW_D_IMM:
4834 // A half vector worth of data
4835 // Width = mbytes * elements
4836 Scale = TypeSize::getScalable(MinimumSize: 8);
4837 Width = TypeSize::getScalable(MinimumSize: 8);
4838 MinOffset = -8;
4839 MaxOffset = 7;
4840 break;
4841 case AArch64::LD1B_S_IMM:
4842 case AArch64::LD1SB_S_IMM:
4843 case AArch64::LD1H_D_IMM:
4844 case AArch64::LD1SH_D_IMM:
4845 case AArch64::ST1B_S_IMM:
4846 case AArch64::ST1H_D_IMM:
4847 case AArch64::LDNF1B_S_IMM:
4848 case AArch64::LDNF1SB_S_IMM:
4849 case AArch64::LDNF1H_D_IMM:
4850 case AArch64::LDNF1SH_D_IMM:
4851 // A quarter vector worth of data
4852 // Width = mbytes * elements
4853 Scale = TypeSize::getScalable(MinimumSize: 4);
4854 Width = TypeSize::getScalable(MinimumSize: 4);
4855 MinOffset = -8;
4856 MaxOffset = 7;
4857 break;
4858 case AArch64::LD1B_D_IMM:
4859 case AArch64::LD1SB_D_IMM:
4860 case AArch64::ST1B_D_IMM:
4861 case AArch64::LDNF1B_D_IMM:
4862 case AArch64::LDNF1SB_D_IMM:
4863 // A eighth vector worth of data
4864 // Width = mbytes * elements
4865 Scale = TypeSize::getScalable(MinimumSize: 2);
4866 Width = TypeSize::getScalable(MinimumSize: 2);
4867 MinOffset = -8;
4868 MaxOffset = 7;
4869 break;
4870 case AArch64::ST2Gi:
4871 case AArch64::ST2GPreIndex:
4872 case AArch64::ST2GPostIndex:
4873 case AArch64::STZ2Gi:
4874 case AArch64::STZ2GPreIndex:
4875 case AArch64::STZ2GPostIndex:
4876 Scale = TypeSize::getFixed(ExactSize: 16);
4877 Width = TypeSize::getFixed(ExactSize: 32);
4878 MinOffset = -256;
4879 MaxOffset = 255;
4880 break;
4881 case AArch64::STGPi:
4882 case AArch64::STGPpost:
4883 case AArch64::STGPpre:
4884 Scale = TypeSize::getFixed(ExactSize: 16);
4885 Width = TypeSize::getFixed(ExactSize: 16);
4886 MinOffset = -64;
4887 MaxOffset = 63;
4888 break;
4889 case AArch64::LD1RB_IMM:
4890 case AArch64::LD1RB_H_IMM:
4891 case AArch64::LD1RB_S_IMM:
4892 case AArch64::LD1RB_D_IMM:
4893 case AArch64::LD1RSB_H_IMM:
4894 case AArch64::LD1RSB_S_IMM:
4895 case AArch64::LD1RSB_D_IMM:
4896 Scale = TypeSize::getFixed(ExactSize: 1);
4897 Width = TypeSize::getFixed(ExactSize: 1);
4898 MinOffset = 0;
4899 MaxOffset = 63;
4900 break;
4901 case AArch64::LD1RH_IMM:
4902 case AArch64::LD1RH_S_IMM:
4903 case AArch64::LD1RH_D_IMM:
4904 case AArch64::LD1RSH_S_IMM:
4905 case AArch64::LD1RSH_D_IMM:
4906 Scale = TypeSize::getFixed(ExactSize: 2);
4907 Width = TypeSize::getFixed(ExactSize: 2);
4908 MinOffset = 0;
4909 MaxOffset = 63;
4910 break;
4911 case AArch64::LD1RW_IMM:
4912 case AArch64::LD1RW_D_IMM:
4913 case AArch64::LD1RSW_IMM:
4914 Scale = TypeSize::getFixed(ExactSize: 4);
4915 Width = TypeSize::getFixed(ExactSize: 4);
4916 MinOffset = 0;
4917 MaxOffset = 63;
4918 break;
4919 case AArch64::LD1RD_IMM:
4920 Scale = TypeSize::getFixed(ExactSize: 8);
4921 Width = TypeSize::getFixed(ExactSize: 8);
4922 MinOffset = 0;
4923 MaxOffset = 63;
4924 break;
4925 }
4926
4927 return true;
4928}
4929
4930// Scaling factor for unscaled load or store.
4931int AArch64InstrInfo::getMemScale(unsigned Opc) {
4932 switch (Opc) {
4933 default:
4934 llvm_unreachable("Opcode has unknown scale!");
4935 case AArch64::LDRBui:
4936 case AArch64::LDRBBui:
4937 case AArch64::LDURBBi:
4938 case AArch64::LDRSBWui:
4939 case AArch64::LDURSBWi:
4940 case AArch64::STRBui:
4941 case AArch64::STRBBui:
4942 case AArch64::STURBBi:
4943 return 1;
4944 case AArch64::LDRHui:
4945 case AArch64::LDRHHui:
4946 case AArch64::LDURHHi:
4947 case AArch64::LDRSHWui:
4948 case AArch64::LDURSHWi:
4949 case AArch64::STRHui:
4950 case AArch64::STRHHui:
4951 case AArch64::STURHHi:
4952 return 2;
4953 case AArch64::LDRSui:
4954 case AArch64::LDURSi:
4955 case AArch64::LDRSpre:
4956 case AArch64::LDRSWui:
4957 case AArch64::LDURSWi:
4958 case AArch64::LDRSWpre:
4959 case AArch64::LDRWpre:
4960 case AArch64::LDRWui:
4961 case AArch64::LDURWi:
4962 case AArch64::STRSui:
4963 case AArch64::STURSi:
4964 case AArch64::STRSpre:
4965 case AArch64::STRWui:
4966 case AArch64::STURWi:
4967 case AArch64::STRWpre:
4968 case AArch64::LDPSi:
4969 case AArch64::LDPSWi:
4970 case AArch64::LDPWi:
4971 case AArch64::STPSi:
4972 case AArch64::STPWi:
4973 return 4;
4974 case AArch64::LDRDui:
4975 case AArch64::LDURDi:
4976 case AArch64::LDRDpre:
4977 case AArch64::LDRXui:
4978 case AArch64::LDURXi:
4979 case AArch64::LDRXpre:
4980 case AArch64::STRDui:
4981 case AArch64::STURDi:
4982 case AArch64::STRDpre:
4983 case AArch64::STRXui:
4984 case AArch64::STURXi:
4985 case AArch64::STRXpre:
4986 case AArch64::LDPDi:
4987 case AArch64::LDPXi:
4988 case AArch64::STPDi:
4989 case AArch64::STPXi:
4990 return 8;
4991 case AArch64::LDRQui:
4992 case AArch64::LDURQi:
4993 case AArch64::STRQui:
4994 case AArch64::STURQi:
4995 case AArch64::STRQpre:
4996 case AArch64::LDPQi:
4997 case AArch64::LDRQpre:
4998 case AArch64::STPQi:
4999 case AArch64::STGi:
5000 case AArch64::STZGi:
5001 case AArch64::ST2Gi:
5002 case AArch64::STZ2Gi:
5003 case AArch64::STGPi:
5004 return 16;
5005 }
5006}
5007
5008bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
5009 switch (MI.getOpcode()) {
5010 default:
5011 return false;
5012 case AArch64::LDRWpre:
5013 case AArch64::LDRXpre:
5014 case AArch64::LDRSWpre:
5015 case AArch64::LDRSpre:
5016 case AArch64::LDRDpre:
5017 case AArch64::LDRQpre:
5018 return true;
5019 }
5020}
5021
5022bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
5023 switch (MI.getOpcode()) {
5024 default:
5025 return false;
5026 case AArch64::STRWpre:
5027 case AArch64::STRXpre:
5028 case AArch64::STRSpre:
5029 case AArch64::STRDpre:
5030 case AArch64::STRQpre:
5031 return true;
5032 }
5033}
5034
5035bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
5036 return isPreLd(MI) || isPreSt(MI);
5037}
5038
5039bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
5040 switch (MI.getOpcode()) {
5041 default:
5042 return false;
5043 case AArch64::LDPSi:
5044 case AArch64::LDPSWi:
5045 case AArch64::LDPDi:
5046 case AArch64::LDPQi:
5047 case AArch64::LDPWi:
5048 case AArch64::LDPXi:
5049 case AArch64::STPSi:
5050 case AArch64::STPDi:
5051 case AArch64::STPQi:
5052 case AArch64::STPWi:
5053 case AArch64::STPXi:
5054 case AArch64::STGPi:
5055 return true;
5056 }
5057}
5058
5059const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
5060 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5061 unsigned Idx =
5062 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
5063 : 1;
5064 return MI.getOperand(i: Idx);
5065}
5066
5067const MachineOperand &
5068AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
5069 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5070 unsigned Idx =
5071 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
5072 : 2;
5073 return MI.getOperand(i: Idx);
5074}
5075
5076const MachineOperand &
5077AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
5078 switch (MI.getOpcode()) {
5079 default:
5080 llvm_unreachable("Unexpected opcode");
5081 case AArch64::LDRBroX:
5082 case AArch64::LDRBBroX:
5083 case AArch64::LDRSBXroX:
5084 case AArch64::LDRSBWroX:
5085 case AArch64::LDRHroX:
5086 case AArch64::LDRHHroX:
5087 case AArch64::LDRSHXroX:
5088 case AArch64::LDRSHWroX:
5089 case AArch64::LDRWroX:
5090 case AArch64::LDRSroX:
5091 case AArch64::LDRSWroX:
5092 case AArch64::LDRDroX:
5093 case AArch64::LDRXroX:
5094 case AArch64::LDRQroX:
5095 return MI.getOperand(i: 4);
5096 }
5097}
5098
5099static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
5100 Register Reg) {
5101 if (MI.getParent() == nullptr)
5102 return nullptr;
5103 const MachineFunction *MF = MI.getParent()->getParent();
5104 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5105}
5106
5107bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
5108 auto IsHFPR = [&](const MachineOperand &Op) {
5109 if (!Op.isReg())
5110 return false;
5111 auto Reg = Op.getReg();
5112 if (Reg.isPhysical())
5113 return AArch64::FPR16RegClass.contains(Reg);
5114 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5115 return TRC == &AArch64::FPR16RegClass ||
5116 TRC == &AArch64::FPR16_loRegClass;
5117 };
5118 return llvm::any_of(Range: MI.operands(), P: IsHFPR);
5119}
5120
5121bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
5122 auto IsQFPR = [&](const MachineOperand &Op) {
5123 if (!Op.isReg())
5124 return false;
5125 auto Reg = Op.getReg();
5126 if (Reg.isPhysical())
5127 return AArch64::FPR128RegClass.contains(Reg);
5128 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5129 return TRC == &AArch64::FPR128RegClass ||
5130 TRC == &AArch64::FPR128_loRegClass;
5131 };
5132 return llvm::any_of(Range: MI.operands(), P: IsQFPR);
5133}
5134
5135bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
5136 switch (MI.getOpcode()) {
5137 case AArch64::BRK:
5138 case AArch64::HLT:
5139 case AArch64::PACIASP:
5140 case AArch64::PACIBSP:
5141 // Implicit BTI behavior.
5142 return true;
5143 case AArch64::PAUTH_PROLOGUE:
5144 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5145 return true;
5146 case AArch64::HINT: {
5147 unsigned Imm = MI.getOperand(i: 0).getImm();
5148 // Explicit BTI instruction.
5149 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5150 return true;
5151 // PACI(A|B)SP instructions.
5152 if (Imm == 25 || Imm == 27)
5153 return true;
5154 return false;
5155 }
5156 default:
5157 return false;
5158 }
5159}
5160
5161bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
5162 if (Reg == 0)
5163 return false;
5164 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5165 return AArch64::FPR128RegClass.contains(Reg) ||
5166 AArch64::FPR64RegClass.contains(Reg) ||
5167 AArch64::FPR32RegClass.contains(Reg) ||
5168 AArch64::FPR16RegClass.contains(Reg) ||
5169 AArch64::FPR8RegClass.contains(Reg);
5170}
5171
5172bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
5173 auto IsFPR = [&](const MachineOperand &Op) {
5174 if (!Op.isReg())
5175 return false;
5176 auto Reg = Op.getReg();
5177 if (Reg.isPhysical())
5178 return isFpOrNEON(Reg);
5179
5180 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5181 return TRC == &AArch64::FPR128RegClass ||
5182 TRC == &AArch64::FPR128_loRegClass ||
5183 TRC == &AArch64::FPR64RegClass ||
5184 TRC == &AArch64::FPR64_loRegClass ||
5185 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5186 TRC == &AArch64::FPR8RegClass;
5187 };
5188 return llvm::any_of(Range: MI.operands(), P: IsFPR);
5189}
5190
5191// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5192// scaled.
5193static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5194 int Scale = AArch64InstrInfo::getMemScale(Opc);
5195
5196 // If the byte-offset isn't a multiple of the stride, we can't scale this
5197 // offset.
5198 if (Offset % Scale != 0)
5199 return false;
5200
5201 // Convert the byte-offset used by unscaled into an "element" offset used
5202 // by the scaled pair load/store instructions.
5203 Offset /= Scale;
5204 return true;
5205}
5206
5207static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5208 if (FirstOpc == SecondOpc)
5209 return true;
5210 // We can also pair sign-ext and zero-ext instructions.
5211 switch (FirstOpc) {
5212 default:
5213 return false;
5214 case AArch64::STRSui:
5215 case AArch64::STURSi:
5216 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5217 case AArch64::STRDui:
5218 case AArch64::STURDi:
5219 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5220 case AArch64::STRQui:
5221 case AArch64::STURQi:
5222 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5223 case AArch64::STRWui:
5224 case AArch64::STURWi:
5225 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5226 case AArch64::STRXui:
5227 case AArch64::STURXi:
5228 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5229 case AArch64::LDRSui:
5230 case AArch64::LDURSi:
5231 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5232 case AArch64::LDRDui:
5233 case AArch64::LDURDi:
5234 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5235 case AArch64::LDRQui:
5236 case AArch64::LDURQi:
5237 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5238 case AArch64::LDRWui:
5239 case AArch64::LDURWi:
5240 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5241 case AArch64::LDRSWui:
5242 case AArch64::LDURSWi:
5243 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5244 case AArch64::LDRXui:
5245 case AArch64::LDURXi:
5246 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5247 }
5248 // These instructions can't be paired based on their opcodes.
5249 return false;
5250}
5251
5252static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5253 int64_t Offset1, unsigned Opcode1, int FI2,
5254 int64_t Offset2, unsigned Opcode2) {
5255 // Accesses through fixed stack object frame indices may access a different
5256 // fixed stack slot. Check that the object offsets + offsets match.
5257 if (MFI.isFixedObjectIndex(ObjectIdx: FI1) && MFI.isFixedObjectIndex(ObjectIdx: FI2)) {
5258 int64_t ObjectOffset1 = MFI.getObjectOffset(ObjectIdx: FI1);
5259 int64_t ObjectOffset2 = MFI.getObjectOffset(ObjectIdx: FI2);
5260 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5261 // Convert to scaled object offsets.
5262 int Scale1 = AArch64InstrInfo::getMemScale(Opc: Opcode1);
5263 if (ObjectOffset1 % Scale1 != 0)
5264 return false;
5265 ObjectOffset1 /= Scale1;
5266 int Scale2 = AArch64InstrInfo::getMemScale(Opc: Opcode2);
5267 if (ObjectOffset2 % Scale2 != 0)
5268 return false;
5269 ObjectOffset2 /= Scale2;
5270 ObjectOffset1 += Offset1;
5271 ObjectOffset2 += Offset2;
5272 return ObjectOffset1 + 1 == ObjectOffset2;
5273 }
5274
5275 return FI1 == FI2;
5276}
5277
5278/// Detect opportunities for ldp/stp formation.
5279///
5280/// Only called for LdSt for which getMemOperandWithOffset returns true.
5281bool AArch64InstrInfo::shouldClusterMemOps(
5282 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5283 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5284 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5285 unsigned NumBytes) const {
5286 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5287 const MachineOperand &BaseOp1 = *BaseOps1.front();
5288 const MachineOperand &BaseOp2 = *BaseOps2.front();
5289 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5290 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5291 if (BaseOp1.getType() != BaseOp2.getType())
5292 return false;
5293
5294 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5295 "Only base registers and frame indices are supported.");
5296
5297 // Check for both base regs and base FI.
5298 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5299 return false;
5300
5301 // Only cluster up to a single pair.
5302 if (ClusterSize > 2)
5303 return false;
5304
5305 if (!isPairableLdStInst(MI: FirstLdSt) || !isPairableLdStInst(MI: SecondLdSt))
5306 return false;
5307
5308 // Can we pair these instructions based on their opcodes?
5309 unsigned FirstOpc = FirstLdSt.getOpcode();
5310 unsigned SecondOpc = SecondLdSt.getOpcode();
5311 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5312 return false;
5313
5314 // Can't merge volatiles or load/stores that have a hint to avoid pair
5315 // formation, for example.
5316 if (!isCandidateToMergeOrPair(MI: FirstLdSt) ||
5317 !isCandidateToMergeOrPair(MI: SecondLdSt))
5318 return false;
5319
5320 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5321 int64_t Offset1 = FirstLdSt.getOperand(i: 2).getImm();
5322 if (hasUnscaledLdStOffset(Opc: FirstOpc) && !scaleOffset(Opc: FirstOpc, Offset&: Offset1))
5323 return false;
5324
5325 int64_t Offset2 = SecondLdSt.getOperand(i: 2).getImm();
5326 if (hasUnscaledLdStOffset(Opc: SecondOpc) && !scaleOffset(Opc: SecondOpc, Offset&: Offset2))
5327 return false;
5328
5329 // Pairwise instructions have a 7-bit signed offset field.
5330 if (Offset1 > 63 || Offset1 < -64)
5331 return false;
5332
5333 // The caller should already have ordered First/SecondLdSt by offset.
5334 // Note: except for non-equal frame index bases
5335 if (BaseOp1.isFI()) {
5336 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5337 "Caller should have ordered offsets.");
5338
5339 const MachineFrameInfo &MFI =
5340 FirstLdSt.getParent()->getParent()->getFrameInfo();
5341 return shouldClusterFI(MFI, FI1: BaseOp1.getIndex(), Offset1, Opcode1: FirstOpc,
5342 FI2: BaseOp2.getIndex(), Offset2, Opcode2: SecondOpc);
5343 }
5344
5345 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5346
5347 return Offset1 + 1 == Offset2;
5348}
5349
5350static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
5351 MCRegister Reg, unsigned SubIdx,
5352 RegState State,
5353 const TargetRegisterInfo *TRI) {
5354 if (!SubIdx)
5355 return MIB.addReg(RegNo: Reg, Flags: State);
5356
5357 if (Reg.isPhysical())
5358 return MIB.addReg(RegNo: TRI->getSubReg(Reg, Idx: SubIdx), Flags: State);
5359 return MIB.addReg(RegNo: Reg, Flags: State, SubReg: SubIdx);
5360}
5361
5362static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5363 unsigned NumRegs) {
5364 // We really want the positive remainder mod 32 here, that happens to be
5365 // easily obtainable with a mask.
5366 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5367}
5368
5369void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
5370 MachineBasicBlock::iterator I,
5371 const DebugLoc &DL, MCRegister DestReg,
5372 MCRegister SrcReg, bool KillSrc,
5373 unsigned Opcode,
5374 ArrayRef<unsigned> Indices) const {
5375 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5376 const TargetRegisterInfo *TRI = &getRegisterInfo();
5377 uint16_t DestEncoding = TRI->getEncodingValue(Reg: DestReg);
5378 uint16_t SrcEncoding = TRI->getEncodingValue(Reg: SrcReg);
5379 unsigned NumRegs = Indices.size();
5380
5381 int SubReg = 0, End = NumRegs, Incr = 1;
5382 if (forwardCopyWillClobberTuple(DestReg: DestEncoding, SrcReg: SrcEncoding, NumRegs)) {
5383 SubReg = NumRegs - 1;
5384 End = -1;
5385 Incr = -1;
5386 }
5387
5388 for (; SubReg != End; SubReg += Incr) {
5389 const MachineInstrBuilder MIB = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode));
5390 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
5391 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: {}, TRI);
5392 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
5393 }
5394}
5395
5396void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
5397 MachineBasicBlock::iterator I,
5398 const DebugLoc &DL, MCRegister DestReg,
5399 MCRegister SrcReg, bool KillSrc,
5400 unsigned Opcode, unsigned ZeroReg,
5401 llvm::ArrayRef<unsigned> Indices) const {
5402 const TargetRegisterInfo *TRI = &getRegisterInfo();
5403 unsigned NumRegs = Indices.size();
5404
5405#ifndef NDEBUG
5406 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5407 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5408 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5409 "GPR reg sequences should not be able to overlap");
5410#endif
5411
5412 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5413 const MachineInstrBuilder MIB = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode));
5414 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
5415 MIB.addReg(RegNo: ZeroReg);
5416 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
5417 MIB.addImm(Val: 0);
5418 }
5419}
5420
5421/// Returns true if the instruction at I is in a streaming call site region,
5422/// within a single basic block.
5423/// A "call site streaming region" starts after smstart and ends at smstop
5424/// around a call to a streaming function. This walks backward from I.
5425static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB,
5426 MachineBasicBlock::iterator I) {
5427 MachineFunction &MF = *MBB.getParent();
5428 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5429 if (!AFI->hasStreamingModeChanges())
5430 return false;
5431 // Walk backwards to find smstart/smstop
5432 for (MachineInstr &MI : reverse(C: make_range(x: MBB.begin(), y: I))) {
5433 unsigned Opc = MI.getOpcode();
5434 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5435 // Check if this is SM change (not ZA)
5436 int64_t PState = MI.getOperand(i: 0).getImm();
5437 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5438 // Operand 1 is 1 for start, 0 for stop
5439 return MI.getOperand(i: 1).getImm() == 1;
5440 }
5441 }
5442 }
5443 return false;
5444}
5445
5446/// Returns true if in a streaming call site region without SME-FA64.
5447static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5448 MachineBasicBlock &MBB,
5449 MachineBasicBlock::iterator I) {
5450 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5451}
5452
5453void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
5454 MachineBasicBlock::iterator I,
5455 const DebugLoc &DL, Register DestReg,
5456 Register SrcReg, bool KillSrc,
5457 bool RenamableDest,
5458 bool RenamableSrc) const {
5459 ++NumCopyInstrs;
5460 if (AArch64::GPR32spRegClass.contains(Reg: DestReg) &&
5461 AArch64::GPR32spRegClass.contains(Reg: SrcReg)) {
5462 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5463 // If either operand is WSP, expand to ADD #0.
5464 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5465 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5466 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5467 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5468 RC: &AArch64::GPR64spRegClass);
5469 MCRegister SrcRegX = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::sub_32,
5470 RC: &AArch64::GPR64spRegClass);
5471 // This instruction is reading and writing X registers. This may upset
5472 // the register scavenger and machine verifier, so we need to indicate
5473 // that we are reading an undefined value from SrcRegX, but a proper
5474 // value from SrcReg.
5475 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg: DestRegX)
5476 .addReg(RegNo: SrcRegX, Flags: RegState::Undef)
5477 .addImm(Val: 0)
5478 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
5479 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5480 ++NumZCRegMoveInstrsGPR;
5481 } else {
5482 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDWri), DestReg)
5483 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5484 .addImm(Val: 0)
5485 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5486 if (Subtarget.hasZeroCycleRegMoveGPR32())
5487 ++NumZCRegMoveInstrsGPR;
5488 }
5489 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5490 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5491 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5492 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5493 RC: &AArch64::GPR64spRegClass);
5494 assert(DestRegX.isValid() && "Destination super-reg not valid");
5495 MCRegister SrcRegX = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::sub_32,
5496 RC: &AArch64::GPR64spRegClass);
5497 assert(SrcRegX.isValid() && "Source super-reg not valid");
5498 // This instruction is reading and writing X registers. This may upset
5499 // the register scavenger and machine verifier, so we need to indicate
5500 // that we are reading an undefined value from SrcRegX, but a proper
5501 // value from SrcReg.
5502 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg: DestRegX)
5503 .addReg(RegNo: AArch64::XZR)
5504 .addReg(RegNo: SrcRegX, Flags: RegState::Undef)
5505 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5506 ++NumZCRegMoveInstrsGPR;
5507 } else {
5508 // Otherwise, expand to ORR WZR.
5509 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRWrr), DestReg)
5510 .addReg(RegNo: AArch64::WZR)
5511 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5512 if (Subtarget.hasZeroCycleRegMoveGPR32())
5513 ++NumZCRegMoveInstrsGPR;
5514 }
5515 return;
5516 }
5517
5518 // GPR32 zeroing
5519 if (AArch64::GPR32spRegClass.contains(Reg: DestReg) && SrcReg == AArch64::WZR) {
5520 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5521 !Subtarget.hasZeroCycleZeroingGPR32()) {
5522 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5523 RC: &AArch64::GPR64spRegClass);
5524 assert(DestRegX.isValid() && "Destination super-reg not valid");
5525 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: DestRegX)
5526 .addImm(Val: 0)
5527 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5528 ++NumZCZeroingInstrsGPR;
5529 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5530 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZWi), DestReg)
5531 .addImm(Val: 0)
5532 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5533 ++NumZCZeroingInstrsGPR;
5534 } else {
5535 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRWrr), DestReg)
5536 .addReg(RegNo: AArch64::WZR)
5537 .addReg(RegNo: AArch64::WZR);
5538 }
5539 return;
5540 }
5541
5542 if (AArch64::GPR64spRegClass.contains(Reg: DestReg) &&
5543 AArch64::GPR64spRegClass.contains(Reg: SrcReg)) {
5544 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5545 // If either operand is SP, expand to ADD #0.
5546 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg)
5547 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5548 .addImm(Val: 0)
5549 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5550 if (Subtarget.hasZeroCycleRegMoveGPR64())
5551 ++NumZCRegMoveInstrsGPR;
5552 } else {
5553 // Otherwise, expand to ORR XZR.
5554 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg)
5555 .addReg(RegNo: AArch64::XZR)
5556 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5557 if (Subtarget.hasZeroCycleRegMoveGPR64())
5558 ++NumZCRegMoveInstrsGPR;
5559 }
5560 return;
5561 }
5562
5563 // GPR64 zeroing
5564 if (AArch64::GPR64spRegClass.contains(Reg: DestReg) && SrcReg == AArch64::XZR) {
5565 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5566 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg)
5567 .addImm(Val: 0)
5568 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5569 ++NumZCZeroingInstrsGPR;
5570 } else {
5571 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg)
5572 .addReg(RegNo: AArch64::XZR)
5573 .addReg(RegNo: AArch64::XZR);
5574 }
5575 return;
5576 }
5577
5578 // Copy a Predicate register by ORRing with itself.
5579 if (AArch64::PPRRegClass.contains(Reg: DestReg) &&
5580 AArch64::PPRRegClass.contains(Reg: SrcReg)) {
5581 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5582 "Unexpected SVE register.");
5583 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_PPzPP), DestReg)
5584 .addReg(RegNo: SrcReg) // Pg
5585 .addReg(RegNo: SrcReg)
5586 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5587 return;
5588 }
5589
5590 // Copy a predicate-as-counter register by ORRing with itself as if it
5591 // were a regular predicate (mask) register.
5592 bool DestIsPNR = AArch64::PNRRegClass.contains(Reg: DestReg);
5593 bool SrcIsPNR = AArch64::PNRRegClass.contains(Reg: SrcReg);
5594 if (DestIsPNR || SrcIsPNR) {
5595 auto ToPPR = [](MCRegister R) -> MCRegister {
5596 return (R - AArch64::PN0) + AArch64::P0;
5597 };
5598 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5599 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5600
5601 if (PPRSrcReg != PPRDestReg) {
5602 auto NewMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_PPzPP), DestReg: PPRDestReg)
5603 .addReg(RegNo: PPRSrcReg) // Pg
5604 .addReg(RegNo: PPRSrcReg)
5605 .addReg(RegNo: PPRSrcReg, Flags: getKillRegState(B: KillSrc));
5606 if (DestIsPNR)
5607 NewMI.addDef(RegNo: DestReg, Flags: RegState::Implicit);
5608 }
5609 return;
5610 }
5611
5612 // Copy a Z register by ORRing with itself.
5613 if (AArch64::ZPRRegClass.contains(Reg: DestReg) &&
5614 AArch64::ZPRRegClass.contains(Reg: SrcReg)) {
5615 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5616 "Unexpected SVE register.");
5617 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_ZZZ), DestReg)
5618 .addReg(RegNo: SrcReg)
5619 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5620 return;
5621 }
5622
5623 // Copy a Z register pair by copying the individual sub-registers.
5624 if ((AArch64::ZPR2RegClass.contains(Reg: DestReg) ||
5625 AArch64::ZPR2StridedOrContiguousRegClass.contains(Reg: DestReg)) &&
5626 (AArch64::ZPR2RegClass.contains(Reg: SrcReg) ||
5627 AArch64::ZPR2StridedOrContiguousRegClass.contains(Reg: SrcReg))) {
5628 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5629 "Unexpected SVE register.");
5630 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5631 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5632 Indices);
5633 return;
5634 }
5635
5636 // Copy a Z register triple by copying the individual sub-registers.
5637 if (AArch64::ZPR3RegClass.contains(Reg: DestReg) &&
5638 AArch64::ZPR3RegClass.contains(Reg: SrcReg)) {
5639 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5640 "Unexpected SVE register.");
5641 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5642 AArch64::zsub2};
5643 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5644 Indices);
5645 return;
5646 }
5647
5648 // Copy a Z register quad by copying the individual sub-registers.
5649 if ((AArch64::ZPR4RegClass.contains(Reg: DestReg) ||
5650 AArch64::ZPR4StridedOrContiguousRegClass.contains(Reg: DestReg)) &&
5651 (AArch64::ZPR4RegClass.contains(Reg: SrcReg) ||
5652 AArch64::ZPR4StridedOrContiguousRegClass.contains(Reg: SrcReg))) {
5653 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5654 "Unexpected SVE register.");
5655 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5656 AArch64::zsub2, AArch64::zsub3};
5657 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5658 Indices);
5659 return;
5660 }
5661
5662 // Copy a DDDD register quad by copying the individual sub-registers.
5663 if (AArch64::DDDDRegClass.contains(Reg: DestReg) &&
5664 AArch64::DDDDRegClass.contains(Reg: SrcReg)) {
5665 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5666 AArch64::dsub2, AArch64::dsub3};
5667 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5668 Indices);
5669 return;
5670 }
5671
5672 // Copy a DDD register triple by copying the individual sub-registers.
5673 if (AArch64::DDDRegClass.contains(Reg: DestReg) &&
5674 AArch64::DDDRegClass.contains(Reg: SrcReg)) {
5675 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5676 AArch64::dsub2};
5677 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5678 Indices);
5679 return;
5680 }
5681
5682 // Copy a DD register pair by copying the individual sub-registers.
5683 if (AArch64::DDRegClass.contains(Reg: DestReg) &&
5684 AArch64::DDRegClass.contains(Reg: SrcReg)) {
5685 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5686 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5687 Indices);
5688 return;
5689 }
5690
5691 // Copy a QQQQ register quad by copying the individual sub-registers.
5692 if (AArch64::QQQQRegClass.contains(Reg: DestReg) &&
5693 AArch64::QQQQRegClass.contains(Reg: SrcReg)) {
5694 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5695 AArch64::qsub2, AArch64::qsub3};
5696 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5697 Indices);
5698 return;
5699 }
5700
5701 // Copy a QQQ register triple by copying the individual sub-registers.
5702 if (AArch64::QQQRegClass.contains(Reg: DestReg) &&
5703 AArch64::QQQRegClass.contains(Reg: SrcReg)) {
5704 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5705 AArch64::qsub2};
5706 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5707 Indices);
5708 return;
5709 }
5710
5711 // Copy a QQ register pair by copying the individual sub-registers.
5712 if (AArch64::QQRegClass.contains(Reg: DestReg) &&
5713 AArch64::QQRegClass.contains(Reg: SrcReg)) {
5714 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5715 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5716 Indices);
5717 return;
5718 }
5719
5720 if (AArch64::XSeqPairsClassRegClass.contains(Reg: DestReg) &&
5721 AArch64::XSeqPairsClassRegClass.contains(Reg: SrcReg)) {
5722 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5723 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRXrs,
5724 ZeroReg: AArch64::XZR, Indices);
5725 return;
5726 }
5727
5728 if (AArch64::WSeqPairsClassRegClass.contains(Reg: DestReg) &&
5729 AArch64::WSeqPairsClassRegClass.contains(Reg: SrcReg)) {
5730 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5731 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRWrs,
5732 ZeroReg: AArch64::WZR, Indices);
5733 return;
5734 }
5735
5736 if (AArch64::FPR128RegClass.contains(Reg: DestReg) &&
5737 AArch64::FPR128RegClass.contains(Reg: SrcReg)) {
5738 // In streaming regions, NEON is illegal but streaming-SVE is available.
5739 // Use SVE for copies if we're in a streaming region and SME is available.
5740 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
5741 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
5742 !Subtarget.isNeonAvailable()) ||
5743 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5744 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_ZZZ))
5745 .addReg(RegNo: AArch64::Z0 + (DestReg - AArch64::Q0), Flags: RegState::Define)
5746 .addReg(RegNo: AArch64::Z0 + (SrcReg - AArch64::Q0))
5747 .addReg(RegNo: AArch64::Z0 + (SrcReg - AArch64::Q0));
5748 } else if (Subtarget.isNeonAvailable()) {
5749 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg)
5750 .addReg(RegNo: SrcReg)
5751 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5752 if (Subtarget.hasZeroCycleRegMoveFPR128())
5753 ++NumZCRegMoveInstrsFPR;
5754 } else {
5755 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::STRQpre))
5756 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
5757 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5758 .addReg(RegNo: AArch64::SP)
5759 .addImm(Val: -16);
5760 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::LDRQpost))
5761 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
5762 .addReg(RegNo: DestReg, Flags: RegState::Define)
5763 .addReg(RegNo: AArch64::SP)
5764 .addImm(Val: 16);
5765 }
5766 return;
5767 }
5768
5769 if (AArch64::FPR64RegClass.contains(Reg: DestReg) &&
5770 AArch64::FPR64RegClass.contains(Reg: SrcReg)) {
5771 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5772 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5773 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5774 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5775 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::dsub,
5776 RC: &AArch64::FPR128RegClass);
5777 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::dsub,
5778 RC: &AArch64::FPR128RegClass);
5779 // This instruction is reading and writing Q registers. This may upset
5780 // the register scavenger and machine verifier, so we need to indicate
5781 // that we are reading an undefined value from SrcRegQ, but a proper
5782 // value from SrcReg.
5783 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5784 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5785 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5786 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5787 ++NumZCRegMoveInstrsFPR;
5788 } else {
5789 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg)
5790 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5791 if (Subtarget.hasZeroCycleRegMoveFPR64())
5792 ++NumZCRegMoveInstrsFPR;
5793 }
5794 return;
5795 }
5796
5797 if (AArch64::FPR32RegClass.contains(Reg: DestReg) &&
5798 AArch64::FPR32RegClass.contains(Reg: SrcReg)) {
5799 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5800 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5801 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5802 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5803 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::ssub,
5804 RC: &AArch64::FPR128RegClass);
5805 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::ssub,
5806 RC: &AArch64::FPR128RegClass);
5807 // This instruction is reading and writing Q registers. This may upset
5808 // the register scavenger and machine verifier, so we need to indicate
5809 // that we are reading an undefined value from SrcRegQ, but a proper
5810 // value from SrcReg.
5811 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5812 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5813 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5814 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5815 ++NumZCRegMoveInstrsFPR;
5816 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5817 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5818 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::ssub,
5819 RC: &AArch64::FPR64RegClass);
5820 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::ssub,
5821 RC: &AArch64::FPR64RegClass);
5822 // This instruction is reading and writing D registers. This may upset
5823 // the register scavenger and machine verifier, so we need to indicate
5824 // that we are reading an undefined value from SrcRegD, but a proper
5825 // value from SrcReg.
5826 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
5827 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
5828 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5829 ++NumZCRegMoveInstrsFPR;
5830 } else {
5831 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
5832 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5833 if (Subtarget.hasZeroCycleRegMoveFPR32())
5834 ++NumZCRegMoveInstrsFPR;
5835 }
5836 return;
5837 }
5838
5839 if (AArch64::FPR16RegClass.contains(Reg: DestReg) &&
5840 AArch64::FPR16RegClass.contains(Reg: SrcReg)) {
5841 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5842 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5843 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5844 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5845 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
5846 RC: &AArch64::FPR128RegClass);
5847 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
5848 RC: &AArch64::FPR128RegClass);
5849 // This instruction is reading and writing Q registers. This may upset
5850 // the register scavenger and machine verifier, so we need to indicate
5851 // that we are reading an undefined value from SrcRegQ, but a proper
5852 // value from SrcReg.
5853 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5854 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5855 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5856 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5857 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5858 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5859 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
5860 RC: &AArch64::FPR64RegClass);
5861 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
5862 RC: &AArch64::FPR64RegClass);
5863 // This instruction is reading and writing D registers. This may upset
5864 // the register scavenger and machine verifier, so we need to indicate
5865 // that we are reading an undefined value from SrcRegD, but a proper
5866 // value from SrcReg.
5867 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
5868 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
5869 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5870 } else {
5871 DestReg = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
5872 RC: &AArch64::FPR32RegClass);
5873 SrcReg = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
5874 RC: &AArch64::FPR32RegClass);
5875 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
5876 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5877 }
5878 return;
5879 }
5880
5881 if (AArch64::FPR8RegClass.contains(Reg: DestReg) &&
5882 AArch64::FPR8RegClass.contains(Reg: SrcReg)) {
5883 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5884 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5885 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5886 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5887 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
5888 RC: &AArch64::FPR128RegClass);
5889 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
5890 RC: &AArch64::FPR128RegClass);
5891 // This instruction is reading and writing Q registers. This may upset
5892 // the register scavenger and machine verifier, so we need to indicate
5893 // that we are reading an undefined value from SrcRegQ, but a proper
5894 // value from SrcReg.
5895 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5896 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5897 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5898 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5899 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5900 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5901 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
5902 RC: &AArch64::FPR64RegClass);
5903 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
5904 RC: &AArch64::FPR64RegClass);
5905 // This instruction is reading and writing D registers. This may upset
5906 // the register scavenger and machine verifier, so we need to indicate
5907 // that we are reading an undefined value from SrcRegD, but a proper
5908 // value from SrcReg.
5909 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
5910 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
5911 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5912 } else {
5913 DestReg = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
5914 RC: &AArch64::FPR32RegClass);
5915 SrcReg = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
5916 RC: &AArch64::FPR32RegClass);
5917 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
5918 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5919 }
5920 return;
5921 }
5922
5923 // Copies between GPR64 and FPR64.
5924 if (AArch64::FPR64RegClass.contains(Reg: DestReg) &&
5925 AArch64::GPR64RegClass.contains(Reg: SrcReg)) {
5926 if (AArch64::XZR == SrcReg) {
5927 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVD0), DestReg);
5928 } else {
5929 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVXDr), DestReg)
5930 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5931 }
5932 return;
5933 }
5934 if (AArch64::GPR64RegClass.contains(Reg: DestReg) &&
5935 AArch64::FPR64RegClass.contains(Reg: SrcReg)) {
5936 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDXr), DestReg)
5937 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5938 return;
5939 }
5940 // Copies between GPR32 and FPR32.
5941 if (AArch64::FPR32RegClass.contains(Reg: DestReg) &&
5942 AArch64::GPR32RegClass.contains(Reg: SrcReg)) {
5943 if (AArch64::WZR == SrcReg) {
5944 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVS0), DestReg);
5945 } else {
5946 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVWSr), DestReg)
5947 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5948 }
5949 return;
5950 }
5951 if (AArch64::GPR32RegClass.contains(Reg: DestReg) &&
5952 AArch64::FPR32RegClass.contains(Reg: SrcReg)) {
5953 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSWr), DestReg)
5954 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5955 return;
5956 }
5957
5958 if (DestReg == AArch64::NZCV) {
5959 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
5960 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MSR))
5961 .addImm(Val: AArch64SysReg::NZCV)
5962 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5963 .addReg(RegNo: AArch64::NZCV, Flags: RegState::Implicit | RegState::Define);
5964 return;
5965 }
5966
5967 if (SrcReg == AArch64::NZCV) {
5968 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
5969 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MRS), DestReg)
5970 .addImm(Val: AArch64SysReg::NZCV)
5971 .addReg(RegNo: AArch64::NZCV, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5972 return;
5973 }
5974
5975#ifndef NDEBUG
5976 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
5977 << "\n";
5978#endif
5979 llvm_unreachable("unimplemented reg-to-reg copy");
5980}
5981
5982static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
5983 MachineBasicBlock &MBB,
5984 MachineBasicBlock::iterator InsertBefore,
5985 const MCInstrDesc &MCID,
5986 Register SrcReg, bool IsKill,
5987 unsigned SubIdx0, unsigned SubIdx1, int FI,
5988 MachineMemOperand *MMO) {
5989 Register SrcReg0 = SrcReg;
5990 Register SrcReg1 = SrcReg;
5991 if (SrcReg.isPhysical()) {
5992 SrcReg0 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx0);
5993 SubIdx0 = 0;
5994 SrcReg1 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx1);
5995 SubIdx1 = 0;
5996 }
5997 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
5998 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: IsKill), SubReg: SubIdx0)
5999 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: IsKill), SubReg: SubIdx1)
6000 .addFrameIndex(Idx: FI)
6001 .addImm(Val: 0)
6002 .addMemOperand(MMO);
6003}
6004
6005void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
6006 MachineBasicBlock::iterator MBBI,
6007 Register SrcReg, bool isKill, int FI,
6008 const TargetRegisterClass *RC,
6009 Register VReg,
6010 MachineInstr::MIFlag Flags) const {
6011 MachineFunction &MF = *MBB.getParent();
6012 MachineFrameInfo &MFI = MF.getFrameInfo();
6013
6014 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6015 MachineMemOperand *MMO =
6016 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore,
6017 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
6018 unsigned Opc = 0;
6019 bool Offset = true;
6020 MCRegister PNRReg = MCRegister::NoRegister;
6021 unsigned StackID = TargetStackID::Default;
6022 switch (RI.getSpillSize(RC: *RC)) {
6023 case 1:
6024 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6025 Opc = AArch64::STRBui;
6026 break;
6027 case 2: {
6028 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6029 Opc = AArch64::STRHui;
6030 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6031 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6032 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6033 "Unexpected register store without SVE store instructions");
6034 Opc = AArch64::STR_PXI;
6035 StackID = TargetStackID::ScalablePredicateVector;
6036 }
6037 break;
6038 }
6039 case 4:
6040 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6041 Opc = AArch64::STRWui;
6042 if (SrcReg.isVirtual())
6043 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR32RegClass);
6044 else
6045 assert(SrcReg != AArch64::WSP);
6046 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6047 Opc = AArch64::STRSui;
6048 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6049 Opc = AArch64::STR_PPXI;
6050 StackID = TargetStackID::ScalablePredicateVector;
6051 }
6052 break;
6053 case 8:
6054 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6055 Opc = AArch64::STRXui;
6056 if (SrcReg.isVirtual())
6057 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64RegClass);
6058 else
6059 assert(SrcReg != AArch64::SP);
6060 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6061 Opc = AArch64::STRDui;
6062 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6063 storeRegPairToStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6064 MCID: get(Opcode: AArch64::STPWi), SrcReg, IsKill: isKill,
6065 SubIdx0: AArch64::sube32, SubIdx1: AArch64::subo32, FI, MMO);
6066 return;
6067 }
6068 break;
6069 case 16:
6070 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6071 Opc = AArch64::STRQui;
6072 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6073 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6074 Opc = AArch64::ST1Twov1d;
6075 Offset = false;
6076 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6077 storeRegPairToStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6078 MCID: get(Opcode: AArch64::STPXi), SrcReg, IsKill: isKill,
6079 SubIdx0: AArch64::sube64, SubIdx1: AArch64::subo64, FI, MMO);
6080 return;
6081 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6082 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6083 "Unexpected register store without SVE store instructions");
6084 Opc = AArch64::STR_ZXI;
6085 StackID = TargetStackID::ScalableVector;
6086 }
6087 break;
6088 case 24:
6089 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6090 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6091 Opc = AArch64::ST1Threev1d;
6092 Offset = false;
6093 }
6094 break;
6095 case 32:
6096 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6097 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6098 Opc = AArch64::ST1Fourv1d;
6099 Offset = false;
6100 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6101 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6102 Opc = AArch64::ST1Twov2d;
6103 Offset = false;
6104 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6105 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6106 "Unexpected register store without SVE store instructions");
6107 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6108 StackID = TargetStackID::ScalableVector;
6109 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6110 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6111 "Unexpected register store without SVE store instructions");
6112 Opc = AArch64::STR_ZZXI;
6113 StackID = TargetStackID::ScalableVector;
6114 }
6115 break;
6116 case 48:
6117 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6118 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6119 Opc = AArch64::ST1Threev2d;
6120 Offset = false;
6121 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6122 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6123 "Unexpected register store without SVE store instructions");
6124 Opc = AArch64::STR_ZZZXI;
6125 StackID = TargetStackID::ScalableVector;
6126 }
6127 break;
6128 case 64:
6129 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6130 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6131 Opc = AArch64::ST1Fourv2d;
6132 Offset = false;
6133 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6134 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6135 "Unexpected register store without SVE store instructions");
6136 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6137 StackID = TargetStackID::ScalableVector;
6138 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6139 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6140 "Unexpected register store without SVE store instructions");
6141 Opc = AArch64::STR_ZZZZXI;
6142 StackID = TargetStackID::ScalableVector;
6143 }
6144 break;
6145 }
6146 assert(Opc && "Unknown register class");
6147 MFI.setStackID(ObjectIdx: FI, ID: StackID);
6148
6149 const MachineInstrBuilder MI = BuildMI(BB&: MBB, I: MBBI, MIMD: DebugLoc(), MCID: get(Opcode: Opc))
6150 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill))
6151 .addFrameIndex(Idx: FI);
6152
6153 if (Offset)
6154 MI.addImm(Val: 0);
6155 if (PNRReg.isValid())
6156 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
6157 MI.addMemOperand(MMO);
6158}
6159
6160static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
6161 MachineBasicBlock &MBB,
6162 MachineBasicBlock::iterator InsertBefore,
6163 const MCInstrDesc &MCID,
6164 Register DestReg, unsigned SubIdx0,
6165 unsigned SubIdx1, int FI,
6166 MachineMemOperand *MMO) {
6167 Register DestReg0 = DestReg;
6168 Register DestReg1 = DestReg;
6169 bool IsUndef = true;
6170 if (DestReg.isPhysical()) {
6171 DestReg0 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx0);
6172 SubIdx0 = 0;
6173 DestReg1 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx1);
6174 SubIdx1 = 0;
6175 IsUndef = false;
6176 }
6177 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
6178 .addReg(RegNo: DestReg0, Flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx0)
6179 .addReg(RegNo: DestReg1, Flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx1)
6180 .addFrameIndex(Idx: FI)
6181 .addImm(Val: 0)
6182 .addMemOperand(MMO);
6183}
6184
6185void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
6186 MachineBasicBlock::iterator MBBI,
6187 Register DestReg, int FI,
6188 const TargetRegisterClass *RC,
6189 Register VReg, unsigned SubReg,
6190 MachineInstr::MIFlag Flags) const {
6191 MachineFunction &MF = *MBB.getParent();
6192 MachineFrameInfo &MFI = MF.getFrameInfo();
6193 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6194 MachineMemOperand *MMO =
6195 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOLoad,
6196 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
6197
6198 unsigned Opc = 0;
6199 bool Offset = true;
6200 unsigned StackID = TargetStackID::Default;
6201 Register PNRReg = MCRegister::NoRegister;
6202 switch (TRI.getSpillSize(RC: *RC)) {
6203 case 1:
6204 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6205 Opc = AArch64::LDRBui;
6206 break;
6207 case 2: {
6208 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6209 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6210 Opc = AArch64::LDRHui;
6211 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6212 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6213 "Unexpected register load without SVE load instructions");
6214 if (IsPNR)
6215 PNRReg = DestReg;
6216 Opc = AArch64::LDR_PXI;
6217 StackID = TargetStackID::ScalablePredicateVector;
6218 }
6219 break;
6220 }
6221 case 4:
6222 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6223 Opc = AArch64::LDRWui;
6224 if (DestReg.isVirtual())
6225 MF.getRegInfo().constrainRegClass(Reg: DestReg, RC: &AArch64::GPR32RegClass);
6226 else
6227 assert(DestReg != AArch64::WSP);
6228 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6229 Opc = AArch64::LDRSui;
6230 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6231 Opc = AArch64::LDR_PPXI;
6232 StackID = TargetStackID::ScalablePredicateVector;
6233 }
6234 break;
6235 case 8:
6236 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6237 Opc = AArch64::LDRXui;
6238 if (DestReg.isVirtual())
6239 MF.getRegInfo().constrainRegClass(Reg: DestReg, RC: &AArch64::GPR64RegClass);
6240 else
6241 assert(DestReg != AArch64::SP);
6242 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6243 Opc = AArch64::LDRDui;
6244 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6245 loadRegPairFromStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6246 MCID: get(Opcode: AArch64::LDPWi), DestReg, SubIdx0: AArch64::sube32,
6247 SubIdx1: AArch64::subo32, FI, MMO);
6248 return;
6249 }
6250 break;
6251 case 16:
6252 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6253 Opc = AArch64::LDRQui;
6254 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6255 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6256 Opc = AArch64::LD1Twov1d;
6257 Offset = false;
6258 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6259 loadRegPairFromStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6260 MCID: get(Opcode: AArch64::LDPXi), DestReg, SubIdx0: AArch64::sube64,
6261 SubIdx1: AArch64::subo64, FI, MMO);
6262 return;
6263 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6264 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6265 "Unexpected register load without SVE load instructions");
6266 Opc = AArch64::LDR_ZXI;
6267 StackID = TargetStackID::ScalableVector;
6268 }
6269 break;
6270 case 24:
6271 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6272 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6273 Opc = AArch64::LD1Threev1d;
6274 Offset = false;
6275 }
6276 break;
6277 case 32:
6278 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6279 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6280 Opc = AArch64::LD1Fourv1d;
6281 Offset = false;
6282 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6283 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6284 Opc = AArch64::LD1Twov2d;
6285 Offset = false;
6286 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6287 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6288 "Unexpected register load without SVE load instructions");
6289 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6290 StackID = TargetStackID::ScalableVector;
6291 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6292 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6293 "Unexpected register load without SVE load instructions");
6294 Opc = AArch64::LDR_ZZXI;
6295 StackID = TargetStackID::ScalableVector;
6296 }
6297 break;
6298 case 48:
6299 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6300 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6301 Opc = AArch64::LD1Threev2d;
6302 Offset = false;
6303 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6304 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6305 "Unexpected register load without SVE load instructions");
6306 Opc = AArch64::LDR_ZZZXI;
6307 StackID = TargetStackID::ScalableVector;
6308 }
6309 break;
6310 case 64:
6311 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6312 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6313 Opc = AArch64::LD1Fourv2d;
6314 Offset = false;
6315 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6316 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6317 "Unexpected register load without SVE load instructions");
6318 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6319 StackID = TargetStackID::ScalableVector;
6320 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6321 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6322 "Unexpected register load without SVE load instructions");
6323 Opc = AArch64::LDR_ZZZZXI;
6324 StackID = TargetStackID::ScalableVector;
6325 }
6326 break;
6327 }
6328
6329 assert(Opc && "Unknown register class");
6330 MFI.setStackID(ObjectIdx: FI, ID: StackID);
6331
6332 const MachineInstrBuilder MI = BuildMI(BB&: MBB, I: MBBI, MIMD: DebugLoc(), MCID: get(Opcode: Opc))
6333 .addReg(RegNo: DestReg, Flags: getDefRegState(B: true))
6334 .addFrameIndex(Idx: FI);
6335 if (Offset)
6336 MI.addImm(Val: 0);
6337 if (PNRReg.isValid() && !PNRReg.isVirtual())
6338 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
6339 MI.addMemOperand(MMO);
6340}
6341
6342bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
6343 const MachineInstr &UseMI,
6344 const TargetRegisterInfo *TRI) {
6345 return any_of(Range: instructionsWithoutDebug(It: std::next(x: DefMI.getIterator()),
6346 End: UseMI.getIterator()),
6347 P: [TRI](const MachineInstr &I) {
6348 return I.modifiesRegister(Reg: AArch64::NZCV, TRI) ||
6349 I.readsRegister(Reg: AArch64::NZCV, TRI);
6350 });
6351}
6352
6353void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6354 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6355 // The smallest scalable element supported by scaled SVE addressing
6356 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6357 // byte offset must always be a multiple of 2.
6358 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6359
6360 // VGSized offsets are divided by '2', because the VG register is the
6361 // the number of 64bit granules as opposed to 128bit vector chunks,
6362 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6363 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6364 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6365 ByteSized = Offset.getFixed();
6366 VGSized = Offset.getScalable() / 2;
6367}
6368
6369/// Returns the offset in parts to which this frame offset can be
6370/// decomposed for the purpose of describing a frame offset.
6371/// For non-scalable offsets this is simply its byte size.
6372void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6373 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6374 int64_t &NumDataVectors) {
6375 // The smallest scalable element supported by scaled SVE addressing
6376 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6377 // byte offset must always be a multiple of 2.
6378 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6379
6380 NumBytes = Offset.getFixed();
6381 NumDataVectors = 0;
6382 NumPredicateVectors = Offset.getScalable() / 2;
6383 // This method is used to get the offsets to adjust the frame offset.
6384 // If the function requires ADDPL to be used and needs more than two ADDPL
6385 // instructions, part of the offset is folded into NumDataVectors so that it
6386 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6387 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6388 NumPredicateVectors > 62) {
6389 NumDataVectors = NumPredicateVectors / 8;
6390 NumPredicateVectors -= NumDataVectors * 8;
6391 }
6392}
6393
6394// Convenience function to create a DWARF expression for: Constant `Operation`.
6395// This helper emits compact sequences for common cases. For example, for`-15
6396// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6397static void appendConstantExpr(SmallVectorImpl<char> &Expr, int64_t Constant,
6398 dwarf::LocationAtom Operation) {
6399 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6400 // -Constant (1 to 31)
6401 Expr.push_back(Elt: dwarf::DW_OP_lit0 - Constant);
6402 Operation = dwarf::DW_OP_minus;
6403 } else if (Constant >= 0 && Constant <= 31) {
6404 // Literal value 0 to 31
6405 Expr.push_back(Elt: dwarf::DW_OP_lit0 + Constant);
6406 } else {
6407 // Signed constant
6408 Expr.push_back(Elt: dwarf::DW_OP_consts);
6409 appendLEB128<LEB128Sign::Signed>(Buffer&: Expr, Value: Constant);
6410 }
6411 return Expr.push_back(Elt: Operation);
6412}
6413
6414// Convenience function to create a DWARF expression for a register.
6415static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6416 Expr.push_back(Elt: (char)dwarf::DW_OP_bregx);
6417 appendLEB128<LEB128Sign::Unsigned>(Buffer&: Expr, Value: RegNum);
6418 Expr.push_back(Elt: 0);
6419}
6420
6421// Convenience function to create a DWARF expression for loading a register from
6422// a CFA offset.
6423static void appendLoadRegExpr(SmallVectorImpl<char> &Expr,
6424 int64_t OffsetFromDefCFA) {
6425 // This assumes the top of the DWARF stack contains the CFA.
6426 Expr.push_back(Elt: dwarf::DW_OP_dup);
6427 // Add the offset to the register.
6428 appendConstantExpr(Expr, Constant: OffsetFromDefCFA, Operation: dwarf::DW_OP_plus);
6429 // Dereference the address (loads a 64 bit value)..
6430 Expr.push_back(Elt: dwarf::DW_OP_deref);
6431}
6432
6433// Convenience function to create a comment for
6434// (+/-) NumBytes (* RegScale)?
6435static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6436 StringRef RegScale = {}) {
6437 if (NumBytes) {
6438 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(x: NumBytes);
6439 if (!RegScale.empty())
6440 Comment << ' ' << RegScale;
6441 }
6442}
6443
6444// Creates an MCCFIInstruction:
6445// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6446static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
6447 unsigned Reg,
6448 const StackOffset &Offset) {
6449 int64_t NumBytes, NumVGScaledBytes;
6450 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, ByteSized&: NumBytes,
6451 VGSized&: NumVGScaledBytes);
6452 std::string CommentBuffer;
6453 llvm::raw_string_ostream Comment(CommentBuffer);
6454
6455 if (Reg == AArch64::SP)
6456 Comment << "sp";
6457 else if (Reg == AArch64::FP)
6458 Comment << "fp";
6459 else
6460 Comment << printReg(Reg, TRI: &TRI);
6461
6462 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6463 SmallString<64> Expr;
6464 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6465 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6466 // Reg + NumBytes
6467 Expr.push_back(Elt: dwarf::DW_OP_breg0 + DwarfReg);
6468 appendLEB128<LEB128Sign::Signed>(Buffer&: Expr, Value: NumBytes);
6469 appendOffsetComment(NumBytes, Comment);
6470 if (NumVGScaledBytes) {
6471 // + VG * NumVGScaledBytes
6472 appendOffsetComment(NumBytes: NumVGScaledBytes, Comment, RegScale: "* VG");
6473 appendReadRegExpr(Expr, RegNum: TRI.getDwarfRegNum(Reg: AArch64::VG, isEH: true));
6474 appendConstantExpr(Expr, Constant: NumVGScaledBytes, Operation: dwarf::DW_OP_mul);
6475 Expr.push_back(Elt: dwarf::DW_OP_plus);
6476 }
6477
6478 // Wrap this into DW_CFA_def_cfa.
6479 SmallString<64> DefCfaExpr;
6480 DefCfaExpr.push_back(Elt: dwarf::DW_CFA_def_cfa_expression);
6481 appendLEB128<LEB128Sign::Unsigned>(Buffer&: DefCfaExpr, Value: Expr.size());
6482 DefCfaExpr.append(RHS: Expr.str());
6483 return MCCFIInstruction::createEscape(L: nullptr, Vals: DefCfaExpr.str(), Loc: SMLoc(),
6484 Comment: Comment.str());
6485}
6486
6487MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
6488 unsigned FrameReg, unsigned Reg,
6489 const StackOffset &Offset,
6490 bool LastAdjustmentWasScalable) {
6491 if (Offset.getScalable())
6492 return createDefCFAExpression(TRI, Reg, Offset);
6493
6494 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6495 return MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: int(Offset.getFixed()));
6496
6497 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6498 return MCCFIInstruction::cfiDefCfa(L: nullptr, Register: DwarfReg, Offset: (int)Offset.getFixed());
6499}
6500
6501MCCFIInstruction
6502llvm::createCFAOffset(const TargetRegisterInfo &TRI, unsigned Reg,
6503 const StackOffset &OffsetFromDefCFA,
6504 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6505 int64_t NumBytes, NumVGScaledBytes;
6506 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6507 Offset: OffsetFromDefCFA, ByteSized&: NumBytes, VGSized&: NumVGScaledBytes);
6508
6509 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6510
6511 // Non-scalable offsets can use DW_CFA_offset directly.
6512 if (!NumVGScaledBytes)
6513 return MCCFIInstruction::createOffset(L: nullptr, Register: DwarfReg, Offset: NumBytes);
6514
6515 std::string CommentBuffer;
6516 llvm::raw_string_ostream Comment(CommentBuffer);
6517 Comment << printReg(Reg, TRI: &TRI) << " @ cfa";
6518
6519 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6520 assert(NumVGScaledBytes && "Expected scalable offset");
6521 SmallString<64> OffsetExpr;
6522 // + VG * NumVGScaledBytes
6523 StringRef VGRegScale;
6524 if (IncomingVGOffsetFromDefCFA) {
6525 appendLoadRegExpr(Expr&: OffsetExpr, OffsetFromDefCFA: *IncomingVGOffsetFromDefCFA);
6526 VGRegScale = "* IncomingVG";
6527 } else {
6528 appendReadRegExpr(Expr&: OffsetExpr, RegNum: TRI.getDwarfRegNum(Reg: AArch64::VG, isEH: true));
6529 VGRegScale = "* VG";
6530 }
6531 appendConstantExpr(Expr&: OffsetExpr, Constant: NumVGScaledBytes, Operation: dwarf::DW_OP_mul);
6532 appendOffsetComment(NumBytes: NumVGScaledBytes, Comment, RegScale: VGRegScale);
6533 OffsetExpr.push_back(Elt: dwarf::DW_OP_plus);
6534 if (NumBytes) {
6535 // + NumBytes
6536 appendOffsetComment(NumBytes, Comment);
6537 appendConstantExpr(Expr&: OffsetExpr, Constant: NumBytes, Operation: dwarf::DW_OP_plus);
6538 }
6539
6540 // Wrap this into DW_CFA_expression
6541 SmallString<64> CfaExpr;
6542 CfaExpr.push_back(Elt: dwarf::DW_CFA_expression);
6543 appendLEB128<LEB128Sign::Unsigned>(Buffer&: CfaExpr, Value: DwarfReg);
6544 appendLEB128<LEB128Sign::Unsigned>(Buffer&: CfaExpr, Value: OffsetExpr.size());
6545 CfaExpr.append(RHS: OffsetExpr.str());
6546
6547 return MCCFIInstruction::createEscape(L: nullptr, Vals: CfaExpr.str(), Loc: SMLoc(),
6548 Comment: Comment.str());
6549}
6550
6551// Helper function to emit a frame offset adjustment from a given
6552// pointer (SrcReg), stored into DestReg. This function is explicit
6553// in that it requires the opcode.
6554static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
6555 MachineBasicBlock::iterator MBBI,
6556 const DebugLoc &DL, unsigned DestReg,
6557 unsigned SrcReg, int64_t Offset, unsigned Opc,
6558 const TargetInstrInfo *TII,
6559 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6560 bool *HasWinCFI, bool EmitCFAOffset,
6561 StackOffset CFAOffset, unsigned FrameReg) {
6562 int Sign = 1;
6563 unsigned MaxEncoding, ShiftSize;
6564 switch (Opc) {
6565 case AArch64::ADDXri:
6566 case AArch64::ADDSXri:
6567 case AArch64::SUBXri:
6568 case AArch64::SUBSXri:
6569 MaxEncoding = 0xfff;
6570 ShiftSize = 12;
6571 break;
6572 case AArch64::ADDVL_XXI:
6573 case AArch64::ADDPL_XXI:
6574 case AArch64::ADDSVL_XXI:
6575 case AArch64::ADDSPL_XXI:
6576 MaxEncoding = 31;
6577 ShiftSize = 0;
6578 if (Offset < 0) {
6579 MaxEncoding = 32;
6580 Sign = -1;
6581 Offset = -Offset;
6582 }
6583 break;
6584 default:
6585 llvm_unreachable("Unsupported opcode");
6586 }
6587
6588 // `Offset` can be in bytes or in "scalable bytes".
6589 int VScale = 1;
6590 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6591 VScale = 16;
6592 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6593 VScale = 2;
6594
6595 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6596 // scratch register. If DestReg is a virtual register, use it as the
6597 // scratch register; otherwise, create a new virtual register (to be
6598 // replaced by the scavenger at the end of PEI). That case can be optimized
6599 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6600 // register can be loaded with offset%8 and the add/sub can use an extending
6601 // instruction with LSL#3.
6602 // Currently the function handles any offsets but generates a poor sequence
6603 // of code.
6604 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6605
6606 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6607 Register TmpReg = DestReg;
6608 if (TmpReg == AArch64::XZR)
6609 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6610 RegClass: &AArch64::GPR64RegClass);
6611 do {
6612 uint64_t ThisVal = std::min<uint64_t>(a: Offset, b: MaxEncodableValue);
6613 unsigned LocalShiftSize = 0;
6614 if (ThisVal > MaxEncoding) {
6615 ThisVal = ThisVal >> ShiftSize;
6616 LocalShiftSize = ShiftSize;
6617 }
6618 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6619 "Encoding cannot handle value that big");
6620
6621 Offset -= ThisVal << LocalShiftSize;
6622 if (Offset == 0)
6623 TmpReg = DestReg;
6624 auto MBI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: TmpReg)
6625 .addReg(RegNo: SrcReg)
6626 .addImm(Val: Sign * (int)ThisVal);
6627 if (ShiftSize)
6628 MBI = MBI.addImm(
6629 Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: LocalShiftSize));
6630 MBI = MBI.setMIFlag(Flag);
6631
6632 auto Change =
6633 VScale == 1
6634 ? StackOffset::getFixed(Fixed: ThisVal << LocalShiftSize)
6635 : StackOffset::getScalable(Scalable: VScale * (ThisVal << LocalShiftSize));
6636 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6637 CFAOffset += Change;
6638 else
6639 CFAOffset -= Change;
6640 if (EmitCFAOffset && DestReg == TmpReg) {
6641 MachineFunction &MF = *MBB.getParent();
6642 const TargetSubtargetInfo &STI = MF.getSubtarget();
6643 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6644
6645 unsigned CFIIndex = MF.addFrameInst(
6646 Inst: createDefCFA(TRI, FrameReg, Reg: DestReg, Offset: CFAOffset, LastAdjustmentWasScalable: VScale != 1));
6647 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::CFI_INSTRUCTION))
6648 .addCFIIndex(CFIIndex)
6649 .setMIFlags(Flag);
6650 }
6651
6652 if (NeedsWinCFI) {
6653 int Imm = (int)(ThisVal << LocalShiftSize);
6654 if (VScale != 1 && DestReg == AArch64::SP) {
6655 if (HasWinCFI)
6656 *HasWinCFI = true;
6657 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_AllocZ))
6658 .addImm(Val: ThisVal)
6659 .setMIFlag(Flag);
6660 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6661 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6662 assert(VScale == 1 && "Expected non-scalable operation");
6663 if (HasWinCFI)
6664 *HasWinCFI = true;
6665 if (Imm == 0)
6666 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_SetFP)).setMIFlag(Flag);
6667 else
6668 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_AddFP))
6669 .addImm(Val: Imm)
6670 .setMIFlag(Flag);
6671 assert(Offset == 0 && "Expected remaining offset to be zero to "
6672 "emit a single SEH directive");
6673 } else if (DestReg == AArch64::SP) {
6674 assert(VScale == 1 && "Expected non-scalable operation");
6675 if (HasWinCFI)
6676 *HasWinCFI = true;
6677 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6678 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_StackAlloc))
6679 .addImm(Val: Imm)
6680 .setMIFlag(Flag);
6681 }
6682 }
6683
6684 SrcReg = TmpReg;
6685 } while (Offset);
6686}
6687
6688void llvm::emitFrameOffset(MachineBasicBlock &MBB,
6689 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
6690 unsigned DestReg, unsigned SrcReg,
6691 StackOffset Offset, const TargetInstrInfo *TII,
6692 MachineInstr::MIFlag Flag, bool SetNZCV,
6693 bool NeedsWinCFI, bool *HasWinCFI,
6694 bool EmitCFAOffset, StackOffset CFAOffset,
6695 unsigned FrameReg) {
6696 // If a function is marked as arm_locally_streaming, then the runtime value of
6697 // vscale in the prologue/epilogue is different the runtime value of vscale
6698 // in the function's body. To avoid having to consider multiple vscales,
6699 // we can use `addsvl` to allocate any scalable stack-slots, which under
6700 // most circumstances will be only locals, not callee-save slots.
6701 const Function &F = MBB.getParent()->getFunction();
6702 bool UseSVL = F.hasFnAttribute(Kind: "aarch64_pstate_sm_body");
6703
6704 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6705 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6706 Offset, NumBytes&: Bytes, NumPredicateVectors, NumDataVectors);
6707
6708 // Insert ADDSXri for scalable offset at the end.
6709 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6710 if (NeedsFinalDefNZCV)
6711 SetNZCV = false;
6712
6713 // First emit non-scalable frame offsets, or a simple 'mov'.
6714 if (Bytes || (!Offset && SrcReg != DestReg)) {
6715 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6716 "SP increment/decrement not 8-byte aligned");
6717 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6718 if (Bytes < 0) {
6719 Bytes = -Bytes;
6720 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6721 }
6722 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: Bytes, Opc, TII, Flag,
6723 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6724 FrameReg);
6725 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6726 ? StackOffset::getFixed(Fixed: -Bytes)
6727 : StackOffset::getFixed(Fixed: Bytes);
6728 SrcReg = DestReg;
6729 FrameReg = DestReg;
6730 }
6731
6732 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6733 "WinCFI can't allocate fractions of an SVE data vector");
6734
6735 if (NumDataVectors) {
6736 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: NumDataVectors,
6737 Opc: UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6738 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6739 FrameReg);
6740 CFAOffset += StackOffset::getScalable(Scalable: -NumDataVectors * 16);
6741 SrcReg = DestReg;
6742 }
6743
6744 if (NumPredicateVectors) {
6745 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6746 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: NumPredicateVectors,
6747 Opc: UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6748 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6749 FrameReg);
6750 }
6751
6752 if (NeedsFinalDefNZCV)
6753 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDSXri), DestReg)
6754 .addReg(RegNo: DestReg)
6755 .addImm(Val: 0)
6756 .addImm(Val: 0);
6757}
6758
6759MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
6760 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
6761 MachineBasicBlock::iterator InsertPt, int FrameIndex,
6762 LiveIntervals *LIS, VirtRegMap *VRM) const {
6763 // This is a bit of a hack. Consider this instruction:
6764 //
6765 // %0 = COPY %sp; GPR64all:%0
6766 //
6767 // We explicitly chose GPR64all for the virtual register so such a copy might
6768 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6769 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6770 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6771 //
6772 // To prevent that, we are going to constrain the %0 register class here.
6773 if (MI.isFullCopy()) {
6774 Register DstReg = MI.getOperand(i: 0).getReg();
6775 Register SrcReg = MI.getOperand(i: 1).getReg();
6776 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6777 MF.getRegInfo().constrainRegClass(Reg: DstReg, RC: &AArch64::GPR64RegClass);
6778 return nullptr;
6779 }
6780 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6781 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64RegClass);
6782 return nullptr;
6783 }
6784 // Nothing can folded with copy from/to NZCV.
6785 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6786 return nullptr;
6787 }
6788
6789 // Handle the case where a copy is being spilled or filled but the source
6790 // and destination register class don't match. For example:
6791 //
6792 // %0 = COPY %xzr; GPR64common:%0
6793 //
6794 // In this case we can still safely fold away the COPY and generate the
6795 // following spill code:
6796 //
6797 // STRXui %xzr, %stack.0
6798 //
6799 // This also eliminates spilled cross register class COPYs (e.g. between x and
6800 // d regs) of the same size. For example:
6801 //
6802 // %0 = COPY %1; GPR64:%0, FPR64:%1
6803 //
6804 // will be filled as
6805 //
6806 // LDRDui %0, fi<#0>
6807 //
6808 // instead of
6809 //
6810 // LDRXui %Temp, fi<#0>
6811 // %0 = FMOV %Temp
6812 //
6813 if (MI.isCopy() && Ops.size() == 1 &&
6814 // Make sure we're only folding the explicit COPY defs/uses.
6815 (Ops[0] == 0 || Ops[0] == 1)) {
6816 bool IsSpill = Ops[0] == 0;
6817 bool IsFill = !IsSpill;
6818 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
6819 const MachineRegisterInfo &MRI = MF.getRegInfo();
6820 MachineBasicBlock &MBB = *MI.getParent();
6821 const MachineOperand &DstMO = MI.getOperand(i: 0);
6822 const MachineOperand &SrcMO = MI.getOperand(i: 1);
6823 Register DstReg = DstMO.getReg();
6824 Register SrcReg = SrcMO.getReg();
6825 // This is slightly expensive to compute for physical regs since
6826 // getMinimalPhysRegClass is slow.
6827 auto getRegClass = [&](unsigned Reg) {
6828 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6829 : TRI.getMinimalPhysRegClass(Reg);
6830 };
6831
6832 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6833 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6834 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6835 "Mismatched register size in non subreg COPY");
6836 if (IsSpill)
6837 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg, isKill: SrcMO.isKill(), FI: FrameIndex,
6838 RC: getRegClass(SrcReg), VReg: Register());
6839 else
6840 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex,
6841 RC: getRegClass(DstReg), VReg: Register());
6842 return &*--InsertPt;
6843 }
6844
6845 // Handle cases like spilling def of:
6846 //
6847 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6848 //
6849 // where the physical register source can be widened and stored to the full
6850 // virtual reg destination stack slot, in this case producing:
6851 //
6852 // STRXui %xzr, %stack.0
6853 //
6854 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6855 TRI.getRegSizeInBits(RC: *getRegClass(DstReg)) == 64) {
6856 assert(SrcMO.getSubReg() == 0 &&
6857 "Unexpected subreg on physical register");
6858 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg: AArch64::XZR, isKill: SrcMO.isKill(),
6859 FI: FrameIndex, RC: &AArch64::GPR64RegClass, VReg: Register());
6860 return &*--InsertPt;
6861 }
6862
6863 // Handle cases like filling use of:
6864 //
6865 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6866 //
6867 // where we can load the full virtual reg source stack slot, into the subreg
6868 // destination, in this case producing:
6869 //
6870 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6871 //
6872 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6873 const TargetRegisterClass *FillRC = nullptr;
6874 switch (DstMO.getSubReg()) {
6875 default:
6876 break;
6877 case AArch64::sub_32:
6878 if (AArch64::GPR64RegClass.hasSubClassEq(RC: getRegClass(DstReg)))
6879 FillRC = &AArch64::GPR32RegClass;
6880 break;
6881 case AArch64::ssub:
6882 FillRC = &AArch64::FPR32RegClass;
6883 break;
6884 case AArch64::dsub:
6885 FillRC = &AArch64::FPR64RegClass;
6886 break;
6887 }
6888
6889 if (FillRC) {
6890 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6891 TRI.getRegSizeInBits(*FillRC) &&
6892 "Mismatched regclass size on folded subreg COPY");
6893 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex, RC: FillRC,
6894 VReg: Register());
6895 MachineInstr &LoadMI = *--InsertPt;
6896 MachineOperand &LoadDst = LoadMI.getOperand(i: 0);
6897 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6898 LoadDst.setSubReg(DstMO.getSubReg());
6899 LoadDst.setIsUndef();
6900 return &LoadMI;
6901 }
6902 }
6903 }
6904
6905 // Cannot fold.
6906 return nullptr;
6907}
6908
6909int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
6910 StackOffset &SOffset,
6911 bool *OutUseUnscaledOp,
6912 unsigned *OutUnscaledOp,
6913 int64_t *EmittableOffset) {
6914 // Set output values in case of early exit.
6915 if (EmittableOffset)
6916 *EmittableOffset = 0;
6917 if (OutUseUnscaledOp)
6918 *OutUseUnscaledOp = false;
6919 if (OutUnscaledOp)
6920 *OutUnscaledOp = 0;
6921
6922 // Exit early for structured vector spills/fills as they can't take an
6923 // immediate offset.
6924 switch (MI.getOpcode()) {
6925 default:
6926 break;
6927 case AArch64::LD1Rv1d:
6928 case AArch64::LD1Rv2s:
6929 case AArch64::LD1Rv2d:
6930 case AArch64::LD1Rv4h:
6931 case AArch64::LD1Rv4s:
6932 case AArch64::LD1Rv8b:
6933 case AArch64::LD1Rv8h:
6934 case AArch64::LD1Rv16b:
6935 case AArch64::LD1Twov2d:
6936 case AArch64::LD1Threev2d:
6937 case AArch64::LD1Fourv2d:
6938 case AArch64::LD1Twov1d:
6939 case AArch64::LD1Threev1d:
6940 case AArch64::LD1Fourv1d:
6941 case AArch64::ST1Twov2d:
6942 case AArch64::ST1Threev2d:
6943 case AArch64::ST1Fourv2d:
6944 case AArch64::ST1Twov1d:
6945 case AArch64::ST1Threev1d:
6946 case AArch64::ST1Fourv1d:
6947 case AArch64::ST1i8:
6948 case AArch64::ST1i16:
6949 case AArch64::ST1i32:
6950 case AArch64::ST1i64:
6951 case AArch64::IRG:
6952 case AArch64::IRGstack:
6953 case AArch64::STGloop:
6954 case AArch64::STZGloop:
6955 return AArch64FrameOffsetCannotUpdate;
6956 }
6957
6958 // Get the min/max offset and the scale.
6959 TypeSize ScaleValue(0U, false), Width(0U, false);
6960 int64_t MinOff, MaxOff;
6961 if (!AArch64InstrInfo::getMemOpInfo(Opcode: MI.getOpcode(), Scale&: ScaleValue, Width, MinOffset&: MinOff,
6962 MaxOffset&: MaxOff))
6963 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6964
6965 // Construct the complete offset.
6966 bool IsMulVL = ScaleValue.isScalable();
6967 unsigned Scale = ScaleValue.getKnownMinValue();
6968 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
6969
6970 const MachineOperand &ImmOpnd =
6971 MI.getOperand(i: AArch64InstrInfo::getLoadStoreImmIdx(Opc: MI.getOpcode()));
6972 Offset += ImmOpnd.getImm() * Scale;
6973
6974 // If the offset doesn't match the scale, we rewrite the instruction to
6975 // use the unscaled instruction instead. Likewise, if we have a negative
6976 // offset and there is an unscaled op to use.
6977 std::optional<unsigned> UnscaledOp =
6978 AArch64InstrInfo::getUnscaledLdSt(Opc: MI.getOpcode());
6979 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
6980 if (useUnscaledOp &&
6981 !AArch64InstrInfo::getMemOpInfo(Opcode: *UnscaledOp, Scale&: ScaleValue, Width, MinOffset&: MinOff,
6982 MaxOffset&: MaxOff))
6983 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6984
6985 Scale = ScaleValue.getKnownMinValue();
6986 assert(IsMulVL == ScaleValue.isScalable() &&
6987 "Unscaled opcode has different value for scalable");
6988
6989 int64_t Remainder = Offset % Scale;
6990 assert(!(Remainder && useUnscaledOp) &&
6991 "Cannot have remainder when using unscaled op");
6992
6993 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
6994 int64_t NewOffset = Offset / Scale;
6995 if (MinOff <= NewOffset && NewOffset <= MaxOff)
6996 Offset = Remainder;
6997 else {
6998 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
6999 Offset = Offset - (NewOffset * Scale);
7000 }
7001
7002 if (EmittableOffset)
7003 *EmittableOffset = NewOffset;
7004 if (OutUseUnscaledOp)
7005 *OutUseUnscaledOp = useUnscaledOp;
7006 if (OutUnscaledOp && UnscaledOp)
7007 *OutUnscaledOp = *UnscaledOp;
7008
7009 if (IsMulVL)
7010 SOffset = StackOffset::get(Fixed: SOffset.getFixed(), Scalable: Offset);
7011 else
7012 SOffset = StackOffset::get(Fixed: Offset, Scalable: SOffset.getScalable());
7013 return AArch64FrameOffsetCanUpdate |
7014 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7015}
7016
7017bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
7018 unsigned FrameReg, StackOffset &Offset,
7019 const AArch64InstrInfo *TII) {
7020 unsigned Opcode = MI.getOpcode();
7021 unsigned ImmIdx = FrameRegIdx + 1;
7022
7023 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7024 Offset += StackOffset::getFixed(Fixed: MI.getOperand(i: ImmIdx).getImm());
7025 emitFrameOffset(MBB&: *MI.getParent(), MBBI: MI, DL: MI.getDebugLoc(),
7026 DestReg: MI.getOperand(i: 0).getReg(), SrcReg: FrameReg, Offset, TII,
7027 Flag: MachineInstr::NoFlags, SetNZCV: (Opcode == AArch64::ADDSXri));
7028 MI.eraseFromParent();
7029 Offset = StackOffset();
7030 return true;
7031 }
7032
7033 int64_t NewOffset;
7034 unsigned UnscaledOp;
7035 bool UseUnscaledOp;
7036 int Status = isAArch64FrameOffsetLegal(MI, SOffset&: Offset, OutUseUnscaledOp: &UseUnscaledOp,
7037 OutUnscaledOp: &UnscaledOp, EmittableOffset: &NewOffset);
7038 if (Status & AArch64FrameOffsetCanUpdate) {
7039 if (Status & AArch64FrameOffsetIsLegal)
7040 // Replace the FrameIndex with FrameReg.
7041 MI.getOperand(i: FrameRegIdx).ChangeToRegister(Reg: FrameReg, isDef: false);
7042 if (UseUnscaledOp)
7043 MI.setDesc(TII->get(Opcode: UnscaledOp));
7044
7045 MI.getOperand(i: ImmIdx).ChangeToImmediate(ImmVal: NewOffset);
7046 return !Offset;
7047 }
7048
7049 return false;
7050}
7051
7052void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
7053 MachineBasicBlock::iterator MI) const {
7054 DebugLoc DL;
7055 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AArch64::NOP));
7056}
7057
7058MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7059
7060// AArch64 supports MachineCombiner.
7061bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7062
7063// True when Opc sets flag
7064static bool isCombineInstrSettingFlag(unsigned Opc) {
7065 switch (Opc) {
7066 case AArch64::ADDSWrr:
7067 case AArch64::ADDSWri:
7068 case AArch64::ADDSXrr:
7069 case AArch64::ADDSXri:
7070 case AArch64::SUBSWrr:
7071 case AArch64::SUBSXrr:
7072 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7073 case AArch64::SUBSWri:
7074 case AArch64::SUBSXri:
7075 return true;
7076 default:
7077 break;
7078 }
7079 return false;
7080}
7081
7082// 32b Opcodes that can be combined with a MUL
7083static bool isCombineInstrCandidate32(unsigned Opc) {
7084 switch (Opc) {
7085 case AArch64::ADDWrr:
7086 case AArch64::ADDWri:
7087 case AArch64::SUBWrr:
7088 case AArch64::ADDSWrr:
7089 case AArch64::ADDSWri:
7090 case AArch64::SUBSWrr:
7091 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7092 case AArch64::SUBWri:
7093 case AArch64::SUBSWri:
7094 return true;
7095 default:
7096 break;
7097 }
7098 return false;
7099}
7100
7101// 64b Opcodes that can be combined with a MUL
7102static bool isCombineInstrCandidate64(unsigned Opc) {
7103 switch (Opc) {
7104 case AArch64::ADDXrr:
7105 case AArch64::ADDXri:
7106 case AArch64::SUBXrr:
7107 case AArch64::ADDSXrr:
7108 case AArch64::ADDSXri:
7109 case AArch64::SUBSXrr:
7110 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7111 case AArch64::SUBXri:
7112 case AArch64::SUBSXri:
7113 case AArch64::ADDv8i8:
7114 case AArch64::ADDv16i8:
7115 case AArch64::ADDv4i16:
7116 case AArch64::ADDv8i16:
7117 case AArch64::ADDv2i32:
7118 case AArch64::ADDv4i32:
7119 case AArch64::SUBv8i8:
7120 case AArch64::SUBv16i8:
7121 case AArch64::SUBv4i16:
7122 case AArch64::SUBv8i16:
7123 case AArch64::SUBv2i32:
7124 case AArch64::SUBv4i32:
7125 return true;
7126 default:
7127 break;
7128 }
7129 return false;
7130}
7131
7132// FP Opcodes that can be combined with a FMUL.
7133static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7134 switch (Inst.getOpcode()) {
7135 default:
7136 break;
7137 case AArch64::FADDHrr:
7138 case AArch64::FADDSrr:
7139 case AArch64::FADDDrr:
7140 case AArch64::FADDv4f16:
7141 case AArch64::FADDv8f16:
7142 case AArch64::FADDv2f32:
7143 case AArch64::FADDv2f64:
7144 case AArch64::FADDv4f32:
7145 case AArch64::FSUBHrr:
7146 case AArch64::FSUBSrr:
7147 case AArch64::FSUBDrr:
7148 case AArch64::FSUBv4f16:
7149 case AArch64::FSUBv8f16:
7150 case AArch64::FSUBv2f32:
7151 case AArch64::FSUBv2f64:
7152 case AArch64::FSUBv4f32:
7153 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
7154 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7155 // the target options or if FADD/FSUB has the contract fast-math flag.
7156 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7157 Inst.getFlag(Flag: MachineInstr::FmContract);
7158 }
7159 return false;
7160}
7161
7162// Opcodes that can be combined with a MUL
7163static bool isCombineInstrCandidate(unsigned Opc) {
7164 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
7165}
7166
7167//
7168// Utility routine that checks if \param MO is defined by an
7169// \param CombineOpc instruction in the basic block \param MBB
7170static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
7171 unsigned CombineOpc, unsigned ZeroReg = 0,
7172 bool CheckZeroReg = false) {
7173 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7174 MachineInstr *MI = nullptr;
7175
7176 if (MO.isReg() && MO.getReg().isVirtual())
7177 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
7178 // And it needs to be in the trace (otherwise, it won't have a depth).
7179 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7180 return false;
7181 // Must only used by the user we combine with.
7182 if (!MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
7183 return false;
7184
7185 if (CheckZeroReg) {
7186 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7187 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7188 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7189 // The third input reg must be zero.
7190 if (MI->getOperand(i: 3).getReg() != ZeroReg)
7191 return false;
7192 }
7193
7194 if (isCombineInstrSettingFlag(Opc: CombineOpc) &&
7195 MI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) == -1)
7196 return false;
7197
7198 return true;
7199}
7200
7201//
7202// Is \param MO defined by an integer multiply and can be combined?
7203static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
7204 unsigned MulOpc, unsigned ZeroReg) {
7205 return canCombine(MBB, MO, CombineOpc: MulOpc, ZeroReg, CheckZeroReg: true);
7206}
7207
7208//
7209// Is \param MO defined by a floating-point multiply and can be combined?
7210static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
7211 unsigned MulOpc) {
7212 return canCombine(MBB, MO, CombineOpc: MulOpc);
7213}
7214
7215// TODO: There are many more machine instruction opcodes to match:
7216// 1. Other data types (integer, vectors)
7217// 2. Other math / logic operations (xor, or)
7218// 3. Other forms of the same operation (intrinsics and other variants)
7219bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7220 bool Invert) const {
7221 if (Invert)
7222 return false;
7223 switch (Inst.getOpcode()) {
7224 // == Floating-point types ==
7225 // -- Floating-point instructions --
7226 case AArch64::FADDHrr:
7227 case AArch64::FADDSrr:
7228 case AArch64::FADDDrr:
7229 case AArch64::FMULHrr:
7230 case AArch64::FMULSrr:
7231 case AArch64::FMULDrr:
7232 case AArch64::FMULX16:
7233 case AArch64::FMULX32:
7234 case AArch64::FMULX64:
7235 // -- Advanced SIMD instructions --
7236 case AArch64::FADDv4f16:
7237 case AArch64::FADDv8f16:
7238 case AArch64::FADDv2f32:
7239 case AArch64::FADDv4f32:
7240 case AArch64::FADDv2f64:
7241 case AArch64::FMULv4f16:
7242 case AArch64::FMULv8f16:
7243 case AArch64::FMULv2f32:
7244 case AArch64::FMULv4f32:
7245 case AArch64::FMULv2f64:
7246 case AArch64::FMULXv4f16:
7247 case AArch64::FMULXv8f16:
7248 case AArch64::FMULXv2f32:
7249 case AArch64::FMULXv4f32:
7250 case AArch64::FMULXv2f64:
7251 // -- SVE instructions --
7252 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7253 // in the SVE instruction set (though there are predicated ones).
7254 case AArch64::FADD_ZZZ_H:
7255 case AArch64::FADD_ZZZ_S:
7256 case AArch64::FADD_ZZZ_D:
7257 case AArch64::FMUL_ZZZ_H:
7258 case AArch64::FMUL_ZZZ_S:
7259 case AArch64::FMUL_ZZZ_D:
7260 return Inst.getFlag(Flag: MachineInstr::MIFlag::FmReassoc) &&
7261 Inst.getFlag(Flag: MachineInstr::MIFlag::FmNsz);
7262
7263 // == Integer types ==
7264 // -- Base instructions --
7265 // Opcodes MULWrr and MULXrr don't exist because
7266 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7267 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7268 // The machine-combiner does not support three-source-operands machine
7269 // instruction. So we cannot reassociate MULs.
7270 case AArch64::ADDWrr:
7271 case AArch64::ADDXrr:
7272 case AArch64::ANDWrr:
7273 case AArch64::ANDXrr:
7274 case AArch64::ORRWrr:
7275 case AArch64::ORRXrr:
7276 case AArch64::EORWrr:
7277 case AArch64::EORXrr:
7278 case AArch64::EONWrr:
7279 case AArch64::EONXrr:
7280 // -- Advanced SIMD instructions --
7281 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7282 // in the Advanced SIMD instruction set.
7283 case AArch64::ADDv8i8:
7284 case AArch64::ADDv16i8:
7285 case AArch64::ADDv4i16:
7286 case AArch64::ADDv8i16:
7287 case AArch64::ADDv2i32:
7288 case AArch64::ADDv4i32:
7289 case AArch64::ADDv1i64:
7290 case AArch64::ADDv2i64:
7291 case AArch64::MULv8i8:
7292 case AArch64::MULv16i8:
7293 case AArch64::MULv4i16:
7294 case AArch64::MULv8i16:
7295 case AArch64::MULv2i32:
7296 case AArch64::MULv4i32:
7297 case AArch64::ANDv8i8:
7298 case AArch64::ANDv16i8:
7299 case AArch64::ORRv8i8:
7300 case AArch64::ORRv16i8:
7301 case AArch64::EORv8i8:
7302 case AArch64::EORv16i8:
7303 // -- SVE instructions --
7304 case AArch64::ADD_ZZZ_B:
7305 case AArch64::ADD_ZZZ_H:
7306 case AArch64::ADD_ZZZ_S:
7307 case AArch64::ADD_ZZZ_D:
7308 case AArch64::MUL_ZZZ_B:
7309 case AArch64::MUL_ZZZ_H:
7310 case AArch64::MUL_ZZZ_S:
7311 case AArch64::MUL_ZZZ_D:
7312 case AArch64::AND_ZZZ:
7313 case AArch64::ORR_ZZZ:
7314 case AArch64::EOR_ZZZ:
7315 return true;
7316
7317 default:
7318 return false;
7319 }
7320}
7321
7322/// Find instructions that can be turned into madd.
7323static bool getMaddPatterns(MachineInstr &Root,
7324 SmallVectorImpl<unsigned> &Patterns) {
7325 unsigned Opc = Root.getOpcode();
7326 MachineBasicBlock &MBB = *Root.getParent();
7327 bool Found = false;
7328
7329 if (!isCombineInstrCandidate(Opc))
7330 return false;
7331 if (isCombineInstrSettingFlag(Opc)) {
7332 int Cmp_NZCV =
7333 Root.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true);
7334 // When NZCV is live bail out.
7335 if (Cmp_NZCV == -1)
7336 return false;
7337 unsigned NewOpc = convertToNonFlagSettingOpc(MI: Root);
7338 // When opcode can't change bail out.
7339 // CHECKME: do we miss any cases for opcode conversion?
7340 if (NewOpc == Opc)
7341 return false;
7342 Opc = NewOpc;
7343 }
7344
7345 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7346 unsigned Pattern) {
7347 if (canCombineWithMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode, ZeroReg)) {
7348 Patterns.push_back(Elt: Pattern);
7349 Found = true;
7350 }
7351 };
7352
7353 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7354 if (canCombine(MBB, MO&: Root.getOperand(i: Operand), CombineOpc: Opcode)) {
7355 Patterns.push_back(Elt: Pattern);
7356 Found = true;
7357 }
7358 };
7359
7360 typedef AArch64MachineCombinerPattern MCP;
7361
7362 switch (Opc) {
7363 default:
7364 break;
7365 case AArch64::ADDWrr:
7366 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7367 "ADDWrr does not have register operands");
7368 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7369 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7370 break;
7371 case AArch64::ADDXrr:
7372 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7373 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7374 break;
7375 case AArch64::SUBWrr:
7376 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7377 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7378 break;
7379 case AArch64::SUBXrr:
7380 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7381 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7382 break;
7383 case AArch64::ADDWri:
7384 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7385 break;
7386 case AArch64::ADDXri:
7387 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7388 break;
7389 case AArch64::SUBWri:
7390 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7391 break;
7392 case AArch64::SUBXri:
7393 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7394 break;
7395 case AArch64::ADDv8i8:
7396 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7397 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7398 break;
7399 case AArch64::ADDv16i8:
7400 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7401 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7402 break;
7403 case AArch64::ADDv4i16:
7404 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7405 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7406 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7407 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7408 break;
7409 case AArch64::ADDv8i16:
7410 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7411 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7412 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7413 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7414 break;
7415 case AArch64::ADDv2i32:
7416 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7417 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7418 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7419 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7420 break;
7421 case AArch64::ADDv4i32:
7422 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7423 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7424 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7425 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7426 break;
7427 case AArch64::SUBv8i8:
7428 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7429 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7430 break;
7431 case AArch64::SUBv16i8:
7432 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7433 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7434 break;
7435 case AArch64::SUBv4i16:
7436 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7437 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7438 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7439 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7440 break;
7441 case AArch64::SUBv8i16:
7442 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7443 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7444 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7445 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7446 break;
7447 case AArch64::SUBv2i32:
7448 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7449 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7450 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7451 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7452 break;
7453 case AArch64::SUBv4i32:
7454 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7455 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7456 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7457 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7458 break;
7459 }
7460 return Found;
7461}
7462
7463bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7464 switch (Opcode) {
7465 default:
7466 break;
7467 case AArch64::UABALB_ZZZ_D:
7468 case AArch64::UABALB_ZZZ_H:
7469 case AArch64::UABALB_ZZZ_S:
7470 case AArch64::UABALT_ZZZ_D:
7471 case AArch64::UABALT_ZZZ_H:
7472 case AArch64::UABALT_ZZZ_S:
7473 case AArch64::SABALB_ZZZ_D:
7474 case AArch64::SABALB_ZZZ_S:
7475 case AArch64::SABALB_ZZZ_H:
7476 case AArch64::SABALT_ZZZ_D:
7477 case AArch64::SABALT_ZZZ_S:
7478 case AArch64::SABALT_ZZZ_H:
7479 case AArch64::UABALv16i8_v8i16:
7480 case AArch64::UABALv2i32_v2i64:
7481 case AArch64::UABALv4i16_v4i32:
7482 case AArch64::UABALv4i32_v2i64:
7483 case AArch64::UABALv8i16_v4i32:
7484 case AArch64::UABALv8i8_v8i16:
7485 case AArch64::UABAv16i8:
7486 case AArch64::UABAv2i32:
7487 case AArch64::UABAv4i16:
7488 case AArch64::UABAv4i32:
7489 case AArch64::UABAv8i16:
7490 case AArch64::UABAv8i8:
7491 case AArch64::SABALv16i8_v8i16:
7492 case AArch64::SABALv2i32_v2i64:
7493 case AArch64::SABALv4i16_v4i32:
7494 case AArch64::SABALv4i32_v2i64:
7495 case AArch64::SABALv8i16_v4i32:
7496 case AArch64::SABALv8i8_v8i16:
7497 case AArch64::SABAv16i8:
7498 case AArch64::SABAv2i32:
7499 case AArch64::SABAv4i16:
7500 case AArch64::SABAv4i32:
7501 case AArch64::SABAv8i16:
7502 case AArch64::SABAv8i8:
7503 return true;
7504 }
7505
7506 return false;
7507}
7508
7509unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7510 unsigned AccumulationOpcode) const {
7511 switch (AccumulationOpcode) {
7512 default:
7513 llvm_unreachable("Unsupported accumulation Opcode!");
7514 case AArch64::UABALB_ZZZ_D:
7515 return AArch64::UABDLB_ZZZ_D;
7516 case AArch64::UABALB_ZZZ_H:
7517 return AArch64::UABDLB_ZZZ_H;
7518 case AArch64::UABALB_ZZZ_S:
7519 return AArch64::UABDLB_ZZZ_S;
7520 case AArch64::UABALT_ZZZ_D:
7521 return AArch64::UABDLT_ZZZ_D;
7522 case AArch64::UABALT_ZZZ_H:
7523 return AArch64::UABDLT_ZZZ_H;
7524 case AArch64::UABALT_ZZZ_S:
7525 return AArch64::UABDLT_ZZZ_S;
7526 case AArch64::UABALv16i8_v8i16:
7527 return AArch64::UABDLv16i8_v8i16;
7528 case AArch64::UABALv2i32_v2i64:
7529 return AArch64::UABDLv2i32_v2i64;
7530 case AArch64::UABALv4i16_v4i32:
7531 return AArch64::UABDLv4i16_v4i32;
7532 case AArch64::UABALv4i32_v2i64:
7533 return AArch64::UABDLv4i32_v2i64;
7534 case AArch64::UABALv8i16_v4i32:
7535 return AArch64::UABDLv8i16_v4i32;
7536 case AArch64::UABALv8i8_v8i16:
7537 return AArch64::UABDLv8i8_v8i16;
7538 case AArch64::UABAv16i8:
7539 return AArch64::UABDv16i8;
7540 case AArch64::UABAv2i32:
7541 return AArch64::UABDv2i32;
7542 case AArch64::UABAv4i16:
7543 return AArch64::UABDv4i16;
7544 case AArch64::UABAv4i32:
7545 return AArch64::UABDv4i32;
7546 case AArch64::UABAv8i16:
7547 return AArch64::UABDv8i16;
7548 case AArch64::UABAv8i8:
7549 return AArch64::UABDv8i8;
7550 case AArch64::SABALB_ZZZ_D:
7551 return AArch64::SABDLB_ZZZ_D;
7552 case AArch64::SABALB_ZZZ_S:
7553 return AArch64::SABDLB_ZZZ_S;
7554 case AArch64::SABALB_ZZZ_H:
7555 return AArch64::SABDLB_ZZZ_H;
7556 case AArch64::SABALT_ZZZ_D:
7557 return AArch64::SABDLT_ZZZ_D;
7558 case AArch64::SABALT_ZZZ_S:
7559 return AArch64::SABDLT_ZZZ_S;
7560 case AArch64::SABALT_ZZZ_H:
7561 return AArch64::SABDLT_ZZZ_H;
7562 case AArch64::SABALv16i8_v8i16:
7563 return AArch64::SABDLv16i8_v8i16;
7564 case AArch64::SABALv2i32_v2i64:
7565 return AArch64::SABDLv2i32_v2i64;
7566 case AArch64::SABALv4i16_v4i32:
7567 return AArch64::SABDLv4i16_v4i32;
7568 case AArch64::SABALv4i32_v2i64:
7569 return AArch64::SABDLv4i32_v2i64;
7570 case AArch64::SABALv8i16_v4i32:
7571 return AArch64::SABDLv8i16_v4i32;
7572 case AArch64::SABALv8i8_v8i16:
7573 return AArch64::SABDLv8i8_v8i16;
7574 case AArch64::SABAv16i8:
7575 return AArch64::SABDv16i8;
7576 case AArch64::SABAv2i32:
7577 return AArch64::SABAv2i32;
7578 case AArch64::SABAv4i16:
7579 return AArch64::SABDv4i16;
7580 case AArch64::SABAv4i32:
7581 return AArch64::SABDv4i32;
7582 case AArch64::SABAv8i16:
7583 return AArch64::SABDv8i16;
7584 case AArch64::SABAv8i8:
7585 return AArch64::SABDv8i8;
7586 }
7587}
7588
7589/// Floating-Point Support
7590
7591/// Find instructions that can be turned into madd.
7592static bool getFMAPatterns(MachineInstr &Root,
7593 SmallVectorImpl<unsigned> &Patterns) {
7594
7595 if (!isCombineInstrCandidateFP(Inst: Root))
7596 return false;
7597
7598 MachineBasicBlock &MBB = *Root.getParent();
7599 bool Found = false;
7600
7601 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7602 if (canCombineWithFMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode)) {
7603 Patterns.push_back(Elt: Pattern);
7604 return true;
7605 }
7606 return false;
7607 };
7608
7609 typedef AArch64MachineCombinerPattern MCP;
7610
7611 switch (Root.getOpcode()) {
7612 default:
7613 assert(false && "Unsupported FP instruction in combiner\n");
7614 break;
7615 case AArch64::FADDHrr:
7616 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7617 "FADDHrr does not have register operands");
7618
7619 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7620 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7621 break;
7622 case AArch64::FADDSrr:
7623 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7624 "FADDSrr does not have register operands");
7625
7626 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7627 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7628
7629 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7630 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7631 break;
7632 case AArch64::FADDDrr:
7633 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7634 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7635
7636 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7637 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7638 break;
7639 case AArch64::FADDv4f16:
7640 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7641 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7642
7643 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7644 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7645 break;
7646 case AArch64::FADDv8f16:
7647 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7648 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7649
7650 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7651 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7652 break;
7653 case AArch64::FADDv2f32:
7654 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7655 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7656
7657 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7658 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7659 break;
7660 case AArch64::FADDv2f64:
7661 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7662 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7663
7664 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7665 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7666 break;
7667 case AArch64::FADDv4f32:
7668 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7669 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7670
7671 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7672 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7673 break;
7674 case AArch64::FSUBHrr:
7675 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7676 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7677 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7678 break;
7679 case AArch64::FSUBSrr:
7680 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7681
7682 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7683 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7684
7685 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7686 break;
7687 case AArch64::FSUBDrr:
7688 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7689
7690 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7691 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7692
7693 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7694 break;
7695 case AArch64::FSUBv4f16:
7696 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7697 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7698
7699 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7700 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7701 break;
7702 case AArch64::FSUBv8f16:
7703 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7704 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7705
7706 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7707 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7708 break;
7709 case AArch64::FSUBv2f32:
7710 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7711 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7712
7713 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7714 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7715 break;
7716 case AArch64::FSUBv2f64:
7717 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7718 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7719
7720 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7721 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7722 break;
7723 case AArch64::FSUBv4f32:
7724 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7725 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7726
7727 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7728 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7729 break;
7730 }
7731 return Found;
7732}
7733
7734static bool getFMULPatterns(MachineInstr &Root,
7735 SmallVectorImpl<unsigned> &Patterns) {
7736 MachineBasicBlock &MBB = *Root.getParent();
7737 bool Found = false;
7738
7739 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7740 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7741 MachineOperand &MO = Root.getOperand(i: Operand);
7742 MachineInstr *MI = nullptr;
7743 if (MO.isReg() && MO.getReg().isVirtual())
7744 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
7745 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7746 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7747 MI->getOperand(i: 1).getReg().isVirtual())
7748 MI = MRI.getUniqueVRegDef(Reg: MI->getOperand(i: 1).getReg());
7749 if (MI && MI->getOpcode() == Opcode) {
7750 Patterns.push_back(Elt: Pattern);
7751 return true;
7752 }
7753 return false;
7754 };
7755
7756 typedef AArch64MachineCombinerPattern MCP;
7757
7758 switch (Root.getOpcode()) {
7759 default:
7760 return false;
7761 case AArch64::FMULv2f32:
7762 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7763 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7764 break;
7765 case AArch64::FMULv2f64:
7766 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7767 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7768 break;
7769 case AArch64::FMULv4f16:
7770 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7771 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7772 break;
7773 case AArch64::FMULv4f32:
7774 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7775 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7776 break;
7777 case AArch64::FMULv8f16:
7778 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7779 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7780 break;
7781 }
7782
7783 return Found;
7784}
7785
7786static bool getFNEGPatterns(MachineInstr &Root,
7787 SmallVectorImpl<unsigned> &Patterns) {
7788 unsigned Opc = Root.getOpcode();
7789 MachineBasicBlock &MBB = *Root.getParent();
7790 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7791
7792 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7793 MachineOperand &MO = Root.getOperand(i: 1);
7794 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
7795 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7796 MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()) &&
7797 Root.getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
7798 Root.getFlag(Flag: MachineInstr::MIFlag::FmNsz) &&
7799 MI->getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
7800 MI->getFlag(Flag: MachineInstr::MIFlag::FmNsz)) {
7801 Patterns.push_back(Elt: Pattern);
7802 return true;
7803 }
7804 return false;
7805 };
7806
7807 switch (Opc) {
7808 default:
7809 break;
7810 case AArch64::FNEGDr:
7811 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7812 case AArch64::FNEGSr:
7813 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7814 }
7815
7816 return false;
7817}
7818
7819/// Return true when a code sequence can improve throughput. It
7820/// should be called only for instructions in loops.
7821/// \param Pattern - combiner pattern
7822bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
7823 switch (Pattern) {
7824 default:
7825 break;
7826 case AArch64MachineCombinerPattern::FMULADDH_OP1:
7827 case AArch64MachineCombinerPattern::FMULADDH_OP2:
7828 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7829 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7830 case AArch64MachineCombinerPattern::FMULADDS_OP1:
7831 case AArch64MachineCombinerPattern::FMULADDS_OP2:
7832 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7833 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7834 case AArch64MachineCombinerPattern::FMULADDD_OP1:
7835 case AArch64MachineCombinerPattern::FMULADDD_OP2:
7836 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7837 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7838 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7839 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7840 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7841 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7842 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7843 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7844 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7845 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7846 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7847 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7848 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7849 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7850 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7851 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7852 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7853 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7854 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7855 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7856 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7857 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7858 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7859 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7860 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7861 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7862 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7863 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7864 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7865 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
7866 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7867 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
7868 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7869 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7870 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7871 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7872 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7873 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7874 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7875 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7876 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7877 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7878 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7879 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7880 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7881 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7882 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
7883 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7884 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
7885 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7886 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
7887 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7888 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
7889 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7890 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
7891 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7892 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7893 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7894 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7895 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7896 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7897 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7898 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7899 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7900 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7901 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7902 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7903 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7904 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7905 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7906 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7907 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7908 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7909 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7910 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7911 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7912 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7913 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7914 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7915 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7916 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7917 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7918 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7919 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7920 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7921 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7922 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7923 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7924 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7925 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7926 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7927 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7928 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7929 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7930 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7931 return true;
7932 } // end switch (Pattern)
7933 return false;
7934}
7935
7936/// Find other MI combine patterns.
7937static bool getMiscPatterns(MachineInstr &Root,
7938 SmallVectorImpl<unsigned> &Patterns) {
7939 // A - (B + C) ==> (A - B) - C or (A - C) - B
7940 unsigned Opc = Root.getOpcode();
7941 MachineBasicBlock &MBB = *Root.getParent();
7942
7943 switch (Opc) {
7944 case AArch64::SUBWrr:
7945 case AArch64::SUBSWrr:
7946 case AArch64::SUBXrr:
7947 case AArch64::SUBSXrr:
7948 // Found candidate root.
7949 break;
7950 default:
7951 return false;
7952 }
7953
7954 if (isCombineInstrSettingFlag(Opc) &&
7955 Root.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) ==
7956 -1)
7957 return false;
7958
7959 if (canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDWrr) ||
7960 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDSWrr) ||
7961 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDXrr) ||
7962 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDSXrr)) {
7963 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP1);
7964 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP2);
7965 return true;
7966 }
7967
7968 return false;
7969}
7970
7971/// Check if the given instruction forms a gather load pattern that can be
7972/// optimized for better Memory-Level Parallelism (MLP). This function
7973/// identifies chains of NEON lane load instructions that load data from
7974/// different memory addresses into individual lanes of a 128-bit vector
7975/// register, then attempts to split the pattern into parallel loads to break
7976/// the serial dependency between instructions.
7977///
7978/// Pattern Matched:
7979/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
7980/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
7981///
7982/// Transformed Into:
7983/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
7984/// to combine the results, enabling better memory-level parallelism.
7985///
7986/// Supported Element Types:
7987/// - 32-bit elements (LD1i32, 4 lanes total)
7988/// - 16-bit elements (LD1i16, 8 lanes total)
7989/// - 8-bit elements (LD1i8, 16 lanes total)
7990static bool getGatherLanePattern(MachineInstr &Root,
7991 SmallVectorImpl<unsigned> &Patterns,
7992 unsigned LoadLaneOpCode, unsigned NumLanes) {
7993 const MachineFunction *MF = Root.getMF();
7994
7995 // Early exit if optimizing for size.
7996 if (MF->getFunction().hasMinSize())
7997 return false;
7998
7999 const MachineRegisterInfo &MRI = MF->getRegInfo();
8000 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
8001
8002 // The root of the pattern must load into the last lane of the vector.
8003 if (Root.getOperand(i: 2).getImm() != NumLanes - 1)
8004 return false;
8005
8006 // Check that we have load into all lanes except lane 0.
8007 // For each load we also want to check that:
8008 // 1. It has a single non-debug use (since we will be replacing the virtual
8009 // register)
8010 // 2. That the addressing mode only uses a single pointer operand
8011 auto *CurrInstr = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 1).getReg());
8012 auto Range = llvm::seq<unsigned>(Begin: 1, End: NumLanes - 1);
8013 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8014 SmallVector<const MachineInstr *, 16> LoadInstrs;
8015 while (!RemainingLanes.empty() && CurrInstr &&
8016 CurrInstr->getOpcode() == LoadLaneOpCode &&
8017 MRI.hasOneNonDBGUse(RegNo: CurrInstr->getOperand(i: 0).getReg()) &&
8018 CurrInstr->getNumOperands() == 4) {
8019 RemainingLanes.erase(V: CurrInstr->getOperand(i: 2).getImm());
8020 LoadInstrs.push_back(Elt: CurrInstr);
8021 CurrInstr = MRI.getUniqueVRegDef(Reg: CurrInstr->getOperand(i: 1).getReg());
8022 }
8023
8024 // Check that we have found a match for lanes N-1.. 1.
8025 if (!RemainingLanes.empty())
8026 return false;
8027
8028 // Match the SUBREG_TO_REG sequence.
8029 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8030 return false;
8031
8032 // Verify that the subreg to reg loads an integer into the first lane.
8033 auto Lane0LoadReg = CurrInstr->getOperand(i: 1).getReg();
8034 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8035 if (TRI->getRegSizeInBits(Reg: Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8036 return false;
8037
8038 // Verify that it also has a single non debug use.
8039 if (!MRI.hasOneNonDBGUse(RegNo: Lane0LoadReg))
8040 return false;
8041
8042 LoadInstrs.push_back(Elt: MRI.getUniqueVRegDef(Reg: Lane0LoadReg));
8043
8044 // If there is any chance of aliasing, do not apply the pattern.
8045 // Walk backward through the MBB starting from Root.
8046 // Exit early if we've encountered all load instructions or hit the search
8047 // limit.
8048 auto MBBItr = Root.getIterator();
8049 unsigned RemainingSteps = GatherOptSearchLimit;
8050 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8051 RemainingLoadInstrs.insert(I: LoadInstrs.begin(), E: LoadInstrs.end());
8052 const MachineBasicBlock *MBB = Root.getParent();
8053
8054 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8055 !RemainingLoadInstrs.empty();
8056 --MBBItr, --RemainingSteps) {
8057 const MachineInstr &CurrInstr = *MBBItr;
8058
8059 // Remove this instruction from remaining loads if it's one we're tracking.
8060 RemainingLoadInstrs.erase(Ptr: &CurrInstr);
8061
8062 // Check for potential aliasing with any of the load instructions to
8063 // optimize.
8064 if (CurrInstr.isLoadFoldBarrier())
8065 return false;
8066 }
8067
8068 // If we hit the search limit without finding all load instructions,
8069 // don't match the pattern.
8070 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8071 return false;
8072
8073 switch (NumLanes) {
8074 case 4:
8075 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i32);
8076 break;
8077 case 8:
8078 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i16);
8079 break;
8080 case 16:
8081 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i8);
8082 break;
8083 default:
8084 llvm_unreachable("Got bad number of lanes for gather pattern.");
8085 }
8086
8087 return true;
8088}
8089
8090/// Search for patterns of LD instructions we can optimize.
8091static bool getLoadPatterns(MachineInstr &Root,
8092 SmallVectorImpl<unsigned> &Patterns) {
8093
8094 // The pattern searches for loads into single lanes.
8095 switch (Root.getOpcode()) {
8096 case AArch64::LD1i32:
8097 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 4);
8098 case AArch64::LD1i16:
8099 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 8);
8100 case AArch64::LD1i8:
8101 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 16);
8102 default:
8103 return false;
8104 }
8105}
8106
8107/// Generate optimized instruction sequence for gather load patterns to improve
8108/// Memory-Level Parallelism (MLP). This function transforms a chain of
8109/// sequential NEON lane loads into parallel vector loads that can execute
8110/// concurrently.
8111static void
8112generateGatherLanePattern(MachineInstr &Root,
8113 SmallVectorImpl<MachineInstr *> &InsInstrs,
8114 SmallVectorImpl<MachineInstr *> &DelInstrs,
8115 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8116 unsigned Pattern, unsigned NumLanes) {
8117 MachineFunction &MF = *Root.getParent()->getParent();
8118 MachineRegisterInfo &MRI = MF.getRegInfo();
8119 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8120
8121 // Gather the initial load instructions to build the pattern.
8122 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8123 MachineInstr *CurrInstr = &Root;
8124 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8125 LoadToLaneInstrs.push_back(Elt: CurrInstr);
8126 CurrInstr = MRI.getUniqueVRegDef(Reg: CurrInstr->getOperand(i: 1).getReg());
8127 }
8128
8129 // Sort the load instructions according to the lane.
8130 llvm::sort(C&: LoadToLaneInstrs,
8131 Comp: [](const MachineInstr *A, const MachineInstr *B) {
8132 return A->getOperand(i: 2).getImm() > B->getOperand(i: 2).getImm();
8133 });
8134
8135 MachineInstr *SubregToReg = CurrInstr;
8136 LoadToLaneInstrs.push_back(
8137 Elt: MRI.getUniqueVRegDef(Reg: SubregToReg->getOperand(i: 1).getReg()));
8138 auto LoadToLaneInstrsAscending = llvm::reverse(C&: LoadToLaneInstrs);
8139
8140 const TargetRegisterClass *FPR128RegClass =
8141 MRI.getRegClass(Reg: Root.getOperand(i: 0).getReg());
8142
8143 // Helper lambda to create a LD1 instruction.
8144 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8145 Register SrcRegister, unsigned Lane,
8146 Register OffsetRegister,
8147 bool OffsetRegisterKillState) {
8148 auto NewRegister = MRI.createVirtualRegister(RegClass: FPR128RegClass);
8149 MachineInstrBuilder LoadIndexIntoRegister =
8150 BuildMI(MF, MIMD: MIMetadata(*OriginalInstr), MCID: TII->get(Opcode: Root.getOpcode()),
8151 DestReg: NewRegister)
8152 .addReg(RegNo: SrcRegister)
8153 .addImm(Val: Lane)
8154 .addReg(RegNo: OffsetRegister, Flags: getKillRegState(B: OffsetRegisterKillState));
8155 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewRegister, y: InsInstrs.size()));
8156 InsInstrs.push_back(Elt: LoadIndexIntoRegister);
8157 return NewRegister;
8158 };
8159
8160 // Helper to create load instruction based on the NumLanes in the NEON
8161 // register we are rewriting.
8162 auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
8163 Register OffsetReg,
8164 bool KillState) -> MachineInstrBuilder {
8165 unsigned Opcode;
8166 switch (NumLanes) {
8167 case 4:
8168 Opcode = AArch64::LDRSui;
8169 break;
8170 case 8:
8171 Opcode = AArch64::LDRHui;
8172 break;
8173 case 16:
8174 Opcode = AArch64::LDRBui;
8175 break;
8176 default:
8177 llvm_unreachable(
8178 "Got unsupported number of lanes in machine-combiner gather pattern");
8179 }
8180 // Immediate offset load
8181 return BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg)
8182 .addReg(RegNo: OffsetReg)
8183 .addImm(Val: 0);
8184 };
8185
8186 // Load the remaining lanes into register 0.
8187 auto LanesToLoadToReg0 =
8188 llvm::make_range(x: LoadToLaneInstrsAscending.begin() + 1,
8189 y: LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8190 Register PrevReg = SubregToReg->getOperand(i: 0).getReg();
8191 for (auto [Index, LoadInstr] : llvm::enumerate(First&: LanesToLoadToReg0)) {
8192 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(i: 3);
8193 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8194 OffsetRegOperand.getReg(),
8195 OffsetRegOperand.isKill());
8196 DelInstrs.push_back(Elt: LoadInstr);
8197 }
8198 Register LastLoadReg0 = PrevReg;
8199
8200 // First load into register 1. Perform an integer load to zero out the upper
8201 // lanes in a single instruction.
8202 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8203 MachineInstr *OriginalSplitLoad =
8204 *std::next(x: LoadToLaneInstrsAscending.begin(), n: NumLanes / 2);
8205 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8206 RegClass: MRI.getRegClass(Reg: Lane0Load->getOperand(i: 0).getReg()));
8207
8208 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8209 OriginalSplitLoad->getOperand(i: 3);
8210 MachineInstrBuilder MiddleIndexLoadInstr =
8211 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8212 OriginalSplitToLoadOffsetOperand.getReg(),
8213 OriginalSplitToLoadOffsetOperand.isKill());
8214
8215 InstrIdxForVirtReg.insert(
8216 KV: std::make_pair(x&: DestRegForMiddleIndex, y: InsInstrs.size()));
8217 InsInstrs.push_back(Elt: MiddleIndexLoadInstr);
8218 DelInstrs.push_back(Elt: OriginalSplitLoad);
8219
8220 // Subreg To Reg instruction for register 1.
8221 Register DestRegForSubregToReg = MRI.createVirtualRegister(RegClass: FPR128RegClass);
8222 unsigned SubregType;
8223 switch (NumLanes) {
8224 case 4:
8225 SubregType = AArch64::ssub;
8226 break;
8227 case 8:
8228 SubregType = AArch64::hsub;
8229 break;
8230 case 16:
8231 SubregType = AArch64::bsub;
8232 break;
8233 default:
8234 llvm_unreachable(
8235 "Got invalid NumLanes for machine-combiner gather pattern");
8236 }
8237
8238 auto SubRegToRegInstr =
8239 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: SubregToReg->getOpcode()),
8240 DestReg: DestRegForSubregToReg)
8241 .addReg(RegNo: DestRegForMiddleIndex, Flags: getKillRegState(B: true))
8242 .addImm(Val: SubregType);
8243 InstrIdxForVirtReg.insert(
8244 KV: std::make_pair(x&: DestRegForSubregToReg, y: InsInstrs.size()));
8245 InsInstrs.push_back(Elt: SubRegToRegInstr);
8246
8247 // Load remaining lanes into register 1.
8248 auto LanesToLoadToReg1 =
8249 llvm::make_range(x: LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8250 y: LoadToLaneInstrsAscending.end());
8251 PrevReg = SubRegToRegInstr->getOperand(i: 0).getReg();
8252 for (auto [Index, LoadInstr] : llvm::enumerate(First&: LanesToLoadToReg1)) {
8253 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(i: 3);
8254 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8255 OffsetRegOperand.getReg(),
8256 OffsetRegOperand.isKill());
8257
8258 // Do not add the last reg to DelInstrs - it will be removed later.
8259 if (Index == NumLanes / 2 - 2) {
8260 break;
8261 }
8262 DelInstrs.push_back(Elt: LoadInstr);
8263 }
8264 Register LastLoadReg1 = PrevReg;
8265
8266 // Create the final zip instruction to combine the results.
8267 MachineInstrBuilder ZipInstr =
8268 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::ZIP1v2i64),
8269 DestReg: Root.getOperand(i: 0).getReg())
8270 .addReg(RegNo: LastLoadReg0)
8271 .addReg(RegNo: LastLoadReg1);
8272 InsInstrs.push_back(Elt: ZipInstr);
8273}
8274
8275CombinerObjective
8276AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
8277 switch (Pattern) {
8278 case AArch64MachineCombinerPattern::SUBADD_OP1:
8279 case AArch64MachineCombinerPattern::SUBADD_OP2:
8280 case AArch64MachineCombinerPattern::GATHER_LANE_i32:
8281 case AArch64MachineCombinerPattern::GATHER_LANE_i16:
8282 case AArch64MachineCombinerPattern::GATHER_LANE_i8:
8283 return CombinerObjective::MustReduceDepth;
8284 default:
8285 return TargetInstrInfo::getCombinerObjective(Pattern);
8286 }
8287}
8288
8289/// Return true when there is potentially a faster code sequence for an
8290/// instruction chain ending in \p Root. All potential patterns are listed in
8291/// the \p Pattern vector. Pattern should be sorted in priority order since the
8292/// pattern evaluator stops checking as soon as it finds a faster sequence.
8293
8294bool AArch64InstrInfo::getMachineCombinerPatterns(
8295 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8296 bool DoRegPressureReduce) const {
8297 // Integer patterns
8298 if (getMaddPatterns(Root, Patterns))
8299 return true;
8300 // Floating point patterns
8301 if (getFMULPatterns(Root, Patterns))
8302 return true;
8303 if (getFMAPatterns(Root, Patterns))
8304 return true;
8305 if (getFNEGPatterns(Root, Patterns))
8306 return true;
8307
8308 // Other patterns
8309 if (getMiscPatterns(Root, Patterns))
8310 return true;
8311
8312 // Load patterns
8313 if (getLoadPatterns(Root, Patterns))
8314 return true;
8315
8316 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8317 DoRegPressureReduce);
8318}
8319
8320enum class FMAInstKind { Default, Indexed, Accumulator };
8321/// genFusedMultiply - Generate fused multiply instructions.
8322/// This function supports both integer and floating point instructions.
8323/// A typical example:
8324/// F|MUL I=A,B,0
8325/// F|ADD R,I,C
8326/// ==> F|MADD R,A,B,C
8327/// \param MF Containing MachineFunction
8328/// \param MRI Register information
8329/// \param TII Target information
8330/// \param Root is the F|ADD instruction
8331/// \param [out] InsInstrs is a vector of machine instructions and will
8332/// contain the generated madd instruction
8333/// \param IdxMulOpd is index of operand in Root that is the result of
8334/// the F|MUL. In the example above IdxMulOpd is 1.
8335/// \param MaddOpc the opcode fo the f|madd instruction
8336/// \param RC Register class of operands
8337/// \param kind of fma instruction (addressing mode) to be generated
8338/// \param ReplacedAddend is the result register from the instruction
8339/// replacing the non-combined operand, if any.
8340static MachineInstr *
8341genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
8342 const TargetInstrInfo *TII, MachineInstr &Root,
8343 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8344 unsigned MaddOpc, const TargetRegisterClass *RC,
8345 FMAInstKind kind = FMAInstKind::Default,
8346 const Register *ReplacedAddend = nullptr) {
8347 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8348
8349 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8350 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
8351 Register ResultReg = Root.getOperand(i: 0).getReg();
8352 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
8353 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
8354 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
8355 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
8356
8357 Register SrcReg2;
8358 bool Src2IsKill;
8359 if (ReplacedAddend) {
8360 // If we just generated a new addend, we must be it's only use.
8361 SrcReg2 = *ReplacedAddend;
8362 Src2IsKill = true;
8363 } else {
8364 SrcReg2 = Root.getOperand(i: IdxOtherOpd).getReg();
8365 Src2IsKill = Root.getOperand(i: IdxOtherOpd).isKill();
8366 }
8367
8368 if (ResultReg.isVirtual())
8369 MRI.constrainRegClass(Reg: ResultReg, RC);
8370 if (SrcReg0.isVirtual())
8371 MRI.constrainRegClass(Reg: SrcReg0, RC);
8372 if (SrcReg1.isVirtual())
8373 MRI.constrainRegClass(Reg: SrcReg1, RC);
8374 if (SrcReg2.isVirtual())
8375 MRI.constrainRegClass(Reg: SrcReg2, RC);
8376
8377 MachineInstrBuilder MIB;
8378 if (kind == FMAInstKind::Default)
8379 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8380 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8381 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8382 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill));
8383 else if (kind == FMAInstKind::Indexed)
8384 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8385 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill))
8386 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8387 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8388 .addImm(Val: MUL->getOperand(i: 3).getImm());
8389 else if (kind == FMAInstKind::Accumulator)
8390 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8391 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill))
8392 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8393 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill));
8394 else
8395 assert(false && "Invalid FMA instruction kind \n");
8396 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8397 InsInstrs.push_back(Elt: MIB);
8398 return MUL;
8399}
8400
8401static MachineInstr *
8402genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
8403 const TargetInstrInfo *TII, MachineInstr &Root,
8404 SmallVectorImpl<MachineInstr *> &InsInstrs) {
8405 MachineInstr *MAD = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 1).getReg());
8406
8407 unsigned Opc = 0;
8408 const TargetRegisterClass *RC = MRI.getRegClass(Reg: MAD->getOperand(i: 0).getReg());
8409 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8410 Opc = AArch64::FNMADDSrrr;
8411 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8412 Opc = AArch64::FNMADDDrrr;
8413 else
8414 return nullptr;
8415
8416 Register ResultReg = Root.getOperand(i: 0).getReg();
8417 Register SrcReg0 = MAD->getOperand(i: 1).getReg();
8418 Register SrcReg1 = MAD->getOperand(i: 2).getReg();
8419 Register SrcReg2 = MAD->getOperand(i: 3).getReg();
8420 bool Src0IsKill = MAD->getOperand(i: 1).isKill();
8421 bool Src1IsKill = MAD->getOperand(i: 2).isKill();
8422 bool Src2IsKill = MAD->getOperand(i: 3).isKill();
8423 if (ResultReg.isVirtual())
8424 MRI.constrainRegClass(Reg: ResultReg, RC);
8425 if (SrcReg0.isVirtual())
8426 MRI.constrainRegClass(Reg: SrcReg0, RC);
8427 if (SrcReg1.isVirtual())
8428 MRI.constrainRegClass(Reg: SrcReg1, RC);
8429 if (SrcReg2.isVirtual())
8430 MRI.constrainRegClass(Reg: SrcReg2, RC);
8431
8432 MachineInstrBuilder MIB =
8433 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: Opc), DestReg: ResultReg)
8434 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8435 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8436 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill));
8437 InsInstrs.push_back(Elt: MIB);
8438
8439 return MAD;
8440}
8441
8442/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8443static MachineInstr *
8444genIndexedMultiply(MachineInstr &Root,
8445 SmallVectorImpl<MachineInstr *> &InsInstrs,
8446 unsigned IdxDupOp, unsigned MulOpc,
8447 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8448 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8449 "Invalid index of FMUL operand");
8450
8451 MachineFunction &MF = *Root.getMF();
8452 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8453
8454 MachineInstr *Dup =
8455 MF.getRegInfo().getUniqueVRegDef(Reg: Root.getOperand(i: IdxDupOp).getReg());
8456
8457 if (Dup->getOpcode() == TargetOpcode::COPY)
8458 Dup = MRI.getUniqueVRegDef(Reg: Dup->getOperand(i: 1).getReg());
8459
8460 Register DupSrcReg = Dup->getOperand(i: 1).getReg();
8461 MRI.clearKillFlags(Reg: DupSrcReg);
8462 MRI.constrainRegClass(Reg: DupSrcReg, RC);
8463
8464 unsigned DupSrcLane = Dup->getOperand(i: 2).getImm();
8465
8466 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8467 MachineOperand &MulOp = Root.getOperand(i: IdxMulOp);
8468
8469 Register ResultReg = Root.getOperand(i: 0).getReg();
8470
8471 MachineInstrBuilder MIB;
8472 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MulOpc), DestReg: ResultReg)
8473 .add(MO: MulOp)
8474 .addReg(RegNo: DupSrcReg)
8475 .addImm(Val: DupSrcLane);
8476
8477 InsInstrs.push_back(Elt: MIB);
8478 return &Root;
8479}
8480
8481/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8482/// instructions.
8483///
8484/// \see genFusedMultiply
8485static MachineInstr *genFusedMultiplyAcc(
8486 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8487 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8488 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8489 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8490 kind: FMAInstKind::Accumulator);
8491}
8492
8493/// genNeg - Helper to generate an intermediate negation of the second operand
8494/// of Root
8495static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
8496 const TargetInstrInfo *TII, MachineInstr &Root,
8497 SmallVectorImpl<MachineInstr *> &InsInstrs,
8498 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8499 unsigned MnegOpc, const TargetRegisterClass *RC) {
8500 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
8501 MachineInstrBuilder MIB =
8502 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MnegOpc), DestReg: NewVR)
8503 .add(MO: Root.getOperand(i: 2));
8504 InsInstrs.push_back(Elt: MIB);
8505
8506 assert(InstrIdxForVirtReg.empty());
8507 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8508
8509 return NewVR;
8510}
8511
8512/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8513/// instructions with an additional negation of the accumulator
8514static MachineInstr *genFusedMultiplyAccNeg(
8515 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8516 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8517 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8518 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8519 assert(IdxMulOpd == 1);
8520
8521 Register NewVR =
8522 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8523 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8524 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
8525}
8526
8527/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8528/// instructions.
8529///
8530/// \see genFusedMultiply
8531static MachineInstr *genFusedMultiplyIdx(
8532 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8533 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8534 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8535 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8536 kind: FMAInstKind::Indexed);
8537}
8538
8539/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8540/// instructions with an additional negation of the accumulator
8541static MachineInstr *genFusedMultiplyIdxNeg(
8542 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8543 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8544 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8545 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8546 assert(IdxMulOpd == 1);
8547
8548 Register NewVR =
8549 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8550
8551 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8552 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
8553}
8554
8555/// genMaddR - Generate madd instruction and combine mul and add using
8556/// an extra virtual register
8557/// Example - an ADD intermediate needs to be stored in a register:
8558/// MUL I=A,B,0
8559/// ADD R,I,Imm
8560/// ==> ORR V, ZR, Imm
8561/// ==> MADD R,A,B,V
8562/// \param MF Containing MachineFunction
8563/// \param MRI Register information
8564/// \param TII Target information
8565/// \param Root is the ADD instruction
8566/// \param [out] InsInstrs is a vector of machine instructions and will
8567/// contain the generated madd instruction
8568/// \param IdxMulOpd is index of operand in Root that is the result of
8569/// the MUL. In the example above IdxMulOpd is 1.
8570/// \param MaddOpc the opcode fo the madd instruction
8571/// \param VR is a virtual register that holds the value of an ADD operand
8572/// (V in the example above).
8573/// \param RC Register class of operands
8574static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
8575 const TargetInstrInfo *TII, MachineInstr &Root,
8576 SmallVectorImpl<MachineInstr *> &InsInstrs,
8577 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8578 const TargetRegisterClass *RC) {
8579 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8580
8581 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
8582 Register ResultReg = Root.getOperand(i: 0).getReg();
8583 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
8584 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
8585 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
8586 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
8587
8588 if (ResultReg.isVirtual())
8589 MRI.constrainRegClass(Reg: ResultReg, RC);
8590 if (SrcReg0.isVirtual())
8591 MRI.constrainRegClass(Reg: SrcReg0, RC);
8592 if (SrcReg1.isVirtual())
8593 MRI.constrainRegClass(Reg: SrcReg1, RC);
8594 if (Register::isVirtualRegister(Reg: VR))
8595 MRI.constrainRegClass(Reg: VR, RC);
8596
8597 MachineInstrBuilder MIB =
8598 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8599 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8600 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8601 .addReg(RegNo: VR);
8602 // Insert the MADD
8603 InsInstrs.push_back(Elt: MIB);
8604 return MUL;
8605}
8606
8607/// Do the following transformation
8608/// A - (B + C) ==> (A - B) - C
8609/// A - (B + C) ==> (A - C) - B
8610static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
8611 const TargetInstrInfo *TII, MachineInstr &Root,
8612 SmallVectorImpl<MachineInstr *> &InsInstrs,
8613 SmallVectorImpl<MachineInstr *> &DelInstrs,
8614 unsigned IdxOpd1,
8615 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8616 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8617 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8618 MachineInstr *AddMI = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 2).getReg());
8619
8620 Register ResultReg = Root.getOperand(i: 0).getReg();
8621 Register RegA = Root.getOperand(i: 1).getReg();
8622 bool RegAIsKill = Root.getOperand(i: 1).isKill();
8623 Register RegB = AddMI->getOperand(i: IdxOpd1).getReg();
8624 bool RegBIsKill = AddMI->getOperand(i: IdxOpd1).isKill();
8625 Register RegC = AddMI->getOperand(i: IdxOtherOpd).getReg();
8626 bool RegCIsKill = AddMI->getOperand(i: IdxOtherOpd).isKill();
8627 Register NewVR =
8628 MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: Root.getOperand(i: 2).getReg()));
8629
8630 unsigned Opcode = Root.getOpcode();
8631 if (Opcode == AArch64::SUBSWrr)
8632 Opcode = AArch64::SUBWrr;
8633 else if (Opcode == AArch64::SUBSXrr)
8634 Opcode = AArch64::SUBXrr;
8635 else
8636 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8637 "Unexpected instruction opcode.");
8638
8639 uint32_t Flags = Root.mergeFlagsWith(Other: *AddMI);
8640 Flags &= ~MachineInstr::NoSWrap;
8641 Flags &= ~MachineInstr::NoUWrap;
8642
8643 MachineInstrBuilder MIB1 =
8644 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: NewVR)
8645 .addReg(RegNo: RegA, Flags: getKillRegState(B: RegAIsKill))
8646 .addReg(RegNo: RegB, Flags: getKillRegState(B: RegBIsKill))
8647 .setMIFlags(Flags);
8648 MachineInstrBuilder MIB2 =
8649 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: ResultReg)
8650 .addReg(RegNo: NewVR, Flags: getKillRegState(B: true))
8651 .addReg(RegNo: RegC, Flags: getKillRegState(B: RegCIsKill))
8652 .setMIFlags(Flags);
8653
8654 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8655 InsInstrs.push_back(Elt: MIB1);
8656 InsInstrs.push_back(Elt: MIB2);
8657 DelInstrs.push_back(Elt: AddMI);
8658 DelInstrs.push_back(Elt: &Root);
8659}
8660
8661unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8662 unsigned int AccumulatorOpCode) const {
8663 switch (AccumulatorOpCode) {
8664 case AArch64::UABALB_ZZZ_D:
8665 case AArch64::SABALB_ZZZ_D:
8666 case AArch64::UABALT_ZZZ_D:
8667 case AArch64::SABALT_ZZZ_D:
8668 return AArch64::ADD_ZZZ_D;
8669 case AArch64::UABALB_ZZZ_H:
8670 case AArch64::SABALB_ZZZ_H:
8671 case AArch64::UABALT_ZZZ_H:
8672 case AArch64::SABALT_ZZZ_H:
8673 return AArch64::ADD_ZZZ_H;
8674 case AArch64::UABALB_ZZZ_S:
8675 case AArch64::SABALB_ZZZ_S:
8676 case AArch64::UABALT_ZZZ_S:
8677 case AArch64::SABALT_ZZZ_S:
8678 return AArch64::ADD_ZZZ_S;
8679 case AArch64::UABALv16i8_v8i16:
8680 case AArch64::SABALv8i8_v8i16:
8681 case AArch64::SABAv8i16:
8682 case AArch64::UABAv8i16:
8683 return AArch64::ADDv8i16;
8684 case AArch64::SABALv2i32_v2i64:
8685 case AArch64::UABALv2i32_v2i64:
8686 case AArch64::SABALv4i32_v2i64:
8687 return AArch64::ADDv2i64;
8688 case AArch64::UABALv4i16_v4i32:
8689 case AArch64::SABALv4i16_v4i32:
8690 case AArch64::SABALv8i16_v4i32:
8691 case AArch64::SABAv4i32:
8692 case AArch64::UABAv4i32:
8693 return AArch64::ADDv4i32;
8694 case AArch64::UABALv4i32_v2i64:
8695 return AArch64::ADDv2i64;
8696 case AArch64::UABALv8i16_v4i32:
8697 return AArch64::ADDv4i32;
8698 case AArch64::UABALv8i8_v8i16:
8699 case AArch64::SABALv16i8_v8i16:
8700 return AArch64::ADDv8i16;
8701 case AArch64::UABAv16i8:
8702 case AArch64::SABAv16i8:
8703 return AArch64::ADDv16i8;
8704 case AArch64::UABAv4i16:
8705 case AArch64::SABAv4i16:
8706 return AArch64::ADDv4i16;
8707 case AArch64::UABAv2i32:
8708 case AArch64::SABAv2i32:
8709 return AArch64::ADDv2i32;
8710 case AArch64::UABAv8i8:
8711 case AArch64::SABAv8i8:
8712 return AArch64::ADDv8i8;
8713 default:
8714 llvm_unreachable("Unknown accumulator opcode");
8715 }
8716}
8717
8718/// When getMachineCombinerPatterns() finds potential patterns,
8719/// this function generates the instructions that could replace the
8720/// original code sequence
8721void AArch64InstrInfo::genAlternativeCodeSequence(
8722 MachineInstr &Root, unsigned Pattern,
8723 SmallVectorImpl<MachineInstr *> &InsInstrs,
8724 SmallVectorImpl<MachineInstr *> &DelInstrs,
8725 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8726 MachineBasicBlock &MBB = *Root.getParent();
8727 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8728 MachineFunction &MF = *MBB.getParent();
8729 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8730
8731 MachineInstr *MUL = nullptr;
8732 const TargetRegisterClass *RC;
8733 unsigned Opc;
8734 switch (Pattern) {
8735 default:
8736 // Reassociate instructions.
8737 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8738 DelInstrs, InstIdxForVirtReg&: InstrIdxForVirtReg);
8739 return;
8740 case AArch64MachineCombinerPattern::SUBADD_OP1:
8741 // A - (B + C)
8742 // ==> (A - B) - C
8743 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 1,
8744 InstrIdxForVirtReg);
8745 return;
8746 case AArch64MachineCombinerPattern::SUBADD_OP2:
8747 // A - (B + C)
8748 // ==> (A - C) - B
8749 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 2,
8750 InstrIdxForVirtReg);
8751 return;
8752 case AArch64MachineCombinerPattern::MULADDW_OP1:
8753 case AArch64MachineCombinerPattern::MULADDX_OP1:
8754 // MUL I=A,B,0
8755 // ADD R,I,C
8756 // ==> MADD R,A,B,C
8757 // --- Create(MADD);
8758 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
8759 Opc = AArch64::MADDWrrr;
8760 RC = &AArch64::GPR32RegClass;
8761 } else {
8762 Opc = AArch64::MADDXrrr;
8763 RC = &AArch64::GPR64RegClass;
8764 }
8765 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8766 break;
8767 case AArch64MachineCombinerPattern::MULADDW_OP2:
8768 case AArch64MachineCombinerPattern::MULADDX_OP2:
8769 // MUL I=A,B,0
8770 // ADD R,C,I
8771 // ==> MADD R,A,B,C
8772 // --- Create(MADD);
8773 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
8774 Opc = AArch64::MADDWrrr;
8775 RC = &AArch64::GPR32RegClass;
8776 } else {
8777 Opc = AArch64::MADDXrrr;
8778 RC = &AArch64::GPR64RegClass;
8779 }
8780 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8781 break;
8782 case AArch64MachineCombinerPattern::MULADDWI_OP1:
8783 case AArch64MachineCombinerPattern::MULADDXI_OP1:
8784 case AArch64MachineCombinerPattern::MULSUBWI_OP1:
8785 case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
8786 // MUL I=A,B,0
8787 // ADD/SUB R,I,Imm
8788 // ==> MOV V, Imm/-Imm
8789 // ==> MADD R,A,B,V
8790 // --- Create(MADD);
8791 const TargetRegisterClass *RC;
8792 unsigned BitSize, MovImm;
8793 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1 ||
8794 Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
8795 MovImm = AArch64::MOVi32imm;
8796 RC = &AArch64::GPR32spRegClass;
8797 BitSize = 32;
8798 Opc = AArch64::MADDWrrr;
8799 RC = &AArch64::GPR32RegClass;
8800 } else {
8801 MovImm = AArch64::MOVi64imm;
8802 RC = &AArch64::GPR64spRegClass;
8803 BitSize = 64;
8804 Opc = AArch64::MADDXrrr;
8805 RC = &AArch64::GPR64RegClass;
8806 }
8807 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
8808 uint64_t Imm = Root.getOperand(i: 2).getImm();
8809
8810 if (Root.getOperand(i: 3).isImm()) {
8811 unsigned Val = Root.getOperand(i: 3).getImm();
8812 Imm = Imm << Val;
8813 }
8814 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8815 Pattern == AArch64MachineCombinerPattern::MULSUBXI_OP1;
8816 uint64_t UImm = SignExtend64(X: IsSub ? -Imm : Imm, B: BitSize);
8817 // Check that the immediate can be composed via a single instruction.
8818 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
8819 AArch64_IMM::expandMOVImm(Imm: UImm, BitSize, Insn);
8820 if (Insn.size() != 1)
8821 return;
8822 MachineInstrBuilder MIB1 =
8823 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MovImm), DestReg: NewVR)
8824 .addImm(Val: IsSub ? -Imm : Imm);
8825 InsInstrs.push_back(Elt: MIB1);
8826 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8827 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
8828 break;
8829 }
8830 case AArch64MachineCombinerPattern::MULSUBW_OP1:
8831 case AArch64MachineCombinerPattern::MULSUBX_OP1: {
8832 // MUL I=A,B,0
8833 // SUB R,I, C
8834 // ==> SUB V, 0, C
8835 // ==> MADD R,A,B,V // = -C + A*B
8836 // --- Create(MADD);
8837 const TargetRegisterClass *SubRC;
8838 unsigned SubOpc, ZeroReg;
8839 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
8840 SubOpc = AArch64::SUBWrr;
8841 SubRC = &AArch64::GPR32spRegClass;
8842 ZeroReg = AArch64::WZR;
8843 Opc = AArch64::MADDWrrr;
8844 RC = &AArch64::GPR32RegClass;
8845 } else {
8846 SubOpc = AArch64::SUBXrr;
8847 SubRC = &AArch64::GPR64spRegClass;
8848 ZeroReg = AArch64::XZR;
8849 Opc = AArch64::MADDXrrr;
8850 RC = &AArch64::GPR64RegClass;
8851 }
8852 Register NewVR = MRI.createVirtualRegister(RegClass: SubRC);
8853 // SUB NewVR, 0, C
8854 MachineInstrBuilder MIB1 =
8855 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: SubOpc), DestReg: NewVR)
8856 .addReg(RegNo: ZeroReg)
8857 .add(MO: Root.getOperand(i: 2));
8858 InsInstrs.push_back(Elt: MIB1);
8859 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8860 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
8861 break;
8862 }
8863 case AArch64MachineCombinerPattern::MULSUBW_OP2:
8864 case AArch64MachineCombinerPattern::MULSUBX_OP2:
8865 // MUL I=A,B,0
8866 // SUB R,C,I
8867 // ==> MSUB R,A,B,C (computes C - A*B)
8868 // --- Create(MSUB);
8869 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
8870 Opc = AArch64::MSUBWrrr;
8871 RC = &AArch64::GPR32RegClass;
8872 } else {
8873 Opc = AArch64::MSUBXrrr;
8874 RC = &AArch64::GPR64RegClass;
8875 }
8876 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8877 break;
8878 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
8879 Opc = AArch64::MLAv8i8;
8880 RC = &AArch64::FPR64RegClass;
8881 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8882 break;
8883 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
8884 Opc = AArch64::MLAv8i8;
8885 RC = &AArch64::FPR64RegClass;
8886 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8887 break;
8888 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
8889 Opc = AArch64::MLAv16i8;
8890 RC = &AArch64::FPR128RegClass;
8891 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8892 break;
8893 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
8894 Opc = AArch64::MLAv16i8;
8895 RC = &AArch64::FPR128RegClass;
8896 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8897 break;
8898 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
8899 Opc = AArch64::MLAv4i16;
8900 RC = &AArch64::FPR64RegClass;
8901 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8902 break;
8903 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
8904 Opc = AArch64::MLAv4i16;
8905 RC = &AArch64::FPR64RegClass;
8906 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8907 break;
8908 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
8909 Opc = AArch64::MLAv8i16;
8910 RC = &AArch64::FPR128RegClass;
8911 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8912 break;
8913 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
8914 Opc = AArch64::MLAv8i16;
8915 RC = &AArch64::FPR128RegClass;
8916 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8917 break;
8918 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
8919 Opc = AArch64::MLAv2i32;
8920 RC = &AArch64::FPR64RegClass;
8921 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8922 break;
8923 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
8924 Opc = AArch64::MLAv2i32;
8925 RC = &AArch64::FPR64RegClass;
8926 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8927 break;
8928 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
8929 Opc = AArch64::MLAv4i32;
8930 RC = &AArch64::FPR128RegClass;
8931 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8932 break;
8933 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
8934 Opc = AArch64::MLAv4i32;
8935 RC = &AArch64::FPR128RegClass;
8936 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8937 break;
8938
8939 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
8940 Opc = AArch64::MLAv8i8;
8941 RC = &AArch64::FPR64RegClass;
8942 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8943 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i8,
8944 RC);
8945 break;
8946 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
8947 Opc = AArch64::MLSv8i8;
8948 RC = &AArch64::FPR64RegClass;
8949 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8950 break;
8951 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
8952 Opc = AArch64::MLAv16i8;
8953 RC = &AArch64::FPR128RegClass;
8954 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8955 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv16i8,
8956 RC);
8957 break;
8958 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
8959 Opc = AArch64::MLSv16i8;
8960 RC = &AArch64::FPR128RegClass;
8961 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8962 break;
8963 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
8964 Opc = AArch64::MLAv4i16;
8965 RC = &AArch64::FPR64RegClass;
8966 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8967 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i16,
8968 RC);
8969 break;
8970 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
8971 Opc = AArch64::MLSv4i16;
8972 RC = &AArch64::FPR64RegClass;
8973 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8974 break;
8975 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
8976 Opc = AArch64::MLAv8i16;
8977 RC = &AArch64::FPR128RegClass;
8978 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8979 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i16,
8980 RC);
8981 break;
8982 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
8983 Opc = AArch64::MLSv8i16;
8984 RC = &AArch64::FPR128RegClass;
8985 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8986 break;
8987 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
8988 Opc = AArch64::MLAv2i32;
8989 RC = &AArch64::FPR64RegClass;
8990 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8991 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv2i32,
8992 RC);
8993 break;
8994 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
8995 Opc = AArch64::MLSv2i32;
8996 RC = &AArch64::FPR64RegClass;
8997 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8998 break;
8999 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
9000 Opc = AArch64::MLAv4i32;
9001 RC = &AArch64::FPR128RegClass;
9002 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9003 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i32,
9004 RC);
9005 break;
9006 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
9007 Opc = AArch64::MLSv4i32;
9008 RC = &AArch64::FPR128RegClass;
9009 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9010 break;
9011
9012 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
9013 Opc = AArch64::MLAv4i16_indexed;
9014 RC = &AArch64::FPR64RegClass;
9015 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9016 break;
9017 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
9018 Opc = AArch64::MLAv4i16_indexed;
9019 RC = &AArch64::FPR64RegClass;
9020 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9021 break;
9022 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
9023 Opc = AArch64::MLAv8i16_indexed;
9024 RC = &AArch64::FPR128RegClass;
9025 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9026 break;
9027 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
9028 Opc = AArch64::MLAv8i16_indexed;
9029 RC = &AArch64::FPR128RegClass;
9030 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9031 break;
9032 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
9033 Opc = AArch64::MLAv2i32_indexed;
9034 RC = &AArch64::FPR64RegClass;
9035 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9036 break;
9037 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
9038 Opc = AArch64::MLAv2i32_indexed;
9039 RC = &AArch64::FPR64RegClass;
9040 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9041 break;
9042 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
9043 Opc = AArch64::MLAv4i32_indexed;
9044 RC = &AArch64::FPR128RegClass;
9045 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9046 break;
9047 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
9048 Opc = AArch64::MLAv4i32_indexed;
9049 RC = &AArch64::FPR128RegClass;
9050 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9051 break;
9052
9053 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
9054 Opc = AArch64::MLAv4i16_indexed;
9055 RC = &AArch64::FPR64RegClass;
9056 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9057 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i16,
9058 RC);
9059 break;
9060 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
9061 Opc = AArch64::MLSv4i16_indexed;
9062 RC = &AArch64::FPR64RegClass;
9063 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9064 break;
9065 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
9066 Opc = AArch64::MLAv8i16_indexed;
9067 RC = &AArch64::FPR128RegClass;
9068 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9069 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i16,
9070 RC);
9071 break;
9072 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
9073 Opc = AArch64::MLSv8i16_indexed;
9074 RC = &AArch64::FPR128RegClass;
9075 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9076 break;
9077 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
9078 Opc = AArch64::MLAv2i32_indexed;
9079 RC = &AArch64::FPR64RegClass;
9080 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9081 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv2i32,
9082 RC);
9083 break;
9084 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
9085 Opc = AArch64::MLSv2i32_indexed;
9086 RC = &AArch64::FPR64RegClass;
9087 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9088 break;
9089 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
9090 Opc = AArch64::MLAv4i32_indexed;
9091 RC = &AArch64::FPR128RegClass;
9092 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9093 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i32,
9094 RC);
9095 break;
9096 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
9097 Opc = AArch64::MLSv4i32_indexed;
9098 RC = &AArch64::FPR128RegClass;
9099 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9100 break;
9101
9102 // Floating Point Support
9103 case AArch64MachineCombinerPattern::FMULADDH_OP1:
9104 Opc = AArch64::FMADDHrrr;
9105 RC = &AArch64::FPR16RegClass;
9106 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9107 break;
9108 case AArch64MachineCombinerPattern::FMULADDS_OP1:
9109 Opc = AArch64::FMADDSrrr;
9110 RC = &AArch64::FPR32RegClass;
9111 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9112 break;
9113 case AArch64MachineCombinerPattern::FMULADDD_OP1:
9114 Opc = AArch64::FMADDDrrr;
9115 RC = &AArch64::FPR64RegClass;
9116 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9117 break;
9118
9119 case AArch64MachineCombinerPattern::FMULADDH_OP2:
9120 Opc = AArch64::FMADDHrrr;
9121 RC = &AArch64::FPR16RegClass;
9122 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9123 break;
9124 case AArch64MachineCombinerPattern::FMULADDS_OP2:
9125 Opc = AArch64::FMADDSrrr;
9126 RC = &AArch64::FPR32RegClass;
9127 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9128 break;
9129 case AArch64MachineCombinerPattern::FMULADDD_OP2:
9130 Opc = AArch64::FMADDDrrr;
9131 RC = &AArch64::FPR64RegClass;
9132 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9133 break;
9134
9135 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
9136 Opc = AArch64::FMLAv1i32_indexed;
9137 RC = &AArch64::FPR32RegClass;
9138 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9139 kind: FMAInstKind::Indexed);
9140 break;
9141 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
9142 Opc = AArch64::FMLAv1i32_indexed;
9143 RC = &AArch64::FPR32RegClass;
9144 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9145 kind: FMAInstKind::Indexed);
9146 break;
9147
9148 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
9149 Opc = AArch64::FMLAv1i64_indexed;
9150 RC = &AArch64::FPR64RegClass;
9151 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9152 kind: FMAInstKind::Indexed);
9153 break;
9154 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
9155 Opc = AArch64::FMLAv1i64_indexed;
9156 RC = &AArch64::FPR64RegClass;
9157 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9158 kind: FMAInstKind::Indexed);
9159 break;
9160
9161 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
9162 RC = &AArch64::FPR64RegClass;
9163 Opc = AArch64::FMLAv4i16_indexed;
9164 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9165 kind: FMAInstKind::Indexed);
9166 break;
9167 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
9168 RC = &AArch64::FPR64RegClass;
9169 Opc = AArch64::FMLAv4f16;
9170 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9171 kind: FMAInstKind::Accumulator);
9172 break;
9173 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
9174 RC = &AArch64::FPR64RegClass;
9175 Opc = AArch64::FMLAv4i16_indexed;
9176 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9177 kind: FMAInstKind::Indexed);
9178 break;
9179 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
9180 RC = &AArch64::FPR64RegClass;
9181 Opc = AArch64::FMLAv4f16;
9182 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9183 kind: FMAInstKind::Accumulator);
9184 break;
9185
9186 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
9187 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
9188 RC = &AArch64::FPR64RegClass;
9189 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
9190 Opc = AArch64::FMLAv2i32_indexed;
9191 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9192 kind: FMAInstKind::Indexed);
9193 } else {
9194 Opc = AArch64::FMLAv2f32;
9195 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9196 kind: FMAInstKind::Accumulator);
9197 }
9198 break;
9199 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
9200 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
9201 RC = &AArch64::FPR64RegClass;
9202 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
9203 Opc = AArch64::FMLAv2i32_indexed;
9204 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9205 kind: FMAInstKind::Indexed);
9206 } else {
9207 Opc = AArch64::FMLAv2f32;
9208 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9209 kind: FMAInstKind::Accumulator);
9210 }
9211 break;
9212
9213 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
9214 RC = &AArch64::FPR128RegClass;
9215 Opc = AArch64::FMLAv8i16_indexed;
9216 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9217 kind: FMAInstKind::Indexed);
9218 break;
9219 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
9220 RC = &AArch64::FPR128RegClass;
9221 Opc = AArch64::FMLAv8f16;
9222 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9223 kind: FMAInstKind::Accumulator);
9224 break;
9225 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
9226 RC = &AArch64::FPR128RegClass;
9227 Opc = AArch64::FMLAv8i16_indexed;
9228 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9229 kind: FMAInstKind::Indexed);
9230 break;
9231 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
9232 RC = &AArch64::FPR128RegClass;
9233 Opc = AArch64::FMLAv8f16;
9234 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9235 kind: FMAInstKind::Accumulator);
9236 break;
9237
9238 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
9239 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
9240 RC = &AArch64::FPR128RegClass;
9241 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
9242 Opc = AArch64::FMLAv2i64_indexed;
9243 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9244 kind: FMAInstKind::Indexed);
9245 } else {
9246 Opc = AArch64::FMLAv2f64;
9247 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9248 kind: FMAInstKind::Accumulator);
9249 }
9250 break;
9251 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
9252 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
9253 RC = &AArch64::FPR128RegClass;
9254 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
9255 Opc = AArch64::FMLAv2i64_indexed;
9256 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9257 kind: FMAInstKind::Indexed);
9258 } else {
9259 Opc = AArch64::FMLAv2f64;
9260 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9261 kind: FMAInstKind::Accumulator);
9262 }
9263 break;
9264
9265 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
9266 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
9267 RC = &AArch64::FPR128RegClass;
9268 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
9269 Opc = AArch64::FMLAv4i32_indexed;
9270 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9271 kind: FMAInstKind::Indexed);
9272 } else {
9273 Opc = AArch64::FMLAv4f32;
9274 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9275 kind: FMAInstKind::Accumulator);
9276 }
9277 break;
9278
9279 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
9280 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
9281 RC = &AArch64::FPR128RegClass;
9282 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
9283 Opc = AArch64::FMLAv4i32_indexed;
9284 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9285 kind: FMAInstKind::Indexed);
9286 } else {
9287 Opc = AArch64::FMLAv4f32;
9288 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9289 kind: FMAInstKind::Accumulator);
9290 }
9291 break;
9292
9293 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
9294 Opc = AArch64::FNMSUBHrrr;
9295 RC = &AArch64::FPR16RegClass;
9296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9297 break;
9298 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
9299 Opc = AArch64::FNMSUBSrrr;
9300 RC = &AArch64::FPR32RegClass;
9301 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9302 break;
9303 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
9304 Opc = AArch64::FNMSUBDrrr;
9305 RC = &AArch64::FPR64RegClass;
9306 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9307 break;
9308
9309 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
9310 Opc = AArch64::FNMADDHrrr;
9311 RC = &AArch64::FPR16RegClass;
9312 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9313 break;
9314 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
9315 Opc = AArch64::FNMADDSrrr;
9316 RC = &AArch64::FPR32RegClass;
9317 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9318 break;
9319 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
9320 Opc = AArch64::FNMADDDrrr;
9321 RC = &AArch64::FPR64RegClass;
9322 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9323 break;
9324
9325 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
9326 Opc = AArch64::FMSUBHrrr;
9327 RC = &AArch64::FPR16RegClass;
9328 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9329 break;
9330 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
9331 Opc = AArch64::FMSUBSrrr;
9332 RC = &AArch64::FPR32RegClass;
9333 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9334 break;
9335 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
9336 Opc = AArch64::FMSUBDrrr;
9337 RC = &AArch64::FPR64RegClass;
9338 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9339 break;
9340
9341 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
9342 Opc = AArch64::FMLSv1i32_indexed;
9343 RC = &AArch64::FPR32RegClass;
9344 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9345 kind: FMAInstKind::Indexed);
9346 break;
9347
9348 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
9349 Opc = AArch64::FMLSv1i64_indexed;
9350 RC = &AArch64::FPR64RegClass;
9351 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9352 kind: FMAInstKind::Indexed);
9353 break;
9354
9355 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
9356 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
9357 RC = &AArch64::FPR64RegClass;
9358 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9359 MachineInstrBuilder MIB1 =
9360 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv4f16), DestReg: NewVR)
9361 .add(MO: Root.getOperand(i: 2));
9362 InsInstrs.push_back(Elt: MIB1);
9363 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9364 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
9365 Opc = AArch64::FMLAv4f16;
9366 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9367 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9368 } else {
9369 Opc = AArch64::FMLAv4i16_indexed;
9370 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9371 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9372 }
9373 break;
9374 }
9375 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
9376 RC = &AArch64::FPR64RegClass;
9377 Opc = AArch64::FMLSv4f16;
9378 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9379 kind: FMAInstKind::Accumulator);
9380 break;
9381 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
9382 RC = &AArch64::FPR64RegClass;
9383 Opc = AArch64::FMLSv4i16_indexed;
9384 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9385 kind: FMAInstKind::Indexed);
9386 break;
9387
9388 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
9389 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
9390 RC = &AArch64::FPR64RegClass;
9391 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
9392 Opc = AArch64::FMLSv2i32_indexed;
9393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9394 kind: FMAInstKind::Indexed);
9395 } else {
9396 Opc = AArch64::FMLSv2f32;
9397 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9398 kind: FMAInstKind::Accumulator);
9399 }
9400 break;
9401
9402 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
9403 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
9404 RC = &AArch64::FPR128RegClass;
9405 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9406 MachineInstrBuilder MIB1 =
9407 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv8f16), DestReg: NewVR)
9408 .add(MO: Root.getOperand(i: 2));
9409 InsInstrs.push_back(Elt: MIB1);
9410 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9411 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
9412 Opc = AArch64::FMLAv8f16;
9413 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9414 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9415 } else {
9416 Opc = AArch64::FMLAv8i16_indexed;
9417 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9418 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9419 }
9420 break;
9421 }
9422 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
9423 RC = &AArch64::FPR128RegClass;
9424 Opc = AArch64::FMLSv8f16;
9425 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9426 kind: FMAInstKind::Accumulator);
9427 break;
9428 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
9429 RC = &AArch64::FPR128RegClass;
9430 Opc = AArch64::FMLSv8i16_indexed;
9431 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9432 kind: FMAInstKind::Indexed);
9433 break;
9434
9435 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
9436 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
9437 RC = &AArch64::FPR128RegClass;
9438 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
9439 Opc = AArch64::FMLSv2i64_indexed;
9440 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9441 kind: FMAInstKind::Indexed);
9442 } else {
9443 Opc = AArch64::FMLSv2f64;
9444 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9445 kind: FMAInstKind::Accumulator);
9446 }
9447 break;
9448
9449 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
9450 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
9451 RC = &AArch64::FPR128RegClass;
9452 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
9453 Opc = AArch64::FMLSv4i32_indexed;
9454 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9455 kind: FMAInstKind::Indexed);
9456 } else {
9457 Opc = AArch64::FMLSv4f32;
9458 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9459 kind: FMAInstKind::Accumulator);
9460 }
9461 break;
9462 case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
9463 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
9464 RC = &AArch64::FPR64RegClass;
9465 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9466 MachineInstrBuilder MIB1 =
9467 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv2f32), DestReg: NewVR)
9468 .add(MO: Root.getOperand(i: 2));
9469 InsInstrs.push_back(Elt: MIB1);
9470 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9471 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
9472 Opc = AArch64::FMLAv2i32_indexed;
9473 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9474 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9475 } else {
9476 Opc = AArch64::FMLAv2f32;
9477 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9478 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9479 }
9480 break;
9481 }
9482 case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
9483 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
9484 RC = &AArch64::FPR128RegClass;
9485 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9486 MachineInstrBuilder MIB1 =
9487 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv4f32), DestReg: NewVR)
9488 .add(MO: Root.getOperand(i: 2));
9489 InsInstrs.push_back(Elt: MIB1);
9490 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9491 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
9492 Opc = AArch64::FMLAv4i32_indexed;
9493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9494 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9495 } else {
9496 Opc = AArch64::FMLAv4f32;
9497 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9498 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9499 }
9500 break;
9501 }
9502 case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
9503 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
9504 RC = &AArch64::FPR128RegClass;
9505 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9506 MachineInstrBuilder MIB1 =
9507 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv2f64), DestReg: NewVR)
9508 .add(MO: Root.getOperand(i: 2));
9509 InsInstrs.push_back(Elt: MIB1);
9510 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9511 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
9512 Opc = AArch64::FMLAv2i64_indexed;
9513 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9514 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9515 } else {
9516 Opc = AArch64::FMLAv2f64;
9517 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9518 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9519 }
9520 break;
9521 }
9522 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
9523 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
9524 unsigned IdxDupOp =
9525 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
9526 : 2;
9527 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv2i32_indexed,
9528 RC: &AArch64::FPR128RegClass, MRI);
9529 break;
9530 }
9531 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
9532 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
9533 unsigned IdxDupOp =
9534 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
9535 : 2;
9536 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv2i64_indexed,
9537 RC: &AArch64::FPR128RegClass, MRI);
9538 break;
9539 }
9540 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
9541 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
9542 unsigned IdxDupOp =
9543 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
9544 : 2;
9545 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv4i16_indexed,
9546 RC: &AArch64::FPR128_loRegClass, MRI);
9547 break;
9548 }
9549 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
9550 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
9551 unsigned IdxDupOp =
9552 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
9553 : 2;
9554 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv4i32_indexed,
9555 RC: &AArch64::FPR128RegClass, MRI);
9556 break;
9557 }
9558 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
9559 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
9560 unsigned IdxDupOp =
9561 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
9562 : 2;
9563 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv8i16_indexed,
9564 RC: &AArch64::FPR128_loRegClass, MRI);
9565 break;
9566 }
9567 case AArch64MachineCombinerPattern::FNMADD: {
9568 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9569 break;
9570 }
9571 case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
9572 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9573 Pattern, NumLanes: 4);
9574 break;
9575 }
9576 case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
9577 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9578 Pattern, NumLanes: 8);
9579 break;
9580 }
9581 case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
9582 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9583 Pattern, NumLanes: 16);
9584 break;
9585 }
9586
9587 } // end switch (Pattern)
9588 // Record MUL and ADD/SUB for deletion
9589 if (MUL)
9590 DelInstrs.push_back(Elt: MUL);
9591 DelInstrs.push_back(Elt: &Root);
9592
9593 // Set the flags on the inserted instructions to be the merged flags of the
9594 // instructions that we have combined.
9595 uint32_t Flags = Root.getFlags();
9596 if (MUL)
9597 Flags = Root.mergeFlagsWith(Other: *MUL);
9598 for (auto *MI : InsInstrs)
9599 MI->setFlags(Flags);
9600}
9601
9602/// Replace csincr-branch sequence by simple conditional branch
9603///
9604/// Examples:
9605/// 1. \code
9606/// csinc w9, wzr, wzr, <condition code>
9607/// tbnz w9, #0, 0x44
9608/// \endcode
9609/// to
9610/// \code
9611/// b.<inverted condition code>
9612/// \endcode
9613///
9614/// 2. \code
9615/// csinc w9, wzr, wzr, <condition code>
9616/// tbz w9, #0, 0x44
9617/// \endcode
9618/// to
9619/// \code
9620/// b.<condition code>
9621/// \endcode
9622///
9623/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9624/// compare's constant operand is power of 2.
9625///
9626/// Examples:
9627/// \code
9628/// and w8, w8, #0x400
9629/// cbnz w8, L1
9630/// \endcode
9631/// to
9632/// \code
9633/// tbnz w8, #10, L1
9634/// \endcode
9635///
9636/// \param MI Conditional Branch
9637/// \return True when the simple conditional branch is generated
9638///
9639bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
9640 bool IsNegativeBranch = false;
9641 bool IsTestAndBranch = false;
9642 unsigned TargetBBInMI = 0;
9643 switch (MI.getOpcode()) {
9644 default:
9645 llvm_unreachable("Unknown branch instruction?");
9646 case AArch64::Bcc:
9647 case AArch64::CBWPri:
9648 case AArch64::CBXPri:
9649 case AArch64::CBBAssertExt:
9650 case AArch64::CBHAssertExt:
9651 case AArch64::CBWPrr:
9652 case AArch64::CBXPrr:
9653 return false;
9654 case AArch64::CBZW:
9655 case AArch64::CBZX:
9656 TargetBBInMI = 1;
9657 break;
9658 case AArch64::CBNZW:
9659 case AArch64::CBNZX:
9660 TargetBBInMI = 1;
9661 IsNegativeBranch = true;
9662 break;
9663 case AArch64::TBZW:
9664 case AArch64::TBZX:
9665 TargetBBInMI = 2;
9666 IsTestAndBranch = true;
9667 break;
9668 case AArch64::TBNZW:
9669 case AArch64::TBNZX:
9670 TargetBBInMI = 2;
9671 IsNegativeBranch = true;
9672 IsTestAndBranch = true;
9673 break;
9674 }
9675 // So we increment a zero register and test for bits other
9676 // than bit 0? Conservatively bail out in case the verifier
9677 // missed this case.
9678 if (IsTestAndBranch && MI.getOperand(i: 1).getImm())
9679 return false;
9680
9681 // Find Definition.
9682 assert(MI.getParent() && "Incomplete machine instruction\n");
9683 MachineBasicBlock *MBB = MI.getParent();
9684 MachineFunction *MF = MBB->getParent();
9685 MachineRegisterInfo *MRI = &MF->getRegInfo();
9686 Register VReg = MI.getOperand(i: 0).getReg();
9687 if (!VReg.isVirtual())
9688 return false;
9689
9690 MachineInstr *DefMI = MRI->getVRegDef(Reg: VReg);
9691
9692 // Look through COPY instructions to find definition.
9693 while (DefMI->isCopy()) {
9694 Register CopyVReg = DefMI->getOperand(i: 1).getReg();
9695 if (!MRI->hasOneNonDBGUse(RegNo: CopyVReg))
9696 return false;
9697 if (!MRI->hasOneDef(RegNo: CopyVReg))
9698 return false;
9699 DefMI = MRI->getVRegDef(Reg: CopyVReg);
9700 }
9701
9702 switch (DefMI->getOpcode()) {
9703 default:
9704 return false;
9705 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9706 case AArch64::ANDWri:
9707 case AArch64::ANDXri: {
9708 if (IsTestAndBranch)
9709 return false;
9710 if (DefMI->getParent() != MBB)
9711 return false;
9712 if (!MRI->hasOneNonDBGUse(RegNo: VReg))
9713 return false;
9714
9715 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9716 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
9717 val: DefMI->getOperand(i: 2).getImm(), regSize: Is32Bit ? 32 : 64);
9718 if (!isPowerOf2_64(Value: Mask))
9719 return false;
9720
9721 MachineOperand &MO = DefMI->getOperand(i: 1);
9722 Register NewReg = MO.getReg();
9723 if (!NewReg.isVirtual())
9724 return false;
9725
9726 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9727
9728 MachineBasicBlock &RefToMBB = *MBB;
9729 MachineBasicBlock *TBB = MI.getOperand(i: 1).getMBB();
9730 DebugLoc DL = MI.getDebugLoc();
9731 unsigned Imm = Log2_64(Value: Mask);
9732 unsigned Opc = (Imm < 32)
9733 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9734 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9735 MachineInstr *NewMI = BuildMI(BB&: RefToMBB, I&: MI, MIMD: DL, MCID: get(Opcode: Opc))
9736 .addReg(RegNo: NewReg)
9737 .addImm(Val: Imm)
9738 .addMBB(MBB: TBB);
9739 // Register lives on to the CBZ now.
9740 MO.setIsKill(false);
9741
9742 // For immediate smaller than 32, we need to use the 32-bit
9743 // variant (W) in all cases. Indeed the 64-bit variant does not
9744 // allow to encode them.
9745 // Therefore, if the input register is 64-bit, we need to take the
9746 // 32-bit sub-part.
9747 if (!Is32Bit && Imm < 32)
9748 NewMI->getOperand(i: 0).setSubReg(AArch64::sub_32);
9749 MI.eraseFromParent();
9750 return true;
9751 }
9752 // Look for CSINC
9753 case AArch64::CSINCWr:
9754 case AArch64::CSINCXr: {
9755 if (!(DefMI->getOperand(i: 1).getReg() == AArch64::WZR &&
9756 DefMI->getOperand(i: 2).getReg() == AArch64::WZR) &&
9757 !(DefMI->getOperand(i: 1).getReg() == AArch64::XZR &&
9758 DefMI->getOperand(i: 2).getReg() == AArch64::XZR))
9759 return false;
9760
9761 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
9762 isDead: true) != -1)
9763 return false;
9764
9765 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(i: 3).getImm();
9766 // Convert only when the condition code is not modified between
9767 // the CSINC and the branch. The CC may be used by other
9768 // instructions in between.
9769 if (areCFlagsAccessedBetweenInstrs(From: DefMI, To: MI, TRI: &getRegisterInfo(), AccessToCheck: AK_Write))
9770 return false;
9771 MachineBasicBlock &RefToMBB = *MBB;
9772 MachineBasicBlock *TBB = MI.getOperand(i: TargetBBInMI).getMBB();
9773 DebugLoc DL = MI.getDebugLoc();
9774 if (IsNegativeBranch)
9775 CC = AArch64CC::getInvertedCondCode(Code: CC);
9776 BuildMI(BB&: RefToMBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::Bcc)).addImm(Val: CC).addMBB(MBB: TBB);
9777 MI.eraseFromParent();
9778 return true;
9779 }
9780 }
9781}
9782
9783std::pair<unsigned, unsigned>
9784AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9785 const unsigned Mask = AArch64II::MO_FRAGMENT;
9786 return std::make_pair(x: TF & Mask, y: TF & ~Mask);
9787}
9788
9789ArrayRef<std::pair<unsigned, const char *>>
9790AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9791 using namespace AArch64II;
9792
9793 static const std::pair<unsigned, const char *> TargetFlags[] = {
9794 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9795 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9796 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9797 {MO_HI12, "aarch64-hi12"}};
9798 return ArrayRef(TargetFlags);
9799}
9800
9801ArrayRef<std::pair<unsigned, const char *>>
9802AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9803 using namespace AArch64II;
9804
9805 static const std::pair<unsigned, const char *> TargetFlags[] = {
9806 {MO_COFFSTUB, "aarch64-coffstub"},
9807 {MO_GOT, "aarch64-got"},
9808 {MO_NC, "aarch64-nc"},
9809 {MO_S, "aarch64-s"},
9810 {MO_TLS, "aarch64-tls"},
9811 {MO_DLLIMPORT, "aarch64-dllimport"},
9812 {MO_PREL, "aarch64-prel"},
9813 {MO_TAGGED, "aarch64-tagged"},
9814 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9815 };
9816 return ArrayRef(TargetFlags);
9817}
9818
9819ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
9820AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9821 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9822 {{MOSuppressPair, "aarch64-suppress-pair"},
9823 {MOStridedAccess, "aarch64-strided-access"}};
9824 return ArrayRef(TargetFlags);
9825}
9826
9827/// Constants defining how certain sequences should be outlined.
9828/// This encompasses how an outlined function should be called, and what kind of
9829/// frame should be emitted for that outlined function.
9830///
9831/// \p MachineOutlinerDefault implies that the function should be called with
9832/// a save and restore of LR to the stack.
9833///
9834/// That is,
9835///
9836/// I1 Save LR OUTLINED_FUNCTION:
9837/// I2 --> BL OUTLINED_FUNCTION I1
9838/// I3 Restore LR I2
9839/// I3
9840/// RET
9841///
9842/// * Call construction overhead: 3 (save + BL + restore)
9843/// * Frame construction overhead: 1 (ret)
9844/// * Requires stack fixups? Yes
9845///
9846/// \p MachineOutlinerTailCall implies that the function is being created from
9847/// a sequence of instructions ending in a return.
9848///
9849/// That is,
9850///
9851/// I1 OUTLINED_FUNCTION:
9852/// I2 --> B OUTLINED_FUNCTION I1
9853/// RET I2
9854/// RET
9855///
9856/// * Call construction overhead: 1 (B)
9857/// * Frame construction overhead: 0 (Return included in sequence)
9858/// * Requires stack fixups? No
9859///
9860/// \p MachineOutlinerNoLRSave implies that the function should be called using
9861/// a BL instruction, but doesn't require LR to be saved and restored. This
9862/// happens when LR is known to be dead.
9863///
9864/// That is,
9865///
9866/// I1 OUTLINED_FUNCTION:
9867/// I2 --> BL OUTLINED_FUNCTION I1
9868/// I3 I2
9869/// I3
9870/// RET
9871///
9872/// * Call construction overhead: 1 (BL)
9873/// * Frame construction overhead: 1 (RET)
9874/// * Requires stack fixups? No
9875///
9876/// \p MachineOutlinerThunk implies that the function is being created from
9877/// a sequence of instructions ending in a call. The outlined function is
9878/// called with a BL instruction, and the outlined function tail-calls the
9879/// original call destination.
9880///
9881/// That is,
9882///
9883/// I1 OUTLINED_FUNCTION:
9884/// I2 --> BL OUTLINED_FUNCTION I1
9885/// BL f I2
9886/// B f
9887/// * Call construction overhead: 1 (BL)
9888/// * Frame construction overhead: 0
9889/// * Requires stack fixups? No
9890///
9891/// \p MachineOutlinerRegSave implies that the function should be called with a
9892/// save and restore of LR to an available register. This allows us to avoid
9893/// stack fixups. Note that this outlining variant is compatible with the
9894/// NoLRSave case.
9895///
9896/// That is,
9897///
9898/// I1 Save LR OUTLINED_FUNCTION:
9899/// I2 --> BL OUTLINED_FUNCTION I1
9900/// I3 Restore LR I2
9901/// I3
9902/// RET
9903///
9904/// * Call construction overhead: 3 (save + BL + restore)
9905/// * Frame construction overhead: 1 (ret)
9906/// * Requires stack fixups? No
9907enum MachineOutlinerClass {
9908 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9909 MachineOutlinerTailCall, /// Only emit a branch.
9910 MachineOutlinerNoLRSave, /// Emit a call and return.
9911 MachineOutlinerThunk, /// Emit a call and tail-call.
9912 MachineOutlinerRegSave /// Same as default, but save to a register.
9913};
9914
9915enum MachineOutlinerMBBFlags {
9916 LRUnavailableSomewhere = 0x2,
9917 HasCalls = 0x4,
9918 UnsafeRegsDead = 0x8
9919};
9920
9921Register
9922AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
9923 MachineFunction *MF = C.getMF();
9924 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
9925 const AArch64RegisterInfo *ARI =
9926 static_cast<const AArch64RegisterInfo *>(&TRI);
9927 // Check if there is an available register across the sequence that we can
9928 // use.
9929 for (unsigned Reg : AArch64::GPR64RegClass) {
9930 if (!ARI->isReservedReg(MF: *MF, Reg) &&
9931 Reg != AArch64::LR && // LR is not reserved, but don't use it.
9932 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
9933 Reg != AArch64::X17 && // Ditto for X17.
9934 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
9935 C.isAvailableInsideSeq(Reg, TRI))
9936 return Reg;
9937 }
9938 return Register();
9939}
9940
9941static bool
9942outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
9943 const outliner::Candidate &b) {
9944 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9945 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9946
9947 return MFIa->getSignReturnAddressCondition() ==
9948 MFIb->getSignReturnAddressCondition();
9949}
9950
9951static bool
9952outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
9953 const outliner::Candidate &b) {
9954 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9955 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9956
9957 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
9958}
9959
9960static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
9961 const outliner::Candidate &b) {
9962 const AArch64Subtarget &SubtargetA =
9963 a.getMF()->getSubtarget<AArch64Subtarget>();
9964 const AArch64Subtarget &SubtargetB =
9965 b.getMF()->getSubtarget<AArch64Subtarget>();
9966 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
9967}
9968
9969std::optional<std::unique_ptr<outliner::OutlinedFunction>>
9970AArch64InstrInfo::getOutliningCandidateInfo(
9971 const MachineModuleInfo &MMI,
9972 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
9973 unsigned MinRepeats) const {
9974 unsigned SequenceSize = 0;
9975 for (auto &MI : RepeatedSequenceLocs[0])
9976 SequenceSize += getInstSizeInBytes(MI);
9977
9978 unsigned NumBytesToCreateFrame = 0;
9979
9980 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
9981 // These instructions are fused together by the scheduler.
9982 // Any candidate where ADRP is the last instruction should be rejected
9983 // as that will lead to splitting ADRP pair.
9984 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
9985 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
9986 if (LastMI.getOpcode() == AArch64::ADRP &&
9987 (LastMI.getOperand(i: 1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
9988 (LastMI.getOperand(i: 1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
9989 return std::nullopt;
9990 }
9991
9992 // Similarly any candidate where the first instruction is ADD/LDR with a
9993 // page offset should be rejected to avoid ADRP splitting.
9994 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
9995 FirstMI.getOpcode() == AArch64::LDRXui) &&
9996 (FirstMI.getOperand(i: 2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
9997 (FirstMI.getOperand(i: 2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
9998 return std::nullopt;
9999 }
10000
10001 // We only allow outlining for functions having exactly matching return
10002 // address signing attributes, i.e., all share the same value for the
10003 // attribute "sign-return-address" and all share the same type of key they
10004 // are signed with.
10005 // Additionally we require all functions to simultaneously either support
10006 // v8.3a features or not. Otherwise an outlined function could get signed
10007 // using dedicated v8.3 instructions and a call from a function that doesn't
10008 // support v8.3 instructions would therefore be invalid.
10009 if (std::adjacent_find(
10010 first: RepeatedSequenceLocs.begin(), last: RepeatedSequenceLocs.end(),
10011 binary_pred: [](const outliner::Candidate &a, const outliner::Candidate &b) {
10012 // Return true if a and b are non-equal w.r.t. return address
10013 // signing or support of v8.3a features
10014 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10015 outliningCandidatesSigningKeyConsensus(a, b) &&
10016 outliningCandidatesV8_3OpsConsensus(a, b)) {
10017 return false;
10018 }
10019 return true;
10020 }) != RepeatedSequenceLocs.end()) {
10021 return std::nullopt;
10022 }
10023
10024 // Since at this point all candidates agree on their return address signing
10025 // picking just one is fine. If the candidate functions potentially sign their
10026 // return addresses, the outlined function should do the same. Note that in
10027 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10028 // not certainly true that the outlined function will have to sign its return
10029 // address but this decision is made later, when the decision to outline
10030 // has already been made.
10031 // The same holds for the number of additional instructions we need: On
10032 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10033 // necessary. However, at this point we don't know if the outlined function
10034 // will have a RET instruction so we assume the worst.
10035 const TargetRegisterInfo &TRI = getRegisterInfo();
10036 // Performing a tail call may require extra checks when PAuth is enabled.
10037 // If PAuth is disabled, set it to zero for uniformity.
10038 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10039 const auto RASignCondition = RepeatedSequenceLocs[0]
10040 .getMF()
10041 ->getInfo<AArch64FunctionInfo>()
10042 ->getSignReturnAddressCondition();
10043 if (RASignCondition != SignReturnAddress::None) {
10044 // One PAC and one AUT instructions
10045 NumBytesToCreateFrame += 8;
10046
10047 // PAuth is enabled - set extra tail call cost, if any.
10048 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10049 MF: *RepeatedSequenceLocs[0].getMF());
10050 NumBytesToCheckLRInTCEpilogue =
10051 AArch64PAuth::getCheckerSizeInBytes(Method: LRCheckMethod);
10052 // Checking the authenticated LR value may significantly impact
10053 // SequenceSize, so account for it for more precise results.
10054 if (isTailCallReturnInst(MI: RepeatedSequenceLocs[0].back()))
10055 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10056
10057 // We have to check if sp modifying instructions would get outlined.
10058 // If so we only allow outlining if sp is unchanged overall, so matching
10059 // sub and add instructions are okay to outline, all other sp modifications
10060 // are not
10061 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10062 int SPValue = 0;
10063 for (auto &MI : C) {
10064 if (MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI)) {
10065 switch (MI.getOpcode()) {
10066 case AArch64::ADDXri:
10067 case AArch64::ADDWri:
10068 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10069 assert(MI.getOperand(2).isImm() &&
10070 "Expected operand to be immediate");
10071 assert(MI.getOperand(1).isReg() &&
10072 "Expected operand to be a register");
10073 // Check if the add just increments sp. If so, we search for
10074 // matching sub instructions that decrement sp. If not, the
10075 // modification is illegal
10076 if (MI.getOperand(i: 1).getReg() == AArch64::SP)
10077 SPValue += MI.getOperand(i: 2).getImm();
10078 else
10079 return true;
10080 break;
10081 case AArch64::SUBXri:
10082 case AArch64::SUBWri:
10083 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10084 assert(MI.getOperand(2).isImm() &&
10085 "Expected operand to be immediate");
10086 assert(MI.getOperand(1).isReg() &&
10087 "Expected operand to be a register");
10088 // Check if the sub just decrements sp. If so, we search for
10089 // matching add instructions that increment sp. If not, the
10090 // modification is illegal
10091 if (MI.getOperand(i: 1).getReg() == AArch64::SP)
10092 SPValue -= MI.getOperand(i: 2).getImm();
10093 else
10094 return true;
10095 break;
10096 default:
10097 return true;
10098 }
10099 }
10100 }
10101 if (SPValue)
10102 return true;
10103 return false;
10104 };
10105 // Remove candidates with illegal stack modifying instructions
10106 llvm::erase_if(C&: RepeatedSequenceLocs, P: hasIllegalSPModification);
10107
10108 // If the sequence doesn't have enough candidates left, then we're done.
10109 if (RepeatedSequenceLocs.size() < MinRepeats)
10110 return std::nullopt;
10111 }
10112
10113 // Properties about candidate MBBs that hold for all of them.
10114 unsigned FlagsSetInAll = 0xF;
10115
10116 // Compute liveness information for each candidate, and set FlagsSetInAll.
10117 for (outliner::Candidate &C : RepeatedSequenceLocs)
10118 FlagsSetInAll &= C.Flags;
10119
10120 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10121
10122 // Helper lambda which sets call information for every candidate.
10123 auto SetCandidateCallInfo =
10124 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10125 for (outliner::Candidate &C : RepeatedSequenceLocs)
10126 C.setCallInfo(CID: CallID, CO: NumBytesForCall);
10127 };
10128
10129 unsigned FrameID = MachineOutlinerDefault;
10130 NumBytesToCreateFrame += 4;
10131
10132 bool HasBTI = any_of(Range&: RepeatedSequenceLocs, P: [](outliner::Candidate &C) {
10133 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10134 });
10135
10136 // We check to see if CFI Instructions are present, and if they are
10137 // we find the number of CFI Instructions in the candidates.
10138 unsigned CFICount = 0;
10139 for (auto &I : RepeatedSequenceLocs[0]) {
10140 if (I.isCFIInstruction())
10141 CFICount++;
10142 }
10143
10144 // We compare the number of found CFI Instructions to the number of CFI
10145 // instructions in the parent function for each candidate. We must check this
10146 // since if we outline one of the CFI instructions in a function, we have to
10147 // outline them all for correctness. If we do not, the address offsets will be
10148 // incorrect between the two sections of the program.
10149 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10150 std::vector<MCCFIInstruction> CFIInstructions =
10151 C.getMF()->getFrameInstructions();
10152
10153 if (CFICount > 0 && CFICount != CFIInstructions.size())
10154 return std::nullopt;
10155 }
10156
10157 // Returns true if an instructions is safe to fix up, false otherwise.
10158 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10159 if (MI.isCall())
10160 return true;
10161
10162 if (!MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI) &&
10163 !MI.readsRegister(Reg: AArch64::SP, TRI: &TRI))
10164 return true;
10165
10166 // Any modification of SP will break our code to save/restore LR.
10167 // FIXME: We could handle some instructions which add a constant
10168 // offset to SP, with a bit more work.
10169 if (MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI))
10170 return false;
10171
10172 // At this point, we have a stack instruction that we might need to
10173 // fix up. We'll handle it if it's a load or store.
10174 if (MI.mayLoadOrStore()) {
10175 const MachineOperand *Base; // Filled with the base operand of MI.
10176 int64_t Offset; // Filled with the offset of MI.
10177 bool OffsetIsScalable;
10178
10179 // Does it allow us to offset the base operand and is the base the
10180 // register SP?
10181 if (!getMemOperandWithOffset(MI, BaseOp&: Base, Offset, OffsetIsScalable, TRI: &TRI) ||
10182 !Base->isReg() || Base->getReg() != AArch64::SP)
10183 return false;
10184
10185 // Fixe-up code below assumes bytes.
10186 if (OffsetIsScalable)
10187 return false;
10188
10189 // Find the minimum/maximum offset for this instruction and check
10190 // if fixing it up would be in range.
10191 int64_t MinOffset,
10192 MaxOffset; // Unscaled offsets for the instruction.
10193 // The scale to multiply the offsets by.
10194 TypeSize Scale(0U, false), DummyWidth(0U, false);
10195 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width&: DummyWidth, MinOffset, MaxOffset);
10196
10197 Offset += 16; // Update the offset to what it would be if we outlined.
10198 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10199 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10200 return false;
10201
10202 // It's in range, so we can outline it.
10203 return true;
10204 }
10205
10206 // FIXME: Add handling for instructions like "add x0, sp, #8".
10207
10208 // We can't fix it up, so don't outline it.
10209 return false;
10210 };
10211
10212 // True if it's possible to fix up each stack instruction in this sequence.
10213 // Important for frames/call variants that modify the stack.
10214 bool AllStackInstrsSafe =
10215 llvm::all_of(Range&: RepeatedSequenceLocs[0], P: IsSafeToFixup);
10216
10217 // If the last instruction in any candidate is a terminator, then we should
10218 // tail call all of the candidates.
10219 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10220 FrameID = MachineOutlinerTailCall;
10221 NumBytesToCreateFrame = 0;
10222 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10223 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10224 }
10225
10226 else if (LastInstrOpcode == AArch64::BL ||
10227 ((LastInstrOpcode == AArch64::BLR ||
10228 LastInstrOpcode == AArch64::BLRNoIP) &&
10229 !HasBTI)) {
10230 // FIXME: Do we need to check if the code after this uses the value of LR?
10231 FrameID = MachineOutlinerThunk;
10232 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10233 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10234 }
10235
10236 else {
10237 // We need to decide how to emit calls + frames. We can always emit the same
10238 // frame if we don't need to save to the stack. If we have to save to the
10239 // stack, then we need a different frame.
10240 unsigned NumBytesNoStackCalls = 0;
10241 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10242
10243 // Check if we have to save LR.
10244 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10245 bool LRAvailable =
10246 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
10247 ? C.isAvailableAcrossAndOutOfSeq(Reg: AArch64::LR, TRI)
10248 : true;
10249 // If we have a noreturn caller, then we're going to be conservative and
10250 // say that we have to save LR. If we don't have a ret at the end of the
10251 // block, then we can't reason about liveness accurately.
10252 //
10253 // FIXME: We can probably do better than always disabling this in
10254 // noreturn functions by fixing up the liveness info.
10255 bool IsNoReturn =
10256 C.getMF()->getFunction().hasFnAttribute(Kind: Attribute::NoReturn);
10257
10258 // Is LR available? If so, we don't need a save.
10259 if (LRAvailable && !IsNoReturn) {
10260 NumBytesNoStackCalls += 4;
10261 C.setCallInfo(CID: MachineOutlinerNoLRSave, CO: 4);
10262 CandidatesWithoutStackFixups.push_back(x: C);
10263 }
10264
10265 // Is an unused register available? If so, we won't modify the stack, so
10266 // we can outline with the same frame type as those that don't save LR.
10267 else if (findRegisterToSaveLRTo(C)) {
10268 NumBytesNoStackCalls += 12;
10269 C.setCallInfo(CID: MachineOutlinerRegSave, CO: 12);
10270 CandidatesWithoutStackFixups.push_back(x: C);
10271 }
10272
10273 // Is SP used in the sequence at all? If not, we don't have to modify
10274 // the stack, so we are guaranteed to get the same frame.
10275 else if (C.isAvailableInsideSeq(Reg: AArch64::SP, TRI)) {
10276 NumBytesNoStackCalls += 12;
10277 C.setCallInfo(CID: MachineOutlinerDefault, CO: 12);
10278 CandidatesWithoutStackFixups.push_back(x: C);
10279 }
10280
10281 // If we outline this, we need to modify the stack. Pretend we don't
10282 // outline this by saving all of its bytes.
10283 else {
10284 NumBytesNoStackCalls += SequenceSize;
10285 }
10286 }
10287
10288 // If there are no places where we have to save LR, then note that we
10289 // don't have to update the stack. Otherwise, give every candidate the
10290 // default call type, as long as it's safe to do so.
10291 if (!AllStackInstrsSafe ||
10292 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10293 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10294 FrameID = MachineOutlinerNoLRSave;
10295 if (RepeatedSequenceLocs.size() < MinRepeats)
10296 return std::nullopt;
10297 } else {
10298 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10299
10300 // Bugzilla ID: 46767
10301 // TODO: Check if fixing up the stack more than once is safe so we can
10302 // outline these.
10303 //
10304 // An outline resulting in a caller that requires stack fixups at the
10305 // callsite to a callee that also requires stack fixups can happen when
10306 // there are no available registers at the candidate callsite for a
10307 // candidate that itself also has calls.
10308 //
10309 // In other words if function_containing_sequence in the following pseudo
10310 // assembly requires that we save LR at the point of the call, but there
10311 // are no available registers: in this case we save using SP and as a
10312 // result the SP offsets requires stack fixups by multiples of 16.
10313 //
10314 // function_containing_sequence:
10315 // ...
10316 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10317 // call OUTLINED_FUNCTION_N
10318 // restore LR from SP
10319 // ...
10320 //
10321 // OUTLINED_FUNCTION_N:
10322 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10323 // ...
10324 // bl foo
10325 // restore LR from SP
10326 // ret
10327 //
10328 // Because the code to handle more than one stack fixup does not
10329 // currently have the proper checks for legality, these cases will assert
10330 // in the AArch64 MachineOutliner. This is because the code to do this
10331 // needs more hardening, testing, better checks that generated code is
10332 // legal, etc and because it is only verified to handle a single pass of
10333 // stack fixup.
10334 //
10335 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10336 // these cases until they are known to be handled. Bugzilla 46767 is
10337 // referenced in comments at the assert site.
10338 //
10339 // To avoid asserting (or generating non-legal code on noassert builds)
10340 // we remove all candidates which would need more than one stack fixup by
10341 // pruning the cases where the candidate has calls while also having no
10342 // available LR and having no available general purpose registers to copy
10343 // LR to (ie one extra stack save/restore).
10344 //
10345 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10346 erase_if(C&: RepeatedSequenceLocs, P: [this, &TRI](outliner::Candidate &C) {
10347 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10348 return (llvm::any_of(Range&: C, P: IsCall)) &&
10349 (!C.isAvailableAcrossAndOutOfSeq(Reg: AArch64::LR, TRI) ||
10350 !findRegisterToSaveLRTo(C));
10351 });
10352 }
10353 }
10354
10355 // If we dropped all of the candidates, bail out here.
10356 if (RepeatedSequenceLocs.size() < MinRepeats)
10357 return std::nullopt;
10358 }
10359
10360 // Does every candidate's MBB contain a call? If so, then we might have a call
10361 // in the range.
10362 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10363 // Check if the range contains a call. These require a save + restore of the
10364 // link register.
10365 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10366 bool ModStackToSaveLR = false;
10367 if (any_of(Range: drop_end(RangeOrContainer&: FirstCand),
10368 P: [](const MachineInstr &MI) { return MI.isCall(); }))
10369 ModStackToSaveLR = true;
10370
10371 // Handle the last instruction separately. If this is a tail call, then the
10372 // last instruction is a call. We don't want to save + restore in this case.
10373 // However, it could be possible that the last instruction is a call without
10374 // it being valid to tail call this sequence. We should consider this as
10375 // well.
10376 else if (FrameID != MachineOutlinerThunk &&
10377 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10378 ModStackToSaveLR = true;
10379
10380 if (ModStackToSaveLR) {
10381 // We can't fix up the stack. Bail out.
10382 if (!AllStackInstrsSafe)
10383 return std::nullopt;
10384
10385 // Save + restore LR.
10386 NumBytesToCreateFrame += 8;
10387 }
10388 }
10389
10390 // If we have CFI instructions, we can only outline if the outlined section
10391 // can be a tail call
10392 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10393 return std::nullopt;
10394
10395 return std::make_unique<outliner::OutlinedFunction>(
10396 args&: RepeatedSequenceLocs, args&: SequenceSize, args&: NumBytesToCreateFrame, args&: FrameID);
10397}
10398
10399void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10400 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10401 // If a bunch of candidates reach this point they must agree on their return
10402 // address signing. It is therefore enough to just consider the signing
10403 // behaviour of one of them
10404 const auto &CFn = Candidates.front().getMF()->getFunction();
10405
10406 if (CFn.hasFnAttribute(Kind: "ptrauth-returns"))
10407 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "ptrauth-returns"));
10408 if (CFn.hasFnAttribute(Kind: "ptrauth-auth-traps"))
10409 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "ptrauth-auth-traps"));
10410 // Since all candidates belong to the same module, just copy the
10411 // function-level attributes of an arbitrary function.
10412 if (CFn.hasFnAttribute(Kind: "sign-return-address"))
10413 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address"));
10414 if (CFn.hasFnAttribute(Kind: "sign-return-address-key"))
10415 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address-key"));
10416
10417 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10418}
10419
10420bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10421 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10422 const Function &F = MF.getFunction();
10423
10424 // Can F be deduplicated by the linker? If it can, don't outline from it.
10425 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10426 return false;
10427
10428 // Don't outline from functions with section markings; the program could
10429 // expect that all the code is in the named section.
10430 // FIXME: Allow outlining from multiple functions with the same section
10431 // marking.
10432 if (F.hasSection())
10433 return false;
10434
10435 // Outlining from functions with redzones is unsafe since the outliner may
10436 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10437 // outline from it.
10438 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10439 if (!AFI || AFI->hasRedZone().value_or(u: true))
10440 return false;
10441
10442 // FIXME: Determine whether it is safe to outline from functions which contain
10443 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10444 // outlined together and ensure it is safe to outline with async unwind info,
10445 // required for saving & restoring VG around calls.
10446 if (AFI->hasStreamingModeChanges())
10447 return false;
10448
10449 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10450 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
10451 return false;
10452
10453 // It's safe to outline from MF.
10454 return true;
10455}
10456
10457SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10458AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10459 unsigned &Flags) const {
10460 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
10461 "Must track liveness!");
10462 SmallVector<
10463 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10464 Ranges;
10465 // According to the AArch64 Procedure Call Standard, the following are
10466 // undefined on entry/exit from a function call:
10467 //
10468 // * Registers x16, x17, (and thus w16, w17)
10469 // * Condition codes (and thus the NZCV register)
10470 //
10471 // If any of these registers are used inside or live across an outlined
10472 // function, then they may be modified later, either by the compiler or
10473 // some other tool (like the linker).
10474 //
10475 // To avoid outlining in these situations, partition each block into ranges
10476 // where these registers are dead. We will only outline from those ranges.
10477 LiveRegUnits LRU(getRegisterInfo());
10478 auto AreAllUnsafeRegsDead = [&LRU]() {
10479 return LRU.available(Reg: AArch64::W16) && LRU.available(Reg: AArch64::W17) &&
10480 LRU.available(Reg: AArch64::NZCV);
10481 };
10482
10483 // We need to know if LR is live across an outlining boundary later on in
10484 // order to decide how we'll create the outlined call, frame, etc.
10485 //
10486 // It's pretty expensive to check this for *every candidate* within a block.
10487 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10488 // to compute liveness from the end of the block for O(n) candidates within
10489 // the block.
10490 //
10491 // So, to improve the average case, let's keep track of liveness from the end
10492 // of the block to the beginning of *every outlinable range*. If we know that
10493 // LR is available in every range we could outline from, then we know that
10494 // we don't need to check liveness for any candidate within that range.
10495 bool LRAvailableEverywhere = true;
10496 // Compute liveness bottom-up.
10497 LRU.addLiveOuts(MBB);
10498 // Update flags that require info about the entire MBB.
10499 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10500 if (MI.isCall() && !MI.isTerminator())
10501 Flags |= MachineOutlinerMBBFlags::HasCalls;
10502 };
10503 // Range: [RangeBegin, RangeEnd)
10504 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10505 unsigned RangeLen;
10506 auto CreateNewRangeStartingAt =
10507 [&RangeBegin, &RangeEnd,
10508 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10509 RangeBegin = NewBegin;
10510 RangeEnd = std::next(x: RangeBegin);
10511 RangeLen = 0;
10512 };
10513 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10514 // At least one unsafe register is not dead. We do not want to outline at
10515 // this point. If it is long enough to outline from and does not cross a
10516 // bundle boundary, save the range [RangeBegin, RangeEnd).
10517 if (RangeLen <= 1)
10518 return;
10519 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10520 return;
10521 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10522 return;
10523 Ranges.emplace_back(Args&: RangeBegin, Args&: RangeEnd);
10524 };
10525 // Find the first point where all unsafe registers are dead.
10526 // FIND: <safe instr> <-- end of first potential range
10527 // SKIP: <unsafe def>
10528 // SKIP: ... everything between ...
10529 // SKIP: <unsafe use>
10530 auto FirstPossibleEndPt = MBB.instr_rbegin();
10531 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10532 LRU.stepBackward(MI: *FirstPossibleEndPt);
10533 // Update flags that impact how we outline across the entire block,
10534 // regardless of safety.
10535 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10536 if (AreAllUnsafeRegsDead())
10537 break;
10538 }
10539 // If we exhausted the entire block, we have no safe ranges to outline.
10540 if (FirstPossibleEndPt == MBB.instr_rend())
10541 return Ranges;
10542 // Current range.
10543 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10544 // StartPt points to the first place where all unsafe registers
10545 // are dead (if there is any such point). Begin partitioning the MBB into
10546 // ranges.
10547 for (auto &MI : make_range(x: FirstPossibleEndPt, y: MBB.instr_rend())) {
10548 LRU.stepBackward(MI);
10549 UpdateWholeMBBFlags(MI);
10550 if (!AreAllUnsafeRegsDead()) {
10551 SaveRangeIfNonEmpty();
10552 CreateNewRangeStartingAt(MI.getIterator());
10553 continue;
10554 }
10555 LRAvailableEverywhere &= LRU.available(Reg: AArch64::LR);
10556 RangeBegin = MI.getIterator();
10557 ++RangeLen;
10558 }
10559 // Above loop misses the last (or only) range. If we are still safe, then
10560 // let's save the range.
10561 if (AreAllUnsafeRegsDead())
10562 SaveRangeIfNonEmpty();
10563 if (Ranges.empty())
10564 return Ranges;
10565 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10566 // the order.
10567 std::reverse(first: Ranges.begin(), last: Ranges.end());
10568 // If there is at least one outlinable range where LR is unavailable
10569 // somewhere, remember that.
10570 if (!LRAvailableEverywhere)
10571 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
10572 return Ranges;
10573}
10574
10575outliner::InstrType
10576AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10577 MachineBasicBlock::iterator &MIT,
10578 unsigned Flags) const {
10579 MachineInstr &MI = *MIT;
10580
10581 // Don't outline anything used for return address signing. The outlined
10582 // function will get signed later if needed
10583 switch (MI.getOpcode()) {
10584 case AArch64::PACM:
10585 case AArch64::PACIASP:
10586 case AArch64::PACIBSP:
10587 case AArch64::PACIASPPC:
10588 case AArch64::PACIBSPPC:
10589 case AArch64::AUTIASP:
10590 case AArch64::AUTIBSP:
10591 case AArch64::AUTIASPPCi:
10592 case AArch64::AUTIASPPCr:
10593 case AArch64::AUTIBSPPCi:
10594 case AArch64::AUTIBSPPCr:
10595 case AArch64::RETAA:
10596 case AArch64::RETAB:
10597 case AArch64::RETAASPPCi:
10598 case AArch64::RETAASPPCr:
10599 case AArch64::RETABSPPCi:
10600 case AArch64::RETABSPPCr:
10601 case AArch64::EMITBKEY:
10602 case AArch64::PAUTH_PROLOGUE:
10603 case AArch64::PAUTH_EPILOGUE:
10604 return outliner::InstrType::Illegal;
10605 }
10606
10607 // We can only outline these if we will tail call the outlined function, or
10608 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10609 // in a tail call.
10610 //
10611 // FIXME: If the proper fixups for the offset are implemented, this should be
10612 // possible.
10613 if (MI.isCFIInstruction())
10614 return outliner::InstrType::Legal;
10615
10616 // Is this a terminator for a basic block?
10617 if (MI.isTerminator())
10618 // TargetInstrInfo::getOutliningType has already filtered out anything
10619 // that would break this, so we can allow it here.
10620 return outliner::InstrType::Legal;
10621
10622 // Make sure none of the operands are un-outlinable.
10623 for (const MachineOperand &MOP : MI.operands()) {
10624 // A check preventing CFI indices was here before, but only CFI
10625 // instructions should have those.
10626 assert(!MOP.isCFIIndex());
10627
10628 // If it uses LR or W30 explicitly, then don't touch it.
10629 if (MOP.isReg() && !MOP.isImplicit() &&
10630 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10631 return outliner::InstrType::Illegal;
10632 }
10633
10634 // Special cases for instructions that can always be outlined, but will fail
10635 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10636 // be outlined because they don't require a *specific* value to be in LR.
10637 if (MI.getOpcode() == AArch64::ADRP)
10638 return outliner::InstrType::Legal;
10639
10640 // If MI is a call we might be able to outline it. We don't want to outline
10641 // any calls that rely on the position of items on the stack. When we outline
10642 // something containing a call, we have to emit a save and restore of LR in
10643 // the outlined function. Currently, this always happens by saving LR to the
10644 // stack. Thus, if we outline, say, half the parameters for a function call
10645 // plus the call, then we'll break the callee's expectations for the layout
10646 // of the stack.
10647 //
10648 // FIXME: Allow calls to functions which construct a stack frame, as long
10649 // as they don't access arguments on the stack.
10650 // FIXME: Figure out some way to analyze functions defined in other modules.
10651 // We should be able to compute the memory usage based on the IR calling
10652 // convention, even if we can't see the definition.
10653 if (MI.isCall()) {
10654 // Get the function associated with the call. Look at each operand and find
10655 // the one that represents the callee and get its name.
10656 const Function *Callee = nullptr;
10657 for (const MachineOperand &MOP : MI.operands()) {
10658 if (MOP.isGlobal()) {
10659 Callee = dyn_cast<Function>(Val: MOP.getGlobal());
10660 break;
10661 }
10662 }
10663
10664 // Never outline calls to mcount. There isn't any rule that would require
10665 // this, but the Linux kernel's "ftrace" feature depends on it.
10666 if (Callee && Callee->getName() == "\01_mcount")
10667 return outliner::InstrType::Illegal;
10668
10669 // If we don't know anything about the callee, assume it depends on the
10670 // stack layout of the caller. In that case, it's only legal to outline
10671 // as a tail-call. Explicitly list the call instructions we know about so we
10672 // don't get unexpected results with call pseudo-instructions.
10673 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10674 if (MI.getOpcode() == AArch64::BLR ||
10675 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10676 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10677
10678 if (!Callee)
10679 return UnknownCallOutlineType;
10680
10681 // We have a function we have information about. Check it if it's something
10682 // can safely outline.
10683 MachineFunction *CalleeMF = MMI.getMachineFunction(F: *Callee);
10684
10685 // We don't know what's going on with the callee at all. Don't touch it.
10686 if (!CalleeMF)
10687 return UnknownCallOutlineType;
10688
10689 // Check if we know anything about the callee saves on the function. If we
10690 // don't, then don't touch it, since that implies that we haven't
10691 // computed anything about its stack frame yet.
10692 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10693 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10694 MFI.getNumObjects() > 0)
10695 return UnknownCallOutlineType;
10696
10697 // At this point, we can say that CalleeMF ought to not pass anything on the
10698 // stack. Therefore, we can outline it.
10699 return outliner::InstrType::Legal;
10700 }
10701
10702 // Don't touch the link register or W30.
10703 if (MI.readsRegister(Reg: AArch64::W30, TRI: &getRegisterInfo()) ||
10704 MI.modifiesRegister(Reg: AArch64::W30, TRI: &getRegisterInfo()))
10705 return outliner::InstrType::Illegal;
10706
10707 // Don't outline BTI instructions, because that will prevent the outlining
10708 // site from being indirectly callable.
10709 if (hasBTISemantics(MI))
10710 return outliner::InstrType::Illegal;
10711
10712 return outliner::InstrType::Legal;
10713}
10714
10715void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10716 for (MachineInstr &MI : MBB) {
10717 const MachineOperand *Base;
10718 TypeSize Width(0, false);
10719 int64_t Offset;
10720 bool OffsetIsScalable;
10721
10722 // Is this a load or store with an immediate offset with SP as the base?
10723 if (!MI.mayLoadOrStore() ||
10724 !getMemOperandWithOffsetWidth(LdSt: MI, BaseOp&: Base, Offset, OffsetIsScalable, Width,
10725 TRI: &RI) ||
10726 (Base->isReg() && Base->getReg() != AArch64::SP))
10727 continue;
10728
10729 // It is, so we have to fix it up.
10730 TypeSize Scale(0U, false);
10731 int64_t Dummy1, Dummy2;
10732
10733 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(LdSt&: MI);
10734 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10735 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2);
10736 assert(Scale != 0 && "Unexpected opcode!");
10737 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10738
10739 // We've pushed the return address to the stack, so add 16 to the offset.
10740 // This is safe, since we already checked if it would overflow when we
10741 // checked if this instruction was legal to outline.
10742 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10743 StackOffsetOperand.setImm(NewImm);
10744 }
10745}
10746
10747static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
10748 const AArch64InstrInfo *TII,
10749 bool ShouldSignReturnAddr) {
10750 if (!ShouldSignReturnAddr)
10751 return;
10752
10753 BuildMI(BB&: MBB, I: MBB.begin(), MIMD: DebugLoc(), MCID: TII->get(Opcode: AArch64::PAUTH_PROLOGUE))
10754 .setMIFlag(MachineInstr::FrameSetup);
10755 BuildMI(BB&: MBB, I: MBB.getFirstInstrTerminator(), MIMD: DebugLoc(),
10756 MCID: TII->get(Opcode: AArch64::PAUTH_EPILOGUE))
10757 .setMIFlag(MachineInstr::FrameDestroy);
10758}
10759
10760void AArch64InstrInfo::buildOutlinedFrame(
10761 MachineBasicBlock &MBB, MachineFunction &MF,
10762 const outliner::OutlinedFunction &OF) const {
10763
10764 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10765
10766 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10767 FI->setOutliningStyle("Tail Call");
10768 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10769 // For thunk outlining, rewrite the last instruction from a call to a
10770 // tail-call.
10771 MachineInstr *Call = &*--MBB.instr_end();
10772 unsigned TailOpcode;
10773 if (Call->getOpcode() == AArch64::BL) {
10774 TailOpcode = AArch64::TCRETURNdi;
10775 } else {
10776 assert(Call->getOpcode() == AArch64::BLR ||
10777 Call->getOpcode() == AArch64::BLRNoIP);
10778 TailOpcode = AArch64::TCRETURNriALL;
10779 }
10780 MachineInstr *TC = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: TailOpcode))
10781 .add(MO: Call->getOperand(i: 0))
10782 .addImm(Val: 0);
10783 MBB.insert(I: MBB.end(), MI: TC);
10784 Call->eraseFromParent();
10785
10786 FI->setOutliningStyle("Thunk");
10787 }
10788
10789 bool IsLeafFunction = true;
10790
10791 // Is there a call in the outlined range?
10792 auto IsNonTailCall = [](const MachineInstr &MI) {
10793 return MI.isCall() && !MI.isReturn();
10794 };
10795
10796 if (llvm::any_of(Range: MBB.instrs(), P: IsNonTailCall)) {
10797 // Fix up the instructions in the range, since we're going to modify the
10798 // stack.
10799
10800 // Bugzilla ID: 46767
10801 // TODO: Check if fixing up twice is safe so we can outline these.
10802 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10803 "Can only fix up stack references once");
10804 fixupPostOutline(MBB);
10805
10806 IsLeafFunction = false;
10807
10808 // LR has to be a live in so that we can save it.
10809 if (!MBB.isLiveIn(Reg: AArch64::LR))
10810 MBB.addLiveIn(PhysReg: AArch64::LR);
10811
10812 MachineBasicBlock::iterator It = MBB.begin();
10813 MachineBasicBlock::iterator Et = MBB.end();
10814
10815 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10816 OF.FrameConstructionID == MachineOutlinerThunk)
10817 Et = std::prev(x: MBB.end());
10818
10819 // Insert a save before the outlined region
10820 MachineInstr *STRXpre = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::STRXpre))
10821 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10822 .addReg(RegNo: AArch64::LR)
10823 .addReg(RegNo: AArch64::SP)
10824 .addImm(Val: -16);
10825 It = MBB.insert(I: It, MI: STRXpre);
10826
10827 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10828 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10829
10830 // Add a CFI saying the stack was moved 16 B down.
10831 CFIBuilder.buildDefCFAOffset(Offset: 16);
10832
10833 // Add a CFI saying that the LR that we want to find is now 16 B higher
10834 // than before.
10835 CFIBuilder.buildOffset(Reg: AArch64::LR, Offset: -16);
10836 }
10837
10838 // Insert a restore before the terminator for the function.
10839 MachineInstr *LDRXpost = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::LDRXpost))
10840 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10841 .addReg(RegNo: AArch64::LR, Flags: RegState::Define)
10842 .addReg(RegNo: AArch64::SP)
10843 .addImm(Val: 16);
10844 Et = MBB.insert(I: Et, MI: LDRXpost);
10845 }
10846
10847 auto RASignCondition = FI->getSignReturnAddressCondition();
10848 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10849 Condition: RASignCondition, IsLRSpilled: !IsLeafFunction);
10850
10851 // If this is a tail call outlined function, then there's already a return.
10852 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10853 OF.FrameConstructionID == MachineOutlinerThunk) {
10854 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
10855 return;
10856 }
10857
10858 // It's not a tail call, so we have to insert the return ourselves.
10859
10860 // LR has to be a live in so that we can return to it.
10861 if (!MBB.isLiveIn(Reg: AArch64::LR))
10862 MBB.addLiveIn(PhysReg: AArch64::LR);
10863
10864 MachineInstr *ret = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::RET))
10865 .addReg(RegNo: AArch64::LR);
10866 MBB.insert(I: MBB.end(), MI: ret);
10867
10868 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
10869
10870 FI->setOutliningStyle("Function");
10871
10872 // Did we have to modify the stack by saving the link register?
10873 if (OF.FrameConstructionID != MachineOutlinerDefault)
10874 return;
10875
10876 // We modified the stack.
10877 // Walk over the basic block and fix up all the stack accesses.
10878 fixupPostOutline(MBB);
10879}
10880
10881MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
10882 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
10883 MachineFunction &MF, outliner::Candidate &C) const {
10884
10885 // Are we tail calling?
10886 if (C.CallConstructionID == MachineOutlinerTailCall) {
10887 // If yes, then we can just branch to the label.
10888 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::TCRETURNdi))
10889 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName()))
10890 .addImm(Val: 0));
10891 return It;
10892 }
10893
10894 // Are we saving the link register?
10895 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
10896 C.CallConstructionID == MachineOutlinerThunk) {
10897 // No, so just insert the call.
10898 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::BL))
10899 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName())));
10900 return It;
10901 }
10902
10903 // We want to return the spot where we inserted the call.
10904 MachineBasicBlock::iterator CallPt;
10905
10906 // Instructions for saving and restoring LR around the call instruction we're
10907 // going to insert.
10908 MachineInstr *Save;
10909 MachineInstr *Restore;
10910 // Can we save to a register?
10911 if (C.CallConstructionID == MachineOutlinerRegSave) {
10912 // FIXME: This logic should be sunk into a target-specific interface so that
10913 // we don't have to recompute the register.
10914 Register Reg = findRegisterToSaveLRTo(C);
10915 assert(Reg && "No callee-saved register available?");
10916
10917 // LR has to be a live in so that we can save it.
10918 if (!MBB.isLiveIn(Reg: AArch64::LR))
10919 MBB.addLiveIn(PhysReg: AArch64::LR);
10920
10921 // Save and restore LR from Reg.
10922 Save = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::ORRXrs), DestReg: Reg)
10923 .addReg(RegNo: AArch64::XZR)
10924 .addReg(RegNo: AArch64::LR)
10925 .addImm(Val: 0);
10926 Restore = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::ORRXrs), DestReg: AArch64::LR)
10927 .addReg(RegNo: AArch64::XZR)
10928 .addReg(RegNo: Reg)
10929 .addImm(Val: 0);
10930 } else {
10931 // We have the default case. Save and restore from SP.
10932 Save = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::STRXpre))
10933 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10934 .addReg(RegNo: AArch64::LR)
10935 .addReg(RegNo: AArch64::SP)
10936 .addImm(Val: -16);
10937 Restore = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::LDRXpost))
10938 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10939 .addReg(RegNo: AArch64::LR, Flags: RegState::Define)
10940 .addReg(RegNo: AArch64::SP)
10941 .addImm(Val: 16);
10942 }
10943
10944 It = MBB.insert(I: It, MI: Save);
10945 It++;
10946
10947 // Insert the call.
10948 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::BL))
10949 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName())));
10950 CallPt = It;
10951 It++;
10952
10953 It = MBB.insert(I: It, MI: Restore);
10954 return CallPt;
10955}
10956
10957bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
10958 MachineFunction &MF) const {
10959 return MF.getFunction().hasMinSize();
10960}
10961
10962void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
10963 MachineBasicBlock::iterator Iter,
10964 DebugLoc &DL,
10965 bool AllowSideEffects) const {
10966 const MachineFunction &MF = *MBB.getParent();
10967 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
10968 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
10969
10970 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10971 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: Reg).addImm(Val: 0).addImm(Val: 0);
10972 } else if (STI.isSVEorStreamingSVEAvailable()) {
10973 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::DUP_ZI_D), DestReg: Reg)
10974 .addImm(Val: 0)
10975 .addImm(Val: 0);
10976 } else if (STI.isNeonAvailable()) {
10977 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::MOVIv2d_ns), DestReg: Reg)
10978 .addImm(Val: 0);
10979 } else {
10980 // This is a streaming-compatible function without SVE. We don't have full
10981 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
10982 // So given `movi v..` would be illegal use `fmov d..` instead.
10983 assert(STI.hasNEON() && "Expected to have NEON.");
10984 Register Reg64 = TRI.getSubReg(Reg, Idx: AArch64::dsub);
10985 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::FMOVD0), DestReg: Reg64);
10986 }
10987}
10988
10989std::optional<DestSourcePair>
10990AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
10991
10992 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
10993 // and zero immediate operands used as an alias for mov instruction.
10994 if (((MI.getOpcode() == AArch64::ORRWrs &&
10995 MI.getOperand(i: 1).getReg() == AArch64::WZR &&
10996 MI.getOperand(i: 3).getImm() == 0x0) ||
10997 (MI.getOpcode() == AArch64::ORRWrr &&
10998 MI.getOperand(i: 1).getReg() == AArch64::WZR)) &&
10999 // Check that the w->w move is not a zero-extending w->x mov.
11000 (!MI.getOperand(i: 0).getReg().isVirtual() ||
11001 MI.getOperand(i: 0).getSubReg() == 0) &&
11002 (!MI.getOperand(i: 0).getReg().isPhysical() ||
11003 MI.findRegisterDefOperandIdx(Reg: getXRegFromWReg(Reg: MI.getOperand(i: 0).getReg()),
11004 /*TRI=*/nullptr) == -1))
11005 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11006
11007 if (MI.getOpcode() == AArch64::ORRXrs &&
11008 MI.getOperand(i: 1).getReg() == AArch64::XZR &&
11009 MI.getOperand(i: 3).getImm() == 0x0)
11010 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11011
11012 return std::nullopt;
11013}
11014
11015std::optional<DestSourcePair>
11016AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
11017 if ((MI.getOpcode() == AArch64::ORRWrs &&
11018 MI.getOperand(i: 1).getReg() == AArch64::WZR &&
11019 MI.getOperand(i: 3).getImm() == 0x0) ||
11020 (MI.getOpcode() == AArch64::ORRWrr &&
11021 MI.getOperand(i: 1).getReg() == AArch64::WZR))
11022 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11023 return std::nullopt;
11024}
11025
11026std::optional<RegImmPair>
11027AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11028 int Sign = 1;
11029 int64_t Offset = 0;
11030
11031 // TODO: Handle cases where Reg is a super- or sub-register of the
11032 // destination register.
11033 const MachineOperand &Op0 = MI.getOperand(i: 0);
11034 if (!Op0.isReg() || Reg != Op0.getReg())
11035 return std::nullopt;
11036
11037 switch (MI.getOpcode()) {
11038 default:
11039 return std::nullopt;
11040 case AArch64::SUBWri:
11041 case AArch64::SUBXri:
11042 case AArch64::SUBSWri:
11043 case AArch64::SUBSXri:
11044 Sign *= -1;
11045 [[fallthrough]];
11046 case AArch64::ADDSWri:
11047 case AArch64::ADDSXri:
11048 case AArch64::ADDWri:
11049 case AArch64::ADDXri: {
11050 // TODO: Third operand can be global address (usually some string).
11051 if (!MI.getOperand(i: 0).isReg() || !MI.getOperand(i: 1).isReg() ||
11052 !MI.getOperand(i: 2).isImm())
11053 return std::nullopt;
11054 int Shift = MI.getOperand(i: 3).getImm();
11055 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11056 Offset = Sign * (MI.getOperand(i: 2).getImm() << Shift);
11057 }
11058 }
11059 return RegImmPair{MI.getOperand(i: 1).getReg(), Offset};
11060}
11061
11062/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11063/// the destination register then, if possible, describe the value in terms of
11064/// the source register.
11065static std::optional<ParamLoadedValue>
11066describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
11067 const TargetInstrInfo *TII,
11068 const TargetRegisterInfo *TRI) {
11069 auto DestSrc = TII->isCopyLikeInstr(MI);
11070 if (!DestSrc)
11071 return std::nullopt;
11072
11073 Register DestReg = DestSrc->Destination->getReg();
11074 Register SrcReg = DestSrc->Source->getReg();
11075
11076 if (!DestReg.isValid() || !SrcReg.isValid())
11077 return std::nullopt;
11078
11079 auto Expr = DIExpression::get(Context&: MI.getMF()->getFunction().getContext(), Elements: {});
11080
11081 // If the described register is the destination, just return the source.
11082 if (DestReg == DescribedReg)
11083 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
11084
11085 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11086 if (MI.getOpcode() == AArch64::ORRWrs &&
11087 TRI->isSuperRegister(RegA: DestReg, RegB: DescribedReg))
11088 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
11089
11090 // We may need to describe the lower part of a ORRXrs move.
11091 if (MI.getOpcode() == AArch64::ORRXrs &&
11092 TRI->isSubRegister(RegA: DestReg, RegB: DescribedReg)) {
11093 Register SrcSubReg = TRI->getSubReg(Reg: SrcReg, Idx: AArch64::sub_32);
11094 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcSubReg, isDef: false), Expr);
11095 }
11096
11097 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11098 "Unhandled ORR[XW]rs copy case");
11099
11100 return std::nullopt;
11101}
11102
11103bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11104 // Functions cannot be split to different sections on AArch64 if they have
11105 // a red zone. This is because relaxing a cross-section branch may require
11106 // incrementing the stack pointer to spill a register, which would overwrite
11107 // the red zone.
11108 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(u: true))
11109 return false;
11110
11111 return TargetInstrInfo::isFunctionSafeToSplit(MF);
11112}
11113
11114bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11115 const MachineBasicBlock &MBB) const {
11116 // Asm Goto blocks can contain conditional branches to goto labels, which can
11117 // get moved out of range of the branch instruction.
11118 auto isAsmGoto = [](const MachineInstr &MI) {
11119 return MI.getOpcode() == AArch64::INLINEASM_BR;
11120 };
11121 if (llvm::any_of(Range: MBB, P: isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11122 return false;
11123
11124 // Because jump tables are label-relative instead of table-relative, they all
11125 // must be in the same section or relocation fixup handling will fail.
11126
11127 // Check if MBB is a jump table target
11128 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11129 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11130 return llvm::is_contained(Range: JTE.MBBs, Element: &MBB);
11131 };
11132 if (MJTI != nullptr && llvm::any_of(Range: MJTI->getJumpTables(), P: containsMBB))
11133 return false;
11134
11135 // Check if MBB contains a jump table lookup
11136 for (const MachineInstr &MI : MBB) {
11137 switch (MI.getOpcode()) {
11138 case TargetOpcode::G_BRJT:
11139 case AArch64::JumpTableDest32:
11140 case AArch64::JumpTableDest16:
11141 case AArch64::JumpTableDest8:
11142 return false;
11143 default:
11144 continue;
11145 }
11146 }
11147
11148 // MBB isn't a special case, so it's safe to be split to the cold section.
11149 return true;
11150}
11151
11152std::optional<ParamLoadedValue>
11153AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11154 Register Reg) const {
11155 const MachineFunction *MF = MI.getMF();
11156 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11157 switch (MI.getOpcode()) {
11158 case AArch64::MOVZWi:
11159 case AArch64::MOVZXi: {
11160 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11161 // 64-bit parameters, so we need to consider super-registers.
11162 if (!TRI->isSuperRegisterEq(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
11163 return std::nullopt;
11164
11165 if (!MI.getOperand(i: 1).isImm())
11166 return std::nullopt;
11167 int64_t Immediate = MI.getOperand(i: 1).getImm();
11168 int Shift = MI.getOperand(i: 2).getImm();
11169 return ParamLoadedValue(MachineOperand::CreateImm(Val: Immediate << Shift),
11170 nullptr);
11171 }
11172 case AArch64::ORRWrs:
11173 case AArch64::ORRXrs:
11174 return describeORRLoadedValue(MI, DescribedReg: Reg, TII: this, TRI);
11175 }
11176
11177 return TargetInstrInfo::describeLoadedValue(MI, Reg);
11178}
11179
11180bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11181 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11182 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11183 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11184 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11185
11186 // Anyexts are nops.
11187 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11188 return true;
11189
11190 Register DefReg = ExtMI.getOperand(i: 0).getReg();
11191 if (!MRI.hasOneNonDBGUse(RegNo: DefReg))
11192 return false;
11193
11194 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11195 // addressing mode.
11196 auto *UserMI = &*MRI.use_instr_nodbg_begin(RegNo: DefReg);
11197 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11198}
11199
11200uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11201 return get(Opcode: Opc).TSFlags & AArch64::ElementSizeMask;
11202}
11203
11204bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11205 return get(Opcode: Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11206}
11207
11208bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11209 return get(Opcode: Opc).TSFlags & AArch64::InstrFlagIsWhile;
11210}
11211
11212unsigned int
11213AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11214 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11215}
11216
11217bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11218 unsigned Scale) const {
11219 if (Offset && Scale)
11220 return false;
11221
11222 // Check Reg + Imm
11223 if (!Scale) {
11224 // 9-bit signed offset
11225 if (isInt<9>(x: Offset))
11226 return true;
11227
11228 // 12-bit unsigned offset
11229 unsigned Shift = Log2_64(Value: NumBytes);
11230 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11231 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11232 (Offset >> Shift) << Shift == Offset)
11233 return true;
11234 return false;
11235 }
11236
11237 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11238 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11239}
11240
11241unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
11242 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11243 return AArch64::BLRNoIP;
11244 else
11245 return AArch64::BLR;
11246}
11247
11248MachineBasicBlock::iterator
11249AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11250 Register TargetReg, bool FrameSetup) const {
11251 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11252
11253 MachineBasicBlock &MBB = *MBBI->getParent();
11254 MachineFunction &MF = *MBB.getParent();
11255 const AArch64InstrInfo *TII =
11256 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11257 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11258 DebugLoc DL = MBB.findDebugLoc(MBBI);
11259
11260 MachineFunction::iterator MBBInsertPoint = std::next(x: MBB.getIterator());
11261 MachineBasicBlock *LoopTestMBB =
11262 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11263 MF.insert(MBBI: MBBInsertPoint, MBB: LoopTestMBB);
11264 MachineBasicBlock *LoopBodyMBB =
11265 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11266 MF.insert(MBBI: MBBInsertPoint, MBB: LoopBodyMBB);
11267 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11268 MF.insert(MBBI: MBBInsertPoint, MBB: ExitMBB);
11269 MachineInstr::MIFlag Flags =
11270 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
11271
11272 // LoopTest:
11273 // SUB SP, SP, #ProbeSize
11274 emitFrameOffset(MBB&: *LoopTestMBB, MBBI: LoopTestMBB->end(), DL, DestReg: AArch64::SP,
11275 SrcReg: AArch64::SP, Offset: StackOffset::getFixed(Fixed: -ProbeSize), TII, Flag: Flags);
11276
11277 // CMP SP, TargetReg
11278 BuildMI(BB&: *LoopTestMBB, I: LoopTestMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::SUBSXrx64),
11279 DestReg: AArch64::XZR)
11280 .addReg(RegNo: AArch64::SP)
11281 .addReg(RegNo: TargetReg)
11282 .addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: 0))
11283 .setMIFlags(Flags);
11284
11285 // B.<Cond> LoopExit
11286 BuildMI(BB&: *LoopTestMBB, I: LoopTestMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc))
11287 .addImm(Val: AArch64CC::LE)
11288 .addMBB(MBB: ExitMBB)
11289 .setMIFlags(Flags);
11290
11291 // LDR XZR, [SP]
11292 BuildMI(BB&: *LoopBodyMBB, I: LoopBodyMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
11293 .addDef(RegNo: AArch64::XZR)
11294 .addReg(RegNo: AArch64::SP)
11295 .addImm(Val: 0)
11296 .addMemOperand(MMO: MF.getMachineMemOperand(
11297 PtrInfo: MachinePointerInfo::getUnknownStack(MF),
11298 F: MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, Size: 8,
11299 BaseAlignment: Align(8)))
11300 .setMIFlags(Flags);
11301
11302 // B loop
11303 BuildMI(BB&: *LoopBodyMBB, I: LoopBodyMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::B))
11304 .addMBB(MBB: LoopTestMBB)
11305 .setMIFlags(Flags);
11306
11307 // LoopExit:
11308 // MOV SP, TargetReg
11309 BuildMI(BB&: *ExitMBB, I: ExitMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri), DestReg: AArch64::SP)
11310 .addReg(RegNo: TargetReg)
11311 .addImm(Val: 0)
11312 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
11313 .setMIFlags(Flags);
11314
11315 // LDR XZR, [SP]
11316 BuildMI(BB&: *ExitMBB, I: ExitMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
11317 .addReg(RegNo: AArch64::XZR, Flags: RegState::Define)
11318 .addReg(RegNo: AArch64::SP)
11319 .addImm(Val: 0)
11320 .setMIFlags(Flags);
11321
11322 ExitMBB->splice(Where: ExitMBB->end(), Other: &MBB, From: std::next(x: MBBI), To: MBB.end());
11323 ExitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
11324
11325 LoopTestMBB->addSuccessor(Succ: ExitMBB);
11326 LoopTestMBB->addSuccessor(Succ: LoopBodyMBB);
11327 LoopBodyMBB->addSuccessor(Succ: LoopTestMBB);
11328 MBB.addSuccessor(Succ: LoopTestMBB);
11329
11330 // Update liveins.
11331 if (MF.getRegInfo().reservedRegsFrozen())
11332 fullyRecomputeLiveIns(MBBs: {ExitMBB, LoopBodyMBB, LoopTestMBB});
11333
11334 return ExitMBB->begin();
11335}
11336
11337namespace {
11338class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11339 MachineFunction *MF;
11340 const TargetInstrInfo *TII;
11341 const TargetRegisterInfo *TRI;
11342 MachineRegisterInfo &MRI;
11343
11344 /// The block of the loop
11345 MachineBasicBlock *LoopBB;
11346 /// The conditional branch of the loop
11347 MachineInstr *CondBranch;
11348 /// The compare instruction for loop control
11349 MachineInstr *Comp;
11350 /// The number of the operand of the loop counter value in Comp
11351 unsigned CompCounterOprNum;
11352 /// The instruction that updates the loop counter value
11353 MachineInstr *Update;
11354 /// The number of the operand of the loop counter value in Update
11355 unsigned UpdateCounterOprNum;
11356 /// The initial value of the loop counter
11357 Register Init;
11358 /// True iff Update is a predecessor of Comp
11359 bool IsUpdatePriorComp;
11360
11361 /// The normalized condition used by createTripCountGreaterCondition()
11362 SmallVector<MachineOperand, 4> Cond;
11363
11364public:
11365 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11366 MachineInstr *Comp, unsigned CompCounterOprNum,
11367 MachineInstr *Update, unsigned UpdateCounterOprNum,
11368 Register Init, bool IsUpdatePriorComp,
11369 const SmallVectorImpl<MachineOperand> &Cond)
11370 : MF(Comp->getParent()->getParent()),
11371 TII(MF->getSubtarget().getInstrInfo()),
11372 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11373 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11374 CompCounterOprNum(CompCounterOprNum), Update(Update),
11375 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11376 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11377
11378 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11379 // Make the instructions for loop control be placed in stage 0.
11380 // The predecessors of Comp are considered by the caller.
11381 return MI == Comp;
11382 }
11383
11384 std::optional<bool> createTripCountGreaterCondition(
11385 int TC, MachineBasicBlock &MBB,
11386 SmallVectorImpl<MachineOperand> &CondParam) override {
11387 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11388 // Cond is normalized for such use.
11389 // The predecessors of the branch are assumed to have already been inserted.
11390 CondParam = Cond;
11391 return {};
11392 }
11393
11394 void createRemainingIterationsGreaterCondition(
11395 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11396 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11397
11398 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11399
11400 void adjustTripCount(int TripCountAdjust) override {}
11401
11402 bool isMVEExpanderSupported() override { return true; }
11403};
11404} // namespace
11405
11406/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11407/// is replaced by ReplaceReg. The output register is newly created.
11408/// The other operands are unchanged from MI.
11409static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11410 Register ReplaceReg, MachineBasicBlock &MBB,
11411 MachineBasicBlock::iterator InsertTo) {
11412 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11413 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11414 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(Orig: MI);
11415 Register Result = 0;
11416 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11417 if (I == 0 && NewMI->getOperand(i: 0).getReg().isVirtual()) {
11418 Result = MRI.createVirtualRegister(
11419 RegClass: MRI.getRegClass(Reg: NewMI->getOperand(i: 0).getReg()));
11420 NewMI->getOperand(i: I).setReg(Result);
11421 } else if (I == ReplaceOprNum) {
11422 MRI.constrainRegClass(Reg: ReplaceReg, RC: TII->getRegClass(MCID: NewMI->getDesc(), OpNum: I));
11423 NewMI->getOperand(i: I).setReg(ReplaceReg);
11424 }
11425 }
11426 MBB.insert(I: InsertTo, MI: NewMI);
11427 return Result;
11428}
11429
11430void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11431 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11432 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
11433 // Create and accumulate conditions for next TC iterations.
11434 // Example:
11435 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11436 // # iteration of the kernel
11437 //
11438 // # insert the following instructions
11439 // cond = CSINCXr 0, 0, C, implicit $nzcv
11440 // counter = ADDXri counter, 1 # clone from this->Update
11441 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11442 // cond = CSINCXr cond, cond, C, implicit $nzcv
11443 // ... (repeat TC times)
11444 // SUBSXri cond, 0, implicit-def $nzcv
11445
11446 assert(CondBranch->getOpcode() == AArch64::Bcc);
11447 // CondCode to exit the loop
11448 AArch64CC::CondCode CC =
11449 (AArch64CC::CondCode)CondBranch->getOperand(i: 0).getImm();
11450 if (CondBranch->getOperand(i: 1).getMBB() == LoopBB)
11451 CC = AArch64CC::getInvertedCondCode(Code: CC);
11452
11453 // Accumulate conditions to exit the loop
11454 Register AccCond = AArch64::XZR;
11455
11456 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11457 auto AccumulateCond = [&](Register CurCond,
11458 AArch64CC::CondCode CC) -> Register {
11459 Register NewCond = MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
11460 BuildMI(BB&: MBB, I: MBB.end(), MIMD: Comp->getDebugLoc(), MCID: TII->get(Opcode: AArch64::CSINCXr))
11461 .addReg(RegNo: NewCond, Flags: RegState::Define)
11462 .addReg(RegNo: CurCond)
11463 .addReg(RegNo: CurCond)
11464 .addImm(Val: AArch64CC::getInvertedCondCode(Code: CC));
11465 return NewCond;
11466 };
11467
11468 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11469 // Update and Comp for I==0 are already exists in MBB
11470 // (MBB is an unrolled kernel)
11471 Register Counter;
11472 for (int I = 0; I <= TC; ++I) {
11473 Register NextCounter;
11474 if (I != 0)
11475 NextCounter =
11476 cloneInstr(MI: Comp, ReplaceOprNum: CompCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11477
11478 AccCond = AccumulateCond(AccCond, CC);
11479
11480 if (I != TC) {
11481 if (I == 0) {
11482 if (Update != Comp && IsUpdatePriorComp) {
11483 Counter =
11484 LastStage0Insts[Comp]->getOperand(i: CompCounterOprNum).getReg();
11485 NextCounter = cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB,
11486 InsertTo: MBB.end());
11487 } else {
11488 // can use already calculated value
11489 NextCounter = LastStage0Insts[Update]->getOperand(i: 0).getReg();
11490 }
11491 } else if (Update != Comp) {
11492 NextCounter =
11493 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11494 }
11495 }
11496 Counter = NextCounter;
11497 }
11498 } else {
11499 Register Counter;
11500 if (LastStage0Insts.empty()) {
11501 // use initial counter value (testing if the trip count is sufficient to
11502 // be executed by pipelined code)
11503 Counter = Init;
11504 if (IsUpdatePriorComp)
11505 Counter =
11506 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11507 } else {
11508 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11509 Counter = LastStage0Insts[Comp]->getOperand(i: CompCounterOprNum).getReg();
11510 }
11511
11512 for (int I = 0; I <= TC; ++I) {
11513 Register NextCounter;
11514 NextCounter =
11515 cloneInstr(MI: Comp, ReplaceOprNum: CompCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11516 AccCond = AccumulateCond(AccCond, CC);
11517 if (I != TC && Update != Comp)
11518 NextCounter =
11519 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11520 Counter = NextCounter;
11521 }
11522 }
11523
11524 // If AccCond == 0, the remainder is greater than TC.
11525 BuildMI(BB&: MBB, I: MBB.end(), MIMD: Comp->getDebugLoc(), MCID: TII->get(Opcode: AArch64::SUBSXri))
11526 .addReg(RegNo: AArch64::XZR, Flags: RegState::Define | RegState::Dead)
11527 .addReg(RegNo: AccCond)
11528 .addImm(Val: 0)
11529 .addImm(Val: 0);
11530 Cond.clear();
11531 Cond.push_back(Elt: MachineOperand::CreateImm(Val: AArch64CC::EQ));
11532}
11533
11534static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11535 Register &RegMBB, Register &RegOther) {
11536 assert(Phi.getNumOperands() == 5);
11537 if (Phi.getOperand(i: 2).getMBB() == MBB) {
11538 RegMBB = Phi.getOperand(i: 1).getReg();
11539 RegOther = Phi.getOperand(i: 3).getReg();
11540 } else {
11541 assert(Phi.getOperand(4).getMBB() == MBB);
11542 RegMBB = Phi.getOperand(i: 3).getReg();
11543 RegOther = Phi.getOperand(i: 1).getReg();
11544 }
11545}
11546
11547static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
11548 if (!Reg.isVirtual())
11549 return false;
11550 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11551 return MRI.getVRegDef(Reg)->getParent() != BB;
11552}
11553
11554/// If Reg is an induction variable, return true and set some parameters
11555static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11556 MachineInstr *&UpdateInst,
11557 unsigned &UpdateCounterOprNum, Register &InitReg,
11558 bool &IsUpdatePriorComp) {
11559 // Example:
11560 //
11561 // Preheader:
11562 // InitReg = ...
11563 // LoopBB:
11564 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11565 // Reg = COPY Reg0 ; COPY is ignored.
11566 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11567 // ; Reg is the value calculated in the previous
11568 // ; iteration, so IsUpdatePriorComp == false.
11569
11570 if (LoopBB->pred_size() != 2)
11571 return false;
11572 if (!Reg.isVirtual())
11573 return false;
11574 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11575 UpdateInst = nullptr;
11576 UpdateCounterOprNum = 0;
11577 InitReg = 0;
11578 IsUpdatePriorComp = true;
11579 Register CurReg = Reg;
11580 while (true) {
11581 MachineInstr *Def = MRI.getVRegDef(Reg: CurReg);
11582 if (Def->getParent() != LoopBB)
11583 return false;
11584 if (Def->isCopy()) {
11585 // Ignore copy instructions unless they contain subregisters
11586 if (Def->getOperand(i: 0).getSubReg() || Def->getOperand(i: 1).getSubReg())
11587 return false;
11588 CurReg = Def->getOperand(i: 1).getReg();
11589 } else if (Def->isPHI()) {
11590 if (InitReg != 0)
11591 return false;
11592 if (!UpdateInst)
11593 IsUpdatePriorComp = false;
11594 extractPhiReg(Phi: *Def, MBB: LoopBB, RegMBB&: CurReg, RegOther&: InitReg);
11595 } else {
11596 if (UpdateInst)
11597 return false;
11598 switch (Def->getOpcode()) {
11599 case AArch64::ADDSXri:
11600 case AArch64::ADDSWri:
11601 case AArch64::SUBSXri:
11602 case AArch64::SUBSWri:
11603 case AArch64::ADDXri:
11604 case AArch64::ADDWri:
11605 case AArch64::SUBXri:
11606 case AArch64::SUBWri:
11607 UpdateInst = Def;
11608 UpdateCounterOprNum = 1;
11609 break;
11610 case AArch64::ADDSXrr:
11611 case AArch64::ADDSWrr:
11612 case AArch64::SUBSXrr:
11613 case AArch64::SUBSWrr:
11614 case AArch64::ADDXrr:
11615 case AArch64::ADDWrr:
11616 case AArch64::SUBXrr:
11617 case AArch64::SUBWrr:
11618 UpdateInst = Def;
11619 if (isDefinedOutside(Reg: Def->getOperand(i: 2).getReg(), BB: LoopBB))
11620 UpdateCounterOprNum = 1;
11621 else if (isDefinedOutside(Reg: Def->getOperand(i: 1).getReg(), BB: LoopBB))
11622 UpdateCounterOprNum = 2;
11623 else
11624 return false;
11625 break;
11626 default:
11627 return false;
11628 }
11629 CurReg = Def->getOperand(i: UpdateCounterOprNum).getReg();
11630 }
11631
11632 if (!CurReg.isVirtual())
11633 return false;
11634 if (Reg == CurReg)
11635 break;
11636 }
11637
11638 if (!UpdateInst)
11639 return false;
11640
11641 return true;
11642}
11643
11644std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11645AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
11646 // Accept loops that meet the following conditions
11647 // * The conditional branch is BCC
11648 // * The compare instruction is ADDS/SUBS/WHILEXX
11649 // * One operand of the compare is an induction variable and the other is a
11650 // loop invariant value
11651 // * The induction variable is incremented/decremented by a single instruction
11652 // * Does not contain CALL or instructions which have unmodeled side effects
11653
11654 for (MachineInstr &MI : *LoopBB)
11655 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11656 // This instruction may use NZCV, which interferes with the instruction to
11657 // be inserted for loop control.
11658 return nullptr;
11659
11660 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11661 SmallVector<MachineOperand, 4> Cond;
11662 if (analyzeBranch(MBB&: *LoopBB, TBB, FBB, Cond))
11663 return nullptr;
11664
11665 // Infinite loops are not supported
11666 if (TBB == LoopBB && FBB == LoopBB)
11667 return nullptr;
11668
11669 // Must be conditional branch
11670 if (TBB != LoopBB && FBB == nullptr)
11671 return nullptr;
11672
11673 assert((TBB == LoopBB || FBB == LoopBB) &&
11674 "The Loop must be a single-basic-block loop");
11675
11676 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11677 const TargetRegisterInfo &TRI = getRegisterInfo();
11678
11679 if (CondBranch->getOpcode() != AArch64::Bcc)
11680 return nullptr;
11681
11682 // Normalization for createTripCountGreaterCondition()
11683 if (TBB == LoopBB)
11684 reverseBranchCondition(Cond);
11685
11686 MachineInstr *Comp = nullptr;
11687 unsigned CompCounterOprNum = 0;
11688 for (MachineInstr &MI : reverse(C&: *LoopBB)) {
11689 if (MI.modifiesRegister(Reg: AArch64::NZCV, TRI: &TRI)) {
11690 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11691 // operands is a loop invariant value
11692
11693 switch (MI.getOpcode()) {
11694 case AArch64::SUBSXri:
11695 case AArch64::SUBSWri:
11696 case AArch64::ADDSXri:
11697 case AArch64::ADDSWri:
11698 Comp = &MI;
11699 CompCounterOprNum = 1;
11700 break;
11701 case AArch64::ADDSWrr:
11702 case AArch64::ADDSXrr:
11703 case AArch64::SUBSWrr:
11704 case AArch64::SUBSXrr:
11705 Comp = &MI;
11706 break;
11707 default:
11708 if (isWhileOpcode(Opc: MI.getOpcode())) {
11709 Comp = &MI;
11710 break;
11711 }
11712 return nullptr;
11713 }
11714
11715 if (CompCounterOprNum == 0) {
11716 if (isDefinedOutside(Reg: Comp->getOperand(i: 1).getReg(), BB: LoopBB))
11717 CompCounterOprNum = 2;
11718 else if (isDefinedOutside(Reg: Comp->getOperand(i: 2).getReg(), BB: LoopBB))
11719 CompCounterOprNum = 1;
11720 else
11721 return nullptr;
11722 }
11723 break;
11724 }
11725 }
11726 if (!Comp)
11727 return nullptr;
11728
11729 MachineInstr *Update = nullptr;
11730 Register Init;
11731 bool IsUpdatePriorComp;
11732 unsigned UpdateCounterOprNum;
11733 if (!getIndVarInfo(Reg: Comp->getOperand(i: CompCounterOprNum).getReg(), LoopBB,
11734 UpdateInst&: Update, UpdateCounterOprNum, InitReg&: Init, IsUpdatePriorComp))
11735 return nullptr;
11736
11737 return std::make_unique<AArch64PipelinerLoopInfo>(
11738 args&: LoopBB, args&: CondBranch, args&: Comp, args&: CompCounterOprNum, args&: Update, args&: UpdateCounterOprNum,
11739 args&: Init, args&: IsUpdatePriorComp, args&: Cond);
11740}
11741
11742/// verifyInstruction - Perform target specific instruction verification.
11743bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11744 StringRef &ErrInfo) const {
11745 // Verify that immediate offsets on load/store instructions are within range.
11746 // Stack objects with an FI operand are excluded as they can be fixed up
11747 // during PEI.
11748 TypeSize Scale(0U, false), Width(0U, false);
11749 int64_t MinOffset, MaxOffset;
11750 if (getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11751 unsigned ImmIdx = getLoadStoreImmIdx(Opc: MI.getOpcode());
11752 if (MI.getOperand(i: ImmIdx).isImm() && !MI.getOperand(i: ImmIdx - 1).isFI()) {
11753 int64_t Imm = MI.getOperand(i: ImmIdx).getImm();
11754 if (Imm < MinOffset || Imm > MaxOffset) {
11755 ErrInfo = "Unexpected immediate on load/store instruction";
11756 return false;
11757 }
11758 }
11759 }
11760
11761 const MCInstrDesc &MCID = MI.getDesc();
11762 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11763 const MachineOperand &MO = MI.getOperand(i: Op);
11764 switch (MCID.operands()[Op].OperandType) {
11765 case AArch64::OPERAND_IMPLICIT_IMM_0:
11766 if (!MO.isImm() || MO.getImm() != 0) {
11767 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11768 return false;
11769 }
11770 break;
11771 case AArch64::OPERAND_SHIFT_MSL:
11772 if (!MO.isImm() ||
11773 AArch64_AM::getShiftType(Imm: MO.getImm()) != AArch64_AM::MSL ||
11774 (AArch64_AM::getShiftValue(Imm: MO.getImm()) != 8 &&
11775 AArch64_AM::getShiftValue(Imm: MO.getImm()) != 16)) {
11776 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11777 return false;
11778 }
11779 break;
11780 default:
11781 break;
11782 }
11783 }
11784 return true;
11785}
11786
11787#define GET_INSTRINFO_HELPERS
11788#define GET_INSTRMAP_INFO
11789#include "AArch64GenInstrInfo.inc"
11790