1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
15#include "AArch64MachineFunctionInfo.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
18#include "MCTargetDesc/AArch64AddressingModes.h"
19#include "MCTargetDesc/AArch64MCTargetDesc.h"
20#include "Utils/AArch64BaseInfo.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/Analysis/AliasAnalysis.h"
27#include "llvm/CodeGen/CFIInstBuilder.h"
28#include "llvm/CodeGen/LivePhysRegs.h"
29#include "llvm/CodeGen/MachineBasicBlock.h"
30#include "llvm/CodeGen/MachineCombinerPattern.h"
31#include "llvm/CodeGen/MachineFrameInfo.h"
32#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineInstr.h"
34#include "llvm/CodeGen/MachineInstrBuilder.h"
35#include "llvm/CodeGen/MachineMemOperand.h"
36#include "llvm/CodeGen/MachineModuleInfo.h"
37#include "llvm/CodeGen/MachineOperand.h"
38#include "llvm/CodeGen/MachineRegisterInfo.h"
39#include "llvm/CodeGen/RegisterScavenging.h"
40#include "llvm/CodeGen/StackMaps.h"
41#include "llvm/CodeGen/TargetRegisterInfo.h"
42#include "llvm/CodeGen/TargetSubtargetInfo.h"
43#include "llvm/IR/DebugInfoMetadata.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
49#include "llvm/MC/MCInstBuilder.h"
50#include "llvm/MC/MCInstrDesc.h"
51#include "llvm/Support/Casting.h"
52#include "llvm/Support/CodeGen.h"
53#include "llvm/Support/CommandLine.h"
54#include "llvm/Support/ErrorHandling.h"
55#include "llvm/Support/LEB128.h"
56#include "llvm/Support/MathExtras.h"
57#include "llvm/Target/TargetMachine.h"
58#include "llvm/Target/TargetOptions.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
80static cl::opt<unsigned>
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(Val: 9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
84static cl::opt<unsigned> TBZDisplacementBits(
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(Val: 14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
88static cl::opt<unsigned> CBZDisplacementBits(
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(Val: 19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
92static cl::opt<unsigned>
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(Val: 19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
96static cl::opt<unsigned>
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(Val: 26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
100static cl::opt<unsigned> GatherOptSearchLimit(
101 "aarch64-search-limit", cl::Hidden, cl::init(Val: 2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
105AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// GetInstSize - Return the number of bytes of code the specified
111/// instruction may be. This returns the maximum number of bytes.
112unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
113 const MachineBasicBlock &MBB = *MI.getParent();
114 const MachineFunction *MF = MBB.getParent();
115 const Function &F = MF->getFunction();
116 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
117
118 {
119 auto Op = MI.getOpcode();
120 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
121 return getInlineAsmLength(Str: MI.getOperand(i: 0).getSymbolName(), MAI: *MAI);
122 }
123
124 // Meta-instructions emit no code.
125 if (MI.isMetaInstruction())
126 return 0;
127
128 // FIXME: We currently only handle pseudoinstructions that don't get expanded
129 // before the assembly printer.
130 unsigned NumBytes = 0;
131 const MCInstrDesc &Desc = MI.getDesc();
132
133 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
134 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
135
136 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
137 if (!MFI->shouldSignReturnAddress(MF: *MF))
138 return NumBytes;
139
140 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
141 auto Method = STI.getAuthenticatedLRCheckMethod(MF: *MF);
142 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
143 return NumBytes;
144 }
145
146 // Size should be preferably set in
147 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
148 // Specific cases handle instructions of variable sizes
149 switch (Desc.getOpcode()) {
150 default:
151 if (Desc.getSize())
152 return Desc.getSize();
153
154 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
155 // with fixed constant size but not specified in .td file) is a normal
156 // 4-byte insn.
157 NumBytes = 4;
158 break;
159 case TargetOpcode::STACKMAP:
160 // The upper bound for a stackmap intrinsic is the full length of its shadow
161 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
162 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
163 break;
164 case TargetOpcode::PATCHPOINT:
165 // The size of the patchpoint intrinsic is the number of bytes requested
166 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
167 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
168 break;
169 case TargetOpcode::STATEPOINT:
170 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
171 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
172 // No patch bytes means a normal call inst is emitted
173 if (NumBytes == 0)
174 NumBytes = 4;
175 break;
176 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
177 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
178 // instructions are expanded to the specified number of NOPs. Otherwise,
179 // they are expanded to 36-byte XRay sleds.
180 NumBytes =
181 F.getFnAttributeAsParsedInteger(Kind: "patchable-function-entry", Default: 9) * 4;
182 break;
183 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
184 case TargetOpcode::PATCHABLE_TAIL_CALL:
185 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
186 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
187 NumBytes = 36;
188 break;
189 case TargetOpcode::PATCHABLE_EVENT_CALL:
190 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
191 NumBytes = 24;
192 break;
193
194 case AArch64::SPACE:
195 NumBytes = MI.getOperand(i: 1).getImm();
196 break;
197 case TargetOpcode::BUNDLE:
198 NumBytes = getInstBundleLength(MI);
199 break;
200 }
201
202 return NumBytes;
203}
204
205unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
206 unsigned Size = 0;
207 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
208 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
209 while (++I != E && I->isInsideBundle()) {
210 assert(!I->isBundle() && "No nested bundle!");
211 Size += getInstSizeInBytes(MI: *I);
212 }
213 return Size;
214}
215
216static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
217 SmallVectorImpl<MachineOperand> &Cond) {
218 // Block ends with fall-through condbranch.
219 switch (LastInst->getOpcode()) {
220 default:
221 llvm_unreachable("Unknown branch instruction?");
222 case AArch64::Bcc:
223 Target = LastInst->getOperand(i: 1).getMBB();
224 Cond.push_back(Elt: LastInst->getOperand(i: 0));
225 break;
226 case AArch64::CBZW:
227 case AArch64::CBZX:
228 case AArch64::CBNZW:
229 case AArch64::CBNZX:
230 Target = LastInst->getOperand(i: 1).getMBB();
231 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
232 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
233 Cond.push_back(Elt: LastInst->getOperand(i: 0));
234 break;
235 case AArch64::TBZW:
236 case AArch64::TBZX:
237 case AArch64::TBNZW:
238 case AArch64::TBNZX:
239 Target = LastInst->getOperand(i: 2).getMBB();
240 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
241 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
242 Cond.push_back(Elt: LastInst->getOperand(i: 0));
243 Cond.push_back(Elt: LastInst->getOperand(i: 1));
244 break;
245 case AArch64::CBWPri:
246 case AArch64::CBXPri:
247 case AArch64::CBWPrr:
248 case AArch64::CBXPrr:
249 Target = LastInst->getOperand(i: 3).getMBB();
250 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
251 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
252 Cond.push_back(Elt: LastInst->getOperand(i: 0));
253 Cond.push_back(Elt: LastInst->getOperand(i: 1));
254 Cond.push_back(Elt: LastInst->getOperand(i: 2));
255 break;
256 case AArch64::CBBAssertExt:
257 case AArch64::CBHAssertExt:
258 Target = LastInst->getOperand(i: 3).getMBB();
259 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1)); // -1
260 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode())); // Opc
261 Cond.push_back(Elt: LastInst->getOperand(i: 0)); // Cond
262 Cond.push_back(Elt: LastInst->getOperand(i: 1)); // Op0
263 Cond.push_back(Elt: LastInst->getOperand(i: 2)); // Op1
264 Cond.push_back(Elt: LastInst->getOperand(i: 4)); // Ext0
265 Cond.push_back(Elt: LastInst->getOperand(i: 5)); // Ext1
266 break;
267 }
268}
269
270static unsigned getBranchDisplacementBits(unsigned Opc) {
271 switch (Opc) {
272 default:
273 llvm_unreachable("unexpected opcode!");
274 case AArch64::B:
275 return BDisplacementBits;
276 case AArch64::TBNZW:
277 case AArch64::TBZW:
278 case AArch64::TBNZX:
279 case AArch64::TBZX:
280 return TBZDisplacementBits;
281 case AArch64::CBNZW:
282 case AArch64::CBZW:
283 case AArch64::CBNZX:
284 case AArch64::CBZX:
285 return CBZDisplacementBits;
286 case AArch64::Bcc:
287 return BCCDisplacementBits;
288 case AArch64::CBWPri:
289 case AArch64::CBXPri:
290 case AArch64::CBBAssertExt:
291 case AArch64::CBHAssertExt:
292 case AArch64::CBWPrr:
293 case AArch64::CBXPrr:
294 return CBDisplacementBits;
295 }
296}
297
298bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
299 int64_t BrOffset) const {
300 unsigned Bits = getBranchDisplacementBits(Opc: BranchOp);
301 assert(Bits >= 3 && "max branch displacement must be enough to jump"
302 "over conditional branch expansion");
303 return isIntN(N: Bits, x: BrOffset / 4);
304}
305
306MachineBasicBlock *
307AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
308 switch (MI.getOpcode()) {
309 default:
310 llvm_unreachable("unexpected opcode!");
311 case AArch64::B:
312 return MI.getOperand(i: 0).getMBB();
313 case AArch64::TBZW:
314 case AArch64::TBNZW:
315 case AArch64::TBZX:
316 case AArch64::TBNZX:
317 return MI.getOperand(i: 2).getMBB();
318 case AArch64::CBZW:
319 case AArch64::CBNZW:
320 case AArch64::CBZX:
321 case AArch64::CBNZX:
322 case AArch64::Bcc:
323 return MI.getOperand(i: 1).getMBB();
324 case AArch64::CBWPri:
325 case AArch64::CBXPri:
326 case AArch64::CBBAssertExt:
327 case AArch64::CBHAssertExt:
328 case AArch64::CBWPrr:
329 case AArch64::CBXPrr:
330 return MI.getOperand(i: 3).getMBB();
331 }
332}
333
334void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
335 MachineBasicBlock &NewDestBB,
336 MachineBasicBlock &RestoreBB,
337 const DebugLoc &DL,
338 int64_t BrOffset,
339 RegScavenger *RS) const {
340 assert(RS && "RegScavenger required for long branching");
341 assert(MBB.empty() &&
342 "new block should be inserted for expanding unconditional branch");
343 assert(MBB.pred_size() == 1);
344 assert(RestoreBB.empty() &&
345 "restore block should be inserted for restoring clobbered registers");
346
347 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
348 // Offsets outside of the signed 33-bit range are not supported for ADRP +
349 // ADD.
350 if (!isInt<33>(x: BrOffset))
351 report_fatal_error(
352 reason: "Branch offsets outside of the signed 33-bit range not supported");
353
354 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::ADRP), DestReg: Reg)
355 .addSym(Sym: DestBB.getSymbol(), TargetFlags: AArch64II::MO_PAGE);
356 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg: Reg)
357 .addReg(RegNo: Reg)
358 .addSym(Sym: DestBB.getSymbol(), TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
359 .addImm(Val: 0);
360 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::BR)).addReg(RegNo: Reg);
361 };
362
363 RS->enterBasicBlockEnd(MBB);
364 // If X16 is unused, we can rely on the linker to insert a range extension
365 // thunk if NewDestBB is out of range of a single B instruction.
366 constexpr Register Reg = AArch64::X16;
367 if (!RS->isRegUsed(Reg)) {
368 insertUnconditionalBranch(MBB, DestBB: &NewDestBB, DL);
369 RS->setRegUsed(Reg);
370 return;
371 }
372
373 // If there's a free register and it's worth inflating the code size,
374 // manually insert the indirect branch.
375 Register Scavenged = RS->FindUnusedReg(RC: &AArch64::GPR64RegClass);
376 if (Scavenged != AArch64::NoRegister &&
377 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
378 buildIndirectBranch(Scavenged, NewDestBB);
379 RS->setRegUsed(Reg: Scavenged);
380 return;
381 }
382
383 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
384 // with red zones.
385 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
386 if (!AFI || AFI->hasRedZone().value_or(u: true))
387 report_fatal_error(
388 reason: "Unable to insert indirect branch inside function that has red zone");
389
390 // Otherwise, spill X16 and defer range extension to the linker.
391 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::STRXpre))
392 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
393 .addReg(RegNo: Reg)
394 .addReg(RegNo: AArch64::SP)
395 .addImm(Val: -16);
396
397 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: &RestoreBB);
398
399 BuildMI(BB&: RestoreBB, I: RestoreBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::LDRXpost))
400 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
401 .addReg(RegNo: Reg, Flags: RegState::Define)
402 .addReg(RegNo: AArch64::SP)
403 .addImm(Val: 16);
404}
405
406// Branch analysis.
407bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
408 MachineBasicBlock *&TBB,
409 MachineBasicBlock *&FBB,
410 SmallVectorImpl<MachineOperand> &Cond,
411 bool AllowModify) const {
412 // If the block has no terminators, it just falls into the block after it.
413 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
414 if (I == MBB.end())
415 return false;
416
417 // Skip over SpeculationBarrierEndBB terminators
418 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
419 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
420 --I;
421 }
422
423 if (!isUnpredicatedTerminator(MI: *I))
424 return false;
425
426 // Get the last instruction in the block.
427 MachineInstr *LastInst = &*I;
428
429 // If there is only one terminator instruction, process it.
430 unsigned LastOpc = LastInst->getOpcode();
431 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
432 if (isUncondBranchOpcode(Opc: LastOpc)) {
433 TBB = LastInst->getOperand(i: 0).getMBB();
434 return false;
435 }
436 if (isCondBranchOpcode(Opc: LastOpc)) {
437 // Block ends with fall-through condbranch.
438 parseCondBranch(LastInst, Target&: TBB, Cond);
439 return false;
440 }
441 return true; // Can't handle indirect branch.
442 }
443
444 // Get the instruction before it if it is a terminator.
445 MachineInstr *SecondLastInst = &*I;
446 unsigned SecondLastOpc = SecondLastInst->getOpcode();
447
448 // If AllowModify is true and the block ends with two or more unconditional
449 // branches, delete all but the first unconditional branch.
450 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc)) {
451 while (isUncondBranchOpcode(Opc: SecondLastOpc)) {
452 LastInst->eraseFromParent();
453 LastInst = SecondLastInst;
454 LastOpc = LastInst->getOpcode();
455 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
456 // Return now the only terminator is an unconditional branch.
457 TBB = LastInst->getOperand(i: 0).getMBB();
458 return false;
459 }
460 SecondLastInst = &*I;
461 SecondLastOpc = SecondLastInst->getOpcode();
462 }
463 }
464
465 // If we're allowed to modify and the block ends in a unconditional branch
466 // which could simply fallthrough, remove the branch. (Note: This case only
467 // matters when we can't understand the whole sequence, otherwise it's also
468 // handled by BranchFolding.cpp.)
469 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc) &&
470 MBB.isLayoutSuccessor(MBB: getBranchDestBlock(MI: *LastInst))) {
471 LastInst->eraseFromParent();
472 LastInst = SecondLastInst;
473 LastOpc = LastInst->getOpcode();
474 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
475 assert(!isUncondBranchOpcode(LastOpc) &&
476 "unreachable unconditional branches removed above");
477
478 if (isCondBranchOpcode(Opc: LastOpc)) {
479 // Block ends with fall-through condbranch.
480 parseCondBranch(LastInst, Target&: TBB, Cond);
481 return false;
482 }
483 return true; // Can't handle indirect branch.
484 }
485 SecondLastInst = &*I;
486 SecondLastOpc = SecondLastInst->getOpcode();
487 }
488
489 // If there are three terminators, we don't know what sort of block this is.
490 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(MI: *--I))
491 return true;
492
493 // If the block ends with a B and a Bcc, handle it.
494 if (isCondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
495 parseCondBranch(LastInst: SecondLastInst, Target&: TBB, Cond);
496 FBB = LastInst->getOperand(i: 0).getMBB();
497 return false;
498 }
499
500 // If the block ends with two unconditional branches, handle it. The second
501 // one is not executed, so remove it.
502 if (isUncondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
503 TBB = SecondLastInst->getOperand(i: 0).getMBB();
504 I = LastInst;
505 if (AllowModify)
506 I->eraseFromParent();
507 return false;
508 }
509
510 // ...likewise if it ends with an indirect branch followed by an unconditional
511 // branch.
512 if (isIndirectBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
513 I = LastInst;
514 if (AllowModify)
515 I->eraseFromParent();
516 return true;
517 }
518
519 // Otherwise, can't handle this.
520 return true;
521}
522
523bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
524 MachineBranchPredicate &MBP,
525 bool AllowModify) const {
526 // Use analyzeBranch to validate the branch pattern.
527 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
528 SmallVector<MachineOperand, 4> Cond;
529 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
530 return true;
531
532 // analyzeBranch returns success with empty Cond for unconditional branches.
533 if (Cond.empty())
534 return true;
535
536 MBP.TrueDest = TBB;
537 assert(MBP.TrueDest && "expected!");
538 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
539
540 MBP.ConditionDef = nullptr;
541 MBP.SingleUseCondition = false;
542
543 // Find the conditional branch. After analyzeBranch succeeds with non-empty
544 // Cond, there's exactly one conditional branch - either last (fallthrough)
545 // or second-to-last (followed by unconditional B).
546 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
547 if (I == MBB.end())
548 return true;
549
550 if (isUncondBranchOpcode(Opc: I->getOpcode())) {
551 if (I == MBB.begin())
552 return true;
553 --I;
554 }
555
556 MachineInstr *CondBranch = &*I;
557 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
558
559 switch (CondBranch->getOpcode()) {
560 default:
561 return true;
562
563 case AArch64::Bcc:
564 // Bcc takes the NZCV flag as the operand to branch on, walk up the
565 // instruction stream to find the last instruction to define NZCV.
566 for (MachineInstr &MI : llvm::drop_begin(RangeOrContainer: llvm::reverse(C&: MBB))) {
567 if (MI.modifiesRegister(Reg: AArch64::NZCV, /*TRI=*/nullptr)) {
568 MBP.ConditionDef = &MI;
569 break;
570 }
571 }
572 return false;
573
574 case AArch64::CBZW:
575 case AArch64::CBZX:
576 case AArch64::CBNZW:
577 case AArch64::CBNZX: {
578 MBP.LHS = CondBranch->getOperand(i: 0);
579 MBP.RHS = MachineOperand::CreateImm(Val: 0);
580 unsigned Opc = CondBranch->getOpcode();
581 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
582 ? MachineBranchPredicate::PRED_NE
583 : MachineBranchPredicate::PRED_EQ;
584 Register CondReg = MBP.LHS.getReg();
585 if (CondReg.isVirtual())
586 MBP.ConditionDef = MRI.getVRegDef(Reg: CondReg);
587 return false;
588 }
589
590 case AArch64::TBZW:
591 case AArch64::TBZX:
592 case AArch64::TBNZW:
593 case AArch64::TBNZX: {
594 Register CondReg = CondBranch->getOperand(i: 0).getReg();
595 if (CondReg.isVirtual())
596 MBP.ConditionDef = MRI.getVRegDef(Reg: CondReg);
597 return false;
598 }
599 }
600}
601
602bool AArch64InstrInfo::reverseBranchCondition(
603 SmallVectorImpl<MachineOperand> &Cond) const {
604 if (Cond[0].getImm() != -1) {
605 // Regular Bcc
606 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
607 Cond[0].setImm(AArch64CC::getInvertedCondCode(Code: CC));
608 } else {
609 // Folded compare-and-branch
610 switch (Cond[1].getImm()) {
611 default:
612 llvm_unreachable("Unknown conditional branch!");
613 case AArch64::CBZW:
614 Cond[1].setImm(AArch64::CBNZW);
615 break;
616 case AArch64::CBNZW:
617 Cond[1].setImm(AArch64::CBZW);
618 break;
619 case AArch64::CBZX:
620 Cond[1].setImm(AArch64::CBNZX);
621 break;
622 case AArch64::CBNZX:
623 Cond[1].setImm(AArch64::CBZX);
624 break;
625 case AArch64::TBZW:
626 Cond[1].setImm(AArch64::TBNZW);
627 break;
628 case AArch64::TBNZW:
629 Cond[1].setImm(AArch64::TBZW);
630 break;
631 case AArch64::TBZX:
632 Cond[1].setImm(AArch64::TBNZX);
633 break;
634 case AArch64::TBNZX:
635 Cond[1].setImm(AArch64::TBZX);
636 break;
637
638 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
639 case AArch64::CBWPri:
640 case AArch64::CBXPri:
641 case AArch64::CBBAssertExt:
642 case AArch64::CBHAssertExt:
643 case AArch64::CBWPrr:
644 case AArch64::CBXPrr: {
645 // Pseudos using standard 4bit Arm condition codes
646 AArch64CC::CondCode CC =
647 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
648 Cond[2].setImm(AArch64CC::getInvertedCondCode(Code: CC));
649 }
650 }
651 }
652
653 return false;
654}
655
656unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
657 int *BytesRemoved) const {
658 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
659 if (I == MBB.end())
660 return 0;
661
662 if (!isUncondBranchOpcode(Opc: I->getOpcode()) &&
663 !isCondBranchOpcode(Opc: I->getOpcode()))
664 return 0;
665
666 // Remove the branch.
667 I->eraseFromParent();
668
669 I = MBB.end();
670
671 if (I == MBB.begin()) {
672 if (BytesRemoved)
673 *BytesRemoved = 4;
674 return 1;
675 }
676 --I;
677 if (!isCondBranchOpcode(Opc: I->getOpcode())) {
678 if (BytesRemoved)
679 *BytesRemoved = 4;
680 return 1;
681 }
682
683 // Remove the branch.
684 I->eraseFromParent();
685 if (BytesRemoved)
686 *BytesRemoved = 8;
687
688 return 2;
689}
690
691void AArch64InstrInfo::instantiateCondBranch(
692 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
693 ArrayRef<MachineOperand> Cond) const {
694 if (Cond[0].getImm() != -1) {
695 // Regular Bcc
696 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::Bcc)).addImm(Val: Cond[0].getImm()).addMBB(MBB: TBB);
697 } else {
698 // Folded compare-and-branch
699 // Note that we use addOperand instead of addReg to keep the flags.
700
701 // cbz, cbnz
702 const MachineInstrBuilder MIB =
703 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: Cond[1].getImm())).add(MO: Cond[2]);
704
705 // tbz/tbnz
706 if (Cond.size() > 3)
707 MIB.add(MO: Cond[3]);
708
709 // cb
710 if (Cond.size() > 4)
711 MIB.add(MO: Cond[4]);
712
713 MIB.addMBB(MBB: TBB);
714
715 // cb[b,h]
716 if (Cond.size() > 5) {
717 MIB.addImm(Val: Cond[5].getImm());
718 MIB.addImm(Val: Cond[6].getImm());
719 }
720 }
721}
722
723unsigned AArch64InstrInfo::insertBranch(
724 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
725 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
726 // Shouldn't be a fall through.
727 assert(TBB && "insertBranch must not be told to insert a fallthrough");
728
729 if (!FBB) {
730 if (Cond.empty()) // Unconditional branch?
731 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: TBB);
732 else
733 instantiateCondBranch(MBB, DL, TBB, Cond);
734
735 if (BytesAdded)
736 *BytesAdded = 4;
737
738 return 1;
739 }
740
741 // Two-way conditional branch.
742 instantiateCondBranch(MBB, DL, TBB, Cond);
743 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: FBB);
744
745 if (BytesAdded)
746 *BytesAdded = 8;
747
748 return 2;
749}
750
751bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
752 const TargetInstrInfo &TII) {
753 for (MachineInstr &MI : MBB->terminators()) {
754 unsigned Opc = MI.getOpcode();
755 switch (Opc) {
756 case AArch64::CBZW:
757 case AArch64::CBZX:
758 case AArch64::TBZW:
759 case AArch64::TBZX:
760 // CBZ/TBZ with WZR/XZR -> unconditional B
761 if (MI.getOperand(i: 0).getReg() == AArch64::WZR ||
762 MI.getOperand(i: 0).getReg() == AArch64::XZR) {
763 DEBUG_WITH_TYPE("optimizeTerminators",
764 dbgs() << "Removing always taken branch: " << MI);
765 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
766 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
767 for (auto *S : Succs)
768 if (S != Target)
769 MBB->removeSuccessor(Succ: S);
770 DebugLoc DL = MI.getDebugLoc();
771 while (MBB->rbegin() != &MI)
772 MBB->rbegin()->eraseFromParent();
773 MI.eraseFromParent();
774 BuildMI(BB: MBB, MIMD: DL, MCID: TII.get(Opcode: AArch64::B)).addMBB(MBB: Target);
775 return true;
776 }
777 break;
778 case AArch64::CBNZW:
779 case AArch64::CBNZX:
780 case AArch64::TBNZW:
781 case AArch64::TBNZX:
782 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
783 if (MI.getOperand(i: 0).getReg() == AArch64::WZR ||
784 MI.getOperand(i: 0).getReg() == AArch64::XZR) {
785 DEBUG_WITH_TYPE("optimizeTerminators",
786 dbgs() << "Removing never taken branch: " << MI);
787 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
788 MI.getParent()->removeSuccessor(Succ: Target);
789 MI.eraseFromParent();
790 return true;
791 }
792 break;
793 }
794 }
795 return false;
796}
797
798// Find the original register that VReg is copied from.
799static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
800 while (Register::isVirtualRegister(Reg: VReg)) {
801 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
802 if (!DefMI->isFullCopy())
803 return VReg;
804 VReg = DefMI->getOperand(i: 1).getReg();
805 }
806 return VReg;
807}
808
809// Determine if VReg is defined by an instruction that can be folded into a
810// csel instruction. If so, return the folded opcode, and the replacement
811// register.
812static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
813 unsigned *NewReg = nullptr) {
814 VReg = removeCopies(MRI, VReg);
815 if (!Register::isVirtualRegister(Reg: VReg))
816 return 0;
817
818 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(RC: MRI.getRegClass(Reg: VReg));
819 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
820 unsigned Opc = 0;
821 unsigned SrcReg = 0;
822 switch (DefMI->getOpcode()) {
823 case AArch64::SUBREG_TO_REG:
824 // Check for the following way to define an 64-bit immediate:
825 // %0:gpr32 = MOVi32imm 1
826 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
827 if (!DefMI->getOperand(i: 1).isReg())
828 return 0;
829 if (!DefMI->getOperand(i: 2).isImm() ||
830 DefMI->getOperand(i: 2).getImm() != AArch64::sub_32)
831 return 0;
832 DefMI = MRI.getVRegDef(Reg: DefMI->getOperand(i: 1).getReg());
833 if (DefMI->getOpcode() != AArch64::MOVi32imm)
834 return 0;
835 if (!DefMI->getOperand(i: 1).isImm() || DefMI->getOperand(i: 1).getImm() != 1)
836 return 0;
837 assert(Is64Bit);
838 SrcReg = AArch64::XZR;
839 Opc = AArch64::CSINCXr;
840 break;
841
842 case AArch64::MOVi32imm:
843 case AArch64::MOVi64imm:
844 if (!DefMI->getOperand(i: 1).isImm() || DefMI->getOperand(i: 1).getImm() != 1)
845 return 0;
846 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
847 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
848 break;
849
850 case AArch64::ADDSXri:
851 case AArch64::ADDSWri:
852 // if NZCV is used, do not fold.
853 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
854 isDead: true) == -1)
855 return 0;
856 // fall-through to ADDXri and ADDWri.
857 [[fallthrough]];
858 case AArch64::ADDXri:
859 case AArch64::ADDWri:
860 // add x, 1 -> csinc.
861 if (!DefMI->getOperand(i: 2).isImm() || DefMI->getOperand(i: 2).getImm() != 1 ||
862 DefMI->getOperand(i: 3).getImm() != 0)
863 return 0;
864 SrcReg = DefMI->getOperand(i: 1).getReg();
865 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
866 break;
867
868 case AArch64::ORNXrr:
869 case AArch64::ORNWrr: {
870 // not x -> csinv, represented as orn dst, xzr, src.
871 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
872 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
873 return 0;
874 SrcReg = DefMI->getOperand(i: 2).getReg();
875 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
876 break;
877 }
878
879 case AArch64::SUBSXrr:
880 case AArch64::SUBSWrr:
881 // if NZCV is used, do not fold.
882 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
883 isDead: true) == -1)
884 return 0;
885 // fall-through to SUBXrr and SUBWrr.
886 [[fallthrough]];
887 case AArch64::SUBXrr:
888 case AArch64::SUBWrr: {
889 // neg x -> csneg, represented as sub dst, xzr, src.
890 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
891 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
892 return 0;
893 SrcReg = DefMI->getOperand(i: 2).getReg();
894 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
895 break;
896 }
897 default:
898 return 0;
899 }
900 assert(Opc && SrcReg && "Missing parameters");
901
902 if (NewReg)
903 *NewReg = SrcReg;
904 return Opc;
905}
906
907bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
908 ArrayRef<MachineOperand> Cond,
909 Register DstReg, Register TrueReg,
910 Register FalseReg, int &CondCycles,
911 int &TrueCycles,
912 int &FalseCycles) const {
913 // Check register classes.
914 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
915 const TargetRegisterClass *RC =
916 RI.getCommonSubClass(A: MRI.getRegClass(Reg: TrueReg), B: MRI.getRegClass(Reg: FalseReg));
917 if (!RC)
918 return false;
919
920 // Also need to check the dest regclass, in case we're trying to optimize
921 // something like:
922 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
923 if (!RI.getCommonSubClass(A: RC, B: MRI.getRegClass(Reg: DstReg)))
924 return false;
925
926 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
927 unsigned ExtraCondLat = Cond.size() != 1;
928
929 // GPRs are handled by csel.
930 // FIXME: Fold in x+1, -x, and ~x when applicable.
931 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
932 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
933 // Single-cycle csel, csinc, csinv, and csneg.
934 CondCycles = 1 + ExtraCondLat;
935 TrueCycles = FalseCycles = 1;
936 if (canFoldIntoCSel(MRI, VReg: TrueReg))
937 TrueCycles = 0;
938 else if (canFoldIntoCSel(MRI, VReg: FalseReg))
939 FalseCycles = 0;
940 return true;
941 }
942
943 // Scalar floating point is handled by fcsel.
944 // FIXME: Form fabs, fmin, and fmax when applicable.
945 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
946 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
947 CondCycles = 5 + ExtraCondLat;
948 TrueCycles = FalseCycles = 2;
949 return true;
950 }
951
952 // Can't do vectors.
953 return false;
954}
955
956void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
957 MachineBasicBlock::iterator I,
958 const DebugLoc &DL, Register DstReg,
959 ArrayRef<MachineOperand> Cond,
960 Register TrueReg, Register FalseReg) const {
961 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
962
963 // Parse the condition code, see parseCondBranch() above.
964 AArch64CC::CondCode CC;
965 switch (Cond.size()) {
966 default:
967 llvm_unreachable("Unknown condition opcode in Cond");
968 case 1: // b.cc
969 CC = AArch64CC::CondCode(Cond[0].getImm());
970 break;
971 case 3: { // cbz/cbnz
972 // We must insert a compare against 0.
973 bool Is64Bit;
974 switch (Cond[1].getImm()) {
975 default:
976 llvm_unreachable("Unknown branch opcode in Cond");
977 case AArch64::CBZW:
978 Is64Bit = false;
979 CC = AArch64CC::EQ;
980 break;
981 case AArch64::CBZX:
982 Is64Bit = true;
983 CC = AArch64CC::EQ;
984 break;
985 case AArch64::CBNZW:
986 Is64Bit = false;
987 CC = AArch64CC::NE;
988 break;
989 case AArch64::CBNZX:
990 Is64Bit = true;
991 CC = AArch64CC::NE;
992 break;
993 }
994 Register SrcReg = Cond[2].getReg();
995 if (Is64Bit) {
996 // cmp reg, #0 is actually subs xzr, reg, #0.
997 MRI.constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64spRegClass);
998 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSXri), DestReg: AArch64::XZR)
999 .addReg(RegNo: SrcReg)
1000 .addImm(Val: 0)
1001 .addImm(Val: 0);
1002 } else {
1003 MRI.constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR32spRegClass);
1004 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWri), DestReg: AArch64::WZR)
1005 .addReg(RegNo: SrcReg)
1006 .addImm(Val: 0)
1007 .addImm(Val: 0);
1008 }
1009 break;
1010 }
1011 case 4: { // tbz/tbnz
1012 // We must insert a tst instruction.
1013 switch (Cond[1].getImm()) {
1014 default:
1015 llvm_unreachable("Unknown branch opcode in Cond");
1016 case AArch64::TBZW:
1017 case AArch64::TBZX:
1018 CC = AArch64CC::EQ;
1019 break;
1020 case AArch64::TBNZW:
1021 case AArch64::TBNZX:
1022 CC = AArch64CC::NE;
1023 break;
1024 }
1025 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1026 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1027 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ANDSWri), DestReg: AArch64::WZR)
1028 .addReg(RegNo: Cond[2].getReg())
1029 .addImm(
1030 Val: AArch64_AM::encodeLogicalImmediate(imm: 1ull << Cond[3].getImm(), regSize: 32));
1031 else
1032 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ANDSXri), DestReg: AArch64::XZR)
1033 .addReg(RegNo: Cond[2].getReg())
1034 .addImm(
1035 Val: AArch64_AM::encodeLogicalImmediate(imm: 1ull << Cond[3].getImm(), regSize: 64));
1036 break;
1037 }
1038 case 5: { // cb
1039 // We must insert a cmp, that is a subs
1040 // 0 1 2 3 4
1041 // Cond is { -1, Opcode, CC, Op0, Op1 }
1042
1043 unsigned SubsOpc, SubsDestReg;
1044 bool IsImm = false;
1045 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1046 switch (Cond[1].getImm()) {
1047 default:
1048 llvm_unreachable("Unknown branch opcode in Cond");
1049 case AArch64::CBWPri:
1050 SubsOpc = AArch64::SUBSWri;
1051 SubsDestReg = AArch64::WZR;
1052 IsImm = true;
1053 break;
1054 case AArch64::CBXPri:
1055 SubsOpc = AArch64::SUBSXri;
1056 SubsDestReg = AArch64::XZR;
1057 IsImm = true;
1058 break;
1059 case AArch64::CBWPrr:
1060 SubsOpc = AArch64::SUBSWrr;
1061 SubsDestReg = AArch64::WZR;
1062 IsImm = false;
1063 break;
1064 case AArch64::CBXPrr:
1065 SubsOpc = AArch64::SUBSXrr;
1066 SubsDestReg = AArch64::XZR;
1067 IsImm = false;
1068 break;
1069 }
1070
1071 if (IsImm)
1072 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SubsOpc), DestReg: SubsDestReg)
1073 .addReg(RegNo: Cond[3].getReg())
1074 .addImm(Val: Cond[4].getImm())
1075 .addImm(Val: 0);
1076 else
1077 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SubsOpc), DestReg: SubsDestReg)
1078 .addReg(RegNo: Cond[3].getReg())
1079 .addReg(RegNo: Cond[4].getReg());
1080 } break;
1081 case 7: { // cb[b,h]
1082 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1083 // that have been folded. For the first operand we codegen an explicit
1084 // extension, for the second operand we fold the extension into cmp.
1085 // 0 1 2 3 4 5 6
1086 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1087
1088 // We need a new register for the now explicitly extended register
1089 Register Reg = Cond[4].getReg();
1090 if (Cond[5].getImm() != AArch64_AM::InvalidShiftExtend) {
1091 unsigned ExtOpc;
1092 unsigned ExtBits;
1093 AArch64_AM::ShiftExtendType ExtendType =
1094 AArch64_AM::getExtendType(Imm: Cond[5].getImm());
1095 switch (ExtendType) {
1096 default:
1097 llvm_unreachable("Unknown shift-extend for CB instruction");
1098 case AArch64_AM::SXTB:
1099 assert(
1100 Cond[1].getImm() == AArch64::CBBAssertExt &&
1101 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1102 ExtOpc = AArch64::SBFMWri;
1103 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xff, regSize: 32);
1104 break;
1105 case AArch64_AM::SXTH:
1106 assert(
1107 Cond[1].getImm() == AArch64::CBHAssertExt &&
1108 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1109 ExtOpc = AArch64::SBFMWri;
1110 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xffff, regSize: 32);
1111 break;
1112 case AArch64_AM::UXTB:
1113 assert(
1114 Cond[1].getImm() == AArch64::CBBAssertExt &&
1115 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1116 ExtOpc = AArch64::ANDWri;
1117 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xff, regSize: 32);
1118 break;
1119 case AArch64_AM::UXTH:
1120 assert(
1121 Cond[1].getImm() == AArch64::CBHAssertExt &&
1122 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1123 ExtOpc = AArch64::ANDWri;
1124 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xffff, regSize: 32);
1125 break;
1126 }
1127
1128 // Build the explicit extension of the first operand
1129 Reg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32spRegClass);
1130 MachineInstrBuilder MBBI =
1131 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: ExtOpc), DestReg: Reg).addReg(RegNo: Cond[4].getReg());
1132 if (ExtOpc != AArch64::ANDWri)
1133 MBBI.addImm(Val: 0);
1134 MBBI.addImm(Val: ExtBits);
1135 }
1136
1137 // Now, subs with an extended second operand
1138 if (Cond[6].getImm() != AArch64_AM::InvalidShiftExtend) {
1139 AArch64_AM::ShiftExtendType ExtendType =
1140 AArch64_AM::getExtendType(Imm: Cond[6].getImm());
1141 MRI.constrainRegClass(Reg, RC: MRI.getRegClass(Reg: Cond[3].getReg()));
1142 MRI.constrainRegClass(Reg: Cond[3].getReg(), RC: &AArch64::GPR32spRegClass);
1143 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWrx), DestReg: AArch64::WZR)
1144 .addReg(RegNo: Cond[3].getReg())
1145 .addReg(RegNo: Reg)
1146 .addImm(Val: AArch64_AM::getArithExtendImm(ET: ExtendType, Imm: 0));
1147 } // If no extension is needed, just a regular subs
1148 else {
1149 MRI.constrainRegClass(Reg, RC: MRI.getRegClass(Reg: Cond[3].getReg()));
1150 MRI.constrainRegClass(Reg: Cond[3].getReg(), RC: &AArch64::GPR32spRegClass);
1151 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWrr), DestReg: AArch64::WZR)
1152 .addReg(RegNo: Cond[3].getReg())
1153 .addReg(RegNo: Reg);
1154 }
1155
1156 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1157 } break;
1158 }
1159
1160 unsigned Opc = 0;
1161 const TargetRegisterClass *RC = nullptr;
1162 bool TryFold = false;
1163 if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::GPR64RegClass)) {
1164 RC = &AArch64::GPR64RegClass;
1165 Opc = AArch64::CSELXr;
1166 TryFold = true;
1167 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::GPR32RegClass)) {
1168 RC = &AArch64::GPR32RegClass;
1169 Opc = AArch64::CSELWr;
1170 TryFold = true;
1171 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::FPR64RegClass)) {
1172 RC = &AArch64::FPR64RegClass;
1173 Opc = AArch64::FCSELDrrr;
1174 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::FPR32RegClass)) {
1175 RC = &AArch64::FPR32RegClass;
1176 Opc = AArch64::FCSELSrrr;
1177 }
1178 assert(RC && "Unsupported regclass");
1179
1180 // Try folding simple instructions into the csel.
1181 if (TryFold) {
1182 unsigned NewReg = 0;
1183 unsigned FoldedOpc = canFoldIntoCSel(MRI, VReg: TrueReg, NewReg: &NewReg);
1184 if (FoldedOpc) {
1185 // The folded opcodes csinc, csinc and csneg apply the operation to
1186 // FalseReg, so we need to invert the condition.
1187 CC = AArch64CC::getInvertedCondCode(Code: CC);
1188 TrueReg = FalseReg;
1189 } else
1190 FoldedOpc = canFoldIntoCSel(MRI, VReg: FalseReg, NewReg: &NewReg);
1191
1192 // Fold the operation. Leave any dead instructions for DCE to clean up.
1193 if (FoldedOpc) {
1194 FalseReg = NewReg;
1195 Opc = FoldedOpc;
1196 // Extend the live range of NewReg.
1197 MRI.clearKillFlags(Reg: NewReg);
1198 }
1199 }
1200
1201 // Pull all virtual register into the appropriate class.
1202 MRI.constrainRegClass(Reg: TrueReg, RC);
1203 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1204 assert(
1205 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1206 FalseReg == AArch64::XZR) &&
1207 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1208 if (FalseReg.isVirtual())
1209 MRI.constrainRegClass(Reg: FalseReg, RC);
1210
1211 // Insert the csel.
1212 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: Opc), DestReg: DstReg)
1213 .addReg(RegNo: TrueReg)
1214 .addReg(RegNo: FalseReg)
1215 .addImm(Val: CC);
1216}
1217
1218// Return true if Imm can be loaded into a register by a "cheap" sequence of
1219// instructions. For now, "cheap" means at most two instructions.
1220static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1221 if (BitSize == 32)
1222 return true;
1223
1224 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1225 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(i: 1).getImm());
1226 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
1227 AArch64_IMM::expandMOVImm(Imm, BitSize, Insn&: Is);
1228
1229 return Is.size() <= 2;
1230}
1231
1232// Check if a COPY instruction is cheap.
1233static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1234 assert(MI.isCopy() && "Expected COPY instruction");
1235 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1236
1237 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1238 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1239 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1240 if (Reg.isVirtual())
1241 return MRI.getRegClass(Reg);
1242 if (Reg.isPhysical())
1243 return RI.getMinimalPhysRegClass(Reg);
1244 return nullptr;
1245 };
1246 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(i: 0).getReg());
1247 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(i: 1).getReg());
1248 if (DstRC && SrcRC && !RI.getCommonSubClass(A: DstRC, B: SrcRC))
1249 return false;
1250
1251 return MI.isAsCheapAsAMove();
1252}
1253
1254// FIXME: this implementation should be micro-architecture dependent, so a
1255// micro-architecture target hook should be introduced here in future.
1256bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
1257 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1258 if (isExynosCheapAsMove(MI))
1259 return true;
1260 return MI.isAsCheapAsAMove();
1261 }
1262
1263 switch (MI.getOpcode()) {
1264 default:
1265 return MI.isAsCheapAsAMove();
1266
1267 case TargetOpcode::COPY:
1268 return isCheapCopy(MI, RI);
1269
1270 case AArch64::ADDWrs:
1271 case AArch64::ADDXrs:
1272 case AArch64::SUBWrs:
1273 case AArch64::SUBXrs:
1274 return Subtarget.hasALULSLFast() && MI.getOperand(i: 3).getImm() <= 4;
1275
1276 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1277 // ORRXri, it is as cheap as MOV.
1278 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1279 case AArch64::MOVi32imm:
1280 return isCheapImmediate(MI, BitSize: 32);
1281 case AArch64::MOVi64imm:
1282 return isCheapImmediate(MI, BitSize: 64);
1283 }
1284}
1285
1286bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1287 switch (MI.getOpcode()) {
1288 default:
1289 return false;
1290
1291 case AArch64::ADDWrs:
1292 case AArch64::ADDXrs:
1293 case AArch64::ADDSWrs:
1294 case AArch64::ADDSXrs: {
1295 unsigned Imm = MI.getOperand(i: 3).getImm();
1296 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1297 if (ShiftVal == 0)
1298 return true;
1299 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1300 }
1301
1302 case AArch64::ADDWrx:
1303 case AArch64::ADDXrx:
1304 case AArch64::ADDXrx64:
1305 case AArch64::ADDSWrx:
1306 case AArch64::ADDSXrx:
1307 case AArch64::ADDSXrx64: {
1308 unsigned Imm = MI.getOperand(i: 3).getImm();
1309 switch (AArch64_AM::getArithExtendType(Imm)) {
1310 default:
1311 return false;
1312 case AArch64_AM::UXTB:
1313 case AArch64_AM::UXTH:
1314 case AArch64_AM::UXTW:
1315 case AArch64_AM::UXTX:
1316 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1317 }
1318 }
1319
1320 case AArch64::SUBWrs:
1321 case AArch64::SUBSWrs: {
1322 unsigned Imm = MI.getOperand(i: 3).getImm();
1323 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1324 return ShiftVal == 0 ||
1325 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1326 }
1327
1328 case AArch64::SUBXrs:
1329 case AArch64::SUBSXrs: {
1330 unsigned Imm = MI.getOperand(i: 3).getImm();
1331 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1332 return ShiftVal == 0 ||
1333 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1334 }
1335
1336 case AArch64::SUBWrx:
1337 case AArch64::SUBXrx:
1338 case AArch64::SUBXrx64:
1339 case AArch64::SUBSWrx:
1340 case AArch64::SUBSXrx:
1341 case AArch64::SUBSXrx64: {
1342 unsigned Imm = MI.getOperand(i: 3).getImm();
1343 switch (AArch64_AM::getArithExtendType(Imm)) {
1344 default:
1345 return false;
1346 case AArch64_AM::UXTB:
1347 case AArch64_AM::UXTH:
1348 case AArch64_AM::UXTW:
1349 case AArch64_AM::UXTX:
1350 return AArch64_AM::getArithShiftValue(Imm) == 0;
1351 }
1352 }
1353
1354 case AArch64::LDRBBroW:
1355 case AArch64::LDRBBroX:
1356 case AArch64::LDRBroW:
1357 case AArch64::LDRBroX:
1358 case AArch64::LDRDroW:
1359 case AArch64::LDRDroX:
1360 case AArch64::LDRHHroW:
1361 case AArch64::LDRHHroX:
1362 case AArch64::LDRHroW:
1363 case AArch64::LDRHroX:
1364 case AArch64::LDRQroW:
1365 case AArch64::LDRQroX:
1366 case AArch64::LDRSBWroW:
1367 case AArch64::LDRSBWroX:
1368 case AArch64::LDRSBXroW:
1369 case AArch64::LDRSBXroX:
1370 case AArch64::LDRSHWroW:
1371 case AArch64::LDRSHWroX:
1372 case AArch64::LDRSHXroW:
1373 case AArch64::LDRSHXroX:
1374 case AArch64::LDRSWroW:
1375 case AArch64::LDRSWroX:
1376 case AArch64::LDRSroW:
1377 case AArch64::LDRSroX:
1378 case AArch64::LDRWroW:
1379 case AArch64::LDRWroX:
1380 case AArch64::LDRXroW:
1381 case AArch64::LDRXroX:
1382 case AArch64::PRFMroW:
1383 case AArch64::PRFMroX:
1384 case AArch64::STRBBroW:
1385 case AArch64::STRBBroX:
1386 case AArch64::STRBroW:
1387 case AArch64::STRBroX:
1388 case AArch64::STRDroW:
1389 case AArch64::STRDroX:
1390 case AArch64::STRHHroW:
1391 case AArch64::STRHHroX:
1392 case AArch64::STRHroW:
1393 case AArch64::STRHroX:
1394 case AArch64::STRQroW:
1395 case AArch64::STRQroX:
1396 case AArch64::STRSroW:
1397 case AArch64::STRSroX:
1398 case AArch64::STRWroW:
1399 case AArch64::STRWroX:
1400 case AArch64::STRXroW:
1401 case AArch64::STRXroX: {
1402 unsigned IsSigned = MI.getOperand(i: 3).getImm();
1403 return !IsSigned;
1404 }
1405 }
1406}
1407
1408bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1409 unsigned Opc = MI.getOpcode();
1410 switch (Opc) {
1411 default:
1412 return false;
1413 case AArch64::SEH_StackAlloc:
1414 case AArch64::SEH_SaveFPLR:
1415 case AArch64::SEH_SaveFPLR_X:
1416 case AArch64::SEH_SaveReg:
1417 case AArch64::SEH_SaveReg_X:
1418 case AArch64::SEH_SaveRegP:
1419 case AArch64::SEH_SaveRegP_X:
1420 case AArch64::SEH_SaveFReg:
1421 case AArch64::SEH_SaveFReg_X:
1422 case AArch64::SEH_SaveFRegP:
1423 case AArch64::SEH_SaveFRegP_X:
1424 case AArch64::SEH_SetFP:
1425 case AArch64::SEH_AddFP:
1426 case AArch64::SEH_Nop:
1427 case AArch64::SEH_PrologEnd:
1428 case AArch64::SEH_EpilogStart:
1429 case AArch64::SEH_EpilogEnd:
1430 case AArch64::SEH_PACSignLR:
1431 case AArch64::SEH_SaveAnyRegI:
1432 case AArch64::SEH_SaveAnyRegIP:
1433 case AArch64::SEH_SaveAnyRegQP:
1434 case AArch64::SEH_SaveAnyRegQPX:
1435 case AArch64::SEH_AllocZ:
1436 case AArch64::SEH_SaveZReg:
1437 case AArch64::SEH_SavePReg:
1438 return true;
1439 }
1440}
1441
1442bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1443 Register &SrcReg, Register &DstReg,
1444 unsigned &SubIdx) const {
1445 switch (MI.getOpcode()) {
1446 default:
1447 return false;
1448 case AArch64::SBFMXri: // aka sxtw
1449 case AArch64::UBFMXri: // aka uxtw
1450 // Check for the 32 -> 64 bit extension case, these instructions can do
1451 // much more.
1452 if (MI.getOperand(i: 2).getImm() != 0 || MI.getOperand(i: 3).getImm() != 31)
1453 return false;
1454 // This is a signed or unsigned 32 -> 64 bit extension.
1455 SrcReg = MI.getOperand(i: 1).getReg();
1456 DstReg = MI.getOperand(i: 0).getReg();
1457 SubIdx = AArch64::sub_32;
1458 return true;
1459 }
1460}
1461
1462bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1463 const MachineInstr &MIa, const MachineInstr &MIb) const {
1464 const TargetRegisterInfo *TRI = &getRegisterInfo();
1465 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1466 int64_t OffsetA = 0, OffsetB = 0;
1467 TypeSize WidthA(0, false), WidthB(0, false);
1468 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1469
1470 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1471 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1472
1473 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1474 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1475 return false;
1476
1477 // Retrieve the base, offset from the base and width. Width
1478 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1479 // base are identical, and the offset of a lower memory access +
1480 // the width doesn't overlap the offset of a higher memory access,
1481 // then the memory accesses are different.
1482 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1483 // are assumed to have the same scale (vscale).
1484 if (getMemOperandWithOffsetWidth(MI: MIa, BaseOp&: BaseOpA, Offset&: OffsetA, OffsetIsScalable&: OffsetAIsScalable,
1485 Width&: WidthA, TRI) &&
1486 getMemOperandWithOffsetWidth(MI: MIb, BaseOp&: BaseOpB, Offset&: OffsetB, OffsetIsScalable&: OffsetBIsScalable,
1487 Width&: WidthB, TRI)) {
1488 if (BaseOpA->isIdenticalTo(Other: *BaseOpB) &&
1489 OffsetAIsScalable == OffsetBIsScalable) {
1490 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1491 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1492 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1493 if (LowWidth.isScalable() == OffsetAIsScalable &&
1494 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1495 return true;
1496 }
1497 }
1498 return false;
1499}
1500
1501bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1502 const MachineBasicBlock *MBB,
1503 const MachineFunction &MF) const {
1504 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1505 return true;
1506
1507 // Do not move an instruction that can be recognized as a branch target.
1508 if (hasBTISemantics(MI))
1509 return true;
1510
1511 switch (MI.getOpcode()) {
1512 case AArch64::HINT:
1513 // CSDB hints are scheduling barriers.
1514 if (MI.getOperand(i: 0).getImm() == 0x14)
1515 return true;
1516 break;
1517 case AArch64::DSB:
1518 case AArch64::ISB:
1519 // DSB and ISB also are scheduling barriers.
1520 return true;
1521 case AArch64::MSRpstatesvcrImm1:
1522 // SMSTART and SMSTOP are also scheduling barriers.
1523 return true;
1524 default:;
1525 }
1526 if (isSEHInstruction(MI))
1527 return true;
1528 auto Next = std::next(x: MI.getIterator());
1529 return Next != MBB->end() && Next->isCFIInstruction();
1530}
1531
1532/// analyzeCompare - For a comparison instruction, return the source registers
1533/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1534/// Return true if the comparison instruction can be analyzed.
1535bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1536 Register &SrcReg2, int64_t &CmpMask,
1537 int64_t &CmpValue) const {
1538 // The first operand can be a frame index where we'd normally expect a
1539 // register.
1540 // FIXME: Pass subregisters out of analyzeCompare
1541 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1542 if (!MI.getOperand(i: 1).isReg() || MI.getOperand(i: 1).getSubReg())
1543 return false;
1544
1545 switch (MI.getOpcode()) {
1546 default:
1547 break;
1548 case AArch64::PTEST_PP:
1549 case AArch64::PTEST_PP_ANY:
1550 case AArch64::PTEST_PP_FIRST:
1551 SrcReg = MI.getOperand(i: 0).getReg();
1552 SrcReg2 = MI.getOperand(i: 1).getReg();
1553 if (MI.getOperand(i: 2).getSubReg())
1554 return false;
1555
1556 // Not sure about the mask and value for now...
1557 CmpMask = ~0;
1558 CmpValue = 0;
1559 return true;
1560 case AArch64::SUBSWrr:
1561 case AArch64::SUBSWrs:
1562 case AArch64::SUBSWrx:
1563 case AArch64::SUBSXrr:
1564 case AArch64::SUBSXrs:
1565 case AArch64::SUBSXrx:
1566 case AArch64::ADDSWrr:
1567 case AArch64::ADDSWrs:
1568 case AArch64::ADDSWrx:
1569 case AArch64::ADDSXrr:
1570 case AArch64::ADDSXrs:
1571 case AArch64::ADDSXrx:
1572 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1573 SrcReg = MI.getOperand(i: 1).getReg();
1574 SrcReg2 = MI.getOperand(i: 2).getReg();
1575
1576 // FIXME: Pass subregisters out of analyzeCompare
1577 if (MI.getOperand(i: 2).getSubReg())
1578 return false;
1579
1580 CmpMask = ~0;
1581 CmpValue = 0;
1582 return true;
1583 case AArch64::SUBSWri:
1584 case AArch64::ADDSWri:
1585 case AArch64::SUBSXri:
1586 case AArch64::ADDSXri:
1587 SrcReg = MI.getOperand(i: 1).getReg();
1588 SrcReg2 = 0;
1589 CmpMask = ~0;
1590 CmpValue = MI.getOperand(i: 2).getImm();
1591 return true;
1592 case AArch64::ANDSWri:
1593 case AArch64::ANDSXri:
1594 // ANDS does not use the same encoding scheme as the others xxxS
1595 // instructions.
1596 SrcReg = MI.getOperand(i: 1).getReg();
1597 SrcReg2 = 0;
1598 CmpMask = ~0;
1599 CmpValue = AArch64_AM::decodeLogicalImmediate(
1600 val: MI.getOperand(i: 2).getImm(),
1601 regSize: MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1602 return true;
1603 }
1604
1605 return false;
1606}
1607
1608static bool UpdateOperandRegClass(MachineInstr &Instr) {
1609 MachineBasicBlock *MBB = Instr.getParent();
1610 assert(MBB && "Can't get MachineBasicBlock here");
1611 MachineFunction *MF = MBB->getParent();
1612 assert(MF && "Can't get MachineFunction here");
1613 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1614 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1615 MachineRegisterInfo *MRI = &MF->getRegInfo();
1616
1617 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1618 ++OpIdx) {
1619 MachineOperand &MO = Instr.getOperand(i: OpIdx);
1620 const TargetRegisterClass *OpRegCstraints =
1621 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1622
1623 // If there's no constraint, there's nothing to do.
1624 if (!OpRegCstraints)
1625 continue;
1626 // If the operand is a frame index, there's nothing to do here.
1627 // A frame index operand will resolve correctly during PEI.
1628 if (MO.isFI())
1629 continue;
1630
1631 assert(MO.isReg() &&
1632 "Operand has register constraints without being a register!");
1633
1634 Register Reg = MO.getReg();
1635 if (Reg.isPhysical()) {
1636 if (!OpRegCstraints->contains(Reg))
1637 return false;
1638 } else if (!OpRegCstraints->hasSubClassEq(RC: MRI->getRegClass(Reg)) &&
1639 !MRI->constrainRegClass(Reg, RC: OpRegCstraints))
1640 return false;
1641 }
1642
1643 return true;
1644}
1645
1646/// Return the opcode that does not set flags when possible - otherwise
1647/// return the original opcode. The caller is responsible to do the actual
1648/// substitution and legality checking.
1649static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1650 // Don't convert all compare instructions, because for some the zero register
1651 // encoding becomes the sp register.
1652 bool MIDefinesZeroReg = false;
1653 if (MI.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) ||
1654 MI.definesRegister(Reg: AArch64::XZR, /*TRI=*/nullptr))
1655 MIDefinesZeroReg = true;
1656
1657 switch (MI.getOpcode()) {
1658 default:
1659 return MI.getOpcode();
1660 case AArch64::ADDSWrr:
1661 return AArch64::ADDWrr;
1662 case AArch64::ADDSWri:
1663 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1664 case AArch64::ADDSWrs:
1665 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1666 case AArch64::ADDSWrx:
1667 return AArch64::ADDWrx;
1668 case AArch64::ADDSXrr:
1669 return AArch64::ADDXrr;
1670 case AArch64::ADDSXri:
1671 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1672 case AArch64::ADDSXrs:
1673 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1674 case AArch64::ADDSXrx:
1675 return AArch64::ADDXrx;
1676 case AArch64::SUBSWrr:
1677 return AArch64::SUBWrr;
1678 case AArch64::SUBSWri:
1679 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1680 case AArch64::SUBSWrs:
1681 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1682 case AArch64::SUBSWrx:
1683 return AArch64::SUBWrx;
1684 case AArch64::SUBSXrr:
1685 return AArch64::SUBXrr;
1686 case AArch64::SUBSXri:
1687 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1688 case AArch64::SUBSXrs:
1689 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1690 case AArch64::SUBSXrx:
1691 return AArch64::SUBXrx;
1692 }
1693}
1694
1695enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1696
1697/// True when condition flags are accessed (either by writing or reading)
1698/// on the instruction trace starting at From and ending at To.
1699///
1700/// Note: If From and To are from different blocks it's assumed CC are accessed
1701/// on the path.
1702static bool areCFlagsAccessedBetweenInstrs(
1703 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1704 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1705 // Early exit if To is at the beginning of the BB.
1706 if (To == To->getParent()->begin())
1707 return true;
1708
1709 // Check whether the instructions are in the same basic block
1710 // If not, assume the condition flags might get modified somewhere.
1711 if (To->getParent() != From->getParent())
1712 return true;
1713
1714 // From must be above To.
1715 assert(std::any_of(
1716 ++To.getReverse(), To->getParent()->rend(),
1717 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1718
1719 // We iterate backward starting at \p To until we hit \p From.
1720 for (const MachineInstr &Instr :
1721 instructionsWithoutDebug(It: ++To.getReverse(), End: From.getReverse())) {
1722 if (((AccessToCheck & AK_Write) &&
1723 Instr.modifiesRegister(Reg: AArch64::NZCV, TRI)) ||
1724 ((AccessToCheck & AK_Read) && Instr.readsRegister(Reg: AArch64::NZCV, TRI)))
1725 return true;
1726 }
1727 return false;
1728}
1729
1730std::optional<unsigned>
1731AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1732 MachineInstr *Pred,
1733 const MachineRegisterInfo *MRI) const {
1734 unsigned MaskOpcode = Mask->getOpcode();
1735 unsigned PredOpcode = Pred->getOpcode();
1736 bool PredIsPTestLike = isPTestLikeOpcode(Opc: PredOpcode);
1737 bool PredIsWhileLike = isWhileOpcode(Opc: PredOpcode);
1738
1739 if (PredIsWhileLike) {
1740 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1741 // instruction and the condition is "any" since WHILcc does an implicit
1742 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1743 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1744 return PredOpcode;
1745
1746 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1747 // redundant since WHILE performs an implicit PTEST with an all active
1748 // mask.
1749 if (isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31 &&
1750 getElementSizeForOpcode(Opc: MaskOpcode) ==
1751 getElementSizeForOpcode(Opc: PredOpcode))
1752 return PredOpcode;
1753
1754 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1755 // WHILEcc performs an implicit PTEST with an all active mask, setting
1756 // the N flag as the PTEST_FIRST would.
1757 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1758 isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31)
1759 return PredOpcode;
1760
1761 return {};
1762 }
1763
1764 if (PredIsPTestLike) {
1765 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1766 // instruction that sets the flags as PTEST would and the condition is
1767 // "any" since PG is always a subset of the governing predicate of the
1768 // ptest-like instruction.
1769 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1770 return PredOpcode;
1771
1772 auto PTestLikeMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1773
1774 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1775 // to look through a copy and try again. This is because some instructions
1776 // take a predicate whose register class is a subset of its result class.
1777 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1778 PTestLikeMask->getOperand(i: 1).getReg().isVirtual())
1779 PTestLikeMask =
1780 MRI->getUniqueVRegDef(Reg: PTestLikeMask->getOperand(i: 1).getReg());
1781
1782 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1783 // the element size matches and either the PTEST_LIKE instruction uses
1784 // the same all active mask or the condition is "any".
1785 if (isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31 &&
1786 getElementSizeForOpcode(Opc: MaskOpcode) ==
1787 getElementSizeForOpcode(Opc: PredOpcode)) {
1788 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1789 return PredOpcode;
1790 }
1791
1792 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1793 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1794 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1795 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1796 // performed by the compare could consider fewer lanes for these element
1797 // sizes.
1798 //
1799 // For example, consider
1800 //
1801 // ptrue p0.b ; P0=1111-1111-1111-1111
1802 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1803 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1804 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1805 // ; ^ last active
1806 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1807 // ; ^ last active
1808 //
1809 // where the compare generates a canonical all active 32-bit predicate
1810 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1811 // active flag, whereas the PTEST instruction with the same mask doesn't.
1812 // For PTEST_ANY this doesn't apply as the flags in this case would be
1813 // identical regardless of element size.
1814 uint64_t PredElementSize = getElementSizeForOpcode(Opc: PredOpcode);
1815 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1816 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1817 return PredOpcode;
1818
1819 return {};
1820 }
1821
1822 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1823 // opcode so the PTEST becomes redundant.
1824 switch (PredOpcode) {
1825 case AArch64::AND_PPzPP:
1826 case AArch64::BIC_PPzPP:
1827 case AArch64::EOR_PPzPP:
1828 case AArch64::NAND_PPzPP:
1829 case AArch64::NOR_PPzPP:
1830 case AArch64::ORN_PPzPP:
1831 case AArch64::ORR_PPzPP:
1832 case AArch64::BRKA_PPzP:
1833 case AArch64::BRKPA_PPzPP:
1834 case AArch64::BRKB_PPzP:
1835 case AArch64::BRKPB_PPzPP:
1836 case AArch64::RDFFR_PPz: {
1837 // Check to see if our mask is the same. If not the resulting flag bits
1838 // may be different and we can't remove the ptest.
1839 auto *PredMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1840 if (Mask != PredMask)
1841 return {};
1842 break;
1843 }
1844 case AArch64::BRKN_PPzP: {
1845 // BRKN uses an all active implicit mask to set flags unlike the other
1846 // flag-setting instructions.
1847 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1848 if ((MaskOpcode != AArch64::PTRUE_B) ||
1849 (Mask->getOperand(i: 1).getImm() != 31))
1850 return {};
1851 break;
1852 }
1853 case AArch64::PTRUE_B:
1854 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1855 break;
1856 default:
1857 // Bail out if we don't recognize the input
1858 return {};
1859 }
1860
1861 return convertToFlagSettingOpc(Opc: PredOpcode);
1862}
1863
1864/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1865/// operation which could set the flags in an identical manner
1866bool AArch64InstrInfo::optimizePTestInstr(
1867 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1868 const MachineRegisterInfo *MRI) const {
1869 auto *Mask = MRI->getUniqueVRegDef(Reg: MaskReg);
1870 auto *Pred = MRI->getUniqueVRegDef(Reg: PredReg);
1871
1872 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1873 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1874 // before the branch to extract each subregister.
1875 auto Op = Pred->getOperand(i: 1);
1876 if (Op.isReg() && Op.getReg().isVirtual() &&
1877 Op.getSubReg() == AArch64::psub0)
1878 Pred = MRI->getUniqueVRegDef(Reg: Op.getReg());
1879 }
1880
1881 unsigned PredOpcode = Pred->getOpcode();
1882 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1883 if (!NewOp)
1884 return false;
1885
1886 const TargetRegisterInfo *TRI = &getRegisterInfo();
1887
1888 // If another instruction between Pred and PTest accesses flags, don't remove
1889 // the ptest or update the earlier instruction to modify them.
1890 if (areCFlagsAccessedBetweenInstrs(From: Pred, To: PTest, TRI))
1891 return false;
1892
1893 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1894 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1895 // operand to be replaced with an equivalent instruction that also sets the
1896 // flags.
1897 PTest->eraseFromParent();
1898 if (*NewOp != PredOpcode) {
1899 Pred->setDesc(get(Opcode: *NewOp));
1900 bool succeeded = UpdateOperandRegClass(Instr&: *Pred);
1901 (void)succeeded;
1902 assert(succeeded && "Operands have incompatible register classes!");
1903 Pred->addRegisterDefined(Reg: AArch64::NZCV, RegInfo: TRI);
1904 }
1905
1906 // Ensure that the flags def is live.
1907 if (Pred->registerDefIsDead(Reg: AArch64::NZCV, TRI)) {
1908 unsigned i = 0, e = Pred->getNumOperands();
1909 for (; i != e; ++i) {
1910 MachineOperand &MO = Pred->getOperand(i);
1911 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1912 MO.setIsDead(false);
1913 break;
1914 }
1915 }
1916 }
1917 return true;
1918}
1919
1920/// Try to optimize a compare instruction. A compare instruction is an
1921/// instruction which produces AArch64::NZCV. It can be truly compare
1922/// instruction
1923/// when there are no uses of its destination register.
1924///
1925/// The following steps are tried in order:
1926/// 1. Convert CmpInstr into an unconditional version.
1927/// 2. Remove CmpInstr if above there is an instruction producing a needed
1928/// condition code or an instruction which can be converted into such an
1929/// instruction.
1930/// Only comparison with zero is supported.
1931bool AArch64InstrInfo::optimizeCompareInstr(
1932 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1933 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1934 assert(CmpInstr.getParent());
1935 assert(MRI);
1936
1937 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1938 int DeadNZCVIdx =
1939 CmpInstr.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true);
1940 if (DeadNZCVIdx != -1) {
1941 if (CmpInstr.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) ||
1942 CmpInstr.definesRegister(Reg: AArch64::XZR, /*TRI=*/nullptr)) {
1943 CmpInstr.eraseFromParent();
1944 return true;
1945 }
1946 unsigned Opc = CmpInstr.getOpcode();
1947 unsigned NewOpc = convertToNonFlagSettingOpc(MI: CmpInstr);
1948 if (NewOpc == Opc)
1949 return false;
1950 const MCInstrDesc &MCID = get(Opcode: NewOpc);
1951 CmpInstr.setDesc(MCID);
1952 CmpInstr.removeOperand(OpNo: DeadNZCVIdx);
1953 bool succeeded = UpdateOperandRegClass(Instr&: CmpInstr);
1954 (void)succeeded;
1955 assert(succeeded && "Some operands reg class are incompatible!");
1956 return true;
1957 }
1958
1959 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1960 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
1961 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
1962 return optimizePTestInstr(PTest: &CmpInstr, MaskReg: SrcReg, PredReg: SrcReg2, MRI);
1963
1964 if (SrcReg2 != 0)
1965 return false;
1966
1967 // CmpInstr is a Compare instruction if destination register is not used.
1968 if (!MRI->use_nodbg_empty(RegNo: CmpInstr.getOperand(i: 0).getReg()))
1969 return false;
1970
1971 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, MRI: *MRI))
1972 return true;
1973 return (CmpValue == 0 || CmpValue == 1) &&
1974 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, MRI: *MRI);
1975}
1976
1977/// Get opcode of S version of Instr.
1978/// If Instr is S version its opcode is returned.
1979/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1980/// or we are not interested in it.
1981static unsigned sForm(MachineInstr &Instr) {
1982 switch (Instr.getOpcode()) {
1983 default:
1984 return AArch64::INSTRUCTION_LIST_END;
1985
1986 case AArch64::ADDSWrr:
1987 case AArch64::ADDSWri:
1988 case AArch64::ADDSXrr:
1989 case AArch64::ADDSXri:
1990 case AArch64::ADDSWrx:
1991 case AArch64::ADDSXrx:
1992 case AArch64::SUBSWrr:
1993 case AArch64::SUBSWri:
1994 case AArch64::SUBSWrx:
1995 case AArch64::SUBSXrr:
1996 case AArch64::SUBSXri:
1997 case AArch64::SUBSXrx:
1998 case AArch64::ANDSWri:
1999 case AArch64::ANDSWrr:
2000 case AArch64::ANDSWrs:
2001 case AArch64::ANDSXri:
2002 case AArch64::ANDSXrr:
2003 case AArch64::ANDSXrs:
2004 case AArch64::BICSWrr:
2005 case AArch64::BICSXrr:
2006 case AArch64::BICSWrs:
2007 case AArch64::BICSXrs:
2008 return Instr.getOpcode();
2009
2010 case AArch64::ADDWrr:
2011 return AArch64::ADDSWrr;
2012 case AArch64::ADDWri:
2013 return AArch64::ADDSWri;
2014 case AArch64::ADDXrr:
2015 return AArch64::ADDSXrr;
2016 case AArch64::ADDXri:
2017 return AArch64::ADDSXri;
2018 case AArch64::ADDWrx:
2019 return AArch64::ADDSWrx;
2020 case AArch64::ADDXrx:
2021 return AArch64::ADDSXrx;
2022 case AArch64::ADCWr:
2023 return AArch64::ADCSWr;
2024 case AArch64::ADCXr:
2025 return AArch64::ADCSXr;
2026 case AArch64::SUBWrr:
2027 return AArch64::SUBSWrr;
2028 case AArch64::SUBWri:
2029 return AArch64::SUBSWri;
2030 case AArch64::SUBXrr:
2031 return AArch64::SUBSXrr;
2032 case AArch64::SUBXri:
2033 return AArch64::SUBSXri;
2034 case AArch64::SUBWrx:
2035 return AArch64::SUBSWrx;
2036 case AArch64::SUBXrx:
2037 return AArch64::SUBSXrx;
2038 case AArch64::SBCWr:
2039 return AArch64::SBCSWr;
2040 case AArch64::SBCXr:
2041 return AArch64::SBCSXr;
2042 case AArch64::ANDWri:
2043 return AArch64::ANDSWri;
2044 case AArch64::ANDXri:
2045 return AArch64::ANDSXri;
2046 case AArch64::ANDWrr:
2047 return AArch64::ANDSWrr;
2048 case AArch64::ANDWrs:
2049 return AArch64::ANDSWrs;
2050 case AArch64::ANDXrr:
2051 return AArch64::ANDSXrr;
2052 case AArch64::ANDXrs:
2053 return AArch64::ANDSXrs;
2054 case AArch64::BICWrr:
2055 return AArch64::BICSWrr;
2056 case AArch64::BICXrr:
2057 return AArch64::BICSXrr;
2058 case AArch64::BICWrs:
2059 return AArch64::BICSWrs;
2060 case AArch64::BICXrs:
2061 return AArch64::BICSXrs;
2062 }
2063}
2064
2065/// Check if AArch64::NZCV should be alive in successors of MBB.
2066static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
2067 for (auto *BB : MBB->successors())
2068 if (BB->isLiveIn(Reg: AArch64::NZCV))
2069 return true;
2070 return false;
2071}
2072
2073/// \returns The condition code operand index for \p Instr if it is a branch
2074/// or select and -1 otherwise.
2075int AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(
2076 const MachineInstr &Instr) {
2077 switch (Instr.getOpcode()) {
2078 default:
2079 return -1;
2080
2081 case AArch64::Bcc: {
2082 int Idx = Instr.findRegisterUseOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
2083 assert(Idx >= 2);
2084 return Idx - 2;
2085 }
2086
2087 case AArch64::CSINVWr:
2088 case AArch64::CSINVXr:
2089 case AArch64::CSINCWr:
2090 case AArch64::CSINCXr:
2091 case AArch64::CSELWr:
2092 case AArch64::CSELXr:
2093 case AArch64::CSNEGWr:
2094 case AArch64::CSNEGXr:
2095 case AArch64::FCSELSrrr:
2096 case AArch64::FCSELDrrr: {
2097 int Idx = Instr.findRegisterUseOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
2098 assert(Idx >= 1);
2099 return Idx - 1;
2100 }
2101 }
2102}
2103
2104/// Find a condition code used by the instruction.
2105/// Returns AArch64CC::Invalid if either the instruction does not use condition
2106/// codes or we don't optimize CmpInstr in the presence of such instructions.
2107static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
2108 int CCIdx =
2109 AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2110 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2111 Instr.getOperand(i: CCIdx).getImm())
2112 : AArch64CC::Invalid;
2113}
2114
2115static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
2116 assert(CC != AArch64CC::Invalid);
2117 UsedNZCV UsedFlags;
2118 switch (CC) {
2119 default:
2120 break;
2121
2122 case AArch64CC::EQ: // Z set
2123 case AArch64CC::NE: // Z clear
2124 UsedFlags.Z = true;
2125 break;
2126
2127 case AArch64CC::HI: // Z clear and C set
2128 case AArch64CC::LS: // Z set or C clear
2129 UsedFlags.Z = true;
2130 [[fallthrough]];
2131 case AArch64CC::HS: // C set
2132 case AArch64CC::LO: // C clear
2133 UsedFlags.C = true;
2134 break;
2135
2136 case AArch64CC::MI: // N set
2137 case AArch64CC::PL: // N clear
2138 UsedFlags.N = true;
2139 break;
2140
2141 case AArch64CC::VS: // V set
2142 case AArch64CC::VC: // V clear
2143 UsedFlags.V = true;
2144 break;
2145
2146 case AArch64CC::GT: // Z clear, N and V the same
2147 case AArch64CC::LE: // Z set, N and V differ
2148 UsedFlags.Z = true;
2149 [[fallthrough]];
2150 case AArch64CC::GE: // N and V the same
2151 case AArch64CC::LT: // N and V differ
2152 UsedFlags.N = true;
2153 UsedFlags.V = true;
2154 break;
2155 }
2156 return UsedFlags;
2157}
2158
2159/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2160/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2161/// \returns std::nullopt otherwise.
2162///
2163/// Collect instructions using that flags in \p CCUseInstrs if provided.
2164std::optional<UsedNZCV>
2165llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
2166 const TargetRegisterInfo &TRI,
2167 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2168 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2169 if (MI.getParent() != CmpParent)
2170 return std::nullopt;
2171
2172 if (areCFlagsAliveInSuccessors(MBB: CmpParent))
2173 return std::nullopt;
2174
2175 UsedNZCV NZCVUsedAfterCmp;
2176 for (MachineInstr &Instr : instructionsWithoutDebug(
2177 It: std::next(x: CmpInstr.getIterator()), End: CmpParent->instr_end())) {
2178 if (Instr.readsRegister(Reg: AArch64::NZCV, TRI: &TRI)) {
2179 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
2180 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2181 return std::nullopt;
2182 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2183 if (CCUseInstrs)
2184 CCUseInstrs->push_back(Elt: &Instr);
2185 }
2186 if (Instr.modifiesRegister(Reg: AArch64::NZCV, TRI: &TRI))
2187 break;
2188 }
2189 return NZCVUsedAfterCmp;
2190}
2191
2192static bool isADDSRegImm(unsigned Opcode) {
2193 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2194}
2195
2196static bool isSUBSRegImm(unsigned Opcode) {
2197 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2198}
2199
2200static bool isANDOpcode(MachineInstr &MI) {
2201 unsigned Opc = sForm(Instr&: MI);
2202 switch (Opc) {
2203 case AArch64::ANDSWri:
2204 case AArch64::ANDSWrr:
2205 case AArch64::ANDSWrs:
2206 case AArch64::ANDSXri:
2207 case AArch64::ANDSXrr:
2208 case AArch64::ANDSXrs:
2209 case AArch64::BICSWrr:
2210 case AArch64::BICSXrr:
2211 case AArch64::BICSWrs:
2212 case AArch64::BICSXrs:
2213 return true;
2214 default:
2215 return false;
2216 }
2217}
2218
2219/// Check if CmpInstr can be substituted by MI.
2220///
2221/// CmpInstr can be substituted:
2222/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2223/// - and, MI and CmpInstr are from the same MachineBB
2224/// - and, condition flags are not alive in successors of the CmpInstr parent
2225/// - and, if MI opcode is the S form there must be no defs of flags between
2226/// MI and CmpInstr
2227/// or if MI opcode is not the S form there must be neither defs of flags
2228/// nor uses of flags between MI and CmpInstr.
2229/// - and, if C/V flags are not used after CmpInstr
2230/// or if N flag is used but MI produces poison value if signed overflow
2231/// occurs.
2232static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
2233 const TargetRegisterInfo &TRI) {
2234 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2235 // that may or may not set flags.
2236 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2237
2238 const unsigned CmpOpcode = CmpInstr.getOpcode();
2239 if (!isADDSRegImm(Opcode: CmpOpcode) && !isSUBSRegImm(Opcode: CmpOpcode))
2240 return false;
2241
2242 assert((CmpInstr.getOperand(2).isImm() &&
2243 CmpInstr.getOperand(2).getImm() == 0) &&
2244 "Caller guarantees that CmpInstr compares with constant 0");
2245
2246 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2247 if (!NZVCUsed || NZVCUsed->C)
2248 return false;
2249
2250 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2251 // '%vreg = add ...' or '%vreg = sub ...'.
2252 // Condition flag V is used to indicate signed overflow.
2253 // 1) MI and CmpInstr set N and V to the same value.
2254 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2255 // signed overflow occurs, so CmpInstr could still be simplified away.
2256 // Note that Ands and Bics instructions always clear the V flag.
2257 if (NZVCUsed->V && !MI.getFlag(Flag: MachineInstr::NoSWrap) && !isANDOpcode(MI))
2258 return false;
2259
2260 AccessKind AccessToCheck = AK_Write;
2261 if (sForm(Instr&: MI) != MI.getOpcode())
2262 AccessToCheck = AK_All;
2263 return !areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck);
2264}
2265
2266/// Substitute an instruction comparing to zero with another instruction
2267/// which produces needed condition flags.
2268///
2269/// Return true on success.
2270bool AArch64InstrInfo::substituteCmpToZero(
2271 MachineInstr &CmpInstr, unsigned SrcReg,
2272 const MachineRegisterInfo &MRI) const {
2273 // Get the unique definition of SrcReg.
2274 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
2275 if (!MI)
2276 return false;
2277
2278 const TargetRegisterInfo &TRI = getRegisterInfo();
2279
2280 unsigned NewOpc = sForm(Instr&: *MI);
2281 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2282 return false;
2283
2284 if (!canInstrSubstituteCmpInstr(MI&: *MI, CmpInstr, TRI))
2285 return false;
2286
2287 // Update the instruction to set NZCV.
2288 MI->setDesc(get(Opcode: NewOpc));
2289 CmpInstr.eraseFromParent();
2290 bool succeeded = UpdateOperandRegClass(Instr&: *MI);
2291 (void)succeeded;
2292 assert(succeeded && "Some operands reg class are incompatible!");
2293 MI->addRegisterDefined(Reg: AArch64::NZCV, RegInfo: &TRI);
2294 return true;
2295}
2296
2297/// \returns True if \p CmpInstr can be removed.
2298///
2299/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2300/// codes used in \p CCUseInstrs must be inverted.
2301static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
2302 int CmpValue, const TargetRegisterInfo &TRI,
2303 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
2304 bool &IsInvertCC) {
2305 assert((CmpValue == 0 || CmpValue == 1) &&
2306 "Only comparisons to 0 or 1 considered for removal!");
2307
2308 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2309 unsigned MIOpc = MI.getOpcode();
2310 if (MIOpc == AArch64::CSINCWr) {
2311 if (MI.getOperand(i: 1).getReg() != AArch64::WZR ||
2312 MI.getOperand(i: 2).getReg() != AArch64::WZR)
2313 return false;
2314 } else if (MIOpc == AArch64::CSINCXr) {
2315 if (MI.getOperand(i: 1).getReg() != AArch64::XZR ||
2316 MI.getOperand(i: 2).getReg() != AArch64::XZR)
2317 return false;
2318 } else {
2319 return false;
2320 }
2321 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(Instr: MI);
2322 if (MICC == AArch64CC::Invalid)
2323 return false;
2324
2325 // NZCV needs to be defined
2326 if (MI.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) != -1)
2327 return false;
2328
2329 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2330 const unsigned CmpOpcode = CmpInstr.getOpcode();
2331 bool IsSubsRegImm = isSUBSRegImm(Opcode: CmpOpcode);
2332 if (CmpValue && !IsSubsRegImm)
2333 return false;
2334 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(Opcode: CmpOpcode))
2335 return false;
2336
2337 // MI conditions allowed: eq, ne, mi, pl
2338 UsedNZCV MIUsedNZCV = getUsedNZCV(CC: MICC);
2339 if (MIUsedNZCV.C || MIUsedNZCV.V)
2340 return false;
2341
2342 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2343 examineCFlagsUse(MI, CmpInstr, TRI, CCUseInstrs: &CCUseInstrs);
2344 // Condition flags are not used in CmpInstr basic block successors and only
2345 // Z or N flags allowed to be used after CmpInstr within its basic block
2346 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2347 return false;
2348 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2349 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2350 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2351 return false;
2352 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2353 if (MIUsedNZCV.N && !CmpValue)
2354 return false;
2355
2356 // There must be no defs of flags between MI and CmpInstr
2357 if (areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck: AK_Write))
2358 return false;
2359
2360 // Condition code is inverted in the following cases:
2361 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2362 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2363 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2364 (!CmpValue && MICC == AArch64CC::NE);
2365 return true;
2366}
2367
2368/// Remove comparison in csinc-cmp sequence
2369///
2370/// Examples:
2371/// 1. \code
2372/// csinc w9, wzr, wzr, ne
2373/// cmp w9, #0
2374/// b.eq
2375/// \endcode
2376/// to
2377/// \code
2378/// csinc w9, wzr, wzr, ne
2379/// b.ne
2380/// \endcode
2381///
2382/// 2. \code
2383/// csinc x2, xzr, xzr, mi
2384/// cmp x2, #1
2385/// b.pl
2386/// \endcode
2387/// to
2388/// \code
2389/// csinc x2, xzr, xzr, mi
2390/// b.pl
2391/// \endcode
2392///
2393/// \param CmpInstr comparison instruction
2394/// \return True when comparison removed
2395bool AArch64InstrInfo::removeCmpToZeroOrOne(
2396 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2397 const MachineRegisterInfo &MRI) const {
2398 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
2399 if (!MI)
2400 return false;
2401 const TargetRegisterInfo &TRI = getRegisterInfo();
2402 SmallVector<MachineInstr *, 4> CCUseInstrs;
2403 bool IsInvertCC = false;
2404 if (!canCmpInstrBeRemoved(MI&: *MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2405 IsInvertCC))
2406 return false;
2407 // Make transformation
2408 CmpInstr.eraseFromParent();
2409 if (IsInvertCC) {
2410 // Invert condition codes in CmpInstr CC users
2411 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2412 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(Instr: *CCUseInstr);
2413 assert(Idx >= 0 && "Unexpected instruction using CC.");
2414 MachineOperand &CCOperand = CCUseInstr->getOperand(i: Idx);
2415 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
2416 Code: static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2417 CCOperand.setImm(CCUse);
2418 }
2419 }
2420 return true;
2421}
2422
2423bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2424 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2425 MI.getOpcode() != AArch64::CATCHRET)
2426 return false;
2427
2428 MachineBasicBlock &MBB = *MI.getParent();
2429 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2430 auto TRI = Subtarget.getRegisterInfo();
2431 DebugLoc DL = MI.getDebugLoc();
2432
2433 if (MI.getOpcode() == AArch64::CATCHRET) {
2434 // Skip to the first instruction before the epilog.
2435 const TargetInstrInfo *TII =
2436 MBB.getParent()->getSubtarget().getInstrInfo();
2437 MachineBasicBlock *TargetMBB = MI.getOperand(i: 0).getMBB();
2438 auto MBBI = MachineBasicBlock::iterator(MI);
2439 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(x: MBBI);
2440 while (FirstEpilogSEH->getFlag(Flag: MachineInstr::FrameDestroy) &&
2441 FirstEpilogSEH != MBB.begin())
2442 FirstEpilogSEH = std::prev(x: FirstEpilogSEH);
2443 if (FirstEpilogSEH != MBB.begin())
2444 FirstEpilogSEH = std::next(x: FirstEpilogSEH);
2445 BuildMI(BB&: MBB, I: FirstEpilogSEH, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADRP))
2446 .addReg(RegNo: AArch64::X0, Flags: RegState::Define)
2447 .addMBB(MBB: TargetMBB);
2448 BuildMI(BB&: MBB, I: FirstEpilogSEH, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri))
2449 .addReg(RegNo: AArch64::X0, Flags: RegState::Define)
2450 .addReg(RegNo: AArch64::X0)
2451 .addMBB(MBB: TargetMBB)
2452 .addImm(Val: 0);
2453 TargetMBB->setMachineBlockAddressTaken();
2454 return true;
2455 }
2456
2457 Register Reg = MI.getOperand(i: 0).getReg();
2458 Module &M = *MBB.getParent()->getFunction().getParent();
2459 if (M.getStackProtectorGuard() == "sysreg") {
2460 const AArch64SysReg::SysReg *SrcReg =
2461 AArch64SysReg::lookupSysRegByName(Name: M.getStackProtectorGuardReg());
2462 if (!SrcReg)
2463 report_fatal_error(reason: "Unknown SysReg for Stack Protector Guard Register");
2464
2465 // mrs xN, sysreg
2466 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MRS))
2467 .addDef(RegNo: Reg, Flags: RegState::Renamable)
2468 .addImm(Val: SrcReg->Encoding);
2469 int Offset = M.getStackProtectorGuardOffset();
2470 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2471 // ldr xN, [xN, #offset]
2472 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui))
2473 .addDef(RegNo: Reg)
2474 .addUse(RegNo: Reg, Flags: RegState::Kill)
2475 .addImm(Val: Offset / 8);
2476 } else if (Offset >= -256 && Offset <= 255) {
2477 // ldur xN, [xN, #offset]
2478 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDURXi))
2479 .addDef(RegNo: Reg)
2480 .addUse(RegNo: Reg, Flags: RegState::Kill)
2481 .addImm(Val: Offset);
2482 } else if (Offset >= -4095 && Offset <= 4095) {
2483 if (Offset > 0) {
2484 // add xN, xN, #offset
2485 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri))
2486 .addDef(RegNo: Reg)
2487 .addUse(RegNo: Reg, Flags: RegState::Kill)
2488 .addImm(Val: Offset)
2489 .addImm(Val: 0);
2490 } else {
2491 // sub xN, xN, #offset
2492 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::SUBXri))
2493 .addDef(RegNo: Reg)
2494 .addUse(RegNo: Reg, Flags: RegState::Kill)
2495 .addImm(Val: -Offset)
2496 .addImm(Val: 0);
2497 }
2498 // ldr xN, [xN]
2499 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui))
2500 .addDef(RegNo: Reg)
2501 .addUse(RegNo: Reg, Flags: RegState::Kill)
2502 .addImm(Val: 0);
2503 } else {
2504 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2505 // than 23760.
2506 // It might be nice to use AArch64::MOVi32imm here, which would get
2507 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2508 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2509 // AArch64FrameLowering might help us find such a scratch register
2510 // though. If we failed to find a scratch register, we could emit a
2511 // stream of add instructions to build up the immediate. Or, we could try
2512 // to insert a AArch64::MOVi32imm before register allocation so that we
2513 // didn't need to scavenge for a scratch register.
2514 report_fatal_error(reason: "Unable to encode Stack Protector Guard Offset");
2515 }
2516 MBB.erase(I: MI);
2517 return true;
2518 }
2519
2520 const GlobalValue *GV =
2521 cast<GlobalValue>(Val: (*MI.memoperands_begin())->getValue());
2522 const TargetMachine &TM = MBB.getParent()->getTarget();
2523 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2524 const unsigned char MO_NC = AArch64II::MO_NC;
2525
2526 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2527 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LOADgot), DestReg: Reg)
2528 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags);
2529 if (Subtarget.isTargetILP32()) {
2530 unsigned Reg32 = TRI->getSubReg(Reg, Idx: AArch64::sub_32);
2531 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRWui))
2532 .addDef(RegNo: Reg32, Flags: RegState::Dead)
2533 .addUse(RegNo: Reg, Flags: RegState::Kill)
2534 .addImm(Val: 0)
2535 .addMemOperand(MMO: *MI.memoperands_begin())
2536 .addDef(RegNo: Reg, Flags: RegState::Implicit);
2537 } else {
2538 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2539 .addReg(RegNo: Reg, Flags: RegState::Kill)
2540 .addImm(Val: 0)
2541 .addMemOperand(MMO: *MI.memoperands_begin());
2542 }
2543 } else if (TM.getCodeModel() == CodeModel::Large) {
2544 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2545 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: Reg)
2546 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G0 | MO_NC)
2547 .addImm(Val: 0);
2548 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2549 .addReg(RegNo: Reg, Flags: RegState::Kill)
2550 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G1 | MO_NC)
2551 .addImm(Val: 16);
2552 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2553 .addReg(RegNo: Reg, Flags: RegState::Kill)
2554 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G2 | MO_NC)
2555 .addImm(Val: 32);
2556 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2557 .addReg(RegNo: Reg, Flags: RegState::Kill)
2558 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G3)
2559 .addImm(Val: 48);
2560 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2561 .addReg(RegNo: Reg, Flags: RegState::Kill)
2562 .addImm(Val: 0)
2563 .addMemOperand(MMO: *MI.memoperands_begin());
2564 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2565 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADR), DestReg: Reg)
2566 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags);
2567 } else {
2568 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADRP), DestReg: Reg)
2569 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags | AArch64II::MO_PAGE);
2570 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2571 if (Subtarget.isTargetILP32()) {
2572 unsigned Reg32 = TRI->getSubReg(Reg, Idx: AArch64::sub_32);
2573 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRWui))
2574 .addDef(RegNo: Reg32, Flags: RegState::Dead)
2575 .addUse(RegNo: Reg, Flags: RegState::Kill)
2576 .addGlobalAddress(GV, Offset: 0, TargetFlags: LoFlags)
2577 .addMemOperand(MMO: *MI.memoperands_begin())
2578 .addDef(RegNo: Reg, Flags: RegState::Implicit);
2579 } else {
2580 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2581 .addReg(RegNo: Reg, Flags: RegState::Kill)
2582 .addGlobalAddress(GV, Offset: 0, TargetFlags: LoFlags)
2583 .addMemOperand(MMO: *MI.memoperands_begin());
2584 }
2585 }
2586
2587 MBB.erase(I: MI);
2588
2589 return true;
2590}
2591
2592// Return true if this instruction simply sets its single destination register
2593// to zero. This is equivalent to a register rename of the zero-register.
2594bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2595 switch (MI.getOpcode()) {
2596 default:
2597 break;
2598 case AArch64::MOVZWi:
2599 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2600 if (MI.getOperand(i: 1).isImm() && MI.getOperand(i: 1).getImm() == 0) {
2601 assert(MI.getDesc().getNumOperands() == 3 &&
2602 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2603 return true;
2604 }
2605 break;
2606 case AArch64::ANDWri: // and Rd, Rzr, #imm
2607 return MI.getOperand(i: 1).getReg() == AArch64::WZR;
2608 case AArch64::ANDXri:
2609 return MI.getOperand(i: 1).getReg() == AArch64::XZR;
2610 case TargetOpcode::COPY:
2611 return MI.getOperand(i: 1).getReg() == AArch64::WZR;
2612 }
2613 return false;
2614}
2615
2616// Return true if this instruction simply renames a general register without
2617// modifying bits.
2618bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2619 switch (MI.getOpcode()) {
2620 default:
2621 break;
2622 case TargetOpcode::COPY: {
2623 // GPR32 copies will by lowered to ORRXrs
2624 Register DstReg = MI.getOperand(i: 0).getReg();
2625 return (AArch64::GPR32RegClass.contains(Reg: DstReg) ||
2626 AArch64::GPR64RegClass.contains(Reg: DstReg));
2627 }
2628 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2629 if (MI.getOperand(i: 1).getReg() == AArch64::XZR) {
2630 assert(MI.getDesc().getNumOperands() == 4 &&
2631 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2632 return true;
2633 }
2634 break;
2635 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2636 if (MI.getOperand(i: 2).getImm() == 0) {
2637 assert(MI.getDesc().getNumOperands() == 4 &&
2638 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2639 return true;
2640 }
2641 break;
2642 }
2643 return false;
2644}
2645
2646// Return true if this instruction simply renames a general register without
2647// modifying bits.
2648bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2649 switch (MI.getOpcode()) {
2650 default:
2651 break;
2652 case TargetOpcode::COPY: {
2653 Register DstReg = MI.getOperand(i: 0).getReg();
2654 return AArch64::FPR128RegClass.contains(Reg: DstReg);
2655 }
2656 case AArch64::ORRv16i8:
2657 if (MI.getOperand(i: 1).getReg() == MI.getOperand(i: 2).getReg()) {
2658 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2659 "invalid ORRv16i8 operands");
2660 return true;
2661 }
2662 break;
2663 }
2664 return false;
2665}
2666
2667static bool isFrameLoadOpcode(int Opcode) {
2668 switch (Opcode) {
2669 default:
2670 return false;
2671 case AArch64::LDRWui:
2672 case AArch64::LDRXui:
2673 case AArch64::LDRBui:
2674 case AArch64::LDRHui:
2675 case AArch64::LDRSui:
2676 case AArch64::LDRDui:
2677 case AArch64::LDRQui:
2678 case AArch64::LDR_PXI:
2679 return true;
2680 }
2681}
2682
2683Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2684 int &FrameIndex) const {
2685 if (!isFrameLoadOpcode(Opcode: MI.getOpcode()))
2686 return Register();
2687
2688 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2689 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2690 FrameIndex = MI.getOperand(i: 1).getIndex();
2691 return MI.getOperand(i: 0).getReg();
2692 }
2693 return Register();
2694}
2695
2696static bool isFrameStoreOpcode(int Opcode) {
2697 switch (Opcode) {
2698 default:
2699 return false;
2700 case AArch64::STRWui:
2701 case AArch64::STRXui:
2702 case AArch64::STRBui:
2703 case AArch64::STRHui:
2704 case AArch64::STRSui:
2705 case AArch64::STRDui:
2706 case AArch64::STRQui:
2707 case AArch64::STR_PXI:
2708 return true;
2709 }
2710}
2711
2712Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2713 int &FrameIndex) const {
2714 if (!isFrameStoreOpcode(Opcode: MI.getOpcode()))
2715 return Register();
2716
2717 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2718 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2719 FrameIndex = MI.getOperand(i: 1).getIndex();
2720 return MI.getOperand(i: 0).getReg();
2721 }
2722 return Register();
2723}
2724
2725Register AArch64InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
2726 int &FrameIndex) const {
2727 if (!isFrameStoreOpcode(Opcode: MI.getOpcode()))
2728 return Register();
2729
2730 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2731 return Reg;
2732
2733 SmallVector<const MachineMemOperand *, 1> Accesses;
2734 if (hasStoreToStackSlot(MI, Accesses)) {
2735 if (Accesses.size() > 1)
2736 return Register();
2737
2738 FrameIndex =
2739 cast<FixedStackPseudoSourceValue>(Val: Accesses.front()->getPseudoValue())
2740 ->getFrameIndex();
2741 return MI.getOperand(i: 0).getReg();
2742 }
2743 return Register();
2744}
2745
2746Register AArch64InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
2747 int &FrameIndex) const {
2748 if (!isFrameLoadOpcode(Opcode: MI.getOpcode()))
2749 return Register();
2750
2751 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2752 return Reg;
2753
2754 SmallVector<const MachineMemOperand *, 1> Accesses;
2755 if (hasLoadFromStackSlot(MI, Accesses)) {
2756 if (Accesses.size() > 1)
2757 return Register();
2758
2759 FrameIndex =
2760 cast<FixedStackPseudoSourceValue>(Val: Accesses.front()->getPseudoValue())
2761 ->getFrameIndex();
2762 return MI.getOperand(i: 0).getReg();
2763 }
2764 return Register();
2765}
2766
2767/// Check all MachineMemOperands for a hint to suppress pairing.
2768bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2769 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2770 return MMO->getFlags() & MOSuppressPair;
2771 });
2772}
2773
2774/// Set a flag on the first MachineMemOperand to suppress pairing.
2775void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2776 if (MI.memoperands_empty())
2777 return;
2778 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2779}
2780
2781/// Check all MachineMemOperands for a hint that the load/store is strided.
2782bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2783 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2784 return MMO->getFlags() & MOStridedAccess;
2785 });
2786}
2787
2788bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2789 switch (Opc) {
2790 default:
2791 return false;
2792 case AArch64::STURSi:
2793 case AArch64::STRSpre:
2794 case AArch64::STURDi:
2795 case AArch64::STRDpre:
2796 case AArch64::STURQi:
2797 case AArch64::STRQpre:
2798 case AArch64::STURBBi:
2799 case AArch64::STURHHi:
2800 case AArch64::STURWi:
2801 case AArch64::STRWpre:
2802 case AArch64::STURXi:
2803 case AArch64::STRXpre:
2804 case AArch64::LDURSi:
2805 case AArch64::LDRSpre:
2806 case AArch64::LDURDi:
2807 case AArch64::LDRDpre:
2808 case AArch64::LDURQi:
2809 case AArch64::LDRQpre:
2810 case AArch64::LDURWi:
2811 case AArch64::LDRWpre:
2812 case AArch64::LDURXi:
2813 case AArch64::LDRXpre:
2814 case AArch64::LDRSWpre:
2815 case AArch64::LDURSWi:
2816 case AArch64::LDURHHi:
2817 case AArch64::LDURBBi:
2818 case AArch64::LDURSBWi:
2819 case AArch64::LDURSHWi:
2820 return true;
2821 }
2822}
2823
2824std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2825 switch (Opc) {
2826 default: return {};
2827 case AArch64::PRFMui: return AArch64::PRFUMi;
2828 case AArch64::LDRXui: return AArch64::LDURXi;
2829 case AArch64::LDRWui: return AArch64::LDURWi;
2830 case AArch64::LDRBui: return AArch64::LDURBi;
2831 case AArch64::LDRHui: return AArch64::LDURHi;
2832 case AArch64::LDRSui: return AArch64::LDURSi;
2833 case AArch64::LDRDui: return AArch64::LDURDi;
2834 case AArch64::LDRQui: return AArch64::LDURQi;
2835 case AArch64::LDRBBui: return AArch64::LDURBBi;
2836 case AArch64::LDRHHui: return AArch64::LDURHHi;
2837 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2838 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2839 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2840 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2841 case AArch64::LDRSWui: return AArch64::LDURSWi;
2842 case AArch64::STRXui: return AArch64::STURXi;
2843 case AArch64::STRWui: return AArch64::STURWi;
2844 case AArch64::STRBui: return AArch64::STURBi;
2845 case AArch64::STRHui: return AArch64::STURHi;
2846 case AArch64::STRSui: return AArch64::STURSi;
2847 case AArch64::STRDui: return AArch64::STURDi;
2848 case AArch64::STRQui: return AArch64::STURQi;
2849 case AArch64::STRBBui: return AArch64::STURBBi;
2850 case AArch64::STRHHui: return AArch64::STURHHi;
2851 }
2852}
2853
2854unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2855 switch (Opc) {
2856 default:
2857 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2858 case AArch64::ADDG:
2859 case AArch64::LDAPURBi:
2860 case AArch64::LDAPURHi:
2861 case AArch64::LDAPURi:
2862 case AArch64::LDAPURSBWi:
2863 case AArch64::LDAPURSBXi:
2864 case AArch64::LDAPURSHWi:
2865 case AArch64::LDAPURSHXi:
2866 case AArch64::LDAPURSWi:
2867 case AArch64::LDAPURXi:
2868 case AArch64::LDR_PPXI:
2869 case AArch64::LDR_PXI:
2870 case AArch64::LDR_ZXI:
2871 case AArch64::LDR_ZZXI:
2872 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2873 case AArch64::LDR_ZZZXI:
2874 case AArch64::LDR_ZZZZXI:
2875 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2876 case AArch64::LDRBBui:
2877 case AArch64::LDRBui:
2878 case AArch64::LDRDui:
2879 case AArch64::LDRHHui:
2880 case AArch64::LDRHui:
2881 case AArch64::LDRQui:
2882 case AArch64::LDRSBWui:
2883 case AArch64::LDRSBXui:
2884 case AArch64::LDRSHWui:
2885 case AArch64::LDRSHXui:
2886 case AArch64::LDRSui:
2887 case AArch64::LDRSWui:
2888 case AArch64::LDRWui:
2889 case AArch64::LDRXui:
2890 case AArch64::LDURBBi:
2891 case AArch64::LDURBi:
2892 case AArch64::LDURDi:
2893 case AArch64::LDURHHi:
2894 case AArch64::LDURHi:
2895 case AArch64::LDURQi:
2896 case AArch64::LDURSBWi:
2897 case AArch64::LDURSBXi:
2898 case AArch64::LDURSHWi:
2899 case AArch64::LDURSHXi:
2900 case AArch64::LDURSi:
2901 case AArch64::LDURSWi:
2902 case AArch64::LDURWi:
2903 case AArch64::LDURXi:
2904 case AArch64::PRFMui:
2905 case AArch64::PRFUMi:
2906 case AArch64::ST2Gi:
2907 case AArch64::STGi:
2908 case AArch64::STLURBi:
2909 case AArch64::STLURHi:
2910 case AArch64::STLURWi:
2911 case AArch64::STLURXi:
2912 case AArch64::StoreSwiftAsyncContext:
2913 case AArch64::STR_PPXI:
2914 case AArch64::STR_PXI:
2915 case AArch64::STR_ZXI:
2916 case AArch64::STR_ZZXI:
2917 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2918 case AArch64::STR_ZZZXI:
2919 case AArch64::STR_ZZZZXI:
2920 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2921 case AArch64::STRBBui:
2922 case AArch64::STRBui:
2923 case AArch64::STRDui:
2924 case AArch64::STRHHui:
2925 case AArch64::STRHui:
2926 case AArch64::STRQui:
2927 case AArch64::STRSui:
2928 case AArch64::STRWui:
2929 case AArch64::STRXui:
2930 case AArch64::STURBBi:
2931 case AArch64::STURBi:
2932 case AArch64::STURDi:
2933 case AArch64::STURHHi:
2934 case AArch64::STURHi:
2935 case AArch64::STURQi:
2936 case AArch64::STURSi:
2937 case AArch64::STURWi:
2938 case AArch64::STURXi:
2939 case AArch64::STZ2Gi:
2940 case AArch64::STZGi:
2941 case AArch64::TAGPstack:
2942 return 2;
2943 case AArch64::LD1B_D_IMM:
2944 case AArch64::LD1B_H_IMM:
2945 case AArch64::LD1B_IMM:
2946 case AArch64::LD1B_S_IMM:
2947 case AArch64::LD1D_IMM:
2948 case AArch64::LD1H_D_IMM:
2949 case AArch64::LD1H_IMM:
2950 case AArch64::LD1H_S_IMM:
2951 case AArch64::LD1RB_D_IMM:
2952 case AArch64::LD1RB_H_IMM:
2953 case AArch64::LD1RB_IMM:
2954 case AArch64::LD1RB_S_IMM:
2955 case AArch64::LD1RD_IMM:
2956 case AArch64::LD1RH_D_IMM:
2957 case AArch64::LD1RH_IMM:
2958 case AArch64::LD1RH_S_IMM:
2959 case AArch64::LD1RSB_D_IMM:
2960 case AArch64::LD1RSB_H_IMM:
2961 case AArch64::LD1RSB_S_IMM:
2962 case AArch64::LD1RSH_D_IMM:
2963 case AArch64::LD1RSH_S_IMM:
2964 case AArch64::LD1RSW_IMM:
2965 case AArch64::LD1RW_D_IMM:
2966 case AArch64::LD1RW_IMM:
2967 case AArch64::LD1SB_D_IMM:
2968 case AArch64::LD1SB_H_IMM:
2969 case AArch64::LD1SB_S_IMM:
2970 case AArch64::LD1SH_D_IMM:
2971 case AArch64::LD1SH_S_IMM:
2972 case AArch64::LD1SW_D_IMM:
2973 case AArch64::LD1W_D_IMM:
2974 case AArch64::LD1W_IMM:
2975 case AArch64::LD2B_IMM:
2976 case AArch64::LD2D_IMM:
2977 case AArch64::LD2H_IMM:
2978 case AArch64::LD2W_IMM:
2979 case AArch64::LD3B_IMM:
2980 case AArch64::LD3D_IMM:
2981 case AArch64::LD3H_IMM:
2982 case AArch64::LD3W_IMM:
2983 case AArch64::LD4B_IMM:
2984 case AArch64::LD4D_IMM:
2985 case AArch64::LD4H_IMM:
2986 case AArch64::LD4W_IMM:
2987 case AArch64::LDG:
2988 case AArch64::LDNF1B_D_IMM:
2989 case AArch64::LDNF1B_H_IMM:
2990 case AArch64::LDNF1B_IMM:
2991 case AArch64::LDNF1B_S_IMM:
2992 case AArch64::LDNF1D_IMM:
2993 case AArch64::LDNF1H_D_IMM:
2994 case AArch64::LDNF1H_IMM:
2995 case AArch64::LDNF1H_S_IMM:
2996 case AArch64::LDNF1SB_D_IMM:
2997 case AArch64::LDNF1SB_H_IMM:
2998 case AArch64::LDNF1SB_S_IMM:
2999 case AArch64::LDNF1SH_D_IMM:
3000 case AArch64::LDNF1SH_S_IMM:
3001 case AArch64::LDNF1SW_D_IMM:
3002 case AArch64::LDNF1W_D_IMM:
3003 case AArch64::LDNF1W_IMM:
3004 case AArch64::LDNPDi:
3005 case AArch64::LDNPQi:
3006 case AArch64::LDNPSi:
3007 case AArch64::LDNPWi:
3008 case AArch64::LDNPXi:
3009 case AArch64::LDNT1B_ZRI:
3010 case AArch64::LDNT1D_ZRI:
3011 case AArch64::LDNT1H_ZRI:
3012 case AArch64::LDNT1W_ZRI:
3013 case AArch64::LDPDi:
3014 case AArch64::LDPQi:
3015 case AArch64::LDPSi:
3016 case AArch64::LDPWi:
3017 case AArch64::LDPXi:
3018 case AArch64::LDRBBpost:
3019 case AArch64::LDRBBpre:
3020 case AArch64::LDRBpost:
3021 case AArch64::LDRBpre:
3022 case AArch64::LDRDpost:
3023 case AArch64::LDRDpre:
3024 case AArch64::LDRHHpost:
3025 case AArch64::LDRHHpre:
3026 case AArch64::LDRHpost:
3027 case AArch64::LDRHpre:
3028 case AArch64::LDRQpost:
3029 case AArch64::LDRQpre:
3030 case AArch64::LDRSpost:
3031 case AArch64::LDRSpre:
3032 case AArch64::LDRWpost:
3033 case AArch64::LDRWpre:
3034 case AArch64::LDRXpost:
3035 case AArch64::LDRXpre:
3036 case AArch64::ST1B_D_IMM:
3037 case AArch64::ST1B_H_IMM:
3038 case AArch64::ST1B_IMM:
3039 case AArch64::ST1B_S_IMM:
3040 case AArch64::ST1D_IMM:
3041 case AArch64::ST1H_D_IMM:
3042 case AArch64::ST1H_IMM:
3043 case AArch64::ST1H_S_IMM:
3044 case AArch64::ST1W_D_IMM:
3045 case AArch64::ST1W_IMM:
3046 case AArch64::ST2B_IMM:
3047 case AArch64::ST2D_IMM:
3048 case AArch64::ST2H_IMM:
3049 case AArch64::ST2W_IMM:
3050 case AArch64::ST3B_IMM:
3051 case AArch64::ST3D_IMM:
3052 case AArch64::ST3H_IMM:
3053 case AArch64::ST3W_IMM:
3054 case AArch64::ST4B_IMM:
3055 case AArch64::ST4D_IMM:
3056 case AArch64::ST4H_IMM:
3057 case AArch64::ST4W_IMM:
3058 case AArch64::STGPi:
3059 case AArch64::STGPreIndex:
3060 case AArch64::STZGPreIndex:
3061 case AArch64::ST2GPreIndex:
3062 case AArch64::STZ2GPreIndex:
3063 case AArch64::STGPostIndex:
3064 case AArch64::STZGPostIndex:
3065 case AArch64::ST2GPostIndex:
3066 case AArch64::STZ2GPostIndex:
3067 case AArch64::STNPDi:
3068 case AArch64::STNPQi:
3069 case AArch64::STNPSi:
3070 case AArch64::STNPWi:
3071 case AArch64::STNPXi:
3072 case AArch64::STNT1B_ZRI:
3073 case AArch64::STNT1D_ZRI:
3074 case AArch64::STNT1H_ZRI:
3075 case AArch64::STNT1W_ZRI:
3076 case AArch64::STPDi:
3077 case AArch64::STPQi:
3078 case AArch64::STPSi:
3079 case AArch64::STPWi:
3080 case AArch64::STPXi:
3081 case AArch64::STRBBpost:
3082 case AArch64::STRBBpre:
3083 case AArch64::STRBpost:
3084 case AArch64::STRBpre:
3085 case AArch64::STRDpost:
3086 case AArch64::STRDpre:
3087 case AArch64::STRHHpost:
3088 case AArch64::STRHHpre:
3089 case AArch64::STRHpost:
3090 case AArch64::STRHpre:
3091 case AArch64::STRQpost:
3092 case AArch64::STRQpre:
3093 case AArch64::STRSpost:
3094 case AArch64::STRSpre:
3095 case AArch64::STRWpost:
3096 case AArch64::STRWpre:
3097 case AArch64::STRXpost:
3098 case AArch64::STRXpre:
3099 return 3;
3100 case AArch64::LDPDpost:
3101 case AArch64::LDPDpre:
3102 case AArch64::LDPQpost:
3103 case AArch64::LDPQpre:
3104 case AArch64::LDPSpost:
3105 case AArch64::LDPSpre:
3106 case AArch64::LDPWpost:
3107 case AArch64::LDPWpre:
3108 case AArch64::LDPXpost:
3109 case AArch64::LDPXpre:
3110 case AArch64::STGPpre:
3111 case AArch64::STGPpost:
3112 case AArch64::STPDpost:
3113 case AArch64::STPDpre:
3114 case AArch64::STPQpost:
3115 case AArch64::STPQpre:
3116 case AArch64::STPSpost:
3117 case AArch64::STPSpre:
3118 case AArch64::STPWpost:
3119 case AArch64::STPWpre:
3120 case AArch64::STPXpost:
3121 case AArch64::STPXpre:
3122 return 4;
3123 }
3124}
3125
3126bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
3127 switch (MI.getOpcode()) {
3128 default:
3129 return false;
3130 // Scaled instructions.
3131 case AArch64::STRSui:
3132 case AArch64::STRDui:
3133 case AArch64::STRQui:
3134 case AArch64::STRXui:
3135 case AArch64::STRWui:
3136 case AArch64::LDRSui:
3137 case AArch64::LDRDui:
3138 case AArch64::LDRQui:
3139 case AArch64::LDRXui:
3140 case AArch64::LDRWui:
3141 case AArch64::LDRSWui:
3142 // Unscaled instructions.
3143 case AArch64::STURSi:
3144 case AArch64::STRSpre:
3145 case AArch64::STURDi:
3146 case AArch64::STRDpre:
3147 case AArch64::STURQi:
3148 case AArch64::STRQpre:
3149 case AArch64::STURWi:
3150 case AArch64::STRWpre:
3151 case AArch64::STURXi:
3152 case AArch64::STRXpre:
3153 case AArch64::LDURSi:
3154 case AArch64::LDRSpre:
3155 case AArch64::LDURDi:
3156 case AArch64::LDRDpre:
3157 case AArch64::LDURQi:
3158 case AArch64::LDRQpre:
3159 case AArch64::LDURWi:
3160 case AArch64::LDRWpre:
3161 case AArch64::LDURXi:
3162 case AArch64::LDRXpre:
3163 case AArch64::LDURSWi:
3164 case AArch64::LDRSWpre:
3165 // SVE instructions.
3166 case AArch64::LDR_ZXI:
3167 case AArch64::STR_ZXI:
3168 return true;
3169 }
3170}
3171
3172bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
3173 switch (MI.getOpcode()) {
3174 default:
3175 assert((!MI.isCall() || !MI.isReturn()) &&
3176 "Unexpected instruction - was a new tail call opcode introduced?");
3177 return false;
3178 case AArch64::TCRETURNdi:
3179 case AArch64::TCRETURNri:
3180 case AArch64::TCRETURNrix16x17:
3181 case AArch64::TCRETURNrix17:
3182 case AArch64::TCRETURNrinotx16:
3183 case AArch64::TCRETURNriALL:
3184 case AArch64::AUTH_TCRETURN:
3185 case AArch64::AUTH_TCRETURN_BTI:
3186 return true;
3187 }
3188}
3189
3190unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
3191 switch (Opc) {
3192 default:
3193 llvm_unreachable("Opcode has no flag setting equivalent!");
3194 // 32-bit cases:
3195 case AArch64::ADDWri:
3196 return AArch64::ADDSWri;
3197 case AArch64::ADDWrr:
3198 return AArch64::ADDSWrr;
3199 case AArch64::ADDWrs:
3200 return AArch64::ADDSWrs;
3201 case AArch64::ADDWrx:
3202 return AArch64::ADDSWrx;
3203 case AArch64::ANDWri:
3204 return AArch64::ANDSWri;
3205 case AArch64::ANDWrr:
3206 return AArch64::ANDSWrr;
3207 case AArch64::ANDWrs:
3208 return AArch64::ANDSWrs;
3209 case AArch64::BICWrr:
3210 return AArch64::BICSWrr;
3211 case AArch64::BICWrs:
3212 return AArch64::BICSWrs;
3213 case AArch64::SUBWri:
3214 return AArch64::SUBSWri;
3215 case AArch64::SUBWrr:
3216 return AArch64::SUBSWrr;
3217 case AArch64::SUBWrs:
3218 return AArch64::SUBSWrs;
3219 case AArch64::SUBWrx:
3220 return AArch64::SUBSWrx;
3221 // 64-bit cases:
3222 case AArch64::ADDXri:
3223 return AArch64::ADDSXri;
3224 case AArch64::ADDXrr:
3225 return AArch64::ADDSXrr;
3226 case AArch64::ADDXrs:
3227 return AArch64::ADDSXrs;
3228 case AArch64::ADDXrx:
3229 return AArch64::ADDSXrx;
3230 case AArch64::ANDXri:
3231 return AArch64::ANDSXri;
3232 case AArch64::ANDXrr:
3233 return AArch64::ANDSXrr;
3234 case AArch64::ANDXrs:
3235 return AArch64::ANDSXrs;
3236 case AArch64::BICXrr:
3237 return AArch64::BICSXrr;
3238 case AArch64::BICXrs:
3239 return AArch64::BICSXrs;
3240 case AArch64::SUBXri:
3241 return AArch64::SUBSXri;
3242 case AArch64::SUBXrr:
3243 return AArch64::SUBSXrr;
3244 case AArch64::SUBXrs:
3245 return AArch64::SUBSXrs;
3246 case AArch64::SUBXrx:
3247 return AArch64::SUBSXrx;
3248 // SVE instructions:
3249 case AArch64::AND_PPzPP:
3250 return AArch64::ANDS_PPzPP;
3251 case AArch64::BIC_PPzPP:
3252 return AArch64::BICS_PPzPP;
3253 case AArch64::EOR_PPzPP:
3254 return AArch64::EORS_PPzPP;
3255 case AArch64::NAND_PPzPP:
3256 return AArch64::NANDS_PPzPP;
3257 case AArch64::NOR_PPzPP:
3258 return AArch64::NORS_PPzPP;
3259 case AArch64::ORN_PPzPP:
3260 return AArch64::ORNS_PPzPP;
3261 case AArch64::ORR_PPzPP:
3262 return AArch64::ORRS_PPzPP;
3263 case AArch64::BRKA_PPzP:
3264 return AArch64::BRKAS_PPzP;
3265 case AArch64::BRKPA_PPzPP:
3266 return AArch64::BRKPAS_PPzPP;
3267 case AArch64::BRKB_PPzP:
3268 return AArch64::BRKBS_PPzP;
3269 case AArch64::BRKPB_PPzPP:
3270 return AArch64::BRKPBS_PPzPP;
3271 case AArch64::BRKN_PPzP:
3272 return AArch64::BRKNS_PPzP;
3273 case AArch64::RDFFR_PPz:
3274 return AArch64::RDFFRS_PPz;
3275 case AArch64::PTRUE_B:
3276 return AArch64::PTRUES_B;
3277 }
3278}
3279
3280// Is this a candidate for ld/st merging or pairing? For example, we don't
3281// touch volatiles or load/stores that have a hint to avoid pair formation.
3282bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
3283
3284 bool IsPreLdSt = isPreLdSt(MI);
3285
3286 // If this is a volatile load/store, don't mess with it.
3287 if (MI.hasOrderedMemoryRef())
3288 return false;
3289
3290 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3291 // For Pre-inc LD/ST, the operand is shifted by one.
3292 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3293 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3294 "Expected a reg or frame index operand.");
3295
3296 // For Pre-indexed addressing quadword instructions, the third operand is the
3297 // immediate value.
3298 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(i: 3).isImm();
3299
3300 if (!MI.getOperand(i: 2).isImm() && !IsImmPreLdSt)
3301 return false;
3302
3303 // Can't merge/pair if the instruction modifies the base register.
3304 // e.g., ldr x0, [x0]
3305 // This case will never occur with an FI base.
3306 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3307 // STR<S,D,Q,W,X>pre, it can be merged.
3308 // For example:
3309 // ldr q0, [x11, #32]!
3310 // ldr q1, [x11, #16]
3311 // to
3312 // ldp q0, q1, [x11, #32]!
3313 if (MI.getOperand(i: 1).isReg() && !IsPreLdSt) {
3314 Register BaseReg = MI.getOperand(i: 1).getReg();
3315 const TargetRegisterInfo *TRI = &getRegisterInfo();
3316 if (MI.modifiesRegister(Reg: BaseReg, TRI))
3317 return false;
3318 }
3319
3320 // Pairing SVE fills/spills is only valid for little-endian targets that
3321 // implement VLS 128.
3322 switch (MI.getOpcode()) {
3323 default:
3324 break;
3325 case AArch64::LDR_ZXI:
3326 case AArch64::STR_ZXI:
3327 if (!Subtarget.isLittleEndian() ||
3328 Subtarget.getSVEVectorSizeInBits() != 128)
3329 return false;
3330 }
3331
3332 // Check if this load/store has a hint to avoid pair formation.
3333 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3334 if (isLdStPairSuppressed(MI))
3335 return false;
3336
3337 // Do not pair any callee-save store/reload instructions in the
3338 // prologue/epilogue if the CFI information encoded the operations as separate
3339 // instructions, as that will cause the size of the actual prologue to mismatch
3340 // with the prologue size recorded in the Windows CFI.
3341 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
3342 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
3343 MI.getMF()->getFunction().needsUnwindTableEntry();
3344 if (NeedsWinCFI && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
3345 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
3346 return false;
3347
3348 // On some CPUs quad load/store pairs are slower than two single load/stores.
3349 if (Subtarget.isPaired128Slow()) {
3350 switch (MI.getOpcode()) {
3351 default:
3352 break;
3353 case AArch64::LDURQi:
3354 case AArch64::STURQi:
3355 case AArch64::LDRQui:
3356 case AArch64::STRQui:
3357 return false;
3358 }
3359 }
3360
3361 return true;
3362}
3363
3364bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
3365 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
3366 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3367 const TargetRegisterInfo *TRI) const {
3368 if (!LdSt.mayLoadOrStore())
3369 return false;
3370
3371 const MachineOperand *BaseOp;
3372 TypeSize WidthN(0, false);
3373 if (!getMemOperandWithOffsetWidth(MI: LdSt, BaseOp, Offset, OffsetIsScalable,
3374 Width&: WidthN, TRI))
3375 return false;
3376 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3377 // vector.
3378 Width = LocationSize::precise(Value: WidthN);
3379 BaseOps.push_back(Elt: BaseOp);
3380 return true;
3381}
3382
3383std::optional<ExtAddrMode>
3384AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
3385 const TargetRegisterInfo *TRI) const {
3386 const MachineOperand *Base; // Filled with the base operand of MI.
3387 int64_t Offset; // Filled with the offset of MI.
3388 bool OffsetIsScalable;
3389 if (!getMemOperandWithOffset(MI: MemI, BaseOp&: Base, Offset, OffsetIsScalable, TRI))
3390 return std::nullopt;
3391
3392 if (!Base->isReg())
3393 return std::nullopt;
3394 ExtAddrMode AM;
3395 AM.BaseReg = Base->getReg();
3396 AM.Displacement = Offset;
3397 AM.ScaledReg = 0;
3398 AM.Scale = 0;
3399 return AM;
3400}
3401
3402bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
3403 Register Reg,
3404 const MachineInstr &AddrI,
3405 ExtAddrMode &AM) const {
3406 // Filter out instructions into which we cannot fold.
3407 unsigned NumBytes;
3408 int64_t OffsetScale = 1;
3409 switch (MemI.getOpcode()) {
3410 default:
3411 return false;
3412
3413 case AArch64::LDURQi:
3414 case AArch64::STURQi:
3415 NumBytes = 16;
3416 break;
3417
3418 case AArch64::LDURDi:
3419 case AArch64::STURDi:
3420 case AArch64::LDURXi:
3421 case AArch64::STURXi:
3422 NumBytes = 8;
3423 break;
3424
3425 case AArch64::LDURWi:
3426 case AArch64::LDURSWi:
3427 case AArch64::STURWi:
3428 NumBytes = 4;
3429 break;
3430
3431 case AArch64::LDURHi:
3432 case AArch64::STURHi:
3433 case AArch64::LDURHHi:
3434 case AArch64::STURHHi:
3435 case AArch64::LDURSHXi:
3436 case AArch64::LDURSHWi:
3437 NumBytes = 2;
3438 break;
3439
3440 case AArch64::LDRBroX:
3441 case AArch64::LDRBBroX:
3442 case AArch64::LDRSBXroX:
3443 case AArch64::LDRSBWroX:
3444 case AArch64::STRBroX:
3445 case AArch64::STRBBroX:
3446 case AArch64::LDURBi:
3447 case AArch64::LDURBBi:
3448 case AArch64::LDURSBXi:
3449 case AArch64::LDURSBWi:
3450 case AArch64::STURBi:
3451 case AArch64::STURBBi:
3452 case AArch64::LDRBui:
3453 case AArch64::LDRBBui:
3454 case AArch64::LDRSBXui:
3455 case AArch64::LDRSBWui:
3456 case AArch64::STRBui:
3457 case AArch64::STRBBui:
3458 NumBytes = 1;
3459 break;
3460
3461 case AArch64::LDRQroX:
3462 case AArch64::STRQroX:
3463 case AArch64::LDRQui:
3464 case AArch64::STRQui:
3465 NumBytes = 16;
3466 OffsetScale = 16;
3467 break;
3468
3469 case AArch64::LDRDroX:
3470 case AArch64::STRDroX:
3471 case AArch64::LDRXroX:
3472 case AArch64::STRXroX:
3473 case AArch64::LDRDui:
3474 case AArch64::STRDui:
3475 case AArch64::LDRXui:
3476 case AArch64::STRXui:
3477 NumBytes = 8;
3478 OffsetScale = 8;
3479 break;
3480
3481 case AArch64::LDRWroX:
3482 case AArch64::LDRSWroX:
3483 case AArch64::STRWroX:
3484 case AArch64::LDRWui:
3485 case AArch64::LDRSWui:
3486 case AArch64::STRWui:
3487 NumBytes = 4;
3488 OffsetScale = 4;
3489 break;
3490
3491 case AArch64::LDRHroX:
3492 case AArch64::STRHroX:
3493 case AArch64::LDRHHroX:
3494 case AArch64::STRHHroX:
3495 case AArch64::LDRSHXroX:
3496 case AArch64::LDRSHWroX:
3497 case AArch64::LDRHui:
3498 case AArch64::STRHui:
3499 case AArch64::LDRHHui:
3500 case AArch64::STRHHui:
3501 case AArch64::LDRSHXui:
3502 case AArch64::LDRSHWui:
3503 NumBytes = 2;
3504 OffsetScale = 2;
3505 break;
3506 }
3507
3508 // Check the fold operand is not the loaded/stored value.
3509 const MachineOperand &BaseRegOp = MemI.getOperand(i: 0);
3510 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3511 return false;
3512
3513 // Handle memory instructions with a [Reg, Reg] addressing mode.
3514 if (MemI.getOperand(i: 2).isReg()) {
3515 // Bail if the addressing mode already includes extension of the offset
3516 // register.
3517 if (MemI.getOperand(i: 3).getImm())
3518 return false;
3519
3520 // Check if we actually have a scaled offset.
3521 if (MemI.getOperand(i: 4).getImm() == 0)
3522 OffsetScale = 1;
3523
3524 // If the address instructions is folded into the base register, then the
3525 // addressing mode must not have a scale. Then we can swap the base and the
3526 // scaled registers.
3527 if (MemI.getOperand(i: 1).getReg() == Reg && OffsetScale != 1)
3528 return false;
3529
3530 switch (AddrI.getOpcode()) {
3531 default:
3532 return false;
3533
3534 case AArch64::SBFMXri:
3535 // sxtw Xa, Wm
3536 // ldr Xd, [Xn, Xa, lsl #N]
3537 // ->
3538 // ldr Xd, [Xn, Wm, sxtw #N]
3539 if (AddrI.getOperand(i: 2).getImm() != 0 ||
3540 AddrI.getOperand(i: 3).getImm() != 31)
3541 return false;
3542
3543 AM.BaseReg = MemI.getOperand(i: 1).getReg();
3544 if (AM.BaseReg == Reg)
3545 AM.BaseReg = MemI.getOperand(i: 2).getReg();
3546 AM.ScaledReg = AddrI.getOperand(i: 1).getReg();
3547 AM.Scale = OffsetScale;
3548 AM.Displacement = 0;
3549 AM.Form = ExtAddrMode::Formula::SExtScaledReg;
3550 return true;
3551
3552 case TargetOpcode::SUBREG_TO_REG: {
3553 // mov Wa, Wm
3554 // ldr Xd, [Xn, Xa, lsl #N]
3555 // ->
3556 // ldr Xd, [Xn, Wm, uxtw #N]
3557
3558 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3559 if (AddrI.getOperand(i: 2).getImm() != AArch64::sub_32)
3560 return false;
3561
3562 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3563 Register OffsetReg = AddrI.getOperand(i: 1).getReg();
3564 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(RegNo: OffsetReg))
3565 return false;
3566
3567 const MachineInstr &DefMI = *MRI.getVRegDef(Reg: OffsetReg);
3568 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3569 DefMI.getOperand(i: 1).getReg() != AArch64::WZR ||
3570 DefMI.getOperand(i: 3).getImm() != 0)
3571 return false;
3572
3573 AM.BaseReg = MemI.getOperand(i: 1).getReg();
3574 if (AM.BaseReg == Reg)
3575 AM.BaseReg = MemI.getOperand(i: 2).getReg();
3576 AM.ScaledReg = DefMI.getOperand(i: 2).getReg();
3577 AM.Scale = OffsetScale;
3578 AM.Displacement = 0;
3579 AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
3580 return true;
3581 }
3582 }
3583 }
3584
3585 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3586
3587 // Check we are not breaking a potential conversion to an LDP.
3588 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3589 int64_t NewOffset) -> bool {
3590 int64_t MinOffset, MaxOffset;
3591 switch (NumBytes) {
3592 default:
3593 return true;
3594 case 4:
3595 MinOffset = -256;
3596 MaxOffset = 252;
3597 break;
3598 case 8:
3599 MinOffset = -512;
3600 MaxOffset = 504;
3601 break;
3602 case 16:
3603 MinOffset = -1024;
3604 MaxOffset = 1008;
3605 break;
3606 }
3607 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3608 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3609 };
3610 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3611 int64_t OldOffset = MemI.getOperand(i: 2).getImm() * OffsetScale;
3612 int64_t NewOffset = OldOffset + Disp;
3613 if (!isLegalAddressingMode(NumBytes, Offset: NewOffset, /* Scale */ 0))
3614 return false;
3615 // If the old offset would fit into an LDP, but the new offset wouldn't,
3616 // bail out.
3617 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3618 return false;
3619 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
3620 AM.ScaledReg = 0;
3621 AM.Scale = 0;
3622 AM.Displacement = NewOffset;
3623 AM.Form = ExtAddrMode::Formula::Basic;
3624 return true;
3625 };
3626
3627 auto canFoldAddRegIntoAddrMode =
3628 [&](int64_t Scale,
3629 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
3630 if (MemI.getOperand(i: 2).getImm() != 0)
3631 return false;
3632 if ((unsigned)Scale != Scale)
3633 return false;
3634 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3635 return false;
3636 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
3637 AM.ScaledReg = AddrI.getOperand(i: 2).getReg();
3638 AM.Scale = Scale;
3639 AM.Displacement = 0;
3640 AM.Form = Form;
3641 return true;
3642 };
3643
3644 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3645 unsigned Opcode = MemI.getOpcode();
3646 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3647 Subtarget.isSTRQroSlow();
3648 };
3649
3650 int64_t Disp = 0;
3651 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3652 switch (AddrI.getOpcode()) {
3653 default:
3654 return false;
3655
3656 case AArch64::ADDXri:
3657 // add Xa, Xn, #N
3658 // ldr Xd, [Xa, #M]
3659 // ->
3660 // ldr Xd, [Xn, #N'+M]
3661 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
3662 return canFoldAddSubImmIntoAddrMode(Disp);
3663
3664 case AArch64::SUBXri:
3665 // sub Xa, Xn, #N
3666 // ldr Xd, [Xa, #M]
3667 // ->
3668 // ldr Xd, [Xn, #N'+M]
3669 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
3670 return canFoldAddSubImmIntoAddrMode(-Disp);
3671
3672 case AArch64::ADDXrs: {
3673 // add Xa, Xn, Xm, lsl #N
3674 // ldr Xd, [Xa]
3675 // ->
3676 // ldr Xd, [Xn, Xm, lsl #N]
3677
3678 // Don't fold the add if the result would be slower, unless optimising for
3679 // size.
3680 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3681 if (AArch64_AM::getShiftType(Imm: Shift) != AArch64_AM::ShiftExtendType::LSL)
3682 return false;
3683 Shift = AArch64_AM::getShiftValue(Imm: Shift);
3684 if (!OptSize) {
3685 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3686 return false;
3687 if (avoidSlowSTRQ(MemI))
3688 return false;
3689 }
3690 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3691 }
3692
3693 case AArch64::ADDXrr:
3694 // add Xa, Xn, Xm
3695 // ldr Xd, [Xa]
3696 // ->
3697 // ldr Xd, [Xn, Xm, lsl #0]
3698
3699 // Don't fold the add if the result would be slower, unless optimising for
3700 // size.
3701 if (!OptSize && avoidSlowSTRQ(MemI))
3702 return false;
3703 return canFoldAddRegIntoAddrMode(1);
3704
3705 case AArch64::ADDXrx:
3706 // add Xa, Xn, Wm, {s,u}xtw #N
3707 // ldr Xd, [Xa]
3708 // ->
3709 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3710
3711 // Don't fold the add if the result would be slower, unless optimising for
3712 // size.
3713 if (!OptSize && avoidSlowSTRQ(MemI))
3714 return false;
3715
3716 // Can fold only sign-/zero-extend of a word.
3717 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3718 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3719 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3720 return false;
3721
3722 return canFoldAddRegIntoAddrMode(
3723 1ULL << AArch64_AM::getArithShiftValue(Imm),
3724 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3725 : ExtAddrMode::Formula::ZExtScaledReg);
3726 }
3727}
3728
3729// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3730// return the opcode of an instruction performing the same operation, but using
3731// the [Reg, Reg] addressing mode.
3732static unsigned regOffsetOpcode(unsigned Opcode) {
3733 switch (Opcode) {
3734 default:
3735 llvm_unreachable("Address folding not implemented for instruction");
3736
3737 case AArch64::LDURQi:
3738 case AArch64::LDRQui:
3739 return AArch64::LDRQroX;
3740 case AArch64::STURQi:
3741 case AArch64::STRQui:
3742 return AArch64::STRQroX;
3743 case AArch64::LDURDi:
3744 case AArch64::LDRDui:
3745 return AArch64::LDRDroX;
3746 case AArch64::STURDi:
3747 case AArch64::STRDui:
3748 return AArch64::STRDroX;
3749 case AArch64::LDURXi:
3750 case AArch64::LDRXui:
3751 return AArch64::LDRXroX;
3752 case AArch64::STURXi:
3753 case AArch64::STRXui:
3754 return AArch64::STRXroX;
3755 case AArch64::LDURWi:
3756 case AArch64::LDRWui:
3757 return AArch64::LDRWroX;
3758 case AArch64::LDURSWi:
3759 case AArch64::LDRSWui:
3760 return AArch64::LDRSWroX;
3761 case AArch64::STURWi:
3762 case AArch64::STRWui:
3763 return AArch64::STRWroX;
3764 case AArch64::LDURHi:
3765 case AArch64::LDRHui:
3766 return AArch64::LDRHroX;
3767 case AArch64::STURHi:
3768 case AArch64::STRHui:
3769 return AArch64::STRHroX;
3770 case AArch64::LDURHHi:
3771 case AArch64::LDRHHui:
3772 return AArch64::LDRHHroX;
3773 case AArch64::STURHHi:
3774 case AArch64::STRHHui:
3775 return AArch64::STRHHroX;
3776 case AArch64::LDURSHXi:
3777 case AArch64::LDRSHXui:
3778 return AArch64::LDRSHXroX;
3779 case AArch64::LDURSHWi:
3780 case AArch64::LDRSHWui:
3781 return AArch64::LDRSHWroX;
3782 case AArch64::LDURBi:
3783 case AArch64::LDRBui:
3784 return AArch64::LDRBroX;
3785 case AArch64::LDURBBi:
3786 case AArch64::LDRBBui:
3787 return AArch64::LDRBBroX;
3788 case AArch64::LDURSBXi:
3789 case AArch64::LDRSBXui:
3790 return AArch64::LDRSBXroX;
3791 case AArch64::LDURSBWi:
3792 case AArch64::LDRSBWui:
3793 return AArch64::LDRSBWroX;
3794 case AArch64::STURBi:
3795 case AArch64::STRBui:
3796 return AArch64::STRBroX;
3797 case AArch64::STURBBi:
3798 case AArch64::STRBBui:
3799 return AArch64::STRBBroX;
3800 }
3801}
3802
3803// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3804// the opcode of an instruction performing the same operation, but using the
3805// [Reg, #Imm] addressing mode with scaled offset.
3806unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3807 switch (Opcode) {
3808 default:
3809 llvm_unreachable("Address folding not implemented for instruction");
3810
3811 case AArch64::LDURQi:
3812 Scale = 16;
3813 return AArch64::LDRQui;
3814 case AArch64::STURQi:
3815 Scale = 16;
3816 return AArch64::STRQui;
3817 case AArch64::LDURDi:
3818 Scale = 8;
3819 return AArch64::LDRDui;
3820 case AArch64::STURDi:
3821 Scale = 8;
3822 return AArch64::STRDui;
3823 case AArch64::LDURXi:
3824 Scale = 8;
3825 return AArch64::LDRXui;
3826 case AArch64::STURXi:
3827 Scale = 8;
3828 return AArch64::STRXui;
3829 case AArch64::LDURWi:
3830 Scale = 4;
3831 return AArch64::LDRWui;
3832 case AArch64::LDURSWi:
3833 Scale = 4;
3834 return AArch64::LDRSWui;
3835 case AArch64::STURWi:
3836 Scale = 4;
3837 return AArch64::STRWui;
3838 case AArch64::LDURHi:
3839 Scale = 2;
3840 return AArch64::LDRHui;
3841 case AArch64::STURHi:
3842 Scale = 2;
3843 return AArch64::STRHui;
3844 case AArch64::LDURHHi:
3845 Scale = 2;
3846 return AArch64::LDRHHui;
3847 case AArch64::STURHHi:
3848 Scale = 2;
3849 return AArch64::STRHHui;
3850 case AArch64::LDURSHXi:
3851 Scale = 2;
3852 return AArch64::LDRSHXui;
3853 case AArch64::LDURSHWi:
3854 Scale = 2;
3855 return AArch64::LDRSHWui;
3856 case AArch64::LDURBi:
3857 Scale = 1;
3858 return AArch64::LDRBui;
3859 case AArch64::LDURBBi:
3860 Scale = 1;
3861 return AArch64::LDRBBui;
3862 case AArch64::LDURSBXi:
3863 Scale = 1;
3864 return AArch64::LDRSBXui;
3865 case AArch64::LDURSBWi:
3866 Scale = 1;
3867 return AArch64::LDRSBWui;
3868 case AArch64::STURBi:
3869 Scale = 1;
3870 return AArch64::STRBui;
3871 case AArch64::STURBBi:
3872 Scale = 1;
3873 return AArch64::STRBBui;
3874 case AArch64::LDRQui:
3875 case AArch64::STRQui:
3876 Scale = 16;
3877 return Opcode;
3878 case AArch64::LDRDui:
3879 case AArch64::STRDui:
3880 case AArch64::LDRXui:
3881 case AArch64::STRXui:
3882 Scale = 8;
3883 return Opcode;
3884 case AArch64::LDRWui:
3885 case AArch64::LDRSWui:
3886 case AArch64::STRWui:
3887 Scale = 4;
3888 return Opcode;
3889 case AArch64::LDRHui:
3890 case AArch64::STRHui:
3891 case AArch64::LDRHHui:
3892 case AArch64::STRHHui:
3893 case AArch64::LDRSHXui:
3894 case AArch64::LDRSHWui:
3895 Scale = 2;
3896 return Opcode;
3897 case AArch64::LDRBui:
3898 case AArch64::LDRBBui:
3899 case AArch64::LDRSBXui:
3900 case AArch64::LDRSBWui:
3901 case AArch64::STRBui:
3902 case AArch64::STRBBui:
3903 Scale = 1;
3904 return Opcode;
3905 }
3906}
3907
3908// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3909// the opcode of an instruction performing the same operation, but using the
3910// [Reg, #Imm] addressing mode with unscaled offset.
3911unsigned unscaledOffsetOpcode(unsigned Opcode) {
3912 switch (Opcode) {
3913 default:
3914 llvm_unreachable("Address folding not implemented for instruction");
3915
3916 case AArch64::LDURQi:
3917 case AArch64::STURQi:
3918 case AArch64::LDURDi:
3919 case AArch64::STURDi:
3920 case AArch64::LDURXi:
3921 case AArch64::STURXi:
3922 case AArch64::LDURWi:
3923 case AArch64::LDURSWi:
3924 case AArch64::STURWi:
3925 case AArch64::LDURHi:
3926 case AArch64::STURHi:
3927 case AArch64::LDURHHi:
3928 case AArch64::STURHHi:
3929 case AArch64::LDURSHXi:
3930 case AArch64::LDURSHWi:
3931 case AArch64::LDURBi:
3932 case AArch64::STURBi:
3933 case AArch64::LDURBBi:
3934 case AArch64::STURBBi:
3935 case AArch64::LDURSBWi:
3936 case AArch64::LDURSBXi:
3937 return Opcode;
3938 case AArch64::LDRQui:
3939 return AArch64::LDURQi;
3940 case AArch64::STRQui:
3941 return AArch64::STURQi;
3942 case AArch64::LDRDui:
3943 return AArch64::LDURDi;
3944 case AArch64::STRDui:
3945 return AArch64::STURDi;
3946 case AArch64::LDRXui:
3947 return AArch64::LDURXi;
3948 case AArch64::STRXui:
3949 return AArch64::STURXi;
3950 case AArch64::LDRWui:
3951 return AArch64::LDURWi;
3952 case AArch64::LDRSWui:
3953 return AArch64::LDURSWi;
3954 case AArch64::STRWui:
3955 return AArch64::STURWi;
3956 case AArch64::LDRHui:
3957 return AArch64::LDURHi;
3958 case AArch64::STRHui:
3959 return AArch64::STURHi;
3960 case AArch64::LDRHHui:
3961 return AArch64::LDURHHi;
3962 case AArch64::STRHHui:
3963 return AArch64::STURHHi;
3964 case AArch64::LDRSHXui:
3965 return AArch64::LDURSHXi;
3966 case AArch64::LDRSHWui:
3967 return AArch64::LDURSHWi;
3968 case AArch64::LDRBBui:
3969 return AArch64::LDURBBi;
3970 case AArch64::LDRBui:
3971 return AArch64::LDURBi;
3972 case AArch64::STRBBui:
3973 return AArch64::STURBBi;
3974 case AArch64::STRBui:
3975 return AArch64::STURBi;
3976 case AArch64::LDRSBWui:
3977 return AArch64::LDURSBWi;
3978 case AArch64::LDRSBXui:
3979 return AArch64::LDURSBXi;
3980 }
3981}
3982
3983// Given the opcode of a memory load/store instruction, return the opcode of an
3984// instruction performing the same operation, but using
3985// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3986// offset register.
3987static unsigned offsetExtendOpcode(unsigned Opcode) {
3988 switch (Opcode) {
3989 default:
3990 llvm_unreachable("Address folding not implemented for instruction");
3991
3992 case AArch64::LDRQroX:
3993 case AArch64::LDURQi:
3994 case AArch64::LDRQui:
3995 return AArch64::LDRQroW;
3996 case AArch64::STRQroX:
3997 case AArch64::STURQi:
3998 case AArch64::STRQui:
3999 return AArch64::STRQroW;
4000 case AArch64::LDRDroX:
4001 case AArch64::LDURDi:
4002 case AArch64::LDRDui:
4003 return AArch64::LDRDroW;
4004 case AArch64::STRDroX:
4005 case AArch64::STURDi:
4006 case AArch64::STRDui:
4007 return AArch64::STRDroW;
4008 case AArch64::LDRXroX:
4009 case AArch64::LDURXi:
4010 case AArch64::LDRXui:
4011 return AArch64::LDRXroW;
4012 case AArch64::STRXroX:
4013 case AArch64::STURXi:
4014 case AArch64::STRXui:
4015 return AArch64::STRXroW;
4016 case AArch64::LDRWroX:
4017 case AArch64::LDURWi:
4018 case AArch64::LDRWui:
4019 return AArch64::LDRWroW;
4020 case AArch64::LDRSWroX:
4021 case AArch64::LDURSWi:
4022 case AArch64::LDRSWui:
4023 return AArch64::LDRSWroW;
4024 case AArch64::STRWroX:
4025 case AArch64::STURWi:
4026 case AArch64::STRWui:
4027 return AArch64::STRWroW;
4028 case AArch64::LDRHroX:
4029 case AArch64::LDURHi:
4030 case AArch64::LDRHui:
4031 return AArch64::LDRHroW;
4032 case AArch64::STRHroX:
4033 case AArch64::STURHi:
4034 case AArch64::STRHui:
4035 return AArch64::STRHroW;
4036 case AArch64::LDRHHroX:
4037 case AArch64::LDURHHi:
4038 case AArch64::LDRHHui:
4039 return AArch64::LDRHHroW;
4040 case AArch64::STRHHroX:
4041 case AArch64::STURHHi:
4042 case AArch64::STRHHui:
4043 return AArch64::STRHHroW;
4044 case AArch64::LDRSHXroX:
4045 case AArch64::LDURSHXi:
4046 case AArch64::LDRSHXui:
4047 return AArch64::LDRSHXroW;
4048 case AArch64::LDRSHWroX:
4049 case AArch64::LDURSHWi:
4050 case AArch64::LDRSHWui:
4051 return AArch64::LDRSHWroW;
4052 case AArch64::LDRBroX:
4053 case AArch64::LDURBi:
4054 case AArch64::LDRBui:
4055 return AArch64::LDRBroW;
4056 case AArch64::LDRBBroX:
4057 case AArch64::LDURBBi:
4058 case AArch64::LDRBBui:
4059 return AArch64::LDRBBroW;
4060 case AArch64::LDRSBXroX:
4061 case AArch64::LDURSBXi:
4062 case AArch64::LDRSBXui:
4063 return AArch64::LDRSBXroW;
4064 case AArch64::LDRSBWroX:
4065 case AArch64::LDURSBWi:
4066 case AArch64::LDRSBWui:
4067 return AArch64::LDRSBWroW;
4068 case AArch64::STRBroX:
4069 case AArch64::STURBi:
4070 case AArch64::STRBui:
4071 return AArch64::STRBroW;
4072 case AArch64::STRBBroX:
4073 case AArch64::STURBBi:
4074 case AArch64::STRBBui:
4075 return AArch64::STRBBroW;
4076 }
4077}
4078
4079MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
4080 const ExtAddrMode &AM) const {
4081
4082 const DebugLoc &DL = MemI.getDebugLoc();
4083 MachineBasicBlock &MBB = *MemI.getParent();
4084 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4085
4086 if (AM.Form == ExtAddrMode::Formula::Basic) {
4087 if (AM.ScaledReg) {
4088 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4089 unsigned Opcode = regOffsetOpcode(Opcode: MemI.getOpcode());
4090 MRI.constrainRegClass(Reg: AM.BaseReg, RC: &AArch64::GPR64spRegClass);
4091 auto B = BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4092 .addReg(RegNo: MemI.getOperand(i: 0).getReg(),
4093 Flags: getDefRegState(B: MemI.mayLoad()))
4094 .addReg(RegNo: AM.BaseReg)
4095 .addReg(RegNo: AM.ScaledReg)
4096 .addImm(Val: 0)
4097 .addImm(Val: AM.Scale > 1)
4098 .setMemRefs(MemI.memoperands())
4099 .setMIFlags(MemI.getFlags());
4100 return B.getInstr();
4101 }
4102
4103 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4104 "Addressing mode not supported for folding");
4105
4106 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4107 unsigned Scale = 1;
4108 unsigned Opcode = MemI.getOpcode();
4109 if (isInt<9>(x: AM.Displacement))
4110 Opcode = unscaledOffsetOpcode(Opcode);
4111 else
4112 Opcode = scaledOffsetOpcode(Opcode, Scale);
4113
4114 auto B =
4115 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4116 .addReg(RegNo: MemI.getOperand(i: 0).getReg(), Flags: getDefRegState(B: MemI.mayLoad()))
4117 .addReg(RegNo: AM.BaseReg)
4118 .addImm(Val: AM.Displacement / Scale)
4119 .setMemRefs(MemI.memoperands())
4120 .setMIFlags(MemI.getFlags());
4121 return B.getInstr();
4122 }
4123
4124 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
4125 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
4126 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4127 assert(AM.ScaledReg && !AM.Displacement &&
4128 "Address offset can be a register or an immediate, but not both");
4129 unsigned Opcode = offsetExtendOpcode(Opcode: MemI.getOpcode());
4130 MRI.constrainRegClass(Reg: AM.BaseReg, RC: &AArch64::GPR64spRegClass);
4131 // Make sure the offset register is in the correct register class.
4132 Register OffsetReg = AM.ScaledReg;
4133 const TargetRegisterClass *RC = MRI.getRegClass(Reg: OffsetReg);
4134 if (RC->hasSuperClassEq(RC: &AArch64::GPR64RegClass)) {
4135 OffsetReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
4136 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: OffsetReg)
4137 .addReg(RegNo: AM.ScaledReg, Flags: {}, SubReg: AArch64::sub_32);
4138 }
4139 auto B =
4140 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4141 .addReg(RegNo: MemI.getOperand(i: 0).getReg(), Flags: getDefRegState(B: MemI.mayLoad()))
4142 .addReg(RegNo: AM.BaseReg)
4143 .addReg(RegNo: OffsetReg)
4144 .addImm(Val: AM.Form == ExtAddrMode::Formula::SExtScaledReg)
4145 .addImm(Val: AM.Scale != 1)
4146 .setMemRefs(MemI.memoperands())
4147 .setMIFlags(MemI.getFlags());
4148
4149 return B.getInstr();
4150 }
4151
4152 llvm_unreachable(
4153 "Function must not be called with an addressing mode it can't handle");
4154}
4155
4156/// Return true if the opcode is a post-index ld/st instruction, which really
4157/// loads from base+0.
4158static bool isPostIndexLdStOpcode(unsigned Opcode) {
4159 switch (Opcode) {
4160 default:
4161 return false;
4162 case AArch64::LD1Fourv16b_POST:
4163 case AArch64::LD1Fourv1d_POST:
4164 case AArch64::LD1Fourv2d_POST:
4165 case AArch64::LD1Fourv2s_POST:
4166 case AArch64::LD1Fourv4h_POST:
4167 case AArch64::LD1Fourv4s_POST:
4168 case AArch64::LD1Fourv8b_POST:
4169 case AArch64::LD1Fourv8h_POST:
4170 case AArch64::LD1Onev16b_POST:
4171 case AArch64::LD1Onev1d_POST:
4172 case AArch64::LD1Onev2d_POST:
4173 case AArch64::LD1Onev2s_POST:
4174 case AArch64::LD1Onev4h_POST:
4175 case AArch64::LD1Onev4s_POST:
4176 case AArch64::LD1Onev8b_POST:
4177 case AArch64::LD1Onev8h_POST:
4178 case AArch64::LD1Rv16b_POST:
4179 case AArch64::LD1Rv1d_POST:
4180 case AArch64::LD1Rv2d_POST:
4181 case AArch64::LD1Rv2s_POST:
4182 case AArch64::LD1Rv4h_POST:
4183 case AArch64::LD1Rv4s_POST:
4184 case AArch64::LD1Rv8b_POST:
4185 case AArch64::LD1Rv8h_POST:
4186 case AArch64::LD1Threev16b_POST:
4187 case AArch64::LD1Threev1d_POST:
4188 case AArch64::LD1Threev2d_POST:
4189 case AArch64::LD1Threev2s_POST:
4190 case AArch64::LD1Threev4h_POST:
4191 case AArch64::LD1Threev4s_POST:
4192 case AArch64::LD1Threev8b_POST:
4193 case AArch64::LD1Threev8h_POST:
4194 case AArch64::LD1Twov16b_POST:
4195 case AArch64::LD1Twov1d_POST:
4196 case AArch64::LD1Twov2d_POST:
4197 case AArch64::LD1Twov2s_POST:
4198 case AArch64::LD1Twov4h_POST:
4199 case AArch64::LD1Twov4s_POST:
4200 case AArch64::LD1Twov8b_POST:
4201 case AArch64::LD1Twov8h_POST:
4202 case AArch64::LD1i16_POST:
4203 case AArch64::LD1i32_POST:
4204 case AArch64::LD1i64_POST:
4205 case AArch64::LD1i8_POST:
4206 case AArch64::LD2Rv16b_POST:
4207 case AArch64::LD2Rv1d_POST:
4208 case AArch64::LD2Rv2d_POST:
4209 case AArch64::LD2Rv2s_POST:
4210 case AArch64::LD2Rv4h_POST:
4211 case AArch64::LD2Rv4s_POST:
4212 case AArch64::LD2Rv8b_POST:
4213 case AArch64::LD2Rv8h_POST:
4214 case AArch64::LD2Twov16b_POST:
4215 case AArch64::LD2Twov2d_POST:
4216 case AArch64::LD2Twov2s_POST:
4217 case AArch64::LD2Twov4h_POST:
4218 case AArch64::LD2Twov4s_POST:
4219 case AArch64::LD2Twov8b_POST:
4220 case AArch64::LD2Twov8h_POST:
4221 case AArch64::LD2i16_POST:
4222 case AArch64::LD2i32_POST:
4223 case AArch64::LD2i64_POST:
4224 case AArch64::LD2i8_POST:
4225 case AArch64::LD3Rv16b_POST:
4226 case AArch64::LD3Rv1d_POST:
4227 case AArch64::LD3Rv2d_POST:
4228 case AArch64::LD3Rv2s_POST:
4229 case AArch64::LD3Rv4h_POST:
4230 case AArch64::LD3Rv4s_POST:
4231 case AArch64::LD3Rv8b_POST:
4232 case AArch64::LD3Rv8h_POST:
4233 case AArch64::LD3Threev16b_POST:
4234 case AArch64::LD3Threev2d_POST:
4235 case AArch64::LD3Threev2s_POST:
4236 case AArch64::LD3Threev4h_POST:
4237 case AArch64::LD3Threev4s_POST:
4238 case AArch64::LD3Threev8b_POST:
4239 case AArch64::LD3Threev8h_POST:
4240 case AArch64::LD3i16_POST:
4241 case AArch64::LD3i32_POST:
4242 case AArch64::LD3i64_POST:
4243 case AArch64::LD3i8_POST:
4244 case AArch64::LD4Fourv16b_POST:
4245 case AArch64::LD4Fourv2d_POST:
4246 case AArch64::LD4Fourv2s_POST:
4247 case AArch64::LD4Fourv4h_POST:
4248 case AArch64::LD4Fourv4s_POST:
4249 case AArch64::LD4Fourv8b_POST:
4250 case AArch64::LD4Fourv8h_POST:
4251 case AArch64::LD4Rv16b_POST:
4252 case AArch64::LD4Rv1d_POST:
4253 case AArch64::LD4Rv2d_POST:
4254 case AArch64::LD4Rv2s_POST:
4255 case AArch64::LD4Rv4h_POST:
4256 case AArch64::LD4Rv4s_POST:
4257 case AArch64::LD4Rv8b_POST:
4258 case AArch64::LD4Rv8h_POST:
4259 case AArch64::LD4i16_POST:
4260 case AArch64::LD4i32_POST:
4261 case AArch64::LD4i64_POST:
4262 case AArch64::LD4i8_POST:
4263 case AArch64::LDAPRWpost:
4264 case AArch64::LDAPRXpost:
4265 case AArch64::LDIAPPWpost:
4266 case AArch64::LDIAPPXpost:
4267 case AArch64::LDPDpost:
4268 case AArch64::LDPQpost:
4269 case AArch64::LDPSWpost:
4270 case AArch64::LDPSpost:
4271 case AArch64::LDPWpost:
4272 case AArch64::LDPXpost:
4273 case AArch64::LDRBBpost:
4274 case AArch64::LDRBpost:
4275 case AArch64::LDRDpost:
4276 case AArch64::LDRHHpost:
4277 case AArch64::LDRHpost:
4278 case AArch64::LDRQpost:
4279 case AArch64::LDRSBWpost:
4280 case AArch64::LDRSBXpost:
4281 case AArch64::LDRSHWpost:
4282 case AArch64::LDRSHXpost:
4283 case AArch64::LDRSWpost:
4284 case AArch64::LDRSpost:
4285 case AArch64::LDRWpost:
4286 case AArch64::LDRXpost:
4287 case AArch64::ST1Fourv16b_POST:
4288 case AArch64::ST1Fourv1d_POST:
4289 case AArch64::ST1Fourv2d_POST:
4290 case AArch64::ST1Fourv2s_POST:
4291 case AArch64::ST1Fourv4h_POST:
4292 case AArch64::ST1Fourv4s_POST:
4293 case AArch64::ST1Fourv8b_POST:
4294 case AArch64::ST1Fourv8h_POST:
4295 case AArch64::ST1Onev16b_POST:
4296 case AArch64::ST1Onev1d_POST:
4297 case AArch64::ST1Onev2d_POST:
4298 case AArch64::ST1Onev2s_POST:
4299 case AArch64::ST1Onev4h_POST:
4300 case AArch64::ST1Onev4s_POST:
4301 case AArch64::ST1Onev8b_POST:
4302 case AArch64::ST1Onev8h_POST:
4303 case AArch64::ST1Threev16b_POST:
4304 case AArch64::ST1Threev1d_POST:
4305 case AArch64::ST1Threev2d_POST:
4306 case AArch64::ST1Threev2s_POST:
4307 case AArch64::ST1Threev4h_POST:
4308 case AArch64::ST1Threev4s_POST:
4309 case AArch64::ST1Threev8b_POST:
4310 case AArch64::ST1Threev8h_POST:
4311 case AArch64::ST1Twov16b_POST:
4312 case AArch64::ST1Twov1d_POST:
4313 case AArch64::ST1Twov2d_POST:
4314 case AArch64::ST1Twov2s_POST:
4315 case AArch64::ST1Twov4h_POST:
4316 case AArch64::ST1Twov4s_POST:
4317 case AArch64::ST1Twov8b_POST:
4318 case AArch64::ST1Twov8h_POST:
4319 case AArch64::ST1i16_POST:
4320 case AArch64::ST1i32_POST:
4321 case AArch64::ST1i64_POST:
4322 case AArch64::ST1i8_POST:
4323 case AArch64::ST2GPostIndex:
4324 case AArch64::ST2Twov16b_POST:
4325 case AArch64::ST2Twov2d_POST:
4326 case AArch64::ST2Twov2s_POST:
4327 case AArch64::ST2Twov4h_POST:
4328 case AArch64::ST2Twov4s_POST:
4329 case AArch64::ST2Twov8b_POST:
4330 case AArch64::ST2Twov8h_POST:
4331 case AArch64::ST2i16_POST:
4332 case AArch64::ST2i32_POST:
4333 case AArch64::ST2i64_POST:
4334 case AArch64::ST2i8_POST:
4335 case AArch64::ST3Threev16b_POST:
4336 case AArch64::ST3Threev2d_POST:
4337 case AArch64::ST3Threev2s_POST:
4338 case AArch64::ST3Threev4h_POST:
4339 case AArch64::ST3Threev4s_POST:
4340 case AArch64::ST3Threev8b_POST:
4341 case AArch64::ST3Threev8h_POST:
4342 case AArch64::ST3i16_POST:
4343 case AArch64::ST3i32_POST:
4344 case AArch64::ST3i64_POST:
4345 case AArch64::ST3i8_POST:
4346 case AArch64::ST4Fourv16b_POST:
4347 case AArch64::ST4Fourv2d_POST:
4348 case AArch64::ST4Fourv2s_POST:
4349 case AArch64::ST4Fourv4h_POST:
4350 case AArch64::ST4Fourv4s_POST:
4351 case AArch64::ST4Fourv8b_POST:
4352 case AArch64::ST4Fourv8h_POST:
4353 case AArch64::ST4i16_POST:
4354 case AArch64::ST4i32_POST:
4355 case AArch64::ST4i64_POST:
4356 case AArch64::ST4i8_POST:
4357 case AArch64::STGPostIndex:
4358 case AArch64::STGPpost:
4359 case AArch64::STPDpost:
4360 case AArch64::STPQpost:
4361 case AArch64::STPSpost:
4362 case AArch64::STPWpost:
4363 case AArch64::STPXpost:
4364 case AArch64::STRBBpost:
4365 case AArch64::STRBpost:
4366 case AArch64::STRDpost:
4367 case AArch64::STRHHpost:
4368 case AArch64::STRHpost:
4369 case AArch64::STRQpost:
4370 case AArch64::STRSpost:
4371 case AArch64::STRWpost:
4372 case AArch64::STRXpost:
4373 case AArch64::STZ2GPostIndex:
4374 case AArch64::STZGPostIndex:
4375 return true;
4376 }
4377}
4378
4379bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
4380 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4381 bool &OffsetIsScalable, TypeSize &Width,
4382 const TargetRegisterInfo *TRI) const {
4383 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4384 // Handle only loads/stores with base register followed by immediate offset.
4385 if (LdSt.getNumExplicitOperands() == 3) {
4386 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4387 if ((!LdSt.getOperand(i: 1).isReg() && !LdSt.getOperand(i: 1).isFI()) ||
4388 !LdSt.getOperand(i: 2).isImm())
4389 return false;
4390 } else if (LdSt.getNumExplicitOperands() == 4) {
4391 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4392 if (!LdSt.getOperand(i: 1).isReg() ||
4393 (!LdSt.getOperand(i: 2).isReg() && !LdSt.getOperand(i: 2).isFI()) ||
4394 !LdSt.getOperand(i: 3).isImm())
4395 return false;
4396 } else
4397 return false;
4398
4399 // Get the scaling factor for the instruction and set the width for the
4400 // instruction.
4401 TypeSize Scale(0U, false);
4402 int64_t Dummy1, Dummy2;
4403
4404 // If this returns false, then it's an instruction we don't want to handle.
4405 if (!getMemOpInfo(Opcode: LdSt.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2))
4406 return false;
4407
4408 // Compute the offset. Offset is calculated as the immediate operand
4409 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4410 // set to 1. Postindex are a special case which have an offset of 0.
4411 if (isPostIndexLdStOpcode(Opcode: LdSt.getOpcode())) {
4412 BaseOp = &LdSt.getOperand(i: 2);
4413 Offset = 0;
4414 } else if (LdSt.getNumExplicitOperands() == 3) {
4415 BaseOp = &LdSt.getOperand(i: 1);
4416 Offset = LdSt.getOperand(i: 2).getImm() * Scale.getKnownMinValue();
4417 } else {
4418 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4419 BaseOp = &LdSt.getOperand(i: 2);
4420 Offset = LdSt.getOperand(i: 3).getImm() * Scale.getKnownMinValue();
4421 }
4422 OffsetIsScalable = Scale.isScalable();
4423
4424 return BaseOp->isReg() || BaseOp->isFI();
4425}
4426
4427MachineOperand &
4428AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
4429 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4430 MachineOperand &OfsOp = LdSt.getOperand(i: LdSt.getNumExplicitOperands() - 1);
4431 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4432 return OfsOp;
4433}
4434
4435bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4436 TypeSize &Width, int64_t &MinOffset,
4437 int64_t &MaxOffset) {
4438 switch (Opcode) {
4439 // Not a memory operation or something we want to handle.
4440 default:
4441 Scale = TypeSize::getFixed(ExactSize: 0);
4442 Width = TypeSize::getFixed(ExactSize: 0);
4443 MinOffset = MaxOffset = 0;
4444 return false;
4445 // LDR / STR
4446 case AArch64::LDRQui:
4447 case AArch64::STRQui:
4448 Scale = TypeSize::getFixed(ExactSize: 16);
4449 Width = TypeSize::getFixed(ExactSize: 16);
4450 MinOffset = 0;
4451 MaxOffset = 4095;
4452 break;
4453 case AArch64::LDRXui:
4454 case AArch64::LDRDui:
4455 case AArch64::STRXui:
4456 case AArch64::STRDui:
4457 case AArch64::PRFMui:
4458 Scale = TypeSize::getFixed(ExactSize: 8);
4459 Width = TypeSize::getFixed(ExactSize: 8);
4460 MinOffset = 0;
4461 MaxOffset = 4095;
4462 break;
4463 case AArch64::LDRWui:
4464 case AArch64::LDRSui:
4465 case AArch64::LDRSWui:
4466 case AArch64::STRWui:
4467 case AArch64::STRSui:
4468 Scale = TypeSize::getFixed(ExactSize: 4);
4469 Width = TypeSize::getFixed(ExactSize: 4);
4470 MinOffset = 0;
4471 MaxOffset = 4095;
4472 break;
4473 case AArch64::LDRHui:
4474 case AArch64::LDRHHui:
4475 case AArch64::LDRSHWui:
4476 case AArch64::LDRSHXui:
4477 case AArch64::STRHui:
4478 case AArch64::STRHHui:
4479 Scale = TypeSize::getFixed(ExactSize: 2);
4480 Width = TypeSize::getFixed(ExactSize: 2);
4481 MinOffset = 0;
4482 MaxOffset = 4095;
4483 break;
4484 case AArch64::LDRBui:
4485 case AArch64::LDRBBui:
4486 case AArch64::LDRSBWui:
4487 case AArch64::LDRSBXui:
4488 case AArch64::STRBui:
4489 case AArch64::STRBBui:
4490 Scale = TypeSize::getFixed(ExactSize: 1);
4491 Width = TypeSize::getFixed(ExactSize: 1);
4492 MinOffset = 0;
4493 MaxOffset = 4095;
4494 break;
4495 // post/pre inc
4496 case AArch64::STRQpre:
4497 case AArch64::LDRQpost:
4498 Scale = TypeSize::getFixed(ExactSize: 1);
4499 Width = TypeSize::getFixed(ExactSize: 16);
4500 MinOffset = -256;
4501 MaxOffset = 255;
4502 break;
4503 case AArch64::LDRDpost:
4504 case AArch64::LDRDpre:
4505 case AArch64::LDRXpost:
4506 case AArch64::LDRXpre:
4507 case AArch64::STRDpost:
4508 case AArch64::STRDpre:
4509 case AArch64::STRXpost:
4510 case AArch64::STRXpre:
4511 Scale = TypeSize::getFixed(ExactSize: 1);
4512 Width = TypeSize::getFixed(ExactSize: 8);
4513 MinOffset = -256;
4514 MaxOffset = 255;
4515 break;
4516 case AArch64::STRWpost:
4517 case AArch64::STRWpre:
4518 case AArch64::LDRWpost:
4519 case AArch64::LDRWpre:
4520 case AArch64::STRSpost:
4521 case AArch64::STRSpre:
4522 case AArch64::LDRSpost:
4523 case AArch64::LDRSpre:
4524 Scale = TypeSize::getFixed(ExactSize: 1);
4525 Width = TypeSize::getFixed(ExactSize: 4);
4526 MinOffset = -256;
4527 MaxOffset = 255;
4528 break;
4529 case AArch64::LDRHpost:
4530 case AArch64::LDRHpre:
4531 case AArch64::STRHpost:
4532 case AArch64::STRHpre:
4533 case AArch64::LDRHHpost:
4534 case AArch64::LDRHHpre:
4535 case AArch64::STRHHpost:
4536 case AArch64::STRHHpre:
4537 Scale = TypeSize::getFixed(ExactSize: 1);
4538 Width = TypeSize::getFixed(ExactSize: 2);
4539 MinOffset = -256;
4540 MaxOffset = 255;
4541 break;
4542 case AArch64::LDRBpost:
4543 case AArch64::LDRBpre:
4544 case AArch64::STRBpost:
4545 case AArch64::STRBpre:
4546 case AArch64::LDRBBpost:
4547 case AArch64::LDRBBpre:
4548 case AArch64::STRBBpost:
4549 case AArch64::STRBBpre:
4550 Scale = TypeSize::getFixed(ExactSize: 1);
4551 Width = TypeSize::getFixed(ExactSize: 1);
4552 MinOffset = -256;
4553 MaxOffset = 255;
4554 break;
4555 // Unscaled
4556 case AArch64::LDURQi:
4557 case AArch64::STURQi:
4558 Scale = TypeSize::getFixed(ExactSize: 1);
4559 Width = TypeSize::getFixed(ExactSize: 16);
4560 MinOffset = -256;
4561 MaxOffset = 255;
4562 break;
4563 case AArch64::LDURXi:
4564 case AArch64::LDURDi:
4565 case AArch64::LDAPURXi:
4566 case AArch64::STURXi:
4567 case AArch64::STURDi:
4568 case AArch64::STLURXi:
4569 case AArch64::PRFUMi:
4570 Scale = TypeSize::getFixed(ExactSize: 1);
4571 Width = TypeSize::getFixed(ExactSize: 8);
4572 MinOffset = -256;
4573 MaxOffset = 255;
4574 break;
4575 case AArch64::LDURWi:
4576 case AArch64::LDURSi:
4577 case AArch64::LDURSWi:
4578 case AArch64::LDAPURi:
4579 case AArch64::LDAPURSWi:
4580 case AArch64::STURWi:
4581 case AArch64::STURSi:
4582 case AArch64::STLURWi:
4583 Scale = TypeSize::getFixed(ExactSize: 1);
4584 Width = TypeSize::getFixed(ExactSize: 4);
4585 MinOffset = -256;
4586 MaxOffset = 255;
4587 break;
4588 case AArch64::LDURHi:
4589 case AArch64::LDURHHi:
4590 case AArch64::LDURSHXi:
4591 case AArch64::LDURSHWi:
4592 case AArch64::LDAPURHi:
4593 case AArch64::LDAPURSHWi:
4594 case AArch64::LDAPURSHXi:
4595 case AArch64::STURHi:
4596 case AArch64::STURHHi:
4597 case AArch64::STLURHi:
4598 Scale = TypeSize::getFixed(ExactSize: 1);
4599 Width = TypeSize::getFixed(ExactSize: 2);
4600 MinOffset = -256;
4601 MaxOffset = 255;
4602 break;
4603 case AArch64::LDURBi:
4604 case AArch64::LDURBBi:
4605 case AArch64::LDURSBXi:
4606 case AArch64::LDURSBWi:
4607 case AArch64::LDAPURBi:
4608 case AArch64::LDAPURSBWi:
4609 case AArch64::LDAPURSBXi:
4610 case AArch64::STURBi:
4611 case AArch64::STURBBi:
4612 case AArch64::STLURBi:
4613 Scale = TypeSize::getFixed(ExactSize: 1);
4614 Width = TypeSize::getFixed(ExactSize: 1);
4615 MinOffset = -256;
4616 MaxOffset = 255;
4617 break;
4618 // LDP / STP (including pre/post inc)
4619 case AArch64::LDPQi:
4620 case AArch64::LDNPQi:
4621 case AArch64::STPQi:
4622 case AArch64::STNPQi:
4623 case AArch64::LDPQpost:
4624 case AArch64::LDPQpre:
4625 case AArch64::STPQpost:
4626 case AArch64::STPQpre:
4627 Scale = TypeSize::getFixed(ExactSize: 16);
4628 Width = TypeSize::getFixed(ExactSize: 16 * 2);
4629 MinOffset = -64;
4630 MaxOffset = 63;
4631 break;
4632 case AArch64::LDPXi:
4633 case AArch64::LDPDi:
4634 case AArch64::LDNPXi:
4635 case AArch64::LDNPDi:
4636 case AArch64::STPXi:
4637 case AArch64::STPDi:
4638 case AArch64::STNPXi:
4639 case AArch64::STNPDi:
4640 case AArch64::LDPDpost:
4641 case AArch64::LDPDpre:
4642 case AArch64::LDPXpost:
4643 case AArch64::LDPXpre:
4644 case AArch64::STPDpost:
4645 case AArch64::STPDpre:
4646 case AArch64::STPXpost:
4647 case AArch64::STPXpre:
4648 Scale = TypeSize::getFixed(ExactSize: 8);
4649 Width = TypeSize::getFixed(ExactSize: 8 * 2);
4650 MinOffset = -64;
4651 MaxOffset = 63;
4652 break;
4653 case AArch64::LDPWi:
4654 case AArch64::LDPSi:
4655 case AArch64::LDNPWi:
4656 case AArch64::LDNPSi:
4657 case AArch64::STPWi:
4658 case AArch64::STPSi:
4659 case AArch64::STNPWi:
4660 case AArch64::STNPSi:
4661 case AArch64::LDPSpost:
4662 case AArch64::LDPSpre:
4663 case AArch64::LDPWpost:
4664 case AArch64::LDPWpre:
4665 case AArch64::STPSpost:
4666 case AArch64::STPSpre:
4667 case AArch64::STPWpost:
4668 case AArch64::STPWpre:
4669 Scale = TypeSize::getFixed(ExactSize: 4);
4670 Width = TypeSize::getFixed(ExactSize: 4 * 2);
4671 MinOffset = -64;
4672 MaxOffset = 63;
4673 break;
4674 case AArch64::StoreSwiftAsyncContext:
4675 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4676 Scale = TypeSize::getFixed(ExactSize: 1);
4677 Width = TypeSize::getFixed(ExactSize: 8);
4678 MinOffset = 0;
4679 MaxOffset = 4095;
4680 break;
4681 case AArch64::ADDG:
4682 Scale = TypeSize::getFixed(ExactSize: 16);
4683 Width = TypeSize::getFixed(ExactSize: 0);
4684 MinOffset = 0;
4685 MaxOffset = 63;
4686 break;
4687 case AArch64::TAGPstack:
4688 Scale = TypeSize::getFixed(ExactSize: 16);
4689 Width = TypeSize::getFixed(ExactSize: 0);
4690 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4691 // of 63 (not 64!).
4692 MinOffset = -63;
4693 MaxOffset = 63;
4694 break;
4695 case AArch64::LDG:
4696 case AArch64::STGi:
4697 case AArch64::STGPreIndex:
4698 case AArch64::STGPostIndex:
4699 case AArch64::STZGi:
4700 case AArch64::STZGPreIndex:
4701 case AArch64::STZGPostIndex:
4702 Scale = TypeSize::getFixed(ExactSize: 16);
4703 Width = TypeSize::getFixed(ExactSize: 16);
4704 MinOffset = -256;
4705 MaxOffset = 255;
4706 break;
4707 // SVE
4708 case AArch64::STR_ZZZZXI:
4709 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4710 case AArch64::LDR_ZZZZXI:
4711 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4712 Scale = TypeSize::getScalable(MinimumSize: 16);
4713 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
4714 MinOffset = -256;
4715 MaxOffset = 252;
4716 break;
4717 case AArch64::STR_ZZZXI:
4718 case AArch64::LDR_ZZZXI:
4719 Scale = TypeSize::getScalable(MinimumSize: 16);
4720 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
4721 MinOffset = -256;
4722 MaxOffset = 253;
4723 break;
4724 case AArch64::STR_ZZXI:
4725 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4726 case AArch64::LDR_ZZXI:
4727 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4728 Scale = TypeSize::getScalable(MinimumSize: 16);
4729 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
4730 MinOffset = -256;
4731 MaxOffset = 254;
4732 break;
4733 case AArch64::LDR_PXI:
4734 case AArch64::STR_PXI:
4735 Scale = TypeSize::getScalable(MinimumSize: 2);
4736 Width = TypeSize::getScalable(MinimumSize: 2);
4737 MinOffset = -256;
4738 MaxOffset = 255;
4739 break;
4740 case AArch64::LDR_PPXI:
4741 case AArch64::STR_PPXI:
4742 Scale = TypeSize::getScalable(MinimumSize: 2);
4743 Width = TypeSize::getScalable(MinimumSize: 2 * 2);
4744 MinOffset = -256;
4745 MaxOffset = 254;
4746 break;
4747 case AArch64::LDR_ZXI:
4748 case AArch64::STR_ZXI:
4749 Scale = TypeSize::getScalable(MinimumSize: 16);
4750 Width = TypeSize::getScalable(MinimumSize: 16);
4751 MinOffset = -256;
4752 MaxOffset = 255;
4753 break;
4754 case AArch64::LD1B_IMM:
4755 case AArch64::LD1H_IMM:
4756 case AArch64::LD1W_IMM:
4757 case AArch64::LD1D_IMM:
4758 case AArch64::LDNT1B_ZRI:
4759 case AArch64::LDNT1H_ZRI:
4760 case AArch64::LDNT1W_ZRI:
4761 case AArch64::LDNT1D_ZRI:
4762 case AArch64::ST1B_IMM:
4763 case AArch64::ST1H_IMM:
4764 case AArch64::ST1W_IMM:
4765 case AArch64::ST1D_IMM:
4766 case AArch64::STNT1B_ZRI:
4767 case AArch64::STNT1H_ZRI:
4768 case AArch64::STNT1W_ZRI:
4769 case AArch64::STNT1D_ZRI:
4770 case AArch64::LDNF1B_IMM:
4771 case AArch64::LDNF1H_IMM:
4772 case AArch64::LDNF1W_IMM:
4773 case AArch64::LDNF1D_IMM:
4774 // A full vectors worth of data
4775 // Width = mbytes * elements
4776 Scale = TypeSize::getScalable(MinimumSize: 16);
4777 Width = TypeSize::getScalable(MinimumSize: 16);
4778 MinOffset = -8;
4779 MaxOffset = 7;
4780 break;
4781 case AArch64::LD2B_IMM:
4782 case AArch64::LD2H_IMM:
4783 case AArch64::LD2W_IMM:
4784 case AArch64::LD2D_IMM:
4785 case AArch64::ST2B_IMM:
4786 case AArch64::ST2H_IMM:
4787 case AArch64::ST2W_IMM:
4788 case AArch64::ST2D_IMM:
4789 Scale = TypeSize::getScalable(MinimumSize: 32);
4790 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
4791 MinOffset = -8;
4792 MaxOffset = 7;
4793 break;
4794 case AArch64::LD3B_IMM:
4795 case AArch64::LD3H_IMM:
4796 case AArch64::LD3W_IMM:
4797 case AArch64::LD3D_IMM:
4798 case AArch64::ST3B_IMM:
4799 case AArch64::ST3H_IMM:
4800 case AArch64::ST3W_IMM:
4801 case AArch64::ST3D_IMM:
4802 Scale = TypeSize::getScalable(MinimumSize: 48);
4803 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
4804 MinOffset = -8;
4805 MaxOffset = 7;
4806 break;
4807 case AArch64::LD4B_IMM:
4808 case AArch64::LD4H_IMM:
4809 case AArch64::LD4W_IMM:
4810 case AArch64::LD4D_IMM:
4811 case AArch64::ST4B_IMM:
4812 case AArch64::ST4H_IMM:
4813 case AArch64::ST4W_IMM:
4814 case AArch64::ST4D_IMM:
4815 Scale = TypeSize::getScalable(MinimumSize: 64);
4816 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
4817 MinOffset = -8;
4818 MaxOffset = 7;
4819 break;
4820 case AArch64::LD1B_H_IMM:
4821 case AArch64::LD1SB_H_IMM:
4822 case AArch64::LD1H_S_IMM:
4823 case AArch64::LD1SH_S_IMM:
4824 case AArch64::LD1W_D_IMM:
4825 case AArch64::LD1SW_D_IMM:
4826 case AArch64::ST1B_H_IMM:
4827 case AArch64::ST1H_S_IMM:
4828 case AArch64::ST1W_D_IMM:
4829 case AArch64::LDNF1B_H_IMM:
4830 case AArch64::LDNF1SB_H_IMM:
4831 case AArch64::LDNF1H_S_IMM:
4832 case AArch64::LDNF1SH_S_IMM:
4833 case AArch64::LDNF1W_D_IMM:
4834 case AArch64::LDNF1SW_D_IMM:
4835 // A half vector worth of data
4836 // Width = mbytes * elements
4837 Scale = TypeSize::getScalable(MinimumSize: 8);
4838 Width = TypeSize::getScalable(MinimumSize: 8);
4839 MinOffset = -8;
4840 MaxOffset = 7;
4841 break;
4842 case AArch64::LD1B_S_IMM:
4843 case AArch64::LD1SB_S_IMM:
4844 case AArch64::LD1H_D_IMM:
4845 case AArch64::LD1SH_D_IMM:
4846 case AArch64::ST1B_S_IMM:
4847 case AArch64::ST1H_D_IMM:
4848 case AArch64::LDNF1B_S_IMM:
4849 case AArch64::LDNF1SB_S_IMM:
4850 case AArch64::LDNF1H_D_IMM:
4851 case AArch64::LDNF1SH_D_IMM:
4852 // A quarter vector worth of data
4853 // Width = mbytes * elements
4854 Scale = TypeSize::getScalable(MinimumSize: 4);
4855 Width = TypeSize::getScalable(MinimumSize: 4);
4856 MinOffset = -8;
4857 MaxOffset = 7;
4858 break;
4859 case AArch64::LD1B_D_IMM:
4860 case AArch64::LD1SB_D_IMM:
4861 case AArch64::ST1B_D_IMM:
4862 case AArch64::LDNF1B_D_IMM:
4863 case AArch64::LDNF1SB_D_IMM:
4864 // A eighth vector worth of data
4865 // Width = mbytes * elements
4866 Scale = TypeSize::getScalable(MinimumSize: 2);
4867 Width = TypeSize::getScalable(MinimumSize: 2);
4868 MinOffset = -8;
4869 MaxOffset = 7;
4870 break;
4871 case AArch64::ST2Gi:
4872 case AArch64::ST2GPreIndex:
4873 case AArch64::ST2GPostIndex:
4874 case AArch64::STZ2Gi:
4875 case AArch64::STZ2GPreIndex:
4876 case AArch64::STZ2GPostIndex:
4877 Scale = TypeSize::getFixed(ExactSize: 16);
4878 Width = TypeSize::getFixed(ExactSize: 32);
4879 MinOffset = -256;
4880 MaxOffset = 255;
4881 break;
4882 case AArch64::STGPi:
4883 case AArch64::STGPpost:
4884 case AArch64::STGPpre:
4885 Scale = TypeSize::getFixed(ExactSize: 16);
4886 Width = TypeSize::getFixed(ExactSize: 16);
4887 MinOffset = -64;
4888 MaxOffset = 63;
4889 break;
4890 case AArch64::LD1RB_IMM:
4891 case AArch64::LD1RB_H_IMM:
4892 case AArch64::LD1RB_S_IMM:
4893 case AArch64::LD1RB_D_IMM:
4894 case AArch64::LD1RSB_H_IMM:
4895 case AArch64::LD1RSB_S_IMM:
4896 case AArch64::LD1RSB_D_IMM:
4897 Scale = TypeSize::getFixed(ExactSize: 1);
4898 Width = TypeSize::getFixed(ExactSize: 1);
4899 MinOffset = 0;
4900 MaxOffset = 63;
4901 break;
4902 case AArch64::LD1RH_IMM:
4903 case AArch64::LD1RH_S_IMM:
4904 case AArch64::LD1RH_D_IMM:
4905 case AArch64::LD1RSH_S_IMM:
4906 case AArch64::LD1RSH_D_IMM:
4907 Scale = TypeSize::getFixed(ExactSize: 2);
4908 Width = TypeSize::getFixed(ExactSize: 2);
4909 MinOffset = 0;
4910 MaxOffset = 63;
4911 break;
4912 case AArch64::LD1RW_IMM:
4913 case AArch64::LD1RW_D_IMM:
4914 case AArch64::LD1RSW_IMM:
4915 Scale = TypeSize::getFixed(ExactSize: 4);
4916 Width = TypeSize::getFixed(ExactSize: 4);
4917 MinOffset = 0;
4918 MaxOffset = 63;
4919 break;
4920 case AArch64::LD1RD_IMM:
4921 Scale = TypeSize::getFixed(ExactSize: 8);
4922 Width = TypeSize::getFixed(ExactSize: 8);
4923 MinOffset = 0;
4924 MaxOffset = 63;
4925 break;
4926 }
4927
4928 return true;
4929}
4930
4931// Scaling factor for unscaled load or store.
4932int AArch64InstrInfo::getMemScale(unsigned Opc) {
4933 switch (Opc) {
4934 default:
4935 llvm_unreachable("Opcode has unknown scale!");
4936 case AArch64::LDRBui:
4937 case AArch64::LDRBBui:
4938 case AArch64::LDURBBi:
4939 case AArch64::LDRSBWui:
4940 case AArch64::LDURSBWi:
4941 case AArch64::STRBui:
4942 case AArch64::STRBBui:
4943 case AArch64::STURBBi:
4944 return 1;
4945 case AArch64::LDRHui:
4946 case AArch64::LDRHHui:
4947 case AArch64::LDURHHi:
4948 case AArch64::LDRSHWui:
4949 case AArch64::LDURSHWi:
4950 case AArch64::STRHui:
4951 case AArch64::STRHHui:
4952 case AArch64::STURHHi:
4953 return 2;
4954 case AArch64::LDRSui:
4955 case AArch64::LDURSi:
4956 case AArch64::LDRSpre:
4957 case AArch64::LDRSWui:
4958 case AArch64::LDURSWi:
4959 case AArch64::LDRSWpre:
4960 case AArch64::LDRWpre:
4961 case AArch64::LDRWui:
4962 case AArch64::LDURWi:
4963 case AArch64::STRSui:
4964 case AArch64::STURSi:
4965 case AArch64::STRSpre:
4966 case AArch64::STRWui:
4967 case AArch64::STURWi:
4968 case AArch64::STRWpre:
4969 case AArch64::LDPSi:
4970 case AArch64::LDPSWi:
4971 case AArch64::LDPWi:
4972 case AArch64::STPSi:
4973 case AArch64::STPWi:
4974 return 4;
4975 case AArch64::LDRDui:
4976 case AArch64::LDURDi:
4977 case AArch64::LDRDpre:
4978 case AArch64::LDRXui:
4979 case AArch64::LDURXi:
4980 case AArch64::LDRXpre:
4981 case AArch64::STRDui:
4982 case AArch64::STURDi:
4983 case AArch64::STRDpre:
4984 case AArch64::STRXui:
4985 case AArch64::STURXi:
4986 case AArch64::STRXpre:
4987 case AArch64::LDPDi:
4988 case AArch64::LDPXi:
4989 case AArch64::STPDi:
4990 case AArch64::STPXi:
4991 return 8;
4992 case AArch64::LDRQui:
4993 case AArch64::LDURQi:
4994 case AArch64::STRQui:
4995 case AArch64::STURQi:
4996 case AArch64::STRQpre:
4997 case AArch64::LDPQi:
4998 case AArch64::LDRQpre:
4999 case AArch64::STPQi:
5000 case AArch64::STGi:
5001 case AArch64::STZGi:
5002 case AArch64::ST2Gi:
5003 case AArch64::STZ2Gi:
5004 case AArch64::STGPi:
5005 return 16;
5006 }
5007}
5008
5009bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
5010 switch (MI.getOpcode()) {
5011 default:
5012 return false;
5013 case AArch64::LDRWpre:
5014 case AArch64::LDRXpre:
5015 case AArch64::LDRSWpre:
5016 case AArch64::LDRSpre:
5017 case AArch64::LDRDpre:
5018 case AArch64::LDRQpre:
5019 return true;
5020 }
5021}
5022
5023bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
5024 switch (MI.getOpcode()) {
5025 default:
5026 return false;
5027 case AArch64::STRWpre:
5028 case AArch64::STRXpre:
5029 case AArch64::STRSpre:
5030 case AArch64::STRDpre:
5031 case AArch64::STRQpre:
5032 return true;
5033 }
5034}
5035
5036bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
5037 return isPreLd(MI) || isPreSt(MI);
5038}
5039
5040bool AArch64InstrInfo::isZExtLoad(const MachineInstr &MI) {
5041 switch (MI.getOpcode()) {
5042 default:
5043 return false;
5044 case AArch64::LDURBBi:
5045 case AArch64::LDURHHi:
5046 case AArch64::LDURWi:
5047 case AArch64::LDRBBui:
5048 case AArch64::LDRHHui:
5049 case AArch64::LDRWui:
5050 case AArch64::LDRBBroX:
5051 case AArch64::LDRHHroX:
5052 case AArch64::LDRWroX:
5053 case AArch64::LDRBBroW:
5054 case AArch64::LDRHHroW:
5055 case AArch64::LDRWroW:
5056 return true;
5057 }
5058}
5059
5060bool AArch64InstrInfo::isSExtLoad(const MachineInstr &MI) {
5061 switch (MI.getOpcode()) {
5062 default:
5063 return false;
5064 case AArch64::LDURSBWi:
5065 case AArch64::LDURSHWi:
5066 case AArch64::LDURSBXi:
5067 case AArch64::LDURSHXi:
5068 case AArch64::LDURSWi:
5069 case AArch64::LDRSBWui:
5070 case AArch64::LDRSHWui:
5071 case AArch64::LDRSBXui:
5072 case AArch64::LDRSHXui:
5073 case AArch64::LDRSWui:
5074 case AArch64::LDRSBWroX:
5075 case AArch64::LDRSHWroX:
5076 case AArch64::LDRSBXroX:
5077 case AArch64::LDRSHXroX:
5078 case AArch64::LDRSWroX:
5079 case AArch64::LDRSBWroW:
5080 case AArch64::LDRSHWroW:
5081 case AArch64::LDRSBXroW:
5082 case AArch64::LDRSHXroW:
5083 case AArch64::LDRSWroW:
5084 return true;
5085 }
5086}
5087
5088bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
5089 switch (MI.getOpcode()) {
5090 default:
5091 return false;
5092 case AArch64::LDPSi:
5093 case AArch64::LDPSWi:
5094 case AArch64::LDPDi:
5095 case AArch64::LDPQi:
5096 case AArch64::LDPWi:
5097 case AArch64::LDPXi:
5098 case AArch64::STPSi:
5099 case AArch64::STPDi:
5100 case AArch64::STPQi:
5101 case AArch64::STPWi:
5102 case AArch64::STPXi:
5103 case AArch64::STGPi:
5104 return true;
5105 }
5106}
5107
5108const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
5109 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5110 unsigned Idx =
5111 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
5112 : 1;
5113 return MI.getOperand(i: Idx);
5114}
5115
5116const MachineOperand &
5117AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
5118 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5119 unsigned Idx =
5120 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
5121 : 2;
5122 return MI.getOperand(i: Idx);
5123}
5124
5125const MachineOperand &
5126AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
5127 switch (MI.getOpcode()) {
5128 default:
5129 llvm_unreachable("Unexpected opcode");
5130 case AArch64::LDRBroX:
5131 case AArch64::LDRBBroX:
5132 case AArch64::LDRSBXroX:
5133 case AArch64::LDRSBWroX:
5134 case AArch64::LDRHroX:
5135 case AArch64::LDRHHroX:
5136 case AArch64::LDRSHXroX:
5137 case AArch64::LDRSHWroX:
5138 case AArch64::LDRWroX:
5139 case AArch64::LDRSroX:
5140 case AArch64::LDRSWroX:
5141 case AArch64::LDRDroX:
5142 case AArch64::LDRXroX:
5143 case AArch64::LDRQroX:
5144 return MI.getOperand(i: 4);
5145 }
5146}
5147
5148static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
5149 Register Reg) {
5150 if (MI.getParent() == nullptr)
5151 return nullptr;
5152 const MachineFunction *MF = MI.getParent()->getParent();
5153 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5154}
5155
5156bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
5157 auto IsHFPR = [&](const MachineOperand &Op) {
5158 if (!Op.isReg())
5159 return false;
5160 auto Reg = Op.getReg();
5161 if (Reg.isPhysical())
5162 return AArch64::FPR16RegClass.contains(Reg);
5163 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5164 return TRC == &AArch64::FPR16RegClass ||
5165 TRC == &AArch64::FPR16_loRegClass;
5166 };
5167 return llvm::any_of(Range: MI.operands(), P: IsHFPR);
5168}
5169
5170bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
5171 auto IsQFPR = [&](const MachineOperand &Op) {
5172 if (!Op.isReg())
5173 return false;
5174 auto Reg = Op.getReg();
5175 if (Reg.isPhysical())
5176 return AArch64::FPR128RegClass.contains(Reg);
5177 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5178 return TRC == &AArch64::FPR128RegClass ||
5179 TRC == &AArch64::FPR128_loRegClass;
5180 };
5181 return llvm::any_of(Range: MI.operands(), P: IsQFPR);
5182}
5183
5184bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
5185 switch (MI.getOpcode()) {
5186 case AArch64::BRK:
5187 case AArch64::HLT:
5188 case AArch64::PACIASP:
5189 case AArch64::PACIBSP:
5190 // Implicit BTI behavior.
5191 return true;
5192 case AArch64::PAUTH_PROLOGUE:
5193 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5194 return true;
5195 case AArch64::HINT: {
5196 unsigned Imm = MI.getOperand(i: 0).getImm();
5197 // Explicit BTI instruction.
5198 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5199 return true;
5200 // PACI(A|B)SP instructions.
5201 if (Imm == 25 || Imm == 27)
5202 return true;
5203 return false;
5204 }
5205 default:
5206 return false;
5207 }
5208}
5209
5210bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
5211 if (Reg == 0)
5212 return false;
5213 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5214 return AArch64::FPR128RegClass.contains(Reg) ||
5215 AArch64::FPR64RegClass.contains(Reg) ||
5216 AArch64::FPR32RegClass.contains(Reg) ||
5217 AArch64::FPR16RegClass.contains(Reg) ||
5218 AArch64::FPR8RegClass.contains(Reg);
5219}
5220
5221bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
5222 auto IsFPR = [&](const MachineOperand &Op) {
5223 if (!Op.isReg())
5224 return false;
5225 auto Reg = Op.getReg();
5226 if (Reg.isPhysical())
5227 return isFpOrNEON(Reg);
5228
5229 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5230 return TRC == &AArch64::FPR128RegClass ||
5231 TRC == &AArch64::FPR128_loRegClass ||
5232 TRC == &AArch64::FPR64RegClass ||
5233 TRC == &AArch64::FPR64_loRegClass ||
5234 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5235 TRC == &AArch64::FPR8RegClass;
5236 };
5237 return llvm::any_of(Range: MI.operands(), P: IsFPR);
5238}
5239
5240// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5241// scaled.
5242static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5243 int Scale = AArch64InstrInfo::getMemScale(Opc);
5244
5245 // If the byte-offset isn't a multiple of the stride, we can't scale this
5246 // offset.
5247 if (Offset % Scale != 0)
5248 return false;
5249
5250 // Convert the byte-offset used by unscaled into an "element" offset used
5251 // by the scaled pair load/store instructions.
5252 Offset /= Scale;
5253 return true;
5254}
5255
5256static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5257 if (FirstOpc == SecondOpc)
5258 return true;
5259 // We can also pair sign-ext and zero-ext instructions.
5260 switch (FirstOpc) {
5261 default:
5262 return false;
5263 case AArch64::STRSui:
5264 case AArch64::STURSi:
5265 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5266 case AArch64::STRDui:
5267 case AArch64::STURDi:
5268 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5269 case AArch64::STRQui:
5270 case AArch64::STURQi:
5271 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5272 case AArch64::STRWui:
5273 case AArch64::STURWi:
5274 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5275 case AArch64::STRXui:
5276 case AArch64::STURXi:
5277 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5278 case AArch64::LDRSui:
5279 case AArch64::LDURSi:
5280 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5281 case AArch64::LDRDui:
5282 case AArch64::LDURDi:
5283 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5284 case AArch64::LDRQui:
5285 case AArch64::LDURQi:
5286 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5287 case AArch64::LDRWui:
5288 case AArch64::LDURWi:
5289 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5290 case AArch64::LDRSWui:
5291 case AArch64::LDURSWi:
5292 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5293 case AArch64::LDRXui:
5294 case AArch64::LDURXi:
5295 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5296 }
5297 // These instructions can't be paired based on their opcodes.
5298 return false;
5299}
5300
5301static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5302 int64_t Offset1, unsigned Opcode1, int FI2,
5303 int64_t Offset2, unsigned Opcode2) {
5304 // Accesses through fixed stack object frame indices may access a different
5305 // fixed stack slot. Check that the object offsets + offsets match.
5306 if (MFI.isFixedObjectIndex(ObjectIdx: FI1) && MFI.isFixedObjectIndex(ObjectIdx: FI2)) {
5307 int64_t ObjectOffset1 = MFI.getObjectOffset(ObjectIdx: FI1);
5308 int64_t ObjectOffset2 = MFI.getObjectOffset(ObjectIdx: FI2);
5309 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5310 // Convert to scaled object offsets.
5311 int Scale1 = AArch64InstrInfo::getMemScale(Opc: Opcode1);
5312 if (ObjectOffset1 % Scale1 != 0)
5313 return false;
5314 ObjectOffset1 /= Scale1;
5315 int Scale2 = AArch64InstrInfo::getMemScale(Opc: Opcode2);
5316 if (ObjectOffset2 % Scale2 != 0)
5317 return false;
5318 ObjectOffset2 /= Scale2;
5319 ObjectOffset1 += Offset1;
5320 ObjectOffset2 += Offset2;
5321 return ObjectOffset1 + 1 == ObjectOffset2;
5322 }
5323
5324 return FI1 == FI2;
5325}
5326
5327/// Detect opportunities for ldp/stp formation.
5328///
5329/// Only called for LdSt for which getMemOperandWithOffset returns true.
5330bool AArch64InstrInfo::shouldClusterMemOps(
5331 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5332 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5333 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5334 unsigned NumBytes) const {
5335 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5336 const MachineOperand &BaseOp1 = *BaseOps1.front();
5337 const MachineOperand &BaseOp2 = *BaseOps2.front();
5338 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5339 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5340 if (BaseOp1.getType() != BaseOp2.getType())
5341 return false;
5342
5343 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5344 "Only base registers and frame indices are supported.");
5345
5346 // Check for both base regs and base FI.
5347 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5348 return false;
5349
5350 // Only cluster up to a single pair.
5351 if (ClusterSize > 2)
5352 return false;
5353
5354 if (!isPairableLdStInst(MI: FirstLdSt) || !isPairableLdStInst(MI: SecondLdSt))
5355 return false;
5356
5357 // Can we pair these instructions based on their opcodes?
5358 unsigned FirstOpc = FirstLdSt.getOpcode();
5359 unsigned SecondOpc = SecondLdSt.getOpcode();
5360 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5361 return false;
5362
5363 // Can't merge volatiles or load/stores that have a hint to avoid pair
5364 // formation, for example.
5365 if (!isCandidateToMergeOrPair(MI: FirstLdSt) ||
5366 !isCandidateToMergeOrPair(MI: SecondLdSt))
5367 return false;
5368
5369 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5370 int64_t Offset1 = FirstLdSt.getOperand(i: 2).getImm();
5371 if (hasUnscaledLdStOffset(Opc: FirstOpc) && !scaleOffset(Opc: FirstOpc, Offset&: Offset1))
5372 return false;
5373
5374 int64_t Offset2 = SecondLdSt.getOperand(i: 2).getImm();
5375 if (hasUnscaledLdStOffset(Opc: SecondOpc) && !scaleOffset(Opc: SecondOpc, Offset&: Offset2))
5376 return false;
5377
5378 // Pairwise instructions have a 7-bit signed offset field.
5379 if (Offset1 > 63 || Offset1 < -64)
5380 return false;
5381
5382 // The caller should already have ordered First/SecondLdSt by offset.
5383 // Note: except for non-equal frame index bases
5384 if (BaseOp1.isFI()) {
5385 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5386 "Caller should have ordered offsets.");
5387
5388 const MachineFrameInfo &MFI =
5389 FirstLdSt.getParent()->getParent()->getFrameInfo();
5390 return shouldClusterFI(MFI, FI1: BaseOp1.getIndex(), Offset1, Opcode1: FirstOpc,
5391 FI2: BaseOp2.getIndex(), Offset2, Opcode2: SecondOpc);
5392 }
5393
5394 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5395
5396 return Offset1 + 1 == Offset2;
5397}
5398
5399static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
5400 MCRegister Reg, unsigned SubIdx,
5401 RegState State,
5402 const TargetRegisterInfo *TRI) {
5403 if (!SubIdx)
5404 return MIB.addReg(RegNo: Reg, Flags: State);
5405
5406 if (Reg.isPhysical())
5407 return MIB.addReg(RegNo: TRI->getSubReg(Reg, Idx: SubIdx), Flags: State);
5408 return MIB.addReg(RegNo: Reg, Flags: State, SubReg: SubIdx);
5409}
5410
5411static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5412 unsigned NumRegs) {
5413 // We really want the positive remainder mod 32 here, that happens to be
5414 // easily obtainable with a mask.
5415 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5416}
5417
5418void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
5419 MachineBasicBlock::iterator I,
5420 const DebugLoc &DL, MCRegister DestReg,
5421 MCRegister SrcReg, bool KillSrc,
5422 unsigned Opcode,
5423 ArrayRef<unsigned> Indices) const {
5424 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5425 const TargetRegisterInfo *TRI = &getRegisterInfo();
5426 uint16_t DestEncoding = TRI->getEncodingValue(Reg: DestReg);
5427 uint16_t SrcEncoding = TRI->getEncodingValue(Reg: SrcReg);
5428 unsigned NumRegs = Indices.size();
5429
5430 int SubReg = 0, End = NumRegs, Incr = 1;
5431 if (forwardCopyWillClobberTuple(DestReg: DestEncoding, SrcReg: SrcEncoding, NumRegs)) {
5432 SubReg = NumRegs - 1;
5433 End = -1;
5434 Incr = -1;
5435 }
5436
5437 for (; SubReg != End; SubReg += Incr) {
5438 const MachineInstrBuilder MIB = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode));
5439 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
5440 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: {}, TRI);
5441 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
5442 }
5443}
5444
5445void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
5446 MachineBasicBlock::iterator I,
5447 const DebugLoc &DL, MCRegister DestReg,
5448 MCRegister SrcReg, bool KillSrc,
5449 unsigned Opcode, unsigned ZeroReg,
5450 llvm::ArrayRef<unsigned> Indices) const {
5451 const TargetRegisterInfo *TRI = &getRegisterInfo();
5452 unsigned NumRegs = Indices.size();
5453
5454#ifndef NDEBUG
5455 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5456 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5457 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5458 "GPR reg sequences should not be able to overlap");
5459#endif
5460
5461 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5462 const MachineInstrBuilder MIB = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode));
5463 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
5464 MIB.addReg(RegNo: ZeroReg);
5465 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
5466 MIB.addImm(Val: 0);
5467 }
5468}
5469
5470/// Returns true if the instruction at I is in a streaming call site region,
5471/// within a single basic block.
5472/// A "call site streaming region" starts after smstart and ends at smstop
5473/// around a call to a streaming function. This walks backward from I.
5474static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB,
5475 MachineBasicBlock::iterator I) {
5476 MachineFunction &MF = *MBB.getParent();
5477 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5478 if (!AFI->hasStreamingModeChanges())
5479 return false;
5480 // Walk backwards to find smstart/smstop
5481 for (MachineInstr &MI : reverse(C: make_range(x: MBB.begin(), y: I))) {
5482 unsigned Opc = MI.getOpcode();
5483 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5484 // Check if this is SM change (not ZA)
5485 int64_t PState = MI.getOperand(i: 0).getImm();
5486 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5487 // Operand 1 is 1 for start, 0 for stop
5488 return MI.getOperand(i: 1).getImm() == 1;
5489 }
5490 }
5491 }
5492 return false;
5493}
5494
5495/// Returns true if in a streaming call site region without SME-FA64.
5496static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5497 MachineBasicBlock &MBB,
5498 MachineBasicBlock::iterator I) {
5499 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5500}
5501
5502void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
5503 MachineBasicBlock::iterator I,
5504 const DebugLoc &DL, Register DestReg,
5505 Register SrcReg, bool KillSrc,
5506 bool RenamableDest,
5507 bool RenamableSrc) const {
5508 ++NumCopyInstrs;
5509 if (AArch64::GPR32spRegClass.contains(Reg: DestReg) &&
5510 AArch64::GPR32spRegClass.contains(Reg: SrcReg)) {
5511 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5512 // If either operand is WSP, expand to ADD #0.
5513 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5514 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5515 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5516 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5517 RC: &AArch64::GPR64spRegClass);
5518 MCRegister SrcRegX = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::sub_32,
5519 RC: &AArch64::GPR64spRegClass);
5520 // This instruction is reading and writing X registers. This may upset
5521 // the register scavenger and machine verifier, so we need to indicate
5522 // that we are reading an undefined value from SrcRegX, but a proper
5523 // value from SrcReg.
5524 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg: DestRegX)
5525 .addReg(RegNo: SrcRegX, Flags: RegState::Undef)
5526 .addImm(Val: 0)
5527 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
5528 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5529 ++NumZCRegMoveInstrsGPR;
5530 } else {
5531 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDWri), DestReg)
5532 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5533 .addImm(Val: 0)
5534 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5535 if (Subtarget.hasZeroCycleRegMoveGPR32())
5536 ++NumZCRegMoveInstrsGPR;
5537 }
5538 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5539 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5540 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5541 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5542 RC: &AArch64::GPR64spRegClass);
5543 assert(DestRegX.isValid() && "Destination super-reg not valid");
5544 MCRegister SrcRegX = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::sub_32,
5545 RC: &AArch64::GPR64spRegClass);
5546 assert(SrcRegX.isValid() && "Source super-reg not valid");
5547 // This instruction is reading and writing X registers. This may upset
5548 // the register scavenger and machine verifier, so we need to indicate
5549 // that we are reading an undefined value from SrcRegX, but a proper
5550 // value from SrcReg.
5551 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg: DestRegX)
5552 .addReg(RegNo: AArch64::XZR)
5553 .addReg(RegNo: SrcRegX, Flags: RegState::Undef)
5554 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5555 ++NumZCRegMoveInstrsGPR;
5556 } else {
5557 // Otherwise, expand to ORR WZR.
5558 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRWrr), DestReg)
5559 .addReg(RegNo: AArch64::WZR)
5560 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5561 if (Subtarget.hasZeroCycleRegMoveGPR32())
5562 ++NumZCRegMoveInstrsGPR;
5563 }
5564 return;
5565 }
5566
5567 // GPR32 zeroing
5568 if (AArch64::GPR32spRegClass.contains(Reg: DestReg) && SrcReg == AArch64::WZR) {
5569 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5570 !Subtarget.hasZeroCycleZeroingGPR32()) {
5571 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5572 RC: &AArch64::GPR64spRegClass);
5573 assert(DestRegX.isValid() && "Destination super-reg not valid");
5574 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: DestRegX)
5575 .addImm(Val: 0)
5576 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5577 ++NumZCZeroingInstrsGPR;
5578 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5579 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZWi), DestReg)
5580 .addImm(Val: 0)
5581 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5582 ++NumZCZeroingInstrsGPR;
5583 } else {
5584 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRWrr), DestReg)
5585 .addReg(RegNo: AArch64::WZR)
5586 .addReg(RegNo: AArch64::WZR);
5587 }
5588 return;
5589 }
5590
5591 if (AArch64::GPR64spRegClass.contains(Reg: DestReg) &&
5592 AArch64::GPR64spRegClass.contains(Reg: SrcReg)) {
5593 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5594 // If either operand is SP, expand to ADD #0.
5595 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg)
5596 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5597 .addImm(Val: 0)
5598 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5599 if (Subtarget.hasZeroCycleRegMoveGPR64())
5600 ++NumZCRegMoveInstrsGPR;
5601 } else {
5602 // Otherwise, expand to ORR XZR.
5603 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg)
5604 .addReg(RegNo: AArch64::XZR)
5605 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5606 if (Subtarget.hasZeroCycleRegMoveGPR64())
5607 ++NumZCRegMoveInstrsGPR;
5608 }
5609 return;
5610 }
5611
5612 // GPR64 zeroing
5613 if (AArch64::GPR64spRegClass.contains(Reg: DestReg) && SrcReg == AArch64::XZR) {
5614 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5615 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg)
5616 .addImm(Val: 0)
5617 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5618 ++NumZCZeroingInstrsGPR;
5619 } else {
5620 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg)
5621 .addReg(RegNo: AArch64::XZR)
5622 .addReg(RegNo: AArch64::XZR);
5623 }
5624 return;
5625 }
5626
5627 // Copy a Predicate register by ORRing with itself.
5628 if (AArch64::PPRRegClass.contains(Reg: DestReg) &&
5629 AArch64::PPRRegClass.contains(Reg: SrcReg)) {
5630 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5631 "Unexpected SVE register.");
5632 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_PPzPP), DestReg)
5633 .addReg(RegNo: SrcReg) // Pg
5634 .addReg(RegNo: SrcReg)
5635 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5636 return;
5637 }
5638
5639 // Copy a predicate-as-counter register by ORRing with itself as if it
5640 // were a regular predicate (mask) register.
5641 bool DestIsPNR = AArch64::PNRRegClass.contains(Reg: DestReg);
5642 bool SrcIsPNR = AArch64::PNRRegClass.contains(Reg: SrcReg);
5643 if (DestIsPNR || SrcIsPNR) {
5644 auto ToPPR = [](MCRegister R) -> MCRegister {
5645 return (R - AArch64::PN0) + AArch64::P0;
5646 };
5647 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5648 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5649
5650 if (PPRSrcReg != PPRDestReg) {
5651 auto NewMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_PPzPP), DestReg: PPRDestReg)
5652 .addReg(RegNo: PPRSrcReg) // Pg
5653 .addReg(RegNo: PPRSrcReg)
5654 .addReg(RegNo: PPRSrcReg, Flags: getKillRegState(B: KillSrc));
5655 if (DestIsPNR)
5656 NewMI.addDef(RegNo: DestReg, Flags: RegState::Implicit);
5657 }
5658 return;
5659 }
5660
5661 // Copy a Z register by ORRing with itself.
5662 if (AArch64::ZPRRegClass.contains(Reg: DestReg) &&
5663 AArch64::ZPRRegClass.contains(Reg: SrcReg)) {
5664 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5665 "Unexpected SVE register.");
5666 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_ZZZ), DestReg)
5667 .addReg(RegNo: SrcReg)
5668 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5669 return;
5670 }
5671
5672 // Copy a Z register pair by copying the individual sub-registers.
5673 if ((AArch64::ZPR2RegClass.contains(Reg: DestReg) ||
5674 AArch64::ZPR2StridedOrContiguousRegClass.contains(Reg: DestReg)) &&
5675 (AArch64::ZPR2RegClass.contains(Reg: SrcReg) ||
5676 AArch64::ZPR2StridedOrContiguousRegClass.contains(Reg: SrcReg))) {
5677 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5678 "Unexpected SVE register.");
5679 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5680 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5681 Indices);
5682 return;
5683 }
5684
5685 // Copy a Z register triple by copying the individual sub-registers.
5686 if (AArch64::ZPR3RegClass.contains(Reg: DestReg) &&
5687 AArch64::ZPR3RegClass.contains(Reg: SrcReg)) {
5688 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5689 "Unexpected SVE register.");
5690 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5691 AArch64::zsub2};
5692 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5693 Indices);
5694 return;
5695 }
5696
5697 // Copy a Z register quad by copying the individual sub-registers.
5698 if ((AArch64::ZPR4RegClass.contains(Reg: DestReg) ||
5699 AArch64::ZPR4StridedOrContiguousRegClass.contains(Reg: DestReg)) &&
5700 (AArch64::ZPR4RegClass.contains(Reg: SrcReg) ||
5701 AArch64::ZPR4StridedOrContiguousRegClass.contains(Reg: SrcReg))) {
5702 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5703 "Unexpected SVE register.");
5704 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5705 AArch64::zsub2, AArch64::zsub3};
5706 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5707 Indices);
5708 return;
5709 }
5710
5711 // Copy a DDDD register quad by copying the individual sub-registers.
5712 if (AArch64::DDDDRegClass.contains(Reg: DestReg) &&
5713 AArch64::DDDDRegClass.contains(Reg: SrcReg)) {
5714 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5715 AArch64::dsub2, AArch64::dsub3};
5716 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5717 Indices);
5718 return;
5719 }
5720
5721 // Copy a DDD register triple by copying the individual sub-registers.
5722 if (AArch64::DDDRegClass.contains(Reg: DestReg) &&
5723 AArch64::DDDRegClass.contains(Reg: SrcReg)) {
5724 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5725 AArch64::dsub2};
5726 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5727 Indices);
5728 return;
5729 }
5730
5731 // Copy a DD register pair by copying the individual sub-registers.
5732 if (AArch64::DDRegClass.contains(Reg: DestReg) &&
5733 AArch64::DDRegClass.contains(Reg: SrcReg)) {
5734 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5735 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5736 Indices);
5737 return;
5738 }
5739
5740 // Copy a QQQQ register quad by copying the individual sub-registers.
5741 if (AArch64::QQQQRegClass.contains(Reg: DestReg) &&
5742 AArch64::QQQQRegClass.contains(Reg: SrcReg)) {
5743 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5744 AArch64::qsub2, AArch64::qsub3};
5745 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5746 Indices);
5747 return;
5748 }
5749
5750 // Copy a QQQ register triple by copying the individual sub-registers.
5751 if (AArch64::QQQRegClass.contains(Reg: DestReg) &&
5752 AArch64::QQQRegClass.contains(Reg: SrcReg)) {
5753 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5754 AArch64::qsub2};
5755 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5756 Indices);
5757 return;
5758 }
5759
5760 // Copy a QQ register pair by copying the individual sub-registers.
5761 if (AArch64::QQRegClass.contains(Reg: DestReg) &&
5762 AArch64::QQRegClass.contains(Reg: SrcReg)) {
5763 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5764 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5765 Indices);
5766 return;
5767 }
5768
5769 if (AArch64::XSeqPairsClassRegClass.contains(Reg: DestReg) &&
5770 AArch64::XSeqPairsClassRegClass.contains(Reg: SrcReg)) {
5771 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5772 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRXrs,
5773 ZeroReg: AArch64::XZR, Indices);
5774 return;
5775 }
5776
5777 if (AArch64::WSeqPairsClassRegClass.contains(Reg: DestReg) &&
5778 AArch64::WSeqPairsClassRegClass.contains(Reg: SrcReg)) {
5779 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5780 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRWrs,
5781 ZeroReg: AArch64::WZR, Indices);
5782 return;
5783 }
5784
5785 if (AArch64::FPR128RegClass.contains(Reg: DestReg) &&
5786 AArch64::FPR128RegClass.contains(Reg: SrcReg)) {
5787 // In streaming regions, NEON is illegal but streaming-SVE is available.
5788 // Use SVE for copies if we're in a streaming region and SME is available.
5789 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
5790 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
5791 !Subtarget.isNeonAvailable()) ||
5792 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5793 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_ZZZ))
5794 .addReg(RegNo: AArch64::Z0 + (DestReg - AArch64::Q0), Flags: RegState::Define)
5795 .addReg(RegNo: AArch64::Z0 + (SrcReg - AArch64::Q0))
5796 .addReg(RegNo: AArch64::Z0 + (SrcReg - AArch64::Q0));
5797 } else if (Subtarget.isNeonAvailable()) {
5798 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg)
5799 .addReg(RegNo: SrcReg)
5800 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5801 if (Subtarget.hasZeroCycleRegMoveFPR128())
5802 ++NumZCRegMoveInstrsFPR;
5803 } else {
5804 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::STRQpre))
5805 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
5806 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5807 .addReg(RegNo: AArch64::SP)
5808 .addImm(Val: -16);
5809 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::LDRQpost))
5810 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
5811 .addReg(RegNo: DestReg, Flags: RegState::Define)
5812 .addReg(RegNo: AArch64::SP)
5813 .addImm(Val: 16);
5814 }
5815 return;
5816 }
5817
5818 if (AArch64::FPR64RegClass.contains(Reg: DestReg) &&
5819 AArch64::FPR64RegClass.contains(Reg: SrcReg)) {
5820 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5821 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5822 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5823 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5824 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::dsub,
5825 RC: &AArch64::FPR128RegClass);
5826 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::dsub,
5827 RC: &AArch64::FPR128RegClass);
5828 // This instruction is reading and writing Q registers. This may upset
5829 // the register scavenger and machine verifier, so we need to indicate
5830 // that we are reading an undefined value from SrcRegQ, but a proper
5831 // value from SrcReg.
5832 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5833 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5834 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5835 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5836 ++NumZCRegMoveInstrsFPR;
5837 } else {
5838 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg)
5839 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5840 if (Subtarget.hasZeroCycleRegMoveFPR64())
5841 ++NumZCRegMoveInstrsFPR;
5842 }
5843 return;
5844 }
5845
5846 if (AArch64::FPR32RegClass.contains(Reg: DestReg) &&
5847 AArch64::FPR32RegClass.contains(Reg: SrcReg)) {
5848 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5849 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5850 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5851 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5852 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::ssub,
5853 RC: &AArch64::FPR128RegClass);
5854 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::ssub,
5855 RC: &AArch64::FPR128RegClass);
5856 // This instruction is reading and writing Q registers. This may upset
5857 // the register scavenger and machine verifier, so we need to indicate
5858 // that we are reading an undefined value from SrcRegQ, but a proper
5859 // value from SrcReg.
5860 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5861 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5862 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5863 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5864 ++NumZCRegMoveInstrsFPR;
5865 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5866 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5867 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::ssub,
5868 RC: &AArch64::FPR64RegClass);
5869 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::ssub,
5870 RC: &AArch64::FPR64RegClass);
5871 // This instruction is reading and writing D registers. This may upset
5872 // the register scavenger and machine verifier, so we need to indicate
5873 // that we are reading an undefined value from SrcRegD, but a proper
5874 // value from SrcReg.
5875 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
5876 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
5877 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5878 ++NumZCRegMoveInstrsFPR;
5879 } else {
5880 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
5881 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5882 if (Subtarget.hasZeroCycleRegMoveFPR32())
5883 ++NumZCRegMoveInstrsFPR;
5884 }
5885 return;
5886 }
5887
5888 if (AArch64::FPR16RegClass.contains(Reg: DestReg) &&
5889 AArch64::FPR16RegClass.contains(Reg: SrcReg)) {
5890 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5891 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5892 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5893 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5894 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
5895 RC: &AArch64::FPR128RegClass);
5896 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
5897 RC: &AArch64::FPR128RegClass);
5898 // This instruction is reading and writing Q registers. This may upset
5899 // the register scavenger and machine verifier, so we need to indicate
5900 // that we are reading an undefined value from SrcRegQ, but a proper
5901 // value from SrcReg.
5902 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5903 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5904 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5905 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5906 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5907 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5908 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
5909 RC: &AArch64::FPR64RegClass);
5910 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
5911 RC: &AArch64::FPR64RegClass);
5912 // This instruction is reading and writing D registers. This may upset
5913 // the register scavenger and machine verifier, so we need to indicate
5914 // that we are reading an undefined value from SrcRegD, but a proper
5915 // value from SrcReg.
5916 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
5917 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
5918 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5919 } else {
5920 DestReg = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
5921 RC: &AArch64::FPR32RegClass);
5922 SrcReg = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
5923 RC: &AArch64::FPR32RegClass);
5924 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
5925 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5926 }
5927 return;
5928 }
5929
5930 if (AArch64::FPR8RegClass.contains(Reg: DestReg) &&
5931 AArch64::FPR8RegClass.contains(Reg: SrcReg)) {
5932 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5933 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5934 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5935 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5936 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
5937 RC: &AArch64::FPR128RegClass);
5938 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
5939 RC: &AArch64::FPR128RegClass);
5940 // This instruction is reading and writing Q registers. This may upset
5941 // the register scavenger and machine verifier, so we need to indicate
5942 // that we are reading an undefined value from SrcRegQ, but a proper
5943 // value from SrcReg.
5944 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
5945 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5946 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
5947 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5948 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5949 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5950 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
5951 RC: &AArch64::FPR64RegClass);
5952 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
5953 RC: &AArch64::FPR64RegClass);
5954 // This instruction is reading and writing D registers. This may upset
5955 // the register scavenger and machine verifier, so we need to indicate
5956 // that we are reading an undefined value from SrcRegD, but a proper
5957 // value from SrcReg.
5958 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
5959 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
5960 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5961 } else {
5962 DestReg = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
5963 RC: &AArch64::FPR32RegClass);
5964 SrcReg = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
5965 RC: &AArch64::FPR32RegClass);
5966 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
5967 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5968 }
5969 return;
5970 }
5971
5972 // Copies between GPR64 and FPR64.
5973 if (AArch64::FPR64RegClass.contains(Reg: DestReg) &&
5974 AArch64::GPR64RegClass.contains(Reg: SrcReg)) {
5975 if (AArch64::XZR == SrcReg) {
5976 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVD0), DestReg);
5977 } else {
5978 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVXDr), DestReg)
5979 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5980 }
5981 return;
5982 }
5983 if (AArch64::GPR64RegClass.contains(Reg: DestReg) &&
5984 AArch64::FPR64RegClass.contains(Reg: SrcReg)) {
5985 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDXr), DestReg)
5986 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5987 return;
5988 }
5989 // Copies between GPR32 and FPR32.
5990 if (AArch64::FPR32RegClass.contains(Reg: DestReg) &&
5991 AArch64::GPR32RegClass.contains(Reg: SrcReg)) {
5992 if (AArch64::WZR == SrcReg) {
5993 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVS0), DestReg);
5994 } else {
5995 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVWSr), DestReg)
5996 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5997 }
5998 return;
5999 }
6000 if (AArch64::GPR32RegClass.contains(Reg: DestReg) &&
6001 AArch64::FPR32RegClass.contains(Reg: SrcReg)) {
6002 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSWr), DestReg)
6003 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
6004 return;
6005 }
6006
6007 if (DestReg == AArch64::NZCV) {
6008 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
6009 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MSR))
6010 .addImm(Val: AArch64SysReg::NZCV)
6011 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
6012 .addReg(RegNo: AArch64::NZCV, Flags: RegState::Implicit | RegState::Define);
6013 return;
6014 }
6015
6016 if (SrcReg == AArch64::NZCV) {
6017 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
6018 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MRS), DestReg)
6019 .addImm(Val: AArch64SysReg::NZCV)
6020 .addReg(RegNo: AArch64::NZCV, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
6021 return;
6022 }
6023
6024#ifndef NDEBUG
6025 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
6026 << "\n";
6027#endif
6028 llvm_unreachable("unimplemented reg-to-reg copy");
6029}
6030
6031static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
6032 MachineBasicBlock &MBB,
6033 MachineBasicBlock::iterator InsertBefore,
6034 const MCInstrDesc &MCID,
6035 Register SrcReg, bool IsKill,
6036 unsigned SubIdx0, unsigned SubIdx1, int FI,
6037 MachineMemOperand *MMO) {
6038 Register SrcReg0 = SrcReg;
6039 Register SrcReg1 = SrcReg;
6040 if (SrcReg.isPhysical()) {
6041 SrcReg0 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx0);
6042 SubIdx0 = 0;
6043 SrcReg1 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx1);
6044 SubIdx1 = 0;
6045 }
6046 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
6047 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: IsKill), SubReg: SubIdx0)
6048 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: IsKill), SubReg: SubIdx1)
6049 .addFrameIndex(Idx: FI)
6050 .addImm(Val: 0)
6051 .addMemOperand(MMO);
6052}
6053
6054void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
6055 MachineBasicBlock::iterator MBBI,
6056 Register SrcReg, bool isKill, int FI,
6057 const TargetRegisterClass *RC,
6058 Register VReg,
6059 MachineInstr::MIFlag Flags) const {
6060 MachineFunction &MF = *MBB.getParent();
6061 MachineFrameInfo &MFI = MF.getFrameInfo();
6062
6063 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6064 MachineMemOperand *MMO =
6065 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore,
6066 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
6067 unsigned Opc = 0;
6068 bool Offset = true;
6069 MCRegister PNRReg = MCRegister::NoRegister;
6070 unsigned StackID = TargetStackID::Default;
6071 switch (RI.getSpillSize(RC: *RC)) {
6072 case 1:
6073 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6074 Opc = AArch64::STRBui;
6075 break;
6076 case 2: {
6077 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6078 Opc = AArch64::STRHui;
6079 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6080 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6081 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6082 "Unexpected register store without SVE store instructions");
6083 Opc = AArch64::STR_PXI;
6084 StackID = TargetStackID::ScalablePredicateVector;
6085 }
6086 break;
6087 }
6088 case 4:
6089 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6090 Opc = AArch64::STRWui;
6091 if (SrcReg.isVirtual())
6092 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR32RegClass);
6093 else
6094 assert(SrcReg != AArch64::WSP);
6095 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6096 Opc = AArch64::STRSui;
6097 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6098 Opc = AArch64::STR_PPXI;
6099 StackID = TargetStackID::ScalablePredicateVector;
6100 }
6101 break;
6102 case 8:
6103 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6104 Opc = AArch64::STRXui;
6105 if (SrcReg.isVirtual())
6106 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64RegClass);
6107 else
6108 assert(SrcReg != AArch64::SP);
6109 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6110 Opc = AArch64::STRDui;
6111 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6112 storeRegPairToStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6113 MCID: get(Opcode: AArch64::STPWi), SrcReg, IsKill: isKill,
6114 SubIdx0: AArch64::sube32, SubIdx1: AArch64::subo32, FI, MMO);
6115 return;
6116 }
6117 break;
6118 case 16:
6119 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6120 Opc = AArch64::STRQui;
6121 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6122 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6123 Opc = AArch64::ST1Twov1d;
6124 Offset = false;
6125 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6126 storeRegPairToStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6127 MCID: get(Opcode: AArch64::STPXi), SrcReg, IsKill: isKill,
6128 SubIdx0: AArch64::sube64, SubIdx1: AArch64::subo64, FI, MMO);
6129 return;
6130 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6131 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6132 "Unexpected register store without SVE store instructions");
6133 Opc = AArch64::STR_ZXI;
6134 StackID = TargetStackID::ScalableVector;
6135 }
6136 break;
6137 case 24:
6138 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6139 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6140 Opc = AArch64::ST1Threev1d;
6141 Offset = false;
6142 }
6143 break;
6144 case 32:
6145 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6146 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6147 Opc = AArch64::ST1Fourv1d;
6148 Offset = false;
6149 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6150 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6151 Opc = AArch64::ST1Twov2d;
6152 Offset = false;
6153 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6154 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6155 "Unexpected register store without SVE store instructions");
6156 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6157 StackID = TargetStackID::ScalableVector;
6158 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6159 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6160 "Unexpected register store without SVE store instructions");
6161 Opc = AArch64::STR_ZZXI;
6162 StackID = TargetStackID::ScalableVector;
6163 }
6164 break;
6165 case 48:
6166 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6167 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6168 Opc = AArch64::ST1Threev2d;
6169 Offset = false;
6170 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6171 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6172 "Unexpected register store without SVE store instructions");
6173 Opc = AArch64::STR_ZZZXI;
6174 StackID = TargetStackID::ScalableVector;
6175 }
6176 break;
6177 case 64:
6178 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6179 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6180 Opc = AArch64::ST1Fourv2d;
6181 Offset = false;
6182 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6183 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6184 "Unexpected register store without SVE store instructions");
6185 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6186 StackID = TargetStackID::ScalableVector;
6187 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6188 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6189 "Unexpected register store without SVE store instructions");
6190 Opc = AArch64::STR_ZZZZXI;
6191 StackID = TargetStackID::ScalableVector;
6192 }
6193 break;
6194 }
6195 assert(Opc && "Unknown register class");
6196 MFI.setStackID(ObjectIdx: FI, ID: StackID);
6197
6198 const MachineInstrBuilder MI = BuildMI(BB&: MBB, I: MBBI, MIMD: DebugLoc(), MCID: get(Opcode: Opc))
6199 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill))
6200 .addFrameIndex(Idx: FI);
6201
6202 if (Offset)
6203 MI.addImm(Val: 0);
6204 if (PNRReg.isValid())
6205 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
6206 MI.addMemOperand(MMO);
6207}
6208
6209static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
6210 MachineBasicBlock &MBB,
6211 MachineBasicBlock::iterator InsertBefore,
6212 const MCInstrDesc &MCID,
6213 Register DestReg, unsigned SubIdx0,
6214 unsigned SubIdx1, int FI,
6215 MachineMemOperand *MMO) {
6216 Register DestReg0 = DestReg;
6217 Register DestReg1 = DestReg;
6218 bool IsUndef = true;
6219 if (DestReg.isPhysical()) {
6220 DestReg0 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx0);
6221 SubIdx0 = 0;
6222 DestReg1 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx1);
6223 SubIdx1 = 0;
6224 IsUndef = false;
6225 }
6226 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
6227 .addReg(RegNo: DestReg0, Flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx0)
6228 .addReg(RegNo: DestReg1, Flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx1)
6229 .addFrameIndex(Idx: FI)
6230 .addImm(Val: 0)
6231 .addMemOperand(MMO);
6232}
6233
6234void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
6235 MachineBasicBlock::iterator MBBI,
6236 Register DestReg, int FI,
6237 const TargetRegisterClass *RC,
6238 Register VReg, unsigned SubReg,
6239 MachineInstr::MIFlag Flags) const {
6240 MachineFunction &MF = *MBB.getParent();
6241 MachineFrameInfo &MFI = MF.getFrameInfo();
6242 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6243 MachineMemOperand *MMO =
6244 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOLoad,
6245 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
6246
6247 unsigned Opc = 0;
6248 bool Offset = true;
6249 unsigned StackID = TargetStackID::Default;
6250 Register PNRReg = MCRegister::NoRegister;
6251 switch (TRI.getSpillSize(RC: *RC)) {
6252 case 1:
6253 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6254 Opc = AArch64::LDRBui;
6255 break;
6256 case 2: {
6257 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6258 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6259 Opc = AArch64::LDRHui;
6260 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6261 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6262 "Unexpected register load without SVE load instructions");
6263 if (IsPNR)
6264 PNRReg = DestReg;
6265 Opc = AArch64::LDR_PXI;
6266 StackID = TargetStackID::ScalablePredicateVector;
6267 }
6268 break;
6269 }
6270 case 4:
6271 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6272 Opc = AArch64::LDRWui;
6273 if (DestReg.isVirtual())
6274 MF.getRegInfo().constrainRegClass(Reg: DestReg, RC: &AArch64::GPR32RegClass);
6275 else
6276 assert(DestReg != AArch64::WSP);
6277 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6278 Opc = AArch64::LDRSui;
6279 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6280 Opc = AArch64::LDR_PPXI;
6281 StackID = TargetStackID::ScalablePredicateVector;
6282 }
6283 break;
6284 case 8:
6285 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6286 Opc = AArch64::LDRXui;
6287 if (DestReg.isVirtual())
6288 MF.getRegInfo().constrainRegClass(Reg: DestReg, RC: &AArch64::GPR64RegClass);
6289 else
6290 assert(DestReg != AArch64::SP);
6291 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6292 Opc = AArch64::LDRDui;
6293 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6294 loadRegPairFromStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6295 MCID: get(Opcode: AArch64::LDPWi), DestReg, SubIdx0: AArch64::sube32,
6296 SubIdx1: AArch64::subo32, FI, MMO);
6297 return;
6298 }
6299 break;
6300 case 16:
6301 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6302 Opc = AArch64::LDRQui;
6303 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6304 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6305 Opc = AArch64::LD1Twov1d;
6306 Offset = false;
6307 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6308 loadRegPairFromStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6309 MCID: get(Opcode: AArch64::LDPXi), DestReg, SubIdx0: AArch64::sube64,
6310 SubIdx1: AArch64::subo64, FI, MMO);
6311 return;
6312 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6313 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6314 "Unexpected register load without SVE load instructions");
6315 Opc = AArch64::LDR_ZXI;
6316 StackID = TargetStackID::ScalableVector;
6317 }
6318 break;
6319 case 24:
6320 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6321 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6322 Opc = AArch64::LD1Threev1d;
6323 Offset = false;
6324 }
6325 break;
6326 case 32:
6327 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6328 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6329 Opc = AArch64::LD1Fourv1d;
6330 Offset = false;
6331 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6332 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6333 Opc = AArch64::LD1Twov2d;
6334 Offset = false;
6335 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6336 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6337 "Unexpected register load without SVE load instructions");
6338 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6339 StackID = TargetStackID::ScalableVector;
6340 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6341 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6342 "Unexpected register load without SVE load instructions");
6343 Opc = AArch64::LDR_ZZXI;
6344 StackID = TargetStackID::ScalableVector;
6345 }
6346 break;
6347 case 48:
6348 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6349 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6350 Opc = AArch64::LD1Threev2d;
6351 Offset = false;
6352 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6353 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6354 "Unexpected register load without SVE load instructions");
6355 Opc = AArch64::LDR_ZZZXI;
6356 StackID = TargetStackID::ScalableVector;
6357 }
6358 break;
6359 case 64:
6360 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6361 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6362 Opc = AArch64::LD1Fourv2d;
6363 Offset = false;
6364 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6365 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6366 "Unexpected register load without SVE load instructions");
6367 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6368 StackID = TargetStackID::ScalableVector;
6369 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6370 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6371 "Unexpected register load without SVE load instructions");
6372 Opc = AArch64::LDR_ZZZZXI;
6373 StackID = TargetStackID::ScalableVector;
6374 }
6375 break;
6376 }
6377
6378 assert(Opc && "Unknown register class");
6379 MFI.setStackID(ObjectIdx: FI, ID: StackID);
6380
6381 const MachineInstrBuilder MI = BuildMI(BB&: MBB, I: MBBI, MIMD: DebugLoc(), MCID: get(Opcode: Opc))
6382 .addReg(RegNo: DestReg, Flags: getDefRegState(B: true))
6383 .addFrameIndex(Idx: FI);
6384 if (Offset)
6385 MI.addImm(Val: 0);
6386 if (PNRReg.isValid() && !PNRReg.isVirtual())
6387 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
6388 MI.addMemOperand(MMO);
6389}
6390
6391bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
6392 const MachineInstr &UseMI,
6393 const TargetRegisterInfo *TRI) {
6394 return any_of(Range: instructionsWithoutDebug(It: std::next(x: DefMI.getIterator()),
6395 End: UseMI.getIterator()),
6396 P: [TRI](const MachineInstr &I) {
6397 return I.modifiesRegister(Reg: AArch64::NZCV, TRI) ||
6398 I.readsRegister(Reg: AArch64::NZCV, TRI);
6399 });
6400}
6401
6402void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6403 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6404 // The smallest scalable element supported by scaled SVE addressing
6405 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6406 // byte offset must always be a multiple of 2.
6407 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6408
6409 // VGSized offsets are divided by '2', because the VG register is the
6410 // the number of 64bit granules as opposed to 128bit vector chunks,
6411 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6412 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6413 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6414 ByteSized = Offset.getFixed();
6415 VGSized = Offset.getScalable() / 2;
6416}
6417
6418/// Returns the offset in parts to which this frame offset can be
6419/// decomposed for the purpose of describing a frame offset.
6420/// For non-scalable offsets this is simply its byte size.
6421void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6422 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6423 int64_t &NumDataVectors) {
6424 // The smallest scalable element supported by scaled SVE addressing
6425 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6426 // byte offset must always be a multiple of 2.
6427 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6428
6429 NumBytes = Offset.getFixed();
6430 NumDataVectors = 0;
6431 NumPredicateVectors = Offset.getScalable() / 2;
6432 // This method is used to get the offsets to adjust the frame offset.
6433 // If the function requires ADDPL to be used and needs more than two ADDPL
6434 // instructions, part of the offset is folded into NumDataVectors so that it
6435 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6436 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6437 NumPredicateVectors > 62) {
6438 NumDataVectors = NumPredicateVectors / 8;
6439 NumPredicateVectors -= NumDataVectors * 8;
6440 }
6441}
6442
6443// Convenience function to create a DWARF expression for: Constant `Operation`.
6444// This helper emits compact sequences for common cases. For example, for`-15
6445// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6446static void appendConstantExpr(SmallVectorImpl<char> &Expr, int64_t Constant,
6447 dwarf::LocationAtom Operation) {
6448 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6449 // -Constant (1 to 31)
6450 Expr.push_back(Elt: dwarf::DW_OP_lit0 - Constant);
6451 Operation = dwarf::DW_OP_minus;
6452 } else if (Constant >= 0 && Constant <= 31) {
6453 // Literal value 0 to 31
6454 Expr.push_back(Elt: dwarf::DW_OP_lit0 + Constant);
6455 } else {
6456 // Signed constant
6457 Expr.push_back(Elt: dwarf::DW_OP_consts);
6458 appendLEB128<LEB128Sign::Signed>(Buffer&: Expr, Value: Constant);
6459 }
6460 return Expr.push_back(Elt: Operation);
6461}
6462
6463// Convenience function to create a DWARF expression for a register.
6464static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6465 Expr.push_back(Elt: (char)dwarf::DW_OP_bregx);
6466 appendLEB128<LEB128Sign::Unsigned>(Buffer&: Expr, Value: RegNum);
6467 Expr.push_back(Elt: 0);
6468}
6469
6470// Convenience function to create a DWARF expression for loading a register from
6471// a CFA offset.
6472static void appendLoadRegExpr(SmallVectorImpl<char> &Expr,
6473 int64_t OffsetFromDefCFA) {
6474 // This assumes the top of the DWARF stack contains the CFA.
6475 Expr.push_back(Elt: dwarf::DW_OP_dup);
6476 // Add the offset to the register.
6477 appendConstantExpr(Expr, Constant: OffsetFromDefCFA, Operation: dwarf::DW_OP_plus);
6478 // Dereference the address (loads a 64 bit value)..
6479 Expr.push_back(Elt: dwarf::DW_OP_deref);
6480}
6481
6482// Convenience function to create a comment for
6483// (+/-) NumBytes (* RegScale)?
6484static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6485 StringRef RegScale = {}) {
6486 if (NumBytes) {
6487 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(x: NumBytes);
6488 if (!RegScale.empty())
6489 Comment << ' ' << RegScale;
6490 }
6491}
6492
6493// Creates an MCCFIInstruction:
6494// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6495static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
6496 unsigned Reg,
6497 const StackOffset &Offset) {
6498 int64_t NumBytes, NumVGScaledBytes;
6499 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, ByteSized&: NumBytes,
6500 VGSized&: NumVGScaledBytes);
6501 std::string CommentBuffer;
6502 llvm::raw_string_ostream Comment(CommentBuffer);
6503
6504 if (Reg == AArch64::SP)
6505 Comment << "sp";
6506 else if (Reg == AArch64::FP)
6507 Comment << "fp";
6508 else
6509 Comment << printReg(Reg, TRI: &TRI);
6510
6511 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6512 SmallString<64> Expr;
6513 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6514 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6515 // Reg + NumBytes
6516 Expr.push_back(Elt: dwarf::DW_OP_breg0 + DwarfReg);
6517 appendLEB128<LEB128Sign::Signed>(Buffer&: Expr, Value: NumBytes);
6518 appendOffsetComment(NumBytes, Comment);
6519 if (NumVGScaledBytes) {
6520 // + VG * NumVGScaledBytes
6521 appendOffsetComment(NumBytes: NumVGScaledBytes, Comment, RegScale: "* VG");
6522 appendReadRegExpr(Expr, RegNum: TRI.getDwarfRegNum(Reg: AArch64::VG, isEH: true));
6523 appendConstantExpr(Expr, Constant: NumVGScaledBytes, Operation: dwarf::DW_OP_mul);
6524 Expr.push_back(Elt: dwarf::DW_OP_plus);
6525 }
6526
6527 // Wrap this into DW_CFA_def_cfa.
6528 SmallString<64> DefCfaExpr;
6529 DefCfaExpr.push_back(Elt: dwarf::DW_CFA_def_cfa_expression);
6530 appendLEB128<LEB128Sign::Unsigned>(Buffer&: DefCfaExpr, Value: Expr.size());
6531 DefCfaExpr.append(RHS: Expr.str());
6532 return MCCFIInstruction::createEscape(L: nullptr, Vals: DefCfaExpr.str(), Loc: SMLoc(),
6533 Comment: Comment.str());
6534}
6535
6536MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
6537 unsigned FrameReg, unsigned Reg,
6538 const StackOffset &Offset,
6539 bool LastAdjustmentWasScalable) {
6540 if (Offset.getScalable())
6541 return createDefCFAExpression(TRI, Reg, Offset);
6542
6543 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6544 return MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: int(Offset.getFixed()));
6545
6546 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6547 return MCCFIInstruction::cfiDefCfa(L: nullptr, Register: DwarfReg, Offset: (int)Offset.getFixed());
6548}
6549
6550MCCFIInstruction
6551llvm::createCFAOffset(const TargetRegisterInfo &TRI, unsigned Reg,
6552 const StackOffset &OffsetFromDefCFA,
6553 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6554 int64_t NumBytes, NumVGScaledBytes;
6555 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6556 Offset: OffsetFromDefCFA, ByteSized&: NumBytes, VGSized&: NumVGScaledBytes);
6557
6558 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6559
6560 // Non-scalable offsets can use DW_CFA_offset directly.
6561 if (!NumVGScaledBytes)
6562 return MCCFIInstruction::createOffset(L: nullptr, Register: DwarfReg, Offset: NumBytes);
6563
6564 std::string CommentBuffer;
6565 llvm::raw_string_ostream Comment(CommentBuffer);
6566 Comment << printReg(Reg, TRI: &TRI) << " @ cfa";
6567
6568 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6569 assert(NumVGScaledBytes && "Expected scalable offset");
6570 SmallString<64> OffsetExpr;
6571 // + VG * NumVGScaledBytes
6572 StringRef VGRegScale;
6573 if (IncomingVGOffsetFromDefCFA) {
6574 appendLoadRegExpr(Expr&: OffsetExpr, OffsetFromDefCFA: *IncomingVGOffsetFromDefCFA);
6575 VGRegScale = "* IncomingVG";
6576 } else {
6577 appendReadRegExpr(Expr&: OffsetExpr, RegNum: TRI.getDwarfRegNum(Reg: AArch64::VG, isEH: true));
6578 VGRegScale = "* VG";
6579 }
6580 appendConstantExpr(Expr&: OffsetExpr, Constant: NumVGScaledBytes, Operation: dwarf::DW_OP_mul);
6581 appendOffsetComment(NumBytes: NumVGScaledBytes, Comment, RegScale: VGRegScale);
6582 OffsetExpr.push_back(Elt: dwarf::DW_OP_plus);
6583 if (NumBytes) {
6584 // + NumBytes
6585 appendOffsetComment(NumBytes, Comment);
6586 appendConstantExpr(Expr&: OffsetExpr, Constant: NumBytes, Operation: dwarf::DW_OP_plus);
6587 }
6588
6589 // Wrap this into DW_CFA_expression
6590 SmallString<64> CfaExpr;
6591 CfaExpr.push_back(Elt: dwarf::DW_CFA_expression);
6592 appendLEB128<LEB128Sign::Unsigned>(Buffer&: CfaExpr, Value: DwarfReg);
6593 appendLEB128<LEB128Sign::Unsigned>(Buffer&: CfaExpr, Value: OffsetExpr.size());
6594 CfaExpr.append(RHS: OffsetExpr.str());
6595
6596 return MCCFIInstruction::createEscape(L: nullptr, Vals: CfaExpr.str(), Loc: SMLoc(),
6597 Comment: Comment.str());
6598}
6599
6600// Helper function to emit a frame offset adjustment from a given
6601// pointer (SrcReg), stored into DestReg. This function is explicit
6602// in that it requires the opcode.
6603static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
6604 MachineBasicBlock::iterator MBBI,
6605 const DebugLoc &DL, unsigned DestReg,
6606 unsigned SrcReg, int64_t Offset, unsigned Opc,
6607 const TargetInstrInfo *TII,
6608 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6609 bool *HasWinCFI, bool EmitCFAOffset,
6610 StackOffset CFAOffset, unsigned FrameReg) {
6611 int Sign = 1;
6612 unsigned MaxEncoding, ShiftSize;
6613 switch (Opc) {
6614 case AArch64::ADDXri:
6615 case AArch64::ADDSXri:
6616 case AArch64::SUBXri:
6617 case AArch64::SUBSXri:
6618 MaxEncoding = 0xfff;
6619 ShiftSize = 12;
6620 break;
6621 case AArch64::ADDVL_XXI:
6622 case AArch64::ADDPL_XXI:
6623 case AArch64::ADDSVL_XXI:
6624 case AArch64::ADDSPL_XXI:
6625 MaxEncoding = 31;
6626 ShiftSize = 0;
6627 if (Offset < 0) {
6628 MaxEncoding = 32;
6629 Sign = -1;
6630 Offset = -Offset;
6631 }
6632 break;
6633 default:
6634 llvm_unreachable("Unsupported opcode");
6635 }
6636
6637 // `Offset` can be in bytes or in "scalable bytes".
6638 int VScale = 1;
6639 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6640 VScale = 16;
6641 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6642 VScale = 2;
6643
6644 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6645 // scratch register. If DestReg is a virtual register, use it as the
6646 // scratch register; otherwise, create a new virtual register (to be
6647 // replaced by the scavenger at the end of PEI). That case can be optimized
6648 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6649 // register can be loaded with offset%8 and the add/sub can use an extending
6650 // instruction with LSL#3.
6651 // Currently the function handles any offsets but generates a poor sequence
6652 // of code.
6653 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6654
6655 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6656 Register TmpReg = DestReg;
6657 if (TmpReg == AArch64::XZR)
6658 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6659 RegClass: &AArch64::GPR64RegClass);
6660 do {
6661 uint64_t ThisVal = std::min<uint64_t>(a: Offset, b: MaxEncodableValue);
6662 unsigned LocalShiftSize = 0;
6663 if (ThisVal > MaxEncoding) {
6664 ThisVal = ThisVal >> ShiftSize;
6665 LocalShiftSize = ShiftSize;
6666 }
6667 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6668 "Encoding cannot handle value that big");
6669
6670 Offset -= ThisVal << LocalShiftSize;
6671 if (Offset == 0)
6672 TmpReg = DestReg;
6673 auto MBI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: TmpReg)
6674 .addReg(RegNo: SrcReg)
6675 .addImm(Val: Sign * (int)ThisVal);
6676 if (ShiftSize)
6677 MBI = MBI.addImm(
6678 Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: LocalShiftSize));
6679 MBI = MBI.setMIFlag(Flag);
6680
6681 auto Change =
6682 VScale == 1
6683 ? StackOffset::getFixed(Fixed: ThisVal << LocalShiftSize)
6684 : StackOffset::getScalable(Scalable: VScale * (ThisVal << LocalShiftSize));
6685 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6686 CFAOffset += Change;
6687 else
6688 CFAOffset -= Change;
6689 if (EmitCFAOffset && DestReg == TmpReg) {
6690 MachineFunction &MF = *MBB.getParent();
6691 const TargetSubtargetInfo &STI = MF.getSubtarget();
6692 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6693
6694 unsigned CFIIndex = MF.addFrameInst(
6695 Inst: createDefCFA(TRI, FrameReg, Reg: DestReg, Offset: CFAOffset, LastAdjustmentWasScalable: VScale != 1));
6696 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::CFI_INSTRUCTION))
6697 .addCFIIndex(CFIIndex)
6698 .setMIFlags(Flag);
6699 }
6700
6701 if (NeedsWinCFI) {
6702 int Imm = (int)(ThisVal << LocalShiftSize);
6703 if (VScale != 1 && DestReg == AArch64::SP) {
6704 if (HasWinCFI)
6705 *HasWinCFI = true;
6706 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_AllocZ))
6707 .addImm(Val: ThisVal)
6708 .setMIFlag(Flag);
6709 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6710 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6711 assert(VScale == 1 && "Expected non-scalable operation");
6712 if (HasWinCFI)
6713 *HasWinCFI = true;
6714 if (Imm == 0)
6715 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_SetFP)).setMIFlag(Flag);
6716 else
6717 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_AddFP))
6718 .addImm(Val: Imm)
6719 .setMIFlag(Flag);
6720 assert(Offset == 0 && "Expected remaining offset to be zero to "
6721 "emit a single SEH directive");
6722 } else if (DestReg == AArch64::SP) {
6723 assert(VScale == 1 && "Expected non-scalable operation");
6724 if (HasWinCFI)
6725 *HasWinCFI = true;
6726 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6727 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_StackAlloc))
6728 .addImm(Val: Imm)
6729 .setMIFlag(Flag);
6730 }
6731 }
6732
6733 SrcReg = TmpReg;
6734 } while (Offset);
6735}
6736
6737void llvm::emitFrameOffset(MachineBasicBlock &MBB,
6738 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
6739 unsigned DestReg, unsigned SrcReg,
6740 StackOffset Offset, const TargetInstrInfo *TII,
6741 MachineInstr::MIFlag Flag, bool SetNZCV,
6742 bool NeedsWinCFI, bool *HasWinCFI,
6743 bool EmitCFAOffset, StackOffset CFAOffset,
6744 unsigned FrameReg) {
6745 // If a function is marked as arm_locally_streaming, then the runtime value of
6746 // vscale in the prologue/epilogue is different the runtime value of vscale
6747 // in the function's body. To avoid having to consider multiple vscales,
6748 // we can use `addsvl` to allocate any scalable stack-slots, which under
6749 // most circumstances will be only locals, not callee-save slots.
6750 const Function &F = MBB.getParent()->getFunction();
6751 bool UseSVL = F.hasFnAttribute(Kind: "aarch64_pstate_sm_body");
6752
6753 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6754 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6755 Offset, NumBytes&: Bytes, NumPredicateVectors, NumDataVectors);
6756
6757 // Insert ADDSXri for scalable offset at the end.
6758 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6759 if (NeedsFinalDefNZCV)
6760 SetNZCV = false;
6761
6762 // First emit non-scalable frame offsets, or a simple 'mov'.
6763 if (Bytes || (!Offset && SrcReg != DestReg)) {
6764 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6765 "SP increment/decrement not 8-byte aligned");
6766 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6767 if (Bytes < 0) {
6768 Bytes = -Bytes;
6769 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6770 }
6771 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: Bytes, Opc, TII, Flag,
6772 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6773 FrameReg);
6774 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6775 ? StackOffset::getFixed(Fixed: -Bytes)
6776 : StackOffset::getFixed(Fixed: Bytes);
6777 SrcReg = DestReg;
6778 FrameReg = DestReg;
6779 }
6780
6781 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6782 "WinCFI can't allocate fractions of an SVE data vector");
6783
6784 if (NumDataVectors) {
6785 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: NumDataVectors,
6786 Opc: UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6787 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6788 FrameReg);
6789 CFAOffset += StackOffset::getScalable(Scalable: -NumDataVectors * 16);
6790 SrcReg = DestReg;
6791 }
6792
6793 if (NumPredicateVectors) {
6794 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6795 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: NumPredicateVectors,
6796 Opc: UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6797 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6798 FrameReg);
6799 }
6800
6801 if (NeedsFinalDefNZCV)
6802 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDSXri), DestReg)
6803 .addReg(RegNo: DestReg)
6804 .addImm(Val: 0)
6805 .addImm(Val: 0);
6806}
6807
6808MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
6809 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
6810 MachineBasicBlock::iterator InsertPt, int FrameIndex,
6811 LiveIntervals *LIS, VirtRegMap *VRM) const {
6812 // This is a bit of a hack. Consider this instruction:
6813 //
6814 // %0 = COPY %sp; GPR64all:%0
6815 //
6816 // We explicitly chose GPR64all for the virtual register so such a copy might
6817 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6818 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6819 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6820 //
6821 // To prevent that, we are going to constrain the %0 register class here.
6822 if (MI.isFullCopy()) {
6823 Register DstReg = MI.getOperand(i: 0).getReg();
6824 Register SrcReg = MI.getOperand(i: 1).getReg();
6825 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6826 MF.getRegInfo().constrainRegClass(Reg: DstReg, RC: &AArch64::GPR64RegClass);
6827 return nullptr;
6828 }
6829 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6830 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64RegClass);
6831 return nullptr;
6832 }
6833 // Nothing can folded with copy from/to NZCV.
6834 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6835 return nullptr;
6836 }
6837
6838 // Handle the case where a copy is being spilled or filled but the source
6839 // and destination register class don't match. For example:
6840 //
6841 // %0 = COPY %xzr; GPR64common:%0
6842 //
6843 // In this case we can still safely fold away the COPY and generate the
6844 // following spill code:
6845 //
6846 // STRXui %xzr, %stack.0
6847 //
6848 // This also eliminates spilled cross register class COPYs (e.g. between x and
6849 // d regs) of the same size. For example:
6850 //
6851 // %0 = COPY %1; GPR64:%0, FPR64:%1
6852 //
6853 // will be filled as
6854 //
6855 // LDRDui %0, fi<#0>
6856 //
6857 // instead of
6858 //
6859 // LDRXui %Temp, fi<#0>
6860 // %0 = FMOV %Temp
6861 //
6862 if (MI.isCopy() && Ops.size() == 1 &&
6863 // Make sure we're only folding the explicit COPY defs/uses.
6864 (Ops[0] == 0 || Ops[0] == 1)) {
6865 bool IsSpill = Ops[0] == 0;
6866 bool IsFill = !IsSpill;
6867 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
6868 const MachineRegisterInfo &MRI = MF.getRegInfo();
6869 MachineBasicBlock &MBB = *MI.getParent();
6870 const MachineOperand &DstMO = MI.getOperand(i: 0);
6871 const MachineOperand &SrcMO = MI.getOperand(i: 1);
6872 Register DstReg = DstMO.getReg();
6873 Register SrcReg = SrcMO.getReg();
6874 // This is slightly expensive to compute for physical regs since
6875 // getMinimalPhysRegClass is slow.
6876 auto getRegClass = [&](unsigned Reg) {
6877 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6878 : TRI.getMinimalPhysRegClass(Reg);
6879 };
6880
6881 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6882 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6883 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6884 "Mismatched register size in non subreg COPY");
6885 if (IsSpill)
6886 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg, isKill: SrcMO.isKill(), FI: FrameIndex,
6887 RC: getRegClass(SrcReg), VReg: Register());
6888 else
6889 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex,
6890 RC: getRegClass(DstReg), VReg: Register());
6891 return &*--InsertPt;
6892 }
6893
6894 // Handle cases like spilling def of:
6895 //
6896 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6897 //
6898 // where the physical register source can be widened and stored to the full
6899 // virtual reg destination stack slot, in this case producing:
6900 //
6901 // STRXui %xzr, %stack.0
6902 //
6903 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6904 TRI.getRegSizeInBits(RC: *getRegClass(DstReg)) == 64) {
6905 assert(SrcMO.getSubReg() == 0 &&
6906 "Unexpected subreg on physical register");
6907 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg: AArch64::XZR, isKill: SrcMO.isKill(),
6908 FI: FrameIndex, RC: &AArch64::GPR64RegClass, VReg: Register());
6909 return &*--InsertPt;
6910 }
6911
6912 // Handle cases like filling use of:
6913 //
6914 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6915 //
6916 // where we can load the full virtual reg source stack slot, into the subreg
6917 // destination, in this case producing:
6918 //
6919 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6920 //
6921 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6922 const TargetRegisterClass *FillRC = nullptr;
6923 switch (DstMO.getSubReg()) {
6924 default:
6925 break;
6926 case AArch64::sub_32:
6927 if (AArch64::GPR64RegClass.hasSubClassEq(RC: getRegClass(DstReg)))
6928 FillRC = &AArch64::GPR32RegClass;
6929 break;
6930 case AArch64::ssub:
6931 FillRC = &AArch64::FPR32RegClass;
6932 break;
6933 case AArch64::dsub:
6934 FillRC = &AArch64::FPR64RegClass;
6935 break;
6936 }
6937
6938 if (FillRC) {
6939 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6940 TRI.getRegSizeInBits(*FillRC) &&
6941 "Mismatched regclass size on folded subreg COPY");
6942 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex, RC: FillRC,
6943 VReg: Register());
6944 MachineInstr &LoadMI = *--InsertPt;
6945 MachineOperand &LoadDst = LoadMI.getOperand(i: 0);
6946 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6947 LoadDst.setSubReg(DstMO.getSubReg());
6948 LoadDst.setIsUndef();
6949 return &LoadMI;
6950 }
6951 }
6952 }
6953
6954 // Cannot fold.
6955 return nullptr;
6956}
6957
6958int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
6959 StackOffset &SOffset,
6960 bool *OutUseUnscaledOp,
6961 unsigned *OutUnscaledOp,
6962 int64_t *EmittableOffset) {
6963 // Set output values in case of early exit.
6964 if (EmittableOffset)
6965 *EmittableOffset = 0;
6966 if (OutUseUnscaledOp)
6967 *OutUseUnscaledOp = false;
6968 if (OutUnscaledOp)
6969 *OutUnscaledOp = 0;
6970
6971 // Exit early for structured vector spills/fills as they can't take an
6972 // immediate offset.
6973 switch (MI.getOpcode()) {
6974 default:
6975 break;
6976 case AArch64::LD1Rv1d:
6977 case AArch64::LD1Rv2s:
6978 case AArch64::LD1Rv2d:
6979 case AArch64::LD1Rv4h:
6980 case AArch64::LD1Rv4s:
6981 case AArch64::LD1Rv8b:
6982 case AArch64::LD1Rv8h:
6983 case AArch64::LD1Rv16b:
6984 case AArch64::LD1Twov2d:
6985 case AArch64::LD1Threev2d:
6986 case AArch64::LD1Fourv2d:
6987 case AArch64::LD1Twov1d:
6988 case AArch64::LD1Threev1d:
6989 case AArch64::LD1Fourv1d:
6990 case AArch64::ST1Twov2d:
6991 case AArch64::ST1Threev2d:
6992 case AArch64::ST1Fourv2d:
6993 case AArch64::ST1Twov1d:
6994 case AArch64::ST1Threev1d:
6995 case AArch64::ST1Fourv1d:
6996 case AArch64::ST1i8:
6997 case AArch64::ST1i16:
6998 case AArch64::ST1i32:
6999 case AArch64::ST1i64:
7000 case AArch64::IRG:
7001 case AArch64::IRGstack:
7002 case AArch64::STGloop:
7003 case AArch64::STZGloop:
7004 return AArch64FrameOffsetCannotUpdate;
7005 }
7006
7007 // Get the min/max offset and the scale.
7008 TypeSize ScaleValue(0U, false), Width(0U, false);
7009 int64_t MinOff, MaxOff;
7010 if (!AArch64InstrInfo::getMemOpInfo(Opcode: MI.getOpcode(), Scale&: ScaleValue, Width, MinOffset&: MinOff,
7011 MaxOffset&: MaxOff))
7012 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7013
7014 // Construct the complete offset.
7015 bool IsMulVL = ScaleValue.isScalable();
7016 unsigned Scale = ScaleValue.getKnownMinValue();
7017 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
7018
7019 const MachineOperand &ImmOpnd =
7020 MI.getOperand(i: AArch64InstrInfo::getLoadStoreImmIdx(Opc: MI.getOpcode()));
7021 Offset += ImmOpnd.getImm() * Scale;
7022
7023 // If the offset doesn't match the scale, we rewrite the instruction to
7024 // use the unscaled instruction instead. Likewise, if we have a negative
7025 // offset and there is an unscaled op to use.
7026 std::optional<unsigned> UnscaledOp =
7027 AArch64InstrInfo::getUnscaledLdSt(Opc: MI.getOpcode());
7028 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
7029 if (useUnscaledOp &&
7030 !AArch64InstrInfo::getMemOpInfo(Opcode: *UnscaledOp, Scale&: ScaleValue, Width, MinOffset&: MinOff,
7031 MaxOffset&: MaxOff))
7032 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7033
7034 Scale = ScaleValue.getKnownMinValue();
7035 assert(IsMulVL == ScaleValue.isScalable() &&
7036 "Unscaled opcode has different value for scalable");
7037
7038 int64_t Remainder = Offset % Scale;
7039 assert(!(Remainder && useUnscaledOp) &&
7040 "Cannot have remainder when using unscaled op");
7041
7042 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
7043 int64_t NewOffset = Offset / Scale;
7044 if (MinOff <= NewOffset && NewOffset <= MaxOff)
7045 Offset = Remainder;
7046 else {
7047 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
7048 Offset = Offset - (NewOffset * Scale);
7049 }
7050
7051 if (EmittableOffset)
7052 *EmittableOffset = NewOffset;
7053 if (OutUseUnscaledOp)
7054 *OutUseUnscaledOp = useUnscaledOp;
7055 if (OutUnscaledOp && UnscaledOp)
7056 *OutUnscaledOp = *UnscaledOp;
7057
7058 if (IsMulVL)
7059 SOffset = StackOffset::get(Fixed: SOffset.getFixed(), Scalable: Offset);
7060 else
7061 SOffset = StackOffset::get(Fixed: Offset, Scalable: SOffset.getScalable());
7062 return AArch64FrameOffsetCanUpdate |
7063 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7064}
7065
7066bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
7067 unsigned FrameReg, StackOffset &Offset,
7068 const AArch64InstrInfo *TII) {
7069 unsigned Opcode = MI.getOpcode();
7070 unsigned ImmIdx = FrameRegIdx + 1;
7071
7072 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7073 Offset += StackOffset::getFixed(Fixed: MI.getOperand(i: ImmIdx).getImm());
7074 emitFrameOffset(MBB&: *MI.getParent(), MBBI: MI, DL: MI.getDebugLoc(),
7075 DestReg: MI.getOperand(i: 0).getReg(), SrcReg: FrameReg, Offset, TII,
7076 Flag: MachineInstr::NoFlags, SetNZCV: (Opcode == AArch64::ADDSXri));
7077 MI.eraseFromParent();
7078 Offset = StackOffset();
7079 return true;
7080 }
7081
7082 int64_t NewOffset;
7083 unsigned UnscaledOp;
7084 bool UseUnscaledOp;
7085 int Status = isAArch64FrameOffsetLegal(MI, SOffset&: Offset, OutUseUnscaledOp: &UseUnscaledOp,
7086 OutUnscaledOp: &UnscaledOp, EmittableOffset: &NewOffset);
7087 if (Status & AArch64FrameOffsetCanUpdate) {
7088 if (Status & AArch64FrameOffsetIsLegal)
7089 // Replace the FrameIndex with FrameReg.
7090 MI.getOperand(i: FrameRegIdx).ChangeToRegister(Reg: FrameReg, isDef: false);
7091 if (UseUnscaledOp)
7092 MI.setDesc(TII->get(Opcode: UnscaledOp));
7093
7094 MI.getOperand(i: ImmIdx).ChangeToImmediate(ImmVal: NewOffset);
7095 return !Offset;
7096 }
7097
7098 return false;
7099}
7100
7101void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
7102 MachineBasicBlock::iterator MI) const {
7103 DebugLoc DL;
7104 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AArch64::NOP));
7105}
7106
7107MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7108
7109// AArch64 supports MachineCombiner.
7110bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7111
7112// True when Opc sets flag
7113static bool isCombineInstrSettingFlag(unsigned Opc) {
7114 switch (Opc) {
7115 case AArch64::ADDSWrr:
7116 case AArch64::ADDSWri:
7117 case AArch64::ADDSXrr:
7118 case AArch64::ADDSXri:
7119 case AArch64::SUBSWrr:
7120 case AArch64::SUBSXrr:
7121 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7122 case AArch64::SUBSWri:
7123 case AArch64::SUBSXri:
7124 return true;
7125 default:
7126 break;
7127 }
7128 return false;
7129}
7130
7131// 32b Opcodes that can be combined with a MUL
7132static bool isCombineInstrCandidate32(unsigned Opc) {
7133 switch (Opc) {
7134 case AArch64::ADDWrr:
7135 case AArch64::ADDWri:
7136 case AArch64::SUBWrr:
7137 case AArch64::ADDSWrr:
7138 case AArch64::ADDSWri:
7139 case AArch64::SUBSWrr:
7140 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7141 case AArch64::SUBWri:
7142 case AArch64::SUBSWri:
7143 return true;
7144 default:
7145 break;
7146 }
7147 return false;
7148}
7149
7150// 64b Opcodes that can be combined with a MUL
7151static bool isCombineInstrCandidate64(unsigned Opc) {
7152 switch (Opc) {
7153 case AArch64::ADDXrr:
7154 case AArch64::ADDXri:
7155 case AArch64::SUBXrr:
7156 case AArch64::ADDSXrr:
7157 case AArch64::ADDSXri:
7158 case AArch64::SUBSXrr:
7159 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7160 case AArch64::SUBXri:
7161 case AArch64::SUBSXri:
7162 case AArch64::ADDv8i8:
7163 case AArch64::ADDv16i8:
7164 case AArch64::ADDv4i16:
7165 case AArch64::ADDv8i16:
7166 case AArch64::ADDv2i32:
7167 case AArch64::ADDv4i32:
7168 case AArch64::SUBv8i8:
7169 case AArch64::SUBv16i8:
7170 case AArch64::SUBv4i16:
7171 case AArch64::SUBv8i16:
7172 case AArch64::SUBv2i32:
7173 case AArch64::SUBv4i32:
7174 return true;
7175 default:
7176 break;
7177 }
7178 return false;
7179}
7180
7181// FP Opcodes that can be combined with a FMUL.
7182static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7183 switch (Inst.getOpcode()) {
7184 default:
7185 break;
7186 case AArch64::FADDHrr:
7187 case AArch64::FADDSrr:
7188 case AArch64::FADDDrr:
7189 case AArch64::FADDv4f16:
7190 case AArch64::FADDv8f16:
7191 case AArch64::FADDv2f32:
7192 case AArch64::FADDv2f64:
7193 case AArch64::FADDv4f32:
7194 case AArch64::FSUBHrr:
7195 case AArch64::FSUBSrr:
7196 case AArch64::FSUBDrr:
7197 case AArch64::FSUBv4f16:
7198 case AArch64::FSUBv8f16:
7199 case AArch64::FSUBv2f32:
7200 case AArch64::FSUBv2f64:
7201 case AArch64::FSUBv4f32:
7202 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
7203 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7204 // the target options or if FADD/FSUB has the contract fast-math flag.
7205 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7206 Inst.getFlag(Flag: MachineInstr::FmContract);
7207 }
7208 return false;
7209}
7210
7211// Opcodes that can be combined with a MUL
7212static bool isCombineInstrCandidate(unsigned Opc) {
7213 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
7214}
7215
7216//
7217// Utility routine that checks if \param MO is defined by an
7218// \param CombineOpc instruction in the basic block \param MBB
7219static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
7220 unsigned CombineOpc, unsigned ZeroReg = 0,
7221 bool CheckZeroReg = false) {
7222 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7223 MachineInstr *MI = nullptr;
7224
7225 if (MO.isReg() && MO.getReg().isVirtual())
7226 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
7227 // And it needs to be in the trace (otherwise, it won't have a depth).
7228 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7229 return false;
7230 // Must only used by the user we combine with.
7231 if (!MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
7232 return false;
7233
7234 if (CheckZeroReg) {
7235 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7236 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7237 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7238 // The third input reg must be zero.
7239 if (MI->getOperand(i: 3).getReg() != ZeroReg)
7240 return false;
7241 }
7242
7243 if (isCombineInstrSettingFlag(Opc: CombineOpc) &&
7244 MI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) == -1)
7245 return false;
7246
7247 return true;
7248}
7249
7250//
7251// Is \param MO defined by an integer multiply and can be combined?
7252static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
7253 unsigned MulOpc, unsigned ZeroReg) {
7254 return canCombine(MBB, MO, CombineOpc: MulOpc, ZeroReg, CheckZeroReg: true);
7255}
7256
7257//
7258// Is \param MO defined by a floating-point multiply and can be combined?
7259static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
7260 unsigned MulOpc) {
7261 return canCombine(MBB, MO, CombineOpc: MulOpc);
7262}
7263
7264// TODO: There are many more machine instruction opcodes to match:
7265// 1. Other data types (integer, vectors)
7266// 2. Other math / logic operations (xor, or)
7267// 3. Other forms of the same operation (intrinsics and other variants)
7268bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7269 bool Invert) const {
7270 if (Invert)
7271 return false;
7272 switch (Inst.getOpcode()) {
7273 // == Floating-point types ==
7274 // -- Floating-point instructions --
7275 case AArch64::FADDHrr:
7276 case AArch64::FADDSrr:
7277 case AArch64::FADDDrr:
7278 case AArch64::FMULHrr:
7279 case AArch64::FMULSrr:
7280 case AArch64::FMULDrr:
7281 case AArch64::FMULX16:
7282 case AArch64::FMULX32:
7283 case AArch64::FMULX64:
7284 // -- Advanced SIMD instructions --
7285 case AArch64::FADDv4f16:
7286 case AArch64::FADDv8f16:
7287 case AArch64::FADDv2f32:
7288 case AArch64::FADDv4f32:
7289 case AArch64::FADDv2f64:
7290 case AArch64::FMULv4f16:
7291 case AArch64::FMULv8f16:
7292 case AArch64::FMULv2f32:
7293 case AArch64::FMULv4f32:
7294 case AArch64::FMULv2f64:
7295 case AArch64::FMULXv4f16:
7296 case AArch64::FMULXv8f16:
7297 case AArch64::FMULXv2f32:
7298 case AArch64::FMULXv4f32:
7299 case AArch64::FMULXv2f64:
7300 // -- SVE instructions --
7301 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7302 // in the SVE instruction set (though there are predicated ones).
7303 case AArch64::FADD_ZZZ_H:
7304 case AArch64::FADD_ZZZ_S:
7305 case AArch64::FADD_ZZZ_D:
7306 case AArch64::FMUL_ZZZ_H:
7307 case AArch64::FMUL_ZZZ_S:
7308 case AArch64::FMUL_ZZZ_D:
7309 return Inst.getFlag(Flag: MachineInstr::MIFlag::FmReassoc) &&
7310 Inst.getFlag(Flag: MachineInstr::MIFlag::FmNsz);
7311
7312 // == Integer types ==
7313 // -- Base instructions --
7314 // Opcodes MULWrr and MULXrr don't exist because
7315 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7316 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7317 // The machine-combiner does not support three-source-operands machine
7318 // instruction. So we cannot reassociate MULs.
7319 case AArch64::ADDWrr:
7320 case AArch64::ADDXrr:
7321 case AArch64::ANDWrr:
7322 case AArch64::ANDXrr:
7323 case AArch64::ORRWrr:
7324 case AArch64::ORRXrr:
7325 case AArch64::EORWrr:
7326 case AArch64::EORXrr:
7327 case AArch64::EONWrr:
7328 case AArch64::EONXrr:
7329 // -- Advanced SIMD instructions --
7330 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7331 // in the Advanced SIMD instruction set.
7332 case AArch64::ADDv8i8:
7333 case AArch64::ADDv16i8:
7334 case AArch64::ADDv4i16:
7335 case AArch64::ADDv8i16:
7336 case AArch64::ADDv2i32:
7337 case AArch64::ADDv4i32:
7338 case AArch64::ADDv1i64:
7339 case AArch64::ADDv2i64:
7340 case AArch64::MULv8i8:
7341 case AArch64::MULv16i8:
7342 case AArch64::MULv4i16:
7343 case AArch64::MULv8i16:
7344 case AArch64::MULv2i32:
7345 case AArch64::MULv4i32:
7346 case AArch64::ANDv8i8:
7347 case AArch64::ANDv16i8:
7348 case AArch64::ORRv8i8:
7349 case AArch64::ORRv16i8:
7350 case AArch64::EORv8i8:
7351 case AArch64::EORv16i8:
7352 // -- SVE instructions --
7353 case AArch64::ADD_ZZZ_B:
7354 case AArch64::ADD_ZZZ_H:
7355 case AArch64::ADD_ZZZ_S:
7356 case AArch64::ADD_ZZZ_D:
7357 case AArch64::MUL_ZZZ_B:
7358 case AArch64::MUL_ZZZ_H:
7359 case AArch64::MUL_ZZZ_S:
7360 case AArch64::MUL_ZZZ_D:
7361 case AArch64::AND_ZZZ:
7362 case AArch64::ORR_ZZZ:
7363 case AArch64::EOR_ZZZ:
7364 return true;
7365
7366 default:
7367 return false;
7368 }
7369}
7370
7371/// Find instructions that can be turned into madd.
7372static bool getMaddPatterns(MachineInstr &Root,
7373 SmallVectorImpl<unsigned> &Patterns) {
7374 unsigned Opc = Root.getOpcode();
7375 MachineBasicBlock &MBB = *Root.getParent();
7376 bool Found = false;
7377
7378 if (!isCombineInstrCandidate(Opc))
7379 return false;
7380 if (isCombineInstrSettingFlag(Opc)) {
7381 int Cmp_NZCV =
7382 Root.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true);
7383 // When NZCV is live bail out.
7384 if (Cmp_NZCV == -1)
7385 return false;
7386 unsigned NewOpc = convertToNonFlagSettingOpc(MI: Root);
7387 // When opcode can't change bail out.
7388 // CHECKME: do we miss any cases for opcode conversion?
7389 if (NewOpc == Opc)
7390 return false;
7391 Opc = NewOpc;
7392 }
7393
7394 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7395 unsigned Pattern) {
7396 if (canCombineWithMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode, ZeroReg)) {
7397 Patterns.push_back(Elt: Pattern);
7398 Found = true;
7399 }
7400 };
7401
7402 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7403 if (canCombine(MBB, MO&: Root.getOperand(i: Operand), CombineOpc: Opcode)) {
7404 Patterns.push_back(Elt: Pattern);
7405 Found = true;
7406 }
7407 };
7408
7409 typedef AArch64MachineCombinerPattern MCP;
7410
7411 switch (Opc) {
7412 default:
7413 break;
7414 case AArch64::ADDWrr:
7415 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7416 "ADDWrr does not have register operands");
7417 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7418 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7419 break;
7420 case AArch64::ADDXrr:
7421 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7422 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7423 break;
7424 case AArch64::SUBWrr:
7425 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7426 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7427 break;
7428 case AArch64::SUBXrr:
7429 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7430 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7431 break;
7432 case AArch64::ADDWri:
7433 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7434 break;
7435 case AArch64::ADDXri:
7436 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7437 break;
7438 case AArch64::SUBWri:
7439 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7440 break;
7441 case AArch64::SUBXri:
7442 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7443 break;
7444 case AArch64::ADDv8i8:
7445 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7446 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7447 break;
7448 case AArch64::ADDv16i8:
7449 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7450 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7451 break;
7452 case AArch64::ADDv4i16:
7453 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7454 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7455 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7456 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7457 break;
7458 case AArch64::ADDv8i16:
7459 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7460 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7461 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7462 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7463 break;
7464 case AArch64::ADDv2i32:
7465 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7466 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7467 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7468 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7469 break;
7470 case AArch64::ADDv4i32:
7471 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7472 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7473 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7474 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7475 break;
7476 case AArch64::SUBv8i8:
7477 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7478 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7479 break;
7480 case AArch64::SUBv16i8:
7481 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7482 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7483 break;
7484 case AArch64::SUBv4i16:
7485 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7486 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7487 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7488 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7489 break;
7490 case AArch64::SUBv8i16:
7491 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7492 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7493 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7494 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7495 break;
7496 case AArch64::SUBv2i32:
7497 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7498 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7499 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7500 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7501 break;
7502 case AArch64::SUBv4i32:
7503 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7504 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7505 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7506 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7507 break;
7508 }
7509 return Found;
7510}
7511
7512bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7513 switch (Opcode) {
7514 default:
7515 break;
7516 case AArch64::UABALB_ZZZ_D:
7517 case AArch64::UABALB_ZZZ_H:
7518 case AArch64::UABALB_ZZZ_S:
7519 case AArch64::UABALT_ZZZ_D:
7520 case AArch64::UABALT_ZZZ_H:
7521 case AArch64::UABALT_ZZZ_S:
7522 case AArch64::SABALB_ZZZ_D:
7523 case AArch64::SABALB_ZZZ_S:
7524 case AArch64::SABALB_ZZZ_H:
7525 case AArch64::SABALT_ZZZ_D:
7526 case AArch64::SABALT_ZZZ_S:
7527 case AArch64::SABALT_ZZZ_H:
7528 case AArch64::UABALv16i8_v8i16:
7529 case AArch64::UABALv2i32_v2i64:
7530 case AArch64::UABALv4i16_v4i32:
7531 case AArch64::UABALv4i32_v2i64:
7532 case AArch64::UABALv8i16_v4i32:
7533 case AArch64::UABALv8i8_v8i16:
7534 case AArch64::UABAv16i8:
7535 case AArch64::UABAv2i32:
7536 case AArch64::UABAv4i16:
7537 case AArch64::UABAv4i32:
7538 case AArch64::UABAv8i16:
7539 case AArch64::UABAv8i8:
7540 case AArch64::SABALv16i8_v8i16:
7541 case AArch64::SABALv2i32_v2i64:
7542 case AArch64::SABALv4i16_v4i32:
7543 case AArch64::SABALv4i32_v2i64:
7544 case AArch64::SABALv8i16_v4i32:
7545 case AArch64::SABALv8i8_v8i16:
7546 case AArch64::SABAv16i8:
7547 case AArch64::SABAv2i32:
7548 case AArch64::SABAv4i16:
7549 case AArch64::SABAv4i32:
7550 case AArch64::SABAv8i16:
7551 case AArch64::SABAv8i8:
7552 return true;
7553 }
7554
7555 return false;
7556}
7557
7558unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7559 unsigned AccumulationOpcode) const {
7560 switch (AccumulationOpcode) {
7561 default:
7562 llvm_unreachable("Unsupported accumulation Opcode!");
7563 case AArch64::UABALB_ZZZ_D:
7564 return AArch64::UABDLB_ZZZ_D;
7565 case AArch64::UABALB_ZZZ_H:
7566 return AArch64::UABDLB_ZZZ_H;
7567 case AArch64::UABALB_ZZZ_S:
7568 return AArch64::UABDLB_ZZZ_S;
7569 case AArch64::UABALT_ZZZ_D:
7570 return AArch64::UABDLT_ZZZ_D;
7571 case AArch64::UABALT_ZZZ_H:
7572 return AArch64::UABDLT_ZZZ_H;
7573 case AArch64::UABALT_ZZZ_S:
7574 return AArch64::UABDLT_ZZZ_S;
7575 case AArch64::UABALv16i8_v8i16:
7576 return AArch64::UABDLv16i8_v8i16;
7577 case AArch64::UABALv2i32_v2i64:
7578 return AArch64::UABDLv2i32_v2i64;
7579 case AArch64::UABALv4i16_v4i32:
7580 return AArch64::UABDLv4i16_v4i32;
7581 case AArch64::UABALv4i32_v2i64:
7582 return AArch64::UABDLv4i32_v2i64;
7583 case AArch64::UABALv8i16_v4i32:
7584 return AArch64::UABDLv8i16_v4i32;
7585 case AArch64::UABALv8i8_v8i16:
7586 return AArch64::UABDLv8i8_v8i16;
7587 case AArch64::UABAv16i8:
7588 return AArch64::UABDv16i8;
7589 case AArch64::UABAv2i32:
7590 return AArch64::UABDv2i32;
7591 case AArch64::UABAv4i16:
7592 return AArch64::UABDv4i16;
7593 case AArch64::UABAv4i32:
7594 return AArch64::UABDv4i32;
7595 case AArch64::UABAv8i16:
7596 return AArch64::UABDv8i16;
7597 case AArch64::UABAv8i8:
7598 return AArch64::UABDv8i8;
7599 case AArch64::SABALB_ZZZ_D:
7600 return AArch64::SABDLB_ZZZ_D;
7601 case AArch64::SABALB_ZZZ_S:
7602 return AArch64::SABDLB_ZZZ_S;
7603 case AArch64::SABALB_ZZZ_H:
7604 return AArch64::SABDLB_ZZZ_H;
7605 case AArch64::SABALT_ZZZ_D:
7606 return AArch64::SABDLT_ZZZ_D;
7607 case AArch64::SABALT_ZZZ_S:
7608 return AArch64::SABDLT_ZZZ_S;
7609 case AArch64::SABALT_ZZZ_H:
7610 return AArch64::SABDLT_ZZZ_H;
7611 case AArch64::SABALv16i8_v8i16:
7612 return AArch64::SABDLv16i8_v8i16;
7613 case AArch64::SABALv2i32_v2i64:
7614 return AArch64::SABDLv2i32_v2i64;
7615 case AArch64::SABALv4i16_v4i32:
7616 return AArch64::SABDLv4i16_v4i32;
7617 case AArch64::SABALv4i32_v2i64:
7618 return AArch64::SABDLv4i32_v2i64;
7619 case AArch64::SABALv8i16_v4i32:
7620 return AArch64::SABDLv8i16_v4i32;
7621 case AArch64::SABALv8i8_v8i16:
7622 return AArch64::SABDLv8i8_v8i16;
7623 case AArch64::SABAv16i8:
7624 return AArch64::SABDv16i8;
7625 case AArch64::SABAv2i32:
7626 return AArch64::SABAv2i32;
7627 case AArch64::SABAv4i16:
7628 return AArch64::SABDv4i16;
7629 case AArch64::SABAv4i32:
7630 return AArch64::SABDv4i32;
7631 case AArch64::SABAv8i16:
7632 return AArch64::SABDv8i16;
7633 case AArch64::SABAv8i8:
7634 return AArch64::SABDv8i8;
7635 }
7636}
7637
7638/// Floating-Point Support
7639
7640/// Find instructions that can be turned into madd.
7641static bool getFMAPatterns(MachineInstr &Root,
7642 SmallVectorImpl<unsigned> &Patterns) {
7643
7644 if (!isCombineInstrCandidateFP(Inst: Root))
7645 return false;
7646
7647 MachineBasicBlock &MBB = *Root.getParent();
7648 bool Found = false;
7649
7650 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7651 if (canCombineWithFMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode)) {
7652 Patterns.push_back(Elt: Pattern);
7653 return true;
7654 }
7655 return false;
7656 };
7657
7658 typedef AArch64MachineCombinerPattern MCP;
7659
7660 switch (Root.getOpcode()) {
7661 default:
7662 assert(false && "Unsupported FP instruction in combiner\n");
7663 break;
7664 case AArch64::FADDHrr:
7665 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7666 "FADDHrr does not have register operands");
7667
7668 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7669 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7670 break;
7671 case AArch64::FADDSrr:
7672 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7673 "FADDSrr does not have register operands");
7674
7675 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7676 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7677
7678 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7679 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7680 break;
7681 case AArch64::FADDDrr:
7682 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7683 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7684
7685 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7686 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7687 break;
7688 case AArch64::FADDv4f16:
7689 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7690 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7691
7692 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7693 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7694 break;
7695 case AArch64::FADDv8f16:
7696 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7697 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7698
7699 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7700 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7701 break;
7702 case AArch64::FADDv2f32:
7703 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7704 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7705
7706 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7707 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7708 break;
7709 case AArch64::FADDv2f64:
7710 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7711 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7712
7713 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7714 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7715 break;
7716 case AArch64::FADDv4f32:
7717 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7718 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7719
7720 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7721 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7722 break;
7723 case AArch64::FSUBHrr:
7724 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7725 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7726 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7727 break;
7728 case AArch64::FSUBSrr:
7729 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7730
7731 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7732 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7733
7734 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7735 break;
7736 case AArch64::FSUBDrr:
7737 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7738
7739 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7740 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7741
7742 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7743 break;
7744 case AArch64::FSUBv4f16:
7745 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7746 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7747
7748 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7749 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7750 break;
7751 case AArch64::FSUBv8f16:
7752 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7753 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7754
7755 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7756 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7757 break;
7758 case AArch64::FSUBv2f32:
7759 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7760 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7761
7762 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7763 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7764 break;
7765 case AArch64::FSUBv2f64:
7766 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7767 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7768
7769 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7770 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7771 break;
7772 case AArch64::FSUBv4f32:
7773 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7774 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7775
7776 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7777 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7778 break;
7779 }
7780 return Found;
7781}
7782
7783static bool getFMULPatterns(MachineInstr &Root,
7784 SmallVectorImpl<unsigned> &Patterns) {
7785 MachineBasicBlock &MBB = *Root.getParent();
7786 bool Found = false;
7787
7788 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7789 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7790 MachineOperand &MO = Root.getOperand(i: Operand);
7791 MachineInstr *MI = nullptr;
7792 if (MO.isReg() && MO.getReg().isVirtual())
7793 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
7794 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7795 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7796 MI->getOperand(i: 1).getReg().isVirtual())
7797 MI = MRI.getUniqueVRegDef(Reg: MI->getOperand(i: 1).getReg());
7798 if (MI && MI->getOpcode() == Opcode) {
7799 Patterns.push_back(Elt: Pattern);
7800 return true;
7801 }
7802 return false;
7803 };
7804
7805 typedef AArch64MachineCombinerPattern MCP;
7806
7807 switch (Root.getOpcode()) {
7808 default:
7809 return false;
7810 case AArch64::FMULv2f32:
7811 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7812 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7813 break;
7814 case AArch64::FMULv2f64:
7815 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7816 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7817 break;
7818 case AArch64::FMULv4f16:
7819 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7820 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7821 break;
7822 case AArch64::FMULv4f32:
7823 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7824 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7825 break;
7826 case AArch64::FMULv8f16:
7827 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7828 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7829 break;
7830 }
7831
7832 return Found;
7833}
7834
7835static bool getFNEGPatterns(MachineInstr &Root,
7836 SmallVectorImpl<unsigned> &Patterns) {
7837 unsigned Opc = Root.getOpcode();
7838 MachineBasicBlock &MBB = *Root.getParent();
7839 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7840
7841 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7842 MachineOperand &MO = Root.getOperand(i: 1);
7843 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
7844 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7845 MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()) &&
7846 Root.getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
7847 Root.getFlag(Flag: MachineInstr::MIFlag::FmNsz) &&
7848 MI->getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
7849 MI->getFlag(Flag: MachineInstr::MIFlag::FmNsz)) {
7850 Patterns.push_back(Elt: Pattern);
7851 return true;
7852 }
7853 return false;
7854 };
7855
7856 switch (Opc) {
7857 default:
7858 break;
7859 case AArch64::FNEGDr:
7860 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7861 case AArch64::FNEGSr:
7862 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7863 }
7864
7865 return false;
7866}
7867
7868/// Return true when a code sequence can improve throughput. It
7869/// should be called only for instructions in loops.
7870/// \param Pattern - combiner pattern
7871bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
7872 switch (Pattern) {
7873 default:
7874 break;
7875 case AArch64MachineCombinerPattern::FMULADDH_OP1:
7876 case AArch64MachineCombinerPattern::FMULADDH_OP2:
7877 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7878 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7879 case AArch64MachineCombinerPattern::FMULADDS_OP1:
7880 case AArch64MachineCombinerPattern::FMULADDS_OP2:
7881 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7882 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7883 case AArch64MachineCombinerPattern::FMULADDD_OP1:
7884 case AArch64MachineCombinerPattern::FMULADDD_OP2:
7885 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7886 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7887 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7888 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7889 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7890 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7891 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7892 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7893 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7894 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7895 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7896 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7897 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7898 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7899 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7900 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7901 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7902 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7903 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7904 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7905 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7906 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7907 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7908 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7909 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7910 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7911 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7912 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7913 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7914 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
7915 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7916 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
7917 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7918 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7919 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7920 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7921 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7922 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7923 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7924 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7925 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7926 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7927 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7928 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7929 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7930 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7931 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
7932 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7933 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
7934 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7935 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
7936 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7937 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
7938 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7939 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
7940 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7941 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7942 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7943 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7944 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7945 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7946 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7947 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7948 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7949 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7950 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7951 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7952 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7953 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7954 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7955 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7956 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7957 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7958 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7959 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7960 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7961 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7962 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7963 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7964 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7965 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7966 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7967 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7968 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7969 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7970 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7971 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7972 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7973 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7974 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7975 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7976 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7977 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7978 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7979 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7980 return true;
7981 } // end switch (Pattern)
7982 return false;
7983}
7984
7985/// Find other MI combine patterns.
7986static bool getMiscPatterns(MachineInstr &Root,
7987 SmallVectorImpl<unsigned> &Patterns) {
7988 // A - (B + C) ==> (A - B) - C or (A - C) - B
7989 unsigned Opc = Root.getOpcode();
7990 MachineBasicBlock &MBB = *Root.getParent();
7991
7992 switch (Opc) {
7993 case AArch64::SUBWrr:
7994 case AArch64::SUBSWrr:
7995 case AArch64::SUBXrr:
7996 case AArch64::SUBSXrr:
7997 // Found candidate root.
7998 break;
7999 default:
8000 return false;
8001 }
8002
8003 if (isCombineInstrSettingFlag(Opc) &&
8004 Root.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) ==
8005 -1)
8006 return false;
8007
8008 if (canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDWrr) ||
8009 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDSWrr) ||
8010 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDXrr) ||
8011 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDSXrr)) {
8012 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP1);
8013 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP2);
8014 return true;
8015 }
8016
8017 return false;
8018}
8019
8020/// Check if the given instruction forms a gather load pattern that can be
8021/// optimized for better Memory-Level Parallelism (MLP). This function
8022/// identifies chains of NEON lane load instructions that load data from
8023/// different memory addresses into individual lanes of a 128-bit vector
8024/// register, then attempts to split the pattern into parallel loads to break
8025/// the serial dependency between instructions.
8026///
8027/// Pattern Matched:
8028/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
8029/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
8030///
8031/// Transformed Into:
8032/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
8033/// to combine the results, enabling better memory-level parallelism.
8034///
8035/// Supported Element Types:
8036/// - 32-bit elements (LD1i32, 4 lanes total)
8037/// - 16-bit elements (LD1i16, 8 lanes total)
8038/// - 8-bit elements (LD1i8, 16 lanes total)
8039static bool getGatherLanePattern(MachineInstr &Root,
8040 SmallVectorImpl<unsigned> &Patterns,
8041 unsigned LoadLaneOpCode, unsigned NumLanes) {
8042 const MachineFunction *MF = Root.getMF();
8043
8044 // Early exit if optimizing for size.
8045 if (MF->getFunction().hasMinSize())
8046 return false;
8047
8048 const MachineRegisterInfo &MRI = MF->getRegInfo();
8049 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
8050
8051 // The root of the pattern must load into the last lane of the vector.
8052 if (Root.getOperand(i: 2).getImm() != NumLanes - 1)
8053 return false;
8054
8055 // Check that we have load into all lanes except lane 0.
8056 // For each load we also want to check that:
8057 // 1. It has a single non-debug use (since we will be replacing the virtual
8058 // register)
8059 // 2. That the addressing mode only uses a single pointer operand
8060 auto *CurrInstr = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 1).getReg());
8061 auto Range = llvm::seq<unsigned>(Begin: 1, End: NumLanes - 1);
8062 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8063 SmallVector<const MachineInstr *, 16> LoadInstrs;
8064 while (!RemainingLanes.empty() && CurrInstr &&
8065 CurrInstr->getOpcode() == LoadLaneOpCode &&
8066 MRI.hasOneNonDBGUse(RegNo: CurrInstr->getOperand(i: 0).getReg()) &&
8067 CurrInstr->getNumOperands() == 4) {
8068 RemainingLanes.erase(V: CurrInstr->getOperand(i: 2).getImm());
8069 LoadInstrs.push_back(Elt: CurrInstr);
8070 CurrInstr = MRI.getUniqueVRegDef(Reg: CurrInstr->getOperand(i: 1).getReg());
8071 }
8072
8073 // Check that we have found a match for lanes N-1.. 1.
8074 if (!RemainingLanes.empty())
8075 return false;
8076
8077 // Match the SUBREG_TO_REG sequence.
8078 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8079 return false;
8080
8081 // Verify that the subreg to reg loads an integer into the first lane.
8082 auto Lane0LoadReg = CurrInstr->getOperand(i: 1).getReg();
8083 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8084 if (TRI->getRegSizeInBits(Reg: Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8085 return false;
8086
8087 // Verify that it also has a single non debug use.
8088 if (!MRI.hasOneNonDBGUse(RegNo: Lane0LoadReg))
8089 return false;
8090
8091 LoadInstrs.push_back(Elt: MRI.getUniqueVRegDef(Reg: Lane0LoadReg));
8092
8093 // If there is any chance of aliasing, do not apply the pattern.
8094 // Walk backward through the MBB starting from Root.
8095 // Exit early if we've encountered all load instructions or hit the search
8096 // limit.
8097 auto MBBItr = Root.getIterator();
8098 unsigned RemainingSteps = GatherOptSearchLimit;
8099 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8100 RemainingLoadInstrs.insert(I: LoadInstrs.begin(), E: LoadInstrs.end());
8101 const MachineBasicBlock *MBB = Root.getParent();
8102
8103 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8104 !RemainingLoadInstrs.empty();
8105 --MBBItr, --RemainingSteps) {
8106 const MachineInstr &CurrInstr = *MBBItr;
8107
8108 // Remove this instruction from remaining loads if it's one we're tracking.
8109 RemainingLoadInstrs.erase(Ptr: &CurrInstr);
8110
8111 // Check for potential aliasing with any of the load instructions to
8112 // optimize.
8113 if (CurrInstr.isLoadFoldBarrier())
8114 return false;
8115 }
8116
8117 // If we hit the search limit without finding all load instructions,
8118 // don't match the pattern.
8119 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8120 return false;
8121
8122 switch (NumLanes) {
8123 case 4:
8124 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i32);
8125 break;
8126 case 8:
8127 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i16);
8128 break;
8129 case 16:
8130 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i8);
8131 break;
8132 default:
8133 llvm_unreachable("Got bad number of lanes for gather pattern.");
8134 }
8135
8136 return true;
8137}
8138
8139/// Search for patterns of LD instructions we can optimize.
8140static bool getLoadPatterns(MachineInstr &Root,
8141 SmallVectorImpl<unsigned> &Patterns) {
8142
8143 // The pattern searches for loads into single lanes.
8144 switch (Root.getOpcode()) {
8145 case AArch64::LD1i32:
8146 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 4);
8147 case AArch64::LD1i16:
8148 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 8);
8149 case AArch64::LD1i8:
8150 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 16);
8151 default:
8152 return false;
8153 }
8154}
8155
8156/// Generate optimized instruction sequence for gather load patterns to improve
8157/// Memory-Level Parallelism (MLP). This function transforms a chain of
8158/// sequential NEON lane loads into parallel vector loads that can execute
8159/// concurrently.
8160static void
8161generateGatherLanePattern(MachineInstr &Root,
8162 SmallVectorImpl<MachineInstr *> &InsInstrs,
8163 SmallVectorImpl<MachineInstr *> &DelInstrs,
8164 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8165 unsigned Pattern, unsigned NumLanes) {
8166 MachineFunction &MF = *Root.getParent()->getParent();
8167 MachineRegisterInfo &MRI = MF.getRegInfo();
8168 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8169
8170 // Gather the initial load instructions to build the pattern.
8171 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8172 MachineInstr *CurrInstr = &Root;
8173 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8174 LoadToLaneInstrs.push_back(Elt: CurrInstr);
8175 CurrInstr = MRI.getUniqueVRegDef(Reg: CurrInstr->getOperand(i: 1).getReg());
8176 }
8177
8178 // Sort the load instructions according to the lane.
8179 llvm::sort(C&: LoadToLaneInstrs,
8180 Comp: [](const MachineInstr *A, const MachineInstr *B) {
8181 return A->getOperand(i: 2).getImm() > B->getOperand(i: 2).getImm();
8182 });
8183
8184 MachineInstr *SubregToReg = CurrInstr;
8185 LoadToLaneInstrs.push_back(
8186 Elt: MRI.getUniqueVRegDef(Reg: SubregToReg->getOperand(i: 1).getReg()));
8187 auto LoadToLaneInstrsAscending = llvm::reverse(C&: LoadToLaneInstrs);
8188
8189 const TargetRegisterClass *FPR128RegClass =
8190 MRI.getRegClass(Reg: Root.getOperand(i: 0).getReg());
8191
8192 // Helper lambda to create a LD1 instruction.
8193 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8194 Register SrcRegister, unsigned Lane,
8195 Register OffsetRegister,
8196 bool OffsetRegisterKillState) {
8197 auto NewRegister = MRI.createVirtualRegister(RegClass: FPR128RegClass);
8198 MachineInstrBuilder LoadIndexIntoRegister =
8199 BuildMI(MF, MIMD: MIMetadata(*OriginalInstr), MCID: TII->get(Opcode: Root.getOpcode()),
8200 DestReg: NewRegister)
8201 .addReg(RegNo: SrcRegister)
8202 .addImm(Val: Lane)
8203 .addReg(RegNo: OffsetRegister, Flags: getKillRegState(B: OffsetRegisterKillState));
8204 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewRegister, y: InsInstrs.size()));
8205 InsInstrs.push_back(Elt: LoadIndexIntoRegister);
8206 return NewRegister;
8207 };
8208
8209 // Helper to create load instruction based on the NumLanes in the NEON
8210 // register we are rewriting.
8211 auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
8212 Register OffsetReg,
8213 bool KillState) -> MachineInstrBuilder {
8214 unsigned Opcode;
8215 switch (NumLanes) {
8216 case 4:
8217 Opcode = AArch64::LDRSui;
8218 break;
8219 case 8:
8220 Opcode = AArch64::LDRHui;
8221 break;
8222 case 16:
8223 Opcode = AArch64::LDRBui;
8224 break;
8225 default:
8226 llvm_unreachable(
8227 "Got unsupported number of lanes in machine-combiner gather pattern");
8228 }
8229 // Immediate offset load
8230 return BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg)
8231 .addReg(RegNo: OffsetReg)
8232 .addImm(Val: 0);
8233 };
8234
8235 // Load the remaining lanes into register 0.
8236 auto LanesToLoadToReg0 =
8237 llvm::make_range(x: LoadToLaneInstrsAscending.begin() + 1,
8238 y: LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8239 Register PrevReg = SubregToReg->getOperand(i: 0).getReg();
8240 for (auto [Index, LoadInstr] : llvm::enumerate(First&: LanesToLoadToReg0)) {
8241 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(i: 3);
8242 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8243 OffsetRegOperand.getReg(),
8244 OffsetRegOperand.isKill());
8245 DelInstrs.push_back(Elt: LoadInstr);
8246 }
8247 Register LastLoadReg0 = PrevReg;
8248
8249 // First load into register 1. Perform an integer load to zero out the upper
8250 // lanes in a single instruction.
8251 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8252 MachineInstr *OriginalSplitLoad =
8253 *std::next(x: LoadToLaneInstrsAscending.begin(), n: NumLanes / 2);
8254 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8255 RegClass: MRI.getRegClass(Reg: Lane0Load->getOperand(i: 0).getReg()));
8256
8257 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8258 OriginalSplitLoad->getOperand(i: 3);
8259 MachineInstrBuilder MiddleIndexLoadInstr =
8260 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8261 OriginalSplitToLoadOffsetOperand.getReg(),
8262 OriginalSplitToLoadOffsetOperand.isKill());
8263
8264 InstrIdxForVirtReg.insert(
8265 KV: std::make_pair(x&: DestRegForMiddleIndex, y: InsInstrs.size()));
8266 InsInstrs.push_back(Elt: MiddleIndexLoadInstr);
8267 DelInstrs.push_back(Elt: OriginalSplitLoad);
8268
8269 // Subreg To Reg instruction for register 1.
8270 Register DestRegForSubregToReg = MRI.createVirtualRegister(RegClass: FPR128RegClass);
8271 unsigned SubregType;
8272 switch (NumLanes) {
8273 case 4:
8274 SubregType = AArch64::ssub;
8275 break;
8276 case 8:
8277 SubregType = AArch64::hsub;
8278 break;
8279 case 16:
8280 SubregType = AArch64::bsub;
8281 break;
8282 default:
8283 llvm_unreachable(
8284 "Got invalid NumLanes for machine-combiner gather pattern");
8285 }
8286
8287 auto SubRegToRegInstr =
8288 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: SubregToReg->getOpcode()),
8289 DestReg: DestRegForSubregToReg)
8290 .addReg(RegNo: DestRegForMiddleIndex, Flags: getKillRegState(B: true))
8291 .addImm(Val: SubregType);
8292 InstrIdxForVirtReg.insert(
8293 KV: std::make_pair(x&: DestRegForSubregToReg, y: InsInstrs.size()));
8294 InsInstrs.push_back(Elt: SubRegToRegInstr);
8295
8296 // Load remaining lanes into register 1.
8297 auto LanesToLoadToReg1 =
8298 llvm::make_range(x: LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8299 y: LoadToLaneInstrsAscending.end());
8300 PrevReg = SubRegToRegInstr->getOperand(i: 0).getReg();
8301 for (auto [Index, LoadInstr] : llvm::enumerate(First&: LanesToLoadToReg1)) {
8302 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(i: 3);
8303 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8304 OffsetRegOperand.getReg(),
8305 OffsetRegOperand.isKill());
8306
8307 // Do not add the last reg to DelInstrs - it will be removed later.
8308 if (Index == NumLanes / 2 - 2) {
8309 break;
8310 }
8311 DelInstrs.push_back(Elt: LoadInstr);
8312 }
8313 Register LastLoadReg1 = PrevReg;
8314
8315 // Create the final zip instruction to combine the results.
8316 MachineInstrBuilder ZipInstr =
8317 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::ZIP1v2i64),
8318 DestReg: Root.getOperand(i: 0).getReg())
8319 .addReg(RegNo: LastLoadReg0)
8320 .addReg(RegNo: LastLoadReg1);
8321 InsInstrs.push_back(Elt: ZipInstr);
8322}
8323
8324CombinerObjective
8325AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
8326 switch (Pattern) {
8327 case AArch64MachineCombinerPattern::SUBADD_OP1:
8328 case AArch64MachineCombinerPattern::SUBADD_OP2:
8329 case AArch64MachineCombinerPattern::GATHER_LANE_i32:
8330 case AArch64MachineCombinerPattern::GATHER_LANE_i16:
8331 case AArch64MachineCombinerPattern::GATHER_LANE_i8:
8332 return CombinerObjective::MustReduceDepth;
8333 default:
8334 return TargetInstrInfo::getCombinerObjective(Pattern);
8335 }
8336}
8337
8338/// Return true when there is potentially a faster code sequence for an
8339/// instruction chain ending in \p Root. All potential patterns are listed in
8340/// the \p Pattern vector. Pattern should be sorted in priority order since the
8341/// pattern evaluator stops checking as soon as it finds a faster sequence.
8342
8343bool AArch64InstrInfo::getMachineCombinerPatterns(
8344 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8345 bool DoRegPressureReduce) const {
8346 // Integer patterns
8347 if (getMaddPatterns(Root, Patterns))
8348 return true;
8349 // Floating point patterns
8350 if (getFMULPatterns(Root, Patterns))
8351 return true;
8352 if (getFMAPatterns(Root, Patterns))
8353 return true;
8354 if (getFNEGPatterns(Root, Patterns))
8355 return true;
8356
8357 // Other patterns
8358 if (getMiscPatterns(Root, Patterns))
8359 return true;
8360
8361 // Load patterns
8362 if (getLoadPatterns(Root, Patterns))
8363 return true;
8364
8365 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8366 DoRegPressureReduce);
8367}
8368
8369enum class FMAInstKind { Default, Indexed, Accumulator };
8370/// genFusedMultiply - Generate fused multiply instructions.
8371/// This function supports both integer and floating point instructions.
8372/// A typical example:
8373/// F|MUL I=A,B,0
8374/// F|ADD R,I,C
8375/// ==> F|MADD R,A,B,C
8376/// \param MF Containing MachineFunction
8377/// \param MRI Register information
8378/// \param TII Target information
8379/// \param Root is the F|ADD instruction
8380/// \param [out] InsInstrs is a vector of machine instructions and will
8381/// contain the generated madd instruction
8382/// \param IdxMulOpd is index of operand in Root that is the result of
8383/// the F|MUL. In the example above IdxMulOpd is 1.
8384/// \param MaddOpc the opcode fo the f|madd instruction
8385/// \param RC Register class of operands
8386/// \param kind of fma instruction (addressing mode) to be generated
8387/// \param ReplacedAddend is the result register from the instruction
8388/// replacing the non-combined operand, if any.
8389static MachineInstr *
8390genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
8391 const TargetInstrInfo *TII, MachineInstr &Root,
8392 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8393 unsigned MaddOpc, const TargetRegisterClass *RC,
8394 FMAInstKind kind = FMAInstKind::Default,
8395 const Register *ReplacedAddend = nullptr) {
8396 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8397
8398 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8399 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
8400 Register ResultReg = Root.getOperand(i: 0).getReg();
8401 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
8402 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
8403 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
8404 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
8405
8406 Register SrcReg2;
8407 bool Src2IsKill;
8408 if (ReplacedAddend) {
8409 // If we just generated a new addend, we must be it's only use.
8410 SrcReg2 = *ReplacedAddend;
8411 Src2IsKill = true;
8412 } else {
8413 SrcReg2 = Root.getOperand(i: IdxOtherOpd).getReg();
8414 Src2IsKill = Root.getOperand(i: IdxOtherOpd).isKill();
8415 }
8416
8417 if (ResultReg.isVirtual())
8418 MRI.constrainRegClass(Reg: ResultReg, RC);
8419 if (SrcReg0.isVirtual())
8420 MRI.constrainRegClass(Reg: SrcReg0, RC);
8421 if (SrcReg1.isVirtual())
8422 MRI.constrainRegClass(Reg: SrcReg1, RC);
8423 if (SrcReg2.isVirtual())
8424 MRI.constrainRegClass(Reg: SrcReg2, RC);
8425
8426 MachineInstrBuilder MIB;
8427 if (kind == FMAInstKind::Default)
8428 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8429 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8430 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8431 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill));
8432 else if (kind == FMAInstKind::Indexed)
8433 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8434 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill))
8435 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8436 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8437 .addImm(Val: MUL->getOperand(i: 3).getImm());
8438 else if (kind == FMAInstKind::Accumulator)
8439 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8440 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill))
8441 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8442 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill));
8443 else
8444 assert(false && "Invalid FMA instruction kind \n");
8445 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8446 InsInstrs.push_back(Elt: MIB);
8447 return MUL;
8448}
8449
8450static MachineInstr *
8451genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
8452 const TargetInstrInfo *TII, MachineInstr &Root,
8453 SmallVectorImpl<MachineInstr *> &InsInstrs) {
8454 MachineInstr *MAD = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 1).getReg());
8455
8456 unsigned Opc = 0;
8457 const TargetRegisterClass *RC = MRI.getRegClass(Reg: MAD->getOperand(i: 0).getReg());
8458 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8459 Opc = AArch64::FNMADDSrrr;
8460 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8461 Opc = AArch64::FNMADDDrrr;
8462 else
8463 return nullptr;
8464
8465 Register ResultReg = Root.getOperand(i: 0).getReg();
8466 Register SrcReg0 = MAD->getOperand(i: 1).getReg();
8467 Register SrcReg1 = MAD->getOperand(i: 2).getReg();
8468 Register SrcReg2 = MAD->getOperand(i: 3).getReg();
8469 bool Src0IsKill = MAD->getOperand(i: 1).isKill();
8470 bool Src1IsKill = MAD->getOperand(i: 2).isKill();
8471 bool Src2IsKill = MAD->getOperand(i: 3).isKill();
8472 if (ResultReg.isVirtual())
8473 MRI.constrainRegClass(Reg: ResultReg, RC);
8474 if (SrcReg0.isVirtual())
8475 MRI.constrainRegClass(Reg: SrcReg0, RC);
8476 if (SrcReg1.isVirtual())
8477 MRI.constrainRegClass(Reg: SrcReg1, RC);
8478 if (SrcReg2.isVirtual())
8479 MRI.constrainRegClass(Reg: SrcReg2, RC);
8480
8481 MachineInstrBuilder MIB =
8482 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: Opc), DestReg: ResultReg)
8483 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8484 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8485 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill));
8486 InsInstrs.push_back(Elt: MIB);
8487
8488 return MAD;
8489}
8490
8491/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8492static MachineInstr *
8493genIndexedMultiply(MachineInstr &Root,
8494 SmallVectorImpl<MachineInstr *> &InsInstrs,
8495 unsigned IdxDupOp, unsigned MulOpc,
8496 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8497 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8498 "Invalid index of FMUL operand");
8499
8500 MachineFunction &MF = *Root.getMF();
8501 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8502
8503 MachineInstr *Dup =
8504 MF.getRegInfo().getUniqueVRegDef(Reg: Root.getOperand(i: IdxDupOp).getReg());
8505
8506 if (Dup->getOpcode() == TargetOpcode::COPY)
8507 Dup = MRI.getUniqueVRegDef(Reg: Dup->getOperand(i: 1).getReg());
8508
8509 Register DupSrcReg = Dup->getOperand(i: 1).getReg();
8510 MRI.clearKillFlags(Reg: DupSrcReg);
8511 MRI.constrainRegClass(Reg: DupSrcReg, RC);
8512
8513 unsigned DupSrcLane = Dup->getOperand(i: 2).getImm();
8514
8515 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8516 MachineOperand &MulOp = Root.getOperand(i: IdxMulOp);
8517
8518 Register ResultReg = Root.getOperand(i: 0).getReg();
8519
8520 MachineInstrBuilder MIB;
8521 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MulOpc), DestReg: ResultReg)
8522 .add(MO: MulOp)
8523 .addReg(RegNo: DupSrcReg)
8524 .addImm(Val: DupSrcLane);
8525
8526 InsInstrs.push_back(Elt: MIB);
8527 return &Root;
8528}
8529
8530/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8531/// instructions.
8532///
8533/// \see genFusedMultiply
8534static MachineInstr *genFusedMultiplyAcc(
8535 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8536 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8537 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8538 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8539 kind: FMAInstKind::Accumulator);
8540}
8541
8542/// genNeg - Helper to generate an intermediate negation of the second operand
8543/// of Root
8544static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
8545 const TargetInstrInfo *TII, MachineInstr &Root,
8546 SmallVectorImpl<MachineInstr *> &InsInstrs,
8547 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8548 unsigned MnegOpc, const TargetRegisterClass *RC) {
8549 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
8550 MachineInstrBuilder MIB =
8551 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MnegOpc), DestReg: NewVR)
8552 .add(MO: Root.getOperand(i: 2));
8553 InsInstrs.push_back(Elt: MIB);
8554
8555 assert(InstrIdxForVirtReg.empty());
8556 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8557
8558 return NewVR;
8559}
8560
8561/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8562/// instructions with an additional negation of the accumulator
8563static MachineInstr *genFusedMultiplyAccNeg(
8564 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8565 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8566 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8567 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8568 assert(IdxMulOpd == 1);
8569
8570 Register NewVR =
8571 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8572 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8573 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
8574}
8575
8576/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8577/// instructions.
8578///
8579/// \see genFusedMultiply
8580static MachineInstr *genFusedMultiplyIdx(
8581 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8582 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8583 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8584 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8585 kind: FMAInstKind::Indexed);
8586}
8587
8588/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8589/// instructions with an additional negation of the accumulator
8590static MachineInstr *genFusedMultiplyIdxNeg(
8591 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8592 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8593 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8594 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8595 assert(IdxMulOpd == 1);
8596
8597 Register NewVR =
8598 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8599
8600 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8601 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
8602}
8603
8604/// genMaddR - Generate madd instruction and combine mul and add using
8605/// an extra virtual register
8606/// Example - an ADD intermediate needs to be stored in a register:
8607/// MUL I=A,B,0
8608/// ADD R,I,Imm
8609/// ==> ORR V, ZR, Imm
8610/// ==> MADD R,A,B,V
8611/// \param MF Containing MachineFunction
8612/// \param MRI Register information
8613/// \param TII Target information
8614/// \param Root is the ADD instruction
8615/// \param [out] InsInstrs is a vector of machine instructions and will
8616/// contain the generated madd instruction
8617/// \param IdxMulOpd is index of operand in Root that is the result of
8618/// the MUL. In the example above IdxMulOpd is 1.
8619/// \param MaddOpc the opcode fo the madd instruction
8620/// \param VR is a virtual register that holds the value of an ADD operand
8621/// (V in the example above).
8622/// \param RC Register class of operands
8623static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
8624 const TargetInstrInfo *TII, MachineInstr &Root,
8625 SmallVectorImpl<MachineInstr *> &InsInstrs,
8626 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8627 const TargetRegisterClass *RC) {
8628 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8629
8630 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
8631 Register ResultReg = Root.getOperand(i: 0).getReg();
8632 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
8633 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
8634 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
8635 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
8636
8637 if (ResultReg.isVirtual())
8638 MRI.constrainRegClass(Reg: ResultReg, RC);
8639 if (SrcReg0.isVirtual())
8640 MRI.constrainRegClass(Reg: SrcReg0, RC);
8641 if (SrcReg1.isVirtual())
8642 MRI.constrainRegClass(Reg: SrcReg1, RC);
8643 if (Register::isVirtualRegister(Reg: VR))
8644 MRI.constrainRegClass(Reg: VR, RC);
8645
8646 MachineInstrBuilder MIB =
8647 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8648 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8649 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8650 .addReg(RegNo: VR);
8651 // Insert the MADD
8652 InsInstrs.push_back(Elt: MIB);
8653 return MUL;
8654}
8655
8656/// Do the following transformation
8657/// A - (B + C) ==> (A - B) - C
8658/// A - (B + C) ==> (A - C) - B
8659static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
8660 const TargetInstrInfo *TII, MachineInstr &Root,
8661 SmallVectorImpl<MachineInstr *> &InsInstrs,
8662 SmallVectorImpl<MachineInstr *> &DelInstrs,
8663 unsigned IdxOpd1,
8664 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8665 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8666 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8667 MachineInstr *AddMI = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 2).getReg());
8668
8669 Register ResultReg = Root.getOperand(i: 0).getReg();
8670 Register RegA = Root.getOperand(i: 1).getReg();
8671 bool RegAIsKill = Root.getOperand(i: 1).isKill();
8672 Register RegB = AddMI->getOperand(i: IdxOpd1).getReg();
8673 bool RegBIsKill = AddMI->getOperand(i: IdxOpd1).isKill();
8674 Register RegC = AddMI->getOperand(i: IdxOtherOpd).getReg();
8675 bool RegCIsKill = AddMI->getOperand(i: IdxOtherOpd).isKill();
8676 Register NewVR =
8677 MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: Root.getOperand(i: 2).getReg()));
8678
8679 unsigned Opcode = Root.getOpcode();
8680 if (Opcode == AArch64::SUBSWrr)
8681 Opcode = AArch64::SUBWrr;
8682 else if (Opcode == AArch64::SUBSXrr)
8683 Opcode = AArch64::SUBXrr;
8684 else
8685 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8686 "Unexpected instruction opcode.");
8687
8688 uint32_t Flags = Root.mergeFlagsWith(Other: *AddMI);
8689 Flags &= ~MachineInstr::NoSWrap;
8690 Flags &= ~MachineInstr::NoUWrap;
8691
8692 MachineInstrBuilder MIB1 =
8693 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: NewVR)
8694 .addReg(RegNo: RegA, Flags: getKillRegState(B: RegAIsKill))
8695 .addReg(RegNo: RegB, Flags: getKillRegState(B: RegBIsKill))
8696 .setMIFlags(Flags);
8697 MachineInstrBuilder MIB2 =
8698 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: ResultReg)
8699 .addReg(RegNo: NewVR, Flags: getKillRegState(B: true))
8700 .addReg(RegNo: RegC, Flags: getKillRegState(B: RegCIsKill))
8701 .setMIFlags(Flags);
8702
8703 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8704 InsInstrs.push_back(Elt: MIB1);
8705 InsInstrs.push_back(Elt: MIB2);
8706 DelInstrs.push_back(Elt: AddMI);
8707 DelInstrs.push_back(Elt: &Root);
8708}
8709
8710unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8711 unsigned int AccumulatorOpCode) const {
8712 switch (AccumulatorOpCode) {
8713 case AArch64::UABALB_ZZZ_D:
8714 case AArch64::SABALB_ZZZ_D:
8715 case AArch64::UABALT_ZZZ_D:
8716 case AArch64::SABALT_ZZZ_D:
8717 return AArch64::ADD_ZZZ_D;
8718 case AArch64::UABALB_ZZZ_H:
8719 case AArch64::SABALB_ZZZ_H:
8720 case AArch64::UABALT_ZZZ_H:
8721 case AArch64::SABALT_ZZZ_H:
8722 return AArch64::ADD_ZZZ_H;
8723 case AArch64::UABALB_ZZZ_S:
8724 case AArch64::SABALB_ZZZ_S:
8725 case AArch64::UABALT_ZZZ_S:
8726 case AArch64::SABALT_ZZZ_S:
8727 return AArch64::ADD_ZZZ_S;
8728 case AArch64::UABALv16i8_v8i16:
8729 case AArch64::SABALv8i8_v8i16:
8730 case AArch64::SABAv8i16:
8731 case AArch64::UABAv8i16:
8732 return AArch64::ADDv8i16;
8733 case AArch64::SABALv2i32_v2i64:
8734 case AArch64::UABALv2i32_v2i64:
8735 case AArch64::SABALv4i32_v2i64:
8736 return AArch64::ADDv2i64;
8737 case AArch64::UABALv4i16_v4i32:
8738 case AArch64::SABALv4i16_v4i32:
8739 case AArch64::SABALv8i16_v4i32:
8740 case AArch64::SABAv4i32:
8741 case AArch64::UABAv4i32:
8742 return AArch64::ADDv4i32;
8743 case AArch64::UABALv4i32_v2i64:
8744 return AArch64::ADDv2i64;
8745 case AArch64::UABALv8i16_v4i32:
8746 return AArch64::ADDv4i32;
8747 case AArch64::UABALv8i8_v8i16:
8748 case AArch64::SABALv16i8_v8i16:
8749 return AArch64::ADDv8i16;
8750 case AArch64::UABAv16i8:
8751 case AArch64::SABAv16i8:
8752 return AArch64::ADDv16i8;
8753 case AArch64::UABAv4i16:
8754 case AArch64::SABAv4i16:
8755 return AArch64::ADDv4i16;
8756 case AArch64::UABAv2i32:
8757 case AArch64::SABAv2i32:
8758 return AArch64::ADDv2i32;
8759 case AArch64::UABAv8i8:
8760 case AArch64::SABAv8i8:
8761 return AArch64::ADDv8i8;
8762 default:
8763 llvm_unreachable("Unknown accumulator opcode");
8764 }
8765}
8766
8767/// When getMachineCombinerPatterns() finds potential patterns,
8768/// this function generates the instructions that could replace the
8769/// original code sequence
8770void AArch64InstrInfo::genAlternativeCodeSequence(
8771 MachineInstr &Root, unsigned Pattern,
8772 SmallVectorImpl<MachineInstr *> &InsInstrs,
8773 SmallVectorImpl<MachineInstr *> &DelInstrs,
8774 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8775 MachineBasicBlock &MBB = *Root.getParent();
8776 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8777 MachineFunction &MF = *MBB.getParent();
8778 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8779
8780 MachineInstr *MUL = nullptr;
8781 const TargetRegisterClass *RC;
8782 unsigned Opc;
8783 switch (Pattern) {
8784 default:
8785 // Reassociate instructions.
8786 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8787 DelInstrs, InstIdxForVirtReg&: InstrIdxForVirtReg);
8788 return;
8789 case AArch64MachineCombinerPattern::SUBADD_OP1:
8790 // A - (B + C)
8791 // ==> (A - B) - C
8792 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 1,
8793 InstrIdxForVirtReg);
8794 return;
8795 case AArch64MachineCombinerPattern::SUBADD_OP2:
8796 // A - (B + C)
8797 // ==> (A - C) - B
8798 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 2,
8799 InstrIdxForVirtReg);
8800 return;
8801 case AArch64MachineCombinerPattern::MULADDW_OP1:
8802 case AArch64MachineCombinerPattern::MULADDX_OP1:
8803 // MUL I=A,B,0
8804 // ADD R,I,C
8805 // ==> MADD R,A,B,C
8806 // --- Create(MADD);
8807 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
8808 Opc = AArch64::MADDWrrr;
8809 RC = &AArch64::GPR32RegClass;
8810 } else {
8811 Opc = AArch64::MADDXrrr;
8812 RC = &AArch64::GPR64RegClass;
8813 }
8814 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8815 break;
8816 case AArch64MachineCombinerPattern::MULADDW_OP2:
8817 case AArch64MachineCombinerPattern::MULADDX_OP2:
8818 // MUL I=A,B,0
8819 // ADD R,C,I
8820 // ==> MADD R,A,B,C
8821 // --- Create(MADD);
8822 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
8823 Opc = AArch64::MADDWrrr;
8824 RC = &AArch64::GPR32RegClass;
8825 } else {
8826 Opc = AArch64::MADDXrrr;
8827 RC = &AArch64::GPR64RegClass;
8828 }
8829 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8830 break;
8831 case AArch64MachineCombinerPattern::MULADDWI_OP1:
8832 case AArch64MachineCombinerPattern::MULADDXI_OP1:
8833 case AArch64MachineCombinerPattern::MULSUBWI_OP1:
8834 case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
8835 // MUL I=A,B,0
8836 // ADD/SUB R,I,Imm
8837 // ==> MOV V, Imm/-Imm
8838 // ==> MADD R,A,B,V
8839 // --- Create(MADD);
8840 const TargetRegisterClass *RC;
8841 unsigned BitSize, MovImm;
8842 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1 ||
8843 Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
8844 MovImm = AArch64::MOVi32imm;
8845 RC = &AArch64::GPR32spRegClass;
8846 BitSize = 32;
8847 Opc = AArch64::MADDWrrr;
8848 RC = &AArch64::GPR32RegClass;
8849 } else {
8850 MovImm = AArch64::MOVi64imm;
8851 RC = &AArch64::GPR64spRegClass;
8852 BitSize = 64;
8853 Opc = AArch64::MADDXrrr;
8854 RC = &AArch64::GPR64RegClass;
8855 }
8856 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
8857 uint64_t Imm = Root.getOperand(i: 2).getImm();
8858
8859 if (Root.getOperand(i: 3).isImm()) {
8860 unsigned Val = Root.getOperand(i: 3).getImm();
8861 Imm = Imm << Val;
8862 }
8863 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8864 Pattern == AArch64MachineCombinerPattern::MULSUBXI_OP1;
8865 uint64_t UImm = SignExtend64(X: IsSub ? -Imm : Imm, B: BitSize);
8866 // Check that the immediate can be composed via a single instruction.
8867 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
8868 AArch64_IMM::expandMOVImm(Imm: UImm, BitSize, Insn);
8869 if (Insn.size() != 1)
8870 return;
8871 MachineInstrBuilder MIB1 =
8872 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MovImm), DestReg: NewVR)
8873 .addImm(Val: IsSub ? -Imm : Imm);
8874 InsInstrs.push_back(Elt: MIB1);
8875 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8876 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
8877 break;
8878 }
8879 case AArch64MachineCombinerPattern::MULSUBW_OP1:
8880 case AArch64MachineCombinerPattern::MULSUBX_OP1: {
8881 // MUL I=A,B,0
8882 // SUB R,I, C
8883 // ==> SUB V, 0, C
8884 // ==> MADD R,A,B,V // = -C + A*B
8885 // --- Create(MADD);
8886 const TargetRegisterClass *SubRC;
8887 unsigned SubOpc, ZeroReg;
8888 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
8889 SubOpc = AArch64::SUBWrr;
8890 SubRC = &AArch64::GPR32spRegClass;
8891 ZeroReg = AArch64::WZR;
8892 Opc = AArch64::MADDWrrr;
8893 RC = &AArch64::GPR32RegClass;
8894 } else {
8895 SubOpc = AArch64::SUBXrr;
8896 SubRC = &AArch64::GPR64spRegClass;
8897 ZeroReg = AArch64::XZR;
8898 Opc = AArch64::MADDXrrr;
8899 RC = &AArch64::GPR64RegClass;
8900 }
8901 Register NewVR = MRI.createVirtualRegister(RegClass: SubRC);
8902 // SUB NewVR, 0, C
8903 MachineInstrBuilder MIB1 =
8904 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: SubOpc), DestReg: NewVR)
8905 .addReg(RegNo: ZeroReg)
8906 .add(MO: Root.getOperand(i: 2));
8907 InsInstrs.push_back(Elt: MIB1);
8908 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8909 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
8910 break;
8911 }
8912 case AArch64MachineCombinerPattern::MULSUBW_OP2:
8913 case AArch64MachineCombinerPattern::MULSUBX_OP2:
8914 // MUL I=A,B,0
8915 // SUB R,C,I
8916 // ==> MSUB R,A,B,C (computes C - A*B)
8917 // --- Create(MSUB);
8918 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
8919 Opc = AArch64::MSUBWrrr;
8920 RC = &AArch64::GPR32RegClass;
8921 } else {
8922 Opc = AArch64::MSUBXrrr;
8923 RC = &AArch64::GPR64RegClass;
8924 }
8925 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8926 break;
8927 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
8928 Opc = AArch64::MLAv8i8;
8929 RC = &AArch64::FPR64RegClass;
8930 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8931 break;
8932 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
8933 Opc = AArch64::MLAv8i8;
8934 RC = &AArch64::FPR64RegClass;
8935 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8936 break;
8937 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
8938 Opc = AArch64::MLAv16i8;
8939 RC = &AArch64::FPR128RegClass;
8940 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8941 break;
8942 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
8943 Opc = AArch64::MLAv16i8;
8944 RC = &AArch64::FPR128RegClass;
8945 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8946 break;
8947 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
8948 Opc = AArch64::MLAv4i16;
8949 RC = &AArch64::FPR64RegClass;
8950 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8951 break;
8952 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
8953 Opc = AArch64::MLAv4i16;
8954 RC = &AArch64::FPR64RegClass;
8955 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8956 break;
8957 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
8958 Opc = AArch64::MLAv8i16;
8959 RC = &AArch64::FPR128RegClass;
8960 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8961 break;
8962 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
8963 Opc = AArch64::MLAv8i16;
8964 RC = &AArch64::FPR128RegClass;
8965 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8966 break;
8967 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
8968 Opc = AArch64::MLAv2i32;
8969 RC = &AArch64::FPR64RegClass;
8970 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8971 break;
8972 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
8973 Opc = AArch64::MLAv2i32;
8974 RC = &AArch64::FPR64RegClass;
8975 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8976 break;
8977 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
8978 Opc = AArch64::MLAv4i32;
8979 RC = &AArch64::FPR128RegClass;
8980 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
8981 break;
8982 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
8983 Opc = AArch64::MLAv4i32;
8984 RC = &AArch64::FPR128RegClass;
8985 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8986 break;
8987
8988 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
8989 Opc = AArch64::MLAv8i8;
8990 RC = &AArch64::FPR64RegClass;
8991 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8992 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i8,
8993 RC);
8994 break;
8995 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
8996 Opc = AArch64::MLSv8i8;
8997 RC = &AArch64::FPR64RegClass;
8998 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
8999 break;
9000 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
9001 Opc = AArch64::MLAv16i8;
9002 RC = &AArch64::FPR128RegClass;
9003 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9004 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv16i8,
9005 RC);
9006 break;
9007 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
9008 Opc = AArch64::MLSv16i8;
9009 RC = &AArch64::FPR128RegClass;
9010 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9011 break;
9012 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
9013 Opc = AArch64::MLAv4i16;
9014 RC = &AArch64::FPR64RegClass;
9015 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9016 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i16,
9017 RC);
9018 break;
9019 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
9020 Opc = AArch64::MLSv4i16;
9021 RC = &AArch64::FPR64RegClass;
9022 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9023 break;
9024 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
9025 Opc = AArch64::MLAv8i16;
9026 RC = &AArch64::FPR128RegClass;
9027 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9028 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i16,
9029 RC);
9030 break;
9031 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
9032 Opc = AArch64::MLSv8i16;
9033 RC = &AArch64::FPR128RegClass;
9034 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9035 break;
9036 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
9037 Opc = AArch64::MLAv2i32;
9038 RC = &AArch64::FPR64RegClass;
9039 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9040 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv2i32,
9041 RC);
9042 break;
9043 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
9044 Opc = AArch64::MLSv2i32;
9045 RC = &AArch64::FPR64RegClass;
9046 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9047 break;
9048 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
9049 Opc = AArch64::MLAv4i32;
9050 RC = &AArch64::FPR128RegClass;
9051 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9052 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i32,
9053 RC);
9054 break;
9055 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
9056 Opc = AArch64::MLSv4i32;
9057 RC = &AArch64::FPR128RegClass;
9058 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9059 break;
9060
9061 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
9062 Opc = AArch64::MLAv4i16_indexed;
9063 RC = &AArch64::FPR64RegClass;
9064 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9065 break;
9066 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
9067 Opc = AArch64::MLAv4i16_indexed;
9068 RC = &AArch64::FPR64RegClass;
9069 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9070 break;
9071 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
9072 Opc = AArch64::MLAv8i16_indexed;
9073 RC = &AArch64::FPR128RegClass;
9074 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9075 break;
9076 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
9077 Opc = AArch64::MLAv8i16_indexed;
9078 RC = &AArch64::FPR128RegClass;
9079 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9080 break;
9081 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
9082 Opc = AArch64::MLAv2i32_indexed;
9083 RC = &AArch64::FPR64RegClass;
9084 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9085 break;
9086 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
9087 Opc = AArch64::MLAv2i32_indexed;
9088 RC = &AArch64::FPR64RegClass;
9089 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9090 break;
9091 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
9092 Opc = AArch64::MLAv4i32_indexed;
9093 RC = &AArch64::FPR128RegClass;
9094 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9095 break;
9096 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
9097 Opc = AArch64::MLAv4i32_indexed;
9098 RC = &AArch64::FPR128RegClass;
9099 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9100 break;
9101
9102 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
9103 Opc = AArch64::MLAv4i16_indexed;
9104 RC = &AArch64::FPR64RegClass;
9105 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9106 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i16,
9107 RC);
9108 break;
9109 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
9110 Opc = AArch64::MLSv4i16_indexed;
9111 RC = &AArch64::FPR64RegClass;
9112 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9113 break;
9114 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
9115 Opc = AArch64::MLAv8i16_indexed;
9116 RC = &AArch64::FPR128RegClass;
9117 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9118 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i16,
9119 RC);
9120 break;
9121 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
9122 Opc = AArch64::MLSv8i16_indexed;
9123 RC = &AArch64::FPR128RegClass;
9124 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9125 break;
9126 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
9127 Opc = AArch64::MLAv2i32_indexed;
9128 RC = &AArch64::FPR64RegClass;
9129 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9130 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv2i32,
9131 RC);
9132 break;
9133 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
9134 Opc = AArch64::MLSv2i32_indexed;
9135 RC = &AArch64::FPR64RegClass;
9136 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9137 break;
9138 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
9139 Opc = AArch64::MLAv4i32_indexed;
9140 RC = &AArch64::FPR128RegClass;
9141 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9142 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i32,
9143 RC);
9144 break;
9145 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
9146 Opc = AArch64::MLSv4i32_indexed;
9147 RC = &AArch64::FPR128RegClass;
9148 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9149 break;
9150
9151 // Floating Point Support
9152 case AArch64MachineCombinerPattern::FMULADDH_OP1:
9153 Opc = AArch64::FMADDHrrr;
9154 RC = &AArch64::FPR16RegClass;
9155 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9156 break;
9157 case AArch64MachineCombinerPattern::FMULADDS_OP1:
9158 Opc = AArch64::FMADDSrrr;
9159 RC = &AArch64::FPR32RegClass;
9160 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9161 break;
9162 case AArch64MachineCombinerPattern::FMULADDD_OP1:
9163 Opc = AArch64::FMADDDrrr;
9164 RC = &AArch64::FPR64RegClass;
9165 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9166 break;
9167
9168 case AArch64MachineCombinerPattern::FMULADDH_OP2:
9169 Opc = AArch64::FMADDHrrr;
9170 RC = &AArch64::FPR16RegClass;
9171 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9172 break;
9173 case AArch64MachineCombinerPattern::FMULADDS_OP2:
9174 Opc = AArch64::FMADDSrrr;
9175 RC = &AArch64::FPR32RegClass;
9176 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9177 break;
9178 case AArch64MachineCombinerPattern::FMULADDD_OP2:
9179 Opc = AArch64::FMADDDrrr;
9180 RC = &AArch64::FPR64RegClass;
9181 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9182 break;
9183
9184 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
9185 Opc = AArch64::FMLAv1i32_indexed;
9186 RC = &AArch64::FPR32RegClass;
9187 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9188 kind: FMAInstKind::Indexed);
9189 break;
9190 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
9191 Opc = AArch64::FMLAv1i32_indexed;
9192 RC = &AArch64::FPR32RegClass;
9193 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9194 kind: FMAInstKind::Indexed);
9195 break;
9196
9197 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
9198 Opc = AArch64::FMLAv1i64_indexed;
9199 RC = &AArch64::FPR64RegClass;
9200 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9201 kind: FMAInstKind::Indexed);
9202 break;
9203 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
9204 Opc = AArch64::FMLAv1i64_indexed;
9205 RC = &AArch64::FPR64RegClass;
9206 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9207 kind: FMAInstKind::Indexed);
9208 break;
9209
9210 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
9211 RC = &AArch64::FPR64RegClass;
9212 Opc = AArch64::FMLAv4i16_indexed;
9213 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9214 kind: FMAInstKind::Indexed);
9215 break;
9216 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
9217 RC = &AArch64::FPR64RegClass;
9218 Opc = AArch64::FMLAv4f16;
9219 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9220 kind: FMAInstKind::Accumulator);
9221 break;
9222 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
9223 RC = &AArch64::FPR64RegClass;
9224 Opc = AArch64::FMLAv4i16_indexed;
9225 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9226 kind: FMAInstKind::Indexed);
9227 break;
9228 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
9229 RC = &AArch64::FPR64RegClass;
9230 Opc = AArch64::FMLAv4f16;
9231 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9232 kind: FMAInstKind::Accumulator);
9233 break;
9234
9235 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
9236 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
9237 RC = &AArch64::FPR64RegClass;
9238 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
9239 Opc = AArch64::FMLAv2i32_indexed;
9240 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9241 kind: FMAInstKind::Indexed);
9242 } else {
9243 Opc = AArch64::FMLAv2f32;
9244 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9245 kind: FMAInstKind::Accumulator);
9246 }
9247 break;
9248 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
9249 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
9250 RC = &AArch64::FPR64RegClass;
9251 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
9252 Opc = AArch64::FMLAv2i32_indexed;
9253 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9254 kind: FMAInstKind::Indexed);
9255 } else {
9256 Opc = AArch64::FMLAv2f32;
9257 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9258 kind: FMAInstKind::Accumulator);
9259 }
9260 break;
9261
9262 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
9263 RC = &AArch64::FPR128RegClass;
9264 Opc = AArch64::FMLAv8i16_indexed;
9265 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9266 kind: FMAInstKind::Indexed);
9267 break;
9268 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
9269 RC = &AArch64::FPR128RegClass;
9270 Opc = AArch64::FMLAv8f16;
9271 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9272 kind: FMAInstKind::Accumulator);
9273 break;
9274 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
9275 RC = &AArch64::FPR128RegClass;
9276 Opc = AArch64::FMLAv8i16_indexed;
9277 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9278 kind: FMAInstKind::Indexed);
9279 break;
9280 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
9281 RC = &AArch64::FPR128RegClass;
9282 Opc = AArch64::FMLAv8f16;
9283 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9284 kind: FMAInstKind::Accumulator);
9285 break;
9286
9287 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
9288 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
9289 RC = &AArch64::FPR128RegClass;
9290 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
9291 Opc = AArch64::FMLAv2i64_indexed;
9292 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9293 kind: FMAInstKind::Indexed);
9294 } else {
9295 Opc = AArch64::FMLAv2f64;
9296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9297 kind: FMAInstKind::Accumulator);
9298 }
9299 break;
9300 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
9301 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
9302 RC = &AArch64::FPR128RegClass;
9303 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
9304 Opc = AArch64::FMLAv2i64_indexed;
9305 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9306 kind: FMAInstKind::Indexed);
9307 } else {
9308 Opc = AArch64::FMLAv2f64;
9309 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9310 kind: FMAInstKind::Accumulator);
9311 }
9312 break;
9313
9314 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
9315 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
9316 RC = &AArch64::FPR128RegClass;
9317 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
9318 Opc = AArch64::FMLAv4i32_indexed;
9319 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9320 kind: FMAInstKind::Indexed);
9321 } else {
9322 Opc = AArch64::FMLAv4f32;
9323 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9324 kind: FMAInstKind::Accumulator);
9325 }
9326 break;
9327
9328 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
9329 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
9330 RC = &AArch64::FPR128RegClass;
9331 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
9332 Opc = AArch64::FMLAv4i32_indexed;
9333 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9334 kind: FMAInstKind::Indexed);
9335 } else {
9336 Opc = AArch64::FMLAv4f32;
9337 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9338 kind: FMAInstKind::Accumulator);
9339 }
9340 break;
9341
9342 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
9343 Opc = AArch64::FNMSUBHrrr;
9344 RC = &AArch64::FPR16RegClass;
9345 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9346 break;
9347 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
9348 Opc = AArch64::FNMSUBSrrr;
9349 RC = &AArch64::FPR32RegClass;
9350 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9351 break;
9352 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
9353 Opc = AArch64::FNMSUBDrrr;
9354 RC = &AArch64::FPR64RegClass;
9355 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9356 break;
9357
9358 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
9359 Opc = AArch64::FNMADDHrrr;
9360 RC = &AArch64::FPR16RegClass;
9361 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9362 break;
9363 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
9364 Opc = AArch64::FNMADDSrrr;
9365 RC = &AArch64::FPR32RegClass;
9366 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9367 break;
9368 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
9369 Opc = AArch64::FNMADDDrrr;
9370 RC = &AArch64::FPR64RegClass;
9371 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9372 break;
9373
9374 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
9375 Opc = AArch64::FMSUBHrrr;
9376 RC = &AArch64::FPR16RegClass;
9377 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9378 break;
9379 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
9380 Opc = AArch64::FMSUBSrrr;
9381 RC = &AArch64::FPR32RegClass;
9382 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9383 break;
9384 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
9385 Opc = AArch64::FMSUBDrrr;
9386 RC = &AArch64::FPR64RegClass;
9387 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9388 break;
9389
9390 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
9391 Opc = AArch64::FMLSv1i32_indexed;
9392 RC = &AArch64::FPR32RegClass;
9393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9394 kind: FMAInstKind::Indexed);
9395 break;
9396
9397 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
9398 Opc = AArch64::FMLSv1i64_indexed;
9399 RC = &AArch64::FPR64RegClass;
9400 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9401 kind: FMAInstKind::Indexed);
9402 break;
9403
9404 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
9405 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
9406 RC = &AArch64::FPR64RegClass;
9407 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9408 MachineInstrBuilder MIB1 =
9409 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv4f16), DestReg: NewVR)
9410 .add(MO: Root.getOperand(i: 2));
9411 InsInstrs.push_back(Elt: MIB1);
9412 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9413 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
9414 Opc = AArch64::FMLAv4f16;
9415 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9416 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9417 } else {
9418 Opc = AArch64::FMLAv4i16_indexed;
9419 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9420 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9421 }
9422 break;
9423 }
9424 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
9425 RC = &AArch64::FPR64RegClass;
9426 Opc = AArch64::FMLSv4f16;
9427 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9428 kind: FMAInstKind::Accumulator);
9429 break;
9430 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
9431 RC = &AArch64::FPR64RegClass;
9432 Opc = AArch64::FMLSv4i16_indexed;
9433 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9434 kind: FMAInstKind::Indexed);
9435 break;
9436
9437 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
9438 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
9439 RC = &AArch64::FPR64RegClass;
9440 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
9441 Opc = AArch64::FMLSv2i32_indexed;
9442 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9443 kind: FMAInstKind::Indexed);
9444 } else {
9445 Opc = AArch64::FMLSv2f32;
9446 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9447 kind: FMAInstKind::Accumulator);
9448 }
9449 break;
9450
9451 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
9452 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
9453 RC = &AArch64::FPR128RegClass;
9454 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9455 MachineInstrBuilder MIB1 =
9456 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv8f16), DestReg: NewVR)
9457 .add(MO: Root.getOperand(i: 2));
9458 InsInstrs.push_back(Elt: MIB1);
9459 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9460 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
9461 Opc = AArch64::FMLAv8f16;
9462 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9463 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9464 } else {
9465 Opc = AArch64::FMLAv8i16_indexed;
9466 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9467 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9468 }
9469 break;
9470 }
9471 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
9472 RC = &AArch64::FPR128RegClass;
9473 Opc = AArch64::FMLSv8f16;
9474 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9475 kind: FMAInstKind::Accumulator);
9476 break;
9477 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
9478 RC = &AArch64::FPR128RegClass;
9479 Opc = AArch64::FMLSv8i16_indexed;
9480 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9481 kind: FMAInstKind::Indexed);
9482 break;
9483
9484 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
9485 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
9486 RC = &AArch64::FPR128RegClass;
9487 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
9488 Opc = AArch64::FMLSv2i64_indexed;
9489 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9490 kind: FMAInstKind::Indexed);
9491 } else {
9492 Opc = AArch64::FMLSv2f64;
9493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9494 kind: FMAInstKind::Accumulator);
9495 }
9496 break;
9497
9498 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
9499 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
9500 RC = &AArch64::FPR128RegClass;
9501 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
9502 Opc = AArch64::FMLSv4i32_indexed;
9503 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9504 kind: FMAInstKind::Indexed);
9505 } else {
9506 Opc = AArch64::FMLSv4f32;
9507 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9508 kind: FMAInstKind::Accumulator);
9509 }
9510 break;
9511 case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
9512 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
9513 RC = &AArch64::FPR64RegClass;
9514 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9515 MachineInstrBuilder MIB1 =
9516 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv2f32), DestReg: NewVR)
9517 .add(MO: Root.getOperand(i: 2));
9518 InsInstrs.push_back(Elt: MIB1);
9519 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9520 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
9521 Opc = AArch64::FMLAv2i32_indexed;
9522 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9523 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9524 } else {
9525 Opc = AArch64::FMLAv2f32;
9526 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9527 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9528 }
9529 break;
9530 }
9531 case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
9532 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
9533 RC = &AArch64::FPR128RegClass;
9534 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9535 MachineInstrBuilder MIB1 =
9536 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv4f32), DestReg: NewVR)
9537 .add(MO: Root.getOperand(i: 2));
9538 InsInstrs.push_back(Elt: MIB1);
9539 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9540 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
9541 Opc = AArch64::FMLAv4i32_indexed;
9542 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9543 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9544 } else {
9545 Opc = AArch64::FMLAv4f32;
9546 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9547 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9548 }
9549 break;
9550 }
9551 case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
9552 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
9553 RC = &AArch64::FPR128RegClass;
9554 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9555 MachineInstrBuilder MIB1 =
9556 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv2f64), DestReg: NewVR)
9557 .add(MO: Root.getOperand(i: 2));
9558 InsInstrs.push_back(Elt: MIB1);
9559 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9560 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
9561 Opc = AArch64::FMLAv2i64_indexed;
9562 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9563 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9564 } else {
9565 Opc = AArch64::FMLAv2f64;
9566 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9567 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9568 }
9569 break;
9570 }
9571 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
9572 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
9573 unsigned IdxDupOp =
9574 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
9575 : 2;
9576 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv2i32_indexed,
9577 RC: &AArch64::FPR128RegClass, MRI);
9578 break;
9579 }
9580 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
9581 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
9582 unsigned IdxDupOp =
9583 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
9584 : 2;
9585 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv2i64_indexed,
9586 RC: &AArch64::FPR128RegClass, MRI);
9587 break;
9588 }
9589 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
9590 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
9591 unsigned IdxDupOp =
9592 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
9593 : 2;
9594 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv4i16_indexed,
9595 RC: &AArch64::FPR128_loRegClass, MRI);
9596 break;
9597 }
9598 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
9599 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
9600 unsigned IdxDupOp =
9601 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
9602 : 2;
9603 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv4i32_indexed,
9604 RC: &AArch64::FPR128RegClass, MRI);
9605 break;
9606 }
9607 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
9608 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
9609 unsigned IdxDupOp =
9610 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
9611 : 2;
9612 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv8i16_indexed,
9613 RC: &AArch64::FPR128_loRegClass, MRI);
9614 break;
9615 }
9616 case AArch64MachineCombinerPattern::FNMADD: {
9617 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9618 break;
9619 }
9620 case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
9621 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9622 Pattern, NumLanes: 4);
9623 break;
9624 }
9625 case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
9626 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9627 Pattern, NumLanes: 8);
9628 break;
9629 }
9630 case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
9631 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9632 Pattern, NumLanes: 16);
9633 break;
9634 }
9635
9636 } // end switch (Pattern)
9637 // Record MUL and ADD/SUB for deletion
9638 if (MUL)
9639 DelInstrs.push_back(Elt: MUL);
9640 DelInstrs.push_back(Elt: &Root);
9641
9642 // Set the flags on the inserted instructions to be the merged flags of the
9643 // instructions that we have combined.
9644 uint32_t Flags = Root.getFlags();
9645 if (MUL)
9646 Flags = Root.mergeFlagsWith(Other: *MUL);
9647 for (auto *MI : InsInstrs)
9648 MI->setFlags(Flags);
9649}
9650
9651/// Replace csincr-branch sequence by simple conditional branch
9652///
9653/// Examples:
9654/// 1. \code
9655/// csinc w9, wzr, wzr, <condition code>
9656/// tbnz w9, #0, 0x44
9657/// \endcode
9658/// to
9659/// \code
9660/// b.<inverted condition code>
9661/// \endcode
9662///
9663/// 2. \code
9664/// csinc w9, wzr, wzr, <condition code>
9665/// tbz w9, #0, 0x44
9666/// \endcode
9667/// to
9668/// \code
9669/// b.<condition code>
9670/// \endcode
9671///
9672/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9673/// compare's constant operand is power of 2.
9674///
9675/// Examples:
9676/// \code
9677/// and w8, w8, #0x400
9678/// cbnz w8, L1
9679/// \endcode
9680/// to
9681/// \code
9682/// tbnz w8, #10, L1
9683/// \endcode
9684///
9685/// \param MI Conditional Branch
9686/// \return True when the simple conditional branch is generated
9687///
9688bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
9689 bool IsNegativeBranch = false;
9690 bool IsTestAndBranch = false;
9691 unsigned TargetBBInMI = 0;
9692 switch (MI.getOpcode()) {
9693 default:
9694 llvm_unreachable("Unknown branch instruction?");
9695 case AArch64::Bcc:
9696 case AArch64::CBWPri:
9697 case AArch64::CBXPri:
9698 case AArch64::CBBAssertExt:
9699 case AArch64::CBHAssertExt:
9700 case AArch64::CBWPrr:
9701 case AArch64::CBXPrr:
9702 return false;
9703 case AArch64::CBZW:
9704 case AArch64::CBZX:
9705 TargetBBInMI = 1;
9706 break;
9707 case AArch64::CBNZW:
9708 case AArch64::CBNZX:
9709 TargetBBInMI = 1;
9710 IsNegativeBranch = true;
9711 break;
9712 case AArch64::TBZW:
9713 case AArch64::TBZX:
9714 TargetBBInMI = 2;
9715 IsTestAndBranch = true;
9716 break;
9717 case AArch64::TBNZW:
9718 case AArch64::TBNZX:
9719 TargetBBInMI = 2;
9720 IsNegativeBranch = true;
9721 IsTestAndBranch = true;
9722 break;
9723 }
9724 // So we increment a zero register and test for bits other
9725 // than bit 0? Conservatively bail out in case the verifier
9726 // missed this case.
9727 if (IsTestAndBranch && MI.getOperand(i: 1).getImm())
9728 return false;
9729
9730 // Find Definition.
9731 assert(MI.getParent() && "Incomplete machine instruction\n");
9732 MachineBasicBlock *MBB = MI.getParent();
9733 MachineFunction *MF = MBB->getParent();
9734 MachineRegisterInfo *MRI = &MF->getRegInfo();
9735 Register VReg = MI.getOperand(i: 0).getReg();
9736 if (!VReg.isVirtual())
9737 return false;
9738
9739 MachineInstr *DefMI = MRI->getVRegDef(Reg: VReg);
9740
9741 // Look through COPY instructions to find definition.
9742 while (DefMI->isCopy()) {
9743 Register CopyVReg = DefMI->getOperand(i: 1).getReg();
9744 if (!MRI->hasOneNonDBGUse(RegNo: CopyVReg))
9745 return false;
9746 if (!MRI->hasOneDef(RegNo: CopyVReg))
9747 return false;
9748 DefMI = MRI->getVRegDef(Reg: CopyVReg);
9749 }
9750
9751 switch (DefMI->getOpcode()) {
9752 default:
9753 return false;
9754 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9755 case AArch64::ANDWri:
9756 case AArch64::ANDXri: {
9757 if (IsTestAndBranch)
9758 return false;
9759 if (DefMI->getParent() != MBB)
9760 return false;
9761 if (!MRI->hasOneNonDBGUse(RegNo: VReg))
9762 return false;
9763
9764 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9765 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
9766 val: DefMI->getOperand(i: 2).getImm(), regSize: Is32Bit ? 32 : 64);
9767 if (!isPowerOf2_64(Value: Mask))
9768 return false;
9769
9770 MachineOperand &MO = DefMI->getOperand(i: 1);
9771 Register NewReg = MO.getReg();
9772 if (!NewReg.isVirtual())
9773 return false;
9774
9775 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9776
9777 MachineBasicBlock &RefToMBB = *MBB;
9778 MachineBasicBlock *TBB = MI.getOperand(i: 1).getMBB();
9779 DebugLoc DL = MI.getDebugLoc();
9780 unsigned Imm = Log2_64(Value: Mask);
9781 unsigned Opc = (Imm < 32)
9782 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9783 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9784 MachineInstr *NewMI = BuildMI(BB&: RefToMBB, I&: MI, MIMD: DL, MCID: get(Opcode: Opc))
9785 .addReg(RegNo: NewReg)
9786 .addImm(Val: Imm)
9787 .addMBB(MBB: TBB);
9788 // Register lives on to the CBZ now.
9789 MO.setIsKill(false);
9790
9791 // For immediate smaller than 32, we need to use the 32-bit
9792 // variant (W) in all cases. Indeed the 64-bit variant does not
9793 // allow to encode them.
9794 // Therefore, if the input register is 64-bit, we need to take the
9795 // 32-bit sub-part.
9796 if (!Is32Bit && Imm < 32)
9797 NewMI->getOperand(i: 0).setSubReg(AArch64::sub_32);
9798 MI.eraseFromParent();
9799 return true;
9800 }
9801 // Look for CSINC
9802 case AArch64::CSINCWr:
9803 case AArch64::CSINCXr: {
9804 if (!(DefMI->getOperand(i: 1).getReg() == AArch64::WZR &&
9805 DefMI->getOperand(i: 2).getReg() == AArch64::WZR) &&
9806 !(DefMI->getOperand(i: 1).getReg() == AArch64::XZR &&
9807 DefMI->getOperand(i: 2).getReg() == AArch64::XZR))
9808 return false;
9809
9810 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
9811 isDead: true) != -1)
9812 return false;
9813
9814 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(i: 3).getImm();
9815 // Convert only when the condition code is not modified between
9816 // the CSINC and the branch. The CC may be used by other
9817 // instructions in between.
9818 if (areCFlagsAccessedBetweenInstrs(From: DefMI, To: MI, TRI: &getRegisterInfo(), AccessToCheck: AK_Write))
9819 return false;
9820 MachineBasicBlock &RefToMBB = *MBB;
9821 MachineBasicBlock *TBB = MI.getOperand(i: TargetBBInMI).getMBB();
9822 DebugLoc DL = MI.getDebugLoc();
9823 if (IsNegativeBranch)
9824 CC = AArch64CC::getInvertedCondCode(Code: CC);
9825 BuildMI(BB&: RefToMBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::Bcc)).addImm(Val: CC).addMBB(MBB: TBB);
9826 MI.eraseFromParent();
9827 return true;
9828 }
9829 }
9830}
9831
9832std::pair<unsigned, unsigned>
9833AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9834 const unsigned Mask = AArch64II::MO_FRAGMENT;
9835 return std::make_pair(x: TF & Mask, y: TF & ~Mask);
9836}
9837
9838ArrayRef<std::pair<unsigned, const char *>>
9839AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9840 using namespace AArch64II;
9841
9842 static const std::pair<unsigned, const char *> TargetFlags[] = {
9843 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9844 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9845 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9846 {MO_HI12, "aarch64-hi12"}};
9847 return ArrayRef(TargetFlags);
9848}
9849
9850ArrayRef<std::pair<unsigned, const char *>>
9851AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9852 using namespace AArch64II;
9853
9854 static const std::pair<unsigned, const char *> TargetFlags[] = {
9855 {MO_COFFSTUB, "aarch64-coffstub"},
9856 {MO_GOT, "aarch64-got"},
9857 {MO_NC, "aarch64-nc"},
9858 {MO_S, "aarch64-s"},
9859 {MO_TLS, "aarch64-tls"},
9860 {MO_DLLIMPORT, "aarch64-dllimport"},
9861 {MO_PREL, "aarch64-prel"},
9862 {MO_TAGGED, "aarch64-tagged"},
9863 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9864 };
9865 return ArrayRef(TargetFlags);
9866}
9867
9868ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
9869AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9870 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9871 {{MOSuppressPair, "aarch64-suppress-pair"},
9872 {MOStridedAccess, "aarch64-strided-access"}};
9873 return ArrayRef(TargetFlags);
9874}
9875
9876/// Constants defining how certain sequences should be outlined.
9877/// This encompasses how an outlined function should be called, and what kind of
9878/// frame should be emitted for that outlined function.
9879///
9880/// \p MachineOutlinerDefault implies that the function should be called with
9881/// a save and restore of LR to the stack.
9882///
9883/// That is,
9884///
9885/// I1 Save LR OUTLINED_FUNCTION:
9886/// I2 --> BL OUTLINED_FUNCTION I1
9887/// I3 Restore LR I2
9888/// I3
9889/// RET
9890///
9891/// * Call construction overhead: 3 (save + BL + restore)
9892/// * Frame construction overhead: 1 (ret)
9893/// * Requires stack fixups? Yes
9894///
9895/// \p MachineOutlinerTailCall implies that the function is being created from
9896/// a sequence of instructions ending in a return.
9897///
9898/// That is,
9899///
9900/// I1 OUTLINED_FUNCTION:
9901/// I2 --> B OUTLINED_FUNCTION I1
9902/// RET I2
9903/// RET
9904///
9905/// * Call construction overhead: 1 (B)
9906/// * Frame construction overhead: 0 (Return included in sequence)
9907/// * Requires stack fixups? No
9908///
9909/// \p MachineOutlinerNoLRSave implies that the function should be called using
9910/// a BL instruction, but doesn't require LR to be saved and restored. This
9911/// happens when LR is known to be dead.
9912///
9913/// That is,
9914///
9915/// I1 OUTLINED_FUNCTION:
9916/// I2 --> BL OUTLINED_FUNCTION I1
9917/// I3 I2
9918/// I3
9919/// RET
9920///
9921/// * Call construction overhead: 1 (BL)
9922/// * Frame construction overhead: 1 (RET)
9923/// * Requires stack fixups? No
9924///
9925/// \p MachineOutlinerThunk implies that the function is being created from
9926/// a sequence of instructions ending in a call. The outlined function is
9927/// called with a BL instruction, and the outlined function tail-calls the
9928/// original call destination.
9929///
9930/// That is,
9931///
9932/// I1 OUTLINED_FUNCTION:
9933/// I2 --> BL OUTLINED_FUNCTION I1
9934/// BL f I2
9935/// B f
9936/// * Call construction overhead: 1 (BL)
9937/// * Frame construction overhead: 0
9938/// * Requires stack fixups? No
9939///
9940/// \p MachineOutlinerRegSave implies that the function should be called with a
9941/// save and restore of LR to an available register. This allows us to avoid
9942/// stack fixups. Note that this outlining variant is compatible with the
9943/// NoLRSave case.
9944///
9945/// That is,
9946///
9947/// I1 Save LR OUTLINED_FUNCTION:
9948/// I2 --> BL OUTLINED_FUNCTION I1
9949/// I3 Restore LR I2
9950/// I3
9951/// RET
9952///
9953/// * Call construction overhead: 3 (save + BL + restore)
9954/// * Frame construction overhead: 1 (ret)
9955/// * Requires stack fixups? No
9956enum MachineOutlinerClass {
9957 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9958 MachineOutlinerTailCall, /// Only emit a branch.
9959 MachineOutlinerNoLRSave, /// Emit a call and return.
9960 MachineOutlinerThunk, /// Emit a call and tail-call.
9961 MachineOutlinerRegSave /// Same as default, but save to a register.
9962};
9963
9964enum MachineOutlinerMBBFlags {
9965 LRUnavailableSomewhere = 0x2,
9966 HasCalls = 0x4,
9967 UnsafeRegsDead = 0x8
9968};
9969
9970Register
9971AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
9972 MachineFunction *MF = C.getMF();
9973 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
9974 const AArch64RegisterInfo *ARI =
9975 static_cast<const AArch64RegisterInfo *>(&TRI);
9976 // Check if there is an available register across the sequence that we can
9977 // use.
9978 for (unsigned Reg : AArch64::GPR64RegClass) {
9979 if (!ARI->isReservedReg(MF: *MF, Reg) &&
9980 Reg != AArch64::LR && // LR is not reserved, but don't use it.
9981 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
9982 Reg != AArch64::X17 && // Ditto for X17.
9983 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
9984 C.isAvailableInsideSeq(Reg, TRI))
9985 return Reg;
9986 }
9987 return Register();
9988}
9989
9990static bool
9991outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
9992 const outliner::Candidate &b) {
9993 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9994 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9995
9996 return MFIa->getSignReturnAddressCondition() ==
9997 MFIb->getSignReturnAddressCondition();
9998}
9999
10000static bool
10001outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
10002 const outliner::Candidate &b) {
10003 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10004 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10005
10006 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
10007}
10008
10009static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
10010 const outliner::Candidate &b) {
10011 const AArch64Subtarget &SubtargetA =
10012 a.getMF()->getSubtarget<AArch64Subtarget>();
10013 const AArch64Subtarget &SubtargetB =
10014 b.getMF()->getSubtarget<AArch64Subtarget>();
10015 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
10016}
10017
10018std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10019AArch64InstrInfo::getOutliningCandidateInfo(
10020 const MachineModuleInfo &MMI,
10021 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10022 unsigned MinRepeats) const {
10023 unsigned SequenceSize = 0;
10024 for (auto &MI : RepeatedSequenceLocs[0])
10025 SequenceSize += getInstSizeInBytes(MI);
10026
10027 unsigned NumBytesToCreateFrame = 0;
10028
10029 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
10030 // These instructions are fused together by the scheduler.
10031 // Any candidate where ADRP is the last instruction should be rejected
10032 // as that will lead to splitting ADRP pair.
10033 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
10034 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
10035 if (LastMI.getOpcode() == AArch64::ADRP &&
10036 (LastMI.getOperand(i: 1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
10037 (LastMI.getOperand(i: 1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10038 return std::nullopt;
10039 }
10040
10041 // Similarly any candidate where the first instruction is ADD/LDR with a
10042 // page offset should be rejected to avoid ADRP splitting.
10043 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
10044 FirstMI.getOpcode() == AArch64::LDRXui) &&
10045 (FirstMI.getOperand(i: 2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
10046 (FirstMI.getOperand(i: 2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10047 return std::nullopt;
10048 }
10049
10050 // We only allow outlining for functions having exactly matching return
10051 // address signing attributes, i.e., all share the same value for the
10052 // attribute "sign-return-address" and all share the same type of key they
10053 // are signed with.
10054 // Additionally we require all functions to simultaneously either support
10055 // v8.3a features or not. Otherwise an outlined function could get signed
10056 // using dedicated v8.3 instructions and a call from a function that doesn't
10057 // support v8.3 instructions would therefore be invalid.
10058 if (std::adjacent_find(
10059 first: RepeatedSequenceLocs.begin(), last: RepeatedSequenceLocs.end(),
10060 binary_pred: [](const outliner::Candidate &a, const outliner::Candidate &b) {
10061 // Return true if a and b are non-equal w.r.t. return address
10062 // signing or support of v8.3a features
10063 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10064 outliningCandidatesSigningKeyConsensus(a, b) &&
10065 outliningCandidatesV8_3OpsConsensus(a, b)) {
10066 return false;
10067 }
10068 return true;
10069 }) != RepeatedSequenceLocs.end()) {
10070 return std::nullopt;
10071 }
10072
10073 // Since at this point all candidates agree on their return address signing
10074 // picking just one is fine. If the candidate functions potentially sign their
10075 // return addresses, the outlined function should do the same. Note that in
10076 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10077 // not certainly true that the outlined function will have to sign its return
10078 // address but this decision is made later, when the decision to outline
10079 // has already been made.
10080 // The same holds for the number of additional instructions we need: On
10081 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10082 // necessary. However, at this point we don't know if the outlined function
10083 // will have a RET instruction so we assume the worst.
10084 const TargetRegisterInfo &TRI = getRegisterInfo();
10085 // Performing a tail call may require extra checks when PAuth is enabled.
10086 // If PAuth is disabled, set it to zero for uniformity.
10087 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10088 const auto RASignCondition = RepeatedSequenceLocs[0]
10089 .getMF()
10090 ->getInfo<AArch64FunctionInfo>()
10091 ->getSignReturnAddressCondition();
10092 if (RASignCondition != SignReturnAddress::None) {
10093 // One PAC and one AUT instructions
10094 NumBytesToCreateFrame += 8;
10095
10096 // PAuth is enabled - set extra tail call cost, if any.
10097 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10098 MF: *RepeatedSequenceLocs[0].getMF());
10099 NumBytesToCheckLRInTCEpilogue =
10100 AArch64PAuth::getCheckerSizeInBytes(Method: LRCheckMethod);
10101 // Checking the authenticated LR value may significantly impact
10102 // SequenceSize, so account for it for more precise results.
10103 if (isTailCallReturnInst(MI: RepeatedSequenceLocs[0].back()))
10104 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10105
10106 // We have to check if sp modifying instructions would get outlined.
10107 // If so we only allow outlining if sp is unchanged overall, so matching
10108 // sub and add instructions are okay to outline, all other sp modifications
10109 // are not
10110 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10111 int SPValue = 0;
10112 for (auto &MI : C) {
10113 if (MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI)) {
10114 switch (MI.getOpcode()) {
10115 case AArch64::ADDXri:
10116 case AArch64::ADDWri:
10117 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10118 assert(MI.getOperand(2).isImm() &&
10119 "Expected operand to be immediate");
10120 assert(MI.getOperand(1).isReg() &&
10121 "Expected operand to be a register");
10122 // Check if the add just increments sp. If so, we search for
10123 // matching sub instructions that decrement sp. If not, the
10124 // modification is illegal
10125 if (MI.getOperand(i: 1).getReg() == AArch64::SP)
10126 SPValue += MI.getOperand(i: 2).getImm();
10127 else
10128 return true;
10129 break;
10130 case AArch64::SUBXri:
10131 case AArch64::SUBWri:
10132 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10133 assert(MI.getOperand(2).isImm() &&
10134 "Expected operand to be immediate");
10135 assert(MI.getOperand(1).isReg() &&
10136 "Expected operand to be a register");
10137 // Check if the sub just decrements sp. If so, we search for
10138 // matching add instructions that increment sp. If not, the
10139 // modification is illegal
10140 if (MI.getOperand(i: 1).getReg() == AArch64::SP)
10141 SPValue -= MI.getOperand(i: 2).getImm();
10142 else
10143 return true;
10144 break;
10145 default:
10146 return true;
10147 }
10148 }
10149 }
10150 if (SPValue)
10151 return true;
10152 return false;
10153 };
10154 // Remove candidates with illegal stack modifying instructions
10155 llvm::erase_if(C&: RepeatedSequenceLocs, P: hasIllegalSPModification);
10156
10157 // If the sequence doesn't have enough candidates left, then we're done.
10158 if (RepeatedSequenceLocs.size() < MinRepeats)
10159 return std::nullopt;
10160 }
10161
10162 // Properties about candidate MBBs that hold for all of them.
10163 unsigned FlagsSetInAll = 0xF;
10164
10165 // Compute liveness information for each candidate, and set FlagsSetInAll.
10166 for (outliner::Candidate &C : RepeatedSequenceLocs)
10167 FlagsSetInAll &= C.Flags;
10168
10169 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10170
10171 // Helper lambda which sets call information for every candidate.
10172 auto SetCandidateCallInfo =
10173 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10174 for (outliner::Candidate &C : RepeatedSequenceLocs)
10175 C.setCallInfo(CID: CallID, CO: NumBytesForCall);
10176 };
10177
10178 unsigned FrameID = MachineOutlinerDefault;
10179 NumBytesToCreateFrame += 4;
10180
10181 bool HasBTI = any_of(Range&: RepeatedSequenceLocs, P: [](outliner::Candidate &C) {
10182 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10183 });
10184
10185 // We check to see if CFI Instructions are present, and if they are
10186 // we find the number of CFI Instructions in the candidates.
10187 unsigned CFICount = 0;
10188 for (auto &I : RepeatedSequenceLocs[0]) {
10189 if (I.isCFIInstruction())
10190 CFICount++;
10191 }
10192
10193 // We compare the number of found CFI Instructions to the number of CFI
10194 // instructions in the parent function for each candidate. We must check this
10195 // since if we outline one of the CFI instructions in a function, we have to
10196 // outline them all for correctness. If we do not, the address offsets will be
10197 // incorrect between the two sections of the program.
10198 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10199 std::vector<MCCFIInstruction> CFIInstructions =
10200 C.getMF()->getFrameInstructions();
10201
10202 if (CFICount > 0 && CFICount != CFIInstructions.size())
10203 return std::nullopt;
10204 }
10205
10206 // Returns true if an instructions is safe to fix up, false otherwise.
10207 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10208 if (MI.isCall())
10209 return true;
10210
10211 if (!MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI) &&
10212 !MI.readsRegister(Reg: AArch64::SP, TRI: &TRI))
10213 return true;
10214
10215 // Any modification of SP will break our code to save/restore LR.
10216 // FIXME: We could handle some instructions which add a constant
10217 // offset to SP, with a bit more work.
10218 if (MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI))
10219 return false;
10220
10221 // At this point, we have a stack instruction that we might need to
10222 // fix up. We'll handle it if it's a load or store.
10223 if (MI.mayLoadOrStore()) {
10224 const MachineOperand *Base; // Filled with the base operand of MI.
10225 int64_t Offset; // Filled with the offset of MI.
10226 bool OffsetIsScalable;
10227
10228 // Does it allow us to offset the base operand and is the base the
10229 // register SP?
10230 if (!getMemOperandWithOffset(MI, BaseOp&: Base, Offset, OffsetIsScalable, TRI: &TRI) ||
10231 !Base->isReg() || Base->getReg() != AArch64::SP)
10232 return false;
10233
10234 // Fixe-up code below assumes bytes.
10235 if (OffsetIsScalable)
10236 return false;
10237
10238 // Find the minimum/maximum offset for this instruction and check
10239 // if fixing it up would be in range.
10240 int64_t MinOffset,
10241 MaxOffset; // Unscaled offsets for the instruction.
10242 // The scale to multiply the offsets by.
10243 TypeSize Scale(0U, false), DummyWidth(0U, false);
10244 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width&: DummyWidth, MinOffset, MaxOffset);
10245
10246 Offset += 16; // Update the offset to what it would be if we outlined.
10247 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10248 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10249 return false;
10250
10251 // It's in range, so we can outline it.
10252 return true;
10253 }
10254
10255 // FIXME: Add handling for instructions like "add x0, sp, #8".
10256
10257 // We can't fix it up, so don't outline it.
10258 return false;
10259 };
10260
10261 // True if it's possible to fix up each stack instruction in this sequence.
10262 // Important for frames/call variants that modify the stack.
10263 bool AllStackInstrsSafe =
10264 llvm::all_of(Range&: RepeatedSequenceLocs[0], P: IsSafeToFixup);
10265
10266 // If the last instruction in any candidate is a terminator, then we should
10267 // tail call all of the candidates.
10268 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10269 FrameID = MachineOutlinerTailCall;
10270 NumBytesToCreateFrame = 0;
10271 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10272 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10273 }
10274
10275 else if (LastInstrOpcode == AArch64::BL ||
10276 ((LastInstrOpcode == AArch64::BLR ||
10277 LastInstrOpcode == AArch64::BLRNoIP) &&
10278 !HasBTI)) {
10279 // FIXME: Do we need to check if the code after this uses the value of LR?
10280 FrameID = MachineOutlinerThunk;
10281 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10282 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10283 }
10284
10285 else {
10286 // We need to decide how to emit calls + frames. We can always emit the same
10287 // frame if we don't need to save to the stack. If we have to save to the
10288 // stack, then we need a different frame.
10289 unsigned NumBytesNoStackCalls = 0;
10290 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10291
10292 // Check if we have to save LR.
10293 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10294 bool LRAvailable =
10295 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
10296 ? C.isAvailableAcrossAndOutOfSeq(Reg: AArch64::LR, TRI)
10297 : true;
10298 // If we have a noreturn caller, then we're going to be conservative and
10299 // say that we have to save LR. If we don't have a ret at the end of the
10300 // block, then we can't reason about liveness accurately.
10301 //
10302 // FIXME: We can probably do better than always disabling this in
10303 // noreturn functions by fixing up the liveness info.
10304 bool IsNoReturn =
10305 C.getMF()->getFunction().hasFnAttribute(Kind: Attribute::NoReturn);
10306
10307 // Is LR available? If so, we don't need a save.
10308 if (LRAvailable && !IsNoReturn) {
10309 NumBytesNoStackCalls += 4;
10310 C.setCallInfo(CID: MachineOutlinerNoLRSave, CO: 4);
10311 CandidatesWithoutStackFixups.push_back(x: C);
10312 }
10313
10314 // Is an unused register available? If so, we won't modify the stack, so
10315 // we can outline with the same frame type as those that don't save LR.
10316 else if (findRegisterToSaveLRTo(C)) {
10317 NumBytesNoStackCalls += 12;
10318 C.setCallInfo(CID: MachineOutlinerRegSave, CO: 12);
10319 CandidatesWithoutStackFixups.push_back(x: C);
10320 }
10321
10322 // Is SP used in the sequence at all? If not, we don't have to modify
10323 // the stack, so we are guaranteed to get the same frame.
10324 else if (C.isAvailableInsideSeq(Reg: AArch64::SP, TRI)) {
10325 NumBytesNoStackCalls += 12;
10326 C.setCallInfo(CID: MachineOutlinerDefault, CO: 12);
10327 CandidatesWithoutStackFixups.push_back(x: C);
10328 }
10329
10330 // If we outline this, we need to modify the stack. Pretend we don't
10331 // outline this by saving all of its bytes.
10332 else {
10333 NumBytesNoStackCalls += SequenceSize;
10334 }
10335 }
10336
10337 // If there are no places where we have to save LR, then note that we
10338 // don't have to update the stack. Otherwise, give every candidate the
10339 // default call type, as long as it's safe to do so.
10340 if (!AllStackInstrsSafe ||
10341 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10342 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10343 FrameID = MachineOutlinerNoLRSave;
10344 if (RepeatedSequenceLocs.size() < MinRepeats)
10345 return std::nullopt;
10346 } else {
10347 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10348
10349 // Bugzilla ID: 46767
10350 // TODO: Check if fixing up the stack more than once is safe so we can
10351 // outline these.
10352 //
10353 // An outline resulting in a caller that requires stack fixups at the
10354 // callsite to a callee that also requires stack fixups can happen when
10355 // there are no available registers at the candidate callsite for a
10356 // candidate that itself also has calls.
10357 //
10358 // In other words if function_containing_sequence in the following pseudo
10359 // assembly requires that we save LR at the point of the call, but there
10360 // are no available registers: in this case we save using SP and as a
10361 // result the SP offsets requires stack fixups by multiples of 16.
10362 //
10363 // function_containing_sequence:
10364 // ...
10365 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10366 // call OUTLINED_FUNCTION_N
10367 // restore LR from SP
10368 // ...
10369 //
10370 // OUTLINED_FUNCTION_N:
10371 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10372 // ...
10373 // bl foo
10374 // restore LR from SP
10375 // ret
10376 //
10377 // Because the code to handle more than one stack fixup does not
10378 // currently have the proper checks for legality, these cases will assert
10379 // in the AArch64 MachineOutliner. This is because the code to do this
10380 // needs more hardening, testing, better checks that generated code is
10381 // legal, etc and because it is only verified to handle a single pass of
10382 // stack fixup.
10383 //
10384 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10385 // these cases until they are known to be handled. Bugzilla 46767 is
10386 // referenced in comments at the assert site.
10387 //
10388 // To avoid asserting (or generating non-legal code on noassert builds)
10389 // we remove all candidates which would need more than one stack fixup by
10390 // pruning the cases where the candidate has calls while also having no
10391 // available LR and having no available general purpose registers to copy
10392 // LR to (ie one extra stack save/restore).
10393 //
10394 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10395 erase_if(C&: RepeatedSequenceLocs, P: [this, &TRI](outliner::Candidate &C) {
10396 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10397 return (llvm::any_of(Range&: C, P: IsCall)) &&
10398 (!C.isAvailableAcrossAndOutOfSeq(Reg: AArch64::LR, TRI) ||
10399 !findRegisterToSaveLRTo(C));
10400 });
10401 }
10402 }
10403
10404 // If we dropped all of the candidates, bail out here.
10405 if (RepeatedSequenceLocs.size() < MinRepeats)
10406 return std::nullopt;
10407 }
10408
10409 // Does every candidate's MBB contain a call? If so, then we might have a call
10410 // in the range.
10411 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10412 // Check if the range contains a call. These require a save + restore of the
10413 // link register.
10414 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10415 bool ModStackToSaveLR = false;
10416 if (any_of(Range: drop_end(RangeOrContainer&: FirstCand),
10417 P: [](const MachineInstr &MI) { return MI.isCall(); }))
10418 ModStackToSaveLR = true;
10419
10420 // Handle the last instruction separately. If this is a tail call, then the
10421 // last instruction is a call. We don't want to save + restore in this case.
10422 // However, it could be possible that the last instruction is a call without
10423 // it being valid to tail call this sequence. We should consider this as
10424 // well.
10425 else if (FrameID != MachineOutlinerThunk &&
10426 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10427 ModStackToSaveLR = true;
10428
10429 if (ModStackToSaveLR) {
10430 // We can't fix up the stack. Bail out.
10431 if (!AllStackInstrsSafe)
10432 return std::nullopt;
10433
10434 // Save + restore LR.
10435 NumBytesToCreateFrame += 8;
10436 }
10437 }
10438
10439 // If we have CFI instructions, we can only outline if the outlined section
10440 // can be a tail call
10441 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10442 return std::nullopt;
10443
10444 return std::make_unique<outliner::OutlinedFunction>(
10445 args&: RepeatedSequenceLocs, args&: SequenceSize, args&: NumBytesToCreateFrame, args&: FrameID);
10446}
10447
10448void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10449 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10450 // If a bunch of candidates reach this point they must agree on their return
10451 // address signing. It is therefore enough to just consider the signing
10452 // behaviour of one of them
10453 const auto &CFn = Candidates.front().getMF()->getFunction();
10454
10455 if (CFn.hasFnAttribute(Kind: "ptrauth-returns"))
10456 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "ptrauth-returns"));
10457 if (CFn.hasFnAttribute(Kind: "ptrauth-auth-traps"))
10458 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "ptrauth-auth-traps"));
10459 // Since all candidates belong to the same module, just copy the
10460 // function-level attributes of an arbitrary function.
10461 if (CFn.hasFnAttribute(Kind: "sign-return-address"))
10462 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address"));
10463 if (CFn.hasFnAttribute(Kind: "sign-return-address-key"))
10464 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address-key"));
10465
10466 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10467}
10468
10469bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10470 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10471 const Function &F = MF.getFunction();
10472
10473 // Can F be deduplicated by the linker? If it can, don't outline from it.
10474 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10475 return false;
10476
10477 // Don't outline from functions with section markings; the program could
10478 // expect that all the code is in the named section.
10479 // FIXME: Allow outlining from multiple functions with the same section
10480 // marking.
10481 if (F.hasSection())
10482 return false;
10483
10484 // Outlining from functions with redzones is unsafe since the outliner may
10485 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10486 // outline from it.
10487 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10488 if (!AFI || AFI->hasRedZone().value_or(u: true))
10489 return false;
10490
10491 // FIXME: Determine whether it is safe to outline from functions which contain
10492 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10493 // outlined together and ensure it is safe to outline with async unwind info,
10494 // required for saving & restoring VG around calls.
10495 if (AFI->hasStreamingModeChanges())
10496 return false;
10497
10498 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10499 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
10500 return false;
10501
10502 // It's safe to outline from MF.
10503 return true;
10504}
10505
10506SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10507AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10508 unsigned &Flags) const {
10509 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
10510 "Must track liveness!");
10511 SmallVector<
10512 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10513 Ranges;
10514 // According to the AArch64 Procedure Call Standard, the following are
10515 // undefined on entry/exit from a function call:
10516 //
10517 // * Registers x16, x17, (and thus w16, w17)
10518 // * Condition codes (and thus the NZCV register)
10519 //
10520 // If any of these registers are used inside or live across an outlined
10521 // function, then they may be modified later, either by the compiler or
10522 // some other tool (like the linker).
10523 //
10524 // To avoid outlining in these situations, partition each block into ranges
10525 // where these registers are dead. We will only outline from those ranges.
10526 LiveRegUnits LRU(getRegisterInfo());
10527 auto AreAllUnsafeRegsDead = [&LRU]() {
10528 return LRU.available(Reg: AArch64::W16) && LRU.available(Reg: AArch64::W17) &&
10529 LRU.available(Reg: AArch64::NZCV);
10530 };
10531
10532 // We need to know if LR is live across an outlining boundary later on in
10533 // order to decide how we'll create the outlined call, frame, etc.
10534 //
10535 // It's pretty expensive to check this for *every candidate* within a block.
10536 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10537 // to compute liveness from the end of the block for O(n) candidates within
10538 // the block.
10539 //
10540 // So, to improve the average case, let's keep track of liveness from the end
10541 // of the block to the beginning of *every outlinable range*. If we know that
10542 // LR is available in every range we could outline from, then we know that
10543 // we don't need to check liveness for any candidate within that range.
10544 bool LRAvailableEverywhere = true;
10545 // Compute liveness bottom-up.
10546 LRU.addLiveOuts(MBB);
10547 // Update flags that require info about the entire MBB.
10548 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10549 if (MI.isCall() && !MI.isTerminator())
10550 Flags |= MachineOutlinerMBBFlags::HasCalls;
10551 };
10552 // Range: [RangeBegin, RangeEnd)
10553 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10554 unsigned RangeLen;
10555 auto CreateNewRangeStartingAt =
10556 [&RangeBegin, &RangeEnd,
10557 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10558 RangeBegin = NewBegin;
10559 RangeEnd = std::next(x: RangeBegin);
10560 RangeLen = 0;
10561 };
10562 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10563 // At least one unsafe register is not dead. We do not want to outline at
10564 // this point. If it is long enough to outline from and does not cross a
10565 // bundle boundary, save the range [RangeBegin, RangeEnd).
10566 if (RangeLen <= 1)
10567 return;
10568 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10569 return;
10570 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10571 return;
10572 Ranges.emplace_back(Args&: RangeBegin, Args&: RangeEnd);
10573 };
10574 // Find the first point where all unsafe registers are dead.
10575 // FIND: <safe instr> <-- end of first potential range
10576 // SKIP: <unsafe def>
10577 // SKIP: ... everything between ...
10578 // SKIP: <unsafe use>
10579 auto FirstPossibleEndPt = MBB.instr_rbegin();
10580 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10581 LRU.stepBackward(MI: *FirstPossibleEndPt);
10582 // Update flags that impact how we outline across the entire block,
10583 // regardless of safety.
10584 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10585 if (AreAllUnsafeRegsDead())
10586 break;
10587 }
10588 // If we exhausted the entire block, we have no safe ranges to outline.
10589 if (FirstPossibleEndPt == MBB.instr_rend())
10590 return Ranges;
10591 // Current range.
10592 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10593 // StartPt points to the first place where all unsafe registers
10594 // are dead (if there is any such point). Begin partitioning the MBB into
10595 // ranges.
10596 for (auto &MI : make_range(x: FirstPossibleEndPt, y: MBB.instr_rend())) {
10597 LRU.stepBackward(MI);
10598 UpdateWholeMBBFlags(MI);
10599 if (!AreAllUnsafeRegsDead()) {
10600 SaveRangeIfNonEmpty();
10601 CreateNewRangeStartingAt(MI.getIterator());
10602 continue;
10603 }
10604 LRAvailableEverywhere &= LRU.available(Reg: AArch64::LR);
10605 RangeBegin = MI.getIterator();
10606 ++RangeLen;
10607 }
10608 // Above loop misses the last (or only) range. If we are still safe, then
10609 // let's save the range.
10610 if (AreAllUnsafeRegsDead())
10611 SaveRangeIfNonEmpty();
10612 if (Ranges.empty())
10613 return Ranges;
10614 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10615 // the order.
10616 std::reverse(first: Ranges.begin(), last: Ranges.end());
10617 // If there is at least one outlinable range where LR is unavailable
10618 // somewhere, remember that.
10619 if (!LRAvailableEverywhere)
10620 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
10621 return Ranges;
10622}
10623
10624outliner::InstrType
10625AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10626 MachineBasicBlock::iterator &MIT,
10627 unsigned Flags) const {
10628 MachineInstr &MI = *MIT;
10629
10630 // Don't outline anything used for return address signing. The outlined
10631 // function will get signed later if needed
10632 switch (MI.getOpcode()) {
10633 case AArch64::PACM:
10634 case AArch64::PACIASP:
10635 case AArch64::PACIBSP:
10636 case AArch64::PACIASPPC:
10637 case AArch64::PACIBSPPC:
10638 case AArch64::AUTIASP:
10639 case AArch64::AUTIBSP:
10640 case AArch64::AUTIASPPCi:
10641 case AArch64::AUTIASPPCr:
10642 case AArch64::AUTIBSPPCi:
10643 case AArch64::AUTIBSPPCr:
10644 case AArch64::RETAA:
10645 case AArch64::RETAB:
10646 case AArch64::RETAASPPCi:
10647 case AArch64::RETAASPPCr:
10648 case AArch64::RETABSPPCi:
10649 case AArch64::RETABSPPCr:
10650 case AArch64::EMITBKEY:
10651 case AArch64::PAUTH_PROLOGUE:
10652 case AArch64::PAUTH_EPILOGUE:
10653 return outliner::InstrType::Illegal;
10654 }
10655
10656 // We can only outline these if we will tail call the outlined function, or
10657 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10658 // in a tail call.
10659 //
10660 // FIXME: If the proper fixups for the offset are implemented, this should be
10661 // possible.
10662 if (MI.isCFIInstruction())
10663 return outliner::InstrType::Legal;
10664
10665 // Is this a terminator for a basic block?
10666 if (MI.isTerminator())
10667 // TargetInstrInfo::getOutliningType has already filtered out anything
10668 // that would break this, so we can allow it here.
10669 return outliner::InstrType::Legal;
10670
10671 // Make sure none of the operands are un-outlinable.
10672 for (const MachineOperand &MOP : MI.operands()) {
10673 // A check preventing CFI indices was here before, but only CFI
10674 // instructions should have those.
10675 assert(!MOP.isCFIIndex());
10676
10677 // If it uses LR or W30 explicitly, then don't touch it.
10678 if (MOP.isReg() && !MOP.isImplicit() &&
10679 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10680 return outliner::InstrType::Illegal;
10681 }
10682
10683 // Special cases for instructions that can always be outlined, but will fail
10684 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10685 // be outlined because they don't require a *specific* value to be in LR.
10686 if (MI.getOpcode() == AArch64::ADRP)
10687 return outliner::InstrType::Legal;
10688
10689 // If MI is a call we might be able to outline it. We don't want to outline
10690 // any calls that rely on the position of items on the stack. When we outline
10691 // something containing a call, we have to emit a save and restore of LR in
10692 // the outlined function. Currently, this always happens by saving LR to the
10693 // stack. Thus, if we outline, say, half the parameters for a function call
10694 // plus the call, then we'll break the callee's expectations for the layout
10695 // of the stack.
10696 //
10697 // FIXME: Allow calls to functions which construct a stack frame, as long
10698 // as they don't access arguments on the stack.
10699 // FIXME: Figure out some way to analyze functions defined in other modules.
10700 // We should be able to compute the memory usage based on the IR calling
10701 // convention, even if we can't see the definition.
10702 if (MI.isCall()) {
10703 // Get the function associated with the call. Look at each operand and find
10704 // the one that represents the callee and get its name.
10705 const Function *Callee = nullptr;
10706 for (const MachineOperand &MOP : MI.operands()) {
10707 if (MOP.isGlobal()) {
10708 Callee = dyn_cast<Function>(Val: MOP.getGlobal());
10709 break;
10710 }
10711 }
10712
10713 // Never outline calls to mcount. There isn't any rule that would require
10714 // this, but the Linux kernel's "ftrace" feature depends on it.
10715 if (Callee && Callee->getName() == "\01_mcount")
10716 return outliner::InstrType::Illegal;
10717
10718 // If we don't know anything about the callee, assume it depends on the
10719 // stack layout of the caller. In that case, it's only legal to outline
10720 // as a tail-call. Explicitly list the call instructions we know about so we
10721 // don't get unexpected results with call pseudo-instructions.
10722 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10723 if (MI.getOpcode() == AArch64::BLR ||
10724 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10725 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10726
10727 if (!Callee)
10728 return UnknownCallOutlineType;
10729
10730 // We have a function we have information about. Check it if it's something
10731 // can safely outline.
10732 MachineFunction *CalleeMF = MMI.getMachineFunction(F: *Callee);
10733
10734 // We don't know what's going on with the callee at all. Don't touch it.
10735 if (!CalleeMF)
10736 return UnknownCallOutlineType;
10737
10738 // Check if we know anything about the callee saves on the function. If we
10739 // don't, then don't touch it, since that implies that we haven't
10740 // computed anything about its stack frame yet.
10741 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10742 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10743 MFI.getNumObjects() > 0)
10744 return UnknownCallOutlineType;
10745
10746 // At this point, we can say that CalleeMF ought to not pass anything on the
10747 // stack. Therefore, we can outline it.
10748 return outliner::InstrType::Legal;
10749 }
10750
10751 // Don't touch the link register or W30.
10752 if (MI.readsRegister(Reg: AArch64::W30, TRI: &getRegisterInfo()) ||
10753 MI.modifiesRegister(Reg: AArch64::W30, TRI: &getRegisterInfo()))
10754 return outliner::InstrType::Illegal;
10755
10756 // Don't outline BTI instructions, because that will prevent the outlining
10757 // site from being indirectly callable.
10758 if (hasBTISemantics(MI))
10759 return outliner::InstrType::Illegal;
10760
10761 return outliner::InstrType::Legal;
10762}
10763
10764void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10765 for (MachineInstr &MI : MBB) {
10766 const MachineOperand *Base;
10767 TypeSize Width(0, false);
10768 int64_t Offset;
10769 bool OffsetIsScalable;
10770
10771 // Is this a load or store with an immediate offset with SP as the base?
10772 if (!MI.mayLoadOrStore() ||
10773 !getMemOperandWithOffsetWidth(LdSt: MI, BaseOp&: Base, Offset, OffsetIsScalable, Width,
10774 TRI: &RI) ||
10775 (Base->isReg() && Base->getReg() != AArch64::SP))
10776 continue;
10777
10778 // It is, so we have to fix it up.
10779 TypeSize Scale(0U, false);
10780 int64_t Dummy1, Dummy2;
10781
10782 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(LdSt&: MI);
10783 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10784 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2);
10785 assert(Scale != 0 && "Unexpected opcode!");
10786 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10787
10788 // We've pushed the return address to the stack, so add 16 to the offset.
10789 // This is safe, since we already checked if it would overflow when we
10790 // checked if this instruction was legal to outline.
10791 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10792 StackOffsetOperand.setImm(NewImm);
10793 }
10794}
10795
10796static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
10797 const AArch64InstrInfo *TII,
10798 bool ShouldSignReturnAddr) {
10799 if (!ShouldSignReturnAddr)
10800 return;
10801
10802 BuildMI(BB&: MBB, I: MBB.begin(), MIMD: DebugLoc(), MCID: TII->get(Opcode: AArch64::PAUTH_PROLOGUE))
10803 .setMIFlag(MachineInstr::FrameSetup);
10804 BuildMI(BB&: MBB, I: MBB.getFirstInstrTerminator(), MIMD: DebugLoc(),
10805 MCID: TII->get(Opcode: AArch64::PAUTH_EPILOGUE))
10806 .setMIFlag(MachineInstr::FrameDestroy);
10807}
10808
10809void AArch64InstrInfo::buildOutlinedFrame(
10810 MachineBasicBlock &MBB, MachineFunction &MF,
10811 const outliner::OutlinedFunction &OF) const {
10812
10813 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10814
10815 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10816 FI->setOutliningStyle("Tail Call");
10817 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10818 // For thunk outlining, rewrite the last instruction from a call to a
10819 // tail-call.
10820 MachineInstr *Call = &*--MBB.instr_end();
10821 unsigned TailOpcode;
10822 if (Call->getOpcode() == AArch64::BL) {
10823 TailOpcode = AArch64::TCRETURNdi;
10824 } else {
10825 assert(Call->getOpcode() == AArch64::BLR ||
10826 Call->getOpcode() == AArch64::BLRNoIP);
10827 TailOpcode = AArch64::TCRETURNriALL;
10828 }
10829 MachineInstr *TC = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: TailOpcode))
10830 .add(MO: Call->getOperand(i: 0))
10831 .addImm(Val: 0);
10832 MBB.insert(I: MBB.end(), MI: TC);
10833 Call->eraseFromParent();
10834
10835 FI->setOutliningStyle("Thunk");
10836 }
10837
10838 bool IsLeafFunction = true;
10839
10840 // Is there a call in the outlined range?
10841 auto IsNonTailCall = [](const MachineInstr &MI) {
10842 return MI.isCall() && !MI.isReturn();
10843 };
10844
10845 if (llvm::any_of(Range: MBB.instrs(), P: IsNonTailCall)) {
10846 // Fix up the instructions in the range, since we're going to modify the
10847 // stack.
10848
10849 // Bugzilla ID: 46767
10850 // TODO: Check if fixing up twice is safe so we can outline these.
10851 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10852 "Can only fix up stack references once");
10853 fixupPostOutline(MBB);
10854
10855 IsLeafFunction = false;
10856
10857 // LR has to be a live in so that we can save it.
10858 if (!MBB.isLiveIn(Reg: AArch64::LR))
10859 MBB.addLiveIn(PhysReg: AArch64::LR);
10860
10861 MachineBasicBlock::iterator It = MBB.begin();
10862 MachineBasicBlock::iterator Et = MBB.end();
10863
10864 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10865 OF.FrameConstructionID == MachineOutlinerThunk)
10866 Et = std::prev(x: MBB.end());
10867
10868 // Insert a save before the outlined region
10869 MachineInstr *STRXpre = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::STRXpre))
10870 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10871 .addReg(RegNo: AArch64::LR)
10872 .addReg(RegNo: AArch64::SP)
10873 .addImm(Val: -16);
10874 It = MBB.insert(I: It, MI: STRXpre);
10875
10876 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10877 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10878
10879 // Add a CFI saying the stack was moved 16 B down.
10880 CFIBuilder.buildDefCFAOffset(Offset: 16);
10881
10882 // Add a CFI saying that the LR that we want to find is now 16 B higher
10883 // than before.
10884 CFIBuilder.buildOffset(Reg: AArch64::LR, Offset: -16);
10885 }
10886
10887 // Insert a restore before the terminator for the function.
10888 MachineInstr *LDRXpost = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::LDRXpost))
10889 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10890 .addReg(RegNo: AArch64::LR, Flags: RegState::Define)
10891 .addReg(RegNo: AArch64::SP)
10892 .addImm(Val: 16);
10893 Et = MBB.insert(I: Et, MI: LDRXpost);
10894 }
10895
10896 auto RASignCondition = FI->getSignReturnAddressCondition();
10897 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10898 Condition: RASignCondition, IsLRSpilled: !IsLeafFunction);
10899
10900 // If this is a tail call outlined function, then there's already a return.
10901 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10902 OF.FrameConstructionID == MachineOutlinerThunk) {
10903 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
10904 return;
10905 }
10906
10907 // It's not a tail call, so we have to insert the return ourselves.
10908
10909 // LR has to be a live in so that we can return to it.
10910 if (!MBB.isLiveIn(Reg: AArch64::LR))
10911 MBB.addLiveIn(PhysReg: AArch64::LR);
10912
10913 MachineInstr *ret = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::RET))
10914 .addReg(RegNo: AArch64::LR);
10915 MBB.insert(I: MBB.end(), MI: ret);
10916
10917 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
10918
10919 FI->setOutliningStyle("Function");
10920
10921 // Did we have to modify the stack by saving the link register?
10922 if (OF.FrameConstructionID != MachineOutlinerDefault)
10923 return;
10924
10925 // We modified the stack.
10926 // Walk over the basic block and fix up all the stack accesses.
10927 fixupPostOutline(MBB);
10928}
10929
10930MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
10931 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
10932 MachineFunction &MF, outliner::Candidate &C) const {
10933
10934 // Are we tail calling?
10935 if (C.CallConstructionID == MachineOutlinerTailCall) {
10936 // If yes, then we can just branch to the label.
10937 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::TCRETURNdi))
10938 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName()))
10939 .addImm(Val: 0));
10940 return It;
10941 }
10942
10943 // Are we saving the link register?
10944 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
10945 C.CallConstructionID == MachineOutlinerThunk) {
10946 // No, so just insert the call.
10947 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::BL))
10948 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName())));
10949 return It;
10950 }
10951
10952 // We want to return the spot where we inserted the call.
10953 MachineBasicBlock::iterator CallPt;
10954
10955 // Instructions for saving and restoring LR around the call instruction we're
10956 // going to insert.
10957 MachineInstr *Save;
10958 MachineInstr *Restore;
10959 // Can we save to a register?
10960 if (C.CallConstructionID == MachineOutlinerRegSave) {
10961 // FIXME: This logic should be sunk into a target-specific interface so that
10962 // we don't have to recompute the register.
10963 Register Reg = findRegisterToSaveLRTo(C);
10964 assert(Reg && "No callee-saved register available?");
10965
10966 // LR has to be a live in so that we can save it.
10967 if (!MBB.isLiveIn(Reg: AArch64::LR))
10968 MBB.addLiveIn(PhysReg: AArch64::LR);
10969
10970 // Save and restore LR from Reg.
10971 Save = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::ORRXrs), DestReg: Reg)
10972 .addReg(RegNo: AArch64::XZR)
10973 .addReg(RegNo: AArch64::LR)
10974 .addImm(Val: 0);
10975 Restore = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::ORRXrs), DestReg: AArch64::LR)
10976 .addReg(RegNo: AArch64::XZR)
10977 .addReg(RegNo: Reg)
10978 .addImm(Val: 0);
10979 } else {
10980 // We have the default case. Save and restore from SP.
10981 Save = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::STRXpre))
10982 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10983 .addReg(RegNo: AArch64::LR)
10984 .addReg(RegNo: AArch64::SP)
10985 .addImm(Val: -16);
10986 Restore = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::LDRXpost))
10987 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
10988 .addReg(RegNo: AArch64::LR, Flags: RegState::Define)
10989 .addReg(RegNo: AArch64::SP)
10990 .addImm(Val: 16);
10991 }
10992
10993 It = MBB.insert(I: It, MI: Save);
10994 It++;
10995
10996 // Insert the call.
10997 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::BL))
10998 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName())));
10999 CallPt = It;
11000 It++;
11001
11002 It = MBB.insert(I: It, MI: Restore);
11003 return CallPt;
11004}
11005
11006bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
11007 MachineFunction &MF) const {
11008 return MF.getFunction().hasMinSize();
11009}
11010
11011void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
11012 MachineBasicBlock::iterator Iter,
11013 DebugLoc &DL,
11014 bool AllowSideEffects) const {
11015 const MachineFunction &MF = *MBB.getParent();
11016 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
11017 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
11018
11019 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
11020 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: Reg).addImm(Val: 0).addImm(Val: 0);
11021 } else if (STI.isSVEorStreamingSVEAvailable()) {
11022 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::DUP_ZI_D), DestReg: Reg)
11023 .addImm(Val: 0)
11024 .addImm(Val: 0);
11025 } else if (STI.isNeonAvailable()) {
11026 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::MOVIv2d_ns), DestReg: Reg)
11027 .addImm(Val: 0);
11028 } else {
11029 // This is a streaming-compatible function without SVE. We don't have full
11030 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
11031 // So given `movi v..` would be illegal use `fmov d..` instead.
11032 assert(STI.hasNEON() && "Expected to have NEON.");
11033 Register Reg64 = TRI.getSubReg(Reg, Idx: AArch64::dsub);
11034 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::FMOVD0), DestReg: Reg64);
11035 }
11036}
11037
11038std::optional<DestSourcePair>
11039AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
11040
11041 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
11042 // and zero immediate operands used as an alias for mov instruction.
11043 if (((MI.getOpcode() == AArch64::ORRWrs &&
11044 MI.getOperand(i: 1).getReg() == AArch64::WZR &&
11045 MI.getOperand(i: 3).getImm() == 0x0) ||
11046 (MI.getOpcode() == AArch64::ORRWrr &&
11047 MI.getOperand(i: 1).getReg() == AArch64::WZR)) &&
11048 // Check that the w->w move is not a zero-extending w->x mov.
11049 (!MI.getOperand(i: 0).getReg().isVirtual() ||
11050 MI.getOperand(i: 0).getSubReg() == 0) &&
11051 (!MI.getOperand(i: 0).getReg().isPhysical() ||
11052 MI.findRegisterDefOperandIdx(Reg: getXRegFromWReg(Reg: MI.getOperand(i: 0).getReg()),
11053 /*TRI=*/nullptr) == -1))
11054 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11055
11056 if (MI.getOpcode() == AArch64::ORRXrs &&
11057 MI.getOperand(i: 1).getReg() == AArch64::XZR &&
11058 MI.getOperand(i: 3).getImm() == 0x0)
11059 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11060
11061 return std::nullopt;
11062}
11063
11064std::optional<DestSourcePair>
11065AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
11066 if ((MI.getOpcode() == AArch64::ORRWrs &&
11067 MI.getOperand(i: 1).getReg() == AArch64::WZR &&
11068 MI.getOperand(i: 3).getImm() == 0x0) ||
11069 (MI.getOpcode() == AArch64::ORRWrr &&
11070 MI.getOperand(i: 1).getReg() == AArch64::WZR))
11071 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11072 return std::nullopt;
11073}
11074
11075std::optional<RegImmPair>
11076AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11077 int Sign = 1;
11078 int64_t Offset = 0;
11079
11080 // TODO: Handle cases where Reg is a super- or sub-register of the
11081 // destination register.
11082 const MachineOperand &Op0 = MI.getOperand(i: 0);
11083 if (!Op0.isReg() || Reg != Op0.getReg())
11084 return std::nullopt;
11085
11086 switch (MI.getOpcode()) {
11087 default:
11088 return std::nullopt;
11089 case AArch64::SUBWri:
11090 case AArch64::SUBXri:
11091 case AArch64::SUBSWri:
11092 case AArch64::SUBSXri:
11093 Sign *= -1;
11094 [[fallthrough]];
11095 case AArch64::ADDSWri:
11096 case AArch64::ADDSXri:
11097 case AArch64::ADDWri:
11098 case AArch64::ADDXri: {
11099 // TODO: Third operand can be global address (usually some string).
11100 if (!MI.getOperand(i: 0).isReg() || !MI.getOperand(i: 1).isReg() ||
11101 !MI.getOperand(i: 2).isImm())
11102 return std::nullopt;
11103 int Shift = MI.getOperand(i: 3).getImm();
11104 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11105 Offset = Sign * (MI.getOperand(i: 2).getImm() << Shift);
11106 }
11107 }
11108 return RegImmPair{MI.getOperand(i: 1).getReg(), Offset};
11109}
11110
11111/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11112/// the destination register then, if possible, describe the value in terms of
11113/// the source register.
11114static std::optional<ParamLoadedValue>
11115describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
11116 const TargetInstrInfo *TII,
11117 const TargetRegisterInfo *TRI) {
11118 auto DestSrc = TII->isCopyLikeInstr(MI);
11119 if (!DestSrc)
11120 return std::nullopt;
11121
11122 Register DestReg = DestSrc->Destination->getReg();
11123 Register SrcReg = DestSrc->Source->getReg();
11124
11125 if (!DestReg.isValid() || !SrcReg.isValid())
11126 return std::nullopt;
11127
11128 auto Expr = DIExpression::get(Context&: MI.getMF()->getFunction().getContext(), Elements: {});
11129
11130 // If the described register is the destination, just return the source.
11131 if (DestReg == DescribedReg)
11132 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
11133
11134 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11135 if (MI.getOpcode() == AArch64::ORRWrs &&
11136 TRI->isSuperRegister(RegA: DestReg, RegB: DescribedReg))
11137 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
11138
11139 // We may need to describe the lower part of a ORRXrs move.
11140 if (MI.getOpcode() == AArch64::ORRXrs &&
11141 TRI->isSubRegister(RegA: DestReg, RegB: DescribedReg)) {
11142 Register SrcSubReg = TRI->getSubReg(Reg: SrcReg, Idx: AArch64::sub_32);
11143 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcSubReg, isDef: false), Expr);
11144 }
11145
11146 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11147 "Unhandled ORR[XW]rs copy case");
11148
11149 return std::nullopt;
11150}
11151
11152bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11153 // Functions cannot be split to different sections on AArch64 if they have
11154 // a red zone. This is because relaxing a cross-section branch may require
11155 // incrementing the stack pointer to spill a register, which would overwrite
11156 // the red zone.
11157 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(u: true))
11158 return false;
11159
11160 return TargetInstrInfo::isFunctionSafeToSplit(MF);
11161}
11162
11163bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11164 const MachineBasicBlock &MBB) const {
11165 // Asm Goto blocks can contain conditional branches to goto labels, which can
11166 // get moved out of range of the branch instruction.
11167 auto isAsmGoto = [](const MachineInstr &MI) {
11168 return MI.getOpcode() == AArch64::INLINEASM_BR;
11169 };
11170 if (llvm::any_of(Range: MBB, P: isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11171 return false;
11172
11173 // Because jump tables are label-relative instead of table-relative, they all
11174 // must be in the same section or relocation fixup handling will fail.
11175
11176 // Check if MBB is a jump table target
11177 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11178 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11179 return llvm::is_contained(Range: JTE.MBBs, Element: &MBB);
11180 };
11181 if (MJTI != nullptr && llvm::any_of(Range: MJTI->getJumpTables(), P: containsMBB))
11182 return false;
11183
11184 // Check if MBB contains a jump table lookup
11185 for (const MachineInstr &MI : MBB) {
11186 switch (MI.getOpcode()) {
11187 case TargetOpcode::G_BRJT:
11188 case AArch64::JumpTableDest32:
11189 case AArch64::JumpTableDest16:
11190 case AArch64::JumpTableDest8:
11191 return false;
11192 default:
11193 continue;
11194 }
11195 }
11196
11197 // MBB isn't a special case, so it's safe to be split to the cold section.
11198 return true;
11199}
11200
11201std::optional<ParamLoadedValue>
11202AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11203 Register Reg) const {
11204 const MachineFunction *MF = MI.getMF();
11205 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11206 switch (MI.getOpcode()) {
11207 case AArch64::MOVZWi:
11208 case AArch64::MOVZXi: {
11209 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11210 // 64-bit parameters, so we need to consider super-registers.
11211 if (!TRI->isSuperRegisterEq(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
11212 return std::nullopt;
11213
11214 if (!MI.getOperand(i: 1).isImm())
11215 return std::nullopt;
11216 int64_t Immediate = MI.getOperand(i: 1).getImm();
11217 int Shift = MI.getOperand(i: 2).getImm();
11218 return ParamLoadedValue(MachineOperand::CreateImm(Val: Immediate << Shift),
11219 nullptr);
11220 }
11221 case AArch64::ORRWrs:
11222 case AArch64::ORRXrs:
11223 return describeORRLoadedValue(MI, DescribedReg: Reg, TII: this, TRI);
11224 }
11225
11226 return TargetInstrInfo::describeLoadedValue(MI, Reg);
11227}
11228
11229bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11230 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11231 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11232 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11233 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11234
11235 // Anyexts are nops.
11236 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11237 return true;
11238
11239 Register DefReg = ExtMI.getOperand(i: 0).getReg();
11240 if (!MRI.hasOneNonDBGUse(RegNo: DefReg))
11241 return false;
11242
11243 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11244 // addressing mode.
11245 auto *UserMI = &*MRI.use_instr_nodbg_begin(RegNo: DefReg);
11246 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11247}
11248
11249uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11250 return get(Opcode: Opc).TSFlags & AArch64::ElementSizeMask;
11251}
11252
11253bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11254 return get(Opcode: Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11255}
11256
11257bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11258 return get(Opcode: Opc).TSFlags & AArch64::InstrFlagIsWhile;
11259}
11260
11261unsigned int
11262AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11263 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11264}
11265
11266bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11267 unsigned Scale) const {
11268 if (Offset && Scale)
11269 return false;
11270
11271 // Check Reg + Imm
11272 if (!Scale) {
11273 // 9-bit signed offset
11274 if (isInt<9>(x: Offset))
11275 return true;
11276
11277 // 12-bit unsigned offset
11278 unsigned Shift = Log2_64(Value: NumBytes);
11279 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11280 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11281 (Offset >> Shift) << Shift == Offset)
11282 return true;
11283 return false;
11284 }
11285
11286 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11287 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11288}
11289
11290unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
11291 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11292 return AArch64::BLRNoIP;
11293 else
11294 return AArch64::BLR;
11295}
11296
11297MachineBasicBlock::iterator
11298AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11299 Register TargetReg, bool FrameSetup) const {
11300 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11301
11302 MachineBasicBlock &MBB = *MBBI->getParent();
11303 MachineFunction &MF = *MBB.getParent();
11304 const AArch64InstrInfo *TII =
11305 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11306 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11307 DebugLoc DL = MBB.findDebugLoc(MBBI);
11308
11309 MachineFunction::iterator MBBInsertPoint = std::next(x: MBB.getIterator());
11310 MachineBasicBlock *LoopTestMBB =
11311 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11312 MF.insert(MBBI: MBBInsertPoint, MBB: LoopTestMBB);
11313 MachineBasicBlock *LoopBodyMBB =
11314 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11315 MF.insert(MBBI: MBBInsertPoint, MBB: LoopBodyMBB);
11316 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11317 MF.insert(MBBI: MBBInsertPoint, MBB: ExitMBB);
11318 MachineInstr::MIFlag Flags =
11319 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
11320
11321 // LoopTest:
11322 // SUB SP, SP, #ProbeSize
11323 emitFrameOffset(MBB&: *LoopTestMBB, MBBI: LoopTestMBB->end(), DL, DestReg: AArch64::SP,
11324 SrcReg: AArch64::SP, Offset: StackOffset::getFixed(Fixed: -ProbeSize), TII, Flag: Flags);
11325
11326 // CMP SP, TargetReg
11327 BuildMI(BB&: *LoopTestMBB, I: LoopTestMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::SUBSXrx64),
11328 DestReg: AArch64::XZR)
11329 .addReg(RegNo: AArch64::SP)
11330 .addReg(RegNo: TargetReg)
11331 .addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: 0))
11332 .setMIFlags(Flags);
11333
11334 // B.<Cond> LoopExit
11335 BuildMI(BB&: *LoopTestMBB, I: LoopTestMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc))
11336 .addImm(Val: AArch64CC::LE)
11337 .addMBB(MBB: ExitMBB)
11338 .setMIFlags(Flags);
11339
11340 // LDR XZR, [SP]
11341 BuildMI(BB&: *LoopBodyMBB, I: LoopBodyMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
11342 .addDef(RegNo: AArch64::XZR)
11343 .addReg(RegNo: AArch64::SP)
11344 .addImm(Val: 0)
11345 .addMemOperand(MMO: MF.getMachineMemOperand(
11346 PtrInfo: MachinePointerInfo::getUnknownStack(MF),
11347 F: MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, Size: 8,
11348 BaseAlignment: Align(8)))
11349 .setMIFlags(Flags);
11350
11351 // B loop
11352 BuildMI(BB&: *LoopBodyMBB, I: LoopBodyMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::B))
11353 .addMBB(MBB: LoopTestMBB)
11354 .setMIFlags(Flags);
11355
11356 // LoopExit:
11357 // MOV SP, TargetReg
11358 BuildMI(BB&: *ExitMBB, I: ExitMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri), DestReg: AArch64::SP)
11359 .addReg(RegNo: TargetReg)
11360 .addImm(Val: 0)
11361 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
11362 .setMIFlags(Flags);
11363
11364 // LDR XZR, [SP]
11365 BuildMI(BB&: *ExitMBB, I: ExitMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
11366 .addReg(RegNo: AArch64::XZR, Flags: RegState::Define)
11367 .addReg(RegNo: AArch64::SP)
11368 .addImm(Val: 0)
11369 .setMIFlags(Flags);
11370
11371 ExitMBB->splice(Where: ExitMBB->end(), Other: &MBB, From: std::next(x: MBBI), To: MBB.end());
11372 ExitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
11373
11374 LoopTestMBB->addSuccessor(Succ: ExitMBB);
11375 LoopTestMBB->addSuccessor(Succ: LoopBodyMBB);
11376 LoopBodyMBB->addSuccessor(Succ: LoopTestMBB);
11377 MBB.addSuccessor(Succ: LoopTestMBB);
11378
11379 // Update liveins.
11380 if (MF.getRegInfo().reservedRegsFrozen())
11381 fullyRecomputeLiveIns(MBBs: {ExitMBB, LoopBodyMBB, LoopTestMBB});
11382
11383 return ExitMBB->begin();
11384}
11385
11386namespace {
11387class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11388 MachineFunction *MF;
11389 const TargetInstrInfo *TII;
11390 const TargetRegisterInfo *TRI;
11391 MachineRegisterInfo &MRI;
11392
11393 /// The block of the loop
11394 MachineBasicBlock *LoopBB;
11395 /// The conditional branch of the loop
11396 MachineInstr *CondBranch;
11397 /// The compare instruction for loop control
11398 MachineInstr *Comp;
11399 /// The number of the operand of the loop counter value in Comp
11400 unsigned CompCounterOprNum;
11401 /// The instruction that updates the loop counter value
11402 MachineInstr *Update;
11403 /// The number of the operand of the loop counter value in Update
11404 unsigned UpdateCounterOprNum;
11405 /// The initial value of the loop counter
11406 Register Init;
11407 /// True iff Update is a predecessor of Comp
11408 bool IsUpdatePriorComp;
11409
11410 /// The normalized condition used by createTripCountGreaterCondition()
11411 SmallVector<MachineOperand, 4> Cond;
11412
11413public:
11414 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11415 MachineInstr *Comp, unsigned CompCounterOprNum,
11416 MachineInstr *Update, unsigned UpdateCounterOprNum,
11417 Register Init, bool IsUpdatePriorComp,
11418 const SmallVectorImpl<MachineOperand> &Cond)
11419 : MF(Comp->getParent()->getParent()),
11420 TII(MF->getSubtarget().getInstrInfo()),
11421 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11422 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11423 CompCounterOprNum(CompCounterOprNum), Update(Update),
11424 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11425 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11426
11427 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11428 // Make the instructions for loop control be placed in stage 0.
11429 // The predecessors of Comp are considered by the caller.
11430 return MI == Comp;
11431 }
11432
11433 std::optional<bool> createTripCountGreaterCondition(
11434 int TC, MachineBasicBlock &MBB,
11435 SmallVectorImpl<MachineOperand> &CondParam) override {
11436 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11437 // Cond is normalized for such use.
11438 // The predecessors of the branch are assumed to have already been inserted.
11439 CondParam = Cond;
11440 return {};
11441 }
11442
11443 void createRemainingIterationsGreaterCondition(
11444 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11445 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11446
11447 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11448
11449 void adjustTripCount(int TripCountAdjust) override {}
11450
11451 bool isMVEExpanderSupported() override { return true; }
11452};
11453} // namespace
11454
11455/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11456/// is replaced by ReplaceReg. The output register is newly created.
11457/// The other operands are unchanged from MI.
11458static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11459 Register ReplaceReg, MachineBasicBlock &MBB,
11460 MachineBasicBlock::iterator InsertTo) {
11461 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11462 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11463 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(Orig: MI);
11464 Register Result = 0;
11465 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11466 if (I == 0 && NewMI->getOperand(i: 0).getReg().isVirtual()) {
11467 Result = MRI.createVirtualRegister(
11468 RegClass: MRI.getRegClass(Reg: NewMI->getOperand(i: 0).getReg()));
11469 NewMI->getOperand(i: I).setReg(Result);
11470 } else if (I == ReplaceOprNum) {
11471 MRI.constrainRegClass(Reg: ReplaceReg, RC: TII->getRegClass(MCID: NewMI->getDesc(), OpNum: I));
11472 NewMI->getOperand(i: I).setReg(ReplaceReg);
11473 }
11474 }
11475 MBB.insert(I: InsertTo, MI: NewMI);
11476 return Result;
11477}
11478
11479void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11480 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11481 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
11482 // Create and accumulate conditions for next TC iterations.
11483 // Example:
11484 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11485 // # iteration of the kernel
11486 //
11487 // # insert the following instructions
11488 // cond = CSINCXr 0, 0, C, implicit $nzcv
11489 // counter = ADDXri counter, 1 # clone from this->Update
11490 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11491 // cond = CSINCXr cond, cond, C, implicit $nzcv
11492 // ... (repeat TC times)
11493 // SUBSXri cond, 0, implicit-def $nzcv
11494
11495 assert(CondBranch->getOpcode() == AArch64::Bcc);
11496 // CondCode to exit the loop
11497 AArch64CC::CondCode CC =
11498 (AArch64CC::CondCode)CondBranch->getOperand(i: 0).getImm();
11499 if (CondBranch->getOperand(i: 1).getMBB() == LoopBB)
11500 CC = AArch64CC::getInvertedCondCode(Code: CC);
11501
11502 // Accumulate conditions to exit the loop
11503 Register AccCond = AArch64::XZR;
11504
11505 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11506 auto AccumulateCond = [&](Register CurCond,
11507 AArch64CC::CondCode CC) -> Register {
11508 Register NewCond = MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
11509 BuildMI(BB&: MBB, I: MBB.end(), MIMD: Comp->getDebugLoc(), MCID: TII->get(Opcode: AArch64::CSINCXr))
11510 .addReg(RegNo: NewCond, Flags: RegState::Define)
11511 .addReg(RegNo: CurCond)
11512 .addReg(RegNo: CurCond)
11513 .addImm(Val: AArch64CC::getInvertedCondCode(Code: CC));
11514 return NewCond;
11515 };
11516
11517 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11518 // Update and Comp for I==0 are already exists in MBB
11519 // (MBB is an unrolled kernel)
11520 Register Counter;
11521 for (int I = 0; I <= TC; ++I) {
11522 Register NextCounter;
11523 if (I != 0)
11524 NextCounter =
11525 cloneInstr(MI: Comp, ReplaceOprNum: CompCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11526
11527 AccCond = AccumulateCond(AccCond, CC);
11528
11529 if (I != TC) {
11530 if (I == 0) {
11531 if (Update != Comp && IsUpdatePriorComp) {
11532 Counter =
11533 LastStage0Insts[Comp]->getOperand(i: CompCounterOprNum).getReg();
11534 NextCounter = cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB,
11535 InsertTo: MBB.end());
11536 } else {
11537 // can use already calculated value
11538 NextCounter = LastStage0Insts[Update]->getOperand(i: 0).getReg();
11539 }
11540 } else if (Update != Comp) {
11541 NextCounter =
11542 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11543 }
11544 }
11545 Counter = NextCounter;
11546 }
11547 } else {
11548 Register Counter;
11549 if (LastStage0Insts.empty()) {
11550 // use initial counter value (testing if the trip count is sufficient to
11551 // be executed by pipelined code)
11552 Counter = Init;
11553 if (IsUpdatePriorComp)
11554 Counter =
11555 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11556 } else {
11557 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11558 Counter = LastStage0Insts[Comp]->getOperand(i: CompCounterOprNum).getReg();
11559 }
11560
11561 for (int I = 0; I <= TC; ++I) {
11562 Register NextCounter;
11563 NextCounter =
11564 cloneInstr(MI: Comp, ReplaceOprNum: CompCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11565 AccCond = AccumulateCond(AccCond, CC);
11566 if (I != TC && Update != Comp)
11567 NextCounter =
11568 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11569 Counter = NextCounter;
11570 }
11571 }
11572
11573 // If AccCond == 0, the remainder is greater than TC.
11574 BuildMI(BB&: MBB, I: MBB.end(), MIMD: Comp->getDebugLoc(), MCID: TII->get(Opcode: AArch64::SUBSXri))
11575 .addReg(RegNo: AArch64::XZR, Flags: RegState::Define | RegState::Dead)
11576 .addReg(RegNo: AccCond)
11577 .addImm(Val: 0)
11578 .addImm(Val: 0);
11579 Cond.clear();
11580 Cond.push_back(Elt: MachineOperand::CreateImm(Val: AArch64CC::EQ));
11581}
11582
11583static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11584 Register &RegMBB, Register &RegOther) {
11585 assert(Phi.getNumOperands() == 5);
11586 if (Phi.getOperand(i: 2).getMBB() == MBB) {
11587 RegMBB = Phi.getOperand(i: 1).getReg();
11588 RegOther = Phi.getOperand(i: 3).getReg();
11589 } else {
11590 assert(Phi.getOperand(4).getMBB() == MBB);
11591 RegMBB = Phi.getOperand(i: 3).getReg();
11592 RegOther = Phi.getOperand(i: 1).getReg();
11593 }
11594}
11595
11596static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
11597 if (!Reg.isVirtual())
11598 return false;
11599 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11600 return MRI.getVRegDef(Reg)->getParent() != BB;
11601}
11602
11603/// If Reg is an induction variable, return true and set some parameters
11604static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11605 MachineInstr *&UpdateInst,
11606 unsigned &UpdateCounterOprNum, Register &InitReg,
11607 bool &IsUpdatePriorComp) {
11608 // Example:
11609 //
11610 // Preheader:
11611 // InitReg = ...
11612 // LoopBB:
11613 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11614 // Reg = COPY Reg0 ; COPY is ignored.
11615 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11616 // ; Reg is the value calculated in the previous
11617 // ; iteration, so IsUpdatePriorComp == false.
11618
11619 if (LoopBB->pred_size() != 2)
11620 return false;
11621 if (!Reg.isVirtual())
11622 return false;
11623 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11624 UpdateInst = nullptr;
11625 UpdateCounterOprNum = 0;
11626 InitReg = 0;
11627 IsUpdatePriorComp = true;
11628 Register CurReg = Reg;
11629 while (true) {
11630 MachineInstr *Def = MRI.getVRegDef(Reg: CurReg);
11631 if (Def->getParent() != LoopBB)
11632 return false;
11633 if (Def->isCopy()) {
11634 // Ignore copy instructions unless they contain subregisters
11635 if (Def->getOperand(i: 0).getSubReg() || Def->getOperand(i: 1).getSubReg())
11636 return false;
11637 CurReg = Def->getOperand(i: 1).getReg();
11638 } else if (Def->isPHI()) {
11639 if (InitReg != 0)
11640 return false;
11641 if (!UpdateInst)
11642 IsUpdatePriorComp = false;
11643 extractPhiReg(Phi: *Def, MBB: LoopBB, RegMBB&: CurReg, RegOther&: InitReg);
11644 } else {
11645 if (UpdateInst)
11646 return false;
11647 switch (Def->getOpcode()) {
11648 case AArch64::ADDSXri:
11649 case AArch64::ADDSWri:
11650 case AArch64::SUBSXri:
11651 case AArch64::SUBSWri:
11652 case AArch64::ADDXri:
11653 case AArch64::ADDWri:
11654 case AArch64::SUBXri:
11655 case AArch64::SUBWri:
11656 UpdateInst = Def;
11657 UpdateCounterOprNum = 1;
11658 break;
11659 case AArch64::ADDSXrr:
11660 case AArch64::ADDSWrr:
11661 case AArch64::SUBSXrr:
11662 case AArch64::SUBSWrr:
11663 case AArch64::ADDXrr:
11664 case AArch64::ADDWrr:
11665 case AArch64::SUBXrr:
11666 case AArch64::SUBWrr:
11667 UpdateInst = Def;
11668 if (isDefinedOutside(Reg: Def->getOperand(i: 2).getReg(), BB: LoopBB))
11669 UpdateCounterOprNum = 1;
11670 else if (isDefinedOutside(Reg: Def->getOperand(i: 1).getReg(), BB: LoopBB))
11671 UpdateCounterOprNum = 2;
11672 else
11673 return false;
11674 break;
11675 default:
11676 return false;
11677 }
11678 CurReg = Def->getOperand(i: UpdateCounterOprNum).getReg();
11679 }
11680
11681 if (!CurReg.isVirtual())
11682 return false;
11683 if (Reg == CurReg)
11684 break;
11685 }
11686
11687 if (!UpdateInst)
11688 return false;
11689
11690 return true;
11691}
11692
11693std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11694AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
11695 // Accept loops that meet the following conditions
11696 // * The conditional branch is BCC
11697 // * The compare instruction is ADDS/SUBS/WHILEXX
11698 // * One operand of the compare is an induction variable and the other is a
11699 // loop invariant value
11700 // * The induction variable is incremented/decremented by a single instruction
11701 // * Does not contain CALL or instructions which have unmodeled side effects
11702
11703 for (MachineInstr &MI : *LoopBB)
11704 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11705 // This instruction may use NZCV, which interferes with the instruction to
11706 // be inserted for loop control.
11707 return nullptr;
11708
11709 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11710 SmallVector<MachineOperand, 4> Cond;
11711 if (analyzeBranch(MBB&: *LoopBB, TBB, FBB, Cond))
11712 return nullptr;
11713
11714 // Infinite loops are not supported
11715 if (TBB == LoopBB && FBB == LoopBB)
11716 return nullptr;
11717
11718 // Must be conditional branch
11719 if (TBB != LoopBB && FBB == nullptr)
11720 return nullptr;
11721
11722 assert((TBB == LoopBB || FBB == LoopBB) &&
11723 "The Loop must be a single-basic-block loop");
11724
11725 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11726 const TargetRegisterInfo &TRI = getRegisterInfo();
11727
11728 if (CondBranch->getOpcode() != AArch64::Bcc)
11729 return nullptr;
11730
11731 // Normalization for createTripCountGreaterCondition()
11732 if (TBB == LoopBB)
11733 reverseBranchCondition(Cond);
11734
11735 MachineInstr *Comp = nullptr;
11736 unsigned CompCounterOprNum = 0;
11737 for (MachineInstr &MI : reverse(C&: *LoopBB)) {
11738 if (MI.modifiesRegister(Reg: AArch64::NZCV, TRI: &TRI)) {
11739 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11740 // operands is a loop invariant value
11741
11742 switch (MI.getOpcode()) {
11743 case AArch64::SUBSXri:
11744 case AArch64::SUBSWri:
11745 case AArch64::ADDSXri:
11746 case AArch64::ADDSWri:
11747 Comp = &MI;
11748 CompCounterOprNum = 1;
11749 break;
11750 case AArch64::ADDSWrr:
11751 case AArch64::ADDSXrr:
11752 case AArch64::SUBSWrr:
11753 case AArch64::SUBSXrr:
11754 Comp = &MI;
11755 break;
11756 default:
11757 if (isWhileOpcode(Opc: MI.getOpcode())) {
11758 Comp = &MI;
11759 break;
11760 }
11761 return nullptr;
11762 }
11763
11764 if (CompCounterOprNum == 0) {
11765 if (isDefinedOutside(Reg: Comp->getOperand(i: 1).getReg(), BB: LoopBB))
11766 CompCounterOprNum = 2;
11767 else if (isDefinedOutside(Reg: Comp->getOperand(i: 2).getReg(), BB: LoopBB))
11768 CompCounterOprNum = 1;
11769 else
11770 return nullptr;
11771 }
11772 break;
11773 }
11774 }
11775 if (!Comp)
11776 return nullptr;
11777
11778 MachineInstr *Update = nullptr;
11779 Register Init;
11780 bool IsUpdatePriorComp;
11781 unsigned UpdateCounterOprNum;
11782 if (!getIndVarInfo(Reg: Comp->getOperand(i: CompCounterOprNum).getReg(), LoopBB,
11783 UpdateInst&: Update, UpdateCounterOprNum, InitReg&: Init, IsUpdatePriorComp))
11784 return nullptr;
11785
11786 return std::make_unique<AArch64PipelinerLoopInfo>(
11787 args&: LoopBB, args&: CondBranch, args&: Comp, args&: CompCounterOprNum, args&: Update, args&: UpdateCounterOprNum,
11788 args&: Init, args&: IsUpdatePriorComp, args&: Cond);
11789}
11790
11791/// verifyInstruction - Perform target specific instruction verification.
11792bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11793 StringRef &ErrInfo) const {
11794 // Verify that immediate offsets on load/store instructions are within range.
11795 // Stack objects with an FI operand are excluded as they can be fixed up
11796 // during PEI.
11797 TypeSize Scale(0U, false), Width(0U, false);
11798 int64_t MinOffset, MaxOffset;
11799 if (getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11800 unsigned ImmIdx = getLoadStoreImmIdx(Opc: MI.getOpcode());
11801 if (MI.getOperand(i: ImmIdx).isImm() && !MI.getOperand(i: ImmIdx - 1).isFI()) {
11802 int64_t Imm = MI.getOperand(i: ImmIdx).getImm();
11803 if (Imm < MinOffset || Imm > MaxOffset) {
11804 ErrInfo = "Unexpected immediate on load/store instruction";
11805 return false;
11806 }
11807 }
11808 }
11809
11810 const MCInstrDesc &MCID = MI.getDesc();
11811 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11812 const MachineOperand &MO = MI.getOperand(i: Op);
11813 switch (MCID.operands()[Op].OperandType) {
11814 case AArch64::OPERAND_IMPLICIT_IMM_0:
11815 if (!MO.isImm() || MO.getImm() != 0) {
11816 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11817 return false;
11818 }
11819 break;
11820 case AArch64::OPERAND_SHIFT_MSL:
11821 if (!MO.isImm() ||
11822 AArch64_AM::getShiftType(Imm: MO.getImm()) != AArch64_AM::MSL ||
11823 (AArch64_AM::getShiftValue(Imm: MO.getImm()) != 8 &&
11824 AArch64_AM::getShiftValue(Imm: MO.getImm()) != 16)) {
11825 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11826 return false;
11827 }
11828 break;
11829 default:
11830 break;
11831 }
11832 }
11833 return true;
11834}
11835
11836#define GET_INSTRINFO_HELPERS
11837#define GET_INSTRMAP_INFO
11838#include "AArch64GenInstrInfo.inc"
11839