1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
15#include "AArch64MachineFunctionInfo.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
18#include "MCTargetDesc/AArch64AddressingModes.h"
19#include "MCTargetDesc/AArch64MCLFIRewriter.h"
20#include "MCTargetDesc/AArch64MCTargetDesc.h"
21#include "Utils/AArch64BaseInfo.h"
22#include "llvm/ADT/ArrayRef.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/SmallVector.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/Analysis/AliasAnalysis.h"
28#include "llvm/CodeGen/CFIInstBuilder.h"
29#include "llvm/CodeGen/LivePhysRegs.h"
30#include "llvm/CodeGen/MachineBasicBlock.h"
31#include "llvm/CodeGen/MachineCombinerPattern.h"
32#include "llvm/CodeGen/MachineFrameInfo.h"
33#include "llvm/CodeGen/MachineFunction.h"
34#include "llvm/CodeGen/MachineInstr.h"
35#include "llvm/CodeGen/MachineInstrBuilder.h"
36#include "llvm/CodeGen/MachineMemOperand.h"
37#include "llvm/CodeGen/MachineModuleInfo.h"
38#include "llvm/CodeGen/MachineOperand.h"
39#include "llvm/CodeGen/MachineRegisterInfo.h"
40#include "llvm/CodeGen/RegisterScavenging.h"
41#include "llvm/CodeGen/StackMaps.h"
42#include "llvm/CodeGen/TargetRegisterInfo.h"
43#include "llvm/CodeGen/TargetSubtargetInfo.h"
44#include "llvm/IR/DebugInfoMetadata.h"
45#include "llvm/IR/DebugLoc.h"
46#include "llvm/IR/GlobalValue.h"
47#include "llvm/IR/Module.h"
48#include "llvm/MC/MCAsmInfo.h"
49#include "llvm/MC/MCInst.h"
50#include "llvm/MC/MCInstBuilder.h"
51#include "llvm/MC/MCInstrDesc.h"
52#include "llvm/Support/Casting.h"
53#include "llvm/Support/CodeGen.h"
54#include "llvm/Support/CommandLine.h"
55#include "llvm/Support/ErrorHandling.h"
56#include "llvm/Support/LEB128.h"
57#include "llvm/Support/MathExtras.h"
58#include "llvm/Target/TargetMachine.h"
59#include "llvm/Target/TargetOptions.h"
60#include <cassert>
61#include <cstdint>
62#include <iterator>
63#include <utility>
64
65using namespace llvm;
66
67#define GET_INSTRINFO_CTOR_DTOR
68#include "AArch64GenInstrInfo.inc"
69
70#define DEBUG_TYPE "AArch64InstrInfo"
71
72STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
73STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
74 "instructions expanded from canonical COPY");
75STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
76 "instructions expanded from canonical COPY");
77STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
78 "instructions expanded from canonical COPY");
79// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
80
81static cl::opt<unsigned>
82 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(Val: 9),
83 cl::desc("Restrict range of CB instructions (DEBUG)"));
84
85static cl::opt<unsigned> TBZDisplacementBits(
86 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(Val: 14),
87 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
88
89static cl::opt<unsigned> CBZDisplacementBits(
90 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(Val: 19),
91 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
92
93static cl::opt<unsigned>
94 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(Val: 19),
95 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
96
97static cl::opt<unsigned>
98 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(Val: 26),
99 cl::desc("Restrict range of B instructions (DEBUG)"));
100
101static cl::opt<unsigned> GatherOptSearchLimit(
102 "aarch64-search-limit", cl::Hidden, cl::init(Val: 2048),
103 cl::desc("Restrict range of instructions to search for the "
104 "machine-combiner gather pattern optimization"));
105
106AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
107 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
108 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
109 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
110
111/// Return the maximum number of bytes of code the specified instruction may be
112/// after LFI rewriting. If the instruction is not rewritten, std::nullopt is
113/// returned (use default sizing).
114///
115/// NOTE: the size estimates here must be kept in sync with the rewrites in
116/// AArch64MCLFIRewriter.cpp. Sizes may be overestimates of the rewritten
117/// instruction sequences.
118static std::optional<unsigned> getLFIInstSizeInBytes(const MachineInstr &MI) {
119 switch (MI.getOpcode()) {
120 case AArch64::SVC:
121 // SVC expands to 4 instructions.
122 return 16;
123 case AArch64::BR:
124 case AArch64::BLR:
125 // Indirect branches/calls expand to 2 instructions (guard + br/blr).
126 return 8;
127 case AArch64::RET:
128 // RET through LR is not rewritten, but RET through another register
129 // expands to 2 instructions (guard + ret).
130 if (MI.getOperand(i: 0).getReg() != AArch64::LR)
131 return 8;
132 return 4;
133 case AArch64::SYSxt:
134 // VA-based DC/IC ops (op1=3, Cn=7, op2=1) expand to 2 instructions.
135 if (MI.getOperand(i: 0).getImm() == 3 && MI.getOperand(i: 1).getImm() == 7 &&
136 MI.getOperand(i: 3).getImm() == 1)
137 return 8;
138 return std::nullopt;
139 default:
140 break;
141 }
142
143 // Detect instructions that explicitly define SP or LR.
144 bool ModifiesLR = false;
145 bool ModifiesSP = false;
146 for (const MachineOperand &MO : MI.defs()) {
147 if (!MO.isReg())
148 continue;
149 if (MO.getReg() == AArch64::LR)
150 ModifiesLR = true;
151 else if (MO.getReg() == AArch64::SP)
152 ModifiesSP = true;
153 }
154
155 // Memory accesses expand to a base-register guard plus the rewritten access
156 // (8 bytes), with an extra base-register update for pre/post-index forms (12
157 // bytes total). If the access also defines LR, an LR mask is appended (+4
158 // bytes). Depending on additional optimizations that the rewriter performs,
159 // this may be an overestimate.
160 if (MI.mayLoadOrStore()) {
161 unsigned Size = isLFIPrePostMemAccess(Opcode: MI.getOpcode()) ? 12 : 8;
162 if (ModifiesLR)
163 Size += 4;
164 return Size;
165 }
166
167 // Non memory operations that modify LR or SP expand to 2 instructions.
168 if (ModifiesSP || ModifiesLR)
169 return 8;
170
171 // Default case: instructions that don't cause expansion.
172 // - TP accesses in LFI are a single load/store, so no expansion.
173 // - All remaining instructions are not rewritten.
174 return std::nullopt;
175}
176
177/// GetInstSize - Return the number of bytes of code the specified
178/// instruction may be. This returns the maximum number of bytes.
179unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
180 const MachineBasicBlock &MBB = *MI.getParent();
181 const MachineFunction *MF = MBB.getParent();
182 const Function &F = MF->getFunction();
183 const MCAsmInfo &MAI = MF->getTarget().getMCAsmInfo();
184
185 {
186 auto Op = MI.getOpcode();
187 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
188 return getInlineAsmLength(Str: MI.getOperand(i: 0).getSymbolName(), MAI);
189 }
190
191 // Meta-instructions emit no code.
192 if (MI.isMetaInstruction())
193 return 0;
194
195 // FIXME: We currently only handle pseudoinstructions that don't get expanded
196 // before the assembly printer.
197 unsigned NumBytes = 0;
198 const MCInstrDesc &Desc = MI.getDesc();
199
200 // LFI rewriter expansions that supersede normal sizing.
201 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
202 if (STI.isLFI())
203 if (auto Size = getLFIInstSizeInBytes(MI))
204 return *Size;
205
206 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
207 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
208
209 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
210 if (!MFI->shouldSignReturnAddress(MF: *MF))
211 return NumBytes;
212
213 auto Method = STI.getAuthenticatedLRCheckMethod(MF: *MF);
214 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
215 return NumBytes;
216 }
217
218 // Size should be preferably set in
219 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
220 // Specific cases handle instructions of variable sizes
221 switch (Desc.getOpcode()) {
222 default:
223 if (Desc.getSize())
224 return Desc.getSize();
225
226 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
227 // with fixed constant size but not specified in .td file) is a normal
228 // 4-byte insn.
229 NumBytes = 4;
230 break;
231 case TargetOpcode::STACKMAP:
232 // The upper bound for a stackmap intrinsic is the full length of its shadow
233 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
234 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
235 break;
236 case TargetOpcode::PATCHPOINT:
237 // The size of the patchpoint intrinsic is the number of bytes requested
238 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
239 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
240 break;
241 case TargetOpcode::STATEPOINT:
242 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
243 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
244 // No patch bytes means a normal call inst is emitted
245 if (NumBytes == 0)
246 NumBytes = 4;
247 break;
248 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
249 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
250 // instructions are expanded to the specified number of NOPs. Otherwise,
251 // they are expanded to 36-byte XRay sleds.
252 NumBytes =
253 F.getFnAttributeAsParsedInteger(Kind: "patchable-function-entry", Default: 9) * 4;
254 break;
255 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
256 case TargetOpcode::PATCHABLE_TAIL_CALL:
257 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
258 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
259 NumBytes = 36;
260 break;
261 case TargetOpcode::PATCHABLE_EVENT_CALL:
262 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
263 NumBytes = 24;
264 break;
265
266 case AArch64::SPACE:
267 NumBytes = MI.getOperand(i: 1).getImm();
268 break;
269 case AArch64::MOVaddr:
270 case AArch64::MOVaddrJT:
271 case AArch64::MOVaddrCP:
272 case AArch64::MOVaddrBA:
273 case AArch64::MOVaddrTLS:
274 case AArch64::MOVaddrEXT: {
275 // Use the same logic as the pseudo expansion to count instructions.
276 SmallVector<AArch64_IMM::AddrInsnModel, 3> Insn;
277 AArch64_IMM::expandMOVAddr(Opcode: Desc.getOpcode(),
278 TargetFlags: MI.getOperand(i: 1).getTargetFlags(),
279 IsTargetMachO: Subtarget.isTargetMachO(), Insn);
280 NumBytes = Insn.size() * 4;
281 break;
282 }
283
284 case AArch64::MOVi32imm:
285 case AArch64::MOVi64imm: {
286 // Use the same logic as the pseudo expansion to count instructions.
287 unsigned BitSize = Desc.getOpcode() == AArch64::MOVi32imm ? 32 : 64;
288 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
289 AArch64_IMM::expandMOVImm(Imm: MI.getOperand(i: 1).getImm(), BitSize, Insn);
290 NumBytes = Insn.size() * 4;
291 break;
292 }
293
294 case TargetOpcode::BUNDLE:
295 NumBytes = getInstBundleSize(MI);
296 break;
297 }
298
299 return NumBytes;
300}
301
302static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
303 SmallVectorImpl<MachineOperand> &Cond) {
304 // Block ends with fall-through condbranch.
305 switch (LastInst->getOpcode()) {
306 default:
307 llvm_unreachable("Unknown branch instruction?");
308 case AArch64::Bcc:
309 Target = LastInst->getOperand(i: 1).getMBB();
310 Cond.push_back(Elt: LastInst->getOperand(i: 0));
311 break;
312 case AArch64::CBZW:
313 case AArch64::CBZX:
314 case AArch64::CBNZW:
315 case AArch64::CBNZX:
316 Target = LastInst->getOperand(i: 1).getMBB();
317 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
318 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
319 Cond.push_back(Elt: LastInst->getOperand(i: 0));
320 break;
321 case AArch64::TBZW:
322 case AArch64::TBZX:
323 case AArch64::TBNZW:
324 case AArch64::TBNZX:
325 Target = LastInst->getOperand(i: 2).getMBB();
326 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
327 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
328 Cond.push_back(Elt: LastInst->getOperand(i: 0));
329 Cond.push_back(Elt: LastInst->getOperand(i: 1));
330 break;
331 case AArch64::CBWPri:
332 case AArch64::CBXPri:
333 case AArch64::CBWPrr:
334 case AArch64::CBXPrr:
335 Target = LastInst->getOperand(i: 3).getMBB();
336 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
337 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
338 Cond.push_back(Elt: LastInst->getOperand(i: 0));
339 Cond.push_back(Elt: LastInst->getOperand(i: 1));
340 Cond.push_back(Elt: LastInst->getOperand(i: 2));
341 break;
342 case AArch64::CBBAssertExt:
343 case AArch64::CBHAssertExt:
344 Target = LastInst->getOperand(i: 3).getMBB();
345 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1)); // -1
346 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode())); // Opc
347 Cond.push_back(Elt: LastInst->getOperand(i: 0)); // Cond
348 Cond.push_back(Elt: LastInst->getOperand(i: 1)); // Op0
349 Cond.push_back(Elt: LastInst->getOperand(i: 2)); // Op1
350 Cond.push_back(Elt: LastInst->getOperand(i: 4)); // Ext0
351 Cond.push_back(Elt: LastInst->getOperand(i: 5)); // Ext1
352 break;
353 }
354}
355
356static unsigned getBranchDisplacementBits(unsigned Opc) {
357 switch (Opc) {
358 default:
359 llvm_unreachable("unexpected opcode!");
360 case AArch64::B:
361 return BDisplacementBits;
362 case AArch64::TBNZW:
363 case AArch64::TBZW:
364 case AArch64::TBNZX:
365 case AArch64::TBZX:
366 return TBZDisplacementBits;
367 case AArch64::CBNZW:
368 case AArch64::CBZW:
369 case AArch64::CBNZX:
370 case AArch64::CBZX:
371 return CBZDisplacementBits;
372 case AArch64::Bcc:
373 return BCCDisplacementBits;
374 case AArch64::CBWPri:
375 case AArch64::CBXPri:
376 case AArch64::CBBAssertExt:
377 case AArch64::CBHAssertExt:
378 case AArch64::CBWPrr:
379 case AArch64::CBXPrr:
380 return CBDisplacementBits;
381 }
382}
383
384bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
385 int64_t BrOffset) const {
386 unsigned Bits = getBranchDisplacementBits(Opc: BranchOp);
387 assert(Bits >= 3 && "max branch displacement must be enough to jump"
388 "over conditional branch expansion");
389 return isIntN(N: Bits, x: BrOffset / 4);
390}
391
392MachineBasicBlock *
393AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
394 switch (MI.getOpcode()) {
395 default:
396 llvm_unreachable("unexpected opcode!");
397 case AArch64::B:
398 return MI.getOperand(i: 0).getMBB();
399 case AArch64::TBZW:
400 case AArch64::TBNZW:
401 case AArch64::TBZX:
402 case AArch64::TBNZX:
403 return MI.getOperand(i: 2).getMBB();
404 case AArch64::CBZW:
405 case AArch64::CBNZW:
406 case AArch64::CBZX:
407 case AArch64::CBNZX:
408 case AArch64::Bcc:
409 return MI.getOperand(i: 1).getMBB();
410 case AArch64::CBWPri:
411 case AArch64::CBXPri:
412 case AArch64::CBBAssertExt:
413 case AArch64::CBHAssertExt:
414 case AArch64::CBWPrr:
415 case AArch64::CBXPrr:
416 return MI.getOperand(i: 3).getMBB();
417 }
418}
419
420void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
421 MachineBasicBlock &NewDestBB,
422 MachineBasicBlock &RestoreBB,
423 const DebugLoc &DL,
424 int64_t BrOffset,
425 RegScavenger *RS) const {
426 assert(RS && "RegScavenger required for long branching");
427 assert(MBB.empty() &&
428 "new block should be inserted for expanding unconditional branch");
429 assert(MBB.pred_size() == 1);
430 assert(RestoreBB.empty() &&
431 "restore block should be inserted for restoring clobbered registers");
432
433 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
434 // Offsets outside of the signed 33-bit range are not supported for ADRP +
435 // ADD.
436 if (!isInt<33>(x: BrOffset))
437 report_fatal_error(
438 reason: "Branch offsets outside of the signed 33-bit range not supported");
439
440 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::ADRP), DestReg: Reg)
441 .addSym(Sym: DestBB.getSymbol(), TargetFlags: AArch64II::MO_PAGE);
442 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg: Reg)
443 .addReg(RegNo: Reg)
444 .addSym(Sym: DestBB.getSymbol(), TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
445 .addImm(Val: 0);
446 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::BR)).addReg(RegNo: Reg);
447 };
448
449 RS->enterBasicBlockEnd(MBB);
450 // If X16 is unused, we can rely on the linker to insert a range extension
451 // thunk if NewDestBB is out of range of a single B instruction.
452 constexpr Register Reg = AArch64::X16;
453 if (!RS->isRegUsed(Reg)) {
454 insertUnconditionalBranch(MBB, DestBB: &NewDestBB, DL);
455 RS->setRegUsed(Reg);
456 return;
457 }
458
459 // In a cold block without BTI, insert the indirect branch if a register is
460 // free. Skip this if BTI is enabled to avoid inserting a BTI at the target,
461 // prioritizing a dynamic cost in cold code over a static cost in hot code.
462 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
463 bool HasBTI = AFI && AFI->branchTargetEnforcement();
464 if (MBB.getSectionID() == MBBSectionID::ColdSectionID && !HasBTI) {
465 Register Scavenged = RS->FindUnusedReg(RC: &AArch64::GPR64RegClass);
466 if (Scavenged != AArch64::NoRegister) {
467 buildIndirectBranch(Scavenged, NewDestBB);
468 RS->setRegUsed(Reg: Scavenged);
469 return;
470 }
471 }
472
473 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
474 // with red zones.
475 if (!AFI || AFI->hasRedZone().value_or(u: true))
476 report_fatal_error(
477 reason: "Unable to insert indirect branch inside function that has red zone");
478
479 // Otherwise, spill X16 and defer range extension to the linker.
480 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::STRXpre))
481 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
482 .addReg(RegNo: Reg)
483 .addReg(RegNo: AArch64::SP)
484 .addImm(Val: -16);
485
486 BuildMI(BB&: MBB, I: MBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: &RestoreBB);
487
488 BuildMI(BB&: RestoreBB, I: RestoreBB.end(), MIMD: DL, MCID: get(Opcode: AArch64::LDRXpost))
489 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
490 .addReg(RegNo: Reg, Flags: RegState::Define)
491 .addReg(RegNo: AArch64::SP)
492 .addImm(Val: 16);
493}
494
495// Branch analysis.
496bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
497 MachineBasicBlock *&TBB,
498 MachineBasicBlock *&FBB,
499 SmallVectorImpl<MachineOperand> &Cond,
500 bool AllowModify) const {
501 // If the block has no terminators, it just falls into the block after it.
502 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
503 if (I == MBB.end())
504 return false;
505
506 // Skip over SpeculationBarrierEndBB terminators
507 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
508 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
509 --I;
510 }
511
512 if (!isUnpredicatedTerminator(MI: *I))
513 return false;
514
515 // Get the last instruction in the block.
516 MachineInstr *LastInst = &*I;
517
518 // If there is only one terminator instruction, process it.
519 unsigned LastOpc = LastInst->getOpcode();
520 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
521 if (isUncondBranchOpcode(Opc: LastOpc)) {
522 TBB = LastInst->getOperand(i: 0).getMBB();
523 return false;
524 }
525 if (isCondBranchOpcode(Opc: LastOpc)) {
526 // Block ends with fall-through condbranch.
527 parseCondBranch(LastInst, Target&: TBB, Cond);
528 return false;
529 }
530 return true; // Can't handle indirect branch.
531 }
532
533 // Get the instruction before it if it is a terminator.
534 MachineInstr *SecondLastInst = &*I;
535 unsigned SecondLastOpc = SecondLastInst->getOpcode();
536
537 // If AllowModify is true and the block ends with two or more unconditional
538 // branches, delete all but the first unconditional branch.
539 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc)) {
540 while (isUncondBranchOpcode(Opc: SecondLastOpc)) {
541 LastInst->eraseFromParent();
542 LastInst = SecondLastInst;
543 LastOpc = LastInst->getOpcode();
544 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
545 // Return now the only terminator is an unconditional branch.
546 TBB = LastInst->getOperand(i: 0).getMBB();
547 return false;
548 }
549 SecondLastInst = &*I;
550 SecondLastOpc = SecondLastInst->getOpcode();
551 }
552 }
553
554 // If we're allowed to modify and the block ends in a unconditional branch
555 // which could simply fallthrough, remove the branch. (Note: This case only
556 // matters when we can't understand the whole sequence, otherwise it's also
557 // handled by BranchFolding.cpp.)
558 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc) &&
559 MBB.isLayoutSuccessor(MBB: getBranchDestBlock(MI: *LastInst))) {
560 LastInst->eraseFromParent();
561 LastInst = SecondLastInst;
562 LastOpc = LastInst->getOpcode();
563 if (I == MBB.begin() || !isUnpredicatedTerminator(MI: *--I)) {
564 assert(!isUncondBranchOpcode(LastOpc) &&
565 "unreachable unconditional branches removed above");
566
567 if (isCondBranchOpcode(Opc: LastOpc)) {
568 // Block ends with fall-through condbranch.
569 parseCondBranch(LastInst, Target&: TBB, Cond);
570 return false;
571 }
572 return true; // Can't handle indirect branch.
573 }
574 SecondLastInst = &*I;
575 SecondLastOpc = SecondLastInst->getOpcode();
576 }
577
578 // If there are three terminators, we don't know what sort of block this is.
579 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(MI: *--I))
580 return true;
581
582 // If the block ends with a B and a Bcc, handle it.
583 if (isCondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
584 parseCondBranch(LastInst: SecondLastInst, Target&: TBB, Cond);
585 FBB = LastInst->getOperand(i: 0).getMBB();
586 return false;
587 }
588
589 // If the block ends with two unconditional branches, handle it. The second
590 // one is not executed, so remove it.
591 if (isUncondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
592 TBB = SecondLastInst->getOperand(i: 0).getMBB();
593 I = LastInst;
594 if (AllowModify)
595 I->eraseFromParent();
596 return false;
597 }
598
599 // ...likewise if it ends with an indirect branch followed by an unconditional
600 // branch.
601 if (isIndirectBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
602 I = LastInst;
603 if (AllowModify)
604 I->eraseFromParent();
605 return true;
606 }
607
608 // Otherwise, can't handle this.
609 return true;
610}
611
612bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
613 MachineBranchPredicate &MBP,
614 bool AllowModify) const {
615 // Use analyzeBranch to validate the branch pattern.
616 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
617 SmallVector<MachineOperand, 4> Cond;
618 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
619 return true;
620
621 // analyzeBranch returns success with empty Cond for unconditional branches.
622 if (Cond.empty())
623 return true;
624
625 MBP.TrueDest = TBB;
626 assert(MBP.TrueDest && "expected!");
627 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
628
629 MBP.ConditionDef = nullptr;
630 MBP.SingleUseCondition = false;
631
632 // Find the conditional branch. After analyzeBranch succeeds with non-empty
633 // Cond, there's exactly one conditional branch - either last (fallthrough)
634 // or second-to-last (followed by unconditional B).
635 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
636 if (I == MBB.end())
637 return true;
638
639 if (isUncondBranchOpcode(Opc: I->getOpcode())) {
640 if (I == MBB.begin())
641 return true;
642 --I;
643 }
644
645 MachineInstr *CondBranch = &*I;
646 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
647
648 switch (CondBranch->getOpcode()) {
649 default:
650 return true;
651
652 case AArch64::Bcc:
653 // Bcc takes the NZCV flag as the operand to branch on, walk up the
654 // instruction stream to find the last instruction to define NZCV.
655 for (MachineInstr &MI : llvm::drop_begin(RangeOrContainer: llvm::reverse(C&: MBB))) {
656 if (MI.modifiesRegister(Reg: AArch64::NZCV, /*TRI=*/nullptr)) {
657 MBP.ConditionDef = &MI;
658 break;
659 }
660 }
661 return false;
662
663 case AArch64::CBZW:
664 case AArch64::CBZX:
665 case AArch64::CBNZW:
666 case AArch64::CBNZX: {
667 MBP.LHS = CondBranch->getOperand(i: 0);
668 MBP.RHS = MachineOperand::CreateImm(Val: 0);
669 unsigned Opc = CondBranch->getOpcode();
670 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
671 ? MachineBranchPredicate::PRED_NE
672 : MachineBranchPredicate::PRED_EQ;
673 Register CondReg = MBP.LHS.getReg();
674 if (CondReg.isVirtual())
675 MBP.ConditionDef = MRI.getVRegDef(Reg: CondReg);
676 return false;
677 }
678
679 case AArch64::TBZW:
680 case AArch64::TBZX:
681 case AArch64::TBNZW:
682 case AArch64::TBNZX: {
683 Register CondReg = CondBranch->getOperand(i: 0).getReg();
684 if (CondReg.isVirtual())
685 MBP.ConditionDef = MRI.getVRegDef(Reg: CondReg);
686 return false;
687 }
688 }
689}
690
691bool AArch64InstrInfo::reverseBranchCondition(
692 SmallVectorImpl<MachineOperand> &Cond) const {
693 if (Cond[0].getImm() != -1) {
694 // Regular Bcc
695 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
696 Cond[0].setImm(AArch64CC::getInvertedCondCode(Code: CC));
697 } else {
698 // Folded compare-and-branch
699 switch (Cond[1].getImm()) {
700 default:
701 llvm_unreachable("Unknown conditional branch!");
702 case AArch64::CBZW:
703 Cond[1].setImm(AArch64::CBNZW);
704 break;
705 case AArch64::CBNZW:
706 Cond[1].setImm(AArch64::CBZW);
707 break;
708 case AArch64::CBZX:
709 Cond[1].setImm(AArch64::CBNZX);
710 break;
711 case AArch64::CBNZX:
712 Cond[1].setImm(AArch64::CBZX);
713 break;
714 case AArch64::TBZW:
715 Cond[1].setImm(AArch64::TBNZW);
716 break;
717 case AArch64::TBNZW:
718 Cond[1].setImm(AArch64::TBZW);
719 break;
720 case AArch64::TBZX:
721 Cond[1].setImm(AArch64::TBNZX);
722 break;
723 case AArch64::TBNZX:
724 Cond[1].setImm(AArch64::TBZX);
725 break;
726
727 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
728 case AArch64::CBWPri:
729 case AArch64::CBXPri:
730 case AArch64::CBBAssertExt:
731 case AArch64::CBHAssertExt:
732 case AArch64::CBWPrr:
733 case AArch64::CBXPrr: {
734 // Pseudos using standard 4bit Arm condition codes
735 AArch64CC::CondCode CC =
736 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
737 Cond[2].setImm(AArch64CC::getInvertedCondCode(Code: CC));
738 }
739 }
740 }
741
742 return false;
743}
744
745unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
746 int *BytesRemoved) const {
747 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
748 if (I == MBB.end())
749 return 0;
750
751 if (!isUncondBranchOpcode(Opc: I->getOpcode()) &&
752 !isCondBranchOpcode(Opc: I->getOpcode()))
753 return 0;
754
755 // Remove the branch.
756 I->eraseFromParent();
757
758 I = MBB.end();
759
760 if (I == MBB.begin()) {
761 if (BytesRemoved)
762 *BytesRemoved = 4;
763 return 1;
764 }
765 --I;
766 if (!isCondBranchOpcode(Opc: I->getOpcode())) {
767 if (BytesRemoved)
768 *BytesRemoved = 4;
769 return 1;
770 }
771
772 // Remove the branch.
773 I->eraseFromParent();
774 if (BytesRemoved)
775 *BytesRemoved = 8;
776
777 return 2;
778}
779
780void AArch64InstrInfo::instantiateCondBranch(
781 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
782 ArrayRef<MachineOperand> Cond) const {
783 if (Cond[0].getImm() != -1) {
784 // Regular Bcc
785 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::Bcc)).addImm(Val: Cond[0].getImm()).addMBB(MBB: TBB);
786 } else {
787 // Folded compare-and-branch
788 // Note that we use addOperand instead of addReg to keep the flags.
789
790 // cbz, cbnz
791 const MachineInstrBuilder MIB =
792 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: Cond[1].getImm())).add(MO: Cond[2]);
793
794 // tbz/tbnz
795 if (Cond.size() > 3)
796 MIB.add(MO: Cond[3]);
797
798 // cb
799 if (Cond.size() > 4)
800 MIB.add(MO: Cond[4]);
801
802 MIB.addMBB(MBB: TBB);
803
804 // cb[b,h]
805 if (Cond.size() > 5) {
806 MIB.addImm(Val: Cond[5].getImm());
807 MIB.addImm(Val: Cond[6].getImm());
808 }
809 }
810}
811
812unsigned AArch64InstrInfo::insertBranch(
813 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
814 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
815 // Shouldn't be a fall through.
816 assert(TBB && "insertBranch must not be told to insert a fallthrough");
817
818 if (!FBB) {
819 if (Cond.empty()) // Unconditional branch?
820 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: TBB);
821 else
822 instantiateCondBranch(MBB, DL, TBB, Cond);
823
824 if (BytesAdded)
825 *BytesAdded = 4;
826
827 return 1;
828 }
829
830 // Two-way conditional branch.
831 instantiateCondBranch(MBB, DL, TBB, Cond);
832 BuildMI(BB: &MBB, MIMD: DL, MCID: get(Opcode: AArch64::B)).addMBB(MBB: FBB);
833
834 if (BytesAdded)
835 *BytesAdded = 8;
836
837 return 2;
838}
839
840bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
841 const TargetInstrInfo &TII) {
842 for (MachineInstr &MI : MBB->terminators()) {
843 unsigned Opc = MI.getOpcode();
844 switch (Opc) {
845 case AArch64::CBZW:
846 case AArch64::CBZX:
847 case AArch64::TBZW:
848 case AArch64::TBZX:
849 // CBZ/TBZ with WZR/XZR -> unconditional B
850 if (MI.getOperand(i: 0).getReg() == AArch64::WZR ||
851 MI.getOperand(i: 0).getReg() == AArch64::XZR) {
852 DEBUG_WITH_TYPE("optimizeTerminators",
853 dbgs() << "Removing always taken branch: " << MI);
854 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
855 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
856 for (auto *S : Succs)
857 if (S != Target)
858 MBB->removeSuccessor(Succ: S);
859 DebugLoc DL = MI.getDebugLoc();
860 while (MBB->rbegin() != &MI)
861 MBB->rbegin()->eraseFromParent();
862 MI.eraseFromParent();
863 BuildMI(BB: MBB, MIMD: DL, MCID: TII.get(Opcode: AArch64::B)).addMBB(MBB: Target);
864 return true;
865 }
866 break;
867 case AArch64::CBNZW:
868 case AArch64::CBNZX:
869 case AArch64::TBNZW:
870 case AArch64::TBNZX:
871 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
872 if (MI.getOperand(i: 0).getReg() == AArch64::WZR ||
873 MI.getOperand(i: 0).getReg() == AArch64::XZR) {
874 DEBUG_WITH_TYPE("optimizeTerminators",
875 dbgs() << "Removing never taken branch: " << MI);
876 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
877 MI.getParent()->removeSuccessor(Succ: Target);
878 MI.eraseFromParent();
879 return true;
880 }
881 break;
882 }
883 }
884 return false;
885}
886
887// Find the original register that VReg is copied from.
888static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
889 while (Register::isVirtualRegister(Reg: VReg)) {
890 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
891 if (!DefMI->isFullCopy())
892 return VReg;
893 VReg = DefMI->getOperand(i: 1).getReg();
894 }
895 return VReg;
896}
897
898// Determine if VReg is defined by an instruction that can be folded into a
899// csel instruction. If so, return the folded opcode, and the replacement
900// register.
901static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
902 unsigned *NewReg = nullptr) {
903 VReg = removeCopies(MRI, VReg);
904 if (!Register::isVirtualRegister(Reg: VReg))
905 return 0;
906
907 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(RC: MRI.getRegClass(Reg: VReg));
908 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
909 unsigned Opc = 0;
910 unsigned SrcReg = 0;
911 switch (DefMI->getOpcode()) {
912 case AArch64::SUBREG_TO_REG:
913 // Check for the following way to define an 64-bit immediate:
914 // %0:gpr32 = MOVi32imm 1
915 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
916 if (!DefMI->getOperand(i: 1).isReg())
917 return 0;
918 if (!DefMI->getOperand(i: 2).isImm() ||
919 DefMI->getOperand(i: 2).getImm() != AArch64::sub_32)
920 return 0;
921 DefMI = MRI.getVRegDef(Reg: DefMI->getOperand(i: 1).getReg());
922 if (DefMI->getOpcode() != AArch64::MOVi32imm)
923 return 0;
924 if (!DefMI->getOperand(i: 1).isImm() || DefMI->getOperand(i: 1).getImm() != 1)
925 return 0;
926 assert(Is64Bit);
927 SrcReg = AArch64::XZR;
928 Opc = AArch64::CSINCXr;
929 break;
930
931 case AArch64::MOVi32imm:
932 case AArch64::MOVi64imm:
933 if (!DefMI->getOperand(i: 1).isImm() || DefMI->getOperand(i: 1).getImm() != 1)
934 return 0;
935 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
936 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
937 break;
938
939 case AArch64::ADDSXri:
940 case AArch64::ADDSWri:
941 // if NZCV is used, do not fold.
942 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
943 isDead: true) == -1)
944 return 0;
945 // fall-through to ADDXri and ADDWri.
946 [[fallthrough]];
947 case AArch64::ADDXri:
948 case AArch64::ADDWri:
949 // add x, 1 -> csinc.
950 if (!DefMI->getOperand(i: 2).isImm() || DefMI->getOperand(i: 2).getImm() != 1 ||
951 DefMI->getOperand(i: 3).getImm() != 0)
952 return 0;
953 SrcReg = DefMI->getOperand(i: 1).getReg();
954 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
955 break;
956
957 case AArch64::ORNXrr:
958 case AArch64::ORNWrr: {
959 // not x -> csinv, represented as orn dst, xzr, src.
960 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
961 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
962 return 0;
963 SrcReg = DefMI->getOperand(i: 2).getReg();
964 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
965 break;
966 }
967
968 case AArch64::SUBSXrr:
969 case AArch64::SUBSWrr:
970 // if NZCV is used, do not fold.
971 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
972 isDead: true) == -1)
973 return 0;
974 // fall-through to SUBXrr and SUBWrr.
975 [[fallthrough]];
976 case AArch64::SUBXrr:
977 case AArch64::SUBWrr: {
978 // neg x -> csneg, represented as sub dst, xzr, src.
979 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
980 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
981 return 0;
982 SrcReg = DefMI->getOperand(i: 2).getReg();
983 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
984 break;
985 }
986 default:
987 return 0;
988 }
989 assert(Opc && SrcReg && "Missing parameters");
990
991 if (NewReg)
992 *NewReg = SrcReg;
993 return Opc;
994}
995
996bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
997 ArrayRef<MachineOperand> Cond,
998 Register DstReg, Register TrueReg,
999 Register FalseReg, int &CondCycles,
1000 int &TrueCycles,
1001 int &FalseCycles) const {
1002 // Check register classes.
1003 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1004 const TargetRegisterClass *RC =
1005 RI.getCommonSubClass(A: MRI.getRegClass(Reg: TrueReg), B: MRI.getRegClass(Reg: FalseReg));
1006 if (!RC)
1007 return false;
1008
1009 // Also need to check the dest regclass, in case we're trying to optimize
1010 // something like:
1011 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
1012 if (!RI.getCommonSubClass(A: RC, B: MRI.getRegClass(Reg: DstReg)))
1013 return false;
1014
1015 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
1016 unsigned ExtraCondLat = Cond.size() != 1;
1017
1018 // GPRs are handled by csel.
1019 // FIXME: Fold in x+1, -x, and ~x when applicable.
1020 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
1021 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
1022 // Single-cycle csel, csinc, csinv, and csneg.
1023 CondCycles = 1 + ExtraCondLat;
1024 TrueCycles = FalseCycles = 1;
1025 if (canFoldIntoCSel(MRI, VReg: TrueReg))
1026 TrueCycles = 0;
1027 else if (canFoldIntoCSel(MRI, VReg: FalseReg))
1028 FalseCycles = 0;
1029 return true;
1030 }
1031
1032 // Scalar floating point is handled by fcsel.
1033 // FIXME: Form fabs, fmin, and fmax when applicable.
1034 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
1035 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
1036 CondCycles = 5 + ExtraCondLat;
1037 TrueCycles = FalseCycles = 2;
1038 return true;
1039 }
1040
1041 // Can't do vectors.
1042 return false;
1043}
1044
1045void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
1046 MachineBasicBlock::iterator I,
1047 const DebugLoc &DL, Register DstReg,
1048 ArrayRef<MachineOperand> Cond,
1049 Register TrueReg, Register FalseReg) const {
1050 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1051
1052 // Parse the condition code, see parseCondBranch() above.
1053 AArch64CC::CondCode CC;
1054 switch (Cond.size()) {
1055 default:
1056 llvm_unreachable("Unknown condition opcode in Cond");
1057 case 1: // b.cc
1058 CC = AArch64CC::CondCode(Cond[0].getImm());
1059 break;
1060 case 3: { // cbz/cbnz
1061 // We must insert a compare against 0.
1062 bool Is64Bit;
1063 switch (Cond[1].getImm()) {
1064 default:
1065 llvm_unreachable("Unknown branch opcode in Cond");
1066 case AArch64::CBZW:
1067 Is64Bit = false;
1068 CC = AArch64CC::EQ;
1069 break;
1070 case AArch64::CBZX:
1071 Is64Bit = true;
1072 CC = AArch64CC::EQ;
1073 break;
1074 case AArch64::CBNZW:
1075 Is64Bit = false;
1076 CC = AArch64CC::NE;
1077 break;
1078 case AArch64::CBNZX:
1079 Is64Bit = true;
1080 CC = AArch64CC::NE;
1081 break;
1082 }
1083 Register SrcReg = Cond[2].getReg();
1084 if (Is64Bit) {
1085 // cmp reg, #0 is actually subs xzr, reg, #0.
1086 MRI.constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64spRegClass);
1087 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSXri), DestReg: AArch64::XZR)
1088 .addReg(RegNo: SrcReg)
1089 .addImm(Val: 0)
1090 .addImm(Val: 0);
1091 } else {
1092 MRI.constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR32spRegClass);
1093 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWri), DestReg: AArch64::WZR)
1094 .addReg(RegNo: SrcReg)
1095 .addImm(Val: 0)
1096 .addImm(Val: 0);
1097 }
1098 break;
1099 }
1100 case 4: { // tbz/tbnz
1101 // We must insert a tst instruction.
1102 switch (Cond[1].getImm()) {
1103 default:
1104 llvm_unreachable("Unknown branch opcode in Cond");
1105 case AArch64::TBZW:
1106 case AArch64::TBZX:
1107 CC = AArch64CC::EQ;
1108 break;
1109 case AArch64::TBNZW:
1110 case AArch64::TBNZX:
1111 CC = AArch64CC::NE;
1112 break;
1113 }
1114 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1115 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1116 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ANDSWri), DestReg: AArch64::WZR)
1117 .addReg(RegNo: Cond[2].getReg())
1118 .addImm(
1119 Val: AArch64_AM::encodeLogicalImmediate(imm: 1ull << Cond[3].getImm(), regSize: 32));
1120 else
1121 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ANDSXri), DestReg: AArch64::XZR)
1122 .addReg(RegNo: Cond[2].getReg())
1123 .addImm(
1124 Val: AArch64_AM::encodeLogicalImmediate(imm: 1ull << Cond[3].getImm(), regSize: 64));
1125 break;
1126 }
1127 case 5: { // cb
1128 // We must insert a cmp, that is a subs
1129 // 0 1 2 3 4
1130 // Cond is { -1, Opcode, CC, Op0, Op1 }
1131
1132 unsigned SubsOpc, SubsDestReg;
1133 bool IsImm = false;
1134 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1135 switch (Cond[1].getImm()) {
1136 default:
1137 llvm_unreachable("Unknown branch opcode in Cond");
1138 case AArch64::CBWPri:
1139 SubsOpc = AArch64::SUBSWri;
1140 SubsDestReg = AArch64::WZR;
1141 IsImm = true;
1142 break;
1143 case AArch64::CBXPri:
1144 SubsOpc = AArch64::SUBSXri;
1145 SubsDestReg = AArch64::XZR;
1146 IsImm = true;
1147 break;
1148 case AArch64::CBWPrr:
1149 SubsOpc = AArch64::SUBSWrr;
1150 SubsDestReg = AArch64::WZR;
1151 IsImm = false;
1152 break;
1153 case AArch64::CBXPrr:
1154 SubsOpc = AArch64::SUBSXrr;
1155 SubsDestReg = AArch64::XZR;
1156 IsImm = false;
1157 break;
1158 }
1159
1160 if (IsImm)
1161 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SubsOpc), DestReg: SubsDestReg)
1162 .addReg(RegNo: Cond[3].getReg())
1163 .addImm(Val: Cond[4].getImm())
1164 .addImm(Val: 0);
1165 else
1166 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: SubsOpc), DestReg: SubsDestReg)
1167 .addReg(RegNo: Cond[3].getReg())
1168 .addReg(RegNo: Cond[4].getReg());
1169 } break;
1170 case 7: { // cb[b,h]
1171 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1172 // that have been folded. For the first operand we codegen an explicit
1173 // extension, for the second operand we fold the extension into cmp.
1174 // 0 1 2 3 4 5 6
1175 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1176
1177 // We need a new register for the now explicitly extended register
1178 Register Reg = Cond[4].getReg();
1179 if (Cond[5].getImm() != AArch64_AM::InvalidShiftExtend) {
1180 unsigned ExtOpc;
1181 unsigned ExtBits;
1182 AArch64_AM::ShiftExtendType ExtendType =
1183 AArch64_AM::getExtendType(Imm: Cond[5].getImm());
1184 switch (ExtendType) {
1185 default:
1186 llvm_unreachable("Unknown shift-extend for CB instruction");
1187 case AArch64_AM::SXTB:
1188 assert(
1189 Cond[1].getImm() == AArch64::CBBAssertExt &&
1190 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1191 ExtOpc = AArch64::SBFMWri;
1192 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xff, regSize: 32);
1193 break;
1194 case AArch64_AM::SXTH:
1195 assert(
1196 Cond[1].getImm() == AArch64::CBHAssertExt &&
1197 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1198 ExtOpc = AArch64::SBFMWri;
1199 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xffff, regSize: 32);
1200 break;
1201 case AArch64_AM::UXTB:
1202 assert(
1203 Cond[1].getImm() == AArch64::CBBAssertExt &&
1204 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1205 ExtOpc = AArch64::ANDWri;
1206 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xff, regSize: 32);
1207 break;
1208 case AArch64_AM::UXTH:
1209 assert(
1210 Cond[1].getImm() == AArch64::CBHAssertExt &&
1211 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1212 ExtOpc = AArch64::ANDWri;
1213 ExtBits = AArch64_AM::encodeLogicalImmediate(imm: 0xffff, regSize: 32);
1214 break;
1215 }
1216
1217 // Build the explicit extension of the first operand
1218 Reg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32spRegClass);
1219 MachineInstrBuilder MBBI =
1220 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: ExtOpc), DestReg: Reg).addReg(RegNo: Cond[4].getReg());
1221 if (ExtOpc != AArch64::ANDWri)
1222 MBBI.addImm(Val: 0);
1223 MBBI.addImm(Val: ExtBits);
1224 }
1225
1226 // Now, subs with an extended second operand
1227 if (Cond[6].getImm() != AArch64_AM::InvalidShiftExtend) {
1228 AArch64_AM::ShiftExtendType ExtendType =
1229 AArch64_AM::getExtendType(Imm: Cond[6].getImm());
1230 MRI.constrainRegClass(Reg, RC: MRI.getRegClass(Reg: Cond[3].getReg()));
1231 MRI.constrainRegClass(Reg: Cond[3].getReg(), RC: &AArch64::GPR32spRegClass);
1232 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWrx), DestReg: AArch64::WZR)
1233 .addReg(RegNo: Cond[3].getReg())
1234 .addReg(RegNo: Reg)
1235 .addImm(Val: AArch64_AM::getArithExtendImm(ET: ExtendType, Imm: 0));
1236 } // If no extension is needed, just a regular subs
1237 else {
1238 MRI.constrainRegClass(Reg, RC: MRI.getRegClass(Reg: Cond[3].getReg()));
1239 MRI.constrainRegClass(Reg: Cond[3].getReg(), RC: &AArch64::GPR32spRegClass);
1240 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::SUBSWrr), DestReg: AArch64::WZR)
1241 .addReg(RegNo: Cond[3].getReg())
1242 .addReg(RegNo: Reg);
1243 }
1244
1245 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1246 } break;
1247 }
1248
1249 unsigned Opc = 0;
1250 const TargetRegisterClass *RC = nullptr;
1251 bool TryFold = false;
1252 if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::GPR64RegClass)) {
1253 RC = &AArch64::GPR64RegClass;
1254 Opc = AArch64::CSELXr;
1255 TryFold = true;
1256 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::GPR32RegClass)) {
1257 RC = &AArch64::GPR32RegClass;
1258 Opc = AArch64::CSELWr;
1259 TryFold = true;
1260 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::FPR64RegClass)) {
1261 RC = &AArch64::FPR64RegClass;
1262 Opc = AArch64::FCSELDrrr;
1263 } else if (MRI.constrainRegClass(Reg: DstReg, RC: &AArch64::FPR32RegClass)) {
1264 RC = &AArch64::FPR32RegClass;
1265 Opc = AArch64::FCSELSrrr;
1266 }
1267 assert(RC && "Unsupported regclass");
1268
1269 // Try folding simple instructions into the csel.
1270 if (TryFold) {
1271 unsigned NewReg = 0;
1272 unsigned FoldedOpc = canFoldIntoCSel(MRI, VReg: TrueReg, NewReg: &NewReg);
1273 if (FoldedOpc) {
1274 // The folded opcodes csinc, csinc and csneg apply the operation to
1275 // FalseReg, so we need to invert the condition.
1276 CC = AArch64CC::getInvertedCondCode(Code: CC);
1277 TrueReg = FalseReg;
1278 } else
1279 FoldedOpc = canFoldIntoCSel(MRI, VReg: FalseReg, NewReg: &NewReg);
1280
1281 // Fold the operation. Leave any dead instructions for DCE to clean up.
1282 if (FoldedOpc) {
1283 FalseReg = NewReg;
1284 Opc = FoldedOpc;
1285 // Extend the live range of NewReg.
1286 MRI.clearKillFlags(Reg: NewReg);
1287 }
1288 }
1289
1290 // Pull all virtual register into the appropriate class.
1291 MRI.constrainRegClass(Reg: TrueReg, RC);
1292 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1293 assert(
1294 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1295 FalseReg == AArch64::XZR) &&
1296 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1297 if (FalseReg.isVirtual())
1298 MRI.constrainRegClass(Reg: FalseReg, RC);
1299
1300 // Insert the csel.
1301 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: Opc), DestReg: DstReg)
1302 .addReg(RegNo: TrueReg)
1303 .addReg(RegNo: FalseReg)
1304 .addImm(Val: CC);
1305}
1306
1307// Return true if Imm can be loaded into a register by a "cheap" sequence of
1308// instructions. For now, "cheap" means at most two instructions.
1309static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1310 if (BitSize == 32)
1311 return true;
1312
1313 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1314 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(i: 1).getImm());
1315 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
1316 AArch64_IMM::expandMOVImm(Imm, BitSize, Insn&: Is);
1317
1318 return Is.size() <= 2;
1319}
1320
1321// Check if a COPY instruction is cheap.
1322static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1323 assert(MI.isCopy() && "Expected COPY instruction");
1324 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1325
1326 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1327 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1328 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1329 if (Reg.isVirtual())
1330 return MRI.getRegClass(Reg);
1331 if (Reg.isPhysical())
1332 return RI.getMinimalPhysRegClass(Reg);
1333 return nullptr;
1334 };
1335 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(i: 0).getReg());
1336 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(i: 1).getReg());
1337 if (DstRC && SrcRC && !RI.getCommonSubClass(A: DstRC, B: SrcRC))
1338 return false;
1339
1340 return MI.isAsCheapAsAMove();
1341}
1342
1343// FIXME: this implementation should be micro-architecture dependent, so a
1344// micro-architecture target hook should be introduced here in future.
1345bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
1346 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1347 if (isExynosCheapAsMove(MI))
1348 return true;
1349 return MI.isAsCheapAsAMove();
1350 }
1351
1352 switch (MI.getOpcode()) {
1353 default:
1354 return MI.isAsCheapAsAMove();
1355
1356 case TargetOpcode::COPY:
1357 return isCheapCopy(MI, RI);
1358
1359 case AArch64::ADDWrs:
1360 case AArch64::ADDXrs:
1361 case AArch64::SUBWrs:
1362 case AArch64::SUBXrs:
1363 return Subtarget.hasALULSLFast() && MI.getOperand(i: 3).getImm() <= 4;
1364
1365 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1366 // ORRXri, it is as cheap as MOV.
1367 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1368 case AArch64::MOVi32imm:
1369 return isCheapImmediate(MI, BitSize: 32);
1370 case AArch64::MOVi64imm:
1371 return isCheapImmediate(MI, BitSize: 64);
1372 }
1373}
1374
1375bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1376 switch (MI.getOpcode()) {
1377 default:
1378 return false;
1379
1380 case AArch64::ADDWrs:
1381 case AArch64::ADDXrs:
1382 case AArch64::ADDSWrs:
1383 case AArch64::ADDSXrs: {
1384 unsigned Imm = MI.getOperand(i: 3).getImm();
1385 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1386 if (ShiftVal == 0)
1387 return true;
1388 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1389 }
1390
1391 case AArch64::ADDWrx:
1392 case AArch64::ADDXrx:
1393 case AArch64::ADDXrx64:
1394 case AArch64::ADDSWrx:
1395 case AArch64::ADDSXrx:
1396 case AArch64::ADDSXrx64: {
1397 unsigned Imm = MI.getOperand(i: 3).getImm();
1398 switch (AArch64_AM::getArithExtendType(Imm)) {
1399 default:
1400 return false;
1401 case AArch64_AM::UXTB:
1402 case AArch64_AM::UXTH:
1403 case AArch64_AM::UXTW:
1404 case AArch64_AM::UXTX:
1405 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1406 }
1407 }
1408
1409 case AArch64::SUBWrs:
1410 case AArch64::SUBSWrs: {
1411 unsigned Imm = MI.getOperand(i: 3).getImm();
1412 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1413 return ShiftVal == 0 ||
1414 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1415 }
1416
1417 case AArch64::SUBXrs:
1418 case AArch64::SUBSXrs: {
1419 unsigned Imm = MI.getOperand(i: 3).getImm();
1420 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1421 return ShiftVal == 0 ||
1422 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1423 }
1424
1425 case AArch64::SUBWrx:
1426 case AArch64::SUBXrx:
1427 case AArch64::SUBXrx64:
1428 case AArch64::SUBSWrx:
1429 case AArch64::SUBSXrx:
1430 case AArch64::SUBSXrx64: {
1431 unsigned Imm = MI.getOperand(i: 3).getImm();
1432 switch (AArch64_AM::getArithExtendType(Imm)) {
1433 default:
1434 return false;
1435 case AArch64_AM::UXTB:
1436 case AArch64_AM::UXTH:
1437 case AArch64_AM::UXTW:
1438 case AArch64_AM::UXTX:
1439 return AArch64_AM::getArithShiftValue(Imm) == 0;
1440 }
1441 }
1442
1443 case AArch64::LDRBBroW:
1444 case AArch64::LDRBBroX:
1445 case AArch64::LDRBroW:
1446 case AArch64::LDRBroX:
1447 case AArch64::LDRDroW:
1448 case AArch64::LDRDroX:
1449 case AArch64::LDRHHroW:
1450 case AArch64::LDRHHroX:
1451 case AArch64::LDRHroW:
1452 case AArch64::LDRHroX:
1453 case AArch64::LDRQroW:
1454 case AArch64::LDRQroX:
1455 case AArch64::LDRSBWroW:
1456 case AArch64::LDRSBWroX:
1457 case AArch64::LDRSBXroW:
1458 case AArch64::LDRSBXroX:
1459 case AArch64::LDRSHWroW:
1460 case AArch64::LDRSHWroX:
1461 case AArch64::LDRSHXroW:
1462 case AArch64::LDRSHXroX:
1463 case AArch64::LDRSWroW:
1464 case AArch64::LDRSWroX:
1465 case AArch64::LDRSroW:
1466 case AArch64::LDRSroX:
1467 case AArch64::LDRWroW:
1468 case AArch64::LDRWroX:
1469 case AArch64::LDRXroW:
1470 case AArch64::LDRXroX:
1471 case AArch64::PRFMroW:
1472 case AArch64::PRFMroX:
1473 case AArch64::STRBBroW:
1474 case AArch64::STRBBroX:
1475 case AArch64::STRBroW:
1476 case AArch64::STRBroX:
1477 case AArch64::STRDroW:
1478 case AArch64::STRDroX:
1479 case AArch64::STRHHroW:
1480 case AArch64::STRHHroX:
1481 case AArch64::STRHroW:
1482 case AArch64::STRHroX:
1483 case AArch64::STRQroW:
1484 case AArch64::STRQroX:
1485 case AArch64::STRSroW:
1486 case AArch64::STRSroX:
1487 case AArch64::STRWroW:
1488 case AArch64::STRWroX:
1489 case AArch64::STRXroW:
1490 case AArch64::STRXroX: {
1491 unsigned IsSigned = MI.getOperand(i: 3).getImm();
1492 return !IsSigned;
1493 }
1494 }
1495}
1496
1497bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1498 unsigned Opc = MI.getOpcode();
1499 switch (Opc) {
1500 default:
1501 return false;
1502 case AArch64::SEH_StackAlloc:
1503 case AArch64::SEH_SaveFPLR:
1504 case AArch64::SEH_SaveFPLR_X:
1505 case AArch64::SEH_SaveReg:
1506 case AArch64::SEH_SaveReg_X:
1507 case AArch64::SEH_SaveRegP:
1508 case AArch64::SEH_SaveRegP_X:
1509 case AArch64::SEH_SaveFReg:
1510 case AArch64::SEH_SaveFReg_X:
1511 case AArch64::SEH_SaveFRegP:
1512 case AArch64::SEH_SaveFRegP_X:
1513 case AArch64::SEH_SetFP:
1514 case AArch64::SEH_AddFP:
1515 case AArch64::SEH_Nop:
1516 case AArch64::SEH_PrologEnd:
1517 case AArch64::SEH_EpilogStart:
1518 case AArch64::SEH_EpilogEnd:
1519 case AArch64::SEH_PACSignLR:
1520 case AArch64::SEH_SaveAnyRegI:
1521 case AArch64::SEH_SaveAnyRegIP:
1522 case AArch64::SEH_SaveAnyRegQP:
1523 case AArch64::SEH_SaveAnyRegQPX:
1524 case AArch64::SEH_AllocZ:
1525 case AArch64::SEH_SaveZReg:
1526 case AArch64::SEH_SavePReg:
1527 return true;
1528 }
1529}
1530
1531bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1532 Register &SrcReg, Register &DstReg,
1533 unsigned &SubIdx) const {
1534 switch (MI.getOpcode()) {
1535 default:
1536 return false;
1537 case AArch64::SBFMXri: // aka sxtw
1538 case AArch64::UBFMXri: // aka uxtw
1539 // Check for the 32 -> 64 bit extension case, these instructions can do
1540 // much more.
1541 if (MI.getOperand(i: 2).getImm() != 0 || MI.getOperand(i: 3).getImm() != 31)
1542 return false;
1543 // This is a signed or unsigned 32 -> 64 bit extension.
1544 SrcReg = MI.getOperand(i: 1).getReg();
1545 DstReg = MI.getOperand(i: 0).getReg();
1546 SubIdx = AArch64::sub_32;
1547 return true;
1548 }
1549}
1550
1551bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1552 const MachineInstr &MIa, const MachineInstr &MIb) const {
1553 const TargetRegisterInfo *TRI = &getRegisterInfo();
1554 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1555 int64_t OffsetA = 0, OffsetB = 0;
1556 TypeSize WidthA(0, false), WidthB(0, false);
1557 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1558
1559 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1560 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1561
1562 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1563 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1564 return false;
1565
1566 // Retrieve the base, offset from the base and width. Width
1567 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1568 // base are identical, and the offset of a lower memory access +
1569 // the width doesn't overlap the offset of a higher memory access,
1570 // then the memory accesses are different.
1571 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1572 // are assumed to have the same scale (vscale).
1573 if (getMemOperandWithOffsetWidth(MI: MIa, BaseOp&: BaseOpA, Offset&: OffsetA, OffsetIsScalable&: OffsetAIsScalable,
1574 Width&: WidthA, TRI) &&
1575 getMemOperandWithOffsetWidth(MI: MIb, BaseOp&: BaseOpB, Offset&: OffsetB, OffsetIsScalable&: OffsetBIsScalable,
1576 Width&: WidthB, TRI)) {
1577 if (BaseOpA->isIdenticalTo(Other: *BaseOpB) &&
1578 OffsetAIsScalable == OffsetBIsScalable) {
1579 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1580 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1581 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1582 if (LowWidth.isScalable() == OffsetAIsScalable &&
1583 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1584 return true;
1585 }
1586 }
1587 return false;
1588}
1589
1590bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1591 const MachineBasicBlock *MBB,
1592 const MachineFunction &MF) const {
1593 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1594 return true;
1595
1596 // Do not move an instruction that can be recognized as a branch target.
1597 if (hasBTISemantics(MI))
1598 return true;
1599
1600 switch (MI.getOpcode()) {
1601 case AArch64::HINT:
1602 // CSDB hints are scheduling barriers.
1603 if (MI.getOperand(i: 0).getImm() == 0x14)
1604 return true;
1605 break;
1606 case AArch64::DSB:
1607 case AArch64::ISB:
1608 // DSB and ISB also are scheduling barriers.
1609 return true;
1610 case AArch64::MSRpstatesvcrImm1:
1611 // SMSTART and SMSTOP are also scheduling barriers.
1612 return true;
1613 default:;
1614 }
1615 if (isSEHInstruction(MI))
1616 return true;
1617 auto Next = std::next(x: MI.getIterator());
1618 return Next != MBB->end() && Next->isCFIInstruction();
1619}
1620
1621/// analyzeCompare - For a comparison instruction, return the source registers
1622/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1623/// Return true if the comparison instruction can be analyzed.
1624bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1625 Register &SrcReg2, int64_t &CmpMask,
1626 int64_t &CmpValue) const {
1627 // The first operand can be a frame index where we'd normally expect a
1628 // register.
1629 // FIXME: Pass subregisters out of analyzeCompare
1630 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1631 if (!MI.getOperand(i: 1).isReg() || MI.getOperand(i: 1).getSubReg())
1632 return false;
1633
1634 switch (MI.getOpcode()) {
1635 default:
1636 break;
1637 case AArch64::PTEST_PP:
1638 case AArch64::PTEST_PP_ANY:
1639 case AArch64::PTEST_PP_FIRST:
1640 SrcReg = MI.getOperand(i: 0).getReg();
1641 SrcReg2 = MI.getOperand(i: 1).getReg();
1642 if (MI.getOperand(i: 2).getSubReg())
1643 return false;
1644
1645 // Not sure about the mask and value for now...
1646 CmpMask = ~0;
1647 CmpValue = 0;
1648 return true;
1649 case AArch64::SUBSWrr:
1650 case AArch64::SUBSWrs:
1651 case AArch64::SUBSWrx:
1652 case AArch64::SUBSXrr:
1653 case AArch64::SUBSXrs:
1654 case AArch64::SUBSXrx:
1655 case AArch64::ADDSWrr:
1656 case AArch64::ADDSWrs:
1657 case AArch64::ADDSWrx:
1658 case AArch64::ADDSXrr:
1659 case AArch64::ADDSXrs:
1660 case AArch64::ADDSXrx:
1661 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1662 SrcReg = MI.getOperand(i: 1).getReg();
1663 SrcReg2 = MI.getOperand(i: 2).getReg();
1664
1665 // FIXME: Pass subregisters out of analyzeCompare
1666 if (MI.getOperand(i: 2).getSubReg())
1667 return false;
1668
1669 CmpMask = ~0;
1670 CmpValue = 0;
1671 return true;
1672 case AArch64::SUBSWri:
1673 case AArch64::ADDSWri:
1674 case AArch64::SUBSXri:
1675 case AArch64::ADDSXri:
1676 SrcReg = MI.getOperand(i: 1).getReg();
1677 SrcReg2 = 0;
1678 CmpMask = ~0;
1679 CmpValue = MI.getOperand(i: 2).getImm();
1680 return true;
1681 case AArch64::ANDSWri:
1682 case AArch64::ANDSXri:
1683 // ANDS does not use the same encoding scheme as the others xxxS
1684 // instructions.
1685 SrcReg = MI.getOperand(i: 1).getReg();
1686 SrcReg2 = 0;
1687 CmpMask = ~0;
1688 CmpValue = AArch64_AM::decodeLogicalImmediate(
1689 val: MI.getOperand(i: 2).getImm(),
1690 regSize: MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1691 return true;
1692 }
1693
1694 return false;
1695}
1696
1697static bool UpdateOperandRegClass(MachineInstr &Instr) {
1698 MachineBasicBlock *MBB = Instr.getParent();
1699 assert(MBB && "Can't get MachineBasicBlock here");
1700 MachineFunction *MF = MBB->getParent();
1701 assert(MF && "Can't get MachineFunction here");
1702 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1703 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1704 MachineRegisterInfo *MRI = &MF->getRegInfo();
1705
1706 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1707 ++OpIdx) {
1708 MachineOperand &MO = Instr.getOperand(i: OpIdx);
1709 const TargetRegisterClass *OpRegCstraints =
1710 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1711
1712 // If there's no constraint, there's nothing to do.
1713 if (!OpRegCstraints)
1714 continue;
1715 // If the operand is a frame index, there's nothing to do here.
1716 // A frame index operand will resolve correctly during PEI.
1717 if (MO.isFI())
1718 continue;
1719
1720 assert(MO.isReg() &&
1721 "Operand has register constraints without being a register!");
1722
1723 Register Reg = MO.getReg();
1724 if (Reg.isPhysical()) {
1725 if (!OpRegCstraints->contains(Reg))
1726 return false;
1727 } else if (!OpRegCstraints->hasSubClassEq(RC: MRI->getRegClass(Reg)) &&
1728 !MRI->constrainRegClass(Reg, RC: OpRegCstraints))
1729 return false;
1730 }
1731
1732 return true;
1733}
1734
1735/// Return the opcode that does not set flags when possible - otherwise
1736/// return the original opcode. The caller is responsible to do the actual
1737/// substitution and legality checking.
1738static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1739 // Don't convert all compare instructions, because for some the zero register
1740 // encoding becomes the sp register.
1741 bool MIDefinesZeroReg = false;
1742 if (MI.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) ||
1743 MI.definesRegister(Reg: AArch64::XZR, /*TRI=*/nullptr))
1744 MIDefinesZeroReg = true;
1745
1746 switch (MI.getOpcode()) {
1747 default:
1748 return MI.getOpcode();
1749 case AArch64::ADDSWrr:
1750 return AArch64::ADDWrr;
1751 case AArch64::ADDSWri:
1752 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1753 case AArch64::ADDSWrs:
1754 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1755 case AArch64::ADDSWrx:
1756 return AArch64::ADDWrx;
1757 case AArch64::ADDSXrr:
1758 return AArch64::ADDXrr;
1759 case AArch64::ADDSXri:
1760 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1761 case AArch64::ADDSXrs:
1762 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1763 case AArch64::ADDSXrx:
1764 return AArch64::ADDXrx;
1765 case AArch64::SUBSWrr:
1766 return AArch64::SUBWrr;
1767 case AArch64::SUBSWri:
1768 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1769 case AArch64::SUBSWrs:
1770 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1771 case AArch64::SUBSWrx:
1772 return AArch64::SUBWrx;
1773 case AArch64::SUBSXrr:
1774 return AArch64::SUBXrr;
1775 case AArch64::SUBSXri:
1776 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1777 case AArch64::SUBSXrs:
1778 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1779 case AArch64::SUBSXrx:
1780 return AArch64::SUBXrx;
1781 }
1782}
1783
1784enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1785
1786/// True when condition flags are accessed (either by writing or reading)
1787/// on the instruction trace starting at From and ending at To.
1788///
1789/// Note: If From and To are from different blocks it's assumed CC are accessed
1790/// on the path.
1791static bool areCFlagsAccessedBetweenInstrs(
1792 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1793 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1794 // Early exit if To is at the beginning of the BB.
1795 if (To == To->getParent()->begin())
1796 return true;
1797
1798 // Check whether the instructions are in the same basic block
1799 // If not, assume the condition flags might get modified somewhere.
1800 if (To->getParent() != From->getParent())
1801 return true;
1802
1803 // From must be above To.
1804 assert(std::any_of(
1805 ++To.getReverse(), To->getParent()->rend(),
1806 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1807
1808 // We iterate backward starting at \p To until we hit \p From.
1809 for (const MachineInstr &Instr :
1810 instructionsWithoutDebug(It: ++To.getReverse(), End: From.getReverse())) {
1811 if (((AccessToCheck & AK_Write) &&
1812 Instr.modifiesRegister(Reg: AArch64::NZCV, TRI)) ||
1813 ((AccessToCheck & AK_Read) && Instr.readsRegister(Reg: AArch64::NZCV, TRI)))
1814 return true;
1815 }
1816 return false;
1817}
1818
1819std::optional<unsigned>
1820AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1821 MachineInstr *Pred,
1822 const MachineRegisterInfo *MRI) const {
1823 unsigned MaskOpcode = Mask->getOpcode();
1824 unsigned PredOpcode = Pred->getOpcode();
1825 bool PredIsPTestLike = isPTestLikeOpcode(Opc: PredOpcode);
1826 bool PredIsWhileLike = isWhileOpcode(Opc: PredOpcode);
1827
1828 if (PredIsWhileLike) {
1829 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1830 // instruction and the condition is "any" since WHILcc does an implicit
1831 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1832 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1833 return PredOpcode;
1834
1835 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1836 // redundant since WHILE performs an implicit PTEST with an all active
1837 // mask.
1838 if (isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31 &&
1839 getElementSizeForOpcode(Opc: MaskOpcode) ==
1840 getElementSizeForOpcode(Opc: PredOpcode))
1841 return PredOpcode;
1842
1843 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1844 // WHILEcc performs an implicit PTEST with an all active mask, setting
1845 // the N flag as the PTEST_FIRST would.
1846 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1847 isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31)
1848 return PredOpcode;
1849
1850 return {};
1851 }
1852
1853 if (PredIsPTestLike) {
1854 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1855 // instruction that sets the flags as PTEST would and the condition is
1856 // "any" since PG is always a subset of the governing predicate of the
1857 // ptest-like instruction.
1858 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1859 return PredOpcode;
1860
1861 auto PTestLikeMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1862
1863 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1864 // to look through a copy and try again. This is because some instructions
1865 // take a predicate whose register class is a subset of its result class.
1866 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1867 PTestLikeMask->getOperand(i: 1).getReg().isVirtual())
1868 PTestLikeMask =
1869 MRI->getUniqueVRegDef(Reg: PTestLikeMask->getOperand(i: 1).getReg());
1870
1871 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1872 // the element size matches and either the PTEST_LIKE instruction uses
1873 // the same all active mask or the condition is "any".
1874 if (isPTrueOpcode(Opc: MaskOpcode) && Mask->getOperand(i: 1).getImm() == 31 &&
1875 getElementSizeForOpcode(Opc: MaskOpcode) ==
1876 getElementSizeForOpcode(Opc: PredOpcode)) {
1877 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1878 return PredOpcode;
1879 }
1880
1881 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1882 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1883 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1884 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1885 // performed by the compare could consider fewer lanes for these element
1886 // sizes.
1887 //
1888 // For example, consider
1889 //
1890 // ptrue p0.b ; P0=1111-1111-1111-1111
1891 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1892 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1893 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1894 // ; ^ last active
1895 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1896 // ; ^ last active
1897 //
1898 // where the compare generates a canonical all active 32-bit predicate
1899 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1900 // active flag, whereas the PTEST instruction with the same mask doesn't.
1901 // For PTEST_ANY this doesn't apply as the flags in this case would be
1902 // identical regardless of element size.
1903 uint64_t PredElementSize = getElementSizeForOpcode(Opc: PredOpcode);
1904 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1905 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1906 return PredOpcode;
1907
1908 return {};
1909 }
1910
1911 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1912 // opcode so the PTEST becomes redundant.
1913 switch (PredOpcode) {
1914 case AArch64::AND_PPzPP:
1915 case AArch64::BIC_PPzPP:
1916 case AArch64::EOR_PPzPP:
1917 case AArch64::NAND_PPzPP:
1918 case AArch64::NOR_PPzPP:
1919 case AArch64::ORN_PPzPP:
1920 case AArch64::ORR_PPzPP:
1921 case AArch64::BRKA_PPzP:
1922 case AArch64::BRKPA_PPzPP:
1923 case AArch64::BRKB_PPzP:
1924 case AArch64::BRKPB_PPzPP:
1925 case AArch64::RDFFR_PPz: {
1926 // Check to see if our mask is the same. If not the resulting flag bits
1927 // may be different and we can't remove the ptest.
1928 auto *PredMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1929 if (Mask != PredMask)
1930 return {};
1931 break;
1932 }
1933 case AArch64::BRKN_PPzP: {
1934 // BRKN uses an all active implicit mask to set flags unlike the other
1935 // flag-setting instructions.
1936 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1937 if ((MaskOpcode != AArch64::PTRUE_B) ||
1938 (Mask->getOperand(i: 1).getImm() != 31))
1939 return {};
1940 break;
1941 }
1942 case AArch64::PTRUE_B:
1943 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1944 break;
1945 default:
1946 // Bail out if we don't recognize the input
1947 return {};
1948 }
1949
1950 return convertToFlagSettingOpc(Opc: PredOpcode);
1951}
1952
1953/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1954/// operation which could set the flags in an identical manner
1955bool AArch64InstrInfo::optimizePTestInstr(
1956 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1957 const MachineRegisterInfo *MRI) const {
1958 auto *Mask = MRI->getUniqueVRegDef(Reg: MaskReg);
1959 auto *Pred = MRI->getUniqueVRegDef(Reg: PredReg);
1960
1961 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1962 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1963 // before the branch to extract each subregister.
1964 auto Op = Pred->getOperand(i: 1);
1965 if (Op.isReg() && Op.getReg().isVirtual() &&
1966 Op.getSubReg() == AArch64::psub0)
1967 Pred = MRI->getUniqueVRegDef(Reg: Op.getReg());
1968 }
1969
1970 unsigned PredOpcode = Pred->getOpcode();
1971 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1972 if (!NewOp)
1973 return false;
1974
1975 const TargetRegisterInfo *TRI = &getRegisterInfo();
1976
1977 // If another instruction between Pred and PTest accesses flags, don't remove
1978 // the ptest or update the earlier instruction to modify them.
1979 if (areCFlagsAccessedBetweenInstrs(From: Pred, To: PTest, TRI))
1980 return false;
1981
1982 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1983 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1984 // operand to be replaced with an equivalent instruction that also sets the
1985 // flags.
1986 PTest->eraseFromParent();
1987 if (*NewOp != PredOpcode) {
1988 Pred->setDesc(get(Opcode: *NewOp));
1989 bool succeeded = UpdateOperandRegClass(Instr&: *Pred);
1990 (void)succeeded;
1991 assert(succeeded && "Operands have incompatible register classes!");
1992 Pred->addRegisterDefined(Reg: AArch64::NZCV, RegInfo: TRI);
1993 }
1994
1995 // Ensure that the flags def is live.
1996 if (Pred->registerDefIsDead(Reg: AArch64::NZCV, TRI)) {
1997 unsigned i = 0, e = Pred->getNumOperands();
1998 for (; i != e; ++i) {
1999 MachineOperand &MO = Pred->getOperand(i);
2000 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
2001 MO.setIsDead(false);
2002 break;
2003 }
2004 }
2005 }
2006 return true;
2007}
2008
2009/// Try to optimize a compare instruction. A compare instruction is an
2010/// instruction which produces AArch64::NZCV. It can be truly compare
2011/// instruction
2012/// when there are no uses of its destination register.
2013///
2014/// The following steps are tried in order:
2015/// 1. Convert CmpInstr into an unconditional version.
2016/// 2. Remove CmpInstr if above there is an instruction producing a needed
2017/// condition code or an instruction which can be converted into such an
2018/// instruction.
2019/// Only comparison with zero is supported.
2020bool AArch64InstrInfo::optimizeCompareInstr(
2021 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
2022 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
2023 assert(CmpInstr.getParent());
2024 assert(MRI);
2025
2026 // Replace SUBSWrr with SUBWrr if NZCV is not used.
2027 int DeadNZCVIdx =
2028 CmpInstr.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true);
2029 if (DeadNZCVIdx != -1) {
2030 if (CmpInstr.definesRegister(Reg: AArch64::WZR, /*TRI=*/nullptr) ||
2031 CmpInstr.definesRegister(Reg: AArch64::XZR, /*TRI=*/nullptr)) {
2032 CmpInstr.eraseFromParent();
2033 return true;
2034 }
2035 unsigned Opc = CmpInstr.getOpcode();
2036 unsigned NewOpc = convertToNonFlagSettingOpc(MI: CmpInstr);
2037 if (NewOpc == Opc)
2038 return false;
2039 const MCInstrDesc &MCID = get(Opcode: NewOpc);
2040 CmpInstr.setDesc(MCID);
2041 CmpInstr.removeOperand(OpNo: DeadNZCVIdx);
2042 bool succeeded = UpdateOperandRegClass(Instr&: CmpInstr);
2043 (void)succeeded;
2044 assert(succeeded && "Some operands reg class are incompatible!");
2045 return true;
2046 }
2047
2048 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
2049 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
2050 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
2051 return optimizePTestInstr(PTest: &CmpInstr, MaskReg: SrcReg, PredReg: SrcReg2, MRI);
2052
2053 if (SrcReg2 != 0)
2054 return false;
2055
2056 // CmpInstr is a Compare instruction if destination register is not used.
2057 if (!MRI->use_nodbg_empty(RegNo: CmpInstr.getOperand(i: 0).getReg()))
2058 return false;
2059
2060 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, MRI: *MRI))
2061 return true;
2062 return (CmpValue == 0 || CmpValue == 1) &&
2063 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, MRI: *MRI);
2064}
2065
2066/// Get opcode of S version of Instr.
2067/// If Instr is S version its opcode is returned.
2068/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
2069/// or we are not interested in it.
2070static unsigned sForm(MachineInstr &Instr) {
2071 switch (Instr.getOpcode()) {
2072 default:
2073 return AArch64::INSTRUCTION_LIST_END;
2074
2075 case AArch64::ADDSWrr:
2076 case AArch64::ADDSWri:
2077 case AArch64::ADDSXrr:
2078 case AArch64::ADDSXri:
2079 case AArch64::ADDSWrx:
2080 case AArch64::ADDSXrx:
2081 case AArch64::ADDSWrs:
2082 case AArch64::ADDSXrs:
2083 case AArch64::SUBSWrr:
2084 case AArch64::SUBSWri:
2085 case AArch64::SUBSWrx:
2086 case AArch64::SUBSWrs:
2087 case AArch64::SUBSXrr:
2088 case AArch64::SUBSXri:
2089 case AArch64::SUBSXrx:
2090 case AArch64::SUBSXrs:
2091 case AArch64::ANDSWri:
2092 case AArch64::ANDSWrr:
2093 case AArch64::ANDSWrs:
2094 case AArch64::ANDSXri:
2095 case AArch64::ANDSXrr:
2096 case AArch64::ANDSXrs:
2097 case AArch64::BICSWrr:
2098 case AArch64::BICSXrr:
2099 case AArch64::BICSWrs:
2100 case AArch64::BICSXrs:
2101 case AArch64::ADCSWr:
2102 case AArch64::ADCSXr:
2103 case AArch64::SBCSWr:
2104 case AArch64::SBCSXr:
2105 return Instr.getOpcode();
2106
2107 case AArch64::ADDWrr:
2108 return AArch64::ADDSWrr;
2109 case AArch64::ADDWri:
2110 return AArch64::ADDSWri;
2111 case AArch64::ADDXrr:
2112 return AArch64::ADDSXrr;
2113 case AArch64::ADDXri:
2114 return AArch64::ADDSXri;
2115 case AArch64::ADDWrx:
2116 return AArch64::ADDSWrx;
2117 case AArch64::ADDXrx:
2118 return AArch64::ADDSXrx;
2119 case AArch64::ADDWrs:
2120 return AArch64::ADDSWrs;
2121 case AArch64::ADDXrs:
2122 return AArch64::ADDSXrs;
2123 case AArch64::ADCWr:
2124 return AArch64::ADCSWr;
2125 case AArch64::ADCXr:
2126 return AArch64::ADCSXr;
2127 case AArch64::SUBWrr:
2128 return AArch64::SUBSWrr;
2129 case AArch64::SUBWri:
2130 return AArch64::SUBSWri;
2131 case AArch64::SUBXrr:
2132 return AArch64::SUBSXrr;
2133 case AArch64::SUBXri:
2134 return AArch64::SUBSXri;
2135 case AArch64::SUBWrx:
2136 return AArch64::SUBSWrx;
2137 case AArch64::SUBXrx:
2138 return AArch64::SUBSXrx;
2139 case AArch64::SUBWrs:
2140 return AArch64::SUBSWrs;
2141 case AArch64::SUBXrs:
2142 return AArch64::SUBSXrs;
2143 case AArch64::SBCWr:
2144 return AArch64::SBCSWr;
2145 case AArch64::SBCXr:
2146 return AArch64::SBCSXr;
2147 case AArch64::ANDWri:
2148 return AArch64::ANDSWri;
2149 case AArch64::ANDXri:
2150 return AArch64::ANDSXri;
2151 case AArch64::ANDWrr:
2152 return AArch64::ANDSWrr;
2153 case AArch64::ANDWrs:
2154 return AArch64::ANDSWrs;
2155 case AArch64::ANDXrr:
2156 return AArch64::ANDSXrr;
2157 case AArch64::ANDXrs:
2158 return AArch64::ANDSXrs;
2159 case AArch64::BICWrr:
2160 return AArch64::BICSWrr;
2161 case AArch64::BICXrr:
2162 return AArch64::BICSXrr;
2163 case AArch64::BICWrs:
2164 return AArch64::BICSWrs;
2165 case AArch64::BICXrs:
2166 return AArch64::BICSXrs;
2167 }
2168}
2169
2170/// Check if AArch64::NZCV should be alive in successors of MBB.
2171static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
2172 for (auto *BB : MBB->successors())
2173 if (BB->isLiveIn(Reg: AArch64::NZCV))
2174 return true;
2175 return false;
2176}
2177
2178/// \returns The condition code operand index for \p Instr if it is a branch
2179/// or select and -1 otherwise.
2180int AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(
2181 const MachineInstr &Instr) {
2182 switch (Instr.getOpcode()) {
2183 default:
2184 return -1;
2185
2186 case AArch64::Bcc: {
2187 int Idx = Instr.findRegisterUseOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
2188 assert(Idx >= 2);
2189 return Idx - 2;
2190 }
2191
2192 case AArch64::CSINVWr:
2193 case AArch64::CSINVXr:
2194 case AArch64::CSINCWr:
2195 case AArch64::CSINCXr:
2196 case AArch64::CSELWr:
2197 case AArch64::CSELXr:
2198 case AArch64::CSNEGWr:
2199 case AArch64::CSNEGXr:
2200 case AArch64::FCSELSrrr:
2201 case AArch64::FCSELDrrr: {
2202 int Idx = Instr.findRegisterUseOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr);
2203 assert(Idx >= 1);
2204 return Idx - 1;
2205 }
2206 }
2207}
2208
2209/// Find a condition code used by the instruction.
2210/// Returns AArch64CC::Invalid if either the instruction does not use condition
2211/// codes or we don't optimize CmpInstr in the presence of such instructions.
2212static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
2213 int CCIdx =
2214 AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2215 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2216 Instr.getOperand(i: CCIdx).getImm())
2217 : AArch64CC::Invalid;
2218}
2219
2220static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
2221 assert(CC != AArch64CC::Invalid);
2222 UsedNZCV UsedFlags;
2223 switch (CC) {
2224 default:
2225 break;
2226
2227 case AArch64CC::EQ: // Z set
2228 case AArch64CC::NE: // Z clear
2229 UsedFlags.Z = true;
2230 break;
2231
2232 case AArch64CC::HI: // Z clear and C set
2233 case AArch64CC::LS: // Z set or C clear
2234 UsedFlags.Z = true;
2235 [[fallthrough]];
2236 case AArch64CC::HS: // C set
2237 case AArch64CC::LO: // C clear
2238 UsedFlags.C = true;
2239 break;
2240
2241 case AArch64CC::MI: // N set
2242 case AArch64CC::PL: // N clear
2243 UsedFlags.N = true;
2244 break;
2245
2246 case AArch64CC::VS: // V set
2247 case AArch64CC::VC: // V clear
2248 UsedFlags.V = true;
2249 break;
2250
2251 case AArch64CC::GT: // Z clear, N and V the same
2252 case AArch64CC::LE: // Z set, N and V differ
2253 UsedFlags.Z = true;
2254 [[fallthrough]];
2255 case AArch64CC::GE: // N and V the same
2256 case AArch64CC::LT: // N and V differ
2257 UsedFlags.N = true;
2258 UsedFlags.V = true;
2259 break;
2260 }
2261 return UsedFlags;
2262}
2263
2264/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2265/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2266/// \returns std::nullopt otherwise.
2267///
2268/// Collect instructions using that flags in \p CCUseInstrs if provided.
2269std::optional<UsedNZCV>
2270llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
2271 const TargetRegisterInfo &TRI,
2272 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2273 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2274 if (MI.getParent() != CmpParent)
2275 return std::nullopt;
2276
2277 if (areCFlagsAliveInSuccessors(MBB: CmpParent))
2278 return std::nullopt;
2279
2280 UsedNZCV NZCVUsedAfterCmp;
2281 for (MachineInstr &Instr : instructionsWithoutDebug(
2282 It: std::next(x: CmpInstr.getIterator()), End: CmpParent->instr_end())) {
2283 if (Instr.readsRegister(Reg: AArch64::NZCV, TRI: &TRI)) {
2284 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
2285 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2286 return std::nullopt;
2287 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2288 if (CCUseInstrs)
2289 CCUseInstrs->push_back(Elt: &Instr);
2290 }
2291 if (Instr.modifiesRegister(Reg: AArch64::NZCV, TRI: &TRI))
2292 break;
2293 }
2294 return NZCVUsedAfterCmp;
2295}
2296
2297static bool isADDSRegImm(unsigned Opcode) {
2298 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2299}
2300
2301static bool isSUBSRegImm(unsigned Opcode) {
2302 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2303}
2304
2305static bool isANDOpcode(MachineInstr &MI) {
2306 unsigned Opc = sForm(Instr&: MI);
2307 switch (Opc) {
2308 case AArch64::ANDSWri:
2309 case AArch64::ANDSWrr:
2310 case AArch64::ANDSWrs:
2311 case AArch64::ANDSXri:
2312 case AArch64::ANDSXrr:
2313 case AArch64::ANDSXrs:
2314 case AArch64::BICSWrr:
2315 case AArch64::BICSXrr:
2316 case AArch64::BICSWrs:
2317 case AArch64::BICSXrs:
2318 return true;
2319 default:
2320 return false;
2321 }
2322}
2323
2324/// Check if CmpInstr can be substituted by MI.
2325///
2326/// CmpInstr can be substituted:
2327/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2328/// - and, MI and CmpInstr are from the same MachineBB
2329/// - and, condition flags are not alive in successors of the CmpInstr parent
2330/// - and, if MI opcode is the S form there must be no defs of flags between
2331/// MI and CmpInstr
2332/// or if MI opcode is not the S form there must be neither defs of flags
2333/// nor uses of flags between MI and CmpInstr.
2334/// - and, C is not used after CmpInstr; CmpInstr's C is from adds/subs #0 on
2335/// SrcReg and can differ from MI (e.g. carry out of ADCS/SBCS).
2336/// - and, V is not used after CmpInstr unless MI is AND/BIC (V cleared) or MI
2337/// has NoSWrap (overflow is poison and the fold is still safe).
2338static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
2339 const TargetRegisterInfo &TRI) {
2340 // MI is an opcode sForm maps (add/sub/adc/sbc/and/bic and their S forms).
2341 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2342
2343 const unsigned CmpOpcode = CmpInstr.getOpcode();
2344 if (!isADDSRegImm(Opcode: CmpOpcode) && !isSUBSRegImm(Opcode: CmpOpcode))
2345 return false;
2346
2347 assert((CmpInstr.getOperand(2).isImm() &&
2348 CmpInstr.getOperand(2).getImm() == 0) &&
2349 "Caller guarantees that CmpInstr compares with constant 0");
2350
2351 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2352 if (!NZVCUsed || NZVCUsed->C)
2353 return false;
2354
2355 // CmpInstr is ADDS/SUBS with immediate 0 on SrcReg (compare SrcReg to zero).
2356 // After the fold, users see NZCV from MI (or its S form), not from CmpInstr.
2357 // N/Z match CmpInstr for the value in SrcReg; C/V need not match in general
2358 // (e.g. ADCS vs adds #0), so we require C unused after CmpInstr and gate V
2359 // as below. NoSWrap makes signed overflow poison; AND/BIC clear V.
2360 if (NZVCUsed->V && !MI.getFlag(Flag: MachineInstr::NoSWrap) && !isANDOpcode(MI))
2361 return false;
2362
2363 AccessKind AccessToCheck = AK_Write;
2364 if (sForm(Instr&: MI) != MI.getOpcode())
2365 AccessToCheck = AK_All;
2366 return !areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck);
2367}
2368
2369/// Substitute an instruction comparing to zero with another instruction
2370/// which produces needed condition flags.
2371///
2372/// Return true on success.
2373bool AArch64InstrInfo::substituteCmpToZero(
2374 MachineInstr &CmpInstr, unsigned SrcReg,
2375 const MachineRegisterInfo &MRI) const {
2376 // Get the unique definition of SrcReg.
2377 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
2378 if (!MI)
2379 return false;
2380
2381 const TargetRegisterInfo &TRI = getRegisterInfo();
2382
2383 unsigned NewOpc = sForm(Instr&: *MI);
2384 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2385 return false;
2386
2387 if (!canInstrSubstituteCmpInstr(MI&: *MI, CmpInstr, TRI))
2388 return false;
2389
2390 // Update the instruction to set NZCV.
2391 MI->setDesc(get(Opcode: NewOpc));
2392 CmpInstr.eraseFromParent();
2393 bool succeeded = UpdateOperandRegClass(Instr&: *MI);
2394 (void)succeeded;
2395 assert(succeeded && "Some operands reg class are incompatible!");
2396 MI->addRegisterDefined(Reg: AArch64::NZCV, RegInfo: &TRI);
2397 return true;
2398}
2399
2400/// \returns True if \p CmpInstr can be removed.
2401///
2402/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2403/// codes used in \p CCUseInstrs must be inverted.
2404static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
2405 int CmpValue, const TargetRegisterInfo &TRI,
2406 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
2407 bool &IsInvertCC) {
2408 assert((CmpValue == 0 || CmpValue == 1) &&
2409 "Only comparisons to 0 or 1 considered for removal!");
2410
2411 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2412 unsigned MIOpc = MI.getOpcode();
2413 if (MIOpc == AArch64::CSINCWr) {
2414 if (MI.getOperand(i: 1).getReg() != AArch64::WZR ||
2415 MI.getOperand(i: 2).getReg() != AArch64::WZR)
2416 return false;
2417 } else if (MIOpc == AArch64::CSINCXr) {
2418 if (MI.getOperand(i: 1).getReg() != AArch64::XZR ||
2419 MI.getOperand(i: 2).getReg() != AArch64::XZR)
2420 return false;
2421 } else {
2422 return false;
2423 }
2424 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(Instr: MI);
2425 if (MICC == AArch64CC::Invalid)
2426 return false;
2427
2428 // NZCV needs to be defined
2429 if (MI.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) != -1)
2430 return false;
2431
2432 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2433 const unsigned CmpOpcode = CmpInstr.getOpcode();
2434 bool IsSubsRegImm = isSUBSRegImm(Opcode: CmpOpcode);
2435 if (CmpValue && !IsSubsRegImm)
2436 return false;
2437 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(Opcode: CmpOpcode))
2438 return false;
2439
2440 // MI conditions allowed: eq, ne, mi, pl
2441 UsedNZCV MIUsedNZCV = getUsedNZCV(CC: MICC);
2442 if (MIUsedNZCV.C || MIUsedNZCV.V)
2443 return false;
2444
2445 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2446 examineCFlagsUse(MI, CmpInstr, TRI, CCUseInstrs: &CCUseInstrs);
2447 // Condition flags are not used in CmpInstr basic block successors and only
2448 // Z or N flags allowed to be used after CmpInstr within its basic block
2449 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2450 return false;
2451 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2452 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2453 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2454 return false;
2455 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2456 if (MIUsedNZCV.N && !CmpValue)
2457 return false;
2458
2459 // There must be no defs of flags between MI and CmpInstr
2460 if (areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck: AK_Write))
2461 return false;
2462
2463 // Condition code is inverted in the following cases:
2464 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2465 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2466 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2467 (!CmpValue && MICC == AArch64CC::NE);
2468 return true;
2469}
2470
2471/// Remove comparison in csinc-cmp sequence
2472///
2473/// Examples:
2474/// 1. \code
2475/// csinc w9, wzr, wzr, ne
2476/// cmp w9, #0
2477/// b.eq
2478/// \endcode
2479/// to
2480/// \code
2481/// csinc w9, wzr, wzr, ne
2482/// b.ne
2483/// \endcode
2484///
2485/// 2. \code
2486/// csinc x2, xzr, xzr, mi
2487/// cmp x2, #1
2488/// b.pl
2489/// \endcode
2490/// to
2491/// \code
2492/// csinc x2, xzr, xzr, mi
2493/// b.pl
2494/// \endcode
2495///
2496/// \param CmpInstr comparison instruction
2497/// \return True when comparison removed
2498bool AArch64InstrInfo::removeCmpToZeroOrOne(
2499 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2500 const MachineRegisterInfo &MRI) const {
2501 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
2502 if (!MI)
2503 return false;
2504 const TargetRegisterInfo &TRI = getRegisterInfo();
2505 SmallVector<MachineInstr *, 4> CCUseInstrs;
2506 bool IsInvertCC = false;
2507 if (!canCmpInstrBeRemoved(MI&: *MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2508 IsInvertCC))
2509 return false;
2510 // Make transformation
2511 CmpInstr.eraseFromParent();
2512 if (IsInvertCC) {
2513 // Invert condition codes in CmpInstr CC users
2514 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2515 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(Instr: *CCUseInstr);
2516 assert(Idx >= 0 && "Unexpected instruction using CC.");
2517 MachineOperand &CCOperand = CCUseInstr->getOperand(i: Idx);
2518 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
2519 Code: static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2520 CCOperand.setImm(CCUse);
2521 }
2522 }
2523 return true;
2524}
2525
2526bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2527 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2528 MI.getOpcode() != AArch64::CATCHRET)
2529 return false;
2530
2531 MachineBasicBlock &MBB = *MI.getParent();
2532 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2533 auto TRI = Subtarget.getRegisterInfo();
2534 DebugLoc DL = MI.getDebugLoc();
2535
2536 if (MI.getOpcode() == AArch64::CATCHRET) {
2537 // Skip to the first instruction before the epilog.
2538 const TargetInstrInfo *TII =
2539 MBB.getParent()->getSubtarget().getInstrInfo();
2540 MachineBasicBlock *TargetMBB = MI.getOperand(i: 0).getMBB();
2541 auto MBBI = MachineBasicBlock::iterator(MI);
2542 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(x: MBBI);
2543 while (FirstEpilogSEH->getFlag(Flag: MachineInstr::FrameDestroy) &&
2544 FirstEpilogSEH != MBB.begin())
2545 FirstEpilogSEH = std::prev(x: FirstEpilogSEH);
2546 if (FirstEpilogSEH != MBB.begin())
2547 FirstEpilogSEH = std::next(x: FirstEpilogSEH);
2548 BuildMI(BB&: MBB, I: FirstEpilogSEH, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADRP))
2549 .addReg(RegNo: AArch64::X0, Flags: RegState::Define)
2550 .addMBB(MBB: TargetMBB);
2551 BuildMI(BB&: MBB, I: FirstEpilogSEH, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri))
2552 .addReg(RegNo: AArch64::X0, Flags: RegState::Define)
2553 .addReg(RegNo: AArch64::X0)
2554 .addMBB(MBB: TargetMBB)
2555 .addImm(Val: 0);
2556 TargetMBB->setMachineBlockAddressTaken();
2557 return true;
2558 }
2559
2560 Register Reg = MI.getOperand(i: 0).getReg();
2561 Module &M = *MBB.getParent()->getFunction().getParent();
2562 if (M.getStackProtectorGuard() == "sysreg") {
2563 const AArch64SysReg::SysReg *SrcReg =
2564 AArch64SysReg::lookupSysRegByName(Name: M.getStackProtectorGuardReg());
2565 if (!SrcReg)
2566 report_fatal_error(reason: "Unknown SysReg for Stack Protector Guard Register");
2567
2568 // mrs xN, sysreg
2569 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MRS))
2570 .addDef(RegNo: Reg, Flags: RegState::Renamable)
2571 .addImm(Val: SrcReg->Encoding);
2572 int Offset = M.getStackProtectorGuardOffset();
2573 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2574 // ldr xN, [xN, #offset]
2575 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui))
2576 .addDef(RegNo: Reg)
2577 .addUse(RegNo: Reg, Flags: RegState::Kill)
2578 .addImm(Val: Offset / 8);
2579 } else if (Offset >= -256 && Offset <= 255) {
2580 // ldur xN, [xN, #offset]
2581 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDURXi))
2582 .addDef(RegNo: Reg)
2583 .addUse(RegNo: Reg, Flags: RegState::Kill)
2584 .addImm(Val: Offset);
2585 } else if (Offset >= -4095 && Offset <= 4095) {
2586 if (Offset > 0) {
2587 // add xN, xN, #offset
2588 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri))
2589 .addDef(RegNo: Reg)
2590 .addUse(RegNo: Reg, Flags: RegState::Kill)
2591 .addImm(Val: Offset)
2592 .addImm(Val: 0);
2593 } else {
2594 // sub xN, xN, #offset
2595 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::SUBXri))
2596 .addDef(RegNo: Reg)
2597 .addUse(RegNo: Reg, Flags: RegState::Kill)
2598 .addImm(Val: -Offset)
2599 .addImm(Val: 0);
2600 }
2601 // ldr xN, [xN]
2602 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui))
2603 .addDef(RegNo: Reg)
2604 .addUse(RegNo: Reg, Flags: RegState::Kill)
2605 .addImm(Val: 0);
2606 } else {
2607 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2608 // than 23760.
2609 // It might be nice to use AArch64::MOVi32imm here, which would get
2610 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2611 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2612 // AArch64FrameLowering might help us find such a scratch register
2613 // though. If we failed to find a scratch register, we could emit a
2614 // stream of add instructions to build up the immediate. Or, we could try
2615 // to insert a AArch64::MOVi32imm before register allocation so that we
2616 // didn't need to scavenge for a scratch register.
2617 report_fatal_error(reason: "Unable to encode Stack Protector Guard Offset");
2618 }
2619 MBB.erase(I: MI);
2620 return true;
2621 }
2622
2623 const GlobalValue *GV =
2624 cast<GlobalValue>(Val: (*MI.memoperands_begin())->getValue());
2625 const TargetMachine &TM = MBB.getParent()->getTarget();
2626 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2627 const unsigned char MO_NC = AArch64II::MO_NC;
2628
2629 unsigned GuardWidth = M.getStackProtectorGuardValueWidth().value_or(
2630 u: Subtarget.isTargetILP32() ? 4 : 8);
2631 if (GuardWidth != 4 && GuardWidth != 8)
2632 report_fatal_error(reason: "Unsupported stack protector value width");
2633 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2634 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LOADgot), DestReg: Reg)
2635 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags);
2636 if (GuardWidth == 4) {
2637 unsigned Reg32 = TRI->getSubReg(Reg, Idx: AArch64::sub_32);
2638 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRWui))
2639 .addDef(RegNo: Reg32, Flags: RegState::Dead)
2640 .addUse(RegNo: Reg, Flags: RegState::Kill)
2641 .addImm(Val: 0)
2642 .addMemOperand(MMO: *MI.memoperands_begin())
2643 .addDef(RegNo: Reg, Flags: RegState::Implicit);
2644 } else {
2645 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2646 .addReg(RegNo: Reg, Flags: RegState::Kill)
2647 .addImm(Val: 0)
2648 .addMemOperand(MMO: *MI.memoperands_begin());
2649 }
2650 } else if (TM.getCodeModel() == CodeModel::Large) {
2651 if (GuardWidth == 4)
2652 report_fatal_error(reason: "Large code model with 4-byte stack protector not yet "
2653 "supported");
2654 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: Reg)
2655 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G0 | MO_NC)
2656 .addImm(Val: 0);
2657 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2658 .addReg(RegNo: Reg, Flags: RegState::Kill)
2659 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G1 | MO_NC)
2660 .addImm(Val: 16);
2661 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2662 .addReg(RegNo: Reg, Flags: RegState::Kill)
2663 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G2 | MO_NC)
2664 .addImm(Val: 32);
2665 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::MOVKXi), DestReg: Reg)
2666 .addReg(RegNo: Reg, Flags: RegState::Kill)
2667 .addGlobalAddress(GV, Offset: 0, TargetFlags: AArch64II::MO_G3)
2668 .addImm(Val: 48);
2669 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2670 .addReg(RegNo: Reg, Flags: RegState::Kill)
2671 .addImm(Val: 0)
2672 .addMemOperand(MMO: *MI.memoperands_begin());
2673 } else {
2674 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::ADRP), DestReg: Reg)
2675 .addGlobalAddress(GV, Offset: 0, TargetFlags: OpFlags | AArch64II::MO_PAGE);
2676 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2677 if (GuardWidth == 4) {
2678 unsigned Reg32 = TRI->getSubReg(Reg, Idx: AArch64::sub_32);
2679 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRWui))
2680 .addDef(RegNo: Reg32, Flags: RegState::Dead)
2681 .addUse(RegNo: Reg, Flags: RegState::Kill)
2682 .addGlobalAddress(GV, Offset: 0, TargetFlags: LoFlags)
2683 .addMemOperand(MMO: *MI.memoperands_begin())
2684 .addDef(RegNo: Reg, Flags: RegState::Implicit);
2685 } else {
2686 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::LDRXui), DestReg: Reg)
2687 .addReg(RegNo: Reg, Flags: RegState::Kill)
2688 .addGlobalAddress(GV, Offset: 0, TargetFlags: LoFlags)
2689 .addMemOperand(MMO: *MI.memoperands_begin());
2690 }
2691 }
2692
2693 MBB.erase(I: MI);
2694
2695 return true;
2696}
2697
2698// Return true if this instruction simply sets its single destination register
2699// to zero. This is equivalent to a register rename of the zero-register.
2700bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2701 switch (MI.getOpcode()) {
2702 default:
2703 break;
2704 case AArch64::MOVZWi:
2705 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2706 if (MI.getOperand(i: 1).isImm() && MI.getOperand(i: 1).getImm() == 0) {
2707 assert(MI.getDesc().getNumOperands() == 3 &&
2708 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2709 return true;
2710 }
2711 break;
2712 case AArch64::ANDWri: // and Rd, Rzr, #imm
2713 return MI.getOperand(i: 1).getReg() == AArch64::WZR;
2714 case AArch64::ANDXri:
2715 return MI.getOperand(i: 1).getReg() == AArch64::XZR;
2716 case TargetOpcode::COPY:
2717 return MI.getOperand(i: 1).getReg() == AArch64::WZR;
2718 }
2719 return false;
2720}
2721
2722// Return true if this instruction simply renames a general register without
2723// modifying bits.
2724bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2725 switch (MI.getOpcode()) {
2726 default:
2727 break;
2728 case TargetOpcode::COPY: {
2729 // GPR32 copies will by lowered to ORRXrs
2730 Register DstReg = MI.getOperand(i: 0).getReg();
2731 return (AArch64::GPR32RegClass.contains(Reg: DstReg) ||
2732 AArch64::GPR64RegClass.contains(Reg: DstReg));
2733 }
2734 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2735 if (MI.getOperand(i: 1).getReg() == AArch64::XZR) {
2736 assert(MI.getDesc().getNumOperands() == 4 &&
2737 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2738 return true;
2739 }
2740 break;
2741 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2742 if (MI.getOperand(i: 2).getImm() == 0) {
2743 assert(MI.getDesc().getNumOperands() == 4 &&
2744 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2745 return true;
2746 }
2747 break;
2748 }
2749 return false;
2750}
2751
2752// Return true if this instruction simply renames a general register without
2753// modifying bits.
2754bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2755 switch (MI.getOpcode()) {
2756 default:
2757 break;
2758 case TargetOpcode::COPY: {
2759 Register DstReg = MI.getOperand(i: 0).getReg();
2760 return AArch64::FPR128RegClass.contains(Reg: DstReg);
2761 }
2762 case AArch64::ORRv16i8:
2763 if (MI.getOperand(i: 1).getReg() == MI.getOperand(i: 2).getReg()) {
2764 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2765 "invalid ORRv16i8 operands");
2766 return true;
2767 }
2768 break;
2769 }
2770 return false;
2771}
2772
2773static bool isFrameLoadOpcode(int Opcode) {
2774 switch (Opcode) {
2775 default:
2776 return false;
2777 case AArch64::LDRWui:
2778 case AArch64::LDRXui:
2779 case AArch64::LDRBui:
2780 case AArch64::LDRHui:
2781 case AArch64::LDRSui:
2782 case AArch64::LDRDui:
2783 case AArch64::LDRQui:
2784 case AArch64::LDR_PXI:
2785 return true;
2786 }
2787}
2788
2789Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2790 int &FrameIndex) const {
2791 if (!isFrameLoadOpcode(Opcode: MI.getOpcode()))
2792 return Register();
2793
2794 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2795 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2796 FrameIndex = MI.getOperand(i: 1).getIndex();
2797 return MI.getOperand(i: 0).getReg();
2798 }
2799 return Register();
2800}
2801
2802static bool isFrameStoreOpcode(int Opcode) {
2803 switch (Opcode) {
2804 default:
2805 return false;
2806 case AArch64::STRWui:
2807 case AArch64::STRXui:
2808 case AArch64::STRBui:
2809 case AArch64::STRHui:
2810 case AArch64::STRSui:
2811 case AArch64::STRDui:
2812 case AArch64::STRQui:
2813 case AArch64::STR_PXI:
2814 return true;
2815 }
2816}
2817
2818Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2819 int &FrameIndex) const {
2820 if (!isFrameStoreOpcode(Opcode: MI.getOpcode()))
2821 return Register();
2822
2823 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2824 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2825 FrameIndex = MI.getOperand(i: 1).getIndex();
2826 return MI.getOperand(i: 0).getReg();
2827 }
2828 return Register();
2829}
2830
2831Register AArch64InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
2832 int &FrameIndex) const {
2833 if (!isFrameStoreOpcode(Opcode: MI.getOpcode()))
2834 return Register();
2835
2836 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2837 return Reg;
2838
2839 SmallVector<const MachineMemOperand *, 1> Accesses;
2840 if (hasStoreToStackSlot(MI, Accesses)) {
2841 if (Accesses.size() > 1)
2842 return Register();
2843
2844 FrameIndex =
2845 cast<FixedStackPseudoSourceValue>(Val: Accesses.front()->getPseudoValue())
2846 ->getFrameIndex();
2847 return MI.getOperand(i: 0).getReg();
2848 }
2849 return Register();
2850}
2851
2852Register AArch64InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
2853 int &FrameIndex) const {
2854 if (!isFrameLoadOpcode(Opcode: MI.getOpcode()))
2855 return Register();
2856
2857 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2858 return Reg;
2859
2860 SmallVector<const MachineMemOperand *, 1> Accesses;
2861 if (hasLoadFromStackSlot(MI, Accesses)) {
2862 if (Accesses.size() > 1)
2863 return Register();
2864
2865 FrameIndex =
2866 cast<FixedStackPseudoSourceValue>(Val: Accesses.front()->getPseudoValue())
2867 ->getFrameIndex();
2868 return MI.getOperand(i: 0).getReg();
2869 }
2870 return Register();
2871}
2872
2873/// Check all MachineMemOperands for a hint to suppress pairing.
2874bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2875 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2876 return MMO->getFlags() & MOSuppressPair;
2877 });
2878}
2879
2880/// Set a flag on the first MachineMemOperand to suppress pairing.
2881void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2882 if (MI.memoperands_empty())
2883 return;
2884 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2885}
2886
2887/// Check all MachineMemOperands for a hint that the load/store is strided.
2888bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2889 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2890 return MMO->getFlags() & MOStridedAccess;
2891 });
2892}
2893
2894bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2895 switch (Opc) {
2896 default:
2897 return false;
2898 case AArch64::STURSi:
2899 case AArch64::STRSpre:
2900 case AArch64::STURDi:
2901 case AArch64::STRDpre:
2902 case AArch64::STURQi:
2903 case AArch64::STRQpre:
2904 case AArch64::STURBBi:
2905 case AArch64::STURHHi:
2906 case AArch64::STURWi:
2907 case AArch64::STRWpre:
2908 case AArch64::STURXi:
2909 case AArch64::STRXpre:
2910 case AArch64::LDURSi:
2911 case AArch64::LDRSpre:
2912 case AArch64::LDURDi:
2913 case AArch64::LDRDpre:
2914 case AArch64::LDURQi:
2915 case AArch64::LDRQpre:
2916 case AArch64::LDURWi:
2917 case AArch64::LDRWpre:
2918 case AArch64::LDURXi:
2919 case AArch64::LDRXpre:
2920 case AArch64::LDRSWpre:
2921 case AArch64::LDURSWi:
2922 case AArch64::LDURHHi:
2923 case AArch64::LDURBBi:
2924 case AArch64::LDURSBWi:
2925 case AArch64::LDURSHWi:
2926 return true;
2927 }
2928}
2929
2930std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2931 switch (Opc) {
2932 default: return {};
2933 case AArch64::PRFMui: return AArch64::PRFUMi;
2934 case AArch64::LDRXui: return AArch64::LDURXi;
2935 case AArch64::LDRWui: return AArch64::LDURWi;
2936 case AArch64::LDRBui: return AArch64::LDURBi;
2937 case AArch64::LDRHui: return AArch64::LDURHi;
2938 case AArch64::LDRSui: return AArch64::LDURSi;
2939 case AArch64::LDRDui: return AArch64::LDURDi;
2940 case AArch64::LDRQui: return AArch64::LDURQi;
2941 case AArch64::LDRBBui: return AArch64::LDURBBi;
2942 case AArch64::LDRHHui: return AArch64::LDURHHi;
2943 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2944 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2945 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2946 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2947 case AArch64::LDRSWui: return AArch64::LDURSWi;
2948 case AArch64::STRXui: return AArch64::STURXi;
2949 case AArch64::STRWui: return AArch64::STURWi;
2950 case AArch64::STRBui: return AArch64::STURBi;
2951 case AArch64::STRHui: return AArch64::STURHi;
2952 case AArch64::STRSui: return AArch64::STURSi;
2953 case AArch64::STRDui: return AArch64::STURDi;
2954 case AArch64::STRQui: return AArch64::STURQi;
2955 case AArch64::STRBBui: return AArch64::STURBBi;
2956 case AArch64::STRHHui: return AArch64::STURHHi;
2957 }
2958}
2959
2960unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2961 switch (Opc) {
2962 default:
2963 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2964 case AArch64::ADDG:
2965 case AArch64::LDAPURBi:
2966 case AArch64::LDAPURHi:
2967 case AArch64::LDAPURi:
2968 case AArch64::LDAPURSBWi:
2969 case AArch64::LDAPURSBXi:
2970 case AArch64::LDAPURSHWi:
2971 case AArch64::LDAPURSHXi:
2972 case AArch64::LDAPURSWi:
2973 case AArch64::LDAPURXi:
2974 case AArch64::LDR_PPXI:
2975 case AArch64::LDR_PXI:
2976 case AArch64::LDR_ZXI:
2977 case AArch64::LDR_ZZXI:
2978 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2979 case AArch64::LDR_ZZZXI:
2980 case AArch64::LDR_ZZZZXI:
2981 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2982 case AArch64::LDRBBui:
2983 case AArch64::LDRBui:
2984 case AArch64::LDRDui:
2985 case AArch64::LDRHHui:
2986 case AArch64::LDRHui:
2987 case AArch64::LDRQui:
2988 case AArch64::LDRSBWui:
2989 case AArch64::LDRSBXui:
2990 case AArch64::LDRSHWui:
2991 case AArch64::LDRSHXui:
2992 case AArch64::LDRSui:
2993 case AArch64::LDRSWui:
2994 case AArch64::LDRWui:
2995 case AArch64::LDRXui:
2996 case AArch64::LDURBBi:
2997 case AArch64::LDURBi:
2998 case AArch64::LDURDi:
2999 case AArch64::LDURHHi:
3000 case AArch64::LDURHi:
3001 case AArch64::LDURQi:
3002 case AArch64::LDURSBWi:
3003 case AArch64::LDURSBXi:
3004 case AArch64::LDURSHWi:
3005 case AArch64::LDURSHXi:
3006 case AArch64::LDURSi:
3007 case AArch64::LDURSWi:
3008 case AArch64::LDURWi:
3009 case AArch64::LDURXi:
3010 case AArch64::PRFMui:
3011 case AArch64::PRFUMi:
3012 case AArch64::ST2Gi:
3013 case AArch64::STGi:
3014 case AArch64::STLURBi:
3015 case AArch64::STLURHi:
3016 case AArch64::STLURWi:
3017 case AArch64::STLURXi:
3018 case AArch64::StoreSwiftAsyncContext:
3019 case AArch64::STR_PPXI:
3020 case AArch64::STR_PXI:
3021 case AArch64::STR_ZXI:
3022 case AArch64::STR_ZZXI:
3023 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
3024 case AArch64::STR_ZZZXI:
3025 case AArch64::STR_ZZZZXI:
3026 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
3027 case AArch64::STRBBui:
3028 case AArch64::STRBui:
3029 case AArch64::STRDui:
3030 case AArch64::STRHHui:
3031 case AArch64::STRHui:
3032 case AArch64::STRQui:
3033 case AArch64::STRSui:
3034 case AArch64::STRWui:
3035 case AArch64::STRXui:
3036 case AArch64::STURBBi:
3037 case AArch64::STURBi:
3038 case AArch64::STURDi:
3039 case AArch64::STURHHi:
3040 case AArch64::STURHi:
3041 case AArch64::STURQi:
3042 case AArch64::STURSi:
3043 case AArch64::STURWi:
3044 case AArch64::STURXi:
3045 case AArch64::STZ2Gi:
3046 case AArch64::STZGi:
3047 case AArch64::TAGPstack:
3048 return 2;
3049 case AArch64::LD1B_D_IMM:
3050 case AArch64::LD1B_H_IMM:
3051 case AArch64::LD1B_IMM:
3052 case AArch64::LD1B_S_IMM:
3053 case AArch64::LD1D_IMM:
3054 case AArch64::LD1H_D_IMM:
3055 case AArch64::LD1H_IMM:
3056 case AArch64::LD1H_S_IMM:
3057 case AArch64::LD1RB_D_IMM:
3058 case AArch64::LD1RB_H_IMM:
3059 case AArch64::LD1RB_IMM:
3060 case AArch64::LD1RB_S_IMM:
3061 case AArch64::LD1RD_IMM:
3062 case AArch64::LD1RH_D_IMM:
3063 case AArch64::LD1RH_IMM:
3064 case AArch64::LD1RH_S_IMM:
3065 case AArch64::LD1RSB_D_IMM:
3066 case AArch64::LD1RSB_H_IMM:
3067 case AArch64::LD1RSB_S_IMM:
3068 case AArch64::LD1RSH_D_IMM:
3069 case AArch64::LD1RSH_S_IMM:
3070 case AArch64::LD1RSW_IMM:
3071 case AArch64::LD1RW_D_IMM:
3072 case AArch64::LD1RW_IMM:
3073 case AArch64::LD1SB_D_IMM:
3074 case AArch64::LD1SB_H_IMM:
3075 case AArch64::LD1SB_S_IMM:
3076 case AArch64::LD1SH_D_IMM:
3077 case AArch64::LD1SH_S_IMM:
3078 case AArch64::LD1SW_D_IMM:
3079 case AArch64::LD1W_D_IMM:
3080 case AArch64::LD1W_IMM:
3081 case AArch64::LD2B_IMM:
3082 case AArch64::LD2D_IMM:
3083 case AArch64::LD2H_IMM:
3084 case AArch64::LD2W_IMM:
3085 case AArch64::LD3B_IMM:
3086 case AArch64::LD3D_IMM:
3087 case AArch64::LD3H_IMM:
3088 case AArch64::LD3W_IMM:
3089 case AArch64::LD4B_IMM:
3090 case AArch64::LD4D_IMM:
3091 case AArch64::LD4H_IMM:
3092 case AArch64::LD4W_IMM:
3093 case AArch64::LDG:
3094 case AArch64::LDNF1B_D_IMM:
3095 case AArch64::LDNF1B_H_IMM:
3096 case AArch64::LDNF1B_IMM:
3097 case AArch64::LDNF1B_S_IMM:
3098 case AArch64::LDNF1D_IMM:
3099 case AArch64::LDNF1H_D_IMM:
3100 case AArch64::LDNF1H_IMM:
3101 case AArch64::LDNF1H_S_IMM:
3102 case AArch64::LDNF1SB_D_IMM:
3103 case AArch64::LDNF1SB_H_IMM:
3104 case AArch64::LDNF1SB_S_IMM:
3105 case AArch64::LDNF1SH_D_IMM:
3106 case AArch64::LDNF1SH_S_IMM:
3107 case AArch64::LDNF1SW_D_IMM:
3108 case AArch64::LDNF1W_D_IMM:
3109 case AArch64::LDNF1W_IMM:
3110 case AArch64::LDNPDi:
3111 case AArch64::LDNPQi:
3112 case AArch64::LDNPSi:
3113 case AArch64::LDNPWi:
3114 case AArch64::LDNPXi:
3115 case AArch64::LDNT1B_ZRI:
3116 case AArch64::LDNT1D_ZRI:
3117 case AArch64::LDNT1H_ZRI:
3118 case AArch64::LDNT1W_ZRI:
3119 case AArch64::LDPDi:
3120 case AArch64::LDPQi:
3121 case AArch64::LDPSi:
3122 case AArch64::LDPWi:
3123 case AArch64::LDPXi:
3124 case AArch64::LDRBBpost:
3125 case AArch64::LDRBBpre:
3126 case AArch64::LDRBpost:
3127 case AArch64::LDRBpre:
3128 case AArch64::LDRDpost:
3129 case AArch64::LDRDpre:
3130 case AArch64::LDRHHpost:
3131 case AArch64::LDRHHpre:
3132 case AArch64::LDRHpost:
3133 case AArch64::LDRHpre:
3134 case AArch64::LDRQpost:
3135 case AArch64::LDRQpre:
3136 case AArch64::LDRSpost:
3137 case AArch64::LDRSpre:
3138 case AArch64::LDRWpost:
3139 case AArch64::LDRWpre:
3140 case AArch64::LDRXpost:
3141 case AArch64::LDRXpre:
3142 case AArch64::ST1B_D_IMM:
3143 case AArch64::ST1B_H_IMM:
3144 case AArch64::ST1B_IMM:
3145 case AArch64::ST1B_S_IMM:
3146 case AArch64::ST1D_IMM:
3147 case AArch64::ST1H_D_IMM:
3148 case AArch64::ST1H_IMM:
3149 case AArch64::ST1H_S_IMM:
3150 case AArch64::ST1W_D_IMM:
3151 case AArch64::ST1W_IMM:
3152 case AArch64::ST2B_IMM:
3153 case AArch64::ST2D_IMM:
3154 case AArch64::ST2H_IMM:
3155 case AArch64::ST2W_IMM:
3156 case AArch64::ST3B_IMM:
3157 case AArch64::ST3D_IMM:
3158 case AArch64::ST3H_IMM:
3159 case AArch64::ST3W_IMM:
3160 case AArch64::ST4B_IMM:
3161 case AArch64::ST4D_IMM:
3162 case AArch64::ST4H_IMM:
3163 case AArch64::ST4W_IMM:
3164 case AArch64::STGPi:
3165 case AArch64::STGPreIndex:
3166 case AArch64::STZGPreIndex:
3167 case AArch64::ST2GPreIndex:
3168 case AArch64::STZ2GPreIndex:
3169 case AArch64::STGPostIndex:
3170 case AArch64::STZGPostIndex:
3171 case AArch64::ST2GPostIndex:
3172 case AArch64::STZ2GPostIndex:
3173 case AArch64::STNPDi:
3174 case AArch64::STNPQi:
3175 case AArch64::STNPSi:
3176 case AArch64::STNPWi:
3177 case AArch64::STNPXi:
3178 case AArch64::STNT1B_ZRI:
3179 case AArch64::STNT1D_ZRI:
3180 case AArch64::STNT1H_ZRI:
3181 case AArch64::STNT1W_ZRI:
3182 case AArch64::STPDi:
3183 case AArch64::STPQi:
3184 case AArch64::STPSi:
3185 case AArch64::STPWi:
3186 case AArch64::STPXi:
3187 case AArch64::STRBBpost:
3188 case AArch64::STRBBpre:
3189 case AArch64::STRBpost:
3190 case AArch64::STRBpre:
3191 case AArch64::STRDpost:
3192 case AArch64::STRDpre:
3193 case AArch64::STRHHpost:
3194 case AArch64::STRHHpre:
3195 case AArch64::STRHpost:
3196 case AArch64::STRHpre:
3197 case AArch64::STRQpost:
3198 case AArch64::STRQpre:
3199 case AArch64::STRSpost:
3200 case AArch64::STRSpre:
3201 case AArch64::STRWpost:
3202 case AArch64::STRWpre:
3203 case AArch64::STRXpost:
3204 case AArch64::STRXpre:
3205 case AArch64::LD1B_2Z_IMM:
3206 case AArch64::LD1B_2Z_STRIDED_IMM:
3207 case AArch64::LD1H_2Z_IMM:
3208 case AArch64::LD1H_2Z_STRIDED_IMM:
3209 case AArch64::LD1W_2Z_IMM:
3210 case AArch64::LD1W_2Z_STRIDED_IMM:
3211 case AArch64::LD1D_2Z_IMM:
3212 case AArch64::LD1D_2Z_STRIDED_IMM:
3213 case AArch64::LD1B_4Z_IMM:
3214 case AArch64::LD1B_4Z_STRIDED_IMM:
3215 case AArch64::LD1H_4Z_IMM:
3216 case AArch64::LD1H_4Z_STRIDED_IMM:
3217 case AArch64::LD1W_4Z_IMM:
3218 case AArch64::LD1W_4Z_STRIDED_IMM:
3219 case AArch64::LD1D_4Z_IMM:
3220 case AArch64::LD1D_4Z_STRIDED_IMM:
3221 case AArch64::LD1B_2Z_IMM_PSEUDO:
3222 case AArch64::LD1H_2Z_IMM_PSEUDO:
3223 case AArch64::LD1W_2Z_IMM_PSEUDO:
3224 case AArch64::LD1D_2Z_IMM_PSEUDO:
3225 case AArch64::LD1B_4Z_IMM_PSEUDO:
3226 case AArch64::LD1H_4Z_IMM_PSEUDO:
3227 case AArch64::LD1W_4Z_IMM_PSEUDO:
3228 case AArch64::LD1D_4Z_IMM_PSEUDO:
3229 case AArch64::ST1B_2Z_IMM:
3230 case AArch64::ST1B_2Z_STRIDED_IMM:
3231 case AArch64::ST1H_2Z_IMM:
3232 case AArch64::ST1H_2Z_STRIDED_IMM:
3233 case AArch64::ST1W_2Z_IMM:
3234 case AArch64::ST1W_2Z_STRIDED_IMM:
3235 case AArch64::ST1D_2Z_IMM:
3236 case AArch64::ST1D_2Z_STRIDED_IMM:
3237 case AArch64::LDNT1B_2Z_IMM_PSEUDO:
3238 case AArch64::LDNT1B_2Z_IMM:
3239 case AArch64::LDNT1B_2Z_STRIDED_IMM:
3240 case AArch64::LDNT1H_2Z_IMM_PSEUDO:
3241 case AArch64::LDNT1H_2Z_IMM:
3242 case AArch64::LDNT1H_2Z_STRIDED_IMM:
3243 case AArch64::LDNT1W_2Z_IMM_PSEUDO:
3244 case AArch64::LDNT1W_2Z_IMM:
3245 case AArch64::LDNT1W_2Z_STRIDED_IMM:
3246 case AArch64::LDNT1D_2Z_IMM_PSEUDO:
3247 case AArch64::LDNT1D_2Z_IMM:
3248 case AArch64::LDNT1D_2Z_STRIDED_IMM:
3249 case AArch64::STNT1B_2Z_IMM:
3250 case AArch64::STNT1B_2Z_STRIDED_IMM:
3251 case AArch64::STNT1H_2Z_IMM:
3252 case AArch64::STNT1H_2Z_STRIDED_IMM:
3253 case AArch64::STNT1W_2Z_IMM:
3254 case AArch64::STNT1W_2Z_STRIDED_IMM:
3255 case AArch64::STNT1D_2Z_IMM:
3256 case AArch64::STNT1D_2Z_STRIDED_IMM:
3257 case AArch64::ST1B_4Z_IMM:
3258 case AArch64::ST1B_4Z_STRIDED_IMM:
3259 case AArch64::ST1H_4Z_IMM:
3260 case AArch64::ST1H_4Z_STRIDED_IMM:
3261 case AArch64::ST1W_4Z_IMM:
3262 case AArch64::ST1W_4Z_STRIDED_IMM:
3263 case AArch64::ST1D_4Z_IMM:
3264 case AArch64::ST1D_4Z_STRIDED_IMM:
3265 case AArch64::LDNT1B_4Z_IMM_PSEUDO:
3266 case AArch64::LDNT1B_4Z_IMM:
3267 case AArch64::LDNT1B_4Z_STRIDED_IMM:
3268 case AArch64::LDNT1H_4Z_IMM_PSEUDO:
3269 case AArch64::LDNT1H_4Z_IMM:
3270 case AArch64::LDNT1H_4Z_STRIDED_IMM:
3271 case AArch64::LDNT1W_4Z_IMM_PSEUDO:
3272 case AArch64::LDNT1W_4Z_IMM:
3273 case AArch64::LDNT1W_4Z_STRIDED_IMM:
3274 case AArch64::LDNT1D_4Z_IMM_PSEUDO:
3275 case AArch64::LDNT1D_4Z_IMM:
3276 case AArch64::LDNT1D_4Z_STRIDED_IMM:
3277 case AArch64::STNT1B_4Z_IMM:
3278 case AArch64::STNT1B_4Z_STRIDED_IMM:
3279 case AArch64::STNT1H_4Z_IMM:
3280 case AArch64::STNT1H_4Z_STRIDED_IMM:
3281 case AArch64::STNT1W_4Z_IMM:
3282 case AArch64::STNT1W_4Z_STRIDED_IMM:
3283 case AArch64::STNT1D_4Z_IMM:
3284 case AArch64::STNT1D_4Z_STRIDED_IMM:
3285 return 3;
3286 case AArch64::LDPDpost:
3287 case AArch64::LDPDpre:
3288 case AArch64::LDPQpost:
3289 case AArch64::LDPQpre:
3290 case AArch64::LDPSpost:
3291 case AArch64::LDPSpre:
3292 case AArch64::LDPWpost:
3293 case AArch64::LDPWpre:
3294 case AArch64::LDPXpost:
3295 case AArch64::LDPXpre:
3296 case AArch64::STGPpre:
3297 case AArch64::STGPpost:
3298 case AArch64::STPDpost:
3299 case AArch64::STPDpre:
3300 case AArch64::STPQpost:
3301 case AArch64::STPQpre:
3302 case AArch64::STPSpost:
3303 case AArch64::STPSpre:
3304 case AArch64::STPWpost:
3305 case AArch64::STPWpre:
3306 case AArch64::STPXpost:
3307 case AArch64::STPXpre:
3308 return 4;
3309 }
3310}
3311
3312bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
3313 switch (MI.getOpcode()) {
3314 default:
3315 return false;
3316 // Scaled instructions.
3317 case AArch64::STRSui:
3318 case AArch64::STRDui:
3319 case AArch64::STRQui:
3320 case AArch64::STRXui:
3321 case AArch64::STRWui:
3322 case AArch64::LDRSui:
3323 case AArch64::LDRDui:
3324 case AArch64::LDRQui:
3325 case AArch64::LDRXui:
3326 case AArch64::LDRWui:
3327 case AArch64::LDRSWui:
3328 // Unscaled instructions.
3329 case AArch64::STURSi:
3330 case AArch64::STRSpre:
3331 case AArch64::STURDi:
3332 case AArch64::STRDpre:
3333 case AArch64::STURQi:
3334 case AArch64::STRQpre:
3335 case AArch64::STURWi:
3336 case AArch64::STRWpre:
3337 case AArch64::STURXi:
3338 case AArch64::STRXpre:
3339 case AArch64::LDURSi:
3340 case AArch64::LDRSpre:
3341 case AArch64::LDURDi:
3342 case AArch64::LDRDpre:
3343 case AArch64::LDURQi:
3344 case AArch64::LDRQpre:
3345 case AArch64::LDURWi:
3346 case AArch64::LDRWpre:
3347 case AArch64::LDURXi:
3348 case AArch64::LDRXpre:
3349 case AArch64::LDURSWi:
3350 case AArch64::LDRSWpre:
3351 // SVE instructions.
3352 case AArch64::LDR_ZXI:
3353 case AArch64::STR_ZXI:
3354 return true;
3355 }
3356}
3357
3358bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
3359 switch (MI.getOpcode()) {
3360 default:
3361 assert((!MI.isCall() || !MI.isReturn()) &&
3362 "Unexpected instruction - was a new tail call opcode introduced?");
3363 return false;
3364 case AArch64::TCRETURNdi:
3365 case AArch64::TCRETURNri:
3366 case AArch64::TCRETURNrix16x17:
3367 case AArch64::TCRETURNrix17:
3368 case AArch64::TCRETURNrinotx16:
3369 case AArch64::TCRETURNriALL:
3370 case AArch64::AUTH_TCRETURN:
3371 case AArch64::AUTH_TCRETURN_BTI:
3372 return true;
3373 }
3374}
3375
3376unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
3377 switch (Opc) {
3378 default:
3379 llvm_unreachable("Opcode has no flag setting equivalent!");
3380 // 32-bit cases:
3381 case AArch64::ADDWri:
3382 return AArch64::ADDSWri;
3383 case AArch64::ADDWrr:
3384 return AArch64::ADDSWrr;
3385 case AArch64::ADDWrs:
3386 return AArch64::ADDSWrs;
3387 case AArch64::ADDWrx:
3388 return AArch64::ADDSWrx;
3389 case AArch64::ANDWri:
3390 return AArch64::ANDSWri;
3391 case AArch64::ANDWrr:
3392 return AArch64::ANDSWrr;
3393 case AArch64::ANDWrs:
3394 return AArch64::ANDSWrs;
3395 case AArch64::BICWrr:
3396 return AArch64::BICSWrr;
3397 case AArch64::BICWrs:
3398 return AArch64::BICSWrs;
3399 case AArch64::SUBWri:
3400 return AArch64::SUBSWri;
3401 case AArch64::SUBWrr:
3402 return AArch64::SUBSWrr;
3403 case AArch64::SUBWrs:
3404 return AArch64::SUBSWrs;
3405 case AArch64::SUBWrx:
3406 return AArch64::SUBSWrx;
3407 // 64-bit cases:
3408 case AArch64::ADDXri:
3409 return AArch64::ADDSXri;
3410 case AArch64::ADDXrr:
3411 return AArch64::ADDSXrr;
3412 case AArch64::ADDXrs:
3413 return AArch64::ADDSXrs;
3414 case AArch64::ADDXrx:
3415 return AArch64::ADDSXrx;
3416 case AArch64::ANDXri:
3417 return AArch64::ANDSXri;
3418 case AArch64::ANDXrr:
3419 return AArch64::ANDSXrr;
3420 case AArch64::ANDXrs:
3421 return AArch64::ANDSXrs;
3422 case AArch64::BICXrr:
3423 return AArch64::BICSXrr;
3424 case AArch64::BICXrs:
3425 return AArch64::BICSXrs;
3426 case AArch64::SUBXri:
3427 return AArch64::SUBSXri;
3428 case AArch64::SUBXrr:
3429 return AArch64::SUBSXrr;
3430 case AArch64::SUBXrs:
3431 return AArch64::SUBSXrs;
3432 case AArch64::SUBXrx:
3433 return AArch64::SUBSXrx;
3434 // SVE instructions:
3435 case AArch64::AND_PPzPP:
3436 return AArch64::ANDS_PPzPP;
3437 case AArch64::BIC_PPzPP:
3438 return AArch64::BICS_PPzPP;
3439 case AArch64::EOR_PPzPP:
3440 return AArch64::EORS_PPzPP;
3441 case AArch64::NAND_PPzPP:
3442 return AArch64::NANDS_PPzPP;
3443 case AArch64::NOR_PPzPP:
3444 return AArch64::NORS_PPzPP;
3445 case AArch64::ORN_PPzPP:
3446 return AArch64::ORNS_PPzPP;
3447 case AArch64::ORR_PPzPP:
3448 return AArch64::ORRS_PPzPP;
3449 case AArch64::BRKA_PPzP:
3450 return AArch64::BRKAS_PPzP;
3451 case AArch64::BRKPA_PPzPP:
3452 return AArch64::BRKPAS_PPzPP;
3453 case AArch64::BRKB_PPzP:
3454 return AArch64::BRKBS_PPzP;
3455 case AArch64::BRKPB_PPzPP:
3456 return AArch64::BRKPBS_PPzPP;
3457 case AArch64::BRKN_PPzP:
3458 return AArch64::BRKNS_PPzP;
3459 case AArch64::RDFFR_PPz:
3460 return AArch64::RDFFRS_PPz;
3461 case AArch64::PTRUE_B:
3462 return AArch64::PTRUES_B;
3463 }
3464}
3465
3466// Is this a candidate for ld/st merging or pairing? For example, we don't
3467// touch volatiles or load/stores that have a hint to avoid pair formation.
3468bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
3469
3470 bool IsPreLdSt = isPreLdSt(MI);
3471
3472 // If this is a volatile load/store, don't mess with it.
3473 if (MI.hasOrderedMemoryRef())
3474 return false;
3475
3476 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3477 // For Pre-inc LD/ST, the operand is shifted by one.
3478 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3479 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3480 "Expected a reg or frame index operand.");
3481
3482 // For Pre-indexed addressing quadword instructions, the third operand is the
3483 // immediate value.
3484 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(i: 3).isImm();
3485
3486 if (!MI.getOperand(i: 2).isImm() && !IsImmPreLdSt)
3487 return false;
3488
3489 // Can't merge/pair if the instruction modifies the base register.
3490 // e.g., ldr x0, [x0]
3491 // This case will never occur with an FI base.
3492 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3493 // STR<S,D,Q,W,X>pre, it can be merged.
3494 // For example:
3495 // ldr q0, [x11, #32]!
3496 // ldr q1, [x11, #16]
3497 // to
3498 // ldp q0, q1, [x11, #32]!
3499 if (MI.getOperand(i: 1).isReg() && !IsPreLdSt) {
3500 Register BaseReg = MI.getOperand(i: 1).getReg();
3501 const TargetRegisterInfo *TRI = &getRegisterInfo();
3502 if (MI.modifiesRegister(Reg: BaseReg, TRI))
3503 return false;
3504 }
3505
3506 // Pairing SVE fills/spills is only valid for little-endian targets that
3507 // implement VLS 128.
3508 switch (MI.getOpcode()) {
3509 default:
3510 break;
3511 case AArch64::LDR_ZXI:
3512 case AArch64::STR_ZXI:
3513 if (!Subtarget.isLittleEndian() ||
3514 Subtarget.getSVEVectorSizeInBits() != 128)
3515 return false;
3516 }
3517
3518 // Check if this load/store has a hint to avoid pair formation.
3519 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3520 if (isLdStPairSuppressed(MI))
3521 return false;
3522
3523 // Do not pair any callee-save store/reload instructions in the
3524 // prologue/epilogue if the CFI information encoded the operations as separate
3525 // instructions, as that will cause the size of the actual prologue to mismatch
3526 // with the prologue size recorded in the Windows CFI.
3527 const MCAsmInfo &MAI = MI.getMF()->getTarget().getMCAsmInfo();
3528 bool NeedsWinCFI =
3529 MAI.usesWindowsCFI() && MI.getMF()->getFunction().needsUnwindTableEntry();
3530 if (NeedsWinCFI && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
3531 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
3532 return false;
3533
3534 // On some CPUs quad load/store pairs are slower than two single load/stores.
3535 if (Subtarget.isPaired128Slow()) {
3536 switch (MI.getOpcode()) {
3537 default:
3538 break;
3539 case AArch64::LDURQi:
3540 case AArch64::STURQi:
3541 case AArch64::LDRQui:
3542 case AArch64::STRQui:
3543 return false;
3544 }
3545 }
3546
3547 return true;
3548}
3549
3550bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
3551 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
3552 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3553 const TargetRegisterInfo *TRI) const {
3554 if (!LdSt.mayLoadOrStore())
3555 return false;
3556
3557 const MachineOperand *BaseOp;
3558 TypeSize WidthN(0, false);
3559 if (!getMemOperandWithOffsetWidth(MI: LdSt, BaseOp, Offset, OffsetIsScalable,
3560 Width&: WidthN, TRI))
3561 return false;
3562 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3563 // vector.
3564 Width = LocationSize::precise(Value: WidthN);
3565 BaseOps.push_back(Elt: BaseOp);
3566 return true;
3567}
3568
3569std::optional<ExtAddrMode>
3570AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
3571 const TargetRegisterInfo *TRI) const {
3572 const MachineOperand *Base; // Filled with the base operand of MI.
3573 int64_t Offset; // Filled with the offset of MI.
3574 bool OffsetIsScalable;
3575 if (!getMemOperandWithOffset(MI: MemI, BaseOp&: Base, Offset, OffsetIsScalable, TRI))
3576 return std::nullopt;
3577
3578 if (!Base->isReg())
3579 return std::nullopt;
3580 ExtAddrMode AM;
3581 AM.BaseReg = Base->getReg();
3582 AM.Displacement = Offset;
3583 AM.ScaledReg = 0;
3584 AM.Scale = 0;
3585 return AM;
3586}
3587
3588bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
3589 Register Reg,
3590 const MachineInstr &AddrI,
3591 ExtAddrMode &AM) const {
3592 // Filter out instructions into which we cannot fold.
3593 unsigned NumBytes;
3594 int64_t OffsetScale = 1;
3595 switch (MemI.getOpcode()) {
3596 default:
3597 return false;
3598
3599 case AArch64::LDURQi:
3600 case AArch64::STURQi:
3601 NumBytes = 16;
3602 break;
3603
3604 case AArch64::LDURDi:
3605 case AArch64::STURDi:
3606 case AArch64::LDURXi:
3607 case AArch64::STURXi:
3608 NumBytes = 8;
3609 break;
3610
3611 case AArch64::LDURWi:
3612 case AArch64::LDURSWi:
3613 case AArch64::STURWi:
3614 NumBytes = 4;
3615 break;
3616
3617 case AArch64::LDURHi:
3618 case AArch64::STURHi:
3619 case AArch64::LDURHHi:
3620 case AArch64::STURHHi:
3621 case AArch64::LDURSHXi:
3622 case AArch64::LDURSHWi:
3623 NumBytes = 2;
3624 break;
3625
3626 case AArch64::LDRBroX:
3627 case AArch64::LDRBBroX:
3628 case AArch64::LDRSBXroX:
3629 case AArch64::LDRSBWroX:
3630 case AArch64::STRBroX:
3631 case AArch64::STRBBroX:
3632 case AArch64::LDURBi:
3633 case AArch64::LDURBBi:
3634 case AArch64::LDURSBXi:
3635 case AArch64::LDURSBWi:
3636 case AArch64::STURBi:
3637 case AArch64::STURBBi:
3638 case AArch64::LDRBui:
3639 case AArch64::LDRBBui:
3640 case AArch64::LDRSBXui:
3641 case AArch64::LDRSBWui:
3642 case AArch64::STRBui:
3643 case AArch64::STRBBui:
3644 NumBytes = 1;
3645 break;
3646
3647 case AArch64::LDRQroX:
3648 case AArch64::STRQroX:
3649 case AArch64::LDRQui:
3650 case AArch64::STRQui:
3651 NumBytes = 16;
3652 OffsetScale = 16;
3653 break;
3654
3655 case AArch64::LDRDroX:
3656 case AArch64::STRDroX:
3657 case AArch64::LDRXroX:
3658 case AArch64::STRXroX:
3659 case AArch64::LDRDui:
3660 case AArch64::STRDui:
3661 case AArch64::LDRXui:
3662 case AArch64::STRXui:
3663 NumBytes = 8;
3664 OffsetScale = 8;
3665 break;
3666
3667 case AArch64::LDRWroX:
3668 case AArch64::LDRSWroX:
3669 case AArch64::STRWroX:
3670 case AArch64::LDRWui:
3671 case AArch64::LDRSWui:
3672 case AArch64::STRWui:
3673 NumBytes = 4;
3674 OffsetScale = 4;
3675 break;
3676
3677 case AArch64::LDRHroX:
3678 case AArch64::STRHroX:
3679 case AArch64::LDRHHroX:
3680 case AArch64::STRHHroX:
3681 case AArch64::LDRSHXroX:
3682 case AArch64::LDRSHWroX:
3683 case AArch64::LDRHui:
3684 case AArch64::STRHui:
3685 case AArch64::LDRHHui:
3686 case AArch64::STRHHui:
3687 case AArch64::LDRSHXui:
3688 case AArch64::LDRSHWui:
3689 NumBytes = 2;
3690 OffsetScale = 2;
3691 break;
3692 }
3693
3694 // Check the fold operand is not the loaded/stored value.
3695 const MachineOperand &BaseRegOp = MemI.getOperand(i: 0);
3696 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3697 return false;
3698
3699 // Handle memory instructions with a [Reg, Reg] addressing mode.
3700 if (MemI.getOperand(i: 2).isReg()) {
3701 // Bail if the addressing mode already includes extension of the offset
3702 // register.
3703 if (MemI.getOperand(i: 3).getImm())
3704 return false;
3705
3706 // Check if we actually have a scaled offset.
3707 if (MemI.getOperand(i: 4).getImm() == 0)
3708 OffsetScale = 1;
3709
3710 // If the address instructions is folded into the base register, then the
3711 // addressing mode must not have a scale. Then we can swap the base and the
3712 // scaled registers.
3713 if (MemI.getOperand(i: 1).getReg() == Reg && OffsetScale != 1)
3714 return false;
3715
3716 switch (AddrI.getOpcode()) {
3717 default:
3718 return false;
3719
3720 case AArch64::SBFMXri:
3721 // sxtw Xa, Wm
3722 // ldr Xd, [Xn, Xa, lsl #N]
3723 // ->
3724 // ldr Xd, [Xn, Wm, sxtw #N]
3725 if (AddrI.getOperand(i: 2).getImm() != 0 ||
3726 AddrI.getOperand(i: 3).getImm() != 31)
3727 return false;
3728
3729 AM.BaseReg = MemI.getOperand(i: 1).getReg();
3730 if (AM.BaseReg == Reg)
3731 AM.BaseReg = MemI.getOperand(i: 2).getReg();
3732 AM.ScaledReg = AddrI.getOperand(i: 1).getReg();
3733 AM.Scale = OffsetScale;
3734 AM.Displacement = 0;
3735 AM.Form = ExtAddrMode::Formula::SExtScaledReg;
3736 return true;
3737
3738 case TargetOpcode::SUBREG_TO_REG: {
3739 // mov Wa, Wm
3740 // ldr Xd, [Xn, Xa, lsl #N]
3741 // ->
3742 // ldr Xd, [Xn, Wm, uxtw #N]
3743
3744 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3745 if (AddrI.getOperand(i: 2).getImm() != AArch64::sub_32)
3746 return false;
3747
3748 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3749 Register OffsetReg = AddrI.getOperand(i: 1).getReg();
3750 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(RegNo: OffsetReg))
3751 return false;
3752
3753 const MachineInstr &DefMI = *MRI.getVRegDef(Reg: OffsetReg);
3754 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3755 DefMI.getOperand(i: 1).getReg() != AArch64::WZR ||
3756 DefMI.getOperand(i: 3).getImm() != 0)
3757 return false;
3758
3759 AM.BaseReg = MemI.getOperand(i: 1).getReg();
3760 if (AM.BaseReg == Reg)
3761 AM.BaseReg = MemI.getOperand(i: 2).getReg();
3762 AM.ScaledReg = DefMI.getOperand(i: 2).getReg();
3763 AM.Scale = OffsetScale;
3764 AM.Displacement = 0;
3765 AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
3766 return true;
3767 }
3768 }
3769 }
3770
3771 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3772
3773 // Check we are not breaking a potential conversion to an LDP.
3774 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3775 int64_t NewOffset) -> bool {
3776 int64_t MinOffset, MaxOffset;
3777 switch (NumBytes) {
3778 default:
3779 return true;
3780 case 4:
3781 MinOffset = -256;
3782 MaxOffset = 252;
3783 break;
3784 case 8:
3785 MinOffset = -512;
3786 MaxOffset = 504;
3787 break;
3788 case 16:
3789 MinOffset = -1024;
3790 MaxOffset = 1008;
3791 break;
3792 }
3793 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3794 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3795 };
3796 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3797 int64_t OldOffset = MemI.getOperand(i: 2).getImm() * OffsetScale;
3798 int64_t NewOffset = OldOffset + Disp;
3799 if (!isLegalAddressingMode(NumBytes, Offset: NewOffset, /* Scale */ 0))
3800 return false;
3801 // If the old offset would fit into an LDP, but the new offset wouldn't,
3802 // bail out.
3803 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3804 return false;
3805 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
3806 AM.ScaledReg = 0;
3807 AM.Scale = 0;
3808 AM.Displacement = NewOffset;
3809 AM.Form = ExtAddrMode::Formula::Basic;
3810 return true;
3811 };
3812
3813 auto canFoldAddRegIntoAddrMode =
3814 [&](int64_t Scale,
3815 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
3816 if (MemI.getOperand(i: 2).getImm() != 0)
3817 return false;
3818 if ((unsigned)Scale != Scale)
3819 return false;
3820 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3821 return false;
3822 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
3823 AM.ScaledReg = AddrI.getOperand(i: 2).getReg();
3824 AM.Scale = Scale;
3825 AM.Displacement = 0;
3826 AM.Form = Form;
3827 return true;
3828 };
3829
3830 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3831 unsigned Opcode = MemI.getOpcode();
3832 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3833 Subtarget.isSTRQroSlow();
3834 };
3835
3836 int64_t Disp = 0;
3837 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3838 switch (AddrI.getOpcode()) {
3839 default:
3840 return false;
3841
3842 case AArch64::ADDXri:
3843 // add Xa, Xn, #N
3844 // ldr Xd, [Xa, #M]
3845 // ->
3846 // ldr Xd, [Xn, #N'+M]
3847 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
3848 return canFoldAddSubImmIntoAddrMode(Disp);
3849
3850 case AArch64::SUBXri:
3851 // sub Xa, Xn, #N
3852 // ldr Xd, [Xa, #M]
3853 // ->
3854 // ldr Xd, [Xn, #N'+M]
3855 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
3856 return canFoldAddSubImmIntoAddrMode(-Disp);
3857
3858 case AArch64::ADDXrs: {
3859 // add Xa, Xn, Xm, lsl #N
3860 // ldr Xd, [Xa]
3861 // ->
3862 // ldr Xd, [Xn, Xm, lsl #N]
3863
3864 // Don't fold the add if the result would be slower, unless optimising for
3865 // size.
3866 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3867 if (AArch64_AM::getShiftType(Imm: Shift) != AArch64_AM::ShiftExtendType::LSL)
3868 return false;
3869 Shift = AArch64_AM::getShiftValue(Imm: Shift);
3870 if (!OptSize) {
3871 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3872 return false;
3873 if (avoidSlowSTRQ(MemI))
3874 return false;
3875 }
3876 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3877 }
3878
3879 case AArch64::ADDXrr:
3880 // add Xa, Xn, Xm
3881 // ldr Xd, [Xa]
3882 // ->
3883 // ldr Xd, [Xn, Xm, lsl #0]
3884
3885 // Don't fold the add if the result would be slower, unless optimising for
3886 // size.
3887 if (!OptSize && avoidSlowSTRQ(MemI))
3888 return false;
3889 return canFoldAddRegIntoAddrMode(1);
3890
3891 case AArch64::ADDXrx:
3892 // add Xa, Xn, Wm, {s,u}xtw #N
3893 // ldr Xd, [Xa]
3894 // ->
3895 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3896
3897 // Don't fold the add if the result would be slower, unless optimising for
3898 // size.
3899 if (!OptSize && avoidSlowSTRQ(MemI))
3900 return false;
3901
3902 // Can fold only sign-/zero-extend of a word.
3903 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3904 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3905 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3906 return false;
3907
3908 return canFoldAddRegIntoAddrMode(
3909 1ULL << AArch64_AM::getArithShiftValue(Imm),
3910 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3911 : ExtAddrMode::Formula::ZExtScaledReg);
3912 }
3913}
3914
3915// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3916// return the opcode of an instruction performing the same operation, but using
3917// the [Reg, Reg] addressing mode.
3918static unsigned regOffsetOpcode(unsigned Opcode) {
3919 switch (Opcode) {
3920 default:
3921 llvm_unreachable("Address folding not implemented for instruction");
3922
3923 case AArch64::LDURQi:
3924 case AArch64::LDRQui:
3925 return AArch64::LDRQroX;
3926 case AArch64::STURQi:
3927 case AArch64::STRQui:
3928 return AArch64::STRQroX;
3929 case AArch64::LDURDi:
3930 case AArch64::LDRDui:
3931 return AArch64::LDRDroX;
3932 case AArch64::STURDi:
3933 case AArch64::STRDui:
3934 return AArch64::STRDroX;
3935 case AArch64::LDURXi:
3936 case AArch64::LDRXui:
3937 return AArch64::LDRXroX;
3938 case AArch64::STURXi:
3939 case AArch64::STRXui:
3940 return AArch64::STRXroX;
3941 case AArch64::LDURWi:
3942 case AArch64::LDRWui:
3943 return AArch64::LDRWroX;
3944 case AArch64::LDURSWi:
3945 case AArch64::LDRSWui:
3946 return AArch64::LDRSWroX;
3947 case AArch64::STURWi:
3948 case AArch64::STRWui:
3949 return AArch64::STRWroX;
3950 case AArch64::LDURHi:
3951 case AArch64::LDRHui:
3952 return AArch64::LDRHroX;
3953 case AArch64::STURHi:
3954 case AArch64::STRHui:
3955 return AArch64::STRHroX;
3956 case AArch64::LDURHHi:
3957 case AArch64::LDRHHui:
3958 return AArch64::LDRHHroX;
3959 case AArch64::STURHHi:
3960 case AArch64::STRHHui:
3961 return AArch64::STRHHroX;
3962 case AArch64::LDURSHXi:
3963 case AArch64::LDRSHXui:
3964 return AArch64::LDRSHXroX;
3965 case AArch64::LDURSHWi:
3966 case AArch64::LDRSHWui:
3967 return AArch64::LDRSHWroX;
3968 case AArch64::LDURBi:
3969 case AArch64::LDRBui:
3970 return AArch64::LDRBroX;
3971 case AArch64::LDURBBi:
3972 case AArch64::LDRBBui:
3973 return AArch64::LDRBBroX;
3974 case AArch64::LDURSBXi:
3975 case AArch64::LDRSBXui:
3976 return AArch64::LDRSBXroX;
3977 case AArch64::LDURSBWi:
3978 case AArch64::LDRSBWui:
3979 return AArch64::LDRSBWroX;
3980 case AArch64::STURBi:
3981 case AArch64::STRBui:
3982 return AArch64::STRBroX;
3983 case AArch64::STURBBi:
3984 case AArch64::STRBBui:
3985 return AArch64::STRBBroX;
3986 }
3987}
3988
3989// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3990// the opcode of an instruction performing the same operation, but using the
3991// [Reg, #Imm] addressing mode with scaled offset.
3992unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3993 switch (Opcode) {
3994 default:
3995 llvm_unreachable("Address folding not implemented for instruction");
3996
3997 case AArch64::LDURQi:
3998 Scale = 16;
3999 return AArch64::LDRQui;
4000 case AArch64::STURQi:
4001 Scale = 16;
4002 return AArch64::STRQui;
4003 case AArch64::LDURDi:
4004 Scale = 8;
4005 return AArch64::LDRDui;
4006 case AArch64::STURDi:
4007 Scale = 8;
4008 return AArch64::STRDui;
4009 case AArch64::LDURXi:
4010 Scale = 8;
4011 return AArch64::LDRXui;
4012 case AArch64::STURXi:
4013 Scale = 8;
4014 return AArch64::STRXui;
4015 case AArch64::LDURWi:
4016 Scale = 4;
4017 return AArch64::LDRWui;
4018 case AArch64::LDURSWi:
4019 Scale = 4;
4020 return AArch64::LDRSWui;
4021 case AArch64::STURWi:
4022 Scale = 4;
4023 return AArch64::STRWui;
4024 case AArch64::LDURHi:
4025 Scale = 2;
4026 return AArch64::LDRHui;
4027 case AArch64::STURHi:
4028 Scale = 2;
4029 return AArch64::STRHui;
4030 case AArch64::LDURHHi:
4031 Scale = 2;
4032 return AArch64::LDRHHui;
4033 case AArch64::STURHHi:
4034 Scale = 2;
4035 return AArch64::STRHHui;
4036 case AArch64::LDURSHXi:
4037 Scale = 2;
4038 return AArch64::LDRSHXui;
4039 case AArch64::LDURSHWi:
4040 Scale = 2;
4041 return AArch64::LDRSHWui;
4042 case AArch64::LDURBi:
4043 Scale = 1;
4044 return AArch64::LDRBui;
4045 case AArch64::LDURBBi:
4046 Scale = 1;
4047 return AArch64::LDRBBui;
4048 case AArch64::LDURSBXi:
4049 Scale = 1;
4050 return AArch64::LDRSBXui;
4051 case AArch64::LDURSBWi:
4052 Scale = 1;
4053 return AArch64::LDRSBWui;
4054 case AArch64::STURBi:
4055 Scale = 1;
4056 return AArch64::STRBui;
4057 case AArch64::STURBBi:
4058 Scale = 1;
4059 return AArch64::STRBBui;
4060 case AArch64::LDRQui:
4061 case AArch64::STRQui:
4062 Scale = 16;
4063 return Opcode;
4064 case AArch64::LDRDui:
4065 case AArch64::STRDui:
4066 case AArch64::LDRXui:
4067 case AArch64::STRXui:
4068 Scale = 8;
4069 return Opcode;
4070 case AArch64::LDRWui:
4071 case AArch64::LDRSWui:
4072 case AArch64::STRWui:
4073 Scale = 4;
4074 return Opcode;
4075 case AArch64::LDRHui:
4076 case AArch64::STRHui:
4077 case AArch64::LDRHHui:
4078 case AArch64::STRHHui:
4079 case AArch64::LDRSHXui:
4080 case AArch64::LDRSHWui:
4081 Scale = 2;
4082 return Opcode;
4083 case AArch64::LDRBui:
4084 case AArch64::LDRBBui:
4085 case AArch64::LDRSBXui:
4086 case AArch64::LDRSBWui:
4087 case AArch64::STRBui:
4088 case AArch64::STRBBui:
4089 Scale = 1;
4090 return Opcode;
4091 }
4092}
4093
4094// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
4095// the opcode of an instruction performing the same operation, but using the
4096// [Reg, #Imm] addressing mode with unscaled offset.
4097unsigned unscaledOffsetOpcode(unsigned Opcode) {
4098 switch (Opcode) {
4099 default:
4100 llvm_unreachable("Address folding not implemented for instruction");
4101
4102 case AArch64::LDURQi:
4103 case AArch64::STURQi:
4104 case AArch64::LDURDi:
4105 case AArch64::STURDi:
4106 case AArch64::LDURXi:
4107 case AArch64::STURXi:
4108 case AArch64::LDURWi:
4109 case AArch64::LDURSWi:
4110 case AArch64::STURWi:
4111 case AArch64::LDURHi:
4112 case AArch64::STURHi:
4113 case AArch64::LDURHHi:
4114 case AArch64::STURHHi:
4115 case AArch64::LDURSHXi:
4116 case AArch64::LDURSHWi:
4117 case AArch64::LDURBi:
4118 case AArch64::STURBi:
4119 case AArch64::LDURBBi:
4120 case AArch64::STURBBi:
4121 case AArch64::LDURSBWi:
4122 case AArch64::LDURSBXi:
4123 return Opcode;
4124 case AArch64::LDRQui:
4125 return AArch64::LDURQi;
4126 case AArch64::STRQui:
4127 return AArch64::STURQi;
4128 case AArch64::LDRDui:
4129 return AArch64::LDURDi;
4130 case AArch64::STRDui:
4131 return AArch64::STURDi;
4132 case AArch64::LDRXui:
4133 return AArch64::LDURXi;
4134 case AArch64::STRXui:
4135 return AArch64::STURXi;
4136 case AArch64::LDRWui:
4137 return AArch64::LDURWi;
4138 case AArch64::LDRSWui:
4139 return AArch64::LDURSWi;
4140 case AArch64::STRWui:
4141 return AArch64::STURWi;
4142 case AArch64::LDRHui:
4143 return AArch64::LDURHi;
4144 case AArch64::STRHui:
4145 return AArch64::STURHi;
4146 case AArch64::LDRHHui:
4147 return AArch64::LDURHHi;
4148 case AArch64::STRHHui:
4149 return AArch64::STURHHi;
4150 case AArch64::LDRSHXui:
4151 return AArch64::LDURSHXi;
4152 case AArch64::LDRSHWui:
4153 return AArch64::LDURSHWi;
4154 case AArch64::LDRBBui:
4155 return AArch64::LDURBBi;
4156 case AArch64::LDRBui:
4157 return AArch64::LDURBi;
4158 case AArch64::STRBBui:
4159 return AArch64::STURBBi;
4160 case AArch64::STRBui:
4161 return AArch64::STURBi;
4162 case AArch64::LDRSBWui:
4163 return AArch64::LDURSBWi;
4164 case AArch64::LDRSBXui:
4165 return AArch64::LDURSBXi;
4166 }
4167}
4168
4169// Given the opcode of a memory load/store instruction, return the opcode of an
4170// instruction performing the same operation, but using
4171// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
4172// offset register.
4173static unsigned offsetExtendOpcode(unsigned Opcode) {
4174 switch (Opcode) {
4175 default:
4176 llvm_unreachable("Address folding not implemented for instruction");
4177
4178 case AArch64::LDRQroX:
4179 case AArch64::LDURQi:
4180 case AArch64::LDRQui:
4181 return AArch64::LDRQroW;
4182 case AArch64::STRQroX:
4183 case AArch64::STURQi:
4184 case AArch64::STRQui:
4185 return AArch64::STRQroW;
4186 case AArch64::LDRDroX:
4187 case AArch64::LDURDi:
4188 case AArch64::LDRDui:
4189 return AArch64::LDRDroW;
4190 case AArch64::STRDroX:
4191 case AArch64::STURDi:
4192 case AArch64::STRDui:
4193 return AArch64::STRDroW;
4194 case AArch64::LDRXroX:
4195 case AArch64::LDURXi:
4196 case AArch64::LDRXui:
4197 return AArch64::LDRXroW;
4198 case AArch64::STRXroX:
4199 case AArch64::STURXi:
4200 case AArch64::STRXui:
4201 return AArch64::STRXroW;
4202 case AArch64::LDRWroX:
4203 case AArch64::LDURWi:
4204 case AArch64::LDRWui:
4205 return AArch64::LDRWroW;
4206 case AArch64::LDRSWroX:
4207 case AArch64::LDURSWi:
4208 case AArch64::LDRSWui:
4209 return AArch64::LDRSWroW;
4210 case AArch64::STRWroX:
4211 case AArch64::STURWi:
4212 case AArch64::STRWui:
4213 return AArch64::STRWroW;
4214 case AArch64::LDRHroX:
4215 case AArch64::LDURHi:
4216 case AArch64::LDRHui:
4217 return AArch64::LDRHroW;
4218 case AArch64::STRHroX:
4219 case AArch64::STURHi:
4220 case AArch64::STRHui:
4221 return AArch64::STRHroW;
4222 case AArch64::LDRHHroX:
4223 case AArch64::LDURHHi:
4224 case AArch64::LDRHHui:
4225 return AArch64::LDRHHroW;
4226 case AArch64::STRHHroX:
4227 case AArch64::STURHHi:
4228 case AArch64::STRHHui:
4229 return AArch64::STRHHroW;
4230 case AArch64::LDRSHXroX:
4231 case AArch64::LDURSHXi:
4232 case AArch64::LDRSHXui:
4233 return AArch64::LDRSHXroW;
4234 case AArch64::LDRSHWroX:
4235 case AArch64::LDURSHWi:
4236 case AArch64::LDRSHWui:
4237 return AArch64::LDRSHWroW;
4238 case AArch64::LDRBroX:
4239 case AArch64::LDURBi:
4240 case AArch64::LDRBui:
4241 return AArch64::LDRBroW;
4242 case AArch64::LDRBBroX:
4243 case AArch64::LDURBBi:
4244 case AArch64::LDRBBui:
4245 return AArch64::LDRBBroW;
4246 case AArch64::LDRSBXroX:
4247 case AArch64::LDURSBXi:
4248 case AArch64::LDRSBXui:
4249 return AArch64::LDRSBXroW;
4250 case AArch64::LDRSBWroX:
4251 case AArch64::LDURSBWi:
4252 case AArch64::LDRSBWui:
4253 return AArch64::LDRSBWroW;
4254 case AArch64::STRBroX:
4255 case AArch64::STURBi:
4256 case AArch64::STRBui:
4257 return AArch64::STRBroW;
4258 case AArch64::STRBBroX:
4259 case AArch64::STURBBi:
4260 case AArch64::STRBBui:
4261 return AArch64::STRBBroW;
4262 }
4263}
4264
4265MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
4266 const ExtAddrMode &AM) const {
4267
4268 const DebugLoc &DL = MemI.getDebugLoc();
4269 MachineBasicBlock &MBB = *MemI.getParent();
4270 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4271
4272 if (AM.Form == ExtAddrMode::Formula::Basic) {
4273 if (AM.ScaledReg) {
4274 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4275 unsigned Opcode = regOffsetOpcode(Opcode: MemI.getOpcode());
4276 MRI.constrainRegClass(Reg: AM.BaseReg, RC: &AArch64::GPR64spRegClass);
4277 auto B = BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4278 .addReg(RegNo: MemI.getOperand(i: 0).getReg(),
4279 Flags: getDefRegState(B: MemI.mayLoad()))
4280 .addReg(RegNo: AM.BaseReg)
4281 .addReg(RegNo: AM.ScaledReg)
4282 .addImm(Val: 0)
4283 .addImm(Val: AM.Scale > 1)
4284 .setMemRefs(MemI.memoperands())
4285 .setMIFlags(MemI.getFlags());
4286 return B.getInstr();
4287 }
4288
4289 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4290 "Addressing mode not supported for folding");
4291
4292 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4293 unsigned Scale = 1;
4294 unsigned Opcode = MemI.getOpcode();
4295 if (isInt<9>(x: AM.Displacement))
4296 Opcode = unscaledOffsetOpcode(Opcode);
4297 else
4298 Opcode = scaledOffsetOpcode(Opcode, Scale);
4299
4300 auto B =
4301 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4302 .addReg(RegNo: MemI.getOperand(i: 0).getReg(), Flags: getDefRegState(B: MemI.mayLoad()))
4303 .addReg(RegNo: AM.BaseReg)
4304 .addImm(Val: AM.Displacement / Scale)
4305 .setMemRefs(MemI.memoperands())
4306 .setMIFlags(MemI.getFlags());
4307 return B.getInstr();
4308 }
4309
4310 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
4311 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
4312 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4313 assert(AM.ScaledReg && !AM.Displacement &&
4314 "Address offset can be a register or an immediate, but not both");
4315 unsigned Opcode = offsetExtendOpcode(Opcode: MemI.getOpcode());
4316 MRI.constrainRegClass(Reg: AM.BaseReg, RC: &AArch64::GPR64spRegClass);
4317 // Make sure the offset register is in the correct register class.
4318 Register OffsetReg = AM.ScaledReg;
4319 const TargetRegisterClass *RC = MRI.getRegClass(Reg: OffsetReg);
4320 if (RC->hasSuperClassEq(RC: &AArch64::GPR64RegClass)) {
4321 OffsetReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
4322 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode: TargetOpcode::COPY), DestReg: OffsetReg)
4323 .addReg(RegNo: AM.ScaledReg, Flags: {}, SubReg: AArch64::sub_32);
4324 }
4325 auto B =
4326 BuildMI(BB&: MBB, I&: MemI, MIMD: DL, MCID: get(Opcode))
4327 .addReg(RegNo: MemI.getOperand(i: 0).getReg(), Flags: getDefRegState(B: MemI.mayLoad()))
4328 .addReg(RegNo: AM.BaseReg)
4329 .addReg(RegNo: OffsetReg)
4330 .addImm(Val: AM.Form == ExtAddrMode::Formula::SExtScaledReg)
4331 .addImm(Val: AM.Scale != 1)
4332 .setMemRefs(MemI.memoperands())
4333 .setMIFlags(MemI.getFlags());
4334
4335 return B.getInstr();
4336 }
4337
4338 llvm_unreachable(
4339 "Function must not be called with an addressing mode it can't handle");
4340}
4341
4342/// Return true if the opcode is a post-index ld/st instruction, which really
4343/// loads from base+0.
4344static bool isPostIndexLdStOpcode(unsigned Opcode) {
4345 switch (Opcode) {
4346 default:
4347 return false;
4348 case AArch64::LD1Fourv16b_POST:
4349 case AArch64::LD1Fourv1d_POST:
4350 case AArch64::LD1Fourv2d_POST:
4351 case AArch64::LD1Fourv2s_POST:
4352 case AArch64::LD1Fourv4h_POST:
4353 case AArch64::LD1Fourv4s_POST:
4354 case AArch64::LD1Fourv8b_POST:
4355 case AArch64::LD1Fourv8h_POST:
4356 case AArch64::LD1Onev16b_POST:
4357 case AArch64::LD1Onev1d_POST:
4358 case AArch64::LD1Onev2d_POST:
4359 case AArch64::LD1Onev2s_POST:
4360 case AArch64::LD1Onev4h_POST:
4361 case AArch64::LD1Onev4s_POST:
4362 case AArch64::LD1Onev8b_POST:
4363 case AArch64::LD1Onev8h_POST:
4364 case AArch64::LD1Rv16b_POST:
4365 case AArch64::LD1Rv1d_POST:
4366 case AArch64::LD1Rv2d_POST:
4367 case AArch64::LD1Rv2s_POST:
4368 case AArch64::LD1Rv4h_POST:
4369 case AArch64::LD1Rv4s_POST:
4370 case AArch64::LD1Rv8b_POST:
4371 case AArch64::LD1Rv8h_POST:
4372 case AArch64::LD1Threev16b_POST:
4373 case AArch64::LD1Threev1d_POST:
4374 case AArch64::LD1Threev2d_POST:
4375 case AArch64::LD1Threev2s_POST:
4376 case AArch64::LD1Threev4h_POST:
4377 case AArch64::LD1Threev4s_POST:
4378 case AArch64::LD1Threev8b_POST:
4379 case AArch64::LD1Threev8h_POST:
4380 case AArch64::LD1Twov16b_POST:
4381 case AArch64::LD1Twov1d_POST:
4382 case AArch64::LD1Twov2d_POST:
4383 case AArch64::LD1Twov2s_POST:
4384 case AArch64::LD1Twov4h_POST:
4385 case AArch64::LD1Twov4s_POST:
4386 case AArch64::LD1Twov8b_POST:
4387 case AArch64::LD1Twov8h_POST:
4388 case AArch64::LD1i16_POST:
4389 case AArch64::LD1i32_POST:
4390 case AArch64::LD1i64_POST:
4391 case AArch64::LD1i8_POST:
4392 case AArch64::LD2Rv16b_POST:
4393 case AArch64::LD2Rv1d_POST:
4394 case AArch64::LD2Rv2d_POST:
4395 case AArch64::LD2Rv2s_POST:
4396 case AArch64::LD2Rv4h_POST:
4397 case AArch64::LD2Rv4s_POST:
4398 case AArch64::LD2Rv8b_POST:
4399 case AArch64::LD2Rv8h_POST:
4400 case AArch64::LD2Twov16b_POST:
4401 case AArch64::LD2Twov2d_POST:
4402 case AArch64::LD2Twov2s_POST:
4403 case AArch64::LD2Twov4h_POST:
4404 case AArch64::LD2Twov4s_POST:
4405 case AArch64::LD2Twov8b_POST:
4406 case AArch64::LD2Twov8h_POST:
4407 case AArch64::LD2i16_POST:
4408 case AArch64::LD2i32_POST:
4409 case AArch64::LD2i64_POST:
4410 case AArch64::LD2i8_POST:
4411 case AArch64::LD3Rv16b_POST:
4412 case AArch64::LD3Rv1d_POST:
4413 case AArch64::LD3Rv2d_POST:
4414 case AArch64::LD3Rv2s_POST:
4415 case AArch64::LD3Rv4h_POST:
4416 case AArch64::LD3Rv4s_POST:
4417 case AArch64::LD3Rv8b_POST:
4418 case AArch64::LD3Rv8h_POST:
4419 case AArch64::LD3Threev16b_POST:
4420 case AArch64::LD3Threev2d_POST:
4421 case AArch64::LD3Threev2s_POST:
4422 case AArch64::LD3Threev4h_POST:
4423 case AArch64::LD3Threev4s_POST:
4424 case AArch64::LD3Threev8b_POST:
4425 case AArch64::LD3Threev8h_POST:
4426 case AArch64::LD3i16_POST:
4427 case AArch64::LD3i32_POST:
4428 case AArch64::LD3i64_POST:
4429 case AArch64::LD3i8_POST:
4430 case AArch64::LD4Fourv16b_POST:
4431 case AArch64::LD4Fourv2d_POST:
4432 case AArch64::LD4Fourv2s_POST:
4433 case AArch64::LD4Fourv4h_POST:
4434 case AArch64::LD4Fourv4s_POST:
4435 case AArch64::LD4Fourv8b_POST:
4436 case AArch64::LD4Fourv8h_POST:
4437 case AArch64::LD4Rv16b_POST:
4438 case AArch64::LD4Rv1d_POST:
4439 case AArch64::LD4Rv2d_POST:
4440 case AArch64::LD4Rv2s_POST:
4441 case AArch64::LD4Rv4h_POST:
4442 case AArch64::LD4Rv4s_POST:
4443 case AArch64::LD4Rv8b_POST:
4444 case AArch64::LD4Rv8h_POST:
4445 case AArch64::LD4i16_POST:
4446 case AArch64::LD4i32_POST:
4447 case AArch64::LD4i64_POST:
4448 case AArch64::LD4i8_POST:
4449 case AArch64::LDAPRWpost:
4450 case AArch64::LDAPRXpost:
4451 case AArch64::LDIAPPWpost:
4452 case AArch64::LDIAPPXpost:
4453 case AArch64::LDPDpost:
4454 case AArch64::LDPQpost:
4455 case AArch64::LDPSWpost:
4456 case AArch64::LDPSpost:
4457 case AArch64::LDPWpost:
4458 case AArch64::LDPXpost:
4459 case AArch64::LDRBBpost:
4460 case AArch64::LDRBpost:
4461 case AArch64::LDRDpost:
4462 case AArch64::LDRHHpost:
4463 case AArch64::LDRHpost:
4464 case AArch64::LDRQpost:
4465 case AArch64::LDRSBWpost:
4466 case AArch64::LDRSBXpost:
4467 case AArch64::LDRSHWpost:
4468 case AArch64::LDRSHXpost:
4469 case AArch64::LDRSWpost:
4470 case AArch64::LDRSpost:
4471 case AArch64::LDRWpost:
4472 case AArch64::LDRXpost:
4473 case AArch64::ST1Fourv16b_POST:
4474 case AArch64::ST1Fourv1d_POST:
4475 case AArch64::ST1Fourv2d_POST:
4476 case AArch64::ST1Fourv2s_POST:
4477 case AArch64::ST1Fourv4h_POST:
4478 case AArch64::ST1Fourv4s_POST:
4479 case AArch64::ST1Fourv8b_POST:
4480 case AArch64::ST1Fourv8h_POST:
4481 case AArch64::ST1Onev16b_POST:
4482 case AArch64::ST1Onev1d_POST:
4483 case AArch64::ST1Onev2d_POST:
4484 case AArch64::ST1Onev2s_POST:
4485 case AArch64::ST1Onev4h_POST:
4486 case AArch64::ST1Onev4s_POST:
4487 case AArch64::ST1Onev8b_POST:
4488 case AArch64::ST1Onev8h_POST:
4489 case AArch64::ST1Threev16b_POST:
4490 case AArch64::ST1Threev1d_POST:
4491 case AArch64::ST1Threev2d_POST:
4492 case AArch64::ST1Threev2s_POST:
4493 case AArch64::ST1Threev4h_POST:
4494 case AArch64::ST1Threev4s_POST:
4495 case AArch64::ST1Threev8b_POST:
4496 case AArch64::ST1Threev8h_POST:
4497 case AArch64::ST1Twov16b_POST:
4498 case AArch64::ST1Twov1d_POST:
4499 case AArch64::ST1Twov2d_POST:
4500 case AArch64::ST1Twov2s_POST:
4501 case AArch64::ST1Twov4h_POST:
4502 case AArch64::ST1Twov4s_POST:
4503 case AArch64::ST1Twov8b_POST:
4504 case AArch64::ST1Twov8h_POST:
4505 case AArch64::ST1i16_POST:
4506 case AArch64::ST1i32_POST:
4507 case AArch64::ST1i64_POST:
4508 case AArch64::ST1i8_POST:
4509 case AArch64::ST2GPostIndex:
4510 case AArch64::ST2Twov16b_POST:
4511 case AArch64::ST2Twov2d_POST:
4512 case AArch64::ST2Twov2s_POST:
4513 case AArch64::ST2Twov4h_POST:
4514 case AArch64::ST2Twov4s_POST:
4515 case AArch64::ST2Twov8b_POST:
4516 case AArch64::ST2Twov8h_POST:
4517 case AArch64::ST2i16_POST:
4518 case AArch64::ST2i32_POST:
4519 case AArch64::ST2i64_POST:
4520 case AArch64::ST2i8_POST:
4521 case AArch64::ST3Threev16b_POST:
4522 case AArch64::ST3Threev2d_POST:
4523 case AArch64::ST3Threev2s_POST:
4524 case AArch64::ST3Threev4h_POST:
4525 case AArch64::ST3Threev4s_POST:
4526 case AArch64::ST3Threev8b_POST:
4527 case AArch64::ST3Threev8h_POST:
4528 case AArch64::ST3i16_POST:
4529 case AArch64::ST3i32_POST:
4530 case AArch64::ST3i64_POST:
4531 case AArch64::ST3i8_POST:
4532 case AArch64::ST4Fourv16b_POST:
4533 case AArch64::ST4Fourv2d_POST:
4534 case AArch64::ST4Fourv2s_POST:
4535 case AArch64::ST4Fourv4h_POST:
4536 case AArch64::ST4Fourv4s_POST:
4537 case AArch64::ST4Fourv8b_POST:
4538 case AArch64::ST4Fourv8h_POST:
4539 case AArch64::ST4i16_POST:
4540 case AArch64::ST4i32_POST:
4541 case AArch64::ST4i64_POST:
4542 case AArch64::ST4i8_POST:
4543 case AArch64::STGPostIndex:
4544 case AArch64::STGPpost:
4545 case AArch64::STPDpost:
4546 case AArch64::STPQpost:
4547 case AArch64::STPSpost:
4548 case AArch64::STPWpost:
4549 case AArch64::STPXpost:
4550 case AArch64::STRBBpost:
4551 case AArch64::STRBpost:
4552 case AArch64::STRDpost:
4553 case AArch64::STRHHpost:
4554 case AArch64::STRHpost:
4555 case AArch64::STRQpost:
4556 case AArch64::STRSpost:
4557 case AArch64::STRWpost:
4558 case AArch64::STRXpost:
4559 case AArch64::STZ2GPostIndex:
4560 case AArch64::STZGPostIndex:
4561 return true;
4562 }
4563}
4564
4565bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
4566 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4567 bool &OffsetIsScalable, TypeSize &Width,
4568 const TargetRegisterInfo *TRI) const {
4569 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4570 // Handle only loads/stores with base register followed by immediate offset.
4571 if (LdSt.getNumExplicitOperands() == 3) {
4572 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4573 if ((!LdSt.getOperand(i: 1).isReg() && !LdSt.getOperand(i: 1).isFI()) ||
4574 !LdSt.getOperand(i: 2).isImm())
4575 return false;
4576 } else if (LdSt.getNumExplicitOperands() == 4) {
4577 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4578 if (!LdSt.getOperand(i: 1).isReg() ||
4579 (!LdSt.getOperand(i: 2).isReg() && !LdSt.getOperand(i: 2).isFI()) ||
4580 !LdSt.getOperand(i: 3).isImm())
4581 return false;
4582 } else
4583 return false;
4584
4585 // Get the scaling factor for the instruction and set the width for the
4586 // instruction.
4587 TypeSize Scale(0U, false);
4588 int64_t Dummy1, Dummy2;
4589
4590 // If this returns false, then it's an instruction we don't want to handle.
4591 if (!getMemOpInfo(Opcode: LdSt.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2))
4592 return false;
4593
4594 // Compute the offset. Offset is calculated as the immediate operand
4595 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4596 // set to 1. Postindex are a special case which have an offset of 0.
4597 if (isPostIndexLdStOpcode(Opcode: LdSt.getOpcode())) {
4598 BaseOp = &LdSt.getOperand(i: 2);
4599 Offset = 0;
4600 } else if (LdSt.getNumExplicitOperands() == 3) {
4601 BaseOp = &LdSt.getOperand(i: 1);
4602 Offset = LdSt.getOperand(i: 2).getImm() * Scale.getKnownMinValue();
4603 } else {
4604 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4605 BaseOp = &LdSt.getOperand(i: 2);
4606 Offset = LdSt.getOperand(i: 3).getImm() * Scale.getKnownMinValue();
4607 }
4608 OffsetIsScalable = Scale.isScalable();
4609
4610 return BaseOp->isReg() || BaseOp->isFI();
4611}
4612
4613MachineOperand &
4614AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
4615 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4616 MachineOperand &OfsOp = LdSt.getOperand(i: LdSt.getNumExplicitOperands() - 1);
4617 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4618 return OfsOp;
4619}
4620
4621bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4622 TypeSize &Width, int64_t &MinOffset,
4623 int64_t &MaxOffset) {
4624 switch (Opcode) {
4625 // Not a memory operation or something we want to handle.
4626 default:
4627 Scale = Width = TypeSize::getFixed(ExactSize: 0);
4628 MinOffset = MaxOffset = 0;
4629 return false;
4630 // LDR / STR
4631 case AArch64::LDRQui:
4632 case AArch64::STRQui:
4633 Scale = Width = TypeSize::getFixed(ExactSize: 16);
4634 MinOffset = 0;
4635 MaxOffset = 4095;
4636 break;
4637 case AArch64::LDRXui:
4638 case AArch64::LDRDui:
4639 case AArch64::STRXui:
4640 case AArch64::STRDui:
4641 case AArch64::PRFMui:
4642 Scale = Width = TypeSize::getFixed(ExactSize: 8);
4643 MinOffset = 0;
4644 MaxOffset = 4095;
4645 break;
4646 case AArch64::LDRWui:
4647 case AArch64::LDRSui:
4648 case AArch64::LDRSWui:
4649 case AArch64::STRWui:
4650 case AArch64::STRSui:
4651 Scale = Width = TypeSize::getFixed(ExactSize: 4);
4652 MinOffset = 0;
4653 MaxOffset = 4095;
4654 break;
4655 case AArch64::LDRHui:
4656 case AArch64::LDRHHui:
4657 case AArch64::LDRSHWui:
4658 case AArch64::LDRSHXui:
4659 case AArch64::STRHui:
4660 case AArch64::STRHHui:
4661 Scale = Width = TypeSize::getFixed(ExactSize: 2);
4662 MinOffset = 0;
4663 MaxOffset = 4095;
4664 break;
4665 case AArch64::LDRBui:
4666 case AArch64::LDRBBui:
4667 case AArch64::LDRSBWui:
4668 case AArch64::LDRSBXui:
4669 case AArch64::STRBui:
4670 case AArch64::STRBBui:
4671 Scale = Width = TypeSize::getFixed(ExactSize: 1);
4672 MinOffset = 0;
4673 MaxOffset = 4095;
4674 break;
4675 // post/pre inc
4676 case AArch64::STRQpre:
4677 case AArch64::LDRQpost:
4678 Scale = TypeSize::getFixed(ExactSize: 1);
4679 Width = TypeSize::getFixed(ExactSize: 16);
4680 MinOffset = -256;
4681 MaxOffset = 255;
4682 break;
4683 case AArch64::LDRDpost:
4684 case AArch64::LDRDpre:
4685 case AArch64::LDRXpost:
4686 case AArch64::LDRXpre:
4687 case AArch64::STRDpost:
4688 case AArch64::STRDpre:
4689 case AArch64::STRXpost:
4690 case AArch64::STRXpre:
4691 Scale = TypeSize::getFixed(ExactSize: 1);
4692 Width = TypeSize::getFixed(ExactSize: 8);
4693 MinOffset = -256;
4694 MaxOffset = 255;
4695 break;
4696 case AArch64::STRWpost:
4697 case AArch64::STRWpre:
4698 case AArch64::LDRWpost:
4699 case AArch64::LDRWpre:
4700 case AArch64::STRSpost:
4701 case AArch64::STRSpre:
4702 case AArch64::LDRSpost:
4703 case AArch64::LDRSpre:
4704 Scale = TypeSize::getFixed(ExactSize: 1);
4705 Width = TypeSize::getFixed(ExactSize: 4);
4706 MinOffset = -256;
4707 MaxOffset = 255;
4708 break;
4709 case AArch64::LDRHpost:
4710 case AArch64::LDRHpre:
4711 case AArch64::STRHpost:
4712 case AArch64::STRHpre:
4713 case AArch64::LDRHHpost:
4714 case AArch64::LDRHHpre:
4715 case AArch64::STRHHpost:
4716 case AArch64::STRHHpre:
4717 Scale = TypeSize::getFixed(ExactSize: 1);
4718 Width = TypeSize::getFixed(ExactSize: 2);
4719 MinOffset = -256;
4720 MaxOffset = 255;
4721 break;
4722 case AArch64::LDRBpost:
4723 case AArch64::LDRBpre:
4724 case AArch64::STRBpost:
4725 case AArch64::STRBpre:
4726 case AArch64::LDRBBpost:
4727 case AArch64::LDRBBpre:
4728 case AArch64::STRBBpost:
4729 case AArch64::STRBBpre:
4730 Scale = Width = TypeSize::getFixed(ExactSize: 1);
4731 MinOffset = -256;
4732 MaxOffset = 255;
4733 break;
4734 // Unscaled
4735 case AArch64::LDURQi:
4736 case AArch64::STURQi:
4737 Scale = TypeSize::getFixed(ExactSize: 1);
4738 Width = TypeSize::getFixed(ExactSize: 16);
4739 MinOffset = -256;
4740 MaxOffset = 255;
4741 break;
4742 case AArch64::LDURXi:
4743 case AArch64::LDURDi:
4744 case AArch64::LDAPURXi:
4745 case AArch64::STURXi:
4746 case AArch64::STURDi:
4747 case AArch64::STLURXi:
4748 case AArch64::PRFUMi:
4749 Scale = TypeSize::getFixed(ExactSize: 1);
4750 Width = TypeSize::getFixed(ExactSize: 8);
4751 MinOffset = -256;
4752 MaxOffset = 255;
4753 break;
4754 case AArch64::LDURWi:
4755 case AArch64::LDURSi:
4756 case AArch64::LDURSWi:
4757 case AArch64::LDAPURi:
4758 case AArch64::LDAPURSWi:
4759 case AArch64::STURWi:
4760 case AArch64::STURSi:
4761 case AArch64::STLURWi:
4762 Scale = TypeSize::getFixed(ExactSize: 1);
4763 Width = TypeSize::getFixed(ExactSize: 4);
4764 MinOffset = -256;
4765 MaxOffset = 255;
4766 break;
4767 case AArch64::LDURHi:
4768 case AArch64::LDURHHi:
4769 case AArch64::LDURSHXi:
4770 case AArch64::LDURSHWi:
4771 case AArch64::LDAPURHi:
4772 case AArch64::LDAPURSHWi:
4773 case AArch64::LDAPURSHXi:
4774 case AArch64::STURHi:
4775 case AArch64::STURHHi:
4776 case AArch64::STLURHi:
4777 Scale = TypeSize::getFixed(ExactSize: 1);
4778 Width = TypeSize::getFixed(ExactSize: 2);
4779 MinOffset = -256;
4780 MaxOffset = 255;
4781 break;
4782 case AArch64::LDURBi:
4783 case AArch64::LDURBBi:
4784 case AArch64::LDURSBXi:
4785 case AArch64::LDURSBWi:
4786 case AArch64::LDAPURBi:
4787 case AArch64::LDAPURSBWi:
4788 case AArch64::LDAPURSBXi:
4789 case AArch64::STURBi:
4790 case AArch64::STURBBi:
4791 case AArch64::STLURBi:
4792 Scale = Width = TypeSize::getFixed(ExactSize: 1);
4793 MinOffset = -256;
4794 MaxOffset = 255;
4795 break;
4796 // LDP / STP (including pre/post inc)
4797 case AArch64::LDPQi:
4798 case AArch64::LDNPQi:
4799 case AArch64::STPQi:
4800 case AArch64::STNPQi:
4801 case AArch64::LDPQpost:
4802 case AArch64::LDPQpre:
4803 case AArch64::STPQpost:
4804 case AArch64::STPQpre:
4805 Scale = TypeSize::getFixed(ExactSize: 16);
4806 Width = TypeSize::getFixed(ExactSize: 16 * 2);
4807 MinOffset = -64;
4808 MaxOffset = 63;
4809 break;
4810 case AArch64::LDPXi:
4811 case AArch64::LDPDi:
4812 case AArch64::LDNPXi:
4813 case AArch64::LDNPDi:
4814 case AArch64::STPXi:
4815 case AArch64::STPDi:
4816 case AArch64::STNPXi:
4817 case AArch64::STNPDi:
4818 case AArch64::LDPDpost:
4819 case AArch64::LDPDpre:
4820 case AArch64::LDPXpost:
4821 case AArch64::LDPXpre:
4822 case AArch64::STPDpost:
4823 case AArch64::STPDpre:
4824 case AArch64::STPXpost:
4825 case AArch64::STPXpre:
4826 Scale = TypeSize::getFixed(ExactSize: 8);
4827 Width = TypeSize::getFixed(ExactSize: 8 * 2);
4828 MinOffset = -64;
4829 MaxOffset = 63;
4830 break;
4831 case AArch64::LDPWi:
4832 case AArch64::LDPSi:
4833 case AArch64::LDNPWi:
4834 case AArch64::LDNPSi:
4835 case AArch64::STPWi:
4836 case AArch64::STPSi:
4837 case AArch64::STNPWi:
4838 case AArch64::STNPSi:
4839 case AArch64::LDPSpost:
4840 case AArch64::LDPSpre:
4841 case AArch64::LDPWpost:
4842 case AArch64::LDPWpre:
4843 case AArch64::STPSpost:
4844 case AArch64::STPSpre:
4845 case AArch64::STPWpost:
4846 case AArch64::STPWpre:
4847 Scale = TypeSize::getFixed(ExactSize: 4);
4848 Width = TypeSize::getFixed(ExactSize: 4 * 2);
4849 MinOffset = -64;
4850 MaxOffset = 63;
4851 break;
4852 case AArch64::StoreSwiftAsyncContext:
4853 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4854 Scale = TypeSize::getFixed(ExactSize: 1);
4855 Width = TypeSize::getFixed(ExactSize: 8);
4856 MinOffset = 0;
4857 MaxOffset = 4095;
4858 break;
4859 case AArch64::ADDG:
4860 Scale = TypeSize::getFixed(ExactSize: 16);
4861 Width = TypeSize::getFixed(ExactSize: 0);
4862 MinOffset = 0;
4863 MaxOffset = 63;
4864 break;
4865 case AArch64::TAGPstack:
4866 Scale = TypeSize::getFixed(ExactSize: 16);
4867 Width = TypeSize::getFixed(ExactSize: 0);
4868 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4869 // of 63 (not 64!).
4870 MinOffset = -63;
4871 MaxOffset = 63;
4872 break;
4873 case AArch64::LDG:
4874 case AArch64::STGi:
4875 case AArch64::STGPreIndex:
4876 case AArch64::STGPostIndex:
4877 case AArch64::STZGi:
4878 case AArch64::STZGPreIndex:
4879 case AArch64::STZGPostIndex:
4880 Scale = Width = TypeSize::getFixed(ExactSize: 16);
4881 MinOffset = -256;
4882 MaxOffset = 255;
4883 break;
4884 // SVE
4885 case AArch64::STR_ZZZZXI:
4886 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4887 case AArch64::LDR_ZZZZXI:
4888 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4889 Scale = TypeSize::getScalable(MinimumSize: 16);
4890 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
4891 MinOffset = -256;
4892 MaxOffset = 252;
4893 break;
4894 case AArch64::STR_ZZZXI:
4895 case AArch64::LDR_ZZZXI:
4896 Scale = TypeSize::getScalable(MinimumSize: 16);
4897 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
4898 MinOffset = -256;
4899 MaxOffset = 253;
4900 break;
4901 case AArch64::STR_ZZXI:
4902 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4903 case AArch64::LDR_ZZXI:
4904 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4905 Scale = TypeSize::getScalable(MinimumSize: 16);
4906 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
4907 MinOffset = -256;
4908 MaxOffset = 254;
4909 break;
4910 case AArch64::LDR_PXI:
4911 case AArch64::STR_PXI:
4912 Scale = Width = TypeSize::getScalable(MinimumSize: 2);
4913 MinOffset = -256;
4914 MaxOffset = 255;
4915 break;
4916 case AArch64::LDR_PPXI:
4917 case AArch64::STR_PPXI:
4918 Scale = TypeSize::getScalable(MinimumSize: 2);
4919 Width = TypeSize::getScalable(MinimumSize: 2 * 2);
4920 MinOffset = -256;
4921 MaxOffset = 254;
4922 break;
4923 case AArch64::LDR_ZXI:
4924 case AArch64::STR_ZXI:
4925 Scale = Width = TypeSize::getScalable(MinimumSize: 16);
4926 MinOffset = -256;
4927 MaxOffset = 255;
4928 break;
4929 case AArch64::LD1B_IMM:
4930 case AArch64::LD1H_IMM:
4931 case AArch64::LD1W_IMM:
4932 case AArch64::LD1D_IMM:
4933 case AArch64::LDNT1B_ZRI:
4934 case AArch64::LDNT1H_ZRI:
4935 case AArch64::LDNT1W_ZRI:
4936 case AArch64::LDNT1D_ZRI:
4937 case AArch64::ST1B_IMM:
4938 case AArch64::ST1H_IMM:
4939 case AArch64::ST1W_IMM:
4940 case AArch64::ST1D_IMM:
4941 case AArch64::STNT1B_ZRI:
4942 case AArch64::STNT1H_ZRI:
4943 case AArch64::STNT1W_ZRI:
4944 case AArch64::STNT1D_ZRI:
4945 case AArch64::LDNF1B_IMM:
4946 case AArch64::LDNF1H_IMM:
4947 case AArch64::LDNF1W_IMM:
4948 case AArch64::LDNF1D_IMM:
4949 // A full vectors worth of data
4950 // Width = mbytes * elements
4951 Scale = Width = TypeSize::getScalable(MinimumSize: 16);
4952 MinOffset = -8;
4953 MaxOffset = 7;
4954 break;
4955 case AArch64::LD2B_IMM:
4956 case AArch64::LD2H_IMM:
4957 case AArch64::LD2W_IMM:
4958 case AArch64::LD2D_IMM:
4959 case AArch64::ST2B_IMM:
4960 case AArch64::ST2H_IMM:
4961 case AArch64::ST2W_IMM:
4962 case AArch64::ST2D_IMM:
4963 case AArch64::LD1B_2Z_IMM:
4964 case AArch64::LD1B_2Z_STRIDED_IMM:
4965 case AArch64::LD1H_2Z_IMM:
4966 case AArch64::LD1H_2Z_STRIDED_IMM:
4967 case AArch64::LD1W_2Z_IMM:
4968 case AArch64::LD1W_2Z_STRIDED_IMM:
4969 case AArch64::LD1D_2Z_IMM:
4970 case AArch64::LD1D_2Z_STRIDED_IMM:
4971 case AArch64::LD1B_2Z_IMM_PSEUDO:
4972 case AArch64::LD1H_2Z_IMM_PSEUDO:
4973 case AArch64::LD1W_2Z_IMM_PSEUDO:
4974 case AArch64::LD1D_2Z_IMM_PSEUDO:
4975 case AArch64::ST1B_2Z_IMM:
4976 case AArch64::ST1B_2Z_STRIDED_IMM:
4977 case AArch64::ST1H_2Z_IMM:
4978 case AArch64::ST1H_2Z_STRIDED_IMM:
4979 case AArch64::ST1W_2Z_IMM:
4980 case AArch64::ST1W_2Z_STRIDED_IMM:
4981 case AArch64::ST1D_2Z_IMM:
4982 case AArch64::ST1D_2Z_STRIDED_IMM:
4983 case AArch64::LDNT1B_2Z_IMM_PSEUDO:
4984 case AArch64::LDNT1B_2Z_IMM:
4985 case AArch64::LDNT1B_2Z_STRIDED_IMM:
4986 case AArch64::LDNT1H_2Z_IMM_PSEUDO:
4987 case AArch64::LDNT1H_2Z_IMM:
4988 case AArch64::LDNT1H_2Z_STRIDED_IMM:
4989 case AArch64::LDNT1W_2Z_IMM_PSEUDO:
4990 case AArch64::LDNT1W_2Z_IMM:
4991 case AArch64::LDNT1W_2Z_STRIDED_IMM:
4992 case AArch64::LDNT1D_2Z_IMM_PSEUDO:
4993 case AArch64::LDNT1D_2Z_IMM:
4994 case AArch64::LDNT1D_2Z_STRIDED_IMM:
4995 case AArch64::STNT1B_2Z_IMM:
4996 case AArch64::STNT1B_2Z_STRIDED_IMM:
4997 case AArch64::STNT1H_2Z_IMM:
4998 case AArch64::STNT1H_2Z_STRIDED_IMM:
4999 case AArch64::STNT1W_2Z_IMM:
5000 case AArch64::STNT1W_2Z_STRIDED_IMM:
5001 case AArch64::STNT1D_2Z_IMM:
5002 case AArch64::STNT1D_2Z_STRIDED_IMM:
5003 Scale = Width = TypeSize::getScalable(MinimumSize: 16 * 2);
5004 MinOffset = -8;
5005 MaxOffset = 7;
5006 break;
5007 case AArch64::LD3B_IMM:
5008 case AArch64::LD3H_IMM:
5009 case AArch64::LD3W_IMM:
5010 case AArch64::LD3D_IMM:
5011 case AArch64::ST3B_IMM:
5012 case AArch64::ST3H_IMM:
5013 case AArch64::ST3W_IMM:
5014 case AArch64::ST3D_IMM:
5015 Scale = Width = TypeSize::getScalable(MinimumSize: 16 * 3);
5016 MinOffset = -8;
5017 MaxOffset = 7;
5018 break;
5019 case AArch64::LD4B_IMM:
5020 case AArch64::LD4H_IMM:
5021 case AArch64::LD4W_IMM:
5022 case AArch64::LD4D_IMM:
5023 case AArch64::ST4B_IMM:
5024 case AArch64::ST4H_IMM:
5025 case AArch64::ST4W_IMM:
5026 case AArch64::ST4D_IMM:
5027 case AArch64::LD1B_4Z_IMM:
5028 case AArch64::LD1B_4Z_STRIDED_IMM:
5029 case AArch64::LD1H_4Z_IMM:
5030 case AArch64::LD1H_4Z_STRIDED_IMM:
5031 case AArch64::LD1W_4Z_IMM:
5032 case AArch64::LD1W_4Z_STRIDED_IMM:
5033 case AArch64::LD1D_4Z_IMM:
5034 case AArch64::LD1D_4Z_STRIDED_IMM:
5035 case AArch64::LD1B_4Z_IMM_PSEUDO:
5036 case AArch64::LD1H_4Z_IMM_PSEUDO:
5037 case AArch64::LD1W_4Z_IMM_PSEUDO:
5038 case AArch64::LD1D_4Z_IMM_PSEUDO:
5039 case AArch64::ST1B_4Z_IMM:
5040 case AArch64::ST1B_4Z_STRIDED_IMM:
5041 case AArch64::ST1H_4Z_IMM:
5042 case AArch64::ST1H_4Z_STRIDED_IMM:
5043 case AArch64::ST1W_4Z_IMM:
5044 case AArch64::ST1W_4Z_STRIDED_IMM:
5045 case AArch64::ST1D_4Z_IMM:
5046 case AArch64::ST1D_4Z_STRIDED_IMM:
5047 case AArch64::LDNT1B_4Z_IMM_PSEUDO:
5048 case AArch64::LDNT1B_4Z_IMM:
5049 case AArch64::LDNT1B_4Z_STRIDED_IMM:
5050 case AArch64::LDNT1H_4Z_IMM_PSEUDO:
5051 case AArch64::LDNT1H_4Z_IMM:
5052 case AArch64::LDNT1H_4Z_STRIDED_IMM:
5053 case AArch64::LDNT1W_4Z_IMM_PSEUDO:
5054 case AArch64::LDNT1W_4Z_IMM:
5055 case AArch64::LDNT1W_4Z_STRIDED_IMM:
5056 case AArch64::LDNT1D_4Z_IMM_PSEUDO:
5057 case AArch64::LDNT1D_4Z_IMM:
5058 case AArch64::LDNT1D_4Z_STRIDED_IMM:
5059 case AArch64::STNT1B_4Z_IMM:
5060 case AArch64::STNT1B_4Z_STRIDED_IMM:
5061 case AArch64::STNT1H_4Z_IMM:
5062 case AArch64::STNT1H_4Z_STRIDED_IMM:
5063 case AArch64::STNT1W_4Z_IMM:
5064 case AArch64::STNT1W_4Z_STRIDED_IMM:
5065 case AArch64::STNT1D_4Z_IMM:
5066 case AArch64::STNT1D_4Z_STRIDED_IMM:
5067 Scale = Width = TypeSize::getScalable(MinimumSize: 16 * 4);
5068 MinOffset = -8;
5069 MaxOffset = 7;
5070 break;
5071 case AArch64::LD1B_H_IMM:
5072 case AArch64::LD1SB_H_IMM:
5073 case AArch64::LD1H_S_IMM:
5074 case AArch64::LD1SH_S_IMM:
5075 case AArch64::LD1W_D_IMM:
5076 case AArch64::LD1SW_D_IMM:
5077 case AArch64::ST1B_H_IMM:
5078 case AArch64::ST1H_S_IMM:
5079 case AArch64::ST1W_D_IMM:
5080 case AArch64::LDNF1B_H_IMM:
5081 case AArch64::LDNF1SB_H_IMM:
5082 case AArch64::LDNF1H_S_IMM:
5083 case AArch64::LDNF1SH_S_IMM:
5084 case AArch64::LDNF1W_D_IMM:
5085 case AArch64::LDNF1SW_D_IMM:
5086 // A half vector worth of data
5087 // Width = mbytes * elements
5088 Scale = Width = TypeSize::getScalable(MinimumSize: 8);
5089 MinOffset = -8;
5090 MaxOffset = 7;
5091 break;
5092 case AArch64::LD1B_S_IMM:
5093 case AArch64::LD1SB_S_IMM:
5094 case AArch64::LD1H_D_IMM:
5095 case AArch64::LD1SH_D_IMM:
5096 case AArch64::ST1B_S_IMM:
5097 case AArch64::ST1H_D_IMM:
5098 case AArch64::LDNF1B_S_IMM:
5099 case AArch64::LDNF1SB_S_IMM:
5100 case AArch64::LDNF1H_D_IMM:
5101 case AArch64::LDNF1SH_D_IMM:
5102 // A quarter vector worth of data
5103 // Width = mbytes * elements
5104 Scale = Width = TypeSize::getScalable(MinimumSize: 4);
5105 MinOffset = -8;
5106 MaxOffset = 7;
5107 break;
5108 case AArch64::LD1B_D_IMM:
5109 case AArch64::LD1SB_D_IMM:
5110 case AArch64::ST1B_D_IMM:
5111 case AArch64::LDNF1B_D_IMM:
5112 case AArch64::LDNF1SB_D_IMM:
5113 // A eighth vector worth of data
5114 // Width = mbytes * elements
5115 Scale = Width = TypeSize::getScalable(MinimumSize: 2);
5116 MinOffset = -8;
5117 MaxOffset = 7;
5118 break;
5119 case AArch64::ST2Gi:
5120 case AArch64::ST2GPreIndex:
5121 case AArch64::ST2GPostIndex:
5122 case AArch64::STZ2Gi:
5123 case AArch64::STZ2GPreIndex:
5124 case AArch64::STZ2GPostIndex:
5125 Scale = TypeSize::getFixed(ExactSize: 16);
5126 Width = TypeSize::getFixed(ExactSize: 32);
5127 MinOffset = -256;
5128 MaxOffset = 255;
5129 break;
5130 case AArch64::STGPi:
5131 case AArch64::STGPpost:
5132 case AArch64::STGPpre:
5133 Scale = Width = TypeSize::getFixed(ExactSize: 16);
5134 MinOffset = -64;
5135 MaxOffset = 63;
5136 break;
5137 case AArch64::LD1RB_IMM:
5138 case AArch64::LD1RB_H_IMM:
5139 case AArch64::LD1RB_S_IMM:
5140 case AArch64::LD1RB_D_IMM:
5141 case AArch64::LD1RSB_H_IMM:
5142 case AArch64::LD1RSB_S_IMM:
5143 case AArch64::LD1RSB_D_IMM:
5144 Scale = Width = TypeSize::getFixed(ExactSize: 1);
5145 MinOffset = 0;
5146 MaxOffset = 63;
5147 break;
5148 case AArch64::LD1RH_IMM:
5149 case AArch64::LD1RH_S_IMM:
5150 case AArch64::LD1RH_D_IMM:
5151 case AArch64::LD1RSH_S_IMM:
5152 case AArch64::LD1RSH_D_IMM:
5153 Scale = Width = TypeSize::getFixed(ExactSize: 2);
5154 MinOffset = 0;
5155 MaxOffset = 63;
5156 break;
5157 case AArch64::LD1RW_IMM:
5158 case AArch64::LD1RW_D_IMM:
5159 case AArch64::LD1RSW_IMM:
5160 Scale = Width = TypeSize::getFixed(ExactSize: 4);
5161 MinOffset = 0;
5162 MaxOffset = 63;
5163 break;
5164 case AArch64::LD1RD_IMM:
5165 Scale = Width = TypeSize::getFixed(ExactSize: 8);
5166 MinOffset = 0;
5167 MaxOffset = 63;
5168 break;
5169 }
5170
5171 return true;
5172}
5173
5174// Scaling factor for unscaled load or store.
5175int AArch64InstrInfo::getMemScale(unsigned Opc) {
5176 switch (Opc) {
5177 default:
5178 llvm_unreachable("Opcode has unknown scale!");
5179 case AArch64::LDRBui:
5180 case AArch64::LDRBBui:
5181 case AArch64::LDURBBi:
5182 case AArch64::LDRSBWui:
5183 case AArch64::LDURSBWi:
5184 case AArch64::STRBui:
5185 case AArch64::STRBBui:
5186 case AArch64::STURBBi:
5187 return 1;
5188 case AArch64::LDRHui:
5189 case AArch64::LDRHHui:
5190 case AArch64::LDURHHi:
5191 case AArch64::LDRSHWui:
5192 case AArch64::LDURSHWi:
5193 case AArch64::STRHui:
5194 case AArch64::STRHHui:
5195 case AArch64::STURHHi:
5196 return 2;
5197 case AArch64::LDRSui:
5198 case AArch64::LDURSi:
5199 case AArch64::LDRSpre:
5200 case AArch64::LDRSWui:
5201 case AArch64::LDURSWi:
5202 case AArch64::LDRSWpre:
5203 case AArch64::LDRWpre:
5204 case AArch64::LDRWui:
5205 case AArch64::LDURWi:
5206 case AArch64::STRSui:
5207 case AArch64::STURSi:
5208 case AArch64::STRSpre:
5209 case AArch64::STRWui:
5210 case AArch64::STURWi:
5211 case AArch64::STRWpre:
5212 case AArch64::LDPSi:
5213 case AArch64::LDPSWi:
5214 case AArch64::LDPWi:
5215 case AArch64::STPSi:
5216 case AArch64::STPWi:
5217 return 4;
5218 case AArch64::LDRDui:
5219 case AArch64::LDURDi:
5220 case AArch64::LDRDpre:
5221 case AArch64::LDRXui:
5222 case AArch64::LDURXi:
5223 case AArch64::LDRXpre:
5224 case AArch64::STRDui:
5225 case AArch64::STURDi:
5226 case AArch64::STRDpre:
5227 case AArch64::STRXui:
5228 case AArch64::STURXi:
5229 case AArch64::STRXpre:
5230 case AArch64::LDPDi:
5231 case AArch64::LDPXi:
5232 case AArch64::STPDi:
5233 case AArch64::STPXi:
5234 return 8;
5235 case AArch64::LDRQui:
5236 case AArch64::LDURQi:
5237 case AArch64::STRQui:
5238 case AArch64::STURQi:
5239 case AArch64::STRQpre:
5240 case AArch64::LDPQi:
5241 case AArch64::LDRQpre:
5242 case AArch64::STPQi:
5243 case AArch64::STGi:
5244 case AArch64::STZGi:
5245 case AArch64::ST2Gi:
5246 case AArch64::STZ2Gi:
5247 case AArch64::STGPi:
5248 return 16;
5249 }
5250}
5251
5252bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
5253 switch (MI.getOpcode()) {
5254 default:
5255 return false;
5256 case AArch64::LDRWpre:
5257 case AArch64::LDRXpre:
5258 case AArch64::LDRSWpre:
5259 case AArch64::LDRSpre:
5260 case AArch64::LDRDpre:
5261 case AArch64::LDRQpre:
5262 return true;
5263 }
5264}
5265
5266bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
5267 switch (MI.getOpcode()) {
5268 default:
5269 return false;
5270 case AArch64::STRWpre:
5271 case AArch64::STRXpre:
5272 case AArch64::STRSpre:
5273 case AArch64::STRDpre:
5274 case AArch64::STRQpre:
5275 return true;
5276 }
5277}
5278
5279bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
5280 return isPreLd(MI) || isPreSt(MI);
5281}
5282
5283bool AArch64InstrInfo::isZExtLoad(const MachineInstr &MI) {
5284 switch (MI.getOpcode()) {
5285 default:
5286 return false;
5287 case AArch64::LDURBBi:
5288 case AArch64::LDURHHi:
5289 case AArch64::LDURWi:
5290 case AArch64::LDRBBui:
5291 case AArch64::LDRHHui:
5292 case AArch64::LDRWui:
5293 case AArch64::LDRBBroX:
5294 case AArch64::LDRHHroX:
5295 case AArch64::LDRWroX:
5296 case AArch64::LDRBBroW:
5297 case AArch64::LDRHHroW:
5298 case AArch64::LDRWroW:
5299 return true;
5300 }
5301}
5302
5303bool AArch64InstrInfo::isSExtLoad(const MachineInstr &MI) {
5304 switch (MI.getOpcode()) {
5305 default:
5306 return false;
5307 case AArch64::LDURSBWi:
5308 case AArch64::LDURSHWi:
5309 case AArch64::LDURSBXi:
5310 case AArch64::LDURSHXi:
5311 case AArch64::LDURSWi:
5312 case AArch64::LDRSBWui:
5313 case AArch64::LDRSHWui:
5314 case AArch64::LDRSBXui:
5315 case AArch64::LDRSHXui:
5316 case AArch64::LDRSWui:
5317 case AArch64::LDRSBWroX:
5318 case AArch64::LDRSHWroX:
5319 case AArch64::LDRSBXroX:
5320 case AArch64::LDRSHXroX:
5321 case AArch64::LDRSWroX:
5322 case AArch64::LDRSBWroW:
5323 case AArch64::LDRSHWroW:
5324 case AArch64::LDRSBXroW:
5325 case AArch64::LDRSHXroW:
5326 case AArch64::LDRSWroW:
5327 return true;
5328 }
5329}
5330
5331bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
5332 switch (MI.getOpcode()) {
5333 default:
5334 return false;
5335 case AArch64::LDPSi:
5336 case AArch64::LDPSWi:
5337 case AArch64::LDPDi:
5338 case AArch64::LDPQi:
5339 case AArch64::LDPWi:
5340 case AArch64::LDPXi:
5341 case AArch64::STPSi:
5342 case AArch64::STPDi:
5343 case AArch64::STPQi:
5344 case AArch64::STPWi:
5345 case AArch64::STPXi:
5346 case AArch64::STGPi:
5347 return true;
5348 }
5349}
5350
5351const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
5352 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5353 unsigned Idx =
5354 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
5355 : 1;
5356 return MI.getOperand(i: Idx);
5357}
5358
5359const MachineOperand &
5360AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
5361 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5362 unsigned Idx =
5363 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
5364 : 2;
5365 return MI.getOperand(i: Idx);
5366}
5367
5368const MachineOperand &
5369AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
5370 switch (MI.getOpcode()) {
5371 default:
5372 llvm_unreachable("Unexpected opcode");
5373 case AArch64::LDRBroX:
5374 case AArch64::LDRBBroX:
5375 case AArch64::LDRSBXroX:
5376 case AArch64::LDRSBWroX:
5377 case AArch64::LDRHroX:
5378 case AArch64::LDRHHroX:
5379 case AArch64::LDRSHXroX:
5380 case AArch64::LDRSHWroX:
5381 case AArch64::LDRWroX:
5382 case AArch64::LDRSroX:
5383 case AArch64::LDRSWroX:
5384 case AArch64::LDRDroX:
5385 case AArch64::LDRXroX:
5386 case AArch64::LDRQroX:
5387 return MI.getOperand(i: 4);
5388 }
5389}
5390
5391static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
5392 Register Reg) {
5393 if (MI.getParent() == nullptr)
5394 return nullptr;
5395 const MachineFunction *MF = MI.getParent()->getParent();
5396 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5397}
5398
5399bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
5400 auto IsHFPR = [&](const MachineOperand &Op) {
5401 if (!Op.isReg())
5402 return false;
5403 auto Reg = Op.getReg();
5404 if (Reg.isPhysical())
5405 return AArch64::FPR16RegClass.contains(Reg);
5406 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5407 return TRC == &AArch64::FPR16RegClass ||
5408 TRC == &AArch64::FPR16_loRegClass;
5409 };
5410 return llvm::any_of(Range: MI.operands(), P: IsHFPR);
5411}
5412
5413bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
5414 auto IsQFPR = [&](const MachineOperand &Op) {
5415 if (!Op.isReg())
5416 return false;
5417 auto Reg = Op.getReg();
5418 if (Reg.isPhysical())
5419 return AArch64::FPR128RegClass.contains(Reg);
5420 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5421 return TRC == &AArch64::FPR128RegClass ||
5422 TRC == &AArch64::FPR128_loRegClass;
5423 };
5424 return llvm::any_of(Range: MI.operands(), P: IsQFPR);
5425}
5426
5427bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
5428 switch (MI.getOpcode()) {
5429 case AArch64::BRK:
5430 case AArch64::HLT:
5431 case AArch64::PACIASP:
5432 case AArch64::PACIBSP:
5433 // Implicit BTI behavior.
5434 return true;
5435 case AArch64::PAUTH_PROLOGUE:
5436 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5437 return true;
5438 case AArch64::HINT: {
5439 unsigned Imm = MI.getOperand(i: 0).getImm();
5440 // Explicit BTI instruction.
5441 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5442 return true;
5443 // PACI(A|B)SP instructions.
5444 if (Imm == 25 || Imm == 27)
5445 return true;
5446 return false;
5447 }
5448 default:
5449 return false;
5450 }
5451}
5452
5453bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
5454 if (Reg == 0)
5455 return false;
5456 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5457 return AArch64::FPR128RegClass.contains(Reg) ||
5458 AArch64::FPR64RegClass.contains(Reg) ||
5459 AArch64::FPR32RegClass.contains(Reg) ||
5460 AArch64::FPR16RegClass.contains(Reg) ||
5461 AArch64::FPR8RegClass.contains(Reg);
5462}
5463
5464bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
5465 auto IsFPR = [&](const MachineOperand &Op) {
5466 if (!Op.isReg())
5467 return false;
5468 auto Reg = Op.getReg();
5469 if (Reg.isPhysical())
5470 return isFpOrNEON(Reg);
5471
5472 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5473 return TRC == &AArch64::FPR128RegClass ||
5474 TRC == &AArch64::FPR128_loRegClass ||
5475 TRC == &AArch64::FPR64RegClass ||
5476 TRC == &AArch64::FPR64_loRegClass ||
5477 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5478 TRC == &AArch64::FPR8RegClass;
5479 };
5480 return llvm::any_of(Range: MI.operands(), P: IsFPR);
5481}
5482
5483// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5484// scaled.
5485static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5486 int Scale = AArch64InstrInfo::getMemScale(Opc);
5487
5488 // If the byte-offset isn't a multiple of the stride, we can't scale this
5489 // offset.
5490 if (Offset % Scale != 0)
5491 return false;
5492
5493 // Convert the byte-offset used by unscaled into an "element" offset used
5494 // by the scaled pair load/store instructions.
5495 Offset /= Scale;
5496 return true;
5497}
5498
5499static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5500 if (FirstOpc == SecondOpc)
5501 return true;
5502 // We can also pair sign-ext and zero-ext instructions.
5503 switch (FirstOpc) {
5504 default:
5505 return false;
5506 case AArch64::STRSui:
5507 case AArch64::STURSi:
5508 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5509 case AArch64::STRDui:
5510 case AArch64::STURDi:
5511 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5512 case AArch64::STRQui:
5513 case AArch64::STURQi:
5514 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5515 case AArch64::STRWui:
5516 case AArch64::STURWi:
5517 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5518 case AArch64::STRXui:
5519 case AArch64::STURXi:
5520 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5521 case AArch64::LDRSui:
5522 case AArch64::LDURSi:
5523 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5524 case AArch64::LDRDui:
5525 case AArch64::LDURDi:
5526 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5527 case AArch64::LDRQui:
5528 case AArch64::LDURQi:
5529 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5530 case AArch64::LDRWui:
5531 case AArch64::LDURWi:
5532 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5533 case AArch64::LDRSWui:
5534 case AArch64::LDURSWi:
5535 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5536 case AArch64::LDRXui:
5537 case AArch64::LDURXi:
5538 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5539 }
5540 // These instructions can't be paired based on their opcodes.
5541 return false;
5542}
5543
5544static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5545 int64_t Offset1, unsigned Opcode1, int FI2,
5546 int64_t Offset2, unsigned Opcode2) {
5547 // Accesses through fixed stack object frame indices may access a different
5548 // fixed stack slot. Check that the object offsets + offsets match.
5549 if (MFI.isFixedObjectIndex(ObjectIdx: FI1) && MFI.isFixedObjectIndex(ObjectIdx: FI2)) {
5550 int64_t ObjectOffset1 = MFI.getObjectOffset(ObjectIdx: FI1);
5551 int64_t ObjectOffset2 = MFI.getObjectOffset(ObjectIdx: FI2);
5552 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5553 // Convert to scaled object offsets.
5554 int Scale1 = AArch64InstrInfo::getMemScale(Opc: Opcode1);
5555 if (ObjectOffset1 % Scale1 != 0)
5556 return false;
5557 ObjectOffset1 /= Scale1;
5558 int Scale2 = AArch64InstrInfo::getMemScale(Opc: Opcode2);
5559 if (ObjectOffset2 % Scale2 != 0)
5560 return false;
5561 ObjectOffset2 /= Scale2;
5562 ObjectOffset1 += Offset1;
5563 ObjectOffset2 += Offset2;
5564 return ObjectOffset1 + 1 == ObjectOffset2;
5565 }
5566
5567 return FI1 == FI2;
5568}
5569
5570/// Detect opportunities for ldp/stp formation.
5571///
5572/// Only called for LdSt for which getMemOperandWithOffset returns true.
5573bool AArch64InstrInfo::shouldClusterMemOps(
5574 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5575 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5576 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5577 unsigned NumBytes) const {
5578 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5579 const MachineOperand &BaseOp1 = *BaseOps1.front();
5580 const MachineOperand &BaseOp2 = *BaseOps2.front();
5581 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5582 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5583 if (BaseOp1.getType() != BaseOp2.getType())
5584 return false;
5585
5586 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5587 "Only base registers and frame indices are supported.");
5588
5589 // Check for both base regs and base FI.
5590 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5591 return false;
5592
5593 // Only cluster up to a single pair.
5594 if (ClusterSize > 2)
5595 return false;
5596
5597 if (!isPairableLdStInst(MI: FirstLdSt) || !isPairableLdStInst(MI: SecondLdSt))
5598 return false;
5599
5600 // Can we pair these instructions based on their opcodes?
5601 unsigned FirstOpc = FirstLdSt.getOpcode();
5602 unsigned SecondOpc = SecondLdSt.getOpcode();
5603 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5604 return false;
5605
5606 // Can't merge volatiles or load/stores that have a hint to avoid pair
5607 // formation, for example.
5608 if (!isCandidateToMergeOrPair(MI: FirstLdSt) ||
5609 !isCandidateToMergeOrPair(MI: SecondLdSt))
5610 return false;
5611
5612 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5613 int64_t Offset1 = FirstLdSt.getOperand(i: 2).getImm();
5614 if (hasUnscaledLdStOffset(Opc: FirstOpc) && !scaleOffset(Opc: FirstOpc, Offset&: Offset1))
5615 return false;
5616
5617 int64_t Offset2 = SecondLdSt.getOperand(i: 2).getImm();
5618 if (hasUnscaledLdStOffset(Opc: SecondOpc) && !scaleOffset(Opc: SecondOpc, Offset&: Offset2))
5619 return false;
5620
5621 // Pairwise instructions have a 7-bit signed offset field.
5622 if (Offset1 > 63 || Offset1 < -64)
5623 return false;
5624
5625 // The caller should already have ordered First/SecondLdSt by offset.
5626 // Note: except for non-equal frame index bases
5627 if (BaseOp1.isFI()) {
5628 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5629 "Caller should have ordered offsets.");
5630
5631 const MachineFrameInfo &MFI =
5632 FirstLdSt.getParent()->getParent()->getFrameInfo();
5633 return shouldClusterFI(MFI, FI1: BaseOp1.getIndex(), Offset1, Opcode1: FirstOpc,
5634 FI2: BaseOp2.getIndex(), Offset2, Opcode2: SecondOpc);
5635 }
5636
5637 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5638
5639 return Offset1 + 1 == Offset2;
5640}
5641
5642static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
5643 MCRegister Reg, unsigned SubIdx,
5644 RegState State,
5645 const TargetRegisterInfo *TRI) {
5646 if (!SubIdx)
5647 return MIB.addReg(RegNo: Reg, Flags: State);
5648
5649 if (Reg.isPhysical())
5650 return MIB.addReg(RegNo: TRI->getSubReg(Reg, Idx: SubIdx), Flags: State);
5651 return MIB.addReg(RegNo: Reg, Flags: State, SubReg: SubIdx);
5652}
5653
5654static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5655 unsigned NumRegs) {
5656 // We really want the positive remainder mod 32 here, that happens to be
5657 // easily obtainable with a mask.
5658 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5659}
5660
5661void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
5662 MachineBasicBlock::iterator I,
5663 const DebugLoc &DL, MCRegister DestReg,
5664 MCRegister SrcReg, bool KillSrc,
5665 unsigned Opcode,
5666 ArrayRef<unsigned> Indices) const {
5667 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5668 const TargetRegisterInfo *TRI = &getRegisterInfo();
5669 uint16_t DestEncoding = TRI->getEncodingValue(Reg: DestReg);
5670 uint16_t SrcEncoding = TRI->getEncodingValue(Reg: SrcReg);
5671 unsigned NumRegs = Indices.size();
5672
5673 int SubReg = 0, End = NumRegs, Incr = 1;
5674 if (forwardCopyWillClobberTuple(DestReg: DestEncoding, SrcReg: SrcEncoding, NumRegs)) {
5675 SubReg = NumRegs - 1;
5676 End = -1;
5677 Incr = -1;
5678 }
5679
5680 for (; SubReg != End; SubReg += Incr) {
5681 const MachineInstrBuilder MIB = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode));
5682 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
5683 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: {}, TRI);
5684 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
5685 }
5686}
5687
5688void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
5689 MachineBasicBlock::iterator I,
5690 const DebugLoc &DL, MCRegister DestReg,
5691 MCRegister SrcReg, bool KillSrc,
5692 unsigned Opcode, unsigned ZeroReg,
5693 llvm::ArrayRef<unsigned> Indices) const {
5694 const TargetRegisterInfo *TRI = &getRegisterInfo();
5695 unsigned NumRegs = Indices.size();
5696
5697#ifndef NDEBUG
5698 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5699 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5700 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5701 "GPR reg sequences should not be able to overlap");
5702#endif
5703
5704 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5705 const MachineInstrBuilder MIB = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode));
5706 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
5707 MIB.addReg(RegNo: ZeroReg);
5708 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
5709 MIB.addImm(Val: 0);
5710 }
5711}
5712
5713/// Returns true if the instruction at I is in a streaming call site region,
5714/// within a single basic block.
5715/// A "call site streaming region" starts after smstart and ends at smstop
5716/// around a call to a streaming function. This walks backward from I.
5717static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB,
5718 MachineBasicBlock::iterator I) {
5719 MachineFunction &MF = *MBB.getParent();
5720 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5721 if (!AFI->hasStreamingModeChanges())
5722 return false;
5723 // Walk backwards to find smstart/smstop
5724 for (MachineInstr &MI : reverse(C: make_range(x: MBB.begin(), y: I))) {
5725 unsigned Opc = MI.getOpcode();
5726 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5727 // Check if this is SM change (not ZA)
5728 int64_t PState = MI.getOperand(i: 0).getImm();
5729 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5730 // Operand 1 is 1 for start, 0 for stop
5731 return MI.getOperand(i: 1).getImm() == 1;
5732 }
5733 }
5734 }
5735 return false;
5736}
5737
5738/// Returns true if in a streaming call site region without SME-FA64.
5739static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5740 MachineBasicBlock &MBB,
5741 MachineBasicBlock::iterator I) {
5742 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5743}
5744
5745void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
5746 MachineBasicBlock::iterator I,
5747 const DebugLoc &DL, Register DestReg,
5748 Register SrcReg, bool KillSrc,
5749 bool RenamableDest,
5750 bool RenamableSrc) const {
5751 ++NumCopyInstrs;
5752 if (AArch64::GPR32spRegClass.contains(Reg: DestReg) &&
5753 AArch64::GPR32spRegClass.contains(Reg: SrcReg)) {
5754 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5755 // If either operand is WSP, expand to ADD #0.
5756 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5757 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5758 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5759 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5760 RC: &AArch64::GPR64spRegClass);
5761 MCRegister SrcRegX = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::sub_32,
5762 RC: &AArch64::GPR64spRegClass);
5763 // This instruction is reading and writing X registers. This may upset
5764 // the register scavenger and machine verifier, so we need to indicate
5765 // that we are reading an undefined value from SrcRegX, but a proper
5766 // value from SrcReg.
5767 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg: DestRegX)
5768 .addReg(RegNo: SrcRegX, Flags: RegState::Undef)
5769 .addImm(Val: 0)
5770 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
5771 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5772 ++NumZCRegMoveInstrsGPR;
5773 } else {
5774 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDWri), DestReg)
5775 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5776 .addImm(Val: 0)
5777 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5778 if (Subtarget.hasZeroCycleRegMoveGPR32())
5779 ++NumZCRegMoveInstrsGPR;
5780 }
5781 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5782 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5783 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5784 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5785 RC: &AArch64::GPR64spRegClass);
5786 assert(DestRegX.isValid() && "Destination super-reg not valid");
5787 MCRegister SrcRegX = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::sub_32,
5788 RC: &AArch64::GPR64spRegClass);
5789 assert(SrcRegX.isValid() && "Source super-reg not valid");
5790 // This instruction is reading and writing X registers. This may upset
5791 // the register scavenger and machine verifier, so we need to indicate
5792 // that we are reading an undefined value from SrcRegX, but a proper
5793 // value from SrcReg.
5794 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg: DestRegX)
5795 .addReg(RegNo: AArch64::XZR)
5796 .addReg(RegNo: SrcRegX, Flags: RegState::Undef)
5797 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
5798 ++NumZCRegMoveInstrsGPR;
5799 } else {
5800 // Otherwise, expand to ORR WZR.
5801 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRWrr), DestReg)
5802 .addReg(RegNo: AArch64::WZR)
5803 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5804 if (Subtarget.hasZeroCycleRegMoveGPR32())
5805 ++NumZCRegMoveInstrsGPR;
5806 }
5807 return;
5808 }
5809
5810 // GPR32 zeroing
5811 if (AArch64::GPR32spRegClass.contains(Reg: DestReg) && SrcReg == AArch64::WZR) {
5812 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5813 !Subtarget.hasZeroCycleZeroingGPR32()) {
5814 MCRegister DestRegX = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::sub_32,
5815 RC: &AArch64::GPR64spRegClass);
5816 assert(DestRegX.isValid() && "Destination super-reg not valid");
5817 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: DestRegX)
5818 .addImm(Val: 0)
5819 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5820 ++NumZCZeroingInstrsGPR;
5821 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5822 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZWi), DestReg)
5823 .addImm(Val: 0)
5824 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5825 ++NumZCZeroingInstrsGPR;
5826 } else {
5827 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRWrr), DestReg)
5828 .addReg(RegNo: AArch64::WZR)
5829 .addReg(RegNo: AArch64::WZR);
5830 }
5831 return;
5832 }
5833
5834 if (AArch64::GPR64spRegClass.contains(Reg: DestReg) &&
5835 AArch64::GPR64spRegClass.contains(Reg: SrcReg)) {
5836 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5837 // If either operand is SP, expand to ADD #0.
5838 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ADDXri), DestReg)
5839 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
5840 .addImm(Val: 0)
5841 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5842 if (Subtarget.hasZeroCycleRegMoveGPR64())
5843 ++NumZCRegMoveInstrsGPR;
5844 } else {
5845 // Otherwise, expand to ORR XZR.
5846 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg)
5847 .addReg(RegNo: AArch64::XZR)
5848 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5849 if (Subtarget.hasZeroCycleRegMoveGPR64())
5850 ++NumZCRegMoveInstrsGPR;
5851 }
5852 return;
5853 }
5854
5855 // GPR64 zeroing
5856 if (AArch64::GPR64spRegClass.contains(Reg: DestReg) && SrcReg == AArch64::XZR) {
5857 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5858 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg)
5859 .addImm(Val: 0)
5860 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0));
5861 ++NumZCZeroingInstrsGPR;
5862 } else {
5863 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRXrr), DestReg)
5864 .addReg(RegNo: AArch64::XZR)
5865 .addReg(RegNo: AArch64::XZR);
5866 }
5867 return;
5868 }
5869
5870 // Copy a Predicate register by ORRing with itself.
5871 if (AArch64::PPRRegClass.contains(Reg: DestReg) &&
5872 AArch64::PPRRegClass.contains(Reg: SrcReg)) {
5873 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5874 "Unexpected SVE register.");
5875 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_PPzPP), DestReg)
5876 .addReg(RegNo: SrcReg) // Pg
5877 .addReg(RegNo: SrcReg)
5878 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5879 return;
5880 }
5881
5882 // Copy a predicate-as-counter register by ORRing with itself as if it
5883 // were a regular predicate (mask) register.
5884 bool DestIsPNR = AArch64::PNRRegClass.contains(Reg: DestReg);
5885 bool SrcIsPNR = AArch64::PNRRegClass.contains(Reg: SrcReg);
5886 if (DestIsPNR || SrcIsPNR) {
5887 auto ToPPR = [](MCRegister R) -> MCRegister {
5888 return (R - AArch64::PN0) + AArch64::P0;
5889 };
5890 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5891 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5892
5893 if (PPRSrcReg != PPRDestReg) {
5894 auto NewMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_PPzPP), DestReg: PPRDestReg)
5895 .addReg(RegNo: PPRSrcReg) // Pg
5896 .addReg(RegNo: PPRSrcReg)
5897 .addReg(RegNo: PPRSrcReg, Flags: getKillRegState(B: KillSrc));
5898 if (DestIsPNR)
5899 NewMI.addDef(RegNo: DestReg, Flags: RegState::Implicit);
5900 }
5901 return;
5902 }
5903
5904 // Copy a Z register by ORRing with itself.
5905 if (AArch64::ZPRRegClass.contains(Reg: DestReg) &&
5906 AArch64::ZPRRegClass.contains(Reg: SrcReg)) {
5907 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5908 "Unexpected SVE register.");
5909 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_ZZZ), DestReg)
5910 .addReg(RegNo: SrcReg)
5911 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
5912 return;
5913 }
5914
5915 // Copy a Z register pair by copying the individual sub-registers.
5916 if ((AArch64::ZPR2RegClass.contains(Reg: DestReg) ||
5917 AArch64::ZPR2StridedOrContiguousRegClass.contains(Reg: DestReg)) &&
5918 (AArch64::ZPR2RegClass.contains(Reg: SrcReg) ||
5919 AArch64::ZPR2StridedOrContiguousRegClass.contains(Reg: SrcReg))) {
5920 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5921 "Unexpected SVE register.");
5922 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5923 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5924 Indices);
5925 return;
5926 }
5927
5928 // Copy a Z register triple by copying the individual sub-registers.
5929 if (AArch64::ZPR3RegClass.contains(Reg: DestReg) &&
5930 AArch64::ZPR3RegClass.contains(Reg: SrcReg)) {
5931 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5932 "Unexpected SVE register.");
5933 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5934 AArch64::zsub2};
5935 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5936 Indices);
5937 return;
5938 }
5939
5940 // Copy a Z register quad by copying the individual sub-registers.
5941 if ((AArch64::ZPR4RegClass.contains(Reg: DestReg) ||
5942 AArch64::ZPR4StridedOrContiguousRegClass.contains(Reg: DestReg)) &&
5943 (AArch64::ZPR4RegClass.contains(Reg: SrcReg) ||
5944 AArch64::ZPR4StridedOrContiguousRegClass.contains(Reg: SrcReg))) {
5945 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5946 "Unexpected SVE register.");
5947 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5948 AArch64::zsub2, AArch64::zsub3};
5949 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORR_ZZZ,
5950 Indices);
5951 return;
5952 }
5953
5954 // Copy a DDDD register quad by copying the individual sub-registers.
5955 if (AArch64::DDDDRegClass.contains(Reg: DestReg) &&
5956 AArch64::DDDDRegClass.contains(Reg: SrcReg)) {
5957 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5958 AArch64::dsub2, AArch64::dsub3};
5959 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5960 Indices);
5961 return;
5962 }
5963
5964 // Copy a DDD register triple by copying the individual sub-registers.
5965 if (AArch64::DDDRegClass.contains(Reg: DestReg) &&
5966 AArch64::DDDRegClass.contains(Reg: SrcReg)) {
5967 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5968 AArch64::dsub2};
5969 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5970 Indices);
5971 return;
5972 }
5973
5974 // Copy a DD register pair by copying the individual sub-registers.
5975 if (AArch64::DDRegClass.contains(Reg: DestReg) &&
5976 AArch64::DDRegClass.contains(Reg: SrcReg)) {
5977 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5978 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv8i8,
5979 Indices);
5980 return;
5981 }
5982
5983 // Copy a QQQQ register quad by copying the individual sub-registers.
5984 if (AArch64::QQQQRegClass.contains(Reg: DestReg) &&
5985 AArch64::QQQQRegClass.contains(Reg: SrcReg)) {
5986 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5987 AArch64::qsub2, AArch64::qsub3};
5988 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5989 Indices);
5990 return;
5991 }
5992
5993 // Copy a QQQ register triple by copying the individual sub-registers.
5994 if (AArch64::QQQRegClass.contains(Reg: DestReg) &&
5995 AArch64::QQQRegClass.contains(Reg: SrcReg)) {
5996 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5997 AArch64::qsub2};
5998 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
5999 Indices);
6000 return;
6001 }
6002
6003 // Copy a QQ register pair by copying the individual sub-registers.
6004 if (AArch64::QQRegClass.contains(Reg: DestReg) &&
6005 AArch64::QQRegClass.contains(Reg: SrcReg)) {
6006 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
6007 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRv16i8,
6008 Indices);
6009 return;
6010 }
6011
6012 if (AArch64::XSeqPairsClassRegClass.contains(Reg: DestReg) &&
6013 AArch64::XSeqPairsClassRegClass.contains(Reg: SrcReg)) {
6014 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
6015 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRXrs,
6016 ZeroReg: AArch64::XZR, Indices);
6017 return;
6018 }
6019
6020 if (AArch64::WSeqPairsClassRegClass.contains(Reg: DestReg) &&
6021 AArch64::WSeqPairsClassRegClass.contains(Reg: SrcReg)) {
6022 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
6023 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, Opcode: AArch64::ORRWrs,
6024 ZeroReg: AArch64::WZR, Indices);
6025 return;
6026 }
6027
6028 if (AArch64::FPR128RegClass.contains(Reg: DestReg) &&
6029 AArch64::FPR128RegClass.contains(Reg: SrcReg)) {
6030 // In streaming regions, NEON is illegal but streaming-SVE is available.
6031 // Use SVE for copies if we're in a streaming region and SME is available.
6032 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
6033 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
6034 !Subtarget.isNeonAvailable()) ||
6035 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6036 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORR_ZZZ))
6037 .addReg(RegNo: AArch64::Z0 + (DestReg - AArch64::Q0), Flags: RegState::Define)
6038 .addReg(RegNo: AArch64::Z0 + (SrcReg - AArch64::Q0))
6039 .addReg(RegNo: AArch64::Z0 + (SrcReg - AArch64::Q0));
6040 } else if (Subtarget.isNeonAvailable()) {
6041 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg)
6042 .addReg(RegNo: SrcReg)
6043 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
6044 if (Subtarget.hasZeroCycleRegMoveFPR128())
6045 ++NumZCRegMoveInstrsFPR;
6046 } else {
6047 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::STRQpre))
6048 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
6049 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
6050 .addReg(RegNo: AArch64::SP)
6051 .addImm(Val: -16);
6052 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::LDRQpost))
6053 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
6054 .addReg(RegNo: DestReg, Flags: RegState::Define)
6055 .addReg(RegNo: AArch64::SP)
6056 .addImm(Val: 16);
6057 }
6058 return;
6059 }
6060
6061 if (AArch64::FPR64RegClass.contains(Reg: DestReg) &&
6062 AArch64::FPR64RegClass.contains(Reg: SrcReg)) {
6063 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6064 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6065 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6066 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6067 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::dsub,
6068 RC: &AArch64::FPR128RegClass);
6069 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::dsub,
6070 RC: &AArch64::FPR128RegClass);
6071 // This instruction is reading and writing Q registers. This may upset
6072 // the register scavenger and machine verifier, so we need to indicate
6073 // that we are reading an undefined value from SrcRegQ, but a proper
6074 // value from SrcReg.
6075 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
6076 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
6077 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
6078 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
6079 ++NumZCRegMoveInstrsFPR;
6080 } else {
6081 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg)
6082 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
6083 if (Subtarget.hasZeroCycleRegMoveFPR64())
6084 ++NumZCRegMoveInstrsFPR;
6085 }
6086 return;
6087 }
6088
6089 if (AArch64::FPR32RegClass.contains(Reg: DestReg) &&
6090 AArch64::FPR32RegClass.contains(Reg: SrcReg)) {
6091 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6092 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6093 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6094 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6095 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::ssub,
6096 RC: &AArch64::FPR128RegClass);
6097 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::ssub,
6098 RC: &AArch64::FPR128RegClass);
6099 // This instruction is reading and writing Q registers. This may upset
6100 // the register scavenger and machine verifier, so we need to indicate
6101 // that we are reading an undefined value from SrcRegQ, but a proper
6102 // value from SrcReg.
6103 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
6104 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
6105 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
6106 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
6107 ++NumZCRegMoveInstrsFPR;
6108 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6109 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6110 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::ssub,
6111 RC: &AArch64::FPR64RegClass);
6112 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::ssub,
6113 RC: &AArch64::FPR64RegClass);
6114 // This instruction is reading and writing D registers. This may upset
6115 // the register scavenger and machine verifier, so we need to indicate
6116 // that we are reading an undefined value from SrcRegD, but a proper
6117 // value from SrcReg.
6118 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
6119 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
6120 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
6121 ++NumZCRegMoveInstrsFPR;
6122 } else {
6123 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
6124 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
6125 if (Subtarget.hasZeroCycleRegMoveFPR32())
6126 ++NumZCRegMoveInstrsFPR;
6127 }
6128 return;
6129 }
6130
6131 if (AArch64::FPR16RegClass.contains(Reg: DestReg) &&
6132 AArch64::FPR16RegClass.contains(Reg: SrcReg)) {
6133 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6134 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6135 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6136 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6137 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
6138 RC: &AArch64::FPR128RegClass);
6139 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
6140 RC: &AArch64::FPR128RegClass);
6141 // This instruction is reading and writing Q registers. This may upset
6142 // the register scavenger and machine verifier, so we need to indicate
6143 // that we are reading an undefined value from SrcRegQ, but a proper
6144 // value from SrcReg.
6145 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
6146 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
6147 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
6148 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
6149 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6150 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6151 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
6152 RC: &AArch64::FPR64RegClass);
6153 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
6154 RC: &AArch64::FPR64RegClass);
6155 // This instruction is reading and writing D registers. This may upset
6156 // the register scavenger and machine verifier, so we need to indicate
6157 // that we are reading an undefined value from SrcRegD, but a proper
6158 // value from SrcReg.
6159 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
6160 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
6161 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
6162 } else {
6163 DestReg = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::hsub,
6164 RC: &AArch64::FPR32RegClass);
6165 SrcReg = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::hsub,
6166 RC: &AArch64::FPR32RegClass);
6167 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
6168 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
6169 }
6170 return;
6171 }
6172
6173 if (AArch64::FPR8RegClass.contains(Reg: DestReg) &&
6174 AArch64::FPR8RegClass.contains(Reg: SrcReg)) {
6175 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6176 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6177 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6178 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6179 MCRegister DestRegQ = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
6180 RC: &AArch64::FPR128RegClass);
6181 MCRegister SrcRegQ = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
6182 RC: &AArch64::FPR128RegClass);
6183 // This instruction is reading and writing Q registers. This may upset
6184 // the register scavenger and machine verifier, so we need to indicate
6185 // that we are reading an undefined value from SrcRegQ, but a proper
6186 // value from SrcReg.
6187 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::ORRv16i8), DestReg: DestRegQ)
6188 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
6189 .addReg(RegNo: SrcRegQ, Flags: RegState::Undef)
6190 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
6191 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6192 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6193 MCRegister DestRegD = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
6194 RC: &AArch64::FPR64RegClass);
6195 MCRegister SrcRegD = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
6196 RC: &AArch64::FPR64RegClass);
6197 // This instruction is reading and writing D registers. This may upset
6198 // the register scavenger and machine verifier, so we need to indicate
6199 // that we are reading an undefined value from SrcRegD, but a proper
6200 // value from SrcReg.
6201 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDr), DestReg: DestRegD)
6202 .addReg(RegNo: SrcRegD, Flags: RegState::Undef)
6203 .addReg(RegNo: SrcReg, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
6204 } else {
6205 DestReg = RI.getMatchingSuperReg(Reg: DestReg, SubIdx: AArch64::bsub,
6206 RC: &AArch64::FPR32RegClass);
6207 SrcReg = RI.getMatchingSuperReg(Reg: SrcReg, SubIdx: AArch64::bsub,
6208 RC: &AArch64::FPR32RegClass);
6209 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSr), DestReg)
6210 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
6211 }
6212 return;
6213 }
6214
6215 // Copies between GPR64 and FPR64.
6216 if (AArch64::FPR64RegClass.contains(Reg: DestReg) &&
6217 AArch64::GPR64RegClass.contains(Reg: SrcReg)) {
6218 if (AArch64::XZR == SrcReg) {
6219 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVD0), DestReg);
6220 } else {
6221 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVXDr), DestReg)
6222 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
6223 }
6224 return;
6225 }
6226 if (AArch64::GPR64RegClass.contains(Reg: DestReg) &&
6227 AArch64::FPR64RegClass.contains(Reg: SrcReg)) {
6228 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVDXr), DestReg)
6229 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
6230 return;
6231 }
6232 // Copies between GPR32 and FPR32.
6233 if (AArch64::FPR32RegClass.contains(Reg: DestReg) &&
6234 AArch64::GPR32RegClass.contains(Reg: SrcReg)) {
6235 if (AArch64::WZR == SrcReg) {
6236 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVS0), DestReg);
6237 } else {
6238 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVWSr), DestReg)
6239 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
6240 }
6241 return;
6242 }
6243 if (AArch64::GPR32RegClass.contains(Reg: DestReg) &&
6244 AArch64::FPR32RegClass.contains(Reg: SrcReg)) {
6245 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::FMOVSWr), DestReg)
6246 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc));
6247 return;
6248 }
6249
6250 if (DestReg == AArch64::NZCV) {
6251 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
6252 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MSR))
6253 .addImm(Val: AArch64SysReg::NZCV)
6254 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: KillSrc))
6255 .addReg(RegNo: AArch64::NZCV, Flags: RegState::Implicit | RegState::Define);
6256 return;
6257 }
6258
6259 if (SrcReg == AArch64::NZCV) {
6260 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
6261 BuildMI(BB&: MBB, I, MIMD: DL, MCID: get(Opcode: AArch64::MRS), DestReg)
6262 .addImm(Val: AArch64SysReg::NZCV)
6263 .addReg(RegNo: AArch64::NZCV, Flags: RegState::Implicit | getKillRegState(B: KillSrc));
6264 return;
6265 }
6266
6267#ifndef NDEBUG
6268 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
6269 << "\n";
6270#endif
6271 llvm_unreachable("unimplemented reg-to-reg copy");
6272}
6273
6274static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
6275 MachineBasicBlock &MBB,
6276 MachineBasicBlock::iterator InsertBefore,
6277 const MCInstrDesc &MCID,
6278 Register SrcReg, bool IsKill,
6279 unsigned SubIdx0, unsigned SubIdx1, int FI,
6280 MachineMemOperand *MMO) {
6281 Register SrcReg0 = SrcReg;
6282 Register SrcReg1 = SrcReg;
6283 if (SrcReg.isPhysical()) {
6284 SrcReg0 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx0);
6285 SubIdx0 = 0;
6286 SrcReg1 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx1);
6287 SubIdx1 = 0;
6288 }
6289 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
6290 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: IsKill), SubReg: SubIdx0)
6291 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: IsKill), SubReg: SubIdx1)
6292 .addFrameIndex(Idx: FI)
6293 .addImm(Val: 0)
6294 .addMemOperand(MMO);
6295}
6296
6297void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
6298 MachineBasicBlock::iterator MBBI,
6299 Register SrcReg, bool isKill, int FI,
6300 const TargetRegisterClass *RC,
6301 Register VReg,
6302 MachineInstr::MIFlag Flags) const {
6303 MachineFunction &MF = *MBB.getParent();
6304 MachineFrameInfo &MFI = MF.getFrameInfo();
6305
6306 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6307 MachineMemOperand *MMO =
6308 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore,
6309 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
6310 unsigned Opc = 0;
6311 bool Offset = true;
6312 MCRegister PNRReg = MCRegister::NoRegister;
6313 unsigned StackID = TargetStackID::Default;
6314 switch (RI.getSpillSize(RC: *RC)) {
6315 case 1:
6316 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6317 Opc = AArch64::STRBui;
6318 break;
6319 case 2: {
6320 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6321 Opc = AArch64::STRHui;
6322 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6323 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6324 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6325 "Unexpected register store without SVE store instructions");
6326 Opc = AArch64::STR_PXI;
6327 StackID = TargetStackID::ScalablePredicateVector;
6328 }
6329 break;
6330 }
6331 case 4:
6332 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6333 Opc = AArch64::STRWui;
6334 if (SrcReg.isVirtual())
6335 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR32RegClass);
6336 else
6337 assert(SrcReg != AArch64::WSP);
6338 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6339 Opc = AArch64::STRSui;
6340 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6341 Opc = AArch64::STR_PPXI;
6342 StackID = TargetStackID::ScalablePredicateVector;
6343 }
6344 break;
6345 case 8:
6346 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6347 Opc = AArch64::STRXui;
6348 if (SrcReg.isVirtual())
6349 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64RegClass);
6350 else
6351 assert(SrcReg != AArch64::SP);
6352 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6353 Opc = AArch64::STRDui;
6354 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6355 storeRegPairToStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6356 MCID: get(Opcode: AArch64::STPWi), SrcReg, IsKill: isKill,
6357 SubIdx0: AArch64::sube32, SubIdx1: AArch64::subo32, FI, MMO);
6358 return;
6359 }
6360 break;
6361 case 16:
6362 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6363 Opc = AArch64::STRQui;
6364 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6365 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6366 Opc = AArch64::ST1Twov1d;
6367 Offset = false;
6368 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6369 storeRegPairToStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6370 MCID: get(Opcode: AArch64::STPXi), SrcReg, IsKill: isKill,
6371 SubIdx0: AArch64::sube64, SubIdx1: AArch64::subo64, FI, MMO);
6372 return;
6373 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6374 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6375 "Unexpected register store without SVE store instructions");
6376 Opc = AArch64::STR_ZXI;
6377 StackID = TargetStackID::ScalableVector;
6378 }
6379 break;
6380 case 24:
6381 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6382 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6383 Opc = AArch64::ST1Threev1d;
6384 Offset = false;
6385 }
6386 break;
6387 case 32:
6388 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6389 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6390 Opc = AArch64::ST1Fourv1d;
6391 Offset = false;
6392 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6393 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6394 Opc = AArch64::ST1Twov2d;
6395 Offset = false;
6396 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6397 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6398 "Unexpected register store without SVE store instructions");
6399 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6400 StackID = TargetStackID::ScalableVector;
6401 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6402 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6403 "Unexpected register store without SVE store instructions");
6404 Opc = AArch64::STR_ZZXI;
6405 StackID = TargetStackID::ScalableVector;
6406 }
6407 break;
6408 case 48:
6409 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6410 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6411 Opc = AArch64::ST1Threev2d;
6412 Offset = false;
6413 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6414 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6415 "Unexpected register store without SVE store instructions");
6416 Opc = AArch64::STR_ZZZXI;
6417 StackID = TargetStackID::ScalableVector;
6418 }
6419 break;
6420 case 64:
6421 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6422 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6423 Opc = AArch64::ST1Fourv2d;
6424 Offset = false;
6425 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6426 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6427 "Unexpected register store without SVE store instructions");
6428 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6429 StackID = TargetStackID::ScalableVector;
6430 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6431 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6432 "Unexpected register store without SVE store instructions");
6433 Opc = AArch64::STR_ZZZZXI;
6434 StackID = TargetStackID::ScalableVector;
6435 }
6436 break;
6437 }
6438 assert(Opc && "Unknown register class");
6439 MFI.setStackID(ObjectIdx: FI, ID: StackID);
6440
6441 const MachineInstrBuilder MI = BuildMI(BB&: MBB, I: MBBI, MIMD: DebugLoc(), MCID: get(Opcode: Opc))
6442 .addReg(RegNo: SrcReg, Flags: getKillRegState(B: isKill))
6443 .addFrameIndex(Idx: FI);
6444
6445 if (Offset)
6446 MI.addImm(Val: 0);
6447 if (PNRReg.isValid())
6448 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
6449 MI.addMemOperand(MMO);
6450}
6451
6452static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
6453 MachineBasicBlock &MBB,
6454 MachineBasicBlock::iterator InsertBefore,
6455 const MCInstrDesc &MCID,
6456 Register DestReg, unsigned SubIdx0,
6457 unsigned SubIdx1, int FI,
6458 MachineMemOperand *MMO) {
6459 Register DestReg0 = DestReg;
6460 Register DestReg1 = DestReg;
6461 bool IsUndef = true;
6462 if (DestReg.isPhysical()) {
6463 DestReg0 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx0);
6464 SubIdx0 = 0;
6465 DestReg1 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx1);
6466 SubIdx1 = 0;
6467 IsUndef = false;
6468 }
6469 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
6470 .addReg(RegNo: DestReg0, Flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx0)
6471 .addReg(RegNo: DestReg1, Flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx1)
6472 .addFrameIndex(Idx: FI)
6473 .addImm(Val: 0)
6474 .addMemOperand(MMO);
6475}
6476
6477void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
6478 MachineBasicBlock::iterator MBBI,
6479 Register DestReg, int FI,
6480 const TargetRegisterClass *RC,
6481 Register VReg, unsigned SubReg,
6482 MachineInstr::MIFlag Flags) const {
6483 MachineFunction &MF = *MBB.getParent();
6484 MachineFrameInfo &MFI = MF.getFrameInfo();
6485 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6486 MachineMemOperand *MMO =
6487 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOLoad,
6488 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
6489
6490 unsigned Opc = 0;
6491 bool Offset = true;
6492 unsigned StackID = TargetStackID::Default;
6493 Register PNRReg = MCRegister::NoRegister;
6494 switch (TRI.getSpillSize(RC: *RC)) {
6495 case 1:
6496 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6497 Opc = AArch64::LDRBui;
6498 break;
6499 case 2: {
6500 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6501 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6502 Opc = AArch64::LDRHui;
6503 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6504 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6505 "Unexpected register load without SVE load instructions");
6506 if (IsPNR)
6507 PNRReg = DestReg;
6508 Opc = AArch64::LDR_PXI;
6509 StackID = TargetStackID::ScalablePredicateVector;
6510 }
6511 break;
6512 }
6513 case 4:
6514 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6515 Opc = AArch64::LDRWui;
6516 if (DestReg.isVirtual())
6517 MF.getRegInfo().constrainRegClass(Reg: DestReg, RC: &AArch64::GPR32RegClass);
6518 else
6519 assert(DestReg != AArch64::WSP);
6520 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6521 Opc = AArch64::LDRSui;
6522 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6523 Opc = AArch64::LDR_PPXI;
6524 StackID = TargetStackID::ScalablePredicateVector;
6525 }
6526 break;
6527 case 8:
6528 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6529 Opc = AArch64::LDRXui;
6530 if (DestReg.isVirtual())
6531 MF.getRegInfo().constrainRegClass(Reg: DestReg, RC: &AArch64::GPR64RegClass);
6532 else
6533 assert(DestReg != AArch64::SP);
6534 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6535 Opc = AArch64::LDRDui;
6536 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6537 loadRegPairFromStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6538 MCID: get(Opcode: AArch64::LDPWi), DestReg, SubIdx0: AArch64::sube32,
6539 SubIdx1: AArch64::subo32, FI, MMO);
6540 return;
6541 }
6542 break;
6543 case 16:
6544 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6545 Opc = AArch64::LDRQui;
6546 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6547 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6548 Opc = AArch64::LD1Twov1d;
6549 Offset = false;
6550 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6551 loadRegPairFromStackSlot(TRI: getRegisterInfo(), MBB, InsertBefore: MBBI,
6552 MCID: get(Opcode: AArch64::LDPXi), DestReg, SubIdx0: AArch64::sube64,
6553 SubIdx1: AArch64::subo64, FI, MMO);
6554 return;
6555 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6556 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6557 "Unexpected register load without SVE load instructions");
6558 Opc = AArch64::LDR_ZXI;
6559 StackID = TargetStackID::ScalableVector;
6560 }
6561 break;
6562 case 24:
6563 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6564 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6565 Opc = AArch64::LD1Threev1d;
6566 Offset = false;
6567 }
6568 break;
6569 case 32:
6570 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6571 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6572 Opc = AArch64::LD1Fourv1d;
6573 Offset = false;
6574 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6575 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6576 Opc = AArch64::LD1Twov2d;
6577 Offset = false;
6578 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6579 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6580 "Unexpected register load without SVE load instructions");
6581 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6582 StackID = TargetStackID::ScalableVector;
6583 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6584 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6585 "Unexpected register load without SVE load instructions");
6586 Opc = AArch64::LDR_ZZXI;
6587 StackID = TargetStackID::ScalableVector;
6588 }
6589 break;
6590 case 48:
6591 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6592 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6593 Opc = AArch64::LD1Threev2d;
6594 Offset = false;
6595 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6596 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6597 "Unexpected register load without SVE load instructions");
6598 Opc = AArch64::LDR_ZZZXI;
6599 StackID = TargetStackID::ScalableVector;
6600 }
6601 break;
6602 case 64:
6603 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6604 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6605 Opc = AArch64::LD1Fourv2d;
6606 Offset = false;
6607 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6608 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6609 "Unexpected register load without SVE load instructions");
6610 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6611 StackID = TargetStackID::ScalableVector;
6612 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6613 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6614 "Unexpected register load without SVE load instructions");
6615 Opc = AArch64::LDR_ZZZZXI;
6616 StackID = TargetStackID::ScalableVector;
6617 }
6618 break;
6619 }
6620
6621 assert(Opc && "Unknown register class");
6622 MFI.setStackID(ObjectIdx: FI, ID: StackID);
6623
6624 const MachineInstrBuilder MI = BuildMI(BB&: MBB, I: MBBI, MIMD: DebugLoc(), MCID: get(Opcode: Opc))
6625 .addReg(RegNo: DestReg, Flags: getDefRegState(B: true))
6626 .addFrameIndex(Idx: FI);
6627 if (Offset)
6628 MI.addImm(Val: 0);
6629 if (PNRReg.isValid() && !PNRReg.isVirtual())
6630 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
6631 MI.addMemOperand(MMO);
6632}
6633
6634bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
6635 const MachineInstr &UseMI,
6636 const TargetRegisterInfo *TRI) {
6637 return any_of(Range: instructionsWithoutDebug(It: std::next(x: DefMI.getIterator()),
6638 End: UseMI.getIterator()),
6639 P: [TRI](const MachineInstr &I) {
6640 return I.modifiesRegister(Reg: AArch64::NZCV, TRI) ||
6641 I.readsRegister(Reg: AArch64::NZCV, TRI);
6642 });
6643}
6644
6645void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6646 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6647 // The smallest scalable element supported by scaled SVE addressing
6648 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6649 // byte offset must always be a multiple of 2.
6650 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6651
6652 // VGSized offsets are divided by '2', because the VG register is the
6653 // the number of 64bit granules as opposed to 128bit vector chunks,
6654 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6655 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6656 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6657 ByteSized = Offset.getFixed();
6658 VGSized = Offset.getScalable() / 2;
6659}
6660
6661/// Returns the offset in parts to which this frame offset can be
6662/// decomposed for the purpose of describing a frame offset.
6663/// For non-scalable offsets this is simply its byte size.
6664void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6665 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6666 int64_t &NumDataVectors) {
6667 // The smallest scalable element supported by scaled SVE addressing
6668 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6669 // byte offset must always be a multiple of 2.
6670 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6671
6672 NumBytes = Offset.getFixed();
6673 NumDataVectors = 0;
6674 NumPredicateVectors = Offset.getScalable() / 2;
6675 // This method is used to get the offsets to adjust the frame offset.
6676 // If the function requires ADDPL to be used and needs more than two ADDPL
6677 // instructions, part of the offset is folded into NumDataVectors so that it
6678 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6679 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6680 NumPredicateVectors > 62) {
6681 NumDataVectors = NumPredicateVectors / 8;
6682 NumPredicateVectors -= NumDataVectors * 8;
6683 }
6684}
6685
6686// Convenience function to create a DWARF expression for: Constant `Operation`.
6687// This helper emits compact sequences for common cases. For example, for`-15
6688// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6689static void appendConstantExpr(SmallVectorImpl<char> &Expr, int64_t Constant,
6690 dwarf::LocationAtom Operation) {
6691 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6692 // -Constant (1 to 31)
6693 Expr.push_back(Elt: dwarf::DW_OP_lit0 - Constant);
6694 Operation = dwarf::DW_OP_minus;
6695 } else if (Constant >= 0 && Constant <= 31) {
6696 // Literal value 0 to 31
6697 Expr.push_back(Elt: dwarf::DW_OP_lit0 + Constant);
6698 } else {
6699 // Signed constant
6700 Expr.push_back(Elt: dwarf::DW_OP_consts);
6701 appendLEB128<LEB128Sign::Signed>(Buffer&: Expr, Value: Constant);
6702 }
6703 return Expr.push_back(Elt: Operation);
6704}
6705
6706// Convenience function to create a DWARF expression for a register.
6707static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6708 Expr.push_back(Elt: (char)dwarf::DW_OP_bregx);
6709 appendLEB128<LEB128Sign::Unsigned>(Buffer&: Expr, Value: RegNum);
6710 Expr.push_back(Elt: 0);
6711}
6712
6713// Convenience function to create a DWARF expression for loading a register from
6714// a CFA offset.
6715static void appendLoadRegExpr(SmallVectorImpl<char> &Expr,
6716 int64_t OffsetFromDefCFA) {
6717 // This assumes the top of the DWARF stack contains the CFA.
6718 Expr.push_back(Elt: dwarf::DW_OP_dup);
6719 // Add the offset to the register.
6720 appendConstantExpr(Expr, Constant: OffsetFromDefCFA, Operation: dwarf::DW_OP_plus);
6721 // Dereference the address (loads a 64 bit value)..
6722 Expr.push_back(Elt: dwarf::DW_OP_deref);
6723}
6724
6725// Convenience function to create a comment for
6726// (+/-) NumBytes (* RegScale)?
6727static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6728 StringRef RegScale = {}) {
6729 if (NumBytes) {
6730 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(x: NumBytes);
6731 if (!RegScale.empty())
6732 Comment << ' ' << RegScale;
6733 }
6734}
6735
6736// Creates an MCCFIInstruction:
6737// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6738static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
6739 unsigned Reg,
6740 const StackOffset &Offset) {
6741 int64_t NumBytes, NumVGScaledBytes;
6742 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, ByteSized&: NumBytes,
6743 VGSized&: NumVGScaledBytes);
6744 std::string CommentBuffer;
6745 llvm::raw_string_ostream Comment(CommentBuffer);
6746
6747 if (Reg == AArch64::SP)
6748 Comment << "sp";
6749 else if (Reg == AArch64::FP)
6750 Comment << "fp";
6751 else
6752 Comment << printReg(Reg, TRI: &TRI);
6753
6754 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6755 SmallString<64> Expr;
6756 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6757 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6758 // Reg + NumBytes
6759 Expr.push_back(Elt: dwarf::DW_OP_breg0 + DwarfReg);
6760 appendLEB128<LEB128Sign::Signed>(Buffer&: Expr, Value: NumBytes);
6761 appendOffsetComment(NumBytes, Comment);
6762 if (NumVGScaledBytes) {
6763 // + VG * NumVGScaledBytes
6764 appendOffsetComment(NumBytes: NumVGScaledBytes, Comment, RegScale: "* VG");
6765 appendReadRegExpr(Expr, RegNum: TRI.getDwarfRegNum(Reg: AArch64::VG, isEH: true));
6766 appendConstantExpr(Expr, Constant: NumVGScaledBytes, Operation: dwarf::DW_OP_mul);
6767 Expr.push_back(Elt: dwarf::DW_OP_plus);
6768 }
6769
6770 // Wrap this into DW_CFA_def_cfa.
6771 SmallString<64> DefCfaExpr;
6772 DefCfaExpr.push_back(Elt: dwarf::DW_CFA_def_cfa_expression);
6773 appendLEB128<LEB128Sign::Unsigned>(Buffer&: DefCfaExpr, Value: Expr.size());
6774 DefCfaExpr.append(RHS: Expr.str());
6775 return MCCFIInstruction::createEscape(L: nullptr, Vals: DefCfaExpr.str(), Loc: SMLoc(),
6776 Comment: Comment.str());
6777}
6778
6779MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
6780 unsigned FrameReg, unsigned Reg,
6781 const StackOffset &Offset,
6782 bool LastAdjustmentWasScalable) {
6783 if (Offset.getScalable())
6784 return createDefCFAExpression(TRI, Reg, Offset);
6785
6786 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6787 return MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: int(Offset.getFixed()));
6788
6789 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6790 return MCCFIInstruction::cfiDefCfa(L: nullptr, Register: DwarfReg, Offset: (int)Offset.getFixed());
6791}
6792
6793MCCFIInstruction
6794llvm::createCFAOffset(const TargetRegisterInfo &TRI, unsigned Reg,
6795 const StackOffset &OffsetFromDefCFA,
6796 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6797 int64_t NumBytes, NumVGScaledBytes;
6798 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6799 Offset: OffsetFromDefCFA, ByteSized&: NumBytes, VGSized&: NumVGScaledBytes);
6800
6801 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, isEH: true);
6802
6803 // Non-scalable offsets can use DW_CFA_offset directly.
6804 if (!NumVGScaledBytes)
6805 return MCCFIInstruction::createOffset(L: nullptr, Register: DwarfReg, Offset: NumBytes);
6806
6807 std::string CommentBuffer;
6808 llvm::raw_string_ostream Comment(CommentBuffer);
6809 Comment << printReg(Reg, TRI: &TRI) << " @ cfa";
6810
6811 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6812 assert(NumVGScaledBytes && "Expected scalable offset");
6813 SmallString<64> OffsetExpr;
6814 // + VG * NumVGScaledBytes
6815 StringRef VGRegScale;
6816 if (IncomingVGOffsetFromDefCFA) {
6817 appendLoadRegExpr(Expr&: OffsetExpr, OffsetFromDefCFA: *IncomingVGOffsetFromDefCFA);
6818 VGRegScale = "* IncomingVG";
6819 } else {
6820 appendReadRegExpr(Expr&: OffsetExpr, RegNum: TRI.getDwarfRegNum(Reg: AArch64::VG, isEH: true));
6821 VGRegScale = "* VG";
6822 }
6823 appendConstantExpr(Expr&: OffsetExpr, Constant: NumVGScaledBytes, Operation: dwarf::DW_OP_mul);
6824 appendOffsetComment(NumBytes: NumVGScaledBytes, Comment, RegScale: VGRegScale);
6825 OffsetExpr.push_back(Elt: dwarf::DW_OP_plus);
6826 if (NumBytes) {
6827 // + NumBytes
6828 appendOffsetComment(NumBytes, Comment);
6829 appendConstantExpr(Expr&: OffsetExpr, Constant: NumBytes, Operation: dwarf::DW_OP_plus);
6830 }
6831
6832 // Wrap this into DW_CFA_expression
6833 SmallString<64> CfaExpr;
6834 CfaExpr.push_back(Elt: dwarf::DW_CFA_expression);
6835 appendLEB128<LEB128Sign::Unsigned>(Buffer&: CfaExpr, Value: DwarfReg);
6836 appendLEB128<LEB128Sign::Unsigned>(Buffer&: CfaExpr, Value: OffsetExpr.size());
6837 CfaExpr.append(RHS: OffsetExpr.str());
6838
6839 return MCCFIInstruction::createEscape(L: nullptr, Vals: CfaExpr.str(), Loc: SMLoc(),
6840 Comment: Comment.str());
6841}
6842
6843// Helper function to emit a frame offset adjustment from a given
6844// pointer (SrcReg), stored into DestReg. This function is explicit
6845// in that it requires the opcode.
6846static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
6847 MachineBasicBlock::iterator MBBI,
6848 const DebugLoc &DL, unsigned DestReg,
6849 unsigned SrcReg, int64_t Offset, unsigned Opc,
6850 const TargetInstrInfo *TII,
6851 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6852 bool *HasWinCFI, bool EmitCFAOffset,
6853 StackOffset CFAOffset, unsigned FrameReg) {
6854 int Sign = 1;
6855 unsigned MaxEncoding, ShiftSize;
6856 switch (Opc) {
6857 case AArch64::ADDXri:
6858 case AArch64::ADDSXri:
6859 case AArch64::SUBXri:
6860 case AArch64::SUBSXri:
6861 MaxEncoding = 0xfff;
6862 ShiftSize = 12;
6863 break;
6864 case AArch64::ADDVL_XXI:
6865 case AArch64::ADDPL_XXI:
6866 case AArch64::ADDSVL_XXI:
6867 case AArch64::ADDSPL_XXI:
6868 MaxEncoding = 31;
6869 ShiftSize = 0;
6870 if (Offset < 0) {
6871 MaxEncoding = 32;
6872 Sign = -1;
6873 Offset = -Offset;
6874 }
6875 break;
6876 default:
6877 llvm_unreachable("Unsupported opcode");
6878 }
6879
6880 // `Offset` can be in bytes or in "scalable bytes".
6881 int VScale = 1;
6882 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6883 VScale = 16;
6884 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6885 VScale = 2;
6886
6887 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6888 // scratch register. If DestReg is a virtual register, use it as the
6889 // scratch register; otherwise, create a new virtual register (to be
6890 // replaced by the scavenger at the end of PEI). That case can be optimized
6891 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6892 // register can be loaded with offset%8 and the add/sub can use an extending
6893 // instruction with LSL#3.
6894 // Currently the function handles any offsets but generates a poor sequence
6895 // of code.
6896 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6897
6898 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6899 Register TmpReg = DestReg;
6900 if (TmpReg == AArch64::XZR)
6901 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6902 RegClass: &AArch64::GPR64RegClass);
6903 do {
6904 uint64_t ThisVal = std::min<uint64_t>(a: Offset, b: MaxEncodableValue);
6905 unsigned LocalShiftSize = 0;
6906 if (ThisVal > MaxEncoding) {
6907 ThisVal = ThisVal >> ShiftSize;
6908 LocalShiftSize = ShiftSize;
6909 }
6910 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6911 "Encoding cannot handle value that big");
6912
6913 Offset -= ThisVal << LocalShiftSize;
6914 if (Offset == 0)
6915 TmpReg = DestReg;
6916 auto MBI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: TmpReg)
6917 .addReg(RegNo: SrcReg)
6918 .addImm(Val: Sign * (int)ThisVal);
6919 if (ShiftSize)
6920 MBI = MBI.addImm(
6921 Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: LocalShiftSize));
6922 MBI = MBI.setMIFlag(Flag);
6923
6924 auto Change =
6925 VScale == 1
6926 ? StackOffset::getFixed(Fixed: ThisVal << LocalShiftSize)
6927 : StackOffset::getScalable(Scalable: VScale * (ThisVal << LocalShiftSize));
6928 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6929 CFAOffset += Change;
6930 else
6931 CFAOffset -= Change;
6932 if (EmitCFAOffset && DestReg == TmpReg) {
6933 MachineFunction &MF = *MBB.getParent();
6934 const TargetSubtargetInfo &STI = MF.getSubtarget();
6935 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6936
6937 unsigned CFIIndex = MF.addFrameInst(
6938 Inst: createDefCFA(TRI, FrameReg, Reg: DestReg, Offset: CFAOffset, LastAdjustmentWasScalable: VScale != 1));
6939 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::CFI_INSTRUCTION))
6940 .addCFIIndex(CFIIndex)
6941 .setMIFlags(Flag);
6942 }
6943
6944 if (NeedsWinCFI) {
6945 int Imm = (int)(ThisVal << LocalShiftSize);
6946 if (VScale != 1 && DestReg == AArch64::SP) {
6947 if (HasWinCFI)
6948 *HasWinCFI = true;
6949 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_AllocZ))
6950 .addImm(Val: ThisVal)
6951 .setMIFlag(Flag);
6952 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6953 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6954 assert(VScale == 1 && "Expected non-scalable operation");
6955 if (HasWinCFI)
6956 *HasWinCFI = true;
6957 if (Imm == 0)
6958 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_SetFP)).setMIFlag(Flag);
6959 else
6960 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_AddFP))
6961 .addImm(Val: Imm)
6962 .setMIFlag(Flag);
6963 assert(Offset == 0 && "Expected remaining offset to be zero to "
6964 "emit a single SEH directive");
6965 } else if (DestReg == AArch64::SP) {
6966 assert(VScale == 1 && "Expected non-scalable operation");
6967 if (HasWinCFI)
6968 *HasWinCFI = true;
6969 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6970 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::SEH_StackAlloc))
6971 .addImm(Val: Imm)
6972 .setMIFlag(Flag);
6973 }
6974 }
6975
6976 SrcReg = TmpReg;
6977 } while (Offset);
6978}
6979
6980void llvm::emitFrameOffset(MachineBasicBlock &MBB,
6981 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
6982 unsigned DestReg, unsigned SrcReg,
6983 StackOffset Offset, const TargetInstrInfo *TII,
6984 MachineInstr::MIFlag Flag, bool SetNZCV,
6985 bool NeedsWinCFI, bool *HasWinCFI,
6986 bool EmitCFAOffset, StackOffset CFAOffset,
6987 unsigned FrameReg) {
6988 // If a function is marked as arm_locally_streaming, then the runtime value of
6989 // vscale in the prologue/epilogue is different the runtime value of vscale
6990 // in the function's body. To avoid having to consider multiple vscales,
6991 // we can use `addsvl` to allocate any scalable stack-slots, which under
6992 // most circumstances will be only locals, not callee-save slots.
6993 const Function &F = MBB.getParent()->getFunction();
6994 bool UseSVL = F.hasFnAttribute(Kind: "aarch64_pstate_sm_body");
6995
6996 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6997 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6998 Offset, NumBytes&: Bytes, NumPredicateVectors, NumDataVectors);
6999
7000 // Insert ADDSXri for scalable offset at the end.
7001 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
7002 if (NeedsFinalDefNZCV)
7003 SetNZCV = false;
7004
7005 // First emit non-scalable frame offsets, or a simple 'mov'.
7006 if (Bytes || (!Offset && SrcReg != DestReg)) {
7007 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
7008 "SP increment/decrement not 8-byte aligned");
7009 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
7010 if (Bytes < 0) {
7011 Bytes = -Bytes;
7012 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
7013 }
7014 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: Bytes, Opc, TII, Flag,
7015 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
7016 FrameReg);
7017 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
7018 ? StackOffset::getFixed(Fixed: -Bytes)
7019 : StackOffset::getFixed(Fixed: Bytes);
7020 SrcReg = DestReg;
7021 FrameReg = DestReg;
7022 }
7023
7024 assert(!(NeedsWinCFI && NumPredicateVectors) &&
7025 "WinCFI can't allocate fractions of an SVE data vector");
7026
7027 if (NumDataVectors) {
7028 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: NumDataVectors,
7029 Opc: UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
7030 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
7031 FrameReg);
7032 CFAOffset += StackOffset::getScalable(Scalable: -NumDataVectors * 16);
7033 SrcReg = DestReg;
7034 }
7035
7036 if (NumPredicateVectors) {
7037 assert(DestReg != AArch64::SP && "Unaligned access to SP");
7038 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: NumPredicateVectors,
7039 Opc: UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
7040 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
7041 FrameReg);
7042 }
7043
7044 if (NeedsFinalDefNZCV)
7045 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDSXri), DestReg)
7046 .addReg(RegNo: DestReg)
7047 .addImm(Val: 0)
7048 .addImm(Val: 0);
7049}
7050
7051MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
7052 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
7053 int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS,
7054 VirtRegMap *VRM) const {
7055 MachineBasicBlock::iterator InsertPt = MI;
7056 // This is a bit of a hack. Consider this instruction:
7057 //
7058 // %0 = COPY %sp; GPR64all:%0
7059 //
7060 // We explicitly chose GPR64all for the virtual register so such a copy might
7061 // be eliminated by RegisterCoalescer. However, that may not be possible, and
7062 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
7063 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
7064 //
7065 // To prevent that, we are going to constrain the %0 register class here.
7066 if (MI.isFullCopy()) {
7067 Register DstReg = MI.getOperand(i: 0).getReg();
7068 Register SrcReg = MI.getOperand(i: 1).getReg();
7069 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
7070 MF.getRegInfo().constrainRegClass(Reg: DstReg, RC: &AArch64::GPR64RegClass);
7071 return nullptr;
7072 }
7073 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
7074 MF.getRegInfo().constrainRegClass(Reg: SrcReg, RC: &AArch64::GPR64RegClass);
7075 return nullptr;
7076 }
7077 // Nothing can folded with copy from/to NZCV.
7078 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
7079 return nullptr;
7080 }
7081
7082 // Handle the case where a copy is being spilled or filled but the source
7083 // and destination register class don't match. For example:
7084 //
7085 // %0 = COPY %xzr; GPR64common:%0
7086 //
7087 // In this case we can still safely fold away the COPY and generate the
7088 // following spill code:
7089 //
7090 // STRXui %xzr, %stack.0
7091 //
7092 // This also eliminates spilled cross register class COPYs (e.g. between x and
7093 // d regs) of the same size. For example:
7094 //
7095 // %0 = COPY %1; GPR64:%0, FPR64:%1
7096 //
7097 // will be filled as
7098 //
7099 // LDRDui %0, fi<#0>
7100 //
7101 // instead of
7102 //
7103 // LDRXui %Temp, fi<#0>
7104 // %0 = FMOV %Temp
7105 //
7106 if (MI.isCopy() && Ops.size() == 1 &&
7107 // Make sure we're only folding the explicit COPY defs/uses.
7108 (Ops[0] == 0 || Ops[0] == 1)) {
7109 bool IsSpill = Ops[0] == 0;
7110 bool IsFill = !IsSpill;
7111 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7112 const MachineRegisterInfo &MRI = MF.getRegInfo();
7113 MachineBasicBlock &MBB = *MI.getParent();
7114 const MachineOperand &DstMO = MI.getOperand(i: 0);
7115 const MachineOperand &SrcMO = MI.getOperand(i: 1);
7116 Register DstReg = DstMO.getReg();
7117 Register SrcReg = SrcMO.getReg();
7118 // This is slightly expensive to compute for physical regs since
7119 // getMinimalPhysRegClass is slow.
7120 auto getRegClass = [&](unsigned Reg) {
7121 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
7122 : TRI.getMinimalPhysRegClass(Reg);
7123 };
7124
7125 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
7126 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
7127 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
7128 "Mismatched register size in non subreg COPY");
7129 if (IsSpill)
7130 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg, isKill: SrcMO.isKill(), FI: FrameIndex,
7131 RC: getRegClass(SrcReg), VReg: Register());
7132 else
7133 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex,
7134 RC: getRegClass(DstReg), VReg: Register());
7135 return &*--InsertPt;
7136 }
7137
7138 // Handle cases like spilling def of:
7139 //
7140 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
7141 //
7142 // where the physical register source can be widened and stored to the full
7143 // virtual reg destination stack slot, in this case producing:
7144 //
7145 // STRXui %xzr, %stack.0
7146 //
7147 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
7148 TRI.getRegSizeInBits(RC: *getRegClass(DstReg)) == 64) {
7149 assert(SrcMO.getSubReg() == 0 &&
7150 "Unexpected subreg on physical register");
7151 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg: AArch64::XZR, isKill: SrcMO.isKill(),
7152 FI: FrameIndex, RC: &AArch64::GPR64RegClass, VReg: Register());
7153 return &*--InsertPt;
7154 }
7155
7156 // Handle cases like filling use of:
7157 //
7158 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
7159 //
7160 // where we can load the full virtual reg source stack slot, into the subreg
7161 // destination, in this case producing:
7162 //
7163 // LDRWui %0:sub_32<def,read-undef>, %stack.0
7164 //
7165 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
7166 const TargetRegisterClass *FillRC = nullptr;
7167 switch (DstMO.getSubReg()) {
7168 default:
7169 break;
7170 case AArch64::sub_32:
7171 if (AArch64::GPR64RegClass.hasSubClassEq(RC: getRegClass(DstReg)))
7172 FillRC = &AArch64::GPR32RegClass;
7173 break;
7174 case AArch64::ssub:
7175 FillRC = &AArch64::FPR32RegClass;
7176 break;
7177 case AArch64::dsub:
7178 FillRC = &AArch64::FPR64RegClass;
7179 break;
7180 }
7181
7182 if (FillRC) {
7183 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
7184 TRI.getRegSizeInBits(*FillRC) &&
7185 "Mismatched regclass size on folded subreg COPY");
7186 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex, RC: FillRC,
7187 VReg: Register());
7188 MachineInstr &LoadMI = *--InsertPt;
7189 MachineOperand &LoadDst = LoadMI.getOperand(i: 0);
7190 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
7191 LoadDst.setSubReg(DstMO.getSubReg());
7192 LoadDst.setIsUndef();
7193 return &LoadMI;
7194 }
7195 }
7196 }
7197
7198 // Cannot fold.
7199 return nullptr;
7200}
7201
7202int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
7203 StackOffset &SOffset,
7204 bool *OutUseUnscaledOp,
7205 unsigned *OutUnscaledOp,
7206 int64_t *EmittableOffset) {
7207 // Set output values in case of early exit.
7208 if (EmittableOffset)
7209 *EmittableOffset = 0;
7210 if (OutUseUnscaledOp)
7211 *OutUseUnscaledOp = false;
7212 if (OutUnscaledOp)
7213 *OutUnscaledOp = 0;
7214
7215 // Exit early for structured vector spills/fills as they can't take an
7216 // immediate offset.
7217 switch (MI.getOpcode()) {
7218 default:
7219 break;
7220 case AArch64::LD1Rv1d:
7221 case AArch64::LD1Rv2s:
7222 case AArch64::LD1Rv2d:
7223 case AArch64::LD1Rv4h:
7224 case AArch64::LD1Rv4s:
7225 case AArch64::LD1Rv8b:
7226 case AArch64::LD1Rv8h:
7227 case AArch64::LD1Rv16b:
7228 case AArch64::LD1Twov2d:
7229 case AArch64::LD1Threev2d:
7230 case AArch64::LD1Fourv2d:
7231 case AArch64::LD1Twov1d:
7232 case AArch64::LD1Threev1d:
7233 case AArch64::LD1Fourv1d:
7234 case AArch64::ST1Twov2d:
7235 case AArch64::ST1Threev2d:
7236 case AArch64::ST1Fourv2d:
7237 case AArch64::ST1Twov1d:
7238 case AArch64::ST1Threev1d:
7239 case AArch64::ST1Fourv1d:
7240 case AArch64::ST1i8:
7241 case AArch64::ST1i16:
7242 case AArch64::ST1i32:
7243 case AArch64::ST1i64:
7244 case AArch64::IRG:
7245 case AArch64::IRGstack:
7246 case AArch64::STGloop:
7247 case AArch64::STZGloop:
7248 return AArch64FrameOffsetCannotUpdate;
7249 }
7250
7251 // Get the min/max offset and the scale.
7252 TypeSize ScaleValue(0U, false), Width(0U, false);
7253 int64_t MinOff, MaxOff;
7254 if (!AArch64InstrInfo::getMemOpInfo(Opcode: MI.getOpcode(), Scale&: ScaleValue, Width, MinOffset&: MinOff,
7255 MaxOffset&: MaxOff))
7256 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7257
7258 // Construct the complete offset.
7259 bool IsMulVL = ScaleValue.isScalable();
7260 unsigned Scale = ScaleValue.getKnownMinValue();
7261 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
7262
7263 const MachineOperand &ImmOpnd =
7264 MI.getOperand(i: AArch64InstrInfo::getLoadStoreImmIdx(Opc: MI.getOpcode()));
7265 Offset += ImmOpnd.getImm() * Scale;
7266
7267 // If the offset doesn't match the scale, we rewrite the instruction to
7268 // use the unscaled instruction instead. Likewise, if we have a negative
7269 // offset and there is an unscaled op to use.
7270 std::optional<unsigned> UnscaledOp =
7271 AArch64InstrInfo::getUnscaledLdSt(Opc: MI.getOpcode());
7272 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
7273 if (useUnscaledOp &&
7274 !AArch64InstrInfo::getMemOpInfo(Opcode: *UnscaledOp, Scale&: ScaleValue, Width, MinOffset&: MinOff,
7275 MaxOffset&: MaxOff))
7276 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7277
7278 Scale = ScaleValue.getKnownMinValue();
7279 assert(IsMulVL == ScaleValue.isScalable() &&
7280 "Unscaled opcode has different value for scalable");
7281
7282 int64_t Remainder = Offset % Scale;
7283 assert(!(Remainder && useUnscaledOp) &&
7284 "Cannot have remainder when using unscaled op");
7285
7286 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
7287 int64_t NewOffset = Offset / Scale;
7288 if (MinOff <= NewOffset && NewOffset <= MaxOff)
7289 Offset = Remainder;
7290 else {
7291 // Try to minimise the number of instructions required to materialise the
7292 // offset calculation. Specifically, for fixed offsets, if masking out the
7293 // low 12 bits leaves a legal add immediate, we can realise the offset
7294 // calculation with a single add instruction. Whenever this is possible,
7295 // prefer this split.
7296 int64_t HighPart = Offset & ~0xFFF;
7297 int64_t LowPart = Offset & 0xFFF;
7298 int64_t LowScaled = LowPart / Scale;
7299 if (!IsMulVL && NewOffset >= 0 && LowPart % Scale == 0 &&
7300 MinOff <= LowScaled && LowScaled <= MaxOff &&
7301 AArch64_AM::isLegalArithImmed(C: HighPart)) {
7302 NewOffset = LowScaled;
7303 Offset = HighPart;
7304 } else {
7305 // Default to a greedy split: take the memop immediate to be maximum /
7306 // minimum expressible offset and materialise the remainder.
7307 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
7308 Offset = Offset - (NewOffset * Scale);
7309 }
7310 }
7311
7312 if (EmittableOffset)
7313 *EmittableOffset = NewOffset;
7314 if (OutUseUnscaledOp)
7315 *OutUseUnscaledOp = useUnscaledOp;
7316 if (OutUnscaledOp && UnscaledOp)
7317 *OutUnscaledOp = *UnscaledOp;
7318
7319 if (IsMulVL)
7320 SOffset = StackOffset::get(Fixed: SOffset.getFixed(), Scalable: Offset);
7321 else
7322 SOffset = StackOffset::get(Fixed: Offset, Scalable: SOffset.getScalable());
7323 return AArch64FrameOffsetCanUpdate |
7324 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7325}
7326
7327bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
7328 unsigned FrameReg, StackOffset &Offset,
7329 const AArch64InstrInfo *TII) {
7330 unsigned Opcode = MI.getOpcode();
7331 unsigned ImmIdx = FrameRegIdx + 1;
7332
7333 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7334 Offset += StackOffset::getFixed(Fixed: MI.getOperand(i: ImmIdx).getImm());
7335 emitFrameOffset(MBB&: *MI.getParent(), MBBI: MI, DL: MI.getDebugLoc(),
7336 DestReg: MI.getOperand(i: 0).getReg(), SrcReg: FrameReg, Offset, TII,
7337 Flag: MachineInstr::NoFlags, SetNZCV: (Opcode == AArch64::ADDSXri));
7338 MI.eraseFromParent();
7339 Offset = StackOffset();
7340 return true;
7341 }
7342
7343 int64_t NewOffset;
7344 unsigned UnscaledOp;
7345 bool UseUnscaledOp;
7346 int Status = isAArch64FrameOffsetLegal(MI, SOffset&: Offset, OutUseUnscaledOp: &UseUnscaledOp,
7347 OutUnscaledOp: &UnscaledOp, EmittableOffset: &NewOffset);
7348 if (Status & AArch64FrameOffsetCanUpdate) {
7349 if (Status & AArch64FrameOffsetIsLegal)
7350 // Replace the FrameIndex with FrameReg.
7351 MI.getOperand(i: FrameRegIdx).ChangeToRegister(Reg: FrameReg, isDef: false);
7352 if (UseUnscaledOp)
7353 MI.setDesc(TII->get(Opcode: UnscaledOp));
7354
7355 MI.getOperand(i: ImmIdx).ChangeToImmediate(ImmVal: NewOffset);
7356 return !Offset;
7357 }
7358
7359 return false;
7360}
7361
7362void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
7363 MachineBasicBlock::iterator MI) const {
7364 DebugLoc DL;
7365 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: get(Opcode: AArch64::NOP));
7366}
7367
7368MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7369
7370// AArch64 supports MachineCombiner.
7371bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7372
7373// True when Opc sets flag
7374static bool isCombineInstrSettingFlag(unsigned Opc) {
7375 switch (Opc) {
7376 case AArch64::ADDSWrr:
7377 case AArch64::ADDSWri:
7378 case AArch64::ADDSXrr:
7379 case AArch64::ADDSXri:
7380 case AArch64::SUBSWrr:
7381 case AArch64::SUBSXrr:
7382 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7383 case AArch64::SUBSWri:
7384 case AArch64::SUBSXri:
7385 return true;
7386 default:
7387 break;
7388 }
7389 return false;
7390}
7391
7392// 32b Opcodes that can be combined with a MUL
7393static bool isCombineInstrCandidate32(unsigned Opc) {
7394 switch (Opc) {
7395 case AArch64::ADDWrr:
7396 case AArch64::ADDWri:
7397 case AArch64::SUBWrr:
7398 case AArch64::ADDSWrr:
7399 case AArch64::ADDSWri:
7400 case AArch64::SUBSWrr:
7401 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7402 case AArch64::SUBWri:
7403 case AArch64::SUBSWri:
7404 return true;
7405 default:
7406 break;
7407 }
7408 return false;
7409}
7410
7411// 64b Opcodes that can be combined with a MUL
7412static bool isCombineInstrCandidate64(unsigned Opc) {
7413 switch (Opc) {
7414 case AArch64::ADDXrr:
7415 case AArch64::ADDXri:
7416 case AArch64::SUBXrr:
7417 case AArch64::ADDSXrr:
7418 case AArch64::ADDSXri:
7419 case AArch64::SUBSXrr:
7420 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7421 case AArch64::SUBXri:
7422 case AArch64::SUBSXri:
7423 case AArch64::ADDv8i8:
7424 case AArch64::ADDv16i8:
7425 case AArch64::ADDv4i16:
7426 case AArch64::ADDv8i16:
7427 case AArch64::ADDv2i32:
7428 case AArch64::ADDv4i32:
7429 case AArch64::SUBv8i8:
7430 case AArch64::SUBv16i8:
7431 case AArch64::SUBv4i16:
7432 case AArch64::SUBv8i16:
7433 case AArch64::SUBv2i32:
7434 case AArch64::SUBv4i32:
7435 return true;
7436 default:
7437 break;
7438 }
7439 return false;
7440}
7441
7442// FP Opcodes that can be combined with a FMUL.
7443static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7444 switch (Inst.getOpcode()) {
7445 default:
7446 break;
7447 case AArch64::FADDHrr:
7448 case AArch64::FADDSrr:
7449 case AArch64::FADDDrr:
7450 case AArch64::FADDv4f16:
7451 case AArch64::FADDv8f16:
7452 case AArch64::FADDv2f32:
7453 case AArch64::FADDv2f64:
7454 case AArch64::FADDv4f32:
7455 case AArch64::FSUBHrr:
7456 case AArch64::FSUBSrr:
7457 case AArch64::FSUBDrr:
7458 case AArch64::FSUBv4f16:
7459 case AArch64::FSUBv8f16:
7460 case AArch64::FSUBv2f32:
7461 case AArch64::FSUBv2f64:
7462 case AArch64::FSUBv4f32:
7463 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
7464 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7465 // the target options or if FADD/FSUB has the contract fast-math flag.
7466 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7467 Inst.getFlag(Flag: MachineInstr::FmContract);
7468 }
7469 return false;
7470}
7471
7472// Opcodes that can be combined with a MUL
7473static bool isCombineInstrCandidate(unsigned Opc) {
7474 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
7475}
7476
7477//
7478// Utility routine that checks if \param MO is defined by an
7479// \param CombineOpc instruction in the basic block \param MBB
7480static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
7481 unsigned CombineOpc, unsigned ZeroReg = 0,
7482 bool CheckZeroReg = false) {
7483 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7484 MachineInstr *MI = nullptr;
7485
7486 if (MO.isReg() && MO.getReg().isVirtual())
7487 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
7488 // And it needs to be in the trace (otherwise, it won't have a depth).
7489 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7490 return false;
7491 // Must only used by the user we combine with.
7492 if (!MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
7493 return false;
7494
7495 if (CheckZeroReg) {
7496 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7497 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7498 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7499 // The third input reg must be zero.
7500 if (MI->getOperand(i: 3).getReg() != ZeroReg)
7501 return false;
7502 }
7503
7504 if (isCombineInstrSettingFlag(Opc: CombineOpc) &&
7505 MI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) == -1)
7506 return false;
7507
7508 return true;
7509}
7510
7511//
7512// Is \param MO defined by an integer multiply and can be combined?
7513static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
7514 unsigned MulOpc, unsigned ZeroReg) {
7515 return canCombine(MBB, MO, CombineOpc: MulOpc, ZeroReg, CheckZeroReg: true);
7516}
7517
7518//
7519// Is \param MO defined by a floating-point multiply and can be combined?
7520static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
7521 unsigned MulOpc) {
7522 return canCombine(MBB, MO, CombineOpc: MulOpc);
7523}
7524
7525// TODO: There are many more machine instruction opcodes to match:
7526// 1. Other data types (integer, vectors)
7527// 2. Other math / logic operations (xor, or)
7528// 3. Other forms of the same operation (intrinsics and other variants)
7529bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7530 bool Invert) const {
7531 if (Invert)
7532 return false;
7533 switch (Inst.getOpcode()) {
7534 // == Floating-point types ==
7535 // -- Floating-point instructions --
7536 case AArch64::FADDHrr:
7537 case AArch64::FADDSrr:
7538 case AArch64::FADDDrr:
7539 case AArch64::FMULHrr:
7540 case AArch64::FMULSrr:
7541 case AArch64::FMULDrr:
7542 case AArch64::FMULX16:
7543 case AArch64::FMULX32:
7544 case AArch64::FMULX64:
7545 // -- Advanced SIMD instructions --
7546 case AArch64::FADDv4f16:
7547 case AArch64::FADDv8f16:
7548 case AArch64::FADDv2f32:
7549 case AArch64::FADDv4f32:
7550 case AArch64::FADDv2f64:
7551 case AArch64::FMULv4f16:
7552 case AArch64::FMULv8f16:
7553 case AArch64::FMULv2f32:
7554 case AArch64::FMULv4f32:
7555 case AArch64::FMULv2f64:
7556 case AArch64::FMULXv4f16:
7557 case AArch64::FMULXv8f16:
7558 case AArch64::FMULXv2f32:
7559 case AArch64::FMULXv4f32:
7560 case AArch64::FMULXv2f64:
7561 // -- SVE instructions --
7562 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7563 // in the SVE instruction set (though there are predicated ones).
7564 case AArch64::FADD_ZZZ_H:
7565 case AArch64::FADD_ZZZ_S:
7566 case AArch64::FADD_ZZZ_D:
7567 case AArch64::FMUL_ZZZ_H:
7568 case AArch64::FMUL_ZZZ_S:
7569 case AArch64::FMUL_ZZZ_D:
7570 return Inst.getFlag(Flag: MachineInstr::MIFlag::FmReassoc) &&
7571 Inst.getFlag(Flag: MachineInstr::MIFlag::FmNsz);
7572
7573 // == Integer types ==
7574 // -- Base instructions --
7575 // Opcodes MULWrr and MULXrr don't exist because
7576 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7577 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7578 // The machine-combiner does not support three-source-operands machine
7579 // instruction. So we cannot reassociate MULs.
7580 case AArch64::ADDWrr:
7581 case AArch64::ADDXrr:
7582 case AArch64::ANDWrr:
7583 case AArch64::ANDXrr:
7584 case AArch64::ORRWrr:
7585 case AArch64::ORRXrr:
7586 case AArch64::EORWrr:
7587 case AArch64::EORXrr:
7588 case AArch64::EONWrr:
7589 case AArch64::EONXrr:
7590 // -- Advanced SIMD instructions --
7591 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7592 // in the Advanced SIMD instruction set.
7593 case AArch64::ADDv8i8:
7594 case AArch64::ADDv16i8:
7595 case AArch64::ADDv4i16:
7596 case AArch64::ADDv8i16:
7597 case AArch64::ADDv2i32:
7598 case AArch64::ADDv4i32:
7599 case AArch64::ADDv1i64:
7600 case AArch64::ADDv2i64:
7601 case AArch64::MULv8i8:
7602 case AArch64::MULv16i8:
7603 case AArch64::MULv4i16:
7604 case AArch64::MULv8i16:
7605 case AArch64::MULv2i32:
7606 case AArch64::MULv4i32:
7607 case AArch64::ANDv8i8:
7608 case AArch64::ANDv16i8:
7609 case AArch64::ORRv8i8:
7610 case AArch64::ORRv16i8:
7611 case AArch64::EORv8i8:
7612 case AArch64::EORv16i8:
7613 // -- SVE instructions --
7614 case AArch64::ADD_ZZZ_B:
7615 case AArch64::ADD_ZZZ_H:
7616 case AArch64::ADD_ZZZ_S:
7617 case AArch64::ADD_ZZZ_D:
7618 case AArch64::MUL_ZZZ_B:
7619 case AArch64::MUL_ZZZ_H:
7620 case AArch64::MUL_ZZZ_S:
7621 case AArch64::MUL_ZZZ_D:
7622 case AArch64::AND_ZZZ:
7623 case AArch64::ORR_ZZZ:
7624 case AArch64::EOR_ZZZ:
7625 return true;
7626
7627 default:
7628 return false;
7629 }
7630}
7631
7632/// Find instructions that can be turned into madd.
7633static bool getMaddPatterns(MachineInstr &Root,
7634 SmallVectorImpl<unsigned> &Patterns) {
7635 unsigned Opc = Root.getOpcode();
7636 MachineBasicBlock &MBB = *Root.getParent();
7637 bool Found = false;
7638
7639 if (!isCombineInstrCandidate(Opc))
7640 return false;
7641 if (isCombineInstrSettingFlag(Opc)) {
7642 int Cmp_NZCV =
7643 Root.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true);
7644 // When NZCV is live bail out.
7645 if (Cmp_NZCV == -1)
7646 return false;
7647 unsigned NewOpc = convertToNonFlagSettingOpc(MI: Root);
7648 // When opcode can't change bail out.
7649 // CHECKME: do we miss any cases for opcode conversion?
7650 if (NewOpc == Opc)
7651 return false;
7652 Opc = NewOpc;
7653 }
7654
7655 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7656 unsigned Pattern) {
7657 if (canCombineWithMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode, ZeroReg)) {
7658 Patterns.push_back(Elt: Pattern);
7659 Found = true;
7660 }
7661 };
7662
7663 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7664 if (canCombine(MBB, MO&: Root.getOperand(i: Operand), CombineOpc: Opcode)) {
7665 Patterns.push_back(Elt: Pattern);
7666 Found = true;
7667 }
7668 };
7669
7670 typedef AArch64MachineCombinerPattern MCP;
7671
7672 switch (Opc) {
7673 default:
7674 break;
7675 case AArch64::ADDWrr:
7676 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7677 "ADDWrr does not have register operands");
7678 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7679 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7680 break;
7681 case AArch64::ADDXrr:
7682 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7683 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7684 break;
7685 case AArch64::SUBWrr:
7686 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7687 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7688 break;
7689 case AArch64::SUBXrr:
7690 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7691 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7692 break;
7693 case AArch64::ADDWri:
7694 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7695 break;
7696 case AArch64::ADDXri:
7697 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7698 break;
7699 case AArch64::SUBWri:
7700 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7701 break;
7702 case AArch64::SUBXri:
7703 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7704 break;
7705 case AArch64::ADDv8i8:
7706 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7707 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7708 break;
7709 case AArch64::ADDv16i8:
7710 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7711 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7712 break;
7713 case AArch64::ADDv4i16:
7714 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7715 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7716 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7717 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7718 break;
7719 case AArch64::ADDv8i16:
7720 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7721 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7722 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7723 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7724 break;
7725 case AArch64::ADDv2i32:
7726 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7727 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7728 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7729 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7730 break;
7731 case AArch64::ADDv4i32:
7732 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7733 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7734 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7735 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7736 break;
7737 case AArch64::SUBv8i8:
7738 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7739 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7740 break;
7741 case AArch64::SUBv16i8:
7742 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7743 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7744 break;
7745 case AArch64::SUBv4i16:
7746 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7747 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7748 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7749 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7750 break;
7751 case AArch64::SUBv8i16:
7752 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7753 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7754 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7755 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7756 break;
7757 case AArch64::SUBv2i32:
7758 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7759 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7760 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7761 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7762 break;
7763 case AArch64::SUBv4i32:
7764 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7765 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7766 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7767 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7768 break;
7769 }
7770 return Found;
7771}
7772
7773bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7774 switch (Opcode) {
7775 default:
7776 break;
7777 case AArch64::UABALB_ZZZ_D:
7778 case AArch64::UABALB_ZZZ_H:
7779 case AArch64::UABALB_ZZZ_S:
7780 case AArch64::UABALT_ZZZ_D:
7781 case AArch64::UABALT_ZZZ_H:
7782 case AArch64::UABALT_ZZZ_S:
7783 case AArch64::SABALB_ZZZ_D:
7784 case AArch64::SABALB_ZZZ_S:
7785 case AArch64::SABALB_ZZZ_H:
7786 case AArch64::SABALT_ZZZ_D:
7787 case AArch64::SABALT_ZZZ_S:
7788 case AArch64::SABALT_ZZZ_H:
7789 case AArch64::UABALv16i8_v8i16:
7790 case AArch64::UABALv2i32_v2i64:
7791 case AArch64::UABALv4i16_v4i32:
7792 case AArch64::UABALv4i32_v2i64:
7793 case AArch64::UABALv8i16_v4i32:
7794 case AArch64::UABALv8i8_v8i16:
7795 case AArch64::UABAv16i8:
7796 case AArch64::UABAv2i32:
7797 case AArch64::UABAv4i16:
7798 case AArch64::UABAv4i32:
7799 case AArch64::UABAv8i16:
7800 case AArch64::UABAv8i8:
7801 case AArch64::SABALv16i8_v8i16:
7802 case AArch64::SABALv2i32_v2i64:
7803 case AArch64::SABALv4i16_v4i32:
7804 case AArch64::SABALv4i32_v2i64:
7805 case AArch64::SABALv8i16_v4i32:
7806 case AArch64::SABALv8i8_v8i16:
7807 case AArch64::SABAv16i8:
7808 case AArch64::SABAv2i32:
7809 case AArch64::SABAv4i16:
7810 case AArch64::SABAv4i32:
7811 case AArch64::SABAv8i16:
7812 case AArch64::SABAv8i8:
7813 return true;
7814 }
7815
7816 return false;
7817}
7818
7819unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7820 unsigned AccumulationOpcode) const {
7821 switch (AccumulationOpcode) {
7822 default:
7823 llvm_unreachable("Unsupported accumulation Opcode!");
7824 case AArch64::UABALB_ZZZ_D:
7825 return AArch64::UABDLB_ZZZ_D;
7826 case AArch64::UABALB_ZZZ_H:
7827 return AArch64::UABDLB_ZZZ_H;
7828 case AArch64::UABALB_ZZZ_S:
7829 return AArch64::UABDLB_ZZZ_S;
7830 case AArch64::UABALT_ZZZ_D:
7831 return AArch64::UABDLT_ZZZ_D;
7832 case AArch64::UABALT_ZZZ_H:
7833 return AArch64::UABDLT_ZZZ_H;
7834 case AArch64::UABALT_ZZZ_S:
7835 return AArch64::UABDLT_ZZZ_S;
7836 case AArch64::UABALv16i8_v8i16:
7837 return AArch64::UABDLv16i8_v8i16;
7838 case AArch64::UABALv2i32_v2i64:
7839 return AArch64::UABDLv2i32_v2i64;
7840 case AArch64::UABALv4i16_v4i32:
7841 return AArch64::UABDLv4i16_v4i32;
7842 case AArch64::UABALv4i32_v2i64:
7843 return AArch64::UABDLv4i32_v2i64;
7844 case AArch64::UABALv8i16_v4i32:
7845 return AArch64::UABDLv8i16_v4i32;
7846 case AArch64::UABALv8i8_v8i16:
7847 return AArch64::UABDLv8i8_v8i16;
7848 case AArch64::UABAv16i8:
7849 return AArch64::UABDv16i8;
7850 case AArch64::UABAv2i32:
7851 return AArch64::UABDv2i32;
7852 case AArch64::UABAv4i16:
7853 return AArch64::UABDv4i16;
7854 case AArch64::UABAv4i32:
7855 return AArch64::UABDv4i32;
7856 case AArch64::UABAv8i16:
7857 return AArch64::UABDv8i16;
7858 case AArch64::UABAv8i8:
7859 return AArch64::UABDv8i8;
7860 case AArch64::SABALB_ZZZ_D:
7861 return AArch64::SABDLB_ZZZ_D;
7862 case AArch64::SABALB_ZZZ_S:
7863 return AArch64::SABDLB_ZZZ_S;
7864 case AArch64::SABALB_ZZZ_H:
7865 return AArch64::SABDLB_ZZZ_H;
7866 case AArch64::SABALT_ZZZ_D:
7867 return AArch64::SABDLT_ZZZ_D;
7868 case AArch64::SABALT_ZZZ_S:
7869 return AArch64::SABDLT_ZZZ_S;
7870 case AArch64::SABALT_ZZZ_H:
7871 return AArch64::SABDLT_ZZZ_H;
7872 case AArch64::SABALv16i8_v8i16:
7873 return AArch64::SABDLv16i8_v8i16;
7874 case AArch64::SABALv2i32_v2i64:
7875 return AArch64::SABDLv2i32_v2i64;
7876 case AArch64::SABALv4i16_v4i32:
7877 return AArch64::SABDLv4i16_v4i32;
7878 case AArch64::SABALv4i32_v2i64:
7879 return AArch64::SABDLv4i32_v2i64;
7880 case AArch64::SABALv8i16_v4i32:
7881 return AArch64::SABDLv8i16_v4i32;
7882 case AArch64::SABALv8i8_v8i16:
7883 return AArch64::SABDLv8i8_v8i16;
7884 case AArch64::SABAv16i8:
7885 return AArch64::SABDv16i8;
7886 case AArch64::SABAv2i32:
7887 return AArch64::SABAv2i32;
7888 case AArch64::SABAv4i16:
7889 return AArch64::SABDv4i16;
7890 case AArch64::SABAv4i32:
7891 return AArch64::SABDv4i32;
7892 case AArch64::SABAv8i16:
7893 return AArch64::SABDv8i16;
7894 case AArch64::SABAv8i8:
7895 return AArch64::SABDv8i8;
7896 }
7897}
7898
7899/// Floating-Point Support
7900
7901/// Find instructions that can be turned into madd.
7902static bool getFMAPatterns(MachineInstr &Root,
7903 SmallVectorImpl<unsigned> &Patterns) {
7904
7905 if (!isCombineInstrCandidateFP(Inst: Root))
7906 return false;
7907
7908 MachineBasicBlock &MBB = *Root.getParent();
7909 bool Found = false;
7910
7911 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7912 if (canCombineWithFMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode)) {
7913 Patterns.push_back(Elt: Pattern);
7914 return true;
7915 }
7916 return false;
7917 };
7918
7919 typedef AArch64MachineCombinerPattern MCP;
7920
7921 switch (Root.getOpcode()) {
7922 default:
7923 assert(false && "Unsupported FP instruction in combiner\n");
7924 break;
7925 case AArch64::FADDHrr:
7926 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7927 "FADDHrr does not have register operands");
7928
7929 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7930 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7931 break;
7932 case AArch64::FADDSrr:
7933 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7934 "FADDSrr does not have register operands");
7935
7936 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7937 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7938
7939 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7940 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7941 break;
7942 case AArch64::FADDDrr:
7943 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7944 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7945
7946 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7947 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7948 break;
7949 case AArch64::FADDv4f16:
7950 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7951 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7952
7953 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7954 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7955 break;
7956 case AArch64::FADDv8f16:
7957 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7958 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7959
7960 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7961 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7962 break;
7963 case AArch64::FADDv2f32:
7964 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7965 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7966
7967 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7968 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7969 break;
7970 case AArch64::FADDv2f64:
7971 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7972 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7973
7974 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7975 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7976 break;
7977 case AArch64::FADDv4f32:
7978 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7979 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7980
7981 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7982 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7983 break;
7984 case AArch64::FSUBHrr:
7985 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7986 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7987 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7988 break;
7989 case AArch64::FSUBSrr:
7990 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7991
7992 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7993 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7994
7995 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7996 break;
7997 case AArch64::FSUBDrr:
7998 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7999
8000 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
8001 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
8002
8003 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
8004 break;
8005 case AArch64::FSUBv4f16:
8006 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
8007 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
8008
8009 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
8010 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
8011 break;
8012 case AArch64::FSUBv8f16:
8013 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
8014 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
8015
8016 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
8017 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
8018 break;
8019 case AArch64::FSUBv2f32:
8020 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
8021 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
8022
8023 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
8024 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
8025 break;
8026 case AArch64::FSUBv2f64:
8027 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
8028 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
8029
8030 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
8031 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
8032 break;
8033 case AArch64::FSUBv4f32:
8034 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
8035 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
8036
8037 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
8038 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
8039 break;
8040 }
8041 return Found;
8042}
8043
8044static bool getFMULPatterns(MachineInstr &Root,
8045 SmallVectorImpl<unsigned> &Patterns) {
8046 MachineBasicBlock &MBB = *Root.getParent();
8047 bool Found = false;
8048
8049 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
8050 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8051 MachineOperand &MO = Root.getOperand(i: Operand);
8052 MachineInstr *MI = nullptr;
8053 if (MO.isReg() && MO.getReg().isVirtual())
8054 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
8055 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
8056 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
8057 MI->getOperand(i: 1).getReg().isVirtual())
8058 MI = MRI.getUniqueVRegDef(Reg: MI->getOperand(i: 1).getReg());
8059 if (MI && MI->getOpcode() == Opcode) {
8060 Patterns.push_back(Elt: Pattern);
8061 return true;
8062 }
8063 return false;
8064 };
8065
8066 typedef AArch64MachineCombinerPattern MCP;
8067
8068 switch (Root.getOpcode()) {
8069 default:
8070 return false;
8071 case AArch64::FMULv2f32:
8072 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
8073 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
8074 break;
8075 case AArch64::FMULv2f64:
8076 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
8077 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
8078 break;
8079 case AArch64::FMULv4f16:
8080 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
8081 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
8082 break;
8083 case AArch64::FMULv4f32:
8084 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
8085 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
8086 break;
8087 case AArch64::FMULv8f16:
8088 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
8089 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
8090 break;
8091 }
8092
8093 return Found;
8094}
8095
8096static bool getFNEGPatterns(MachineInstr &Root,
8097 SmallVectorImpl<unsigned> &Patterns) {
8098 unsigned Opc = Root.getOpcode();
8099 MachineBasicBlock &MBB = *Root.getParent();
8100 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8101
8102 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
8103 MachineOperand &MO = Root.getOperand(i: 1);
8104 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
8105 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
8106 MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()) &&
8107 Root.getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
8108 Root.getFlag(Flag: MachineInstr::MIFlag::FmNsz) &&
8109 MI->getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
8110 MI->getFlag(Flag: MachineInstr::MIFlag::FmNsz)) {
8111 Patterns.push_back(Elt: Pattern);
8112 return true;
8113 }
8114 return false;
8115 };
8116
8117 switch (Opc) {
8118 default:
8119 break;
8120 case AArch64::FNEGDr:
8121 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
8122 case AArch64::FNEGSr:
8123 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
8124 }
8125
8126 return false;
8127}
8128
8129/// Return true when a code sequence can improve throughput. It
8130/// should be called only for instructions in loops.
8131/// \param Pattern - combiner pattern
8132bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
8133 switch (Pattern) {
8134 default:
8135 break;
8136 case AArch64MachineCombinerPattern::FMULADDH_OP1:
8137 case AArch64MachineCombinerPattern::FMULADDH_OP2:
8138 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
8139 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
8140 case AArch64MachineCombinerPattern::FMULADDS_OP1:
8141 case AArch64MachineCombinerPattern::FMULADDS_OP2:
8142 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
8143 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
8144 case AArch64MachineCombinerPattern::FMULADDD_OP1:
8145 case AArch64MachineCombinerPattern::FMULADDD_OP2:
8146 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
8147 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
8148 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
8149 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
8150 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
8151 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
8152 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
8153 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
8154 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
8155 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
8156 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
8157 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
8158 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
8159 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
8160 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
8161 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
8162 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
8163 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
8164 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
8165 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
8166 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
8167 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
8168 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
8169 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
8170 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
8171 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
8172 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
8173 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
8174 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
8175 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
8176 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
8177 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
8178 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
8179 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
8180 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
8181 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
8182 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
8183 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
8184 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
8185 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
8186 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
8187 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
8188 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
8189 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
8190 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
8191 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
8192 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
8193 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
8194 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
8195 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
8196 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
8197 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
8198 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
8199 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
8200 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
8201 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
8202 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
8203 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
8204 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
8205 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
8206 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
8207 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
8208 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
8209 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
8210 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
8211 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
8212 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
8213 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
8214 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
8215 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
8216 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
8217 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
8218 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
8219 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
8220 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
8221 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
8222 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
8223 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
8224 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
8225 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
8226 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
8227 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
8228 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
8229 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
8230 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
8231 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
8232 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
8233 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
8234 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
8235 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
8236 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
8237 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
8238 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
8239 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
8240 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
8241 return true;
8242 } // end switch (Pattern)
8243 return false;
8244}
8245
8246/// Find other MI combine patterns.
8247static bool getMiscPatterns(MachineInstr &Root,
8248 SmallVectorImpl<unsigned> &Patterns) {
8249 // A - (B + C) ==> (A - B) - C or (A - C) - B
8250 unsigned Opc = Root.getOpcode();
8251 MachineBasicBlock &MBB = *Root.getParent();
8252
8253 switch (Opc) {
8254 case AArch64::SUBWrr:
8255 case AArch64::SUBSWrr:
8256 case AArch64::SUBXrr:
8257 case AArch64::SUBSXrr:
8258 // Found candidate root.
8259 break;
8260 default:
8261 return false;
8262 }
8263
8264 if (isCombineInstrSettingFlag(Opc) &&
8265 Root.findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr, isDead: true) ==
8266 -1)
8267 return false;
8268
8269 if (canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDWrr) ||
8270 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDSWrr) ||
8271 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDXrr) ||
8272 canCombine(MBB, MO&: Root.getOperand(i: 2), CombineOpc: AArch64::ADDSXrr)) {
8273 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP1);
8274 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP2);
8275 return true;
8276 }
8277
8278 return false;
8279}
8280
8281/// Check if the given instruction forms a gather load pattern that can be
8282/// optimized for better Memory-Level Parallelism (MLP). This function
8283/// identifies chains of NEON lane load instructions that load data from
8284/// different memory addresses into individual lanes of a 128-bit vector
8285/// register, then attempts to split the pattern into parallel loads to break
8286/// the serial dependency between instructions.
8287///
8288/// Pattern Matched:
8289/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
8290/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
8291///
8292/// Transformed Into:
8293/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
8294/// to combine the results, enabling better memory-level parallelism.
8295///
8296/// Supported Element Types:
8297/// - 32-bit elements (LD1i32, 4 lanes total)
8298/// - 16-bit elements (LD1i16, 8 lanes total)
8299/// - 8-bit elements (LD1i8, 16 lanes total)
8300static bool getGatherLanePattern(MachineInstr &Root,
8301 SmallVectorImpl<unsigned> &Patterns,
8302 unsigned LoadLaneOpCode, unsigned NumLanes) {
8303 const MachineFunction *MF = Root.getMF();
8304
8305 // Early exit if optimizing for size.
8306 if (MF->getFunction().hasMinSize())
8307 return false;
8308
8309 const MachineRegisterInfo &MRI = MF->getRegInfo();
8310 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
8311
8312 // The root of the pattern must load into the last lane of the vector.
8313 if (Root.getOperand(i: 2).getImm() != NumLanes - 1)
8314 return false;
8315
8316 // Check that we have load into all lanes except lane 0.
8317 // For each load we also want to check that:
8318 // 1. It has a single non-debug use (since we will be replacing the virtual
8319 // register)
8320 // 2. That the addressing mode only uses a single pointer operand
8321 auto *CurrInstr = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 1).getReg());
8322 auto Range = llvm::seq<unsigned>(Begin: 1, End: NumLanes - 1);
8323 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8324 SmallVector<const MachineInstr *, 16> LoadInstrs;
8325 while (!RemainingLanes.empty() && CurrInstr &&
8326 CurrInstr->getOpcode() == LoadLaneOpCode &&
8327 MRI.hasOneNonDBGUse(RegNo: CurrInstr->getOperand(i: 0).getReg()) &&
8328 CurrInstr->getNumOperands() == 4) {
8329 RemainingLanes.erase(V: CurrInstr->getOperand(i: 2).getImm());
8330 LoadInstrs.push_back(Elt: CurrInstr);
8331 CurrInstr = MRI.getUniqueVRegDef(Reg: CurrInstr->getOperand(i: 1).getReg());
8332 }
8333
8334 // Check that we have found a match for lanes N-1.. 1.
8335 if (!RemainingLanes.empty())
8336 return false;
8337
8338 // Match the SUBREG_TO_REG sequence.
8339 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8340 return false;
8341
8342 // Verify that the subreg to reg loads an integer into the first lane.
8343 auto Lane0LoadReg = CurrInstr->getOperand(i: 1).getReg();
8344 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8345 if (TRI->getRegSizeInBits(Reg: Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8346 return false;
8347
8348 // Verify that it also has a single non debug use.
8349 if (!MRI.hasOneNonDBGUse(RegNo: Lane0LoadReg))
8350 return false;
8351
8352 LoadInstrs.push_back(Elt: MRI.getUniqueVRegDef(Reg: Lane0LoadReg));
8353
8354 // If there is any chance of aliasing, do not apply the pattern.
8355 // Walk backward through the MBB starting from Root.
8356 // Exit early if we've encountered all load instructions or hit the search
8357 // limit.
8358 auto MBBItr = Root.getIterator();
8359 unsigned RemainingSteps = GatherOptSearchLimit;
8360 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8361 RemainingLoadInstrs.insert(I: LoadInstrs.begin(), E: LoadInstrs.end());
8362 const MachineBasicBlock *MBB = Root.getParent();
8363
8364 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8365 !RemainingLoadInstrs.empty();
8366 --MBBItr, --RemainingSteps) {
8367 const MachineInstr &CurrInstr = *MBBItr;
8368
8369 // Remove this instruction from remaining loads if it's one we're tracking.
8370 RemainingLoadInstrs.erase(Ptr: &CurrInstr);
8371
8372 // Check for potential aliasing with any of the load instructions to
8373 // optimize.
8374 if (CurrInstr.isLoadFoldBarrier())
8375 return false;
8376 }
8377
8378 // If we hit the search limit without finding all load instructions,
8379 // don't match the pattern.
8380 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8381 return false;
8382
8383 switch (NumLanes) {
8384 case 4:
8385 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i32);
8386 break;
8387 case 8:
8388 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i16);
8389 break;
8390 case 16:
8391 Patterns.push_back(Elt: AArch64MachineCombinerPattern::GATHER_LANE_i8);
8392 break;
8393 default:
8394 llvm_unreachable("Got bad number of lanes for gather pattern.");
8395 }
8396
8397 return true;
8398}
8399
8400/// Search for patterns of LD instructions we can optimize.
8401static bool getLoadPatterns(MachineInstr &Root,
8402 SmallVectorImpl<unsigned> &Patterns) {
8403
8404 // The pattern searches for loads into single lanes.
8405 switch (Root.getOpcode()) {
8406 case AArch64::LD1i32:
8407 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 4);
8408 case AArch64::LD1i16:
8409 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 8);
8410 case AArch64::LD1i8:
8411 return getGatherLanePattern(Root, Patterns, LoadLaneOpCode: Root.getOpcode(), NumLanes: 16);
8412 default:
8413 return false;
8414 }
8415}
8416
8417/// Generate optimized instruction sequence for gather load patterns to improve
8418/// Memory-Level Parallelism (MLP). This function transforms a chain of
8419/// sequential NEON lane loads into parallel vector loads that can execute
8420/// concurrently.
8421static void
8422generateGatherLanePattern(MachineInstr &Root,
8423 SmallVectorImpl<MachineInstr *> &InsInstrs,
8424 SmallVectorImpl<MachineInstr *> &DelInstrs,
8425 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8426 unsigned Pattern, unsigned NumLanes) {
8427 MachineFunction &MF = *Root.getParent()->getParent();
8428 MachineRegisterInfo &MRI = MF.getRegInfo();
8429 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8430
8431 // Gather the initial load instructions to build the pattern.
8432 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8433 MachineInstr *CurrInstr = &Root;
8434 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8435 LoadToLaneInstrs.push_back(Elt: CurrInstr);
8436 CurrInstr = MRI.getUniqueVRegDef(Reg: CurrInstr->getOperand(i: 1).getReg());
8437 }
8438
8439 // Sort the load instructions according to the lane.
8440 llvm::sort(C&: LoadToLaneInstrs,
8441 Comp: [](const MachineInstr *A, const MachineInstr *B) {
8442 return A->getOperand(i: 2).getImm() > B->getOperand(i: 2).getImm();
8443 });
8444
8445 MachineInstr *SubregToReg = CurrInstr;
8446 LoadToLaneInstrs.push_back(
8447 Elt: MRI.getUniqueVRegDef(Reg: SubregToReg->getOperand(i: 1).getReg()));
8448 auto LoadToLaneInstrsAscending = llvm::reverse(C&: LoadToLaneInstrs);
8449
8450 const TargetRegisterClass *FPR128RegClass =
8451 MRI.getRegClass(Reg: Root.getOperand(i: 0).getReg());
8452
8453 // Helper lambda to create a LD1 instruction.
8454 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8455 Register SrcRegister, unsigned Lane,
8456 Register OffsetRegister,
8457 bool OffsetRegisterKillState) {
8458 auto NewRegister = MRI.createVirtualRegister(RegClass: FPR128RegClass);
8459 MachineInstrBuilder LoadIndexIntoRegister =
8460 BuildMI(MF, MIMD: MIMetadata(*OriginalInstr), MCID: TII->get(Opcode: Root.getOpcode()),
8461 DestReg: NewRegister)
8462 .addReg(RegNo: SrcRegister)
8463 .addImm(Val: Lane)
8464 .addReg(RegNo: OffsetRegister, Flags: getKillRegState(B: OffsetRegisterKillState))
8465 .setMemRefs(OriginalInstr->memoperands());
8466 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewRegister, y: InsInstrs.size()));
8467 InsInstrs.push_back(Elt: LoadIndexIntoRegister);
8468 return NewRegister;
8469 };
8470
8471 // Helper to create load instruction based on the NumLanes in the NEON
8472 // register we are rewriting.
8473 auto CreateLDRInstruction =
8474 [&](unsigned NumLanes, Register DestReg, Register OffsetReg,
8475 ArrayRef<MachineMemOperand *> MMOs) -> MachineInstrBuilder {
8476 unsigned Opcode;
8477 switch (NumLanes) {
8478 case 4:
8479 Opcode = AArch64::LDRSui;
8480 break;
8481 case 8:
8482 Opcode = AArch64::LDRHui;
8483 break;
8484 case 16:
8485 Opcode = AArch64::LDRBui;
8486 break;
8487 default:
8488 llvm_unreachable(
8489 "Got unsupported number of lanes in machine-combiner gather pattern");
8490 }
8491 // Immediate offset load
8492 return BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg)
8493 .addReg(RegNo: OffsetReg)
8494 .addImm(Val: 0)
8495 .setMemRefs(MMOs);
8496 };
8497
8498 // Load the remaining lanes into register 0.
8499 auto LanesToLoadToReg0 =
8500 llvm::make_range(x: LoadToLaneInstrsAscending.begin() + 1,
8501 y: LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8502 Register PrevReg = SubregToReg->getOperand(i: 0).getReg();
8503 for (auto [Index, LoadInstr] : llvm::enumerate(First&: LanesToLoadToReg0)) {
8504 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(i: 3);
8505 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8506 OffsetRegOperand.getReg(),
8507 OffsetRegOperand.isKill());
8508 DelInstrs.push_back(Elt: LoadInstr);
8509 }
8510 Register LastLoadReg0 = PrevReg;
8511
8512 // First load into register 1. Perform an integer load to zero out the upper
8513 // lanes in a single instruction.
8514 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8515 MachineInstr *OriginalSplitLoad =
8516 *std::next(x: LoadToLaneInstrsAscending.begin(), n: NumLanes / 2);
8517 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8518 RegClass: MRI.getRegClass(Reg: Lane0Load->getOperand(i: 0).getReg()));
8519
8520 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8521 OriginalSplitLoad->getOperand(i: 3);
8522 MachineInstrBuilder MiddleIndexLoadInstr =
8523 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8524 OriginalSplitToLoadOffsetOperand.getReg(),
8525 OriginalSplitLoad->memoperands());
8526
8527 InstrIdxForVirtReg.insert(
8528 KV: std::make_pair(x&: DestRegForMiddleIndex, y: InsInstrs.size()));
8529 InsInstrs.push_back(Elt: MiddleIndexLoadInstr);
8530 DelInstrs.push_back(Elt: OriginalSplitLoad);
8531
8532 // Subreg To Reg instruction for register 1.
8533 Register DestRegForSubregToReg = MRI.createVirtualRegister(RegClass: FPR128RegClass);
8534 unsigned SubregType;
8535 switch (NumLanes) {
8536 case 4:
8537 SubregType = AArch64::ssub;
8538 break;
8539 case 8:
8540 SubregType = AArch64::hsub;
8541 break;
8542 case 16:
8543 SubregType = AArch64::bsub;
8544 break;
8545 default:
8546 llvm_unreachable(
8547 "Got invalid NumLanes for machine-combiner gather pattern");
8548 }
8549
8550 auto SubRegToRegInstr =
8551 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: SubregToReg->getOpcode()),
8552 DestReg: DestRegForSubregToReg)
8553 .addReg(RegNo: DestRegForMiddleIndex, Flags: getKillRegState(B: true))
8554 .addImm(Val: SubregType);
8555 InstrIdxForVirtReg.insert(
8556 KV: std::make_pair(x&: DestRegForSubregToReg, y: InsInstrs.size()));
8557 InsInstrs.push_back(Elt: SubRegToRegInstr);
8558
8559 // Load remaining lanes into register 1.
8560 auto LanesToLoadToReg1 =
8561 llvm::make_range(x: LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8562 y: LoadToLaneInstrsAscending.end());
8563 PrevReg = SubRegToRegInstr->getOperand(i: 0).getReg();
8564 for (auto [Index, LoadInstr] : llvm::enumerate(First&: LanesToLoadToReg1)) {
8565 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(i: 3);
8566 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8567 OffsetRegOperand.getReg(),
8568 OffsetRegOperand.isKill());
8569
8570 // Do not add the last reg to DelInstrs - it will be removed later.
8571 if (Index == NumLanes / 2 - 2) {
8572 break;
8573 }
8574 DelInstrs.push_back(Elt: LoadInstr);
8575 }
8576 Register LastLoadReg1 = PrevReg;
8577
8578 // Create the final zip instruction to combine the results.
8579 MachineInstrBuilder ZipInstr =
8580 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::ZIP1v2i64),
8581 DestReg: Root.getOperand(i: 0).getReg())
8582 .addReg(RegNo: LastLoadReg0)
8583 .addReg(RegNo: LastLoadReg1);
8584 InsInstrs.push_back(Elt: ZipInstr);
8585}
8586
8587CombinerObjective
8588AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
8589 switch (Pattern) {
8590 case AArch64MachineCombinerPattern::SUBADD_OP1:
8591 case AArch64MachineCombinerPattern::SUBADD_OP2:
8592 case AArch64MachineCombinerPattern::GATHER_LANE_i32:
8593 case AArch64MachineCombinerPattern::GATHER_LANE_i16:
8594 case AArch64MachineCombinerPattern::GATHER_LANE_i8:
8595 return CombinerObjective::MustReduceDepth;
8596 default:
8597 return TargetInstrInfo::getCombinerObjective(Pattern);
8598 }
8599}
8600
8601/// Return true when there is potentially a faster code sequence for an
8602/// instruction chain ending in \p Root. All potential patterns are listed in
8603/// the \p Pattern vector. Pattern should be sorted in priority order since the
8604/// pattern evaluator stops checking as soon as it finds a faster sequence.
8605
8606bool AArch64InstrInfo::getMachineCombinerPatterns(
8607 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8608 bool DoRegPressureReduce) const {
8609 // Integer patterns
8610 if (getMaddPatterns(Root, Patterns))
8611 return true;
8612 // Floating point patterns
8613 if (getFMULPatterns(Root, Patterns))
8614 return true;
8615 if (getFMAPatterns(Root, Patterns))
8616 return true;
8617 if (getFNEGPatterns(Root, Patterns))
8618 return true;
8619
8620 // Other patterns
8621 if (getMiscPatterns(Root, Patterns))
8622 return true;
8623
8624 // Load patterns
8625 if (getLoadPatterns(Root, Patterns))
8626 return true;
8627
8628 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8629 DoRegPressureReduce);
8630}
8631
8632enum class FMAInstKind { Default, Indexed, Accumulator };
8633/// genFusedMultiply - Generate fused multiply instructions.
8634/// This function supports both integer and floating point instructions.
8635/// A typical example:
8636/// F|MUL I=A,B,0
8637/// F|ADD R,I,C
8638/// ==> F|MADD R,A,B,C
8639/// \param MF Containing MachineFunction
8640/// \param MRI Register information
8641/// \param TII Target information
8642/// \param Root is the F|ADD instruction
8643/// \param [out] InsInstrs is a vector of machine instructions and will
8644/// contain the generated madd instruction
8645/// \param IdxMulOpd is index of operand in Root that is the result of
8646/// the F|MUL. In the example above IdxMulOpd is 1.
8647/// \param MaddOpc the opcode fo the f|madd instruction
8648/// \param RC Register class of operands
8649/// \param kind of fma instruction (addressing mode) to be generated
8650/// \param ReplacedAddend is the result register from the instruction
8651/// replacing the non-combined operand, if any.
8652static MachineInstr *
8653genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
8654 const TargetInstrInfo *TII, MachineInstr &Root,
8655 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8656 unsigned MaddOpc, const TargetRegisterClass *RC,
8657 FMAInstKind kind = FMAInstKind::Default,
8658 const Register *ReplacedAddend = nullptr) {
8659 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8660
8661 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8662 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
8663 Register ResultReg = Root.getOperand(i: 0).getReg();
8664 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
8665 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
8666 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
8667 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
8668
8669 Register SrcReg2;
8670 bool Src2IsKill;
8671 if (ReplacedAddend) {
8672 // If we just generated a new addend, we must be it's only use.
8673 SrcReg2 = *ReplacedAddend;
8674 Src2IsKill = true;
8675 } else {
8676 SrcReg2 = Root.getOperand(i: IdxOtherOpd).getReg();
8677 Src2IsKill = Root.getOperand(i: IdxOtherOpd).isKill();
8678 }
8679
8680 if (ResultReg.isVirtual())
8681 MRI.constrainRegClass(Reg: ResultReg, RC);
8682 if (SrcReg0.isVirtual())
8683 MRI.constrainRegClass(Reg: SrcReg0, RC);
8684 if (SrcReg1.isVirtual())
8685 MRI.constrainRegClass(Reg: SrcReg1, RC);
8686 if (SrcReg2.isVirtual())
8687 MRI.constrainRegClass(Reg: SrcReg2, RC);
8688
8689 MachineInstrBuilder MIB;
8690 if (kind == FMAInstKind::Default)
8691 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8692 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8693 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8694 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill));
8695 else if (kind == FMAInstKind::Indexed)
8696 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8697 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill))
8698 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8699 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8700 .addImm(Val: MUL->getOperand(i: 3).getImm());
8701 else if (kind == FMAInstKind::Accumulator)
8702 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8703 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill))
8704 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8705 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill));
8706 else
8707 assert(false && "Invalid FMA instruction kind \n");
8708 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8709 InsInstrs.push_back(Elt: MIB);
8710 return MUL;
8711}
8712
8713static MachineInstr *
8714genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
8715 const TargetInstrInfo *TII, MachineInstr &Root,
8716 SmallVectorImpl<MachineInstr *> &InsInstrs) {
8717 MachineInstr *MAD = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 1).getReg());
8718
8719 unsigned Opc = 0;
8720 const TargetRegisterClass *RC = MRI.getRegClass(Reg: MAD->getOperand(i: 0).getReg());
8721 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8722 Opc = AArch64::FNMADDSrrr;
8723 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8724 Opc = AArch64::FNMADDDrrr;
8725 else
8726 return nullptr;
8727
8728 Register ResultReg = Root.getOperand(i: 0).getReg();
8729 Register SrcReg0 = MAD->getOperand(i: 1).getReg();
8730 Register SrcReg1 = MAD->getOperand(i: 2).getReg();
8731 Register SrcReg2 = MAD->getOperand(i: 3).getReg();
8732 bool Src0IsKill = MAD->getOperand(i: 1).isKill();
8733 bool Src1IsKill = MAD->getOperand(i: 2).isKill();
8734 bool Src2IsKill = MAD->getOperand(i: 3).isKill();
8735 if (ResultReg.isVirtual())
8736 MRI.constrainRegClass(Reg: ResultReg, RC);
8737 if (SrcReg0.isVirtual())
8738 MRI.constrainRegClass(Reg: SrcReg0, RC);
8739 if (SrcReg1.isVirtual())
8740 MRI.constrainRegClass(Reg: SrcReg1, RC);
8741 if (SrcReg2.isVirtual())
8742 MRI.constrainRegClass(Reg: SrcReg2, RC);
8743
8744 MachineInstrBuilder MIB =
8745 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: Opc), DestReg: ResultReg)
8746 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8747 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8748 .addReg(RegNo: SrcReg2, Flags: getKillRegState(B: Src2IsKill));
8749 InsInstrs.push_back(Elt: MIB);
8750
8751 return MAD;
8752}
8753
8754/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8755static MachineInstr *
8756genIndexedMultiply(MachineInstr &Root,
8757 SmallVectorImpl<MachineInstr *> &InsInstrs,
8758 unsigned IdxDupOp, unsigned MulOpc,
8759 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8760 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8761 "Invalid index of FMUL operand");
8762
8763 MachineFunction &MF = *Root.getMF();
8764 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8765
8766 MachineInstr *Dup =
8767 MF.getRegInfo().getUniqueVRegDef(Reg: Root.getOperand(i: IdxDupOp).getReg());
8768
8769 if (Dup->getOpcode() == TargetOpcode::COPY)
8770 Dup = MRI.getUniqueVRegDef(Reg: Dup->getOperand(i: 1).getReg());
8771
8772 Register DupSrcReg = Dup->getOperand(i: 1).getReg();
8773 MRI.clearKillFlags(Reg: DupSrcReg);
8774 MRI.constrainRegClass(Reg: DupSrcReg, RC);
8775
8776 unsigned DupSrcLane = Dup->getOperand(i: 2).getImm();
8777
8778 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8779 MachineOperand &MulOp = Root.getOperand(i: IdxMulOp);
8780
8781 Register ResultReg = Root.getOperand(i: 0).getReg();
8782
8783 MachineInstrBuilder MIB;
8784 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MulOpc), DestReg: ResultReg)
8785 .add(MO: MulOp)
8786 .addReg(RegNo: DupSrcReg)
8787 .addImm(Val: DupSrcLane);
8788
8789 InsInstrs.push_back(Elt: MIB);
8790 return &Root;
8791}
8792
8793/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8794/// instructions.
8795///
8796/// \see genFusedMultiply
8797static MachineInstr *genFusedMultiplyAcc(
8798 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8799 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8800 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8801 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8802 kind: FMAInstKind::Accumulator);
8803}
8804
8805/// genNeg - Helper to generate an intermediate negation of the second operand
8806/// of Root
8807static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
8808 const TargetInstrInfo *TII, MachineInstr &Root,
8809 SmallVectorImpl<MachineInstr *> &InsInstrs,
8810 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8811 unsigned MnegOpc, const TargetRegisterClass *RC) {
8812 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
8813 MachineInstrBuilder MIB =
8814 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MnegOpc), DestReg: NewVR)
8815 .add(MO: Root.getOperand(i: 2));
8816 InsInstrs.push_back(Elt: MIB);
8817
8818 assert(InstrIdxForVirtReg.empty());
8819 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8820
8821 return NewVR;
8822}
8823
8824/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8825/// instructions with an additional negation of the accumulator
8826static MachineInstr *genFusedMultiplyAccNeg(
8827 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8828 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8829 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8830 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8831 assert(IdxMulOpd == 1);
8832
8833 Register NewVR =
8834 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8835 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8836 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
8837}
8838
8839/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8840/// instructions.
8841///
8842/// \see genFusedMultiply
8843static MachineInstr *genFusedMultiplyIdx(
8844 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8845 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8846 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8847 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8848 kind: FMAInstKind::Indexed);
8849}
8850
8851/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8852/// instructions with an additional negation of the accumulator
8853static MachineInstr *genFusedMultiplyIdxNeg(
8854 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
8855 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
8856 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8857 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8858 assert(IdxMulOpd == 1);
8859
8860 Register NewVR =
8861 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8862
8863 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8864 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
8865}
8866
8867/// genMaddR - Generate madd instruction and combine mul and add using
8868/// an extra virtual register
8869/// Example - an ADD intermediate needs to be stored in a register:
8870/// MUL I=A,B,0
8871/// ADD R,I,Imm
8872/// ==> ORR V, ZR, Imm
8873/// ==> MADD R,A,B,V
8874/// \param MF Containing MachineFunction
8875/// \param MRI Register information
8876/// \param TII Target information
8877/// \param Root is the ADD instruction
8878/// \param [out] InsInstrs is a vector of machine instructions and will
8879/// contain the generated madd instruction
8880/// \param IdxMulOpd is index of operand in Root that is the result of
8881/// the MUL. In the example above IdxMulOpd is 1.
8882/// \param MaddOpc the opcode fo the madd instruction
8883/// \param VR is a virtual register that holds the value of an ADD operand
8884/// (V in the example above).
8885/// \param RC Register class of operands
8886static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
8887 const TargetInstrInfo *TII, MachineInstr &Root,
8888 SmallVectorImpl<MachineInstr *> &InsInstrs,
8889 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8890 const TargetRegisterClass *RC) {
8891 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8892
8893 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
8894 Register ResultReg = Root.getOperand(i: 0).getReg();
8895 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
8896 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
8897 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
8898 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
8899
8900 if (ResultReg.isVirtual())
8901 MRI.constrainRegClass(Reg: ResultReg, RC);
8902 if (SrcReg0.isVirtual())
8903 MRI.constrainRegClass(Reg: SrcReg0, RC);
8904 if (SrcReg1.isVirtual())
8905 MRI.constrainRegClass(Reg: SrcReg1, RC);
8906 if (Register::isVirtualRegister(Reg: VR))
8907 MRI.constrainRegClass(Reg: VR, RC);
8908
8909 MachineInstrBuilder MIB =
8910 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
8911 .addReg(RegNo: SrcReg0, Flags: getKillRegState(B: Src0IsKill))
8912 .addReg(RegNo: SrcReg1, Flags: getKillRegState(B: Src1IsKill))
8913 .addReg(RegNo: VR);
8914 // Insert the MADD
8915 InsInstrs.push_back(Elt: MIB);
8916 return MUL;
8917}
8918
8919/// Do the following transformation
8920/// A - (B + C) ==> (A - B) - C
8921/// A - (B + C) ==> (A - C) - B
8922static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
8923 const TargetInstrInfo *TII, MachineInstr &Root,
8924 SmallVectorImpl<MachineInstr *> &InsInstrs,
8925 SmallVectorImpl<MachineInstr *> &DelInstrs,
8926 unsigned IdxOpd1,
8927 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8928 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8929 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8930 MachineInstr *AddMI = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 2).getReg());
8931
8932 Register ResultReg = Root.getOperand(i: 0).getReg();
8933 Register RegA = Root.getOperand(i: 1).getReg();
8934 bool RegAIsKill = Root.getOperand(i: 1).isKill();
8935 Register RegB = AddMI->getOperand(i: IdxOpd1).getReg();
8936 bool RegBIsKill = AddMI->getOperand(i: IdxOpd1).isKill();
8937 Register RegC = AddMI->getOperand(i: IdxOtherOpd).getReg();
8938 bool RegCIsKill = AddMI->getOperand(i: IdxOtherOpd).isKill();
8939 Register NewVR =
8940 MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: Root.getOperand(i: 2).getReg()));
8941
8942 unsigned Opcode = Root.getOpcode();
8943 if (Opcode == AArch64::SUBSWrr)
8944 Opcode = AArch64::SUBWrr;
8945 else if (Opcode == AArch64::SUBSXrr)
8946 Opcode = AArch64::SUBXrr;
8947 else
8948 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8949 "Unexpected instruction opcode.");
8950
8951 uint32_t Flags = Root.mergeFlagsWith(Other: *AddMI);
8952 Flags &= ~MachineInstr::NoSWrap;
8953 Flags &= ~MachineInstr::NoUWrap;
8954
8955 MachineInstrBuilder MIB1 =
8956 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: NewVR)
8957 .addReg(RegNo: RegA, Flags: getKillRegState(B: RegAIsKill))
8958 .addReg(RegNo: RegB, Flags: getKillRegState(B: RegBIsKill))
8959 .setMIFlags(Flags);
8960 MachineInstrBuilder MIB2 =
8961 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: ResultReg)
8962 .addReg(RegNo: NewVR, Flags: getKillRegState(B: true))
8963 .addReg(RegNo: RegC, Flags: getKillRegState(B: RegCIsKill))
8964 .setMIFlags(Flags);
8965
8966 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
8967 InsInstrs.push_back(Elt: MIB1);
8968 InsInstrs.push_back(Elt: MIB2);
8969 DelInstrs.push_back(Elt: AddMI);
8970 DelInstrs.push_back(Elt: &Root);
8971}
8972
8973unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8974 unsigned int AccumulatorOpCode) const {
8975 switch (AccumulatorOpCode) {
8976 case AArch64::UABALB_ZZZ_D:
8977 case AArch64::SABALB_ZZZ_D:
8978 case AArch64::UABALT_ZZZ_D:
8979 case AArch64::SABALT_ZZZ_D:
8980 return AArch64::ADD_ZZZ_D;
8981 case AArch64::UABALB_ZZZ_H:
8982 case AArch64::SABALB_ZZZ_H:
8983 case AArch64::UABALT_ZZZ_H:
8984 case AArch64::SABALT_ZZZ_H:
8985 return AArch64::ADD_ZZZ_H;
8986 case AArch64::UABALB_ZZZ_S:
8987 case AArch64::SABALB_ZZZ_S:
8988 case AArch64::UABALT_ZZZ_S:
8989 case AArch64::SABALT_ZZZ_S:
8990 return AArch64::ADD_ZZZ_S;
8991 case AArch64::UABALv16i8_v8i16:
8992 case AArch64::SABALv8i8_v8i16:
8993 case AArch64::SABAv8i16:
8994 case AArch64::UABAv8i16:
8995 return AArch64::ADDv8i16;
8996 case AArch64::SABALv2i32_v2i64:
8997 case AArch64::UABALv2i32_v2i64:
8998 case AArch64::SABALv4i32_v2i64:
8999 return AArch64::ADDv2i64;
9000 case AArch64::UABALv4i16_v4i32:
9001 case AArch64::SABALv4i16_v4i32:
9002 case AArch64::SABALv8i16_v4i32:
9003 case AArch64::SABAv4i32:
9004 case AArch64::UABAv4i32:
9005 return AArch64::ADDv4i32;
9006 case AArch64::UABALv4i32_v2i64:
9007 return AArch64::ADDv2i64;
9008 case AArch64::UABALv8i16_v4i32:
9009 return AArch64::ADDv4i32;
9010 case AArch64::UABALv8i8_v8i16:
9011 case AArch64::SABALv16i8_v8i16:
9012 return AArch64::ADDv8i16;
9013 case AArch64::UABAv16i8:
9014 case AArch64::SABAv16i8:
9015 return AArch64::ADDv16i8;
9016 case AArch64::UABAv4i16:
9017 case AArch64::SABAv4i16:
9018 return AArch64::ADDv4i16;
9019 case AArch64::UABAv2i32:
9020 case AArch64::SABAv2i32:
9021 return AArch64::ADDv2i32;
9022 case AArch64::UABAv8i8:
9023 case AArch64::SABAv8i8:
9024 return AArch64::ADDv8i8;
9025 default:
9026 llvm_unreachable("Unknown accumulator opcode");
9027 }
9028}
9029
9030/// When getMachineCombinerPatterns() finds potential patterns,
9031/// this function generates the instructions that could replace the
9032/// original code sequence
9033void AArch64InstrInfo::genAlternativeCodeSequence(
9034 MachineInstr &Root, unsigned Pattern,
9035 SmallVectorImpl<MachineInstr *> &InsInstrs,
9036 SmallVectorImpl<MachineInstr *> &DelInstrs,
9037 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
9038 MachineBasicBlock &MBB = *Root.getParent();
9039 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9040 MachineFunction &MF = *MBB.getParent();
9041 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
9042
9043 MachineInstr *MUL = nullptr;
9044 const TargetRegisterClass *RC;
9045 unsigned Opc;
9046 switch (Pattern) {
9047 default:
9048 // Reassociate instructions.
9049 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
9050 DelInstrs, InstIdxForVirtReg&: InstrIdxForVirtReg);
9051 return;
9052 case AArch64MachineCombinerPattern::SUBADD_OP1:
9053 // A - (B + C)
9054 // ==> (A - B) - C
9055 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 1,
9056 InstrIdxForVirtReg);
9057 return;
9058 case AArch64MachineCombinerPattern::SUBADD_OP2:
9059 // A - (B + C)
9060 // ==> (A - C) - B
9061 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 2,
9062 InstrIdxForVirtReg);
9063 return;
9064 case AArch64MachineCombinerPattern::MULADDW_OP1:
9065 case AArch64MachineCombinerPattern::MULADDX_OP1:
9066 // MUL I=A,B,0
9067 // ADD R,I,C
9068 // ==> MADD R,A,B,C
9069 // --- Create(MADD);
9070 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
9071 Opc = AArch64::MADDWrrr;
9072 RC = &AArch64::GPR32RegClass;
9073 } else {
9074 Opc = AArch64::MADDXrrr;
9075 RC = &AArch64::GPR64RegClass;
9076 }
9077 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9078 break;
9079 case AArch64MachineCombinerPattern::MULADDW_OP2:
9080 case AArch64MachineCombinerPattern::MULADDX_OP2:
9081 // MUL I=A,B,0
9082 // ADD R,C,I
9083 // ==> MADD R,A,B,C
9084 // --- Create(MADD);
9085 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
9086 Opc = AArch64::MADDWrrr;
9087 RC = &AArch64::GPR32RegClass;
9088 } else {
9089 Opc = AArch64::MADDXrrr;
9090 RC = &AArch64::GPR64RegClass;
9091 }
9092 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9093 break;
9094 case AArch64MachineCombinerPattern::MULADDWI_OP1:
9095 case AArch64MachineCombinerPattern::MULADDXI_OP1:
9096 case AArch64MachineCombinerPattern::MULSUBWI_OP1:
9097 case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
9098 // MUL I=A,B,0
9099 // ADD/SUB R,I,Imm
9100 // ==> MOV V, Imm/-Imm
9101 // ==> MADD R,A,B,V
9102 // --- Create(MADD);
9103 const TargetRegisterClass *RC;
9104 unsigned BitSize, MovImm;
9105 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1 ||
9106 Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
9107 MovImm = AArch64::MOVi32imm;
9108 RC = &AArch64::GPR32spRegClass;
9109 BitSize = 32;
9110 Opc = AArch64::MADDWrrr;
9111 RC = &AArch64::GPR32RegClass;
9112 } else {
9113 MovImm = AArch64::MOVi64imm;
9114 RC = &AArch64::GPR64spRegClass;
9115 BitSize = 64;
9116 Opc = AArch64::MADDXrrr;
9117 RC = &AArch64::GPR64RegClass;
9118 }
9119 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9120 uint64_t Imm = Root.getOperand(i: 2).getImm();
9121
9122 if (Root.getOperand(i: 3).isImm()) {
9123 unsigned Val = Root.getOperand(i: 3).getImm();
9124 Imm = Imm << Val;
9125 }
9126 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
9127 Pattern == AArch64MachineCombinerPattern::MULSUBXI_OP1;
9128 uint64_t UImm = SignExtend64(X: IsSub ? -Imm : Imm, B: BitSize);
9129 // Check that the immediate can be composed via a single instruction.
9130 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
9131 AArch64_IMM::expandMOVImm(Imm: UImm, BitSize, Insn);
9132 if (Insn.size() != 1)
9133 return;
9134 MachineInstrBuilder MIB1 =
9135 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MovImm), DestReg: NewVR)
9136 .addImm(Val: IsSub ? -Imm : Imm);
9137 InsInstrs.push_back(Elt: MIB1);
9138 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9139 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
9140 break;
9141 }
9142 case AArch64MachineCombinerPattern::MULSUBW_OP1:
9143 case AArch64MachineCombinerPattern::MULSUBX_OP1: {
9144 // MUL I=A,B,0
9145 // SUB R,I, C
9146 // ==> SUB V, 0, C
9147 // ==> MADD R,A,B,V // = -C + A*B
9148 // --- Create(MADD);
9149 const TargetRegisterClass *SubRC;
9150 unsigned SubOpc, ZeroReg;
9151 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
9152 SubOpc = AArch64::SUBWrr;
9153 SubRC = &AArch64::GPR32spRegClass;
9154 ZeroReg = AArch64::WZR;
9155 Opc = AArch64::MADDWrrr;
9156 RC = &AArch64::GPR32RegClass;
9157 } else {
9158 SubOpc = AArch64::SUBXrr;
9159 SubRC = &AArch64::GPR64spRegClass;
9160 ZeroReg = AArch64::XZR;
9161 Opc = AArch64::MADDXrrr;
9162 RC = &AArch64::GPR64RegClass;
9163 }
9164 Register NewVR = MRI.createVirtualRegister(RegClass: SubRC);
9165 // SUB NewVR, 0, C
9166 MachineInstrBuilder MIB1 =
9167 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: SubOpc), DestReg: NewVR)
9168 .addReg(RegNo: ZeroReg)
9169 .add(MO: Root.getOperand(i: 2));
9170 InsInstrs.push_back(Elt: MIB1);
9171 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9172 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
9173 break;
9174 }
9175 case AArch64MachineCombinerPattern::MULSUBW_OP2:
9176 case AArch64MachineCombinerPattern::MULSUBX_OP2:
9177 // MUL I=A,B,0
9178 // SUB R,C,I
9179 // ==> MSUB R,A,B,C (computes C - A*B)
9180 // --- Create(MSUB);
9181 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
9182 Opc = AArch64::MSUBWrrr;
9183 RC = &AArch64::GPR32RegClass;
9184 } else {
9185 Opc = AArch64::MSUBXrrr;
9186 RC = &AArch64::GPR64RegClass;
9187 }
9188 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9189 break;
9190 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
9191 Opc = AArch64::MLAv8i8;
9192 RC = &AArch64::FPR64RegClass;
9193 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9194 break;
9195 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
9196 Opc = AArch64::MLAv8i8;
9197 RC = &AArch64::FPR64RegClass;
9198 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9199 break;
9200 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
9201 Opc = AArch64::MLAv16i8;
9202 RC = &AArch64::FPR128RegClass;
9203 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9204 break;
9205 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
9206 Opc = AArch64::MLAv16i8;
9207 RC = &AArch64::FPR128RegClass;
9208 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9209 break;
9210 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
9211 Opc = AArch64::MLAv4i16;
9212 RC = &AArch64::FPR64RegClass;
9213 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9214 break;
9215 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
9216 Opc = AArch64::MLAv4i16;
9217 RC = &AArch64::FPR64RegClass;
9218 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9219 break;
9220 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
9221 Opc = AArch64::MLAv8i16;
9222 RC = &AArch64::FPR128RegClass;
9223 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9224 break;
9225 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
9226 Opc = AArch64::MLAv8i16;
9227 RC = &AArch64::FPR128RegClass;
9228 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9229 break;
9230 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
9231 Opc = AArch64::MLAv2i32;
9232 RC = &AArch64::FPR64RegClass;
9233 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9234 break;
9235 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
9236 Opc = AArch64::MLAv2i32;
9237 RC = &AArch64::FPR64RegClass;
9238 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9239 break;
9240 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
9241 Opc = AArch64::MLAv4i32;
9242 RC = &AArch64::FPR128RegClass;
9243 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9244 break;
9245 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
9246 Opc = AArch64::MLAv4i32;
9247 RC = &AArch64::FPR128RegClass;
9248 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9249 break;
9250
9251 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
9252 Opc = AArch64::MLAv8i8;
9253 RC = &AArch64::FPR64RegClass;
9254 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9255 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i8,
9256 RC);
9257 break;
9258 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
9259 Opc = AArch64::MLSv8i8;
9260 RC = &AArch64::FPR64RegClass;
9261 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9262 break;
9263 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
9264 Opc = AArch64::MLAv16i8;
9265 RC = &AArch64::FPR128RegClass;
9266 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9267 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv16i8,
9268 RC);
9269 break;
9270 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
9271 Opc = AArch64::MLSv16i8;
9272 RC = &AArch64::FPR128RegClass;
9273 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9274 break;
9275 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
9276 Opc = AArch64::MLAv4i16;
9277 RC = &AArch64::FPR64RegClass;
9278 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9279 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i16,
9280 RC);
9281 break;
9282 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
9283 Opc = AArch64::MLSv4i16;
9284 RC = &AArch64::FPR64RegClass;
9285 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9286 break;
9287 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
9288 Opc = AArch64::MLAv8i16;
9289 RC = &AArch64::FPR128RegClass;
9290 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9291 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i16,
9292 RC);
9293 break;
9294 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
9295 Opc = AArch64::MLSv8i16;
9296 RC = &AArch64::FPR128RegClass;
9297 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9298 break;
9299 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
9300 Opc = AArch64::MLAv2i32;
9301 RC = &AArch64::FPR64RegClass;
9302 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9303 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv2i32,
9304 RC);
9305 break;
9306 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
9307 Opc = AArch64::MLSv2i32;
9308 RC = &AArch64::FPR64RegClass;
9309 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9310 break;
9311 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
9312 Opc = AArch64::MLAv4i32;
9313 RC = &AArch64::FPR128RegClass;
9314 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9315 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i32,
9316 RC);
9317 break;
9318 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
9319 Opc = AArch64::MLSv4i32;
9320 RC = &AArch64::FPR128RegClass;
9321 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9322 break;
9323
9324 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
9325 Opc = AArch64::MLAv4i16_indexed;
9326 RC = &AArch64::FPR64RegClass;
9327 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9328 break;
9329 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
9330 Opc = AArch64::MLAv4i16_indexed;
9331 RC = &AArch64::FPR64RegClass;
9332 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9333 break;
9334 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
9335 Opc = AArch64::MLAv8i16_indexed;
9336 RC = &AArch64::FPR128RegClass;
9337 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9338 break;
9339 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
9340 Opc = AArch64::MLAv8i16_indexed;
9341 RC = &AArch64::FPR128RegClass;
9342 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9343 break;
9344 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
9345 Opc = AArch64::MLAv2i32_indexed;
9346 RC = &AArch64::FPR64RegClass;
9347 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9348 break;
9349 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
9350 Opc = AArch64::MLAv2i32_indexed;
9351 RC = &AArch64::FPR64RegClass;
9352 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9353 break;
9354 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
9355 Opc = AArch64::MLAv4i32_indexed;
9356 RC = &AArch64::FPR128RegClass;
9357 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9358 break;
9359 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
9360 Opc = AArch64::MLAv4i32_indexed;
9361 RC = &AArch64::FPR128RegClass;
9362 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9363 break;
9364
9365 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
9366 Opc = AArch64::MLAv4i16_indexed;
9367 RC = &AArch64::FPR64RegClass;
9368 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9369 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i16,
9370 RC);
9371 break;
9372 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
9373 Opc = AArch64::MLSv4i16_indexed;
9374 RC = &AArch64::FPR64RegClass;
9375 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9376 break;
9377 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
9378 Opc = AArch64::MLAv8i16_indexed;
9379 RC = &AArch64::FPR128RegClass;
9380 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9381 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv8i16,
9382 RC);
9383 break;
9384 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
9385 Opc = AArch64::MLSv8i16_indexed;
9386 RC = &AArch64::FPR128RegClass;
9387 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9388 break;
9389 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
9390 Opc = AArch64::MLAv2i32_indexed;
9391 RC = &AArch64::FPR64RegClass;
9392 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9393 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv2i32,
9394 RC);
9395 break;
9396 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
9397 Opc = AArch64::MLSv2i32_indexed;
9398 RC = &AArch64::FPR64RegClass;
9399 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9400 break;
9401 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
9402 Opc = AArch64::MLAv4i32_indexed;
9403 RC = &AArch64::FPR128RegClass;
9404 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9405 InstrIdxForVirtReg, IdxMulOpd: 1, MaddOpc: Opc, MnegOpc: AArch64::NEGv4i32,
9406 RC);
9407 break;
9408 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
9409 Opc = AArch64::MLSv4i32_indexed;
9410 RC = &AArch64::FPR128RegClass;
9411 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9412 break;
9413
9414 // Floating Point Support
9415 case AArch64MachineCombinerPattern::FMULADDH_OP1:
9416 Opc = AArch64::FMADDHrrr;
9417 RC = &AArch64::FPR16RegClass;
9418 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9419 break;
9420 case AArch64MachineCombinerPattern::FMULADDS_OP1:
9421 Opc = AArch64::FMADDSrrr;
9422 RC = &AArch64::FPR32RegClass;
9423 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9424 break;
9425 case AArch64MachineCombinerPattern::FMULADDD_OP1:
9426 Opc = AArch64::FMADDDrrr;
9427 RC = &AArch64::FPR64RegClass;
9428 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9429 break;
9430
9431 case AArch64MachineCombinerPattern::FMULADDH_OP2:
9432 Opc = AArch64::FMADDHrrr;
9433 RC = &AArch64::FPR16RegClass;
9434 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9435 break;
9436 case AArch64MachineCombinerPattern::FMULADDS_OP2:
9437 Opc = AArch64::FMADDSrrr;
9438 RC = &AArch64::FPR32RegClass;
9439 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9440 break;
9441 case AArch64MachineCombinerPattern::FMULADDD_OP2:
9442 Opc = AArch64::FMADDDrrr;
9443 RC = &AArch64::FPR64RegClass;
9444 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9445 break;
9446
9447 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
9448 Opc = AArch64::FMLAv1i32_indexed;
9449 RC = &AArch64::FPR32RegClass;
9450 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9451 kind: FMAInstKind::Indexed);
9452 break;
9453 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
9454 Opc = AArch64::FMLAv1i32_indexed;
9455 RC = &AArch64::FPR32RegClass;
9456 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9457 kind: FMAInstKind::Indexed);
9458 break;
9459
9460 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
9461 Opc = AArch64::FMLAv1i64_indexed;
9462 RC = &AArch64::FPR64RegClass;
9463 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9464 kind: FMAInstKind::Indexed);
9465 break;
9466 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
9467 Opc = AArch64::FMLAv1i64_indexed;
9468 RC = &AArch64::FPR64RegClass;
9469 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9470 kind: FMAInstKind::Indexed);
9471 break;
9472
9473 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
9474 RC = &AArch64::FPR64RegClass;
9475 Opc = AArch64::FMLAv4i16_indexed;
9476 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9477 kind: FMAInstKind::Indexed);
9478 break;
9479 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
9480 RC = &AArch64::FPR64RegClass;
9481 Opc = AArch64::FMLAv4f16;
9482 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9483 kind: FMAInstKind::Accumulator);
9484 break;
9485 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
9486 RC = &AArch64::FPR64RegClass;
9487 Opc = AArch64::FMLAv4i16_indexed;
9488 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9489 kind: FMAInstKind::Indexed);
9490 break;
9491 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
9492 RC = &AArch64::FPR64RegClass;
9493 Opc = AArch64::FMLAv4f16;
9494 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9495 kind: FMAInstKind::Accumulator);
9496 break;
9497
9498 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
9499 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
9500 RC = &AArch64::FPR64RegClass;
9501 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
9502 Opc = AArch64::FMLAv2i32_indexed;
9503 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9504 kind: FMAInstKind::Indexed);
9505 } else {
9506 Opc = AArch64::FMLAv2f32;
9507 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9508 kind: FMAInstKind::Accumulator);
9509 }
9510 break;
9511 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
9512 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
9513 RC = &AArch64::FPR64RegClass;
9514 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
9515 Opc = AArch64::FMLAv2i32_indexed;
9516 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9517 kind: FMAInstKind::Indexed);
9518 } else {
9519 Opc = AArch64::FMLAv2f32;
9520 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9521 kind: FMAInstKind::Accumulator);
9522 }
9523 break;
9524
9525 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
9526 RC = &AArch64::FPR128RegClass;
9527 Opc = AArch64::FMLAv8i16_indexed;
9528 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9529 kind: FMAInstKind::Indexed);
9530 break;
9531 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
9532 RC = &AArch64::FPR128RegClass;
9533 Opc = AArch64::FMLAv8f16;
9534 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9535 kind: FMAInstKind::Accumulator);
9536 break;
9537 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
9538 RC = &AArch64::FPR128RegClass;
9539 Opc = AArch64::FMLAv8i16_indexed;
9540 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9541 kind: FMAInstKind::Indexed);
9542 break;
9543 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
9544 RC = &AArch64::FPR128RegClass;
9545 Opc = AArch64::FMLAv8f16;
9546 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9547 kind: FMAInstKind::Accumulator);
9548 break;
9549
9550 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
9551 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
9552 RC = &AArch64::FPR128RegClass;
9553 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
9554 Opc = AArch64::FMLAv2i64_indexed;
9555 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9556 kind: FMAInstKind::Indexed);
9557 } else {
9558 Opc = AArch64::FMLAv2f64;
9559 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9560 kind: FMAInstKind::Accumulator);
9561 }
9562 break;
9563 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
9564 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
9565 RC = &AArch64::FPR128RegClass;
9566 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
9567 Opc = AArch64::FMLAv2i64_indexed;
9568 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9569 kind: FMAInstKind::Indexed);
9570 } else {
9571 Opc = AArch64::FMLAv2f64;
9572 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9573 kind: FMAInstKind::Accumulator);
9574 }
9575 break;
9576
9577 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
9578 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
9579 RC = &AArch64::FPR128RegClass;
9580 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
9581 Opc = AArch64::FMLAv4i32_indexed;
9582 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9583 kind: FMAInstKind::Indexed);
9584 } else {
9585 Opc = AArch64::FMLAv4f32;
9586 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9587 kind: FMAInstKind::Accumulator);
9588 }
9589 break;
9590
9591 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
9592 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
9593 RC = &AArch64::FPR128RegClass;
9594 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
9595 Opc = AArch64::FMLAv4i32_indexed;
9596 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9597 kind: FMAInstKind::Indexed);
9598 } else {
9599 Opc = AArch64::FMLAv4f32;
9600 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9601 kind: FMAInstKind::Accumulator);
9602 }
9603 break;
9604
9605 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
9606 Opc = AArch64::FNMSUBHrrr;
9607 RC = &AArch64::FPR16RegClass;
9608 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9609 break;
9610 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
9611 Opc = AArch64::FNMSUBSrrr;
9612 RC = &AArch64::FPR32RegClass;
9613 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9614 break;
9615 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
9616 Opc = AArch64::FNMSUBDrrr;
9617 RC = &AArch64::FPR64RegClass;
9618 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9619 break;
9620
9621 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
9622 Opc = AArch64::FNMADDHrrr;
9623 RC = &AArch64::FPR16RegClass;
9624 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9625 break;
9626 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
9627 Opc = AArch64::FNMADDSrrr;
9628 RC = &AArch64::FPR32RegClass;
9629 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9630 break;
9631 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
9632 Opc = AArch64::FNMADDDrrr;
9633 RC = &AArch64::FPR64RegClass;
9634 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
9635 break;
9636
9637 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
9638 Opc = AArch64::FMSUBHrrr;
9639 RC = &AArch64::FPR16RegClass;
9640 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9641 break;
9642 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
9643 Opc = AArch64::FMSUBSrrr;
9644 RC = &AArch64::FPR32RegClass;
9645 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9646 break;
9647 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
9648 Opc = AArch64::FMSUBDrrr;
9649 RC = &AArch64::FPR64RegClass;
9650 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
9651 break;
9652
9653 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
9654 Opc = AArch64::FMLSv1i32_indexed;
9655 RC = &AArch64::FPR32RegClass;
9656 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9657 kind: FMAInstKind::Indexed);
9658 break;
9659
9660 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
9661 Opc = AArch64::FMLSv1i64_indexed;
9662 RC = &AArch64::FPR64RegClass;
9663 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9664 kind: FMAInstKind::Indexed);
9665 break;
9666
9667 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
9668 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
9669 RC = &AArch64::FPR64RegClass;
9670 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9671 MachineInstrBuilder MIB1 =
9672 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv4f16), DestReg: NewVR)
9673 .add(MO: Root.getOperand(i: 2));
9674 InsInstrs.push_back(Elt: MIB1);
9675 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9676 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
9677 Opc = AArch64::FMLAv4f16;
9678 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9679 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9680 } else {
9681 Opc = AArch64::FMLAv4i16_indexed;
9682 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9683 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9684 }
9685 break;
9686 }
9687 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
9688 RC = &AArch64::FPR64RegClass;
9689 Opc = AArch64::FMLSv4f16;
9690 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9691 kind: FMAInstKind::Accumulator);
9692 break;
9693 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
9694 RC = &AArch64::FPR64RegClass;
9695 Opc = AArch64::FMLSv4i16_indexed;
9696 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9697 kind: FMAInstKind::Indexed);
9698 break;
9699
9700 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
9701 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
9702 RC = &AArch64::FPR64RegClass;
9703 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
9704 Opc = AArch64::FMLSv2i32_indexed;
9705 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9706 kind: FMAInstKind::Indexed);
9707 } else {
9708 Opc = AArch64::FMLSv2f32;
9709 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9710 kind: FMAInstKind::Accumulator);
9711 }
9712 break;
9713
9714 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
9715 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
9716 RC = &AArch64::FPR128RegClass;
9717 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9718 MachineInstrBuilder MIB1 =
9719 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv8f16), DestReg: NewVR)
9720 .add(MO: Root.getOperand(i: 2));
9721 InsInstrs.push_back(Elt: MIB1);
9722 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9723 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
9724 Opc = AArch64::FMLAv8f16;
9725 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9726 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9727 } else {
9728 Opc = AArch64::FMLAv8i16_indexed;
9729 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9730 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9731 }
9732 break;
9733 }
9734 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
9735 RC = &AArch64::FPR128RegClass;
9736 Opc = AArch64::FMLSv8f16;
9737 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9738 kind: FMAInstKind::Accumulator);
9739 break;
9740 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
9741 RC = &AArch64::FPR128RegClass;
9742 Opc = AArch64::FMLSv8i16_indexed;
9743 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9744 kind: FMAInstKind::Indexed);
9745 break;
9746
9747 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
9748 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
9749 RC = &AArch64::FPR128RegClass;
9750 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
9751 Opc = AArch64::FMLSv2i64_indexed;
9752 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9753 kind: FMAInstKind::Indexed);
9754 } else {
9755 Opc = AArch64::FMLSv2f64;
9756 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9757 kind: FMAInstKind::Accumulator);
9758 }
9759 break;
9760
9761 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
9762 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
9763 RC = &AArch64::FPR128RegClass;
9764 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
9765 Opc = AArch64::FMLSv4i32_indexed;
9766 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9767 kind: FMAInstKind::Indexed);
9768 } else {
9769 Opc = AArch64::FMLSv4f32;
9770 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
9771 kind: FMAInstKind::Accumulator);
9772 }
9773 break;
9774 case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
9775 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
9776 RC = &AArch64::FPR64RegClass;
9777 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9778 MachineInstrBuilder MIB1 =
9779 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv2f32), DestReg: NewVR)
9780 .add(MO: Root.getOperand(i: 2));
9781 InsInstrs.push_back(Elt: MIB1);
9782 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9783 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
9784 Opc = AArch64::FMLAv2i32_indexed;
9785 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9786 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9787 } else {
9788 Opc = AArch64::FMLAv2f32;
9789 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9790 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9791 }
9792 break;
9793 }
9794 case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
9795 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
9796 RC = &AArch64::FPR128RegClass;
9797 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9798 MachineInstrBuilder MIB1 =
9799 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv4f32), DestReg: NewVR)
9800 .add(MO: Root.getOperand(i: 2));
9801 InsInstrs.push_back(Elt: MIB1);
9802 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9803 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
9804 Opc = AArch64::FMLAv4i32_indexed;
9805 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9806 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9807 } else {
9808 Opc = AArch64::FMLAv4f32;
9809 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9810 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9811 }
9812 break;
9813 }
9814 case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
9815 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
9816 RC = &AArch64::FPR128RegClass;
9817 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
9818 MachineInstrBuilder MIB1 =
9819 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: AArch64::FNEGv2f64), DestReg: NewVR)
9820 .add(MO: Root.getOperand(i: 2));
9821 InsInstrs.push_back(Elt: MIB1);
9822 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
9823 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
9824 Opc = AArch64::FMLAv2i64_indexed;
9825 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9826 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
9827 } else {
9828 Opc = AArch64::FMLAv2f64;
9829 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
9830 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
9831 }
9832 break;
9833 }
9834 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
9835 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
9836 unsigned IdxDupOp =
9837 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
9838 : 2;
9839 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv2i32_indexed,
9840 RC: &AArch64::FPR128RegClass, MRI);
9841 break;
9842 }
9843 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
9844 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
9845 unsigned IdxDupOp =
9846 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
9847 : 2;
9848 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv2i64_indexed,
9849 RC: &AArch64::FPR128RegClass, MRI);
9850 break;
9851 }
9852 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
9853 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
9854 unsigned IdxDupOp =
9855 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
9856 : 2;
9857 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv4i16_indexed,
9858 RC: &AArch64::FPR128_loRegClass, MRI);
9859 break;
9860 }
9861 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
9862 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
9863 unsigned IdxDupOp =
9864 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
9865 : 2;
9866 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv4i32_indexed,
9867 RC: &AArch64::FPR128RegClass, MRI);
9868 break;
9869 }
9870 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
9871 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
9872 unsigned IdxDupOp =
9873 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
9874 : 2;
9875 genIndexedMultiply(Root, InsInstrs, IdxDupOp, MulOpc: AArch64::FMULv8i16_indexed,
9876 RC: &AArch64::FPR128_loRegClass, MRI);
9877 break;
9878 }
9879 case AArch64MachineCombinerPattern::FNMADD: {
9880 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9881 break;
9882 }
9883 case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
9884 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9885 Pattern, NumLanes: 4);
9886 break;
9887 }
9888 case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
9889 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9890 Pattern, NumLanes: 8);
9891 break;
9892 }
9893 case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
9894 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9895 Pattern, NumLanes: 16);
9896 break;
9897 }
9898
9899 } // end switch (Pattern)
9900 // Record MUL and ADD/SUB for deletion
9901 if (MUL)
9902 DelInstrs.push_back(Elt: MUL);
9903 DelInstrs.push_back(Elt: &Root);
9904
9905 // Set the flags on the inserted instructions to be the merged flags of the
9906 // instructions that we have combined.
9907 uint32_t Flags = Root.getFlags();
9908 if (MUL)
9909 Flags = Root.mergeFlagsWith(Other: *MUL);
9910 for (auto *MI : InsInstrs)
9911 MI->setFlags(Flags);
9912}
9913
9914/// Replace csincr-branch sequence by simple conditional branch
9915///
9916/// Examples:
9917/// 1. \code
9918/// csinc w9, wzr, wzr, <condition code>
9919/// tbnz w9, #0, 0x44
9920/// \endcode
9921/// to
9922/// \code
9923/// b.<inverted condition code>
9924/// \endcode
9925///
9926/// 2. \code
9927/// csinc w9, wzr, wzr, <condition code>
9928/// tbz w9, #0, 0x44
9929/// \endcode
9930/// to
9931/// \code
9932/// b.<condition code>
9933/// \endcode
9934///
9935/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9936/// compare's constant operand is power of 2.
9937///
9938/// Examples:
9939/// \code
9940/// and w8, w8, #0x400
9941/// cbnz w8, L1
9942/// \endcode
9943/// to
9944/// \code
9945/// tbnz w8, #10, L1
9946/// \endcode
9947///
9948/// \param MI Conditional Branch
9949/// \return True when the simple conditional branch is generated
9950///
9951bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
9952 bool IsNegativeBranch = false;
9953 bool IsTestAndBranch = false;
9954 unsigned TargetBBInMI = 0;
9955 switch (MI.getOpcode()) {
9956 default:
9957 llvm_unreachable("Unknown branch instruction?");
9958 case AArch64::Bcc:
9959 case AArch64::CBWPri:
9960 case AArch64::CBXPri:
9961 case AArch64::CBBAssertExt:
9962 case AArch64::CBHAssertExt:
9963 case AArch64::CBWPrr:
9964 case AArch64::CBXPrr:
9965 return false;
9966 case AArch64::CBZW:
9967 case AArch64::CBZX:
9968 TargetBBInMI = 1;
9969 break;
9970 case AArch64::CBNZW:
9971 case AArch64::CBNZX:
9972 TargetBBInMI = 1;
9973 IsNegativeBranch = true;
9974 break;
9975 case AArch64::TBZW:
9976 case AArch64::TBZX:
9977 TargetBBInMI = 2;
9978 IsTestAndBranch = true;
9979 break;
9980 case AArch64::TBNZW:
9981 case AArch64::TBNZX:
9982 TargetBBInMI = 2;
9983 IsNegativeBranch = true;
9984 IsTestAndBranch = true;
9985 break;
9986 }
9987 // So we increment a zero register and test for bits other
9988 // than bit 0? Conservatively bail out in case the verifier
9989 // missed this case.
9990 if (IsTestAndBranch && MI.getOperand(i: 1).getImm())
9991 return false;
9992
9993 // Find Definition.
9994 assert(MI.getParent() && "Incomplete machine instruction\n");
9995 MachineBasicBlock *MBB = MI.getParent();
9996 MachineFunction *MF = MBB->getParent();
9997 MachineRegisterInfo *MRI = &MF->getRegInfo();
9998 Register VReg = MI.getOperand(i: 0).getReg();
9999 if (!VReg.isVirtual())
10000 return false;
10001
10002 MachineInstr *DefMI = MRI->getVRegDef(Reg: VReg);
10003
10004 // Look through COPY instructions to find definition.
10005 while (DefMI->isCopy()) {
10006 Register CopyVReg = DefMI->getOperand(i: 1).getReg();
10007 if (!MRI->hasOneNonDBGUse(RegNo: CopyVReg))
10008 return false;
10009 if (!MRI->hasOneDef(RegNo: CopyVReg))
10010 return false;
10011 DefMI = MRI->getVRegDef(Reg: CopyVReg);
10012 }
10013
10014 switch (DefMI->getOpcode()) {
10015 default:
10016 return false;
10017 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
10018 case AArch64::ANDWri:
10019 case AArch64::ANDXri: {
10020 if (IsTestAndBranch)
10021 return false;
10022 if (DefMI->getParent() != MBB)
10023 return false;
10024 if (!MRI->hasOneNonDBGUse(RegNo: VReg))
10025 return false;
10026
10027 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
10028 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
10029 val: DefMI->getOperand(i: 2).getImm(), regSize: Is32Bit ? 32 : 64);
10030 if (!isPowerOf2_64(Value: Mask))
10031 return false;
10032
10033 MachineOperand &MO = DefMI->getOperand(i: 1);
10034 Register NewReg = MO.getReg();
10035 if (!NewReg.isVirtual())
10036 return false;
10037
10038 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
10039
10040 MachineBasicBlock &RefToMBB = *MBB;
10041 MachineBasicBlock *TBB = MI.getOperand(i: 1).getMBB();
10042 DebugLoc DL = MI.getDebugLoc();
10043 unsigned Imm = Log2_64(Value: Mask);
10044 unsigned Opc = (Imm < 32)
10045 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
10046 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
10047 MachineInstr *NewMI = BuildMI(BB&: RefToMBB, I&: MI, MIMD: DL, MCID: get(Opcode: Opc))
10048 .addReg(RegNo: NewReg)
10049 .addImm(Val: Imm)
10050 .addMBB(MBB: TBB);
10051 // Register lives on to the CBZ now.
10052 MO.setIsKill(false);
10053
10054 // For immediate smaller than 32, we need to use the 32-bit
10055 // variant (W) in all cases. Indeed the 64-bit variant does not
10056 // allow to encode them.
10057 // Therefore, if the input register is 64-bit, we need to take the
10058 // 32-bit sub-part.
10059 if (!Is32Bit && Imm < 32)
10060 NewMI->getOperand(i: 0).setSubReg(AArch64::sub_32);
10061 MI.eraseFromParent();
10062 return true;
10063 }
10064 // Look for CSINC
10065 case AArch64::CSINCWr:
10066 case AArch64::CSINCXr: {
10067 if (!(DefMI->getOperand(i: 1).getReg() == AArch64::WZR &&
10068 DefMI->getOperand(i: 2).getReg() == AArch64::WZR) &&
10069 !(DefMI->getOperand(i: 1).getReg() == AArch64::XZR &&
10070 DefMI->getOperand(i: 2).getReg() == AArch64::XZR))
10071 return false;
10072
10073 if (DefMI->findRegisterDefOperandIdx(Reg: AArch64::NZCV, /*TRI=*/nullptr,
10074 isDead: true) != -1)
10075 return false;
10076
10077 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(i: 3).getImm();
10078 // Convert only when the condition code is not modified between
10079 // the CSINC and the branch. The CC may be used by other
10080 // instructions in between.
10081 if (areCFlagsAccessedBetweenInstrs(From: DefMI, To: MI, TRI: &getRegisterInfo(), AccessToCheck: AK_Write))
10082 return false;
10083 MachineBasicBlock &RefToMBB = *MBB;
10084 MachineBasicBlock *TBB = MI.getOperand(i: TargetBBInMI).getMBB();
10085 DebugLoc DL = MI.getDebugLoc();
10086 if (IsNegativeBranch)
10087 CC = AArch64CC::getInvertedCondCode(Code: CC);
10088 BuildMI(BB&: RefToMBB, I&: MI, MIMD: DL, MCID: get(Opcode: AArch64::Bcc)).addImm(Val: CC).addMBB(MBB: TBB);
10089 MI.eraseFromParent();
10090 return true;
10091 }
10092 }
10093}
10094
10095std::pair<unsigned, unsigned>
10096AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
10097 const unsigned Mask = AArch64II::MO_FRAGMENT;
10098 return std::make_pair(x: TF & Mask, y: TF & ~Mask);
10099}
10100
10101ArrayRef<std::pair<unsigned, const char *>>
10102AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
10103 using namespace AArch64II;
10104
10105 static const std::pair<unsigned, const char *> TargetFlags[] = {
10106 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
10107 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
10108 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
10109 {MO_HI12, "aarch64-hi12"}};
10110 return ArrayRef(TargetFlags);
10111}
10112
10113ArrayRef<std::pair<unsigned, const char *>>
10114AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
10115 using namespace AArch64II;
10116
10117 static const std::pair<unsigned, const char *> TargetFlags[] = {
10118 {MO_COFFSTUB, "aarch64-coffstub"},
10119 {MO_GOT, "aarch64-got"},
10120 {MO_NC, "aarch64-nc"},
10121 {MO_S, "aarch64-s"},
10122 {MO_TLS, "aarch64-tls"},
10123 {MO_DLLIMPORT, "aarch64-dllimport"},
10124 {MO_PREL, "aarch64-prel"},
10125 {MO_TAGGED, "aarch64-tagged"},
10126 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
10127 };
10128 return ArrayRef(TargetFlags);
10129}
10130
10131ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
10132AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
10133 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10134 {{MOSuppressPair, "aarch64-suppress-pair"},
10135 {MOStridedAccess, "aarch64-strided-access"}};
10136 return ArrayRef(TargetFlags);
10137}
10138
10139/// Constants defining how certain sequences should be outlined.
10140/// This encompasses how an outlined function should be called, and what kind of
10141/// frame should be emitted for that outlined function.
10142///
10143/// \p MachineOutlinerDefault implies that the function should be called with
10144/// a save and restore of LR to the stack.
10145///
10146/// That is,
10147///
10148/// I1 Save LR OUTLINED_FUNCTION:
10149/// I2 --> BL OUTLINED_FUNCTION I1
10150/// I3 Restore LR I2
10151/// I3
10152/// RET
10153///
10154/// * Call construction overhead: 3 (save + BL + restore)
10155/// * Frame construction overhead: 1 (ret)
10156/// * Requires stack fixups? Yes
10157///
10158/// \p MachineOutlinerTailCall implies that the function is being created from
10159/// a sequence of instructions ending in a return.
10160///
10161/// That is,
10162///
10163/// I1 OUTLINED_FUNCTION:
10164/// I2 --> B OUTLINED_FUNCTION I1
10165/// RET I2
10166/// RET
10167///
10168/// * Call construction overhead: 1 (B)
10169/// * Frame construction overhead: 0 (Return included in sequence)
10170/// * Requires stack fixups? No
10171///
10172/// \p MachineOutlinerNoLRSave implies that the function should be called using
10173/// a BL instruction, but doesn't require LR to be saved and restored. This
10174/// happens when LR is known to be dead.
10175///
10176/// That is,
10177///
10178/// I1 OUTLINED_FUNCTION:
10179/// I2 --> BL OUTLINED_FUNCTION I1
10180/// I3 I2
10181/// I3
10182/// RET
10183///
10184/// * Call construction overhead: 1 (BL)
10185/// * Frame construction overhead: 1 (RET)
10186/// * Requires stack fixups? No
10187///
10188/// \p MachineOutlinerThunk implies that the function is being created from
10189/// a sequence of instructions ending in a call. The outlined function is
10190/// called with a BL instruction, and the outlined function tail-calls the
10191/// original call destination.
10192///
10193/// That is,
10194///
10195/// I1 OUTLINED_FUNCTION:
10196/// I2 --> BL OUTLINED_FUNCTION I1
10197/// BL f I2
10198/// B f
10199/// * Call construction overhead: 1 (BL)
10200/// * Frame construction overhead: 0
10201/// * Requires stack fixups? No
10202///
10203/// \p MachineOutlinerRegSave implies that the function should be called with a
10204/// save and restore of LR to an available register. This allows us to avoid
10205/// stack fixups. Note that this outlining variant is compatible with the
10206/// NoLRSave case.
10207///
10208/// That is,
10209///
10210/// I1 Save LR OUTLINED_FUNCTION:
10211/// I2 --> BL OUTLINED_FUNCTION I1
10212/// I3 Restore LR I2
10213/// I3
10214/// RET
10215///
10216/// * Call construction overhead: 3 (save + BL + restore)
10217/// * Frame construction overhead: 1 (ret)
10218/// * Requires stack fixups? No
10219enum MachineOutlinerClass {
10220 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
10221 MachineOutlinerTailCall, /// Only emit a branch.
10222 MachineOutlinerNoLRSave, /// Emit a call and return.
10223 MachineOutlinerThunk, /// Emit a call and tail-call.
10224 MachineOutlinerRegSave /// Same as default, but save to a register.
10225};
10226
10227enum MachineOutlinerMBBFlags {
10228 LRUnavailableSomewhere = 0x2,
10229 HasCalls = 0x4,
10230 UnsafeRegsDead = 0x8
10231};
10232
10233Register
10234AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
10235 MachineFunction *MF = C.getMF();
10236 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
10237 const AArch64RegisterInfo *ARI =
10238 static_cast<const AArch64RegisterInfo *>(&TRI);
10239 // Check if there is an available register across the sequence that we can
10240 // use.
10241 for (unsigned Reg : AArch64::GPR64RegClass) {
10242 if (!ARI->isReservedReg(MF: *MF, Reg) &&
10243 Reg != AArch64::LR && // LR is not reserved, but don't use it.
10244 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
10245 Reg != AArch64::X17 && // Ditto for X17.
10246 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
10247 C.isAvailableInsideSeq(Reg, TRI))
10248 return Reg;
10249 }
10250 return Register();
10251}
10252
10253static bool
10254outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
10255 const outliner::Candidate &b) {
10256 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10257 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10258
10259 return MFIa->getSignReturnAddressCondition() ==
10260 MFIb->getSignReturnAddressCondition();
10261}
10262
10263static bool
10264outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
10265 const outliner::Candidate &b) {
10266 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10267 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10268
10269 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
10270}
10271
10272static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
10273 const outliner::Candidate &b) {
10274 const AArch64Subtarget &SubtargetA =
10275 a.getMF()->getSubtarget<AArch64Subtarget>();
10276 const AArch64Subtarget &SubtargetB =
10277 b.getMF()->getSubtarget<AArch64Subtarget>();
10278 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
10279}
10280
10281std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10282AArch64InstrInfo::getOutliningCandidateInfo(
10283 const MachineModuleInfo &MMI,
10284 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10285 unsigned MinRepeats) const {
10286 unsigned SequenceSize = 0;
10287 for (auto &MI : RepeatedSequenceLocs[0])
10288 SequenceSize += getInstSizeInBytes(MI);
10289
10290 unsigned NumBytesToCreateFrame = 0;
10291
10292 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
10293 // These instructions are fused together by the scheduler.
10294 // Any candidate where ADRP is the last instruction should be rejected
10295 // as that will lead to splitting ADRP pair.
10296 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
10297 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
10298 if (LastMI.getOpcode() == AArch64::ADRP &&
10299 (LastMI.getOperand(i: 1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
10300 (LastMI.getOperand(i: 1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10301 return std::nullopt;
10302 }
10303
10304 // Similarly any candidate where the first instruction is ADD/LDR with a
10305 // page offset should be rejected to avoid ADRP splitting.
10306 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
10307 FirstMI.getOpcode() == AArch64::LDRXui) &&
10308 (FirstMI.getOperand(i: 2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
10309 (FirstMI.getOperand(i: 2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10310 return std::nullopt;
10311 }
10312
10313 // We only allow outlining for functions having exactly matching return
10314 // address signing attributes, i.e., all share the same value for the
10315 // attribute "sign-return-address" and all share the same type of key they
10316 // are signed with.
10317 // Additionally we require all functions to simultaneously either support
10318 // v8.3a features or not. Otherwise an outlined function could get signed
10319 // using dedicated v8.3 instructions and a call from a function that doesn't
10320 // support v8.3 instructions would therefore be invalid.
10321 if (std::adjacent_find(
10322 first: RepeatedSequenceLocs.begin(), last: RepeatedSequenceLocs.end(),
10323 binary_pred: [](const outliner::Candidate &a, const outliner::Candidate &b) {
10324 // Return true if a and b are non-equal w.r.t. return address
10325 // signing or support of v8.3a features
10326 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10327 outliningCandidatesSigningKeyConsensus(a, b) &&
10328 outliningCandidatesV8_3OpsConsensus(a, b)) {
10329 return false;
10330 }
10331 return true;
10332 }) != RepeatedSequenceLocs.end()) {
10333 return std::nullopt;
10334 }
10335
10336 // Since at this point all candidates agree on their return address signing
10337 // picking just one is fine. If the candidate functions potentially sign their
10338 // return addresses, the outlined function should do the same. Note that in
10339 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10340 // not certainly true that the outlined function will have to sign its return
10341 // address but this decision is made later, when the decision to outline
10342 // has already been made.
10343 // The same holds for the number of additional instructions we need: On
10344 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10345 // necessary. However, at this point we don't know if the outlined function
10346 // will have a RET instruction so we assume the worst.
10347 const TargetRegisterInfo &TRI = getRegisterInfo();
10348 // Performing a tail call may require extra checks when PAuth is enabled.
10349 // If PAuth is disabled, set it to zero for uniformity.
10350 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10351 const auto RASignCondition = RepeatedSequenceLocs[0]
10352 .getMF()
10353 ->getInfo<AArch64FunctionInfo>()
10354 ->getSignReturnAddressCondition();
10355 if (RASignCondition != SignReturnAddress::None) {
10356 // One PAC and one AUT instructions
10357 NumBytesToCreateFrame += 8;
10358
10359 // PAuth is enabled - set extra tail call cost, if any.
10360 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10361 MF: *RepeatedSequenceLocs[0].getMF());
10362 NumBytesToCheckLRInTCEpilogue =
10363 AArch64PAuth::getCheckerSizeInBytes(Method: LRCheckMethod);
10364 // Checking the authenticated LR value may significantly impact
10365 // SequenceSize, so account for it for more precise results.
10366 if (isTailCallReturnInst(MI: RepeatedSequenceLocs[0].back()))
10367 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10368
10369 // We have to check if sp modifying instructions would get outlined.
10370 // If so we only allow outlining if sp is unchanged overall, so matching
10371 // sub and add instructions are okay to outline, all other sp modifications
10372 // are not
10373 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10374 int SPValue = 0;
10375 for (auto &MI : C) {
10376 if (MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI)) {
10377 switch (MI.getOpcode()) {
10378 case AArch64::ADDXri:
10379 case AArch64::ADDWri:
10380 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10381 assert(MI.getOperand(2).isImm() &&
10382 "Expected operand to be immediate");
10383 assert(MI.getOperand(1).isReg() &&
10384 "Expected operand to be a register");
10385 // Check if the add just increments sp. If so, we search for
10386 // matching sub instructions that decrement sp. If not, the
10387 // modification is illegal
10388 if (MI.getOperand(i: 1).getReg() == AArch64::SP)
10389 SPValue += MI.getOperand(i: 2).getImm();
10390 else
10391 return true;
10392 break;
10393 case AArch64::SUBXri:
10394 case AArch64::SUBWri:
10395 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10396 assert(MI.getOperand(2).isImm() &&
10397 "Expected operand to be immediate");
10398 assert(MI.getOperand(1).isReg() &&
10399 "Expected operand to be a register");
10400 // Check if the sub just decrements sp. If so, we search for
10401 // matching add instructions that increment sp. If not, the
10402 // modification is illegal
10403 if (MI.getOperand(i: 1).getReg() == AArch64::SP)
10404 SPValue -= MI.getOperand(i: 2).getImm();
10405 else
10406 return true;
10407 break;
10408 default:
10409 return true;
10410 }
10411 }
10412 }
10413 if (SPValue)
10414 return true;
10415 return false;
10416 };
10417 // Remove candidates with illegal stack modifying instructions
10418 llvm::erase_if(C&: RepeatedSequenceLocs, P: hasIllegalSPModification);
10419
10420 // If the sequence doesn't have enough candidates left, then we're done.
10421 if (RepeatedSequenceLocs.size() < MinRepeats)
10422 return std::nullopt;
10423 }
10424
10425 // Properties about candidate MBBs that hold for all of them.
10426 unsigned FlagsSetInAll = 0xF;
10427
10428 // Compute liveness information for each candidate, and set FlagsSetInAll.
10429 for (outliner::Candidate &C : RepeatedSequenceLocs)
10430 FlagsSetInAll &= C.Flags;
10431
10432 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10433
10434 // Helper lambda which sets call information for every candidate.
10435 auto SetCandidateCallInfo =
10436 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10437 for (outliner::Candidate &C : RepeatedSequenceLocs)
10438 C.setCallInfo(CID: CallID, CO: NumBytesForCall);
10439 };
10440
10441 unsigned FrameID = MachineOutlinerDefault;
10442 NumBytesToCreateFrame += 4;
10443
10444 bool HasBTI = any_of(Range&: RepeatedSequenceLocs, P: [](outliner::Candidate &C) {
10445 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10446 });
10447
10448 // We check to see if CFI Instructions are present, and if they are
10449 // we find the number of CFI Instructions in the candidates.
10450 unsigned CFICount = 0;
10451 for (auto &I : RepeatedSequenceLocs[0]) {
10452 if (I.isCFIInstruction())
10453 CFICount++;
10454 }
10455
10456 // We compare the number of found CFI Instructions to the number of CFI
10457 // instructions in the parent function for each candidate. We must check this
10458 // since if we outline one of the CFI instructions in a function, we have to
10459 // outline them all for correctness. If we do not, the address offsets will be
10460 // incorrect between the two sections of the program.
10461 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10462 std::vector<MCCFIInstruction> CFIInstructions =
10463 C.getMF()->getFrameInstructions();
10464
10465 if (CFICount > 0 && CFICount != CFIInstructions.size())
10466 return std::nullopt;
10467 }
10468
10469 // Returns true if an instructions is safe to fix up, false otherwise.
10470 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10471 if (MI.isCall())
10472 return true;
10473
10474 if (!MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI) &&
10475 !MI.readsRegister(Reg: AArch64::SP, TRI: &TRI))
10476 return true;
10477
10478 // Any modification of SP will break our code to save/restore LR.
10479 // FIXME: We could handle some instructions which add a constant
10480 // offset to SP, with a bit more work.
10481 if (MI.modifiesRegister(Reg: AArch64::SP, TRI: &TRI))
10482 return false;
10483
10484 // At this point, we have a stack instruction that we might need to
10485 // fix up. We'll handle it if it's a load or store.
10486 if (MI.mayLoadOrStore()) {
10487 const MachineOperand *Base; // Filled with the base operand of MI.
10488 int64_t Offset; // Filled with the offset of MI.
10489 bool OffsetIsScalable;
10490
10491 // Does it allow us to offset the base operand and is the base the
10492 // register SP?
10493 if (!getMemOperandWithOffset(MI, BaseOp&: Base, Offset, OffsetIsScalable, TRI: &TRI) ||
10494 !Base->isReg() || Base->getReg() != AArch64::SP)
10495 return false;
10496
10497 // Fixe-up code below assumes bytes.
10498 if (OffsetIsScalable)
10499 return false;
10500
10501 // Find the minimum/maximum offset for this instruction and check
10502 // if fixing it up would be in range.
10503 int64_t MinOffset,
10504 MaxOffset; // Unscaled offsets for the instruction.
10505 // The scale to multiply the offsets by.
10506 TypeSize Scale(0U, false), DummyWidth(0U, false);
10507 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width&: DummyWidth, MinOffset, MaxOffset);
10508
10509 Offset += 16; // Update the offset to what it would be if we outlined.
10510 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10511 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10512 return false;
10513
10514 // It's in range, so we can outline it.
10515 return true;
10516 }
10517
10518 // FIXME: Add handling for instructions like "add x0, sp, #8".
10519
10520 // We can't fix it up, so don't outline it.
10521 return false;
10522 };
10523
10524 // True if it's possible to fix up each stack instruction in this sequence.
10525 // Important for frames/call variants that modify the stack.
10526 bool AllStackInstrsSafe =
10527 llvm::all_of(Range&: RepeatedSequenceLocs[0], P: IsSafeToFixup);
10528
10529 // If the last instruction in any candidate is a terminator, then we should
10530 // tail call all of the candidates.
10531 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10532 FrameID = MachineOutlinerTailCall;
10533 NumBytesToCreateFrame = 0;
10534 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10535 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10536 }
10537
10538 else if (LastInstrOpcode == AArch64::BL ||
10539 ((LastInstrOpcode == AArch64::BLR ||
10540 LastInstrOpcode == AArch64::BLRNoIP) &&
10541 !HasBTI)) {
10542 // FIXME: Do we need to check if the code after this uses the value of LR?
10543 FrameID = MachineOutlinerThunk;
10544 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10545 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10546 }
10547
10548 else {
10549 // We need to decide how to emit calls + frames. We can always emit the same
10550 // frame if we don't need to save to the stack. If we have to save to the
10551 // stack, then we need a different frame.
10552 unsigned NumBytesNoStackCalls = 0;
10553 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10554
10555 // Check if we have to save LR.
10556 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10557 bool LRAvailable =
10558 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
10559 ? C.isAvailableAcrossAndOutOfSeq(Reg: AArch64::LR, TRI)
10560 : true;
10561 // If we have a noreturn caller, then we're going to be conservative and
10562 // say that we have to save LR. If we don't have a ret at the end of the
10563 // block, then we can't reason about liveness accurately.
10564 //
10565 // FIXME: We can probably do better than always disabling this in
10566 // noreturn functions by fixing up the liveness info.
10567 bool IsNoReturn =
10568 C.getMF()->getFunction().hasFnAttribute(Kind: Attribute::NoReturn);
10569
10570 // Is LR available? If so, we don't need a save.
10571 if (LRAvailable && !IsNoReturn) {
10572 NumBytesNoStackCalls += 4;
10573 C.setCallInfo(CID: MachineOutlinerNoLRSave, CO: 4);
10574 CandidatesWithoutStackFixups.push_back(x: C);
10575 }
10576
10577 // Is an unused register available? If so, we won't modify the stack, so
10578 // we can outline with the same frame type as those that don't save LR.
10579 else if (findRegisterToSaveLRTo(C)) {
10580 NumBytesNoStackCalls += 12;
10581 C.setCallInfo(CID: MachineOutlinerRegSave, CO: 12);
10582 CandidatesWithoutStackFixups.push_back(x: C);
10583 }
10584
10585 // Is SP used in the sequence at all? If not, we don't have to modify
10586 // the stack, so we are guaranteed to get the same frame.
10587 else if (C.isAvailableInsideSeq(Reg: AArch64::SP, TRI)) {
10588 NumBytesNoStackCalls += 12;
10589 C.setCallInfo(CID: MachineOutlinerDefault, CO: 12);
10590 CandidatesWithoutStackFixups.push_back(x: C);
10591 }
10592
10593 // If we outline this, we need to modify the stack. Pretend we don't
10594 // outline this by saving all of its bytes.
10595 else {
10596 NumBytesNoStackCalls += SequenceSize;
10597 }
10598 }
10599
10600 // If there are no places where we have to save LR, then note that we
10601 // don't have to update the stack. Otherwise, give every candidate the
10602 // default call type, as long as it's safe to do so.
10603 if (!AllStackInstrsSafe ||
10604 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10605 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10606 FrameID = MachineOutlinerNoLRSave;
10607 if (RepeatedSequenceLocs.size() < MinRepeats)
10608 return std::nullopt;
10609 } else {
10610 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10611
10612 // Bugzilla ID: 46767
10613 // TODO: Check if fixing up the stack more than once is safe so we can
10614 // outline these.
10615 //
10616 // An outline resulting in a caller that requires stack fixups at the
10617 // callsite to a callee that also requires stack fixups can happen when
10618 // there are no available registers at the candidate callsite for a
10619 // candidate that itself also has calls.
10620 //
10621 // In other words if function_containing_sequence in the following pseudo
10622 // assembly requires that we save LR at the point of the call, but there
10623 // are no available registers: in this case we save using SP and as a
10624 // result the SP offsets requires stack fixups by multiples of 16.
10625 //
10626 // function_containing_sequence:
10627 // ...
10628 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10629 // call OUTLINED_FUNCTION_N
10630 // restore LR from SP
10631 // ...
10632 //
10633 // OUTLINED_FUNCTION_N:
10634 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10635 // ...
10636 // bl foo
10637 // restore LR from SP
10638 // ret
10639 //
10640 // Because the code to handle more than one stack fixup does not
10641 // currently have the proper checks for legality, these cases will assert
10642 // in the AArch64 MachineOutliner. This is because the code to do this
10643 // needs more hardening, testing, better checks that generated code is
10644 // legal, etc and because it is only verified to handle a single pass of
10645 // stack fixup.
10646 //
10647 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10648 // these cases until they are known to be handled. Bugzilla 46767 is
10649 // referenced in comments at the assert site.
10650 //
10651 // To avoid asserting (or generating non-legal code on noassert builds)
10652 // we remove all candidates which would need more than one stack fixup by
10653 // pruning the cases where the candidate has calls while also having no
10654 // available LR and having no available general purpose registers to copy
10655 // LR to (ie one extra stack save/restore).
10656 //
10657 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10658 erase_if(C&: RepeatedSequenceLocs, P: [this, &TRI](outliner::Candidate &C) {
10659 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10660 return (llvm::any_of(Range&: C, P: IsCall)) &&
10661 (!C.isAvailableAcrossAndOutOfSeq(Reg: AArch64::LR, TRI) ||
10662 !findRegisterToSaveLRTo(C));
10663 });
10664 }
10665 }
10666
10667 // If we dropped all of the candidates, bail out here.
10668 if (RepeatedSequenceLocs.size() < MinRepeats)
10669 return std::nullopt;
10670 }
10671
10672 // Does every candidate's MBB contain a call? If so, then we might have a call
10673 // in the range.
10674 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10675 // Check if the range contains a call. These require a save + restore of the
10676 // link register.
10677 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10678 bool ModStackToSaveLR = false;
10679 if (any_of(Range: drop_end(RangeOrContainer&: FirstCand),
10680 P: [](const MachineInstr &MI) { return MI.isCall(); }))
10681 ModStackToSaveLR = true;
10682
10683 // Handle the last instruction separately. If this is a tail call, then the
10684 // last instruction is a call. We don't want to save + restore in this case.
10685 // However, it could be possible that the last instruction is a call without
10686 // it being valid to tail call this sequence. We should consider this as
10687 // well.
10688 else if (FrameID != MachineOutlinerThunk &&
10689 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10690 ModStackToSaveLR = true;
10691
10692 if (ModStackToSaveLR) {
10693 // We can't fix up the stack. Bail out.
10694 if (!AllStackInstrsSafe)
10695 return std::nullopt;
10696
10697 // Save + restore LR.
10698 NumBytesToCreateFrame += 8;
10699 }
10700 }
10701
10702 // If we have CFI instructions, we can only outline if the outlined section
10703 // can be a tail call
10704 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10705 return std::nullopt;
10706
10707 return std::make_unique<outliner::OutlinedFunction>(
10708 args&: RepeatedSequenceLocs, args&: SequenceSize, args&: NumBytesToCreateFrame, args&: FrameID);
10709}
10710
10711void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10712 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10713 // If a bunch of candidates reach this point they must agree on their return
10714 // address signing. It is therefore enough to just consider the signing
10715 // behaviour of one of them
10716 const auto &CFn = Candidates.front().getMF()->getFunction();
10717
10718 if (CFn.hasFnAttribute(Kind: "ptrauth-returns"))
10719 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "ptrauth-returns"));
10720 if (CFn.hasFnAttribute(Kind: "ptrauth-auth-traps"))
10721 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "ptrauth-auth-traps"));
10722 // Since all candidates belong to the same module, just copy the
10723 // function-level attributes of an arbitrary function.
10724 if (CFn.hasFnAttribute(Kind: "sign-return-address"))
10725 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address"));
10726 if (CFn.hasFnAttribute(Kind: "sign-return-address-key"))
10727 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address-key"));
10728
10729 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10730}
10731
10732bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10733 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10734 const Function &F = MF.getFunction();
10735
10736 // Can F be deduplicated by the linker? If it can, don't outline from it.
10737 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10738 return false;
10739
10740 // Don't outline from functions with section markings; the program could
10741 // expect that all the code is in the named section.
10742 // FIXME: Allow outlining from multiple functions with the same section
10743 // marking.
10744 if (F.hasSection())
10745 return false;
10746
10747 // Outlining from functions with redzones is unsafe since the outliner may
10748 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10749 // outline from it.
10750 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10751 if (!AFI || AFI->hasRedZone().value_or(u: true))
10752 return false;
10753
10754 // FIXME: Determine whether it is safe to outline from functions which contain
10755 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10756 // outlined together and ensure it is safe to outline with async unwind info,
10757 // required for saving & restoring VG around calls.
10758 if (AFI->hasStreamingModeChanges())
10759 return false;
10760
10761 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10762 if (MF.getTarget().getMCAsmInfo().usesWindowsCFI())
10763 return false;
10764
10765 // It's safe to outline from MF.
10766 return true;
10767}
10768
10769SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10770AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10771 unsigned &Flags) const {
10772 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
10773 "Must track liveness!");
10774 SmallVector<
10775 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10776 Ranges;
10777 // According to the AArch64 Procedure Call Standard, the following are
10778 // undefined on entry/exit from a function call:
10779 //
10780 // * Registers x16, x17, (and thus w16, w17)
10781 // * Condition codes (and thus the NZCV register)
10782 //
10783 // If any of these registers are used inside or live across an outlined
10784 // function, then they may be modified later, either by the compiler or
10785 // some other tool (like the linker).
10786 //
10787 // To avoid outlining in these situations, partition each block into ranges
10788 // where these registers are dead. We will only outline from those ranges.
10789 LiveRegUnits LRU(getRegisterInfo());
10790 auto AreAllUnsafeRegsDead = [&LRU]() {
10791 return LRU.available(Reg: AArch64::W16) && LRU.available(Reg: AArch64::W17) &&
10792 LRU.available(Reg: AArch64::NZCV);
10793 };
10794
10795 // We need to know if LR is live across an outlining boundary later on in
10796 // order to decide how we'll create the outlined call, frame, etc.
10797 //
10798 // It's pretty expensive to check this for *every candidate* within a block.
10799 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10800 // to compute liveness from the end of the block for O(n) candidates within
10801 // the block.
10802 //
10803 // So, to improve the average case, let's keep track of liveness from the end
10804 // of the block to the beginning of *every outlinable range*. If we know that
10805 // LR is available in every range we could outline from, then we know that
10806 // we don't need to check liveness for any candidate within that range.
10807 bool LRAvailableEverywhere = true;
10808 // Compute liveness bottom-up.
10809 LRU.addLiveOuts(MBB);
10810 // Update flags that require info about the entire MBB.
10811 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10812 if (MI.isCall() && !MI.isTerminator())
10813 Flags |= MachineOutlinerMBBFlags::HasCalls;
10814 };
10815 // Range: [RangeBegin, RangeEnd)
10816 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10817 unsigned RangeLen;
10818 auto CreateNewRangeStartingAt =
10819 [&RangeBegin, &RangeEnd,
10820 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10821 RangeBegin = NewBegin;
10822 RangeEnd = std::next(x: RangeBegin);
10823 RangeLen = 0;
10824 };
10825 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10826 // At least one unsafe register is not dead. We do not want to outline at
10827 // this point. If it is long enough to outline from and does not cross a
10828 // bundle boundary, save the range [RangeBegin, RangeEnd).
10829 if (RangeLen <= 1)
10830 return;
10831 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10832 return;
10833 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10834 return;
10835 Ranges.emplace_back(Args&: RangeBegin, Args&: RangeEnd);
10836 };
10837 // Find the first point where all unsafe registers are dead.
10838 // FIND: <safe instr> <-- end of first potential range
10839 // SKIP: <unsafe def>
10840 // SKIP: ... everything between ...
10841 // SKIP: <unsafe use>
10842 auto FirstPossibleEndPt = MBB.instr_rbegin();
10843 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10844 if (!FirstPossibleEndPt->isDebugInstr())
10845 LRU.stepBackward(MI: *FirstPossibleEndPt);
10846 // Update flags that impact how we outline across the entire block,
10847 // regardless of safety.
10848 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10849 if (AreAllUnsafeRegsDead())
10850 break;
10851 }
10852 // If we exhausted the entire block, we have no safe ranges to outline.
10853 if (FirstPossibleEndPt == MBB.instr_rend())
10854 return Ranges;
10855 // Current range.
10856 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10857 // StartPt points to the first place where all unsafe registers
10858 // are dead (if there is any such point). Begin partitioning the MBB into
10859 // ranges.
10860 for (auto &MI : make_range(x: FirstPossibleEndPt, y: MBB.instr_rend())) {
10861 if (!MI.isDebugInstr())
10862 LRU.stepBackward(MI);
10863 UpdateWholeMBBFlags(MI);
10864 if (!AreAllUnsafeRegsDead()) {
10865 SaveRangeIfNonEmpty();
10866 CreateNewRangeStartingAt(MI.getIterator());
10867 continue;
10868 }
10869 LRAvailableEverywhere &= LRU.available(Reg: AArch64::LR);
10870 RangeBegin = MI.getIterator();
10871 ++RangeLen;
10872 }
10873 // Above loop misses the last (or only) range. If we are still safe, then
10874 // let's save the range.
10875 if (AreAllUnsafeRegsDead())
10876 SaveRangeIfNonEmpty();
10877 if (Ranges.empty())
10878 return Ranges;
10879 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10880 // the order.
10881 std::reverse(first: Ranges.begin(), last: Ranges.end());
10882 // If there is at least one outlinable range where LR is unavailable
10883 // somewhere, remember that.
10884 if (!LRAvailableEverywhere)
10885 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
10886 return Ranges;
10887}
10888
10889outliner::InstrType
10890AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10891 MachineBasicBlock::iterator &MIT,
10892 unsigned Flags) const {
10893 MachineInstr &MI = *MIT;
10894
10895 // Don't outline anything used for return address signing. The outlined
10896 // function will get signed later if needed
10897 switch (MI.getOpcode()) {
10898 case AArch64::PACM:
10899 case AArch64::PACIASP:
10900 case AArch64::PACIBSP:
10901 case AArch64::PACIASPPC:
10902 case AArch64::PACIBSPPC:
10903 case AArch64::AUTIASP:
10904 case AArch64::AUTIBSP:
10905 case AArch64::AUTIASPPCi:
10906 case AArch64::AUTIASPPCr:
10907 case AArch64::AUTIBSPPCi:
10908 case AArch64::AUTIBSPPCr:
10909 case AArch64::RETAA:
10910 case AArch64::RETAB:
10911 case AArch64::RETAASPPCi:
10912 case AArch64::RETAASPPCr:
10913 case AArch64::RETABSPPCi:
10914 case AArch64::RETABSPPCr:
10915 case AArch64::EMITBKEY:
10916 case AArch64::PAUTH_PROLOGUE:
10917 case AArch64::PAUTH_EPILOGUE:
10918 return outliner::InstrType::Illegal;
10919 }
10920
10921 // We can only outline these if we will tail call the outlined function, or
10922 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10923 // in a tail call.
10924 //
10925 // FIXME: If the proper fixups for the offset are implemented, this should be
10926 // possible.
10927 if (MI.isCFIInstruction())
10928 return outliner::InstrType::Legal;
10929
10930 // Is this a terminator for a basic block?
10931 if (MI.isTerminator())
10932 // TargetInstrInfo::getOutliningType has already filtered out anything
10933 // that would break this, so we can allow it here.
10934 return outliner::InstrType::Legal;
10935
10936 // Make sure none of the operands are un-outlinable.
10937 for (const MachineOperand &MOP : MI.operands()) {
10938 // A check preventing CFI indices was here before, but only CFI
10939 // instructions should have those.
10940 assert(!MOP.isCFIIndex());
10941
10942 // If it uses LR or W30 explicitly, then don't touch it.
10943 if (MOP.isReg() && !MOP.isImplicit() &&
10944 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10945 return outliner::InstrType::Illegal;
10946 }
10947
10948 // Special cases for instructions that can always be outlined, but will fail
10949 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10950 // be outlined because they don't require a *specific* value to be in LR.
10951 if (MI.getOpcode() == AArch64::ADRP)
10952 return outliner::InstrType::Legal;
10953
10954 // If MI is a call we might be able to outline it. We don't want to outline
10955 // any calls that rely on the position of items on the stack. When we outline
10956 // something containing a call, we have to emit a save and restore of LR in
10957 // the outlined function. Currently, this always happens by saving LR to the
10958 // stack. Thus, if we outline, say, half the parameters for a function call
10959 // plus the call, then we'll break the callee's expectations for the layout
10960 // of the stack.
10961 //
10962 // FIXME: Allow calls to functions which construct a stack frame, as long
10963 // as they don't access arguments on the stack.
10964 // FIXME: Figure out some way to analyze functions defined in other modules.
10965 // We should be able to compute the memory usage based on the IR calling
10966 // convention, even if we can't see the definition.
10967 if (MI.isCall()) {
10968 // Get the function associated with the call. Look at each operand and find
10969 // the one that represents the callee and get its name.
10970 const Function *Callee = nullptr;
10971 for (const MachineOperand &MOP : MI.operands()) {
10972 if (MOP.isGlobal()) {
10973 Callee = dyn_cast<Function>(Val: MOP.getGlobal());
10974 break;
10975 }
10976 }
10977
10978 // Never outline calls to mcount. There isn't any rule that would require
10979 // this, but the Linux kernel's "ftrace" feature depends on it.
10980 if (Callee && Callee->getName() == "\01_mcount")
10981 return outliner::InstrType::Illegal;
10982
10983 // If we don't know anything about the callee, assume it depends on the
10984 // stack layout of the caller. In that case, it's only legal to outline
10985 // as a tail-call. Explicitly list the call instructions we know about so we
10986 // don't get unexpected results with call pseudo-instructions.
10987 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10988 if (MI.getOpcode() == AArch64::BLR ||
10989 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10990 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10991
10992 if (!Callee)
10993 return UnknownCallOutlineType;
10994
10995 // We have a function we have information about. Check it if it's something
10996 // can safely outline.
10997 MachineFunction *CalleeMF = MMI.getMachineFunction(F: *Callee);
10998
10999 // We don't know what's going on with the callee at all. Don't touch it.
11000 if (!CalleeMF)
11001 return UnknownCallOutlineType;
11002
11003 // Check if we know anything about the callee saves on the function. If we
11004 // don't, then don't touch it, since that implies that we haven't
11005 // computed anything about its stack frame yet.
11006 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
11007 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
11008 MFI.getNumObjects() > 0)
11009 return UnknownCallOutlineType;
11010
11011 // At this point, we can say that CalleeMF ought to not pass anything on the
11012 // stack. Therefore, we can outline it.
11013 return outliner::InstrType::Legal;
11014 }
11015
11016 // Don't touch the link register or W30.
11017 if (MI.readsRegister(Reg: AArch64::W30, TRI: &getRegisterInfo()) ||
11018 MI.modifiesRegister(Reg: AArch64::W30, TRI: &getRegisterInfo()))
11019 return outliner::InstrType::Illegal;
11020
11021 // Don't outline BTI instructions, because that will prevent the outlining
11022 // site from being indirectly callable.
11023 if (hasBTISemantics(MI))
11024 return outliner::InstrType::Illegal;
11025
11026 return outliner::InstrType::Legal;
11027}
11028
11029void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
11030 for (MachineInstr &MI : MBB) {
11031 const MachineOperand *Base;
11032 TypeSize Width(0, false);
11033 int64_t Offset;
11034 bool OffsetIsScalable;
11035
11036 // Is this a load or store with an immediate offset with SP as the base?
11037 if (!MI.mayLoadOrStore() ||
11038 !getMemOperandWithOffsetWidth(LdSt: MI, BaseOp&: Base, Offset, OffsetIsScalable, Width,
11039 TRI: &RI) ||
11040 (Base->isReg() && Base->getReg() != AArch64::SP))
11041 continue;
11042
11043 // It is, so we have to fix it up.
11044 TypeSize Scale(0U, false);
11045 int64_t Dummy1, Dummy2;
11046
11047 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(LdSt&: MI);
11048 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
11049 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2);
11050 assert(Scale != 0 && "Unexpected opcode!");
11051 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
11052
11053 // We've pushed the return address to the stack, so add 16 to the offset.
11054 // This is safe, since we already checked if it would overflow when we
11055 // checked if this instruction was legal to outline.
11056 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
11057 StackOffsetOperand.setImm(NewImm);
11058 }
11059}
11060
11061static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
11062 const AArch64InstrInfo *TII,
11063 bool ShouldSignReturnAddr) {
11064 if (!ShouldSignReturnAddr)
11065 return;
11066
11067 BuildMI(BB&: MBB, I: MBB.begin(), MIMD: DebugLoc(), MCID: TII->get(Opcode: AArch64::PAUTH_PROLOGUE))
11068 .setMIFlag(MachineInstr::FrameSetup);
11069 TII->createPauthEpilogueInstr(MBB, DL: DebugLoc());
11070}
11071
11072void AArch64InstrInfo::buildOutlinedFrame(
11073 MachineBasicBlock &MBB, MachineFunction &MF,
11074 const outliner::OutlinedFunction &OF) const {
11075
11076 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
11077
11078 if (OF.FrameConstructionID == MachineOutlinerTailCall)
11079 FI->setOutliningStyle("Tail Call");
11080 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
11081 // For thunk outlining, rewrite the last instruction from a call to a
11082 // tail-call.
11083 MachineInstr *Call = &*--MBB.instr_end();
11084 unsigned TailOpcode;
11085 if (Call->getOpcode() == AArch64::BL) {
11086 TailOpcode = AArch64::TCRETURNdi;
11087 } else {
11088 assert(Call->getOpcode() == AArch64::BLR ||
11089 Call->getOpcode() == AArch64::BLRNoIP);
11090 TailOpcode = AArch64::TCRETURNriALL;
11091 }
11092 MachineInstr *TC = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: TailOpcode))
11093 .add(MO: Call->getOperand(i: 0))
11094 .addImm(Val: 0);
11095 MBB.insert(I: MBB.end(), MI: TC);
11096 Call->eraseFromParent();
11097
11098 FI->setOutliningStyle("Thunk");
11099 }
11100
11101 bool IsLeafFunction = true;
11102
11103 // Is there a call in the outlined range?
11104 auto IsNonTailCall = [](const MachineInstr &MI) {
11105 return MI.isCall() && !MI.isReturn();
11106 };
11107
11108 if (llvm::any_of(Range: MBB.instrs(), P: IsNonTailCall)) {
11109 // Fix up the instructions in the range, since we're going to modify the
11110 // stack.
11111
11112 // Bugzilla ID: 46767
11113 // TODO: Check if fixing up twice is safe so we can outline these.
11114 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
11115 "Can only fix up stack references once");
11116 fixupPostOutline(MBB);
11117
11118 IsLeafFunction = false;
11119
11120 // LR has to be a live in so that we can save it.
11121 if (!MBB.isLiveIn(Reg: AArch64::LR))
11122 MBB.addLiveIn(PhysReg: AArch64::LR);
11123
11124 MachineBasicBlock::iterator It = MBB.begin();
11125 MachineBasicBlock::iterator Et = MBB.end();
11126
11127 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
11128 OF.FrameConstructionID == MachineOutlinerThunk)
11129 Et = std::prev(x: MBB.end());
11130
11131 // Insert a save before the outlined region
11132 MachineInstr *STRXpre = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::STRXpre))
11133 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
11134 .addReg(RegNo: AArch64::LR)
11135 .addReg(RegNo: AArch64::SP)
11136 .addImm(Val: -16);
11137 It = MBB.insert(I: It, MI: STRXpre);
11138
11139 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
11140 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
11141
11142 // Add a CFI saying the stack was moved 16 B down.
11143 CFIBuilder.buildDefCFAOffset(Offset: 16);
11144
11145 // Add a CFI saying that the LR that we want to find is now 16 B higher
11146 // than before.
11147 CFIBuilder.buildOffset(Reg: AArch64::LR, Offset: -16);
11148 }
11149
11150 // Insert a restore before the terminator for the function.
11151 MachineInstr *LDRXpost = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::LDRXpost))
11152 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
11153 .addReg(RegNo: AArch64::LR, Flags: RegState::Define)
11154 .addReg(RegNo: AArch64::SP)
11155 .addImm(Val: 16);
11156 Et = MBB.insert(I: Et, MI: LDRXpost);
11157 }
11158
11159 auto RASignCondition = FI->getSignReturnAddressCondition();
11160 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
11161 Condition: RASignCondition, IsLRSpilled: !IsLeafFunction);
11162
11163 // If this is a tail call outlined function, then there's already a return.
11164 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
11165 OF.FrameConstructionID == MachineOutlinerThunk) {
11166 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
11167 return;
11168 }
11169
11170 // It's not a tail call, so we have to insert the return ourselves.
11171
11172 // LR has to be a live in so that we can return to it.
11173 if (!MBB.isLiveIn(Reg: AArch64::LR))
11174 MBB.addLiveIn(PhysReg: AArch64::LR);
11175
11176 MachineInstr *ret = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::RET))
11177 .addReg(RegNo: AArch64::LR);
11178 MBB.insert(I: MBB.end(), MI: ret);
11179
11180 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
11181
11182 FI->setOutliningStyle("Function");
11183
11184 // Did we have to modify the stack by saving the link register?
11185 if (OF.FrameConstructionID != MachineOutlinerDefault)
11186 return;
11187
11188 // We modified the stack.
11189 // Walk over the basic block and fix up all the stack accesses.
11190 fixupPostOutline(MBB);
11191}
11192
11193MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
11194 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
11195 MachineFunction &MF, outliner::Candidate &C) const {
11196
11197 // Are we tail calling?
11198 if (C.CallConstructionID == MachineOutlinerTailCall) {
11199 // If yes, then we can just branch to the label.
11200 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::TCRETURNdi))
11201 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName()))
11202 .addImm(Val: 0));
11203 return It;
11204 }
11205
11206 // Are we saving the link register?
11207 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
11208 C.CallConstructionID == MachineOutlinerThunk) {
11209 // No, so just insert the call.
11210 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::BL))
11211 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName())));
11212 return It;
11213 }
11214
11215 // We want to return the spot where we inserted the call.
11216 MachineBasicBlock::iterator CallPt;
11217
11218 // Instructions for saving and restoring LR around the call instruction we're
11219 // going to insert.
11220 MachineInstr *Save;
11221 MachineInstr *Restore;
11222 // Can we save to a register?
11223 if (C.CallConstructionID == MachineOutlinerRegSave) {
11224 // FIXME: This logic should be sunk into a target-specific interface so that
11225 // we don't have to recompute the register.
11226 Register Reg = findRegisterToSaveLRTo(C);
11227 assert(Reg && "No callee-saved register available?");
11228
11229 // LR has to be a live in so that we can save it.
11230 if (!MBB.isLiveIn(Reg: AArch64::LR))
11231 MBB.addLiveIn(PhysReg: AArch64::LR);
11232
11233 // Save and restore LR from Reg.
11234 Save = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::ORRXrs), DestReg: Reg)
11235 .addReg(RegNo: AArch64::XZR)
11236 .addReg(RegNo: AArch64::LR)
11237 .addImm(Val: 0);
11238 Restore = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::ORRXrs), DestReg: AArch64::LR)
11239 .addReg(RegNo: AArch64::XZR)
11240 .addReg(RegNo: Reg)
11241 .addImm(Val: 0);
11242 } else {
11243 // We have the default case. Save and restore from SP.
11244 Save = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::STRXpre))
11245 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
11246 .addReg(RegNo: AArch64::LR)
11247 .addReg(RegNo: AArch64::SP)
11248 .addImm(Val: -16);
11249 Restore = BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::LDRXpost))
11250 .addReg(RegNo: AArch64::SP, Flags: RegState::Define)
11251 .addReg(RegNo: AArch64::LR, Flags: RegState::Define)
11252 .addReg(RegNo: AArch64::SP)
11253 .addImm(Val: 16);
11254 }
11255
11256 It = MBB.insert(I: It, MI: Save);
11257 It++;
11258
11259 // Insert the call.
11260 It = MBB.insert(I: It, MI: BuildMI(MF, MIMD: DebugLoc(), MCID: get(Opcode: AArch64::BL))
11261 .addGlobalAddress(GV: M.getNamedValue(Name: MF.getName())));
11262 CallPt = It;
11263 It++;
11264
11265 It = MBB.insert(I: It, MI: Restore);
11266 return CallPt;
11267}
11268
11269bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
11270 MachineFunction &MF) const {
11271 return MF.getFunction().hasMinSize();
11272}
11273
11274void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
11275 MachineBasicBlock::iterator Iter,
11276 DebugLoc &DL,
11277 bool AllowSideEffects) const {
11278 const MachineFunction &MF = *MBB.getParent();
11279 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
11280 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
11281
11282 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
11283 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::MOVZXi), DestReg: Reg).addImm(Val: 0).addImm(Val: 0);
11284 } else if (STI.isSVEorStreamingSVEAvailable()) {
11285 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::DUP_ZI_D), DestReg: Reg)
11286 .addImm(Val: 0)
11287 .addImm(Val: 0);
11288 } else if (STI.isNeonAvailable()) {
11289 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::MOVIv2d_ns), DestReg: Reg)
11290 .addImm(Val: 0);
11291 } else {
11292 // This is a streaming-compatible function without SVE. We don't have full
11293 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
11294 // So given `movi v..` would be illegal use `fmov d..` instead.
11295 assert(STI.hasNEON() && "Expected to have NEON.");
11296 Register Reg64 = TRI.getSubReg(Reg, Idx: AArch64::dsub);
11297 BuildMI(BB&: MBB, I: Iter, MIMD: DL, MCID: get(Opcode: AArch64::FMOVD0), DestReg: Reg64);
11298 }
11299}
11300
11301std::optional<DestSourcePair>
11302AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
11303
11304 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
11305 // and zero immediate operands used as an alias for mov instruction.
11306 if (((MI.getOpcode() == AArch64::ORRWrs &&
11307 MI.getOperand(i: 1).getReg() == AArch64::WZR &&
11308 MI.getOperand(i: 3).getImm() == 0x0) ||
11309 (MI.getOpcode() == AArch64::ORRWrr &&
11310 MI.getOperand(i: 1).getReg() == AArch64::WZR)) &&
11311 // Check that the w->w move is not a zero-extending w->x mov.
11312 (!MI.getOperand(i: 0).getReg().isVirtual() ||
11313 MI.getOperand(i: 0).getSubReg() == 0) &&
11314 (!MI.getOperand(i: 0).getReg().isPhysical() ||
11315 MI.findRegisterDefOperandIdx(Reg: getXRegFromWReg(Reg: MI.getOperand(i: 0).getReg()),
11316 /*TRI=*/nullptr) == -1))
11317 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11318
11319 if (MI.getOpcode() == AArch64::ORRXrs &&
11320 MI.getOperand(i: 1).getReg() == AArch64::XZR &&
11321 MI.getOperand(i: 3).getImm() == 0x0)
11322 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11323
11324 return std::nullopt;
11325}
11326
11327std::optional<DestSourcePair>
11328AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
11329 if ((MI.getOpcode() == AArch64::ORRWrs &&
11330 MI.getOperand(i: 1).getReg() == AArch64::WZR &&
11331 MI.getOperand(i: 3).getImm() == 0x0) ||
11332 (MI.getOpcode() == AArch64::ORRWrr &&
11333 MI.getOperand(i: 1).getReg() == AArch64::WZR))
11334 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
11335 return std::nullopt;
11336}
11337
11338std::optional<RegImmPair>
11339AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11340 int Sign = 1;
11341 int64_t Offset = 0;
11342
11343 // TODO: Handle cases where Reg is a super- or sub-register of the
11344 // destination register.
11345 const MachineOperand &Op0 = MI.getOperand(i: 0);
11346 if (!Op0.isReg() || Reg != Op0.getReg())
11347 return std::nullopt;
11348
11349 switch (MI.getOpcode()) {
11350 default:
11351 return std::nullopt;
11352 case AArch64::SUBWri:
11353 case AArch64::SUBXri:
11354 case AArch64::SUBSWri:
11355 case AArch64::SUBSXri:
11356 Sign *= -1;
11357 [[fallthrough]];
11358 case AArch64::ADDSWri:
11359 case AArch64::ADDSXri:
11360 case AArch64::ADDWri:
11361 case AArch64::ADDXri: {
11362 // TODO: Third operand can be global address (usually some string).
11363 if (!MI.getOperand(i: 0).isReg() || !MI.getOperand(i: 1).isReg() ||
11364 !MI.getOperand(i: 2).isImm())
11365 return std::nullopt;
11366 int Shift = MI.getOperand(i: 3).getImm();
11367 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11368 Offset = Sign * (MI.getOperand(i: 2).getImm() << Shift);
11369 }
11370 }
11371 return RegImmPair{MI.getOperand(i: 1).getReg(), Offset};
11372}
11373
11374/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11375/// the destination register then, if possible, describe the value in terms of
11376/// the source register.
11377static std::optional<ParamLoadedValue>
11378describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
11379 const TargetInstrInfo *TII,
11380 const TargetRegisterInfo *TRI) {
11381 auto DestSrc = TII->isCopyLikeInstr(MI);
11382 if (!DestSrc)
11383 return std::nullopt;
11384
11385 Register DestReg = DestSrc->Destination->getReg();
11386 Register SrcReg = DestSrc->Source->getReg();
11387
11388 if (!DestReg.isValid() || !SrcReg.isValid())
11389 return std::nullopt;
11390
11391 auto Expr = DIExpression::get(Context&: MI.getMF()->getFunction().getContext(), Elements: {});
11392
11393 // If the described register is the destination, just return the source.
11394 if (DestReg == DescribedReg)
11395 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
11396
11397 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11398 if (MI.getOpcode() == AArch64::ORRWrs &&
11399 TRI->isSuperRegister(RegA: DestReg, RegB: DescribedReg))
11400 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
11401
11402 // We may need to describe the lower part of a ORRXrs move.
11403 if (MI.getOpcode() == AArch64::ORRXrs &&
11404 TRI->isSubRegister(RegA: DestReg, RegB: DescribedReg)) {
11405 Register SrcSubReg = TRI->getSubReg(Reg: SrcReg, Idx: AArch64::sub_32);
11406 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcSubReg, isDef: false), Expr);
11407 }
11408
11409 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11410 "Unhandled ORR[XW]rs copy case");
11411
11412 return std::nullopt;
11413}
11414
11415bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11416 // Functions cannot be split to different sections on AArch64 if they have
11417 // a red zone. This is because relaxing a cross-section branch may require
11418 // incrementing the stack pointer to spill a register, which would overwrite
11419 // the red zone.
11420 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(u: true))
11421 return false;
11422
11423 return TargetInstrInfo::isFunctionSafeToSplit(MF);
11424}
11425
11426bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11427 const MachineBasicBlock &MBB) const {
11428 // Asm Goto blocks can contain conditional branches to goto labels, which can
11429 // get moved out of range of the branch instruction.
11430 auto isAsmGoto = [](const MachineInstr &MI) {
11431 return MI.getOpcode() == AArch64::INLINEASM_BR;
11432 };
11433 if (llvm::any_of(Range: MBB, P: isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11434 return false;
11435
11436 // Because jump tables are label-relative instead of table-relative, they all
11437 // must be in the same section or relocation fixup handling will fail.
11438
11439 // Check if MBB is a jump table target
11440 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11441 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11442 return llvm::is_contained(Range: JTE.MBBs, Element: &MBB);
11443 };
11444 if (MJTI != nullptr && llvm::any_of(Range: MJTI->getJumpTables(), P: containsMBB))
11445 return false;
11446
11447 // Check if MBB contains a jump table lookup
11448 for (const MachineInstr &MI : MBB) {
11449 switch (MI.getOpcode()) {
11450 case TargetOpcode::G_BRJT:
11451 case AArch64::JumpTableDest32:
11452 case AArch64::JumpTableDest16:
11453 case AArch64::JumpTableDest8:
11454 return false;
11455 default:
11456 continue;
11457 }
11458 }
11459
11460 // MBB isn't a special case, so it's safe to be split to the cold section.
11461 return true;
11462}
11463
11464std::optional<ParamLoadedValue>
11465AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11466 Register Reg) const {
11467 const MachineFunction *MF = MI.getMF();
11468 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11469 switch (MI.getOpcode()) {
11470 case AArch64::MOVZWi:
11471 case AArch64::MOVZXi: {
11472 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11473 // 64-bit parameters, so we need to consider super-registers.
11474 if (!TRI->isSuperRegisterEq(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
11475 return std::nullopt;
11476
11477 if (!MI.getOperand(i: 1).isImm())
11478 return std::nullopt;
11479 int64_t Immediate = MI.getOperand(i: 1).getImm();
11480 int Shift = MI.getOperand(i: 2).getImm();
11481 return ParamLoadedValue(MachineOperand::CreateImm(Val: Immediate << Shift),
11482 nullptr);
11483 }
11484 case AArch64::ORRWrs:
11485 case AArch64::ORRXrs:
11486 return describeORRLoadedValue(MI, DescribedReg: Reg, TII: this, TRI);
11487 }
11488
11489 return TargetInstrInfo::describeLoadedValue(MI, Reg);
11490}
11491
11492bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11493 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11494 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11495 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11496 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11497
11498 // Anyexts are nops.
11499 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11500 return true;
11501
11502 Register DefReg = ExtMI.getOperand(i: 0).getReg();
11503 if (!MRI.hasOneNonDBGUse(RegNo: DefReg))
11504 return false;
11505
11506 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11507 // addressing mode.
11508 auto *UserMI = &*MRI.use_instr_nodbg_begin(RegNo: DefReg);
11509 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11510}
11511
11512uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11513 return get(Opcode: Opc).TSFlags & AArch64::ElementSizeMask;
11514}
11515
11516bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11517 return get(Opcode: Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11518}
11519
11520bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11521 return get(Opcode: Opc).TSFlags & AArch64::InstrFlagIsWhile;
11522}
11523
11524unsigned int
11525AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11526 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11527}
11528
11529bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11530 unsigned Scale) const {
11531 if (Offset && Scale)
11532 return false;
11533
11534 // Check Reg + Imm
11535 if (!Scale) {
11536 // 9-bit signed offset
11537 if (isInt<9>(x: Offset))
11538 return true;
11539
11540 // 12-bit unsigned offset
11541 unsigned Shift = Log2_64(Value: NumBytes);
11542 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11543 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11544 (Offset >> Shift) << Shift == Offset)
11545 return true;
11546 return false;
11547 }
11548
11549 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11550 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11551}
11552
11553unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
11554 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11555 return AArch64::BLRNoIP;
11556 else
11557 return AArch64::BLR;
11558}
11559
11560void AArch64InstrInfo::createPauthEpilogueInstr(MachineBasicBlock &MBB,
11561 DebugLoc DL) const {
11562 MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
11563 auto Builder = BuildMI(BB&: MBB, I: InsertPt, MIMD: DL, MCID: get(Opcode: AArch64::PAUTH_EPILOGUE))
11564 .setMIFlag(MachineInstr::FrameDestroy);
11565
11566 MachineFunction &MF = *MBB.getParent();
11567 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
11568 auto &AFL = *static_cast<const AArch64FrameLowering *>(
11569 MF.getSubtarget().getFrameLowering());
11570 if (AFL.getArgumentStackToRestore(MF, MBB)) {
11571 Builder.addReg(RegNo: AArch64::X17, Flags: RegState::ImplicitDefine);
11572 Builder.addReg(RegNo: AArch64::X16, Flags: RegState::ImplicitDefine);
11573 if (Subtarget.hasPAuthLR())
11574 Builder.addReg(RegNo: AArch64::X15, Flags: RegState::ImplicitDefine);
11575 return;
11576 }
11577
11578 if (AFI->branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
11579 Builder.addReg(RegNo: AArch64::X16, Flags: RegState::ImplicitDefine);
11580}
11581
11582MachineBasicBlock::iterator
11583AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11584 Register TargetReg, bool FrameSetup) const {
11585 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11586
11587 MachineBasicBlock &MBB = *MBBI->getParent();
11588 MachineFunction &MF = *MBB.getParent();
11589 const AArch64InstrInfo *TII =
11590 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11591 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11592 DebugLoc DL = MBB.findDebugLoc(MBBI);
11593
11594 MachineFunction::iterator MBBInsertPoint = std::next(x: MBB.getIterator());
11595 MachineBasicBlock *LoopTestMBB =
11596 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11597 MF.insert(MBBI: MBBInsertPoint, MBB: LoopTestMBB);
11598 MachineBasicBlock *LoopBodyMBB =
11599 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11600 MF.insert(MBBI: MBBInsertPoint, MBB: LoopBodyMBB);
11601 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
11602 MF.insert(MBBI: MBBInsertPoint, MBB: ExitMBB);
11603 MachineInstr::MIFlag Flags =
11604 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
11605
11606 // LoopTest:
11607 // SUB SP, SP, #ProbeSize
11608 emitFrameOffset(MBB&: *LoopTestMBB, MBBI: LoopTestMBB->end(), DL, DestReg: AArch64::SP,
11609 SrcReg: AArch64::SP, Offset: StackOffset::getFixed(Fixed: -ProbeSize), TII, Flag: Flags);
11610
11611 // CMP SP, TargetReg
11612 BuildMI(BB&: *LoopTestMBB, I: LoopTestMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::SUBSXrx64),
11613 DestReg: AArch64::XZR)
11614 .addReg(RegNo: AArch64::SP)
11615 .addReg(RegNo: TargetReg)
11616 .addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: 0))
11617 .setMIFlags(Flags);
11618
11619 // B.<Cond> LoopExit
11620 BuildMI(BB&: *LoopTestMBB, I: LoopTestMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc))
11621 .addImm(Val: AArch64CC::LE)
11622 .addMBB(MBB: ExitMBB)
11623 .setMIFlags(Flags);
11624
11625 // LDR XZR, [SP]
11626 BuildMI(BB&: *LoopBodyMBB, I: LoopBodyMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
11627 .addDef(RegNo: AArch64::XZR)
11628 .addReg(RegNo: AArch64::SP)
11629 .addImm(Val: 0)
11630 .addMemOperand(MMO: MF.getMachineMemOperand(
11631 PtrInfo: MachinePointerInfo::getUnknownStack(MF),
11632 F: MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, Size: 8,
11633 BaseAlignment: Align(8)))
11634 .setMIFlags(Flags);
11635
11636 // B loop
11637 BuildMI(BB&: *LoopBodyMBB, I: LoopBodyMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::B))
11638 .addMBB(MBB: LoopTestMBB)
11639 .setMIFlags(Flags);
11640
11641 // LoopExit:
11642 // MOV SP, TargetReg
11643 BuildMI(BB&: *ExitMBB, I: ExitMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::ADDXri), DestReg: AArch64::SP)
11644 .addReg(RegNo: TargetReg)
11645 .addImm(Val: 0)
11646 .addImm(Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: 0))
11647 .setMIFlags(Flags);
11648
11649 // LDR XZR, [SP]
11650 BuildMI(BB&: *ExitMBB, I: ExitMBB->end(), MIMD: DL, MCID: TII->get(Opcode: AArch64::LDRXui))
11651 .addReg(RegNo: AArch64::XZR, Flags: RegState::Define)
11652 .addReg(RegNo: AArch64::SP)
11653 .addImm(Val: 0)
11654 .setMIFlags(Flags);
11655
11656 ExitMBB->splice(Where: ExitMBB->end(), Other: &MBB, From: std::next(x: MBBI), To: MBB.end());
11657 ExitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
11658
11659 LoopTestMBB->addSuccessor(Succ: ExitMBB);
11660 LoopTestMBB->addSuccessor(Succ: LoopBodyMBB);
11661 LoopBodyMBB->addSuccessor(Succ: LoopTestMBB);
11662 MBB.addSuccessor(Succ: LoopTestMBB);
11663
11664 // Update liveins.
11665 if (MF.getRegInfo().reservedRegsFrozen())
11666 fullyRecomputeLiveIns(MBBs: {ExitMBB, LoopBodyMBB, LoopTestMBB});
11667
11668 return ExitMBB->begin();
11669}
11670
11671namespace {
11672class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11673 MachineFunction *MF;
11674 const TargetInstrInfo *TII;
11675 const TargetRegisterInfo *TRI;
11676 MachineRegisterInfo &MRI;
11677
11678 /// The block of the loop
11679 MachineBasicBlock *LoopBB;
11680 /// The conditional branch of the loop
11681 MachineInstr *CondBranch;
11682 /// The compare instruction for loop control
11683 MachineInstr *Comp;
11684 /// The number of the operand of the loop counter value in Comp
11685 unsigned CompCounterOprNum;
11686 /// The instruction that updates the loop counter value
11687 MachineInstr *Update;
11688 /// The number of the operand of the loop counter value in Update
11689 unsigned UpdateCounterOprNum;
11690 /// The initial value of the loop counter
11691 Register Init;
11692 /// True iff Update is a predecessor of Comp
11693 bool IsUpdatePriorComp;
11694
11695 /// The normalized condition used by createTripCountGreaterCondition()
11696 SmallVector<MachineOperand, 4> Cond;
11697
11698public:
11699 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11700 MachineInstr *Comp, unsigned CompCounterOprNum,
11701 MachineInstr *Update, unsigned UpdateCounterOprNum,
11702 Register Init, bool IsUpdatePriorComp,
11703 const SmallVectorImpl<MachineOperand> &Cond)
11704 : MF(Comp->getParent()->getParent()),
11705 TII(MF->getSubtarget().getInstrInfo()),
11706 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11707 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11708 CompCounterOprNum(CompCounterOprNum), Update(Update),
11709 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11710 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11711
11712 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11713 // Make the instructions for loop control be placed in stage 0.
11714 // The predecessors of Comp are considered by the caller.
11715 return MI == Comp;
11716 }
11717
11718 std::optional<bool> createTripCountGreaterCondition(
11719 int TC, MachineBasicBlock &MBB,
11720 SmallVectorImpl<MachineOperand> &CondParam) override {
11721 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11722 // Cond is normalized for such use.
11723 // The predecessors of the branch are assumed to have already been inserted.
11724 CondParam = Cond;
11725 return {};
11726 }
11727
11728 void createRemainingIterationsGreaterCondition(
11729 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11730 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11731
11732 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11733
11734 void adjustTripCount(int TripCountAdjust) override {}
11735
11736 bool isMVEExpanderSupported() override { return true; }
11737};
11738} // namespace
11739
11740/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11741/// is replaced by ReplaceReg. The output register is newly created.
11742/// The other operands are unchanged from MI.
11743static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11744 Register ReplaceReg, MachineBasicBlock &MBB,
11745 MachineBasicBlock::iterator InsertTo) {
11746 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11747 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11748 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(Orig: MI);
11749 Register Result = 0;
11750 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11751 if (I == 0 && NewMI->getOperand(i: 0).getReg().isVirtual()) {
11752 Result = MRI.createVirtualRegister(
11753 RegClass: MRI.getRegClass(Reg: NewMI->getOperand(i: 0).getReg()));
11754 NewMI->getOperand(i: I).setReg(Result);
11755 } else if (I == ReplaceOprNum) {
11756 MRI.constrainRegClass(Reg: ReplaceReg, RC: TII->getRegClass(MCID: NewMI->getDesc(), OpNum: I));
11757 NewMI->getOperand(i: I).setReg(ReplaceReg);
11758 }
11759 }
11760 MBB.insert(I: InsertTo, MI: NewMI);
11761 return Result;
11762}
11763
11764void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11765 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11766 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
11767 // Create and accumulate conditions for next TC iterations.
11768 // Example:
11769 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11770 // # iteration of the kernel
11771 //
11772 // # insert the following instructions
11773 // cond = CSINCXr 0, 0, C, implicit $nzcv
11774 // counter = ADDXri counter, 1 # clone from this->Update
11775 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11776 // cond = CSINCXr cond, cond, C, implicit $nzcv
11777 // ... (repeat TC times)
11778 // SUBSXri cond, 0, implicit-def $nzcv
11779
11780 assert(CondBranch->getOpcode() == AArch64::Bcc);
11781 // CondCode to exit the loop
11782 AArch64CC::CondCode CC =
11783 (AArch64CC::CondCode)CondBranch->getOperand(i: 0).getImm();
11784 if (CondBranch->getOperand(i: 1).getMBB() == LoopBB)
11785 CC = AArch64CC::getInvertedCondCode(Code: CC);
11786
11787 // Accumulate conditions to exit the loop
11788 Register AccCond = AArch64::XZR;
11789
11790 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11791 auto AccumulateCond = [&](Register CurCond,
11792 AArch64CC::CondCode CC) -> Register {
11793 Register NewCond = MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
11794 BuildMI(BB&: MBB, I: MBB.end(), MIMD: Comp->getDebugLoc(), MCID: TII->get(Opcode: AArch64::CSINCXr))
11795 .addReg(RegNo: NewCond, Flags: RegState::Define)
11796 .addReg(RegNo: CurCond)
11797 .addReg(RegNo: CurCond)
11798 .addImm(Val: AArch64CC::getInvertedCondCode(Code: CC));
11799 return NewCond;
11800 };
11801
11802 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11803 // Update and Comp for I==0 are already exists in MBB
11804 // (MBB is an unrolled kernel)
11805 Register Counter;
11806 for (int I = 0; I <= TC; ++I) {
11807 Register NextCounter;
11808 if (I != 0)
11809 NextCounter =
11810 cloneInstr(MI: Comp, ReplaceOprNum: CompCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11811
11812 AccCond = AccumulateCond(AccCond, CC);
11813
11814 if (I != TC) {
11815 if (I == 0) {
11816 if (Update != Comp && IsUpdatePriorComp) {
11817 Counter =
11818 LastStage0Insts[Comp]->getOperand(i: CompCounterOprNum).getReg();
11819 NextCounter = cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB,
11820 InsertTo: MBB.end());
11821 } else {
11822 // can use already calculated value
11823 NextCounter = LastStage0Insts[Update]->getOperand(i: 0).getReg();
11824 }
11825 } else if (Update != Comp) {
11826 NextCounter =
11827 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11828 }
11829 }
11830 Counter = NextCounter;
11831 }
11832 } else {
11833 Register Counter;
11834 if (LastStage0Insts.empty()) {
11835 // use initial counter value (testing if the trip count is sufficient to
11836 // be executed by pipelined code)
11837 Counter = Init;
11838 if (IsUpdatePriorComp)
11839 Counter =
11840 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11841 } else {
11842 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11843 Counter = LastStage0Insts[Comp]->getOperand(i: CompCounterOprNum).getReg();
11844 }
11845
11846 for (int I = 0; I <= TC; ++I) {
11847 Register NextCounter;
11848 NextCounter =
11849 cloneInstr(MI: Comp, ReplaceOprNum: CompCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11850 AccCond = AccumulateCond(AccCond, CC);
11851 if (I != TC && Update != Comp)
11852 NextCounter =
11853 cloneInstr(MI: Update, ReplaceOprNum: UpdateCounterOprNum, ReplaceReg: Counter, MBB, InsertTo: MBB.end());
11854 Counter = NextCounter;
11855 }
11856 }
11857
11858 // If AccCond == 0, the remainder is greater than TC.
11859 BuildMI(BB&: MBB, I: MBB.end(), MIMD: Comp->getDebugLoc(), MCID: TII->get(Opcode: AArch64::SUBSXri))
11860 .addReg(RegNo: AArch64::XZR, Flags: RegState::Define | RegState::Dead)
11861 .addReg(RegNo: AccCond)
11862 .addImm(Val: 0)
11863 .addImm(Val: 0);
11864 Cond.clear();
11865 Cond.push_back(Elt: MachineOperand::CreateImm(Val: AArch64CC::EQ));
11866}
11867
11868static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11869 Register &RegMBB, Register &RegOther) {
11870 assert(Phi.getNumOperands() == 5);
11871 if (Phi.getOperand(i: 2).getMBB() == MBB) {
11872 RegMBB = Phi.getOperand(i: 1).getReg();
11873 RegOther = Phi.getOperand(i: 3).getReg();
11874 } else {
11875 assert(Phi.getOperand(4).getMBB() == MBB);
11876 RegMBB = Phi.getOperand(i: 3).getReg();
11877 RegOther = Phi.getOperand(i: 1).getReg();
11878 }
11879}
11880
11881static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
11882 if (!Reg.isVirtual())
11883 return false;
11884 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11885 return MRI.getVRegDef(Reg)->getParent() != BB;
11886}
11887
11888/// If Reg is an induction variable, return true and set some parameters
11889static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11890 MachineInstr *&UpdateInst,
11891 unsigned &UpdateCounterOprNum, Register &InitReg,
11892 bool &IsUpdatePriorComp) {
11893 // Example:
11894 //
11895 // Preheader:
11896 // InitReg = ...
11897 // LoopBB:
11898 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11899 // Reg = COPY Reg0 ; COPY is ignored.
11900 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11901 // ; Reg is the value calculated in the previous
11902 // ; iteration, so IsUpdatePriorComp == false.
11903
11904 if (LoopBB->pred_size() != 2)
11905 return false;
11906 if (!Reg.isVirtual())
11907 return false;
11908 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11909 UpdateInst = nullptr;
11910 UpdateCounterOprNum = 0;
11911 InitReg = 0;
11912 IsUpdatePriorComp = true;
11913 Register CurReg = Reg;
11914 while (true) {
11915 MachineInstr *Def = MRI.getVRegDef(Reg: CurReg);
11916 if (Def->getParent() != LoopBB)
11917 return false;
11918 if (Def->isCopy()) {
11919 // Ignore copy instructions unless they contain subregisters
11920 if (Def->getOperand(i: 0).getSubReg() || Def->getOperand(i: 1).getSubReg())
11921 return false;
11922 CurReg = Def->getOperand(i: 1).getReg();
11923 } else if (Def->isPHI()) {
11924 if (InitReg != 0)
11925 return false;
11926 if (!UpdateInst)
11927 IsUpdatePriorComp = false;
11928 extractPhiReg(Phi: *Def, MBB: LoopBB, RegMBB&: CurReg, RegOther&: InitReg);
11929 } else {
11930 if (UpdateInst)
11931 return false;
11932 switch (Def->getOpcode()) {
11933 case AArch64::ADDSXri:
11934 case AArch64::ADDSWri:
11935 case AArch64::SUBSXri:
11936 case AArch64::SUBSWri:
11937 case AArch64::ADDXri:
11938 case AArch64::ADDWri:
11939 case AArch64::SUBXri:
11940 case AArch64::SUBWri:
11941 UpdateInst = Def;
11942 UpdateCounterOprNum = 1;
11943 break;
11944 case AArch64::ADDSXrr:
11945 case AArch64::ADDSWrr:
11946 case AArch64::SUBSXrr:
11947 case AArch64::SUBSWrr:
11948 case AArch64::ADDXrr:
11949 case AArch64::ADDWrr:
11950 case AArch64::SUBXrr:
11951 case AArch64::SUBWrr:
11952 UpdateInst = Def;
11953 if (isDefinedOutside(Reg: Def->getOperand(i: 2).getReg(), BB: LoopBB))
11954 UpdateCounterOprNum = 1;
11955 else if (isDefinedOutside(Reg: Def->getOperand(i: 1).getReg(), BB: LoopBB))
11956 UpdateCounterOprNum = 2;
11957 else
11958 return false;
11959 break;
11960 default:
11961 return false;
11962 }
11963 CurReg = Def->getOperand(i: UpdateCounterOprNum).getReg();
11964 }
11965
11966 if (!CurReg.isVirtual())
11967 return false;
11968 if (Reg == CurReg)
11969 break;
11970 }
11971
11972 if (!UpdateInst)
11973 return false;
11974
11975 return true;
11976}
11977
11978std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11979AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
11980 // Accept loops that meet the following conditions
11981 // * The conditional branch is BCC
11982 // * The compare instruction is ADDS/SUBS/WHILEXX
11983 // * One operand of the compare is an induction variable and the other is a
11984 // loop invariant value
11985 // * The induction variable is incremented/decremented by a single instruction
11986 // * Does not contain CALL or instructions which have unmodeled side effects
11987
11988 for (MachineInstr &MI : *LoopBB)
11989 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11990 // This instruction may use NZCV, which interferes with the instruction to
11991 // be inserted for loop control.
11992 return nullptr;
11993
11994 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11995 SmallVector<MachineOperand, 4> Cond;
11996 if (analyzeBranch(MBB&: *LoopBB, TBB, FBB, Cond))
11997 return nullptr;
11998
11999 // Infinite loops are not supported
12000 if (TBB == LoopBB && FBB == LoopBB)
12001 return nullptr;
12002
12003 // Must be conditional branch
12004 if (TBB != LoopBB && FBB == nullptr)
12005 return nullptr;
12006
12007 assert((TBB == LoopBB || FBB == LoopBB) &&
12008 "The Loop must be a single-basic-block loop");
12009
12010 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
12011 const TargetRegisterInfo &TRI = getRegisterInfo();
12012
12013 if (CondBranch->getOpcode() != AArch64::Bcc)
12014 return nullptr;
12015
12016 // Normalization for createTripCountGreaterCondition()
12017 if (TBB == LoopBB)
12018 reverseBranchCondition(Cond);
12019
12020 MachineInstr *Comp = nullptr;
12021 unsigned CompCounterOprNum = 0;
12022 for (MachineInstr &MI : reverse(C&: *LoopBB)) {
12023 if (MI.modifiesRegister(Reg: AArch64::NZCV, TRI: &TRI)) {
12024 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
12025 // operands is a loop invariant value
12026
12027 switch (MI.getOpcode()) {
12028 case AArch64::SUBSXri:
12029 case AArch64::SUBSWri:
12030 case AArch64::ADDSXri:
12031 case AArch64::ADDSWri:
12032 Comp = &MI;
12033 CompCounterOprNum = 1;
12034 break;
12035 case AArch64::ADDSWrr:
12036 case AArch64::ADDSXrr:
12037 case AArch64::SUBSWrr:
12038 case AArch64::SUBSXrr:
12039 Comp = &MI;
12040 break;
12041 default:
12042 if (isWhileOpcode(Opc: MI.getOpcode())) {
12043 Comp = &MI;
12044 break;
12045 }
12046 return nullptr;
12047 }
12048
12049 if (CompCounterOprNum == 0) {
12050 if (isDefinedOutside(Reg: Comp->getOperand(i: 1).getReg(), BB: LoopBB))
12051 CompCounterOprNum = 2;
12052 else if (isDefinedOutside(Reg: Comp->getOperand(i: 2).getReg(), BB: LoopBB))
12053 CompCounterOprNum = 1;
12054 else
12055 return nullptr;
12056 }
12057 break;
12058 }
12059 }
12060 if (!Comp)
12061 return nullptr;
12062
12063 MachineInstr *Update = nullptr;
12064 Register Init;
12065 bool IsUpdatePriorComp;
12066 unsigned UpdateCounterOprNum;
12067 if (!getIndVarInfo(Reg: Comp->getOperand(i: CompCounterOprNum).getReg(), LoopBB,
12068 UpdateInst&: Update, UpdateCounterOprNum, InitReg&: Init, IsUpdatePriorComp))
12069 return nullptr;
12070
12071 return std::make_unique<AArch64PipelinerLoopInfo>(
12072 args&: LoopBB, args&: CondBranch, args&: Comp, args&: CompCounterOprNum, args&: Update, args&: UpdateCounterOprNum,
12073 args&: Init, args&: IsUpdatePriorComp, args&: Cond);
12074}
12075
12076/// verifyInstruction - Perform target specific instruction verification.
12077bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
12078 StringRef &ErrInfo) const {
12079 // Verify that immediate offsets on load/store instructions are within range.
12080 // Stack objects with an FI operand are excluded as they can be fixed up
12081 // during PEI.
12082 TypeSize Scale(0U, false), Width(0U, false);
12083 int64_t MinOffset, MaxOffset;
12084 if (getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
12085 unsigned ImmIdx = getLoadStoreImmIdx(Opc: MI.getOpcode());
12086 if (MI.getOperand(i: ImmIdx).isImm() && !MI.getOperand(i: ImmIdx - 1).isFI()) {
12087 int64_t Imm = MI.getOperand(i: ImmIdx).getImm();
12088 if (Imm < MinOffset || Imm > MaxOffset) {
12089 ErrInfo = "Unexpected immediate on load/store instruction";
12090 return false;
12091 }
12092 }
12093 }
12094
12095 const MCInstrDesc &MCID = MI.getDesc();
12096 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
12097 const MachineOperand &MO = MI.getOperand(i: Op);
12098 switch (MCID.operands()[Op].OperandType) {
12099 case AArch64::OPERAND_IMPLICIT_IMM_0:
12100 if (!MO.isImm() || MO.getImm() != 0) {
12101 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
12102 return false;
12103 }
12104 break;
12105 case AArch64::OPERAND_SHIFT_MSL:
12106 if (!MO.isImm() ||
12107 AArch64_AM::getShiftType(Imm: MO.getImm()) != AArch64_AM::MSL ||
12108 (AArch64_AM::getShiftValue(Imm: MO.getImm()) != 8 &&
12109 AArch64_AM::getShiftValue(Imm: MO.getImm()) != 16)) {
12110 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
12111 return false;
12112 }
12113 break;
12114 default:
12115 break;
12116 }
12117 }
12118 return true;
12119}
12120
12121#define GET_INSTRINFO_HELPERS
12122#define GET_INSTRMAP_INFO
12123#include "AArch64GenInstrInfo.inc"
12124