1//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AArch64.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AArch64GlobalISelUtils.h"
15#include "AArch64InstrInfo.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64RegisterBankInfo.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
20#include "AArch64TargetMachine.h"
21#include "MCTargetDesc/AArch64AddressingModes.h"
22#include "MCTargetDesc/AArch64MCTargetDesc.h"
23#include "llvm/BinaryFormat/Dwarf.h"
24#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
26#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
27#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
28#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30#include "llvm/CodeGen/GlobalISel/Utils.h"
31#include "llvm/CodeGen/MachineBasicBlock.h"
32#include "llvm/CodeGen/MachineConstantPool.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/MachineFunction.h"
35#include "llvm/CodeGen/MachineInstr.h"
36#include "llvm/CodeGen/MachineInstrBuilder.h"
37#include "llvm/CodeGen/MachineMemOperand.h"
38#include "llvm/CodeGen/MachineOperand.h"
39#include "llvm/CodeGen/MachineRegisterInfo.h"
40#include "llvm/CodeGen/TargetOpcodes.h"
41#include "llvm/CodeGen/TargetRegisterInfo.h"
42#include "llvm/IR/Constants.h"
43#include "llvm/IR/DerivedTypes.h"
44#include "llvm/IR/Instructions.h"
45#include "llvm/IR/IntrinsicsAArch64.h"
46#include "llvm/IR/Type.h"
47#include "llvm/Pass.h"
48#include "llvm/Support/Debug.h"
49#include "llvm/Support/raw_ostream.h"
50#include <optional>
51
52#define DEBUG_TYPE "aarch64-isel"
53
54using namespace llvm;
55using namespace MIPatternMatch;
56using namespace AArch64GISelUtils;
57
58namespace llvm {
59class BlockFrequencyInfo;
60class ProfileSummaryInfo;
61}
62
63namespace {
64
65#define GET_GLOBALISEL_PREDICATE_BITSET
66#include "AArch64GenGlobalISel.inc"
67#undef GET_GLOBALISEL_PREDICATE_BITSET
68
69
70class AArch64InstructionSelector : public InstructionSelector {
71public:
72 AArch64InstructionSelector(const AArch64TargetMachine &TM,
73 const AArch64Subtarget &STI,
74 const AArch64RegisterBankInfo &RBI);
75
76 bool select(MachineInstr &I) override;
77 static const char *getName() { return DEBUG_TYPE; }
78
79 void setupMF(MachineFunction &MF, GISelValueTracking *VT,
80 CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
81 BlockFrequencyInfo *BFI) override {
82 InstructionSelector::setupMF(mf&: MF, vt: VT, covinfo: CoverageInfo, psi: PSI, bfi: BFI);
83 MIB.setMF(MF);
84
85 // hasFnAttribute() is expensive to call on every BRCOND selection, so
86 // cache it here for each run of the selector.
87 ProduceNonFlagSettingCondBr =
88 !MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening);
89 MFReturnAddr = Register();
90
91 processPHIs(MF);
92 }
93
94private:
95 /// tblgen-erated 'select' implementation, used as the initial selector for
96 /// the patterns that don't require complex C++.
97 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
98
99 // A lowering phase that runs before any selection attempts.
100 // Returns true if the instruction was modified.
101 bool preISelLower(MachineInstr &I);
102
103 // An early selection function that runs before the selectImpl() call.
104 bool earlySelect(MachineInstr &I);
105
106 /// Save state that is shared between select calls, call select on \p I and
107 /// then restore the saved state. This can be used to recursively call select
108 /// within a select call.
109 bool selectAndRestoreState(MachineInstr &I);
110
111 // Do some preprocessing of G_PHIs before we begin selection.
112 void processPHIs(MachineFunction &MF);
113
114 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
115
116 /// Eliminate same-sized cross-bank copies into stores before selectImpl().
117 bool contractCrossBankCopyIntoStore(MachineInstr &I,
118 MachineRegisterInfo &MRI);
119
120 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
121
122 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
123 MachineRegisterInfo &MRI) const;
124 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
125 MachineRegisterInfo &MRI) const;
126
127 ///@{
128 /// Helper functions for selectCompareBranch.
129 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
130 MachineIRBuilder &MIB) const;
131 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
132 MachineIRBuilder &MIB) const;
133 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
134 MachineIRBuilder &MIB) const;
135 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
136 MachineBasicBlock *DstMBB,
137 MachineIRBuilder &MIB) const;
138 ///@}
139
140 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
141 MachineRegisterInfo &MRI);
142
143 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
144 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
145
146 // Helper to generate an equivalent of scalar_to_vector into a new register,
147 // returned via 'Dst'.
148 MachineInstr *emitScalarToVector(unsigned EltSize,
149 const TargetRegisterClass *DstRC,
150 Register Scalar,
151 MachineIRBuilder &MIRBuilder) const;
152 /// Helper to narrow vector that was widened by emitScalarToVector.
153 /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
154 /// vector, correspondingly.
155 MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
156 MachineIRBuilder &MIRBuilder,
157 MachineRegisterInfo &MRI) const;
158
159 /// Emit a lane insert into \p DstReg, or a new vector register if
160 /// std::nullopt is provided.
161 ///
162 /// The lane inserted into is defined by \p LaneIdx. The vector source
163 /// register is given by \p SrcReg. The register containing the element is
164 /// given by \p EltReg.
165 MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
166 Register EltReg, unsigned LaneIdx,
167 const RegisterBank &RB,
168 MachineIRBuilder &MIRBuilder) const;
169
170 /// Emit a sequence of instructions representing a constant \p CV for a
171 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
172 ///
173 /// \returns the last instruction in the sequence on success, and nullptr
174 /// otherwise.
175 MachineInstr *emitConstantVector(Register Dst, Constant *CV,
176 MachineIRBuilder &MIRBuilder,
177 MachineRegisterInfo &MRI);
178
179 MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits,
180 MachineIRBuilder &MIRBuilder);
181
182 MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits,
183 MachineIRBuilder &MIRBuilder, bool Inv);
184
185 MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits,
186 MachineIRBuilder &MIRBuilder, bool Inv);
187 MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits,
188 MachineIRBuilder &MIRBuilder);
189 MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits,
190 MachineIRBuilder &MIRBuilder, bool Inv);
191 MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
192 MachineIRBuilder &MIRBuilder);
193
194 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
195 MachineRegisterInfo &MRI);
196 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
197 /// SUBREG_TO_REG.
198 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
199 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
200 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
201 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
202
203 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
204 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
205 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
206 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
207
208 /// Helper function to select vector load intrinsics like
209 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
210 /// \p Opc is the opcode that the selected instruction should use.
211 /// \p NumVecs is the number of vector destinations for the instruction.
212 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
213 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
214 MachineInstr &I);
215 bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
216 MachineInstr &I);
217 void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
218 unsigned Opc);
219 bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
220 unsigned Opc);
221 bool selectIntrinsicWithSideEffects(MachineInstr &I,
222 MachineRegisterInfo &MRI);
223 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
224 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
225 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
226 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
227 bool selectPtrAuthGlobalValue(MachineInstr &I,
228 MachineRegisterInfo &MRI) const;
229 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
230 bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
231 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
232 void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs,
233 unsigned Opc1, unsigned Opc2, bool isExt);
234
235 bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
236 bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
237 bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
238
239 unsigned emitConstantPoolEntry(const Constant *CPVal,
240 MachineFunction &MF) const;
241 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
242 MachineIRBuilder &MIRBuilder) const;
243
244 // Emit a vector concat operation.
245 MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
246 Register Op2,
247 MachineIRBuilder &MIRBuilder) const;
248
249 // Emit an integer compare between LHS and RHS, which checks for Predicate.
250 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
251 MachineOperand &Predicate,
252 MachineIRBuilder &MIRBuilder) const;
253
254 /// Emit a floating point comparison between \p LHS and \p RHS.
255 /// \p Pred if given is the intended predicate to use.
256 MachineInstr *
257 emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
258 std::optional<CmpInst::Predicate> = std::nullopt) const;
259
260 MachineInstr *
261 emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
262 std::initializer_list<llvm::SrcOp> SrcOps,
263 MachineIRBuilder &MIRBuilder,
264 const ComplexRendererFns &RenderFns = std::nullopt) const;
265 /// Helper function to emit an add or sub instruction.
266 ///
267 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
268 /// in a specific order.
269 ///
270 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
271 ///
272 /// \code
273 /// const std::array<std::array<unsigned, 2>, 4> Table {
274 /// {{AArch64::ADDXri, AArch64::ADDWri},
275 /// {AArch64::ADDXrs, AArch64::ADDWrs},
276 /// {AArch64::ADDXrr, AArch64::ADDWrr},
277 /// {AArch64::SUBXri, AArch64::SUBWri},
278 /// {AArch64::ADDXrx, AArch64::ADDWrx}}};
279 /// \endcode
280 ///
281 /// Each row in the table corresponds to a different addressing mode. Each
282 /// column corresponds to a different register size.
283 ///
284 /// \attention Rows must be structured as follows:
285 /// - Row 0: The ri opcode variants
286 /// - Row 1: The rs opcode variants
287 /// - Row 2: The rr opcode variants
288 /// - Row 3: The ri opcode variants for negative immediates
289 /// - Row 4: The rx opcode variants
290 ///
291 /// \attention Columns must be structured as follows:
292 /// - Column 0: The 64-bit opcode variants
293 /// - Column 1: The 32-bit opcode variants
294 ///
295 /// \p Dst is the destination register of the binop to emit.
296 /// \p LHS is the left-hand operand of the binop to emit.
297 /// \p RHS is the right-hand operand of the binop to emit.
298 MachineInstr *emitAddSub(
299 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
300 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
301 MachineIRBuilder &MIRBuilder) const;
302 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
303 MachineOperand &RHS,
304 MachineIRBuilder &MIRBuilder) const;
305 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
306 MachineIRBuilder &MIRBuilder) const;
307 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
308 MachineIRBuilder &MIRBuilder) const;
309 MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
310 MachineIRBuilder &MIRBuilder) const;
311 MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
312 MachineIRBuilder &MIRBuilder) const;
313 MachineInstr *emitCMP(MachineOperand &LHS, MachineOperand &RHS,
314 MachineIRBuilder &MIRBuilder) const;
315 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
316 MachineIRBuilder &MIRBuilder) const;
317 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
318 MachineIRBuilder &MIRBuilder) const;
319 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
320 AArch64CC::CondCode CC,
321 MachineIRBuilder &MIRBuilder) const;
322 MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
323 const RegisterBank &DstRB, LLT ScalarTy,
324 Register VecReg, unsigned LaneIdx,
325 MachineIRBuilder &MIRBuilder) const;
326 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
327 AArch64CC::CondCode Pred,
328 MachineIRBuilder &MIRBuilder) const;
329 /// Emit a CSet for a FP compare.
330 ///
331 /// \p Dst is expected to be a 32-bit scalar register.
332 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
333 MachineIRBuilder &MIRBuilder) const;
334
335 /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
336 /// Might elide the instruction if the previous instruction already sets NZCV
337 /// correctly.
338 MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
339
340 /// Emit the overflow op for \p Opcode.
341 ///
342 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
343 /// G_USUBO, etc.
344 std::pair<MachineInstr *, AArch64CC::CondCode>
345 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
346 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
347
348 bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
349
350 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
351 /// In some cases this is even possible with OR operations in the expression.
352 MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
353 MachineIRBuilder &MIB) const;
354 MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
355 CmpInst::Predicate CC,
356 AArch64CC::CondCode Predicate,
357 AArch64CC::CondCode OutCC,
358 MachineIRBuilder &MIB) const;
359 MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
360 bool Negate, Register CCOp,
361 AArch64CC::CondCode Predicate,
362 MachineIRBuilder &MIB) const;
363
364 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
365 /// \p IsNegative is true if the test should be "not zero".
366 /// This will also optimize the test bit instruction when possible.
367 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
368 MachineBasicBlock *DstMBB,
369 MachineIRBuilder &MIB) const;
370
371 /// Emit a CB(N)Z instruction which branches to \p DestMBB.
372 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
373 MachineBasicBlock *DestMBB,
374 MachineIRBuilder &MIB) const;
375
376 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
377 // We use these manually instead of using the importer since it doesn't
378 // support SDNodeXForm.
379 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
380 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
381 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
382 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
383
384 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
385 ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
386 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
387
388 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
389 unsigned Size) const;
390
391 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
392 return selectAddrModeUnscaled(Root, Size: 1);
393 }
394 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
395 return selectAddrModeUnscaled(Root, Size: 2);
396 }
397 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
398 return selectAddrModeUnscaled(Root, Size: 4);
399 }
400 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
401 return selectAddrModeUnscaled(Root, Size: 8);
402 }
403 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
404 return selectAddrModeUnscaled(Root, Size: 16);
405 }
406
407 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
408 /// from complex pattern matchers like selectAddrModeIndexed().
409 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
410 MachineRegisterInfo &MRI) const;
411
412 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
413 unsigned Size) const;
414 template <int Width>
415 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
416 return selectAddrModeIndexed(Root, Size: Width / 8);
417 }
418
419 std::optional<bool>
420 isWorthFoldingIntoAddrMode(const MachineInstr &MI,
421 const MachineRegisterInfo &MRI) const;
422
423 bool isWorthFoldingIntoExtendedReg(const MachineInstr &MI,
424 const MachineRegisterInfo &MRI,
425 bool IsAddrOperand) const;
426 ComplexRendererFns
427 selectAddrModeShiftedExtendXReg(MachineOperand &Root,
428 unsigned SizeInBytes) const;
429
430 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
431 /// or not a shift + extend should be folded into an addressing mode. Returns
432 /// None when this is not profitable or possible.
433 ComplexRendererFns
434 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
435 MachineOperand &Offset, unsigned SizeInBytes,
436 bool WantsExt) const;
437 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
438 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
439 unsigned SizeInBytes) const;
440 template <int Width>
441 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
442 return selectAddrModeXRO(Root, SizeInBytes: Width / 8);
443 }
444
445 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
446 unsigned SizeInBytes) const;
447 template <int Width>
448 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
449 return selectAddrModeWRO(Root, SizeInBytes: Width / 8);
450 }
451
452 ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
453 bool AllowROR = false) const;
454
455 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
456 return selectShiftedRegister(Root);
457 }
458
459 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
460 return selectShiftedRegister(Root, AllowROR: true);
461 }
462
463 /// Given an extend instruction, determine the correct shift-extend type for
464 /// that instruction.
465 ///
466 /// If the instruction is going to be used in a load or store, pass
467 /// \p IsLoadStore = true.
468 AArch64_AM::ShiftExtendType
469 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
470 bool IsLoadStore = false) const;
471
472 /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
473 ///
474 /// \returns Either \p Reg if no change was necessary, or the new register
475 /// created by moving \p Reg.
476 ///
477 /// Note: This uses emitCopy right now.
478 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
479 MachineIRBuilder &MIB) const;
480
481 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
482
483 ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
484
485 ComplexRendererFns selectCVTFixedPointVec(MachineOperand &Root) const;
486 ComplexRendererFns
487 selectCVTFixedPosRecipOperandVec(MachineOperand &Root) const;
488 ComplexRendererFns
489 selectCVTFixedPointVecBase(const MachineOperand &Root,
490 bool isReciprocal = false) const;
491 void renderFixedPointXForm(MachineInstrBuilder &MIB, const MachineInstr &MI,
492 int OpIdx = -1) const;
493 void renderFixedPointRecipXForm(MachineInstrBuilder &MIB,
494 const MachineInstr &MI, int OpIdx = -1) const;
495
496 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
497 int OpIdx = -1) const;
498 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
499 int OpIdx = -1) const;
500 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
501 int OpIdx = -1) const;
502 void renderUbsanTrap(MachineInstrBuilder &MIB, const MachineInstr &MI,
503 int OpIdx) const;
504 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
505 int OpIdx = -1) const;
506 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
507 int OpIdx = -1) const;
508 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
509 int OpIdx = -1) const;
510 void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
511 const MachineInstr &MI,
512 int OpIdx = -1) const;
513
514 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
515 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
516
517 // Optimization methods.
518 bool tryOptSelect(GSelect &Sel);
519 bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
520 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
521 MachineOperand &Predicate,
522 MachineIRBuilder &MIRBuilder) const;
523
524 /// Return true if \p MI is a load or store of \p NumBytes bytes.
525 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
526
527 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
528 /// register zeroed out. In other words, the result of MI has been explicitly
529 /// zero extended.
530 bool isDef32(const MachineInstr &MI) const;
531
532 const AArch64TargetMachine &TM;
533 const AArch64Subtarget &STI;
534 const AArch64InstrInfo &TII;
535 const AArch64RegisterInfo &TRI;
536 const AArch64RegisterBankInfo &RBI;
537
538 bool ProduceNonFlagSettingCondBr = false;
539
540 // Some cached values used during selection.
541 // We use LR as a live-in register, and we keep track of it here as it can be
542 // clobbered by calls.
543 Register MFReturnAddr;
544
545 MachineIRBuilder MIB;
546
547#define GET_GLOBALISEL_PREDICATES_DECL
548#include "AArch64GenGlobalISel.inc"
549#undef GET_GLOBALISEL_PREDICATES_DECL
550
551// We declare the temporaries used by selectImpl() in the class to minimize the
552// cost of constructing placeholder values.
553#define GET_GLOBALISEL_TEMPORARIES_DECL
554#include "AArch64GenGlobalISel.inc"
555#undef GET_GLOBALISEL_TEMPORARIES_DECL
556};
557
558} // end anonymous namespace
559
560#define GET_GLOBALISEL_IMPL
561#include "AArch64GenGlobalISel.inc"
562#undef GET_GLOBALISEL_IMPL
563
564AArch64InstructionSelector::AArch64InstructionSelector(
565 const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
566 const AArch64RegisterBankInfo &RBI)
567 : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
568 RBI(RBI),
569#define GET_GLOBALISEL_PREDICATES_INIT
570#include "AArch64GenGlobalISel.inc"
571#undef GET_GLOBALISEL_PREDICATES_INIT
572#define GET_GLOBALISEL_TEMPORARIES_INIT
573#include "AArch64GenGlobalISel.inc"
574#undef GET_GLOBALISEL_TEMPORARIES_INIT
575{
576}
577
578// FIXME: This should be target-independent, inferred from the types declared
579// for each class in the bank.
580//
581/// Given a register bank, and a type, return the smallest register class that
582/// can represent that combination.
583static const TargetRegisterClass *
584getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
585 bool GetAllRegSet = false) {
586 if (RB.getID() == AArch64::GPRRegBankID) {
587 if (Ty.getSizeInBits() <= 32)
588 return GetAllRegSet ? &AArch64::GPR32allRegClass
589 : &AArch64::GPR32RegClass;
590 if (Ty.getSizeInBits() == 64)
591 return GetAllRegSet ? &AArch64::GPR64allRegClass
592 : &AArch64::GPR64RegClass;
593 if (Ty.getSizeInBits() == 128)
594 return &AArch64::XSeqPairsClassRegClass;
595 return nullptr;
596 }
597
598 if (RB.getID() == AArch64::FPRRegBankID) {
599 switch (Ty.getSizeInBits()) {
600 case 8:
601 return &AArch64::FPR8RegClass;
602 case 16:
603 return &AArch64::FPR16RegClass;
604 case 32:
605 return &AArch64::FPR32RegClass;
606 case 64:
607 return &AArch64::FPR64RegClass;
608 case 128:
609 return &AArch64::FPR128RegClass;
610 }
611 return nullptr;
612 }
613
614 return nullptr;
615}
616
617/// Given a register bank, and size in bits, return the smallest register class
618/// that can represent that combination.
619static const TargetRegisterClass *
620getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
621 bool GetAllRegSet = false) {
622 if (SizeInBits.isScalable()) {
623 assert(RB.getID() == AArch64::FPRRegBankID &&
624 "Expected FPR regbank for scalable type size");
625 return &AArch64::ZPRRegClass;
626 }
627
628 unsigned RegBankID = RB.getID();
629
630 if (RegBankID == AArch64::GPRRegBankID) {
631 assert(!SizeInBits.isScalable() && "Unexpected scalable register size");
632 if (SizeInBits <= 32)
633 return GetAllRegSet ? &AArch64::GPR32allRegClass
634 : &AArch64::GPR32RegClass;
635 if (SizeInBits == 64)
636 return GetAllRegSet ? &AArch64::GPR64allRegClass
637 : &AArch64::GPR64RegClass;
638 if (SizeInBits == 128)
639 return &AArch64::XSeqPairsClassRegClass;
640 }
641
642 if (RegBankID == AArch64::FPRRegBankID) {
643 if (SizeInBits.isScalable()) {
644 assert(SizeInBits == TypeSize::getScalable(128) &&
645 "Unexpected scalable register size");
646 return &AArch64::ZPRRegClass;
647 }
648
649 switch (SizeInBits) {
650 default:
651 return nullptr;
652 case 8:
653 return &AArch64::FPR8RegClass;
654 case 16:
655 return &AArch64::FPR16RegClass;
656 case 32:
657 return &AArch64::FPR32RegClass;
658 case 64:
659 return &AArch64::FPR64RegClass;
660 case 128:
661 return &AArch64::FPR128RegClass;
662 }
663 }
664
665 return nullptr;
666}
667
668/// Returns the correct subregister to use for a given register class.
669static bool getSubRegForClass(const TargetRegisterClass *RC,
670 const TargetRegisterInfo &TRI, unsigned &SubReg) {
671 switch (TRI.getRegSizeInBits(RC: *RC)) {
672 case 8:
673 SubReg = AArch64::bsub;
674 break;
675 case 16:
676 SubReg = AArch64::hsub;
677 break;
678 case 32:
679 if (RC != &AArch64::FPR32RegClass)
680 SubReg = AArch64::sub_32;
681 else
682 SubReg = AArch64::ssub;
683 break;
684 case 64:
685 SubReg = AArch64::dsub;
686 break;
687 default:
688 LLVM_DEBUG(
689 dbgs() << "Couldn't find appropriate subregister for register class.");
690 return false;
691 }
692
693 return true;
694}
695
696/// Returns the minimum size the given register bank can hold.
697static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
698 switch (RB.getID()) {
699 case AArch64::GPRRegBankID:
700 return 32;
701 case AArch64::FPRRegBankID:
702 return 8;
703 default:
704 llvm_unreachable("Tried to get minimum size for unknown register bank.");
705 }
706}
707
708/// Create a REG_SEQUENCE instruction using the registers in \p Regs.
709/// Helper function for functions like createDTuple and createQTuple.
710///
711/// \p RegClassIDs - The list of register class IDs available for some tuple of
712/// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
713/// expected to contain between 2 and 4 tuple classes.
714///
715/// \p SubRegs - The list of subregister classes associated with each register
716/// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
717/// subregister class. The index of each subregister class is expected to
718/// correspond with the index of each register class.
719///
720/// \returns Either the destination register of REG_SEQUENCE instruction that
721/// was created, or the 0th element of \p Regs if \p Regs contains a single
722/// element.
723static Register createTuple(ArrayRef<Register> Regs,
724 const unsigned RegClassIDs[],
725 const unsigned SubRegs[], MachineIRBuilder &MIB) {
726 unsigned NumRegs = Regs.size();
727 if (NumRegs == 1)
728 return Regs[0];
729 assert(NumRegs >= 2 && NumRegs <= 4 &&
730 "Only support between two and 4 registers in a tuple!");
731 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
732 auto *DesiredClass = TRI->getRegClass(i: RegClassIDs[NumRegs - 2]);
733 auto RegSequence =
734 MIB.buildInstr(Opc: TargetOpcode::REG_SEQUENCE, DstOps: {DesiredClass}, SrcOps: {});
735 for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
736 RegSequence.addUse(RegNo: Regs[I]);
737 RegSequence.addImm(Val: SubRegs[I]);
738 }
739 return RegSequence.getReg(Idx: 0);
740}
741
742/// Create a tuple of D-registers using the registers in \p Regs.
743static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
744 static const unsigned RegClassIDs[] = {
745 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
746 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
747 AArch64::dsub2, AArch64::dsub3};
748 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
749}
750
751/// Create a tuple of Q-registers using the registers in \p Regs.
752static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
753 static const unsigned RegClassIDs[] = {
754 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
755 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
756 AArch64::qsub2, AArch64::qsub3};
757 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
758}
759
760static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
761 auto &MI = *Root.getParent();
762 auto &MBB = *MI.getParent();
763 auto &MF = *MBB.getParent();
764 auto &MRI = MF.getRegInfo();
765 uint64_t Immed;
766 if (Root.isImm())
767 Immed = Root.getImm();
768 else if (Root.isCImm())
769 Immed = Root.getCImm()->getZExtValue();
770 else if (Root.isReg()) {
771 auto ValAndVReg =
772 getIConstantVRegValWithLookThrough(VReg: Root.getReg(), MRI, LookThroughInstrs: true);
773 if (!ValAndVReg)
774 return std::nullopt;
775 Immed = ValAndVReg->Value.getSExtValue();
776 } else
777 return std::nullopt;
778 return Immed;
779}
780
781/// Check whether \p I is a currently unsupported binary operation:
782/// - it has an unsized type
783/// - an operand is not a vreg
784/// - all operands are not in the same bank
785/// These are checks that should someday live in the verifier, but right now,
786/// these are mostly limitations of the aarch64 selector.
787static bool unsupportedBinOp(const MachineInstr &I,
788 const AArch64RegisterBankInfo &RBI,
789 const MachineRegisterInfo &MRI,
790 const AArch64RegisterInfo &TRI) {
791 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
792 if (!Ty.isValid()) {
793 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
794 return true;
795 }
796
797 const RegisterBank *PrevOpBank = nullptr;
798 for (auto &MO : I.operands()) {
799 // FIXME: Support non-register operands.
800 if (!MO.isReg()) {
801 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
802 return true;
803 }
804
805 // FIXME: Can generic operations have physical registers operands? If
806 // so, this will need to be taught about that, and we'll need to get the
807 // bank out of the minimal class for the register.
808 // Either way, this needs to be documented (and possibly verified).
809 if (!MO.getReg().isVirtual()) {
810 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
811 return true;
812 }
813
814 const RegisterBank *OpBank = RBI.getRegBank(Reg: MO.getReg(), MRI, TRI);
815 if (!OpBank) {
816 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
817 return true;
818 }
819
820 if (PrevOpBank && OpBank != PrevOpBank) {
821 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
822 return true;
823 }
824 PrevOpBank = OpBank;
825 }
826 return false;
827}
828
829/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
830/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
831/// and of size \p OpSize.
832/// \returns \p GenericOpc if the combination is unsupported.
833static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
834 unsigned OpSize) {
835 switch (RegBankID) {
836 case AArch64::GPRRegBankID:
837 if (OpSize == 32) {
838 switch (GenericOpc) {
839 case TargetOpcode::G_SHL:
840 return AArch64::LSLVWr;
841 case TargetOpcode::G_LSHR:
842 return AArch64::LSRVWr;
843 case TargetOpcode::G_ASHR:
844 return AArch64::ASRVWr;
845 default:
846 return GenericOpc;
847 }
848 } else if (OpSize == 64) {
849 switch (GenericOpc) {
850 case TargetOpcode::G_PTR_ADD:
851 return AArch64::ADDXrr;
852 case TargetOpcode::G_SHL:
853 return AArch64::LSLVXr;
854 case TargetOpcode::G_LSHR:
855 return AArch64::LSRVXr;
856 case TargetOpcode::G_ASHR:
857 return AArch64::ASRVXr;
858 default:
859 return GenericOpc;
860 }
861 }
862 break;
863 case AArch64::FPRRegBankID:
864 switch (OpSize) {
865 case 32:
866 switch (GenericOpc) {
867 case TargetOpcode::G_FADD:
868 return AArch64::FADDSrr;
869 case TargetOpcode::G_FSUB:
870 return AArch64::FSUBSrr;
871 case TargetOpcode::G_FMUL:
872 return AArch64::FMULSrr;
873 case TargetOpcode::G_FDIV:
874 return AArch64::FDIVSrr;
875 default:
876 return GenericOpc;
877 }
878 case 64:
879 switch (GenericOpc) {
880 case TargetOpcode::G_FADD:
881 return AArch64::FADDDrr;
882 case TargetOpcode::G_FSUB:
883 return AArch64::FSUBDrr;
884 case TargetOpcode::G_FMUL:
885 return AArch64::FMULDrr;
886 case TargetOpcode::G_FDIV:
887 return AArch64::FDIVDrr;
888 case TargetOpcode::G_OR:
889 return AArch64::ORRv8i8;
890 default:
891 return GenericOpc;
892 }
893 }
894 break;
895 }
896 return GenericOpc;
897}
898
899/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
900/// appropriate for the (value) register bank \p RegBankID and of memory access
901/// size \p OpSize. This returns the variant with the base+unsigned-immediate
902/// addressing mode (e.g., LDRXui).
903/// \returns \p GenericOpc if the combination is unsupported.
904static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
905 unsigned OpSize) {
906 const bool isStore = GenericOpc == TargetOpcode::G_STORE;
907 switch (RegBankID) {
908 case AArch64::GPRRegBankID:
909 switch (OpSize) {
910 case 8:
911 return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
912 case 16:
913 return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
914 case 32:
915 return isStore ? AArch64::STRWui : AArch64::LDRWui;
916 case 64:
917 return isStore ? AArch64::STRXui : AArch64::LDRXui;
918 }
919 break;
920 case AArch64::FPRRegBankID:
921 switch (OpSize) {
922 case 8:
923 return isStore ? AArch64::STRBui : AArch64::LDRBui;
924 case 16:
925 return isStore ? AArch64::STRHui : AArch64::LDRHui;
926 case 32:
927 return isStore ? AArch64::STRSui : AArch64::LDRSui;
928 case 64:
929 return isStore ? AArch64::STRDui : AArch64::LDRDui;
930 case 128:
931 return isStore ? AArch64::STRQui : AArch64::LDRQui;
932 }
933 break;
934 }
935 return GenericOpc;
936}
937
938/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
939/// to \p *To.
940///
941/// E.g "To = COPY SrcReg:SubReg"
942static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
943 const RegisterBankInfo &RBI, Register SrcReg,
944 const TargetRegisterClass *To, unsigned SubReg) {
945 assert(SrcReg.isValid() && "Expected a valid source register?");
946 assert(To && "Destination register class cannot be null");
947 assert(SubReg && "Expected a valid subregister");
948
949 MachineIRBuilder MIB(I);
950 auto SubRegCopy =
951 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {To}, SrcOps: {}).addReg(RegNo: SrcReg, Flags: {}, SubReg);
952 MachineOperand &RegOp = I.getOperand(i: 1);
953 RegOp.setReg(SubRegCopy.getReg(Idx: 0));
954
955 // It's possible that the destination register won't be constrained. Make
956 // sure that happens.
957 if (!I.getOperand(i: 0).getReg().isPhysical())
958 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(), RC: *To, MRI);
959
960 return true;
961}
962
963/// Helper function to get the source and destination register classes for a
964/// copy. Returns a std::pair containing the source register class for the
965/// copy, and the destination register class for the copy. If a register class
966/// cannot be determined, then it will be nullptr.
967static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
968getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
969 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
970 const RegisterBankInfo &RBI) {
971 Register DstReg = I.getOperand(i: 0).getReg();
972 Register SrcReg = I.getOperand(i: 1).getReg();
973 const RegisterBank &DstRegBank = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
974 const RegisterBank &SrcRegBank = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
975
976 TypeSize DstSize = RBI.getSizeInBits(Reg: DstReg, MRI, TRI);
977 TypeSize SrcSize = RBI.getSizeInBits(Reg: SrcReg, MRI, TRI);
978
979 // Special casing for cross-bank copies of s1s. We can technically represent
980 // a 1-bit value with any size of register. The minimum size for a GPR is 32
981 // bits. So, we need to put the FPR on 32 bits as well.
982 //
983 // FIXME: I'm not sure if this case holds true outside of copies. If it does,
984 // then we can pull it into the helpers that get the appropriate class for a
985 // register bank. Or make a new helper that carries along some constraint
986 // information.
987 if (SrcRegBank != DstRegBank &&
988 (DstSize == TypeSize::getFixed(ExactSize: 1) && SrcSize == TypeSize::getFixed(ExactSize: 1)))
989 SrcSize = DstSize = TypeSize::getFixed(ExactSize: 32);
990
991 return {getMinClassForRegBank(RB: SrcRegBank, SizeInBits: SrcSize, GetAllRegSet: true),
992 getMinClassForRegBank(RB: DstRegBank, SizeInBits: DstSize, GetAllRegSet: true)};
993}
994
995// FIXME: We need some sort of API in RBI/TRI to allow generic code to
996// constrain operands of simple instructions given a TargetRegisterClass
997// and LLT
998static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
999 const RegisterBankInfo &RBI) {
1000 for (MachineOperand &MO : I.operands()) {
1001 if (!MO.isReg())
1002 continue;
1003 Register Reg = MO.getReg();
1004 if (!Reg)
1005 continue;
1006 if (Reg.isPhysical())
1007 continue;
1008 LLT Ty = MRI.getType(Reg);
1009 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1010 const TargetRegisterClass *RC =
1011 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
1012 if (!RC) {
1013 const RegisterBank &RB = *cast<const RegisterBank *>(Val: RegClassOrBank);
1014 RC = getRegClassForTypeOnBank(Ty, RB);
1015 if (!RC) {
1016 LLVM_DEBUG(
1017 dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
1018 break;
1019 }
1020 }
1021 RBI.constrainGenericRegister(Reg, RC: *RC, MRI);
1022 }
1023
1024 return true;
1025}
1026
1027static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
1028 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
1029 const RegisterBankInfo &RBI) {
1030 Register DstReg = I.getOperand(i: 0).getReg();
1031 Register SrcReg = I.getOperand(i: 1).getReg();
1032 const RegisterBank &DstRegBank = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
1033 const RegisterBank &SrcRegBank = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
1034
1035 // Find the correct register classes for the source and destination registers.
1036 const TargetRegisterClass *SrcRC;
1037 const TargetRegisterClass *DstRC;
1038 std::tie(args&: SrcRC, args&: DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1039
1040 if (!DstRC) {
1041 LLVM_DEBUG(dbgs() << "Unexpected dest size "
1042 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
1043 return false;
1044 }
1045
1046 // Is this a copy? If so, then we may need to insert a subregister copy.
1047 if (I.isCopy()) {
1048 // Yes. Check if there's anything to fix up.
1049 if (!SrcRC) {
1050 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1051 return false;
1052 }
1053
1054 const TypeSize SrcSize = TRI.getRegSizeInBits(RC: *SrcRC);
1055 const TypeSize DstSize = TRI.getRegSizeInBits(RC: *DstRC);
1056 unsigned SrcSubReg = I.getOperand(i: 1).getSubReg();
1057 unsigned SubReg;
1058
1059 if (SrcSubReg)
1060 return RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI);
1061
1062 // If the source bank doesn't support a subregister copy small enough,
1063 // then we first need to copy to the destination bank.
1064 if (getMinSizeForRegBank(RB: SrcRegBank) > DstSize) {
1065 const TargetRegisterClass *DstTempRC =
1066 getMinClassForRegBank(RB: DstRegBank, SizeInBits: SrcSize, /* GetAllRegSet */ true);
1067 getSubRegForClass(RC: DstRC, TRI, SubReg);
1068
1069 MachineIRBuilder MIB(I);
1070 auto Copy = MIB.buildCopy(Res: {DstTempRC}, Op: {SrcReg});
1071 copySubReg(I, MRI, RBI, SrcReg: Copy.getReg(Idx: 0), To: DstRC, SubReg);
1072 } else if (SrcSize > DstSize) {
1073 // If the source register is bigger than the destination we need to
1074 // perform a subregister copy.
1075 const TargetRegisterClass *SubRegRC =
1076 getMinClassForRegBank(RB: SrcRegBank, SizeInBits: DstSize, /* GetAllRegSet */ true);
1077 getSubRegForClass(RC: SubRegRC, TRI, SubReg);
1078 copySubReg(I, MRI, RBI, SrcReg, To: DstRC, SubReg);
1079 } else if (DstSize > SrcSize) {
1080 // If the destination register is bigger than the source we need to do
1081 // a promotion using SUBREG_TO_REG.
1082 const TargetRegisterClass *PromotionRC =
1083 getMinClassForRegBank(RB: SrcRegBank, SizeInBits: DstSize, /* GetAllRegSet */ true);
1084 getSubRegForClass(RC: SrcRC, TRI, SubReg);
1085
1086 Register PromoteReg = MRI.createVirtualRegister(RegClass: PromotionRC);
1087 BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(),
1088 MCID: TII.get(Opcode: AArch64::SUBREG_TO_REG), DestReg: PromoteReg)
1089 .addUse(RegNo: SrcReg)
1090 .addImm(Val: SubReg);
1091 MachineOperand &RegOp = I.getOperand(i: 1);
1092 RegOp.setReg(PromoteReg);
1093 }
1094
1095 // If the destination is a physical register, then there's nothing to
1096 // change, so we're done.
1097 if (DstReg.isPhysical())
1098 return true;
1099 }
1100
1101 // No need to constrain SrcReg. It will get constrained when we hit another
1102 // of its use or its defs. Copies do not have constraints.
1103 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI)) {
1104 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1105 << " operand\n");
1106 return false;
1107 }
1108
1109 // If this a GPR ZEXT that we want to just reduce down into a copy.
1110 // The sizes will be mismatched with the source < 32b but that's ok.
1111 if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1112 I.setDesc(TII.get(Opcode: AArch64::COPY));
1113 assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1114 return selectCopy(I, TII, MRI, TRI, RBI);
1115 }
1116
1117 I.setDesc(TII.get(Opcode: AArch64::COPY));
1118 return true;
1119}
1120
1121MachineInstr *
1122AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1123 Register False, AArch64CC::CondCode CC,
1124 MachineIRBuilder &MIB) const {
1125 MachineRegisterInfo &MRI = *MIB.getMRI();
1126 assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1127 RBI.getRegBank(True, MRI, TRI)->getID() &&
1128 "Expected both select operands to have the same regbank?");
1129 LLT Ty = MRI.getType(Reg: True);
1130 if (Ty.isVector())
1131 return nullptr;
1132 const unsigned Size = Ty.getSizeInBits();
1133 assert((Size == 32 || Size == 64) &&
1134 "Expected 32 bit or 64 bit select only?");
1135 const bool Is32Bit = Size == 32;
1136 if (RBI.getRegBank(Reg: True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1137 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1138 auto FCSel = MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {True, False}).addImm(Val: CC);
1139 constrainSelectedInstRegOperands(I&: *FCSel, TII, TRI, RBI);
1140 return &*FCSel;
1141 }
1142
1143 // By default, we'll try and emit a CSEL.
1144 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1145 bool Optimized = false;
1146 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1147 &Optimized](Register &Reg, Register &OtherReg,
1148 bool Invert) {
1149 if (Optimized)
1150 return false;
1151
1152 // Attempt to fold:
1153 //
1154 // %sub = G_SUB 0, %x
1155 // %select = G_SELECT cc, %reg, %sub
1156 //
1157 // Into:
1158 // %select = CSNEG %reg, %x, cc
1159 Register MatchReg;
1160 if (mi_match(R: Reg, MRI, P: m_Neg(Src: m_Reg(R&: MatchReg)))) {
1161 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1162 Reg = MatchReg;
1163 if (Invert) {
1164 CC = AArch64CC::getInvertedCondCode(Code: CC);
1165 std::swap(a&: Reg, b&: OtherReg);
1166 }
1167 return true;
1168 }
1169
1170 // Attempt to fold:
1171 //
1172 // %xor = G_XOR %x, -1
1173 // %select = G_SELECT cc, %reg, %xor
1174 //
1175 // Into:
1176 // %select = CSINV %reg, %x, cc
1177 if (mi_match(R: Reg, MRI, P: m_Not(Src: m_Reg(R&: MatchReg)))) {
1178 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1179 Reg = MatchReg;
1180 if (Invert) {
1181 CC = AArch64CC::getInvertedCondCode(Code: CC);
1182 std::swap(a&: Reg, b&: OtherReg);
1183 }
1184 return true;
1185 }
1186
1187 // Attempt to fold:
1188 //
1189 // %add = G_ADD %x, 1
1190 // %select = G_SELECT cc, %reg, %add
1191 //
1192 // Into:
1193 // %select = CSINC %reg, %x, cc
1194 if (mi_match(R: Reg, MRI,
1195 P: m_any_of(preds: m_GAdd(L: m_Reg(R&: MatchReg), R: m_SpecificICst(RequestedValue: 1)),
1196 preds: m_GPtrAdd(L: m_Reg(R&: MatchReg), R: m_SpecificICst(RequestedValue: 1))))) {
1197 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1198 Reg = MatchReg;
1199 if (Invert) {
1200 CC = AArch64CC::getInvertedCondCode(Code: CC);
1201 std::swap(a&: Reg, b&: OtherReg);
1202 }
1203 return true;
1204 }
1205
1206 return false;
1207 };
1208
1209 // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1210 // true/false values are constants.
1211 // FIXME: All of these patterns already exist in tablegen. We should be
1212 // able to import these.
1213 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1214 &Optimized]() {
1215 if (Optimized)
1216 return false;
1217 auto TrueCst = getIConstantVRegValWithLookThrough(VReg: True, MRI);
1218 auto FalseCst = getIConstantVRegValWithLookThrough(VReg: False, MRI);
1219 if (!TrueCst && !FalseCst)
1220 return false;
1221
1222 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1223 if (TrueCst && FalseCst) {
1224 int64_t T = TrueCst->Value.getSExtValue();
1225 int64_t F = FalseCst->Value.getSExtValue();
1226
1227 if (T == 0 && F == 1) {
1228 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1229 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1230 True = ZReg;
1231 False = ZReg;
1232 return true;
1233 }
1234
1235 if (T == 0 && F == -1) {
1236 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1237 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1238 True = ZReg;
1239 False = ZReg;
1240 return true;
1241 }
1242 }
1243
1244 if (TrueCst) {
1245 int64_t T = TrueCst->Value.getSExtValue();
1246 if (T == 1) {
1247 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1248 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1249 True = False;
1250 False = ZReg;
1251 CC = AArch64CC::getInvertedCondCode(Code: CC);
1252 return true;
1253 }
1254
1255 if (T == -1) {
1256 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1257 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1258 True = False;
1259 False = ZReg;
1260 CC = AArch64CC::getInvertedCondCode(Code: CC);
1261 return true;
1262 }
1263 }
1264
1265 if (FalseCst) {
1266 int64_t F = FalseCst->Value.getSExtValue();
1267 if (F == 1) {
1268 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1269 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1270 False = ZReg;
1271 return true;
1272 }
1273
1274 if (F == -1) {
1275 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1276 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1277 False = ZReg;
1278 return true;
1279 }
1280 }
1281 return false;
1282 };
1283
1284 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1285 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1286 Optimized |= TryOptSelectCst();
1287 auto SelectInst = MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {True, False}).addImm(Val: CC);
1288 constrainSelectedInstRegOperands(I&: *SelectInst, TII, TRI, RBI);
1289 return &*SelectInst;
1290}
1291
1292static AArch64CC::CondCode
1293changeICMPPredToAArch64CC(CmpInst::Predicate P, Register RHS = {},
1294 MachineRegisterInfo *MRI = nullptr) {
1295 switch (P) {
1296 default:
1297 llvm_unreachable("Unknown condition code!");
1298 case CmpInst::ICMP_NE:
1299 return AArch64CC::NE;
1300 case CmpInst::ICMP_EQ:
1301 return AArch64CC::EQ;
1302 case CmpInst::ICMP_SGT:
1303 return AArch64CC::GT;
1304 case CmpInst::ICMP_SGE:
1305 if (RHS && MRI) {
1306 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
1307 if (ValAndVReg && ValAndVReg->Value == 0)
1308 return AArch64CC::PL;
1309 }
1310 return AArch64CC::GE;
1311 case CmpInst::ICMP_SLT:
1312 if (RHS && MRI) {
1313 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
1314 if (ValAndVReg && ValAndVReg->Value == 0)
1315 return AArch64CC::MI;
1316 }
1317 return AArch64CC::LT;
1318 case CmpInst::ICMP_SLE:
1319 return AArch64CC::LE;
1320 case CmpInst::ICMP_UGT:
1321 return AArch64CC::HI;
1322 case CmpInst::ICMP_UGE:
1323 return AArch64CC::HS;
1324 case CmpInst::ICMP_ULT:
1325 return AArch64CC::LO;
1326 case CmpInst::ICMP_ULE:
1327 return AArch64CC::LS;
1328 }
1329}
1330
1331/// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1332static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1333 AArch64CC::CondCode &CondCode,
1334 AArch64CC::CondCode &CondCode2) {
1335 CondCode2 = AArch64CC::AL;
1336 switch (CC) {
1337 default:
1338 llvm_unreachable("Unknown FP condition!");
1339 case CmpInst::FCMP_OEQ:
1340 CondCode = AArch64CC::EQ;
1341 break;
1342 case CmpInst::FCMP_OGT:
1343 CondCode = AArch64CC::GT;
1344 break;
1345 case CmpInst::FCMP_OGE:
1346 CondCode = AArch64CC::GE;
1347 break;
1348 case CmpInst::FCMP_OLT:
1349 CondCode = AArch64CC::MI;
1350 break;
1351 case CmpInst::FCMP_OLE:
1352 CondCode = AArch64CC::LS;
1353 break;
1354 case CmpInst::FCMP_ONE:
1355 CondCode = AArch64CC::MI;
1356 CondCode2 = AArch64CC::GT;
1357 break;
1358 case CmpInst::FCMP_ORD:
1359 CondCode = AArch64CC::VC;
1360 break;
1361 case CmpInst::FCMP_UNO:
1362 CondCode = AArch64CC::VS;
1363 break;
1364 case CmpInst::FCMP_UEQ:
1365 CondCode = AArch64CC::EQ;
1366 CondCode2 = AArch64CC::VS;
1367 break;
1368 case CmpInst::FCMP_UGT:
1369 CondCode = AArch64CC::HI;
1370 break;
1371 case CmpInst::FCMP_UGE:
1372 CondCode = AArch64CC::PL;
1373 break;
1374 case CmpInst::FCMP_ULT:
1375 CondCode = AArch64CC::LT;
1376 break;
1377 case CmpInst::FCMP_ULE:
1378 CondCode = AArch64CC::LE;
1379 break;
1380 case CmpInst::FCMP_UNE:
1381 CondCode = AArch64CC::NE;
1382 break;
1383 }
1384}
1385
1386/// Convert an IR fp condition code to an AArch64 CC.
1387/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1388/// should be AND'ed instead of OR'ed.
1389static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1390 AArch64CC::CondCode &CondCode,
1391 AArch64CC::CondCode &CondCode2) {
1392 CondCode2 = AArch64CC::AL;
1393 switch (CC) {
1394 default:
1395 changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1396 assert(CondCode2 == AArch64CC::AL);
1397 break;
1398 case CmpInst::FCMP_ONE:
1399 // (a one b)
1400 // == ((a olt b) || (a ogt b))
1401 // == ((a ord b) && (a une b))
1402 CondCode = AArch64CC::VC;
1403 CondCode2 = AArch64CC::NE;
1404 break;
1405 case CmpInst::FCMP_UEQ:
1406 // (a ueq b)
1407 // == ((a uno b) || (a oeq b))
1408 // == ((a ule b) && (a uge b))
1409 CondCode = AArch64CC::PL;
1410 CondCode2 = AArch64CC::LE;
1411 break;
1412 }
1413}
1414
1415/// Return a register which can be used as a bit to test in a TB(N)Z.
1416static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1417 MachineRegisterInfo &MRI) {
1418 assert(Reg.isValid() && "Expected valid register!");
1419 bool HasZext = false;
1420 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1421 unsigned Opc = MI->getOpcode();
1422
1423 if (!MI->getOperand(i: 0).isReg() ||
1424 !MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
1425 break;
1426
1427 // (tbz (any_ext x), b) -> (tbz x, b) and
1428 // (tbz (zext x), b) -> (tbz x, b) if we don't use the extended bits.
1429 //
1430 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1431 // on the truncated x is the same as the bit number on x.
1432 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1433 Opc == TargetOpcode::G_TRUNC) {
1434 if (Opc == TargetOpcode::G_ZEXT)
1435 HasZext = true;
1436
1437 Register NextReg = MI->getOperand(i: 1).getReg();
1438 // Did we find something worth folding?
1439 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(RegNo: NextReg))
1440 break;
1441 TypeSize InSize = MRI.getType(Reg: NextReg).getSizeInBits();
1442 if (Bit >= InSize)
1443 break;
1444
1445 // NextReg is worth folding. Keep looking.
1446 Reg = NextReg;
1447 continue;
1448 }
1449
1450 // Attempt to find a suitable operation with a constant on one side.
1451 std::optional<uint64_t> C;
1452 Register TestReg;
1453 switch (Opc) {
1454 default:
1455 break;
1456 case TargetOpcode::G_AND:
1457 case TargetOpcode::G_XOR: {
1458 TestReg = MI->getOperand(i: 1).getReg();
1459 Register ConstantReg = MI->getOperand(i: 2).getReg();
1460 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
1461 if (!VRegAndVal) {
1462 // AND commutes, check the other side for a constant.
1463 // FIXME: Can we canonicalize the constant so that it's always on the
1464 // same side at some point earlier?
1465 std::swap(a&: ConstantReg, b&: TestReg);
1466 VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
1467 }
1468 if (VRegAndVal) {
1469 if (HasZext)
1470 C = VRegAndVal->Value.getZExtValue();
1471 else
1472 C = VRegAndVal->Value.getSExtValue();
1473 }
1474 break;
1475 }
1476 case TargetOpcode::G_ASHR:
1477 case TargetOpcode::G_LSHR:
1478 case TargetOpcode::G_SHL: {
1479 TestReg = MI->getOperand(i: 1).getReg();
1480 auto VRegAndVal =
1481 getIConstantVRegValWithLookThrough(VReg: MI->getOperand(i: 2).getReg(), MRI);
1482 if (VRegAndVal)
1483 C = VRegAndVal->Value.getSExtValue();
1484 break;
1485 }
1486 }
1487
1488 // Didn't find a constant or viable register. Bail out of the loop.
1489 if (!C || !TestReg.isValid())
1490 break;
1491
1492 // We found a suitable instruction with a constant. Check to see if we can
1493 // walk through the instruction.
1494 Register NextReg;
1495 unsigned TestRegSize = MRI.getType(Reg: TestReg).getSizeInBits();
1496 switch (Opc) {
1497 default:
1498 break;
1499 case TargetOpcode::G_AND:
1500 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1501 if ((*C >> Bit) & 1)
1502 NextReg = TestReg;
1503 break;
1504 case TargetOpcode::G_SHL:
1505 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1506 // the type of the register.
1507 if (*C <= Bit && (Bit - *C) < TestRegSize) {
1508 NextReg = TestReg;
1509 Bit = Bit - *C;
1510 }
1511 break;
1512 case TargetOpcode::G_ASHR:
1513 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1514 // in x
1515 NextReg = TestReg;
1516 Bit = Bit + *C;
1517 if (Bit >= TestRegSize)
1518 Bit = TestRegSize - 1;
1519 break;
1520 case TargetOpcode::G_LSHR:
1521 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1522 if ((Bit + *C) < TestRegSize) {
1523 NextReg = TestReg;
1524 Bit = Bit + *C;
1525 }
1526 break;
1527 case TargetOpcode::G_XOR:
1528 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1529 // appropriate.
1530 //
1531 // e.g. If x' = xor x, c, and the b-th bit is set in c then
1532 //
1533 // tbz x', b -> tbnz x, b
1534 //
1535 // Because x' only has the b-th bit set if x does not.
1536 if ((*C >> Bit) & 1)
1537 Invert = !Invert;
1538 NextReg = TestReg;
1539 break;
1540 }
1541
1542 // Check if we found anything worth folding.
1543 if (!NextReg.isValid())
1544 return Reg;
1545 Reg = NextReg;
1546 }
1547
1548 return Reg;
1549}
1550
1551MachineInstr *AArch64InstructionSelector::emitTestBit(
1552 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1553 MachineIRBuilder &MIB) const {
1554 assert(TestReg.isValid());
1555 assert(ProduceNonFlagSettingCondBr &&
1556 "Cannot emit TB(N)Z with speculation tracking!");
1557 MachineRegisterInfo &MRI = *MIB.getMRI();
1558
1559 // Attempt to optimize the test bit by walking over instructions.
1560 TestReg = getTestBitReg(Reg: TestReg, Bit, Invert&: IsNegative, MRI);
1561 LLT Ty = MRI.getType(Reg: TestReg);
1562 unsigned Size = Ty.getSizeInBits();
1563 assert(!Ty.isVector() && "Expected a scalar!");
1564 assert(Bit < 64 && "Bit is too large!");
1565
1566 // When the test register is a 64-bit register, we have to narrow to make
1567 // TBNZW work.
1568 bool UseWReg = Bit < 32;
1569 unsigned NecessarySize = UseWReg ? 32 : 64;
1570 if (Size != NecessarySize)
1571 TestReg = moveScalarRegClass(
1572 Reg: TestReg, RC: UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1573 MIB);
1574
1575 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1576 {AArch64::TBZW, AArch64::TBNZW}};
1577 unsigned Opc = OpcTable[UseWReg][IsNegative];
1578 auto TestBitMI =
1579 MIB.buildInstr(Opcode: Opc).addReg(RegNo: TestReg).addImm(Val: Bit).addMBB(MBB: DstMBB);
1580 constrainSelectedInstRegOperands(I&: *TestBitMI, TII, TRI, RBI);
1581 return &*TestBitMI;
1582}
1583
1584bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1585 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1586 MachineIRBuilder &MIB) const {
1587 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1588 // Given something like this:
1589 //
1590 // %x = ...Something...
1591 // %one = G_CONSTANT i64 1
1592 // %zero = G_CONSTANT i64 0
1593 // %and = G_AND %x, %one
1594 // %cmp = G_ICMP intpred(ne), %and, %zero
1595 // %cmp_trunc = G_TRUNC %cmp
1596 // G_BRCOND %cmp_trunc, %bb.3
1597 //
1598 // We want to try and fold the AND into the G_BRCOND and produce either a
1599 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1600 //
1601 // In this case, we'd get
1602 //
1603 // TBNZ %x %bb.3
1604 //
1605
1606 // Check if the AND has a constant on its RHS which we can use as a mask.
1607 // If it's a power of 2, then it's the same as checking a specific bit.
1608 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1609 auto MaybeBit = getIConstantVRegValWithLookThrough(
1610 VReg: AndInst.getOperand(i: 2).getReg(), MRI: *MIB.getMRI());
1611 if (!MaybeBit)
1612 return false;
1613
1614 int32_t Bit = MaybeBit->Value.exactLogBase2();
1615 if (Bit < 0)
1616 return false;
1617
1618 Register TestReg = AndInst.getOperand(i: 1).getReg();
1619
1620 // Emit a TB(N)Z.
1621 emitTestBit(TestReg, Bit, IsNegative: Invert, DstMBB, MIB);
1622 return true;
1623}
1624
1625MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1626 bool IsNegative,
1627 MachineBasicBlock *DestMBB,
1628 MachineIRBuilder &MIB) const {
1629 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1630 MachineRegisterInfo &MRI = *MIB.getMRI();
1631 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1632 AArch64::GPRRegBankID &&
1633 "Expected GPRs only?");
1634 auto Ty = MRI.getType(Reg: CompareReg);
1635 unsigned Width = Ty.getSizeInBits();
1636 assert(!Ty.isVector() && "Expected scalar only?");
1637 assert(Width <= 64 && "Expected width to be at most 64?");
1638 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1639 {AArch64::CBNZW, AArch64::CBNZX}};
1640 unsigned Opc = OpcTable[IsNegative][Width == 64];
1641 auto BranchMI = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {CompareReg}).addMBB(MBB: DestMBB);
1642 constrainSelectedInstRegOperands(I&: *BranchMI, TII, TRI, RBI);
1643 return &*BranchMI;
1644}
1645
1646bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1647 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1648 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1649 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1650 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1651 // totally clean. Some of them require two branches to implement.
1652 auto Pred = (CmpInst::Predicate)FCmp.getOperand(i: 1).getPredicate();
1653 emitFPCompare(LHS: FCmp.getOperand(i: 2).getReg(), RHS: FCmp.getOperand(i: 3).getReg(), MIRBuilder&: MIB,
1654 Pred);
1655 AArch64CC::CondCode CC1, CC2;
1656 changeFCMPPredToAArch64CC(P: Pred, CondCode&: CC1, CondCode2&: CC2);
1657 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1658 MIB.buildInstr(Opc: AArch64::Bcc, DstOps: {}, SrcOps: {}).addImm(Val: CC1).addMBB(MBB: DestMBB);
1659 if (CC2 != AArch64CC::AL)
1660 MIB.buildInstr(Opc: AArch64::Bcc, DstOps: {}, SrcOps: {}).addImm(Val: CC2).addMBB(MBB: DestMBB);
1661 I.eraseFromParent();
1662 return true;
1663}
1664
1665bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1666 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1667 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1668 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1669 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1670 //
1671 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1672 // instructions will not be produced, as they are conditional branch
1673 // instructions that do not set flags.
1674 if (!ProduceNonFlagSettingCondBr)
1675 return false;
1676
1677 MachineRegisterInfo &MRI = *MIB.getMRI();
1678 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1679 auto Pred =
1680 static_cast<CmpInst::Predicate>(ICmp.getOperand(i: 1).getPredicate());
1681 Register LHS = ICmp.getOperand(i: 2).getReg();
1682 Register RHS = ICmp.getOperand(i: 3).getReg();
1683
1684 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1685 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
1686 MachineInstr *AndInst = getOpcodeDef(Opcode: TargetOpcode::G_AND, Reg: LHS, MRI);
1687
1688 // When we can emit a TB(N)Z, prefer that.
1689 //
1690 // Handle non-commutative condition codes first.
1691 // Note that we don't want to do this when we have a G_AND because it can
1692 // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1693 if (VRegAndVal && !AndInst) {
1694 int64_t C = VRegAndVal->Value.getSExtValue();
1695
1696 // When we have a greater-than comparison, we can just test if the msb is
1697 // zero.
1698 if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1699 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1700 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ false, DstMBB: DestMBB, MIB);
1701 I.eraseFromParent();
1702 return true;
1703 }
1704
1705 // When we have a less than comparison, we can just test if the msb is not
1706 // zero.
1707 if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1708 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1709 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ true, DstMBB: DestMBB, MIB);
1710 I.eraseFromParent();
1711 return true;
1712 }
1713
1714 // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1715 // we can test if the msb is zero.
1716 if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1717 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1718 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ false, DstMBB: DestMBB, MIB);
1719 I.eraseFromParent();
1720 return true;
1721 }
1722 }
1723
1724 // Attempt to handle commutative condition codes. Right now, that's only
1725 // eq/ne.
1726 if (ICmpInst::isEquality(P: Pred)) {
1727 if (!VRegAndVal) {
1728 std::swap(a&: RHS, b&: LHS);
1729 VRegAndVal = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
1730 AndInst = getOpcodeDef(Opcode: TargetOpcode::G_AND, Reg: LHS, MRI);
1731 }
1732
1733 if (VRegAndVal && VRegAndVal->Value == 0) {
1734 // If there's a G_AND feeding into this branch, try to fold it away by
1735 // emitting a TB(N)Z instead.
1736 //
1737 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1738 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1739 // would be redundant.
1740 if (AndInst &&
1741 tryOptAndIntoCompareBranch(
1742 AndInst&: *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DstMBB: DestMBB, MIB)) {
1743 I.eraseFromParent();
1744 return true;
1745 }
1746
1747 // Otherwise, try to emit a CB(N)Z instead.
1748 auto LHSTy = MRI.getType(Reg: LHS);
1749 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1750 emitCBZ(CompareReg: LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1751 I.eraseFromParent();
1752 return true;
1753 }
1754 }
1755 }
1756
1757 return false;
1758}
1759
1760bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1761 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1762 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1763 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1764 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1765 return true;
1766
1767 // Couldn't optimize. Emit a compare + a Bcc.
1768 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1769 auto &PredOp = ICmp.getOperand(i: 1);
1770 emitIntegerCompare(LHS&: ICmp.getOperand(i: 2), RHS&: ICmp.getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
1771 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1772 P: static_cast<CmpInst::Predicate>(PredOp.getPredicate()),
1773 RHS: ICmp.getOperand(i: 3).getReg(), MRI: MIB.getMRI());
1774 MIB.buildInstr(Opc: AArch64::Bcc, DstOps: {}, SrcOps: {}).addImm(Val: CC).addMBB(MBB: DestMBB);
1775 I.eraseFromParent();
1776 return true;
1777}
1778
1779bool AArch64InstructionSelector::selectCompareBranch(
1780 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1781 Register CondReg = I.getOperand(i: 0).getReg();
1782 MachineInstr *CCMI = MRI.getVRegDef(Reg: CondReg);
1783 // Try to select the G_BRCOND using whatever is feeding the condition if
1784 // possible.
1785 unsigned CCMIOpc = CCMI->getOpcode();
1786 if (CCMIOpc == TargetOpcode::G_FCMP)
1787 return selectCompareBranchFedByFCmp(I, FCmp&: *CCMI, MIB);
1788 if (CCMIOpc == TargetOpcode::G_ICMP)
1789 return selectCompareBranchFedByICmp(I, ICmp&: *CCMI, MIB);
1790
1791 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1792 // instructions will not be produced, as they are conditional branch
1793 // instructions that do not set flags.
1794 if (ProduceNonFlagSettingCondBr) {
1795 emitTestBit(TestReg: CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1796 DstMBB: I.getOperand(i: 1).getMBB(), MIB);
1797 I.eraseFromParent();
1798 return true;
1799 }
1800
1801 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1802 auto TstMI =
1803 MIB.buildInstr(Opc: AArch64::ANDSWri, DstOps: {LLT::scalar(SizeInBits: 32)}, SrcOps: {CondReg}).addImm(Val: 1);
1804 constrainSelectedInstRegOperands(I&: *TstMI, TII, TRI, RBI);
1805 auto Bcc = MIB.buildInstr(Opcode: AArch64::Bcc)
1806 .addImm(Val: AArch64CC::NE)
1807 .addMBB(MBB: I.getOperand(i: 1).getMBB());
1808 I.eraseFromParent();
1809 constrainSelectedInstRegOperands(I&: *Bcc, TII, TRI, RBI);
1810 return true;
1811}
1812
1813/// Returns the element immediate value of a vector shift operand if found.
1814/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1815static std::optional<int64_t> getVectorShiftImm(Register Reg,
1816 MachineRegisterInfo &MRI) {
1817 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1818 MachineInstr *OpMI = MRI.getVRegDef(Reg);
1819 return getAArch64VectorSplatScalar(MI: *OpMI, MRI);
1820}
1821
1822/// Matches and returns the shift immediate value for a SHL instruction given
1823/// a shift operand.
1824static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1825 MachineRegisterInfo &MRI) {
1826 std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1827 if (!ShiftImm)
1828 return std::nullopt;
1829 // Check the immediate is in range for a SHL.
1830 int64_t Imm = *ShiftImm;
1831 if (Imm < 0)
1832 return std::nullopt;
1833 switch (SrcTy.getElementType().getSizeInBits()) {
1834 default:
1835 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1836 return std::nullopt;
1837 case 8:
1838 if (Imm > 7)
1839 return std::nullopt;
1840 break;
1841 case 16:
1842 if (Imm > 15)
1843 return std::nullopt;
1844 break;
1845 case 32:
1846 if (Imm > 31)
1847 return std::nullopt;
1848 break;
1849 case 64:
1850 if (Imm > 63)
1851 return std::nullopt;
1852 break;
1853 }
1854 return Imm;
1855}
1856
1857bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1858 MachineRegisterInfo &MRI) {
1859 assert(I.getOpcode() == TargetOpcode::G_SHL);
1860 Register DstReg = I.getOperand(i: 0).getReg();
1861 const LLT Ty = MRI.getType(Reg: DstReg);
1862 Register Src1Reg = I.getOperand(i: 1).getReg();
1863 Register Src2Reg = I.getOperand(i: 2).getReg();
1864
1865 if (!Ty.isVector())
1866 return false;
1867
1868 // Check if we have a vector of constants on RHS that we can select as the
1869 // immediate form.
1870 std::optional<int64_t> ImmVal = getVectorSHLImm(SrcTy: Ty, Reg: Src2Reg, MRI);
1871
1872 unsigned Opc = 0;
1873 if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64)) {
1874 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1875 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
1876 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1877 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
1878 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1879 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)) {
1880 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1881 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16)) {
1882 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1883 } else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8)) {
1884 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1885 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8)) {
1886 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1887 } else {
1888 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1889 return false;
1890 }
1891
1892 auto Shl = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Src1Reg});
1893 if (ImmVal)
1894 Shl.addImm(Val: *ImmVal);
1895 else
1896 Shl.addUse(RegNo: Src2Reg);
1897 constrainSelectedInstRegOperands(I&: *Shl, TII, TRI, RBI);
1898 I.eraseFromParent();
1899 return true;
1900}
1901
1902bool AArch64InstructionSelector::selectVectorAshrLshr(
1903 MachineInstr &I, MachineRegisterInfo &MRI) {
1904 assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1905 I.getOpcode() == TargetOpcode::G_LSHR);
1906 Register DstReg = I.getOperand(i: 0).getReg();
1907 const LLT Ty = MRI.getType(Reg: DstReg);
1908 Register Src1Reg = I.getOperand(i: 1).getReg();
1909 Register Src2Reg = I.getOperand(i: 2).getReg();
1910
1911 if (!Ty.isVector())
1912 return false;
1913
1914 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1915
1916 // We expect the immediate case to be lowered in the PostLegalCombiner to
1917 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1918
1919 // There is not a shift right register instruction, but the shift left
1920 // register instruction takes a signed value, where negative numbers specify a
1921 // right shift.
1922
1923 unsigned Opc = 0;
1924 unsigned NegOpc = 0;
1925 const TargetRegisterClass *RC =
1926 getRegClassForTypeOnBank(Ty, RB: RBI.getRegBank(ID: AArch64::FPRRegBankID));
1927 if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64)) {
1928 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1929 NegOpc = AArch64::NEGv2i64;
1930 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
1931 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1932 NegOpc = AArch64::NEGv4i32;
1933 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
1934 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1935 NegOpc = AArch64::NEGv2i32;
1936 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)) {
1937 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1938 NegOpc = AArch64::NEGv4i16;
1939 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16)) {
1940 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1941 NegOpc = AArch64::NEGv8i16;
1942 } else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8)) {
1943 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1944 NegOpc = AArch64::NEGv16i8;
1945 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8)) {
1946 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1947 NegOpc = AArch64::NEGv8i8;
1948 } else {
1949 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1950 return false;
1951 }
1952
1953 auto Neg = MIB.buildInstr(Opc: NegOpc, DstOps: {RC}, SrcOps: {Src2Reg});
1954 constrainSelectedInstRegOperands(I&: *Neg, TII, TRI, RBI);
1955 auto SShl = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Src1Reg, Neg});
1956 constrainSelectedInstRegOperands(I&: *SShl, TII, TRI, RBI);
1957 I.eraseFromParent();
1958 return true;
1959}
1960
1961bool AArch64InstructionSelector::selectVaStartAAPCS(
1962 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1963
1964 if (STI.isCallingConvWin64(CC: MF.getFunction().getCallingConv(),
1965 IsVarArg: MF.getFunction().isVarArg()))
1966 return false;
1967
1968 // The layout of the va_list struct is specified in the AArch64 Procedure Call
1969 // Standard, section 10.1.5.
1970
1971 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1972 const unsigned PtrSize = STI.isTargetILP32() ? 4 : 8;
1973 const auto *PtrRegClass =
1974 STI.isTargetILP32() ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
1975
1976 const MCInstrDesc &MCIDAddAddr =
1977 TII.get(Opcode: STI.isTargetILP32() ? AArch64::ADDWri : AArch64::ADDXri);
1978 const MCInstrDesc &MCIDStoreAddr =
1979 TII.get(Opcode: STI.isTargetILP32() ? AArch64::STRWui : AArch64::STRXui);
1980
1981 /*
1982 * typedef struct va_list {
1983 * void * stack; // next stack param
1984 * void * gr_top; // end of GP arg reg save area
1985 * void * vr_top; // end of FP/SIMD arg reg save area
1986 * int gr_offs; // offset from gr_top to next GP register arg
1987 * int vr_offs; // offset from vr_top to next FP/SIMD register arg
1988 * } va_list;
1989 */
1990 const auto VAList = I.getOperand(i: 0).getReg();
1991
1992 // Our current offset in bytes from the va_list struct (VAList).
1993 unsigned OffsetBytes = 0;
1994
1995 // Helper function to store (FrameIndex + Imm) to VAList at offset OffsetBytes
1996 // and increment OffsetBytes by PtrSize.
1997 const auto PushAddress = [&](const int FrameIndex, const int64_t Imm) {
1998 const Register Top = MRI.createVirtualRegister(RegClass: PtrRegClass);
1999 auto MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: MCIDAddAddr)
2000 .addDef(RegNo: Top)
2001 .addFrameIndex(Idx: FrameIndex)
2002 .addImm(Val: Imm)
2003 .addImm(Val: 0);
2004 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2005
2006 const auto *MMO = *I.memoperands_begin();
2007 MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: MCIDStoreAddr)
2008 .addUse(RegNo: Top)
2009 .addUse(RegNo: VAList)
2010 .addImm(Val: OffsetBytes / PtrSize)
2011 .addMemOperand(MMO: MF.getMachineMemOperand(
2012 PtrInfo: MMO->getPointerInfo().getWithOffset(O: OffsetBytes),
2013 F: MachineMemOperand::MOStore, Size: PtrSize, BaseAlignment: MMO->getBaseAlign()));
2014 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2015
2016 OffsetBytes += PtrSize;
2017 };
2018
2019 // void* stack at offset 0
2020 PushAddress(FuncInfo->getVarArgsStackIndex(), 0);
2021
2022 // void* gr_top at offset 8 (4 on ILP32)
2023 const unsigned GPRSize = FuncInfo->getVarArgsGPRSize();
2024 PushAddress(FuncInfo->getVarArgsGPRIndex(), GPRSize);
2025
2026 // void* vr_top at offset 16 (8 on ILP32)
2027 const unsigned FPRSize = FuncInfo->getVarArgsFPRSize();
2028 PushAddress(FuncInfo->getVarArgsFPRIndex(), FPRSize);
2029
2030 // Helper function to store a 4-byte integer constant to VAList at offset
2031 // OffsetBytes, and increment OffsetBytes by 4.
2032 const auto PushIntConstant = [&](const int32_t Value) {
2033 constexpr int IntSize = 4;
2034 const Register Temp = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
2035 auto MIB =
2036 BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::MOVi32imm))
2037 .addDef(RegNo: Temp)
2038 .addImm(Val: Value);
2039 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2040
2041 const auto *MMO = *I.memoperands_begin();
2042 MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::STRWui))
2043 .addUse(RegNo: Temp)
2044 .addUse(RegNo: VAList)
2045 .addImm(Val: OffsetBytes / IntSize)
2046 .addMemOperand(MMO: MF.getMachineMemOperand(
2047 PtrInfo: MMO->getPointerInfo().getWithOffset(O: OffsetBytes),
2048 F: MachineMemOperand::MOStore, Size: IntSize, BaseAlignment: MMO->getBaseAlign()));
2049 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2050 OffsetBytes += IntSize;
2051 };
2052
2053 // int gr_offs at offset 24 (12 on ILP32)
2054 PushIntConstant(-static_cast<int32_t>(GPRSize));
2055
2056 // int vr_offs at offset 28 (16 on ILP32)
2057 PushIntConstant(-static_cast<int32_t>(FPRSize));
2058
2059 assert(OffsetBytes == (STI.isTargetILP32() ? 20 : 32) && "Unexpected offset");
2060
2061 I.eraseFromParent();
2062 return true;
2063}
2064
2065bool AArch64InstructionSelector::selectVaStartDarwin(
2066 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
2067 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2068 Register ListReg = I.getOperand(i: 0).getReg();
2069
2070 Register ArgsAddrReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
2071
2072 int FrameIdx = FuncInfo->getVarArgsStackIndex();
2073 if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
2074 CC: MF.getFunction().getCallingConv(), IsVarArg: MF.getFunction().isVarArg())) {
2075 FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
2076 ? FuncInfo->getVarArgsGPRIndex()
2077 : FuncInfo->getVarArgsStackIndex();
2078 }
2079
2080 auto MIB =
2081 BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::ADDXri))
2082 .addDef(RegNo: ArgsAddrReg)
2083 .addFrameIndex(Idx: FrameIdx)
2084 .addImm(Val: 0)
2085 .addImm(Val: 0);
2086
2087 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2088
2089 MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::STRXui))
2090 .addUse(RegNo: ArgsAddrReg)
2091 .addUse(RegNo: ListReg)
2092 .addImm(Val: 0)
2093 .addMemOperand(MMO: *I.memoperands_begin());
2094
2095 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2096 I.eraseFromParent();
2097 return true;
2098}
2099
2100void AArch64InstructionSelector::materializeLargeCMVal(
2101 MachineInstr &I, const Value *V, unsigned OpFlags) {
2102 MachineBasicBlock &MBB = *I.getParent();
2103 MachineFunction &MF = *MBB.getParent();
2104 MachineRegisterInfo &MRI = MF.getRegInfo();
2105
2106 auto MovZ = MIB.buildInstr(Opc: AArch64::MOVZXi, DstOps: {&AArch64::GPR64RegClass}, SrcOps: {});
2107 MovZ->addOperand(MF, Op: I.getOperand(i: 1));
2108 MovZ->getOperand(i: 1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
2109 AArch64II::MO_NC);
2110 MovZ->addOperand(MF, Op: MachineOperand::CreateImm(Val: 0));
2111 constrainSelectedInstRegOperands(I&: *MovZ, TII, TRI, RBI);
2112
2113 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2114 Register ForceDstReg) {
2115 Register DstReg = ForceDstReg
2116 ? ForceDstReg
2117 : MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
2118 auto MovI = MIB.buildInstr(Opcode: AArch64::MOVKXi).addDef(RegNo: DstReg).addUse(RegNo: SrcReg);
2119 if (auto *GV = dyn_cast<GlobalValue>(Val: V)) {
2120 MovI->addOperand(MF, Op: MachineOperand::CreateGA(
2121 GV, Offset: MovZ->getOperand(i: 1).getOffset(), TargetFlags: Flags));
2122 } else {
2123 MovI->addOperand(
2124 MF, Op: MachineOperand::CreateBA(BA: cast<BlockAddress>(Val: V),
2125 Offset: MovZ->getOperand(i: 1).getOffset(), TargetFlags: Flags));
2126 }
2127 MovI->addOperand(MF, Op: MachineOperand::CreateImm(Val: Offset));
2128 constrainSelectedInstRegOperands(I&: *MovI, TII, TRI, RBI);
2129 return DstReg;
2130 };
2131 Register DstReg = BuildMovK(MovZ.getReg(Idx: 0),
2132 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2133 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2134 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(i: 0).getReg());
2135}
2136
2137bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2138 MachineBasicBlock &MBB = *I.getParent();
2139 MachineFunction &MF = *MBB.getParent();
2140 MachineRegisterInfo &MRI = MF.getRegInfo();
2141
2142 switch (I.getOpcode()) {
2143 case TargetOpcode::G_CONSTANT: {
2144 Register DefReg = I.getOperand(i: 0).getReg();
2145 const LLT DefTy = MRI.getType(Reg: DefReg);
2146 if (!DefTy.isPointer())
2147 return false;
2148 const unsigned PtrSize = DefTy.getSizeInBits();
2149 if (PtrSize != 32 && PtrSize != 64)
2150 return false;
2151 // Convert pointer typed constants to integers so TableGen can select.
2152 MRI.setType(VReg: DefReg, Ty: LLT::integer(SizeInBits: PtrSize));
2153 return true;
2154 }
2155 case TargetOpcode::G_STORE: {
2156 bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2157 MachineOperand &SrcOp = I.getOperand(i: 0);
2158 if (MRI.getType(Reg: SrcOp.getReg()).isPointer()) {
2159 // Allow matching with imported patterns for stores of pointers. Unlike
2160 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2161 // and constrain.
2162 auto Copy = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: SrcOp);
2163 Register NewSrc = Copy.getReg(Idx: 0);
2164 SrcOp.setReg(NewSrc);
2165 RBI.constrainGenericRegister(Reg: NewSrc, RC: AArch64::GPR64RegClass, MRI);
2166 Changed = true;
2167 }
2168 return Changed;
2169 }
2170 case TargetOpcode::G_PTR_ADD: {
2171 // If Checked Pointer Arithmetic (FEAT_CPA) is present, preserve the pointer
2172 // arithmetic semantics instead of falling back to regular arithmetic.
2173 const auto &TL = STI.getTargetLowering();
2174 if (TL->shouldPreservePtrArith(F: MF.getFunction(), PtrVT: EVT()))
2175 return false;
2176 return convertPtrAddToAdd(I, MRI);
2177 }
2178 case TargetOpcode::G_LOAD: {
2179 // For scalar loads of pointers, we try to convert the dest type from p0
2180 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2181 // conversion, this should be ok because all users should have been
2182 // selected already, so the type doesn't matter for them.
2183 Register DstReg = I.getOperand(i: 0).getReg();
2184 const LLT DstTy = MRI.getType(Reg: DstReg);
2185 if (!DstTy.isPointer())
2186 return false;
2187 MRI.setType(VReg: DstReg, Ty: LLT::scalar(SizeInBits: 64));
2188 return true;
2189 }
2190 case AArch64::G_DUP: {
2191 // Convert the type from p0 to s64 to help selection.
2192 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2193 if (!DstTy.isPointerVector())
2194 return false;
2195 auto NewSrc = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: I.getOperand(i: 1).getReg());
2196 MRI.setType(VReg: I.getOperand(i: 0).getReg(),
2197 Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 64)));
2198 MRI.setRegClass(Reg: NewSrc.getReg(Idx: 0), RC: &AArch64::GPR64RegClass);
2199 I.getOperand(i: 1).setReg(NewSrc.getReg(Idx: 0));
2200 return true;
2201 }
2202 case AArch64::G_INSERT_VECTOR_ELT: {
2203 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2204 LLT SrcVecTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
2205 if (SrcVecTy.isPointerVector()) {
2206 // Convert the type from p0 to s64 to help selection.
2207 auto NewSrc = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: I.getOperand(i: 2).getReg());
2208 MRI.setType(VReg: I.getOperand(i: 1).getReg(),
2209 Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 64)));
2210 MRI.setType(VReg: I.getOperand(i: 0).getReg(),
2211 Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 64)));
2212 MRI.setRegClass(Reg: NewSrc.getReg(Idx: 0), RC: &AArch64::GPR64RegClass);
2213 I.getOperand(i: 2).setReg(NewSrc.getReg(Idx: 0));
2214 return true;
2215 }
2216
2217 Register EltReg = I.getOperand(i: 2).getReg();
2218 LLT EltTy = MRI.getType(Reg: EltReg);
2219 if (EltTy.isScalar() &&
2220 (EltTy.getSizeInBits() == 8 || EltTy.getSizeInBits() == 16) &&
2221 RBI.getRegBank(Reg: EltReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) {
2222 // Convert the type from s8/s16 to s32 to help selection.
2223 auto NewElt = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 32), Op: EltReg);
2224 MRI.setRegClass(Reg: NewElt.getReg(Idx: 0), RC: &AArch64::GPR32RegClass);
2225 I.getOperand(i: 2).setReg(NewElt.getReg(Idx: 0));
2226 return true;
2227 }
2228 return false;
2229 }
2230 case TargetOpcode::G_UITOFP:
2231 case TargetOpcode::G_SITOFP: {
2232 // If both source and destination regbanks are FPR, then convert the opcode
2233 // to G_SITOF so that the importer can select it to an fpr variant.
2234 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2235 // copy.
2236 Register SrcReg = I.getOperand(i: 1).getReg();
2237 LLT SrcTy = MRI.getType(Reg: SrcReg);
2238 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2239 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2240 return false;
2241
2242 if (RBI.getRegBank(Reg: SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2243 // Need to add a copy to change the type so that the existing patterns can
2244 // match when there is an integer on an FPR bank.
2245 if (SrcTy.getScalarType().isInteger()) {
2246 auto Copy = MIB.buildCopy(Res: DstTy, Op: SrcReg);
2247 I.getOperand(i: 1).setReg(Copy.getReg(Idx: 0));
2248 MRI.setRegClass(Reg: Copy.getReg(Idx: 0),
2249 RC: getRegClassForTypeOnBank(
2250 Ty: SrcTy, RB: RBI.getRegBank(ID: AArch64::FPRRegBankID)));
2251 }
2252 if (I.getOpcode() == TargetOpcode::G_SITOFP)
2253 I.setDesc(TII.get(Opcode: AArch64::G_SITOF));
2254 else
2255 I.setDesc(TII.get(Opcode: AArch64::G_UITOF));
2256 return true;
2257 }
2258 return false;
2259 }
2260 default:
2261 return false;
2262 }
2263}
2264
2265/// This lowering tries to look for G_PTR_ADD instructions and then converts
2266/// them to a standard G_ADD with a COPY on the source.
2267///
2268/// The motivation behind this is to expose the add semantics to the imported
2269/// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2270/// because the selector works bottom up, uses before defs. By the time we
2271/// end up trying to select a G_PTR_ADD, we should have already attempted to
2272/// fold this into addressing modes and were therefore unsuccessful.
2273bool AArch64InstructionSelector::convertPtrAddToAdd(
2274 MachineInstr &I, MachineRegisterInfo &MRI) {
2275 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2276 Register DstReg = I.getOperand(i: 0).getReg();
2277 Register AddOp1Reg = I.getOperand(i: 1).getReg();
2278 const LLT PtrTy = MRI.getType(Reg: DstReg);
2279 if (PtrTy.getAddressSpace() != 0)
2280 return false;
2281
2282 const LLT CastPtrTy = PtrTy.isVector()
2283 ? LLT::fixed_vector(NumElements: 2, ScalarTy: LLT::integer(SizeInBits: 64))
2284 : LLT::integer(SizeInBits: 64);
2285 auto PtrToInt = MIB.buildPtrToInt(Dst: CastPtrTy, Src: AddOp1Reg);
2286 // Set regbanks on the registers.
2287 if (PtrTy.isVector())
2288 MRI.setRegBank(Reg: PtrToInt.getReg(Idx: 0), RegBank: RBI.getRegBank(ID: AArch64::FPRRegBankID));
2289 else
2290 MRI.setRegBank(Reg: PtrToInt.getReg(Idx: 0), RegBank: RBI.getRegBank(ID: AArch64::GPRRegBankID));
2291
2292 // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2293 // %dst(intty) = G_ADD %intbase, off
2294 I.setDesc(TII.get(Opcode: TargetOpcode::G_ADD));
2295 MRI.setType(VReg: DstReg, Ty: CastPtrTy);
2296 I.getOperand(i: 1).setReg(PtrToInt.getReg(Idx: 0));
2297 if (!select(I&: *PtrToInt)) {
2298 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2299 return false;
2300 }
2301
2302 // Also take the opportunity here to try to do some optimization.
2303 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2304 Register NegatedReg;
2305 if (!mi_match(R: I.getOperand(i: 2).getReg(), MRI, P: m_Neg(Src: m_Reg(R&: NegatedReg))))
2306 return true;
2307 I.getOperand(i: 2).setReg(NegatedReg);
2308 I.setDesc(TII.get(Opcode: TargetOpcode::G_SUB));
2309 return true;
2310}
2311
2312bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2313 MachineRegisterInfo &MRI) {
2314 // We try to match the immediate variant of LSL, which is actually an alias
2315 // for a special case of UBFM. Otherwise, we fall back to the imported
2316 // selector which will match the register variant.
2317 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2318 const auto &MO = I.getOperand(i: 2);
2319 auto VRegAndVal = getIConstantVRegVal(VReg: MO.getReg(), MRI);
2320 if (!VRegAndVal)
2321 return false;
2322
2323 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2324 if (DstTy.isVector())
2325 return false;
2326 bool Is64Bit = DstTy.getSizeInBits() == 64;
2327 auto Imm1Fn = Is64Bit ? selectShiftA_64(Root: MO) : selectShiftA_32(Root: MO);
2328 auto Imm2Fn = Is64Bit ? selectShiftB_64(Root: MO) : selectShiftB_32(Root: MO);
2329
2330 if (!Imm1Fn || !Imm2Fn)
2331 return false;
2332
2333 auto NewI =
2334 MIB.buildInstr(Opc: Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2335 DstOps: {I.getOperand(i: 0).getReg()}, SrcOps: {I.getOperand(i: 1).getReg()});
2336
2337 for (auto &RenderFn : *Imm1Fn)
2338 RenderFn(NewI);
2339 for (auto &RenderFn : *Imm2Fn)
2340 RenderFn(NewI);
2341
2342 I.eraseFromParent();
2343 constrainSelectedInstRegOperands(I&: *NewI, TII, TRI, RBI);
2344 return true;
2345}
2346
2347bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2348 MachineInstr &I, MachineRegisterInfo &MRI) {
2349 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2350 // If we're storing a scalar, it doesn't matter what register bank that
2351 // scalar is on. All that matters is the size.
2352 //
2353 // So, if we see something like this (with a 32-bit scalar as an example):
2354 //
2355 // %x:gpr(s32) = ... something ...
2356 // %y:fpr(s32) = COPY %x:gpr(s32)
2357 // G_STORE %y:fpr(s32)
2358 //
2359 // We can fix this up into something like this:
2360 //
2361 // G_STORE %x:gpr(s32)
2362 //
2363 // And then continue the selection process normally.
2364 Register DefDstReg = getSrcRegIgnoringCopies(Reg: I.getOperand(i: 0).getReg(), MRI);
2365 if (!DefDstReg.isValid())
2366 return false;
2367 LLT DefDstTy = MRI.getType(Reg: DefDstReg);
2368 Register StoreSrcReg = I.getOperand(i: 0).getReg();
2369 LLT StoreSrcTy = MRI.getType(Reg: StoreSrcReg);
2370
2371 // If we get something strange like a physical register, then we shouldn't
2372 // go any further.
2373 if (!DefDstTy.isValid())
2374 return false;
2375
2376 // Are the source and dst types the same size?
2377 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2378 return false;
2379
2380 if (RBI.getRegBank(Reg: StoreSrcReg, MRI, TRI) ==
2381 RBI.getRegBank(Reg: DefDstReg, MRI, TRI))
2382 return false;
2383
2384 // We have a cross-bank copy, which is entering a store. Let's fold it.
2385 I.getOperand(i: 0).setReg(DefDstReg);
2386 return true;
2387}
2388
2389bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2390 assert(I.getParent() && "Instruction should be in a basic block!");
2391 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2392
2393 MachineBasicBlock &MBB = *I.getParent();
2394 MachineFunction &MF = *MBB.getParent();
2395 MachineRegisterInfo &MRI = MF.getRegInfo();
2396
2397 switch (I.getOpcode()) {
2398 case AArch64::G_DUP: {
2399 // Before selecting a DUP instruction, check if it is better selected as a
2400 // MOV or load from a constant pool.
2401 Register Src = I.getOperand(i: 1).getReg();
2402 auto ValAndVReg = getAnyConstantVRegValWithLookThrough(
2403 VReg: Src, MRI, /*LookThroughInstrs=*/true, /*LookThroughAnyExt=*/true);
2404 if (!ValAndVReg)
2405 return false;
2406 LLVMContext &Ctx = MF.getFunction().getContext();
2407 Register Dst = I.getOperand(i: 0).getReg();
2408 auto *CV = ConstantDataVector::getSplat(
2409 NumElts: MRI.getType(Reg: Dst).getNumElements(),
2410 Elt: ConstantInt::get(
2411 Ty: Type::getIntNTy(C&: Ctx, N: MRI.getType(Reg: Dst).getScalarSizeInBits()),
2412 V: ValAndVReg->Value.trunc(width: MRI.getType(Reg: Dst).getScalarSizeInBits())));
2413 if (!emitConstantVector(Dst, CV, MIRBuilder&: MIB, MRI))
2414 return false;
2415 I.eraseFromParent();
2416 return true;
2417 }
2418 case TargetOpcode::G_SEXT:
2419 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2420 // over a normal extend.
2421 if (selectUSMovFromExtend(I, MRI))
2422 return true;
2423 return false;
2424 case TargetOpcode::G_BR:
2425 return false;
2426 case TargetOpcode::G_SHL:
2427 return earlySelectSHL(I, MRI);
2428 case TargetOpcode::G_CONSTANT: {
2429 bool IsZero = false;
2430 if (I.getOperand(i: 1).isCImm())
2431 IsZero = I.getOperand(i: 1).getCImm()->isZero();
2432 else if (I.getOperand(i: 1).isImm())
2433 IsZero = I.getOperand(i: 1).getImm() == 0;
2434
2435 if (!IsZero)
2436 return false;
2437
2438 Register DefReg = I.getOperand(i: 0).getReg();
2439 LLT Ty = MRI.getType(Reg: DefReg);
2440 if (Ty.getSizeInBits() == 64) {
2441 I.getOperand(i: 1).ChangeToRegister(Reg: AArch64::XZR, isDef: false);
2442 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
2443 } else if (Ty.getSizeInBits() <= 32) {
2444 I.getOperand(i: 1).ChangeToRegister(Reg: AArch64::WZR, isDef: false);
2445 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR32RegClass, MRI);
2446 } else
2447 return false;
2448
2449 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
2450 return true;
2451 }
2452
2453 case TargetOpcode::G_ADD: {
2454 // Check if this is being fed by a G_ICMP on either side.
2455 //
2456 // (cmp pred, x, y) + z
2457 //
2458 // In the above case, when the cmp is true, we increment z by 1. So, we can
2459 // fold the add into the cset for the cmp by using cinc.
2460 //
2461 // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2462 Register AddDst = I.getOperand(i: 0).getReg();
2463 Register AddLHS = I.getOperand(i: 1).getReg();
2464 Register AddRHS = I.getOperand(i: 2).getReg();
2465 // Only handle scalars.
2466 LLT Ty = MRI.getType(Reg: AddLHS);
2467 if (Ty.isVector())
2468 return false;
2469 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2470 // bits.
2471 unsigned Size = Ty.getSizeInBits();
2472 if (Size != 32 && Size != 64)
2473 return false;
2474 auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2475 if (!MRI.hasOneNonDBGUse(RegNo: Reg))
2476 return nullptr;
2477 // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2478 // compare.
2479 if (Size == 32)
2480 return getOpcodeDef(Opcode: TargetOpcode::G_ICMP, Reg, MRI);
2481 // We model scalar compares using 32-bit destinations right now.
2482 // If it's a 64-bit compare, it'll have 64-bit sources.
2483 Register ZExt;
2484 if (!mi_match(R: Reg, MRI,
2485 P: m_OneNonDBGUse(SP: m_GZExt(Src: m_OneNonDBGUse(SP: m_Reg(R&: ZExt))))))
2486 return nullptr;
2487 auto *Cmp = getOpcodeDef(Opcode: TargetOpcode::G_ICMP, Reg: ZExt, MRI);
2488 if (!Cmp ||
2489 MRI.getType(Reg: Cmp->getOperand(i: 2).getReg()).getSizeInBits() != 64)
2490 return nullptr;
2491 return Cmp;
2492 };
2493 // Try to match
2494 // z + (cmp pred, x, y)
2495 MachineInstr *Cmp = MatchCmp(AddRHS);
2496 if (!Cmp) {
2497 // (cmp pred, x, y) + z
2498 std::swap(a&: AddLHS, b&: AddRHS);
2499 Cmp = MatchCmp(AddRHS);
2500 if (!Cmp)
2501 return false;
2502 }
2503 auto &PredOp = Cmp->getOperand(i: 1);
2504 MIB.setInstrAndDebugLoc(I);
2505 emitIntegerCompare(/*LHS=*/Cmp->getOperand(i: 2),
2506 /*RHS=*/Cmp->getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
2507 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2508 const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
2509 P: CmpInst::getInversePredicate(pred: Pred), RHS: Cmp->getOperand(i: 3).getReg(), MRI: &MRI);
2510 emitCSINC(/*Dst=*/AddDst, /*Src =*/Src1: AddLHS, /*Src2=*/AddLHS, Pred: InvCC, MIRBuilder&: MIB);
2511 I.eraseFromParent();
2512 return true;
2513 }
2514 case TargetOpcode::G_OR: {
2515 // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2516 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2517 // shifting and masking that we can replace with a BFI (encoded as a BFM).
2518 Register Dst = I.getOperand(i: 0).getReg();
2519 LLT Ty = MRI.getType(Reg: Dst);
2520
2521 if (!Ty.isScalar())
2522 return false;
2523
2524 unsigned Size = Ty.getSizeInBits();
2525 if (Size != 32 && Size != 64)
2526 return false;
2527
2528 Register ShiftSrc;
2529 int64_t ShiftImm;
2530 Register MaskSrc;
2531 int64_t MaskImm;
2532 if (!mi_match(
2533 R: Dst, MRI,
2534 P: m_GOr(L: m_OneNonDBGUse(SP: m_GShl(L: m_Reg(R&: ShiftSrc), R: m_ICst(Cst&: ShiftImm))),
2535 R: m_OneNonDBGUse(SP: m_GAnd(L: m_Reg(R&: MaskSrc), R: m_ICst(Cst&: MaskImm))))))
2536 return false;
2537
2538 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2539 return false;
2540
2541 int64_t Immr = Size - ShiftImm;
2542 int64_t Imms = Size - ShiftImm - 1;
2543 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2544 emitInstr(Opcode: Opc, DstOps: {Dst}, SrcOps: {MaskSrc, ShiftSrc, Immr, Imms}, MIRBuilder&: MIB);
2545 I.eraseFromParent();
2546 return true;
2547 }
2548 case TargetOpcode::G_FENCE: {
2549 if (I.getOperand(i: 1).getImm() == 0)
2550 BuildMI(BB&: MBB, I, MIMD: MIMetadata(I), MCID: TII.get(Opcode: TargetOpcode::MEMBARRIER));
2551 else
2552 BuildMI(BB&: MBB, I, MIMD: MIMetadata(I), MCID: TII.get(Opcode: AArch64::DMB))
2553 .addImm(Val: I.getOperand(i: 0).getImm() == 4 ? 0x9 : 0xb);
2554 I.eraseFromParent();
2555 return true;
2556 }
2557 default:
2558 return false;
2559 }
2560}
2561
2562bool AArch64InstructionSelector::select(MachineInstr &I) {
2563 assert(I.getParent() && "Instruction should be in a basic block!");
2564 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2565
2566 MachineBasicBlock &MBB = *I.getParent();
2567 MachineFunction &MF = *MBB.getParent();
2568 MachineRegisterInfo &MRI = MF.getRegInfo();
2569
2570 const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2571 if (Subtarget->requiresStrictAlign()) {
2572 // We don't support this feature yet.
2573 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2574 return false;
2575 }
2576
2577 MIB.setInstrAndDebugLoc(I);
2578
2579 unsigned Opcode = I.getOpcode();
2580 // G_PHI requires same handling as PHI
2581 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2582 // Certain non-generic instructions also need some special handling.
2583
2584 if (Opcode == TargetOpcode::LOAD_STACK_GUARD) {
2585 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2586 return true;
2587 }
2588
2589 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2590 const Register DefReg = I.getOperand(i: 0).getReg();
2591 const LLT DefTy = MRI.getType(Reg: DefReg);
2592
2593 const RegClassOrRegBank &RegClassOrBank =
2594 MRI.getRegClassOrRegBank(Reg: DefReg);
2595
2596 const TargetRegisterClass *DefRC =
2597 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
2598 if (!DefRC) {
2599 if (!DefTy.isValid()) {
2600 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2601 return false;
2602 }
2603 const RegisterBank &RB = *cast<const RegisterBank *>(Val: RegClassOrBank);
2604 DefRC = getRegClassForTypeOnBank(Ty: DefTy, RB);
2605 if (!DefRC) {
2606 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2607 return false;
2608 }
2609 }
2610
2611 I.setDesc(TII.get(Opcode: TargetOpcode::PHI));
2612
2613 return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI);
2614 }
2615
2616 if (I.isCopy())
2617 return selectCopy(I, TII, MRI, TRI, RBI);
2618
2619 if (I.isDebugInstr())
2620 return selectDebugInstr(I, MRI, RBI);
2621
2622 return true;
2623 }
2624
2625
2626 if (I.getNumOperands() != I.getNumExplicitOperands()) {
2627 LLVM_DEBUG(
2628 dbgs() << "Generic instruction has unexpected implicit operands\n");
2629 return false;
2630 }
2631
2632 // Try to do some lowering before we start instruction selecting. These
2633 // lowerings are purely transformations on the input G_MIR and so selection
2634 // must continue after any modification of the instruction.
2635 if (preISelLower(I)) {
2636 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2637 }
2638
2639 // There may be patterns where the importer can't deal with them optimally,
2640 // but does select it to a suboptimal sequence so our custom C++ selection
2641 // code later never has a chance to work on it. Therefore, we have an early
2642 // selection attempt here to give priority to certain selection routines
2643 // over the imported ones.
2644 if (earlySelect(I))
2645 return true;
2646
2647 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
2648 return true;
2649
2650 LLT Ty =
2651 I.getOperand(i: 0).isReg() ? MRI.getType(Reg: I.getOperand(i: 0).getReg()) : LLT{};
2652
2653 switch (Opcode) {
2654 case TargetOpcode::G_SBFX:
2655 case TargetOpcode::G_UBFX: {
2656 static const unsigned OpcTable[2][2] = {
2657 {AArch64::UBFMWri, AArch64::UBFMXri},
2658 {AArch64::SBFMWri, AArch64::SBFMXri}};
2659 bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2660 unsigned Size = Ty.getSizeInBits();
2661 unsigned Opc = OpcTable[IsSigned][Size == 64];
2662 auto Cst1 =
2663 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 2).getReg(), MRI);
2664 assert(Cst1 && "Should have gotten a constant for src 1?");
2665 auto Cst2 =
2666 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 3).getReg(), MRI);
2667 assert(Cst2 && "Should have gotten a constant for src 2?");
2668 auto LSB = Cst1->Value.getZExtValue();
2669 auto Width = Cst2->Value.getZExtValue();
2670 auto BitfieldInst =
2671 MIB.buildInstr(Opc, DstOps: {I.getOperand(i: 0)}, SrcOps: {I.getOperand(i: 1)})
2672 .addImm(Val: LSB)
2673 .addImm(Val: LSB + Width - 1);
2674 I.eraseFromParent();
2675 constrainSelectedInstRegOperands(I&: *BitfieldInst, TII, TRI, RBI);
2676 return true;
2677 }
2678 case TargetOpcode::G_BRCOND:
2679 return selectCompareBranch(I, MF, MRI);
2680
2681 case TargetOpcode::G_BRINDIRECT: {
2682 const Function &Fn = MF.getFunction();
2683 if (std::optional<uint16_t> BADisc =
2684 STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(ParentFn: Fn)) {
2685 auto MI = MIB.buildInstr(Opc: AArch64::BRA, DstOps: {}, SrcOps: {I.getOperand(i: 0).getReg()});
2686 MI.addImm(Val: AArch64PACKey::IA);
2687 MI.addImm(Val: *BADisc);
2688 MI.addReg(/*AddrDisc=*/RegNo: AArch64::XZR);
2689 I.eraseFromParent();
2690 constrainSelectedInstRegOperands(I&: *MI, TII, TRI, RBI);
2691 return true;
2692 }
2693 I.setDesc(TII.get(Opcode: AArch64::BR));
2694 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2695 return true;
2696 }
2697
2698 case TargetOpcode::G_BRJT:
2699 return selectBrJT(I, MRI);
2700
2701 case AArch64::G_ADD_LOW: {
2702 // This op may have been separated from it's ADRP companion by the localizer
2703 // or some other code motion pass. Given that many CPUs will try to
2704 // macro fuse these operations anyway, select this into a MOVaddr pseudo
2705 // which will later be expanded into an ADRP+ADD pair after scheduling.
2706 MachineInstr *BaseMI = MRI.getVRegDef(Reg: I.getOperand(i: 1).getReg());
2707 if (BaseMI->getOpcode() != AArch64::ADRP) {
2708 I.setDesc(TII.get(Opcode: AArch64::ADDXri));
2709 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2710 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2711 return true;
2712 }
2713 assert(TM.getCodeModel() == CodeModel::Small &&
2714 "Expected small code model");
2715 auto Op1 = BaseMI->getOperand(i: 1);
2716 auto Op2 = I.getOperand(i: 2);
2717 auto MovAddr = MIB.buildInstr(Opc: AArch64::MOVaddr, DstOps: {I.getOperand(i: 0)}, SrcOps: {})
2718 .addGlobalAddress(GV: Op1.getGlobal(), Offset: Op1.getOffset(),
2719 TargetFlags: Op1.getTargetFlags())
2720 .addGlobalAddress(GV: Op2.getGlobal(), Offset: Op2.getOffset(),
2721 TargetFlags: Op2.getTargetFlags());
2722 I.eraseFromParent();
2723 constrainSelectedInstRegOperands(I&: *MovAddr, TII, TRI, RBI);
2724 return true;
2725 }
2726
2727 case TargetOpcode::G_FCONSTANT: {
2728 const Register DefReg = I.getOperand(i: 0).getReg();
2729 const LLT DefTy = MRI.getType(Reg: DefReg);
2730 const unsigned DefSize = DefTy.getSizeInBits();
2731 const RegisterBank &RB = *RBI.getRegBank(Reg: DefReg, MRI, TRI);
2732
2733 const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(Ty: DefTy, RB);
2734 // For 16, 64, and 128b values, emit a constant pool load.
2735 switch (DefSize) {
2736 default:
2737 llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2738 case 32:
2739 case 64: {
2740 bool OptForSize = shouldOptForSize(MF: &MF);
2741 const auto &TLI = MF.getSubtarget().getTargetLowering();
2742 // If TLI says that this fpimm is illegal, then we'll expand to a
2743 // constant pool load.
2744 if (TLI->isFPImmLegal(I.getOperand(i: 1).getFPImm()->getValueAPF(),
2745 EVT::getFloatingPointVT(BitWidth: DefSize), ForCodeSize: OptForSize))
2746 break;
2747 [[fallthrough]];
2748 }
2749 case 16:
2750 case 128: {
2751 auto *FPImm = I.getOperand(i: 1).getFPImm();
2752 auto *LoadMI = emitLoadFromConstantPool(CPVal: FPImm, MIRBuilder&: MIB);
2753 if (!LoadMI) {
2754 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2755 return false;
2756 }
2757 MIB.buildCopy(Res: {DefReg}, Op: {LoadMI->getOperand(i: 0).getReg()});
2758 I.eraseFromParent();
2759 return RBI.constrainGenericRegister(Reg: DefReg, RC: FPRRC, MRI);
2760 }
2761 }
2762
2763 assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size");
2764 // Either emit a FMOV, or emit a copy to emit a normal mov.
2765 const Register DefGPRReg = MRI.createVirtualRegister(
2766 RegClass: DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2767 MachineOperand &RegOp = I.getOperand(i: 0);
2768 RegOp.setReg(DefGPRReg);
2769 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: I.getIterator()));
2770 MIB.buildCopy(Res: {DefReg}, Op: {DefGPRReg});
2771
2772 if (!RBI.constrainGenericRegister(Reg: DefReg, RC: FPRRC, MRI)) {
2773 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2774 return false;
2775 }
2776
2777 MachineOperand &ImmOp = I.getOperand(i: 1);
2778 ImmOp.ChangeToImmediate(
2779 ImmVal: ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2780
2781 const unsigned MovOpc =
2782 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2783 I.setDesc(TII.get(Opcode: MovOpc));
2784 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2785 return true;
2786 }
2787 case TargetOpcode::G_EXTRACT: {
2788 Register DstReg = I.getOperand(i: 0).getReg();
2789 Register SrcReg = I.getOperand(i: 1).getReg();
2790 LLT SrcTy = MRI.getType(Reg: SrcReg);
2791 LLT DstTy = MRI.getType(Reg: DstReg);
2792 (void)DstTy;
2793 unsigned SrcSize = SrcTy.getSizeInBits();
2794
2795 if (SrcTy.getSizeInBits() > 64) {
2796 // This should be an extract of an s128, which is like a vector extract.
2797 if (SrcTy.getSizeInBits() != 128)
2798 return false;
2799 // Only support extracting 64 bits from an s128 at the moment.
2800 if (DstTy.getSizeInBits() != 64)
2801 return false;
2802
2803 unsigned Offset = I.getOperand(i: 2).getImm();
2804 if (Offset % 64 != 0)
2805 return false;
2806
2807 // Check we have the right regbank always.
2808 const RegisterBank &SrcRB = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
2809 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
2810 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2811
2812 if (SrcRB.getID() == AArch64::GPRRegBankID) {
2813 auto NewI =
2814 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {})
2815 .addUse(RegNo: SrcReg, Flags: {},
2816 SubReg: Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2817 constrainOperandRegClass(MF, TRI, MRI, TII, RBI, InsertPt&: *NewI,
2818 RegClass: AArch64::GPR64RegClass, RegMO&: NewI->getOperand(i: 0));
2819 I.eraseFromParent();
2820 return true;
2821 }
2822
2823 // Emit the same code as a vector extract.
2824 // Offset must be a multiple of 64.
2825 unsigned LaneIdx = Offset / 64;
2826 MachineInstr *Extract = emitExtractVectorElt(
2827 DstReg, DstRB, ScalarTy: LLT::scalar(SizeInBits: 64), VecReg: SrcReg, LaneIdx, MIRBuilder&: MIB);
2828 if (!Extract)
2829 return false;
2830 I.eraseFromParent();
2831 return true;
2832 }
2833
2834 I.setDesc(TII.get(Opcode: SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2835 MachineInstrBuilder(MF, I).addImm(Val: I.getOperand(i: 2).getImm() +
2836 Ty.getSizeInBits() - 1);
2837
2838 if (SrcSize < 64) {
2839 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2840 "unexpected G_EXTRACT types");
2841 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2842 return true;
2843 }
2844
2845 DstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2846 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: I.getIterator()));
2847 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: 0).getReg()}, SrcOps: {})
2848 .addReg(RegNo: DstReg, Flags: {}, SubReg: AArch64::sub_32);
2849 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(),
2850 RC: AArch64::GPR32RegClass, MRI);
2851 I.getOperand(i: 0).setReg(DstReg);
2852
2853 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2854 return true;
2855 }
2856
2857 case TargetOpcode::G_INSERT: {
2858 LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 2).getReg());
2859 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2860 unsigned DstSize = DstTy.getSizeInBits();
2861 // Larger inserts are vectors, same-size ones should be something else by
2862 // now (split up or turned into COPYs).
2863 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2864 return false;
2865
2866 I.setDesc(TII.get(Opcode: DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2867 unsigned LSB = I.getOperand(i: 3).getImm();
2868 unsigned Width = MRI.getType(Reg: I.getOperand(i: 2).getReg()).getSizeInBits();
2869 I.getOperand(i: 3).setImm((DstSize - LSB) % DstSize);
2870 MachineInstrBuilder(MF, I).addImm(Val: Width - 1);
2871
2872 if (DstSize < 64) {
2873 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2874 "unexpected G_INSERT types");
2875 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2876 return true;
2877 }
2878
2879 Register SrcReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2880 BuildMI(BB&: MBB, I: I.getIterator(), MIMD: I.getDebugLoc(),
2881 MCID: TII.get(Opcode: AArch64::SUBREG_TO_REG))
2882 .addDef(RegNo: SrcReg)
2883 .addUse(RegNo: I.getOperand(i: 2).getReg())
2884 .addImm(Val: AArch64::sub_32);
2885 RBI.constrainGenericRegister(Reg: I.getOperand(i: 2).getReg(),
2886 RC: AArch64::GPR32RegClass, MRI);
2887 I.getOperand(i: 2).setReg(SrcReg);
2888
2889 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2890 return true;
2891 }
2892 case TargetOpcode::G_FRAME_INDEX: {
2893 // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2894 if (Ty != LLT::pointer(AddressSpace: 0, SizeInBits: 64)) {
2895 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2896 << ", expected: " << LLT::pointer(0, 64) << '\n');
2897 return false;
2898 }
2899 I.setDesc(TII.get(Opcode: AArch64::ADDXri));
2900
2901 // MOs for a #0 shifted immediate.
2902 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2903 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2904
2905 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2906 return true;
2907 }
2908
2909 case TargetOpcode::G_GLOBAL_VALUE: {
2910 const GlobalValue *GV = nullptr;
2911 unsigned OpFlags;
2912 if (I.getOperand(i: 1).isSymbol()) {
2913 OpFlags = I.getOperand(i: 1).getTargetFlags();
2914 // Currently only used by "RtLibUseGOT".
2915 assert(OpFlags == AArch64II::MO_GOT);
2916 } else {
2917 GV = I.getOperand(i: 1).getGlobal();
2918 if (GV->isThreadLocal()) {
2919 // We don't support instructions with emulated TLS variables yet
2920 if (TM.useEmulatedTLS())
2921 return false;
2922 return selectTLSGlobalValue(I, MRI);
2923 }
2924 OpFlags = STI.ClassifyGlobalReference(GV, TM);
2925 }
2926
2927 if (OpFlags & AArch64II::MO_GOT) {
2928 bool IsGOTSigned = MF.getInfo<AArch64FunctionInfo>()->hasELFSignedGOT();
2929 I.setDesc(TII.get(Opcode: IsGOTSigned ? AArch64::LOADgotAUTH : AArch64::LOADgot));
2930 I.getOperand(i: 1).setTargetFlags(OpFlags);
2931 I.addImplicitDefUseOperands(MF);
2932 } else if (TM.getCodeModel() == CodeModel::Large &&
2933 !TM.isPositionIndependent()) {
2934 // Materialize the global using movz/movk instructions.
2935 materializeLargeCMVal(I, V: GV, OpFlags);
2936 I.eraseFromParent();
2937 return true;
2938 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2939 I.setDesc(TII.get(Opcode: AArch64::ADR));
2940 I.getOperand(i: 1).setTargetFlags(OpFlags);
2941 } else {
2942 I.setDesc(TII.get(Opcode: AArch64::MOVaddr));
2943 I.getOperand(i: 1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2944 MachineInstrBuilder MIB(MF, I);
2945 MIB.addGlobalAddress(GV, Offset: I.getOperand(i: 1).getOffset(),
2946 TargetFlags: OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2947 }
2948 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2949 return true;
2950 }
2951
2952 case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE:
2953 return selectPtrAuthGlobalValue(I, MRI);
2954
2955 case TargetOpcode::G_ZEXTLOAD:
2956 case TargetOpcode::G_LOAD:
2957 case TargetOpcode::G_STORE: {
2958 GLoadStore &LdSt = cast<GLoadStore>(Val&: I);
2959 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2960 LLT PtrTy = MRI.getType(Reg: LdSt.getPointerReg());
2961
2962 // Can only handle AddressSpace 0, 64-bit pointers.
2963 if (PtrTy != LLT::pointer(AddressSpace: 0, SizeInBits: 64)) {
2964 return false;
2965 }
2966
2967 uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
2968 unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
2969 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2970
2971 // Need special instructions for atomics that affect ordering.
2972 if (isStrongerThanMonotonic(AO: Order)) {
2973 assert(!isa<GZExtLoad>(LdSt));
2974 assert(MemSizeInBytes <= 8 &&
2975 "128-bit atomics should already be custom-legalized");
2976
2977 if (isa<GLoad>(Val: LdSt)) {
2978 static constexpr unsigned LDAPROpcodes[] = {
2979 AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2980 static constexpr unsigned LDAROpcodes[] = {
2981 AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2982 ArrayRef<unsigned> Opcodes =
2983 STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
2984 ? LDAPROpcodes
2985 : LDAROpcodes;
2986 I.setDesc(TII.get(Opcode: Opcodes[Log2_32(Value: MemSizeInBytes)]));
2987 } else {
2988 static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2989 AArch64::STLRW, AArch64::STLRX};
2990 Register ValReg = LdSt.getReg(Idx: 0);
2991 if (MRI.getType(Reg: ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2992 // Emit a subreg copy of 32 bits.
2993 Register NewVal = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
2994 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {NewVal}, SrcOps: {})
2995 .addReg(RegNo: I.getOperand(i: 0).getReg(), Flags: {}, SubReg: AArch64::sub_32);
2996 I.getOperand(i: 0).setReg(NewVal);
2997 }
2998 I.setDesc(TII.get(Opcode: Opcodes[Log2_32(Value: MemSizeInBytes)]));
2999 }
3000 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3001 return true;
3002 }
3003
3004#ifndef NDEBUG
3005 const Register PtrReg = LdSt.getPointerReg();
3006 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
3007 // Check that the pointer register is valid.
3008 assert(PtrRB.getID() == AArch64::GPRRegBankID &&
3009 "Load/Store pointer operand isn't a GPR");
3010 assert(MRI.getType(PtrReg).isPointer() &&
3011 "Load/Store pointer operand isn't a pointer");
3012#endif
3013
3014 const Register ValReg = LdSt.getReg(Idx: 0);
3015 const RegisterBank &RB = *RBI.getRegBank(Reg: ValReg, MRI, TRI);
3016 LLT ValTy = MRI.getType(Reg: ValReg);
3017
3018 // The code below doesn't support truncating stores, so we need to split it
3019 // again.
3020 if (isa<GStore>(Val: LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3021 unsigned SubReg;
3022 LLT MemTy = LdSt.getMMO().getMemoryType();
3023 auto *RC = getRegClassForTypeOnBank(Ty: MemTy, RB);
3024 if (!getSubRegForClass(RC, TRI, SubReg))
3025 return false;
3026
3027 // Generate a subreg copy.
3028 auto Copy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {MemTy}, SrcOps: {})
3029 .addReg(RegNo: ValReg, Flags: {}, SubReg)
3030 .getReg(Idx: 0);
3031 RBI.constrainGenericRegister(Reg: Copy, RC: *RC, MRI);
3032 LdSt.getOperand(i: 0).setReg(Copy);
3033 } else if (isa<GLoad>(Val: LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3034 // If this is an any-extending load from the FPR bank, split it into a regular
3035 // load + extend.
3036 if (RB.getID() == AArch64::FPRRegBankID) {
3037 unsigned SubReg;
3038 LLT MemTy = LdSt.getMMO().getMemoryType();
3039 auto *RC = getRegClassForTypeOnBank(Ty: MemTy, RB);
3040 if (!getSubRegForClass(RC, TRI, SubReg))
3041 return false;
3042 Register OldDst = LdSt.getReg(Idx: 0);
3043 Register NewDst =
3044 MRI.createGenericVirtualRegister(Ty: LdSt.getMMO().getMemoryType());
3045 LdSt.getOperand(i: 0).setReg(NewDst);
3046 MRI.setRegBank(Reg: NewDst, RegBank: RB);
3047 // Generate a SUBREG_TO_REG to extend it.
3048 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: LdSt.getIterator()));
3049 MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {OldDst}, SrcOps: {})
3050 .addUse(RegNo: NewDst)
3051 .addImm(Val: SubReg);
3052 auto SubRegRC = getRegClassForTypeOnBank(Ty: MRI.getType(Reg: OldDst), RB);
3053 RBI.constrainGenericRegister(Reg: OldDst, RC: *SubRegRC, MRI);
3054 MIB.setInstr(LdSt);
3055 ValTy = MemTy; // This is no longer an extending load.
3056 }
3057 }
3058
3059 // Helper lambda for partially selecting I. Either returns the original
3060 // instruction with an updated opcode, or a new instruction.
3061 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
3062 bool IsStore = isa<GStore>(Val: I);
3063 const unsigned NewOpc =
3064 selectLoadStoreUIOp(GenericOpc: I.getOpcode(), RegBankID: RB.getID(), OpSize: MemSizeInBits);
3065 if (NewOpc == I.getOpcode())
3066 return nullptr;
3067 // Check if we can fold anything into the addressing mode.
3068 auto AddrModeFns =
3069 selectAddrModeIndexed(Root&: I.getOperand(i: 1), Size: MemSizeInBytes);
3070 if (!AddrModeFns) {
3071 // Can't fold anything. Use the original instruction.
3072 I.setDesc(TII.get(Opcode: NewOpc));
3073 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
3074 return &I;
3075 }
3076
3077 // Folded something. Create a new instruction and return it.
3078 auto NewInst = MIB.buildInstr(Opc: NewOpc, DstOps: {}, SrcOps: {}, Flags: I.getFlags());
3079 Register CurValReg = I.getOperand(i: 0).getReg();
3080 IsStore ? NewInst.addUse(RegNo: CurValReg) : NewInst.addDef(RegNo: CurValReg);
3081 NewInst.cloneMemRefs(OtherMI: I);
3082 for (auto &Fn : *AddrModeFns)
3083 Fn(NewInst);
3084 I.eraseFromParent();
3085 return &*NewInst;
3086 };
3087
3088 MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
3089 if (!LoadStore)
3090 return false;
3091
3092 // If we're storing a 0, use WZR/XZR.
3093 if (Opcode == TargetOpcode::G_STORE) {
3094 auto CVal = getIConstantVRegValWithLookThrough(
3095 VReg: LoadStore->getOperand(i: 0).getReg(), MRI);
3096 if (CVal && CVal->Value == 0) {
3097 switch (LoadStore->getOpcode()) {
3098 case AArch64::STRWui:
3099 case AArch64::STRHHui:
3100 case AArch64::STRBBui:
3101 LoadStore->getOperand(i: 0).setReg(AArch64::WZR);
3102 break;
3103 case AArch64::STRXui:
3104 LoadStore->getOperand(i: 0).setReg(AArch64::XZR);
3105 break;
3106 }
3107 }
3108 }
3109
3110 if (IsZExtLoad || (Opcode == TargetOpcode::G_LOAD &&
3111 ValTy == LLT::scalar(SizeInBits: 64) && MemSizeInBits == 32)) {
3112 // The any/zextload from a smaller type to i32 should be handled by the
3113 // importer.
3114 if (MRI.getType(Reg: LoadStore->getOperand(i: 0).getReg()).getSizeInBits() != 64)
3115 return false;
3116 // If we have an extending load then change the load's type to be a
3117 // narrower reg and zero_extend with SUBREG_TO_REG.
3118 Register LdReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
3119 Register DstReg = LoadStore->getOperand(i: 0).getReg();
3120 LoadStore->getOperand(i: 0).setReg(LdReg);
3121
3122 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: LoadStore->getIterator()));
3123 MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {DstReg}, SrcOps: {})
3124 .addUse(RegNo: LdReg)
3125 .addImm(Val: AArch64::sub_32);
3126 constrainSelectedInstRegOperands(I&: *LoadStore, TII, TRI, RBI);
3127 return RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64allRegClass,
3128 MRI);
3129 }
3130 constrainSelectedInstRegOperands(I&: *LoadStore, TII, TRI, RBI);
3131 return true;
3132 }
3133
3134 case TargetOpcode::G_INDEXED_ZEXTLOAD:
3135 case TargetOpcode::G_INDEXED_SEXTLOAD:
3136 return selectIndexedExtLoad(I, MRI);
3137 case TargetOpcode::G_INDEXED_LOAD:
3138 return selectIndexedLoad(I, MRI);
3139 case TargetOpcode::G_INDEXED_STORE:
3140 return selectIndexedStore(I&: cast<GIndexedStore>(Val&: I), MRI);
3141
3142 case TargetOpcode::G_LSHR:
3143 case TargetOpcode::G_ASHR:
3144 if (MRI.getType(Reg: I.getOperand(i: 0).getReg()).isVector())
3145 return selectVectorAshrLshr(I, MRI);
3146 [[fallthrough]];
3147 case TargetOpcode::G_SHL:
3148 if (Opcode == TargetOpcode::G_SHL &&
3149 MRI.getType(Reg: I.getOperand(i: 0).getReg()).isVector())
3150 return selectVectorSHL(I, MRI);
3151
3152 // These shifts were legalized to have 64 bit shift amounts because we
3153 // want to take advantage of the selection patterns that assume the
3154 // immediates are s64s, however, selectBinaryOp will assume both operands
3155 // will have the same bit size.
3156 {
3157 Register SrcReg = I.getOperand(i: 1).getReg();
3158 Register ShiftReg = I.getOperand(i: 2).getReg();
3159 const LLT ShiftTy = MRI.getType(Reg: ShiftReg);
3160 const LLT SrcTy = MRI.getType(Reg: SrcReg);
3161 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3162 ShiftTy.getSizeInBits() == 64) {
3163 assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3164 // Insert a subregister copy to implement a 64->32 trunc
3165 auto Trunc = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {SrcTy}, SrcOps: {})
3166 .addReg(RegNo: ShiftReg, Flags: {}, SubReg: AArch64::sub_32);
3167 MRI.setRegBank(Reg: Trunc.getReg(Idx: 0), RegBank: RBI.getRegBank(ID: AArch64::GPRRegBankID));
3168 I.getOperand(i: 2).setReg(Trunc.getReg(Idx: 0));
3169 }
3170 }
3171 [[fallthrough]];
3172 case TargetOpcode::G_OR: {
3173 // Reject the various things we don't support yet.
3174 if (unsupportedBinOp(I, RBI, MRI, TRI))
3175 return false;
3176
3177 const unsigned OpSize = Ty.getSizeInBits();
3178
3179 const Register DefReg = I.getOperand(i: 0).getReg();
3180 const RegisterBank &RB = *RBI.getRegBank(Reg: DefReg, MRI, TRI);
3181
3182 const unsigned NewOpc = selectBinaryOp(GenericOpc: I.getOpcode(), RegBankID: RB.getID(), OpSize);
3183 if (NewOpc == I.getOpcode())
3184 return false;
3185
3186 I.setDesc(TII.get(Opcode: NewOpc));
3187 // FIXME: Should the type be always reset in setDesc?
3188
3189 // Now that we selected an opcode, we need to constrain the register
3190 // operands to use appropriate classes.
3191 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3192 return true;
3193 }
3194
3195 case TargetOpcode::G_PTR_ADD: {
3196 emitADD(DefReg: I.getOperand(i: 0).getReg(), LHS&: I.getOperand(i: 1), RHS&: I.getOperand(i: 2), MIRBuilder&: MIB);
3197 I.eraseFromParent();
3198 return true;
3199 }
3200
3201 case TargetOpcode::G_SADDE:
3202 case TargetOpcode::G_UADDE:
3203 case TargetOpcode::G_SSUBE:
3204 case TargetOpcode::G_USUBE:
3205 case TargetOpcode::G_SADDO:
3206 case TargetOpcode::G_UADDO:
3207 case TargetOpcode::G_SSUBO:
3208 case TargetOpcode::G_USUBO:
3209 return selectOverflowOp(I, MRI);
3210
3211 case TargetOpcode::G_PTRMASK: {
3212 Register MaskReg = I.getOperand(i: 2).getReg();
3213 std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(VReg: MaskReg, MRI);
3214 // TODO: Implement arbitrary cases
3215 if (!MaskVal || !isShiftedMask_64(Value: *MaskVal))
3216 return false;
3217
3218 uint64_t Mask = *MaskVal;
3219 I.setDesc(TII.get(Opcode: AArch64::ANDXri));
3220 I.getOperand(i: 2).ChangeToImmediate(
3221 ImmVal: AArch64_AM::encodeLogicalImmediate(imm: Mask, regSize: 64));
3222
3223 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3224 return true;
3225 }
3226 case TargetOpcode::G_PTRTOINT:
3227 case TargetOpcode::G_TRUNC: {
3228 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3229 const LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
3230
3231 const Register DstReg = I.getOperand(i: 0).getReg();
3232 const Register SrcReg = I.getOperand(i: 1).getReg();
3233
3234 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
3235 const RegisterBank &SrcRB = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
3236
3237 if (DstRB.getID() != SrcRB.getID()) {
3238 LLVM_DEBUG(
3239 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3240 return false;
3241 }
3242
3243 if (DstRB.getID() == AArch64::GPRRegBankID) {
3244 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Ty: DstTy, RB: DstRB);
3245 if (!DstRC)
3246 return false;
3247
3248 const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(Ty: SrcTy, RB: SrcRB);
3249 if (!SrcRC)
3250 return false;
3251
3252 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI) ||
3253 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI)) {
3254 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3255 return false;
3256 }
3257
3258 if (DstRC == SrcRC) {
3259 // Nothing to be done
3260 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(SizeInBits: 32) &&
3261 SrcTy == LLT::scalar(SizeInBits: 64)) {
3262 llvm_unreachable("TableGen can import this case");
3263 return false;
3264 } else if (DstRC == &AArch64::GPR32RegClass &&
3265 SrcRC == &AArch64::GPR64RegClass) {
3266 I.getOperand(i: 1).setSubReg(AArch64::sub_32);
3267 } else {
3268 LLVM_DEBUG(
3269 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3270 return false;
3271 }
3272
3273 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
3274 return true;
3275 } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3276 if (DstTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16) &&
3277 SrcTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
3278 I.setDesc(TII.get(Opcode: AArch64::XTNv4i16));
3279 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3280 return true;
3281 }
3282
3283 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3284 MachineInstr *Extract = emitExtractVectorElt(
3285 DstReg, DstRB, ScalarTy: LLT::scalar(SizeInBits: DstTy.getSizeInBits()), VecReg: SrcReg, LaneIdx: 0, MIRBuilder&: MIB);
3286 if (!Extract)
3287 return false;
3288 I.eraseFromParent();
3289 return true;
3290 }
3291
3292 // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3293 if (Opcode == TargetOpcode::G_PTRTOINT) {
3294 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3295 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
3296 return selectCopy(I, TII, MRI, TRI, RBI);
3297 }
3298 }
3299
3300 return false;
3301 }
3302
3303 case TargetOpcode::G_ANYEXT: {
3304 if (selectUSMovFromExtend(I, MRI))
3305 return true;
3306
3307 const Register DstReg = I.getOperand(i: 0).getReg();
3308 const Register SrcReg = I.getOperand(i: 1).getReg();
3309
3310 const RegisterBank &RBDst = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
3311 if (RBDst.getID() != AArch64::GPRRegBankID) {
3312 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3313 << ", expected: GPR\n");
3314 return false;
3315 }
3316
3317 const RegisterBank &RBSrc = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
3318 if (RBSrc.getID() != AArch64::GPRRegBankID) {
3319 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3320 << ", expected: GPR\n");
3321 return false;
3322 }
3323
3324 const unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
3325
3326 if (DstSize == 0) {
3327 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3328 return false;
3329 }
3330
3331 if (DstSize != 64 && DstSize > 32) {
3332 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3333 << ", expected: 32 or 64\n");
3334 return false;
3335 }
3336 // At this point G_ANYEXT is just like a plain COPY, but we need
3337 // to explicitly form the 64-bit value if any.
3338 if (DstSize > 32) {
3339 Register ExtSrc = MRI.createVirtualRegister(RegClass: &AArch64::GPR64allRegClass);
3340 BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::SUBREG_TO_REG))
3341 .addDef(RegNo: ExtSrc)
3342 .addUse(RegNo: SrcReg)
3343 .addImm(Val: AArch64::sub_32);
3344 I.getOperand(i: 1).setReg(ExtSrc);
3345 }
3346 return selectCopy(I, TII, MRI, TRI, RBI);
3347 }
3348
3349 case TargetOpcode::G_ZEXT:
3350 case TargetOpcode::G_SEXT_INREG:
3351 case TargetOpcode::G_SEXT: {
3352 if (selectUSMovFromExtend(I, MRI))
3353 return true;
3354
3355 unsigned Opcode = I.getOpcode();
3356 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3357 const Register DefReg = I.getOperand(i: 0).getReg();
3358 Register SrcReg = I.getOperand(i: 1).getReg();
3359 const LLT DstTy = MRI.getType(Reg: DefReg);
3360 const LLT SrcTy = MRI.getType(Reg: SrcReg);
3361 unsigned DstSize = DstTy.getSizeInBits();
3362 unsigned SrcSize = SrcTy.getSizeInBits();
3363
3364 // SEXT_INREG has the same src reg size as dst, the size of the value to be
3365 // extended is encoded in the imm.
3366 if (Opcode == TargetOpcode::G_SEXT_INREG)
3367 SrcSize = I.getOperand(i: 2).getImm();
3368
3369 if (DstTy.isVector())
3370 return false; // Should be handled by imported patterns.
3371
3372 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3373 AArch64::GPRRegBankID &&
3374 "Unexpected ext regbank");
3375
3376 MachineInstr *ExtI;
3377
3378 // First check if we're extending the result of a load which has a dest type
3379 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3380 // GPR register on AArch64 and all loads which are smaller automatically
3381 // zero-extend the upper bits. E.g.
3382 // %v(s8) = G_LOAD %p, :: (load 1)
3383 // %v2(s32) = G_ZEXT %v(s8)
3384 if (!IsSigned) {
3385 auto *LoadMI = getOpcodeDef(Opcode: TargetOpcode::G_LOAD, Reg: SrcReg, MRI);
3386 bool IsGPR =
3387 RBI.getRegBank(Reg: SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3388 if (LoadMI && IsGPR) {
3389 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3390 unsigned BytesLoaded = MemOp->getSize().getValue();
3391 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3392 return selectCopy(I, TII, MRI, TRI, RBI);
3393 }
3394
3395 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3396 // + SUBREG_TO_REG.
3397 if (IsGPR && SrcSize == 32 && DstSize == 64) {
3398 Register SubregToRegSrc =
3399 MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
3400 const Register ZReg = AArch64::WZR;
3401 MIB.buildInstr(Opc: AArch64::ORRWrs, DstOps: {SubregToRegSrc}, SrcOps: {ZReg, SrcReg})
3402 .addImm(Val: 0);
3403
3404 MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {DefReg}, SrcOps: {})
3405 .addUse(RegNo: SubregToRegSrc)
3406 .addImm(Val: AArch64::sub_32);
3407
3408 if (!RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass,
3409 MRI)) {
3410 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3411 return false;
3412 }
3413
3414 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AArch64::GPR32RegClass,
3415 MRI)) {
3416 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3417 return false;
3418 }
3419
3420 I.eraseFromParent();
3421 return true;
3422 }
3423 }
3424
3425 if (DstSize == 64) {
3426 if (Opcode != TargetOpcode::G_SEXT_INREG) {
3427 // FIXME: Can we avoid manually doing this?
3428 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AArch64::GPR32RegClass,
3429 MRI)) {
3430 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3431 << " operand\n");
3432 return false;
3433 }
3434 SrcReg = MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG,
3435 DstOps: {&AArch64::GPR64RegClass}, SrcOps: {})
3436 .addUse(RegNo: SrcReg)
3437 .addImm(Val: AArch64::sub_32)
3438 .getReg(Idx: 0);
3439 }
3440
3441 ExtI = MIB.buildInstr(Opc: IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3442 DstOps: {DefReg}, SrcOps: {SrcReg})
3443 .addImm(Val: 0)
3444 .addImm(Val: SrcSize - 1);
3445 } else if (DstSize <= 32) {
3446 ExtI = MIB.buildInstr(Opc: IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3447 DstOps: {DefReg}, SrcOps: {SrcReg})
3448 .addImm(Val: 0)
3449 .addImm(Val: SrcSize - 1);
3450 } else {
3451 return false;
3452 }
3453
3454 constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
3455 I.eraseFromParent();
3456 return true;
3457 }
3458
3459 case TargetOpcode::G_FREEZE:
3460 return selectCopy(I, TII, MRI, TRI, RBI);
3461
3462 case TargetOpcode::G_INTTOPTR:
3463 // The importer is currently unable to import pointer types since they
3464 // didn't exist in SelectionDAG.
3465 return selectCopy(I, TII, MRI, TRI, RBI);
3466
3467 case TargetOpcode::G_BITCAST:
3468 // Imported SelectionDAG rules can handle every bitcast except those that
3469 // bitcast from a type to the same type. Ideally, these shouldn't occur
3470 // but we might not run an optimizer that deletes them. The other exception
3471 // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3472 // of them.
3473 return selectCopy(I, TII, MRI, TRI, RBI);
3474
3475 case TargetOpcode::G_SELECT: {
3476 auto &Sel = cast<GSelect>(Val&: I);
3477 const Register CondReg = Sel.getCondReg();
3478 const Register TReg = Sel.getTrueReg();
3479 const Register FReg = Sel.getFalseReg();
3480
3481 if (tryOptSelect(Sel))
3482 return true;
3483
3484 // Make sure to use an unused vreg instead of wzr, so that the peephole
3485 // optimizations will be able to optimize these.
3486 Register DeadVReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
3487 auto TstMI = MIB.buildInstr(Opc: AArch64::ANDSWri, DstOps: {DeadVReg}, SrcOps: {CondReg})
3488 .addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: 1, regSize: 32));
3489 constrainSelectedInstRegOperands(I&: *TstMI, TII, TRI, RBI);
3490 if (!emitSelect(Dst: Sel.getReg(Idx: 0), True: TReg, False: FReg, CC: AArch64CC::NE, MIB))
3491 return false;
3492 Sel.eraseFromParent();
3493 return true;
3494 }
3495 case TargetOpcode::G_ICMP: {
3496 if (Ty.isVector())
3497 return false;
3498
3499 if (Ty != LLT::scalar(SizeInBits: 32)) {
3500 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3501 << ", expected: " << LLT::scalar(32) << '\n');
3502 return false;
3503 }
3504
3505 auto &PredOp = I.getOperand(i: 1);
3506 emitIntegerCompare(LHS&: I.getOperand(i: 2), RHS&: I.getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
3507 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
3508 const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
3509 P: CmpInst::getInversePredicate(pred: Pred), RHS: I.getOperand(i: 3).getReg(), MRI: &MRI);
3510 emitCSINC(/*Dst=*/I.getOperand(i: 0).getReg(), /*Src1=*/AArch64::WZR,
3511 /*Src2=*/AArch64::WZR, Pred: InvCC, MIRBuilder&: MIB);
3512 I.eraseFromParent();
3513 return true;
3514 }
3515
3516 case TargetOpcode::G_FCMP: {
3517 CmpInst::Predicate Pred =
3518 static_cast<CmpInst::Predicate>(I.getOperand(i: 1).getPredicate());
3519 if (!emitFPCompare(LHS: I.getOperand(i: 2).getReg(), RHS: I.getOperand(i: 3).getReg(), MIRBuilder&: MIB,
3520 Pred) ||
3521 !emitCSetForFCmp(Dst: I.getOperand(i: 0).getReg(), Pred, MIRBuilder&: MIB))
3522 return false;
3523 I.eraseFromParent();
3524 return true;
3525 }
3526 case TargetOpcode::G_VASTART:
3527 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3528 : selectVaStartAAPCS(I, MF, MRI);
3529 case TargetOpcode::G_INTRINSIC:
3530 return selectIntrinsic(I, MRI);
3531 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3532 return selectIntrinsicWithSideEffects(I, MRI);
3533 case TargetOpcode::G_IMPLICIT_DEF: {
3534 I.setDesc(TII.get(Opcode: TargetOpcode::IMPLICIT_DEF));
3535 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3536 const Register DstReg = I.getOperand(i: 0).getReg();
3537 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
3538 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Ty: DstTy, RB: DstRB);
3539 RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI);
3540 return true;
3541 }
3542 case TargetOpcode::G_BLOCK_ADDR: {
3543 Function *BAFn = I.getOperand(i: 1).getBlockAddress()->getFunction();
3544 if (std::optional<uint16_t> BADisc =
3545 STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(ParentFn: *BAFn)) {
3546 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X16}, SrcOps: {});
3547 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
3548 MIB.buildInstr(Opcode: AArch64::MOVaddrPAC)
3549 .addBlockAddress(BA: I.getOperand(i: 1).getBlockAddress())
3550 .addImm(Val: AArch64PACKey::IA)
3551 .addReg(/*AddrDisc=*/RegNo: AArch64::XZR)
3552 .addImm(Val: *BADisc)
3553 .constrainAllUses(TII, TRI, RBI);
3554 MIB.buildCopy(Res: I.getOperand(i: 0).getReg(), Op: Register(AArch64::X16));
3555 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(),
3556 RC: AArch64::GPR64RegClass, MRI);
3557 I.eraseFromParent();
3558 return true;
3559 }
3560 if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3561 materializeLargeCMVal(I, V: I.getOperand(i: 1).getBlockAddress(), OpFlags: 0);
3562 I.eraseFromParent();
3563 return true;
3564 } else {
3565 I.setDesc(TII.get(Opcode: AArch64::MOVaddrBA));
3566 auto MovMI = BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::MOVaddrBA),
3567 DestReg: I.getOperand(i: 0).getReg())
3568 .addBlockAddress(BA: I.getOperand(i: 1).getBlockAddress(),
3569 /* Offset */ 0, TargetFlags: AArch64II::MO_PAGE)
3570 .addBlockAddress(
3571 BA: I.getOperand(i: 1).getBlockAddress(), /* Offset */ 0,
3572 TargetFlags: AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3573 I.eraseFromParent();
3574 constrainSelectedInstRegOperands(I&: *MovMI, TII, TRI, RBI);
3575 return true;
3576 }
3577 }
3578 case AArch64::G_DUP: {
3579 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3580 // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3581 // difficult because at RBS we may end up pessimizing the fpr case if we
3582 // decided to add an anyextend to fix this. Manual selection is the most
3583 // robust solution for now.
3584 if (RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI)->getID() !=
3585 AArch64::GPRRegBankID)
3586 return false; // We expect the fpr regbank case to be imported.
3587 LLT VecTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3588 if (VecTy == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8))
3589 I.setDesc(TII.get(Opcode: AArch64::DUPv8i8gpr));
3590 else if (VecTy == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8))
3591 I.setDesc(TII.get(Opcode: AArch64::DUPv16i8gpr));
3592 else if (VecTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16))
3593 I.setDesc(TII.get(Opcode: AArch64::DUPv4i16gpr));
3594 else if (VecTy == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16))
3595 I.setDesc(TII.get(Opcode: AArch64::DUPv8i16gpr));
3596 else
3597 return false;
3598 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3599 return true;
3600 }
3601 case TargetOpcode::G_BUILD_VECTOR:
3602 return selectBuildVector(I, MRI);
3603 case TargetOpcode::G_MERGE_VALUES:
3604 return selectMergeValues(I, MRI);
3605 case TargetOpcode::G_UNMERGE_VALUES:
3606 return selectUnmergeValues(I, MRI);
3607 case TargetOpcode::G_SHUFFLE_VECTOR:
3608 return selectShuffleVector(I, MRI);
3609 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3610 return selectExtractElt(I, MRI);
3611 case TargetOpcode::G_CONCAT_VECTORS:
3612 return selectConcatVectors(I, MRI);
3613 case TargetOpcode::G_JUMP_TABLE:
3614 return selectJumpTable(I, MRI);
3615 case TargetOpcode::G_MEMCPY:
3616 case TargetOpcode::G_MEMCPY_INLINE:
3617 case TargetOpcode::G_MEMMOVE:
3618 case TargetOpcode::G_MEMSET:
3619 case TargetOpcode::G_MEMSET_INLINE:
3620 assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3621 return selectMOPS(I, MRI);
3622 }
3623
3624 return false;
3625}
3626
3627bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3628 MachineIRBuilderState OldMIBState = MIB.getState();
3629 bool Success = select(I);
3630 MIB.setState(OldMIBState);
3631 return Success;
3632}
3633
3634bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3635 MachineRegisterInfo &MRI) {
3636 unsigned Mopcode;
3637 switch (GI.getOpcode()) {
3638 case TargetOpcode::G_MEMCPY:
3639 case TargetOpcode::G_MEMCPY_INLINE:
3640 Mopcode = AArch64::MOPSMemoryCopyPseudo;
3641 break;
3642 case TargetOpcode::G_MEMMOVE:
3643 Mopcode = AArch64::MOPSMemoryMovePseudo;
3644 break;
3645 case TargetOpcode::G_MEMSET:
3646 case TargetOpcode::G_MEMSET_INLINE:
3647 // For tagged memset see llvm.aarch64.mops.memset.tag
3648 Mopcode = AArch64::MOPSMemorySetPseudo;
3649 break;
3650 }
3651
3652 auto &DstPtr = GI.getOperand(i: 0);
3653 auto &SrcOrVal = GI.getOperand(i: 1);
3654 auto &Size = GI.getOperand(i: 2);
3655
3656 // Create copies of the registers that can be clobbered.
3657 const Register DstPtrCopy = MRI.cloneVirtualRegister(VReg: DstPtr.getReg());
3658 const Register SrcValCopy = MRI.cloneVirtualRegister(VReg: SrcOrVal.getReg());
3659 const Register SizeCopy = MRI.cloneVirtualRegister(VReg: Size.getReg());
3660
3661 const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3662 const auto &SrcValRegClass =
3663 IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3664
3665 // Constrain to specific registers
3666 RBI.constrainGenericRegister(Reg: DstPtrCopy, RC: AArch64::GPR64commonRegClass, MRI);
3667 RBI.constrainGenericRegister(Reg: SrcValCopy, RC: SrcValRegClass, MRI);
3668 RBI.constrainGenericRegister(Reg: SizeCopy, RC: AArch64::GPR64RegClass, MRI);
3669
3670 MIB.buildCopy(Res: DstPtrCopy, Op: DstPtr);
3671 MIB.buildCopy(Res: SrcValCopy, Op: SrcOrVal);
3672 MIB.buildCopy(Res: SizeCopy, Op: Size);
3673
3674 // New instruction uses the copied registers because it must update them.
3675 // The defs are not used since they don't exist in G_MEM*. They are still
3676 // tied.
3677 // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3678 Register DefDstPtr = MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
3679 Register DefSize = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3680 if (IsSet) {
3681 MIB.buildInstr(Opc: Mopcode, DstOps: {DefDstPtr, DefSize},
3682 SrcOps: {DstPtrCopy, SizeCopy, SrcValCopy});
3683 } else {
3684 Register DefSrcPtr = MRI.createVirtualRegister(RegClass: &SrcValRegClass);
3685 MIB.buildInstr(Opc: Mopcode, DstOps: {DefDstPtr, DefSrcPtr, DefSize},
3686 SrcOps: {DstPtrCopy, SrcValCopy, SizeCopy});
3687 }
3688
3689 GI.eraseFromParent();
3690 return true;
3691}
3692
3693bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3694 MachineRegisterInfo &MRI) {
3695 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3696 Register JTAddr = I.getOperand(i: 0).getReg();
3697 unsigned JTI = I.getOperand(i: 1).getIndex();
3698 Register Index = I.getOperand(i: 2).getReg();
3699
3700 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(Idx: JTI, Size: 4, PCRelSym: nullptr);
3701
3702 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
3703 // sequence later, to guarantee the integrity of the intermediate values.
3704 if (MF->getFunction().hasFnAttribute(Kind: "aarch64-jump-table-hardening")) {
3705 CodeModel::Model CM = TM.getCodeModel();
3706 if (STI.isTargetMachO()) {
3707 if (CM != CodeModel::Small && CM != CodeModel::Large)
3708 report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
3709 } else {
3710 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
3711 assert(STI.isTargetELF() &&
3712 "jump table hardening only supported on MachO/ELF");
3713 if (CM != CodeModel::Small)
3714 report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
3715 }
3716
3717 MIB.buildCopy(Res: {AArch64::X16}, Op: I.getOperand(i: 2).getReg());
3718 MIB.buildInstr(Opcode: AArch64::BR_JumpTable)
3719 .addJumpTableIndex(Idx: I.getOperand(i: 1).getIndex());
3720 I.eraseFromParent();
3721 return true;
3722 }
3723
3724 Register TargetReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3725 Register ScratchReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64spRegClass);
3726
3727 auto JumpTableInst = MIB.buildInstr(Opc: AArch64::JumpTableDest32,
3728 DstOps: {TargetReg, ScratchReg}, SrcOps: {JTAddr, Index})
3729 .addJumpTableIndex(Idx: JTI);
3730 // Save the jump table info.
3731 MIB.buildInstr(Opc: TargetOpcode::JUMP_TABLE_DEBUG_INFO, DstOps: {},
3732 SrcOps: {static_cast<int64_t>(JTI)});
3733 // Build the indirect branch.
3734 MIB.buildInstr(Opc: AArch64::BR, DstOps: {}, SrcOps: {TargetReg});
3735 I.eraseFromParent();
3736 constrainSelectedInstRegOperands(I&: *JumpTableInst, TII, TRI, RBI);
3737 return true;
3738}
3739
3740bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3741 MachineRegisterInfo &MRI) {
3742 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3743 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3744
3745 Register DstReg = I.getOperand(i: 0).getReg();
3746 unsigned JTI = I.getOperand(i: 1).getIndex();
3747 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3748 auto MovMI =
3749 MIB.buildInstr(Opc: AArch64::MOVaddrJT, DstOps: {DstReg}, SrcOps: {})
3750 .addJumpTableIndex(Idx: JTI, TargetFlags: AArch64II::MO_PAGE)
3751 .addJumpTableIndex(Idx: JTI, TargetFlags: AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3752 I.eraseFromParent();
3753 constrainSelectedInstRegOperands(I&: *MovMI, TII, TRI, RBI);
3754 return true;
3755}
3756
3757bool AArch64InstructionSelector::selectTLSGlobalValue(
3758 MachineInstr &I, MachineRegisterInfo &MRI) {
3759 if (!STI.isTargetMachO())
3760 return false;
3761 MachineFunction &MF = *I.getParent()->getParent();
3762 MF.getFrameInfo().setAdjustsStack(true);
3763
3764 const auto &GlobalOp = I.getOperand(i: 1);
3765 assert(GlobalOp.getOffset() == 0 &&
3766 "Shouldn't have an offset on TLS globals!");
3767 const GlobalValue &GV = *GlobalOp.getGlobal();
3768
3769 auto LoadGOT =
3770 MIB.buildInstr(Opc: AArch64::LOADgot, DstOps: {&AArch64::GPR64commonRegClass}, SrcOps: {})
3771 .addGlobalAddress(GV: &GV, Offset: 0, TargetFlags: AArch64II::MO_TLS);
3772
3773 auto Load = MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {&AArch64::GPR64commonRegClass},
3774 SrcOps: {LoadGOT.getReg(Idx: 0)})
3775 .addImm(Val: 0);
3776
3777 MIB.buildCopy(Res: Register(AArch64::X0), Op: LoadGOT.getReg(Idx: 0));
3778 // TLS calls preserve all registers except those that absolutely must be
3779 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3780 // silly).
3781 unsigned Opcode = getBLRCallOpcode(MF);
3782
3783 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
3784 if (MF.getFunction().hasFnAttribute(Kind: "ptrauth-calls")) {
3785 assert(Opcode == AArch64::BLR);
3786 Opcode = AArch64::BLRAAZ;
3787 }
3788
3789 MIB.buildInstr(Opc: Opcode, DstOps: {}, SrcOps: {Load})
3790 .addUse(RegNo: AArch64::X0, Flags: RegState::Implicit)
3791 .addDef(RegNo: AArch64::X0, Flags: RegState::Implicit)
3792 .addRegMask(Mask: TRI.getTLSCallPreservedMask());
3793
3794 MIB.buildCopy(Res: I.getOperand(i: 0).getReg(), Op: Register(AArch64::X0));
3795 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(), RC: AArch64::GPR64RegClass,
3796 MRI);
3797 I.eraseFromParent();
3798 return true;
3799}
3800
3801MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3802 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3803 MachineIRBuilder &MIRBuilder) const {
3804 auto Undef = MIRBuilder.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {DstRC}, SrcOps: {});
3805
3806 auto BuildFn = [&](unsigned SubregIndex) {
3807 auto Ins =
3808 MIRBuilder
3809 .buildInstr(Opc: TargetOpcode::INSERT_SUBREG, DstOps: {DstRC}, SrcOps: {Undef, Scalar})
3810 .addImm(Val: SubregIndex);
3811 constrainSelectedInstRegOperands(I&: *Undef, TII, TRI, RBI);
3812 constrainSelectedInstRegOperands(I&: *Ins, TII, TRI, RBI);
3813 return &*Ins;
3814 };
3815
3816 switch (EltSize) {
3817 case 8:
3818 return BuildFn(AArch64::bsub);
3819 case 16:
3820 return BuildFn(AArch64::hsub);
3821 case 32:
3822 return BuildFn(AArch64::ssub);
3823 case 64:
3824 return BuildFn(AArch64::dsub);
3825 default:
3826 return nullptr;
3827 }
3828}
3829
3830MachineInstr *
3831AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3832 MachineIRBuilder &MIB,
3833 MachineRegisterInfo &MRI) const {
3834 LLT DstTy = MRI.getType(Reg: DstReg);
3835 const TargetRegisterClass *RC =
3836 getRegClassForTypeOnBank(Ty: DstTy, RB: *RBI.getRegBank(Reg: SrcReg, MRI, TRI));
3837 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3838 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3839 return nullptr;
3840 }
3841 unsigned SubReg = 0;
3842 if (!getSubRegForClass(RC, TRI, SubReg))
3843 return nullptr;
3844 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3845 LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3846 << DstTy.getSizeInBits() << "\n");
3847 return nullptr;
3848 }
3849 auto Copy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {})
3850 .addReg(RegNo: SrcReg, Flags: {}, SubReg);
3851 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
3852 return Copy;
3853}
3854
3855bool AArch64InstructionSelector::selectMergeValues(
3856 MachineInstr &I, MachineRegisterInfo &MRI) {
3857 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3858 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3859 const LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
3860 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3861 const RegisterBank &RB = *RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI);
3862
3863 if (I.getNumOperands() != 3)
3864 return false;
3865
3866 // Merging 2 s64s into an s128.
3867 if (DstTy == LLT::scalar(SizeInBits: 128)) {
3868 if (SrcTy.getSizeInBits() != 64)
3869 return false;
3870 Register DstReg = I.getOperand(i: 0).getReg();
3871 Register Src1Reg = I.getOperand(i: 1).getReg();
3872 Register Src2Reg = I.getOperand(i: 2).getReg();
3873 auto Tmp = MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {DstTy}, SrcOps: {});
3874 MachineInstr *InsMI = emitLaneInsert(DstReg: std::nullopt, SrcReg: Tmp.getReg(Idx: 0), EltReg: Src1Reg,
3875 /* LaneIdx */ 0, RB, MIRBuilder&: MIB);
3876 if (!InsMI)
3877 return false;
3878 MachineInstr *Ins2MI = emitLaneInsert(DstReg, SrcReg: InsMI->getOperand(i: 0).getReg(),
3879 EltReg: Src2Reg, /* LaneIdx */ 1, RB, MIRBuilder&: MIB);
3880 if (!Ins2MI)
3881 return false;
3882 constrainSelectedInstRegOperands(I&: *InsMI, TII, TRI, RBI);
3883 constrainSelectedInstRegOperands(I&: *Ins2MI, TII, TRI, RBI);
3884 I.eraseFromParent();
3885 return true;
3886 }
3887
3888 if (RB.getID() != AArch64::GPRRegBankID)
3889 return false;
3890
3891 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3892 return false;
3893
3894 auto *DstRC = &AArch64::GPR64RegClass;
3895 Register SubToRegDef = MRI.createVirtualRegister(RegClass: DstRC);
3896 MachineInstr &SubRegMI = *BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(),
3897 MCID: TII.get(Opcode: TargetOpcode::SUBREG_TO_REG))
3898 .addDef(RegNo: SubToRegDef)
3899 .addUse(RegNo: I.getOperand(i: 1).getReg())
3900 .addImm(Val: AArch64::sub_32);
3901 Register SubToRegDef2 = MRI.createVirtualRegister(RegClass: DstRC);
3902 // Need to anyext the second scalar before we can use bfm
3903 MachineInstr &SubRegMI2 = *BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(),
3904 MCID: TII.get(Opcode: TargetOpcode::SUBREG_TO_REG))
3905 .addDef(RegNo: SubToRegDef2)
3906 .addUse(RegNo: I.getOperand(i: 2).getReg())
3907 .addImm(Val: AArch64::sub_32);
3908 MachineInstr &BFM =
3909 *BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::BFMXri))
3910 .addDef(RegNo: I.getOperand(i: 0).getReg())
3911 .addUse(RegNo: SubToRegDef)
3912 .addUse(RegNo: SubToRegDef2)
3913 .addImm(Val: 32)
3914 .addImm(Val: 31);
3915 constrainSelectedInstRegOperands(I&: SubRegMI, TII, TRI, RBI);
3916 constrainSelectedInstRegOperands(I&: SubRegMI2, TII, TRI, RBI);
3917 constrainSelectedInstRegOperands(I&: BFM, TII, TRI, RBI);
3918 I.eraseFromParent();
3919 return true;
3920}
3921
3922static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3923 const unsigned EltSize) {
3924 // Choose a lane copy opcode and subregister based off of the size of the
3925 // vector's elements.
3926 switch (EltSize) {
3927 case 8:
3928 CopyOpc = AArch64::DUPi8;
3929 ExtractSubReg = AArch64::bsub;
3930 break;
3931 case 16:
3932 CopyOpc = AArch64::DUPi16;
3933 ExtractSubReg = AArch64::hsub;
3934 break;
3935 case 32:
3936 CopyOpc = AArch64::DUPi32;
3937 ExtractSubReg = AArch64::ssub;
3938 break;
3939 case 64:
3940 CopyOpc = AArch64::DUPi64;
3941 ExtractSubReg = AArch64::dsub;
3942 break;
3943 default:
3944 // Unknown size, bail out.
3945 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
3946 return false;
3947 }
3948 return true;
3949}
3950
3951MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
3952 std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
3953 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
3954 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3955 unsigned CopyOpc = 0;
3956 unsigned ExtractSubReg = 0;
3957 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, EltSize: ScalarTy.getSizeInBits())) {
3958 LLVM_DEBUG(
3959 dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
3960 return nullptr;
3961 }
3962
3963 const TargetRegisterClass *DstRC =
3964 getRegClassForTypeOnBank(Ty: ScalarTy, RB: DstRB, GetAllRegSet: true);
3965 if (!DstRC) {
3966 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
3967 return nullptr;
3968 }
3969
3970 const RegisterBank &VecRB = *RBI.getRegBank(Reg: VecReg, MRI, TRI);
3971 const LLT &VecTy = MRI.getType(Reg: VecReg);
3972 const TargetRegisterClass *VecRC =
3973 getRegClassForTypeOnBank(Ty: VecTy, RB: VecRB, GetAllRegSet: true);
3974 if (!VecRC) {
3975 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3976 return nullptr;
3977 }
3978
3979 // The register that we're going to copy into.
3980 Register InsertReg = VecReg;
3981 if (!DstReg)
3982 DstReg = MRI.createVirtualRegister(RegClass: DstRC);
3983 // If the lane index is 0, we just use a subregister COPY.
3984 if (LaneIdx == 0) {
3985 auto Copy = MIRBuilder.buildInstr(Opc: TargetOpcode::COPY, DstOps: {*DstReg}, SrcOps: {})
3986 .addReg(RegNo: VecReg, Flags: {}, SubReg: ExtractSubReg);
3987 RBI.constrainGenericRegister(Reg: *DstReg, RC: *DstRC, MRI);
3988 return &*Copy;
3989 }
3990
3991 // Lane copies require 128-bit wide registers. If we're dealing with an
3992 // unpacked vector, then we need to move up to that width. Insert an implicit
3993 // def and a subregister insert to get us there.
3994 if (VecTy.getSizeInBits() != 128) {
3995 MachineInstr *ScalarToVector = emitScalarToVector(
3996 EltSize: VecTy.getSizeInBits(), DstRC: &AArch64::FPR128RegClass, Scalar: VecReg, MIRBuilder);
3997 if (!ScalarToVector)
3998 return nullptr;
3999 InsertReg = ScalarToVector->getOperand(i: 0).getReg();
4000 }
4001
4002 MachineInstr *LaneCopyMI =
4003 MIRBuilder.buildInstr(Opc: CopyOpc, DstOps: {*DstReg}, SrcOps: {InsertReg}).addImm(Val: LaneIdx);
4004 constrainSelectedInstRegOperands(I&: *LaneCopyMI, TII, TRI, RBI);
4005
4006 // Make sure that we actually constrain the initial copy.
4007 RBI.constrainGenericRegister(Reg: *DstReg, RC: *DstRC, MRI);
4008 return LaneCopyMI;
4009}
4010
4011bool AArch64InstructionSelector::selectExtractElt(
4012 MachineInstr &I, MachineRegisterInfo &MRI) {
4013 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4014 "unexpected opcode!");
4015 Register DstReg = I.getOperand(i: 0).getReg();
4016 const LLT NarrowTy = MRI.getType(Reg: DstReg);
4017 const Register SrcReg = I.getOperand(i: 1).getReg();
4018 const LLT WideTy = MRI.getType(Reg: SrcReg);
4019 (void)WideTy;
4020 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4021 "source register size too small!");
4022 assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4023
4024 // Need the lane index to determine the correct copy opcode.
4025 MachineOperand &LaneIdxOp = I.getOperand(i: 2);
4026 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4027
4028 if (RBI.getRegBank(Reg: DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4029 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4030 return false;
4031 }
4032
4033 // Find the index to extract from.
4034 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: LaneIdxOp.getReg(), MRI);
4035 if (!VRegAndVal)
4036 return false;
4037 unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4038
4039
4040 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
4041 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, ScalarTy: NarrowTy, VecReg: SrcReg,
4042 LaneIdx, MIRBuilder&: MIB);
4043 if (!Extract)
4044 return false;
4045
4046 I.eraseFromParent();
4047 return true;
4048}
4049
4050bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4051 MachineInstr &I, MachineRegisterInfo &MRI) {
4052 unsigned NumElts = I.getNumOperands() - 1;
4053 Register SrcReg = I.getOperand(i: NumElts).getReg();
4054 const LLT NarrowTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
4055 const LLT SrcTy = MRI.getType(Reg: SrcReg);
4056
4057 assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4058 if (SrcTy.getSizeInBits() > 128) {
4059 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4060 return false;
4061 }
4062
4063 // We implement a split vector operation by treating the sub-vectors as
4064 // scalars and extracting them.
4065 const RegisterBank &DstRB =
4066 *RBI.getRegBank(Reg: I.getOperand(i: 0).getReg(), MRI, TRI);
4067 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4068 Register Dst = I.getOperand(i: OpIdx).getReg();
4069 MachineInstr *Extract =
4070 emitExtractVectorElt(DstReg: Dst, DstRB, ScalarTy: NarrowTy, VecReg: SrcReg, LaneIdx: OpIdx, MIRBuilder&: MIB);
4071 if (!Extract)
4072 return false;
4073 }
4074 I.eraseFromParent();
4075 return true;
4076}
4077
4078bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4079 MachineRegisterInfo &MRI) {
4080 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4081 "unexpected opcode");
4082
4083 // TODO: Handle unmerging into GPRs and from scalars to scalars.
4084 if (RBI.getRegBank(Reg: I.getOperand(i: 0).getReg(), MRI, TRI)->getID() !=
4085 AArch64::FPRRegBankID ||
4086 RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI)->getID() !=
4087 AArch64::FPRRegBankID) {
4088 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4089 "currently unsupported.\n");
4090 return false;
4091 }
4092
4093 // The last operand is the vector source register, and every other operand is
4094 // a register to unpack into.
4095 unsigned NumElts = I.getNumOperands() - 1;
4096 Register SrcReg = I.getOperand(i: NumElts).getReg();
4097 const LLT NarrowTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
4098 const LLT WideTy = MRI.getType(Reg: SrcReg);
4099
4100 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4101 "source register size too small!");
4102
4103 if (!NarrowTy.isScalar())
4104 return selectSplitVectorUnmerge(I, MRI);
4105
4106 // Choose a lane copy opcode and subregister based off of the size of the
4107 // vector's elements.
4108 unsigned CopyOpc = 0;
4109 unsigned ExtractSubReg = 0;
4110 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, EltSize: NarrowTy.getSizeInBits()))
4111 return false;
4112
4113 // Set up for the lane copies.
4114 MachineBasicBlock &MBB = *I.getParent();
4115
4116 // Stores the registers we'll be copying from.
4117 SmallVector<Register, 4> InsertRegs;
4118
4119 // We'll use the first register twice, so we only need NumElts-1 registers.
4120 unsigned NumInsertRegs = NumElts - 1;
4121
4122 // If our elements fit into exactly 128 bits, then we can copy from the source
4123 // directly. Otherwise, we need to do a bit of setup with some subregister
4124 // inserts.
4125 if (NarrowTy.getSizeInBits() * NumElts == 128) {
4126 InsertRegs.assign(NumElts: NumInsertRegs, Elt: SrcReg);
4127 } else {
4128 // No. We have to perform subregister inserts. For each insert, create an
4129 // implicit def and a subregister insert, and save the register we create.
4130 // For scalar sources, treat as a pseudo-vector of NarrowTy elements.
4131 unsigned EltSize = WideTy.isVector() ? WideTy.getScalarSizeInBits()
4132 : NarrowTy.getSizeInBits();
4133 const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4134 Ty: LLT::fixed_vector(NumElements: NumElts, ScalarSizeInBits: EltSize), RB: *RBI.getRegBank(Reg: SrcReg, MRI, TRI));
4135 unsigned SubReg = 0;
4136 bool Found = getSubRegForClass(RC, TRI, SubReg);
4137 (void)Found;
4138 assert(Found && "expected to find last operand's subeg idx");
4139 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4140 Register ImpDefReg = MRI.createVirtualRegister(RegClass: &AArch64::FPR128RegClass);
4141 MachineInstr &ImpDefMI =
4142 *BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: TargetOpcode::IMPLICIT_DEF),
4143 DestReg: ImpDefReg);
4144
4145 // Now, create the subregister insert from SrcReg.
4146 Register InsertReg = MRI.createVirtualRegister(RegClass: &AArch64::FPR128RegClass);
4147 MachineInstr &InsMI =
4148 *BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(),
4149 MCID: TII.get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: InsertReg)
4150 .addUse(RegNo: ImpDefReg)
4151 .addUse(RegNo: SrcReg)
4152 .addImm(Val: SubReg);
4153
4154 constrainSelectedInstRegOperands(I&: ImpDefMI, TII, TRI, RBI);
4155 constrainSelectedInstRegOperands(I&: InsMI, TII, TRI, RBI);
4156
4157 // Save the register so that we can copy from it after.
4158 InsertRegs.push_back(Elt: InsertReg);
4159 }
4160 }
4161
4162 // Now that we've created any necessary subregister inserts, we can
4163 // create the copies.
4164 //
4165 // Perform the first copy separately as a subregister copy.
4166 Register CopyTo = I.getOperand(i: 0).getReg();
4167 auto FirstCopy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {CopyTo}, SrcOps: {})
4168 .addReg(RegNo: InsertRegs[0], Flags: {}, SubReg: ExtractSubReg);
4169 constrainSelectedInstRegOperands(I&: *FirstCopy, TII, TRI, RBI);
4170
4171 // Now, perform the remaining copies as vector lane copies.
4172 unsigned LaneIdx = 1;
4173 for (Register InsReg : InsertRegs) {
4174 Register CopyTo = I.getOperand(i: LaneIdx).getReg();
4175 MachineInstr &CopyInst =
4176 *BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: CopyOpc), DestReg: CopyTo)
4177 .addUse(RegNo: InsReg)
4178 .addImm(Val: LaneIdx);
4179 constrainSelectedInstRegOperands(I&: CopyInst, TII, TRI, RBI);
4180 ++LaneIdx;
4181 }
4182
4183 // Separately constrain the first copy's destination. Because of the
4184 // limitation in constrainOperandRegClass, we can't guarantee that this will
4185 // actually be constrained. So, do it ourselves using the second operand.
4186 const TargetRegisterClass *RC =
4187 MRI.getRegClassOrNull(Reg: I.getOperand(i: 1).getReg());
4188 if (!RC) {
4189 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4190 return false;
4191 }
4192
4193 RBI.constrainGenericRegister(Reg: CopyTo, RC: *RC, MRI);
4194 I.eraseFromParent();
4195 return true;
4196}
4197
4198bool AArch64InstructionSelector::selectConcatVectors(
4199 MachineInstr &I, MachineRegisterInfo &MRI) {
4200 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4201 "Unexpected opcode");
4202 Register Dst = I.getOperand(i: 0).getReg();
4203 Register Op1 = I.getOperand(i: 1).getReg();
4204 Register Op2 = I.getOperand(i: 2).getReg();
4205 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder&: MIB);
4206 if (!ConcatMI)
4207 return false;
4208 I.eraseFromParent();
4209 return true;
4210}
4211
4212unsigned
4213AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4214 MachineFunction &MF) const {
4215 Type *CPTy = CPVal->getType();
4216 Align Alignment = MF.getDataLayout().getPrefTypeAlign(Ty: CPTy);
4217
4218 MachineConstantPool *MCP = MF.getConstantPool();
4219 return MCP->getConstantPoolIndex(C: CPVal, Alignment);
4220}
4221
4222MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4223 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4224 const TargetRegisterClass *RC;
4225 unsigned Opc;
4226 bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4227 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(Ty: CPVal->getType());
4228 switch (Size) {
4229 case 16:
4230 RC = &AArch64::FPR128RegClass;
4231 Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4232 break;
4233 case 8:
4234 RC = &AArch64::FPR64RegClass;
4235 Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4236 break;
4237 case 4:
4238 RC = &AArch64::FPR32RegClass;
4239 Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4240 break;
4241 case 2:
4242 RC = &AArch64::FPR16RegClass;
4243 Opc = AArch64::LDRHui;
4244 break;
4245 default:
4246 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4247 << *CPVal->getType());
4248 return nullptr;
4249 }
4250
4251 MachineInstr *LoadMI = nullptr;
4252 auto &MF = MIRBuilder.getMF();
4253 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4254 if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4255 // Use load(literal) for tiny code model.
4256 LoadMI = &*MIRBuilder.buildInstr(Opc, DstOps: {RC}, SrcOps: {}).addConstantPoolIndex(Idx: CPIdx);
4257 } else {
4258 auto Adrp =
4259 MIRBuilder.buildInstr(Opc: AArch64::ADRP, DstOps: {&AArch64::GPR64RegClass}, SrcOps: {})
4260 .addConstantPoolIndex(Idx: CPIdx, Offset: 0, TargetFlags: AArch64II::MO_PAGE);
4261
4262 LoadMI = &*MIRBuilder.buildInstr(Opc, DstOps: {RC}, SrcOps: {Adrp})
4263 .addConstantPoolIndex(
4264 Idx: CPIdx, Offset: 0, TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4265
4266 constrainSelectedInstRegOperands(I&: *Adrp, TII, TRI, RBI);
4267 }
4268
4269 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4270 LoadMI->addMemOperand(MF, MO: MF.getMachineMemOperand(PtrInfo,
4271 F: MachineMemOperand::MOLoad,
4272 Size, BaseAlignment: Align(Size)));
4273 constrainSelectedInstRegOperands(I&: *LoadMI, TII, TRI, RBI);
4274 return LoadMI;
4275}
4276
4277/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4278/// size and RB.
4279static std::pair<unsigned, unsigned>
4280getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4281 unsigned Opc, SubregIdx;
4282 if (RB.getID() == AArch64::GPRRegBankID) {
4283 if (EltSize == 8) {
4284 Opc = AArch64::INSvi8gpr;
4285 SubregIdx = AArch64::bsub;
4286 } else if (EltSize == 16) {
4287 Opc = AArch64::INSvi16gpr;
4288 SubregIdx = AArch64::ssub;
4289 } else if (EltSize == 32) {
4290 Opc = AArch64::INSvi32gpr;
4291 SubregIdx = AArch64::ssub;
4292 } else if (EltSize == 64) {
4293 Opc = AArch64::INSvi64gpr;
4294 SubregIdx = AArch64::dsub;
4295 } else {
4296 llvm_unreachable("invalid elt size!");
4297 }
4298 } else {
4299 if (EltSize == 8) {
4300 Opc = AArch64::INSvi8lane;
4301 SubregIdx = AArch64::bsub;
4302 } else if (EltSize == 16) {
4303 Opc = AArch64::INSvi16lane;
4304 SubregIdx = AArch64::hsub;
4305 } else if (EltSize == 32) {
4306 Opc = AArch64::INSvi32lane;
4307 SubregIdx = AArch64::ssub;
4308 } else if (EltSize == 64) {
4309 Opc = AArch64::INSvi64lane;
4310 SubregIdx = AArch64::dsub;
4311 } else {
4312 llvm_unreachable("invalid elt size!");
4313 }
4314 }
4315 return std::make_pair(x&: Opc, y&: SubregIdx);
4316}
4317
4318MachineInstr *AArch64InstructionSelector::emitInstr(
4319 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4320 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4321 const ComplexRendererFns &RenderFns) const {
4322 assert(Opcode && "Expected an opcode?");
4323 assert(!isPreISelGenericOpcode(Opcode) &&
4324 "Function should only be used to produce selected instructions!");
4325 auto MI = MIRBuilder.buildInstr(Opc: Opcode, DstOps, SrcOps);
4326 if (RenderFns)
4327 for (auto &Fn : *RenderFns)
4328 Fn(MI);
4329 constrainSelectedInstRegOperands(I&: *MI, TII, TRI, RBI);
4330 return &*MI;
4331}
4332
4333MachineInstr *AArch64InstructionSelector::emitAddSub(
4334 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4335 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4336 MachineIRBuilder &MIRBuilder) const {
4337 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4338 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4339 auto Ty = MRI.getType(Reg: LHS.getReg());
4340 assert(!Ty.isVector() && "Expected a scalar or pointer?");
4341 unsigned Size = Ty.getSizeInBits();
4342 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4343 bool Is32Bit = Size == 32;
4344
4345 // INSTRri form with positive arithmetic immediate.
4346 if (auto Fns = selectArithImmed(Root&: RHS))
4347 return emitInstr(Opcode: AddrModeAndSizeToOpcode[0][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4348 MIRBuilder, RenderFns: Fns);
4349
4350 // INSTRri form with negative arithmetic immediate.
4351 if (auto Fns = selectNegArithImmed(Root&: RHS))
4352 return emitInstr(Opcode: AddrModeAndSizeToOpcode[3][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4353 MIRBuilder, RenderFns: Fns);
4354
4355 // INSTRrx form.
4356 if (auto Fns = selectArithExtendedRegister(Root&: RHS))
4357 return emitInstr(Opcode: AddrModeAndSizeToOpcode[4][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4358 MIRBuilder, RenderFns: Fns);
4359
4360 // INSTRrs form.
4361 if (auto Fns = selectShiftedRegister(Root&: RHS))
4362 return emitInstr(Opcode: AddrModeAndSizeToOpcode[1][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4363 MIRBuilder, RenderFns: Fns);
4364 return emitInstr(Opcode: AddrModeAndSizeToOpcode[2][Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS},
4365 MIRBuilder);
4366}
4367
4368MachineInstr *
4369AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4370 MachineOperand &RHS,
4371 MachineIRBuilder &MIRBuilder) const {
4372 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4373 ._M_elems: {{AArch64::ADDXri, AArch64::ADDWri},
4374 {AArch64::ADDXrs, AArch64::ADDWrs},
4375 {AArch64::ADDXrr, AArch64::ADDWrr},
4376 {AArch64::SUBXri, AArch64::SUBWri},
4377 {AArch64::ADDXrx, AArch64::ADDWrx}}};
4378 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst: DefReg, LHS, RHS, MIRBuilder);
4379}
4380
4381MachineInstr *
4382AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4383 MachineOperand &RHS,
4384 MachineIRBuilder &MIRBuilder) const {
4385 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4386 ._M_elems: {{AArch64::ADDSXri, AArch64::ADDSWri},
4387 {AArch64::ADDSXrs, AArch64::ADDSWrs},
4388 {AArch64::ADDSXrr, AArch64::ADDSWrr},
4389 {AArch64::SUBSXri, AArch64::SUBSWri},
4390 {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4391 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst, LHS, RHS, MIRBuilder);
4392}
4393
4394MachineInstr *
4395AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4396 MachineOperand &RHS,
4397 MachineIRBuilder &MIRBuilder) const {
4398 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4399 ._M_elems: {{AArch64::SUBSXri, AArch64::SUBSWri},
4400 {AArch64::SUBSXrs, AArch64::SUBSWrs},
4401 {AArch64::SUBSXrr, AArch64::SUBSWrr},
4402 {AArch64::ADDSXri, AArch64::ADDSWri},
4403 {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4404 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst, LHS, RHS, MIRBuilder);
4405}
4406
4407MachineInstr *
4408AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4409 MachineOperand &RHS,
4410 MachineIRBuilder &MIRBuilder) const {
4411 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4412 MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4413 bool Is32Bit = (MRI->getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4414 static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4415 return emitInstr(Opcode: OpcTable[Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS}, MIRBuilder);
4416}
4417
4418MachineInstr *
4419AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4420 MachineOperand &RHS,
4421 MachineIRBuilder &MIRBuilder) const {
4422 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4423 MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4424 bool Is32Bit = (MRI->getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4425 static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4426 return emitInstr(Opcode: OpcTable[Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS}, MIRBuilder);
4427}
4428
4429MachineInstr *
4430AArch64InstructionSelector::emitCMP(MachineOperand &LHS, MachineOperand &RHS,
4431 MachineIRBuilder &MIRBuilder) const {
4432 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4433 bool Is32Bit = MRI.getType(Reg: LHS.getReg()).getSizeInBits() == 32;
4434 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4435 return emitSUBS(Dst: MRI.createVirtualRegister(RegClass: RC), LHS, RHS, MIRBuilder);
4436}
4437
4438MachineInstr *
4439AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4440 MachineIRBuilder &MIRBuilder) const {
4441 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4442 bool Is32Bit = (MRI.getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4443 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4444 return emitADDS(Dst: MRI.createVirtualRegister(RegClass: RC), LHS, RHS, MIRBuilder);
4445}
4446
4447MachineInstr *
4448AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4449 MachineIRBuilder &MIRBuilder) const {
4450 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4451 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4452 LLT Ty = MRI.getType(Reg: LHS.getReg());
4453 unsigned RegSize = Ty.getSizeInBits();
4454 bool Is32Bit = (RegSize == 32);
4455 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4456 {AArch64::ANDSXrs, AArch64::ANDSWrs},
4457 {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4458 // ANDS needs a logical immediate for its immediate form. Check if we can
4459 // fold one in.
4460 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI)) {
4461 int64_t Imm = ValAndVReg->Value.getSExtValue();
4462
4463 if (AArch64_AM::isLogicalImmediate(imm: Imm, regSize: RegSize)) {
4464 auto TstMI = MIRBuilder.buildInstr(Opc: OpcTable[0][Is32Bit], DstOps: {Ty}, SrcOps: {LHS});
4465 TstMI.addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: Imm, regSize: RegSize));
4466 constrainSelectedInstRegOperands(I&: *TstMI, TII, TRI, RBI);
4467 return &*TstMI;
4468 }
4469 }
4470
4471 if (auto Fns = selectLogicalShiftedRegister(Root&: RHS))
4472 return emitInstr(Opcode: OpcTable[1][Is32Bit], DstOps: {Ty}, SrcOps: {LHS}, MIRBuilder, RenderFns: Fns);
4473 return emitInstr(Opcode: OpcTable[2][Is32Bit], DstOps: {Ty}, SrcOps: {LHS, RHS}, MIRBuilder);
4474}
4475
4476MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4477 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4478 MachineIRBuilder &MIRBuilder) const {
4479 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4480 assert(Predicate.isPredicate() && "Expected predicate?");
4481 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4482 LLT CmpTy = MRI.getType(Reg: LHS.getReg());
4483 assert(!CmpTy.isVector() && "Expected scalar or pointer");
4484 unsigned Size = CmpTy.getSizeInBits();
4485 (void)Size;
4486 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4487 // Fold the compare into a cmn or tst if possible.
4488 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4489 return FoldCmp;
4490 return emitCMP(LHS, RHS, MIRBuilder);
4491}
4492
4493MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4494 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4495 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4496#ifndef NDEBUG
4497 LLT Ty = MRI.getType(Dst);
4498 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4499 "Expected a 32-bit scalar register?");
4500#endif
4501 const Register ZReg = AArch64::WZR;
4502 AArch64CC::CondCode CC1, CC2;
4503 changeFCMPPredToAArch64CC(P: Pred, CondCode&: CC1, CondCode2&: CC2);
4504 auto InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
4505 if (CC2 == AArch64CC::AL)
4506 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC1,
4507 MIRBuilder);
4508 const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4509 Register Def1Reg = MRI.createVirtualRegister(RegClass: RC);
4510 Register Def2Reg = MRI.createVirtualRegister(RegClass: RC);
4511 auto InvCC2 = AArch64CC::getInvertedCondCode(Code: CC2);
4512 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC1, MIRBuilder);
4513 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC2, MIRBuilder);
4514 auto OrMI = MIRBuilder.buildInstr(Opc: AArch64::ORRWrr, DstOps: {Dst}, SrcOps: {Def1Reg, Def2Reg});
4515 constrainSelectedInstRegOperands(I&: *OrMI, TII, TRI, RBI);
4516 return &*OrMI;
4517}
4518
4519MachineInstr *AArch64InstructionSelector::emitFPCompare(
4520 Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4521 std::optional<CmpInst::Predicate> Pred) const {
4522 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4523 LLT Ty = MRI.getType(Reg: LHS);
4524 if (Ty.isVector())
4525 return nullptr;
4526 unsigned OpSize = Ty.getSizeInBits();
4527 assert(OpSize == 16 || OpSize == 32 || OpSize == 64);
4528
4529 // If this is a compare against +0.0, then we don't have
4530 // to explicitly materialize a constant.
4531 const ConstantFP *FPImm = getConstantFPVRegVal(VReg: RHS, MRI);
4532 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4533
4534 auto IsEqualityPred = [](CmpInst::Predicate P) {
4535 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4536 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4537 };
4538 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4539 // Try commuting the operands.
4540 const ConstantFP *LHSImm = getConstantFPVRegVal(VReg: LHS, MRI);
4541 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4542 ShouldUseImm = true;
4543 std::swap(a&: LHS, b&: RHS);
4544 }
4545 }
4546 unsigned CmpOpcTbl[2][3] = {
4547 {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
4548 {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
4549 unsigned CmpOpc =
4550 CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)];
4551
4552 // Partially build the compare. Decide if we need to add a use for the
4553 // third operand based off whether or not we're comparing against 0.0.
4554 auto CmpMI = MIRBuilder.buildInstr(Opcode: CmpOpc).addUse(RegNo: LHS);
4555 CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4556 if (!ShouldUseImm)
4557 CmpMI.addUse(RegNo: RHS);
4558 constrainSelectedInstRegOperands(I&: *CmpMI, TII, TRI, RBI);
4559 return &*CmpMI;
4560}
4561
4562MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4563 std::optional<Register> Dst, Register Op1, Register Op2,
4564 MachineIRBuilder &MIRBuilder) const {
4565 // We implement a vector concat by:
4566 // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4567 // 2. Insert the upper vector into the destination's upper element
4568 // TODO: some of this code is common with G_BUILD_VECTOR handling.
4569 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4570
4571 const LLT Op1Ty = MRI.getType(Reg: Op1);
4572 const LLT Op2Ty = MRI.getType(Reg: Op2);
4573
4574 if (Op1Ty != Op2Ty) {
4575 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4576 return nullptr;
4577 }
4578 assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4579
4580 if (Op1Ty.getSizeInBits() >= 128) {
4581 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4582 return nullptr;
4583 }
4584
4585 // At the moment we just support 64 bit vector concats.
4586 if (Op1Ty.getSizeInBits() != 64) {
4587 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4588 return nullptr;
4589 }
4590
4591 const LLT ScalarTy = LLT::scalar(SizeInBits: Op1Ty.getSizeInBits());
4592 const RegisterBank &FPRBank = *RBI.getRegBank(Reg: Op1, MRI, TRI);
4593 const TargetRegisterClass *DstRC =
4594 getRegClassForTypeOnBank(Ty: Op1Ty.multiplyElements(Factor: 2), RB: FPRBank);
4595
4596 MachineInstr *WidenedOp1 =
4597 emitScalarToVector(EltSize: ScalarTy.getSizeInBits(), DstRC, Scalar: Op1, MIRBuilder);
4598 MachineInstr *WidenedOp2 =
4599 emitScalarToVector(EltSize: ScalarTy.getSizeInBits(), DstRC, Scalar: Op2, MIRBuilder);
4600 if (!WidenedOp1 || !WidenedOp2) {
4601 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4602 return nullptr;
4603 }
4604
4605 // Now do the insert of the upper element.
4606 unsigned InsertOpc, InsSubRegIdx;
4607 std::tie(args&: InsertOpc, args&: InsSubRegIdx) =
4608 getInsertVecEltOpInfo(RB: FPRBank, EltSize: ScalarTy.getSizeInBits());
4609
4610 if (!Dst)
4611 Dst = MRI.createVirtualRegister(RegClass: DstRC);
4612 auto InsElt =
4613 MIRBuilder
4614 .buildInstr(Opc: InsertOpc, DstOps: {*Dst}, SrcOps: {WidenedOp1->getOperand(i: 0).getReg()})
4615 .addImm(Val: 1) /* Lane index */
4616 .addUse(RegNo: WidenedOp2->getOperand(i: 0).getReg())
4617 .addImm(Val: 0);
4618 constrainSelectedInstRegOperands(I&: *InsElt, TII, TRI, RBI);
4619 return &*InsElt;
4620}
4621
4622MachineInstr *
4623AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4624 Register Src2, AArch64CC::CondCode Pred,
4625 MachineIRBuilder &MIRBuilder) const {
4626 auto &MRI = *MIRBuilder.getMRI();
4627 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg: Dst);
4628 // If we used a register class, then this won't necessarily have an LLT.
4629 // Compute the size based off whether or not we have a class or bank.
4630 unsigned Size;
4631 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank))
4632 Size = TRI.getRegSizeInBits(RC: *RC);
4633 else
4634 Size = MRI.getType(Reg: Dst).getSizeInBits();
4635 // Some opcodes use s1.
4636 assert(Size <= 64 && "Expected 64 bits or less only!");
4637 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4638 unsigned Opc = OpcTable[Size == 64];
4639 auto CSINC = MIRBuilder.buildInstr(Opc, DstOps: {Dst}, SrcOps: {Src1, Src2}).addImm(Val: Pred);
4640 constrainSelectedInstRegOperands(I&: *CSINC, TII, TRI, RBI);
4641 return &*CSINC;
4642}
4643
4644MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4645 Register CarryReg) {
4646 MachineRegisterInfo *MRI = MIB.getMRI();
4647 unsigned Opcode = I.getOpcode();
4648
4649 // If the instruction is a SUB, we need to negate the carry,
4650 // because borrowing is indicated by carry-flag == 0.
4651 bool NeedsNegatedCarry =
4652 (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4653
4654 // If the previous instruction will already produce the correct carry, do not
4655 // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4656 // generated during legalization of wide add/sub. This optimization depends on
4657 // these sequences not being interrupted by other instructions.
4658 // We have to select the previous instruction before the carry-using
4659 // instruction is deleted by the calling function, otherwise the previous
4660 // instruction might become dead and would get deleted.
4661 MachineInstr *SrcMI = MRI->getVRegDef(Reg: CarryReg);
4662 if (SrcMI == I.getPrevNode()) {
4663 if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(Val: SrcMI)) {
4664 bool ProducesNegatedCarry = CarrySrcMI->isSub();
4665 if (NeedsNegatedCarry == ProducesNegatedCarry &&
4666 CarrySrcMI->isUnsigned() &&
4667 CarrySrcMI->getCarryOutReg() == CarryReg &&
4668 selectAndRestoreState(I&: *SrcMI))
4669 return nullptr;
4670 }
4671 }
4672
4673 Register DeadReg = MRI->createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
4674
4675 if (NeedsNegatedCarry) {
4676 // (0 - Carry) sets !C in NZCV when Carry == 1
4677 Register ZReg = AArch64::WZR;
4678 return emitInstr(Opcode: AArch64::SUBSWrr, DstOps: {DeadReg}, SrcOps: {ZReg, CarryReg}, MIRBuilder&: MIB);
4679 }
4680
4681 // (Carry - 1) sets !C in NZCV when Carry == 0
4682 auto Fns = select12BitValueWithLeftShift(Immed: 1);
4683 return emitInstr(Opcode: AArch64::SUBSWri, DstOps: {DeadReg}, SrcOps: {CarryReg}, MIRBuilder&: MIB, RenderFns: Fns);
4684}
4685
4686bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4687 MachineRegisterInfo &MRI) {
4688 auto &CarryMI = cast<GAddSubCarryOut>(Val&: I);
4689
4690 if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(Val: &I)) {
4691 // Set NZCV carry according to carry-in VReg
4692 emitCarryIn(I, CarryReg: CarryInMI->getCarryInReg());
4693 }
4694
4695 // Emit the operation and get the correct condition code.
4696 auto OpAndCC = emitOverflowOp(Opcode: I.getOpcode(), Dst: CarryMI.getDstReg(),
4697 LHS&: CarryMI.getLHS(), RHS&: CarryMI.getRHS(), MIRBuilder&: MIB);
4698
4699 Register CarryOutReg = CarryMI.getCarryOutReg();
4700
4701 // Don't convert carry-out to VReg if it is never used
4702 if (!MRI.use_nodbg_empty(RegNo: CarryOutReg)) {
4703 // Now, put the overflow result in the register given by the first operand
4704 // to the overflow op. CSINC increments the result when the predicate is
4705 // false, so to get the increment when it's true, we need to use the
4706 // inverse. In this case, we want to increment when carry is set.
4707 Register ZReg = AArch64::WZR;
4708 emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4709 Pred: getInvertedCondCode(Code: OpAndCC.second), MIRBuilder&: MIB);
4710 }
4711
4712 I.eraseFromParent();
4713 return true;
4714}
4715
4716std::pair<MachineInstr *, AArch64CC::CondCode>
4717AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4718 MachineOperand &LHS,
4719 MachineOperand &RHS,
4720 MachineIRBuilder &MIRBuilder) const {
4721 switch (Opcode) {
4722 default:
4723 llvm_unreachable("Unexpected opcode!");
4724 case TargetOpcode::G_SADDO:
4725 return std::make_pair(x: emitADDS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4726 case TargetOpcode::G_UADDO:
4727 return std::make_pair(x: emitADDS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::HS);
4728 case TargetOpcode::G_SSUBO:
4729 return std::make_pair(x: emitSUBS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4730 case TargetOpcode::G_USUBO:
4731 return std::make_pair(x: emitSUBS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::LO);
4732 case TargetOpcode::G_SADDE:
4733 return std::make_pair(x: emitADCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4734 case TargetOpcode::G_UADDE:
4735 return std::make_pair(x: emitADCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::HS);
4736 case TargetOpcode::G_SSUBE:
4737 return std::make_pair(x: emitSBCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4738 case TargetOpcode::G_USUBE:
4739 return std::make_pair(x: emitSBCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::LO);
4740 }
4741}
4742
4743/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4744/// expressed as a conjunction.
4745/// \param CanNegate Set to true if we can negate the whole sub-tree just by
4746/// changing the conditions on the CMP tests.
4747/// (this means we can call emitConjunctionRec() with
4748/// Negate==true on this sub-tree)
4749/// \param MustBeFirst Set to true if this subtree needs to be negated and we
4750/// cannot do the negation naturally. We are required to
4751/// emit the subtree first in this case.
4752/// \param WillNegate Is true if are called when the result of this
4753/// subexpression must be negated. This happens when the
4754/// outer expression is an OR. We can use this fact to know
4755/// that we have a double negation (or (or ...) ...) that
4756/// can be implemented for free.
4757static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4758 bool WillNegate, MachineRegisterInfo &MRI,
4759 unsigned Depth = 0) {
4760 if (!MRI.hasOneNonDBGUse(RegNo: Val))
4761 return false;
4762 MachineInstr *ValDef = MRI.getVRegDef(Reg: Val);
4763 unsigned Opcode = ValDef->getOpcode();
4764 if (isa<GAnyCmp>(Val: ValDef)) {
4765 CanNegate = true;
4766 MustBeFirst = false;
4767 return true;
4768 }
4769 // Protect against exponential runtime and stack overflow.
4770 if (Depth > 6)
4771 return false;
4772 if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4773 bool IsOR = Opcode == TargetOpcode::G_OR;
4774 Register O0 = ValDef->getOperand(i: 1).getReg();
4775 Register O1 = ValDef->getOperand(i: 2).getReg();
4776 bool CanNegateL;
4777 bool MustBeFirstL;
4778 if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, MRI, Depth: Depth + 1))
4779 return false;
4780 bool CanNegateR;
4781 bool MustBeFirstR;
4782 if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, MRI, Depth: Depth + 1))
4783 return false;
4784
4785 if (MustBeFirstL && MustBeFirstR)
4786 return false;
4787
4788 if (IsOR) {
4789 // For an OR expression we need to be able to naturally negate at least
4790 // one side or we cannot do the transformation at all.
4791 if (!CanNegateL && !CanNegateR)
4792 return false;
4793 // If we the result of the OR will be negated and we can naturally negate
4794 // the leaves, then this sub-tree as a whole negates naturally.
4795 CanNegate = WillNegate && CanNegateL && CanNegateR;
4796 // If we cannot naturally negate the whole sub-tree, then this must be
4797 // emitted first.
4798 MustBeFirst = !CanNegate;
4799 } else {
4800 assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4801 // We cannot naturally negate an AND operation.
4802 CanNegate = false;
4803 MustBeFirst = MustBeFirstL || MustBeFirstR;
4804 }
4805 return true;
4806 }
4807 return false;
4808}
4809
4810MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4811 Register LHS, Register RHS, CmpInst::Predicate CC,
4812 AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4813 MachineIRBuilder &MIB) const {
4814 auto &MRI = *MIB.getMRI();
4815 LLT OpTy = MRI.getType(Reg: LHS);
4816 unsigned CCmpOpc;
4817 std::optional<ValueAndVReg> C;
4818 if (CmpInst::isIntPredicate(P: CC)) {
4819 assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4820 C = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
4821 if (!C || C->Value.sgt(RHS: 31) || C->Value.slt(RHS: -31))
4822 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4823 else if (C->Value.ule(RHS: 31))
4824 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4825 else
4826 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi;
4827 } else {
4828 assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
4829 OpTy.getSizeInBits() == 64);
4830 switch (OpTy.getSizeInBits()) {
4831 case 16:
4832 assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
4833 CCmpOpc = AArch64::FCCMPHrr;
4834 break;
4835 case 32:
4836 CCmpOpc = AArch64::FCCMPSrr;
4837 break;
4838 case 64:
4839 CCmpOpc = AArch64::FCCMPDrr;
4840 break;
4841 default:
4842 return nullptr;
4843 }
4844 }
4845 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
4846 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
4847 auto CCmp =
4848 MIB.buildInstr(Opc: CCmpOpc, DstOps: {}, SrcOps: {LHS});
4849 if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4850 CCmp.addImm(Val: C->Value.getZExtValue());
4851 else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi)
4852 CCmp.addImm(Val: C->Value.abs().getZExtValue());
4853 else
4854 CCmp.addReg(RegNo: RHS);
4855 CCmp.addImm(Val: NZCV).addImm(Val: Predicate);
4856 constrainSelectedInstRegOperands(I&: *CCmp, TII, TRI, RBI);
4857 return &*CCmp;
4858}
4859
4860MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4861 Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4862 AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4863 // We're at a tree leaf, produce a conditional comparison operation.
4864 auto &MRI = *MIB.getMRI();
4865 MachineInstr *ValDef = MRI.getVRegDef(Reg: Val);
4866 unsigned Opcode = ValDef->getOpcode();
4867 if (auto *Cmp = dyn_cast<GAnyCmp>(Val: ValDef)) {
4868 Register LHS = Cmp->getLHSReg();
4869 Register RHS = Cmp->getRHSReg();
4870 CmpInst::Predicate CC = Cmp->getCond();
4871 if (Negate)
4872 CC = CmpInst::getInversePredicate(pred: CC);
4873 if (isa<GICmp>(Val: Cmp)) {
4874 OutCC = changeICMPPredToAArch64CC(P: CC, RHS, MRI: MIB.getMRI());
4875 } else {
4876 // Handle special FP cases.
4877 AArch64CC::CondCode ExtraCC;
4878 changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
4879 // Some floating point conditions can't be tested with a single condition
4880 // code. Construct an additional comparison in this case.
4881 if (ExtraCC != AArch64CC::AL) {
4882 MachineInstr *ExtraCmp;
4883 if (!CCOp)
4884 ExtraCmp = emitFPCompare(LHS, RHS, MIRBuilder&: MIB, Pred: CC);
4885 else
4886 ExtraCmp =
4887 emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC: ExtraCC, MIB);
4888 CCOp = ExtraCmp->getOperand(i: 0).getReg();
4889 Predicate = ExtraCC;
4890 }
4891 }
4892
4893 // Produce a normal comparison if we are first in the chain
4894 if (!CCOp) {
4895 if (isa<GICmp>(Val: Cmp))
4896 return emitCMP(LHS&: Cmp->getOperand(i: 2), RHS&: Cmp->getOperand(i: 3), MIRBuilder&: MIB);
4897 return emitFPCompare(LHS: Cmp->getOperand(i: 2).getReg(),
4898 RHS: Cmp->getOperand(i: 3).getReg(), MIRBuilder&: MIB);
4899 }
4900 // Otherwise produce a ccmp.
4901 return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4902 }
4903 assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4904
4905 bool IsOR = Opcode == TargetOpcode::G_OR;
4906
4907 Register LHS = ValDef->getOperand(i: 1).getReg();
4908 bool CanNegateL;
4909 bool MustBeFirstL;
4910 bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, MRI);
4911 assert(ValidL && "Valid conjunction/disjunction tree");
4912 (void)ValidL;
4913
4914 Register RHS = ValDef->getOperand(i: 2).getReg();
4915 bool CanNegateR;
4916 bool MustBeFirstR;
4917 bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, MRI);
4918 assert(ValidR && "Valid conjunction/disjunction tree");
4919 (void)ValidR;
4920
4921 // Swap sub-tree that must come first to the right side.
4922 if (MustBeFirstL) {
4923 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4924 std::swap(a&: LHS, b&: RHS);
4925 std::swap(a&: CanNegateL, b&: CanNegateR);
4926 std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
4927 }
4928
4929 bool NegateR;
4930 bool NegateAfterR;
4931 bool NegateL;
4932 bool NegateAfterAll;
4933 if (Opcode == TargetOpcode::G_OR) {
4934 // Swap the sub-tree that we can negate naturally to the left.
4935 if (!CanNegateL) {
4936 assert(CanNegateR && "at least one side must be negatable");
4937 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4938 assert(!Negate);
4939 std::swap(a&: LHS, b&: RHS);
4940 NegateR = false;
4941 NegateAfterR = true;
4942 } else {
4943 // Negate the left sub-tree if possible, otherwise negate the result.
4944 NegateR = CanNegateR;
4945 NegateAfterR = !CanNegateR;
4946 }
4947 NegateL = true;
4948 NegateAfterAll = !Negate;
4949 } else {
4950 assert(Opcode == TargetOpcode::G_AND &&
4951 "Valid conjunction/disjunction tree");
4952 assert(!Negate && "Valid conjunction/disjunction tree");
4953
4954 NegateL = false;
4955 NegateR = false;
4956 NegateAfterR = false;
4957 NegateAfterAll = false;
4958 }
4959
4960 // Emit sub-trees.
4961 AArch64CC::CondCode RHSCC;
4962 MachineInstr *CmpR =
4963 emitConjunctionRec(Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate, MIB);
4964 if (NegateAfterR)
4965 RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
4966 MachineInstr *CmpL = emitConjunctionRec(
4967 Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR->getOperand(i: 0).getReg(), Predicate: RHSCC, MIB);
4968 if (NegateAfterAll)
4969 OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
4970 return CmpL;
4971}
4972
4973MachineInstr *AArch64InstructionSelector::emitConjunction(
4974 Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
4975 bool DummyCanNegate;
4976 bool DummyMustBeFirst;
4977 if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false,
4978 MRI&: *MIB.getMRI()))
4979 return nullptr;
4980 return emitConjunctionRec(Val, OutCC, Negate: false, CCOp: Register(), Predicate: AArch64CC::AL, MIB);
4981}
4982
4983bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
4984 MachineInstr &CondMI) {
4985 AArch64CC::CondCode AArch64CC;
4986 MachineInstr *ConjMI = emitConjunction(Val: SelI.getCondReg(), OutCC&: AArch64CC, MIB);
4987 if (!ConjMI)
4988 return false;
4989
4990 emitSelect(Dst: SelI.getReg(Idx: 0), True: SelI.getTrueReg(), False: SelI.getFalseReg(), CC: AArch64CC, MIB);
4991 SelI.eraseFromParent();
4992 return true;
4993}
4994
4995bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
4996 MachineRegisterInfo &MRI = *MIB.getMRI();
4997 // We want to recognize this pattern:
4998 //
4999 // $z = G_FCMP pred, $x, $y
5000 // ...
5001 // $w = G_SELECT $z, $a, $b
5002 //
5003 // Where the value of $z is *only* ever used by the G_SELECT (possibly with
5004 // some copies/truncs in between.)
5005 //
5006 // If we see this, then we can emit something like this:
5007 //
5008 // fcmp $x, $y
5009 // fcsel $w, $a, $b, pred
5010 //
5011 // Rather than emitting both of the rather long sequences in the standard
5012 // G_FCMP/G_SELECT select methods.
5013
5014 // First, check if the condition is defined by a compare.
5015 MachineInstr *CondDef = MRI.getVRegDef(Reg: I.getOperand(i: 1).getReg());
5016
5017 // We can only fold if all of the defs have one use.
5018 Register CondDefReg = CondDef->getOperand(i: 0).getReg();
5019 if (!MRI.hasOneNonDBGUse(RegNo: CondDefReg)) {
5020 // Unless it's another select.
5021 for (const MachineInstr &UI : MRI.use_nodbg_instructions(Reg: CondDefReg)) {
5022 if (CondDef == &UI)
5023 continue;
5024 if (UI.getOpcode() != TargetOpcode::G_SELECT)
5025 return false;
5026 }
5027 }
5028
5029 // Is the condition defined by a compare?
5030 unsigned CondOpc = CondDef->getOpcode();
5031 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5032 if (tryOptSelectConjunction(SelI&: I, CondMI&: *CondDef))
5033 return true;
5034 return false;
5035 }
5036
5037 AArch64CC::CondCode CondCode;
5038 if (CondOpc == TargetOpcode::G_ICMP) {
5039 auto &PredOp = CondDef->getOperand(i: 1);
5040 emitIntegerCompare(LHS&: CondDef->getOperand(i: 2), RHS&: CondDef->getOperand(i: 3), Predicate&: PredOp,
5041 MIRBuilder&: MIB);
5042 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
5043 CondCode =
5044 changeICMPPredToAArch64CC(P: Pred, RHS: CondDef->getOperand(i: 3).getReg(), MRI: &MRI);
5045 } else {
5046 // Get the condition code for the select.
5047 auto Pred =
5048 static_cast<CmpInst::Predicate>(CondDef->getOperand(i: 1).getPredicate());
5049 AArch64CC::CondCode CondCode2;
5050 changeFCMPPredToAArch64CC(P: Pred, CondCode, CondCode2);
5051
5052 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5053 // instructions to emit the comparison.
5054 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5055 // unnecessary.
5056 if (CondCode2 != AArch64CC::AL)
5057 return false;
5058
5059 if (!emitFPCompare(LHS: CondDef->getOperand(i: 2).getReg(),
5060 RHS: CondDef->getOperand(i: 3).getReg(), MIRBuilder&: MIB)) {
5061 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5062 return false;
5063 }
5064 }
5065
5066 // Emit the select.
5067 emitSelect(Dst: I.getOperand(i: 0).getReg(), True: I.getOperand(i: 2).getReg(),
5068 False: I.getOperand(i: 3).getReg(), CC: CondCode, MIB);
5069 I.eraseFromParent();
5070 return true;
5071}
5072
5073MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5074 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5075 MachineIRBuilder &MIRBuilder) const {
5076 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5077 "Unexpected MachineOperand");
5078 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5079 // We want to find this sort of thing:
5080 // x = G_SUB 0, y
5081 // G_ICMP z, x
5082 //
5083 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5084 // e.g:
5085 //
5086 // cmn z, y
5087
5088 // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5089 MachineInstr *LHSDef = getDefIgnoringCopies(Reg: LHS.getReg(), MRI);
5090 MachineInstr *RHSDef = getDefIgnoringCopies(Reg: RHS.getReg(), MRI);
5091 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5092
5093 // Given this:
5094 //
5095 // x = G_SUB 0, y
5096 // G_ICMP z, x
5097 //
5098 // Produce this:
5099 //
5100 // cmn z, y
5101 if (isCMN(MaybeSub: RHSDef, Pred: P, MRI))
5102 return emitCMN(LHS, RHS&: RHSDef->getOperand(i: 2), MIRBuilder);
5103
5104 // Same idea here, but with the LHS of the compare instead:
5105 //
5106 // Given this:
5107 //
5108 // x = G_SUB 0, y
5109 // G_ICMP x, z
5110 //
5111 // Produce this:
5112 //
5113 // cmn y, z
5114 //
5115 // But be careful! We need to swap the predicate!
5116 if (isCMN(MaybeSub: LHSDef, Pred: P, MRI)) {
5117 if (!CmpInst::isEquality(pred: P)) {
5118 P = CmpInst::getSwappedPredicate(pred: P);
5119 Predicate = MachineOperand::CreatePredicate(Pred: P);
5120 }
5121 return emitCMN(LHS&: LHSDef->getOperand(i: 2), RHS, MIRBuilder);
5122 }
5123
5124 // Given this:
5125 //
5126 // z = G_AND x, y
5127 // G_ICMP z, 0
5128 //
5129 // Produce this if the compare is signed:
5130 //
5131 // tst x, y
5132 if (!CmpInst::isUnsigned(Pred: P) && LHSDef &&
5133 LHSDef->getOpcode() == TargetOpcode::G_AND) {
5134 // Make sure that the RHS is 0.
5135 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI);
5136 if (!ValAndVReg || ValAndVReg->Value != 0)
5137 return nullptr;
5138
5139 return emitTST(LHS&: LHSDef->getOperand(i: 1),
5140 RHS&: LHSDef->getOperand(i: 2), MIRBuilder);
5141 }
5142
5143 return nullptr;
5144}
5145
5146bool AArch64InstructionSelector::selectShuffleVector(
5147 MachineInstr &I, MachineRegisterInfo &MRI) {
5148 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5149 Register Src1Reg = I.getOperand(i: 1).getReg();
5150 Register Src2Reg = I.getOperand(i: 2).getReg();
5151 ArrayRef<int> Mask = I.getOperand(i: 3).getShuffleMask();
5152
5153 MachineBasicBlock &MBB = *I.getParent();
5154 MachineFunction &MF = *MBB.getParent();
5155 LLVMContext &Ctx = MF.getFunction().getContext();
5156
5157 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5158
5159 SmallVector<Constant *, 64> CstIdxs;
5160 for (int Val : Mask) {
5161 // For now, any undef indexes we'll just assume to be 0. This should be
5162 // optimized in future, e.g. to select DUP etc.
5163 Val = Val < 0 ? 0 : Val;
5164 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5165 unsigned Offset = Byte + Val * BytesPerElt;
5166 CstIdxs.emplace_back(Args: ConstantInt::get(Ty: Type::getInt8Ty(C&: Ctx), V: Offset));
5167 }
5168 }
5169
5170 // Use a constant pool to load the index vector for TBL.
5171 Constant *CPVal = ConstantVector::get(V: CstIdxs);
5172 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder&: MIB);
5173 if (!IndexLoad) {
5174 LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5175 return false;
5176 }
5177
5178 if (DstTy.getSizeInBits() != 128) {
5179 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5180 // This case can be done with TBL1.
5181 MachineInstr *Concat =
5182 emitVectorConcat(Dst: std::nullopt, Op1: Src1Reg, Op2: Src2Reg, MIRBuilder&: MIB);
5183 if (!Concat) {
5184 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5185 return false;
5186 }
5187
5188 // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5189 IndexLoad = emitScalarToVector(EltSize: 64, DstRC: &AArch64::FPR128RegClass,
5190 Scalar: IndexLoad->getOperand(i: 0).getReg(), MIRBuilder&: MIB);
5191
5192 auto TBL1 = MIB.buildInstr(
5193 Opc: AArch64::TBLv16i8One, DstOps: {&AArch64::FPR128RegClass},
5194 SrcOps: {Concat->getOperand(i: 0).getReg(), IndexLoad->getOperand(i: 0).getReg()});
5195 constrainSelectedInstRegOperands(I&: *TBL1, TII, TRI, RBI);
5196
5197 auto Copy =
5198 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: 0).getReg()}, SrcOps: {})
5199 .addReg(RegNo: TBL1.getReg(Idx: 0), Flags: {}, SubReg: AArch64::dsub);
5200 RBI.constrainGenericRegister(Reg: Copy.getReg(Idx: 0), RC: AArch64::FPR64RegClass, MRI);
5201 I.eraseFromParent();
5202 return true;
5203 }
5204
5205 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5206 // Q registers for regalloc.
5207 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5208 auto RegSeq = createQTuple(Regs, MIB);
5209 auto TBL2 = MIB.buildInstr(Opc: AArch64::TBLv16i8Two, DstOps: {I.getOperand(i: 0)},
5210 SrcOps: {RegSeq, IndexLoad->getOperand(i: 0)});
5211 constrainSelectedInstRegOperands(I&: *TBL2, TII, TRI, RBI);
5212 I.eraseFromParent();
5213 return true;
5214}
5215
5216MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5217 std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5218 unsigned LaneIdx, const RegisterBank &RB,
5219 MachineIRBuilder &MIRBuilder) const {
5220 MachineInstr *InsElt = nullptr;
5221 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5222 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5223
5224 // Create a register to define with the insert if one wasn't passed in.
5225 if (!DstReg)
5226 DstReg = MRI.createVirtualRegister(RegClass: DstRC);
5227
5228 unsigned EltSize = MRI.getType(Reg: EltReg).getSizeInBits();
5229 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5230
5231 if (RB.getID() == AArch64::FPRRegBankID) {
5232 auto InsSub = emitScalarToVector(EltSize, DstRC, Scalar: EltReg, MIRBuilder);
5233 InsElt = MIRBuilder.buildInstr(Opc, DstOps: {*DstReg}, SrcOps: {SrcReg})
5234 .addImm(Val: LaneIdx)
5235 .addUse(RegNo: InsSub->getOperand(i: 0).getReg())
5236 .addImm(Val: 0);
5237 } else {
5238 InsElt = MIRBuilder.buildInstr(Opc, DstOps: {*DstReg}, SrcOps: {SrcReg})
5239 .addImm(Val: LaneIdx)
5240 .addUse(RegNo: EltReg);
5241 }
5242
5243 constrainSelectedInstRegOperands(I&: *InsElt, TII, TRI, RBI);
5244 return InsElt;
5245}
5246
5247bool AArch64InstructionSelector::selectUSMovFromExtend(
5248 MachineInstr &MI, MachineRegisterInfo &MRI) {
5249 if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5250 MI.getOpcode() != TargetOpcode::G_ZEXT &&
5251 MI.getOpcode() != TargetOpcode::G_ANYEXT)
5252 return false;
5253 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5254 const Register DefReg = MI.getOperand(i: 0).getReg();
5255 const LLT DstTy = MRI.getType(Reg: DefReg);
5256 unsigned DstSize = DstTy.getSizeInBits();
5257
5258 if (DstSize != 32 && DstSize != 64)
5259 return false;
5260
5261 MachineInstr *Extract = getOpcodeDef(Opcode: TargetOpcode::G_EXTRACT_VECTOR_ELT,
5262 Reg: MI.getOperand(i: 1).getReg(), MRI);
5263 int64_t Lane;
5264 if (!Extract || !mi_match(R: Extract->getOperand(i: 2).getReg(), MRI, P: m_ICst(Cst&: Lane)))
5265 return false;
5266 Register Src0 = Extract->getOperand(i: 1).getReg();
5267
5268 const LLT VecTy = MRI.getType(Reg: Src0);
5269 if (VecTy.isScalableVector())
5270 return false;
5271
5272 if (VecTy.getSizeInBits() != 128) {
5273 const MachineInstr *ScalarToVector = emitScalarToVector(
5274 EltSize: VecTy.getSizeInBits(), DstRC: &AArch64::FPR128RegClass, Scalar: Src0, MIRBuilder&: MIB);
5275 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5276 Src0 = ScalarToVector->getOperand(i: 0).getReg();
5277 }
5278
5279 unsigned Opcode;
5280 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5281 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5282 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5283 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5284 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5285 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5286 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5287 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5288 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5289 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5290 else
5291 llvm_unreachable("Unexpected type combo for S/UMov!");
5292
5293 // We may need to generate one of these, depending on the type and sign of the
5294 // input:
5295 // DstReg = SMOV Src0, Lane;
5296 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5297 MachineInstr *ExtI = nullptr;
5298 if (DstSize == 64 && !IsSigned) {
5299 Register NewReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
5300 MIB.buildInstr(Opc: Opcode, DstOps: {NewReg}, SrcOps: {Src0}).addImm(Val: Lane);
5301 ExtI = MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {DefReg}, SrcOps: {})
5302 .addUse(RegNo: NewReg)
5303 .addImm(Val: AArch64::sub_32);
5304 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
5305 } else
5306 ExtI = MIB.buildInstr(Opc: Opcode, DstOps: {DefReg}, SrcOps: {Src0}).addImm(Val: Lane);
5307
5308 constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
5309 MI.eraseFromParent();
5310 return true;
5311}
5312
5313MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5314 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5315 unsigned int Op;
5316 if (DstSize == 128) {
5317 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5318 return nullptr;
5319 Op = AArch64::MOVIv16b_ns;
5320 } else {
5321 Op = AArch64::MOVIv8b_ns;
5322 }
5323
5324 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5325
5326 if (AArch64_AM::isAdvSIMDModImmType9(Imm: Val)) {
5327 Val = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Val);
5328 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5329 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5330 return &*Mov;
5331 }
5332 return nullptr;
5333}
5334
5335MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5336 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5337 bool Inv) {
5338
5339 unsigned int Op;
5340 if (DstSize == 128) {
5341 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5342 return nullptr;
5343 Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5344 } else {
5345 Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5346 }
5347
5348 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5349 uint64_t Shift;
5350
5351 if (AArch64_AM::isAdvSIMDModImmType5(Imm: Val)) {
5352 Val = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Val);
5353 Shift = 0;
5354 } else if (AArch64_AM::isAdvSIMDModImmType6(Imm: Val)) {
5355 Val = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Val);
5356 Shift = 8;
5357 } else
5358 return nullptr;
5359
5360 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5361 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5362 return &*Mov;
5363}
5364
5365MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5366 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5367 bool Inv) {
5368
5369 unsigned int Op;
5370 if (DstSize == 128) {
5371 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5372 return nullptr;
5373 Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5374 } else {
5375 Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5376 }
5377
5378 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5379 uint64_t Shift;
5380
5381 if ((AArch64_AM::isAdvSIMDModImmType1(Imm: Val))) {
5382 Val = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Val);
5383 Shift = 0;
5384 } else if ((AArch64_AM::isAdvSIMDModImmType2(Imm: Val))) {
5385 Val = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Val);
5386 Shift = 8;
5387 } else if ((AArch64_AM::isAdvSIMDModImmType3(Imm: Val))) {
5388 Val = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Val);
5389 Shift = 16;
5390 } else if ((AArch64_AM::isAdvSIMDModImmType4(Imm: Val))) {
5391 Val = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Val);
5392 Shift = 24;
5393 } else
5394 return nullptr;
5395
5396 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5397 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5398 return &*Mov;
5399}
5400
5401MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5402 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5403
5404 unsigned int Op;
5405 if (DstSize == 128) {
5406 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5407 return nullptr;
5408 Op = AArch64::MOVIv2d_ns;
5409 } else {
5410 Op = AArch64::MOVID;
5411 }
5412
5413 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5414 if (AArch64_AM::isAdvSIMDModImmType10(Imm: Val)) {
5415 Val = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Val);
5416 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5417 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5418 return &*Mov;
5419 }
5420 return nullptr;
5421}
5422
5423MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5424 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5425 bool Inv) {
5426
5427 unsigned int Op;
5428 if (DstSize == 128) {
5429 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5430 return nullptr;
5431 Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5432 } else {
5433 Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5434 }
5435
5436 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5437 uint64_t Shift;
5438
5439 if (AArch64_AM::isAdvSIMDModImmType7(Imm: Val)) {
5440 Val = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Val);
5441 Shift = 264;
5442 } else if (AArch64_AM::isAdvSIMDModImmType8(Imm: Val)) {
5443 Val = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Val);
5444 Shift = 272;
5445 } else
5446 return nullptr;
5447
5448 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5449 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5450 return &*Mov;
5451}
5452
5453MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5454 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5455
5456 unsigned int Op;
5457 bool IsWide = false;
5458 if (DstSize == 128) {
5459 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5460 return nullptr;
5461 Op = AArch64::FMOVv4f32_ns;
5462 IsWide = true;
5463 } else {
5464 Op = AArch64::FMOVv2f32_ns;
5465 }
5466
5467 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5468
5469 if (AArch64_AM::isAdvSIMDModImmType11(Imm: Val)) {
5470 Val = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Val);
5471 } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Imm: Val)) {
5472 Val = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Val);
5473 Op = AArch64::FMOVv2f64_ns;
5474 } else
5475 return nullptr;
5476
5477 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5478 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5479 return &*Mov;
5480}
5481
5482bool AArch64InstructionSelector::selectIndexedExtLoad(
5483 MachineInstr &MI, MachineRegisterInfo &MRI) {
5484 auto &ExtLd = cast<GIndexedAnyExtLoad>(Val&: MI);
5485 Register Dst = ExtLd.getDstReg();
5486 Register WriteBack = ExtLd.getWritebackReg();
5487 Register Base = ExtLd.getBaseReg();
5488 Register Offset = ExtLd.getOffsetReg();
5489 LLT Ty = MRI.getType(Reg: Dst);
5490 assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
5491 unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5492 bool IsPre = ExtLd.isPre();
5493 bool IsSExt = isa<GIndexedSExtLoad>(Val: ExtLd);
5494 unsigned InsertIntoSubReg = 0;
5495 bool IsDst64 = Ty.getSizeInBits() == 64;
5496
5497 // ZExt/SExt should be on gpr but can handle extload and zextload of fpr, so
5498 // long as they are scalar.
5499 bool IsFPR = RBI.getRegBank(Reg: Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID;
5500 if ((IsSExt && IsFPR) || Ty.isVector())
5501 return false;
5502
5503 unsigned Opc = 0;
5504 LLT NewLdDstTy;
5505 LLT s32 = LLT::scalar(SizeInBits: 32);
5506 LLT s64 = LLT::scalar(SizeInBits: 64);
5507
5508 if (MemSizeBits == 8) {
5509 if (IsSExt) {
5510 if (IsDst64)
5511 Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5512 else
5513 Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5514 NewLdDstTy = IsDst64 ? s64 : s32;
5515 } else if (IsFPR) {
5516 Opc = IsPre ? AArch64::LDRBpre : AArch64::LDRBpost;
5517 InsertIntoSubReg = AArch64::bsub;
5518 NewLdDstTy = LLT::scalar(SizeInBits: MemSizeBits);
5519 } else {
5520 Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5521 InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5522 NewLdDstTy = s32;
5523 }
5524 } else if (MemSizeBits == 16) {
5525 if (IsSExt) {
5526 if (IsDst64)
5527 Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5528 else
5529 Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5530 NewLdDstTy = IsDst64 ? s64 : s32;
5531 } else if (IsFPR) {
5532 Opc = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
5533 InsertIntoSubReg = AArch64::hsub;
5534 NewLdDstTy = LLT::scalar(SizeInBits: MemSizeBits);
5535 } else {
5536 Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5537 InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5538 NewLdDstTy = s32;
5539 }
5540 } else if (MemSizeBits == 32) {
5541 if (IsSExt) {
5542 Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5543 NewLdDstTy = s64;
5544 } else if (IsFPR) {
5545 Opc = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
5546 InsertIntoSubReg = AArch64::ssub;
5547 NewLdDstTy = LLT::scalar(SizeInBits: MemSizeBits);
5548 } else {
5549 Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5550 InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5551 NewLdDstTy = s32;
5552 }
5553 } else {
5554 llvm_unreachable("Unexpected size for indexed load");
5555 }
5556
5557 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5558 if (!Cst)
5559 return false; // Shouldn't happen, but just in case.
5560
5561 auto LdMI = MIB.buildInstr(Opc, DstOps: {WriteBack, NewLdDstTy}, SrcOps: {Base})
5562 .addImm(Val: Cst->getSExtValue());
5563 LdMI.cloneMemRefs(OtherMI: ExtLd);
5564 constrainSelectedInstRegOperands(I&: *LdMI, TII, TRI, RBI);
5565 // Make sure to select the load with the MemTy as the dest type, and then
5566 // insert into a larger reg if needed.
5567 if (InsertIntoSubReg) {
5568 // Generate a SUBREG_TO_REG.
5569 auto SubToReg = MIB.buildInstr(Opc: TargetOpcode::SUBREG_TO_REG, DstOps: {Dst}, SrcOps: {})
5570 .addUse(RegNo: LdMI.getReg(Idx: 1))
5571 .addImm(Val: InsertIntoSubReg);
5572 RBI.constrainGenericRegister(
5573 Reg: SubToReg.getReg(Idx: 0),
5574 RC: *getRegClassForTypeOnBank(Ty: MRI.getType(Reg: Dst),
5575 RB: *RBI.getRegBank(Reg: Dst, MRI, TRI)),
5576 MRI);
5577 } else {
5578 auto Copy = MIB.buildCopy(Res: Dst, Op: LdMI.getReg(Idx: 1));
5579 selectCopy(I&: *Copy, TII, MRI, TRI, RBI);
5580 }
5581 MI.eraseFromParent();
5582
5583 return true;
5584}
5585
5586bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5587 MachineRegisterInfo &MRI) {
5588 auto &Ld = cast<GIndexedLoad>(Val&: MI);
5589 Register Dst = Ld.getDstReg();
5590 Register WriteBack = Ld.getWritebackReg();
5591 Register Base = Ld.getBaseReg();
5592 Register Offset = Ld.getOffsetReg();
5593 assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
5594 "Unexpected type for indexed load");
5595 unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5596
5597 if (MemSize < MRI.getType(Reg: Dst).getSizeInBytes())
5598 return selectIndexedExtLoad(MI, MRI);
5599
5600 unsigned Opc = 0;
5601 if (Ld.isPre()) {
5602 static constexpr unsigned GPROpcodes[] = {
5603 AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5604 AArch64::LDRXpre};
5605 static constexpr unsigned FPROpcodes[] = {
5606 AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5607 AArch64::LDRQpre};
5608 Opc = (RBI.getRegBank(Reg: Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5609 ? FPROpcodes[Log2_32(Value: MemSize)]
5610 : GPROpcodes[Log2_32(Value: MemSize)];
5611 ;
5612 } else {
5613 static constexpr unsigned GPROpcodes[] = {
5614 AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5615 AArch64::LDRXpost};
5616 static constexpr unsigned FPROpcodes[] = {
5617 AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5618 AArch64::LDRDpost, AArch64::LDRQpost};
5619 Opc = (RBI.getRegBank(Reg: Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5620 ? FPROpcodes[Log2_32(Value: MemSize)]
5621 : GPROpcodes[Log2_32(Value: MemSize)];
5622 ;
5623 }
5624 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5625 if (!Cst)
5626 return false; // Shouldn't happen, but just in case.
5627 auto LdMI =
5628 MIB.buildInstr(Opc, DstOps: {WriteBack, Dst}, SrcOps: {Base}).addImm(Val: Cst->getSExtValue());
5629 LdMI.cloneMemRefs(OtherMI: Ld);
5630 constrainSelectedInstRegOperands(I&: *LdMI, TII, TRI, RBI);
5631 MI.eraseFromParent();
5632 return true;
5633}
5634
5635bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5636 MachineRegisterInfo &MRI) {
5637 Register Dst = I.getWritebackReg();
5638 Register Val = I.getValueReg();
5639 Register Base = I.getBaseReg();
5640 Register Offset = I.getOffsetReg();
5641 assert(MRI.getType(Val).getSizeInBits() <= 128 &&
5642 "Unexpected type for indexed store");
5643
5644 LocationSize MemSize = I.getMMO().getSize();
5645 unsigned MemSizeInBytes = MemSize.getValue();
5646
5647 assert(MemSizeInBytes && MemSizeInBytes <= 16 &&
5648 "Unexpected indexed store size");
5649 unsigned MemSizeLog2 = Log2_32(Value: MemSizeInBytes);
5650
5651 unsigned Opc = 0;
5652 if (I.isPre()) {
5653 static constexpr unsigned GPROpcodes[] = {
5654 AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5655 AArch64::STRXpre};
5656 static constexpr unsigned FPROpcodes[] = {
5657 AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5658 AArch64::STRQpre};
5659
5660 if (RBI.getRegBank(Reg: Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5661 Opc = FPROpcodes[MemSizeLog2];
5662 else
5663 Opc = GPROpcodes[MemSizeLog2];
5664 } else {
5665 static constexpr unsigned GPROpcodes[] = {
5666 AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5667 AArch64::STRXpost};
5668 static constexpr unsigned FPROpcodes[] = {
5669 AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5670 AArch64::STRDpost, AArch64::STRQpost};
5671
5672 if (RBI.getRegBank(Reg: Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5673 Opc = FPROpcodes[MemSizeLog2];
5674 else
5675 Opc = GPROpcodes[MemSizeLog2];
5676 }
5677
5678 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5679 if (!Cst)
5680 return false; // Shouldn't happen, but just in case.
5681 auto Str =
5682 MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {Val, Base}).addImm(Val: Cst->getSExtValue());
5683 Str.cloneMemRefs(OtherMI: I);
5684 constrainSelectedInstRegOperands(I&: *Str, TII, TRI, RBI);
5685 I.eraseFromParent();
5686 return true;
5687}
5688
5689MachineInstr *
5690AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5691 MachineIRBuilder &MIRBuilder,
5692 MachineRegisterInfo &MRI) {
5693 LLT DstTy = MRI.getType(Reg: Dst);
5694 unsigned DstSize = DstTy.getSizeInBits();
5695 assert((DstSize == 64 || DstSize == 128) &&
5696 "Unexpected vector constant size");
5697
5698 if (CV->isNullValue()) {
5699 if (DstSize == 128) {
5700 auto Mov =
5701 MIRBuilder.buildInstr(Opc: AArch64::MOVIv2d_ns, DstOps: {Dst}, SrcOps: {}).addImm(Val: 0);
5702 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5703 return &*Mov;
5704 }
5705
5706 if (DstSize == 64) {
5707 auto Mov =
5708 MIRBuilder
5709 .buildInstr(Opc: AArch64::MOVIv2d_ns, DstOps: {&AArch64::FPR128RegClass}, SrcOps: {})
5710 .addImm(Val: 0);
5711 auto Copy = MIRBuilder.buildInstr(Opc: TargetOpcode::COPY, DstOps: {Dst}, SrcOps: {})
5712 .addReg(RegNo: Mov.getReg(Idx: 0), Flags: {}, SubReg: AArch64::dsub);
5713 RBI.constrainGenericRegister(Reg: Dst, RC: AArch64::FPR64RegClass, MRI);
5714 return &*Copy;
5715 }
5716 }
5717
5718 if (Constant *SplatValue = CV->getSplatValue()) {
5719 APInt SplatValueAsInt =
5720 isa<ConstantFP>(Val: SplatValue)
5721 ? cast<ConstantFP>(Val: SplatValue)->getValueAPF().bitcastToAPInt()
5722 : SplatValue->getUniqueInteger();
5723 APInt DefBits = APInt::getSplat(
5724 NewLen: DstSize, V: SplatValueAsInt.trunc(width: DstTy.getScalarSizeInBits()));
5725 auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * {
5726 MachineInstr *NewOp;
5727 bool Inv = false;
5728 if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)) ||
5729 (NewOp =
5730 tryAdvSIMDModImm32(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5731 (NewOp =
5732 tryAdvSIMDModImm321s(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5733 (NewOp =
5734 tryAdvSIMDModImm16(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5735 (NewOp = tryAdvSIMDModImm8(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)) ||
5736 (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)))
5737 return NewOp;
5738
5739 DefBits = ~DefBits;
5740 Inv = true;
5741 if ((NewOp =
5742 tryAdvSIMDModImm32(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5743 (NewOp =
5744 tryAdvSIMDModImm321s(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5745 (NewOp = tryAdvSIMDModImm16(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)))
5746 return NewOp;
5747 return nullptr;
5748 };
5749
5750 if (auto *NewOp = TryMOVIWithBits(DefBits))
5751 return NewOp;
5752
5753 // See if a fneg of the constant can be materialized with a MOVI, etc
5754 auto TryWithFNeg = [&](APInt DefBits, int NumBits,
5755 unsigned NegOpc) -> MachineInstr * {
5756 // FNegate each sub-element of the constant
5757 APInt Neg = APInt::getHighBitsSet(numBits: NumBits, hiBitsSet: 1).zext(width: DstSize);
5758 APInt NegBits(DstSize, 0);
5759 unsigned NumElts = DstSize / NumBits;
5760 for (unsigned i = 0; i < NumElts; i++)
5761 NegBits |= Neg << (NumBits * i);
5762 NegBits = DefBits ^ NegBits;
5763
5764 // Try to create the new constants with MOVI, and if so generate a fneg
5765 // for it.
5766 if (auto *NewOp = TryMOVIWithBits(NegBits)) {
5767 Register NewDst = MRI.createVirtualRegister(
5768 RegClass: DstSize == 64 ? &AArch64::FPR64RegClass : &AArch64::FPR128RegClass);
5769 NewOp->getOperand(i: 0).setReg(NewDst);
5770 return MIRBuilder.buildInstr(Opc: NegOpc, DstOps: {Dst}, SrcOps: {NewDst});
5771 }
5772 return nullptr;
5773 };
5774 MachineInstr *R;
5775 if ((R = TryWithFNeg(DefBits, 32,
5776 DstSize == 64 ? AArch64::FNEGv2f32
5777 : AArch64::FNEGv4f32)) ||
5778 (R = TryWithFNeg(DefBits, 64,
5779 DstSize == 64 ? AArch64::FNEGDr
5780 : AArch64::FNEGv2f64)) ||
5781 (STI.hasFullFP16() &&
5782 (R = TryWithFNeg(DefBits, 16,
5783 DstSize == 64 ? AArch64::FNEGv4f16
5784 : AArch64::FNEGv8f16))))
5785 return R;
5786 }
5787
5788 auto *CPLoad = emitLoadFromConstantPool(CPVal: CV, MIRBuilder);
5789 if (!CPLoad) {
5790 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5791 return nullptr;
5792 }
5793
5794 auto Copy = MIRBuilder.buildCopy(Res: Dst, Op: CPLoad->getOperand(i: 0));
5795 RBI.constrainGenericRegister(
5796 Reg: Dst, RC: *MRI.getRegClass(Reg: CPLoad->getOperand(i: 0).getReg()), MRI);
5797 return &*Copy;
5798}
5799
5800bool AArch64InstructionSelector::tryOptConstantBuildVec(
5801 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5802 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5803 unsigned DstSize = DstTy.getSizeInBits();
5804 assert(DstSize <= 128 && "Unexpected build_vec type!");
5805 if (DstSize < 32)
5806 return false;
5807 // Check if we're building a constant vector, in which case we want to
5808 // generate a constant pool load instead of a vector insert sequence.
5809 SmallVector<Constant *, 16> Csts;
5810 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5811 Register OpReg = I.getOperand(i: Idx).getReg();
5812 if (auto AnyConst = getAnyConstantVRegValWithLookThrough(
5813 VReg: OpReg, MRI, /*LookThroughInstrs=*/true,
5814 /*LookThroughAnyExt=*/true)) {
5815 MachineInstr *DefMI = MRI.getVRegDef(Reg: AnyConst->VReg);
5816
5817 if (DefMI->getOpcode() == TargetOpcode::G_CONSTANT) {
5818 Csts.emplace_back(
5819 Args: ConstantInt::get(Context&: MIB.getMF().getFunction().getContext(),
5820 V: std::move(AnyConst->Value)));
5821 continue;
5822 }
5823
5824 if (DefMI->getOpcode() == TargetOpcode::G_FCONSTANT) {
5825 Csts.emplace_back(
5826 Args: const_cast<ConstantFP *>(DefMI->getOperand(i: 1).getFPImm()));
5827 continue;
5828 }
5829 }
5830 return false;
5831 }
5832 Constant *CV = ConstantVector::get(V: Csts);
5833 if (!emitConstantVector(Dst: I.getOperand(i: 0).getReg(), CV, MIRBuilder&: MIB, MRI))
5834 return false;
5835 I.eraseFromParent();
5836 return true;
5837}
5838
5839bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5840 MachineInstr &I, MachineRegisterInfo &MRI) {
5841 // Given:
5842 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5843 //
5844 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5845 Register Dst = I.getOperand(i: 0).getReg();
5846 Register EltReg = I.getOperand(i: 1).getReg();
5847 LLT EltTy = MRI.getType(Reg: EltReg);
5848 // If the index isn't on the same bank as its elements, then this can't be a
5849 // SUBREG_TO_REG.
5850 const RegisterBank &EltRB = *RBI.getRegBank(Reg: EltReg, MRI, TRI);
5851 const RegisterBank &DstRB = *RBI.getRegBank(Reg: Dst, MRI, TRI);
5852 if (EltRB != DstRB)
5853 return false;
5854 if (any_of(Range: drop_begin(RangeOrContainer: I.operands(), N: 2), P: [&MRI](const MachineOperand &Op) {
5855 return !getOpcodeDef(Opcode: TargetOpcode::G_IMPLICIT_DEF, Reg: Op.getReg(), MRI);
5856 }))
5857 return false;
5858 unsigned SubReg;
5859 const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(Ty: EltTy, RB: EltRB);
5860 if (!EltRC)
5861 return false;
5862 const TargetRegisterClass *DstRC =
5863 getRegClassForTypeOnBank(Ty: MRI.getType(Reg: Dst), RB: DstRB);
5864 if (!DstRC)
5865 return false;
5866 if (!getSubRegForClass(RC: EltRC, TRI, SubReg))
5867 return false;
5868 auto SubregToReg = MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {Dst}, SrcOps: {})
5869 .addUse(RegNo: EltReg)
5870 .addImm(Val: SubReg);
5871 I.eraseFromParent();
5872 constrainSelectedInstRegOperands(I&: *SubregToReg, TII, TRI, RBI);
5873 return RBI.constrainGenericRegister(Reg: Dst, RC: *DstRC, MRI);
5874}
5875
5876bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5877 MachineRegisterInfo &MRI) {
5878 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5879 // Until we port more of the optimized selections, for now just use a vector
5880 // insert sequence.
5881 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5882 const LLT EltTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
5883 unsigned EltSize = EltTy.getSizeInBits();
5884
5885 if (tryOptConstantBuildVec(I, DstTy, MRI))
5886 return true;
5887 if (tryOptBuildVecToSubregToReg(I, MRI))
5888 return true;
5889
5890 if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5891 return false; // Don't support all element types yet.
5892 const RegisterBank &RB = *RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI);
5893
5894 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5895 MachineInstr *ScalarToVec =
5896 emitScalarToVector(EltSize: DstTy.getElementType().getSizeInBits(), DstRC,
5897 Scalar: I.getOperand(i: 1).getReg(), MIRBuilder&: MIB);
5898 if (!ScalarToVec)
5899 return false;
5900
5901 Register DstVec = ScalarToVec->getOperand(i: 0).getReg();
5902 unsigned DstSize = DstTy.getSizeInBits();
5903
5904 // Keep track of the last MI we inserted. Later on, we might be able to save
5905 // a copy using it.
5906 MachineInstr *PrevMI = ScalarToVec;
5907 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5908 // Note that if we don't do a subregister copy, we can end up making an
5909 // extra register.
5910 Register OpReg = I.getOperand(i).getReg();
5911 // Do not emit inserts for undefs
5912 if (!getOpcodeDef<GImplicitDef>(Reg: OpReg, MRI)) {
5913 PrevMI = &*emitLaneInsert(DstReg: std::nullopt, SrcReg: DstVec, EltReg: OpReg, LaneIdx: i - 1, RB, MIRBuilder&: MIB);
5914 DstVec = PrevMI->getOperand(i: 0).getReg();
5915 }
5916 }
5917
5918 // If DstTy's size in bits is less than 128, then emit a subregister copy
5919 // from DstVec to the last register we've defined.
5920 if (DstSize < 128) {
5921 // Force this to be FPR using the destination vector.
5922 const TargetRegisterClass *RC =
5923 getRegClassForTypeOnBank(Ty: DstTy, RB: *RBI.getRegBank(Reg: DstVec, MRI, TRI));
5924 if (!RC)
5925 return false;
5926 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5927 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5928 return false;
5929 }
5930
5931 unsigned SubReg = 0;
5932 if (!getSubRegForClass(RC, TRI, SubReg))
5933 return false;
5934 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5935 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5936 << "\n");
5937 return false;
5938 }
5939
5940 Register Reg = MRI.createVirtualRegister(RegClass: RC);
5941 Register DstReg = I.getOperand(i: 0).getReg();
5942
5943 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {}).addReg(RegNo: DstVec, Flags: {}, SubReg);
5944 MachineOperand &RegOp = I.getOperand(i: 1);
5945 RegOp.setReg(Reg);
5946 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
5947 } else {
5948 // We either have a vector with all elements (except the first one) undef or
5949 // at least one non-undef non-first element. In the first case, we need to
5950 // constrain the output register ourselves as we may have generated an
5951 // INSERT_SUBREG operation which is a generic operation for which the
5952 // output regclass cannot be automatically chosen.
5953 //
5954 // In the second case, there is no need to do this as it may generate an
5955 // instruction like INSvi32gpr where the regclass can be automatically
5956 // chosen.
5957 //
5958 // Also, we save a copy by re-using the destination register on the final
5959 // insert.
5960 PrevMI->getOperand(i: 0).setReg(I.getOperand(i: 0).getReg());
5961 constrainSelectedInstRegOperands(I&: *PrevMI, TII, TRI, RBI);
5962
5963 Register DstReg = PrevMI->getOperand(i: 0).getReg();
5964 if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
5965 const TargetRegisterClass *RC =
5966 getRegClassForTypeOnBank(Ty: DstTy, RB: *RBI.getRegBank(Reg: DstVec, MRI, TRI));
5967 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
5968 }
5969 }
5970
5971 I.eraseFromParent();
5972 return true;
5973}
5974
5975bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5976 unsigned NumVecs,
5977 MachineInstr &I) {
5978 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5979 assert(Opc && "Expected an opcode?");
5980 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5981 auto &MRI = *MIB.getMRI();
5982 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5983 unsigned Size = Ty.getSizeInBits();
5984 assert((Size == 64 || Size == 128) &&
5985 "Destination must be 64 bits or 128 bits?");
5986 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5987 auto Ptr = I.getOperand(i: I.getNumOperands() - 1).getReg();
5988 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5989 auto Load = MIB.buildInstr(Opc, DstOps: {Ty}, SrcOps: {Ptr});
5990 Load.cloneMemRefs(OtherMI: I);
5991 constrainSelectedInstRegOperands(I&: *Load, TII, TRI, RBI);
5992 Register SelectedLoadDst = Load->getOperand(i: 0).getReg();
5993 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5994 auto Vec = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: Idx)}, SrcOps: {})
5995 .addReg(RegNo: SelectedLoadDst, Flags: {}, SubReg: SubReg + Idx);
5996 // Emit the subreg copies and immediately select them.
5997 // FIXME: We should refactor our copy code into an emitCopy helper and
5998 // clean up uses of this pattern elsewhere in the selector.
5999 selectCopy(I&: *Vec, TII, MRI, TRI, RBI);
6000 }
6001 return true;
6002}
6003
6004bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
6005 unsigned Opc, unsigned NumVecs, MachineInstr &I) {
6006 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
6007 assert(Opc && "Expected an opcode?");
6008 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
6009 auto &MRI = *MIB.getMRI();
6010 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6011 bool Narrow = Ty.getSizeInBits() == 64;
6012
6013 auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
6014 SmallVector<Register, 4> Regs(NumVecs);
6015 std::transform(first: FirstSrcRegIt, last: FirstSrcRegIt + NumVecs, result: Regs.begin(),
6016 unary_op: [](auto MO) { return MO.getReg(); });
6017
6018 if (Narrow) {
6019 transform(Range&: Regs, d_first: Regs.begin(), F: [this](Register Reg) {
6020 return emitScalarToVector(EltSize: 64, DstRC: &AArch64::FPR128RegClass, Scalar: Reg, MIRBuilder&: MIB)
6021 ->getOperand(i: 0)
6022 .getReg();
6023 });
6024 Ty = Ty.multiplyElements(Factor: 2);
6025 }
6026
6027 Register Tuple = createQTuple(Regs, MIB);
6028 auto LaneNo = getIConstantVRegVal(VReg: (FirstSrcRegIt + NumVecs)->getReg(), MRI);
6029 if (!LaneNo)
6030 return false;
6031
6032 Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
6033 auto Load = MIB.buildInstr(Opc, DstOps: {Ty}, SrcOps: {})
6034 .addReg(RegNo: Tuple)
6035 .addImm(Val: LaneNo->getZExtValue())
6036 .addReg(RegNo: Ptr);
6037 Load.cloneMemRefs(OtherMI: I);
6038 constrainSelectedInstRegOperands(I&: *Load, TII, TRI, RBI);
6039 Register SelectedLoadDst = Load->getOperand(i: 0).getReg();
6040 unsigned SubReg = AArch64::qsub0;
6041 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6042 auto Vec = MIB.buildInstr(Opc: TargetOpcode::COPY,
6043 DstOps: {Narrow ? DstOp(&AArch64::FPR128RegClass)
6044 : DstOp(I.getOperand(i: Idx).getReg())},
6045 SrcOps: {})
6046 .addReg(RegNo: SelectedLoadDst, Flags: {}, SubReg: SubReg + Idx);
6047 Register WideReg = Vec.getReg(Idx: 0);
6048 // Emit the subreg copies and immediately select them.
6049 selectCopy(I&: *Vec, TII, MRI, TRI, RBI);
6050 if (Narrow &&
6051 !emitNarrowVector(DstReg: I.getOperand(i: Idx).getReg(), SrcReg: WideReg, MIB, MRI))
6052 return false;
6053 }
6054 return true;
6055}
6056
6057void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
6058 unsigned NumVecs,
6059 unsigned Opc) {
6060 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6061 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6062 Register Ptr = I.getOperand(i: 1 + NumVecs).getReg();
6063
6064 SmallVector<Register, 2> Regs(NumVecs);
6065 std::transform(first: I.operands_begin() + 1, last: I.operands_begin() + 1 + NumVecs,
6066 result: Regs.begin(), unary_op: [](auto MO) { return MO.getReg(); });
6067
6068 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
6069 : createDTuple(Regs, MIB);
6070 auto Store = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {Tuple, Ptr});
6071 Store.cloneMemRefs(OtherMI: I);
6072 constrainSelectedInstRegOperands(I&: *Store, TII, TRI, RBI);
6073}
6074
6075bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
6076 MachineInstr &I, unsigned NumVecs, unsigned Opc) {
6077 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6078 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6079 bool Narrow = Ty.getSizeInBits() == 64;
6080
6081 SmallVector<Register, 2> Regs(NumVecs);
6082 std::transform(first: I.operands_begin() + 1, last: I.operands_begin() + 1 + NumVecs,
6083 result: Regs.begin(), unary_op: [](auto MO) { return MO.getReg(); });
6084
6085 if (Narrow)
6086 transform(Range&: Regs, d_first: Regs.begin(), F: [this](Register Reg) {
6087 return emitScalarToVector(EltSize: 64, DstRC: &AArch64::FPR128RegClass, Scalar: Reg, MIRBuilder&: MIB)
6088 ->getOperand(i: 0)
6089 .getReg();
6090 });
6091
6092 Register Tuple = createQTuple(Regs, MIB);
6093
6094 auto LaneNo = getIConstantVRegVal(VReg: I.getOperand(i: 1 + NumVecs).getReg(), MRI);
6095 if (!LaneNo)
6096 return false;
6097 Register Ptr = I.getOperand(i: 1 + NumVecs + 1).getReg();
6098 auto Store = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {})
6099 .addReg(RegNo: Tuple)
6100 .addImm(Val: LaneNo->getZExtValue())
6101 .addReg(RegNo: Ptr);
6102 Store.cloneMemRefs(OtherMI: I);
6103 constrainSelectedInstRegOperands(I&: *Store, TII, TRI, RBI);
6104 return true;
6105}
6106
6107bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
6108 MachineInstr &I, MachineRegisterInfo &MRI) {
6109 // Find the intrinsic ID.
6110 unsigned IntrinID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
6111
6112 const LLT S8 = LLT::scalar(SizeInBits: 8);
6113 const LLT S16 = LLT::scalar(SizeInBits: 16);
6114 const LLT S32 = LLT::scalar(SizeInBits: 32);
6115 const LLT S64 = LLT::scalar(SizeInBits: 64);
6116 const LLT P0 = LLT::pointer(AddressSpace: 0, SizeInBits: 64);
6117 // Select the instruction.
6118 switch (IntrinID) {
6119 default:
6120 return false;
6121 case Intrinsic::aarch64_ldxp:
6122 case Intrinsic::aarch64_ldaxp: {
6123 auto NewI = MIB.buildInstr(
6124 Opc: IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6125 DstOps: {I.getOperand(i: 0).getReg(), I.getOperand(i: 1).getReg()},
6126 SrcOps: {I.getOperand(i: 3)});
6127 NewI.cloneMemRefs(OtherMI: I);
6128 constrainSelectedInstRegOperands(I&: *NewI, TII, TRI, RBI);
6129 break;
6130 }
6131 case Intrinsic::aarch64_neon_ld1x2: {
6132 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6133 unsigned Opc = 0;
6134 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6135 Opc = AArch64::LD1Twov8b;
6136 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6137 Opc = AArch64::LD1Twov16b;
6138 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6139 Opc = AArch64::LD1Twov4h;
6140 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6141 Opc = AArch64::LD1Twov8h;
6142 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6143 Opc = AArch64::LD1Twov2s;
6144 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6145 Opc = AArch64::LD1Twov4s;
6146 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6147 Opc = AArch64::LD1Twov2d;
6148 else if (Ty == S64 || Ty == P0)
6149 Opc = AArch64::LD1Twov1d;
6150 else
6151 llvm_unreachable("Unexpected type for ld1x2!");
6152 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6153 break;
6154 }
6155 case Intrinsic::aarch64_neon_ld1x3: {
6156 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6157 unsigned Opc = 0;
6158 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6159 Opc = AArch64::LD1Threev8b;
6160 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6161 Opc = AArch64::LD1Threev16b;
6162 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6163 Opc = AArch64::LD1Threev4h;
6164 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6165 Opc = AArch64::LD1Threev8h;
6166 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6167 Opc = AArch64::LD1Threev2s;
6168 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6169 Opc = AArch64::LD1Threev4s;
6170 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6171 Opc = AArch64::LD1Threev2d;
6172 else if (Ty == S64 || Ty == P0)
6173 Opc = AArch64::LD1Threev1d;
6174 else
6175 llvm_unreachable("Unexpected type for ld1x3!");
6176 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6177 break;
6178 }
6179 case Intrinsic::aarch64_neon_ld1x4: {
6180 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6181 unsigned Opc = 0;
6182 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6183 Opc = AArch64::LD1Fourv8b;
6184 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6185 Opc = AArch64::LD1Fourv16b;
6186 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6187 Opc = AArch64::LD1Fourv4h;
6188 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6189 Opc = AArch64::LD1Fourv8h;
6190 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6191 Opc = AArch64::LD1Fourv2s;
6192 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6193 Opc = AArch64::LD1Fourv4s;
6194 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6195 Opc = AArch64::LD1Fourv2d;
6196 else if (Ty == S64 || Ty == P0)
6197 Opc = AArch64::LD1Fourv1d;
6198 else
6199 llvm_unreachable("Unexpected type for ld1x4!");
6200 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6201 break;
6202 }
6203 case Intrinsic::aarch64_neon_ld2: {
6204 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6205 unsigned Opc = 0;
6206 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6207 Opc = AArch64::LD2Twov8b;
6208 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6209 Opc = AArch64::LD2Twov16b;
6210 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6211 Opc = AArch64::LD2Twov4h;
6212 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6213 Opc = AArch64::LD2Twov8h;
6214 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6215 Opc = AArch64::LD2Twov2s;
6216 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6217 Opc = AArch64::LD2Twov4s;
6218 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6219 Opc = AArch64::LD2Twov2d;
6220 else if (Ty == S64 || Ty == P0)
6221 Opc = AArch64::LD1Twov1d;
6222 else
6223 llvm_unreachable("Unexpected type for ld2!");
6224 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6225 break;
6226 }
6227 case Intrinsic::aarch64_neon_ld2lane: {
6228 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6229 unsigned Opc;
6230 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6231 Opc = AArch64::LD2i8;
6232 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6233 Opc = AArch64::LD2i16;
6234 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6235 Opc = AArch64::LD2i32;
6236 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6237 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6238 Opc = AArch64::LD2i64;
6239 else
6240 llvm_unreachable("Unexpected type for st2lane!");
6241 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 2, I))
6242 return false;
6243 break;
6244 }
6245 case Intrinsic::aarch64_neon_ld2r: {
6246 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6247 unsigned Opc = 0;
6248 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6249 Opc = AArch64::LD2Rv8b;
6250 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6251 Opc = AArch64::LD2Rv16b;
6252 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6253 Opc = AArch64::LD2Rv4h;
6254 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6255 Opc = AArch64::LD2Rv8h;
6256 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6257 Opc = AArch64::LD2Rv2s;
6258 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6259 Opc = AArch64::LD2Rv4s;
6260 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6261 Opc = AArch64::LD2Rv2d;
6262 else if (Ty == S64 || Ty == P0)
6263 Opc = AArch64::LD2Rv1d;
6264 else
6265 llvm_unreachable("Unexpected type for ld2r!");
6266 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6267 break;
6268 }
6269 case Intrinsic::aarch64_neon_ld3: {
6270 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6271 unsigned Opc = 0;
6272 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6273 Opc = AArch64::LD3Threev8b;
6274 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6275 Opc = AArch64::LD3Threev16b;
6276 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6277 Opc = AArch64::LD3Threev4h;
6278 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6279 Opc = AArch64::LD3Threev8h;
6280 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6281 Opc = AArch64::LD3Threev2s;
6282 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6283 Opc = AArch64::LD3Threev4s;
6284 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6285 Opc = AArch64::LD3Threev2d;
6286 else if (Ty == S64 || Ty == P0)
6287 Opc = AArch64::LD1Threev1d;
6288 else
6289 llvm_unreachable("Unexpected type for ld3!");
6290 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6291 break;
6292 }
6293 case Intrinsic::aarch64_neon_ld3lane: {
6294 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6295 unsigned Opc;
6296 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6297 Opc = AArch64::LD3i8;
6298 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6299 Opc = AArch64::LD3i16;
6300 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6301 Opc = AArch64::LD3i32;
6302 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6303 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6304 Opc = AArch64::LD3i64;
6305 else
6306 llvm_unreachable("Unexpected type for st3lane!");
6307 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 3, I))
6308 return false;
6309 break;
6310 }
6311 case Intrinsic::aarch64_neon_ld3r: {
6312 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6313 unsigned Opc = 0;
6314 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6315 Opc = AArch64::LD3Rv8b;
6316 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6317 Opc = AArch64::LD3Rv16b;
6318 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6319 Opc = AArch64::LD3Rv4h;
6320 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6321 Opc = AArch64::LD3Rv8h;
6322 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6323 Opc = AArch64::LD3Rv2s;
6324 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6325 Opc = AArch64::LD3Rv4s;
6326 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6327 Opc = AArch64::LD3Rv2d;
6328 else if (Ty == S64 || Ty == P0)
6329 Opc = AArch64::LD3Rv1d;
6330 else
6331 llvm_unreachable("Unexpected type for ld3r!");
6332 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6333 break;
6334 }
6335 case Intrinsic::aarch64_neon_ld4: {
6336 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6337 unsigned Opc = 0;
6338 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6339 Opc = AArch64::LD4Fourv8b;
6340 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6341 Opc = AArch64::LD4Fourv16b;
6342 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6343 Opc = AArch64::LD4Fourv4h;
6344 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6345 Opc = AArch64::LD4Fourv8h;
6346 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6347 Opc = AArch64::LD4Fourv2s;
6348 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6349 Opc = AArch64::LD4Fourv4s;
6350 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6351 Opc = AArch64::LD4Fourv2d;
6352 else if (Ty == S64 || Ty == P0)
6353 Opc = AArch64::LD1Fourv1d;
6354 else
6355 llvm_unreachable("Unexpected type for ld4!");
6356 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6357 break;
6358 }
6359 case Intrinsic::aarch64_neon_ld4lane: {
6360 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6361 unsigned Opc;
6362 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6363 Opc = AArch64::LD4i8;
6364 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6365 Opc = AArch64::LD4i16;
6366 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6367 Opc = AArch64::LD4i32;
6368 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6369 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6370 Opc = AArch64::LD4i64;
6371 else
6372 llvm_unreachable("Unexpected type for st4lane!");
6373 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 4, I))
6374 return false;
6375 break;
6376 }
6377 case Intrinsic::aarch64_neon_ld4r: {
6378 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6379 unsigned Opc = 0;
6380 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6381 Opc = AArch64::LD4Rv8b;
6382 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6383 Opc = AArch64::LD4Rv16b;
6384 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6385 Opc = AArch64::LD4Rv4h;
6386 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6387 Opc = AArch64::LD4Rv8h;
6388 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6389 Opc = AArch64::LD4Rv2s;
6390 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6391 Opc = AArch64::LD4Rv4s;
6392 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6393 Opc = AArch64::LD4Rv2d;
6394 else if (Ty == S64 || Ty == P0)
6395 Opc = AArch64::LD4Rv1d;
6396 else
6397 llvm_unreachable("Unexpected type for ld4r!");
6398 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6399 break;
6400 }
6401 case Intrinsic::aarch64_neon_st1x2: {
6402 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6403 unsigned Opc;
6404 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6405 Opc = AArch64::ST1Twov8b;
6406 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6407 Opc = AArch64::ST1Twov16b;
6408 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6409 Opc = AArch64::ST1Twov4h;
6410 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6411 Opc = AArch64::ST1Twov8h;
6412 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6413 Opc = AArch64::ST1Twov2s;
6414 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6415 Opc = AArch64::ST1Twov4s;
6416 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6417 Opc = AArch64::ST1Twov2d;
6418 else if (Ty == S64 || Ty == P0)
6419 Opc = AArch64::ST1Twov1d;
6420 else
6421 llvm_unreachable("Unexpected type for st1x2!");
6422 selectVectorStoreIntrinsic(I, NumVecs: 2, Opc);
6423 break;
6424 }
6425 case Intrinsic::aarch64_neon_st1x3: {
6426 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6427 unsigned Opc;
6428 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6429 Opc = AArch64::ST1Threev8b;
6430 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6431 Opc = AArch64::ST1Threev16b;
6432 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6433 Opc = AArch64::ST1Threev4h;
6434 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6435 Opc = AArch64::ST1Threev8h;
6436 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6437 Opc = AArch64::ST1Threev2s;
6438 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6439 Opc = AArch64::ST1Threev4s;
6440 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6441 Opc = AArch64::ST1Threev2d;
6442 else if (Ty == S64 || Ty == P0)
6443 Opc = AArch64::ST1Threev1d;
6444 else
6445 llvm_unreachable("Unexpected type for st1x3!");
6446 selectVectorStoreIntrinsic(I, NumVecs: 3, Opc);
6447 break;
6448 }
6449 case Intrinsic::aarch64_neon_st1x4: {
6450 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6451 unsigned Opc;
6452 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6453 Opc = AArch64::ST1Fourv8b;
6454 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6455 Opc = AArch64::ST1Fourv16b;
6456 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6457 Opc = AArch64::ST1Fourv4h;
6458 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6459 Opc = AArch64::ST1Fourv8h;
6460 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6461 Opc = AArch64::ST1Fourv2s;
6462 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6463 Opc = AArch64::ST1Fourv4s;
6464 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6465 Opc = AArch64::ST1Fourv2d;
6466 else if (Ty == S64 || Ty == P0)
6467 Opc = AArch64::ST1Fourv1d;
6468 else
6469 llvm_unreachable("Unexpected type for st1x4!");
6470 selectVectorStoreIntrinsic(I, NumVecs: 4, Opc);
6471 break;
6472 }
6473 case Intrinsic::aarch64_neon_st2: {
6474 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6475 unsigned Opc;
6476 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6477 Opc = AArch64::ST2Twov8b;
6478 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6479 Opc = AArch64::ST2Twov16b;
6480 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6481 Opc = AArch64::ST2Twov4h;
6482 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6483 Opc = AArch64::ST2Twov8h;
6484 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6485 Opc = AArch64::ST2Twov2s;
6486 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6487 Opc = AArch64::ST2Twov4s;
6488 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6489 Opc = AArch64::ST2Twov2d;
6490 else if (Ty == S64 || Ty == P0)
6491 Opc = AArch64::ST1Twov1d;
6492 else
6493 llvm_unreachable("Unexpected type for st2!");
6494 selectVectorStoreIntrinsic(I, NumVecs: 2, Opc);
6495 break;
6496 }
6497 case Intrinsic::aarch64_neon_st3: {
6498 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6499 unsigned Opc;
6500 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6501 Opc = AArch64::ST3Threev8b;
6502 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6503 Opc = AArch64::ST3Threev16b;
6504 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6505 Opc = AArch64::ST3Threev4h;
6506 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6507 Opc = AArch64::ST3Threev8h;
6508 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6509 Opc = AArch64::ST3Threev2s;
6510 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6511 Opc = AArch64::ST3Threev4s;
6512 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6513 Opc = AArch64::ST3Threev2d;
6514 else if (Ty == S64 || Ty == P0)
6515 Opc = AArch64::ST1Threev1d;
6516 else
6517 llvm_unreachable("Unexpected type for st3!");
6518 selectVectorStoreIntrinsic(I, NumVecs: 3, Opc);
6519 break;
6520 }
6521 case Intrinsic::aarch64_neon_st4: {
6522 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6523 unsigned Opc;
6524 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6525 Opc = AArch64::ST4Fourv8b;
6526 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6527 Opc = AArch64::ST4Fourv16b;
6528 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6529 Opc = AArch64::ST4Fourv4h;
6530 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6531 Opc = AArch64::ST4Fourv8h;
6532 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6533 Opc = AArch64::ST4Fourv2s;
6534 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6535 Opc = AArch64::ST4Fourv4s;
6536 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6537 Opc = AArch64::ST4Fourv2d;
6538 else if (Ty == S64 || Ty == P0)
6539 Opc = AArch64::ST1Fourv1d;
6540 else
6541 llvm_unreachable("Unexpected type for st4!");
6542 selectVectorStoreIntrinsic(I, NumVecs: 4, Opc);
6543 break;
6544 }
6545 case Intrinsic::aarch64_neon_st2lane: {
6546 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6547 unsigned Opc;
6548 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6549 Opc = AArch64::ST2i8;
6550 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6551 Opc = AArch64::ST2i16;
6552 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6553 Opc = AArch64::ST2i32;
6554 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6555 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6556 Opc = AArch64::ST2i64;
6557 else
6558 llvm_unreachable("Unexpected type for st2lane!");
6559 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 2, Opc))
6560 return false;
6561 break;
6562 }
6563 case Intrinsic::aarch64_neon_st3lane: {
6564 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6565 unsigned Opc;
6566 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6567 Opc = AArch64::ST3i8;
6568 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6569 Opc = AArch64::ST3i16;
6570 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6571 Opc = AArch64::ST3i32;
6572 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6573 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6574 Opc = AArch64::ST3i64;
6575 else
6576 llvm_unreachable("Unexpected type for st3lane!");
6577 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 3, Opc))
6578 return false;
6579 break;
6580 }
6581 case Intrinsic::aarch64_neon_st4lane: {
6582 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6583 unsigned Opc;
6584 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6585 Opc = AArch64::ST4i8;
6586 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6587 Opc = AArch64::ST4i16;
6588 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6589 Opc = AArch64::ST4i32;
6590 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6591 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6592 Opc = AArch64::ST4i64;
6593 else
6594 llvm_unreachable("Unexpected type for st4lane!");
6595 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 4, Opc))
6596 return false;
6597 break;
6598 }
6599 case Intrinsic::aarch64_mops_memset_tag: {
6600 // Transform
6601 // %dst:gpr(p0) = \
6602 // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6603 // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6604 // where %dst is updated, into
6605 // %Rd:GPR64common, %Rn:GPR64) = \
6606 // MOPSMemorySetTaggingPseudo \
6607 // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6608 // where Rd and Rn are tied.
6609 // It is expected that %val has been extended to s64 in legalization.
6610 // Note that the order of the size/value operands are swapped.
6611
6612 Register DstDef = I.getOperand(i: 0).getReg();
6613 // I.getOperand(1) is the intrinsic function
6614 Register DstUse = I.getOperand(i: 2).getReg();
6615 Register ValUse = I.getOperand(i: 3).getReg();
6616 Register SizeUse = I.getOperand(i: 4).getReg();
6617
6618 // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6619 // Therefore an additional virtual register is required for the updated size
6620 // operand. This value is not accessible via the semantics of the intrinsic.
6621 Register SizeDef = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
6622
6623 auto Memset = MIB.buildInstr(Opc: AArch64::MOPSMemorySetTaggingPseudo,
6624 DstOps: {DstDef, SizeDef}, SrcOps: {DstUse, SizeUse, ValUse});
6625 Memset.cloneMemRefs(OtherMI: I);
6626 constrainSelectedInstRegOperands(I&: *Memset, TII, TRI, RBI);
6627 break;
6628 }
6629 case Intrinsic::ptrauth_resign_load_relative: {
6630 Register DstReg = I.getOperand(i: 0).getReg();
6631 Register ValReg = I.getOperand(i: 2).getReg();
6632 uint64_t AUTKey = I.getOperand(i: 3).getImm();
6633 Register AUTDisc = I.getOperand(i: 4).getReg();
6634 uint64_t PACKey = I.getOperand(i: 5).getImm();
6635 Register PACDisc = I.getOperand(i: 6).getReg();
6636 int64_t Addend = I.getOperand(i: 7).getImm();
6637
6638 Register AUTAddrDisc = AUTDisc;
6639 uint16_t AUTConstDiscC = 0;
6640 std::tie(args&: AUTConstDiscC, args&: AUTAddrDisc) =
6641 extractPtrauthBlendDiscriminators(Disc: AUTDisc, MRI);
6642
6643 Register PACAddrDisc = PACDisc;
6644 uint16_t PACConstDiscC = 0;
6645 std::tie(args&: PACConstDiscC, args&: PACAddrDisc) =
6646 extractPtrauthBlendDiscriminators(Disc: PACDisc, MRI);
6647
6648 MIB.buildCopy(Res: {AArch64::X16}, Op: {ValReg});
6649
6650 MIB.buildInstr(Opcode: AArch64::AUTRELLOADPAC)
6651 .addImm(Val: AUTKey)
6652 .addImm(Val: AUTConstDiscC)
6653 .addUse(RegNo: AUTAddrDisc)
6654 .addImm(Val: PACKey)
6655 .addImm(Val: PACConstDiscC)
6656 .addUse(RegNo: PACAddrDisc)
6657 .addImm(Val: Addend)
6658 .constrainAllUses(TII, TRI, RBI);
6659 MIB.buildCopy(Res: {DstReg}, Op: Register(AArch64::X16));
6660
6661 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6662 I.eraseFromParent();
6663 return true;
6664 }
6665 }
6666
6667 I.eraseFromParent();
6668 return true;
6669}
6670
6671bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6672 MachineRegisterInfo &MRI) {
6673 unsigned IntrinID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
6674
6675 switch (IntrinID) {
6676 default:
6677 break;
6678 case Intrinsic::ptrauth_resign: {
6679 Register DstReg = I.getOperand(i: 0).getReg();
6680 Register ValReg = I.getOperand(i: 2).getReg();
6681 uint64_t AUTKey = I.getOperand(i: 3).getImm();
6682 Register AUTDisc = I.getOperand(i: 4).getReg();
6683 uint64_t PACKey = I.getOperand(i: 5).getImm();
6684 Register PACDisc = I.getOperand(i: 6).getReg();
6685
6686 Register AUTAddrDisc = AUTDisc;
6687 uint16_t AUTConstDiscC = 0;
6688 std::tie(args&: AUTConstDiscC, args&: AUTAddrDisc) =
6689 extractPtrauthBlendDiscriminators(Disc: AUTDisc, MRI);
6690
6691 Register PACAddrDisc = PACDisc;
6692 uint16_t PACConstDiscC = 0;
6693 std::tie(args&: PACConstDiscC, args&: PACAddrDisc) =
6694 extractPtrauthBlendDiscriminators(Disc: PACDisc, MRI);
6695
6696 MIB.buildCopy(Res: {AArch64::X16}, Op: {ValReg});
6697 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
6698 MIB.buildInstr(Opcode: AArch64::AUTPAC)
6699 .addImm(Val: AUTKey)
6700 .addImm(Val: AUTConstDiscC)
6701 .addUse(RegNo: AUTAddrDisc)
6702 .addImm(Val: PACKey)
6703 .addImm(Val: PACConstDiscC)
6704 .addUse(RegNo: PACAddrDisc)
6705 .constrainAllUses(TII, TRI, RBI);
6706 MIB.buildCopy(Res: {DstReg}, Op: Register(AArch64::X16));
6707
6708 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6709 I.eraseFromParent();
6710 return true;
6711 }
6712 case Intrinsic::ptrauth_auth: {
6713 Register DstReg = I.getOperand(i: 0).getReg();
6714 Register ValReg = I.getOperand(i: 2).getReg();
6715 uint64_t AUTKey = I.getOperand(i: 3).getImm();
6716 Register AUTDisc = I.getOperand(i: 4).getReg();
6717
6718 Register AUTAddrDisc = AUTDisc;
6719 uint16_t AUTConstDiscC = 0;
6720 std::tie(args&: AUTConstDiscC, args&: AUTAddrDisc) =
6721 extractPtrauthBlendDiscriminators(Disc: AUTDisc, MRI);
6722
6723 if (STI.isX16X17Safer()) {
6724 MIB.buildCopy(Res: {AArch64::X16}, Op: {ValReg});
6725 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
6726 MIB.buildInstr(Opcode: AArch64::AUTx16x17)
6727 .addImm(Val: AUTKey)
6728 .addImm(Val: AUTConstDiscC)
6729 .addUse(RegNo: AUTAddrDisc)
6730 .constrainAllUses(TII, TRI, RBI);
6731 MIB.buildCopy(Res: {DstReg}, Op: Register(AArch64::X16));
6732 } else {
6733 Register ScratchReg =
6734 MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
6735 MIB.buildInstr(Opcode: AArch64::AUTxMxN)
6736 .addDef(RegNo: DstReg)
6737 .addDef(RegNo: ScratchReg)
6738 .addUse(RegNo: ValReg)
6739 .addImm(Val: AUTKey)
6740 .addImm(Val: AUTConstDiscC)
6741 .addUse(RegNo: AUTAddrDisc)
6742 .constrainAllUses(TII, TRI, RBI);
6743 }
6744
6745 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6746 I.eraseFromParent();
6747 return true;
6748 }
6749 case Intrinsic::frameaddress:
6750 case Intrinsic::returnaddress: {
6751 MachineFunction &MF = *I.getParent()->getParent();
6752 MachineFrameInfo &MFI = MF.getFrameInfo();
6753
6754 unsigned Depth = I.getOperand(i: 2).getImm();
6755 Register DstReg = I.getOperand(i: 0).getReg();
6756 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6757
6758 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6759 if (!MFReturnAddr) {
6760 // Insert the copy from LR/X30 into the entry block, before it can be
6761 // clobbered by anything.
6762 MFI.setReturnAddressIsTaken(true);
6763 MFReturnAddr = getFunctionLiveInPhysReg(
6764 MF, TII, PhysReg: AArch64::LR, RC: AArch64::GPR64RegClass, DL: I.getDebugLoc());
6765 }
6766
6767 if (STI.hasPAuth()) {
6768 MIB.buildInstr(Opc: AArch64::XPACI, DstOps: {DstReg}, SrcOps: {MFReturnAddr});
6769 } else {
6770 MIB.buildCopy(Res: {Register(AArch64::LR)}, Op: {MFReturnAddr});
6771 MIB.buildInstr(Opcode: AArch64::XPACLRI);
6772 MIB.buildCopy(Res: {DstReg}, Op: {Register(AArch64::LR)});
6773 }
6774
6775 I.eraseFromParent();
6776 return true;
6777 }
6778
6779 MFI.setFrameAddressIsTaken(true);
6780 Register FrameAddr(AArch64::FP);
6781 while (Depth--) {
6782 Register NextFrame = MRI.createVirtualRegister(RegClass: &AArch64::GPR64spRegClass);
6783 auto Ldr =
6784 MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {NextFrame}, SrcOps: {FrameAddr}).addImm(Val: 0);
6785 constrainSelectedInstRegOperands(I&: *Ldr, TII, TRI, RBI);
6786 FrameAddr = NextFrame;
6787 }
6788
6789 if (IntrinID == Intrinsic::frameaddress)
6790 MIB.buildCopy(Res: {DstReg}, Op: {FrameAddr});
6791 else {
6792 MFI.setReturnAddressIsTaken(true);
6793
6794 if (STI.hasPAuth()) {
6795 Register TmpReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
6796 MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {TmpReg}, SrcOps: {FrameAddr}).addImm(Val: 1);
6797 MIB.buildInstr(Opc: AArch64::XPACI, DstOps: {DstReg}, SrcOps: {TmpReg});
6798 } else {
6799 MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {Register(AArch64::LR)}, SrcOps: {FrameAddr})
6800 .addImm(Val: 1);
6801 MIB.buildInstr(Opcode: AArch64::XPACLRI);
6802 MIB.buildCopy(Res: {DstReg}, Op: {Register(AArch64::LR)});
6803 }
6804 }
6805
6806 I.eraseFromParent();
6807 return true;
6808 }
6809 case Intrinsic::aarch64_neon_tbl2:
6810 SelectTable(I, MRI, NumVecs: 2, Opc1: AArch64::TBLv8i8Two, Opc2: AArch64::TBLv16i8Two, isExt: false);
6811 return true;
6812 case Intrinsic::aarch64_neon_tbl3:
6813 SelectTable(I, MRI, NumVecs: 3, Opc1: AArch64::TBLv8i8Three, Opc2: AArch64::TBLv16i8Three,
6814 isExt: false);
6815 return true;
6816 case Intrinsic::aarch64_neon_tbl4:
6817 SelectTable(I, MRI, NumVecs: 4, Opc1: AArch64::TBLv8i8Four, Opc2: AArch64::TBLv16i8Four, isExt: false);
6818 return true;
6819 case Intrinsic::aarch64_neon_tbx2:
6820 SelectTable(I, MRI, NumVecs: 2, Opc1: AArch64::TBXv8i8Two, Opc2: AArch64::TBXv16i8Two, isExt: true);
6821 return true;
6822 case Intrinsic::aarch64_neon_tbx3:
6823 SelectTable(I, MRI, NumVecs: 3, Opc1: AArch64::TBXv8i8Three, Opc2: AArch64::TBXv16i8Three, isExt: true);
6824 return true;
6825 case Intrinsic::aarch64_neon_tbx4:
6826 SelectTable(I, MRI, NumVecs: 4, Opc1: AArch64::TBXv8i8Four, Opc2: AArch64::TBXv16i8Four, isExt: true);
6827 return true;
6828 case Intrinsic::swift_async_context_addr:
6829 auto Sub = MIB.buildInstr(Opc: AArch64::SUBXri, DstOps: {I.getOperand(i: 0).getReg()},
6830 SrcOps: {Register(AArch64::FP)})
6831 .addImm(Val: 8)
6832 .addImm(Val: 0);
6833 constrainSelectedInstRegOperands(I&: *Sub, TII, TRI, RBI);
6834
6835 MF->getFrameInfo().setFrameAddressIsTaken(true);
6836 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6837 I.eraseFromParent();
6838 return true;
6839 }
6840 return false;
6841}
6842
6843// G_PTRAUTH_GLOBAL_VALUE lowering
6844//
6845// We have 3 lowering alternatives to choose from:
6846// - MOVaddrPAC: similar to MOVaddr, with added PAC.
6847// If the GV doesn't need a GOT load (i.e., is locally defined)
6848// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
6849//
6850// - LOADgotPAC: similar to LOADgot, with added PAC.
6851// If the GV needs a GOT load, materialize the pointer using the usual
6852// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
6853// section is assumed to be read-only (for example, via relro mechanism). See
6854// LowerMOVaddrPAC.
6855//
6856// - LOADauthptrstatic: similar to LOADgot, but use a
6857// special stub slot instead of a GOT slot.
6858// Load a signed pointer for symbol 'sym' from a stub slot named
6859// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
6860// resolving. This usually lowers to adrp+ldr, but also emits an entry into
6861// .data with an
6862// @AUTH relocation. See LowerLOADauthptrstatic.
6863//
6864// All 3 are pseudos that are expand late to longer sequences: this lets us
6865// provide integrity guarantees on the to-be-signed intermediate values.
6866//
6867// LOADauthptrstatic is undesirable because it requires a large section filled
6868// with often similarly-signed pointers, making it a good harvesting target.
6869// Thus, it's only used for ptrauth references to extern_weak to avoid null
6870// checks.
6871
6872bool AArch64InstructionSelector::selectPtrAuthGlobalValue(
6873 MachineInstr &I, MachineRegisterInfo &MRI) const {
6874 Register DefReg = I.getOperand(i: 0).getReg();
6875 Register Addr = I.getOperand(i: 1).getReg();
6876 uint64_t Key = I.getOperand(i: 2).getImm();
6877 Register AddrDisc = I.getOperand(i: 3).getReg();
6878 uint64_t Disc = I.getOperand(i: 4).getImm();
6879 int64_t Offset = 0;
6880
6881 if (Key > AArch64PACKey::LAST)
6882 report_fatal_error(reason: "key in ptrauth global out of range [0, " +
6883 Twine((int)AArch64PACKey::LAST) + "]");
6884
6885 // Blend only works if the integer discriminator is 16-bit wide.
6886 if (!isUInt<16>(x: Disc))
6887 report_fatal_error(
6888 reason: "constant discriminator in ptrauth global out of range [0, 0xffff]");
6889
6890 // Choosing between 3 lowering alternatives is target-specific.
6891 if (!STI.isTargetELF() && !STI.isTargetMachO())
6892 report_fatal_error(reason: "ptrauth global lowering only supported on MachO/ELF");
6893
6894 if (!MRI.hasOneDef(RegNo: Addr))
6895 return false;
6896
6897 // First match any offset we take from the real global.
6898 const MachineInstr *DefMI = &*MRI.def_instr_begin(RegNo: Addr);
6899 if (DefMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6900 Register OffsetReg = DefMI->getOperand(i: 2).getReg();
6901 if (!MRI.hasOneDef(RegNo: OffsetReg))
6902 return false;
6903 const MachineInstr &OffsetMI = *MRI.def_instr_begin(RegNo: OffsetReg);
6904 if (OffsetMI.getOpcode() != TargetOpcode::G_CONSTANT)
6905 return false;
6906
6907 Addr = DefMI->getOperand(i: 1).getReg();
6908 if (!MRI.hasOneDef(RegNo: Addr))
6909 return false;
6910
6911 DefMI = &*MRI.def_instr_begin(RegNo: Addr);
6912 Offset = OffsetMI.getOperand(i: 1).getCImm()->getSExtValue();
6913 }
6914
6915 // We should be left with a genuine unauthenticated GlobalValue.
6916 const GlobalValue *GV;
6917 if (DefMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
6918 GV = DefMI->getOperand(i: 1).getGlobal();
6919 Offset += DefMI->getOperand(i: 1).getOffset();
6920 } else if (DefMI->getOpcode() == AArch64::G_ADD_LOW) {
6921 GV = DefMI->getOperand(i: 2).getGlobal();
6922 Offset += DefMI->getOperand(i: 2).getOffset();
6923 } else {
6924 return false;
6925 }
6926
6927 MachineIRBuilder MIB(I);
6928
6929 // Classify the reference to determine whether it needs a GOT load.
6930 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
6931 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
6932 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
6933 "unsupported non-GOT op flags on ptrauth global reference");
6934 assert((!GV->hasExternalWeakLinkage() || NeedsGOTLoad) &&
6935 "unsupported non-GOT reference to weak ptrauth global");
6936
6937 std::optional<APInt> AddrDiscVal = getIConstantVRegVal(VReg: AddrDisc, MRI);
6938 bool HasAddrDisc = !AddrDiscVal || *AddrDiscVal != 0;
6939
6940 // Non-extern_weak:
6941 // - No GOT load needed -> MOVaddrPAC
6942 // - GOT load for non-extern_weak -> LOADgotPAC
6943 // Note that we disallow extern_weak refs to avoid null checks later.
6944 if (!GV->hasExternalWeakLinkage()) {
6945 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X16}, SrcOps: {});
6946 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
6947 MIB.buildInstr(Opcode: NeedsGOTLoad ? AArch64::LOADgotPAC : AArch64::MOVaddrPAC)
6948 .addGlobalAddress(GV, Offset)
6949 .addImm(Val: Key)
6950 .addReg(RegNo: HasAddrDisc ? AddrDisc : AArch64::XZR)
6951 .addImm(Val: Disc)
6952 .constrainAllUses(TII, TRI, RBI);
6953 MIB.buildCopy(Res: DefReg, Op: Register(AArch64::X16));
6954 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
6955 I.eraseFromParent();
6956 return true;
6957 }
6958
6959 // extern_weak -> LOADauthptrstatic
6960
6961 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
6962 // offset alone as a pointer if the symbol wasn't available, which would
6963 // probably break null checks in users. Ptrauth complicates things further:
6964 // error out.
6965 if (Offset != 0)
6966 report_fatal_error(
6967 reason: "unsupported non-zero offset in weak ptrauth global reference");
6968
6969 if (HasAddrDisc)
6970 report_fatal_error(reason: "unsupported weak addr-div ptrauth global");
6971
6972 MIB.buildInstr(Opc: AArch64::LOADauthptrstatic, DstOps: {DefReg}, SrcOps: {})
6973 .addGlobalAddress(GV, Offset)
6974 .addImm(Val: Key)
6975 .addImm(Val: Disc);
6976 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
6977
6978 I.eraseFromParent();
6979 return true;
6980}
6981
6982void AArch64InstructionSelector::SelectTable(MachineInstr &I,
6983 MachineRegisterInfo &MRI,
6984 unsigned NumVec, unsigned Opc1,
6985 unsigned Opc2, bool isExt) {
6986 Register DstReg = I.getOperand(i: 0).getReg();
6987 unsigned Opc = MRI.getType(Reg: DstReg) == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8) ? Opc1 : Opc2;
6988
6989 // Create the REG_SEQUENCE
6990 SmallVector<Register, 4> Regs;
6991 for (unsigned i = 0; i < NumVec; i++)
6992 Regs.push_back(Elt: I.getOperand(i: i + 2 + isExt).getReg());
6993 Register RegSeq = createQTuple(Regs, MIB);
6994
6995 Register IdxReg = I.getOperand(i: 2 + NumVec + isExt).getReg();
6996 MachineInstrBuilder Instr;
6997 if (isExt) {
6998 Register Reg = I.getOperand(i: 2).getReg();
6999 Instr = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Reg, RegSeq, IdxReg});
7000 } else
7001 Instr = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {RegSeq, IdxReg});
7002 constrainSelectedInstRegOperands(I&: *Instr, TII, TRI, RBI);
7003 I.eraseFromParent();
7004}
7005
7006InstructionSelector::ComplexRendererFns
7007AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
7008 auto MaybeImmed = getImmedFromMO(Root);
7009 if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
7010 return std::nullopt;
7011 uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
7012 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
7013}
7014
7015InstructionSelector::ComplexRendererFns
7016AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
7017 auto MaybeImmed = getImmedFromMO(Root);
7018 if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
7019 return std::nullopt;
7020 uint64_t Enc = 31 - *MaybeImmed;
7021 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
7022}
7023
7024InstructionSelector::ComplexRendererFns
7025AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
7026 auto MaybeImmed = getImmedFromMO(Root);
7027 if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7028 return std::nullopt;
7029 uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
7030 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
7031}
7032
7033InstructionSelector::ComplexRendererFns
7034AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
7035 auto MaybeImmed = getImmedFromMO(Root);
7036 if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7037 return std::nullopt;
7038 uint64_t Enc = 63 - *MaybeImmed;
7039 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
7040}
7041
7042/// Helper to select an immediate value that can be represented as a 12-bit
7043/// value shifted left by either 0 or 12. If it is possible to do so, return
7044/// the immediate and shift value. If not, return std::nullopt.
7045///
7046/// Used by selectArithImmed and selectNegArithImmed.
7047InstructionSelector::ComplexRendererFns
7048AArch64InstructionSelector::select12BitValueWithLeftShift(
7049 uint64_t Immed) const {
7050 unsigned ShiftAmt;
7051 if (Immed >> 12 == 0) {
7052 ShiftAmt = 0;
7053 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
7054 ShiftAmt = 12;
7055 Immed = Immed >> 12;
7056 } else
7057 return std::nullopt;
7058
7059 unsigned ShVal = AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: ShiftAmt);
7060 return {{
7061 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Immed); },
7062 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ShVal); },
7063 }};
7064}
7065
7066/// SelectArithImmed - Select an immediate value that can be represented as
7067/// a 12-bit value shifted left by either 0 or 12. If so, return true with
7068/// Val set to the 12-bit value and Shift set to the shifter operand.
7069InstructionSelector::ComplexRendererFns
7070AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
7071 // This function is called from the addsub_shifted_imm ComplexPattern,
7072 // which lists [imm] as the list of opcode it's interested in, however
7073 // we still need to check whether the operand is actually an immediate
7074 // here because the ComplexPattern opcode list is only used in
7075 // root-level opcode matching.
7076 auto MaybeImmed = getImmedFromMO(Root);
7077 if (MaybeImmed == std::nullopt)
7078 return std::nullopt;
7079 return select12BitValueWithLeftShift(Immed: *MaybeImmed);
7080}
7081
7082/// SelectNegArithImmed - As above, but negates the value before trying to
7083/// select it.
7084InstructionSelector::ComplexRendererFns
7085AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
7086 // We need a register here, because we need to know if we have a 64 or 32
7087 // bit immediate.
7088 if (!Root.isReg())
7089 return std::nullopt;
7090 auto MaybeImmed = getImmedFromMO(Root);
7091 if (MaybeImmed == std::nullopt)
7092 return std::nullopt;
7093 uint64_t Immed = *MaybeImmed;
7094
7095 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
7096 // have the opposite effect on the C flag, so this pattern mustn't match under
7097 // those circumstances.
7098 if (Immed == 0)
7099 return std::nullopt;
7100
7101 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
7102 // the root.
7103 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7104 if (MRI.getType(Reg: Root.getReg()).getSizeInBits() == 32)
7105 Immed = ~((uint32_t)Immed) + 1;
7106 else
7107 Immed = ~Immed + 1ULL;
7108
7109 if (Immed & 0xFFFFFFFFFF000000ULL)
7110 return std::nullopt;
7111
7112 Immed &= 0xFFFFFFULL;
7113 return select12BitValueWithLeftShift(Immed);
7114}
7115
7116/// Checks if we are sure that folding MI into load/store addressing mode is
7117/// beneficial or not.
7118///
7119/// Returns:
7120/// - true if folding MI would be beneficial.
7121/// - false if folding MI would be bad.
7122/// - std::nullopt if it is not sure whether folding MI is beneficial.
7123///
7124/// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example:
7125///
7126/// %13:gpr(s64) = G_CONSTANT i64 1
7127/// %8:gpr(s64) = G_SHL %6, %13(s64)
7128/// %9:gpr(p0) = G_PTR_ADD %0, %8(s64)
7129/// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16))
7130std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode(
7131 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
7132 if (MI.getOpcode() == AArch64::G_SHL) {
7133 // Address operands with shifts are free, except for running on subtargets
7134 // with AddrLSLSlow14.
7135 if (const auto ValAndVeg = getIConstantVRegValWithLookThrough(
7136 VReg: MI.getOperand(i: 2).getReg(), MRI)) {
7137 const APInt ShiftVal = ValAndVeg->Value;
7138
7139 // Don't fold if we know this will be slow.
7140 return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4));
7141 }
7142 }
7143 return std::nullopt;
7144}
7145
7146/// Return true if it is worth folding MI into an extended register. That is,
7147/// if it's safe to pull it into the addressing mode of a load or store as a
7148/// shift.
7149/// \p IsAddrOperand whether the def of MI is used as an address operand
7150/// (e.g. feeding into an LDR/STR).
7151bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
7152 const MachineInstr &MI, const MachineRegisterInfo &MRI,
7153 bool IsAddrOperand) const {
7154
7155 // Always fold if there is one use, or if we're optimizing for size.
7156 Register DefReg = MI.getOperand(i: 0).getReg();
7157 if (MRI.hasOneNonDBGUse(RegNo: DefReg) ||
7158 MI.getParent()->getParent()->getFunction().hasOptSize())
7159 return true;
7160
7161 if (IsAddrOperand) {
7162 // If we are already sure that folding MI is good or bad, return the result.
7163 if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI))
7164 return *Worth;
7165
7166 // Fold G_PTR_ADD if its offset operand can be folded
7167 if (MI.getOpcode() == AArch64::G_PTR_ADD) {
7168 MachineInstr *OffsetInst =
7169 getDefIgnoringCopies(Reg: MI.getOperand(i: 2).getReg(), MRI);
7170
7171 // Note, we already know G_PTR_ADD is used by at least two instructions.
7172 // If we are also sure about whether folding is beneficial or not,
7173 // return the result.
7174 if (const auto Worth = isWorthFoldingIntoAddrMode(MI: *OffsetInst, MRI))
7175 return *Worth;
7176 }
7177 }
7178
7179 // FIXME: Consider checking HasALULSLFast as appropriate.
7180
7181 // We have a fastpath, so folding a shift in and potentially computing it
7182 // many times may be beneficial. Check if this is only used in memory ops.
7183 // If it is, then we should fold.
7184 return all_of(Range: MRI.use_nodbg_instructions(Reg: DefReg),
7185 P: [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
7186}
7187
7188InstructionSelector::ComplexRendererFns
7189AArch64InstructionSelector::selectExtendedSHL(
7190 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
7191 unsigned SizeInBytes, bool WantsExt) const {
7192 assert(Base.isReg() && "Expected base to be a register operand");
7193 assert(Offset.isReg() && "Expected offset to be a register operand");
7194
7195 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7196 MachineInstr *OffsetInst = MRI.getVRegDef(Reg: Offset.getReg());
7197
7198 unsigned OffsetOpc = OffsetInst->getOpcode();
7199 bool LookedThroughZExt = false;
7200 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
7201 // Try to look through a ZEXT.
7202 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
7203 return std::nullopt;
7204
7205 OffsetInst = MRI.getVRegDef(Reg: OffsetInst->getOperand(i: 1).getReg());
7206 OffsetOpc = OffsetInst->getOpcode();
7207 LookedThroughZExt = true;
7208
7209 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
7210 return std::nullopt;
7211 }
7212 // Make sure that the memory op is a valid size.
7213 int64_t LegalShiftVal = Log2_32(Value: SizeInBytes);
7214 if (LegalShiftVal == 0)
7215 return std::nullopt;
7216 if (!isWorthFoldingIntoExtendedReg(MI: *OffsetInst, MRI, IsAddrOperand: true))
7217 return std::nullopt;
7218
7219 // Now, try to find the specific G_CONSTANT. Start by assuming that the
7220 // register we will offset is the LHS, and the register containing the
7221 // constant is the RHS.
7222 Register OffsetReg = OffsetInst->getOperand(i: 1).getReg();
7223 Register ConstantReg = OffsetInst->getOperand(i: 2).getReg();
7224 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
7225 if (!ValAndVReg) {
7226 // We didn't get a constant on the RHS. If the opcode is a shift, then
7227 // we're done.
7228 if (OffsetOpc == TargetOpcode::G_SHL)
7229 return std::nullopt;
7230
7231 // If we have a G_MUL, we can use either register. Try looking at the RHS.
7232 std::swap(a&: OffsetReg, b&: ConstantReg);
7233 ValAndVReg = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
7234 if (!ValAndVReg)
7235 return std::nullopt;
7236 }
7237
7238 // The value must fit into 3 bits, and must be positive. Make sure that is
7239 // true.
7240 int64_t ImmVal = ValAndVReg->Value.getSExtValue();
7241
7242 // Since we're going to pull this into a shift, the constant value must be
7243 // a power of 2. If we got a multiply, then we need to check this.
7244 if (OffsetOpc == TargetOpcode::G_MUL) {
7245 if (!llvm::has_single_bit<uint32_t>(Value: ImmVal))
7246 return std::nullopt;
7247
7248 // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
7249 ImmVal = Log2_32(Value: ImmVal);
7250 }
7251
7252 if ((ImmVal & 0x7) != ImmVal)
7253 return std::nullopt;
7254
7255 // We are only allowed to shift by LegalShiftVal. This shift value is built
7256 // into the instruction, so we can't just use whatever we want.
7257 if (ImmVal != LegalShiftVal)
7258 return std::nullopt;
7259
7260 unsigned SignExtend = 0;
7261 if (WantsExt) {
7262 // Check if the offset is defined by an extend, unless we looked through a
7263 // G_ZEXT earlier.
7264 if (!LookedThroughZExt) {
7265 MachineInstr *ExtInst = getDefIgnoringCopies(Reg: OffsetReg, MRI);
7266 auto Ext = getExtendTypeForInst(MI&: *ExtInst, MRI, IsLoadStore: true);
7267 if (Ext == AArch64_AM::InvalidShiftExtend)
7268 return std::nullopt;
7269
7270 SignExtend = AArch64_AM::isSignExtendShiftType(Type: Ext) ? 1 : 0;
7271 // We only support SXTW for signed extension here.
7272 if (SignExtend && Ext != AArch64_AM::SXTW)
7273 return std::nullopt;
7274 OffsetReg = ExtInst->getOperand(i: 1).getReg();
7275 }
7276
7277 // Need a 32-bit wide register here.
7278 MachineIRBuilder MIB(*MRI.getVRegDef(Reg: Root.getReg()));
7279 OffsetReg = moveScalarRegClass(Reg: OffsetReg, RC: AArch64::GPR32RegClass, MIB);
7280 }
7281
7282 // We can use the LHS of the GEP as the base, and the LHS of the shift as an
7283 // offset. Signify that we are shifting by setting the shift flag to 1.
7284 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: Base.getReg()); },
7285 [=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: OffsetReg); },
7286 [=](MachineInstrBuilder &MIB) {
7287 // Need to add both immediates here to make sure that they are both
7288 // added to the instruction.
7289 MIB.addImm(Val: SignExtend);
7290 MIB.addImm(Val: 1);
7291 }}};
7292}
7293
7294/// This is used for computing addresses like this:
7295///
7296/// ldr x1, [x2, x3, lsl #3]
7297///
7298/// Where x2 is the base register, and x3 is an offset register. The shift-left
7299/// is a constant value specific to this load instruction. That is, we'll never
7300/// see anything other than a 3 here (which corresponds to the size of the
7301/// element being loaded.)
7302InstructionSelector::ComplexRendererFns
7303AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
7304 MachineOperand &Root, unsigned SizeInBytes) const {
7305 if (!Root.isReg())
7306 return std::nullopt;
7307 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7308
7309 // We want to find something like this:
7310 //
7311 // val = G_CONSTANT LegalShiftVal
7312 // shift = G_SHL off_reg val
7313 // ptr = G_PTR_ADD base_reg shift
7314 // x = G_LOAD ptr
7315 //
7316 // And fold it into this addressing mode:
7317 //
7318 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7319
7320 // Check if we can find the G_PTR_ADD.
7321 MachineInstr *PtrAdd =
7322 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7323 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(MI: *PtrAdd, MRI, IsAddrOperand: true))
7324 return std::nullopt;
7325
7326 // Now, try to match an opcode which will match our specific offset.
7327 // We want a G_SHL or a G_MUL.
7328 MachineInstr *OffsetInst =
7329 getDefIgnoringCopies(Reg: PtrAdd->getOperand(i: 2).getReg(), MRI);
7330 return selectExtendedSHL(Root, Base&: PtrAdd->getOperand(i: 1),
7331 Offset&: OffsetInst->getOperand(i: 0), SizeInBytes,
7332 /*WantsExt=*/false);
7333}
7334
7335/// This is used for computing addresses like this:
7336///
7337/// ldr x1, [x2, x3]
7338///
7339/// Where x2 is the base register, and x3 is an offset register.
7340///
7341/// When possible (or profitable) to fold a G_PTR_ADD into the address
7342/// calculation, this will do so. Otherwise, it will return std::nullopt.
7343InstructionSelector::ComplexRendererFns
7344AArch64InstructionSelector::selectAddrModeRegisterOffset(
7345 MachineOperand &Root) const {
7346 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7347
7348 // We need a GEP.
7349 MachineInstr *Gep = MRI.getVRegDef(Reg: Root.getReg());
7350 if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7351 return std::nullopt;
7352
7353 // If this is used more than once, let's not bother folding.
7354 // TODO: Check if they are memory ops. If they are, then we can still fold
7355 // without having to recompute anything.
7356 if (!MRI.hasOneNonDBGUse(RegNo: Gep->getOperand(i: 0).getReg()))
7357 return std::nullopt;
7358
7359 // Base is the GEP's LHS, offset is its RHS.
7360 return {{[=](MachineInstrBuilder &MIB) {
7361 MIB.addUse(RegNo: Gep->getOperand(i: 1).getReg());
7362 },
7363 [=](MachineInstrBuilder &MIB) {
7364 MIB.addUse(RegNo: Gep->getOperand(i: 2).getReg());
7365 },
7366 [=](MachineInstrBuilder &MIB) {
7367 // Need to add both immediates here to make sure that they are both
7368 // added to the instruction.
7369 MIB.addImm(Val: 0);
7370 MIB.addImm(Val: 0);
7371 }}};
7372}
7373
7374/// This is intended to be equivalent to selectAddrModeXRO in
7375/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7376InstructionSelector::ComplexRendererFns
7377AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7378 unsigned SizeInBytes) const {
7379 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7380 if (!Root.isReg())
7381 return std::nullopt;
7382 MachineInstr *PtrAdd =
7383 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7384 if (!PtrAdd)
7385 return std::nullopt;
7386
7387 // Check for an immediates which cannot be encoded in the [base + imm]
7388 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7389 // end up with code like:
7390 //
7391 // mov x0, wide
7392 // add x1 base, x0
7393 // ldr x2, [x1, x0]
7394 //
7395 // In this situation, we can use the [base, xreg] addressing mode to save an
7396 // add/sub:
7397 //
7398 // mov x0, wide
7399 // ldr x2, [base, x0]
7400 auto ValAndVReg =
7401 getIConstantVRegValWithLookThrough(VReg: PtrAdd->getOperand(i: 2).getReg(), MRI);
7402 if (ValAndVReg) {
7403 unsigned Scale = Log2_32(Value: SizeInBytes);
7404 int64_t ImmOff = ValAndVReg->Value.getSExtValue();
7405
7406 // Skip immediates that can be selected in the load/store addressing
7407 // mode.
7408 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
7409 ImmOff < (0x1000 << Scale))
7410 return std::nullopt;
7411
7412 // Helper lambda to decide whether or not it is preferable to emit an add.
7413 auto isPreferredADD = [](int64_t ImmOff) {
7414 // Constants in [0x0, 0xfff] can be encoded in an add.
7415 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
7416 return true;
7417
7418 // Can it be encoded in an add lsl #12?
7419 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
7420 return false;
7421
7422 // It can be encoded in an add lsl #12, but we may not want to. If it is
7423 // possible to select this as a single movz, then prefer that. A single
7424 // movz is faster than an add with a shift.
7425 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
7426 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
7427 };
7428
7429 // If the immediate can be encoded in a single add/sub, then bail out.
7430 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
7431 return std::nullopt;
7432 }
7433
7434 // Try to fold shifts into the addressing mode.
7435 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7436 if (AddrModeFns)
7437 return AddrModeFns;
7438
7439 // If that doesn't work, see if it's possible to fold in registers from
7440 // a GEP.
7441 return selectAddrModeRegisterOffset(Root);
7442}
7443
7444/// This is used for computing addresses like this:
7445///
7446/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7447///
7448/// Where we have a 64-bit base register, a 32-bit offset register, and an
7449/// extend (which may or may not be signed).
7450InstructionSelector::ComplexRendererFns
7451AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7452 unsigned SizeInBytes) const {
7453 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7454
7455 MachineInstr *PtrAdd =
7456 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7457 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(MI: *PtrAdd, MRI, IsAddrOperand: true))
7458 return std::nullopt;
7459
7460 MachineOperand &LHS = PtrAdd->getOperand(i: 1);
7461 MachineOperand &RHS = PtrAdd->getOperand(i: 2);
7462 MachineInstr *OffsetInst = getDefIgnoringCopies(Reg: RHS.getReg(), MRI);
7463
7464 // The first case is the same as selectAddrModeXRO, except we need an extend.
7465 // In this case, we try to find a shift and extend, and fold them into the
7466 // addressing mode.
7467 //
7468 // E.g.
7469 //
7470 // off_reg = G_Z/S/ANYEXT ext_reg
7471 // val = G_CONSTANT LegalShiftVal
7472 // shift = G_SHL off_reg val
7473 // ptr = G_PTR_ADD base_reg shift
7474 // x = G_LOAD ptr
7475 //
7476 // In this case we can get a load like this:
7477 //
7478 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7479 auto ExtendedShl = selectExtendedSHL(Root, Base&: LHS, Offset&: OffsetInst->getOperand(i: 0),
7480 SizeInBytes, /*WantsExt=*/true);
7481 if (ExtendedShl)
7482 return ExtendedShl;
7483
7484 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7485 //
7486 // e.g.
7487 // ldr something, [base_reg, ext_reg, sxtw]
7488 if (!isWorthFoldingIntoExtendedReg(MI: *OffsetInst, MRI, IsAddrOperand: true))
7489 return std::nullopt;
7490
7491 // Check if this is an extend. We'll get an extend type if it is.
7492 AArch64_AM::ShiftExtendType Ext =
7493 getExtendTypeForInst(MI&: *OffsetInst, MRI, /*IsLoadStore=*/true);
7494 if (Ext == AArch64_AM::InvalidShiftExtend)
7495 return std::nullopt;
7496
7497 // Need a 32-bit wide register.
7498 MachineIRBuilder MIB(*PtrAdd);
7499 Register ExtReg = moveScalarRegClass(Reg: OffsetInst->getOperand(i: 1).getReg(),
7500 RC: AArch64::GPR32RegClass, MIB);
7501 unsigned SignExtend = Ext == AArch64_AM::SXTW;
7502
7503 // Base is LHS, offset is ExtReg.
7504 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: LHS.getReg()); },
7505 [=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); },
7506 [=](MachineInstrBuilder &MIB) {
7507 MIB.addImm(Val: SignExtend);
7508 MIB.addImm(Val: 0);
7509 }}};
7510}
7511
7512/// Select a "register plus unscaled signed 9-bit immediate" address. This
7513/// should only match when there is an offset that is not valid for a scaled
7514/// immediate addressing mode. The "Size" argument is the size in bytes of the
7515/// memory reference, which is needed here to know what is valid for a scaled
7516/// immediate.
7517InstructionSelector::ComplexRendererFns
7518AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7519 unsigned Size) const {
7520 MachineRegisterInfo &MRI =
7521 Root.getParent()->getParent()->getParent()->getRegInfo();
7522
7523 if (!Root.isReg())
7524 return std::nullopt;
7525
7526 if (!isBaseWithConstantOffset(Root, MRI))
7527 return std::nullopt;
7528
7529 MachineInstr *RootDef = MRI.getVRegDef(Reg: Root.getReg());
7530
7531 MachineOperand &OffImm = RootDef->getOperand(i: 2);
7532 if (!OffImm.isReg())
7533 return std::nullopt;
7534 MachineInstr *RHS = MRI.getVRegDef(Reg: OffImm.getReg());
7535 if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7536 return std::nullopt;
7537 int64_t RHSC;
7538 MachineOperand &RHSOp1 = RHS->getOperand(i: 1);
7539 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
7540 return std::nullopt;
7541 RHSC = RHSOp1.getCImm()->getSExtValue();
7542
7543 if (RHSC >= -256 && RHSC < 256) {
7544 MachineOperand &Base = RootDef->getOperand(i: 1);
7545 return {{
7546 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Base); },
7547 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC); },
7548 }};
7549 }
7550 return std::nullopt;
7551}
7552
7553InstructionSelector::ComplexRendererFns
7554AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7555 unsigned Size,
7556 MachineRegisterInfo &MRI) const {
7557 if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7558 return std::nullopt;
7559 MachineInstr &Adrp = *MRI.getVRegDef(Reg: RootDef.getOperand(i: 1).getReg());
7560 if (Adrp.getOpcode() != AArch64::ADRP)
7561 return std::nullopt;
7562
7563 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7564 auto Offset = Adrp.getOperand(i: 1).getOffset();
7565 if (Offset % Size != 0)
7566 return std::nullopt;
7567
7568 auto GV = Adrp.getOperand(i: 1).getGlobal();
7569 if (GV->isThreadLocal())
7570 return std::nullopt;
7571
7572 auto &MF = *RootDef.getParent()->getParent();
7573 if (GV->getPointerAlignment(DL: MF.getDataLayout()) < Size)
7574 return std::nullopt;
7575
7576 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM: MF.getTarget());
7577 MachineIRBuilder MIRBuilder(RootDef);
7578 Register AdrpReg = Adrp.getOperand(i: 0).getReg();
7579 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: AdrpReg); },
7580 [=](MachineInstrBuilder &MIB) {
7581 MIB.addGlobalAddress(GV, Offset,
7582 TargetFlags: OpFlags | AArch64II::MO_PAGEOFF |
7583 AArch64II::MO_NC);
7584 }}};
7585}
7586
7587/// Select a "register plus scaled unsigned 12-bit immediate" address. The
7588/// "Size" argument is the size in bytes of the memory reference, which
7589/// determines the scale.
7590InstructionSelector::ComplexRendererFns
7591AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7592 unsigned Size) const {
7593 MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7594 MachineRegisterInfo &MRI = MF.getRegInfo();
7595
7596 if (!Root.isReg())
7597 return std::nullopt;
7598
7599 MachineInstr *RootDef = MRI.getVRegDef(Reg: Root.getReg());
7600 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7601 return {{
7602 [=](MachineInstrBuilder &MIB) { MIB.add(MO: RootDef->getOperand(i: 1)); },
7603 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); },
7604 }};
7605 }
7606
7607 CodeModel::Model CM = MF.getTarget().getCodeModel();
7608 // Check if we can fold in the ADD of small code model ADRP + ADD address.
7609 // HACK: ld64 on Darwin doesn't support relocations on PRFM, so we can't fold
7610 // globals into the offset.
7611 MachineInstr *RootParent = Root.getParent();
7612 if (CM == CodeModel::Small &&
7613 !(RootParent->getOpcode() == AArch64::G_AARCH64_PREFETCH &&
7614 STI.isTargetDarwin())) {
7615 auto OpFns = tryFoldAddLowIntoImm(RootDef&: *RootDef, Size, MRI);
7616 if (OpFns)
7617 return OpFns;
7618 }
7619
7620 if (isBaseWithConstantOffset(Root, MRI)) {
7621 MachineOperand &LHS = RootDef->getOperand(i: 1);
7622 MachineOperand &RHS = RootDef->getOperand(i: 2);
7623 MachineInstr *LHSDef = MRI.getVRegDef(Reg: LHS.getReg());
7624 MachineInstr *RHSDef = MRI.getVRegDef(Reg: RHS.getReg());
7625
7626 int64_t RHSC = (int64_t)RHSDef->getOperand(i: 1).getCImm()->getZExtValue();
7627 unsigned Scale = Log2_32(Value: Size);
7628 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
7629 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7630 return {{
7631 [=](MachineInstrBuilder &MIB) { MIB.add(MO: LHSDef->getOperand(i: 1)); },
7632 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC >> Scale); },
7633 }};
7634
7635 return {{
7636 [=](MachineInstrBuilder &MIB) { MIB.add(MO: LHS); },
7637 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC >> Scale); },
7638 }};
7639 }
7640 }
7641
7642 // Before falling back to our general case, check if the unscaled
7643 // instructions can handle this. If so, that's preferable.
7644 if (selectAddrModeUnscaled(Root, Size))
7645 return std::nullopt;
7646
7647 return {{
7648 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); },
7649 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); },
7650 }};
7651}
7652
7653/// Given a shift instruction, return the correct shift type for that
7654/// instruction.
7655static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7656 switch (MI.getOpcode()) {
7657 default:
7658 return AArch64_AM::InvalidShiftExtend;
7659 case TargetOpcode::G_SHL:
7660 return AArch64_AM::LSL;
7661 case TargetOpcode::G_LSHR:
7662 return AArch64_AM::LSR;
7663 case TargetOpcode::G_ASHR:
7664 return AArch64_AM::ASR;
7665 case TargetOpcode::G_ROTR:
7666 return AArch64_AM::ROR;
7667 }
7668}
7669
7670/// Select a "shifted register" operand. If the value is not shifted, set the
7671/// shift operand to a default value of "lsl 0".
7672InstructionSelector::ComplexRendererFns
7673AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7674 bool AllowROR) const {
7675 if (!Root.isReg())
7676 return std::nullopt;
7677 MachineRegisterInfo &MRI =
7678 Root.getParent()->getParent()->getParent()->getRegInfo();
7679
7680 // Check if the operand is defined by an instruction which corresponds to
7681 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7682 MachineInstr *ShiftInst = MRI.getVRegDef(Reg: Root.getReg());
7683 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(MI&: *ShiftInst);
7684 if (ShType == AArch64_AM::InvalidShiftExtend)
7685 return std::nullopt;
7686 if (ShType == AArch64_AM::ROR && !AllowROR)
7687 return std::nullopt;
7688 if (!isWorthFoldingIntoExtendedReg(MI: *ShiftInst, MRI, IsAddrOperand: false))
7689 return std::nullopt;
7690
7691 // Need an immediate on the RHS.
7692 MachineOperand &ShiftRHS = ShiftInst->getOperand(i: 2);
7693 auto Immed = getImmedFromMO(Root: ShiftRHS);
7694 if (!Immed)
7695 return std::nullopt;
7696
7697 // We have something that we can fold. Fold in the shift's LHS and RHS into
7698 // the instruction.
7699 MachineOperand &ShiftLHS = ShiftInst->getOperand(i: 1);
7700 Register ShiftReg = ShiftLHS.getReg();
7701
7702 unsigned NumBits = MRI.getType(Reg: ShiftReg).getSizeInBits();
7703 unsigned Val = *Immed & (NumBits - 1);
7704 unsigned ShiftVal = AArch64_AM::getShifterImm(ST: ShType, Imm: Val);
7705
7706 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ShiftReg); },
7707 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ShiftVal); }}};
7708}
7709
7710AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7711 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7712 unsigned Opc = MI.getOpcode();
7713
7714 // Handle explicit extend instructions first.
7715 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
7716 unsigned Size;
7717 if (Opc == TargetOpcode::G_SEXT)
7718 Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7719 else
7720 Size = MI.getOperand(i: 2).getImm();
7721 assert(Size != 64 && "Extend from 64 bits?");
7722 switch (Size) {
7723 case 8:
7724 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7725 case 16:
7726 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7727 case 32:
7728 return AArch64_AM::SXTW;
7729 default:
7730 return AArch64_AM::InvalidShiftExtend;
7731 }
7732 }
7733
7734 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
7735 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7736 assert(Size != 64 && "Extend from 64 bits?");
7737 switch (Size) {
7738 case 8:
7739 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7740 case 16:
7741 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7742 case 32:
7743 return AArch64_AM::UXTW;
7744 default:
7745 return AArch64_AM::InvalidShiftExtend;
7746 }
7747 }
7748
7749 // Don't have an explicit extend. Try to handle a G_AND with a constant mask
7750 // on the RHS.
7751 if (Opc != TargetOpcode::G_AND)
7752 return AArch64_AM::InvalidShiftExtend;
7753
7754 std::optional<uint64_t> MaybeAndMask = getImmedFromMO(Root: MI.getOperand(i: 2));
7755 if (!MaybeAndMask)
7756 return AArch64_AM::InvalidShiftExtend;
7757 uint64_t AndMask = *MaybeAndMask;
7758 switch (AndMask) {
7759 default:
7760 return AArch64_AM::InvalidShiftExtend;
7761 case 0xFF:
7762 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7763 case 0xFFFF:
7764 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7765 case 0xFFFFFFFF:
7766 return AArch64_AM::UXTW;
7767 }
7768}
7769
7770Register AArch64InstructionSelector::moveScalarRegClass(
7771 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7772 MachineRegisterInfo &MRI = *MIB.getMRI();
7773 auto Ty = MRI.getType(Reg);
7774 assert(!Ty.isVector() && "Expected scalars only!");
7775 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7776 return Reg;
7777
7778 // Create a copy and immediately select it.
7779 // FIXME: We should have an emitCopy function?
7780 auto Copy = MIB.buildCopy(Res: {&RC}, Op: {Reg});
7781 selectCopy(I&: *Copy, TII, MRI, TRI, RBI);
7782 return Copy.getReg(Idx: 0);
7783}
7784
7785/// Select an "extended register" operand. This operand folds in an extend
7786/// followed by an optional left shift.
7787InstructionSelector::ComplexRendererFns
7788AArch64InstructionSelector::selectArithExtendedRegister(
7789 MachineOperand &Root) const {
7790 if (!Root.isReg())
7791 return std::nullopt;
7792 MachineRegisterInfo &MRI =
7793 Root.getParent()->getParent()->getParent()->getRegInfo();
7794
7795 uint64_t ShiftVal = 0;
7796 Register ExtReg;
7797 AArch64_AM::ShiftExtendType Ext;
7798 MachineInstr *RootDef = getDefIgnoringCopies(Reg: Root.getReg(), MRI);
7799 if (!RootDef)
7800 return std::nullopt;
7801
7802 if (!isWorthFoldingIntoExtendedReg(MI: *RootDef, MRI, IsAddrOperand: false))
7803 return std::nullopt;
7804
7805 // Check if we can fold a shift and an extend.
7806 if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7807 // Look for a constant on the RHS of the shift.
7808 MachineOperand &RHS = RootDef->getOperand(i: 2);
7809 std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(Root: RHS);
7810 if (!MaybeShiftVal)
7811 return std::nullopt;
7812 ShiftVal = *MaybeShiftVal;
7813 if (ShiftVal > 4)
7814 return std::nullopt;
7815 // Look for a valid extend instruction on the LHS of the shift.
7816 MachineOperand &LHS = RootDef->getOperand(i: 1);
7817 MachineInstr *ExtDef = getDefIgnoringCopies(Reg: LHS.getReg(), MRI);
7818 if (!ExtDef)
7819 return std::nullopt;
7820 Ext = getExtendTypeForInst(MI&: *ExtDef, MRI);
7821 if (Ext == AArch64_AM::InvalidShiftExtend)
7822 return std::nullopt;
7823 ExtReg = ExtDef->getOperand(i: 1).getReg();
7824 } else {
7825 // Didn't get a shift. Try just folding an extend.
7826 Ext = getExtendTypeForInst(MI&: *RootDef, MRI);
7827 if (Ext == AArch64_AM::InvalidShiftExtend)
7828 return std::nullopt;
7829 ExtReg = RootDef->getOperand(i: 1).getReg();
7830
7831 // If we have a 32 bit instruction which zeroes out the high half of a
7832 // register, we get an implicit zero extend for free. Check if we have one.
7833 // FIXME: We actually emit the extend right now even though we don't have
7834 // to.
7835 if (Ext == AArch64_AM::UXTW && MRI.getType(Reg: ExtReg).getSizeInBits() == 32) {
7836 MachineInstr *ExtInst = MRI.getVRegDef(Reg: ExtReg);
7837 if (isDef32(MI: *ExtInst))
7838 return std::nullopt;
7839 }
7840 }
7841
7842 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7843 // copy.
7844 MachineIRBuilder MIB(*RootDef);
7845 ExtReg = moveScalarRegClass(Reg: ExtReg, RC: AArch64::GPR32RegClass, MIB);
7846
7847 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); },
7848 [=](MachineInstrBuilder &MIB) {
7849 MIB.addImm(Val: getArithExtendImm(ET: Ext, Imm: ShiftVal));
7850 }}};
7851}
7852
7853InstructionSelector::ComplexRendererFns
7854AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7855 if (!Root.isReg())
7856 return std::nullopt;
7857 MachineRegisterInfo &MRI =
7858 Root.getParent()->getParent()->getParent()->getRegInfo();
7859
7860 auto Extract = getDefSrcRegIgnoringCopies(Reg: Root.getReg(), MRI);
7861 while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7862 STI.isLittleEndian())
7863 Extract =
7864 getDefSrcRegIgnoringCopies(Reg: Extract->MI->getOperand(i: 1).getReg(), MRI);
7865 if (!Extract)
7866 return std::nullopt;
7867
7868 if (auto *Unmerge = dyn_cast<GUnmerge>(Val: Extract->MI)) {
7869 if (Unmerge->getNumDefs() == 2 &&
7870 Extract->Reg == Unmerge->getOperand(i: 1).getReg()) {
7871 Register ExtReg = Unmerge->getSourceReg();
7872 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7873 }
7874 }
7875 if (auto *ExtElt = dyn_cast<GExtractVectorElement>(Val: Extract->MI)) {
7876 LLT SrcTy = MRI.getType(Reg: ExtElt->getVectorReg());
7877 auto LaneIdx =
7878 getIConstantVRegValWithLookThrough(VReg: ExtElt->getIndexReg(), MRI);
7879 if (LaneIdx && SrcTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) &&
7880 LaneIdx->Value.getSExtValue() == 1) {
7881 Register ExtReg = ExtElt->getVectorReg();
7882 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7883 }
7884 }
7885 if (auto *Subvec = dyn_cast<GExtractSubvector>(Val: Extract->MI)) {
7886 LLT SrcTy = MRI.getType(Reg: Subvec->getSrcVec());
7887 auto LaneIdx = Subvec->getIndexImm();
7888 if (LaneIdx == SrcTy.getNumElements() / 2) {
7889 Register ExtReg = Subvec->getSrcVec();
7890 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7891 }
7892 }
7893
7894 return std::nullopt;
7895}
7896
7897InstructionSelector::ComplexRendererFns
7898AArch64InstructionSelector::selectCVTFixedPointVecBase(
7899 const MachineOperand &Root, bool isReciprocal) const {
7900 if (!Root.isReg())
7901 return std::nullopt;
7902 const MachineRegisterInfo &MRI =
7903 Root.getParent()->getParent()->getParent()->getRegInfo();
7904
7905 MachineInstr *Dup = getDefIgnoringCopies(Reg: Root.getReg(), MRI);
7906 if (Dup->getOpcode() != AArch64::G_DUP)
7907 return std::nullopt;
7908 std::optional<ValueAndVReg> CstVal =
7909 getAnyConstantVRegValWithLookThrough(VReg: Dup->getOperand(i: 1).getReg(), MRI);
7910 if (!CstVal)
7911 return std::nullopt;
7912
7913 unsigned RegWidth = MRI.getType(Reg: Root.getReg()).getScalarSizeInBits();
7914 APFloat FVal(0.0);
7915 switch (RegWidth) {
7916 case 16:
7917 FVal = APFloat(APFloat::IEEEhalf(), CstVal->Value);
7918 break;
7919 case 32:
7920 FVal = APFloat(APFloat::IEEEsingle(), CstVal->Value);
7921 break;
7922 case 64:
7923 FVal = APFloat(APFloat::IEEEdouble(), CstVal->Value);
7924 break;
7925 default:
7926 return std::nullopt;
7927 };
7928 if (unsigned FBits =
7929 CheckFixedPointOperandConstant(FVal, RegWidth, isReciprocal))
7930 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: FBits); }}};
7931
7932 return std::nullopt;
7933}
7934
7935InstructionSelector::ComplexRendererFns
7936AArch64InstructionSelector::selectCVTFixedPointVec(MachineOperand &Root) const {
7937 return selectCVTFixedPointVecBase(Root, /*isReciprocal*/ false);
7938}
7939
7940InstructionSelector::ComplexRendererFns
7941AArch64InstructionSelector::selectCVTFixedPosRecipOperandVec(
7942 MachineOperand &Root) const {
7943 return selectCVTFixedPointVecBase(Root, /*isReciprocal*/ true);
7944}
7945
7946void AArch64InstructionSelector::renderFixedPointXForm(MachineInstrBuilder &MIB,
7947 const MachineInstr &MI,
7948 int OpIdx) const {
7949 // FIXME: This is only needed to satisfy the type checking in tablegen, and
7950 // should be able to reuse the Renderers already calculated by
7951 // selectCVTFixedPointVecBase.
7952 InstructionSelector::ComplexRendererFns Renderer =
7953 selectCVTFixedPointVecBase(Root: MI.getOperand(i: OpIdx), /*isReciprocal*/ false);
7954 assert((Renderer && Renderer->size() == 1) &&
7955 "Expected selectCVTFixedPointVec to provide a function\n");
7956 (Renderer->front())(MIB);
7957}
7958
7959void AArch64InstructionSelector::renderFixedPointRecipXForm(
7960 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7961 InstructionSelector::ComplexRendererFns Renderer =
7962 selectCVTFixedPointVecBase(Root: MI.getOperand(i: OpIdx), /*isReciprocal*/ true);
7963 assert((Renderer && Renderer->size() == 1) &&
7964 "Expected selectCVTFixedPosRecipOperandVec to provide a function\n");
7965 (Renderer->front())(MIB);
7966}
7967
7968void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7969 const MachineInstr &MI,
7970 int OpIdx) const {
7971 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7972 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7973 "Expected G_CONSTANT");
7974 std::optional<int64_t> CstVal =
7975 getIConstantVRegSExtVal(VReg: MI.getOperand(i: 0).getReg(), MRI);
7976 assert(CstVal && "Expected constant value");
7977 MIB.addImm(Val: *CstVal);
7978}
7979
7980void AArch64InstructionSelector::renderLogicalImm32(
7981 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7982 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7983 "Expected G_CONSTANT");
7984 uint64_t CstVal = I.getOperand(i: 1).getCImm()->getZExtValue();
7985 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(imm: CstVal, regSize: 32);
7986 MIB.addImm(Val: Enc);
7987}
7988
7989void AArch64InstructionSelector::renderLogicalImm64(
7990 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7991 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7992 "Expected G_CONSTANT");
7993 uint64_t CstVal = I.getOperand(i: 1).getCImm()->getZExtValue();
7994 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(imm: CstVal, regSize: 64);
7995 MIB.addImm(Val: Enc);
7996}
7997
7998void AArch64InstructionSelector::renderUbsanTrap(MachineInstrBuilder &MIB,
7999 const MachineInstr &MI,
8000 int OpIdx) const {
8001 assert(MI.getOpcode() == TargetOpcode::G_UBSANTRAP && OpIdx == 0 &&
8002 "Expected G_UBSANTRAP");
8003 MIB.addImm(Val: MI.getOperand(i: 0).getImm() | ('U' << 8));
8004}
8005
8006void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
8007 const MachineInstr &MI,
8008 int OpIdx) const {
8009 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
8010 "Expected G_FCONSTANT");
8011 MIB.addImm(
8012 Val: AArch64_AM::getFP16Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
8013}
8014
8015void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
8016 const MachineInstr &MI,
8017 int OpIdx) const {
8018 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
8019 "Expected G_FCONSTANT");
8020 MIB.addImm(
8021 Val: AArch64_AM::getFP32Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
8022}
8023
8024void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
8025 const MachineInstr &MI,
8026 int OpIdx) const {
8027 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
8028 "Expected G_FCONSTANT");
8029 MIB.addImm(
8030 Val: AArch64_AM::getFP64Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
8031}
8032
8033void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
8034 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
8035 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
8036 "Expected G_FCONSTANT");
8037 MIB.addImm(Val: AArch64_AM::encodeAdvSIMDModImmType4(Imm: MI.getOperand(i: 1)
8038 .getFPImm()
8039 ->getValueAPF()
8040 .bitcastToAPInt()
8041 .getZExtValue()));
8042}
8043
8044bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
8045 const MachineInstr &MI, unsigned NumBytes) const {
8046 if (!MI.mayLoadOrStore())
8047 return false;
8048 assert(MI.hasOneMemOperand() &&
8049 "Expected load/store to have only one mem op!");
8050 return (*MI.memoperands_begin())->getSize() == NumBytes;
8051}
8052
8053bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
8054 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8055 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits() != 32)
8056 return false;
8057
8058 // Only return true if we know the operation will zero-out the high half of
8059 // the 64-bit register. Truncates can be subregister copies, which don't
8060 // zero out the high bits. Copies and other copy-like instructions can be
8061 // fed by truncates, or could be lowered as subregister copies.
8062 switch (MI.getOpcode()) {
8063 default:
8064 return true;
8065 case TargetOpcode::COPY:
8066 case TargetOpcode::G_BITCAST:
8067 case TargetOpcode::G_TRUNC:
8068 case TargetOpcode::G_PHI:
8069 return false;
8070 }
8071}
8072
8073
8074// Perform fixups on the given PHI instruction's operands to force them all
8075// to be the same as the destination regbank.
8076static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
8077 const AArch64RegisterBankInfo &RBI) {
8078 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
8079 Register DstReg = MI.getOperand(i: 0).getReg();
8080 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: DstReg);
8081 assert(DstRB && "Expected PHI dst to have regbank assigned");
8082 MachineIRBuilder MIB(MI);
8083
8084 // Go through each operand and ensure it has the same regbank.
8085 for (MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands())) {
8086 if (!MO.isReg())
8087 continue;
8088 Register OpReg = MO.getReg();
8089 const RegisterBank *RB = MRI.getRegBankOrNull(Reg: OpReg);
8090 if (RB != DstRB) {
8091 // Insert a cross-bank copy.
8092 auto *OpDef = MRI.getVRegDef(Reg: OpReg);
8093 const LLT &Ty = MRI.getType(Reg: OpReg);
8094 MachineBasicBlock &OpDefBB = *OpDef->getParent();
8095
8096 // Any instruction we insert must appear after all PHIs in the block
8097 // for the block to be valid MIR.
8098 MachineBasicBlock::iterator InsertPt = std::next(x: OpDef->getIterator());
8099 if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
8100 InsertPt = OpDefBB.getFirstNonPHI();
8101 MIB.setInsertPt(MBB&: *OpDef->getParent(), II: InsertPt);
8102 auto Copy = MIB.buildCopy(Res: Ty, Op: OpReg);
8103 MRI.setRegBank(Reg: Copy.getReg(Idx: 0), RegBank: *DstRB);
8104 MO.setReg(Copy.getReg(Idx: 0));
8105 }
8106 }
8107}
8108
8109void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
8110 // We're looking for PHIs, build a list so we don't invalidate iterators.
8111 MachineRegisterInfo &MRI = MF.getRegInfo();
8112 SmallVector<MachineInstr *, 32> Phis;
8113 for (auto &BB : MF) {
8114 for (auto &MI : BB) {
8115 if (MI.getOpcode() == TargetOpcode::G_PHI)
8116 Phis.emplace_back(Args: &MI);
8117 }
8118 }
8119
8120 for (auto *MI : Phis) {
8121 // We need to do some work here if the operand types are < 16 bit and they
8122 // are split across fpr/gpr banks. Since all types <32b on gpr
8123 // end up being assigned gpr32 regclasses, we can end up with PHIs here
8124 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
8125 // be selecting heterogenous regbanks for operands if possible, but we
8126 // still need to be able to deal with it here.
8127 //
8128 // To fix this, if we have a gpr-bank operand < 32b in size and at least
8129 // one other operand is on the fpr bank, then we add cross-bank copies
8130 // to homogenize the operand banks. For simplicity the bank that we choose
8131 // to settle on is whatever bank the def operand has. For example:
8132 //
8133 // %endbb:
8134 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
8135 // =>
8136 // %bb2:
8137 // ...
8138 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
8139 // ...
8140 // %endbb:
8141 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
8142 bool HasGPROp = false, HasFPROp = false;
8143 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands())) {
8144 if (!MO.isReg())
8145 continue;
8146 const LLT &Ty = MRI.getType(Reg: MO.getReg());
8147 if (!Ty.isValid() || !Ty.isScalar())
8148 break;
8149 if (Ty.getSizeInBits() >= 32)
8150 break;
8151 const RegisterBank *RB = MRI.getRegBankOrNull(Reg: MO.getReg());
8152 // If for some reason we don't have a regbank yet. Don't try anything.
8153 if (!RB)
8154 break;
8155
8156 if (RB->getID() == AArch64::GPRRegBankID)
8157 HasGPROp = true;
8158 else
8159 HasFPROp = true;
8160 }
8161 // We have heterogenous regbanks, need to fixup.
8162 if (HasGPROp && HasFPROp)
8163 fixupPHIOpBanks(MI&: *MI, MRI, RBI);
8164 }
8165}
8166
8167namespace llvm {
8168InstructionSelector *
8169createAArch64InstructionSelector(const AArch64TargetMachine &TM,
8170 const AArch64Subtarget &Subtarget,
8171 const AArch64RegisterBankInfo &RBI) {
8172 return new AArch64InstructionSelector(TM, Subtarget, RBI);
8173}
8174}
8175