1//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AArch64.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AArch64GlobalISelUtils.h"
15#include "AArch64InstrInfo.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64RegisterBankInfo.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
20#include "AArch64TargetMachine.h"
21#include "MCTargetDesc/AArch64AddressingModes.h"
22#include "MCTargetDesc/AArch64MCTargetDesc.h"
23#include "llvm/BinaryFormat/Dwarf.h"
24#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
26#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
27#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
28#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30#include "llvm/CodeGen/GlobalISel/Utils.h"
31#include "llvm/CodeGen/MachineBasicBlock.h"
32#include "llvm/CodeGen/MachineConstantPool.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/MachineFunction.h"
35#include "llvm/CodeGen/MachineInstr.h"
36#include "llvm/CodeGen/MachineInstrBuilder.h"
37#include "llvm/CodeGen/MachineMemOperand.h"
38#include "llvm/CodeGen/MachineOperand.h"
39#include "llvm/CodeGen/MachineRegisterInfo.h"
40#include "llvm/CodeGen/TargetOpcodes.h"
41#include "llvm/CodeGen/TargetRegisterInfo.h"
42#include "llvm/IR/Constants.h"
43#include "llvm/IR/DerivedTypes.h"
44#include "llvm/IR/Instructions.h"
45#include "llvm/IR/IntrinsicsAArch64.h"
46#include "llvm/IR/Type.h"
47#include "llvm/Pass.h"
48#include "llvm/Support/Debug.h"
49#include "llvm/Support/raw_ostream.h"
50#include <optional>
51
52#define DEBUG_TYPE "aarch64-isel"
53
54using namespace llvm;
55using namespace MIPatternMatch;
56using namespace AArch64GISelUtils;
57
58namespace llvm {
59class BlockFrequencyInfo;
60class ProfileSummaryInfo;
61}
62
63namespace {
64
65#define GET_GLOBALISEL_PREDICATE_BITSET
66#include "AArch64GenGlobalISel.inc"
67#undef GET_GLOBALISEL_PREDICATE_BITSET
68
69
70class AArch64InstructionSelector : public InstructionSelector {
71public:
72 AArch64InstructionSelector(const AArch64TargetMachine &TM,
73 const AArch64Subtarget &STI,
74 const AArch64RegisterBankInfo &RBI);
75
76 bool select(MachineInstr &I) override;
77 static const char *getName() { return DEBUG_TYPE; }
78
79 void setupMF(MachineFunction &MF, GISelValueTracking *VT,
80 CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
81 BlockFrequencyInfo *BFI) override {
82 InstructionSelector::setupMF(mf&: MF, vt: VT, covinfo: CoverageInfo, psi: PSI, bfi: BFI);
83 MIB.setMF(MF);
84
85 // hasFnAttribute() is expensive to call on every BRCOND selection, so
86 // cache it here for each run of the selector.
87 ProduceNonFlagSettingCondBr =
88 !MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening);
89 MFReturnAddr = Register();
90
91 processPHIs(MF);
92 }
93
94private:
95 /// tblgen-erated 'select' implementation, used as the initial selector for
96 /// the patterns that don't require complex C++.
97 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
98
99 // A lowering phase that runs before any selection attempts.
100 // Returns true if the instruction was modified.
101 bool preISelLower(MachineInstr &I);
102
103 // An early selection function that runs before the selectImpl() call.
104 bool earlySelect(MachineInstr &I);
105
106 /// Save state that is shared between select calls, call select on \p I and
107 /// then restore the saved state. This can be used to recursively call select
108 /// within a select call.
109 bool selectAndRestoreState(MachineInstr &I);
110
111 // Do some preprocessing of G_PHIs before we begin selection.
112 void processPHIs(MachineFunction &MF);
113
114 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
115
116 /// Eliminate same-sized cross-bank copies into stores before selectImpl().
117 bool contractCrossBankCopyIntoStore(MachineInstr &I,
118 MachineRegisterInfo &MRI);
119
120 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
121
122 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
123 MachineRegisterInfo &MRI) const;
124 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
125 MachineRegisterInfo &MRI) const;
126
127 ///@{
128 /// Helper functions for selectCompareBranch.
129 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
130 MachineIRBuilder &MIB) const;
131 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
132 MachineIRBuilder &MIB) const;
133 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
134 MachineIRBuilder &MIB) const;
135 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
136 MachineBasicBlock *DstMBB,
137 MachineIRBuilder &MIB) const;
138 ///@}
139
140 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
141 MachineRegisterInfo &MRI);
142
143 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
144 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
145
146 // Helper to generate an equivalent of scalar_to_vector into a new register,
147 // returned via 'Dst'.
148 MachineInstr *emitScalarToVector(unsigned EltSize,
149 const TargetRegisterClass *DstRC,
150 Register Scalar,
151 MachineIRBuilder &MIRBuilder) const;
152 /// Helper to narrow vector that was widened by emitScalarToVector.
153 /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
154 /// vector, correspondingly.
155 MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
156 MachineIRBuilder &MIRBuilder,
157 MachineRegisterInfo &MRI) const;
158
159 /// Emit a lane insert into \p DstReg, or a new vector register if
160 /// std::nullopt is provided.
161 ///
162 /// The lane inserted into is defined by \p LaneIdx. The vector source
163 /// register is given by \p SrcReg. The register containing the element is
164 /// given by \p EltReg.
165 MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
166 Register EltReg, unsigned LaneIdx,
167 const RegisterBank &RB,
168 MachineIRBuilder &MIRBuilder) const;
169
170 /// Emit a sequence of instructions representing a constant \p CV for a
171 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
172 ///
173 /// \returns the last instruction in the sequence on success, and nullptr
174 /// otherwise.
175 MachineInstr *emitConstantVector(Register Dst, Constant *CV,
176 MachineIRBuilder &MIRBuilder,
177 MachineRegisterInfo &MRI);
178
179 MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits,
180 MachineIRBuilder &MIRBuilder);
181
182 MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits,
183 MachineIRBuilder &MIRBuilder, bool Inv);
184
185 MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits,
186 MachineIRBuilder &MIRBuilder, bool Inv);
187 MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits,
188 MachineIRBuilder &MIRBuilder);
189 MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits,
190 MachineIRBuilder &MIRBuilder, bool Inv);
191 MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
192 MachineIRBuilder &MIRBuilder);
193
194 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
195 MachineRegisterInfo &MRI);
196 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
197 /// SUBREG_TO_REG.
198 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
199 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
200 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
201 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
202
203 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
204 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
205 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
206 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
207
208 /// Helper function to select vector load intrinsics like
209 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
210 /// \p Opc is the opcode that the selected instruction should use.
211 /// \p NumVecs is the number of vector destinations for the instruction.
212 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
213 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
214 MachineInstr &I);
215 bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
216 MachineInstr &I);
217 void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
218 unsigned Opc);
219 bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
220 unsigned Opc);
221 bool selectIntrinsicWithSideEffects(MachineInstr &I,
222 MachineRegisterInfo &MRI);
223 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
224 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
225 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
226 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
227 bool selectPtrAuthGlobalValue(MachineInstr &I,
228 MachineRegisterInfo &MRI) const;
229 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
230 bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
231 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
232 void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs,
233 unsigned Opc1, unsigned Opc2, bool isExt);
234
235 bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
236 bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
237 bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
238
239 unsigned emitConstantPoolEntry(const Constant *CPVal,
240 MachineFunction &MF) const;
241 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
242 MachineIRBuilder &MIRBuilder) const;
243
244 // Emit a vector concat operation.
245 MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
246 Register Op2,
247 MachineIRBuilder &MIRBuilder) const;
248
249 // Emit an integer compare between LHS and RHS, which checks for Predicate.
250 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
251 MachineOperand &Predicate,
252 MachineIRBuilder &MIRBuilder) const;
253
254 /// Emit a floating point comparison between \p LHS and \p RHS.
255 /// \p Pred if given is the intended predicate to use.
256 MachineInstr *
257 emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
258 std::optional<CmpInst::Predicate> = std::nullopt) const;
259
260 MachineInstr *
261 emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
262 std::initializer_list<llvm::SrcOp> SrcOps,
263 MachineIRBuilder &MIRBuilder,
264 const ComplexRendererFns &RenderFns = std::nullopt) const;
265 /// Helper function to emit an add or sub instruction.
266 ///
267 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
268 /// in a specific order.
269 ///
270 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
271 ///
272 /// \code
273 /// const std::array<std::array<unsigned, 2>, 4> Table {
274 /// {{AArch64::ADDXri, AArch64::ADDWri},
275 /// {AArch64::ADDXrs, AArch64::ADDWrs},
276 /// {AArch64::ADDXrr, AArch64::ADDWrr},
277 /// {AArch64::SUBXri, AArch64::SUBWri},
278 /// {AArch64::ADDXrx, AArch64::ADDWrx}}};
279 /// \endcode
280 ///
281 /// Each row in the table corresponds to a different addressing mode. Each
282 /// column corresponds to a different register size.
283 ///
284 /// \attention Rows must be structured as follows:
285 /// - Row 0: The ri opcode variants
286 /// - Row 1: The rs opcode variants
287 /// - Row 2: The rr opcode variants
288 /// - Row 3: The ri opcode variants for negative immediates
289 /// - Row 4: The rx opcode variants
290 ///
291 /// \attention Columns must be structured as follows:
292 /// - Column 0: The 64-bit opcode variants
293 /// - Column 1: The 32-bit opcode variants
294 ///
295 /// \p Dst is the destination register of the binop to emit.
296 /// \p LHS is the left-hand operand of the binop to emit.
297 /// \p RHS is the right-hand operand of the binop to emit.
298 MachineInstr *emitAddSub(
299 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
300 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
301 MachineIRBuilder &MIRBuilder) const;
302 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
303 MachineOperand &RHS,
304 MachineIRBuilder &MIRBuilder) const;
305 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
306 MachineIRBuilder &MIRBuilder) const;
307 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
308 MachineIRBuilder &MIRBuilder) const;
309 MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
310 MachineIRBuilder &MIRBuilder) const;
311 MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
312 MachineIRBuilder &MIRBuilder) const;
313 MachineInstr *emitCMP(MachineOperand &LHS, MachineOperand &RHS,
314 MachineIRBuilder &MIRBuilder) const;
315 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
316 MachineIRBuilder &MIRBuilder) const;
317 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
318 MachineIRBuilder &MIRBuilder) const;
319 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
320 AArch64CC::CondCode CC,
321 MachineIRBuilder &MIRBuilder) const;
322 MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
323 const RegisterBank &DstRB, LLT ScalarTy,
324 Register VecReg, unsigned LaneIdx,
325 MachineIRBuilder &MIRBuilder) const;
326 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
327 AArch64CC::CondCode Pred,
328 MachineIRBuilder &MIRBuilder) const;
329 /// Emit a CSet for a FP compare.
330 ///
331 /// \p Dst is expected to be a 32-bit scalar register.
332 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
333 MachineIRBuilder &MIRBuilder) const;
334
335 /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
336 /// Might elide the instruction if the previous instruction already sets NZCV
337 /// correctly.
338 MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
339
340 /// Emit the overflow op for \p Opcode.
341 ///
342 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
343 /// G_USUBO, etc.
344 std::pair<MachineInstr *, AArch64CC::CondCode>
345 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
346 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
347
348 bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
349
350 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
351 /// In some cases this is even possible with OR operations in the expression.
352 MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
353 MachineIRBuilder &MIB) const;
354 MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
355 CmpInst::Predicate CC,
356 AArch64CC::CondCode Predicate,
357 AArch64CC::CondCode OutCC,
358 MachineIRBuilder &MIB) const;
359 MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
360 bool Negate, Register CCOp,
361 AArch64CC::CondCode Predicate,
362 MachineIRBuilder &MIB) const;
363
364 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
365 /// \p IsNegative is true if the test should be "not zero".
366 /// This will also optimize the test bit instruction when possible.
367 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
368 MachineBasicBlock *DstMBB,
369 MachineIRBuilder &MIB) const;
370
371 /// Emit a CB(N)Z instruction which branches to \p DestMBB.
372 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
373 MachineBasicBlock *DestMBB,
374 MachineIRBuilder &MIB) const;
375
376 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
377 // We use these manually instead of using the importer since it doesn't
378 // support SDNodeXForm.
379 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
380 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
381 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
382 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
383
384 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
385 ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
386 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
387
388 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
389 unsigned Size) const;
390
391 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
392 return selectAddrModeUnscaled(Root, Size: 1);
393 }
394 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
395 return selectAddrModeUnscaled(Root, Size: 2);
396 }
397 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
398 return selectAddrModeUnscaled(Root, Size: 4);
399 }
400 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
401 return selectAddrModeUnscaled(Root, Size: 8);
402 }
403 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
404 return selectAddrModeUnscaled(Root, Size: 16);
405 }
406
407 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
408 /// from complex pattern matchers like selectAddrModeIndexed().
409 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
410 MachineRegisterInfo &MRI) const;
411
412 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
413 unsigned Size) const;
414 template <int Width>
415 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
416 return selectAddrModeIndexed(Root, Size: Width / 8);
417 }
418
419 std::optional<bool>
420 isWorthFoldingIntoAddrMode(const MachineInstr &MI,
421 const MachineRegisterInfo &MRI) const;
422
423 bool isWorthFoldingIntoExtendedReg(const MachineInstr &MI,
424 const MachineRegisterInfo &MRI,
425 bool IsAddrOperand) const;
426 ComplexRendererFns
427 selectAddrModeShiftedExtendXReg(MachineOperand &Root,
428 unsigned SizeInBytes) const;
429
430 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
431 /// or not a shift + extend should be folded into an addressing mode. Returns
432 /// None when this is not profitable or possible.
433 ComplexRendererFns
434 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
435 MachineOperand &Offset, unsigned SizeInBytes,
436 bool WantsExt) const;
437 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
438 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
439 unsigned SizeInBytes) const;
440 template <int Width>
441 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
442 return selectAddrModeXRO(Root, SizeInBytes: Width / 8);
443 }
444
445 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
446 unsigned SizeInBytes) const;
447 template <int Width>
448 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
449 return selectAddrModeWRO(Root, SizeInBytes: Width / 8);
450 }
451
452 ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
453 bool AllowROR = false) const;
454
455 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
456 return selectShiftedRegister(Root);
457 }
458
459 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
460 return selectShiftedRegister(Root, AllowROR: true);
461 }
462
463 /// Given an extend instruction, determine the correct shift-extend type for
464 /// that instruction.
465 ///
466 /// If the instruction is going to be used in a load or store, pass
467 /// \p IsLoadStore = true.
468 AArch64_AM::ShiftExtendType
469 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
470 bool IsLoadStore = false) const;
471
472 /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
473 ///
474 /// \returns Either \p Reg if no change was necessary, or the new register
475 /// created by moving \p Reg.
476 ///
477 /// Note: This uses emitCopy right now.
478 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
479 MachineIRBuilder &MIB) const;
480
481 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
482
483 ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
484
485 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
486 int OpIdx = -1) const;
487 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
488 int OpIdx = -1) const;
489 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
490 int OpIdx = -1) const;
491 void renderUbsanTrap(MachineInstrBuilder &MIB, const MachineInstr &MI,
492 int OpIdx) const;
493 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
494 int OpIdx = -1) const;
495 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
496 int OpIdx = -1) const;
497 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
498 int OpIdx = -1) const;
499 void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
500 const MachineInstr &MI,
501 int OpIdx = -1) const;
502
503 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
504 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
505
506 // Optimization methods.
507 bool tryOptSelect(GSelect &Sel);
508 bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
509 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
510 MachineOperand &Predicate,
511 MachineIRBuilder &MIRBuilder) const;
512
513 /// Return true if \p MI is a load or store of \p NumBytes bytes.
514 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
515
516 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
517 /// register zeroed out. In other words, the result of MI has been explicitly
518 /// zero extended.
519 bool isDef32(const MachineInstr &MI) const;
520
521 const AArch64TargetMachine &TM;
522 const AArch64Subtarget &STI;
523 const AArch64InstrInfo &TII;
524 const AArch64RegisterInfo &TRI;
525 const AArch64RegisterBankInfo &RBI;
526
527 bool ProduceNonFlagSettingCondBr = false;
528
529 // Some cached values used during selection.
530 // We use LR as a live-in register, and we keep track of it here as it can be
531 // clobbered by calls.
532 Register MFReturnAddr;
533
534 MachineIRBuilder MIB;
535
536#define GET_GLOBALISEL_PREDICATES_DECL
537#include "AArch64GenGlobalISel.inc"
538#undef GET_GLOBALISEL_PREDICATES_DECL
539
540// We declare the temporaries used by selectImpl() in the class to minimize the
541// cost of constructing placeholder values.
542#define GET_GLOBALISEL_TEMPORARIES_DECL
543#include "AArch64GenGlobalISel.inc"
544#undef GET_GLOBALISEL_TEMPORARIES_DECL
545};
546
547} // end anonymous namespace
548
549#define GET_GLOBALISEL_IMPL
550#include "AArch64GenGlobalISel.inc"
551#undef GET_GLOBALISEL_IMPL
552
553AArch64InstructionSelector::AArch64InstructionSelector(
554 const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
555 const AArch64RegisterBankInfo &RBI)
556 : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
557 RBI(RBI),
558#define GET_GLOBALISEL_PREDICATES_INIT
559#include "AArch64GenGlobalISel.inc"
560#undef GET_GLOBALISEL_PREDICATES_INIT
561#define GET_GLOBALISEL_TEMPORARIES_INIT
562#include "AArch64GenGlobalISel.inc"
563#undef GET_GLOBALISEL_TEMPORARIES_INIT
564{
565}
566
567// FIXME: This should be target-independent, inferred from the types declared
568// for each class in the bank.
569//
570/// Given a register bank, and a type, return the smallest register class that
571/// can represent that combination.
572static const TargetRegisterClass *
573getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
574 bool GetAllRegSet = false) {
575 if (RB.getID() == AArch64::GPRRegBankID) {
576 if (Ty.getSizeInBits() <= 32)
577 return GetAllRegSet ? &AArch64::GPR32allRegClass
578 : &AArch64::GPR32RegClass;
579 if (Ty.getSizeInBits() == 64)
580 return GetAllRegSet ? &AArch64::GPR64allRegClass
581 : &AArch64::GPR64RegClass;
582 if (Ty.getSizeInBits() == 128)
583 return &AArch64::XSeqPairsClassRegClass;
584 return nullptr;
585 }
586
587 if (RB.getID() == AArch64::FPRRegBankID) {
588 switch (Ty.getSizeInBits()) {
589 case 8:
590 return &AArch64::FPR8RegClass;
591 case 16:
592 return &AArch64::FPR16RegClass;
593 case 32:
594 return &AArch64::FPR32RegClass;
595 case 64:
596 return &AArch64::FPR64RegClass;
597 case 128:
598 return &AArch64::FPR128RegClass;
599 }
600 return nullptr;
601 }
602
603 return nullptr;
604}
605
606/// Given a register bank, and size in bits, return the smallest register class
607/// that can represent that combination.
608static const TargetRegisterClass *
609getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
610 bool GetAllRegSet = false) {
611 if (SizeInBits.isScalable()) {
612 assert(RB.getID() == AArch64::FPRRegBankID &&
613 "Expected FPR regbank for scalable type size");
614 return &AArch64::ZPRRegClass;
615 }
616
617 unsigned RegBankID = RB.getID();
618
619 if (RegBankID == AArch64::GPRRegBankID) {
620 assert(!SizeInBits.isScalable() && "Unexpected scalable register size");
621 if (SizeInBits <= 32)
622 return GetAllRegSet ? &AArch64::GPR32allRegClass
623 : &AArch64::GPR32RegClass;
624 if (SizeInBits == 64)
625 return GetAllRegSet ? &AArch64::GPR64allRegClass
626 : &AArch64::GPR64RegClass;
627 if (SizeInBits == 128)
628 return &AArch64::XSeqPairsClassRegClass;
629 }
630
631 if (RegBankID == AArch64::FPRRegBankID) {
632 if (SizeInBits.isScalable()) {
633 assert(SizeInBits == TypeSize::getScalable(128) &&
634 "Unexpected scalable register size");
635 return &AArch64::ZPRRegClass;
636 }
637
638 switch (SizeInBits) {
639 default:
640 return nullptr;
641 case 8:
642 return &AArch64::FPR8RegClass;
643 case 16:
644 return &AArch64::FPR16RegClass;
645 case 32:
646 return &AArch64::FPR32RegClass;
647 case 64:
648 return &AArch64::FPR64RegClass;
649 case 128:
650 return &AArch64::FPR128RegClass;
651 }
652 }
653
654 return nullptr;
655}
656
657/// Returns the correct subregister to use for a given register class.
658static bool getSubRegForClass(const TargetRegisterClass *RC,
659 const TargetRegisterInfo &TRI, unsigned &SubReg) {
660 switch (TRI.getRegSizeInBits(RC: *RC)) {
661 case 8:
662 SubReg = AArch64::bsub;
663 break;
664 case 16:
665 SubReg = AArch64::hsub;
666 break;
667 case 32:
668 if (RC != &AArch64::FPR32RegClass)
669 SubReg = AArch64::sub_32;
670 else
671 SubReg = AArch64::ssub;
672 break;
673 case 64:
674 SubReg = AArch64::dsub;
675 break;
676 default:
677 LLVM_DEBUG(
678 dbgs() << "Couldn't find appropriate subregister for register class.");
679 return false;
680 }
681
682 return true;
683}
684
685/// Returns the minimum size the given register bank can hold.
686static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
687 switch (RB.getID()) {
688 case AArch64::GPRRegBankID:
689 return 32;
690 case AArch64::FPRRegBankID:
691 return 8;
692 default:
693 llvm_unreachable("Tried to get minimum size for unknown register bank.");
694 }
695}
696
697/// Create a REG_SEQUENCE instruction using the registers in \p Regs.
698/// Helper function for functions like createDTuple and createQTuple.
699///
700/// \p RegClassIDs - The list of register class IDs available for some tuple of
701/// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
702/// expected to contain between 2 and 4 tuple classes.
703///
704/// \p SubRegs - The list of subregister classes associated with each register
705/// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
706/// subregister class. The index of each subregister class is expected to
707/// correspond with the index of each register class.
708///
709/// \returns Either the destination register of REG_SEQUENCE instruction that
710/// was created, or the 0th element of \p Regs if \p Regs contains a single
711/// element.
712static Register createTuple(ArrayRef<Register> Regs,
713 const unsigned RegClassIDs[],
714 const unsigned SubRegs[], MachineIRBuilder &MIB) {
715 unsigned NumRegs = Regs.size();
716 if (NumRegs == 1)
717 return Regs[0];
718 assert(NumRegs >= 2 && NumRegs <= 4 &&
719 "Only support between two and 4 registers in a tuple!");
720 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
721 auto *DesiredClass = TRI->getRegClass(i: RegClassIDs[NumRegs - 2]);
722 auto RegSequence =
723 MIB.buildInstr(Opc: TargetOpcode::REG_SEQUENCE, DstOps: {DesiredClass}, SrcOps: {});
724 for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
725 RegSequence.addUse(RegNo: Regs[I]);
726 RegSequence.addImm(Val: SubRegs[I]);
727 }
728 return RegSequence.getReg(Idx: 0);
729}
730
731/// Create a tuple of D-registers using the registers in \p Regs.
732static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
733 static const unsigned RegClassIDs[] = {
734 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
735 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
736 AArch64::dsub2, AArch64::dsub3};
737 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
738}
739
740/// Create a tuple of Q-registers using the registers in \p Regs.
741static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
742 static const unsigned RegClassIDs[] = {
743 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
744 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
745 AArch64::qsub2, AArch64::qsub3};
746 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
747}
748
749static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
750 auto &MI = *Root.getParent();
751 auto &MBB = *MI.getParent();
752 auto &MF = *MBB.getParent();
753 auto &MRI = MF.getRegInfo();
754 uint64_t Immed;
755 if (Root.isImm())
756 Immed = Root.getImm();
757 else if (Root.isCImm())
758 Immed = Root.getCImm()->getZExtValue();
759 else if (Root.isReg()) {
760 auto ValAndVReg =
761 getIConstantVRegValWithLookThrough(VReg: Root.getReg(), MRI, LookThroughInstrs: true);
762 if (!ValAndVReg)
763 return std::nullopt;
764 Immed = ValAndVReg->Value.getSExtValue();
765 } else
766 return std::nullopt;
767 return Immed;
768}
769
770/// Check whether \p I is a currently unsupported binary operation:
771/// - it has an unsized type
772/// - an operand is not a vreg
773/// - all operands are not in the same bank
774/// These are checks that should someday live in the verifier, but right now,
775/// these are mostly limitations of the aarch64 selector.
776static bool unsupportedBinOp(const MachineInstr &I,
777 const AArch64RegisterBankInfo &RBI,
778 const MachineRegisterInfo &MRI,
779 const AArch64RegisterInfo &TRI) {
780 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
781 if (!Ty.isValid()) {
782 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
783 return true;
784 }
785
786 const RegisterBank *PrevOpBank = nullptr;
787 for (auto &MO : I.operands()) {
788 // FIXME: Support non-register operands.
789 if (!MO.isReg()) {
790 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
791 return true;
792 }
793
794 // FIXME: Can generic operations have physical registers operands? If
795 // so, this will need to be taught about that, and we'll need to get the
796 // bank out of the minimal class for the register.
797 // Either way, this needs to be documented (and possibly verified).
798 if (!MO.getReg().isVirtual()) {
799 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
800 return true;
801 }
802
803 const RegisterBank *OpBank = RBI.getRegBank(Reg: MO.getReg(), MRI, TRI);
804 if (!OpBank) {
805 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
806 return true;
807 }
808
809 if (PrevOpBank && OpBank != PrevOpBank) {
810 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
811 return true;
812 }
813 PrevOpBank = OpBank;
814 }
815 return false;
816}
817
818/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
819/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
820/// and of size \p OpSize.
821/// \returns \p GenericOpc if the combination is unsupported.
822static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
823 unsigned OpSize) {
824 switch (RegBankID) {
825 case AArch64::GPRRegBankID:
826 if (OpSize == 32) {
827 switch (GenericOpc) {
828 case TargetOpcode::G_SHL:
829 return AArch64::LSLVWr;
830 case TargetOpcode::G_LSHR:
831 return AArch64::LSRVWr;
832 case TargetOpcode::G_ASHR:
833 return AArch64::ASRVWr;
834 default:
835 return GenericOpc;
836 }
837 } else if (OpSize == 64) {
838 switch (GenericOpc) {
839 case TargetOpcode::G_PTR_ADD:
840 return AArch64::ADDXrr;
841 case TargetOpcode::G_SHL:
842 return AArch64::LSLVXr;
843 case TargetOpcode::G_LSHR:
844 return AArch64::LSRVXr;
845 case TargetOpcode::G_ASHR:
846 return AArch64::ASRVXr;
847 default:
848 return GenericOpc;
849 }
850 }
851 break;
852 case AArch64::FPRRegBankID:
853 switch (OpSize) {
854 case 32:
855 switch (GenericOpc) {
856 case TargetOpcode::G_FADD:
857 return AArch64::FADDSrr;
858 case TargetOpcode::G_FSUB:
859 return AArch64::FSUBSrr;
860 case TargetOpcode::G_FMUL:
861 return AArch64::FMULSrr;
862 case TargetOpcode::G_FDIV:
863 return AArch64::FDIVSrr;
864 default:
865 return GenericOpc;
866 }
867 case 64:
868 switch (GenericOpc) {
869 case TargetOpcode::G_FADD:
870 return AArch64::FADDDrr;
871 case TargetOpcode::G_FSUB:
872 return AArch64::FSUBDrr;
873 case TargetOpcode::G_FMUL:
874 return AArch64::FMULDrr;
875 case TargetOpcode::G_FDIV:
876 return AArch64::FDIVDrr;
877 case TargetOpcode::G_OR:
878 return AArch64::ORRv8i8;
879 default:
880 return GenericOpc;
881 }
882 }
883 break;
884 }
885 return GenericOpc;
886}
887
888/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
889/// appropriate for the (value) register bank \p RegBankID and of memory access
890/// size \p OpSize. This returns the variant with the base+unsigned-immediate
891/// addressing mode (e.g., LDRXui).
892/// \returns \p GenericOpc if the combination is unsupported.
893static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
894 unsigned OpSize) {
895 const bool isStore = GenericOpc == TargetOpcode::G_STORE;
896 switch (RegBankID) {
897 case AArch64::GPRRegBankID:
898 switch (OpSize) {
899 case 8:
900 return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
901 case 16:
902 return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
903 case 32:
904 return isStore ? AArch64::STRWui : AArch64::LDRWui;
905 case 64:
906 return isStore ? AArch64::STRXui : AArch64::LDRXui;
907 }
908 break;
909 case AArch64::FPRRegBankID:
910 switch (OpSize) {
911 case 8:
912 return isStore ? AArch64::STRBui : AArch64::LDRBui;
913 case 16:
914 return isStore ? AArch64::STRHui : AArch64::LDRHui;
915 case 32:
916 return isStore ? AArch64::STRSui : AArch64::LDRSui;
917 case 64:
918 return isStore ? AArch64::STRDui : AArch64::LDRDui;
919 case 128:
920 return isStore ? AArch64::STRQui : AArch64::LDRQui;
921 }
922 break;
923 }
924 return GenericOpc;
925}
926
927/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
928/// to \p *To.
929///
930/// E.g "To = COPY SrcReg:SubReg"
931static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
932 const RegisterBankInfo &RBI, Register SrcReg,
933 const TargetRegisterClass *To, unsigned SubReg) {
934 assert(SrcReg.isValid() && "Expected a valid source register?");
935 assert(To && "Destination register class cannot be null");
936 assert(SubReg && "Expected a valid subregister");
937
938 MachineIRBuilder MIB(I);
939 auto SubRegCopy =
940 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {To}, SrcOps: {}).addReg(RegNo: SrcReg, Flags: {}, SubReg);
941 MachineOperand &RegOp = I.getOperand(i: 1);
942 RegOp.setReg(SubRegCopy.getReg(Idx: 0));
943
944 // It's possible that the destination register won't be constrained. Make
945 // sure that happens.
946 if (!I.getOperand(i: 0).getReg().isPhysical())
947 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(), RC: *To, MRI);
948
949 return true;
950}
951
952/// Helper function to get the source and destination register classes for a
953/// copy. Returns a std::pair containing the source register class for the
954/// copy, and the destination register class for the copy. If a register class
955/// cannot be determined, then it will be nullptr.
956static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
957getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
958 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
959 const RegisterBankInfo &RBI) {
960 Register DstReg = I.getOperand(i: 0).getReg();
961 Register SrcReg = I.getOperand(i: 1).getReg();
962 const RegisterBank &DstRegBank = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
963 const RegisterBank &SrcRegBank = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
964
965 TypeSize DstSize = RBI.getSizeInBits(Reg: DstReg, MRI, TRI);
966 TypeSize SrcSize = RBI.getSizeInBits(Reg: SrcReg, MRI, TRI);
967
968 // Special casing for cross-bank copies of s1s. We can technically represent
969 // a 1-bit value with any size of register. The minimum size for a GPR is 32
970 // bits. So, we need to put the FPR on 32 bits as well.
971 //
972 // FIXME: I'm not sure if this case holds true outside of copies. If it does,
973 // then we can pull it into the helpers that get the appropriate class for a
974 // register bank. Or make a new helper that carries along some constraint
975 // information.
976 if (SrcRegBank != DstRegBank &&
977 (DstSize == TypeSize::getFixed(ExactSize: 1) && SrcSize == TypeSize::getFixed(ExactSize: 1)))
978 SrcSize = DstSize = TypeSize::getFixed(ExactSize: 32);
979
980 return {getMinClassForRegBank(RB: SrcRegBank, SizeInBits: SrcSize, GetAllRegSet: true),
981 getMinClassForRegBank(RB: DstRegBank, SizeInBits: DstSize, GetAllRegSet: true)};
982}
983
984// FIXME: We need some sort of API in RBI/TRI to allow generic code to
985// constrain operands of simple instructions given a TargetRegisterClass
986// and LLT
987static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
988 const RegisterBankInfo &RBI) {
989 for (MachineOperand &MO : I.operands()) {
990 if (!MO.isReg())
991 continue;
992 Register Reg = MO.getReg();
993 if (!Reg)
994 continue;
995 if (Reg.isPhysical())
996 continue;
997 LLT Ty = MRI.getType(Reg);
998 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
999 const TargetRegisterClass *RC =
1000 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
1001 if (!RC) {
1002 const RegisterBank &RB = *cast<const RegisterBank *>(Val: RegClassOrBank);
1003 RC = getRegClassForTypeOnBank(Ty, RB);
1004 if (!RC) {
1005 LLVM_DEBUG(
1006 dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
1007 break;
1008 }
1009 }
1010 RBI.constrainGenericRegister(Reg, RC: *RC, MRI);
1011 }
1012
1013 return true;
1014}
1015
1016static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
1017 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
1018 const RegisterBankInfo &RBI) {
1019 Register DstReg = I.getOperand(i: 0).getReg();
1020 Register SrcReg = I.getOperand(i: 1).getReg();
1021 const RegisterBank &DstRegBank = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
1022 const RegisterBank &SrcRegBank = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
1023
1024 // Find the correct register classes for the source and destination registers.
1025 const TargetRegisterClass *SrcRC;
1026 const TargetRegisterClass *DstRC;
1027 std::tie(args&: SrcRC, args&: DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1028
1029 if (!DstRC) {
1030 LLVM_DEBUG(dbgs() << "Unexpected dest size "
1031 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
1032 return false;
1033 }
1034
1035 // Is this a copy? If so, then we may need to insert a subregister copy.
1036 if (I.isCopy()) {
1037 // Yes. Check if there's anything to fix up.
1038 if (!SrcRC) {
1039 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1040 return false;
1041 }
1042
1043 const TypeSize SrcSize = TRI.getRegSizeInBits(RC: *SrcRC);
1044 const TypeSize DstSize = TRI.getRegSizeInBits(RC: *DstRC);
1045 unsigned SubReg;
1046
1047 // If the source bank doesn't support a subregister copy small enough,
1048 // then we first need to copy to the destination bank.
1049 if (getMinSizeForRegBank(RB: SrcRegBank) > DstSize) {
1050 const TargetRegisterClass *DstTempRC =
1051 getMinClassForRegBank(RB: DstRegBank, SizeInBits: SrcSize, /* GetAllRegSet */ true);
1052 getSubRegForClass(RC: DstRC, TRI, SubReg);
1053
1054 MachineIRBuilder MIB(I);
1055 auto Copy = MIB.buildCopy(Res: {DstTempRC}, Op: {SrcReg});
1056 copySubReg(I, MRI, RBI, SrcReg: Copy.getReg(Idx: 0), To: DstRC, SubReg);
1057 } else if (SrcSize > DstSize) {
1058 // If the source register is bigger than the destination we need to
1059 // perform a subregister copy.
1060 const TargetRegisterClass *SubRegRC =
1061 getMinClassForRegBank(RB: SrcRegBank, SizeInBits: DstSize, /* GetAllRegSet */ true);
1062 getSubRegForClass(RC: SubRegRC, TRI, SubReg);
1063 copySubReg(I, MRI, RBI, SrcReg, To: DstRC, SubReg);
1064 } else if (DstSize > SrcSize) {
1065 // If the destination register is bigger than the source we need to do
1066 // a promotion using SUBREG_TO_REG.
1067 const TargetRegisterClass *PromotionRC =
1068 getMinClassForRegBank(RB: SrcRegBank, SizeInBits: DstSize, /* GetAllRegSet */ true);
1069 getSubRegForClass(RC: SrcRC, TRI, SubReg);
1070
1071 Register PromoteReg = MRI.createVirtualRegister(RegClass: PromotionRC);
1072 BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(),
1073 MCID: TII.get(Opcode: AArch64::SUBREG_TO_REG), DestReg: PromoteReg)
1074 .addImm(Val: 0)
1075 .addUse(RegNo: SrcReg)
1076 .addImm(Val: SubReg);
1077 MachineOperand &RegOp = I.getOperand(i: 1);
1078 RegOp.setReg(PromoteReg);
1079 }
1080
1081 // If the destination is a physical register, then there's nothing to
1082 // change, so we're done.
1083 if (DstReg.isPhysical())
1084 return true;
1085 }
1086
1087 // No need to constrain SrcReg. It will get constrained when we hit another
1088 // of its use or its defs. Copies do not have constraints.
1089 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI)) {
1090 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1091 << " operand\n");
1092 return false;
1093 }
1094
1095 // If this a GPR ZEXT that we want to just reduce down into a copy.
1096 // The sizes will be mismatched with the source < 32b but that's ok.
1097 if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1098 I.setDesc(TII.get(Opcode: AArch64::COPY));
1099 assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1100 return selectCopy(I, TII, MRI, TRI, RBI);
1101 }
1102
1103 I.setDesc(TII.get(Opcode: AArch64::COPY));
1104 return true;
1105}
1106
1107MachineInstr *
1108AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1109 Register False, AArch64CC::CondCode CC,
1110 MachineIRBuilder &MIB) const {
1111 MachineRegisterInfo &MRI = *MIB.getMRI();
1112 assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1113 RBI.getRegBank(True, MRI, TRI)->getID() &&
1114 "Expected both select operands to have the same regbank?");
1115 LLT Ty = MRI.getType(Reg: True);
1116 if (Ty.isVector())
1117 return nullptr;
1118 const unsigned Size = Ty.getSizeInBits();
1119 assert((Size == 32 || Size == 64) &&
1120 "Expected 32 bit or 64 bit select only?");
1121 const bool Is32Bit = Size == 32;
1122 if (RBI.getRegBank(Reg: True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1123 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1124 auto FCSel = MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {True, False}).addImm(Val: CC);
1125 constrainSelectedInstRegOperands(I&: *FCSel, TII, TRI, RBI);
1126 return &*FCSel;
1127 }
1128
1129 // By default, we'll try and emit a CSEL.
1130 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1131 bool Optimized = false;
1132 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1133 &Optimized](Register &Reg, Register &OtherReg,
1134 bool Invert) {
1135 if (Optimized)
1136 return false;
1137
1138 // Attempt to fold:
1139 //
1140 // %sub = G_SUB 0, %x
1141 // %select = G_SELECT cc, %reg, %sub
1142 //
1143 // Into:
1144 // %select = CSNEG %reg, %x, cc
1145 Register MatchReg;
1146 if (mi_match(R: Reg, MRI, P: m_Neg(Src: m_Reg(R&: MatchReg)))) {
1147 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1148 Reg = MatchReg;
1149 if (Invert) {
1150 CC = AArch64CC::getInvertedCondCode(Code: CC);
1151 std::swap(a&: Reg, b&: OtherReg);
1152 }
1153 return true;
1154 }
1155
1156 // Attempt to fold:
1157 //
1158 // %xor = G_XOR %x, -1
1159 // %select = G_SELECT cc, %reg, %xor
1160 //
1161 // Into:
1162 // %select = CSINV %reg, %x, cc
1163 if (mi_match(R: Reg, MRI, P: m_Not(Src: m_Reg(R&: MatchReg)))) {
1164 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1165 Reg = MatchReg;
1166 if (Invert) {
1167 CC = AArch64CC::getInvertedCondCode(Code: CC);
1168 std::swap(a&: Reg, b&: OtherReg);
1169 }
1170 return true;
1171 }
1172
1173 // Attempt to fold:
1174 //
1175 // %add = G_ADD %x, 1
1176 // %select = G_SELECT cc, %reg, %add
1177 //
1178 // Into:
1179 // %select = CSINC %reg, %x, cc
1180 if (mi_match(R: Reg, MRI,
1181 P: m_any_of(preds: m_GAdd(L: m_Reg(R&: MatchReg), R: m_SpecificICst(RequestedValue: 1)),
1182 preds: m_GPtrAdd(L: m_Reg(R&: MatchReg), R: m_SpecificICst(RequestedValue: 1))))) {
1183 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1184 Reg = MatchReg;
1185 if (Invert) {
1186 CC = AArch64CC::getInvertedCondCode(Code: CC);
1187 std::swap(a&: Reg, b&: OtherReg);
1188 }
1189 return true;
1190 }
1191
1192 return false;
1193 };
1194
1195 // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1196 // true/false values are constants.
1197 // FIXME: All of these patterns already exist in tablegen. We should be
1198 // able to import these.
1199 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1200 &Optimized]() {
1201 if (Optimized)
1202 return false;
1203 auto TrueCst = getIConstantVRegValWithLookThrough(VReg: True, MRI);
1204 auto FalseCst = getIConstantVRegValWithLookThrough(VReg: False, MRI);
1205 if (!TrueCst && !FalseCst)
1206 return false;
1207
1208 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1209 if (TrueCst && FalseCst) {
1210 int64_t T = TrueCst->Value.getSExtValue();
1211 int64_t F = FalseCst->Value.getSExtValue();
1212
1213 if (T == 0 && F == 1) {
1214 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1215 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1216 True = ZReg;
1217 False = ZReg;
1218 return true;
1219 }
1220
1221 if (T == 0 && F == -1) {
1222 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1223 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1224 True = ZReg;
1225 False = ZReg;
1226 return true;
1227 }
1228 }
1229
1230 if (TrueCst) {
1231 int64_t T = TrueCst->Value.getSExtValue();
1232 if (T == 1) {
1233 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1234 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1235 True = False;
1236 False = ZReg;
1237 CC = AArch64CC::getInvertedCondCode(Code: CC);
1238 return true;
1239 }
1240
1241 if (T == -1) {
1242 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1243 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1244 True = False;
1245 False = ZReg;
1246 CC = AArch64CC::getInvertedCondCode(Code: CC);
1247 return true;
1248 }
1249 }
1250
1251 if (FalseCst) {
1252 int64_t F = FalseCst->Value.getSExtValue();
1253 if (F == 1) {
1254 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1255 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1256 False = ZReg;
1257 return true;
1258 }
1259
1260 if (F == -1) {
1261 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1262 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1263 False = ZReg;
1264 return true;
1265 }
1266 }
1267 return false;
1268 };
1269
1270 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1271 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1272 Optimized |= TryOptSelectCst();
1273 auto SelectInst = MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {True, False}).addImm(Val: CC);
1274 constrainSelectedInstRegOperands(I&: *SelectInst, TII, TRI, RBI);
1275 return &*SelectInst;
1276}
1277
1278static AArch64CC::CondCode
1279changeICMPPredToAArch64CC(CmpInst::Predicate P, Register RHS = {},
1280 MachineRegisterInfo *MRI = nullptr) {
1281 switch (P) {
1282 default:
1283 llvm_unreachable("Unknown condition code!");
1284 case CmpInst::ICMP_NE:
1285 return AArch64CC::NE;
1286 case CmpInst::ICMP_EQ:
1287 return AArch64CC::EQ;
1288 case CmpInst::ICMP_SGT:
1289 return AArch64CC::GT;
1290 case CmpInst::ICMP_SGE:
1291 if (RHS && MRI) {
1292 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
1293 if (ValAndVReg && ValAndVReg->Value == 0)
1294 return AArch64CC::PL;
1295 }
1296 return AArch64CC::GE;
1297 case CmpInst::ICMP_SLT:
1298 if (RHS && MRI) {
1299 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
1300 if (ValAndVReg && ValAndVReg->Value == 0)
1301 return AArch64CC::MI;
1302 }
1303 return AArch64CC::LT;
1304 case CmpInst::ICMP_SLE:
1305 return AArch64CC::LE;
1306 case CmpInst::ICMP_UGT:
1307 return AArch64CC::HI;
1308 case CmpInst::ICMP_UGE:
1309 return AArch64CC::HS;
1310 case CmpInst::ICMP_ULT:
1311 return AArch64CC::LO;
1312 case CmpInst::ICMP_ULE:
1313 return AArch64CC::LS;
1314 }
1315}
1316
1317/// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1318static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1319 AArch64CC::CondCode &CondCode,
1320 AArch64CC::CondCode &CondCode2) {
1321 CondCode2 = AArch64CC::AL;
1322 switch (CC) {
1323 default:
1324 llvm_unreachable("Unknown FP condition!");
1325 case CmpInst::FCMP_OEQ:
1326 CondCode = AArch64CC::EQ;
1327 break;
1328 case CmpInst::FCMP_OGT:
1329 CondCode = AArch64CC::GT;
1330 break;
1331 case CmpInst::FCMP_OGE:
1332 CondCode = AArch64CC::GE;
1333 break;
1334 case CmpInst::FCMP_OLT:
1335 CondCode = AArch64CC::MI;
1336 break;
1337 case CmpInst::FCMP_OLE:
1338 CondCode = AArch64CC::LS;
1339 break;
1340 case CmpInst::FCMP_ONE:
1341 CondCode = AArch64CC::MI;
1342 CondCode2 = AArch64CC::GT;
1343 break;
1344 case CmpInst::FCMP_ORD:
1345 CondCode = AArch64CC::VC;
1346 break;
1347 case CmpInst::FCMP_UNO:
1348 CondCode = AArch64CC::VS;
1349 break;
1350 case CmpInst::FCMP_UEQ:
1351 CondCode = AArch64CC::EQ;
1352 CondCode2 = AArch64CC::VS;
1353 break;
1354 case CmpInst::FCMP_UGT:
1355 CondCode = AArch64CC::HI;
1356 break;
1357 case CmpInst::FCMP_UGE:
1358 CondCode = AArch64CC::PL;
1359 break;
1360 case CmpInst::FCMP_ULT:
1361 CondCode = AArch64CC::LT;
1362 break;
1363 case CmpInst::FCMP_ULE:
1364 CondCode = AArch64CC::LE;
1365 break;
1366 case CmpInst::FCMP_UNE:
1367 CondCode = AArch64CC::NE;
1368 break;
1369 }
1370}
1371
1372/// Convert an IR fp condition code to an AArch64 CC.
1373/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1374/// should be AND'ed instead of OR'ed.
1375static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1376 AArch64CC::CondCode &CondCode,
1377 AArch64CC::CondCode &CondCode2) {
1378 CondCode2 = AArch64CC::AL;
1379 switch (CC) {
1380 default:
1381 changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1382 assert(CondCode2 == AArch64CC::AL);
1383 break;
1384 case CmpInst::FCMP_ONE:
1385 // (a one b)
1386 // == ((a olt b) || (a ogt b))
1387 // == ((a ord b) && (a une b))
1388 CondCode = AArch64CC::VC;
1389 CondCode2 = AArch64CC::NE;
1390 break;
1391 case CmpInst::FCMP_UEQ:
1392 // (a ueq b)
1393 // == ((a uno b) || (a oeq b))
1394 // == ((a ule b) && (a uge b))
1395 CondCode = AArch64CC::PL;
1396 CondCode2 = AArch64CC::LE;
1397 break;
1398 }
1399}
1400
1401/// Return a register which can be used as a bit to test in a TB(N)Z.
1402static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1403 MachineRegisterInfo &MRI) {
1404 assert(Reg.isValid() && "Expected valid register!");
1405 bool HasZext = false;
1406 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1407 unsigned Opc = MI->getOpcode();
1408
1409 if (!MI->getOperand(i: 0).isReg() ||
1410 !MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
1411 break;
1412
1413 // (tbz (any_ext x), b) -> (tbz x, b) and
1414 // (tbz (zext x), b) -> (tbz x, b) if we don't use the extended bits.
1415 //
1416 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1417 // on the truncated x is the same as the bit number on x.
1418 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1419 Opc == TargetOpcode::G_TRUNC) {
1420 if (Opc == TargetOpcode::G_ZEXT)
1421 HasZext = true;
1422
1423 Register NextReg = MI->getOperand(i: 1).getReg();
1424 // Did we find something worth folding?
1425 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(RegNo: NextReg))
1426 break;
1427 TypeSize InSize = MRI.getType(Reg: NextReg).getSizeInBits();
1428 if (Bit >= InSize)
1429 break;
1430
1431 // NextReg is worth folding. Keep looking.
1432 Reg = NextReg;
1433 continue;
1434 }
1435
1436 // Attempt to find a suitable operation with a constant on one side.
1437 std::optional<uint64_t> C;
1438 Register TestReg;
1439 switch (Opc) {
1440 default:
1441 break;
1442 case TargetOpcode::G_AND:
1443 case TargetOpcode::G_XOR: {
1444 TestReg = MI->getOperand(i: 1).getReg();
1445 Register ConstantReg = MI->getOperand(i: 2).getReg();
1446 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
1447 if (!VRegAndVal) {
1448 // AND commutes, check the other side for a constant.
1449 // FIXME: Can we canonicalize the constant so that it's always on the
1450 // same side at some point earlier?
1451 std::swap(a&: ConstantReg, b&: TestReg);
1452 VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
1453 }
1454 if (VRegAndVal) {
1455 if (HasZext)
1456 C = VRegAndVal->Value.getZExtValue();
1457 else
1458 C = VRegAndVal->Value.getSExtValue();
1459 }
1460 break;
1461 }
1462 case TargetOpcode::G_ASHR:
1463 case TargetOpcode::G_LSHR:
1464 case TargetOpcode::G_SHL: {
1465 TestReg = MI->getOperand(i: 1).getReg();
1466 auto VRegAndVal =
1467 getIConstantVRegValWithLookThrough(VReg: MI->getOperand(i: 2).getReg(), MRI);
1468 if (VRegAndVal)
1469 C = VRegAndVal->Value.getSExtValue();
1470 break;
1471 }
1472 }
1473
1474 // Didn't find a constant or viable register. Bail out of the loop.
1475 if (!C || !TestReg.isValid())
1476 break;
1477
1478 // We found a suitable instruction with a constant. Check to see if we can
1479 // walk through the instruction.
1480 Register NextReg;
1481 unsigned TestRegSize = MRI.getType(Reg: TestReg).getSizeInBits();
1482 switch (Opc) {
1483 default:
1484 break;
1485 case TargetOpcode::G_AND:
1486 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1487 if ((*C >> Bit) & 1)
1488 NextReg = TestReg;
1489 break;
1490 case TargetOpcode::G_SHL:
1491 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1492 // the type of the register.
1493 if (*C <= Bit && (Bit - *C) < TestRegSize) {
1494 NextReg = TestReg;
1495 Bit = Bit - *C;
1496 }
1497 break;
1498 case TargetOpcode::G_ASHR:
1499 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1500 // in x
1501 NextReg = TestReg;
1502 Bit = Bit + *C;
1503 if (Bit >= TestRegSize)
1504 Bit = TestRegSize - 1;
1505 break;
1506 case TargetOpcode::G_LSHR:
1507 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1508 if ((Bit + *C) < TestRegSize) {
1509 NextReg = TestReg;
1510 Bit = Bit + *C;
1511 }
1512 break;
1513 case TargetOpcode::G_XOR:
1514 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1515 // appropriate.
1516 //
1517 // e.g. If x' = xor x, c, and the b-th bit is set in c then
1518 //
1519 // tbz x', b -> tbnz x, b
1520 //
1521 // Because x' only has the b-th bit set if x does not.
1522 if ((*C >> Bit) & 1)
1523 Invert = !Invert;
1524 NextReg = TestReg;
1525 break;
1526 }
1527
1528 // Check if we found anything worth folding.
1529 if (!NextReg.isValid())
1530 return Reg;
1531 Reg = NextReg;
1532 }
1533
1534 return Reg;
1535}
1536
1537MachineInstr *AArch64InstructionSelector::emitTestBit(
1538 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1539 MachineIRBuilder &MIB) const {
1540 assert(TestReg.isValid());
1541 assert(ProduceNonFlagSettingCondBr &&
1542 "Cannot emit TB(N)Z with speculation tracking!");
1543 MachineRegisterInfo &MRI = *MIB.getMRI();
1544
1545 // Attempt to optimize the test bit by walking over instructions.
1546 TestReg = getTestBitReg(Reg: TestReg, Bit, Invert&: IsNegative, MRI);
1547 LLT Ty = MRI.getType(Reg: TestReg);
1548 unsigned Size = Ty.getSizeInBits();
1549 assert(!Ty.isVector() && "Expected a scalar!");
1550 assert(Bit < 64 && "Bit is too large!");
1551
1552 // When the test register is a 64-bit register, we have to narrow to make
1553 // TBNZW work.
1554 bool UseWReg = Bit < 32;
1555 unsigned NecessarySize = UseWReg ? 32 : 64;
1556 if (Size != NecessarySize)
1557 TestReg = moveScalarRegClass(
1558 Reg: TestReg, RC: UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1559 MIB);
1560
1561 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1562 {AArch64::TBZW, AArch64::TBNZW}};
1563 unsigned Opc = OpcTable[UseWReg][IsNegative];
1564 auto TestBitMI =
1565 MIB.buildInstr(Opcode: Opc).addReg(RegNo: TestReg).addImm(Val: Bit).addMBB(MBB: DstMBB);
1566 constrainSelectedInstRegOperands(I&: *TestBitMI, TII, TRI, RBI);
1567 return &*TestBitMI;
1568}
1569
1570bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1571 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1572 MachineIRBuilder &MIB) const {
1573 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1574 // Given something like this:
1575 //
1576 // %x = ...Something...
1577 // %one = G_CONSTANT i64 1
1578 // %zero = G_CONSTANT i64 0
1579 // %and = G_AND %x, %one
1580 // %cmp = G_ICMP intpred(ne), %and, %zero
1581 // %cmp_trunc = G_TRUNC %cmp
1582 // G_BRCOND %cmp_trunc, %bb.3
1583 //
1584 // We want to try and fold the AND into the G_BRCOND and produce either a
1585 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1586 //
1587 // In this case, we'd get
1588 //
1589 // TBNZ %x %bb.3
1590 //
1591
1592 // Check if the AND has a constant on its RHS which we can use as a mask.
1593 // If it's a power of 2, then it's the same as checking a specific bit.
1594 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1595 auto MaybeBit = getIConstantVRegValWithLookThrough(
1596 VReg: AndInst.getOperand(i: 2).getReg(), MRI: *MIB.getMRI());
1597 if (!MaybeBit)
1598 return false;
1599
1600 int32_t Bit = MaybeBit->Value.exactLogBase2();
1601 if (Bit < 0)
1602 return false;
1603
1604 Register TestReg = AndInst.getOperand(i: 1).getReg();
1605
1606 // Emit a TB(N)Z.
1607 emitTestBit(TestReg, Bit, IsNegative: Invert, DstMBB, MIB);
1608 return true;
1609}
1610
1611MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1612 bool IsNegative,
1613 MachineBasicBlock *DestMBB,
1614 MachineIRBuilder &MIB) const {
1615 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1616 MachineRegisterInfo &MRI = *MIB.getMRI();
1617 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1618 AArch64::GPRRegBankID &&
1619 "Expected GPRs only?");
1620 auto Ty = MRI.getType(Reg: CompareReg);
1621 unsigned Width = Ty.getSizeInBits();
1622 assert(!Ty.isVector() && "Expected scalar only?");
1623 assert(Width <= 64 && "Expected width to be at most 64?");
1624 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1625 {AArch64::CBNZW, AArch64::CBNZX}};
1626 unsigned Opc = OpcTable[IsNegative][Width == 64];
1627 auto BranchMI = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {CompareReg}).addMBB(MBB: DestMBB);
1628 constrainSelectedInstRegOperands(I&: *BranchMI, TII, TRI, RBI);
1629 return &*BranchMI;
1630}
1631
1632bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1633 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1634 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1635 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1636 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1637 // totally clean. Some of them require two branches to implement.
1638 auto Pred = (CmpInst::Predicate)FCmp.getOperand(i: 1).getPredicate();
1639 emitFPCompare(LHS: FCmp.getOperand(i: 2).getReg(), RHS: FCmp.getOperand(i: 3).getReg(), MIRBuilder&: MIB,
1640 Pred);
1641 AArch64CC::CondCode CC1, CC2;
1642 changeFCMPPredToAArch64CC(P: Pred, CondCode&: CC1, CondCode2&: CC2);
1643 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1644 MIB.buildInstr(Opc: AArch64::Bcc, DstOps: {}, SrcOps: {}).addImm(Val: CC1).addMBB(MBB: DestMBB);
1645 if (CC2 != AArch64CC::AL)
1646 MIB.buildInstr(Opc: AArch64::Bcc, DstOps: {}, SrcOps: {}).addImm(Val: CC2).addMBB(MBB: DestMBB);
1647 I.eraseFromParent();
1648 return true;
1649}
1650
1651bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1652 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1653 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1654 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1655 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1656 //
1657 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1658 // instructions will not be produced, as they are conditional branch
1659 // instructions that do not set flags.
1660 if (!ProduceNonFlagSettingCondBr)
1661 return false;
1662
1663 MachineRegisterInfo &MRI = *MIB.getMRI();
1664 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1665 auto Pred =
1666 static_cast<CmpInst::Predicate>(ICmp.getOperand(i: 1).getPredicate());
1667 Register LHS = ICmp.getOperand(i: 2).getReg();
1668 Register RHS = ICmp.getOperand(i: 3).getReg();
1669
1670 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1671 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
1672 MachineInstr *AndInst = getOpcodeDef(Opcode: TargetOpcode::G_AND, Reg: LHS, MRI);
1673
1674 // When we can emit a TB(N)Z, prefer that.
1675 //
1676 // Handle non-commutative condition codes first.
1677 // Note that we don't want to do this when we have a G_AND because it can
1678 // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1679 if (VRegAndVal && !AndInst) {
1680 int64_t C = VRegAndVal->Value.getSExtValue();
1681
1682 // When we have a greater-than comparison, we can just test if the msb is
1683 // zero.
1684 if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1685 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1686 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ false, DstMBB: DestMBB, MIB);
1687 I.eraseFromParent();
1688 return true;
1689 }
1690
1691 // When we have a less than comparison, we can just test if the msb is not
1692 // zero.
1693 if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1694 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1695 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ true, DstMBB: DestMBB, MIB);
1696 I.eraseFromParent();
1697 return true;
1698 }
1699
1700 // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1701 // we can test if the msb is zero.
1702 if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1703 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1704 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ false, DstMBB: DestMBB, MIB);
1705 I.eraseFromParent();
1706 return true;
1707 }
1708 }
1709
1710 // Attempt to handle commutative condition codes. Right now, that's only
1711 // eq/ne.
1712 if (ICmpInst::isEquality(P: Pred)) {
1713 if (!VRegAndVal) {
1714 std::swap(a&: RHS, b&: LHS);
1715 VRegAndVal = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
1716 AndInst = getOpcodeDef(Opcode: TargetOpcode::G_AND, Reg: LHS, MRI);
1717 }
1718
1719 if (VRegAndVal && VRegAndVal->Value == 0) {
1720 // If there's a G_AND feeding into this branch, try to fold it away by
1721 // emitting a TB(N)Z instead.
1722 //
1723 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1724 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1725 // would be redundant.
1726 if (AndInst &&
1727 tryOptAndIntoCompareBranch(
1728 AndInst&: *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DstMBB: DestMBB, MIB)) {
1729 I.eraseFromParent();
1730 return true;
1731 }
1732
1733 // Otherwise, try to emit a CB(N)Z instead.
1734 auto LHSTy = MRI.getType(Reg: LHS);
1735 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1736 emitCBZ(CompareReg: LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1737 I.eraseFromParent();
1738 return true;
1739 }
1740 }
1741 }
1742
1743 return false;
1744}
1745
1746bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1747 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1748 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1749 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1750 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1751 return true;
1752
1753 // Couldn't optimize. Emit a compare + a Bcc.
1754 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1755 auto &PredOp = ICmp.getOperand(i: 1);
1756 emitIntegerCompare(LHS&: ICmp.getOperand(i: 2), RHS&: ICmp.getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
1757 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1758 P: static_cast<CmpInst::Predicate>(PredOp.getPredicate()),
1759 RHS: ICmp.getOperand(i: 3).getReg(), MRI: MIB.getMRI());
1760 MIB.buildInstr(Opc: AArch64::Bcc, DstOps: {}, SrcOps: {}).addImm(Val: CC).addMBB(MBB: DestMBB);
1761 I.eraseFromParent();
1762 return true;
1763}
1764
1765bool AArch64InstructionSelector::selectCompareBranch(
1766 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1767 Register CondReg = I.getOperand(i: 0).getReg();
1768 MachineInstr *CCMI = MRI.getVRegDef(Reg: CondReg);
1769 // Try to select the G_BRCOND using whatever is feeding the condition if
1770 // possible.
1771 unsigned CCMIOpc = CCMI->getOpcode();
1772 if (CCMIOpc == TargetOpcode::G_FCMP)
1773 return selectCompareBranchFedByFCmp(I, FCmp&: *CCMI, MIB);
1774 if (CCMIOpc == TargetOpcode::G_ICMP)
1775 return selectCompareBranchFedByICmp(I, ICmp&: *CCMI, MIB);
1776
1777 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1778 // instructions will not be produced, as they are conditional branch
1779 // instructions that do not set flags.
1780 if (ProduceNonFlagSettingCondBr) {
1781 emitTestBit(TestReg: CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1782 DstMBB: I.getOperand(i: 1).getMBB(), MIB);
1783 I.eraseFromParent();
1784 return true;
1785 }
1786
1787 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1788 auto TstMI =
1789 MIB.buildInstr(Opc: AArch64::ANDSWri, DstOps: {LLT::scalar(SizeInBits: 32)}, SrcOps: {CondReg}).addImm(Val: 1);
1790 constrainSelectedInstRegOperands(I&: *TstMI, TII, TRI, RBI);
1791 auto Bcc = MIB.buildInstr(Opcode: AArch64::Bcc)
1792 .addImm(Val: AArch64CC::NE)
1793 .addMBB(MBB: I.getOperand(i: 1).getMBB());
1794 I.eraseFromParent();
1795 return constrainSelectedInstRegOperands(I&: *Bcc, TII, TRI, RBI);
1796}
1797
1798/// Returns the element immediate value of a vector shift operand if found.
1799/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1800static std::optional<int64_t> getVectorShiftImm(Register Reg,
1801 MachineRegisterInfo &MRI) {
1802 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1803 MachineInstr *OpMI = MRI.getVRegDef(Reg);
1804 return getAArch64VectorSplatScalar(MI: *OpMI, MRI);
1805}
1806
1807/// Matches and returns the shift immediate value for a SHL instruction given
1808/// a shift operand.
1809static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1810 MachineRegisterInfo &MRI) {
1811 std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1812 if (!ShiftImm)
1813 return std::nullopt;
1814 // Check the immediate is in range for a SHL.
1815 int64_t Imm = *ShiftImm;
1816 if (Imm < 0)
1817 return std::nullopt;
1818 switch (SrcTy.getElementType().getSizeInBits()) {
1819 default:
1820 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1821 return std::nullopt;
1822 case 8:
1823 if (Imm > 7)
1824 return std::nullopt;
1825 break;
1826 case 16:
1827 if (Imm > 15)
1828 return std::nullopt;
1829 break;
1830 case 32:
1831 if (Imm > 31)
1832 return std::nullopt;
1833 break;
1834 case 64:
1835 if (Imm > 63)
1836 return std::nullopt;
1837 break;
1838 }
1839 return Imm;
1840}
1841
1842bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1843 MachineRegisterInfo &MRI) {
1844 assert(I.getOpcode() == TargetOpcode::G_SHL);
1845 Register DstReg = I.getOperand(i: 0).getReg();
1846 const LLT Ty = MRI.getType(Reg: DstReg);
1847 Register Src1Reg = I.getOperand(i: 1).getReg();
1848 Register Src2Reg = I.getOperand(i: 2).getReg();
1849
1850 if (!Ty.isVector())
1851 return false;
1852
1853 // Check if we have a vector of constants on RHS that we can select as the
1854 // immediate form.
1855 std::optional<int64_t> ImmVal = getVectorSHLImm(SrcTy: Ty, Reg: Src2Reg, MRI);
1856
1857 unsigned Opc = 0;
1858 if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64)) {
1859 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1860 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
1861 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1862 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
1863 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1864 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)) {
1865 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1866 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16)) {
1867 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1868 } else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8)) {
1869 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1870 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8)) {
1871 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1872 } else {
1873 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1874 return false;
1875 }
1876
1877 auto Shl = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Src1Reg});
1878 if (ImmVal)
1879 Shl.addImm(Val: *ImmVal);
1880 else
1881 Shl.addUse(RegNo: Src2Reg);
1882 constrainSelectedInstRegOperands(I&: *Shl, TII, TRI, RBI);
1883 I.eraseFromParent();
1884 return true;
1885}
1886
1887bool AArch64InstructionSelector::selectVectorAshrLshr(
1888 MachineInstr &I, MachineRegisterInfo &MRI) {
1889 assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1890 I.getOpcode() == TargetOpcode::G_LSHR);
1891 Register DstReg = I.getOperand(i: 0).getReg();
1892 const LLT Ty = MRI.getType(Reg: DstReg);
1893 Register Src1Reg = I.getOperand(i: 1).getReg();
1894 Register Src2Reg = I.getOperand(i: 2).getReg();
1895
1896 if (!Ty.isVector())
1897 return false;
1898
1899 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1900
1901 // We expect the immediate case to be lowered in the PostLegalCombiner to
1902 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1903
1904 // There is not a shift right register instruction, but the shift left
1905 // register instruction takes a signed value, where negative numbers specify a
1906 // right shift.
1907
1908 unsigned Opc = 0;
1909 unsigned NegOpc = 0;
1910 const TargetRegisterClass *RC =
1911 getRegClassForTypeOnBank(Ty, RB: RBI.getRegBank(ID: AArch64::FPRRegBankID));
1912 if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64)) {
1913 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1914 NegOpc = AArch64::NEGv2i64;
1915 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
1916 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1917 NegOpc = AArch64::NEGv4i32;
1918 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
1919 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1920 NegOpc = AArch64::NEGv2i32;
1921 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)) {
1922 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1923 NegOpc = AArch64::NEGv4i16;
1924 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16)) {
1925 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1926 NegOpc = AArch64::NEGv8i16;
1927 } else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8)) {
1928 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1929 NegOpc = AArch64::NEGv16i8;
1930 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8)) {
1931 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1932 NegOpc = AArch64::NEGv8i8;
1933 } else {
1934 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1935 return false;
1936 }
1937
1938 auto Neg = MIB.buildInstr(Opc: NegOpc, DstOps: {RC}, SrcOps: {Src2Reg});
1939 constrainSelectedInstRegOperands(I&: *Neg, TII, TRI, RBI);
1940 auto SShl = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Src1Reg, Neg});
1941 constrainSelectedInstRegOperands(I&: *SShl, TII, TRI, RBI);
1942 I.eraseFromParent();
1943 return true;
1944}
1945
1946bool AArch64InstructionSelector::selectVaStartAAPCS(
1947 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1948
1949 if (STI.isCallingConvWin64(CC: MF.getFunction().getCallingConv(),
1950 IsVarArg: MF.getFunction().isVarArg()))
1951 return false;
1952
1953 // The layout of the va_list struct is specified in the AArch64 Procedure Call
1954 // Standard, section 10.1.5.
1955
1956 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1957 const unsigned PtrSize = STI.isTargetILP32() ? 4 : 8;
1958 const auto *PtrRegClass =
1959 STI.isTargetILP32() ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
1960
1961 const MCInstrDesc &MCIDAddAddr =
1962 TII.get(Opcode: STI.isTargetILP32() ? AArch64::ADDWri : AArch64::ADDXri);
1963 const MCInstrDesc &MCIDStoreAddr =
1964 TII.get(Opcode: STI.isTargetILP32() ? AArch64::STRWui : AArch64::STRXui);
1965
1966 /*
1967 * typedef struct va_list {
1968 * void * stack; // next stack param
1969 * void * gr_top; // end of GP arg reg save area
1970 * void * vr_top; // end of FP/SIMD arg reg save area
1971 * int gr_offs; // offset from gr_top to next GP register arg
1972 * int vr_offs; // offset from vr_top to next FP/SIMD register arg
1973 * } va_list;
1974 */
1975 const auto VAList = I.getOperand(i: 0).getReg();
1976
1977 // Our current offset in bytes from the va_list struct (VAList).
1978 unsigned OffsetBytes = 0;
1979
1980 // Helper function to store (FrameIndex + Imm) to VAList at offset OffsetBytes
1981 // and increment OffsetBytes by PtrSize.
1982 const auto PushAddress = [&](const int FrameIndex, const int64_t Imm) {
1983 const Register Top = MRI.createVirtualRegister(RegClass: PtrRegClass);
1984 auto MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: MCIDAddAddr)
1985 .addDef(RegNo: Top)
1986 .addFrameIndex(Idx: FrameIndex)
1987 .addImm(Val: Imm)
1988 .addImm(Val: 0);
1989 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1990
1991 const auto *MMO = *I.memoperands_begin();
1992 MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: MCIDStoreAddr)
1993 .addUse(RegNo: Top)
1994 .addUse(RegNo: VAList)
1995 .addImm(Val: OffsetBytes / PtrSize)
1996 .addMemOperand(MMO: MF.getMachineMemOperand(
1997 PtrInfo: MMO->getPointerInfo().getWithOffset(O: OffsetBytes),
1998 F: MachineMemOperand::MOStore, Size: PtrSize, BaseAlignment: MMO->getBaseAlign()));
1999 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2000
2001 OffsetBytes += PtrSize;
2002 };
2003
2004 // void* stack at offset 0
2005 PushAddress(FuncInfo->getVarArgsStackIndex(), 0);
2006
2007 // void* gr_top at offset 8 (4 on ILP32)
2008 const unsigned GPRSize = FuncInfo->getVarArgsGPRSize();
2009 PushAddress(FuncInfo->getVarArgsGPRIndex(), GPRSize);
2010
2011 // void* vr_top at offset 16 (8 on ILP32)
2012 const unsigned FPRSize = FuncInfo->getVarArgsFPRSize();
2013 PushAddress(FuncInfo->getVarArgsFPRIndex(), FPRSize);
2014
2015 // Helper function to store a 4-byte integer constant to VAList at offset
2016 // OffsetBytes, and increment OffsetBytes by 4.
2017 const auto PushIntConstant = [&](const int32_t Value) {
2018 constexpr int IntSize = 4;
2019 const Register Temp = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
2020 auto MIB =
2021 BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::MOVi32imm))
2022 .addDef(RegNo: Temp)
2023 .addImm(Val: Value);
2024 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2025
2026 const auto *MMO = *I.memoperands_begin();
2027 MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::STRWui))
2028 .addUse(RegNo: Temp)
2029 .addUse(RegNo: VAList)
2030 .addImm(Val: OffsetBytes / IntSize)
2031 .addMemOperand(MMO: MF.getMachineMemOperand(
2032 PtrInfo: MMO->getPointerInfo().getWithOffset(O: OffsetBytes),
2033 F: MachineMemOperand::MOStore, Size: IntSize, BaseAlignment: MMO->getBaseAlign()));
2034 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2035 OffsetBytes += IntSize;
2036 };
2037
2038 // int gr_offs at offset 24 (12 on ILP32)
2039 PushIntConstant(-static_cast<int32_t>(GPRSize));
2040
2041 // int vr_offs at offset 28 (16 on ILP32)
2042 PushIntConstant(-static_cast<int32_t>(FPRSize));
2043
2044 assert(OffsetBytes == (STI.isTargetILP32() ? 20 : 32) && "Unexpected offset");
2045
2046 I.eraseFromParent();
2047 return true;
2048}
2049
2050bool AArch64InstructionSelector::selectVaStartDarwin(
2051 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
2052 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2053 Register ListReg = I.getOperand(i: 0).getReg();
2054
2055 Register ArgsAddrReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
2056
2057 int FrameIdx = FuncInfo->getVarArgsStackIndex();
2058 if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
2059 CC: MF.getFunction().getCallingConv(), IsVarArg: MF.getFunction().isVarArg())) {
2060 FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
2061 ? FuncInfo->getVarArgsGPRIndex()
2062 : FuncInfo->getVarArgsStackIndex();
2063 }
2064
2065 auto MIB =
2066 BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::ADDXri))
2067 .addDef(RegNo: ArgsAddrReg)
2068 .addFrameIndex(Idx: FrameIdx)
2069 .addImm(Val: 0)
2070 .addImm(Val: 0);
2071
2072 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2073
2074 MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::STRXui))
2075 .addUse(RegNo: ArgsAddrReg)
2076 .addUse(RegNo: ListReg)
2077 .addImm(Val: 0)
2078 .addMemOperand(MMO: *I.memoperands_begin());
2079
2080 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2081 I.eraseFromParent();
2082 return true;
2083}
2084
2085void AArch64InstructionSelector::materializeLargeCMVal(
2086 MachineInstr &I, const Value *V, unsigned OpFlags) {
2087 MachineBasicBlock &MBB = *I.getParent();
2088 MachineFunction &MF = *MBB.getParent();
2089 MachineRegisterInfo &MRI = MF.getRegInfo();
2090
2091 auto MovZ = MIB.buildInstr(Opc: AArch64::MOVZXi, DstOps: {&AArch64::GPR64RegClass}, SrcOps: {});
2092 MovZ->addOperand(MF, Op: I.getOperand(i: 1));
2093 MovZ->getOperand(i: 1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
2094 AArch64II::MO_NC);
2095 MovZ->addOperand(MF, Op: MachineOperand::CreateImm(Val: 0));
2096 constrainSelectedInstRegOperands(I&: *MovZ, TII, TRI, RBI);
2097
2098 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2099 Register ForceDstReg) {
2100 Register DstReg = ForceDstReg
2101 ? ForceDstReg
2102 : MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
2103 auto MovI = MIB.buildInstr(Opcode: AArch64::MOVKXi).addDef(RegNo: DstReg).addUse(RegNo: SrcReg);
2104 if (auto *GV = dyn_cast<GlobalValue>(Val: V)) {
2105 MovI->addOperand(MF, Op: MachineOperand::CreateGA(
2106 GV, Offset: MovZ->getOperand(i: 1).getOffset(), TargetFlags: Flags));
2107 } else {
2108 MovI->addOperand(
2109 MF, Op: MachineOperand::CreateBA(BA: cast<BlockAddress>(Val: V),
2110 Offset: MovZ->getOperand(i: 1).getOffset(), TargetFlags: Flags));
2111 }
2112 MovI->addOperand(MF, Op: MachineOperand::CreateImm(Val: Offset));
2113 constrainSelectedInstRegOperands(I&: *MovI, TII, TRI, RBI);
2114 return DstReg;
2115 };
2116 Register DstReg = BuildMovK(MovZ.getReg(Idx: 0),
2117 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2118 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2119 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(i: 0).getReg());
2120}
2121
2122bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2123 MachineBasicBlock &MBB = *I.getParent();
2124 MachineFunction &MF = *MBB.getParent();
2125 MachineRegisterInfo &MRI = MF.getRegInfo();
2126
2127 switch (I.getOpcode()) {
2128 case TargetOpcode::G_STORE: {
2129 bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2130 MachineOperand &SrcOp = I.getOperand(i: 0);
2131 if (MRI.getType(Reg: SrcOp.getReg()).isPointer()) {
2132 // Allow matching with imported patterns for stores of pointers. Unlike
2133 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2134 // and constrain.
2135 auto Copy = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: SrcOp);
2136 Register NewSrc = Copy.getReg(Idx: 0);
2137 SrcOp.setReg(NewSrc);
2138 RBI.constrainGenericRegister(Reg: NewSrc, RC: AArch64::GPR64RegClass, MRI);
2139 Changed = true;
2140 }
2141 return Changed;
2142 }
2143 case TargetOpcode::G_PTR_ADD: {
2144 // If Checked Pointer Arithmetic (FEAT_CPA) is present, preserve the pointer
2145 // arithmetic semantics instead of falling back to regular arithmetic.
2146 const auto &TL = STI.getTargetLowering();
2147 if (TL->shouldPreservePtrArith(F: MF.getFunction(), PtrVT: EVT()))
2148 return false;
2149 return convertPtrAddToAdd(I, MRI);
2150 }
2151 case TargetOpcode::G_LOAD: {
2152 // For scalar loads of pointers, we try to convert the dest type from p0
2153 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2154 // conversion, this should be ok because all users should have been
2155 // selected already, so the type doesn't matter for them.
2156 Register DstReg = I.getOperand(i: 0).getReg();
2157 const LLT DstTy = MRI.getType(Reg: DstReg);
2158 if (!DstTy.isPointer())
2159 return false;
2160 MRI.setType(VReg: DstReg, Ty: LLT::scalar(SizeInBits: 64));
2161 return true;
2162 }
2163 case AArch64::G_DUP: {
2164 // Convert the type from p0 to s64 to help selection.
2165 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2166 if (!DstTy.isPointerVector())
2167 return false;
2168 auto NewSrc = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: I.getOperand(i: 1).getReg());
2169 MRI.setType(VReg: I.getOperand(i: 0).getReg(),
2170 Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 64)));
2171 MRI.setRegClass(Reg: NewSrc.getReg(Idx: 0), RC: &AArch64::GPR64RegClass);
2172 I.getOperand(i: 1).setReg(NewSrc.getReg(Idx: 0));
2173 return true;
2174 }
2175 case AArch64::G_INSERT_VECTOR_ELT: {
2176 // Convert the type from p0 to s64 to help selection.
2177 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2178 LLT SrcVecTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
2179 if (!SrcVecTy.isPointerVector())
2180 return false;
2181 auto NewSrc = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: I.getOperand(i: 2).getReg());
2182 MRI.setType(VReg: I.getOperand(i: 1).getReg(),
2183 Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 64)));
2184 MRI.setType(VReg: I.getOperand(i: 0).getReg(),
2185 Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 64)));
2186 MRI.setRegClass(Reg: NewSrc.getReg(Idx: 0), RC: &AArch64::GPR64RegClass);
2187 I.getOperand(i: 2).setReg(NewSrc.getReg(Idx: 0));
2188 return true;
2189 }
2190 case TargetOpcode::G_UITOFP:
2191 case TargetOpcode::G_SITOFP: {
2192 // If both source and destination regbanks are FPR, then convert the opcode
2193 // to G_SITOF so that the importer can select it to an fpr variant.
2194 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2195 // copy.
2196 Register SrcReg = I.getOperand(i: 1).getReg();
2197 LLT SrcTy = MRI.getType(Reg: SrcReg);
2198 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2199 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2200 return false;
2201
2202 if (RBI.getRegBank(Reg: SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2203 if (I.getOpcode() == TargetOpcode::G_SITOFP)
2204 I.setDesc(TII.get(Opcode: AArch64::G_SITOF));
2205 else
2206 I.setDesc(TII.get(Opcode: AArch64::G_UITOF));
2207 return true;
2208 }
2209 return false;
2210 }
2211 default:
2212 return false;
2213 }
2214}
2215
2216/// This lowering tries to look for G_PTR_ADD instructions and then converts
2217/// them to a standard G_ADD with a COPY on the source.
2218///
2219/// The motivation behind this is to expose the add semantics to the imported
2220/// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2221/// because the selector works bottom up, uses before defs. By the time we
2222/// end up trying to select a G_PTR_ADD, we should have already attempted to
2223/// fold this into addressing modes and were therefore unsuccessful.
2224bool AArch64InstructionSelector::convertPtrAddToAdd(
2225 MachineInstr &I, MachineRegisterInfo &MRI) {
2226 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2227 Register DstReg = I.getOperand(i: 0).getReg();
2228 Register AddOp1Reg = I.getOperand(i: 1).getReg();
2229 const LLT PtrTy = MRI.getType(Reg: DstReg);
2230 if (PtrTy.getAddressSpace() != 0)
2231 return false;
2232
2233 const LLT CastPtrTy =
2234 PtrTy.isVector() ? LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) : LLT::scalar(SizeInBits: 64);
2235 auto PtrToInt = MIB.buildPtrToInt(Dst: CastPtrTy, Src: AddOp1Reg);
2236 // Set regbanks on the registers.
2237 if (PtrTy.isVector())
2238 MRI.setRegBank(Reg: PtrToInt.getReg(Idx: 0), RegBank: RBI.getRegBank(ID: AArch64::FPRRegBankID));
2239 else
2240 MRI.setRegBank(Reg: PtrToInt.getReg(Idx: 0), RegBank: RBI.getRegBank(ID: AArch64::GPRRegBankID));
2241
2242 // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2243 // %dst(intty) = G_ADD %intbase, off
2244 I.setDesc(TII.get(Opcode: TargetOpcode::G_ADD));
2245 MRI.setType(VReg: DstReg, Ty: CastPtrTy);
2246 I.getOperand(i: 1).setReg(PtrToInt.getReg(Idx: 0));
2247 if (!select(I&: *PtrToInt)) {
2248 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2249 return false;
2250 }
2251
2252 // Also take the opportunity here to try to do some optimization.
2253 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2254 Register NegatedReg;
2255 if (!mi_match(R: I.getOperand(i: 2).getReg(), MRI, P: m_Neg(Src: m_Reg(R&: NegatedReg))))
2256 return true;
2257 I.getOperand(i: 2).setReg(NegatedReg);
2258 I.setDesc(TII.get(Opcode: TargetOpcode::G_SUB));
2259 return true;
2260}
2261
2262bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2263 MachineRegisterInfo &MRI) {
2264 // We try to match the immediate variant of LSL, which is actually an alias
2265 // for a special case of UBFM. Otherwise, we fall back to the imported
2266 // selector which will match the register variant.
2267 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2268 const auto &MO = I.getOperand(i: 2);
2269 auto VRegAndVal = getIConstantVRegVal(VReg: MO.getReg(), MRI);
2270 if (!VRegAndVal)
2271 return false;
2272
2273 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2274 if (DstTy.isVector())
2275 return false;
2276 bool Is64Bit = DstTy.getSizeInBits() == 64;
2277 auto Imm1Fn = Is64Bit ? selectShiftA_64(Root: MO) : selectShiftA_32(Root: MO);
2278 auto Imm2Fn = Is64Bit ? selectShiftB_64(Root: MO) : selectShiftB_32(Root: MO);
2279
2280 if (!Imm1Fn || !Imm2Fn)
2281 return false;
2282
2283 auto NewI =
2284 MIB.buildInstr(Opc: Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2285 DstOps: {I.getOperand(i: 0).getReg()}, SrcOps: {I.getOperand(i: 1).getReg()});
2286
2287 for (auto &RenderFn : *Imm1Fn)
2288 RenderFn(NewI);
2289 for (auto &RenderFn : *Imm2Fn)
2290 RenderFn(NewI);
2291
2292 I.eraseFromParent();
2293 return constrainSelectedInstRegOperands(I&: *NewI, TII, TRI, RBI);
2294}
2295
2296bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2297 MachineInstr &I, MachineRegisterInfo &MRI) {
2298 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2299 // If we're storing a scalar, it doesn't matter what register bank that
2300 // scalar is on. All that matters is the size.
2301 //
2302 // So, if we see something like this (with a 32-bit scalar as an example):
2303 //
2304 // %x:gpr(s32) = ... something ...
2305 // %y:fpr(s32) = COPY %x:gpr(s32)
2306 // G_STORE %y:fpr(s32)
2307 //
2308 // We can fix this up into something like this:
2309 //
2310 // G_STORE %x:gpr(s32)
2311 //
2312 // And then continue the selection process normally.
2313 Register DefDstReg = getSrcRegIgnoringCopies(Reg: I.getOperand(i: 0).getReg(), MRI);
2314 if (!DefDstReg.isValid())
2315 return false;
2316 LLT DefDstTy = MRI.getType(Reg: DefDstReg);
2317 Register StoreSrcReg = I.getOperand(i: 0).getReg();
2318 LLT StoreSrcTy = MRI.getType(Reg: StoreSrcReg);
2319
2320 // If we get something strange like a physical register, then we shouldn't
2321 // go any further.
2322 if (!DefDstTy.isValid())
2323 return false;
2324
2325 // Are the source and dst types the same size?
2326 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2327 return false;
2328
2329 if (RBI.getRegBank(Reg: StoreSrcReg, MRI, TRI) ==
2330 RBI.getRegBank(Reg: DefDstReg, MRI, TRI))
2331 return false;
2332
2333 // We have a cross-bank copy, which is entering a store. Let's fold it.
2334 I.getOperand(i: 0).setReg(DefDstReg);
2335 return true;
2336}
2337
2338bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2339 assert(I.getParent() && "Instruction should be in a basic block!");
2340 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2341
2342 MachineBasicBlock &MBB = *I.getParent();
2343 MachineFunction &MF = *MBB.getParent();
2344 MachineRegisterInfo &MRI = MF.getRegInfo();
2345
2346 switch (I.getOpcode()) {
2347 case AArch64::G_DUP: {
2348 // Before selecting a DUP instruction, check if it is better selected as a
2349 // MOV or load from a constant pool.
2350 Register Src = I.getOperand(i: 1).getReg();
2351 auto ValAndVReg = getAnyConstantVRegValWithLookThrough(VReg: Src, MRI);
2352 if (!ValAndVReg)
2353 return false;
2354 LLVMContext &Ctx = MF.getFunction().getContext();
2355 Register Dst = I.getOperand(i: 0).getReg();
2356 auto *CV = ConstantDataVector::getSplat(
2357 NumElts: MRI.getType(Reg: Dst).getNumElements(),
2358 Elt: ConstantInt::get(
2359 Ty: Type::getIntNTy(C&: Ctx, N: MRI.getType(Reg: Dst).getScalarSizeInBits()),
2360 V: ValAndVReg->Value.trunc(width: MRI.getType(Reg: Dst).getScalarSizeInBits())));
2361 if (!emitConstantVector(Dst, CV, MIRBuilder&: MIB, MRI))
2362 return false;
2363 I.eraseFromParent();
2364 return true;
2365 }
2366 case TargetOpcode::G_SEXT:
2367 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2368 // over a normal extend.
2369 if (selectUSMovFromExtend(I, MRI))
2370 return true;
2371 return false;
2372 case TargetOpcode::G_BR:
2373 return false;
2374 case TargetOpcode::G_SHL:
2375 return earlySelectSHL(I, MRI);
2376 case TargetOpcode::G_CONSTANT: {
2377 bool IsZero = false;
2378 if (I.getOperand(i: 1).isCImm())
2379 IsZero = I.getOperand(i: 1).getCImm()->isZero();
2380 else if (I.getOperand(i: 1).isImm())
2381 IsZero = I.getOperand(i: 1).getImm() == 0;
2382
2383 if (!IsZero)
2384 return false;
2385
2386 Register DefReg = I.getOperand(i: 0).getReg();
2387 LLT Ty = MRI.getType(Reg: DefReg);
2388 if (Ty.getSizeInBits() == 64) {
2389 I.getOperand(i: 1).ChangeToRegister(Reg: AArch64::XZR, isDef: false);
2390 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
2391 } else if (Ty.getSizeInBits() <= 32) {
2392 I.getOperand(i: 1).ChangeToRegister(Reg: AArch64::WZR, isDef: false);
2393 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR32RegClass, MRI);
2394 } else
2395 return false;
2396
2397 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
2398 return true;
2399 }
2400
2401 case TargetOpcode::G_ADD: {
2402 // Check if this is being fed by a G_ICMP on either side.
2403 //
2404 // (cmp pred, x, y) + z
2405 //
2406 // In the above case, when the cmp is true, we increment z by 1. So, we can
2407 // fold the add into the cset for the cmp by using cinc.
2408 //
2409 // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2410 Register AddDst = I.getOperand(i: 0).getReg();
2411 Register AddLHS = I.getOperand(i: 1).getReg();
2412 Register AddRHS = I.getOperand(i: 2).getReg();
2413 // Only handle scalars.
2414 LLT Ty = MRI.getType(Reg: AddLHS);
2415 if (Ty.isVector())
2416 return false;
2417 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2418 // bits.
2419 unsigned Size = Ty.getSizeInBits();
2420 if (Size != 32 && Size != 64)
2421 return false;
2422 auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2423 if (!MRI.hasOneNonDBGUse(RegNo: Reg))
2424 return nullptr;
2425 // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2426 // compare.
2427 if (Size == 32)
2428 return getOpcodeDef(Opcode: TargetOpcode::G_ICMP, Reg, MRI);
2429 // We model scalar compares using 32-bit destinations right now.
2430 // If it's a 64-bit compare, it'll have 64-bit sources.
2431 Register ZExt;
2432 if (!mi_match(R: Reg, MRI,
2433 P: m_OneNonDBGUse(SP: m_GZExt(Src: m_OneNonDBGUse(SP: m_Reg(R&: ZExt))))))
2434 return nullptr;
2435 auto *Cmp = getOpcodeDef(Opcode: TargetOpcode::G_ICMP, Reg: ZExt, MRI);
2436 if (!Cmp ||
2437 MRI.getType(Reg: Cmp->getOperand(i: 2).getReg()).getSizeInBits() != 64)
2438 return nullptr;
2439 return Cmp;
2440 };
2441 // Try to match
2442 // z + (cmp pred, x, y)
2443 MachineInstr *Cmp = MatchCmp(AddRHS);
2444 if (!Cmp) {
2445 // (cmp pred, x, y) + z
2446 std::swap(a&: AddLHS, b&: AddRHS);
2447 Cmp = MatchCmp(AddRHS);
2448 if (!Cmp)
2449 return false;
2450 }
2451 auto &PredOp = Cmp->getOperand(i: 1);
2452 MIB.setInstrAndDebugLoc(I);
2453 emitIntegerCompare(/*LHS=*/Cmp->getOperand(i: 2),
2454 /*RHS=*/Cmp->getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
2455 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2456 const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
2457 P: CmpInst::getInversePredicate(pred: Pred), RHS: Cmp->getOperand(i: 3).getReg(), MRI: &MRI);
2458 emitCSINC(/*Dst=*/AddDst, /*Src =*/Src1: AddLHS, /*Src2=*/AddLHS, Pred: InvCC, MIRBuilder&: MIB);
2459 I.eraseFromParent();
2460 return true;
2461 }
2462 case TargetOpcode::G_OR: {
2463 // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2464 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2465 // shifting and masking that we can replace with a BFI (encoded as a BFM).
2466 Register Dst = I.getOperand(i: 0).getReg();
2467 LLT Ty = MRI.getType(Reg: Dst);
2468
2469 if (!Ty.isScalar())
2470 return false;
2471
2472 unsigned Size = Ty.getSizeInBits();
2473 if (Size != 32 && Size != 64)
2474 return false;
2475
2476 Register ShiftSrc;
2477 int64_t ShiftImm;
2478 Register MaskSrc;
2479 int64_t MaskImm;
2480 if (!mi_match(
2481 R: Dst, MRI,
2482 P: m_GOr(L: m_OneNonDBGUse(SP: m_GShl(L: m_Reg(R&: ShiftSrc), R: m_ICst(Cst&: ShiftImm))),
2483 R: m_OneNonDBGUse(SP: m_GAnd(L: m_Reg(R&: MaskSrc), R: m_ICst(Cst&: MaskImm))))))
2484 return false;
2485
2486 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2487 return false;
2488
2489 int64_t Immr = Size - ShiftImm;
2490 int64_t Imms = Size - ShiftImm - 1;
2491 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2492 emitInstr(Opcode: Opc, DstOps: {Dst}, SrcOps: {MaskSrc, ShiftSrc, Immr, Imms}, MIRBuilder&: MIB);
2493 I.eraseFromParent();
2494 return true;
2495 }
2496 case TargetOpcode::G_FENCE: {
2497 if (I.getOperand(i: 1).getImm() == 0)
2498 BuildMI(BB&: MBB, I, MIMD: MIMetadata(I), MCID: TII.get(Opcode: TargetOpcode::MEMBARRIER));
2499 else
2500 BuildMI(BB&: MBB, I, MIMD: MIMetadata(I), MCID: TII.get(Opcode: AArch64::DMB))
2501 .addImm(Val: I.getOperand(i: 0).getImm() == 4 ? 0x9 : 0xb);
2502 I.eraseFromParent();
2503 return true;
2504 }
2505 default:
2506 return false;
2507 }
2508}
2509
2510bool AArch64InstructionSelector::select(MachineInstr &I) {
2511 assert(I.getParent() && "Instruction should be in a basic block!");
2512 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2513
2514 MachineBasicBlock &MBB = *I.getParent();
2515 MachineFunction &MF = *MBB.getParent();
2516 MachineRegisterInfo &MRI = MF.getRegInfo();
2517
2518 const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2519 if (Subtarget->requiresStrictAlign()) {
2520 // We don't support this feature yet.
2521 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2522 return false;
2523 }
2524
2525 MIB.setInstrAndDebugLoc(I);
2526
2527 unsigned Opcode = I.getOpcode();
2528 // G_PHI requires same handling as PHI
2529 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2530 // Certain non-generic instructions also need some special handling.
2531
2532 if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
2533 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2534
2535 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2536 const Register DefReg = I.getOperand(i: 0).getReg();
2537 const LLT DefTy = MRI.getType(Reg: DefReg);
2538
2539 const RegClassOrRegBank &RegClassOrBank =
2540 MRI.getRegClassOrRegBank(Reg: DefReg);
2541
2542 const TargetRegisterClass *DefRC =
2543 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
2544 if (!DefRC) {
2545 if (!DefTy.isValid()) {
2546 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2547 return false;
2548 }
2549 const RegisterBank &RB = *cast<const RegisterBank *>(Val: RegClassOrBank);
2550 DefRC = getRegClassForTypeOnBank(Ty: DefTy, RB);
2551 if (!DefRC) {
2552 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2553 return false;
2554 }
2555 }
2556
2557 I.setDesc(TII.get(Opcode: TargetOpcode::PHI));
2558
2559 return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI);
2560 }
2561
2562 if (I.isCopy())
2563 return selectCopy(I, TII, MRI, TRI, RBI);
2564
2565 if (I.isDebugInstr())
2566 return selectDebugInstr(I, MRI, RBI);
2567
2568 return true;
2569 }
2570
2571
2572 if (I.getNumOperands() != I.getNumExplicitOperands()) {
2573 LLVM_DEBUG(
2574 dbgs() << "Generic instruction has unexpected implicit operands\n");
2575 return false;
2576 }
2577
2578 // Try to do some lowering before we start instruction selecting. These
2579 // lowerings are purely transformations on the input G_MIR and so selection
2580 // must continue after any modification of the instruction.
2581 if (preISelLower(I)) {
2582 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2583 }
2584
2585 // There may be patterns where the importer can't deal with them optimally,
2586 // but does select it to a suboptimal sequence so our custom C++ selection
2587 // code later never has a chance to work on it. Therefore, we have an early
2588 // selection attempt here to give priority to certain selection routines
2589 // over the imported ones.
2590 if (earlySelect(I))
2591 return true;
2592
2593 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
2594 return true;
2595
2596 LLT Ty =
2597 I.getOperand(i: 0).isReg() ? MRI.getType(Reg: I.getOperand(i: 0).getReg()) : LLT{};
2598
2599 switch (Opcode) {
2600 case TargetOpcode::G_SBFX:
2601 case TargetOpcode::G_UBFX: {
2602 static const unsigned OpcTable[2][2] = {
2603 {AArch64::UBFMWri, AArch64::UBFMXri},
2604 {AArch64::SBFMWri, AArch64::SBFMXri}};
2605 bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2606 unsigned Size = Ty.getSizeInBits();
2607 unsigned Opc = OpcTable[IsSigned][Size == 64];
2608 auto Cst1 =
2609 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 2).getReg(), MRI);
2610 assert(Cst1 && "Should have gotten a constant for src 1?");
2611 auto Cst2 =
2612 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 3).getReg(), MRI);
2613 assert(Cst2 && "Should have gotten a constant for src 2?");
2614 auto LSB = Cst1->Value.getZExtValue();
2615 auto Width = Cst2->Value.getZExtValue();
2616 auto BitfieldInst =
2617 MIB.buildInstr(Opc, DstOps: {I.getOperand(i: 0)}, SrcOps: {I.getOperand(i: 1)})
2618 .addImm(Val: LSB)
2619 .addImm(Val: LSB + Width - 1);
2620 I.eraseFromParent();
2621 return constrainSelectedInstRegOperands(I&: *BitfieldInst, TII, TRI, RBI);
2622 }
2623 case TargetOpcode::G_BRCOND:
2624 return selectCompareBranch(I, MF, MRI);
2625
2626 case TargetOpcode::G_BRINDIRECT: {
2627 const Function &Fn = MF.getFunction();
2628 if (std::optional<uint16_t> BADisc =
2629 STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(ParentFn: Fn)) {
2630 auto MI = MIB.buildInstr(Opc: AArch64::BRA, DstOps: {}, SrcOps: {I.getOperand(i: 0).getReg()});
2631 MI.addImm(Val: AArch64PACKey::IA);
2632 MI.addImm(Val: *BADisc);
2633 MI.addReg(/*AddrDisc=*/RegNo: AArch64::XZR);
2634 I.eraseFromParent();
2635 return constrainSelectedInstRegOperands(I&: *MI, TII, TRI, RBI);
2636 }
2637 I.setDesc(TII.get(Opcode: AArch64::BR));
2638 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2639 }
2640
2641 case TargetOpcode::G_BRJT:
2642 return selectBrJT(I, MRI);
2643
2644 case AArch64::G_ADD_LOW: {
2645 // This op may have been separated from it's ADRP companion by the localizer
2646 // or some other code motion pass. Given that many CPUs will try to
2647 // macro fuse these operations anyway, select this into a MOVaddr pseudo
2648 // which will later be expanded into an ADRP+ADD pair after scheduling.
2649 MachineInstr *BaseMI = MRI.getVRegDef(Reg: I.getOperand(i: 1).getReg());
2650 if (BaseMI->getOpcode() != AArch64::ADRP) {
2651 I.setDesc(TII.get(Opcode: AArch64::ADDXri));
2652 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2653 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2654 }
2655 assert(TM.getCodeModel() == CodeModel::Small &&
2656 "Expected small code model");
2657 auto Op1 = BaseMI->getOperand(i: 1);
2658 auto Op2 = I.getOperand(i: 2);
2659 auto MovAddr = MIB.buildInstr(Opc: AArch64::MOVaddr, DstOps: {I.getOperand(i: 0)}, SrcOps: {})
2660 .addGlobalAddress(GV: Op1.getGlobal(), Offset: Op1.getOffset(),
2661 TargetFlags: Op1.getTargetFlags())
2662 .addGlobalAddress(GV: Op2.getGlobal(), Offset: Op2.getOffset(),
2663 TargetFlags: Op2.getTargetFlags());
2664 I.eraseFromParent();
2665 return constrainSelectedInstRegOperands(I&: *MovAddr, TII, TRI, RBI);
2666 }
2667
2668 case TargetOpcode::G_FCONSTANT:
2669 case TargetOpcode::G_CONSTANT: {
2670 const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2671
2672 const LLT s8 = LLT::scalar(SizeInBits: 8);
2673 const LLT s16 = LLT::scalar(SizeInBits: 16);
2674 const LLT s32 = LLT::scalar(SizeInBits: 32);
2675 const LLT s64 = LLT::scalar(SizeInBits: 64);
2676 const LLT s128 = LLT::scalar(SizeInBits: 128);
2677 const LLT p0 = LLT::pointer(AddressSpace: 0, SizeInBits: 64);
2678
2679 const Register DefReg = I.getOperand(i: 0).getReg();
2680 const LLT DefTy = MRI.getType(Reg: DefReg);
2681 const unsigned DefSize = DefTy.getSizeInBits();
2682 const RegisterBank &RB = *RBI.getRegBank(Reg: DefReg, MRI, TRI);
2683
2684 // FIXME: Redundant check, but even less readable when factored out.
2685 if (isFP) {
2686 if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2687 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2688 << " constant, expected: " << s16 << " or " << s32
2689 << " or " << s64 << " or " << s128 << '\n');
2690 return false;
2691 }
2692
2693 if (RB.getID() != AArch64::FPRRegBankID) {
2694 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2695 << " constant on bank: " << RB
2696 << ", expected: FPR\n");
2697 return false;
2698 }
2699
2700 // The case when we have 0.0 is covered by tablegen. Reject it here so we
2701 // can be sure tablegen works correctly and isn't rescued by this code.
2702 // 0.0 is not covered by tablegen for FP128. So we will handle this
2703 // scenario in the code here.
2704 if (DefSize != 128 && I.getOperand(i: 1).getFPImm()->isExactlyValue(V: 0.0))
2705 return false;
2706 } else {
2707 // s32 and s64 are covered by tablegen.
2708 if (Ty != p0 && Ty != s8 && Ty != s16) {
2709 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2710 << " constant, expected: " << s32 << ", " << s64
2711 << ", or " << p0 << '\n');
2712 return false;
2713 }
2714
2715 if (RB.getID() != AArch64::GPRRegBankID) {
2716 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2717 << " constant on bank: " << RB
2718 << ", expected: GPR\n");
2719 return false;
2720 }
2721 }
2722
2723 if (isFP) {
2724 const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(Ty: DefTy, RB);
2725 // For 16, 64, and 128b values, emit a constant pool load.
2726 switch (DefSize) {
2727 default:
2728 llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2729 case 32:
2730 case 64: {
2731 bool OptForSize = shouldOptForSize(MF: &MF);
2732 const auto &TLI = MF.getSubtarget().getTargetLowering();
2733 // If TLI says that this fpimm is illegal, then we'll expand to a
2734 // constant pool load.
2735 if (TLI->isFPImmLegal(I.getOperand(i: 1).getFPImm()->getValueAPF(),
2736 EVT::getFloatingPointVT(BitWidth: DefSize), ForCodeSize: OptForSize))
2737 break;
2738 [[fallthrough]];
2739 }
2740 case 16:
2741 case 128: {
2742 auto *FPImm = I.getOperand(i: 1).getFPImm();
2743 auto *LoadMI = emitLoadFromConstantPool(CPVal: FPImm, MIRBuilder&: MIB);
2744 if (!LoadMI) {
2745 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2746 return false;
2747 }
2748 MIB.buildCopy(Res: {DefReg}, Op: {LoadMI->getOperand(i: 0).getReg()});
2749 I.eraseFromParent();
2750 return RBI.constrainGenericRegister(Reg: DefReg, RC: FPRRC, MRI);
2751 }
2752 }
2753
2754 assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size");
2755 // Either emit a FMOV, or emit a copy to emit a normal mov.
2756 const Register DefGPRReg = MRI.createVirtualRegister(
2757 RegClass: DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2758 MachineOperand &RegOp = I.getOperand(i: 0);
2759 RegOp.setReg(DefGPRReg);
2760 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: I.getIterator()));
2761 MIB.buildCopy(Res: {DefReg}, Op: {DefGPRReg});
2762
2763 if (!RBI.constrainGenericRegister(Reg: DefReg, RC: FPRRC, MRI)) {
2764 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2765 return false;
2766 }
2767
2768 MachineOperand &ImmOp = I.getOperand(i: 1);
2769 // FIXME: Is going through int64_t always correct?
2770 ImmOp.ChangeToImmediate(
2771 ImmVal: ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2772 } else if (I.getOperand(i: 1).isCImm()) {
2773 uint64_t Val = I.getOperand(i: 1).getCImm()->getZExtValue();
2774 I.getOperand(i: 1).ChangeToImmediate(ImmVal: Val);
2775 } else if (I.getOperand(i: 1).isImm()) {
2776 uint64_t Val = I.getOperand(i: 1).getImm();
2777 I.getOperand(i: 1).ChangeToImmediate(ImmVal: Val);
2778 }
2779
2780 const unsigned MovOpc =
2781 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2782 I.setDesc(TII.get(Opcode: MovOpc));
2783 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2784 return true;
2785 }
2786 case TargetOpcode::G_EXTRACT: {
2787 Register DstReg = I.getOperand(i: 0).getReg();
2788 Register SrcReg = I.getOperand(i: 1).getReg();
2789 LLT SrcTy = MRI.getType(Reg: SrcReg);
2790 LLT DstTy = MRI.getType(Reg: DstReg);
2791 (void)DstTy;
2792 unsigned SrcSize = SrcTy.getSizeInBits();
2793
2794 if (SrcTy.getSizeInBits() > 64) {
2795 // This should be an extract of an s128, which is like a vector extract.
2796 if (SrcTy.getSizeInBits() != 128)
2797 return false;
2798 // Only support extracting 64 bits from an s128 at the moment.
2799 if (DstTy.getSizeInBits() != 64)
2800 return false;
2801
2802 unsigned Offset = I.getOperand(i: 2).getImm();
2803 if (Offset % 64 != 0)
2804 return false;
2805
2806 // Check we have the right regbank always.
2807 const RegisterBank &SrcRB = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
2808 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
2809 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2810
2811 if (SrcRB.getID() == AArch64::GPRRegBankID) {
2812 auto NewI =
2813 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {})
2814 .addUse(RegNo: SrcReg, Flags: {},
2815 SubReg: Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2816 constrainOperandRegClass(MF, TRI, MRI, TII, RBI, InsertPt&: *NewI,
2817 RegClass: AArch64::GPR64RegClass, RegMO&: NewI->getOperand(i: 0));
2818 I.eraseFromParent();
2819 return true;
2820 }
2821
2822 // Emit the same code as a vector extract.
2823 // Offset must be a multiple of 64.
2824 unsigned LaneIdx = Offset / 64;
2825 MachineInstr *Extract = emitExtractVectorElt(
2826 DstReg, DstRB, ScalarTy: LLT::scalar(SizeInBits: 64), VecReg: SrcReg, LaneIdx, MIRBuilder&: MIB);
2827 if (!Extract)
2828 return false;
2829 I.eraseFromParent();
2830 return true;
2831 }
2832
2833 I.setDesc(TII.get(Opcode: SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2834 MachineInstrBuilder(MF, I).addImm(Val: I.getOperand(i: 2).getImm() +
2835 Ty.getSizeInBits() - 1);
2836
2837 if (SrcSize < 64) {
2838 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2839 "unexpected G_EXTRACT types");
2840 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2841 }
2842
2843 DstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2844 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: I.getIterator()));
2845 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: 0).getReg()}, SrcOps: {})
2846 .addReg(RegNo: DstReg, Flags: {}, SubReg: AArch64::sub_32);
2847 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(),
2848 RC: AArch64::GPR32RegClass, MRI);
2849 I.getOperand(i: 0).setReg(DstReg);
2850
2851 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2852 }
2853
2854 case TargetOpcode::G_INSERT: {
2855 LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 2).getReg());
2856 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2857 unsigned DstSize = DstTy.getSizeInBits();
2858 // Larger inserts are vectors, same-size ones should be something else by
2859 // now (split up or turned into COPYs).
2860 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2861 return false;
2862
2863 I.setDesc(TII.get(Opcode: DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2864 unsigned LSB = I.getOperand(i: 3).getImm();
2865 unsigned Width = MRI.getType(Reg: I.getOperand(i: 2).getReg()).getSizeInBits();
2866 I.getOperand(i: 3).setImm((DstSize - LSB) % DstSize);
2867 MachineInstrBuilder(MF, I).addImm(Val: Width - 1);
2868
2869 if (DstSize < 64) {
2870 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2871 "unexpected G_INSERT types");
2872 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2873 }
2874
2875 Register SrcReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2876 BuildMI(BB&: MBB, I: I.getIterator(), MIMD: I.getDebugLoc(),
2877 MCID: TII.get(Opcode: AArch64::SUBREG_TO_REG))
2878 .addDef(RegNo: SrcReg)
2879 .addImm(Val: 0)
2880 .addUse(RegNo: I.getOperand(i: 2).getReg())
2881 .addImm(Val: AArch64::sub_32);
2882 RBI.constrainGenericRegister(Reg: I.getOperand(i: 2).getReg(),
2883 RC: AArch64::GPR32RegClass, MRI);
2884 I.getOperand(i: 2).setReg(SrcReg);
2885
2886 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2887 }
2888 case TargetOpcode::G_FRAME_INDEX: {
2889 // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2890 if (Ty != LLT::pointer(AddressSpace: 0, SizeInBits: 64)) {
2891 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2892 << ", expected: " << LLT::pointer(0, 64) << '\n');
2893 return false;
2894 }
2895 I.setDesc(TII.get(Opcode: AArch64::ADDXri));
2896
2897 // MOs for a #0 shifted immediate.
2898 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2899 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2900
2901 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2902 }
2903
2904 case TargetOpcode::G_GLOBAL_VALUE: {
2905 const GlobalValue *GV = nullptr;
2906 unsigned OpFlags;
2907 if (I.getOperand(i: 1).isSymbol()) {
2908 OpFlags = I.getOperand(i: 1).getTargetFlags();
2909 // Currently only used by "RtLibUseGOT".
2910 assert(OpFlags == AArch64II::MO_GOT);
2911 } else {
2912 GV = I.getOperand(i: 1).getGlobal();
2913 if (GV->isThreadLocal()) {
2914 // We don't support instructions with emulated TLS variables yet
2915 if (TM.useEmulatedTLS())
2916 return false;
2917 return selectTLSGlobalValue(I, MRI);
2918 }
2919 OpFlags = STI.ClassifyGlobalReference(GV, TM);
2920 }
2921
2922 if (OpFlags & AArch64II::MO_GOT) {
2923 bool IsGOTSigned = MF.getInfo<AArch64FunctionInfo>()->hasELFSignedGOT();
2924 I.setDesc(TII.get(Opcode: IsGOTSigned ? AArch64::LOADgotAUTH : AArch64::LOADgot));
2925 I.getOperand(i: 1).setTargetFlags(OpFlags);
2926 I.addImplicitDefUseOperands(MF);
2927 } else if (TM.getCodeModel() == CodeModel::Large &&
2928 !TM.isPositionIndependent()) {
2929 // Materialize the global using movz/movk instructions.
2930 materializeLargeCMVal(I, V: GV, OpFlags);
2931 I.eraseFromParent();
2932 return true;
2933 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2934 I.setDesc(TII.get(Opcode: AArch64::ADR));
2935 I.getOperand(i: 1).setTargetFlags(OpFlags);
2936 } else {
2937 I.setDesc(TII.get(Opcode: AArch64::MOVaddr));
2938 I.getOperand(i: 1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2939 MachineInstrBuilder MIB(MF, I);
2940 MIB.addGlobalAddress(GV, Offset: I.getOperand(i: 1).getOffset(),
2941 TargetFlags: OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2942 }
2943 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2944 }
2945
2946 case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE:
2947 return selectPtrAuthGlobalValue(I, MRI);
2948
2949 case TargetOpcode::G_ZEXTLOAD:
2950 case TargetOpcode::G_LOAD:
2951 case TargetOpcode::G_STORE: {
2952 GLoadStore &LdSt = cast<GLoadStore>(Val&: I);
2953 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2954 LLT PtrTy = MRI.getType(Reg: LdSt.getPointerReg());
2955
2956 // Can only handle AddressSpace 0, 64-bit pointers.
2957 if (PtrTy != LLT::pointer(AddressSpace: 0, SizeInBits: 64)) {
2958 return false;
2959 }
2960
2961 uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
2962 unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
2963 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2964
2965 // Need special instructions for atomics that affect ordering.
2966 if (isStrongerThanMonotonic(AO: Order)) {
2967 assert(!isa<GZExtLoad>(LdSt));
2968 assert(MemSizeInBytes <= 8 &&
2969 "128-bit atomics should already be custom-legalized");
2970
2971 if (isa<GLoad>(Val: LdSt)) {
2972 static constexpr unsigned LDAPROpcodes[] = {
2973 AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2974 static constexpr unsigned LDAROpcodes[] = {
2975 AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2976 ArrayRef<unsigned> Opcodes =
2977 STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
2978 ? LDAPROpcodes
2979 : LDAROpcodes;
2980 I.setDesc(TII.get(Opcode: Opcodes[Log2_32(Value: MemSizeInBytes)]));
2981 } else {
2982 static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2983 AArch64::STLRW, AArch64::STLRX};
2984 Register ValReg = LdSt.getReg(Idx: 0);
2985 if (MRI.getType(Reg: ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2986 // Emit a subreg copy of 32 bits.
2987 Register NewVal = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
2988 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {NewVal}, SrcOps: {})
2989 .addReg(RegNo: I.getOperand(i: 0).getReg(), Flags: {}, SubReg: AArch64::sub_32);
2990 I.getOperand(i: 0).setReg(NewVal);
2991 }
2992 I.setDesc(TII.get(Opcode: Opcodes[Log2_32(Value: MemSizeInBytes)]));
2993 }
2994 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2995 return true;
2996 }
2997
2998#ifndef NDEBUG
2999 const Register PtrReg = LdSt.getPointerReg();
3000 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
3001 // Check that the pointer register is valid.
3002 assert(PtrRB.getID() == AArch64::GPRRegBankID &&
3003 "Load/Store pointer operand isn't a GPR");
3004 assert(MRI.getType(PtrReg).isPointer() &&
3005 "Load/Store pointer operand isn't a pointer");
3006#endif
3007
3008 const Register ValReg = LdSt.getReg(Idx: 0);
3009 const RegisterBank &RB = *RBI.getRegBank(Reg: ValReg, MRI, TRI);
3010 LLT ValTy = MRI.getType(Reg: ValReg);
3011
3012 // The code below doesn't support truncating stores, so we need to split it
3013 // again.
3014 if (isa<GStore>(Val: LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3015 unsigned SubReg;
3016 LLT MemTy = LdSt.getMMO().getMemoryType();
3017 auto *RC = getRegClassForTypeOnBank(Ty: MemTy, RB);
3018 if (!getSubRegForClass(RC, TRI, SubReg))
3019 return false;
3020
3021 // Generate a subreg copy.
3022 auto Copy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {MemTy}, SrcOps: {})
3023 .addReg(RegNo: ValReg, Flags: {}, SubReg)
3024 .getReg(Idx: 0);
3025 RBI.constrainGenericRegister(Reg: Copy, RC: *RC, MRI);
3026 LdSt.getOperand(i: 0).setReg(Copy);
3027 } else if (isa<GLoad>(Val: LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3028 // If this is an any-extending load from the FPR bank, split it into a regular
3029 // load + extend.
3030 if (RB.getID() == AArch64::FPRRegBankID) {
3031 unsigned SubReg;
3032 LLT MemTy = LdSt.getMMO().getMemoryType();
3033 auto *RC = getRegClassForTypeOnBank(Ty: MemTy, RB);
3034 if (!getSubRegForClass(RC, TRI, SubReg))
3035 return false;
3036 Register OldDst = LdSt.getReg(Idx: 0);
3037 Register NewDst =
3038 MRI.createGenericVirtualRegister(Ty: LdSt.getMMO().getMemoryType());
3039 LdSt.getOperand(i: 0).setReg(NewDst);
3040 MRI.setRegBank(Reg: NewDst, RegBank: RB);
3041 // Generate a SUBREG_TO_REG to extend it.
3042 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: LdSt.getIterator()));
3043 MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {OldDst}, SrcOps: {})
3044 .addImm(Val: 0)
3045 .addUse(RegNo: NewDst)
3046 .addImm(Val: SubReg);
3047 auto SubRegRC = getRegClassForTypeOnBank(Ty: MRI.getType(Reg: OldDst), RB);
3048 RBI.constrainGenericRegister(Reg: OldDst, RC: *SubRegRC, MRI);
3049 MIB.setInstr(LdSt);
3050 ValTy = MemTy; // This is no longer an extending load.
3051 }
3052 }
3053
3054 // Helper lambda for partially selecting I. Either returns the original
3055 // instruction with an updated opcode, or a new instruction.
3056 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
3057 bool IsStore = isa<GStore>(Val: I);
3058 const unsigned NewOpc =
3059 selectLoadStoreUIOp(GenericOpc: I.getOpcode(), RegBankID: RB.getID(), OpSize: MemSizeInBits);
3060 if (NewOpc == I.getOpcode())
3061 return nullptr;
3062 // Check if we can fold anything into the addressing mode.
3063 auto AddrModeFns =
3064 selectAddrModeIndexed(Root&: I.getOperand(i: 1), Size: MemSizeInBytes);
3065 if (!AddrModeFns) {
3066 // Can't fold anything. Use the original instruction.
3067 I.setDesc(TII.get(Opcode: NewOpc));
3068 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
3069 return &I;
3070 }
3071
3072 // Folded something. Create a new instruction and return it.
3073 auto NewInst = MIB.buildInstr(Opc: NewOpc, DstOps: {}, SrcOps: {}, Flags: I.getFlags());
3074 Register CurValReg = I.getOperand(i: 0).getReg();
3075 IsStore ? NewInst.addUse(RegNo: CurValReg) : NewInst.addDef(RegNo: CurValReg);
3076 NewInst.cloneMemRefs(OtherMI: I);
3077 for (auto &Fn : *AddrModeFns)
3078 Fn(NewInst);
3079 I.eraseFromParent();
3080 return &*NewInst;
3081 };
3082
3083 MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
3084 if (!LoadStore)
3085 return false;
3086
3087 // If we're storing a 0, use WZR/XZR.
3088 if (Opcode == TargetOpcode::G_STORE) {
3089 auto CVal = getIConstantVRegValWithLookThrough(
3090 VReg: LoadStore->getOperand(i: 0).getReg(), MRI);
3091 if (CVal && CVal->Value == 0) {
3092 switch (LoadStore->getOpcode()) {
3093 case AArch64::STRWui:
3094 case AArch64::STRHHui:
3095 case AArch64::STRBBui:
3096 LoadStore->getOperand(i: 0).setReg(AArch64::WZR);
3097 break;
3098 case AArch64::STRXui:
3099 LoadStore->getOperand(i: 0).setReg(AArch64::XZR);
3100 break;
3101 }
3102 }
3103 }
3104
3105 if (IsZExtLoad || (Opcode == TargetOpcode::G_LOAD &&
3106 ValTy == LLT::scalar(SizeInBits: 64) && MemSizeInBits == 32)) {
3107 // The any/zextload from a smaller type to i32 should be handled by the
3108 // importer.
3109 if (MRI.getType(Reg: LoadStore->getOperand(i: 0).getReg()).getSizeInBits() != 64)
3110 return false;
3111 // If we have an extending load then change the load's type to be a
3112 // narrower reg and zero_extend with SUBREG_TO_REG.
3113 Register LdReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
3114 Register DstReg = LoadStore->getOperand(i: 0).getReg();
3115 LoadStore->getOperand(i: 0).setReg(LdReg);
3116
3117 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: LoadStore->getIterator()));
3118 MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {DstReg}, SrcOps: {})
3119 .addImm(Val: 0)
3120 .addUse(RegNo: LdReg)
3121 .addImm(Val: AArch64::sub_32);
3122 constrainSelectedInstRegOperands(I&: *LoadStore, TII, TRI, RBI);
3123 return RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64allRegClass,
3124 MRI);
3125 }
3126 return constrainSelectedInstRegOperands(I&: *LoadStore, TII, TRI, RBI);
3127 }
3128
3129 case TargetOpcode::G_INDEXED_ZEXTLOAD:
3130 case TargetOpcode::G_INDEXED_SEXTLOAD:
3131 return selectIndexedExtLoad(I, MRI);
3132 case TargetOpcode::G_INDEXED_LOAD:
3133 return selectIndexedLoad(I, MRI);
3134 case TargetOpcode::G_INDEXED_STORE:
3135 return selectIndexedStore(I&: cast<GIndexedStore>(Val&: I), MRI);
3136
3137 case TargetOpcode::G_LSHR:
3138 case TargetOpcode::G_ASHR:
3139 if (MRI.getType(Reg: I.getOperand(i: 0).getReg()).isVector())
3140 return selectVectorAshrLshr(I, MRI);
3141 [[fallthrough]];
3142 case TargetOpcode::G_SHL:
3143 if (Opcode == TargetOpcode::G_SHL &&
3144 MRI.getType(Reg: I.getOperand(i: 0).getReg()).isVector())
3145 return selectVectorSHL(I, MRI);
3146
3147 // These shifts were legalized to have 64 bit shift amounts because we
3148 // want to take advantage of the selection patterns that assume the
3149 // immediates are s64s, however, selectBinaryOp will assume both operands
3150 // will have the same bit size.
3151 {
3152 Register SrcReg = I.getOperand(i: 1).getReg();
3153 Register ShiftReg = I.getOperand(i: 2).getReg();
3154 const LLT ShiftTy = MRI.getType(Reg: ShiftReg);
3155 const LLT SrcTy = MRI.getType(Reg: SrcReg);
3156 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3157 ShiftTy.getSizeInBits() == 64) {
3158 assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3159 // Insert a subregister copy to implement a 64->32 trunc
3160 auto Trunc = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {SrcTy}, SrcOps: {})
3161 .addReg(RegNo: ShiftReg, Flags: {}, SubReg: AArch64::sub_32);
3162 MRI.setRegBank(Reg: Trunc.getReg(Idx: 0), RegBank: RBI.getRegBank(ID: AArch64::GPRRegBankID));
3163 I.getOperand(i: 2).setReg(Trunc.getReg(Idx: 0));
3164 }
3165 }
3166 [[fallthrough]];
3167 case TargetOpcode::G_OR: {
3168 // Reject the various things we don't support yet.
3169 if (unsupportedBinOp(I, RBI, MRI, TRI))
3170 return false;
3171
3172 const unsigned OpSize = Ty.getSizeInBits();
3173
3174 const Register DefReg = I.getOperand(i: 0).getReg();
3175 const RegisterBank &RB = *RBI.getRegBank(Reg: DefReg, MRI, TRI);
3176
3177 const unsigned NewOpc = selectBinaryOp(GenericOpc: I.getOpcode(), RegBankID: RB.getID(), OpSize);
3178 if (NewOpc == I.getOpcode())
3179 return false;
3180
3181 I.setDesc(TII.get(Opcode: NewOpc));
3182 // FIXME: Should the type be always reset in setDesc?
3183
3184 // Now that we selected an opcode, we need to constrain the register
3185 // operands to use appropriate classes.
3186 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3187 }
3188
3189 case TargetOpcode::G_PTR_ADD: {
3190 emitADD(DefReg: I.getOperand(i: 0).getReg(), LHS&: I.getOperand(i: 1), RHS&: I.getOperand(i: 2), MIRBuilder&: MIB);
3191 I.eraseFromParent();
3192 return true;
3193 }
3194
3195 case TargetOpcode::G_SADDE:
3196 case TargetOpcode::G_UADDE:
3197 case TargetOpcode::G_SSUBE:
3198 case TargetOpcode::G_USUBE:
3199 case TargetOpcode::G_SADDO:
3200 case TargetOpcode::G_UADDO:
3201 case TargetOpcode::G_SSUBO:
3202 case TargetOpcode::G_USUBO:
3203 return selectOverflowOp(I, MRI);
3204
3205 case TargetOpcode::G_PTRMASK: {
3206 Register MaskReg = I.getOperand(i: 2).getReg();
3207 std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(VReg: MaskReg, MRI);
3208 // TODO: Implement arbitrary cases
3209 if (!MaskVal || !isShiftedMask_64(Value: *MaskVal))
3210 return false;
3211
3212 uint64_t Mask = *MaskVal;
3213 I.setDesc(TII.get(Opcode: AArch64::ANDXri));
3214 I.getOperand(i: 2).ChangeToImmediate(
3215 ImmVal: AArch64_AM::encodeLogicalImmediate(imm: Mask, regSize: 64));
3216
3217 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3218 }
3219 case TargetOpcode::G_PTRTOINT:
3220 case TargetOpcode::G_TRUNC: {
3221 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3222 const LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
3223
3224 const Register DstReg = I.getOperand(i: 0).getReg();
3225 const Register SrcReg = I.getOperand(i: 1).getReg();
3226
3227 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
3228 const RegisterBank &SrcRB = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
3229
3230 if (DstRB.getID() != SrcRB.getID()) {
3231 LLVM_DEBUG(
3232 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3233 return false;
3234 }
3235
3236 if (DstRB.getID() == AArch64::GPRRegBankID) {
3237 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Ty: DstTy, RB: DstRB);
3238 if (!DstRC)
3239 return false;
3240
3241 const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(Ty: SrcTy, RB: SrcRB);
3242 if (!SrcRC)
3243 return false;
3244
3245 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI) ||
3246 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI)) {
3247 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3248 return false;
3249 }
3250
3251 if (DstRC == SrcRC) {
3252 // Nothing to be done
3253 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(SizeInBits: 32) &&
3254 SrcTy == LLT::scalar(SizeInBits: 64)) {
3255 llvm_unreachable("TableGen can import this case");
3256 return false;
3257 } else if (DstRC == &AArch64::GPR32RegClass &&
3258 SrcRC == &AArch64::GPR64RegClass) {
3259 I.getOperand(i: 1).setSubReg(AArch64::sub_32);
3260 } else {
3261 LLVM_DEBUG(
3262 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3263 return false;
3264 }
3265
3266 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
3267 return true;
3268 } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3269 if (DstTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16) &&
3270 SrcTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
3271 I.setDesc(TII.get(Opcode: AArch64::XTNv4i16));
3272 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3273 return true;
3274 }
3275
3276 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3277 MachineInstr *Extract = emitExtractVectorElt(
3278 DstReg, DstRB, ScalarTy: LLT::scalar(SizeInBits: DstTy.getSizeInBits()), VecReg: SrcReg, LaneIdx: 0, MIRBuilder&: MIB);
3279 if (!Extract)
3280 return false;
3281 I.eraseFromParent();
3282 return true;
3283 }
3284
3285 // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3286 if (Opcode == TargetOpcode::G_PTRTOINT) {
3287 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3288 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
3289 return selectCopy(I, TII, MRI, TRI, RBI);
3290 }
3291 }
3292
3293 return false;
3294 }
3295
3296 case TargetOpcode::G_ANYEXT: {
3297 if (selectUSMovFromExtend(I, MRI))
3298 return true;
3299
3300 const Register DstReg = I.getOperand(i: 0).getReg();
3301 const Register SrcReg = I.getOperand(i: 1).getReg();
3302
3303 const RegisterBank &RBDst = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
3304 if (RBDst.getID() != AArch64::GPRRegBankID) {
3305 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3306 << ", expected: GPR\n");
3307 return false;
3308 }
3309
3310 const RegisterBank &RBSrc = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
3311 if (RBSrc.getID() != AArch64::GPRRegBankID) {
3312 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3313 << ", expected: GPR\n");
3314 return false;
3315 }
3316
3317 const unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
3318
3319 if (DstSize == 0) {
3320 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3321 return false;
3322 }
3323
3324 if (DstSize != 64 && DstSize > 32) {
3325 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3326 << ", expected: 32 or 64\n");
3327 return false;
3328 }
3329 // At this point G_ANYEXT is just like a plain COPY, but we need
3330 // to explicitly form the 64-bit value if any.
3331 if (DstSize > 32) {
3332 Register ExtSrc = MRI.createVirtualRegister(RegClass: &AArch64::GPR64allRegClass);
3333 BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::SUBREG_TO_REG))
3334 .addDef(RegNo: ExtSrc)
3335 .addImm(Val: 0)
3336 .addUse(RegNo: SrcReg)
3337 .addImm(Val: AArch64::sub_32);
3338 I.getOperand(i: 1).setReg(ExtSrc);
3339 }
3340 return selectCopy(I, TII, MRI, TRI, RBI);
3341 }
3342
3343 case TargetOpcode::G_ZEXT:
3344 case TargetOpcode::G_SEXT_INREG:
3345 case TargetOpcode::G_SEXT: {
3346 if (selectUSMovFromExtend(I, MRI))
3347 return true;
3348
3349 unsigned Opcode = I.getOpcode();
3350 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3351 const Register DefReg = I.getOperand(i: 0).getReg();
3352 Register SrcReg = I.getOperand(i: 1).getReg();
3353 const LLT DstTy = MRI.getType(Reg: DefReg);
3354 const LLT SrcTy = MRI.getType(Reg: SrcReg);
3355 unsigned DstSize = DstTy.getSizeInBits();
3356 unsigned SrcSize = SrcTy.getSizeInBits();
3357
3358 // SEXT_INREG has the same src reg size as dst, the size of the value to be
3359 // extended is encoded in the imm.
3360 if (Opcode == TargetOpcode::G_SEXT_INREG)
3361 SrcSize = I.getOperand(i: 2).getImm();
3362
3363 if (DstTy.isVector())
3364 return false; // Should be handled by imported patterns.
3365
3366 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3367 AArch64::GPRRegBankID &&
3368 "Unexpected ext regbank");
3369
3370 MachineInstr *ExtI;
3371
3372 // First check if we're extending the result of a load which has a dest type
3373 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3374 // GPR register on AArch64 and all loads which are smaller automatically
3375 // zero-extend the upper bits. E.g.
3376 // %v(s8) = G_LOAD %p, :: (load 1)
3377 // %v2(s32) = G_ZEXT %v(s8)
3378 if (!IsSigned) {
3379 auto *LoadMI = getOpcodeDef(Opcode: TargetOpcode::G_LOAD, Reg: SrcReg, MRI);
3380 bool IsGPR =
3381 RBI.getRegBank(Reg: SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3382 if (LoadMI && IsGPR) {
3383 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3384 unsigned BytesLoaded = MemOp->getSize().getValue();
3385 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3386 return selectCopy(I, TII, MRI, TRI, RBI);
3387 }
3388
3389 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3390 // + SUBREG_TO_REG.
3391 if (IsGPR && SrcSize == 32 && DstSize == 64) {
3392 Register SubregToRegSrc =
3393 MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
3394 const Register ZReg = AArch64::WZR;
3395 MIB.buildInstr(Opc: AArch64::ORRWrs, DstOps: {SubregToRegSrc}, SrcOps: {ZReg, SrcReg})
3396 .addImm(Val: 0);
3397
3398 MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {DefReg}, SrcOps: {})
3399 .addImm(Val: 0)
3400 .addUse(RegNo: SubregToRegSrc)
3401 .addImm(Val: AArch64::sub_32);
3402
3403 if (!RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass,
3404 MRI)) {
3405 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3406 return false;
3407 }
3408
3409 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AArch64::GPR32RegClass,
3410 MRI)) {
3411 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3412 return false;
3413 }
3414
3415 I.eraseFromParent();
3416 return true;
3417 }
3418 }
3419
3420 if (DstSize == 64) {
3421 if (Opcode != TargetOpcode::G_SEXT_INREG) {
3422 // FIXME: Can we avoid manually doing this?
3423 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AArch64::GPR32RegClass,
3424 MRI)) {
3425 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3426 << " operand\n");
3427 return false;
3428 }
3429 SrcReg = MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG,
3430 DstOps: {&AArch64::GPR64RegClass}, SrcOps: {})
3431 .addImm(Val: 0)
3432 .addUse(RegNo: SrcReg)
3433 .addImm(Val: AArch64::sub_32)
3434 .getReg(Idx: 0);
3435 }
3436
3437 ExtI = MIB.buildInstr(Opc: IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3438 DstOps: {DefReg}, SrcOps: {SrcReg})
3439 .addImm(Val: 0)
3440 .addImm(Val: SrcSize - 1);
3441 } else if (DstSize <= 32) {
3442 ExtI = MIB.buildInstr(Opc: IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3443 DstOps: {DefReg}, SrcOps: {SrcReg})
3444 .addImm(Val: 0)
3445 .addImm(Val: SrcSize - 1);
3446 } else {
3447 return false;
3448 }
3449
3450 constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
3451 I.eraseFromParent();
3452 return true;
3453 }
3454
3455 case TargetOpcode::G_FREEZE:
3456 return selectCopy(I, TII, MRI, TRI, RBI);
3457
3458 case TargetOpcode::G_INTTOPTR:
3459 // The importer is currently unable to import pointer types since they
3460 // didn't exist in SelectionDAG.
3461 return selectCopy(I, TII, MRI, TRI, RBI);
3462
3463 case TargetOpcode::G_BITCAST:
3464 // Imported SelectionDAG rules can handle every bitcast except those that
3465 // bitcast from a type to the same type. Ideally, these shouldn't occur
3466 // but we might not run an optimizer that deletes them. The other exception
3467 // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3468 // of them.
3469 return selectCopy(I, TII, MRI, TRI, RBI);
3470
3471 case TargetOpcode::G_SELECT: {
3472 auto &Sel = cast<GSelect>(Val&: I);
3473 const Register CondReg = Sel.getCondReg();
3474 const Register TReg = Sel.getTrueReg();
3475 const Register FReg = Sel.getFalseReg();
3476
3477 if (tryOptSelect(Sel))
3478 return true;
3479
3480 // Make sure to use an unused vreg instead of wzr, so that the peephole
3481 // optimizations will be able to optimize these.
3482 Register DeadVReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
3483 auto TstMI = MIB.buildInstr(Opc: AArch64::ANDSWri, DstOps: {DeadVReg}, SrcOps: {CondReg})
3484 .addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: 1, regSize: 32));
3485 constrainSelectedInstRegOperands(I&: *TstMI, TII, TRI, RBI);
3486 if (!emitSelect(Dst: Sel.getReg(Idx: 0), True: TReg, False: FReg, CC: AArch64CC::NE, MIB))
3487 return false;
3488 Sel.eraseFromParent();
3489 return true;
3490 }
3491 case TargetOpcode::G_ICMP: {
3492 if (Ty.isVector())
3493 return false;
3494
3495 if (Ty != LLT::scalar(SizeInBits: 32)) {
3496 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3497 << ", expected: " << LLT::scalar(32) << '\n');
3498 return false;
3499 }
3500
3501 auto &PredOp = I.getOperand(i: 1);
3502 emitIntegerCompare(LHS&: I.getOperand(i: 2), RHS&: I.getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
3503 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
3504 const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
3505 P: CmpInst::getInversePredicate(pred: Pred), RHS: I.getOperand(i: 3).getReg(), MRI: &MRI);
3506 emitCSINC(/*Dst=*/I.getOperand(i: 0).getReg(), /*Src1=*/AArch64::WZR,
3507 /*Src2=*/AArch64::WZR, Pred: InvCC, MIRBuilder&: MIB);
3508 I.eraseFromParent();
3509 return true;
3510 }
3511
3512 case TargetOpcode::G_FCMP: {
3513 CmpInst::Predicate Pred =
3514 static_cast<CmpInst::Predicate>(I.getOperand(i: 1).getPredicate());
3515 if (!emitFPCompare(LHS: I.getOperand(i: 2).getReg(), RHS: I.getOperand(i: 3).getReg(), MIRBuilder&: MIB,
3516 Pred) ||
3517 !emitCSetForFCmp(Dst: I.getOperand(i: 0).getReg(), Pred, MIRBuilder&: MIB))
3518 return false;
3519 I.eraseFromParent();
3520 return true;
3521 }
3522 case TargetOpcode::G_VASTART:
3523 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3524 : selectVaStartAAPCS(I, MF, MRI);
3525 case TargetOpcode::G_INTRINSIC:
3526 return selectIntrinsic(I, MRI);
3527 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3528 return selectIntrinsicWithSideEffects(I, MRI);
3529 case TargetOpcode::G_IMPLICIT_DEF: {
3530 I.setDesc(TII.get(Opcode: TargetOpcode::IMPLICIT_DEF));
3531 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3532 const Register DstReg = I.getOperand(i: 0).getReg();
3533 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
3534 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Ty: DstTy, RB: DstRB);
3535 RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI);
3536 return true;
3537 }
3538 case TargetOpcode::G_BLOCK_ADDR: {
3539 Function *BAFn = I.getOperand(i: 1).getBlockAddress()->getFunction();
3540 if (std::optional<uint16_t> BADisc =
3541 STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(ParentFn: *BAFn)) {
3542 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X16}, SrcOps: {});
3543 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
3544 MIB.buildInstr(Opcode: AArch64::MOVaddrPAC)
3545 .addBlockAddress(BA: I.getOperand(i: 1).getBlockAddress())
3546 .addImm(Val: AArch64PACKey::IA)
3547 .addReg(/*AddrDisc=*/RegNo: AArch64::XZR)
3548 .addImm(Val: *BADisc)
3549 .constrainAllUses(TII, TRI, RBI);
3550 MIB.buildCopy(Res: I.getOperand(i: 0).getReg(), Op: Register(AArch64::X16));
3551 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(),
3552 RC: AArch64::GPR64RegClass, MRI);
3553 I.eraseFromParent();
3554 return true;
3555 }
3556 if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3557 materializeLargeCMVal(I, V: I.getOperand(i: 1).getBlockAddress(), OpFlags: 0);
3558 I.eraseFromParent();
3559 return true;
3560 } else {
3561 I.setDesc(TII.get(Opcode: AArch64::MOVaddrBA));
3562 auto MovMI = BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::MOVaddrBA),
3563 DestReg: I.getOperand(i: 0).getReg())
3564 .addBlockAddress(BA: I.getOperand(i: 1).getBlockAddress(),
3565 /* Offset */ 0, TargetFlags: AArch64II::MO_PAGE)
3566 .addBlockAddress(
3567 BA: I.getOperand(i: 1).getBlockAddress(), /* Offset */ 0,
3568 TargetFlags: AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3569 I.eraseFromParent();
3570 return constrainSelectedInstRegOperands(I&: *MovMI, TII, TRI, RBI);
3571 }
3572 }
3573 case AArch64::G_DUP: {
3574 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3575 // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3576 // difficult because at RBS we may end up pessimizing the fpr case if we
3577 // decided to add an anyextend to fix this. Manual selection is the most
3578 // robust solution for now.
3579 if (RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI)->getID() !=
3580 AArch64::GPRRegBankID)
3581 return false; // We expect the fpr regbank case to be imported.
3582 LLT VecTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3583 if (VecTy == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8))
3584 I.setDesc(TII.get(Opcode: AArch64::DUPv8i8gpr));
3585 else if (VecTy == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8))
3586 I.setDesc(TII.get(Opcode: AArch64::DUPv16i8gpr));
3587 else if (VecTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16))
3588 I.setDesc(TII.get(Opcode: AArch64::DUPv4i16gpr));
3589 else if (VecTy == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16))
3590 I.setDesc(TII.get(Opcode: AArch64::DUPv8i16gpr));
3591 else
3592 return false;
3593 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3594 }
3595 case TargetOpcode::G_BUILD_VECTOR:
3596 return selectBuildVector(I, MRI);
3597 case TargetOpcode::G_MERGE_VALUES:
3598 return selectMergeValues(I, MRI);
3599 case TargetOpcode::G_UNMERGE_VALUES:
3600 return selectUnmergeValues(I, MRI);
3601 case TargetOpcode::G_SHUFFLE_VECTOR:
3602 return selectShuffleVector(I, MRI);
3603 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3604 return selectExtractElt(I, MRI);
3605 case TargetOpcode::G_CONCAT_VECTORS:
3606 return selectConcatVectors(I, MRI);
3607 case TargetOpcode::G_JUMP_TABLE:
3608 return selectJumpTable(I, MRI);
3609 case TargetOpcode::G_MEMCPY:
3610 case TargetOpcode::G_MEMCPY_INLINE:
3611 case TargetOpcode::G_MEMMOVE:
3612 case TargetOpcode::G_MEMSET:
3613 assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3614 return selectMOPS(I, MRI);
3615 }
3616
3617 return false;
3618}
3619
3620bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3621 MachineIRBuilderState OldMIBState = MIB.getState();
3622 bool Success = select(I);
3623 MIB.setState(OldMIBState);
3624 return Success;
3625}
3626
3627bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3628 MachineRegisterInfo &MRI) {
3629 unsigned Mopcode;
3630 switch (GI.getOpcode()) {
3631 case TargetOpcode::G_MEMCPY:
3632 case TargetOpcode::G_MEMCPY_INLINE:
3633 Mopcode = AArch64::MOPSMemoryCopyPseudo;
3634 break;
3635 case TargetOpcode::G_MEMMOVE:
3636 Mopcode = AArch64::MOPSMemoryMovePseudo;
3637 break;
3638 case TargetOpcode::G_MEMSET:
3639 // For tagged memset see llvm.aarch64.mops.memset.tag
3640 Mopcode = AArch64::MOPSMemorySetPseudo;
3641 break;
3642 }
3643
3644 auto &DstPtr = GI.getOperand(i: 0);
3645 auto &SrcOrVal = GI.getOperand(i: 1);
3646 auto &Size = GI.getOperand(i: 2);
3647
3648 // Create copies of the registers that can be clobbered.
3649 const Register DstPtrCopy = MRI.cloneVirtualRegister(VReg: DstPtr.getReg());
3650 const Register SrcValCopy = MRI.cloneVirtualRegister(VReg: SrcOrVal.getReg());
3651 const Register SizeCopy = MRI.cloneVirtualRegister(VReg: Size.getReg());
3652
3653 const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3654 const auto &SrcValRegClass =
3655 IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3656
3657 // Constrain to specific registers
3658 RBI.constrainGenericRegister(Reg: DstPtrCopy, RC: AArch64::GPR64commonRegClass, MRI);
3659 RBI.constrainGenericRegister(Reg: SrcValCopy, RC: SrcValRegClass, MRI);
3660 RBI.constrainGenericRegister(Reg: SizeCopy, RC: AArch64::GPR64RegClass, MRI);
3661
3662 MIB.buildCopy(Res: DstPtrCopy, Op: DstPtr);
3663 MIB.buildCopy(Res: SrcValCopy, Op: SrcOrVal);
3664 MIB.buildCopy(Res: SizeCopy, Op: Size);
3665
3666 // New instruction uses the copied registers because it must update them.
3667 // The defs are not used since they don't exist in G_MEM*. They are still
3668 // tied.
3669 // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3670 Register DefDstPtr = MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
3671 Register DefSize = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3672 if (IsSet) {
3673 MIB.buildInstr(Opc: Mopcode, DstOps: {DefDstPtr, DefSize},
3674 SrcOps: {DstPtrCopy, SizeCopy, SrcValCopy});
3675 } else {
3676 Register DefSrcPtr = MRI.createVirtualRegister(RegClass: &SrcValRegClass);
3677 MIB.buildInstr(Opc: Mopcode, DstOps: {DefDstPtr, DefSrcPtr, DefSize},
3678 SrcOps: {DstPtrCopy, SrcValCopy, SizeCopy});
3679 }
3680
3681 GI.eraseFromParent();
3682 return true;
3683}
3684
3685bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3686 MachineRegisterInfo &MRI) {
3687 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3688 Register JTAddr = I.getOperand(i: 0).getReg();
3689 unsigned JTI = I.getOperand(i: 1).getIndex();
3690 Register Index = I.getOperand(i: 2).getReg();
3691
3692 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(Idx: JTI, Size: 4, PCRelSym: nullptr);
3693
3694 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
3695 // sequence later, to guarantee the integrity of the intermediate values.
3696 if (MF->getFunction().hasFnAttribute(Kind: "aarch64-jump-table-hardening")) {
3697 CodeModel::Model CM = TM.getCodeModel();
3698 if (STI.isTargetMachO()) {
3699 if (CM != CodeModel::Small && CM != CodeModel::Large)
3700 report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
3701 } else {
3702 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
3703 assert(STI.isTargetELF() &&
3704 "jump table hardening only supported on MachO/ELF");
3705 if (CM != CodeModel::Small)
3706 report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
3707 }
3708
3709 MIB.buildCopy(Res: {AArch64::X16}, Op: I.getOperand(i: 2).getReg());
3710 MIB.buildInstr(Opcode: AArch64::BR_JumpTable)
3711 .addJumpTableIndex(Idx: I.getOperand(i: 1).getIndex());
3712 I.eraseFromParent();
3713 return true;
3714 }
3715
3716 Register TargetReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3717 Register ScratchReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64spRegClass);
3718
3719 auto JumpTableInst = MIB.buildInstr(Opc: AArch64::JumpTableDest32,
3720 DstOps: {TargetReg, ScratchReg}, SrcOps: {JTAddr, Index})
3721 .addJumpTableIndex(Idx: JTI);
3722 // Save the jump table info.
3723 MIB.buildInstr(Opc: TargetOpcode::JUMP_TABLE_DEBUG_INFO, DstOps: {},
3724 SrcOps: {static_cast<int64_t>(JTI)});
3725 // Build the indirect branch.
3726 MIB.buildInstr(Opc: AArch64::BR, DstOps: {}, SrcOps: {TargetReg});
3727 I.eraseFromParent();
3728 return constrainSelectedInstRegOperands(I&: *JumpTableInst, TII, TRI, RBI);
3729}
3730
3731bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3732 MachineRegisterInfo &MRI) {
3733 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3734 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3735
3736 Register DstReg = I.getOperand(i: 0).getReg();
3737 unsigned JTI = I.getOperand(i: 1).getIndex();
3738 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3739 auto MovMI =
3740 MIB.buildInstr(Opc: AArch64::MOVaddrJT, DstOps: {DstReg}, SrcOps: {})
3741 .addJumpTableIndex(Idx: JTI, TargetFlags: AArch64II::MO_PAGE)
3742 .addJumpTableIndex(Idx: JTI, TargetFlags: AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3743 I.eraseFromParent();
3744 return constrainSelectedInstRegOperands(I&: *MovMI, TII, TRI, RBI);
3745}
3746
3747bool AArch64InstructionSelector::selectTLSGlobalValue(
3748 MachineInstr &I, MachineRegisterInfo &MRI) {
3749 if (!STI.isTargetMachO())
3750 return false;
3751 MachineFunction &MF = *I.getParent()->getParent();
3752 MF.getFrameInfo().setAdjustsStack(true);
3753
3754 const auto &GlobalOp = I.getOperand(i: 1);
3755 assert(GlobalOp.getOffset() == 0 &&
3756 "Shouldn't have an offset on TLS globals!");
3757 const GlobalValue &GV = *GlobalOp.getGlobal();
3758
3759 auto LoadGOT =
3760 MIB.buildInstr(Opc: AArch64::LOADgot, DstOps: {&AArch64::GPR64commonRegClass}, SrcOps: {})
3761 .addGlobalAddress(GV: &GV, Offset: 0, TargetFlags: AArch64II::MO_TLS);
3762
3763 auto Load = MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {&AArch64::GPR64commonRegClass},
3764 SrcOps: {LoadGOT.getReg(Idx: 0)})
3765 .addImm(Val: 0);
3766
3767 MIB.buildCopy(Res: Register(AArch64::X0), Op: LoadGOT.getReg(Idx: 0));
3768 // TLS calls preserve all registers except those that absolutely must be
3769 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3770 // silly).
3771 unsigned Opcode = getBLRCallOpcode(MF);
3772
3773 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
3774 if (MF.getFunction().hasFnAttribute(Kind: "ptrauth-calls")) {
3775 assert(Opcode == AArch64::BLR);
3776 Opcode = AArch64::BLRAAZ;
3777 }
3778
3779 MIB.buildInstr(Opc: Opcode, DstOps: {}, SrcOps: {Load})
3780 .addUse(RegNo: AArch64::X0, Flags: RegState::Implicit)
3781 .addDef(RegNo: AArch64::X0, Flags: RegState::Implicit)
3782 .addRegMask(Mask: TRI.getTLSCallPreservedMask());
3783
3784 MIB.buildCopy(Res: I.getOperand(i: 0).getReg(), Op: Register(AArch64::X0));
3785 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(), RC: AArch64::GPR64RegClass,
3786 MRI);
3787 I.eraseFromParent();
3788 return true;
3789}
3790
3791MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3792 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3793 MachineIRBuilder &MIRBuilder) const {
3794 auto Undef = MIRBuilder.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {DstRC}, SrcOps: {});
3795
3796 auto BuildFn = [&](unsigned SubregIndex) {
3797 auto Ins =
3798 MIRBuilder
3799 .buildInstr(Opc: TargetOpcode::INSERT_SUBREG, DstOps: {DstRC}, SrcOps: {Undef, Scalar})
3800 .addImm(Val: SubregIndex);
3801 constrainSelectedInstRegOperands(I&: *Undef, TII, TRI, RBI);
3802 constrainSelectedInstRegOperands(I&: *Ins, TII, TRI, RBI);
3803 return &*Ins;
3804 };
3805
3806 switch (EltSize) {
3807 case 8:
3808 return BuildFn(AArch64::bsub);
3809 case 16:
3810 return BuildFn(AArch64::hsub);
3811 case 32:
3812 return BuildFn(AArch64::ssub);
3813 case 64:
3814 return BuildFn(AArch64::dsub);
3815 default:
3816 return nullptr;
3817 }
3818}
3819
3820MachineInstr *
3821AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3822 MachineIRBuilder &MIB,
3823 MachineRegisterInfo &MRI) const {
3824 LLT DstTy = MRI.getType(Reg: DstReg);
3825 const TargetRegisterClass *RC =
3826 getRegClassForTypeOnBank(Ty: DstTy, RB: *RBI.getRegBank(Reg: SrcReg, MRI, TRI));
3827 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3828 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3829 return nullptr;
3830 }
3831 unsigned SubReg = 0;
3832 if (!getSubRegForClass(RC, TRI, SubReg))
3833 return nullptr;
3834 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3835 LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3836 << DstTy.getSizeInBits() << "\n");
3837 return nullptr;
3838 }
3839 auto Copy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {})
3840 .addReg(RegNo: SrcReg, Flags: {}, SubReg);
3841 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
3842 return Copy;
3843}
3844
3845bool AArch64InstructionSelector::selectMergeValues(
3846 MachineInstr &I, MachineRegisterInfo &MRI) {
3847 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3848 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3849 const LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
3850 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3851 const RegisterBank &RB = *RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI);
3852
3853 if (I.getNumOperands() != 3)
3854 return false;
3855
3856 // Merging 2 s64s into an s128.
3857 if (DstTy == LLT::scalar(SizeInBits: 128)) {
3858 if (SrcTy.getSizeInBits() != 64)
3859 return false;
3860 Register DstReg = I.getOperand(i: 0).getReg();
3861 Register Src1Reg = I.getOperand(i: 1).getReg();
3862 Register Src2Reg = I.getOperand(i: 2).getReg();
3863 auto Tmp = MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {DstTy}, SrcOps: {});
3864 MachineInstr *InsMI = emitLaneInsert(DstReg: std::nullopt, SrcReg: Tmp.getReg(Idx: 0), EltReg: Src1Reg,
3865 /* LaneIdx */ 0, RB, MIRBuilder&: MIB);
3866 if (!InsMI)
3867 return false;
3868 MachineInstr *Ins2MI = emitLaneInsert(DstReg, SrcReg: InsMI->getOperand(i: 0).getReg(),
3869 EltReg: Src2Reg, /* LaneIdx */ 1, RB, MIRBuilder&: MIB);
3870 if (!Ins2MI)
3871 return false;
3872 constrainSelectedInstRegOperands(I&: *InsMI, TII, TRI, RBI);
3873 constrainSelectedInstRegOperands(I&: *Ins2MI, TII, TRI, RBI);
3874 I.eraseFromParent();
3875 return true;
3876 }
3877
3878 if (RB.getID() != AArch64::GPRRegBankID)
3879 return false;
3880
3881 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3882 return false;
3883
3884 auto *DstRC = &AArch64::GPR64RegClass;
3885 Register SubToRegDef = MRI.createVirtualRegister(RegClass: DstRC);
3886 MachineInstr &SubRegMI = *BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(),
3887 MCID: TII.get(Opcode: TargetOpcode::SUBREG_TO_REG))
3888 .addDef(RegNo: SubToRegDef)
3889 .addImm(Val: 0)
3890 .addUse(RegNo: I.getOperand(i: 1).getReg())
3891 .addImm(Val: AArch64::sub_32);
3892 Register SubToRegDef2 = MRI.createVirtualRegister(RegClass: DstRC);
3893 // Need to anyext the second scalar before we can use bfm
3894 MachineInstr &SubRegMI2 = *BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(),
3895 MCID: TII.get(Opcode: TargetOpcode::SUBREG_TO_REG))
3896 .addDef(RegNo: SubToRegDef2)
3897 .addImm(Val: 0)
3898 .addUse(RegNo: I.getOperand(i: 2).getReg())
3899 .addImm(Val: AArch64::sub_32);
3900 MachineInstr &BFM =
3901 *BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::BFMXri))
3902 .addDef(RegNo: I.getOperand(i: 0).getReg())
3903 .addUse(RegNo: SubToRegDef)
3904 .addUse(RegNo: SubToRegDef2)
3905 .addImm(Val: 32)
3906 .addImm(Val: 31);
3907 constrainSelectedInstRegOperands(I&: SubRegMI, TII, TRI, RBI);
3908 constrainSelectedInstRegOperands(I&: SubRegMI2, TII, TRI, RBI);
3909 constrainSelectedInstRegOperands(I&: BFM, TII, TRI, RBI);
3910 I.eraseFromParent();
3911 return true;
3912}
3913
3914static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3915 const unsigned EltSize) {
3916 // Choose a lane copy opcode and subregister based off of the size of the
3917 // vector's elements.
3918 switch (EltSize) {
3919 case 8:
3920 CopyOpc = AArch64::DUPi8;
3921 ExtractSubReg = AArch64::bsub;
3922 break;
3923 case 16:
3924 CopyOpc = AArch64::DUPi16;
3925 ExtractSubReg = AArch64::hsub;
3926 break;
3927 case 32:
3928 CopyOpc = AArch64::DUPi32;
3929 ExtractSubReg = AArch64::ssub;
3930 break;
3931 case 64:
3932 CopyOpc = AArch64::DUPi64;
3933 ExtractSubReg = AArch64::dsub;
3934 break;
3935 default:
3936 // Unknown size, bail out.
3937 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
3938 return false;
3939 }
3940 return true;
3941}
3942
3943MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
3944 std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
3945 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
3946 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3947 unsigned CopyOpc = 0;
3948 unsigned ExtractSubReg = 0;
3949 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, EltSize: ScalarTy.getSizeInBits())) {
3950 LLVM_DEBUG(
3951 dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
3952 return nullptr;
3953 }
3954
3955 const TargetRegisterClass *DstRC =
3956 getRegClassForTypeOnBank(Ty: ScalarTy, RB: DstRB, GetAllRegSet: true);
3957 if (!DstRC) {
3958 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
3959 return nullptr;
3960 }
3961
3962 const RegisterBank &VecRB = *RBI.getRegBank(Reg: VecReg, MRI, TRI);
3963 const LLT &VecTy = MRI.getType(Reg: VecReg);
3964 const TargetRegisterClass *VecRC =
3965 getRegClassForTypeOnBank(Ty: VecTy, RB: VecRB, GetAllRegSet: true);
3966 if (!VecRC) {
3967 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3968 return nullptr;
3969 }
3970
3971 // The register that we're going to copy into.
3972 Register InsertReg = VecReg;
3973 if (!DstReg)
3974 DstReg = MRI.createVirtualRegister(RegClass: DstRC);
3975 // If the lane index is 0, we just use a subregister COPY.
3976 if (LaneIdx == 0) {
3977 auto Copy = MIRBuilder.buildInstr(Opc: TargetOpcode::COPY, DstOps: {*DstReg}, SrcOps: {})
3978 .addReg(RegNo: VecReg, Flags: {}, SubReg: ExtractSubReg);
3979 RBI.constrainGenericRegister(Reg: *DstReg, RC: *DstRC, MRI);
3980 return &*Copy;
3981 }
3982
3983 // Lane copies require 128-bit wide registers. If we're dealing with an
3984 // unpacked vector, then we need to move up to that width. Insert an implicit
3985 // def and a subregister insert to get us there.
3986 if (VecTy.getSizeInBits() != 128) {
3987 MachineInstr *ScalarToVector = emitScalarToVector(
3988 EltSize: VecTy.getSizeInBits(), DstRC: &AArch64::FPR128RegClass, Scalar: VecReg, MIRBuilder);
3989 if (!ScalarToVector)
3990 return nullptr;
3991 InsertReg = ScalarToVector->getOperand(i: 0).getReg();
3992 }
3993
3994 MachineInstr *LaneCopyMI =
3995 MIRBuilder.buildInstr(Opc: CopyOpc, DstOps: {*DstReg}, SrcOps: {InsertReg}).addImm(Val: LaneIdx);
3996 constrainSelectedInstRegOperands(I&: *LaneCopyMI, TII, TRI, RBI);
3997
3998 // Make sure that we actually constrain the initial copy.
3999 RBI.constrainGenericRegister(Reg: *DstReg, RC: *DstRC, MRI);
4000 return LaneCopyMI;
4001}
4002
4003bool AArch64InstructionSelector::selectExtractElt(
4004 MachineInstr &I, MachineRegisterInfo &MRI) {
4005 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4006 "unexpected opcode!");
4007 Register DstReg = I.getOperand(i: 0).getReg();
4008 const LLT NarrowTy = MRI.getType(Reg: DstReg);
4009 const Register SrcReg = I.getOperand(i: 1).getReg();
4010 const LLT WideTy = MRI.getType(Reg: SrcReg);
4011 (void)WideTy;
4012 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4013 "source register size too small!");
4014 assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4015
4016 // Need the lane index to determine the correct copy opcode.
4017 MachineOperand &LaneIdxOp = I.getOperand(i: 2);
4018 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4019
4020 if (RBI.getRegBank(Reg: DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4021 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4022 return false;
4023 }
4024
4025 // Find the index to extract from.
4026 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: LaneIdxOp.getReg(), MRI);
4027 if (!VRegAndVal)
4028 return false;
4029 unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4030
4031
4032 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
4033 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, ScalarTy: NarrowTy, VecReg: SrcReg,
4034 LaneIdx, MIRBuilder&: MIB);
4035 if (!Extract)
4036 return false;
4037
4038 I.eraseFromParent();
4039 return true;
4040}
4041
4042bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4043 MachineInstr &I, MachineRegisterInfo &MRI) {
4044 unsigned NumElts = I.getNumOperands() - 1;
4045 Register SrcReg = I.getOperand(i: NumElts).getReg();
4046 const LLT NarrowTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
4047 const LLT SrcTy = MRI.getType(Reg: SrcReg);
4048
4049 assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4050 if (SrcTy.getSizeInBits() > 128) {
4051 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4052 return false;
4053 }
4054
4055 // We implement a split vector operation by treating the sub-vectors as
4056 // scalars and extracting them.
4057 const RegisterBank &DstRB =
4058 *RBI.getRegBank(Reg: I.getOperand(i: 0).getReg(), MRI, TRI);
4059 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4060 Register Dst = I.getOperand(i: OpIdx).getReg();
4061 MachineInstr *Extract =
4062 emitExtractVectorElt(DstReg: Dst, DstRB, ScalarTy: NarrowTy, VecReg: SrcReg, LaneIdx: OpIdx, MIRBuilder&: MIB);
4063 if (!Extract)
4064 return false;
4065 }
4066 I.eraseFromParent();
4067 return true;
4068}
4069
4070bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4071 MachineRegisterInfo &MRI) {
4072 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4073 "unexpected opcode");
4074
4075 // TODO: Handle unmerging into GPRs and from scalars to scalars.
4076 if (RBI.getRegBank(Reg: I.getOperand(i: 0).getReg(), MRI, TRI)->getID() !=
4077 AArch64::FPRRegBankID ||
4078 RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI)->getID() !=
4079 AArch64::FPRRegBankID) {
4080 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4081 "currently unsupported.\n");
4082 return false;
4083 }
4084
4085 // The last operand is the vector source register, and every other operand is
4086 // a register to unpack into.
4087 unsigned NumElts = I.getNumOperands() - 1;
4088 Register SrcReg = I.getOperand(i: NumElts).getReg();
4089 const LLT NarrowTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
4090 const LLT WideTy = MRI.getType(Reg: SrcReg);
4091
4092 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4093 "source register size too small!");
4094
4095 if (!NarrowTy.isScalar())
4096 return selectSplitVectorUnmerge(I, MRI);
4097
4098 // Choose a lane copy opcode and subregister based off of the size of the
4099 // vector's elements.
4100 unsigned CopyOpc = 0;
4101 unsigned ExtractSubReg = 0;
4102 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, EltSize: NarrowTy.getSizeInBits()))
4103 return false;
4104
4105 // Set up for the lane copies.
4106 MachineBasicBlock &MBB = *I.getParent();
4107
4108 // Stores the registers we'll be copying from.
4109 SmallVector<Register, 4> InsertRegs;
4110
4111 // We'll use the first register twice, so we only need NumElts-1 registers.
4112 unsigned NumInsertRegs = NumElts - 1;
4113
4114 // If our elements fit into exactly 128 bits, then we can copy from the source
4115 // directly. Otherwise, we need to do a bit of setup with some subregister
4116 // inserts.
4117 if (NarrowTy.getSizeInBits() * NumElts == 128) {
4118 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4119 } else {
4120 // No. We have to perform subregister inserts. For each insert, create an
4121 // implicit def and a subregister insert, and save the register we create.
4122 // For scalar sources, treat as a pseudo-vector of NarrowTy elements.
4123 unsigned EltSize = WideTy.isVector() ? WideTy.getScalarSizeInBits()
4124 : NarrowTy.getSizeInBits();
4125 const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4126 Ty: LLT::fixed_vector(NumElements: NumElts, ScalarSizeInBits: EltSize), RB: *RBI.getRegBank(Reg: SrcReg, MRI, TRI));
4127 unsigned SubReg = 0;
4128 bool Found = getSubRegForClass(RC, TRI, SubReg);
4129 (void)Found;
4130 assert(Found && "expected to find last operand's subeg idx");
4131 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4132 Register ImpDefReg = MRI.createVirtualRegister(RegClass: &AArch64::FPR128RegClass);
4133 MachineInstr &ImpDefMI =
4134 *BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: TargetOpcode::IMPLICIT_DEF),
4135 DestReg: ImpDefReg);
4136
4137 // Now, create the subregister insert from SrcReg.
4138 Register InsertReg = MRI.createVirtualRegister(RegClass: &AArch64::FPR128RegClass);
4139 MachineInstr &InsMI =
4140 *BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(),
4141 MCID: TII.get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: InsertReg)
4142 .addUse(RegNo: ImpDefReg)
4143 .addUse(RegNo: SrcReg)
4144 .addImm(Val: SubReg);
4145
4146 constrainSelectedInstRegOperands(I&: ImpDefMI, TII, TRI, RBI);
4147 constrainSelectedInstRegOperands(I&: InsMI, TII, TRI, RBI);
4148
4149 // Save the register so that we can copy from it after.
4150 InsertRegs.push_back(Elt: InsertReg);
4151 }
4152 }
4153
4154 // Now that we've created any necessary subregister inserts, we can
4155 // create the copies.
4156 //
4157 // Perform the first copy separately as a subregister copy.
4158 Register CopyTo = I.getOperand(i: 0).getReg();
4159 auto FirstCopy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {CopyTo}, SrcOps: {})
4160 .addReg(RegNo: InsertRegs[0], Flags: {}, SubReg: ExtractSubReg);
4161 constrainSelectedInstRegOperands(I&: *FirstCopy, TII, TRI, RBI);
4162
4163 // Now, perform the remaining copies as vector lane copies.
4164 unsigned LaneIdx = 1;
4165 for (Register InsReg : InsertRegs) {
4166 Register CopyTo = I.getOperand(i: LaneIdx).getReg();
4167 MachineInstr &CopyInst =
4168 *BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: CopyOpc), DestReg: CopyTo)
4169 .addUse(RegNo: InsReg)
4170 .addImm(Val: LaneIdx);
4171 constrainSelectedInstRegOperands(I&: CopyInst, TII, TRI, RBI);
4172 ++LaneIdx;
4173 }
4174
4175 // Separately constrain the first copy's destination. Because of the
4176 // limitation in constrainOperandRegClass, we can't guarantee that this will
4177 // actually be constrained. So, do it ourselves using the second operand.
4178 const TargetRegisterClass *RC =
4179 MRI.getRegClassOrNull(Reg: I.getOperand(i: 1).getReg());
4180 if (!RC) {
4181 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4182 return false;
4183 }
4184
4185 RBI.constrainGenericRegister(Reg: CopyTo, RC: *RC, MRI);
4186 I.eraseFromParent();
4187 return true;
4188}
4189
4190bool AArch64InstructionSelector::selectConcatVectors(
4191 MachineInstr &I, MachineRegisterInfo &MRI) {
4192 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4193 "Unexpected opcode");
4194 Register Dst = I.getOperand(i: 0).getReg();
4195 Register Op1 = I.getOperand(i: 1).getReg();
4196 Register Op2 = I.getOperand(i: 2).getReg();
4197 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder&: MIB);
4198 if (!ConcatMI)
4199 return false;
4200 I.eraseFromParent();
4201 return true;
4202}
4203
4204unsigned
4205AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4206 MachineFunction &MF) const {
4207 Type *CPTy = CPVal->getType();
4208 Align Alignment = MF.getDataLayout().getPrefTypeAlign(Ty: CPTy);
4209
4210 MachineConstantPool *MCP = MF.getConstantPool();
4211 return MCP->getConstantPoolIndex(C: CPVal, Alignment);
4212}
4213
4214MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4215 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4216 const TargetRegisterClass *RC;
4217 unsigned Opc;
4218 bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4219 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(Ty: CPVal->getType());
4220 switch (Size) {
4221 case 16:
4222 RC = &AArch64::FPR128RegClass;
4223 Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4224 break;
4225 case 8:
4226 RC = &AArch64::FPR64RegClass;
4227 Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4228 break;
4229 case 4:
4230 RC = &AArch64::FPR32RegClass;
4231 Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4232 break;
4233 case 2:
4234 RC = &AArch64::FPR16RegClass;
4235 Opc = AArch64::LDRHui;
4236 break;
4237 default:
4238 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4239 << *CPVal->getType());
4240 return nullptr;
4241 }
4242
4243 MachineInstr *LoadMI = nullptr;
4244 auto &MF = MIRBuilder.getMF();
4245 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4246 if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4247 // Use load(literal) for tiny code model.
4248 LoadMI = &*MIRBuilder.buildInstr(Opc, DstOps: {RC}, SrcOps: {}).addConstantPoolIndex(Idx: CPIdx);
4249 } else {
4250 auto Adrp =
4251 MIRBuilder.buildInstr(Opc: AArch64::ADRP, DstOps: {&AArch64::GPR64RegClass}, SrcOps: {})
4252 .addConstantPoolIndex(Idx: CPIdx, Offset: 0, TargetFlags: AArch64II::MO_PAGE);
4253
4254 LoadMI = &*MIRBuilder.buildInstr(Opc, DstOps: {RC}, SrcOps: {Adrp})
4255 .addConstantPoolIndex(
4256 Idx: CPIdx, Offset: 0, TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4257
4258 constrainSelectedInstRegOperands(I&: *Adrp, TII, TRI, RBI);
4259 }
4260
4261 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4262 LoadMI->addMemOperand(MF, MO: MF.getMachineMemOperand(PtrInfo,
4263 F: MachineMemOperand::MOLoad,
4264 Size, BaseAlignment: Align(Size)));
4265 constrainSelectedInstRegOperands(I&: *LoadMI, TII, TRI, RBI);
4266 return LoadMI;
4267}
4268
4269/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4270/// size and RB.
4271static std::pair<unsigned, unsigned>
4272getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4273 unsigned Opc, SubregIdx;
4274 if (RB.getID() == AArch64::GPRRegBankID) {
4275 if (EltSize == 8) {
4276 Opc = AArch64::INSvi8gpr;
4277 SubregIdx = AArch64::bsub;
4278 } else if (EltSize == 16) {
4279 Opc = AArch64::INSvi16gpr;
4280 SubregIdx = AArch64::ssub;
4281 } else if (EltSize == 32) {
4282 Opc = AArch64::INSvi32gpr;
4283 SubregIdx = AArch64::ssub;
4284 } else if (EltSize == 64) {
4285 Opc = AArch64::INSvi64gpr;
4286 SubregIdx = AArch64::dsub;
4287 } else {
4288 llvm_unreachable("invalid elt size!");
4289 }
4290 } else {
4291 if (EltSize == 8) {
4292 Opc = AArch64::INSvi8lane;
4293 SubregIdx = AArch64::bsub;
4294 } else if (EltSize == 16) {
4295 Opc = AArch64::INSvi16lane;
4296 SubregIdx = AArch64::hsub;
4297 } else if (EltSize == 32) {
4298 Opc = AArch64::INSvi32lane;
4299 SubregIdx = AArch64::ssub;
4300 } else if (EltSize == 64) {
4301 Opc = AArch64::INSvi64lane;
4302 SubregIdx = AArch64::dsub;
4303 } else {
4304 llvm_unreachable("invalid elt size!");
4305 }
4306 }
4307 return std::make_pair(x&: Opc, y&: SubregIdx);
4308}
4309
4310MachineInstr *AArch64InstructionSelector::emitInstr(
4311 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4312 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4313 const ComplexRendererFns &RenderFns) const {
4314 assert(Opcode && "Expected an opcode?");
4315 assert(!isPreISelGenericOpcode(Opcode) &&
4316 "Function should only be used to produce selected instructions!");
4317 auto MI = MIRBuilder.buildInstr(Opc: Opcode, DstOps, SrcOps);
4318 if (RenderFns)
4319 for (auto &Fn : *RenderFns)
4320 Fn(MI);
4321 constrainSelectedInstRegOperands(I&: *MI, TII, TRI, RBI);
4322 return &*MI;
4323}
4324
4325MachineInstr *AArch64InstructionSelector::emitAddSub(
4326 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4327 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4328 MachineIRBuilder &MIRBuilder) const {
4329 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4330 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4331 auto Ty = MRI.getType(Reg: LHS.getReg());
4332 assert(!Ty.isVector() && "Expected a scalar or pointer?");
4333 unsigned Size = Ty.getSizeInBits();
4334 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4335 bool Is32Bit = Size == 32;
4336
4337 // INSTRri form with positive arithmetic immediate.
4338 if (auto Fns = selectArithImmed(Root&: RHS))
4339 return emitInstr(Opcode: AddrModeAndSizeToOpcode[0][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4340 MIRBuilder, RenderFns: Fns);
4341
4342 // INSTRri form with negative arithmetic immediate.
4343 if (auto Fns = selectNegArithImmed(Root&: RHS))
4344 return emitInstr(Opcode: AddrModeAndSizeToOpcode[3][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4345 MIRBuilder, RenderFns: Fns);
4346
4347 // INSTRrx form.
4348 if (auto Fns = selectArithExtendedRegister(Root&: RHS))
4349 return emitInstr(Opcode: AddrModeAndSizeToOpcode[4][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4350 MIRBuilder, RenderFns: Fns);
4351
4352 // INSTRrs form.
4353 if (auto Fns = selectShiftedRegister(Root&: RHS))
4354 return emitInstr(Opcode: AddrModeAndSizeToOpcode[1][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4355 MIRBuilder, RenderFns: Fns);
4356 return emitInstr(Opcode: AddrModeAndSizeToOpcode[2][Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS},
4357 MIRBuilder);
4358}
4359
4360MachineInstr *
4361AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4362 MachineOperand &RHS,
4363 MachineIRBuilder &MIRBuilder) const {
4364 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4365 ._M_elems: {{AArch64::ADDXri, AArch64::ADDWri},
4366 {AArch64::ADDXrs, AArch64::ADDWrs},
4367 {AArch64::ADDXrr, AArch64::ADDWrr},
4368 {AArch64::SUBXri, AArch64::SUBWri},
4369 {AArch64::ADDXrx, AArch64::ADDWrx}}};
4370 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst: DefReg, LHS, RHS, MIRBuilder);
4371}
4372
4373MachineInstr *
4374AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4375 MachineOperand &RHS,
4376 MachineIRBuilder &MIRBuilder) const {
4377 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4378 ._M_elems: {{AArch64::ADDSXri, AArch64::ADDSWri},
4379 {AArch64::ADDSXrs, AArch64::ADDSWrs},
4380 {AArch64::ADDSXrr, AArch64::ADDSWrr},
4381 {AArch64::SUBSXri, AArch64::SUBSWri},
4382 {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4383 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst, LHS, RHS, MIRBuilder);
4384}
4385
4386MachineInstr *
4387AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4388 MachineOperand &RHS,
4389 MachineIRBuilder &MIRBuilder) const {
4390 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4391 ._M_elems: {{AArch64::SUBSXri, AArch64::SUBSWri},
4392 {AArch64::SUBSXrs, AArch64::SUBSWrs},
4393 {AArch64::SUBSXrr, AArch64::SUBSWrr},
4394 {AArch64::ADDSXri, AArch64::ADDSWri},
4395 {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4396 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst, LHS, RHS, MIRBuilder);
4397}
4398
4399MachineInstr *
4400AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4401 MachineOperand &RHS,
4402 MachineIRBuilder &MIRBuilder) const {
4403 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4404 MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4405 bool Is32Bit = (MRI->getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4406 static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4407 return emitInstr(Opcode: OpcTable[Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS}, MIRBuilder);
4408}
4409
4410MachineInstr *
4411AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4412 MachineOperand &RHS,
4413 MachineIRBuilder &MIRBuilder) const {
4414 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4415 MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4416 bool Is32Bit = (MRI->getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4417 static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4418 return emitInstr(Opcode: OpcTable[Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS}, MIRBuilder);
4419}
4420
4421MachineInstr *
4422AArch64InstructionSelector::emitCMP(MachineOperand &LHS, MachineOperand &RHS,
4423 MachineIRBuilder &MIRBuilder) const {
4424 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4425 bool Is32Bit = MRI.getType(Reg: LHS.getReg()).getSizeInBits() == 32;
4426 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4427 return emitSUBS(Dst: MRI.createVirtualRegister(RegClass: RC), LHS, RHS, MIRBuilder);
4428}
4429
4430MachineInstr *
4431AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4432 MachineIRBuilder &MIRBuilder) const {
4433 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4434 bool Is32Bit = (MRI.getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4435 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4436 return emitADDS(Dst: MRI.createVirtualRegister(RegClass: RC), LHS, RHS, MIRBuilder);
4437}
4438
4439MachineInstr *
4440AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4441 MachineIRBuilder &MIRBuilder) const {
4442 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4443 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4444 LLT Ty = MRI.getType(Reg: LHS.getReg());
4445 unsigned RegSize = Ty.getSizeInBits();
4446 bool Is32Bit = (RegSize == 32);
4447 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4448 {AArch64::ANDSXrs, AArch64::ANDSWrs},
4449 {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4450 // ANDS needs a logical immediate for its immediate form. Check if we can
4451 // fold one in.
4452 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI)) {
4453 int64_t Imm = ValAndVReg->Value.getSExtValue();
4454
4455 if (AArch64_AM::isLogicalImmediate(imm: Imm, regSize: RegSize)) {
4456 auto TstMI = MIRBuilder.buildInstr(Opc: OpcTable[0][Is32Bit], DstOps: {Ty}, SrcOps: {LHS});
4457 TstMI.addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: Imm, regSize: RegSize));
4458 constrainSelectedInstRegOperands(I&: *TstMI, TII, TRI, RBI);
4459 return &*TstMI;
4460 }
4461 }
4462
4463 if (auto Fns = selectLogicalShiftedRegister(Root&: RHS))
4464 return emitInstr(Opcode: OpcTable[1][Is32Bit], DstOps: {Ty}, SrcOps: {LHS}, MIRBuilder, RenderFns: Fns);
4465 return emitInstr(Opcode: OpcTable[2][Is32Bit], DstOps: {Ty}, SrcOps: {LHS, RHS}, MIRBuilder);
4466}
4467
4468MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4469 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4470 MachineIRBuilder &MIRBuilder) const {
4471 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4472 assert(Predicate.isPredicate() && "Expected predicate?");
4473 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4474 LLT CmpTy = MRI.getType(Reg: LHS.getReg());
4475 assert(!CmpTy.isVector() && "Expected scalar or pointer");
4476 unsigned Size = CmpTy.getSizeInBits();
4477 (void)Size;
4478 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4479 // Fold the compare into a cmn or tst if possible.
4480 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4481 return FoldCmp;
4482 return emitCMP(LHS, RHS, MIRBuilder);
4483}
4484
4485MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4486 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4487 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4488#ifndef NDEBUG
4489 LLT Ty = MRI.getType(Dst);
4490 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4491 "Expected a 32-bit scalar register?");
4492#endif
4493 const Register ZReg = AArch64::WZR;
4494 AArch64CC::CondCode CC1, CC2;
4495 changeFCMPPredToAArch64CC(P: Pred, CondCode&: CC1, CondCode2&: CC2);
4496 auto InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
4497 if (CC2 == AArch64CC::AL)
4498 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC1,
4499 MIRBuilder);
4500 const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4501 Register Def1Reg = MRI.createVirtualRegister(RegClass: RC);
4502 Register Def2Reg = MRI.createVirtualRegister(RegClass: RC);
4503 auto InvCC2 = AArch64CC::getInvertedCondCode(Code: CC2);
4504 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC1, MIRBuilder);
4505 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC2, MIRBuilder);
4506 auto OrMI = MIRBuilder.buildInstr(Opc: AArch64::ORRWrr, DstOps: {Dst}, SrcOps: {Def1Reg, Def2Reg});
4507 constrainSelectedInstRegOperands(I&: *OrMI, TII, TRI, RBI);
4508 return &*OrMI;
4509}
4510
4511MachineInstr *AArch64InstructionSelector::emitFPCompare(
4512 Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4513 std::optional<CmpInst::Predicate> Pred) const {
4514 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4515 LLT Ty = MRI.getType(Reg: LHS);
4516 if (Ty.isVector())
4517 return nullptr;
4518 unsigned OpSize = Ty.getSizeInBits();
4519 assert(OpSize == 16 || OpSize == 32 || OpSize == 64);
4520
4521 // If this is a compare against +0.0, then we don't have
4522 // to explicitly materialize a constant.
4523 const ConstantFP *FPImm = getConstantFPVRegVal(VReg: RHS, MRI);
4524 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4525
4526 auto IsEqualityPred = [](CmpInst::Predicate P) {
4527 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4528 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4529 };
4530 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4531 // Try commutating the operands.
4532 const ConstantFP *LHSImm = getConstantFPVRegVal(VReg: LHS, MRI);
4533 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4534 ShouldUseImm = true;
4535 std::swap(a&: LHS, b&: RHS);
4536 }
4537 }
4538 unsigned CmpOpcTbl[2][3] = {
4539 {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
4540 {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
4541 unsigned CmpOpc =
4542 CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)];
4543
4544 // Partially build the compare. Decide if we need to add a use for the
4545 // third operand based off whether or not we're comparing against 0.0.
4546 auto CmpMI = MIRBuilder.buildInstr(Opcode: CmpOpc).addUse(RegNo: LHS);
4547 CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4548 if (!ShouldUseImm)
4549 CmpMI.addUse(RegNo: RHS);
4550 constrainSelectedInstRegOperands(I&: *CmpMI, TII, TRI, RBI);
4551 return &*CmpMI;
4552}
4553
4554MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4555 std::optional<Register> Dst, Register Op1, Register Op2,
4556 MachineIRBuilder &MIRBuilder) const {
4557 // We implement a vector concat by:
4558 // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4559 // 2. Insert the upper vector into the destination's upper element
4560 // TODO: some of this code is common with G_BUILD_VECTOR handling.
4561 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4562
4563 const LLT Op1Ty = MRI.getType(Reg: Op1);
4564 const LLT Op2Ty = MRI.getType(Reg: Op2);
4565
4566 if (Op1Ty != Op2Ty) {
4567 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4568 return nullptr;
4569 }
4570 assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4571
4572 if (Op1Ty.getSizeInBits() >= 128) {
4573 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4574 return nullptr;
4575 }
4576
4577 // At the moment we just support 64 bit vector concats.
4578 if (Op1Ty.getSizeInBits() != 64) {
4579 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4580 return nullptr;
4581 }
4582
4583 const LLT ScalarTy = LLT::scalar(SizeInBits: Op1Ty.getSizeInBits());
4584 const RegisterBank &FPRBank = *RBI.getRegBank(Reg: Op1, MRI, TRI);
4585 const TargetRegisterClass *DstRC =
4586 getRegClassForTypeOnBank(Ty: Op1Ty.multiplyElements(Factor: 2), RB: FPRBank);
4587
4588 MachineInstr *WidenedOp1 =
4589 emitScalarToVector(EltSize: ScalarTy.getSizeInBits(), DstRC, Scalar: Op1, MIRBuilder);
4590 MachineInstr *WidenedOp2 =
4591 emitScalarToVector(EltSize: ScalarTy.getSizeInBits(), DstRC, Scalar: Op2, MIRBuilder);
4592 if (!WidenedOp1 || !WidenedOp2) {
4593 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4594 return nullptr;
4595 }
4596
4597 // Now do the insert of the upper element.
4598 unsigned InsertOpc, InsSubRegIdx;
4599 std::tie(args&: InsertOpc, args&: InsSubRegIdx) =
4600 getInsertVecEltOpInfo(RB: FPRBank, EltSize: ScalarTy.getSizeInBits());
4601
4602 if (!Dst)
4603 Dst = MRI.createVirtualRegister(RegClass: DstRC);
4604 auto InsElt =
4605 MIRBuilder
4606 .buildInstr(Opc: InsertOpc, DstOps: {*Dst}, SrcOps: {WidenedOp1->getOperand(i: 0).getReg()})
4607 .addImm(Val: 1) /* Lane index */
4608 .addUse(RegNo: WidenedOp2->getOperand(i: 0).getReg())
4609 .addImm(Val: 0);
4610 constrainSelectedInstRegOperands(I&: *InsElt, TII, TRI, RBI);
4611 return &*InsElt;
4612}
4613
4614MachineInstr *
4615AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4616 Register Src2, AArch64CC::CondCode Pred,
4617 MachineIRBuilder &MIRBuilder) const {
4618 auto &MRI = *MIRBuilder.getMRI();
4619 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg: Dst);
4620 // If we used a register class, then this won't necessarily have an LLT.
4621 // Compute the size based off whether or not we have a class or bank.
4622 unsigned Size;
4623 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank))
4624 Size = TRI.getRegSizeInBits(RC: *RC);
4625 else
4626 Size = MRI.getType(Reg: Dst).getSizeInBits();
4627 // Some opcodes use s1.
4628 assert(Size <= 64 && "Expected 64 bits or less only!");
4629 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4630 unsigned Opc = OpcTable[Size == 64];
4631 auto CSINC = MIRBuilder.buildInstr(Opc, DstOps: {Dst}, SrcOps: {Src1, Src2}).addImm(Val: Pred);
4632 constrainSelectedInstRegOperands(I&: *CSINC, TII, TRI, RBI);
4633 return &*CSINC;
4634}
4635
4636MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4637 Register CarryReg) {
4638 MachineRegisterInfo *MRI = MIB.getMRI();
4639 unsigned Opcode = I.getOpcode();
4640
4641 // If the instruction is a SUB, we need to negate the carry,
4642 // because borrowing is indicated by carry-flag == 0.
4643 bool NeedsNegatedCarry =
4644 (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4645
4646 // If the previous instruction will already produce the correct carry, do not
4647 // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4648 // generated during legalization of wide add/sub. This optimization depends on
4649 // these sequences not being interrupted by other instructions.
4650 // We have to select the previous instruction before the carry-using
4651 // instruction is deleted by the calling function, otherwise the previous
4652 // instruction might become dead and would get deleted.
4653 MachineInstr *SrcMI = MRI->getVRegDef(Reg: CarryReg);
4654 if (SrcMI == I.getPrevNode()) {
4655 if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(Val: SrcMI)) {
4656 bool ProducesNegatedCarry = CarrySrcMI->isSub();
4657 if (NeedsNegatedCarry == ProducesNegatedCarry &&
4658 CarrySrcMI->isUnsigned() &&
4659 CarrySrcMI->getCarryOutReg() == CarryReg &&
4660 selectAndRestoreState(I&: *SrcMI))
4661 return nullptr;
4662 }
4663 }
4664
4665 Register DeadReg = MRI->createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
4666
4667 if (NeedsNegatedCarry) {
4668 // (0 - Carry) sets !C in NZCV when Carry == 1
4669 Register ZReg = AArch64::WZR;
4670 return emitInstr(Opcode: AArch64::SUBSWrr, DstOps: {DeadReg}, SrcOps: {ZReg, CarryReg}, MIRBuilder&: MIB);
4671 }
4672
4673 // (Carry - 1) sets !C in NZCV when Carry == 0
4674 auto Fns = select12BitValueWithLeftShift(Immed: 1);
4675 return emitInstr(Opcode: AArch64::SUBSWri, DstOps: {DeadReg}, SrcOps: {CarryReg}, MIRBuilder&: MIB, RenderFns: Fns);
4676}
4677
4678bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4679 MachineRegisterInfo &MRI) {
4680 auto &CarryMI = cast<GAddSubCarryOut>(Val&: I);
4681
4682 if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(Val: &I)) {
4683 // Set NZCV carry according to carry-in VReg
4684 emitCarryIn(I, CarryReg: CarryInMI->getCarryInReg());
4685 }
4686
4687 // Emit the operation and get the correct condition code.
4688 auto OpAndCC = emitOverflowOp(Opcode: I.getOpcode(), Dst: CarryMI.getDstReg(),
4689 LHS&: CarryMI.getLHS(), RHS&: CarryMI.getRHS(), MIRBuilder&: MIB);
4690
4691 Register CarryOutReg = CarryMI.getCarryOutReg();
4692
4693 // Don't convert carry-out to VReg if it is never used
4694 if (!MRI.use_nodbg_empty(RegNo: CarryOutReg)) {
4695 // Now, put the overflow result in the register given by the first operand
4696 // to the overflow op. CSINC increments the result when the predicate is
4697 // false, so to get the increment when it's true, we need to use the
4698 // inverse. In this case, we want to increment when carry is set.
4699 Register ZReg = AArch64::WZR;
4700 emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4701 Pred: getInvertedCondCode(Code: OpAndCC.second), MIRBuilder&: MIB);
4702 }
4703
4704 I.eraseFromParent();
4705 return true;
4706}
4707
4708std::pair<MachineInstr *, AArch64CC::CondCode>
4709AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4710 MachineOperand &LHS,
4711 MachineOperand &RHS,
4712 MachineIRBuilder &MIRBuilder) const {
4713 switch (Opcode) {
4714 default:
4715 llvm_unreachable("Unexpected opcode!");
4716 case TargetOpcode::G_SADDO:
4717 return std::make_pair(x: emitADDS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4718 case TargetOpcode::G_UADDO:
4719 return std::make_pair(x: emitADDS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::HS);
4720 case TargetOpcode::G_SSUBO:
4721 return std::make_pair(x: emitSUBS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4722 case TargetOpcode::G_USUBO:
4723 return std::make_pair(x: emitSUBS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::LO);
4724 case TargetOpcode::G_SADDE:
4725 return std::make_pair(x: emitADCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4726 case TargetOpcode::G_UADDE:
4727 return std::make_pair(x: emitADCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::HS);
4728 case TargetOpcode::G_SSUBE:
4729 return std::make_pair(x: emitSBCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4730 case TargetOpcode::G_USUBE:
4731 return std::make_pair(x: emitSBCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::LO);
4732 }
4733}
4734
4735/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4736/// expressed as a conjunction.
4737/// \param CanNegate Set to true if we can negate the whole sub-tree just by
4738/// changing the conditions on the CMP tests.
4739/// (this means we can call emitConjunctionRec() with
4740/// Negate==true on this sub-tree)
4741/// \param MustBeFirst Set to true if this subtree needs to be negated and we
4742/// cannot do the negation naturally. We are required to
4743/// emit the subtree first in this case.
4744/// \param WillNegate Is true if are called when the result of this
4745/// subexpression must be negated. This happens when the
4746/// outer expression is an OR. We can use this fact to know
4747/// that we have a double negation (or (or ...) ...) that
4748/// can be implemented for free.
4749static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4750 bool WillNegate, MachineRegisterInfo &MRI,
4751 unsigned Depth = 0) {
4752 if (!MRI.hasOneNonDBGUse(RegNo: Val))
4753 return false;
4754 MachineInstr *ValDef = MRI.getVRegDef(Reg: Val);
4755 unsigned Opcode = ValDef->getOpcode();
4756 if (isa<GAnyCmp>(Val: ValDef)) {
4757 CanNegate = true;
4758 MustBeFirst = false;
4759 return true;
4760 }
4761 // Protect against exponential runtime and stack overflow.
4762 if (Depth > 6)
4763 return false;
4764 if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4765 bool IsOR = Opcode == TargetOpcode::G_OR;
4766 Register O0 = ValDef->getOperand(i: 1).getReg();
4767 Register O1 = ValDef->getOperand(i: 2).getReg();
4768 bool CanNegateL;
4769 bool MustBeFirstL;
4770 if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, MRI, Depth: Depth + 1))
4771 return false;
4772 bool CanNegateR;
4773 bool MustBeFirstR;
4774 if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, MRI, Depth: Depth + 1))
4775 return false;
4776
4777 if (MustBeFirstL && MustBeFirstR)
4778 return false;
4779
4780 if (IsOR) {
4781 // For an OR expression we need to be able to naturally negate at least
4782 // one side or we cannot do the transformation at all.
4783 if (!CanNegateL && !CanNegateR)
4784 return false;
4785 // If we the result of the OR will be negated and we can naturally negate
4786 // the leaves, then this sub-tree as a whole negates naturally.
4787 CanNegate = WillNegate && CanNegateL && CanNegateR;
4788 // If we cannot naturally negate the whole sub-tree, then this must be
4789 // emitted first.
4790 MustBeFirst = !CanNegate;
4791 } else {
4792 assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4793 // We cannot naturally negate an AND operation.
4794 CanNegate = false;
4795 MustBeFirst = MustBeFirstL || MustBeFirstR;
4796 }
4797 return true;
4798 }
4799 return false;
4800}
4801
4802MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4803 Register LHS, Register RHS, CmpInst::Predicate CC,
4804 AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4805 MachineIRBuilder &MIB) const {
4806 auto &MRI = *MIB.getMRI();
4807 LLT OpTy = MRI.getType(Reg: LHS);
4808 unsigned CCmpOpc;
4809 std::optional<ValueAndVReg> C;
4810 if (CmpInst::isIntPredicate(P: CC)) {
4811 assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4812 C = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
4813 if (!C || C->Value.sgt(RHS: 31) || C->Value.slt(RHS: -31))
4814 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4815 else if (C->Value.ule(RHS: 31))
4816 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4817 else
4818 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi;
4819 } else {
4820 assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
4821 OpTy.getSizeInBits() == 64);
4822 switch (OpTy.getSizeInBits()) {
4823 case 16:
4824 assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
4825 CCmpOpc = AArch64::FCCMPHrr;
4826 break;
4827 case 32:
4828 CCmpOpc = AArch64::FCCMPSrr;
4829 break;
4830 case 64:
4831 CCmpOpc = AArch64::FCCMPDrr;
4832 break;
4833 default:
4834 return nullptr;
4835 }
4836 }
4837 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
4838 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
4839 auto CCmp =
4840 MIB.buildInstr(Opc: CCmpOpc, DstOps: {}, SrcOps: {LHS});
4841 if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4842 CCmp.addImm(Val: C->Value.getZExtValue());
4843 else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi)
4844 CCmp.addImm(Val: C->Value.abs().getZExtValue());
4845 else
4846 CCmp.addReg(RegNo: RHS);
4847 CCmp.addImm(Val: NZCV).addImm(Val: Predicate);
4848 constrainSelectedInstRegOperands(I&: *CCmp, TII, TRI, RBI);
4849 return &*CCmp;
4850}
4851
4852MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4853 Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4854 AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4855 // We're at a tree leaf, produce a conditional comparison operation.
4856 auto &MRI = *MIB.getMRI();
4857 MachineInstr *ValDef = MRI.getVRegDef(Reg: Val);
4858 unsigned Opcode = ValDef->getOpcode();
4859 if (auto *Cmp = dyn_cast<GAnyCmp>(Val: ValDef)) {
4860 Register LHS = Cmp->getLHSReg();
4861 Register RHS = Cmp->getRHSReg();
4862 CmpInst::Predicate CC = Cmp->getCond();
4863 if (Negate)
4864 CC = CmpInst::getInversePredicate(pred: CC);
4865 if (isa<GICmp>(Val: Cmp)) {
4866 OutCC = changeICMPPredToAArch64CC(P: CC, RHS, MRI: MIB.getMRI());
4867 } else {
4868 // Handle special FP cases.
4869 AArch64CC::CondCode ExtraCC;
4870 changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
4871 // Some floating point conditions can't be tested with a single condition
4872 // code. Construct an additional comparison in this case.
4873 if (ExtraCC != AArch64CC::AL) {
4874 MachineInstr *ExtraCmp;
4875 if (!CCOp)
4876 ExtraCmp = emitFPCompare(LHS, RHS, MIRBuilder&: MIB, Pred: CC);
4877 else
4878 ExtraCmp =
4879 emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC: ExtraCC, MIB);
4880 CCOp = ExtraCmp->getOperand(i: 0).getReg();
4881 Predicate = ExtraCC;
4882 }
4883 }
4884
4885 // Produce a normal comparison if we are first in the chain
4886 if (!CCOp) {
4887 if (isa<GICmp>(Val: Cmp))
4888 return emitCMP(LHS&: Cmp->getOperand(i: 2), RHS&: Cmp->getOperand(i: 3), MIRBuilder&: MIB);
4889 return emitFPCompare(LHS: Cmp->getOperand(i: 2).getReg(),
4890 RHS: Cmp->getOperand(i: 3).getReg(), MIRBuilder&: MIB);
4891 }
4892 // Otherwise produce a ccmp.
4893 return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4894 }
4895 assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4896
4897 bool IsOR = Opcode == TargetOpcode::G_OR;
4898
4899 Register LHS = ValDef->getOperand(i: 1).getReg();
4900 bool CanNegateL;
4901 bool MustBeFirstL;
4902 bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, MRI);
4903 assert(ValidL && "Valid conjunction/disjunction tree");
4904 (void)ValidL;
4905
4906 Register RHS = ValDef->getOperand(i: 2).getReg();
4907 bool CanNegateR;
4908 bool MustBeFirstR;
4909 bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, MRI);
4910 assert(ValidR && "Valid conjunction/disjunction tree");
4911 (void)ValidR;
4912
4913 // Swap sub-tree that must come first to the right side.
4914 if (MustBeFirstL) {
4915 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4916 std::swap(a&: LHS, b&: RHS);
4917 std::swap(a&: CanNegateL, b&: CanNegateR);
4918 std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
4919 }
4920
4921 bool NegateR;
4922 bool NegateAfterR;
4923 bool NegateL;
4924 bool NegateAfterAll;
4925 if (Opcode == TargetOpcode::G_OR) {
4926 // Swap the sub-tree that we can negate naturally to the left.
4927 if (!CanNegateL) {
4928 assert(CanNegateR && "at least one side must be negatable");
4929 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4930 assert(!Negate);
4931 std::swap(a&: LHS, b&: RHS);
4932 NegateR = false;
4933 NegateAfterR = true;
4934 } else {
4935 // Negate the left sub-tree if possible, otherwise negate the result.
4936 NegateR = CanNegateR;
4937 NegateAfterR = !CanNegateR;
4938 }
4939 NegateL = true;
4940 NegateAfterAll = !Negate;
4941 } else {
4942 assert(Opcode == TargetOpcode::G_AND &&
4943 "Valid conjunction/disjunction tree");
4944 assert(!Negate && "Valid conjunction/disjunction tree");
4945
4946 NegateL = false;
4947 NegateR = false;
4948 NegateAfterR = false;
4949 NegateAfterAll = false;
4950 }
4951
4952 // Emit sub-trees.
4953 AArch64CC::CondCode RHSCC;
4954 MachineInstr *CmpR =
4955 emitConjunctionRec(Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate, MIB);
4956 if (NegateAfterR)
4957 RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
4958 MachineInstr *CmpL = emitConjunctionRec(
4959 Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR->getOperand(i: 0).getReg(), Predicate: RHSCC, MIB);
4960 if (NegateAfterAll)
4961 OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
4962 return CmpL;
4963}
4964
4965MachineInstr *AArch64InstructionSelector::emitConjunction(
4966 Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
4967 bool DummyCanNegate;
4968 bool DummyMustBeFirst;
4969 if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false,
4970 MRI&: *MIB.getMRI()))
4971 return nullptr;
4972 return emitConjunctionRec(Val, OutCC, Negate: false, CCOp: Register(), Predicate: AArch64CC::AL, MIB);
4973}
4974
4975bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
4976 MachineInstr &CondMI) {
4977 AArch64CC::CondCode AArch64CC;
4978 MachineInstr *ConjMI = emitConjunction(Val: SelI.getCondReg(), OutCC&: AArch64CC, MIB);
4979 if (!ConjMI)
4980 return false;
4981
4982 emitSelect(Dst: SelI.getReg(Idx: 0), True: SelI.getTrueReg(), False: SelI.getFalseReg(), CC: AArch64CC, MIB);
4983 SelI.eraseFromParent();
4984 return true;
4985}
4986
4987bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
4988 MachineRegisterInfo &MRI = *MIB.getMRI();
4989 // We want to recognize this pattern:
4990 //
4991 // $z = G_FCMP pred, $x, $y
4992 // ...
4993 // $w = G_SELECT $z, $a, $b
4994 //
4995 // Where the value of $z is *only* ever used by the G_SELECT (possibly with
4996 // some copies/truncs in between.)
4997 //
4998 // If we see this, then we can emit something like this:
4999 //
5000 // fcmp $x, $y
5001 // fcsel $w, $a, $b, pred
5002 //
5003 // Rather than emitting both of the rather long sequences in the standard
5004 // G_FCMP/G_SELECT select methods.
5005
5006 // First, check if the condition is defined by a compare.
5007 MachineInstr *CondDef = MRI.getVRegDef(Reg: I.getOperand(i: 1).getReg());
5008
5009 // We can only fold if all of the defs have one use.
5010 Register CondDefReg = CondDef->getOperand(i: 0).getReg();
5011 if (!MRI.hasOneNonDBGUse(RegNo: CondDefReg)) {
5012 // Unless it's another select.
5013 for (const MachineInstr &UI : MRI.use_nodbg_instructions(Reg: CondDefReg)) {
5014 if (CondDef == &UI)
5015 continue;
5016 if (UI.getOpcode() != TargetOpcode::G_SELECT)
5017 return false;
5018 }
5019 }
5020
5021 // Is the condition defined by a compare?
5022 unsigned CondOpc = CondDef->getOpcode();
5023 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5024 if (tryOptSelectConjunction(SelI&: I, CondMI&: *CondDef))
5025 return true;
5026 return false;
5027 }
5028
5029 AArch64CC::CondCode CondCode;
5030 if (CondOpc == TargetOpcode::G_ICMP) {
5031 auto &PredOp = CondDef->getOperand(i: 1);
5032 emitIntegerCompare(LHS&: CondDef->getOperand(i: 2), RHS&: CondDef->getOperand(i: 3), Predicate&: PredOp,
5033 MIRBuilder&: MIB);
5034 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
5035 CondCode =
5036 changeICMPPredToAArch64CC(P: Pred, RHS: CondDef->getOperand(i: 3).getReg(), MRI: &MRI);
5037 } else {
5038 // Get the condition code for the select.
5039 auto Pred =
5040 static_cast<CmpInst::Predicate>(CondDef->getOperand(i: 1).getPredicate());
5041 AArch64CC::CondCode CondCode2;
5042 changeFCMPPredToAArch64CC(P: Pred, CondCode, CondCode2);
5043
5044 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5045 // instructions to emit the comparison.
5046 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5047 // unnecessary.
5048 if (CondCode2 != AArch64CC::AL)
5049 return false;
5050
5051 if (!emitFPCompare(LHS: CondDef->getOperand(i: 2).getReg(),
5052 RHS: CondDef->getOperand(i: 3).getReg(), MIRBuilder&: MIB)) {
5053 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5054 return false;
5055 }
5056 }
5057
5058 // Emit the select.
5059 emitSelect(Dst: I.getOperand(i: 0).getReg(), True: I.getOperand(i: 2).getReg(),
5060 False: I.getOperand(i: 3).getReg(), CC: CondCode, MIB);
5061 I.eraseFromParent();
5062 return true;
5063}
5064
5065MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5066 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5067 MachineIRBuilder &MIRBuilder) const {
5068 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5069 "Unexpected MachineOperand");
5070 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5071 // We want to find this sort of thing:
5072 // x = G_SUB 0, y
5073 // G_ICMP z, x
5074 //
5075 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5076 // e.g:
5077 //
5078 // cmn z, y
5079
5080 // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5081 MachineInstr *LHSDef = getDefIgnoringCopies(Reg: LHS.getReg(), MRI);
5082 MachineInstr *RHSDef = getDefIgnoringCopies(Reg: RHS.getReg(), MRI);
5083 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5084
5085 // Given this:
5086 //
5087 // x = G_SUB 0, y
5088 // G_ICMP z, x
5089 //
5090 // Produce this:
5091 //
5092 // cmn z, y
5093 if (isCMN(MaybeSub: RHSDef, Pred: P, MRI))
5094 return emitCMN(LHS, RHS&: RHSDef->getOperand(i: 2), MIRBuilder);
5095
5096 // Same idea here, but with the LHS of the compare instead:
5097 //
5098 // Given this:
5099 //
5100 // x = G_SUB 0, y
5101 // G_ICMP x, z
5102 //
5103 // Produce this:
5104 //
5105 // cmn y, z
5106 //
5107 // But be careful! We need to swap the predicate!
5108 if (isCMN(MaybeSub: LHSDef, Pred: P, MRI)) {
5109 if (!CmpInst::isEquality(pred: P)) {
5110 P = CmpInst::getSwappedPredicate(pred: P);
5111 Predicate = MachineOperand::CreatePredicate(Pred: P);
5112 }
5113 return emitCMN(LHS&: LHSDef->getOperand(i: 2), RHS, MIRBuilder);
5114 }
5115
5116 // Given this:
5117 //
5118 // z = G_AND x, y
5119 // G_ICMP z, 0
5120 //
5121 // Produce this if the compare is signed:
5122 //
5123 // tst x, y
5124 if (!CmpInst::isUnsigned(predicate: P) && LHSDef &&
5125 LHSDef->getOpcode() == TargetOpcode::G_AND) {
5126 // Make sure that the RHS is 0.
5127 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI);
5128 if (!ValAndVReg || ValAndVReg->Value != 0)
5129 return nullptr;
5130
5131 return emitTST(LHS&: LHSDef->getOperand(i: 1),
5132 RHS&: LHSDef->getOperand(i: 2), MIRBuilder);
5133 }
5134
5135 return nullptr;
5136}
5137
5138bool AArch64InstructionSelector::selectShuffleVector(
5139 MachineInstr &I, MachineRegisterInfo &MRI) {
5140 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5141 Register Src1Reg = I.getOperand(i: 1).getReg();
5142 Register Src2Reg = I.getOperand(i: 2).getReg();
5143 ArrayRef<int> Mask = I.getOperand(i: 3).getShuffleMask();
5144
5145 MachineBasicBlock &MBB = *I.getParent();
5146 MachineFunction &MF = *MBB.getParent();
5147 LLVMContext &Ctx = MF.getFunction().getContext();
5148
5149 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5150
5151 SmallVector<Constant *, 64> CstIdxs;
5152 for (int Val : Mask) {
5153 // For now, any undef indexes we'll just assume to be 0. This should be
5154 // optimized in future, e.g. to select DUP etc.
5155 Val = Val < 0 ? 0 : Val;
5156 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5157 unsigned Offset = Byte + Val * BytesPerElt;
5158 CstIdxs.emplace_back(Args: ConstantInt::get(Ty: Type::getInt8Ty(C&: Ctx), V: Offset));
5159 }
5160 }
5161
5162 // Use a constant pool to load the index vector for TBL.
5163 Constant *CPVal = ConstantVector::get(V: CstIdxs);
5164 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder&: MIB);
5165 if (!IndexLoad) {
5166 LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5167 return false;
5168 }
5169
5170 if (DstTy.getSizeInBits() != 128) {
5171 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5172 // This case can be done with TBL1.
5173 MachineInstr *Concat =
5174 emitVectorConcat(Dst: std::nullopt, Op1: Src1Reg, Op2: Src2Reg, MIRBuilder&: MIB);
5175 if (!Concat) {
5176 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5177 return false;
5178 }
5179
5180 // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5181 IndexLoad = emitScalarToVector(EltSize: 64, DstRC: &AArch64::FPR128RegClass,
5182 Scalar: IndexLoad->getOperand(i: 0).getReg(), MIRBuilder&: MIB);
5183
5184 auto TBL1 = MIB.buildInstr(
5185 Opc: AArch64::TBLv16i8One, DstOps: {&AArch64::FPR128RegClass},
5186 SrcOps: {Concat->getOperand(i: 0).getReg(), IndexLoad->getOperand(i: 0).getReg()});
5187 constrainSelectedInstRegOperands(I&: *TBL1, TII, TRI, RBI);
5188
5189 auto Copy =
5190 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: 0).getReg()}, SrcOps: {})
5191 .addReg(RegNo: TBL1.getReg(Idx: 0), Flags: {}, SubReg: AArch64::dsub);
5192 RBI.constrainGenericRegister(Reg: Copy.getReg(Idx: 0), RC: AArch64::FPR64RegClass, MRI);
5193 I.eraseFromParent();
5194 return true;
5195 }
5196
5197 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5198 // Q registers for regalloc.
5199 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5200 auto RegSeq = createQTuple(Regs, MIB);
5201 auto TBL2 = MIB.buildInstr(Opc: AArch64::TBLv16i8Two, DstOps: {I.getOperand(i: 0)},
5202 SrcOps: {RegSeq, IndexLoad->getOperand(i: 0)});
5203 constrainSelectedInstRegOperands(I&: *TBL2, TII, TRI, RBI);
5204 I.eraseFromParent();
5205 return true;
5206}
5207
5208MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5209 std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5210 unsigned LaneIdx, const RegisterBank &RB,
5211 MachineIRBuilder &MIRBuilder) const {
5212 MachineInstr *InsElt = nullptr;
5213 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5214 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5215
5216 // Create a register to define with the insert if one wasn't passed in.
5217 if (!DstReg)
5218 DstReg = MRI.createVirtualRegister(RegClass: DstRC);
5219
5220 unsigned EltSize = MRI.getType(Reg: EltReg).getSizeInBits();
5221 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5222
5223 if (RB.getID() == AArch64::FPRRegBankID) {
5224 auto InsSub = emitScalarToVector(EltSize, DstRC, Scalar: EltReg, MIRBuilder);
5225 InsElt = MIRBuilder.buildInstr(Opc, DstOps: {*DstReg}, SrcOps: {SrcReg})
5226 .addImm(Val: LaneIdx)
5227 .addUse(RegNo: InsSub->getOperand(i: 0).getReg())
5228 .addImm(Val: 0);
5229 } else {
5230 InsElt = MIRBuilder.buildInstr(Opc, DstOps: {*DstReg}, SrcOps: {SrcReg})
5231 .addImm(Val: LaneIdx)
5232 .addUse(RegNo: EltReg);
5233 }
5234
5235 constrainSelectedInstRegOperands(I&: *InsElt, TII, TRI, RBI);
5236 return InsElt;
5237}
5238
5239bool AArch64InstructionSelector::selectUSMovFromExtend(
5240 MachineInstr &MI, MachineRegisterInfo &MRI) {
5241 if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5242 MI.getOpcode() != TargetOpcode::G_ZEXT &&
5243 MI.getOpcode() != TargetOpcode::G_ANYEXT)
5244 return false;
5245 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5246 const Register DefReg = MI.getOperand(i: 0).getReg();
5247 const LLT DstTy = MRI.getType(Reg: DefReg);
5248 unsigned DstSize = DstTy.getSizeInBits();
5249
5250 if (DstSize != 32 && DstSize != 64)
5251 return false;
5252
5253 MachineInstr *Extract = getOpcodeDef(Opcode: TargetOpcode::G_EXTRACT_VECTOR_ELT,
5254 Reg: MI.getOperand(i: 1).getReg(), MRI);
5255 int64_t Lane;
5256 if (!Extract || !mi_match(R: Extract->getOperand(i: 2).getReg(), MRI, P: m_ICst(Cst&: Lane)))
5257 return false;
5258 Register Src0 = Extract->getOperand(i: 1).getReg();
5259
5260 const LLT VecTy = MRI.getType(Reg: Src0);
5261 if (VecTy.isScalableVector())
5262 return false;
5263
5264 if (VecTy.getSizeInBits() != 128) {
5265 const MachineInstr *ScalarToVector = emitScalarToVector(
5266 EltSize: VecTy.getSizeInBits(), DstRC: &AArch64::FPR128RegClass, Scalar: Src0, MIRBuilder&: MIB);
5267 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5268 Src0 = ScalarToVector->getOperand(i: 0).getReg();
5269 }
5270
5271 unsigned Opcode;
5272 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5273 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5274 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5275 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5276 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5277 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5278 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5279 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5280 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5281 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5282 else
5283 llvm_unreachable("Unexpected type combo for S/UMov!");
5284
5285 // We may need to generate one of these, depending on the type and sign of the
5286 // input:
5287 // DstReg = SMOV Src0, Lane;
5288 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5289 MachineInstr *ExtI = nullptr;
5290 if (DstSize == 64 && !IsSigned) {
5291 Register NewReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
5292 MIB.buildInstr(Opc: Opcode, DstOps: {NewReg}, SrcOps: {Src0}).addImm(Val: Lane);
5293 ExtI = MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {DefReg}, SrcOps: {})
5294 .addImm(Val: 0)
5295 .addUse(RegNo: NewReg)
5296 .addImm(Val: AArch64::sub_32);
5297 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
5298 } else
5299 ExtI = MIB.buildInstr(Opc: Opcode, DstOps: {DefReg}, SrcOps: {Src0}).addImm(Val: Lane);
5300
5301 constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
5302 MI.eraseFromParent();
5303 return true;
5304}
5305
5306MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5307 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5308 unsigned int Op;
5309 if (DstSize == 128) {
5310 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5311 return nullptr;
5312 Op = AArch64::MOVIv16b_ns;
5313 } else {
5314 Op = AArch64::MOVIv8b_ns;
5315 }
5316
5317 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5318
5319 if (AArch64_AM::isAdvSIMDModImmType9(Imm: Val)) {
5320 Val = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Val);
5321 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5322 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5323 return &*Mov;
5324 }
5325 return nullptr;
5326}
5327
5328MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5329 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5330 bool Inv) {
5331
5332 unsigned int Op;
5333 if (DstSize == 128) {
5334 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5335 return nullptr;
5336 Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5337 } else {
5338 Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5339 }
5340
5341 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5342 uint64_t Shift;
5343
5344 if (AArch64_AM::isAdvSIMDModImmType5(Imm: Val)) {
5345 Val = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Val);
5346 Shift = 0;
5347 } else if (AArch64_AM::isAdvSIMDModImmType6(Imm: Val)) {
5348 Val = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Val);
5349 Shift = 8;
5350 } else
5351 return nullptr;
5352
5353 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5354 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5355 return &*Mov;
5356}
5357
5358MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5359 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5360 bool Inv) {
5361
5362 unsigned int Op;
5363 if (DstSize == 128) {
5364 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5365 return nullptr;
5366 Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5367 } else {
5368 Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5369 }
5370
5371 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5372 uint64_t Shift;
5373
5374 if ((AArch64_AM::isAdvSIMDModImmType1(Imm: Val))) {
5375 Val = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Val);
5376 Shift = 0;
5377 } else if ((AArch64_AM::isAdvSIMDModImmType2(Imm: Val))) {
5378 Val = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Val);
5379 Shift = 8;
5380 } else if ((AArch64_AM::isAdvSIMDModImmType3(Imm: Val))) {
5381 Val = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Val);
5382 Shift = 16;
5383 } else if ((AArch64_AM::isAdvSIMDModImmType4(Imm: Val))) {
5384 Val = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Val);
5385 Shift = 24;
5386 } else
5387 return nullptr;
5388
5389 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5390 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5391 return &*Mov;
5392}
5393
5394MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5395 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5396
5397 unsigned int Op;
5398 if (DstSize == 128) {
5399 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5400 return nullptr;
5401 Op = AArch64::MOVIv2d_ns;
5402 } else {
5403 Op = AArch64::MOVID;
5404 }
5405
5406 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5407 if (AArch64_AM::isAdvSIMDModImmType10(Imm: Val)) {
5408 Val = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Val);
5409 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5410 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5411 return &*Mov;
5412 }
5413 return nullptr;
5414}
5415
5416MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5417 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5418 bool Inv) {
5419
5420 unsigned int Op;
5421 if (DstSize == 128) {
5422 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5423 return nullptr;
5424 Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5425 } else {
5426 Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5427 }
5428
5429 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5430 uint64_t Shift;
5431
5432 if (AArch64_AM::isAdvSIMDModImmType7(Imm: Val)) {
5433 Val = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Val);
5434 Shift = 264;
5435 } else if (AArch64_AM::isAdvSIMDModImmType8(Imm: Val)) {
5436 Val = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Val);
5437 Shift = 272;
5438 } else
5439 return nullptr;
5440
5441 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5442 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5443 return &*Mov;
5444}
5445
5446MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5447 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5448
5449 unsigned int Op;
5450 bool IsWide = false;
5451 if (DstSize == 128) {
5452 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5453 return nullptr;
5454 Op = AArch64::FMOVv4f32_ns;
5455 IsWide = true;
5456 } else {
5457 Op = AArch64::FMOVv2f32_ns;
5458 }
5459
5460 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5461
5462 if (AArch64_AM::isAdvSIMDModImmType11(Imm: Val)) {
5463 Val = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Val);
5464 } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Imm: Val)) {
5465 Val = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Val);
5466 Op = AArch64::FMOVv2f64_ns;
5467 } else
5468 return nullptr;
5469
5470 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5471 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5472 return &*Mov;
5473}
5474
5475bool AArch64InstructionSelector::selectIndexedExtLoad(
5476 MachineInstr &MI, MachineRegisterInfo &MRI) {
5477 auto &ExtLd = cast<GIndexedAnyExtLoad>(Val&: MI);
5478 Register Dst = ExtLd.getDstReg();
5479 Register WriteBack = ExtLd.getWritebackReg();
5480 Register Base = ExtLd.getBaseReg();
5481 Register Offset = ExtLd.getOffsetReg();
5482 LLT Ty = MRI.getType(Reg: Dst);
5483 assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
5484 unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5485 bool IsPre = ExtLd.isPre();
5486 bool IsSExt = isa<GIndexedSExtLoad>(Val: ExtLd);
5487 unsigned InsertIntoSubReg = 0;
5488 bool IsDst64 = Ty.getSizeInBits() == 64;
5489
5490 // ZExt/SExt should be on gpr but can handle extload and zextload of fpr, so
5491 // long as they are scalar.
5492 bool IsFPR = RBI.getRegBank(Reg: Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID;
5493 if ((IsSExt && IsFPR) || Ty.isVector())
5494 return false;
5495
5496 unsigned Opc = 0;
5497 LLT NewLdDstTy;
5498 LLT s32 = LLT::scalar(SizeInBits: 32);
5499 LLT s64 = LLT::scalar(SizeInBits: 64);
5500
5501 if (MemSizeBits == 8) {
5502 if (IsSExt) {
5503 if (IsDst64)
5504 Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5505 else
5506 Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5507 NewLdDstTy = IsDst64 ? s64 : s32;
5508 } else if (IsFPR) {
5509 Opc = IsPre ? AArch64::LDRBpre : AArch64::LDRBpost;
5510 InsertIntoSubReg = AArch64::bsub;
5511 NewLdDstTy = LLT::scalar(SizeInBits: MemSizeBits);
5512 } else {
5513 Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5514 InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5515 NewLdDstTy = s32;
5516 }
5517 } else if (MemSizeBits == 16) {
5518 if (IsSExt) {
5519 if (IsDst64)
5520 Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5521 else
5522 Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5523 NewLdDstTy = IsDst64 ? s64 : s32;
5524 } else if (IsFPR) {
5525 Opc = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
5526 InsertIntoSubReg = AArch64::hsub;
5527 NewLdDstTy = LLT::scalar(SizeInBits: MemSizeBits);
5528 } else {
5529 Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5530 InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5531 NewLdDstTy = s32;
5532 }
5533 } else if (MemSizeBits == 32) {
5534 if (IsSExt) {
5535 Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5536 NewLdDstTy = s64;
5537 } else if (IsFPR) {
5538 Opc = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
5539 InsertIntoSubReg = AArch64::ssub;
5540 NewLdDstTy = LLT::scalar(SizeInBits: MemSizeBits);
5541 } else {
5542 Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5543 InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5544 NewLdDstTy = s32;
5545 }
5546 } else {
5547 llvm_unreachable("Unexpected size for indexed load");
5548 }
5549
5550 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5551 if (!Cst)
5552 return false; // Shouldn't happen, but just in case.
5553
5554 auto LdMI = MIB.buildInstr(Opc, DstOps: {WriteBack, NewLdDstTy}, SrcOps: {Base})
5555 .addImm(Val: Cst->getSExtValue());
5556 LdMI.cloneMemRefs(OtherMI: ExtLd);
5557 constrainSelectedInstRegOperands(I&: *LdMI, TII, TRI, RBI);
5558 // Make sure to select the load with the MemTy as the dest type, and then
5559 // insert into a larger reg if needed.
5560 if (InsertIntoSubReg) {
5561 // Generate a SUBREG_TO_REG.
5562 auto SubToReg = MIB.buildInstr(Opc: TargetOpcode::SUBREG_TO_REG, DstOps: {Dst}, SrcOps: {})
5563 .addImm(Val: 0)
5564 .addUse(RegNo: LdMI.getReg(Idx: 1))
5565 .addImm(Val: InsertIntoSubReg);
5566 RBI.constrainGenericRegister(
5567 Reg: SubToReg.getReg(Idx: 0),
5568 RC: *getRegClassForTypeOnBank(Ty: MRI.getType(Reg: Dst),
5569 RB: *RBI.getRegBank(Reg: Dst, MRI, TRI)),
5570 MRI);
5571 } else {
5572 auto Copy = MIB.buildCopy(Res: Dst, Op: LdMI.getReg(Idx: 1));
5573 selectCopy(I&: *Copy, TII, MRI, TRI, RBI);
5574 }
5575 MI.eraseFromParent();
5576
5577 return true;
5578}
5579
5580bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5581 MachineRegisterInfo &MRI) {
5582 auto &Ld = cast<GIndexedLoad>(Val&: MI);
5583 Register Dst = Ld.getDstReg();
5584 Register WriteBack = Ld.getWritebackReg();
5585 Register Base = Ld.getBaseReg();
5586 Register Offset = Ld.getOffsetReg();
5587 assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
5588 "Unexpected type for indexed load");
5589 unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5590
5591 if (MemSize < MRI.getType(Reg: Dst).getSizeInBytes())
5592 return selectIndexedExtLoad(MI, MRI);
5593
5594 unsigned Opc = 0;
5595 if (Ld.isPre()) {
5596 static constexpr unsigned GPROpcodes[] = {
5597 AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5598 AArch64::LDRXpre};
5599 static constexpr unsigned FPROpcodes[] = {
5600 AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5601 AArch64::LDRQpre};
5602 Opc = (RBI.getRegBank(Reg: Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5603 ? FPROpcodes[Log2_32(Value: MemSize)]
5604 : GPROpcodes[Log2_32(Value: MemSize)];
5605 ;
5606 } else {
5607 static constexpr unsigned GPROpcodes[] = {
5608 AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5609 AArch64::LDRXpost};
5610 static constexpr unsigned FPROpcodes[] = {
5611 AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5612 AArch64::LDRDpost, AArch64::LDRQpost};
5613 Opc = (RBI.getRegBank(Reg: Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5614 ? FPROpcodes[Log2_32(Value: MemSize)]
5615 : GPROpcodes[Log2_32(Value: MemSize)];
5616 ;
5617 }
5618 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5619 if (!Cst)
5620 return false; // Shouldn't happen, but just in case.
5621 auto LdMI =
5622 MIB.buildInstr(Opc, DstOps: {WriteBack, Dst}, SrcOps: {Base}).addImm(Val: Cst->getSExtValue());
5623 LdMI.cloneMemRefs(OtherMI: Ld);
5624 constrainSelectedInstRegOperands(I&: *LdMI, TII, TRI, RBI);
5625 MI.eraseFromParent();
5626 return true;
5627}
5628
5629bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5630 MachineRegisterInfo &MRI) {
5631 Register Dst = I.getWritebackReg();
5632 Register Val = I.getValueReg();
5633 Register Base = I.getBaseReg();
5634 Register Offset = I.getOffsetReg();
5635 assert(MRI.getType(Val).getSizeInBits() <= 128 &&
5636 "Unexpected type for indexed store");
5637
5638 LocationSize MemSize = I.getMMO().getSize();
5639 unsigned MemSizeInBytes = MemSize.getValue();
5640
5641 assert(MemSizeInBytes && MemSizeInBytes <= 16 &&
5642 "Unexpected indexed store size");
5643 unsigned MemSizeLog2 = Log2_32(Value: MemSizeInBytes);
5644
5645 unsigned Opc = 0;
5646 if (I.isPre()) {
5647 static constexpr unsigned GPROpcodes[] = {
5648 AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5649 AArch64::STRXpre};
5650 static constexpr unsigned FPROpcodes[] = {
5651 AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5652 AArch64::STRQpre};
5653
5654 if (RBI.getRegBank(Reg: Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5655 Opc = FPROpcodes[MemSizeLog2];
5656 else
5657 Opc = GPROpcodes[MemSizeLog2];
5658 } else {
5659 static constexpr unsigned GPROpcodes[] = {
5660 AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5661 AArch64::STRXpost};
5662 static constexpr unsigned FPROpcodes[] = {
5663 AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5664 AArch64::STRDpost, AArch64::STRQpost};
5665
5666 if (RBI.getRegBank(Reg: Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5667 Opc = FPROpcodes[MemSizeLog2];
5668 else
5669 Opc = GPROpcodes[MemSizeLog2];
5670 }
5671
5672 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5673 if (!Cst)
5674 return false; // Shouldn't happen, but just in case.
5675 auto Str =
5676 MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {Val, Base}).addImm(Val: Cst->getSExtValue());
5677 Str.cloneMemRefs(OtherMI: I);
5678 constrainSelectedInstRegOperands(I&: *Str, TII, TRI, RBI);
5679 I.eraseFromParent();
5680 return true;
5681}
5682
5683MachineInstr *
5684AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5685 MachineIRBuilder &MIRBuilder,
5686 MachineRegisterInfo &MRI) {
5687 LLT DstTy = MRI.getType(Reg: Dst);
5688 unsigned DstSize = DstTy.getSizeInBits();
5689 assert((DstSize == 64 || DstSize == 128) &&
5690 "Unexpected vector constant size");
5691
5692 if (CV->isNullValue()) {
5693 if (DstSize == 128) {
5694 auto Mov =
5695 MIRBuilder.buildInstr(Opc: AArch64::MOVIv2d_ns, DstOps: {Dst}, SrcOps: {}).addImm(Val: 0);
5696 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5697 return &*Mov;
5698 }
5699
5700 if (DstSize == 64) {
5701 auto Mov =
5702 MIRBuilder
5703 .buildInstr(Opc: AArch64::MOVIv2d_ns, DstOps: {&AArch64::FPR128RegClass}, SrcOps: {})
5704 .addImm(Val: 0);
5705 auto Copy = MIRBuilder.buildInstr(Opc: TargetOpcode::COPY, DstOps: {Dst}, SrcOps: {})
5706 .addReg(RegNo: Mov.getReg(Idx: 0), Flags: {}, SubReg: AArch64::dsub);
5707 RBI.constrainGenericRegister(Reg: Dst, RC: AArch64::FPR64RegClass, MRI);
5708 return &*Copy;
5709 }
5710 }
5711
5712 if (Constant *SplatValue = CV->getSplatValue()) {
5713 APInt SplatValueAsInt =
5714 isa<ConstantFP>(Val: SplatValue)
5715 ? cast<ConstantFP>(Val: SplatValue)->getValueAPF().bitcastToAPInt()
5716 : SplatValue->getUniqueInteger();
5717 APInt DefBits = APInt::getSplat(
5718 NewLen: DstSize, V: SplatValueAsInt.trunc(width: DstTy.getScalarSizeInBits()));
5719 auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * {
5720 MachineInstr *NewOp;
5721 bool Inv = false;
5722 if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)) ||
5723 (NewOp =
5724 tryAdvSIMDModImm32(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5725 (NewOp =
5726 tryAdvSIMDModImm321s(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5727 (NewOp =
5728 tryAdvSIMDModImm16(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5729 (NewOp = tryAdvSIMDModImm8(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)) ||
5730 (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)))
5731 return NewOp;
5732
5733 DefBits = ~DefBits;
5734 Inv = true;
5735 if ((NewOp =
5736 tryAdvSIMDModImm32(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5737 (NewOp =
5738 tryAdvSIMDModImm321s(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5739 (NewOp = tryAdvSIMDModImm16(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)))
5740 return NewOp;
5741 return nullptr;
5742 };
5743
5744 if (auto *NewOp = TryMOVIWithBits(DefBits))
5745 return NewOp;
5746
5747 // See if a fneg of the constant can be materialized with a MOVI, etc
5748 auto TryWithFNeg = [&](APInt DefBits, int NumBits,
5749 unsigned NegOpc) -> MachineInstr * {
5750 // FNegate each sub-element of the constant
5751 APInt Neg = APInt::getHighBitsSet(numBits: NumBits, hiBitsSet: 1).zext(width: DstSize);
5752 APInt NegBits(DstSize, 0);
5753 unsigned NumElts = DstSize / NumBits;
5754 for (unsigned i = 0; i < NumElts; i++)
5755 NegBits |= Neg << (NumBits * i);
5756 NegBits = DefBits ^ NegBits;
5757
5758 // Try to create the new constants with MOVI, and if so generate a fneg
5759 // for it.
5760 if (auto *NewOp = TryMOVIWithBits(NegBits)) {
5761 Register NewDst = MRI.createVirtualRegister(
5762 RegClass: DstSize == 64 ? &AArch64::FPR64RegClass : &AArch64::FPR128RegClass);
5763 NewOp->getOperand(i: 0).setReg(NewDst);
5764 return MIRBuilder.buildInstr(Opc: NegOpc, DstOps: {Dst}, SrcOps: {NewDst});
5765 }
5766 return nullptr;
5767 };
5768 MachineInstr *R;
5769 if ((R = TryWithFNeg(DefBits, 32,
5770 DstSize == 64 ? AArch64::FNEGv2f32
5771 : AArch64::FNEGv4f32)) ||
5772 (R = TryWithFNeg(DefBits, 64,
5773 DstSize == 64 ? AArch64::FNEGDr
5774 : AArch64::FNEGv2f64)) ||
5775 (STI.hasFullFP16() &&
5776 (R = TryWithFNeg(DefBits, 16,
5777 DstSize == 64 ? AArch64::FNEGv4f16
5778 : AArch64::FNEGv8f16))))
5779 return R;
5780 }
5781
5782 auto *CPLoad = emitLoadFromConstantPool(CPVal: CV, MIRBuilder);
5783 if (!CPLoad) {
5784 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5785 return nullptr;
5786 }
5787
5788 auto Copy = MIRBuilder.buildCopy(Res: Dst, Op: CPLoad->getOperand(i: 0));
5789 RBI.constrainGenericRegister(
5790 Reg: Dst, RC: *MRI.getRegClass(Reg: CPLoad->getOperand(i: 0).getReg()), MRI);
5791 return &*Copy;
5792}
5793
5794bool AArch64InstructionSelector::tryOptConstantBuildVec(
5795 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5796 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5797 unsigned DstSize = DstTy.getSizeInBits();
5798 assert(DstSize <= 128 && "Unexpected build_vec type!");
5799 if (DstSize < 32)
5800 return false;
5801 // Check if we're building a constant vector, in which case we want to
5802 // generate a constant pool load instead of a vector insert sequence.
5803 SmallVector<Constant *, 16> Csts;
5804 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5805 // Try to find G_CONSTANT or G_FCONSTANT
5806 auto *OpMI =
5807 getOpcodeDef(Opcode: TargetOpcode::G_CONSTANT, Reg: I.getOperand(i: Idx).getReg(), MRI);
5808 if (OpMI)
5809 Csts.emplace_back(
5810 Args: const_cast<ConstantInt *>(OpMI->getOperand(i: 1).getCImm()));
5811 else if ((OpMI = getOpcodeDef(Opcode: TargetOpcode::G_FCONSTANT,
5812 Reg: I.getOperand(i: Idx).getReg(), MRI)))
5813 Csts.emplace_back(
5814 Args: const_cast<ConstantFP *>(OpMI->getOperand(i: 1).getFPImm()));
5815 else
5816 return false;
5817 }
5818 Constant *CV = ConstantVector::get(V: Csts);
5819 if (!emitConstantVector(Dst: I.getOperand(i: 0).getReg(), CV, MIRBuilder&: MIB, MRI))
5820 return false;
5821 I.eraseFromParent();
5822 return true;
5823}
5824
5825bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5826 MachineInstr &I, MachineRegisterInfo &MRI) {
5827 // Given:
5828 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5829 //
5830 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5831 Register Dst = I.getOperand(i: 0).getReg();
5832 Register EltReg = I.getOperand(i: 1).getReg();
5833 LLT EltTy = MRI.getType(Reg: EltReg);
5834 // If the index isn't on the same bank as its elements, then this can't be a
5835 // SUBREG_TO_REG.
5836 const RegisterBank &EltRB = *RBI.getRegBank(Reg: EltReg, MRI, TRI);
5837 const RegisterBank &DstRB = *RBI.getRegBank(Reg: Dst, MRI, TRI);
5838 if (EltRB != DstRB)
5839 return false;
5840 if (any_of(Range: drop_begin(RangeOrContainer: I.operands(), N: 2), P: [&MRI](const MachineOperand &Op) {
5841 return !getOpcodeDef(Opcode: TargetOpcode::G_IMPLICIT_DEF, Reg: Op.getReg(), MRI);
5842 }))
5843 return false;
5844 unsigned SubReg;
5845 const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(Ty: EltTy, RB: EltRB);
5846 if (!EltRC)
5847 return false;
5848 const TargetRegisterClass *DstRC =
5849 getRegClassForTypeOnBank(Ty: MRI.getType(Reg: Dst), RB: DstRB);
5850 if (!DstRC)
5851 return false;
5852 if (!getSubRegForClass(RC: EltRC, TRI, SubReg))
5853 return false;
5854 auto SubregToReg = MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {Dst}, SrcOps: {})
5855 .addImm(Val: 0)
5856 .addUse(RegNo: EltReg)
5857 .addImm(Val: SubReg);
5858 I.eraseFromParent();
5859 constrainSelectedInstRegOperands(I&: *SubregToReg, TII, TRI, RBI);
5860 return RBI.constrainGenericRegister(Reg: Dst, RC: *DstRC, MRI);
5861}
5862
5863bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5864 MachineRegisterInfo &MRI) {
5865 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5866 // Until we port more of the optimized selections, for now just use a vector
5867 // insert sequence.
5868 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5869 const LLT EltTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
5870 unsigned EltSize = EltTy.getSizeInBits();
5871
5872 if (tryOptConstantBuildVec(I, DstTy, MRI))
5873 return true;
5874 if (tryOptBuildVecToSubregToReg(I, MRI))
5875 return true;
5876
5877 if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5878 return false; // Don't support all element types yet.
5879 const RegisterBank &RB = *RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI);
5880
5881 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5882 MachineInstr *ScalarToVec =
5883 emitScalarToVector(EltSize: DstTy.getElementType().getSizeInBits(), DstRC,
5884 Scalar: I.getOperand(i: 1).getReg(), MIRBuilder&: MIB);
5885 if (!ScalarToVec)
5886 return false;
5887
5888 Register DstVec = ScalarToVec->getOperand(i: 0).getReg();
5889 unsigned DstSize = DstTy.getSizeInBits();
5890
5891 // Keep track of the last MI we inserted. Later on, we might be able to save
5892 // a copy using it.
5893 MachineInstr *PrevMI = ScalarToVec;
5894 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5895 // Note that if we don't do a subregister copy, we can end up making an
5896 // extra register.
5897 Register OpReg = I.getOperand(i).getReg();
5898 // Do not emit inserts for undefs
5899 if (!getOpcodeDef<GImplicitDef>(Reg: OpReg, MRI)) {
5900 PrevMI = &*emitLaneInsert(DstReg: std::nullopt, SrcReg: DstVec, EltReg: OpReg, LaneIdx: i - 1, RB, MIRBuilder&: MIB);
5901 DstVec = PrevMI->getOperand(i: 0).getReg();
5902 }
5903 }
5904
5905 // If DstTy's size in bits is less than 128, then emit a subregister copy
5906 // from DstVec to the last register we've defined.
5907 if (DstSize < 128) {
5908 // Force this to be FPR using the destination vector.
5909 const TargetRegisterClass *RC =
5910 getRegClassForTypeOnBank(Ty: DstTy, RB: *RBI.getRegBank(Reg: DstVec, MRI, TRI));
5911 if (!RC)
5912 return false;
5913 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5914 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5915 return false;
5916 }
5917
5918 unsigned SubReg = 0;
5919 if (!getSubRegForClass(RC, TRI, SubReg))
5920 return false;
5921 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5922 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5923 << "\n");
5924 return false;
5925 }
5926
5927 Register Reg = MRI.createVirtualRegister(RegClass: RC);
5928 Register DstReg = I.getOperand(i: 0).getReg();
5929
5930 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {}).addReg(RegNo: DstVec, Flags: {}, SubReg);
5931 MachineOperand &RegOp = I.getOperand(i: 1);
5932 RegOp.setReg(Reg);
5933 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
5934 } else {
5935 // We either have a vector with all elements (except the first one) undef or
5936 // at least one non-undef non-first element. In the first case, we need to
5937 // constrain the output register ourselves as we may have generated an
5938 // INSERT_SUBREG operation which is a generic operation for which the
5939 // output regclass cannot be automatically chosen.
5940 //
5941 // In the second case, there is no need to do this as it may generate an
5942 // instruction like INSvi32gpr where the regclass can be automatically
5943 // chosen.
5944 //
5945 // Also, we save a copy by re-using the destination register on the final
5946 // insert.
5947 PrevMI->getOperand(i: 0).setReg(I.getOperand(i: 0).getReg());
5948 constrainSelectedInstRegOperands(I&: *PrevMI, TII, TRI, RBI);
5949
5950 Register DstReg = PrevMI->getOperand(i: 0).getReg();
5951 if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
5952 const TargetRegisterClass *RC =
5953 getRegClassForTypeOnBank(Ty: DstTy, RB: *RBI.getRegBank(Reg: DstVec, MRI, TRI));
5954 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
5955 }
5956 }
5957
5958 I.eraseFromParent();
5959 return true;
5960}
5961
5962bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5963 unsigned NumVecs,
5964 MachineInstr &I) {
5965 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5966 assert(Opc && "Expected an opcode?");
5967 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5968 auto &MRI = *MIB.getMRI();
5969 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5970 unsigned Size = Ty.getSizeInBits();
5971 assert((Size == 64 || Size == 128) &&
5972 "Destination must be 64 bits or 128 bits?");
5973 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5974 auto Ptr = I.getOperand(i: I.getNumOperands() - 1).getReg();
5975 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5976 auto Load = MIB.buildInstr(Opc, DstOps: {Ty}, SrcOps: {Ptr});
5977 Load.cloneMemRefs(OtherMI: I);
5978 constrainSelectedInstRegOperands(I&: *Load, TII, TRI, RBI);
5979 Register SelectedLoadDst = Load->getOperand(i: 0).getReg();
5980 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5981 auto Vec = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: Idx)}, SrcOps: {})
5982 .addReg(RegNo: SelectedLoadDst, Flags: {}, SubReg: SubReg + Idx);
5983 // Emit the subreg copies and immediately select them.
5984 // FIXME: We should refactor our copy code into an emitCopy helper and
5985 // clean up uses of this pattern elsewhere in the selector.
5986 selectCopy(I&: *Vec, TII, MRI, TRI, RBI);
5987 }
5988 return true;
5989}
5990
5991bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
5992 unsigned Opc, unsigned NumVecs, MachineInstr &I) {
5993 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5994 assert(Opc && "Expected an opcode?");
5995 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5996 auto &MRI = *MIB.getMRI();
5997 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5998 bool Narrow = Ty.getSizeInBits() == 64;
5999
6000 auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
6001 SmallVector<Register, 4> Regs(NumVecs);
6002 std::transform(first: FirstSrcRegIt, last: FirstSrcRegIt + NumVecs, result: Regs.begin(),
6003 unary_op: [](auto MO) { return MO.getReg(); });
6004
6005 if (Narrow) {
6006 transform(Range&: Regs, d_first: Regs.begin(), F: [this](Register Reg) {
6007 return emitScalarToVector(EltSize: 64, DstRC: &AArch64::FPR128RegClass, Scalar: Reg, MIRBuilder&: MIB)
6008 ->getOperand(i: 0)
6009 .getReg();
6010 });
6011 Ty = Ty.multiplyElements(Factor: 2);
6012 }
6013
6014 Register Tuple = createQTuple(Regs, MIB);
6015 auto LaneNo = getIConstantVRegVal(VReg: (FirstSrcRegIt + NumVecs)->getReg(), MRI);
6016 if (!LaneNo)
6017 return false;
6018
6019 Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
6020 auto Load = MIB.buildInstr(Opc, DstOps: {Ty}, SrcOps: {})
6021 .addReg(RegNo: Tuple)
6022 .addImm(Val: LaneNo->getZExtValue())
6023 .addReg(RegNo: Ptr);
6024 Load.cloneMemRefs(OtherMI: I);
6025 constrainSelectedInstRegOperands(I&: *Load, TII, TRI, RBI);
6026 Register SelectedLoadDst = Load->getOperand(i: 0).getReg();
6027 unsigned SubReg = AArch64::qsub0;
6028 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6029 auto Vec = MIB.buildInstr(Opc: TargetOpcode::COPY,
6030 DstOps: {Narrow ? DstOp(&AArch64::FPR128RegClass)
6031 : DstOp(I.getOperand(i: Idx).getReg())},
6032 SrcOps: {})
6033 .addReg(RegNo: SelectedLoadDst, Flags: {}, SubReg: SubReg + Idx);
6034 Register WideReg = Vec.getReg(Idx: 0);
6035 // Emit the subreg copies and immediately select them.
6036 selectCopy(I&: *Vec, TII, MRI, TRI, RBI);
6037 if (Narrow &&
6038 !emitNarrowVector(DstReg: I.getOperand(i: Idx).getReg(), SrcReg: WideReg, MIB, MRI))
6039 return false;
6040 }
6041 return true;
6042}
6043
6044void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
6045 unsigned NumVecs,
6046 unsigned Opc) {
6047 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6048 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6049 Register Ptr = I.getOperand(i: 1 + NumVecs).getReg();
6050
6051 SmallVector<Register, 2> Regs(NumVecs);
6052 std::transform(first: I.operands_begin() + 1, last: I.operands_begin() + 1 + NumVecs,
6053 result: Regs.begin(), unary_op: [](auto MO) { return MO.getReg(); });
6054
6055 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
6056 : createDTuple(Regs, MIB);
6057 auto Store = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {Tuple, Ptr});
6058 Store.cloneMemRefs(OtherMI: I);
6059 constrainSelectedInstRegOperands(I&: *Store, TII, TRI, RBI);
6060}
6061
6062bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
6063 MachineInstr &I, unsigned NumVecs, unsigned Opc) {
6064 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6065 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6066 bool Narrow = Ty.getSizeInBits() == 64;
6067
6068 SmallVector<Register, 2> Regs(NumVecs);
6069 std::transform(first: I.operands_begin() + 1, last: I.operands_begin() + 1 + NumVecs,
6070 result: Regs.begin(), unary_op: [](auto MO) { return MO.getReg(); });
6071
6072 if (Narrow)
6073 transform(Range&: Regs, d_first: Regs.begin(), F: [this](Register Reg) {
6074 return emitScalarToVector(EltSize: 64, DstRC: &AArch64::FPR128RegClass, Scalar: Reg, MIRBuilder&: MIB)
6075 ->getOperand(i: 0)
6076 .getReg();
6077 });
6078
6079 Register Tuple = createQTuple(Regs, MIB);
6080
6081 auto LaneNo = getIConstantVRegVal(VReg: I.getOperand(i: 1 + NumVecs).getReg(), MRI);
6082 if (!LaneNo)
6083 return false;
6084 Register Ptr = I.getOperand(i: 1 + NumVecs + 1).getReg();
6085 auto Store = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {})
6086 .addReg(RegNo: Tuple)
6087 .addImm(Val: LaneNo->getZExtValue())
6088 .addReg(RegNo: Ptr);
6089 Store.cloneMemRefs(OtherMI: I);
6090 constrainSelectedInstRegOperands(I&: *Store, TII, TRI, RBI);
6091 return true;
6092}
6093
6094bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
6095 MachineInstr &I, MachineRegisterInfo &MRI) {
6096 // Find the intrinsic ID.
6097 unsigned IntrinID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
6098
6099 const LLT S8 = LLT::scalar(SizeInBits: 8);
6100 const LLT S16 = LLT::scalar(SizeInBits: 16);
6101 const LLT S32 = LLT::scalar(SizeInBits: 32);
6102 const LLT S64 = LLT::scalar(SizeInBits: 64);
6103 const LLT P0 = LLT::pointer(AddressSpace: 0, SizeInBits: 64);
6104 // Select the instruction.
6105 switch (IntrinID) {
6106 default:
6107 return false;
6108 case Intrinsic::aarch64_ldxp:
6109 case Intrinsic::aarch64_ldaxp: {
6110 auto NewI = MIB.buildInstr(
6111 Opc: IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6112 DstOps: {I.getOperand(i: 0).getReg(), I.getOperand(i: 1).getReg()},
6113 SrcOps: {I.getOperand(i: 3)});
6114 NewI.cloneMemRefs(OtherMI: I);
6115 constrainSelectedInstRegOperands(I&: *NewI, TII, TRI, RBI);
6116 break;
6117 }
6118 case Intrinsic::aarch64_neon_ld1x2: {
6119 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6120 unsigned Opc = 0;
6121 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6122 Opc = AArch64::LD1Twov8b;
6123 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6124 Opc = AArch64::LD1Twov16b;
6125 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6126 Opc = AArch64::LD1Twov4h;
6127 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6128 Opc = AArch64::LD1Twov8h;
6129 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6130 Opc = AArch64::LD1Twov2s;
6131 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6132 Opc = AArch64::LD1Twov4s;
6133 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6134 Opc = AArch64::LD1Twov2d;
6135 else if (Ty == S64 || Ty == P0)
6136 Opc = AArch64::LD1Twov1d;
6137 else
6138 llvm_unreachable("Unexpected type for ld1x2!");
6139 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6140 break;
6141 }
6142 case Intrinsic::aarch64_neon_ld1x3: {
6143 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6144 unsigned Opc = 0;
6145 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6146 Opc = AArch64::LD1Threev8b;
6147 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6148 Opc = AArch64::LD1Threev16b;
6149 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6150 Opc = AArch64::LD1Threev4h;
6151 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6152 Opc = AArch64::LD1Threev8h;
6153 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6154 Opc = AArch64::LD1Threev2s;
6155 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6156 Opc = AArch64::LD1Threev4s;
6157 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6158 Opc = AArch64::LD1Threev2d;
6159 else if (Ty == S64 || Ty == P0)
6160 Opc = AArch64::LD1Threev1d;
6161 else
6162 llvm_unreachable("Unexpected type for ld1x3!");
6163 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6164 break;
6165 }
6166 case Intrinsic::aarch64_neon_ld1x4: {
6167 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6168 unsigned Opc = 0;
6169 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6170 Opc = AArch64::LD1Fourv8b;
6171 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6172 Opc = AArch64::LD1Fourv16b;
6173 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6174 Opc = AArch64::LD1Fourv4h;
6175 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6176 Opc = AArch64::LD1Fourv8h;
6177 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6178 Opc = AArch64::LD1Fourv2s;
6179 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6180 Opc = AArch64::LD1Fourv4s;
6181 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6182 Opc = AArch64::LD1Fourv2d;
6183 else if (Ty == S64 || Ty == P0)
6184 Opc = AArch64::LD1Fourv1d;
6185 else
6186 llvm_unreachable("Unexpected type for ld1x4!");
6187 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6188 break;
6189 }
6190 case Intrinsic::aarch64_neon_ld2: {
6191 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6192 unsigned Opc = 0;
6193 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6194 Opc = AArch64::LD2Twov8b;
6195 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6196 Opc = AArch64::LD2Twov16b;
6197 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6198 Opc = AArch64::LD2Twov4h;
6199 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6200 Opc = AArch64::LD2Twov8h;
6201 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6202 Opc = AArch64::LD2Twov2s;
6203 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6204 Opc = AArch64::LD2Twov4s;
6205 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6206 Opc = AArch64::LD2Twov2d;
6207 else if (Ty == S64 || Ty == P0)
6208 Opc = AArch64::LD1Twov1d;
6209 else
6210 llvm_unreachable("Unexpected type for ld2!");
6211 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6212 break;
6213 }
6214 case Intrinsic::aarch64_neon_ld2lane: {
6215 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6216 unsigned Opc;
6217 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6218 Opc = AArch64::LD2i8;
6219 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6220 Opc = AArch64::LD2i16;
6221 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6222 Opc = AArch64::LD2i32;
6223 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6224 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6225 Opc = AArch64::LD2i64;
6226 else
6227 llvm_unreachable("Unexpected type for st2lane!");
6228 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 2, I))
6229 return false;
6230 break;
6231 }
6232 case Intrinsic::aarch64_neon_ld2r: {
6233 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6234 unsigned Opc = 0;
6235 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6236 Opc = AArch64::LD2Rv8b;
6237 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6238 Opc = AArch64::LD2Rv16b;
6239 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6240 Opc = AArch64::LD2Rv4h;
6241 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6242 Opc = AArch64::LD2Rv8h;
6243 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6244 Opc = AArch64::LD2Rv2s;
6245 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6246 Opc = AArch64::LD2Rv4s;
6247 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6248 Opc = AArch64::LD2Rv2d;
6249 else if (Ty == S64 || Ty == P0)
6250 Opc = AArch64::LD2Rv1d;
6251 else
6252 llvm_unreachable("Unexpected type for ld2r!");
6253 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6254 break;
6255 }
6256 case Intrinsic::aarch64_neon_ld3: {
6257 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6258 unsigned Opc = 0;
6259 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6260 Opc = AArch64::LD3Threev8b;
6261 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6262 Opc = AArch64::LD3Threev16b;
6263 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6264 Opc = AArch64::LD3Threev4h;
6265 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6266 Opc = AArch64::LD3Threev8h;
6267 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6268 Opc = AArch64::LD3Threev2s;
6269 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6270 Opc = AArch64::LD3Threev4s;
6271 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6272 Opc = AArch64::LD3Threev2d;
6273 else if (Ty == S64 || Ty == P0)
6274 Opc = AArch64::LD1Threev1d;
6275 else
6276 llvm_unreachable("Unexpected type for ld3!");
6277 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6278 break;
6279 }
6280 case Intrinsic::aarch64_neon_ld3lane: {
6281 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6282 unsigned Opc;
6283 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6284 Opc = AArch64::LD3i8;
6285 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6286 Opc = AArch64::LD3i16;
6287 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6288 Opc = AArch64::LD3i32;
6289 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6290 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6291 Opc = AArch64::LD3i64;
6292 else
6293 llvm_unreachable("Unexpected type for st3lane!");
6294 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 3, I))
6295 return false;
6296 break;
6297 }
6298 case Intrinsic::aarch64_neon_ld3r: {
6299 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6300 unsigned Opc = 0;
6301 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6302 Opc = AArch64::LD3Rv8b;
6303 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6304 Opc = AArch64::LD3Rv16b;
6305 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6306 Opc = AArch64::LD3Rv4h;
6307 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6308 Opc = AArch64::LD3Rv8h;
6309 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6310 Opc = AArch64::LD3Rv2s;
6311 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6312 Opc = AArch64::LD3Rv4s;
6313 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6314 Opc = AArch64::LD3Rv2d;
6315 else if (Ty == S64 || Ty == P0)
6316 Opc = AArch64::LD3Rv1d;
6317 else
6318 llvm_unreachable("Unexpected type for ld3r!");
6319 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6320 break;
6321 }
6322 case Intrinsic::aarch64_neon_ld4: {
6323 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6324 unsigned Opc = 0;
6325 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6326 Opc = AArch64::LD4Fourv8b;
6327 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6328 Opc = AArch64::LD4Fourv16b;
6329 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6330 Opc = AArch64::LD4Fourv4h;
6331 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6332 Opc = AArch64::LD4Fourv8h;
6333 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6334 Opc = AArch64::LD4Fourv2s;
6335 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6336 Opc = AArch64::LD4Fourv4s;
6337 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6338 Opc = AArch64::LD4Fourv2d;
6339 else if (Ty == S64 || Ty == P0)
6340 Opc = AArch64::LD1Fourv1d;
6341 else
6342 llvm_unreachable("Unexpected type for ld4!");
6343 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6344 break;
6345 }
6346 case Intrinsic::aarch64_neon_ld4lane: {
6347 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6348 unsigned Opc;
6349 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6350 Opc = AArch64::LD4i8;
6351 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6352 Opc = AArch64::LD4i16;
6353 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6354 Opc = AArch64::LD4i32;
6355 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6356 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6357 Opc = AArch64::LD4i64;
6358 else
6359 llvm_unreachable("Unexpected type for st4lane!");
6360 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 4, I))
6361 return false;
6362 break;
6363 }
6364 case Intrinsic::aarch64_neon_ld4r: {
6365 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6366 unsigned Opc = 0;
6367 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6368 Opc = AArch64::LD4Rv8b;
6369 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6370 Opc = AArch64::LD4Rv16b;
6371 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6372 Opc = AArch64::LD4Rv4h;
6373 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6374 Opc = AArch64::LD4Rv8h;
6375 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6376 Opc = AArch64::LD4Rv2s;
6377 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6378 Opc = AArch64::LD4Rv4s;
6379 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6380 Opc = AArch64::LD4Rv2d;
6381 else if (Ty == S64 || Ty == P0)
6382 Opc = AArch64::LD4Rv1d;
6383 else
6384 llvm_unreachable("Unexpected type for ld4r!");
6385 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6386 break;
6387 }
6388 case Intrinsic::aarch64_neon_st1x2: {
6389 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6390 unsigned Opc;
6391 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6392 Opc = AArch64::ST1Twov8b;
6393 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6394 Opc = AArch64::ST1Twov16b;
6395 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6396 Opc = AArch64::ST1Twov4h;
6397 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6398 Opc = AArch64::ST1Twov8h;
6399 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6400 Opc = AArch64::ST1Twov2s;
6401 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6402 Opc = AArch64::ST1Twov4s;
6403 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6404 Opc = AArch64::ST1Twov2d;
6405 else if (Ty == S64 || Ty == P0)
6406 Opc = AArch64::ST1Twov1d;
6407 else
6408 llvm_unreachable("Unexpected type for st1x2!");
6409 selectVectorStoreIntrinsic(I, NumVecs: 2, Opc);
6410 break;
6411 }
6412 case Intrinsic::aarch64_neon_st1x3: {
6413 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6414 unsigned Opc;
6415 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6416 Opc = AArch64::ST1Threev8b;
6417 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6418 Opc = AArch64::ST1Threev16b;
6419 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6420 Opc = AArch64::ST1Threev4h;
6421 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6422 Opc = AArch64::ST1Threev8h;
6423 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6424 Opc = AArch64::ST1Threev2s;
6425 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6426 Opc = AArch64::ST1Threev4s;
6427 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6428 Opc = AArch64::ST1Threev2d;
6429 else if (Ty == S64 || Ty == P0)
6430 Opc = AArch64::ST1Threev1d;
6431 else
6432 llvm_unreachable("Unexpected type for st1x3!");
6433 selectVectorStoreIntrinsic(I, NumVecs: 3, Opc);
6434 break;
6435 }
6436 case Intrinsic::aarch64_neon_st1x4: {
6437 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6438 unsigned Opc;
6439 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6440 Opc = AArch64::ST1Fourv8b;
6441 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6442 Opc = AArch64::ST1Fourv16b;
6443 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6444 Opc = AArch64::ST1Fourv4h;
6445 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6446 Opc = AArch64::ST1Fourv8h;
6447 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6448 Opc = AArch64::ST1Fourv2s;
6449 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6450 Opc = AArch64::ST1Fourv4s;
6451 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6452 Opc = AArch64::ST1Fourv2d;
6453 else if (Ty == S64 || Ty == P0)
6454 Opc = AArch64::ST1Fourv1d;
6455 else
6456 llvm_unreachable("Unexpected type for st1x4!");
6457 selectVectorStoreIntrinsic(I, NumVecs: 4, Opc);
6458 break;
6459 }
6460 case Intrinsic::aarch64_neon_st2: {
6461 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6462 unsigned Opc;
6463 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6464 Opc = AArch64::ST2Twov8b;
6465 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6466 Opc = AArch64::ST2Twov16b;
6467 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6468 Opc = AArch64::ST2Twov4h;
6469 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6470 Opc = AArch64::ST2Twov8h;
6471 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6472 Opc = AArch64::ST2Twov2s;
6473 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6474 Opc = AArch64::ST2Twov4s;
6475 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6476 Opc = AArch64::ST2Twov2d;
6477 else if (Ty == S64 || Ty == P0)
6478 Opc = AArch64::ST1Twov1d;
6479 else
6480 llvm_unreachable("Unexpected type for st2!");
6481 selectVectorStoreIntrinsic(I, NumVecs: 2, Opc);
6482 break;
6483 }
6484 case Intrinsic::aarch64_neon_st3: {
6485 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6486 unsigned Opc;
6487 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6488 Opc = AArch64::ST3Threev8b;
6489 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6490 Opc = AArch64::ST3Threev16b;
6491 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6492 Opc = AArch64::ST3Threev4h;
6493 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6494 Opc = AArch64::ST3Threev8h;
6495 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6496 Opc = AArch64::ST3Threev2s;
6497 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6498 Opc = AArch64::ST3Threev4s;
6499 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6500 Opc = AArch64::ST3Threev2d;
6501 else if (Ty == S64 || Ty == P0)
6502 Opc = AArch64::ST1Threev1d;
6503 else
6504 llvm_unreachable("Unexpected type for st3!");
6505 selectVectorStoreIntrinsic(I, NumVecs: 3, Opc);
6506 break;
6507 }
6508 case Intrinsic::aarch64_neon_st4: {
6509 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6510 unsigned Opc;
6511 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6512 Opc = AArch64::ST4Fourv8b;
6513 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6514 Opc = AArch64::ST4Fourv16b;
6515 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6516 Opc = AArch64::ST4Fourv4h;
6517 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6518 Opc = AArch64::ST4Fourv8h;
6519 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6520 Opc = AArch64::ST4Fourv2s;
6521 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6522 Opc = AArch64::ST4Fourv4s;
6523 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6524 Opc = AArch64::ST4Fourv2d;
6525 else if (Ty == S64 || Ty == P0)
6526 Opc = AArch64::ST1Fourv1d;
6527 else
6528 llvm_unreachable("Unexpected type for st4!");
6529 selectVectorStoreIntrinsic(I, NumVecs: 4, Opc);
6530 break;
6531 }
6532 case Intrinsic::aarch64_neon_st2lane: {
6533 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6534 unsigned Opc;
6535 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6536 Opc = AArch64::ST2i8;
6537 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6538 Opc = AArch64::ST2i16;
6539 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6540 Opc = AArch64::ST2i32;
6541 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6542 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6543 Opc = AArch64::ST2i64;
6544 else
6545 llvm_unreachable("Unexpected type for st2lane!");
6546 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 2, Opc))
6547 return false;
6548 break;
6549 }
6550 case Intrinsic::aarch64_neon_st3lane: {
6551 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6552 unsigned Opc;
6553 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6554 Opc = AArch64::ST3i8;
6555 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6556 Opc = AArch64::ST3i16;
6557 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6558 Opc = AArch64::ST3i32;
6559 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6560 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6561 Opc = AArch64::ST3i64;
6562 else
6563 llvm_unreachable("Unexpected type for st3lane!");
6564 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 3, Opc))
6565 return false;
6566 break;
6567 }
6568 case Intrinsic::aarch64_neon_st4lane: {
6569 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6570 unsigned Opc;
6571 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6572 Opc = AArch64::ST4i8;
6573 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6574 Opc = AArch64::ST4i16;
6575 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6576 Opc = AArch64::ST4i32;
6577 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6578 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6579 Opc = AArch64::ST4i64;
6580 else
6581 llvm_unreachable("Unexpected type for st4lane!");
6582 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 4, Opc))
6583 return false;
6584 break;
6585 }
6586 case Intrinsic::aarch64_mops_memset_tag: {
6587 // Transform
6588 // %dst:gpr(p0) = \
6589 // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6590 // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6591 // where %dst is updated, into
6592 // %Rd:GPR64common, %Rn:GPR64) = \
6593 // MOPSMemorySetTaggingPseudo \
6594 // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6595 // where Rd and Rn are tied.
6596 // It is expected that %val has been extended to s64 in legalization.
6597 // Note that the order of the size/value operands are swapped.
6598
6599 Register DstDef = I.getOperand(i: 0).getReg();
6600 // I.getOperand(1) is the intrinsic function
6601 Register DstUse = I.getOperand(i: 2).getReg();
6602 Register ValUse = I.getOperand(i: 3).getReg();
6603 Register SizeUse = I.getOperand(i: 4).getReg();
6604
6605 // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6606 // Therefore an additional virtual register is required for the updated size
6607 // operand. This value is not accessible via the semantics of the intrinsic.
6608 Register SizeDef = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
6609
6610 auto Memset = MIB.buildInstr(Opc: AArch64::MOPSMemorySetTaggingPseudo,
6611 DstOps: {DstDef, SizeDef}, SrcOps: {DstUse, SizeUse, ValUse});
6612 Memset.cloneMemRefs(OtherMI: I);
6613 constrainSelectedInstRegOperands(I&: *Memset, TII, TRI, RBI);
6614 break;
6615 }
6616 case Intrinsic::ptrauth_resign_load_relative: {
6617 Register DstReg = I.getOperand(i: 0).getReg();
6618 Register ValReg = I.getOperand(i: 2).getReg();
6619 uint64_t AUTKey = I.getOperand(i: 3).getImm();
6620 Register AUTDisc = I.getOperand(i: 4).getReg();
6621 uint64_t PACKey = I.getOperand(i: 5).getImm();
6622 Register PACDisc = I.getOperand(i: 6).getReg();
6623 int64_t Addend = I.getOperand(i: 7).getImm();
6624
6625 Register AUTAddrDisc = AUTDisc;
6626 uint16_t AUTConstDiscC = 0;
6627 std::tie(args&: AUTConstDiscC, args&: AUTAddrDisc) =
6628 extractPtrauthBlendDiscriminators(Disc: AUTDisc, MRI);
6629
6630 Register PACAddrDisc = PACDisc;
6631 uint16_t PACConstDiscC = 0;
6632 std::tie(args&: PACConstDiscC, args&: PACAddrDisc) =
6633 extractPtrauthBlendDiscriminators(Disc: PACDisc, MRI);
6634
6635 MIB.buildCopy(Res: {AArch64::X16}, Op: {ValReg});
6636
6637 MIB.buildInstr(Opcode: AArch64::AUTRELLOADPAC)
6638 .addImm(Val: AUTKey)
6639 .addImm(Val: AUTConstDiscC)
6640 .addUse(RegNo: AUTAddrDisc)
6641 .addImm(Val: PACKey)
6642 .addImm(Val: PACConstDiscC)
6643 .addUse(RegNo: PACAddrDisc)
6644 .addImm(Val: Addend)
6645 .constrainAllUses(TII, TRI, RBI);
6646 MIB.buildCopy(Res: {DstReg}, Op: Register(AArch64::X16));
6647
6648 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6649 I.eraseFromParent();
6650 return true;
6651 }
6652 }
6653
6654 I.eraseFromParent();
6655 return true;
6656}
6657
6658bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6659 MachineRegisterInfo &MRI) {
6660 unsigned IntrinID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
6661
6662 switch (IntrinID) {
6663 default:
6664 break;
6665 case Intrinsic::ptrauth_resign: {
6666 Register DstReg = I.getOperand(i: 0).getReg();
6667 Register ValReg = I.getOperand(i: 2).getReg();
6668 uint64_t AUTKey = I.getOperand(i: 3).getImm();
6669 Register AUTDisc = I.getOperand(i: 4).getReg();
6670 uint64_t PACKey = I.getOperand(i: 5).getImm();
6671 Register PACDisc = I.getOperand(i: 6).getReg();
6672
6673 Register AUTAddrDisc = AUTDisc;
6674 uint16_t AUTConstDiscC = 0;
6675 std::tie(args&: AUTConstDiscC, args&: AUTAddrDisc) =
6676 extractPtrauthBlendDiscriminators(Disc: AUTDisc, MRI);
6677
6678 Register PACAddrDisc = PACDisc;
6679 uint16_t PACConstDiscC = 0;
6680 std::tie(args&: PACConstDiscC, args&: PACAddrDisc) =
6681 extractPtrauthBlendDiscriminators(Disc: PACDisc, MRI);
6682
6683 MIB.buildCopy(Res: {AArch64::X16}, Op: {ValReg});
6684 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
6685 MIB.buildInstr(Opcode: AArch64::AUTPAC)
6686 .addImm(Val: AUTKey)
6687 .addImm(Val: AUTConstDiscC)
6688 .addUse(RegNo: AUTAddrDisc)
6689 .addImm(Val: PACKey)
6690 .addImm(Val: PACConstDiscC)
6691 .addUse(RegNo: PACAddrDisc)
6692 .constrainAllUses(TII, TRI, RBI);
6693 MIB.buildCopy(Res: {DstReg}, Op: Register(AArch64::X16));
6694
6695 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6696 I.eraseFromParent();
6697 return true;
6698 }
6699 case Intrinsic::ptrauth_auth: {
6700 Register DstReg = I.getOperand(i: 0).getReg();
6701 Register ValReg = I.getOperand(i: 2).getReg();
6702 uint64_t AUTKey = I.getOperand(i: 3).getImm();
6703 Register AUTDisc = I.getOperand(i: 4).getReg();
6704
6705 Register AUTAddrDisc = AUTDisc;
6706 uint16_t AUTConstDiscC = 0;
6707 std::tie(args&: AUTConstDiscC, args&: AUTAddrDisc) =
6708 extractPtrauthBlendDiscriminators(Disc: AUTDisc, MRI);
6709
6710 if (STI.isX16X17Safer()) {
6711 MIB.buildCopy(Res: {AArch64::X16}, Op: {ValReg});
6712 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
6713 MIB.buildInstr(Opcode: AArch64::AUTx16x17)
6714 .addImm(Val: AUTKey)
6715 .addImm(Val: AUTConstDiscC)
6716 .addUse(RegNo: AUTAddrDisc)
6717 .constrainAllUses(TII, TRI, RBI);
6718 MIB.buildCopy(Res: {DstReg}, Op: Register(AArch64::X16));
6719 } else {
6720 Register ScratchReg =
6721 MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
6722 MIB.buildInstr(Opcode: AArch64::AUTxMxN)
6723 .addDef(RegNo: DstReg)
6724 .addDef(RegNo: ScratchReg)
6725 .addUse(RegNo: ValReg)
6726 .addImm(Val: AUTKey)
6727 .addImm(Val: AUTConstDiscC)
6728 .addUse(RegNo: AUTAddrDisc)
6729 .constrainAllUses(TII, TRI, RBI);
6730 }
6731
6732 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6733 I.eraseFromParent();
6734 return true;
6735 }
6736 case Intrinsic::frameaddress:
6737 case Intrinsic::returnaddress: {
6738 MachineFunction &MF = *I.getParent()->getParent();
6739 MachineFrameInfo &MFI = MF.getFrameInfo();
6740
6741 unsigned Depth = I.getOperand(i: 2).getImm();
6742 Register DstReg = I.getOperand(i: 0).getReg();
6743 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6744
6745 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6746 if (!MFReturnAddr) {
6747 // Insert the copy from LR/X30 into the entry block, before it can be
6748 // clobbered by anything.
6749 MFI.setReturnAddressIsTaken(true);
6750 MFReturnAddr = getFunctionLiveInPhysReg(
6751 MF, TII, PhysReg: AArch64::LR, RC: AArch64::GPR64RegClass, DL: I.getDebugLoc());
6752 }
6753
6754 if (STI.hasPAuth()) {
6755 MIB.buildInstr(Opc: AArch64::XPACI, DstOps: {DstReg}, SrcOps: {MFReturnAddr});
6756 } else {
6757 MIB.buildCopy(Res: {Register(AArch64::LR)}, Op: {MFReturnAddr});
6758 MIB.buildInstr(Opcode: AArch64::XPACLRI);
6759 MIB.buildCopy(Res: {DstReg}, Op: {Register(AArch64::LR)});
6760 }
6761
6762 I.eraseFromParent();
6763 return true;
6764 }
6765
6766 MFI.setFrameAddressIsTaken(true);
6767 Register FrameAddr(AArch64::FP);
6768 while (Depth--) {
6769 Register NextFrame = MRI.createVirtualRegister(RegClass: &AArch64::GPR64spRegClass);
6770 auto Ldr =
6771 MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {NextFrame}, SrcOps: {FrameAddr}).addImm(Val: 0);
6772 constrainSelectedInstRegOperands(I&: *Ldr, TII, TRI, RBI);
6773 FrameAddr = NextFrame;
6774 }
6775
6776 if (IntrinID == Intrinsic::frameaddress)
6777 MIB.buildCopy(Res: {DstReg}, Op: {FrameAddr});
6778 else {
6779 MFI.setReturnAddressIsTaken(true);
6780
6781 if (STI.hasPAuth()) {
6782 Register TmpReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
6783 MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {TmpReg}, SrcOps: {FrameAddr}).addImm(Val: 1);
6784 MIB.buildInstr(Opc: AArch64::XPACI, DstOps: {DstReg}, SrcOps: {TmpReg});
6785 } else {
6786 MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {Register(AArch64::LR)}, SrcOps: {FrameAddr})
6787 .addImm(Val: 1);
6788 MIB.buildInstr(Opcode: AArch64::XPACLRI);
6789 MIB.buildCopy(Res: {DstReg}, Op: {Register(AArch64::LR)});
6790 }
6791 }
6792
6793 I.eraseFromParent();
6794 return true;
6795 }
6796 case Intrinsic::aarch64_neon_tbl2:
6797 SelectTable(I, MRI, NumVecs: 2, Opc1: AArch64::TBLv8i8Two, Opc2: AArch64::TBLv16i8Two, isExt: false);
6798 return true;
6799 case Intrinsic::aarch64_neon_tbl3:
6800 SelectTable(I, MRI, NumVecs: 3, Opc1: AArch64::TBLv8i8Three, Opc2: AArch64::TBLv16i8Three,
6801 isExt: false);
6802 return true;
6803 case Intrinsic::aarch64_neon_tbl4:
6804 SelectTable(I, MRI, NumVecs: 4, Opc1: AArch64::TBLv8i8Four, Opc2: AArch64::TBLv16i8Four, isExt: false);
6805 return true;
6806 case Intrinsic::aarch64_neon_tbx2:
6807 SelectTable(I, MRI, NumVecs: 2, Opc1: AArch64::TBXv8i8Two, Opc2: AArch64::TBXv16i8Two, isExt: true);
6808 return true;
6809 case Intrinsic::aarch64_neon_tbx3:
6810 SelectTable(I, MRI, NumVecs: 3, Opc1: AArch64::TBXv8i8Three, Opc2: AArch64::TBXv16i8Three, isExt: true);
6811 return true;
6812 case Intrinsic::aarch64_neon_tbx4:
6813 SelectTable(I, MRI, NumVecs: 4, Opc1: AArch64::TBXv8i8Four, Opc2: AArch64::TBXv16i8Four, isExt: true);
6814 return true;
6815 case Intrinsic::swift_async_context_addr:
6816 auto Sub = MIB.buildInstr(Opc: AArch64::SUBXri, DstOps: {I.getOperand(i: 0).getReg()},
6817 SrcOps: {Register(AArch64::FP)})
6818 .addImm(Val: 8)
6819 .addImm(Val: 0);
6820 constrainSelectedInstRegOperands(I&: *Sub, TII, TRI, RBI);
6821
6822 MF->getFrameInfo().setFrameAddressIsTaken(true);
6823 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6824 I.eraseFromParent();
6825 return true;
6826 }
6827 return false;
6828}
6829
6830// G_PTRAUTH_GLOBAL_VALUE lowering
6831//
6832// We have 3 lowering alternatives to choose from:
6833// - MOVaddrPAC: similar to MOVaddr, with added PAC.
6834// If the GV doesn't need a GOT load (i.e., is locally defined)
6835// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
6836//
6837// - LOADgotPAC: similar to LOADgot, with added PAC.
6838// If the GV needs a GOT load, materialize the pointer using the usual
6839// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
6840// section is assumed to be read-only (for example, via relro mechanism). See
6841// LowerMOVaddrPAC.
6842//
6843// - LOADauthptrstatic: similar to LOADgot, but use a
6844// special stub slot instead of a GOT slot.
6845// Load a signed pointer for symbol 'sym' from a stub slot named
6846// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
6847// resolving. This usually lowers to adrp+ldr, but also emits an entry into
6848// .data with an
6849// @AUTH relocation. See LowerLOADauthptrstatic.
6850//
6851// All 3 are pseudos that are expand late to longer sequences: this lets us
6852// provide integrity guarantees on the to-be-signed intermediate values.
6853//
6854// LOADauthptrstatic is undesirable because it requires a large section filled
6855// with often similarly-signed pointers, making it a good harvesting target.
6856// Thus, it's only used for ptrauth references to extern_weak to avoid null
6857// checks.
6858
6859bool AArch64InstructionSelector::selectPtrAuthGlobalValue(
6860 MachineInstr &I, MachineRegisterInfo &MRI) const {
6861 Register DefReg = I.getOperand(i: 0).getReg();
6862 Register Addr = I.getOperand(i: 1).getReg();
6863 uint64_t Key = I.getOperand(i: 2).getImm();
6864 Register AddrDisc = I.getOperand(i: 3).getReg();
6865 uint64_t Disc = I.getOperand(i: 4).getImm();
6866 int64_t Offset = 0;
6867
6868 if (Key > AArch64PACKey::LAST)
6869 report_fatal_error(reason: "key in ptrauth global out of range [0, " +
6870 Twine((int)AArch64PACKey::LAST) + "]");
6871
6872 // Blend only works if the integer discriminator is 16-bit wide.
6873 if (!isUInt<16>(x: Disc))
6874 report_fatal_error(
6875 reason: "constant discriminator in ptrauth global out of range [0, 0xffff]");
6876
6877 // Choosing between 3 lowering alternatives is target-specific.
6878 if (!STI.isTargetELF() && !STI.isTargetMachO())
6879 report_fatal_error(reason: "ptrauth global lowering only supported on MachO/ELF");
6880
6881 if (!MRI.hasOneDef(RegNo: Addr))
6882 return false;
6883
6884 // First match any offset we take from the real global.
6885 const MachineInstr *DefMI = &*MRI.def_instr_begin(RegNo: Addr);
6886 if (DefMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6887 Register OffsetReg = DefMI->getOperand(i: 2).getReg();
6888 if (!MRI.hasOneDef(RegNo: OffsetReg))
6889 return false;
6890 const MachineInstr &OffsetMI = *MRI.def_instr_begin(RegNo: OffsetReg);
6891 if (OffsetMI.getOpcode() != TargetOpcode::G_CONSTANT)
6892 return false;
6893
6894 Addr = DefMI->getOperand(i: 1).getReg();
6895 if (!MRI.hasOneDef(RegNo: Addr))
6896 return false;
6897
6898 DefMI = &*MRI.def_instr_begin(RegNo: Addr);
6899 Offset = OffsetMI.getOperand(i: 1).getCImm()->getSExtValue();
6900 }
6901
6902 // We should be left with a genuine unauthenticated GlobalValue.
6903 const GlobalValue *GV;
6904 if (DefMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
6905 GV = DefMI->getOperand(i: 1).getGlobal();
6906 Offset += DefMI->getOperand(i: 1).getOffset();
6907 } else if (DefMI->getOpcode() == AArch64::G_ADD_LOW) {
6908 GV = DefMI->getOperand(i: 2).getGlobal();
6909 Offset += DefMI->getOperand(i: 2).getOffset();
6910 } else {
6911 return false;
6912 }
6913
6914 MachineIRBuilder MIB(I);
6915
6916 // Classify the reference to determine whether it needs a GOT load.
6917 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
6918 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
6919 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
6920 "unsupported non-GOT op flags on ptrauth global reference");
6921 assert((!GV->hasExternalWeakLinkage() || NeedsGOTLoad) &&
6922 "unsupported non-GOT reference to weak ptrauth global");
6923
6924 std::optional<APInt> AddrDiscVal = getIConstantVRegVal(VReg: AddrDisc, MRI);
6925 bool HasAddrDisc = !AddrDiscVal || *AddrDiscVal != 0;
6926
6927 // Non-extern_weak:
6928 // - No GOT load needed -> MOVaddrPAC
6929 // - GOT load for non-extern_weak -> LOADgotPAC
6930 // Note that we disallow extern_weak refs to avoid null checks later.
6931 if (!GV->hasExternalWeakLinkage()) {
6932 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X16}, SrcOps: {});
6933 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
6934 MIB.buildInstr(Opcode: NeedsGOTLoad ? AArch64::LOADgotPAC : AArch64::MOVaddrPAC)
6935 .addGlobalAddress(GV, Offset)
6936 .addImm(Val: Key)
6937 .addReg(RegNo: HasAddrDisc ? AddrDisc : AArch64::XZR)
6938 .addImm(Val: Disc)
6939 .constrainAllUses(TII, TRI, RBI);
6940 MIB.buildCopy(Res: DefReg, Op: Register(AArch64::X16));
6941 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
6942 I.eraseFromParent();
6943 return true;
6944 }
6945
6946 // extern_weak -> LOADauthptrstatic
6947
6948 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
6949 // offset alone as a pointer if the symbol wasn't available, which would
6950 // probably break null checks in users. Ptrauth complicates things further:
6951 // error out.
6952 if (Offset != 0)
6953 report_fatal_error(
6954 reason: "unsupported non-zero offset in weak ptrauth global reference");
6955
6956 if (HasAddrDisc)
6957 report_fatal_error(reason: "unsupported weak addr-div ptrauth global");
6958
6959 MIB.buildInstr(Opc: AArch64::LOADauthptrstatic, DstOps: {DefReg}, SrcOps: {})
6960 .addGlobalAddress(GV, Offset)
6961 .addImm(Val: Key)
6962 .addImm(Val: Disc);
6963 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
6964
6965 I.eraseFromParent();
6966 return true;
6967}
6968
6969void AArch64InstructionSelector::SelectTable(MachineInstr &I,
6970 MachineRegisterInfo &MRI,
6971 unsigned NumVec, unsigned Opc1,
6972 unsigned Opc2, bool isExt) {
6973 Register DstReg = I.getOperand(i: 0).getReg();
6974 unsigned Opc = MRI.getType(Reg: DstReg) == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8) ? Opc1 : Opc2;
6975
6976 // Create the REG_SEQUENCE
6977 SmallVector<Register, 4> Regs;
6978 for (unsigned i = 0; i < NumVec; i++)
6979 Regs.push_back(Elt: I.getOperand(i: i + 2 + isExt).getReg());
6980 Register RegSeq = createQTuple(Regs, MIB);
6981
6982 Register IdxReg = I.getOperand(i: 2 + NumVec + isExt).getReg();
6983 MachineInstrBuilder Instr;
6984 if (isExt) {
6985 Register Reg = I.getOperand(i: 2).getReg();
6986 Instr = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Reg, RegSeq, IdxReg});
6987 } else
6988 Instr = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {RegSeq, IdxReg});
6989 constrainSelectedInstRegOperands(I&: *Instr, TII, TRI, RBI);
6990 I.eraseFromParent();
6991}
6992
6993InstructionSelector::ComplexRendererFns
6994AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
6995 auto MaybeImmed = getImmedFromMO(Root);
6996 if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6997 return std::nullopt;
6998 uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
6999 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
7000}
7001
7002InstructionSelector::ComplexRendererFns
7003AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
7004 auto MaybeImmed = getImmedFromMO(Root);
7005 if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
7006 return std::nullopt;
7007 uint64_t Enc = 31 - *MaybeImmed;
7008 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
7009}
7010
7011InstructionSelector::ComplexRendererFns
7012AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
7013 auto MaybeImmed = getImmedFromMO(Root);
7014 if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7015 return std::nullopt;
7016 uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
7017 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
7018}
7019
7020InstructionSelector::ComplexRendererFns
7021AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
7022 auto MaybeImmed = getImmedFromMO(Root);
7023 if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7024 return std::nullopt;
7025 uint64_t Enc = 63 - *MaybeImmed;
7026 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
7027}
7028
7029/// Helper to select an immediate value that can be represented as a 12-bit
7030/// value shifted left by either 0 or 12. If it is possible to do so, return
7031/// the immediate and shift value. If not, return std::nullopt.
7032///
7033/// Used by selectArithImmed and selectNegArithImmed.
7034InstructionSelector::ComplexRendererFns
7035AArch64InstructionSelector::select12BitValueWithLeftShift(
7036 uint64_t Immed) const {
7037 unsigned ShiftAmt;
7038 if (Immed >> 12 == 0) {
7039 ShiftAmt = 0;
7040 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
7041 ShiftAmt = 12;
7042 Immed = Immed >> 12;
7043 } else
7044 return std::nullopt;
7045
7046 unsigned ShVal = AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: ShiftAmt);
7047 return {{
7048 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Immed); },
7049 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ShVal); },
7050 }};
7051}
7052
7053/// SelectArithImmed - Select an immediate value that can be represented as
7054/// a 12-bit value shifted left by either 0 or 12. If so, return true with
7055/// Val set to the 12-bit value and Shift set to the shifter operand.
7056InstructionSelector::ComplexRendererFns
7057AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
7058 // This function is called from the addsub_shifted_imm ComplexPattern,
7059 // which lists [imm] as the list of opcode it's interested in, however
7060 // we still need to check whether the operand is actually an immediate
7061 // here because the ComplexPattern opcode list is only used in
7062 // root-level opcode matching.
7063 auto MaybeImmed = getImmedFromMO(Root);
7064 if (MaybeImmed == std::nullopt)
7065 return std::nullopt;
7066 return select12BitValueWithLeftShift(Immed: *MaybeImmed);
7067}
7068
7069/// SelectNegArithImmed - As above, but negates the value before trying to
7070/// select it.
7071InstructionSelector::ComplexRendererFns
7072AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
7073 // We need a register here, because we need to know if we have a 64 or 32
7074 // bit immediate.
7075 if (!Root.isReg())
7076 return std::nullopt;
7077 auto MaybeImmed = getImmedFromMO(Root);
7078 if (MaybeImmed == std::nullopt)
7079 return std::nullopt;
7080 uint64_t Immed = *MaybeImmed;
7081
7082 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
7083 // have the opposite effect on the C flag, so this pattern mustn't match under
7084 // those circumstances.
7085 if (Immed == 0)
7086 return std::nullopt;
7087
7088 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
7089 // the root.
7090 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7091 if (MRI.getType(Reg: Root.getReg()).getSizeInBits() == 32)
7092 Immed = ~((uint32_t)Immed) + 1;
7093 else
7094 Immed = ~Immed + 1ULL;
7095
7096 if (Immed & 0xFFFFFFFFFF000000ULL)
7097 return std::nullopt;
7098
7099 Immed &= 0xFFFFFFULL;
7100 return select12BitValueWithLeftShift(Immed);
7101}
7102
7103/// Checks if we are sure that folding MI into load/store addressing mode is
7104/// beneficial or not.
7105///
7106/// Returns:
7107/// - true if folding MI would be beneficial.
7108/// - false if folding MI would be bad.
7109/// - std::nullopt if it is not sure whether folding MI is beneficial.
7110///
7111/// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example:
7112///
7113/// %13:gpr(s64) = G_CONSTANT i64 1
7114/// %8:gpr(s64) = G_SHL %6, %13(s64)
7115/// %9:gpr(p0) = G_PTR_ADD %0, %8(s64)
7116/// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16))
7117std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode(
7118 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
7119 if (MI.getOpcode() == AArch64::G_SHL) {
7120 // Address operands with shifts are free, except for running on subtargets
7121 // with AddrLSLSlow14.
7122 if (const auto ValAndVeg = getIConstantVRegValWithLookThrough(
7123 VReg: MI.getOperand(i: 2).getReg(), MRI)) {
7124 const APInt ShiftVal = ValAndVeg->Value;
7125
7126 // Don't fold if we know this will be slow.
7127 return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4));
7128 }
7129 }
7130 return std::nullopt;
7131}
7132
7133/// Return true if it is worth folding MI into an extended register. That is,
7134/// if it's safe to pull it into the addressing mode of a load or store as a
7135/// shift.
7136/// \p IsAddrOperand whether the def of MI is used as an address operand
7137/// (e.g. feeding into an LDR/STR).
7138bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
7139 const MachineInstr &MI, const MachineRegisterInfo &MRI,
7140 bool IsAddrOperand) const {
7141
7142 // Always fold if there is one use, or if we're optimizing for size.
7143 Register DefReg = MI.getOperand(i: 0).getReg();
7144 if (MRI.hasOneNonDBGUse(RegNo: DefReg) ||
7145 MI.getParent()->getParent()->getFunction().hasOptSize())
7146 return true;
7147
7148 if (IsAddrOperand) {
7149 // If we are already sure that folding MI is good or bad, return the result.
7150 if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI))
7151 return *Worth;
7152
7153 // Fold G_PTR_ADD if its offset operand can be folded
7154 if (MI.getOpcode() == AArch64::G_PTR_ADD) {
7155 MachineInstr *OffsetInst =
7156 getDefIgnoringCopies(Reg: MI.getOperand(i: 2).getReg(), MRI);
7157
7158 // Note, we already know G_PTR_ADD is used by at least two instructions.
7159 // If we are also sure about whether folding is beneficial or not,
7160 // return the result.
7161 if (const auto Worth = isWorthFoldingIntoAddrMode(MI: *OffsetInst, MRI))
7162 return *Worth;
7163 }
7164 }
7165
7166 // FIXME: Consider checking HasALULSLFast as appropriate.
7167
7168 // We have a fastpath, so folding a shift in and potentially computing it
7169 // many times may be beneficial. Check if this is only used in memory ops.
7170 // If it is, then we should fold.
7171 return all_of(Range: MRI.use_nodbg_instructions(Reg: DefReg),
7172 P: [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
7173}
7174
7175static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
7176 switch (Type) {
7177 case AArch64_AM::SXTB:
7178 case AArch64_AM::SXTH:
7179 case AArch64_AM::SXTW:
7180 return true;
7181 default:
7182 return false;
7183 }
7184}
7185
7186InstructionSelector::ComplexRendererFns
7187AArch64InstructionSelector::selectExtendedSHL(
7188 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
7189 unsigned SizeInBytes, bool WantsExt) const {
7190 assert(Base.isReg() && "Expected base to be a register operand");
7191 assert(Offset.isReg() && "Expected offset to be a register operand");
7192
7193 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7194 MachineInstr *OffsetInst = MRI.getVRegDef(Reg: Offset.getReg());
7195
7196 unsigned OffsetOpc = OffsetInst->getOpcode();
7197 bool LookedThroughZExt = false;
7198 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
7199 // Try to look through a ZEXT.
7200 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
7201 return std::nullopt;
7202
7203 OffsetInst = MRI.getVRegDef(Reg: OffsetInst->getOperand(i: 1).getReg());
7204 OffsetOpc = OffsetInst->getOpcode();
7205 LookedThroughZExt = true;
7206
7207 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
7208 return std::nullopt;
7209 }
7210 // Make sure that the memory op is a valid size.
7211 int64_t LegalShiftVal = Log2_32(Value: SizeInBytes);
7212 if (LegalShiftVal == 0)
7213 return std::nullopt;
7214 if (!isWorthFoldingIntoExtendedReg(MI: *OffsetInst, MRI, IsAddrOperand: true))
7215 return std::nullopt;
7216
7217 // Now, try to find the specific G_CONSTANT. Start by assuming that the
7218 // register we will offset is the LHS, and the register containing the
7219 // constant is the RHS.
7220 Register OffsetReg = OffsetInst->getOperand(i: 1).getReg();
7221 Register ConstantReg = OffsetInst->getOperand(i: 2).getReg();
7222 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
7223 if (!ValAndVReg) {
7224 // We didn't get a constant on the RHS. If the opcode is a shift, then
7225 // we're done.
7226 if (OffsetOpc == TargetOpcode::G_SHL)
7227 return std::nullopt;
7228
7229 // If we have a G_MUL, we can use either register. Try looking at the RHS.
7230 std::swap(a&: OffsetReg, b&: ConstantReg);
7231 ValAndVReg = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
7232 if (!ValAndVReg)
7233 return std::nullopt;
7234 }
7235
7236 // The value must fit into 3 bits, and must be positive. Make sure that is
7237 // true.
7238 int64_t ImmVal = ValAndVReg->Value.getSExtValue();
7239
7240 // Since we're going to pull this into a shift, the constant value must be
7241 // a power of 2. If we got a multiply, then we need to check this.
7242 if (OffsetOpc == TargetOpcode::G_MUL) {
7243 if (!llvm::has_single_bit<uint32_t>(Value: ImmVal))
7244 return std::nullopt;
7245
7246 // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
7247 ImmVal = Log2_32(Value: ImmVal);
7248 }
7249
7250 if ((ImmVal & 0x7) != ImmVal)
7251 return std::nullopt;
7252
7253 // We are only allowed to shift by LegalShiftVal. This shift value is built
7254 // into the instruction, so we can't just use whatever we want.
7255 if (ImmVal != LegalShiftVal)
7256 return std::nullopt;
7257
7258 unsigned SignExtend = 0;
7259 if (WantsExt) {
7260 // Check if the offset is defined by an extend, unless we looked through a
7261 // G_ZEXT earlier.
7262 if (!LookedThroughZExt) {
7263 MachineInstr *ExtInst = getDefIgnoringCopies(Reg: OffsetReg, MRI);
7264 auto Ext = getExtendTypeForInst(MI&: *ExtInst, MRI, IsLoadStore: true);
7265 if (Ext == AArch64_AM::InvalidShiftExtend)
7266 return std::nullopt;
7267
7268 SignExtend = isSignExtendShiftType(Type: Ext) ? 1 : 0;
7269 // We only support SXTW for signed extension here.
7270 if (SignExtend && Ext != AArch64_AM::SXTW)
7271 return std::nullopt;
7272 OffsetReg = ExtInst->getOperand(i: 1).getReg();
7273 }
7274
7275 // Need a 32-bit wide register here.
7276 MachineIRBuilder MIB(*MRI.getVRegDef(Reg: Root.getReg()));
7277 OffsetReg = moveScalarRegClass(Reg: OffsetReg, RC: AArch64::GPR32RegClass, MIB);
7278 }
7279
7280 // We can use the LHS of the GEP as the base, and the LHS of the shift as an
7281 // offset. Signify that we are shifting by setting the shift flag to 1.
7282 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: Base.getReg()); },
7283 [=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: OffsetReg); },
7284 [=](MachineInstrBuilder &MIB) {
7285 // Need to add both immediates here to make sure that they are both
7286 // added to the instruction.
7287 MIB.addImm(Val: SignExtend);
7288 MIB.addImm(Val: 1);
7289 }}};
7290}
7291
7292/// This is used for computing addresses like this:
7293///
7294/// ldr x1, [x2, x3, lsl #3]
7295///
7296/// Where x2 is the base register, and x3 is an offset register. The shift-left
7297/// is a constant value specific to this load instruction. That is, we'll never
7298/// see anything other than a 3 here (which corresponds to the size of the
7299/// element being loaded.)
7300InstructionSelector::ComplexRendererFns
7301AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
7302 MachineOperand &Root, unsigned SizeInBytes) const {
7303 if (!Root.isReg())
7304 return std::nullopt;
7305 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7306
7307 // We want to find something like this:
7308 //
7309 // val = G_CONSTANT LegalShiftVal
7310 // shift = G_SHL off_reg val
7311 // ptr = G_PTR_ADD base_reg shift
7312 // x = G_LOAD ptr
7313 //
7314 // And fold it into this addressing mode:
7315 //
7316 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7317
7318 // Check if we can find the G_PTR_ADD.
7319 MachineInstr *PtrAdd =
7320 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7321 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(MI: *PtrAdd, MRI, IsAddrOperand: true))
7322 return std::nullopt;
7323
7324 // Now, try to match an opcode which will match our specific offset.
7325 // We want a G_SHL or a G_MUL.
7326 MachineInstr *OffsetInst =
7327 getDefIgnoringCopies(Reg: PtrAdd->getOperand(i: 2).getReg(), MRI);
7328 return selectExtendedSHL(Root, Base&: PtrAdd->getOperand(i: 1),
7329 Offset&: OffsetInst->getOperand(i: 0), SizeInBytes,
7330 /*WantsExt=*/false);
7331}
7332
7333/// This is used for computing addresses like this:
7334///
7335/// ldr x1, [x2, x3]
7336///
7337/// Where x2 is the base register, and x3 is an offset register.
7338///
7339/// When possible (or profitable) to fold a G_PTR_ADD into the address
7340/// calculation, this will do so. Otherwise, it will return std::nullopt.
7341InstructionSelector::ComplexRendererFns
7342AArch64InstructionSelector::selectAddrModeRegisterOffset(
7343 MachineOperand &Root) const {
7344 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7345
7346 // We need a GEP.
7347 MachineInstr *Gep = MRI.getVRegDef(Reg: Root.getReg());
7348 if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7349 return std::nullopt;
7350
7351 // If this is used more than once, let's not bother folding.
7352 // TODO: Check if they are memory ops. If they are, then we can still fold
7353 // without having to recompute anything.
7354 if (!MRI.hasOneNonDBGUse(RegNo: Gep->getOperand(i: 0).getReg()))
7355 return std::nullopt;
7356
7357 // Base is the GEP's LHS, offset is its RHS.
7358 return {{[=](MachineInstrBuilder &MIB) {
7359 MIB.addUse(RegNo: Gep->getOperand(i: 1).getReg());
7360 },
7361 [=](MachineInstrBuilder &MIB) {
7362 MIB.addUse(RegNo: Gep->getOperand(i: 2).getReg());
7363 },
7364 [=](MachineInstrBuilder &MIB) {
7365 // Need to add both immediates here to make sure that they are both
7366 // added to the instruction.
7367 MIB.addImm(Val: 0);
7368 MIB.addImm(Val: 0);
7369 }}};
7370}
7371
7372/// This is intended to be equivalent to selectAddrModeXRO in
7373/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7374InstructionSelector::ComplexRendererFns
7375AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7376 unsigned SizeInBytes) const {
7377 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7378 if (!Root.isReg())
7379 return std::nullopt;
7380 MachineInstr *PtrAdd =
7381 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7382 if (!PtrAdd)
7383 return std::nullopt;
7384
7385 // Check for an immediates which cannot be encoded in the [base + imm]
7386 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7387 // end up with code like:
7388 //
7389 // mov x0, wide
7390 // add x1 base, x0
7391 // ldr x2, [x1, x0]
7392 //
7393 // In this situation, we can use the [base, xreg] addressing mode to save an
7394 // add/sub:
7395 //
7396 // mov x0, wide
7397 // ldr x2, [base, x0]
7398 auto ValAndVReg =
7399 getIConstantVRegValWithLookThrough(VReg: PtrAdd->getOperand(i: 2).getReg(), MRI);
7400 if (ValAndVReg) {
7401 unsigned Scale = Log2_32(Value: SizeInBytes);
7402 int64_t ImmOff = ValAndVReg->Value.getSExtValue();
7403
7404 // Skip immediates that can be selected in the load/store addressing
7405 // mode.
7406 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
7407 ImmOff < (0x1000 << Scale))
7408 return std::nullopt;
7409
7410 // Helper lambda to decide whether or not it is preferable to emit an add.
7411 auto isPreferredADD = [](int64_t ImmOff) {
7412 // Constants in [0x0, 0xfff] can be encoded in an add.
7413 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
7414 return true;
7415
7416 // Can it be encoded in an add lsl #12?
7417 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
7418 return false;
7419
7420 // It can be encoded in an add lsl #12, but we may not want to. If it is
7421 // possible to select this as a single movz, then prefer that. A single
7422 // movz is faster than an add with a shift.
7423 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
7424 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
7425 };
7426
7427 // If the immediate can be encoded in a single add/sub, then bail out.
7428 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
7429 return std::nullopt;
7430 }
7431
7432 // Try to fold shifts into the addressing mode.
7433 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7434 if (AddrModeFns)
7435 return AddrModeFns;
7436
7437 // If that doesn't work, see if it's possible to fold in registers from
7438 // a GEP.
7439 return selectAddrModeRegisterOffset(Root);
7440}
7441
7442/// This is used for computing addresses like this:
7443///
7444/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7445///
7446/// Where we have a 64-bit base register, a 32-bit offset register, and an
7447/// extend (which may or may not be signed).
7448InstructionSelector::ComplexRendererFns
7449AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7450 unsigned SizeInBytes) const {
7451 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7452
7453 MachineInstr *PtrAdd =
7454 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7455 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(MI: *PtrAdd, MRI, IsAddrOperand: true))
7456 return std::nullopt;
7457
7458 MachineOperand &LHS = PtrAdd->getOperand(i: 1);
7459 MachineOperand &RHS = PtrAdd->getOperand(i: 2);
7460 MachineInstr *OffsetInst = getDefIgnoringCopies(Reg: RHS.getReg(), MRI);
7461
7462 // The first case is the same as selectAddrModeXRO, except we need an extend.
7463 // In this case, we try to find a shift and extend, and fold them into the
7464 // addressing mode.
7465 //
7466 // E.g.
7467 //
7468 // off_reg = G_Z/S/ANYEXT ext_reg
7469 // val = G_CONSTANT LegalShiftVal
7470 // shift = G_SHL off_reg val
7471 // ptr = G_PTR_ADD base_reg shift
7472 // x = G_LOAD ptr
7473 //
7474 // In this case we can get a load like this:
7475 //
7476 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7477 auto ExtendedShl = selectExtendedSHL(Root, Base&: LHS, Offset&: OffsetInst->getOperand(i: 0),
7478 SizeInBytes, /*WantsExt=*/true);
7479 if (ExtendedShl)
7480 return ExtendedShl;
7481
7482 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7483 //
7484 // e.g.
7485 // ldr something, [base_reg, ext_reg, sxtw]
7486 if (!isWorthFoldingIntoExtendedReg(MI: *OffsetInst, MRI, IsAddrOperand: true))
7487 return std::nullopt;
7488
7489 // Check if this is an extend. We'll get an extend type if it is.
7490 AArch64_AM::ShiftExtendType Ext =
7491 getExtendTypeForInst(MI&: *OffsetInst, MRI, /*IsLoadStore=*/true);
7492 if (Ext == AArch64_AM::InvalidShiftExtend)
7493 return std::nullopt;
7494
7495 // Need a 32-bit wide register.
7496 MachineIRBuilder MIB(*PtrAdd);
7497 Register ExtReg = moveScalarRegClass(Reg: OffsetInst->getOperand(i: 1).getReg(),
7498 RC: AArch64::GPR32RegClass, MIB);
7499 unsigned SignExtend = Ext == AArch64_AM::SXTW;
7500
7501 // Base is LHS, offset is ExtReg.
7502 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: LHS.getReg()); },
7503 [=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); },
7504 [=](MachineInstrBuilder &MIB) {
7505 MIB.addImm(Val: SignExtend);
7506 MIB.addImm(Val: 0);
7507 }}};
7508}
7509
7510/// Select a "register plus unscaled signed 9-bit immediate" address. This
7511/// should only match when there is an offset that is not valid for a scaled
7512/// immediate addressing mode. The "Size" argument is the size in bytes of the
7513/// memory reference, which is needed here to know what is valid for a scaled
7514/// immediate.
7515InstructionSelector::ComplexRendererFns
7516AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7517 unsigned Size) const {
7518 MachineRegisterInfo &MRI =
7519 Root.getParent()->getParent()->getParent()->getRegInfo();
7520
7521 if (!Root.isReg())
7522 return std::nullopt;
7523
7524 if (!isBaseWithConstantOffset(Root, MRI))
7525 return std::nullopt;
7526
7527 MachineInstr *RootDef = MRI.getVRegDef(Reg: Root.getReg());
7528
7529 MachineOperand &OffImm = RootDef->getOperand(i: 2);
7530 if (!OffImm.isReg())
7531 return std::nullopt;
7532 MachineInstr *RHS = MRI.getVRegDef(Reg: OffImm.getReg());
7533 if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7534 return std::nullopt;
7535 int64_t RHSC;
7536 MachineOperand &RHSOp1 = RHS->getOperand(i: 1);
7537 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
7538 return std::nullopt;
7539 RHSC = RHSOp1.getCImm()->getSExtValue();
7540
7541 if (RHSC >= -256 && RHSC < 256) {
7542 MachineOperand &Base = RootDef->getOperand(i: 1);
7543 return {{
7544 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Base); },
7545 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC); },
7546 }};
7547 }
7548 return std::nullopt;
7549}
7550
7551InstructionSelector::ComplexRendererFns
7552AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7553 unsigned Size,
7554 MachineRegisterInfo &MRI) const {
7555 if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7556 return std::nullopt;
7557 MachineInstr &Adrp = *MRI.getVRegDef(Reg: RootDef.getOperand(i: 1).getReg());
7558 if (Adrp.getOpcode() != AArch64::ADRP)
7559 return std::nullopt;
7560
7561 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7562 auto Offset = Adrp.getOperand(i: 1).getOffset();
7563 if (Offset % Size != 0)
7564 return std::nullopt;
7565
7566 auto GV = Adrp.getOperand(i: 1).getGlobal();
7567 if (GV->isThreadLocal())
7568 return std::nullopt;
7569
7570 auto &MF = *RootDef.getParent()->getParent();
7571 if (GV->getPointerAlignment(DL: MF.getDataLayout()) < Size)
7572 return std::nullopt;
7573
7574 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM: MF.getTarget());
7575 MachineIRBuilder MIRBuilder(RootDef);
7576 Register AdrpReg = Adrp.getOperand(i: 0).getReg();
7577 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: AdrpReg); },
7578 [=](MachineInstrBuilder &MIB) {
7579 MIB.addGlobalAddress(GV, Offset,
7580 TargetFlags: OpFlags | AArch64II::MO_PAGEOFF |
7581 AArch64II::MO_NC);
7582 }}};
7583}
7584
7585/// Select a "register plus scaled unsigned 12-bit immediate" address. The
7586/// "Size" argument is the size in bytes of the memory reference, which
7587/// determines the scale.
7588InstructionSelector::ComplexRendererFns
7589AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7590 unsigned Size) const {
7591 MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7592 MachineRegisterInfo &MRI = MF.getRegInfo();
7593
7594 if (!Root.isReg())
7595 return std::nullopt;
7596
7597 MachineInstr *RootDef = MRI.getVRegDef(Reg: Root.getReg());
7598 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7599 return {{
7600 [=](MachineInstrBuilder &MIB) { MIB.add(MO: RootDef->getOperand(i: 1)); },
7601 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); },
7602 }};
7603 }
7604
7605 CodeModel::Model CM = MF.getTarget().getCodeModel();
7606 // Check if we can fold in the ADD of small code model ADRP + ADD address.
7607 // HACK: ld64 on Darwin doesn't support relocations on PRFM, so we can't fold
7608 // globals into the offset.
7609 MachineInstr *RootParent = Root.getParent();
7610 if (CM == CodeModel::Small &&
7611 !(RootParent->getOpcode() == AArch64::G_AARCH64_PREFETCH &&
7612 STI.isTargetDarwin())) {
7613 auto OpFns = tryFoldAddLowIntoImm(RootDef&: *RootDef, Size, MRI);
7614 if (OpFns)
7615 return OpFns;
7616 }
7617
7618 if (isBaseWithConstantOffset(Root, MRI)) {
7619 MachineOperand &LHS = RootDef->getOperand(i: 1);
7620 MachineOperand &RHS = RootDef->getOperand(i: 2);
7621 MachineInstr *LHSDef = MRI.getVRegDef(Reg: LHS.getReg());
7622 MachineInstr *RHSDef = MRI.getVRegDef(Reg: RHS.getReg());
7623
7624 int64_t RHSC = (int64_t)RHSDef->getOperand(i: 1).getCImm()->getZExtValue();
7625 unsigned Scale = Log2_32(Value: Size);
7626 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
7627 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7628 return {{
7629 [=](MachineInstrBuilder &MIB) { MIB.add(MO: LHSDef->getOperand(i: 1)); },
7630 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC >> Scale); },
7631 }};
7632
7633 return {{
7634 [=](MachineInstrBuilder &MIB) { MIB.add(MO: LHS); },
7635 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC >> Scale); },
7636 }};
7637 }
7638 }
7639
7640 // Before falling back to our general case, check if the unscaled
7641 // instructions can handle this. If so, that's preferable.
7642 if (selectAddrModeUnscaled(Root, Size))
7643 return std::nullopt;
7644
7645 return {{
7646 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); },
7647 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); },
7648 }};
7649}
7650
7651/// Given a shift instruction, return the correct shift type for that
7652/// instruction.
7653static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7654 switch (MI.getOpcode()) {
7655 default:
7656 return AArch64_AM::InvalidShiftExtend;
7657 case TargetOpcode::G_SHL:
7658 return AArch64_AM::LSL;
7659 case TargetOpcode::G_LSHR:
7660 return AArch64_AM::LSR;
7661 case TargetOpcode::G_ASHR:
7662 return AArch64_AM::ASR;
7663 case TargetOpcode::G_ROTR:
7664 return AArch64_AM::ROR;
7665 }
7666}
7667
7668/// Select a "shifted register" operand. If the value is not shifted, set the
7669/// shift operand to a default value of "lsl 0".
7670InstructionSelector::ComplexRendererFns
7671AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7672 bool AllowROR) const {
7673 if (!Root.isReg())
7674 return std::nullopt;
7675 MachineRegisterInfo &MRI =
7676 Root.getParent()->getParent()->getParent()->getRegInfo();
7677
7678 // Check if the operand is defined by an instruction which corresponds to
7679 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7680 MachineInstr *ShiftInst = MRI.getVRegDef(Reg: Root.getReg());
7681 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(MI&: *ShiftInst);
7682 if (ShType == AArch64_AM::InvalidShiftExtend)
7683 return std::nullopt;
7684 if (ShType == AArch64_AM::ROR && !AllowROR)
7685 return std::nullopt;
7686 if (!isWorthFoldingIntoExtendedReg(MI: *ShiftInst, MRI, IsAddrOperand: false))
7687 return std::nullopt;
7688
7689 // Need an immediate on the RHS.
7690 MachineOperand &ShiftRHS = ShiftInst->getOperand(i: 2);
7691 auto Immed = getImmedFromMO(Root: ShiftRHS);
7692 if (!Immed)
7693 return std::nullopt;
7694
7695 // We have something that we can fold. Fold in the shift's LHS and RHS into
7696 // the instruction.
7697 MachineOperand &ShiftLHS = ShiftInst->getOperand(i: 1);
7698 Register ShiftReg = ShiftLHS.getReg();
7699
7700 unsigned NumBits = MRI.getType(Reg: ShiftReg).getSizeInBits();
7701 unsigned Val = *Immed & (NumBits - 1);
7702 unsigned ShiftVal = AArch64_AM::getShifterImm(ST: ShType, Imm: Val);
7703
7704 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ShiftReg); },
7705 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ShiftVal); }}};
7706}
7707
7708AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7709 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7710 unsigned Opc = MI.getOpcode();
7711
7712 // Handle explicit extend instructions first.
7713 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
7714 unsigned Size;
7715 if (Opc == TargetOpcode::G_SEXT)
7716 Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7717 else
7718 Size = MI.getOperand(i: 2).getImm();
7719 assert(Size != 64 && "Extend from 64 bits?");
7720 switch (Size) {
7721 case 8:
7722 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7723 case 16:
7724 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7725 case 32:
7726 return AArch64_AM::SXTW;
7727 default:
7728 return AArch64_AM::InvalidShiftExtend;
7729 }
7730 }
7731
7732 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
7733 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7734 assert(Size != 64 && "Extend from 64 bits?");
7735 switch (Size) {
7736 case 8:
7737 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7738 case 16:
7739 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7740 case 32:
7741 return AArch64_AM::UXTW;
7742 default:
7743 return AArch64_AM::InvalidShiftExtend;
7744 }
7745 }
7746
7747 // Don't have an explicit extend. Try to handle a G_AND with a constant mask
7748 // on the RHS.
7749 if (Opc != TargetOpcode::G_AND)
7750 return AArch64_AM::InvalidShiftExtend;
7751
7752 std::optional<uint64_t> MaybeAndMask = getImmedFromMO(Root: MI.getOperand(i: 2));
7753 if (!MaybeAndMask)
7754 return AArch64_AM::InvalidShiftExtend;
7755 uint64_t AndMask = *MaybeAndMask;
7756 switch (AndMask) {
7757 default:
7758 return AArch64_AM::InvalidShiftExtend;
7759 case 0xFF:
7760 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7761 case 0xFFFF:
7762 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7763 case 0xFFFFFFFF:
7764 return AArch64_AM::UXTW;
7765 }
7766}
7767
7768Register AArch64InstructionSelector::moveScalarRegClass(
7769 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7770 MachineRegisterInfo &MRI = *MIB.getMRI();
7771 auto Ty = MRI.getType(Reg);
7772 assert(!Ty.isVector() && "Expected scalars only!");
7773 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7774 return Reg;
7775
7776 // Create a copy and immediately select it.
7777 // FIXME: We should have an emitCopy function?
7778 auto Copy = MIB.buildCopy(Res: {&RC}, Op: {Reg});
7779 selectCopy(I&: *Copy, TII, MRI, TRI, RBI);
7780 return Copy.getReg(Idx: 0);
7781}
7782
7783/// Select an "extended register" operand. This operand folds in an extend
7784/// followed by an optional left shift.
7785InstructionSelector::ComplexRendererFns
7786AArch64InstructionSelector::selectArithExtendedRegister(
7787 MachineOperand &Root) const {
7788 if (!Root.isReg())
7789 return std::nullopt;
7790 MachineRegisterInfo &MRI =
7791 Root.getParent()->getParent()->getParent()->getRegInfo();
7792
7793 uint64_t ShiftVal = 0;
7794 Register ExtReg;
7795 AArch64_AM::ShiftExtendType Ext;
7796 MachineInstr *RootDef = getDefIgnoringCopies(Reg: Root.getReg(), MRI);
7797 if (!RootDef)
7798 return std::nullopt;
7799
7800 if (!isWorthFoldingIntoExtendedReg(MI: *RootDef, MRI, IsAddrOperand: false))
7801 return std::nullopt;
7802
7803 // Check if we can fold a shift and an extend.
7804 if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7805 // Look for a constant on the RHS of the shift.
7806 MachineOperand &RHS = RootDef->getOperand(i: 2);
7807 std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(Root: RHS);
7808 if (!MaybeShiftVal)
7809 return std::nullopt;
7810 ShiftVal = *MaybeShiftVal;
7811 if (ShiftVal > 4)
7812 return std::nullopt;
7813 // Look for a valid extend instruction on the LHS of the shift.
7814 MachineOperand &LHS = RootDef->getOperand(i: 1);
7815 MachineInstr *ExtDef = getDefIgnoringCopies(Reg: LHS.getReg(), MRI);
7816 if (!ExtDef)
7817 return std::nullopt;
7818 Ext = getExtendTypeForInst(MI&: *ExtDef, MRI);
7819 if (Ext == AArch64_AM::InvalidShiftExtend)
7820 return std::nullopt;
7821 ExtReg = ExtDef->getOperand(i: 1).getReg();
7822 } else {
7823 // Didn't get a shift. Try just folding an extend.
7824 Ext = getExtendTypeForInst(MI&: *RootDef, MRI);
7825 if (Ext == AArch64_AM::InvalidShiftExtend)
7826 return std::nullopt;
7827 ExtReg = RootDef->getOperand(i: 1).getReg();
7828
7829 // If we have a 32 bit instruction which zeroes out the high half of a
7830 // register, we get an implicit zero extend for free. Check if we have one.
7831 // FIXME: We actually emit the extend right now even though we don't have
7832 // to.
7833 if (Ext == AArch64_AM::UXTW && MRI.getType(Reg: ExtReg).getSizeInBits() == 32) {
7834 MachineInstr *ExtInst = MRI.getVRegDef(Reg: ExtReg);
7835 if (isDef32(MI: *ExtInst))
7836 return std::nullopt;
7837 }
7838 }
7839
7840 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7841 // copy.
7842 MachineIRBuilder MIB(*RootDef);
7843 ExtReg = moveScalarRegClass(Reg: ExtReg, RC: AArch64::GPR32RegClass, MIB);
7844
7845 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); },
7846 [=](MachineInstrBuilder &MIB) {
7847 MIB.addImm(Val: getArithExtendImm(ET: Ext, Imm: ShiftVal));
7848 }}};
7849}
7850
7851InstructionSelector::ComplexRendererFns
7852AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7853 if (!Root.isReg())
7854 return std::nullopt;
7855 MachineRegisterInfo &MRI =
7856 Root.getParent()->getParent()->getParent()->getRegInfo();
7857
7858 auto Extract = getDefSrcRegIgnoringCopies(Reg: Root.getReg(), MRI);
7859 while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7860 STI.isLittleEndian())
7861 Extract =
7862 getDefSrcRegIgnoringCopies(Reg: Extract->MI->getOperand(i: 1).getReg(), MRI);
7863 if (!Extract)
7864 return std::nullopt;
7865
7866 if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
7867 if (Extract->Reg == Extract->MI->getOperand(i: 1).getReg()) {
7868 Register ExtReg = Extract->MI->getOperand(i: 2).getReg();
7869 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7870 }
7871 }
7872 if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) {
7873 LLT SrcTy = MRI.getType(Reg: Extract->MI->getOperand(i: 1).getReg());
7874 auto LaneIdx = getIConstantVRegValWithLookThrough(
7875 VReg: Extract->MI->getOperand(i: 2).getReg(), MRI);
7876 if (LaneIdx && SrcTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) &&
7877 LaneIdx->Value.getSExtValue() == 1) {
7878 Register ExtReg = Extract->MI->getOperand(i: 1).getReg();
7879 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7880 }
7881 }
7882
7883 return std::nullopt;
7884}
7885
7886void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7887 const MachineInstr &MI,
7888 int OpIdx) const {
7889 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7890 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7891 "Expected G_CONSTANT");
7892 std::optional<int64_t> CstVal =
7893 getIConstantVRegSExtVal(VReg: MI.getOperand(i: 0).getReg(), MRI);
7894 assert(CstVal && "Expected constant value");
7895 MIB.addImm(Val: *CstVal);
7896}
7897
7898void AArch64InstructionSelector::renderLogicalImm32(
7899 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7900 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7901 "Expected G_CONSTANT");
7902 uint64_t CstVal = I.getOperand(i: 1).getCImm()->getZExtValue();
7903 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(imm: CstVal, regSize: 32);
7904 MIB.addImm(Val: Enc);
7905}
7906
7907void AArch64InstructionSelector::renderLogicalImm64(
7908 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7909 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7910 "Expected G_CONSTANT");
7911 uint64_t CstVal = I.getOperand(i: 1).getCImm()->getZExtValue();
7912 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(imm: CstVal, regSize: 64);
7913 MIB.addImm(Val: Enc);
7914}
7915
7916void AArch64InstructionSelector::renderUbsanTrap(MachineInstrBuilder &MIB,
7917 const MachineInstr &MI,
7918 int OpIdx) const {
7919 assert(MI.getOpcode() == TargetOpcode::G_UBSANTRAP && OpIdx == 0 &&
7920 "Expected G_UBSANTRAP");
7921 MIB.addImm(Val: MI.getOperand(i: 0).getImm() | ('U' << 8));
7922}
7923
7924void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
7925 const MachineInstr &MI,
7926 int OpIdx) const {
7927 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7928 "Expected G_FCONSTANT");
7929 MIB.addImm(
7930 Val: AArch64_AM::getFP16Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
7931}
7932
7933void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
7934 const MachineInstr &MI,
7935 int OpIdx) const {
7936 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7937 "Expected G_FCONSTANT");
7938 MIB.addImm(
7939 Val: AArch64_AM::getFP32Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
7940}
7941
7942void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
7943 const MachineInstr &MI,
7944 int OpIdx) const {
7945 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7946 "Expected G_FCONSTANT");
7947 MIB.addImm(
7948 Val: AArch64_AM::getFP64Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
7949}
7950
7951void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
7952 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7953 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7954 "Expected G_FCONSTANT");
7955 MIB.addImm(Val: AArch64_AM::encodeAdvSIMDModImmType4(Imm: MI.getOperand(i: 1)
7956 .getFPImm()
7957 ->getValueAPF()
7958 .bitcastToAPInt()
7959 .getZExtValue()));
7960}
7961
7962bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
7963 const MachineInstr &MI, unsigned NumBytes) const {
7964 if (!MI.mayLoadOrStore())
7965 return false;
7966 assert(MI.hasOneMemOperand() &&
7967 "Expected load/store to have only one mem op!");
7968 return (*MI.memoperands_begin())->getSize() == NumBytes;
7969}
7970
7971bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
7972 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7973 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits() != 32)
7974 return false;
7975
7976 // Only return true if we know the operation will zero-out the high half of
7977 // the 64-bit register. Truncates can be subregister copies, which don't
7978 // zero out the high bits. Copies and other copy-like instructions can be
7979 // fed by truncates, or could be lowered as subregister copies.
7980 switch (MI.getOpcode()) {
7981 default:
7982 return true;
7983 case TargetOpcode::COPY:
7984 case TargetOpcode::G_BITCAST:
7985 case TargetOpcode::G_TRUNC:
7986 case TargetOpcode::G_PHI:
7987 return false;
7988 }
7989}
7990
7991
7992// Perform fixups on the given PHI instruction's operands to force them all
7993// to be the same as the destination regbank.
7994static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
7995 const AArch64RegisterBankInfo &RBI) {
7996 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
7997 Register DstReg = MI.getOperand(i: 0).getReg();
7998 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: DstReg);
7999 assert(DstRB && "Expected PHI dst to have regbank assigned");
8000 MachineIRBuilder MIB(MI);
8001
8002 // Go through each operand and ensure it has the same regbank.
8003 for (MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands())) {
8004 if (!MO.isReg())
8005 continue;
8006 Register OpReg = MO.getReg();
8007 const RegisterBank *RB = MRI.getRegBankOrNull(Reg: OpReg);
8008 if (RB != DstRB) {
8009 // Insert a cross-bank copy.
8010 auto *OpDef = MRI.getVRegDef(Reg: OpReg);
8011 const LLT &Ty = MRI.getType(Reg: OpReg);
8012 MachineBasicBlock &OpDefBB = *OpDef->getParent();
8013
8014 // Any instruction we insert must appear after all PHIs in the block
8015 // for the block to be valid MIR.
8016 MachineBasicBlock::iterator InsertPt = std::next(x: OpDef->getIterator());
8017 if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
8018 InsertPt = OpDefBB.getFirstNonPHI();
8019 MIB.setInsertPt(MBB&: *OpDef->getParent(), II: InsertPt);
8020 auto Copy = MIB.buildCopy(Res: Ty, Op: OpReg);
8021 MRI.setRegBank(Reg: Copy.getReg(Idx: 0), RegBank: *DstRB);
8022 MO.setReg(Copy.getReg(Idx: 0));
8023 }
8024 }
8025}
8026
8027void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
8028 // We're looking for PHIs, build a list so we don't invalidate iterators.
8029 MachineRegisterInfo &MRI = MF.getRegInfo();
8030 SmallVector<MachineInstr *, 32> Phis;
8031 for (auto &BB : MF) {
8032 for (auto &MI : BB) {
8033 if (MI.getOpcode() == TargetOpcode::G_PHI)
8034 Phis.emplace_back(Args: &MI);
8035 }
8036 }
8037
8038 for (auto *MI : Phis) {
8039 // We need to do some work here if the operand types are < 16 bit and they
8040 // are split across fpr/gpr banks. Since all types <32b on gpr
8041 // end up being assigned gpr32 regclasses, we can end up with PHIs here
8042 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
8043 // be selecting heterogenous regbanks for operands if possible, but we
8044 // still need to be able to deal with it here.
8045 //
8046 // To fix this, if we have a gpr-bank operand < 32b in size and at least
8047 // one other operand is on the fpr bank, then we add cross-bank copies
8048 // to homogenize the operand banks. For simplicity the bank that we choose
8049 // to settle on is whatever bank the def operand has. For example:
8050 //
8051 // %endbb:
8052 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
8053 // =>
8054 // %bb2:
8055 // ...
8056 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
8057 // ...
8058 // %endbb:
8059 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
8060 bool HasGPROp = false, HasFPROp = false;
8061 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands())) {
8062 if (!MO.isReg())
8063 continue;
8064 const LLT &Ty = MRI.getType(Reg: MO.getReg());
8065 if (!Ty.isValid() || !Ty.isScalar())
8066 break;
8067 if (Ty.getSizeInBits() >= 32)
8068 break;
8069 const RegisterBank *RB = MRI.getRegBankOrNull(Reg: MO.getReg());
8070 // If for some reason we don't have a regbank yet. Don't try anything.
8071 if (!RB)
8072 break;
8073
8074 if (RB->getID() == AArch64::GPRRegBankID)
8075 HasGPROp = true;
8076 else
8077 HasFPROp = true;
8078 }
8079 // We have heterogenous regbanks, need to fixup.
8080 if (HasGPROp && HasFPROp)
8081 fixupPHIOpBanks(MI&: *MI, MRI, RBI);
8082 }
8083}
8084
8085namespace llvm {
8086InstructionSelector *
8087createAArch64InstructionSelector(const AArch64TargetMachine &TM,
8088 const AArch64Subtarget &Subtarget,
8089 const AArch64RegisterBankInfo &RBI) {
8090 return new AArch64InstructionSelector(TM, Subtarget, RBI);
8091}
8092}
8093