1//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AArch64.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AArch64GlobalISelUtils.h"
15#include "AArch64InstrInfo.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64RegisterBankInfo.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
20#include "AArch64TargetMachine.h"
21#include "MCTargetDesc/AArch64AddressingModes.h"
22#include "MCTargetDesc/AArch64MCTargetDesc.h"
23#include "llvm/BinaryFormat/Dwarf.h"
24#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
26#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
27#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
28#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30#include "llvm/CodeGen/GlobalISel/Utils.h"
31#include "llvm/CodeGen/MachineBasicBlock.h"
32#include "llvm/CodeGen/MachineConstantPool.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/MachineFunction.h"
35#include "llvm/CodeGen/MachineInstr.h"
36#include "llvm/CodeGen/MachineInstrBuilder.h"
37#include "llvm/CodeGen/MachineMemOperand.h"
38#include "llvm/CodeGen/MachineOperand.h"
39#include "llvm/CodeGen/MachineRegisterInfo.h"
40#include "llvm/CodeGen/TargetOpcodes.h"
41#include "llvm/CodeGen/TargetRegisterInfo.h"
42#include "llvm/IR/Constants.h"
43#include "llvm/IR/DerivedTypes.h"
44#include "llvm/IR/Instructions.h"
45#include "llvm/IR/IntrinsicsAArch64.h"
46#include "llvm/IR/Type.h"
47#include "llvm/Pass.h"
48#include "llvm/Support/Debug.h"
49#include "llvm/Support/raw_ostream.h"
50#include <optional>
51
52#define DEBUG_TYPE "aarch64-isel"
53
54using namespace llvm;
55using namespace MIPatternMatch;
56using namespace AArch64GISelUtils;
57
58namespace llvm {
59class BlockFrequencyInfo;
60class ProfileSummaryInfo;
61}
62
63namespace {
64
65#define GET_GLOBALISEL_PREDICATE_BITSET
66#include "AArch64GenGlobalISel.inc"
67#undef GET_GLOBALISEL_PREDICATE_BITSET
68
69
70class AArch64InstructionSelector : public InstructionSelector {
71public:
72 AArch64InstructionSelector(const AArch64TargetMachine &TM,
73 const AArch64Subtarget &STI,
74 const AArch64RegisterBankInfo &RBI);
75
76 bool select(MachineInstr &I) override;
77 static const char *getName() { return DEBUG_TYPE; }
78
79 void setupMF(MachineFunction &MF, GISelValueTracking *VT,
80 CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
81 BlockFrequencyInfo *BFI) override {
82 InstructionSelector::setupMF(mf&: MF, vt: VT, covinfo: CoverageInfo, psi: PSI, bfi: BFI);
83 MIB.setMF(MF);
84
85 // hasFnAttribute() is expensive to call on every BRCOND selection, so
86 // cache it here for each run of the selector.
87 ProduceNonFlagSettingCondBr =
88 !MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening);
89 MFReturnAddr = Register();
90
91 processPHIs(MF);
92 }
93
94private:
95 /// tblgen-erated 'select' implementation, used as the initial selector for
96 /// the patterns that don't require complex C++.
97 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
98
99 // A lowering phase that runs before any selection attempts.
100 // Returns true if the instruction was modified.
101 bool preISelLower(MachineInstr &I);
102
103 // An early selection function that runs before the selectImpl() call.
104 bool earlySelect(MachineInstr &I);
105
106 /// Save state that is shared between select calls, call select on \p I and
107 /// then restore the saved state. This can be used to recursively call select
108 /// within a select call.
109 bool selectAndRestoreState(MachineInstr &I);
110
111 // Do some preprocessing of G_PHIs before we begin selection.
112 void processPHIs(MachineFunction &MF);
113
114 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
115
116 /// Eliminate same-sized cross-bank copies into stores before selectImpl().
117 bool contractCrossBankCopyIntoStore(MachineInstr &I,
118 MachineRegisterInfo &MRI);
119
120 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
121
122 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
123 MachineRegisterInfo &MRI) const;
124 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
125 MachineRegisterInfo &MRI) const;
126
127 ///@{
128 /// Helper functions for selectCompareBranch.
129 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
130 MachineIRBuilder &MIB) const;
131 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
132 MachineIRBuilder &MIB) const;
133 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
134 MachineIRBuilder &MIB) const;
135 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
136 MachineBasicBlock *DstMBB,
137 MachineIRBuilder &MIB) const;
138 ///@}
139
140 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
141 MachineRegisterInfo &MRI);
142
143 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
144 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
145
146 // Helper to generate an equivalent of scalar_to_vector into a new register,
147 // returned via 'Dst'.
148 MachineInstr *emitScalarToVector(unsigned EltSize,
149 const TargetRegisterClass *DstRC,
150 Register Scalar,
151 MachineIRBuilder &MIRBuilder) const;
152 /// Helper to narrow vector that was widened by emitScalarToVector.
153 /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
154 /// vector, correspondingly.
155 MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
156 MachineIRBuilder &MIRBuilder,
157 MachineRegisterInfo &MRI) const;
158
159 /// Emit a lane insert into \p DstReg, or a new vector register if
160 /// std::nullopt is provided.
161 ///
162 /// The lane inserted into is defined by \p LaneIdx. The vector source
163 /// register is given by \p SrcReg. The register containing the element is
164 /// given by \p EltReg.
165 MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
166 Register EltReg, unsigned LaneIdx,
167 const RegisterBank &RB,
168 MachineIRBuilder &MIRBuilder) const;
169
170 /// Emit a sequence of instructions representing a constant \p CV for a
171 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
172 ///
173 /// \returns the last instruction in the sequence on success, and nullptr
174 /// otherwise.
175 MachineInstr *emitConstantVector(Register Dst, Constant *CV,
176 MachineIRBuilder &MIRBuilder,
177 MachineRegisterInfo &MRI);
178
179 MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits,
180 MachineIRBuilder &MIRBuilder);
181
182 MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits,
183 MachineIRBuilder &MIRBuilder, bool Inv);
184
185 MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits,
186 MachineIRBuilder &MIRBuilder, bool Inv);
187 MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits,
188 MachineIRBuilder &MIRBuilder);
189 MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits,
190 MachineIRBuilder &MIRBuilder, bool Inv);
191 MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
192 MachineIRBuilder &MIRBuilder);
193
194 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
195 MachineRegisterInfo &MRI);
196 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
197 /// SUBREG_TO_REG.
198 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
199 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
200 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
201 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
202
203 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
204 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
205 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
206 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
207
208 /// Helper function to select vector load intrinsics like
209 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
210 /// \p Opc is the opcode that the selected instruction should use.
211 /// \p NumVecs is the number of vector destinations for the instruction.
212 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
213 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
214 MachineInstr &I);
215 bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
216 MachineInstr &I);
217 void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
218 unsigned Opc);
219 bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
220 unsigned Opc);
221 bool selectIntrinsicWithSideEffects(MachineInstr &I,
222 MachineRegisterInfo &MRI);
223 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
224 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
225 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
226 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
227 bool selectPtrAuthGlobalValue(MachineInstr &I,
228 MachineRegisterInfo &MRI) const;
229 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
230 bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
231 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
232 void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs,
233 unsigned Opc1, unsigned Opc2, bool isExt);
234
235 bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
236 bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
237 bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
238
239 unsigned emitConstantPoolEntry(const Constant *CPVal,
240 MachineFunction &MF) const;
241 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
242 MachineIRBuilder &MIRBuilder) const;
243
244 // Emit a vector concat operation.
245 MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
246 Register Op2,
247 MachineIRBuilder &MIRBuilder) const;
248
249 // Emit an integer compare between LHS and RHS, which checks for Predicate.
250 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
251 MachineOperand &Predicate,
252 MachineIRBuilder &MIRBuilder) const;
253
254 /// Emit a floating point comparison between \p LHS and \p RHS.
255 /// \p Pred if given is the intended predicate to use.
256 MachineInstr *
257 emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
258 std::optional<CmpInst::Predicate> = std::nullopt) const;
259
260 MachineInstr *
261 emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
262 std::initializer_list<llvm::SrcOp> SrcOps,
263 MachineIRBuilder &MIRBuilder,
264 const ComplexRendererFns &RenderFns = std::nullopt) const;
265 /// Helper function to emit an add or sub instruction.
266 ///
267 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
268 /// in a specific order.
269 ///
270 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
271 ///
272 /// \code
273 /// const std::array<std::array<unsigned, 2>, 4> Table {
274 /// {{AArch64::ADDXri, AArch64::ADDWri},
275 /// {AArch64::ADDXrs, AArch64::ADDWrs},
276 /// {AArch64::ADDXrr, AArch64::ADDWrr},
277 /// {AArch64::SUBXri, AArch64::SUBWri},
278 /// {AArch64::ADDXrx, AArch64::ADDWrx}}};
279 /// \endcode
280 ///
281 /// Each row in the table corresponds to a different addressing mode. Each
282 /// column corresponds to a different register size.
283 ///
284 /// \attention Rows must be structured as follows:
285 /// - Row 0: The ri opcode variants
286 /// - Row 1: The rs opcode variants
287 /// - Row 2: The rr opcode variants
288 /// - Row 3: The ri opcode variants for negative immediates
289 /// - Row 4: The rx opcode variants
290 ///
291 /// \attention Columns must be structured as follows:
292 /// - Column 0: The 64-bit opcode variants
293 /// - Column 1: The 32-bit opcode variants
294 ///
295 /// \p Dst is the destination register of the binop to emit.
296 /// \p LHS is the left-hand operand of the binop to emit.
297 /// \p RHS is the right-hand operand of the binop to emit.
298 MachineInstr *emitAddSub(
299 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
300 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
301 MachineIRBuilder &MIRBuilder) const;
302 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
303 MachineOperand &RHS,
304 MachineIRBuilder &MIRBuilder) const;
305 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
306 MachineIRBuilder &MIRBuilder) const;
307 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
308 MachineIRBuilder &MIRBuilder) const;
309 MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
310 MachineIRBuilder &MIRBuilder) const;
311 MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
312 MachineIRBuilder &MIRBuilder) const;
313 MachineInstr *emitCMP(MachineOperand &LHS, MachineOperand &RHS,
314 MachineIRBuilder &MIRBuilder) const;
315 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
316 MachineIRBuilder &MIRBuilder) const;
317 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
318 MachineIRBuilder &MIRBuilder) const;
319 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
320 AArch64CC::CondCode CC,
321 MachineIRBuilder &MIRBuilder) const;
322 MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
323 const RegisterBank &DstRB, LLT ScalarTy,
324 Register VecReg, unsigned LaneIdx,
325 MachineIRBuilder &MIRBuilder) const;
326 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
327 AArch64CC::CondCode Pred,
328 MachineIRBuilder &MIRBuilder) const;
329 /// Emit a CSet for a FP compare.
330 ///
331 /// \p Dst is expected to be a 32-bit scalar register.
332 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
333 MachineIRBuilder &MIRBuilder) const;
334
335 /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
336 /// Might elide the instruction if the previous instruction already sets NZCV
337 /// correctly.
338 MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
339
340 /// Emit the overflow op for \p Opcode.
341 ///
342 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
343 /// G_USUBO, etc.
344 std::pair<MachineInstr *, AArch64CC::CondCode>
345 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
346 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
347
348 bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
349
350 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
351 /// In some cases this is even possible with OR operations in the expression.
352 MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
353 MachineIRBuilder &MIB) const;
354 MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
355 CmpInst::Predicate CC,
356 AArch64CC::CondCode Predicate,
357 AArch64CC::CondCode OutCC,
358 MachineIRBuilder &MIB) const;
359 MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
360 bool Negate, Register CCOp,
361 AArch64CC::CondCode Predicate,
362 MachineIRBuilder &MIB) const;
363
364 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
365 /// \p IsNegative is true if the test should be "not zero".
366 /// This will also optimize the test bit instruction when possible.
367 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
368 MachineBasicBlock *DstMBB,
369 MachineIRBuilder &MIB) const;
370
371 /// Emit a CB(N)Z instruction which branches to \p DestMBB.
372 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
373 MachineBasicBlock *DestMBB,
374 MachineIRBuilder &MIB) const;
375
376 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
377 // We use these manually instead of using the importer since it doesn't
378 // support SDNodeXForm.
379 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
380 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
381 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
382 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
383
384 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
385 ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
386 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
387
388 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
389 unsigned Size) const;
390
391 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
392 return selectAddrModeUnscaled(Root, Size: 1);
393 }
394 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
395 return selectAddrModeUnscaled(Root, Size: 2);
396 }
397 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
398 return selectAddrModeUnscaled(Root, Size: 4);
399 }
400 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
401 return selectAddrModeUnscaled(Root, Size: 8);
402 }
403 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
404 return selectAddrModeUnscaled(Root, Size: 16);
405 }
406
407 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
408 /// from complex pattern matchers like selectAddrModeIndexed().
409 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
410 MachineRegisterInfo &MRI) const;
411
412 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
413 unsigned Size) const;
414 template <int Width>
415 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
416 return selectAddrModeIndexed(Root, Size: Width / 8);
417 }
418
419 std::optional<bool>
420 isWorthFoldingIntoAddrMode(const MachineInstr &MI,
421 const MachineRegisterInfo &MRI) const;
422
423 bool isWorthFoldingIntoExtendedReg(const MachineInstr &MI,
424 const MachineRegisterInfo &MRI,
425 bool IsAddrOperand) const;
426 ComplexRendererFns
427 selectAddrModeShiftedExtendXReg(MachineOperand &Root,
428 unsigned SizeInBytes) const;
429
430 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
431 /// or not a shift + extend should be folded into an addressing mode. Returns
432 /// None when this is not profitable or possible.
433 ComplexRendererFns
434 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
435 MachineOperand &Offset, unsigned SizeInBytes,
436 bool WantsExt) const;
437 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
438 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
439 unsigned SizeInBytes) const;
440 template <int Width>
441 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
442 return selectAddrModeXRO(Root, SizeInBytes: Width / 8);
443 }
444
445 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
446 unsigned SizeInBytes) const;
447 template <int Width>
448 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
449 return selectAddrModeWRO(Root, SizeInBytes: Width / 8);
450 }
451
452 ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
453 bool AllowROR = false) const;
454
455 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
456 return selectShiftedRegister(Root);
457 }
458
459 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
460 return selectShiftedRegister(Root, AllowROR: true);
461 }
462
463 /// Given an extend instruction, determine the correct shift-extend type for
464 /// that instruction.
465 ///
466 /// If the instruction is going to be used in a load or store, pass
467 /// \p IsLoadStore = true.
468 AArch64_AM::ShiftExtendType
469 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
470 bool IsLoadStore = false) const;
471
472 /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
473 ///
474 /// \returns Either \p Reg if no change was necessary, or the new register
475 /// created by moving \p Reg.
476 ///
477 /// Note: This uses emitCopy right now.
478 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
479 MachineIRBuilder &MIB) const;
480
481 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
482
483 ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
484
485 ComplexRendererFns selectCVTFixedPointVec(MachineOperand &Root) const;
486 ComplexRendererFns
487 selectCVTFixedPointVecBase(const MachineOperand &Root) const;
488 void renderFixedPointXForm(MachineInstrBuilder &MIB, const MachineInstr &MI,
489 int OpIdx = -1) const;
490
491 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
492 int OpIdx = -1) const;
493 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
494 int OpIdx = -1) const;
495 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
496 int OpIdx = -1) const;
497 void renderUbsanTrap(MachineInstrBuilder &MIB, const MachineInstr &MI,
498 int OpIdx) const;
499 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
500 int OpIdx = -1) const;
501 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
502 int OpIdx = -1) const;
503 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
504 int OpIdx = -1) const;
505 void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
506 const MachineInstr &MI,
507 int OpIdx = -1) const;
508
509 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
510 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
511
512 // Optimization methods.
513 bool tryOptSelect(GSelect &Sel);
514 bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
515 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
516 MachineOperand &Predicate,
517 MachineIRBuilder &MIRBuilder) const;
518
519 /// Return true if \p MI is a load or store of \p NumBytes bytes.
520 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
521
522 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
523 /// register zeroed out. In other words, the result of MI has been explicitly
524 /// zero extended.
525 bool isDef32(const MachineInstr &MI) const;
526
527 const AArch64TargetMachine &TM;
528 const AArch64Subtarget &STI;
529 const AArch64InstrInfo &TII;
530 const AArch64RegisterInfo &TRI;
531 const AArch64RegisterBankInfo &RBI;
532
533 bool ProduceNonFlagSettingCondBr = false;
534
535 // Some cached values used during selection.
536 // We use LR as a live-in register, and we keep track of it here as it can be
537 // clobbered by calls.
538 Register MFReturnAddr;
539
540 MachineIRBuilder MIB;
541
542#define GET_GLOBALISEL_PREDICATES_DECL
543#include "AArch64GenGlobalISel.inc"
544#undef GET_GLOBALISEL_PREDICATES_DECL
545
546// We declare the temporaries used by selectImpl() in the class to minimize the
547// cost of constructing placeholder values.
548#define GET_GLOBALISEL_TEMPORARIES_DECL
549#include "AArch64GenGlobalISel.inc"
550#undef GET_GLOBALISEL_TEMPORARIES_DECL
551};
552
553} // end anonymous namespace
554
555#define GET_GLOBALISEL_IMPL
556#include "AArch64GenGlobalISel.inc"
557#undef GET_GLOBALISEL_IMPL
558
559AArch64InstructionSelector::AArch64InstructionSelector(
560 const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
561 const AArch64RegisterBankInfo &RBI)
562 : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
563 RBI(RBI),
564#define GET_GLOBALISEL_PREDICATES_INIT
565#include "AArch64GenGlobalISel.inc"
566#undef GET_GLOBALISEL_PREDICATES_INIT
567#define GET_GLOBALISEL_TEMPORARIES_INIT
568#include "AArch64GenGlobalISel.inc"
569#undef GET_GLOBALISEL_TEMPORARIES_INIT
570{
571}
572
573// FIXME: This should be target-independent, inferred from the types declared
574// for each class in the bank.
575//
576/// Given a register bank, and a type, return the smallest register class that
577/// can represent that combination.
578static const TargetRegisterClass *
579getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
580 bool GetAllRegSet = false) {
581 if (RB.getID() == AArch64::GPRRegBankID) {
582 if (Ty.getSizeInBits() <= 32)
583 return GetAllRegSet ? &AArch64::GPR32allRegClass
584 : &AArch64::GPR32RegClass;
585 if (Ty.getSizeInBits() == 64)
586 return GetAllRegSet ? &AArch64::GPR64allRegClass
587 : &AArch64::GPR64RegClass;
588 if (Ty.getSizeInBits() == 128)
589 return &AArch64::XSeqPairsClassRegClass;
590 return nullptr;
591 }
592
593 if (RB.getID() == AArch64::FPRRegBankID) {
594 switch (Ty.getSizeInBits()) {
595 case 8:
596 return &AArch64::FPR8RegClass;
597 case 16:
598 return &AArch64::FPR16RegClass;
599 case 32:
600 return &AArch64::FPR32RegClass;
601 case 64:
602 return &AArch64::FPR64RegClass;
603 case 128:
604 return &AArch64::FPR128RegClass;
605 }
606 return nullptr;
607 }
608
609 return nullptr;
610}
611
612/// Given a register bank, and size in bits, return the smallest register class
613/// that can represent that combination.
614static const TargetRegisterClass *
615getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
616 bool GetAllRegSet = false) {
617 if (SizeInBits.isScalable()) {
618 assert(RB.getID() == AArch64::FPRRegBankID &&
619 "Expected FPR regbank for scalable type size");
620 return &AArch64::ZPRRegClass;
621 }
622
623 unsigned RegBankID = RB.getID();
624
625 if (RegBankID == AArch64::GPRRegBankID) {
626 assert(!SizeInBits.isScalable() && "Unexpected scalable register size");
627 if (SizeInBits <= 32)
628 return GetAllRegSet ? &AArch64::GPR32allRegClass
629 : &AArch64::GPR32RegClass;
630 if (SizeInBits == 64)
631 return GetAllRegSet ? &AArch64::GPR64allRegClass
632 : &AArch64::GPR64RegClass;
633 if (SizeInBits == 128)
634 return &AArch64::XSeqPairsClassRegClass;
635 }
636
637 if (RegBankID == AArch64::FPRRegBankID) {
638 if (SizeInBits.isScalable()) {
639 assert(SizeInBits == TypeSize::getScalable(128) &&
640 "Unexpected scalable register size");
641 return &AArch64::ZPRRegClass;
642 }
643
644 switch (SizeInBits) {
645 default:
646 return nullptr;
647 case 8:
648 return &AArch64::FPR8RegClass;
649 case 16:
650 return &AArch64::FPR16RegClass;
651 case 32:
652 return &AArch64::FPR32RegClass;
653 case 64:
654 return &AArch64::FPR64RegClass;
655 case 128:
656 return &AArch64::FPR128RegClass;
657 }
658 }
659
660 return nullptr;
661}
662
663/// Returns the correct subregister to use for a given register class.
664static bool getSubRegForClass(const TargetRegisterClass *RC,
665 const TargetRegisterInfo &TRI, unsigned &SubReg) {
666 switch (TRI.getRegSizeInBits(RC: *RC)) {
667 case 8:
668 SubReg = AArch64::bsub;
669 break;
670 case 16:
671 SubReg = AArch64::hsub;
672 break;
673 case 32:
674 if (RC != &AArch64::FPR32RegClass)
675 SubReg = AArch64::sub_32;
676 else
677 SubReg = AArch64::ssub;
678 break;
679 case 64:
680 SubReg = AArch64::dsub;
681 break;
682 default:
683 LLVM_DEBUG(
684 dbgs() << "Couldn't find appropriate subregister for register class.");
685 return false;
686 }
687
688 return true;
689}
690
691/// Returns the minimum size the given register bank can hold.
692static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
693 switch (RB.getID()) {
694 case AArch64::GPRRegBankID:
695 return 32;
696 case AArch64::FPRRegBankID:
697 return 8;
698 default:
699 llvm_unreachable("Tried to get minimum size for unknown register bank.");
700 }
701}
702
703/// Create a REG_SEQUENCE instruction using the registers in \p Regs.
704/// Helper function for functions like createDTuple and createQTuple.
705///
706/// \p RegClassIDs - The list of register class IDs available for some tuple of
707/// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
708/// expected to contain between 2 and 4 tuple classes.
709///
710/// \p SubRegs - The list of subregister classes associated with each register
711/// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
712/// subregister class. The index of each subregister class is expected to
713/// correspond with the index of each register class.
714///
715/// \returns Either the destination register of REG_SEQUENCE instruction that
716/// was created, or the 0th element of \p Regs if \p Regs contains a single
717/// element.
718static Register createTuple(ArrayRef<Register> Regs,
719 const unsigned RegClassIDs[],
720 const unsigned SubRegs[], MachineIRBuilder &MIB) {
721 unsigned NumRegs = Regs.size();
722 if (NumRegs == 1)
723 return Regs[0];
724 assert(NumRegs >= 2 && NumRegs <= 4 &&
725 "Only support between two and 4 registers in a tuple!");
726 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
727 auto *DesiredClass = TRI->getRegClass(i: RegClassIDs[NumRegs - 2]);
728 auto RegSequence =
729 MIB.buildInstr(Opc: TargetOpcode::REG_SEQUENCE, DstOps: {DesiredClass}, SrcOps: {});
730 for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
731 RegSequence.addUse(RegNo: Regs[I]);
732 RegSequence.addImm(Val: SubRegs[I]);
733 }
734 return RegSequence.getReg(Idx: 0);
735}
736
737/// Create a tuple of D-registers using the registers in \p Regs.
738static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
739 static const unsigned RegClassIDs[] = {
740 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
741 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
742 AArch64::dsub2, AArch64::dsub3};
743 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
744}
745
746/// Create a tuple of Q-registers using the registers in \p Regs.
747static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
748 static const unsigned RegClassIDs[] = {
749 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
750 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
751 AArch64::qsub2, AArch64::qsub3};
752 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
753}
754
755static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
756 auto &MI = *Root.getParent();
757 auto &MBB = *MI.getParent();
758 auto &MF = *MBB.getParent();
759 auto &MRI = MF.getRegInfo();
760 uint64_t Immed;
761 if (Root.isImm())
762 Immed = Root.getImm();
763 else if (Root.isCImm())
764 Immed = Root.getCImm()->getZExtValue();
765 else if (Root.isReg()) {
766 auto ValAndVReg =
767 getIConstantVRegValWithLookThrough(VReg: Root.getReg(), MRI, LookThroughInstrs: true);
768 if (!ValAndVReg)
769 return std::nullopt;
770 Immed = ValAndVReg->Value.getSExtValue();
771 } else
772 return std::nullopt;
773 return Immed;
774}
775
776/// Check whether \p I is a currently unsupported binary operation:
777/// - it has an unsized type
778/// - an operand is not a vreg
779/// - all operands are not in the same bank
780/// These are checks that should someday live in the verifier, but right now,
781/// these are mostly limitations of the aarch64 selector.
782static bool unsupportedBinOp(const MachineInstr &I,
783 const AArch64RegisterBankInfo &RBI,
784 const MachineRegisterInfo &MRI,
785 const AArch64RegisterInfo &TRI) {
786 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
787 if (!Ty.isValid()) {
788 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
789 return true;
790 }
791
792 const RegisterBank *PrevOpBank = nullptr;
793 for (auto &MO : I.operands()) {
794 // FIXME: Support non-register operands.
795 if (!MO.isReg()) {
796 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
797 return true;
798 }
799
800 // FIXME: Can generic operations have physical registers operands? If
801 // so, this will need to be taught about that, and we'll need to get the
802 // bank out of the minimal class for the register.
803 // Either way, this needs to be documented (and possibly verified).
804 if (!MO.getReg().isVirtual()) {
805 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
806 return true;
807 }
808
809 const RegisterBank *OpBank = RBI.getRegBank(Reg: MO.getReg(), MRI, TRI);
810 if (!OpBank) {
811 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
812 return true;
813 }
814
815 if (PrevOpBank && OpBank != PrevOpBank) {
816 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
817 return true;
818 }
819 PrevOpBank = OpBank;
820 }
821 return false;
822}
823
824/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
825/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
826/// and of size \p OpSize.
827/// \returns \p GenericOpc if the combination is unsupported.
828static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
829 unsigned OpSize) {
830 switch (RegBankID) {
831 case AArch64::GPRRegBankID:
832 if (OpSize == 32) {
833 switch (GenericOpc) {
834 case TargetOpcode::G_SHL:
835 return AArch64::LSLVWr;
836 case TargetOpcode::G_LSHR:
837 return AArch64::LSRVWr;
838 case TargetOpcode::G_ASHR:
839 return AArch64::ASRVWr;
840 default:
841 return GenericOpc;
842 }
843 } else if (OpSize == 64) {
844 switch (GenericOpc) {
845 case TargetOpcode::G_PTR_ADD:
846 return AArch64::ADDXrr;
847 case TargetOpcode::G_SHL:
848 return AArch64::LSLVXr;
849 case TargetOpcode::G_LSHR:
850 return AArch64::LSRVXr;
851 case TargetOpcode::G_ASHR:
852 return AArch64::ASRVXr;
853 default:
854 return GenericOpc;
855 }
856 }
857 break;
858 case AArch64::FPRRegBankID:
859 switch (OpSize) {
860 case 32:
861 switch (GenericOpc) {
862 case TargetOpcode::G_FADD:
863 return AArch64::FADDSrr;
864 case TargetOpcode::G_FSUB:
865 return AArch64::FSUBSrr;
866 case TargetOpcode::G_FMUL:
867 return AArch64::FMULSrr;
868 case TargetOpcode::G_FDIV:
869 return AArch64::FDIVSrr;
870 default:
871 return GenericOpc;
872 }
873 case 64:
874 switch (GenericOpc) {
875 case TargetOpcode::G_FADD:
876 return AArch64::FADDDrr;
877 case TargetOpcode::G_FSUB:
878 return AArch64::FSUBDrr;
879 case TargetOpcode::G_FMUL:
880 return AArch64::FMULDrr;
881 case TargetOpcode::G_FDIV:
882 return AArch64::FDIVDrr;
883 case TargetOpcode::G_OR:
884 return AArch64::ORRv8i8;
885 default:
886 return GenericOpc;
887 }
888 }
889 break;
890 }
891 return GenericOpc;
892}
893
894/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
895/// appropriate for the (value) register bank \p RegBankID and of memory access
896/// size \p OpSize. This returns the variant with the base+unsigned-immediate
897/// addressing mode (e.g., LDRXui).
898/// \returns \p GenericOpc if the combination is unsupported.
899static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
900 unsigned OpSize) {
901 const bool isStore = GenericOpc == TargetOpcode::G_STORE;
902 switch (RegBankID) {
903 case AArch64::GPRRegBankID:
904 switch (OpSize) {
905 case 8:
906 return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
907 case 16:
908 return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
909 case 32:
910 return isStore ? AArch64::STRWui : AArch64::LDRWui;
911 case 64:
912 return isStore ? AArch64::STRXui : AArch64::LDRXui;
913 }
914 break;
915 case AArch64::FPRRegBankID:
916 switch (OpSize) {
917 case 8:
918 return isStore ? AArch64::STRBui : AArch64::LDRBui;
919 case 16:
920 return isStore ? AArch64::STRHui : AArch64::LDRHui;
921 case 32:
922 return isStore ? AArch64::STRSui : AArch64::LDRSui;
923 case 64:
924 return isStore ? AArch64::STRDui : AArch64::LDRDui;
925 case 128:
926 return isStore ? AArch64::STRQui : AArch64::LDRQui;
927 }
928 break;
929 }
930 return GenericOpc;
931}
932
933/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
934/// to \p *To.
935///
936/// E.g "To = COPY SrcReg:SubReg"
937static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
938 const RegisterBankInfo &RBI, Register SrcReg,
939 const TargetRegisterClass *To, unsigned SubReg) {
940 assert(SrcReg.isValid() && "Expected a valid source register?");
941 assert(To && "Destination register class cannot be null");
942 assert(SubReg && "Expected a valid subregister");
943
944 MachineIRBuilder MIB(I);
945 auto SubRegCopy =
946 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {To}, SrcOps: {}).addReg(RegNo: SrcReg, Flags: {}, SubReg);
947 MachineOperand &RegOp = I.getOperand(i: 1);
948 RegOp.setReg(SubRegCopy.getReg(Idx: 0));
949
950 // It's possible that the destination register won't be constrained. Make
951 // sure that happens.
952 if (!I.getOperand(i: 0).getReg().isPhysical())
953 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(), RC: *To, MRI);
954
955 return true;
956}
957
958/// Helper function to get the source and destination register classes for a
959/// copy. Returns a std::pair containing the source register class for the
960/// copy, and the destination register class for the copy. If a register class
961/// cannot be determined, then it will be nullptr.
962static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
963getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
964 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
965 const RegisterBankInfo &RBI) {
966 Register DstReg = I.getOperand(i: 0).getReg();
967 Register SrcReg = I.getOperand(i: 1).getReg();
968 const RegisterBank &DstRegBank = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
969 const RegisterBank &SrcRegBank = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
970
971 TypeSize DstSize = RBI.getSizeInBits(Reg: DstReg, MRI, TRI);
972 TypeSize SrcSize = RBI.getSizeInBits(Reg: SrcReg, MRI, TRI);
973
974 // Special casing for cross-bank copies of s1s. We can technically represent
975 // a 1-bit value with any size of register. The minimum size for a GPR is 32
976 // bits. So, we need to put the FPR on 32 bits as well.
977 //
978 // FIXME: I'm not sure if this case holds true outside of copies. If it does,
979 // then we can pull it into the helpers that get the appropriate class for a
980 // register bank. Or make a new helper that carries along some constraint
981 // information.
982 if (SrcRegBank != DstRegBank &&
983 (DstSize == TypeSize::getFixed(ExactSize: 1) && SrcSize == TypeSize::getFixed(ExactSize: 1)))
984 SrcSize = DstSize = TypeSize::getFixed(ExactSize: 32);
985
986 return {getMinClassForRegBank(RB: SrcRegBank, SizeInBits: SrcSize, GetAllRegSet: true),
987 getMinClassForRegBank(RB: DstRegBank, SizeInBits: DstSize, GetAllRegSet: true)};
988}
989
990// FIXME: We need some sort of API in RBI/TRI to allow generic code to
991// constrain operands of simple instructions given a TargetRegisterClass
992// and LLT
993static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
994 const RegisterBankInfo &RBI) {
995 for (MachineOperand &MO : I.operands()) {
996 if (!MO.isReg())
997 continue;
998 Register Reg = MO.getReg();
999 if (!Reg)
1000 continue;
1001 if (Reg.isPhysical())
1002 continue;
1003 LLT Ty = MRI.getType(Reg);
1004 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1005 const TargetRegisterClass *RC =
1006 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
1007 if (!RC) {
1008 const RegisterBank &RB = *cast<const RegisterBank *>(Val: RegClassOrBank);
1009 RC = getRegClassForTypeOnBank(Ty, RB);
1010 if (!RC) {
1011 LLVM_DEBUG(
1012 dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
1013 break;
1014 }
1015 }
1016 RBI.constrainGenericRegister(Reg, RC: *RC, MRI);
1017 }
1018
1019 return true;
1020}
1021
1022static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
1023 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
1024 const RegisterBankInfo &RBI) {
1025 Register DstReg = I.getOperand(i: 0).getReg();
1026 Register SrcReg = I.getOperand(i: 1).getReg();
1027 const RegisterBank &DstRegBank = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
1028 const RegisterBank &SrcRegBank = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
1029
1030 // Find the correct register classes for the source and destination registers.
1031 const TargetRegisterClass *SrcRC;
1032 const TargetRegisterClass *DstRC;
1033 std::tie(args&: SrcRC, args&: DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1034
1035 if (!DstRC) {
1036 LLVM_DEBUG(dbgs() << "Unexpected dest size "
1037 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
1038 return false;
1039 }
1040
1041 // Is this a copy? If so, then we may need to insert a subregister copy.
1042 if (I.isCopy()) {
1043 // Yes. Check if there's anything to fix up.
1044 if (!SrcRC) {
1045 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1046 return false;
1047 }
1048
1049 const TypeSize SrcSize = TRI.getRegSizeInBits(RC: *SrcRC);
1050 const TypeSize DstSize = TRI.getRegSizeInBits(RC: *DstRC);
1051 unsigned SubReg;
1052
1053 // If the source bank doesn't support a subregister copy small enough,
1054 // then we first need to copy to the destination bank.
1055 if (getMinSizeForRegBank(RB: SrcRegBank) > DstSize) {
1056 const TargetRegisterClass *DstTempRC =
1057 getMinClassForRegBank(RB: DstRegBank, SizeInBits: SrcSize, /* GetAllRegSet */ true);
1058 getSubRegForClass(RC: DstRC, TRI, SubReg);
1059
1060 MachineIRBuilder MIB(I);
1061 auto Copy = MIB.buildCopy(Res: {DstTempRC}, Op: {SrcReg});
1062 copySubReg(I, MRI, RBI, SrcReg: Copy.getReg(Idx: 0), To: DstRC, SubReg);
1063 } else if (SrcSize > DstSize) {
1064 // If the source register is bigger than the destination we need to
1065 // perform a subregister copy.
1066 const TargetRegisterClass *SubRegRC =
1067 getMinClassForRegBank(RB: SrcRegBank, SizeInBits: DstSize, /* GetAllRegSet */ true);
1068 getSubRegForClass(RC: SubRegRC, TRI, SubReg);
1069 copySubReg(I, MRI, RBI, SrcReg, To: DstRC, SubReg);
1070 } else if (DstSize > SrcSize) {
1071 // If the destination register is bigger than the source we need to do
1072 // a promotion using SUBREG_TO_REG.
1073 const TargetRegisterClass *PromotionRC =
1074 getMinClassForRegBank(RB: SrcRegBank, SizeInBits: DstSize, /* GetAllRegSet */ true);
1075 getSubRegForClass(RC: SrcRC, TRI, SubReg);
1076
1077 Register PromoteReg = MRI.createVirtualRegister(RegClass: PromotionRC);
1078 BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(),
1079 MCID: TII.get(Opcode: AArch64::SUBREG_TO_REG), DestReg: PromoteReg)
1080 .addUse(RegNo: SrcReg)
1081 .addImm(Val: SubReg);
1082 MachineOperand &RegOp = I.getOperand(i: 1);
1083 RegOp.setReg(PromoteReg);
1084 }
1085
1086 // If the destination is a physical register, then there's nothing to
1087 // change, so we're done.
1088 if (DstReg.isPhysical())
1089 return true;
1090 }
1091
1092 // No need to constrain SrcReg. It will get constrained when we hit another
1093 // of its use or its defs. Copies do not have constraints.
1094 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI)) {
1095 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1096 << " operand\n");
1097 return false;
1098 }
1099
1100 // If this a GPR ZEXT that we want to just reduce down into a copy.
1101 // The sizes will be mismatched with the source < 32b but that's ok.
1102 if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1103 I.setDesc(TII.get(Opcode: AArch64::COPY));
1104 assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1105 return selectCopy(I, TII, MRI, TRI, RBI);
1106 }
1107
1108 I.setDesc(TII.get(Opcode: AArch64::COPY));
1109 return true;
1110}
1111
1112MachineInstr *
1113AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1114 Register False, AArch64CC::CondCode CC,
1115 MachineIRBuilder &MIB) const {
1116 MachineRegisterInfo &MRI = *MIB.getMRI();
1117 assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1118 RBI.getRegBank(True, MRI, TRI)->getID() &&
1119 "Expected both select operands to have the same regbank?");
1120 LLT Ty = MRI.getType(Reg: True);
1121 if (Ty.isVector())
1122 return nullptr;
1123 const unsigned Size = Ty.getSizeInBits();
1124 assert((Size == 32 || Size == 64) &&
1125 "Expected 32 bit or 64 bit select only?");
1126 const bool Is32Bit = Size == 32;
1127 if (RBI.getRegBank(Reg: True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1128 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1129 auto FCSel = MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {True, False}).addImm(Val: CC);
1130 constrainSelectedInstRegOperands(I&: *FCSel, TII, TRI, RBI);
1131 return &*FCSel;
1132 }
1133
1134 // By default, we'll try and emit a CSEL.
1135 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1136 bool Optimized = false;
1137 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1138 &Optimized](Register &Reg, Register &OtherReg,
1139 bool Invert) {
1140 if (Optimized)
1141 return false;
1142
1143 // Attempt to fold:
1144 //
1145 // %sub = G_SUB 0, %x
1146 // %select = G_SELECT cc, %reg, %sub
1147 //
1148 // Into:
1149 // %select = CSNEG %reg, %x, cc
1150 Register MatchReg;
1151 if (mi_match(R: Reg, MRI, P: m_Neg(Src: m_Reg(R&: MatchReg)))) {
1152 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1153 Reg = MatchReg;
1154 if (Invert) {
1155 CC = AArch64CC::getInvertedCondCode(Code: CC);
1156 std::swap(a&: Reg, b&: OtherReg);
1157 }
1158 return true;
1159 }
1160
1161 // Attempt to fold:
1162 //
1163 // %xor = G_XOR %x, -1
1164 // %select = G_SELECT cc, %reg, %xor
1165 //
1166 // Into:
1167 // %select = CSINV %reg, %x, cc
1168 if (mi_match(R: Reg, MRI, P: m_Not(Src: m_Reg(R&: MatchReg)))) {
1169 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1170 Reg = MatchReg;
1171 if (Invert) {
1172 CC = AArch64CC::getInvertedCondCode(Code: CC);
1173 std::swap(a&: Reg, b&: OtherReg);
1174 }
1175 return true;
1176 }
1177
1178 // Attempt to fold:
1179 //
1180 // %add = G_ADD %x, 1
1181 // %select = G_SELECT cc, %reg, %add
1182 //
1183 // Into:
1184 // %select = CSINC %reg, %x, cc
1185 if (mi_match(R: Reg, MRI,
1186 P: m_any_of(preds: m_GAdd(L: m_Reg(R&: MatchReg), R: m_SpecificICst(RequestedValue: 1)),
1187 preds: m_GPtrAdd(L: m_Reg(R&: MatchReg), R: m_SpecificICst(RequestedValue: 1))))) {
1188 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1189 Reg = MatchReg;
1190 if (Invert) {
1191 CC = AArch64CC::getInvertedCondCode(Code: CC);
1192 std::swap(a&: Reg, b&: OtherReg);
1193 }
1194 return true;
1195 }
1196
1197 return false;
1198 };
1199
1200 // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1201 // true/false values are constants.
1202 // FIXME: All of these patterns already exist in tablegen. We should be
1203 // able to import these.
1204 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1205 &Optimized]() {
1206 if (Optimized)
1207 return false;
1208 auto TrueCst = getIConstantVRegValWithLookThrough(VReg: True, MRI);
1209 auto FalseCst = getIConstantVRegValWithLookThrough(VReg: False, MRI);
1210 if (!TrueCst && !FalseCst)
1211 return false;
1212
1213 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1214 if (TrueCst && FalseCst) {
1215 int64_t T = TrueCst->Value.getSExtValue();
1216 int64_t F = FalseCst->Value.getSExtValue();
1217
1218 if (T == 0 && F == 1) {
1219 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1220 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1221 True = ZReg;
1222 False = ZReg;
1223 return true;
1224 }
1225
1226 if (T == 0 && F == -1) {
1227 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1228 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1229 True = ZReg;
1230 False = ZReg;
1231 return true;
1232 }
1233 }
1234
1235 if (TrueCst) {
1236 int64_t T = TrueCst->Value.getSExtValue();
1237 if (T == 1) {
1238 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1239 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1240 True = False;
1241 False = ZReg;
1242 CC = AArch64CC::getInvertedCondCode(Code: CC);
1243 return true;
1244 }
1245
1246 if (T == -1) {
1247 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1248 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1249 True = False;
1250 False = ZReg;
1251 CC = AArch64CC::getInvertedCondCode(Code: CC);
1252 return true;
1253 }
1254 }
1255
1256 if (FalseCst) {
1257 int64_t F = FalseCst->Value.getSExtValue();
1258 if (F == 1) {
1259 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1260 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1261 False = ZReg;
1262 return true;
1263 }
1264
1265 if (F == -1) {
1266 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1267 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1268 False = ZReg;
1269 return true;
1270 }
1271 }
1272 return false;
1273 };
1274
1275 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1276 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1277 Optimized |= TryOptSelectCst();
1278 auto SelectInst = MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {True, False}).addImm(Val: CC);
1279 constrainSelectedInstRegOperands(I&: *SelectInst, TII, TRI, RBI);
1280 return &*SelectInst;
1281}
1282
1283static AArch64CC::CondCode
1284changeICMPPredToAArch64CC(CmpInst::Predicate P, Register RHS = {},
1285 MachineRegisterInfo *MRI = nullptr) {
1286 switch (P) {
1287 default:
1288 llvm_unreachable("Unknown condition code!");
1289 case CmpInst::ICMP_NE:
1290 return AArch64CC::NE;
1291 case CmpInst::ICMP_EQ:
1292 return AArch64CC::EQ;
1293 case CmpInst::ICMP_SGT:
1294 return AArch64CC::GT;
1295 case CmpInst::ICMP_SGE:
1296 if (RHS && MRI) {
1297 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
1298 if (ValAndVReg && ValAndVReg->Value == 0)
1299 return AArch64CC::PL;
1300 }
1301 return AArch64CC::GE;
1302 case CmpInst::ICMP_SLT:
1303 if (RHS && MRI) {
1304 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI);
1305 if (ValAndVReg && ValAndVReg->Value == 0)
1306 return AArch64CC::MI;
1307 }
1308 return AArch64CC::LT;
1309 case CmpInst::ICMP_SLE:
1310 return AArch64CC::LE;
1311 case CmpInst::ICMP_UGT:
1312 return AArch64CC::HI;
1313 case CmpInst::ICMP_UGE:
1314 return AArch64CC::HS;
1315 case CmpInst::ICMP_ULT:
1316 return AArch64CC::LO;
1317 case CmpInst::ICMP_ULE:
1318 return AArch64CC::LS;
1319 }
1320}
1321
1322/// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1323static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1324 AArch64CC::CondCode &CondCode,
1325 AArch64CC::CondCode &CondCode2) {
1326 CondCode2 = AArch64CC::AL;
1327 switch (CC) {
1328 default:
1329 llvm_unreachable("Unknown FP condition!");
1330 case CmpInst::FCMP_OEQ:
1331 CondCode = AArch64CC::EQ;
1332 break;
1333 case CmpInst::FCMP_OGT:
1334 CondCode = AArch64CC::GT;
1335 break;
1336 case CmpInst::FCMP_OGE:
1337 CondCode = AArch64CC::GE;
1338 break;
1339 case CmpInst::FCMP_OLT:
1340 CondCode = AArch64CC::MI;
1341 break;
1342 case CmpInst::FCMP_OLE:
1343 CondCode = AArch64CC::LS;
1344 break;
1345 case CmpInst::FCMP_ONE:
1346 CondCode = AArch64CC::MI;
1347 CondCode2 = AArch64CC::GT;
1348 break;
1349 case CmpInst::FCMP_ORD:
1350 CondCode = AArch64CC::VC;
1351 break;
1352 case CmpInst::FCMP_UNO:
1353 CondCode = AArch64CC::VS;
1354 break;
1355 case CmpInst::FCMP_UEQ:
1356 CondCode = AArch64CC::EQ;
1357 CondCode2 = AArch64CC::VS;
1358 break;
1359 case CmpInst::FCMP_UGT:
1360 CondCode = AArch64CC::HI;
1361 break;
1362 case CmpInst::FCMP_UGE:
1363 CondCode = AArch64CC::PL;
1364 break;
1365 case CmpInst::FCMP_ULT:
1366 CondCode = AArch64CC::LT;
1367 break;
1368 case CmpInst::FCMP_ULE:
1369 CondCode = AArch64CC::LE;
1370 break;
1371 case CmpInst::FCMP_UNE:
1372 CondCode = AArch64CC::NE;
1373 break;
1374 }
1375}
1376
1377/// Convert an IR fp condition code to an AArch64 CC.
1378/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1379/// should be AND'ed instead of OR'ed.
1380static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1381 AArch64CC::CondCode &CondCode,
1382 AArch64CC::CondCode &CondCode2) {
1383 CondCode2 = AArch64CC::AL;
1384 switch (CC) {
1385 default:
1386 changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1387 assert(CondCode2 == AArch64CC::AL);
1388 break;
1389 case CmpInst::FCMP_ONE:
1390 // (a one b)
1391 // == ((a olt b) || (a ogt b))
1392 // == ((a ord b) && (a une b))
1393 CondCode = AArch64CC::VC;
1394 CondCode2 = AArch64CC::NE;
1395 break;
1396 case CmpInst::FCMP_UEQ:
1397 // (a ueq b)
1398 // == ((a uno b) || (a oeq b))
1399 // == ((a ule b) && (a uge b))
1400 CondCode = AArch64CC::PL;
1401 CondCode2 = AArch64CC::LE;
1402 break;
1403 }
1404}
1405
1406/// Return a register which can be used as a bit to test in a TB(N)Z.
1407static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1408 MachineRegisterInfo &MRI) {
1409 assert(Reg.isValid() && "Expected valid register!");
1410 bool HasZext = false;
1411 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1412 unsigned Opc = MI->getOpcode();
1413
1414 if (!MI->getOperand(i: 0).isReg() ||
1415 !MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
1416 break;
1417
1418 // (tbz (any_ext x), b) -> (tbz x, b) and
1419 // (tbz (zext x), b) -> (tbz x, b) if we don't use the extended bits.
1420 //
1421 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1422 // on the truncated x is the same as the bit number on x.
1423 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1424 Opc == TargetOpcode::G_TRUNC) {
1425 if (Opc == TargetOpcode::G_ZEXT)
1426 HasZext = true;
1427
1428 Register NextReg = MI->getOperand(i: 1).getReg();
1429 // Did we find something worth folding?
1430 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(RegNo: NextReg))
1431 break;
1432 TypeSize InSize = MRI.getType(Reg: NextReg).getSizeInBits();
1433 if (Bit >= InSize)
1434 break;
1435
1436 // NextReg is worth folding. Keep looking.
1437 Reg = NextReg;
1438 continue;
1439 }
1440
1441 // Attempt to find a suitable operation with a constant on one side.
1442 std::optional<uint64_t> C;
1443 Register TestReg;
1444 switch (Opc) {
1445 default:
1446 break;
1447 case TargetOpcode::G_AND:
1448 case TargetOpcode::G_XOR: {
1449 TestReg = MI->getOperand(i: 1).getReg();
1450 Register ConstantReg = MI->getOperand(i: 2).getReg();
1451 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
1452 if (!VRegAndVal) {
1453 // AND commutes, check the other side for a constant.
1454 // FIXME: Can we canonicalize the constant so that it's always on the
1455 // same side at some point earlier?
1456 std::swap(a&: ConstantReg, b&: TestReg);
1457 VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
1458 }
1459 if (VRegAndVal) {
1460 if (HasZext)
1461 C = VRegAndVal->Value.getZExtValue();
1462 else
1463 C = VRegAndVal->Value.getSExtValue();
1464 }
1465 break;
1466 }
1467 case TargetOpcode::G_ASHR:
1468 case TargetOpcode::G_LSHR:
1469 case TargetOpcode::G_SHL: {
1470 TestReg = MI->getOperand(i: 1).getReg();
1471 auto VRegAndVal =
1472 getIConstantVRegValWithLookThrough(VReg: MI->getOperand(i: 2).getReg(), MRI);
1473 if (VRegAndVal)
1474 C = VRegAndVal->Value.getSExtValue();
1475 break;
1476 }
1477 }
1478
1479 // Didn't find a constant or viable register. Bail out of the loop.
1480 if (!C || !TestReg.isValid())
1481 break;
1482
1483 // We found a suitable instruction with a constant. Check to see if we can
1484 // walk through the instruction.
1485 Register NextReg;
1486 unsigned TestRegSize = MRI.getType(Reg: TestReg).getSizeInBits();
1487 switch (Opc) {
1488 default:
1489 break;
1490 case TargetOpcode::G_AND:
1491 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1492 if ((*C >> Bit) & 1)
1493 NextReg = TestReg;
1494 break;
1495 case TargetOpcode::G_SHL:
1496 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1497 // the type of the register.
1498 if (*C <= Bit && (Bit - *C) < TestRegSize) {
1499 NextReg = TestReg;
1500 Bit = Bit - *C;
1501 }
1502 break;
1503 case TargetOpcode::G_ASHR:
1504 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1505 // in x
1506 NextReg = TestReg;
1507 Bit = Bit + *C;
1508 if (Bit >= TestRegSize)
1509 Bit = TestRegSize - 1;
1510 break;
1511 case TargetOpcode::G_LSHR:
1512 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1513 if ((Bit + *C) < TestRegSize) {
1514 NextReg = TestReg;
1515 Bit = Bit + *C;
1516 }
1517 break;
1518 case TargetOpcode::G_XOR:
1519 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1520 // appropriate.
1521 //
1522 // e.g. If x' = xor x, c, and the b-th bit is set in c then
1523 //
1524 // tbz x', b -> tbnz x, b
1525 //
1526 // Because x' only has the b-th bit set if x does not.
1527 if ((*C >> Bit) & 1)
1528 Invert = !Invert;
1529 NextReg = TestReg;
1530 break;
1531 }
1532
1533 // Check if we found anything worth folding.
1534 if (!NextReg.isValid())
1535 return Reg;
1536 Reg = NextReg;
1537 }
1538
1539 return Reg;
1540}
1541
1542MachineInstr *AArch64InstructionSelector::emitTestBit(
1543 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1544 MachineIRBuilder &MIB) const {
1545 assert(TestReg.isValid());
1546 assert(ProduceNonFlagSettingCondBr &&
1547 "Cannot emit TB(N)Z with speculation tracking!");
1548 MachineRegisterInfo &MRI = *MIB.getMRI();
1549
1550 // Attempt to optimize the test bit by walking over instructions.
1551 TestReg = getTestBitReg(Reg: TestReg, Bit, Invert&: IsNegative, MRI);
1552 LLT Ty = MRI.getType(Reg: TestReg);
1553 unsigned Size = Ty.getSizeInBits();
1554 assert(!Ty.isVector() && "Expected a scalar!");
1555 assert(Bit < 64 && "Bit is too large!");
1556
1557 // When the test register is a 64-bit register, we have to narrow to make
1558 // TBNZW work.
1559 bool UseWReg = Bit < 32;
1560 unsigned NecessarySize = UseWReg ? 32 : 64;
1561 if (Size != NecessarySize)
1562 TestReg = moveScalarRegClass(
1563 Reg: TestReg, RC: UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1564 MIB);
1565
1566 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1567 {AArch64::TBZW, AArch64::TBNZW}};
1568 unsigned Opc = OpcTable[UseWReg][IsNegative];
1569 auto TestBitMI =
1570 MIB.buildInstr(Opcode: Opc).addReg(RegNo: TestReg).addImm(Val: Bit).addMBB(MBB: DstMBB);
1571 constrainSelectedInstRegOperands(I&: *TestBitMI, TII, TRI, RBI);
1572 return &*TestBitMI;
1573}
1574
1575bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1576 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1577 MachineIRBuilder &MIB) const {
1578 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1579 // Given something like this:
1580 //
1581 // %x = ...Something...
1582 // %one = G_CONSTANT i64 1
1583 // %zero = G_CONSTANT i64 0
1584 // %and = G_AND %x, %one
1585 // %cmp = G_ICMP intpred(ne), %and, %zero
1586 // %cmp_trunc = G_TRUNC %cmp
1587 // G_BRCOND %cmp_trunc, %bb.3
1588 //
1589 // We want to try and fold the AND into the G_BRCOND and produce either a
1590 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1591 //
1592 // In this case, we'd get
1593 //
1594 // TBNZ %x %bb.3
1595 //
1596
1597 // Check if the AND has a constant on its RHS which we can use as a mask.
1598 // If it's a power of 2, then it's the same as checking a specific bit.
1599 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1600 auto MaybeBit = getIConstantVRegValWithLookThrough(
1601 VReg: AndInst.getOperand(i: 2).getReg(), MRI: *MIB.getMRI());
1602 if (!MaybeBit)
1603 return false;
1604
1605 int32_t Bit = MaybeBit->Value.exactLogBase2();
1606 if (Bit < 0)
1607 return false;
1608
1609 Register TestReg = AndInst.getOperand(i: 1).getReg();
1610
1611 // Emit a TB(N)Z.
1612 emitTestBit(TestReg, Bit, IsNegative: Invert, DstMBB, MIB);
1613 return true;
1614}
1615
1616MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1617 bool IsNegative,
1618 MachineBasicBlock *DestMBB,
1619 MachineIRBuilder &MIB) const {
1620 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1621 MachineRegisterInfo &MRI = *MIB.getMRI();
1622 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1623 AArch64::GPRRegBankID &&
1624 "Expected GPRs only?");
1625 auto Ty = MRI.getType(Reg: CompareReg);
1626 unsigned Width = Ty.getSizeInBits();
1627 assert(!Ty.isVector() && "Expected scalar only?");
1628 assert(Width <= 64 && "Expected width to be at most 64?");
1629 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1630 {AArch64::CBNZW, AArch64::CBNZX}};
1631 unsigned Opc = OpcTable[IsNegative][Width == 64];
1632 auto BranchMI = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {CompareReg}).addMBB(MBB: DestMBB);
1633 constrainSelectedInstRegOperands(I&: *BranchMI, TII, TRI, RBI);
1634 return &*BranchMI;
1635}
1636
1637bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1638 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1639 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1640 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1641 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1642 // totally clean. Some of them require two branches to implement.
1643 auto Pred = (CmpInst::Predicate)FCmp.getOperand(i: 1).getPredicate();
1644 emitFPCompare(LHS: FCmp.getOperand(i: 2).getReg(), RHS: FCmp.getOperand(i: 3).getReg(), MIRBuilder&: MIB,
1645 Pred);
1646 AArch64CC::CondCode CC1, CC2;
1647 changeFCMPPredToAArch64CC(P: Pred, CondCode&: CC1, CondCode2&: CC2);
1648 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1649 MIB.buildInstr(Opc: AArch64::Bcc, DstOps: {}, SrcOps: {}).addImm(Val: CC1).addMBB(MBB: DestMBB);
1650 if (CC2 != AArch64CC::AL)
1651 MIB.buildInstr(Opc: AArch64::Bcc, DstOps: {}, SrcOps: {}).addImm(Val: CC2).addMBB(MBB: DestMBB);
1652 I.eraseFromParent();
1653 return true;
1654}
1655
1656bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1657 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1658 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1659 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1660 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1661 //
1662 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1663 // instructions will not be produced, as they are conditional branch
1664 // instructions that do not set flags.
1665 if (!ProduceNonFlagSettingCondBr)
1666 return false;
1667
1668 MachineRegisterInfo &MRI = *MIB.getMRI();
1669 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1670 auto Pred =
1671 static_cast<CmpInst::Predicate>(ICmp.getOperand(i: 1).getPredicate());
1672 Register LHS = ICmp.getOperand(i: 2).getReg();
1673 Register RHS = ICmp.getOperand(i: 3).getReg();
1674
1675 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1676 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
1677 MachineInstr *AndInst = getOpcodeDef(Opcode: TargetOpcode::G_AND, Reg: LHS, MRI);
1678
1679 // When we can emit a TB(N)Z, prefer that.
1680 //
1681 // Handle non-commutative condition codes first.
1682 // Note that we don't want to do this when we have a G_AND because it can
1683 // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1684 if (VRegAndVal && !AndInst) {
1685 int64_t C = VRegAndVal->Value.getSExtValue();
1686
1687 // When we have a greater-than comparison, we can just test if the msb is
1688 // zero.
1689 if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1690 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1691 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ false, DstMBB: DestMBB, MIB);
1692 I.eraseFromParent();
1693 return true;
1694 }
1695
1696 // When we have a less than comparison, we can just test if the msb is not
1697 // zero.
1698 if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1699 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1700 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ true, DstMBB: DestMBB, MIB);
1701 I.eraseFromParent();
1702 return true;
1703 }
1704
1705 // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1706 // we can test if the msb is zero.
1707 if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1708 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1709 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ false, DstMBB: DestMBB, MIB);
1710 I.eraseFromParent();
1711 return true;
1712 }
1713 }
1714
1715 // Attempt to handle commutative condition codes. Right now, that's only
1716 // eq/ne.
1717 if (ICmpInst::isEquality(P: Pred)) {
1718 if (!VRegAndVal) {
1719 std::swap(a&: RHS, b&: LHS);
1720 VRegAndVal = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
1721 AndInst = getOpcodeDef(Opcode: TargetOpcode::G_AND, Reg: LHS, MRI);
1722 }
1723
1724 if (VRegAndVal && VRegAndVal->Value == 0) {
1725 // If there's a G_AND feeding into this branch, try to fold it away by
1726 // emitting a TB(N)Z instead.
1727 //
1728 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1729 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1730 // would be redundant.
1731 if (AndInst &&
1732 tryOptAndIntoCompareBranch(
1733 AndInst&: *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DstMBB: DestMBB, MIB)) {
1734 I.eraseFromParent();
1735 return true;
1736 }
1737
1738 // Otherwise, try to emit a CB(N)Z instead.
1739 auto LHSTy = MRI.getType(Reg: LHS);
1740 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1741 emitCBZ(CompareReg: LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1742 I.eraseFromParent();
1743 return true;
1744 }
1745 }
1746 }
1747
1748 return false;
1749}
1750
1751bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1752 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1753 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1754 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1755 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1756 return true;
1757
1758 // Couldn't optimize. Emit a compare + a Bcc.
1759 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1760 auto &PredOp = ICmp.getOperand(i: 1);
1761 emitIntegerCompare(LHS&: ICmp.getOperand(i: 2), RHS&: ICmp.getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
1762 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1763 P: static_cast<CmpInst::Predicate>(PredOp.getPredicate()),
1764 RHS: ICmp.getOperand(i: 3).getReg(), MRI: MIB.getMRI());
1765 MIB.buildInstr(Opc: AArch64::Bcc, DstOps: {}, SrcOps: {}).addImm(Val: CC).addMBB(MBB: DestMBB);
1766 I.eraseFromParent();
1767 return true;
1768}
1769
1770bool AArch64InstructionSelector::selectCompareBranch(
1771 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1772 Register CondReg = I.getOperand(i: 0).getReg();
1773 MachineInstr *CCMI = MRI.getVRegDef(Reg: CondReg);
1774 // Try to select the G_BRCOND using whatever is feeding the condition if
1775 // possible.
1776 unsigned CCMIOpc = CCMI->getOpcode();
1777 if (CCMIOpc == TargetOpcode::G_FCMP)
1778 return selectCompareBranchFedByFCmp(I, FCmp&: *CCMI, MIB);
1779 if (CCMIOpc == TargetOpcode::G_ICMP)
1780 return selectCompareBranchFedByICmp(I, ICmp&: *CCMI, MIB);
1781
1782 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1783 // instructions will not be produced, as they are conditional branch
1784 // instructions that do not set flags.
1785 if (ProduceNonFlagSettingCondBr) {
1786 emitTestBit(TestReg: CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1787 DstMBB: I.getOperand(i: 1).getMBB(), MIB);
1788 I.eraseFromParent();
1789 return true;
1790 }
1791
1792 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1793 auto TstMI =
1794 MIB.buildInstr(Opc: AArch64::ANDSWri, DstOps: {LLT::scalar(SizeInBits: 32)}, SrcOps: {CondReg}).addImm(Val: 1);
1795 constrainSelectedInstRegOperands(I&: *TstMI, TII, TRI, RBI);
1796 auto Bcc = MIB.buildInstr(Opcode: AArch64::Bcc)
1797 .addImm(Val: AArch64CC::NE)
1798 .addMBB(MBB: I.getOperand(i: 1).getMBB());
1799 I.eraseFromParent();
1800 constrainSelectedInstRegOperands(I&: *Bcc, TII, TRI, RBI);
1801 return true;
1802}
1803
1804/// Returns the element immediate value of a vector shift operand if found.
1805/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1806static std::optional<int64_t> getVectorShiftImm(Register Reg,
1807 MachineRegisterInfo &MRI) {
1808 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1809 MachineInstr *OpMI = MRI.getVRegDef(Reg);
1810 return getAArch64VectorSplatScalar(MI: *OpMI, MRI);
1811}
1812
1813/// Matches and returns the shift immediate value for a SHL instruction given
1814/// a shift operand.
1815static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1816 MachineRegisterInfo &MRI) {
1817 std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1818 if (!ShiftImm)
1819 return std::nullopt;
1820 // Check the immediate is in range for a SHL.
1821 int64_t Imm = *ShiftImm;
1822 if (Imm < 0)
1823 return std::nullopt;
1824 switch (SrcTy.getElementType().getSizeInBits()) {
1825 default:
1826 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1827 return std::nullopt;
1828 case 8:
1829 if (Imm > 7)
1830 return std::nullopt;
1831 break;
1832 case 16:
1833 if (Imm > 15)
1834 return std::nullopt;
1835 break;
1836 case 32:
1837 if (Imm > 31)
1838 return std::nullopt;
1839 break;
1840 case 64:
1841 if (Imm > 63)
1842 return std::nullopt;
1843 break;
1844 }
1845 return Imm;
1846}
1847
1848bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1849 MachineRegisterInfo &MRI) {
1850 assert(I.getOpcode() == TargetOpcode::G_SHL);
1851 Register DstReg = I.getOperand(i: 0).getReg();
1852 const LLT Ty = MRI.getType(Reg: DstReg);
1853 Register Src1Reg = I.getOperand(i: 1).getReg();
1854 Register Src2Reg = I.getOperand(i: 2).getReg();
1855
1856 if (!Ty.isVector())
1857 return false;
1858
1859 // Check if we have a vector of constants on RHS that we can select as the
1860 // immediate form.
1861 std::optional<int64_t> ImmVal = getVectorSHLImm(SrcTy: Ty, Reg: Src2Reg, MRI);
1862
1863 unsigned Opc = 0;
1864 if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64)) {
1865 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1866 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
1867 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1868 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
1869 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1870 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)) {
1871 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1872 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16)) {
1873 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1874 } else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8)) {
1875 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1876 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8)) {
1877 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1878 } else {
1879 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1880 return false;
1881 }
1882
1883 auto Shl = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Src1Reg});
1884 if (ImmVal)
1885 Shl.addImm(Val: *ImmVal);
1886 else
1887 Shl.addUse(RegNo: Src2Reg);
1888 constrainSelectedInstRegOperands(I&: *Shl, TII, TRI, RBI);
1889 I.eraseFromParent();
1890 return true;
1891}
1892
1893bool AArch64InstructionSelector::selectVectorAshrLshr(
1894 MachineInstr &I, MachineRegisterInfo &MRI) {
1895 assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1896 I.getOpcode() == TargetOpcode::G_LSHR);
1897 Register DstReg = I.getOperand(i: 0).getReg();
1898 const LLT Ty = MRI.getType(Reg: DstReg);
1899 Register Src1Reg = I.getOperand(i: 1).getReg();
1900 Register Src2Reg = I.getOperand(i: 2).getReg();
1901
1902 if (!Ty.isVector())
1903 return false;
1904
1905 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1906
1907 // We expect the immediate case to be lowered in the PostLegalCombiner to
1908 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1909
1910 // There is not a shift right register instruction, but the shift left
1911 // register instruction takes a signed value, where negative numbers specify a
1912 // right shift.
1913
1914 unsigned Opc = 0;
1915 unsigned NegOpc = 0;
1916 const TargetRegisterClass *RC =
1917 getRegClassForTypeOnBank(Ty, RB: RBI.getRegBank(ID: AArch64::FPRRegBankID));
1918 if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64)) {
1919 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1920 NegOpc = AArch64::NEGv2i64;
1921 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
1922 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1923 NegOpc = AArch64::NEGv4i32;
1924 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
1925 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1926 NegOpc = AArch64::NEGv2i32;
1927 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)) {
1928 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1929 NegOpc = AArch64::NEGv4i16;
1930 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16)) {
1931 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1932 NegOpc = AArch64::NEGv8i16;
1933 } else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8)) {
1934 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1935 NegOpc = AArch64::NEGv16i8;
1936 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8)) {
1937 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1938 NegOpc = AArch64::NEGv8i8;
1939 } else {
1940 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1941 return false;
1942 }
1943
1944 auto Neg = MIB.buildInstr(Opc: NegOpc, DstOps: {RC}, SrcOps: {Src2Reg});
1945 constrainSelectedInstRegOperands(I&: *Neg, TII, TRI, RBI);
1946 auto SShl = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Src1Reg, Neg});
1947 constrainSelectedInstRegOperands(I&: *SShl, TII, TRI, RBI);
1948 I.eraseFromParent();
1949 return true;
1950}
1951
1952bool AArch64InstructionSelector::selectVaStartAAPCS(
1953 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1954
1955 if (STI.isCallingConvWin64(CC: MF.getFunction().getCallingConv(),
1956 IsVarArg: MF.getFunction().isVarArg()))
1957 return false;
1958
1959 // The layout of the va_list struct is specified in the AArch64 Procedure Call
1960 // Standard, section 10.1.5.
1961
1962 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1963 const unsigned PtrSize = STI.isTargetILP32() ? 4 : 8;
1964 const auto *PtrRegClass =
1965 STI.isTargetILP32() ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
1966
1967 const MCInstrDesc &MCIDAddAddr =
1968 TII.get(Opcode: STI.isTargetILP32() ? AArch64::ADDWri : AArch64::ADDXri);
1969 const MCInstrDesc &MCIDStoreAddr =
1970 TII.get(Opcode: STI.isTargetILP32() ? AArch64::STRWui : AArch64::STRXui);
1971
1972 /*
1973 * typedef struct va_list {
1974 * void * stack; // next stack param
1975 * void * gr_top; // end of GP arg reg save area
1976 * void * vr_top; // end of FP/SIMD arg reg save area
1977 * int gr_offs; // offset from gr_top to next GP register arg
1978 * int vr_offs; // offset from vr_top to next FP/SIMD register arg
1979 * } va_list;
1980 */
1981 const auto VAList = I.getOperand(i: 0).getReg();
1982
1983 // Our current offset in bytes from the va_list struct (VAList).
1984 unsigned OffsetBytes = 0;
1985
1986 // Helper function to store (FrameIndex + Imm) to VAList at offset OffsetBytes
1987 // and increment OffsetBytes by PtrSize.
1988 const auto PushAddress = [&](const int FrameIndex, const int64_t Imm) {
1989 const Register Top = MRI.createVirtualRegister(RegClass: PtrRegClass);
1990 auto MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: MCIDAddAddr)
1991 .addDef(RegNo: Top)
1992 .addFrameIndex(Idx: FrameIndex)
1993 .addImm(Val: Imm)
1994 .addImm(Val: 0);
1995 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
1996
1997 const auto *MMO = *I.memoperands_begin();
1998 MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: MCIDStoreAddr)
1999 .addUse(RegNo: Top)
2000 .addUse(RegNo: VAList)
2001 .addImm(Val: OffsetBytes / PtrSize)
2002 .addMemOperand(MMO: MF.getMachineMemOperand(
2003 PtrInfo: MMO->getPointerInfo().getWithOffset(O: OffsetBytes),
2004 F: MachineMemOperand::MOStore, Size: PtrSize, BaseAlignment: MMO->getBaseAlign()));
2005 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2006
2007 OffsetBytes += PtrSize;
2008 };
2009
2010 // void* stack at offset 0
2011 PushAddress(FuncInfo->getVarArgsStackIndex(), 0);
2012
2013 // void* gr_top at offset 8 (4 on ILP32)
2014 const unsigned GPRSize = FuncInfo->getVarArgsGPRSize();
2015 PushAddress(FuncInfo->getVarArgsGPRIndex(), GPRSize);
2016
2017 // void* vr_top at offset 16 (8 on ILP32)
2018 const unsigned FPRSize = FuncInfo->getVarArgsFPRSize();
2019 PushAddress(FuncInfo->getVarArgsFPRIndex(), FPRSize);
2020
2021 // Helper function to store a 4-byte integer constant to VAList at offset
2022 // OffsetBytes, and increment OffsetBytes by 4.
2023 const auto PushIntConstant = [&](const int32_t Value) {
2024 constexpr int IntSize = 4;
2025 const Register Temp = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
2026 auto MIB =
2027 BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::MOVi32imm))
2028 .addDef(RegNo: Temp)
2029 .addImm(Val: Value);
2030 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2031
2032 const auto *MMO = *I.memoperands_begin();
2033 MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::STRWui))
2034 .addUse(RegNo: Temp)
2035 .addUse(RegNo: VAList)
2036 .addImm(Val: OffsetBytes / IntSize)
2037 .addMemOperand(MMO: MF.getMachineMemOperand(
2038 PtrInfo: MMO->getPointerInfo().getWithOffset(O: OffsetBytes),
2039 F: MachineMemOperand::MOStore, Size: IntSize, BaseAlignment: MMO->getBaseAlign()));
2040 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2041 OffsetBytes += IntSize;
2042 };
2043
2044 // int gr_offs at offset 24 (12 on ILP32)
2045 PushIntConstant(-static_cast<int32_t>(GPRSize));
2046
2047 // int vr_offs at offset 28 (16 on ILP32)
2048 PushIntConstant(-static_cast<int32_t>(FPRSize));
2049
2050 assert(OffsetBytes == (STI.isTargetILP32() ? 20 : 32) && "Unexpected offset");
2051
2052 I.eraseFromParent();
2053 return true;
2054}
2055
2056bool AArch64InstructionSelector::selectVaStartDarwin(
2057 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
2058 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2059 Register ListReg = I.getOperand(i: 0).getReg();
2060
2061 Register ArgsAddrReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
2062
2063 int FrameIdx = FuncInfo->getVarArgsStackIndex();
2064 if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
2065 CC: MF.getFunction().getCallingConv(), IsVarArg: MF.getFunction().isVarArg())) {
2066 FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
2067 ? FuncInfo->getVarArgsGPRIndex()
2068 : FuncInfo->getVarArgsStackIndex();
2069 }
2070
2071 auto MIB =
2072 BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::ADDXri))
2073 .addDef(RegNo: ArgsAddrReg)
2074 .addFrameIndex(Idx: FrameIdx)
2075 .addImm(Val: 0)
2076 .addImm(Val: 0);
2077
2078 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2079
2080 MIB = BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::STRXui))
2081 .addUse(RegNo: ArgsAddrReg)
2082 .addUse(RegNo: ListReg)
2083 .addImm(Val: 0)
2084 .addMemOperand(MMO: *I.memoperands_begin());
2085
2086 constrainSelectedInstRegOperands(I&: *MIB, TII, TRI, RBI);
2087 I.eraseFromParent();
2088 return true;
2089}
2090
2091void AArch64InstructionSelector::materializeLargeCMVal(
2092 MachineInstr &I, const Value *V, unsigned OpFlags) {
2093 MachineBasicBlock &MBB = *I.getParent();
2094 MachineFunction &MF = *MBB.getParent();
2095 MachineRegisterInfo &MRI = MF.getRegInfo();
2096
2097 auto MovZ = MIB.buildInstr(Opc: AArch64::MOVZXi, DstOps: {&AArch64::GPR64RegClass}, SrcOps: {});
2098 MovZ->addOperand(MF, Op: I.getOperand(i: 1));
2099 MovZ->getOperand(i: 1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
2100 AArch64II::MO_NC);
2101 MovZ->addOperand(MF, Op: MachineOperand::CreateImm(Val: 0));
2102 constrainSelectedInstRegOperands(I&: *MovZ, TII, TRI, RBI);
2103
2104 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2105 Register ForceDstReg) {
2106 Register DstReg = ForceDstReg
2107 ? ForceDstReg
2108 : MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
2109 auto MovI = MIB.buildInstr(Opcode: AArch64::MOVKXi).addDef(RegNo: DstReg).addUse(RegNo: SrcReg);
2110 if (auto *GV = dyn_cast<GlobalValue>(Val: V)) {
2111 MovI->addOperand(MF, Op: MachineOperand::CreateGA(
2112 GV, Offset: MovZ->getOperand(i: 1).getOffset(), TargetFlags: Flags));
2113 } else {
2114 MovI->addOperand(
2115 MF, Op: MachineOperand::CreateBA(BA: cast<BlockAddress>(Val: V),
2116 Offset: MovZ->getOperand(i: 1).getOffset(), TargetFlags: Flags));
2117 }
2118 MovI->addOperand(MF, Op: MachineOperand::CreateImm(Val: Offset));
2119 constrainSelectedInstRegOperands(I&: *MovI, TII, TRI, RBI);
2120 return DstReg;
2121 };
2122 Register DstReg = BuildMovK(MovZ.getReg(Idx: 0),
2123 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2124 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2125 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(i: 0).getReg());
2126}
2127
2128bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2129 MachineBasicBlock &MBB = *I.getParent();
2130 MachineFunction &MF = *MBB.getParent();
2131 MachineRegisterInfo &MRI = MF.getRegInfo();
2132
2133 switch (I.getOpcode()) {
2134 case TargetOpcode::G_CONSTANT: {
2135 Register DefReg = I.getOperand(i: 0).getReg();
2136 const LLT DefTy = MRI.getType(Reg: DefReg);
2137 if (!DefTy.isPointer())
2138 return false;
2139 const unsigned PtrSize = DefTy.getSizeInBits();
2140 if (PtrSize != 32 && PtrSize != 64)
2141 return false;
2142 // Convert pointer typed constants to integers so TableGen can select.
2143 MRI.setType(VReg: DefReg, Ty: LLT::scalar(SizeInBits: PtrSize));
2144 return true;
2145 }
2146 case TargetOpcode::G_STORE: {
2147 bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2148 MachineOperand &SrcOp = I.getOperand(i: 0);
2149 if (MRI.getType(Reg: SrcOp.getReg()).isPointer()) {
2150 // Allow matching with imported patterns for stores of pointers. Unlike
2151 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2152 // and constrain.
2153 auto Copy = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: SrcOp);
2154 Register NewSrc = Copy.getReg(Idx: 0);
2155 SrcOp.setReg(NewSrc);
2156 RBI.constrainGenericRegister(Reg: NewSrc, RC: AArch64::GPR64RegClass, MRI);
2157 Changed = true;
2158 }
2159 return Changed;
2160 }
2161 case TargetOpcode::G_PTR_ADD: {
2162 // If Checked Pointer Arithmetic (FEAT_CPA) is present, preserve the pointer
2163 // arithmetic semantics instead of falling back to regular arithmetic.
2164 const auto &TL = STI.getTargetLowering();
2165 if (TL->shouldPreservePtrArith(F: MF.getFunction(), PtrVT: EVT()))
2166 return false;
2167 return convertPtrAddToAdd(I, MRI);
2168 }
2169 case TargetOpcode::G_LOAD: {
2170 // For scalar loads of pointers, we try to convert the dest type from p0
2171 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2172 // conversion, this should be ok because all users should have been
2173 // selected already, so the type doesn't matter for them.
2174 Register DstReg = I.getOperand(i: 0).getReg();
2175 const LLT DstTy = MRI.getType(Reg: DstReg);
2176 if (!DstTy.isPointer())
2177 return false;
2178 MRI.setType(VReg: DstReg, Ty: LLT::scalar(SizeInBits: 64));
2179 return true;
2180 }
2181 case AArch64::G_DUP: {
2182 // Convert the type from p0 to s64 to help selection.
2183 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2184 if (!DstTy.isPointerVector())
2185 return false;
2186 auto NewSrc = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: I.getOperand(i: 1).getReg());
2187 MRI.setType(VReg: I.getOperand(i: 0).getReg(),
2188 Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 64)));
2189 MRI.setRegClass(Reg: NewSrc.getReg(Idx: 0), RC: &AArch64::GPR64RegClass);
2190 I.getOperand(i: 1).setReg(NewSrc.getReg(Idx: 0));
2191 return true;
2192 }
2193 case AArch64::G_INSERT_VECTOR_ELT: {
2194 // Convert the type from p0 to s64 to help selection.
2195 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2196 LLT SrcVecTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
2197 if (!SrcVecTy.isPointerVector())
2198 return false;
2199 auto NewSrc = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: I.getOperand(i: 2).getReg());
2200 MRI.setType(VReg: I.getOperand(i: 1).getReg(),
2201 Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 64)));
2202 MRI.setType(VReg: I.getOperand(i: 0).getReg(),
2203 Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 64)));
2204 MRI.setRegClass(Reg: NewSrc.getReg(Idx: 0), RC: &AArch64::GPR64RegClass);
2205 I.getOperand(i: 2).setReg(NewSrc.getReg(Idx: 0));
2206 return true;
2207 }
2208 case TargetOpcode::G_UITOFP:
2209 case TargetOpcode::G_SITOFP: {
2210 // If both source and destination regbanks are FPR, then convert the opcode
2211 // to G_SITOF so that the importer can select it to an fpr variant.
2212 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2213 // copy.
2214 Register SrcReg = I.getOperand(i: 1).getReg();
2215 LLT SrcTy = MRI.getType(Reg: SrcReg);
2216 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2217 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2218 return false;
2219
2220 if (RBI.getRegBank(Reg: SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2221 if (I.getOpcode() == TargetOpcode::G_SITOFP)
2222 I.setDesc(TII.get(Opcode: AArch64::G_SITOF));
2223 else
2224 I.setDesc(TII.get(Opcode: AArch64::G_UITOF));
2225 return true;
2226 }
2227 return false;
2228 }
2229 default:
2230 return false;
2231 }
2232}
2233
2234/// This lowering tries to look for G_PTR_ADD instructions and then converts
2235/// them to a standard G_ADD with a COPY on the source.
2236///
2237/// The motivation behind this is to expose the add semantics to the imported
2238/// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2239/// because the selector works bottom up, uses before defs. By the time we
2240/// end up trying to select a G_PTR_ADD, we should have already attempted to
2241/// fold this into addressing modes and were therefore unsuccessful.
2242bool AArch64InstructionSelector::convertPtrAddToAdd(
2243 MachineInstr &I, MachineRegisterInfo &MRI) {
2244 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2245 Register DstReg = I.getOperand(i: 0).getReg();
2246 Register AddOp1Reg = I.getOperand(i: 1).getReg();
2247 const LLT PtrTy = MRI.getType(Reg: DstReg);
2248 if (PtrTy.getAddressSpace() != 0)
2249 return false;
2250
2251 const LLT CastPtrTy =
2252 PtrTy.isVector() ? LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) : LLT::scalar(SizeInBits: 64);
2253 auto PtrToInt = MIB.buildPtrToInt(Dst: CastPtrTy, Src: AddOp1Reg);
2254 // Set regbanks on the registers.
2255 if (PtrTy.isVector())
2256 MRI.setRegBank(Reg: PtrToInt.getReg(Idx: 0), RegBank: RBI.getRegBank(ID: AArch64::FPRRegBankID));
2257 else
2258 MRI.setRegBank(Reg: PtrToInt.getReg(Idx: 0), RegBank: RBI.getRegBank(ID: AArch64::GPRRegBankID));
2259
2260 // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2261 // %dst(intty) = G_ADD %intbase, off
2262 I.setDesc(TII.get(Opcode: TargetOpcode::G_ADD));
2263 MRI.setType(VReg: DstReg, Ty: CastPtrTy);
2264 I.getOperand(i: 1).setReg(PtrToInt.getReg(Idx: 0));
2265 if (!select(I&: *PtrToInt)) {
2266 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2267 return false;
2268 }
2269
2270 // Also take the opportunity here to try to do some optimization.
2271 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2272 Register NegatedReg;
2273 if (!mi_match(R: I.getOperand(i: 2).getReg(), MRI, P: m_Neg(Src: m_Reg(R&: NegatedReg))))
2274 return true;
2275 I.getOperand(i: 2).setReg(NegatedReg);
2276 I.setDesc(TII.get(Opcode: TargetOpcode::G_SUB));
2277 return true;
2278}
2279
2280bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2281 MachineRegisterInfo &MRI) {
2282 // We try to match the immediate variant of LSL, which is actually an alias
2283 // for a special case of UBFM. Otherwise, we fall back to the imported
2284 // selector which will match the register variant.
2285 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2286 const auto &MO = I.getOperand(i: 2);
2287 auto VRegAndVal = getIConstantVRegVal(VReg: MO.getReg(), MRI);
2288 if (!VRegAndVal)
2289 return false;
2290
2291 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2292 if (DstTy.isVector())
2293 return false;
2294 bool Is64Bit = DstTy.getSizeInBits() == 64;
2295 auto Imm1Fn = Is64Bit ? selectShiftA_64(Root: MO) : selectShiftA_32(Root: MO);
2296 auto Imm2Fn = Is64Bit ? selectShiftB_64(Root: MO) : selectShiftB_32(Root: MO);
2297
2298 if (!Imm1Fn || !Imm2Fn)
2299 return false;
2300
2301 auto NewI =
2302 MIB.buildInstr(Opc: Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2303 DstOps: {I.getOperand(i: 0).getReg()}, SrcOps: {I.getOperand(i: 1).getReg()});
2304
2305 for (auto &RenderFn : *Imm1Fn)
2306 RenderFn(NewI);
2307 for (auto &RenderFn : *Imm2Fn)
2308 RenderFn(NewI);
2309
2310 I.eraseFromParent();
2311 constrainSelectedInstRegOperands(I&: *NewI, TII, TRI, RBI);
2312 return true;
2313}
2314
2315bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2316 MachineInstr &I, MachineRegisterInfo &MRI) {
2317 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2318 // If we're storing a scalar, it doesn't matter what register bank that
2319 // scalar is on. All that matters is the size.
2320 //
2321 // So, if we see something like this (with a 32-bit scalar as an example):
2322 //
2323 // %x:gpr(s32) = ... something ...
2324 // %y:fpr(s32) = COPY %x:gpr(s32)
2325 // G_STORE %y:fpr(s32)
2326 //
2327 // We can fix this up into something like this:
2328 //
2329 // G_STORE %x:gpr(s32)
2330 //
2331 // And then continue the selection process normally.
2332 Register DefDstReg = getSrcRegIgnoringCopies(Reg: I.getOperand(i: 0).getReg(), MRI);
2333 if (!DefDstReg.isValid())
2334 return false;
2335 LLT DefDstTy = MRI.getType(Reg: DefDstReg);
2336 Register StoreSrcReg = I.getOperand(i: 0).getReg();
2337 LLT StoreSrcTy = MRI.getType(Reg: StoreSrcReg);
2338
2339 // If we get something strange like a physical register, then we shouldn't
2340 // go any further.
2341 if (!DefDstTy.isValid())
2342 return false;
2343
2344 // Are the source and dst types the same size?
2345 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2346 return false;
2347
2348 if (RBI.getRegBank(Reg: StoreSrcReg, MRI, TRI) ==
2349 RBI.getRegBank(Reg: DefDstReg, MRI, TRI))
2350 return false;
2351
2352 // We have a cross-bank copy, which is entering a store. Let's fold it.
2353 I.getOperand(i: 0).setReg(DefDstReg);
2354 return true;
2355}
2356
2357bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2358 assert(I.getParent() && "Instruction should be in a basic block!");
2359 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2360
2361 MachineBasicBlock &MBB = *I.getParent();
2362 MachineFunction &MF = *MBB.getParent();
2363 MachineRegisterInfo &MRI = MF.getRegInfo();
2364
2365 switch (I.getOpcode()) {
2366 case AArch64::G_DUP: {
2367 // Before selecting a DUP instruction, check if it is better selected as a
2368 // MOV or load from a constant pool.
2369 Register Src = I.getOperand(i: 1).getReg();
2370 auto ValAndVReg = getAnyConstantVRegValWithLookThrough(
2371 VReg: Src, MRI, /*LookThroughInstrs=*/true, /*LookThroughAnyExt=*/true);
2372 if (!ValAndVReg)
2373 return false;
2374 LLVMContext &Ctx = MF.getFunction().getContext();
2375 Register Dst = I.getOperand(i: 0).getReg();
2376 auto *CV = ConstantDataVector::getSplat(
2377 NumElts: MRI.getType(Reg: Dst).getNumElements(),
2378 Elt: ConstantInt::get(
2379 Ty: Type::getIntNTy(C&: Ctx, N: MRI.getType(Reg: Dst).getScalarSizeInBits()),
2380 V: ValAndVReg->Value.trunc(width: MRI.getType(Reg: Dst).getScalarSizeInBits())));
2381 if (!emitConstantVector(Dst, CV, MIRBuilder&: MIB, MRI))
2382 return false;
2383 I.eraseFromParent();
2384 return true;
2385 }
2386 case TargetOpcode::G_SEXT:
2387 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2388 // over a normal extend.
2389 if (selectUSMovFromExtend(I, MRI))
2390 return true;
2391 return false;
2392 case TargetOpcode::G_BR:
2393 return false;
2394 case TargetOpcode::G_SHL:
2395 return earlySelectSHL(I, MRI);
2396 case TargetOpcode::G_CONSTANT: {
2397 bool IsZero = false;
2398 if (I.getOperand(i: 1).isCImm())
2399 IsZero = I.getOperand(i: 1).getCImm()->isZero();
2400 else if (I.getOperand(i: 1).isImm())
2401 IsZero = I.getOperand(i: 1).getImm() == 0;
2402
2403 if (!IsZero)
2404 return false;
2405
2406 Register DefReg = I.getOperand(i: 0).getReg();
2407 LLT Ty = MRI.getType(Reg: DefReg);
2408 if (Ty.getSizeInBits() == 64) {
2409 I.getOperand(i: 1).ChangeToRegister(Reg: AArch64::XZR, isDef: false);
2410 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
2411 } else if (Ty.getSizeInBits() <= 32) {
2412 I.getOperand(i: 1).ChangeToRegister(Reg: AArch64::WZR, isDef: false);
2413 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR32RegClass, MRI);
2414 } else
2415 return false;
2416
2417 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
2418 return true;
2419 }
2420
2421 case TargetOpcode::G_ADD: {
2422 // Check if this is being fed by a G_ICMP on either side.
2423 //
2424 // (cmp pred, x, y) + z
2425 //
2426 // In the above case, when the cmp is true, we increment z by 1. So, we can
2427 // fold the add into the cset for the cmp by using cinc.
2428 //
2429 // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2430 Register AddDst = I.getOperand(i: 0).getReg();
2431 Register AddLHS = I.getOperand(i: 1).getReg();
2432 Register AddRHS = I.getOperand(i: 2).getReg();
2433 // Only handle scalars.
2434 LLT Ty = MRI.getType(Reg: AddLHS);
2435 if (Ty.isVector())
2436 return false;
2437 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2438 // bits.
2439 unsigned Size = Ty.getSizeInBits();
2440 if (Size != 32 && Size != 64)
2441 return false;
2442 auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2443 if (!MRI.hasOneNonDBGUse(RegNo: Reg))
2444 return nullptr;
2445 // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2446 // compare.
2447 if (Size == 32)
2448 return getOpcodeDef(Opcode: TargetOpcode::G_ICMP, Reg, MRI);
2449 // We model scalar compares using 32-bit destinations right now.
2450 // If it's a 64-bit compare, it'll have 64-bit sources.
2451 Register ZExt;
2452 if (!mi_match(R: Reg, MRI,
2453 P: m_OneNonDBGUse(SP: m_GZExt(Src: m_OneNonDBGUse(SP: m_Reg(R&: ZExt))))))
2454 return nullptr;
2455 auto *Cmp = getOpcodeDef(Opcode: TargetOpcode::G_ICMP, Reg: ZExt, MRI);
2456 if (!Cmp ||
2457 MRI.getType(Reg: Cmp->getOperand(i: 2).getReg()).getSizeInBits() != 64)
2458 return nullptr;
2459 return Cmp;
2460 };
2461 // Try to match
2462 // z + (cmp pred, x, y)
2463 MachineInstr *Cmp = MatchCmp(AddRHS);
2464 if (!Cmp) {
2465 // (cmp pred, x, y) + z
2466 std::swap(a&: AddLHS, b&: AddRHS);
2467 Cmp = MatchCmp(AddRHS);
2468 if (!Cmp)
2469 return false;
2470 }
2471 auto &PredOp = Cmp->getOperand(i: 1);
2472 MIB.setInstrAndDebugLoc(I);
2473 emitIntegerCompare(/*LHS=*/Cmp->getOperand(i: 2),
2474 /*RHS=*/Cmp->getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
2475 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2476 const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
2477 P: CmpInst::getInversePredicate(pred: Pred), RHS: Cmp->getOperand(i: 3).getReg(), MRI: &MRI);
2478 emitCSINC(/*Dst=*/AddDst, /*Src =*/Src1: AddLHS, /*Src2=*/AddLHS, Pred: InvCC, MIRBuilder&: MIB);
2479 I.eraseFromParent();
2480 return true;
2481 }
2482 case TargetOpcode::G_OR: {
2483 // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2484 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2485 // shifting and masking that we can replace with a BFI (encoded as a BFM).
2486 Register Dst = I.getOperand(i: 0).getReg();
2487 LLT Ty = MRI.getType(Reg: Dst);
2488
2489 if (!Ty.isScalar())
2490 return false;
2491
2492 unsigned Size = Ty.getSizeInBits();
2493 if (Size != 32 && Size != 64)
2494 return false;
2495
2496 Register ShiftSrc;
2497 int64_t ShiftImm;
2498 Register MaskSrc;
2499 int64_t MaskImm;
2500 if (!mi_match(
2501 R: Dst, MRI,
2502 P: m_GOr(L: m_OneNonDBGUse(SP: m_GShl(L: m_Reg(R&: ShiftSrc), R: m_ICst(Cst&: ShiftImm))),
2503 R: m_OneNonDBGUse(SP: m_GAnd(L: m_Reg(R&: MaskSrc), R: m_ICst(Cst&: MaskImm))))))
2504 return false;
2505
2506 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2507 return false;
2508
2509 int64_t Immr = Size - ShiftImm;
2510 int64_t Imms = Size - ShiftImm - 1;
2511 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2512 emitInstr(Opcode: Opc, DstOps: {Dst}, SrcOps: {MaskSrc, ShiftSrc, Immr, Imms}, MIRBuilder&: MIB);
2513 I.eraseFromParent();
2514 return true;
2515 }
2516 case TargetOpcode::G_FENCE: {
2517 if (I.getOperand(i: 1).getImm() == 0)
2518 BuildMI(BB&: MBB, I, MIMD: MIMetadata(I), MCID: TII.get(Opcode: TargetOpcode::MEMBARRIER));
2519 else
2520 BuildMI(BB&: MBB, I, MIMD: MIMetadata(I), MCID: TII.get(Opcode: AArch64::DMB))
2521 .addImm(Val: I.getOperand(i: 0).getImm() == 4 ? 0x9 : 0xb);
2522 I.eraseFromParent();
2523 return true;
2524 }
2525 default:
2526 return false;
2527 }
2528}
2529
2530bool AArch64InstructionSelector::select(MachineInstr &I) {
2531 assert(I.getParent() && "Instruction should be in a basic block!");
2532 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2533
2534 MachineBasicBlock &MBB = *I.getParent();
2535 MachineFunction &MF = *MBB.getParent();
2536 MachineRegisterInfo &MRI = MF.getRegInfo();
2537
2538 const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2539 if (Subtarget->requiresStrictAlign()) {
2540 // We don't support this feature yet.
2541 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2542 return false;
2543 }
2544
2545 MIB.setInstrAndDebugLoc(I);
2546
2547 unsigned Opcode = I.getOpcode();
2548 // G_PHI requires same handling as PHI
2549 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2550 // Certain non-generic instructions also need some special handling.
2551
2552 if (Opcode == TargetOpcode::LOAD_STACK_GUARD) {
2553 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2554 return true;
2555 }
2556
2557 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2558 const Register DefReg = I.getOperand(i: 0).getReg();
2559 const LLT DefTy = MRI.getType(Reg: DefReg);
2560
2561 const RegClassOrRegBank &RegClassOrBank =
2562 MRI.getRegClassOrRegBank(Reg: DefReg);
2563
2564 const TargetRegisterClass *DefRC =
2565 dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank);
2566 if (!DefRC) {
2567 if (!DefTy.isValid()) {
2568 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2569 return false;
2570 }
2571 const RegisterBank &RB = *cast<const RegisterBank *>(Val: RegClassOrBank);
2572 DefRC = getRegClassForTypeOnBank(Ty: DefTy, RB);
2573 if (!DefRC) {
2574 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2575 return false;
2576 }
2577 }
2578
2579 I.setDesc(TII.get(Opcode: TargetOpcode::PHI));
2580
2581 return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI);
2582 }
2583
2584 if (I.isCopy())
2585 return selectCopy(I, TII, MRI, TRI, RBI);
2586
2587 if (I.isDebugInstr())
2588 return selectDebugInstr(I, MRI, RBI);
2589
2590 return true;
2591 }
2592
2593
2594 if (I.getNumOperands() != I.getNumExplicitOperands()) {
2595 LLVM_DEBUG(
2596 dbgs() << "Generic instruction has unexpected implicit operands\n");
2597 return false;
2598 }
2599
2600 // Try to do some lowering before we start instruction selecting. These
2601 // lowerings are purely transformations on the input G_MIR and so selection
2602 // must continue after any modification of the instruction.
2603 if (preISelLower(I)) {
2604 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2605 }
2606
2607 // There may be patterns where the importer can't deal with them optimally,
2608 // but does select it to a suboptimal sequence so our custom C++ selection
2609 // code later never has a chance to work on it. Therefore, we have an early
2610 // selection attempt here to give priority to certain selection routines
2611 // over the imported ones.
2612 if (earlySelect(I))
2613 return true;
2614
2615 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
2616 return true;
2617
2618 LLT Ty =
2619 I.getOperand(i: 0).isReg() ? MRI.getType(Reg: I.getOperand(i: 0).getReg()) : LLT{};
2620
2621 switch (Opcode) {
2622 case TargetOpcode::G_SBFX:
2623 case TargetOpcode::G_UBFX: {
2624 static const unsigned OpcTable[2][2] = {
2625 {AArch64::UBFMWri, AArch64::UBFMXri},
2626 {AArch64::SBFMWri, AArch64::SBFMXri}};
2627 bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2628 unsigned Size = Ty.getSizeInBits();
2629 unsigned Opc = OpcTable[IsSigned][Size == 64];
2630 auto Cst1 =
2631 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 2).getReg(), MRI);
2632 assert(Cst1 && "Should have gotten a constant for src 1?");
2633 auto Cst2 =
2634 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 3).getReg(), MRI);
2635 assert(Cst2 && "Should have gotten a constant for src 2?");
2636 auto LSB = Cst1->Value.getZExtValue();
2637 auto Width = Cst2->Value.getZExtValue();
2638 auto BitfieldInst =
2639 MIB.buildInstr(Opc, DstOps: {I.getOperand(i: 0)}, SrcOps: {I.getOperand(i: 1)})
2640 .addImm(Val: LSB)
2641 .addImm(Val: LSB + Width - 1);
2642 I.eraseFromParent();
2643 constrainSelectedInstRegOperands(I&: *BitfieldInst, TII, TRI, RBI);
2644 return true;
2645 }
2646 case TargetOpcode::G_BRCOND:
2647 return selectCompareBranch(I, MF, MRI);
2648
2649 case TargetOpcode::G_BRINDIRECT: {
2650 const Function &Fn = MF.getFunction();
2651 if (std::optional<uint16_t> BADisc =
2652 STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(ParentFn: Fn)) {
2653 auto MI = MIB.buildInstr(Opc: AArch64::BRA, DstOps: {}, SrcOps: {I.getOperand(i: 0).getReg()});
2654 MI.addImm(Val: AArch64PACKey::IA);
2655 MI.addImm(Val: *BADisc);
2656 MI.addReg(/*AddrDisc=*/RegNo: AArch64::XZR);
2657 I.eraseFromParent();
2658 constrainSelectedInstRegOperands(I&: *MI, TII, TRI, RBI);
2659 return true;
2660 }
2661 I.setDesc(TII.get(Opcode: AArch64::BR));
2662 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2663 return true;
2664 }
2665
2666 case TargetOpcode::G_BRJT:
2667 return selectBrJT(I, MRI);
2668
2669 case AArch64::G_ADD_LOW: {
2670 // This op may have been separated from it's ADRP companion by the localizer
2671 // or some other code motion pass. Given that many CPUs will try to
2672 // macro fuse these operations anyway, select this into a MOVaddr pseudo
2673 // which will later be expanded into an ADRP+ADD pair after scheduling.
2674 MachineInstr *BaseMI = MRI.getVRegDef(Reg: I.getOperand(i: 1).getReg());
2675 if (BaseMI->getOpcode() != AArch64::ADRP) {
2676 I.setDesc(TII.get(Opcode: AArch64::ADDXri));
2677 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2678 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2679 return true;
2680 }
2681 assert(TM.getCodeModel() == CodeModel::Small &&
2682 "Expected small code model");
2683 auto Op1 = BaseMI->getOperand(i: 1);
2684 auto Op2 = I.getOperand(i: 2);
2685 auto MovAddr = MIB.buildInstr(Opc: AArch64::MOVaddr, DstOps: {I.getOperand(i: 0)}, SrcOps: {})
2686 .addGlobalAddress(GV: Op1.getGlobal(), Offset: Op1.getOffset(),
2687 TargetFlags: Op1.getTargetFlags())
2688 .addGlobalAddress(GV: Op2.getGlobal(), Offset: Op2.getOffset(),
2689 TargetFlags: Op2.getTargetFlags());
2690 I.eraseFromParent();
2691 constrainSelectedInstRegOperands(I&: *MovAddr, TII, TRI, RBI);
2692 return true;
2693 }
2694
2695 case TargetOpcode::G_FCONSTANT: {
2696 const Register DefReg = I.getOperand(i: 0).getReg();
2697 const LLT DefTy = MRI.getType(Reg: DefReg);
2698 const unsigned DefSize = DefTy.getSizeInBits();
2699 const RegisterBank &RB = *RBI.getRegBank(Reg: DefReg, MRI, TRI);
2700
2701 const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(Ty: DefTy, RB);
2702 // For 16, 64, and 128b values, emit a constant pool load.
2703 switch (DefSize) {
2704 default:
2705 llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2706 case 32:
2707 case 64: {
2708 bool OptForSize = shouldOptForSize(MF: &MF);
2709 const auto &TLI = MF.getSubtarget().getTargetLowering();
2710 // If TLI says that this fpimm is illegal, then we'll expand to a
2711 // constant pool load.
2712 if (TLI->isFPImmLegal(I.getOperand(i: 1).getFPImm()->getValueAPF(),
2713 EVT::getFloatingPointVT(BitWidth: DefSize), ForCodeSize: OptForSize))
2714 break;
2715 [[fallthrough]];
2716 }
2717 case 16:
2718 case 128: {
2719 auto *FPImm = I.getOperand(i: 1).getFPImm();
2720 auto *LoadMI = emitLoadFromConstantPool(CPVal: FPImm, MIRBuilder&: MIB);
2721 if (!LoadMI) {
2722 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2723 return false;
2724 }
2725 MIB.buildCopy(Res: {DefReg}, Op: {LoadMI->getOperand(i: 0).getReg()});
2726 I.eraseFromParent();
2727 return RBI.constrainGenericRegister(Reg: DefReg, RC: FPRRC, MRI);
2728 }
2729 }
2730
2731 assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size");
2732 // Either emit a FMOV, or emit a copy to emit a normal mov.
2733 const Register DefGPRReg = MRI.createVirtualRegister(
2734 RegClass: DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2735 MachineOperand &RegOp = I.getOperand(i: 0);
2736 RegOp.setReg(DefGPRReg);
2737 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: I.getIterator()));
2738 MIB.buildCopy(Res: {DefReg}, Op: {DefGPRReg});
2739
2740 if (!RBI.constrainGenericRegister(Reg: DefReg, RC: FPRRC, MRI)) {
2741 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2742 return false;
2743 }
2744
2745 MachineOperand &ImmOp = I.getOperand(i: 1);
2746 ImmOp.ChangeToImmediate(
2747 ImmVal: ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2748
2749 const unsigned MovOpc =
2750 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2751 I.setDesc(TII.get(Opcode: MovOpc));
2752 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2753 return true;
2754 }
2755 case TargetOpcode::G_EXTRACT: {
2756 Register DstReg = I.getOperand(i: 0).getReg();
2757 Register SrcReg = I.getOperand(i: 1).getReg();
2758 LLT SrcTy = MRI.getType(Reg: SrcReg);
2759 LLT DstTy = MRI.getType(Reg: DstReg);
2760 (void)DstTy;
2761 unsigned SrcSize = SrcTy.getSizeInBits();
2762
2763 if (SrcTy.getSizeInBits() > 64) {
2764 // This should be an extract of an s128, which is like a vector extract.
2765 if (SrcTy.getSizeInBits() != 128)
2766 return false;
2767 // Only support extracting 64 bits from an s128 at the moment.
2768 if (DstTy.getSizeInBits() != 64)
2769 return false;
2770
2771 unsigned Offset = I.getOperand(i: 2).getImm();
2772 if (Offset % 64 != 0)
2773 return false;
2774
2775 // Check we have the right regbank always.
2776 const RegisterBank &SrcRB = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
2777 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
2778 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2779
2780 if (SrcRB.getID() == AArch64::GPRRegBankID) {
2781 auto NewI =
2782 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {})
2783 .addUse(RegNo: SrcReg, Flags: {},
2784 SubReg: Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2785 constrainOperandRegClass(MF, TRI, MRI, TII, RBI, InsertPt&: *NewI,
2786 RegClass: AArch64::GPR64RegClass, RegMO&: NewI->getOperand(i: 0));
2787 I.eraseFromParent();
2788 return true;
2789 }
2790
2791 // Emit the same code as a vector extract.
2792 // Offset must be a multiple of 64.
2793 unsigned LaneIdx = Offset / 64;
2794 MachineInstr *Extract = emitExtractVectorElt(
2795 DstReg, DstRB, ScalarTy: LLT::scalar(SizeInBits: 64), VecReg: SrcReg, LaneIdx, MIRBuilder&: MIB);
2796 if (!Extract)
2797 return false;
2798 I.eraseFromParent();
2799 return true;
2800 }
2801
2802 I.setDesc(TII.get(Opcode: SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2803 MachineInstrBuilder(MF, I).addImm(Val: I.getOperand(i: 2).getImm() +
2804 Ty.getSizeInBits() - 1);
2805
2806 if (SrcSize < 64) {
2807 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2808 "unexpected G_EXTRACT types");
2809 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2810 return true;
2811 }
2812
2813 DstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2814 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: I.getIterator()));
2815 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: 0).getReg()}, SrcOps: {})
2816 .addReg(RegNo: DstReg, Flags: {}, SubReg: AArch64::sub_32);
2817 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(),
2818 RC: AArch64::GPR32RegClass, MRI);
2819 I.getOperand(i: 0).setReg(DstReg);
2820
2821 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2822 return true;
2823 }
2824
2825 case TargetOpcode::G_INSERT: {
2826 LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 2).getReg());
2827 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2828 unsigned DstSize = DstTy.getSizeInBits();
2829 // Larger inserts are vectors, same-size ones should be something else by
2830 // now (split up or turned into COPYs).
2831 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2832 return false;
2833
2834 I.setDesc(TII.get(Opcode: DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2835 unsigned LSB = I.getOperand(i: 3).getImm();
2836 unsigned Width = MRI.getType(Reg: I.getOperand(i: 2).getReg()).getSizeInBits();
2837 I.getOperand(i: 3).setImm((DstSize - LSB) % DstSize);
2838 MachineInstrBuilder(MF, I).addImm(Val: Width - 1);
2839
2840 if (DstSize < 64) {
2841 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2842 "unexpected G_INSERT types");
2843 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2844 return true;
2845 }
2846
2847 Register SrcReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2848 BuildMI(BB&: MBB, I: I.getIterator(), MIMD: I.getDebugLoc(),
2849 MCID: TII.get(Opcode: AArch64::SUBREG_TO_REG))
2850 .addDef(RegNo: SrcReg)
2851 .addUse(RegNo: I.getOperand(i: 2).getReg())
2852 .addImm(Val: AArch64::sub_32);
2853 RBI.constrainGenericRegister(Reg: I.getOperand(i: 2).getReg(),
2854 RC: AArch64::GPR32RegClass, MRI);
2855 I.getOperand(i: 2).setReg(SrcReg);
2856
2857 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2858 return true;
2859 }
2860 case TargetOpcode::G_FRAME_INDEX: {
2861 // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2862 if (Ty != LLT::pointer(AddressSpace: 0, SizeInBits: 64)) {
2863 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2864 << ", expected: " << LLT::pointer(0, 64) << '\n');
2865 return false;
2866 }
2867 I.setDesc(TII.get(Opcode: AArch64::ADDXri));
2868
2869 // MOs for a #0 shifted immediate.
2870 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2871 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2872
2873 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2874 return true;
2875 }
2876
2877 case TargetOpcode::G_GLOBAL_VALUE: {
2878 const GlobalValue *GV = nullptr;
2879 unsigned OpFlags;
2880 if (I.getOperand(i: 1).isSymbol()) {
2881 OpFlags = I.getOperand(i: 1).getTargetFlags();
2882 // Currently only used by "RtLibUseGOT".
2883 assert(OpFlags == AArch64II::MO_GOT);
2884 } else {
2885 GV = I.getOperand(i: 1).getGlobal();
2886 if (GV->isThreadLocal()) {
2887 // We don't support instructions with emulated TLS variables yet
2888 if (TM.useEmulatedTLS())
2889 return false;
2890 return selectTLSGlobalValue(I, MRI);
2891 }
2892 OpFlags = STI.ClassifyGlobalReference(GV, TM);
2893 }
2894
2895 if (OpFlags & AArch64II::MO_GOT) {
2896 bool IsGOTSigned = MF.getInfo<AArch64FunctionInfo>()->hasELFSignedGOT();
2897 I.setDesc(TII.get(Opcode: IsGOTSigned ? AArch64::LOADgotAUTH : AArch64::LOADgot));
2898 I.getOperand(i: 1).setTargetFlags(OpFlags);
2899 I.addImplicitDefUseOperands(MF);
2900 } else if (TM.getCodeModel() == CodeModel::Large &&
2901 !TM.isPositionIndependent()) {
2902 // Materialize the global using movz/movk instructions.
2903 materializeLargeCMVal(I, V: GV, OpFlags);
2904 I.eraseFromParent();
2905 return true;
2906 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2907 I.setDesc(TII.get(Opcode: AArch64::ADR));
2908 I.getOperand(i: 1).setTargetFlags(OpFlags);
2909 } else {
2910 I.setDesc(TII.get(Opcode: AArch64::MOVaddr));
2911 I.getOperand(i: 1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2912 MachineInstrBuilder MIB(MF, I);
2913 MIB.addGlobalAddress(GV, Offset: I.getOperand(i: 1).getOffset(),
2914 TargetFlags: OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2915 }
2916 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2917 return true;
2918 }
2919
2920 case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE:
2921 return selectPtrAuthGlobalValue(I, MRI);
2922
2923 case TargetOpcode::G_ZEXTLOAD:
2924 case TargetOpcode::G_LOAD:
2925 case TargetOpcode::G_STORE: {
2926 GLoadStore &LdSt = cast<GLoadStore>(Val&: I);
2927 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2928 LLT PtrTy = MRI.getType(Reg: LdSt.getPointerReg());
2929
2930 // Can only handle AddressSpace 0, 64-bit pointers.
2931 if (PtrTy != LLT::pointer(AddressSpace: 0, SizeInBits: 64)) {
2932 return false;
2933 }
2934
2935 uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
2936 unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
2937 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2938
2939 // Need special instructions for atomics that affect ordering.
2940 if (isStrongerThanMonotonic(AO: Order)) {
2941 assert(!isa<GZExtLoad>(LdSt));
2942 assert(MemSizeInBytes <= 8 &&
2943 "128-bit atomics should already be custom-legalized");
2944
2945 if (isa<GLoad>(Val: LdSt)) {
2946 static constexpr unsigned LDAPROpcodes[] = {
2947 AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2948 static constexpr unsigned LDAROpcodes[] = {
2949 AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2950 ArrayRef<unsigned> Opcodes =
2951 STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
2952 ? LDAPROpcodes
2953 : LDAROpcodes;
2954 I.setDesc(TII.get(Opcode: Opcodes[Log2_32(Value: MemSizeInBytes)]));
2955 } else {
2956 static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2957 AArch64::STLRW, AArch64::STLRX};
2958 Register ValReg = LdSt.getReg(Idx: 0);
2959 if (MRI.getType(Reg: ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2960 // Emit a subreg copy of 32 bits.
2961 Register NewVal = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
2962 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {NewVal}, SrcOps: {})
2963 .addReg(RegNo: I.getOperand(i: 0).getReg(), Flags: {}, SubReg: AArch64::sub_32);
2964 I.getOperand(i: 0).setReg(NewVal);
2965 }
2966 I.setDesc(TII.get(Opcode: Opcodes[Log2_32(Value: MemSizeInBytes)]));
2967 }
2968 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2969 return true;
2970 }
2971
2972#ifndef NDEBUG
2973 const Register PtrReg = LdSt.getPointerReg();
2974 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2975 // Check that the pointer register is valid.
2976 assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2977 "Load/Store pointer operand isn't a GPR");
2978 assert(MRI.getType(PtrReg).isPointer() &&
2979 "Load/Store pointer operand isn't a pointer");
2980#endif
2981
2982 const Register ValReg = LdSt.getReg(Idx: 0);
2983 const RegisterBank &RB = *RBI.getRegBank(Reg: ValReg, MRI, TRI);
2984 LLT ValTy = MRI.getType(Reg: ValReg);
2985
2986 // The code below doesn't support truncating stores, so we need to split it
2987 // again.
2988 if (isa<GStore>(Val: LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2989 unsigned SubReg;
2990 LLT MemTy = LdSt.getMMO().getMemoryType();
2991 auto *RC = getRegClassForTypeOnBank(Ty: MemTy, RB);
2992 if (!getSubRegForClass(RC, TRI, SubReg))
2993 return false;
2994
2995 // Generate a subreg copy.
2996 auto Copy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {MemTy}, SrcOps: {})
2997 .addReg(RegNo: ValReg, Flags: {}, SubReg)
2998 .getReg(Idx: 0);
2999 RBI.constrainGenericRegister(Reg: Copy, RC: *RC, MRI);
3000 LdSt.getOperand(i: 0).setReg(Copy);
3001 } else if (isa<GLoad>(Val: LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3002 // If this is an any-extending load from the FPR bank, split it into a regular
3003 // load + extend.
3004 if (RB.getID() == AArch64::FPRRegBankID) {
3005 unsigned SubReg;
3006 LLT MemTy = LdSt.getMMO().getMemoryType();
3007 auto *RC = getRegClassForTypeOnBank(Ty: MemTy, RB);
3008 if (!getSubRegForClass(RC, TRI, SubReg))
3009 return false;
3010 Register OldDst = LdSt.getReg(Idx: 0);
3011 Register NewDst =
3012 MRI.createGenericVirtualRegister(Ty: LdSt.getMMO().getMemoryType());
3013 LdSt.getOperand(i: 0).setReg(NewDst);
3014 MRI.setRegBank(Reg: NewDst, RegBank: RB);
3015 // Generate a SUBREG_TO_REG to extend it.
3016 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: LdSt.getIterator()));
3017 MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {OldDst}, SrcOps: {})
3018 .addUse(RegNo: NewDst)
3019 .addImm(Val: SubReg);
3020 auto SubRegRC = getRegClassForTypeOnBank(Ty: MRI.getType(Reg: OldDst), RB);
3021 RBI.constrainGenericRegister(Reg: OldDst, RC: *SubRegRC, MRI);
3022 MIB.setInstr(LdSt);
3023 ValTy = MemTy; // This is no longer an extending load.
3024 }
3025 }
3026
3027 // Helper lambda for partially selecting I. Either returns the original
3028 // instruction with an updated opcode, or a new instruction.
3029 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
3030 bool IsStore = isa<GStore>(Val: I);
3031 const unsigned NewOpc =
3032 selectLoadStoreUIOp(GenericOpc: I.getOpcode(), RegBankID: RB.getID(), OpSize: MemSizeInBits);
3033 if (NewOpc == I.getOpcode())
3034 return nullptr;
3035 // Check if we can fold anything into the addressing mode.
3036 auto AddrModeFns =
3037 selectAddrModeIndexed(Root&: I.getOperand(i: 1), Size: MemSizeInBytes);
3038 if (!AddrModeFns) {
3039 // Can't fold anything. Use the original instruction.
3040 I.setDesc(TII.get(Opcode: NewOpc));
3041 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
3042 return &I;
3043 }
3044
3045 // Folded something. Create a new instruction and return it.
3046 auto NewInst = MIB.buildInstr(Opc: NewOpc, DstOps: {}, SrcOps: {}, Flags: I.getFlags());
3047 Register CurValReg = I.getOperand(i: 0).getReg();
3048 IsStore ? NewInst.addUse(RegNo: CurValReg) : NewInst.addDef(RegNo: CurValReg);
3049 NewInst.cloneMemRefs(OtherMI: I);
3050 for (auto &Fn : *AddrModeFns)
3051 Fn(NewInst);
3052 I.eraseFromParent();
3053 return &*NewInst;
3054 };
3055
3056 MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
3057 if (!LoadStore)
3058 return false;
3059
3060 // If we're storing a 0, use WZR/XZR.
3061 if (Opcode == TargetOpcode::G_STORE) {
3062 auto CVal = getIConstantVRegValWithLookThrough(
3063 VReg: LoadStore->getOperand(i: 0).getReg(), MRI);
3064 if (CVal && CVal->Value == 0) {
3065 switch (LoadStore->getOpcode()) {
3066 case AArch64::STRWui:
3067 case AArch64::STRHHui:
3068 case AArch64::STRBBui:
3069 LoadStore->getOperand(i: 0).setReg(AArch64::WZR);
3070 break;
3071 case AArch64::STRXui:
3072 LoadStore->getOperand(i: 0).setReg(AArch64::XZR);
3073 break;
3074 }
3075 }
3076 }
3077
3078 if (IsZExtLoad || (Opcode == TargetOpcode::G_LOAD &&
3079 ValTy == LLT::scalar(SizeInBits: 64) && MemSizeInBits == 32)) {
3080 // The any/zextload from a smaller type to i32 should be handled by the
3081 // importer.
3082 if (MRI.getType(Reg: LoadStore->getOperand(i: 0).getReg()).getSizeInBits() != 64)
3083 return false;
3084 // If we have an extending load then change the load's type to be a
3085 // narrower reg and zero_extend with SUBREG_TO_REG.
3086 Register LdReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
3087 Register DstReg = LoadStore->getOperand(i: 0).getReg();
3088 LoadStore->getOperand(i: 0).setReg(LdReg);
3089
3090 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: LoadStore->getIterator()));
3091 MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {DstReg}, SrcOps: {})
3092 .addUse(RegNo: LdReg)
3093 .addImm(Val: AArch64::sub_32);
3094 constrainSelectedInstRegOperands(I&: *LoadStore, TII, TRI, RBI);
3095 return RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64allRegClass,
3096 MRI);
3097 }
3098 constrainSelectedInstRegOperands(I&: *LoadStore, TII, TRI, RBI);
3099 return true;
3100 }
3101
3102 case TargetOpcode::G_INDEXED_ZEXTLOAD:
3103 case TargetOpcode::G_INDEXED_SEXTLOAD:
3104 return selectIndexedExtLoad(I, MRI);
3105 case TargetOpcode::G_INDEXED_LOAD:
3106 return selectIndexedLoad(I, MRI);
3107 case TargetOpcode::G_INDEXED_STORE:
3108 return selectIndexedStore(I&: cast<GIndexedStore>(Val&: I), MRI);
3109
3110 case TargetOpcode::G_LSHR:
3111 case TargetOpcode::G_ASHR:
3112 if (MRI.getType(Reg: I.getOperand(i: 0).getReg()).isVector())
3113 return selectVectorAshrLshr(I, MRI);
3114 [[fallthrough]];
3115 case TargetOpcode::G_SHL:
3116 if (Opcode == TargetOpcode::G_SHL &&
3117 MRI.getType(Reg: I.getOperand(i: 0).getReg()).isVector())
3118 return selectVectorSHL(I, MRI);
3119
3120 // These shifts were legalized to have 64 bit shift amounts because we
3121 // want to take advantage of the selection patterns that assume the
3122 // immediates are s64s, however, selectBinaryOp will assume both operands
3123 // will have the same bit size.
3124 {
3125 Register SrcReg = I.getOperand(i: 1).getReg();
3126 Register ShiftReg = I.getOperand(i: 2).getReg();
3127 const LLT ShiftTy = MRI.getType(Reg: ShiftReg);
3128 const LLT SrcTy = MRI.getType(Reg: SrcReg);
3129 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3130 ShiftTy.getSizeInBits() == 64) {
3131 assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3132 // Insert a subregister copy to implement a 64->32 trunc
3133 auto Trunc = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {SrcTy}, SrcOps: {})
3134 .addReg(RegNo: ShiftReg, Flags: {}, SubReg: AArch64::sub_32);
3135 MRI.setRegBank(Reg: Trunc.getReg(Idx: 0), RegBank: RBI.getRegBank(ID: AArch64::GPRRegBankID));
3136 I.getOperand(i: 2).setReg(Trunc.getReg(Idx: 0));
3137 }
3138 }
3139 [[fallthrough]];
3140 case TargetOpcode::G_OR: {
3141 // Reject the various things we don't support yet.
3142 if (unsupportedBinOp(I, RBI, MRI, TRI))
3143 return false;
3144
3145 const unsigned OpSize = Ty.getSizeInBits();
3146
3147 const Register DefReg = I.getOperand(i: 0).getReg();
3148 const RegisterBank &RB = *RBI.getRegBank(Reg: DefReg, MRI, TRI);
3149
3150 const unsigned NewOpc = selectBinaryOp(GenericOpc: I.getOpcode(), RegBankID: RB.getID(), OpSize);
3151 if (NewOpc == I.getOpcode())
3152 return false;
3153
3154 I.setDesc(TII.get(Opcode: NewOpc));
3155 // FIXME: Should the type be always reset in setDesc?
3156
3157 // Now that we selected an opcode, we need to constrain the register
3158 // operands to use appropriate classes.
3159 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3160 return true;
3161 }
3162
3163 case TargetOpcode::G_PTR_ADD: {
3164 emitADD(DefReg: I.getOperand(i: 0).getReg(), LHS&: I.getOperand(i: 1), RHS&: I.getOperand(i: 2), MIRBuilder&: MIB);
3165 I.eraseFromParent();
3166 return true;
3167 }
3168
3169 case TargetOpcode::G_SADDE:
3170 case TargetOpcode::G_UADDE:
3171 case TargetOpcode::G_SSUBE:
3172 case TargetOpcode::G_USUBE:
3173 case TargetOpcode::G_SADDO:
3174 case TargetOpcode::G_UADDO:
3175 case TargetOpcode::G_SSUBO:
3176 case TargetOpcode::G_USUBO:
3177 return selectOverflowOp(I, MRI);
3178
3179 case TargetOpcode::G_PTRMASK: {
3180 Register MaskReg = I.getOperand(i: 2).getReg();
3181 std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(VReg: MaskReg, MRI);
3182 // TODO: Implement arbitrary cases
3183 if (!MaskVal || !isShiftedMask_64(Value: *MaskVal))
3184 return false;
3185
3186 uint64_t Mask = *MaskVal;
3187 I.setDesc(TII.get(Opcode: AArch64::ANDXri));
3188 I.getOperand(i: 2).ChangeToImmediate(
3189 ImmVal: AArch64_AM::encodeLogicalImmediate(imm: Mask, regSize: 64));
3190
3191 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3192 return true;
3193 }
3194 case TargetOpcode::G_PTRTOINT:
3195 case TargetOpcode::G_TRUNC: {
3196 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3197 const LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
3198
3199 const Register DstReg = I.getOperand(i: 0).getReg();
3200 const Register SrcReg = I.getOperand(i: 1).getReg();
3201
3202 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
3203 const RegisterBank &SrcRB = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
3204
3205 if (DstRB.getID() != SrcRB.getID()) {
3206 LLVM_DEBUG(
3207 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3208 return false;
3209 }
3210
3211 if (DstRB.getID() == AArch64::GPRRegBankID) {
3212 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Ty: DstTy, RB: DstRB);
3213 if (!DstRC)
3214 return false;
3215
3216 const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(Ty: SrcTy, RB: SrcRB);
3217 if (!SrcRC)
3218 return false;
3219
3220 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI) ||
3221 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI)) {
3222 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3223 return false;
3224 }
3225
3226 if (DstRC == SrcRC) {
3227 // Nothing to be done
3228 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(SizeInBits: 32) &&
3229 SrcTy == LLT::scalar(SizeInBits: 64)) {
3230 llvm_unreachable("TableGen can import this case");
3231 return false;
3232 } else if (DstRC == &AArch64::GPR32RegClass &&
3233 SrcRC == &AArch64::GPR64RegClass) {
3234 I.getOperand(i: 1).setSubReg(AArch64::sub_32);
3235 } else {
3236 LLVM_DEBUG(
3237 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3238 return false;
3239 }
3240
3241 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
3242 return true;
3243 } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3244 if (DstTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16) &&
3245 SrcTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
3246 I.setDesc(TII.get(Opcode: AArch64::XTNv4i16));
3247 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3248 return true;
3249 }
3250
3251 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3252 MachineInstr *Extract = emitExtractVectorElt(
3253 DstReg, DstRB, ScalarTy: LLT::scalar(SizeInBits: DstTy.getSizeInBits()), VecReg: SrcReg, LaneIdx: 0, MIRBuilder&: MIB);
3254 if (!Extract)
3255 return false;
3256 I.eraseFromParent();
3257 return true;
3258 }
3259
3260 // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3261 if (Opcode == TargetOpcode::G_PTRTOINT) {
3262 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3263 I.setDesc(TII.get(Opcode: TargetOpcode::COPY));
3264 return selectCopy(I, TII, MRI, TRI, RBI);
3265 }
3266 }
3267
3268 return false;
3269 }
3270
3271 case TargetOpcode::G_ANYEXT: {
3272 if (selectUSMovFromExtend(I, MRI))
3273 return true;
3274
3275 const Register DstReg = I.getOperand(i: 0).getReg();
3276 const Register SrcReg = I.getOperand(i: 1).getReg();
3277
3278 const RegisterBank &RBDst = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
3279 if (RBDst.getID() != AArch64::GPRRegBankID) {
3280 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3281 << ", expected: GPR\n");
3282 return false;
3283 }
3284
3285 const RegisterBank &RBSrc = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
3286 if (RBSrc.getID() != AArch64::GPRRegBankID) {
3287 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3288 << ", expected: GPR\n");
3289 return false;
3290 }
3291
3292 const unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
3293
3294 if (DstSize == 0) {
3295 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3296 return false;
3297 }
3298
3299 if (DstSize != 64 && DstSize > 32) {
3300 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3301 << ", expected: 32 or 64\n");
3302 return false;
3303 }
3304 // At this point G_ANYEXT is just like a plain COPY, but we need
3305 // to explicitly form the 64-bit value if any.
3306 if (DstSize > 32) {
3307 Register ExtSrc = MRI.createVirtualRegister(RegClass: &AArch64::GPR64allRegClass);
3308 BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::SUBREG_TO_REG))
3309 .addDef(RegNo: ExtSrc)
3310 .addUse(RegNo: SrcReg)
3311 .addImm(Val: AArch64::sub_32);
3312 I.getOperand(i: 1).setReg(ExtSrc);
3313 }
3314 return selectCopy(I, TII, MRI, TRI, RBI);
3315 }
3316
3317 case TargetOpcode::G_ZEXT:
3318 case TargetOpcode::G_SEXT_INREG:
3319 case TargetOpcode::G_SEXT: {
3320 if (selectUSMovFromExtend(I, MRI))
3321 return true;
3322
3323 unsigned Opcode = I.getOpcode();
3324 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3325 const Register DefReg = I.getOperand(i: 0).getReg();
3326 Register SrcReg = I.getOperand(i: 1).getReg();
3327 const LLT DstTy = MRI.getType(Reg: DefReg);
3328 const LLT SrcTy = MRI.getType(Reg: SrcReg);
3329 unsigned DstSize = DstTy.getSizeInBits();
3330 unsigned SrcSize = SrcTy.getSizeInBits();
3331
3332 // SEXT_INREG has the same src reg size as dst, the size of the value to be
3333 // extended is encoded in the imm.
3334 if (Opcode == TargetOpcode::G_SEXT_INREG)
3335 SrcSize = I.getOperand(i: 2).getImm();
3336
3337 if (DstTy.isVector())
3338 return false; // Should be handled by imported patterns.
3339
3340 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3341 AArch64::GPRRegBankID &&
3342 "Unexpected ext regbank");
3343
3344 MachineInstr *ExtI;
3345
3346 // First check if we're extending the result of a load which has a dest type
3347 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3348 // GPR register on AArch64 and all loads which are smaller automatically
3349 // zero-extend the upper bits. E.g.
3350 // %v(s8) = G_LOAD %p, :: (load 1)
3351 // %v2(s32) = G_ZEXT %v(s8)
3352 if (!IsSigned) {
3353 auto *LoadMI = getOpcodeDef(Opcode: TargetOpcode::G_LOAD, Reg: SrcReg, MRI);
3354 bool IsGPR =
3355 RBI.getRegBank(Reg: SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3356 if (LoadMI && IsGPR) {
3357 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3358 unsigned BytesLoaded = MemOp->getSize().getValue();
3359 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3360 return selectCopy(I, TII, MRI, TRI, RBI);
3361 }
3362
3363 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3364 // + SUBREG_TO_REG.
3365 if (IsGPR && SrcSize == 32 && DstSize == 64) {
3366 Register SubregToRegSrc =
3367 MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
3368 const Register ZReg = AArch64::WZR;
3369 MIB.buildInstr(Opc: AArch64::ORRWrs, DstOps: {SubregToRegSrc}, SrcOps: {ZReg, SrcReg})
3370 .addImm(Val: 0);
3371
3372 MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {DefReg}, SrcOps: {})
3373 .addUse(RegNo: SubregToRegSrc)
3374 .addImm(Val: AArch64::sub_32);
3375
3376 if (!RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass,
3377 MRI)) {
3378 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3379 return false;
3380 }
3381
3382 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AArch64::GPR32RegClass,
3383 MRI)) {
3384 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3385 return false;
3386 }
3387
3388 I.eraseFromParent();
3389 return true;
3390 }
3391 }
3392
3393 if (DstSize == 64) {
3394 if (Opcode != TargetOpcode::G_SEXT_INREG) {
3395 // FIXME: Can we avoid manually doing this?
3396 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: AArch64::GPR32RegClass,
3397 MRI)) {
3398 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3399 << " operand\n");
3400 return false;
3401 }
3402 SrcReg = MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG,
3403 DstOps: {&AArch64::GPR64RegClass}, SrcOps: {})
3404 .addUse(RegNo: SrcReg)
3405 .addImm(Val: AArch64::sub_32)
3406 .getReg(Idx: 0);
3407 }
3408
3409 ExtI = MIB.buildInstr(Opc: IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3410 DstOps: {DefReg}, SrcOps: {SrcReg})
3411 .addImm(Val: 0)
3412 .addImm(Val: SrcSize - 1);
3413 } else if (DstSize <= 32) {
3414 ExtI = MIB.buildInstr(Opc: IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3415 DstOps: {DefReg}, SrcOps: {SrcReg})
3416 .addImm(Val: 0)
3417 .addImm(Val: SrcSize - 1);
3418 } else {
3419 return false;
3420 }
3421
3422 constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
3423 I.eraseFromParent();
3424 return true;
3425 }
3426
3427 case TargetOpcode::G_FREEZE:
3428 return selectCopy(I, TII, MRI, TRI, RBI);
3429
3430 case TargetOpcode::G_INTTOPTR:
3431 // The importer is currently unable to import pointer types since they
3432 // didn't exist in SelectionDAG.
3433 return selectCopy(I, TII, MRI, TRI, RBI);
3434
3435 case TargetOpcode::G_BITCAST:
3436 // Imported SelectionDAG rules can handle every bitcast except those that
3437 // bitcast from a type to the same type. Ideally, these shouldn't occur
3438 // but we might not run an optimizer that deletes them. The other exception
3439 // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3440 // of them.
3441 return selectCopy(I, TII, MRI, TRI, RBI);
3442
3443 case TargetOpcode::G_SELECT: {
3444 auto &Sel = cast<GSelect>(Val&: I);
3445 const Register CondReg = Sel.getCondReg();
3446 const Register TReg = Sel.getTrueReg();
3447 const Register FReg = Sel.getFalseReg();
3448
3449 if (tryOptSelect(Sel))
3450 return true;
3451
3452 // Make sure to use an unused vreg instead of wzr, so that the peephole
3453 // optimizations will be able to optimize these.
3454 Register DeadVReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
3455 auto TstMI = MIB.buildInstr(Opc: AArch64::ANDSWri, DstOps: {DeadVReg}, SrcOps: {CondReg})
3456 .addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: 1, regSize: 32));
3457 constrainSelectedInstRegOperands(I&: *TstMI, TII, TRI, RBI);
3458 if (!emitSelect(Dst: Sel.getReg(Idx: 0), True: TReg, False: FReg, CC: AArch64CC::NE, MIB))
3459 return false;
3460 Sel.eraseFromParent();
3461 return true;
3462 }
3463 case TargetOpcode::G_ICMP: {
3464 if (Ty.isVector())
3465 return false;
3466
3467 if (Ty != LLT::scalar(SizeInBits: 32)) {
3468 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3469 << ", expected: " << LLT::scalar(32) << '\n');
3470 return false;
3471 }
3472
3473 auto &PredOp = I.getOperand(i: 1);
3474 emitIntegerCompare(LHS&: I.getOperand(i: 2), RHS&: I.getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
3475 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
3476 const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
3477 P: CmpInst::getInversePredicate(pred: Pred), RHS: I.getOperand(i: 3).getReg(), MRI: &MRI);
3478 emitCSINC(/*Dst=*/I.getOperand(i: 0).getReg(), /*Src1=*/AArch64::WZR,
3479 /*Src2=*/AArch64::WZR, Pred: InvCC, MIRBuilder&: MIB);
3480 I.eraseFromParent();
3481 return true;
3482 }
3483
3484 case TargetOpcode::G_FCMP: {
3485 CmpInst::Predicate Pred =
3486 static_cast<CmpInst::Predicate>(I.getOperand(i: 1).getPredicate());
3487 if (!emitFPCompare(LHS: I.getOperand(i: 2).getReg(), RHS: I.getOperand(i: 3).getReg(), MIRBuilder&: MIB,
3488 Pred) ||
3489 !emitCSetForFCmp(Dst: I.getOperand(i: 0).getReg(), Pred, MIRBuilder&: MIB))
3490 return false;
3491 I.eraseFromParent();
3492 return true;
3493 }
3494 case TargetOpcode::G_VASTART:
3495 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3496 : selectVaStartAAPCS(I, MF, MRI);
3497 case TargetOpcode::G_INTRINSIC:
3498 return selectIntrinsic(I, MRI);
3499 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3500 return selectIntrinsicWithSideEffects(I, MRI);
3501 case TargetOpcode::G_IMPLICIT_DEF: {
3502 I.setDesc(TII.get(Opcode: TargetOpcode::IMPLICIT_DEF));
3503 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3504 const Register DstReg = I.getOperand(i: 0).getReg();
3505 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
3506 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Ty: DstTy, RB: DstRB);
3507 RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI);
3508 return true;
3509 }
3510 case TargetOpcode::G_BLOCK_ADDR: {
3511 Function *BAFn = I.getOperand(i: 1).getBlockAddress()->getFunction();
3512 if (std::optional<uint16_t> BADisc =
3513 STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(ParentFn: *BAFn)) {
3514 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X16}, SrcOps: {});
3515 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
3516 MIB.buildInstr(Opcode: AArch64::MOVaddrPAC)
3517 .addBlockAddress(BA: I.getOperand(i: 1).getBlockAddress())
3518 .addImm(Val: AArch64PACKey::IA)
3519 .addReg(/*AddrDisc=*/RegNo: AArch64::XZR)
3520 .addImm(Val: *BADisc)
3521 .constrainAllUses(TII, TRI, RBI);
3522 MIB.buildCopy(Res: I.getOperand(i: 0).getReg(), Op: Register(AArch64::X16));
3523 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(),
3524 RC: AArch64::GPR64RegClass, MRI);
3525 I.eraseFromParent();
3526 return true;
3527 }
3528 if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3529 materializeLargeCMVal(I, V: I.getOperand(i: 1).getBlockAddress(), OpFlags: 0);
3530 I.eraseFromParent();
3531 return true;
3532 } else {
3533 I.setDesc(TII.get(Opcode: AArch64::MOVaddrBA));
3534 auto MovMI = BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::MOVaddrBA),
3535 DestReg: I.getOperand(i: 0).getReg())
3536 .addBlockAddress(BA: I.getOperand(i: 1).getBlockAddress(),
3537 /* Offset */ 0, TargetFlags: AArch64II::MO_PAGE)
3538 .addBlockAddress(
3539 BA: I.getOperand(i: 1).getBlockAddress(), /* Offset */ 0,
3540 TargetFlags: AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3541 I.eraseFromParent();
3542 constrainSelectedInstRegOperands(I&: *MovMI, TII, TRI, RBI);
3543 return true;
3544 }
3545 }
3546 case AArch64::G_DUP: {
3547 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3548 // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3549 // difficult because at RBS we may end up pessimizing the fpr case if we
3550 // decided to add an anyextend to fix this. Manual selection is the most
3551 // robust solution for now.
3552 if (RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI)->getID() !=
3553 AArch64::GPRRegBankID)
3554 return false; // We expect the fpr regbank case to be imported.
3555 LLT VecTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3556 if (VecTy == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8))
3557 I.setDesc(TII.get(Opcode: AArch64::DUPv8i8gpr));
3558 else if (VecTy == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8))
3559 I.setDesc(TII.get(Opcode: AArch64::DUPv16i8gpr));
3560 else if (VecTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16))
3561 I.setDesc(TII.get(Opcode: AArch64::DUPv4i16gpr));
3562 else if (VecTy == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16))
3563 I.setDesc(TII.get(Opcode: AArch64::DUPv8i16gpr));
3564 else
3565 return false;
3566 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3567 return true;
3568 }
3569 case TargetOpcode::G_BUILD_VECTOR:
3570 return selectBuildVector(I, MRI);
3571 case TargetOpcode::G_MERGE_VALUES:
3572 return selectMergeValues(I, MRI);
3573 case TargetOpcode::G_UNMERGE_VALUES:
3574 return selectUnmergeValues(I, MRI);
3575 case TargetOpcode::G_SHUFFLE_VECTOR:
3576 return selectShuffleVector(I, MRI);
3577 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3578 return selectExtractElt(I, MRI);
3579 case TargetOpcode::G_CONCAT_VECTORS:
3580 return selectConcatVectors(I, MRI);
3581 case TargetOpcode::G_JUMP_TABLE:
3582 return selectJumpTable(I, MRI);
3583 case TargetOpcode::G_MEMCPY:
3584 case TargetOpcode::G_MEMCPY_INLINE:
3585 case TargetOpcode::G_MEMMOVE:
3586 case TargetOpcode::G_MEMSET:
3587 assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3588 return selectMOPS(I, MRI);
3589 }
3590
3591 return false;
3592}
3593
3594bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3595 MachineIRBuilderState OldMIBState = MIB.getState();
3596 bool Success = select(I);
3597 MIB.setState(OldMIBState);
3598 return Success;
3599}
3600
3601bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3602 MachineRegisterInfo &MRI) {
3603 unsigned Mopcode;
3604 switch (GI.getOpcode()) {
3605 case TargetOpcode::G_MEMCPY:
3606 case TargetOpcode::G_MEMCPY_INLINE:
3607 Mopcode = AArch64::MOPSMemoryCopyPseudo;
3608 break;
3609 case TargetOpcode::G_MEMMOVE:
3610 Mopcode = AArch64::MOPSMemoryMovePseudo;
3611 break;
3612 case TargetOpcode::G_MEMSET:
3613 // For tagged memset see llvm.aarch64.mops.memset.tag
3614 Mopcode = AArch64::MOPSMemorySetPseudo;
3615 break;
3616 }
3617
3618 auto &DstPtr = GI.getOperand(i: 0);
3619 auto &SrcOrVal = GI.getOperand(i: 1);
3620 auto &Size = GI.getOperand(i: 2);
3621
3622 // Create copies of the registers that can be clobbered.
3623 const Register DstPtrCopy = MRI.cloneVirtualRegister(VReg: DstPtr.getReg());
3624 const Register SrcValCopy = MRI.cloneVirtualRegister(VReg: SrcOrVal.getReg());
3625 const Register SizeCopy = MRI.cloneVirtualRegister(VReg: Size.getReg());
3626
3627 const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3628 const auto &SrcValRegClass =
3629 IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3630
3631 // Constrain to specific registers
3632 RBI.constrainGenericRegister(Reg: DstPtrCopy, RC: AArch64::GPR64commonRegClass, MRI);
3633 RBI.constrainGenericRegister(Reg: SrcValCopy, RC: SrcValRegClass, MRI);
3634 RBI.constrainGenericRegister(Reg: SizeCopy, RC: AArch64::GPR64RegClass, MRI);
3635
3636 MIB.buildCopy(Res: DstPtrCopy, Op: DstPtr);
3637 MIB.buildCopy(Res: SrcValCopy, Op: SrcOrVal);
3638 MIB.buildCopy(Res: SizeCopy, Op: Size);
3639
3640 // New instruction uses the copied registers because it must update them.
3641 // The defs are not used since they don't exist in G_MEM*. They are still
3642 // tied.
3643 // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3644 Register DefDstPtr = MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
3645 Register DefSize = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3646 if (IsSet) {
3647 MIB.buildInstr(Opc: Mopcode, DstOps: {DefDstPtr, DefSize},
3648 SrcOps: {DstPtrCopy, SizeCopy, SrcValCopy});
3649 } else {
3650 Register DefSrcPtr = MRI.createVirtualRegister(RegClass: &SrcValRegClass);
3651 MIB.buildInstr(Opc: Mopcode, DstOps: {DefDstPtr, DefSrcPtr, DefSize},
3652 SrcOps: {DstPtrCopy, SrcValCopy, SizeCopy});
3653 }
3654
3655 GI.eraseFromParent();
3656 return true;
3657}
3658
3659bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3660 MachineRegisterInfo &MRI) {
3661 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3662 Register JTAddr = I.getOperand(i: 0).getReg();
3663 unsigned JTI = I.getOperand(i: 1).getIndex();
3664 Register Index = I.getOperand(i: 2).getReg();
3665
3666 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(Idx: JTI, Size: 4, PCRelSym: nullptr);
3667
3668 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
3669 // sequence later, to guarantee the integrity of the intermediate values.
3670 if (MF->getFunction().hasFnAttribute(Kind: "aarch64-jump-table-hardening")) {
3671 CodeModel::Model CM = TM.getCodeModel();
3672 if (STI.isTargetMachO()) {
3673 if (CM != CodeModel::Small && CM != CodeModel::Large)
3674 report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
3675 } else {
3676 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
3677 assert(STI.isTargetELF() &&
3678 "jump table hardening only supported on MachO/ELF");
3679 if (CM != CodeModel::Small)
3680 report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
3681 }
3682
3683 MIB.buildCopy(Res: {AArch64::X16}, Op: I.getOperand(i: 2).getReg());
3684 MIB.buildInstr(Opcode: AArch64::BR_JumpTable)
3685 .addJumpTableIndex(Idx: I.getOperand(i: 1).getIndex());
3686 I.eraseFromParent();
3687 return true;
3688 }
3689
3690 Register TargetReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3691 Register ScratchReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64spRegClass);
3692
3693 auto JumpTableInst = MIB.buildInstr(Opc: AArch64::JumpTableDest32,
3694 DstOps: {TargetReg, ScratchReg}, SrcOps: {JTAddr, Index})
3695 .addJumpTableIndex(Idx: JTI);
3696 // Save the jump table info.
3697 MIB.buildInstr(Opc: TargetOpcode::JUMP_TABLE_DEBUG_INFO, DstOps: {},
3698 SrcOps: {static_cast<int64_t>(JTI)});
3699 // Build the indirect branch.
3700 MIB.buildInstr(Opc: AArch64::BR, DstOps: {}, SrcOps: {TargetReg});
3701 I.eraseFromParent();
3702 constrainSelectedInstRegOperands(I&: *JumpTableInst, TII, TRI, RBI);
3703 return true;
3704}
3705
3706bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3707 MachineRegisterInfo &MRI) {
3708 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3709 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3710
3711 Register DstReg = I.getOperand(i: 0).getReg();
3712 unsigned JTI = I.getOperand(i: 1).getIndex();
3713 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3714 auto MovMI =
3715 MIB.buildInstr(Opc: AArch64::MOVaddrJT, DstOps: {DstReg}, SrcOps: {})
3716 .addJumpTableIndex(Idx: JTI, TargetFlags: AArch64II::MO_PAGE)
3717 .addJumpTableIndex(Idx: JTI, TargetFlags: AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3718 I.eraseFromParent();
3719 constrainSelectedInstRegOperands(I&: *MovMI, TII, TRI, RBI);
3720 return true;
3721}
3722
3723bool AArch64InstructionSelector::selectTLSGlobalValue(
3724 MachineInstr &I, MachineRegisterInfo &MRI) {
3725 if (!STI.isTargetMachO())
3726 return false;
3727 MachineFunction &MF = *I.getParent()->getParent();
3728 MF.getFrameInfo().setAdjustsStack(true);
3729
3730 const auto &GlobalOp = I.getOperand(i: 1);
3731 assert(GlobalOp.getOffset() == 0 &&
3732 "Shouldn't have an offset on TLS globals!");
3733 const GlobalValue &GV = *GlobalOp.getGlobal();
3734
3735 auto LoadGOT =
3736 MIB.buildInstr(Opc: AArch64::LOADgot, DstOps: {&AArch64::GPR64commonRegClass}, SrcOps: {})
3737 .addGlobalAddress(GV: &GV, Offset: 0, TargetFlags: AArch64II::MO_TLS);
3738
3739 auto Load = MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {&AArch64::GPR64commonRegClass},
3740 SrcOps: {LoadGOT.getReg(Idx: 0)})
3741 .addImm(Val: 0);
3742
3743 MIB.buildCopy(Res: Register(AArch64::X0), Op: LoadGOT.getReg(Idx: 0));
3744 // TLS calls preserve all registers except those that absolutely must be
3745 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3746 // silly).
3747 unsigned Opcode = getBLRCallOpcode(MF);
3748
3749 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
3750 if (MF.getFunction().hasFnAttribute(Kind: "ptrauth-calls")) {
3751 assert(Opcode == AArch64::BLR);
3752 Opcode = AArch64::BLRAAZ;
3753 }
3754
3755 MIB.buildInstr(Opc: Opcode, DstOps: {}, SrcOps: {Load})
3756 .addUse(RegNo: AArch64::X0, Flags: RegState::Implicit)
3757 .addDef(RegNo: AArch64::X0, Flags: RegState::Implicit)
3758 .addRegMask(Mask: TRI.getTLSCallPreservedMask());
3759
3760 MIB.buildCopy(Res: I.getOperand(i: 0).getReg(), Op: Register(AArch64::X0));
3761 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(), RC: AArch64::GPR64RegClass,
3762 MRI);
3763 I.eraseFromParent();
3764 return true;
3765}
3766
3767MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3768 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3769 MachineIRBuilder &MIRBuilder) const {
3770 auto Undef = MIRBuilder.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {DstRC}, SrcOps: {});
3771
3772 auto BuildFn = [&](unsigned SubregIndex) {
3773 auto Ins =
3774 MIRBuilder
3775 .buildInstr(Opc: TargetOpcode::INSERT_SUBREG, DstOps: {DstRC}, SrcOps: {Undef, Scalar})
3776 .addImm(Val: SubregIndex);
3777 constrainSelectedInstRegOperands(I&: *Undef, TII, TRI, RBI);
3778 constrainSelectedInstRegOperands(I&: *Ins, TII, TRI, RBI);
3779 return &*Ins;
3780 };
3781
3782 switch (EltSize) {
3783 case 8:
3784 return BuildFn(AArch64::bsub);
3785 case 16:
3786 return BuildFn(AArch64::hsub);
3787 case 32:
3788 return BuildFn(AArch64::ssub);
3789 case 64:
3790 return BuildFn(AArch64::dsub);
3791 default:
3792 return nullptr;
3793 }
3794}
3795
3796MachineInstr *
3797AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3798 MachineIRBuilder &MIB,
3799 MachineRegisterInfo &MRI) const {
3800 LLT DstTy = MRI.getType(Reg: DstReg);
3801 const TargetRegisterClass *RC =
3802 getRegClassForTypeOnBank(Ty: DstTy, RB: *RBI.getRegBank(Reg: SrcReg, MRI, TRI));
3803 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3804 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3805 return nullptr;
3806 }
3807 unsigned SubReg = 0;
3808 if (!getSubRegForClass(RC, TRI, SubReg))
3809 return nullptr;
3810 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3811 LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3812 << DstTy.getSizeInBits() << "\n");
3813 return nullptr;
3814 }
3815 auto Copy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {})
3816 .addReg(RegNo: SrcReg, Flags: {}, SubReg);
3817 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
3818 return Copy;
3819}
3820
3821bool AArch64InstructionSelector::selectMergeValues(
3822 MachineInstr &I, MachineRegisterInfo &MRI) {
3823 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3824 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3825 const LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
3826 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3827 const RegisterBank &RB = *RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI);
3828
3829 if (I.getNumOperands() != 3)
3830 return false;
3831
3832 // Merging 2 s64s into an s128.
3833 if (DstTy == LLT::scalar(SizeInBits: 128)) {
3834 if (SrcTy.getSizeInBits() != 64)
3835 return false;
3836 Register DstReg = I.getOperand(i: 0).getReg();
3837 Register Src1Reg = I.getOperand(i: 1).getReg();
3838 Register Src2Reg = I.getOperand(i: 2).getReg();
3839 auto Tmp = MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {DstTy}, SrcOps: {});
3840 MachineInstr *InsMI = emitLaneInsert(DstReg: std::nullopt, SrcReg: Tmp.getReg(Idx: 0), EltReg: Src1Reg,
3841 /* LaneIdx */ 0, RB, MIRBuilder&: MIB);
3842 if (!InsMI)
3843 return false;
3844 MachineInstr *Ins2MI = emitLaneInsert(DstReg, SrcReg: InsMI->getOperand(i: 0).getReg(),
3845 EltReg: Src2Reg, /* LaneIdx */ 1, RB, MIRBuilder&: MIB);
3846 if (!Ins2MI)
3847 return false;
3848 constrainSelectedInstRegOperands(I&: *InsMI, TII, TRI, RBI);
3849 constrainSelectedInstRegOperands(I&: *Ins2MI, TII, TRI, RBI);
3850 I.eraseFromParent();
3851 return true;
3852 }
3853
3854 if (RB.getID() != AArch64::GPRRegBankID)
3855 return false;
3856
3857 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3858 return false;
3859
3860 auto *DstRC = &AArch64::GPR64RegClass;
3861 Register SubToRegDef = MRI.createVirtualRegister(RegClass: DstRC);
3862 MachineInstr &SubRegMI = *BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(),
3863 MCID: TII.get(Opcode: TargetOpcode::SUBREG_TO_REG))
3864 .addDef(RegNo: SubToRegDef)
3865 .addUse(RegNo: I.getOperand(i: 1).getReg())
3866 .addImm(Val: AArch64::sub_32);
3867 Register SubToRegDef2 = MRI.createVirtualRegister(RegClass: DstRC);
3868 // Need to anyext the second scalar before we can use bfm
3869 MachineInstr &SubRegMI2 = *BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(),
3870 MCID: TII.get(Opcode: TargetOpcode::SUBREG_TO_REG))
3871 .addDef(RegNo: SubToRegDef2)
3872 .addUse(RegNo: I.getOperand(i: 2).getReg())
3873 .addImm(Val: AArch64::sub_32);
3874 MachineInstr &BFM =
3875 *BuildMI(BB&: *I.getParent(), I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: AArch64::BFMXri))
3876 .addDef(RegNo: I.getOperand(i: 0).getReg())
3877 .addUse(RegNo: SubToRegDef)
3878 .addUse(RegNo: SubToRegDef2)
3879 .addImm(Val: 32)
3880 .addImm(Val: 31);
3881 constrainSelectedInstRegOperands(I&: SubRegMI, TII, TRI, RBI);
3882 constrainSelectedInstRegOperands(I&: SubRegMI2, TII, TRI, RBI);
3883 constrainSelectedInstRegOperands(I&: BFM, TII, TRI, RBI);
3884 I.eraseFromParent();
3885 return true;
3886}
3887
3888static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3889 const unsigned EltSize) {
3890 // Choose a lane copy opcode and subregister based off of the size of the
3891 // vector's elements.
3892 switch (EltSize) {
3893 case 8:
3894 CopyOpc = AArch64::DUPi8;
3895 ExtractSubReg = AArch64::bsub;
3896 break;
3897 case 16:
3898 CopyOpc = AArch64::DUPi16;
3899 ExtractSubReg = AArch64::hsub;
3900 break;
3901 case 32:
3902 CopyOpc = AArch64::DUPi32;
3903 ExtractSubReg = AArch64::ssub;
3904 break;
3905 case 64:
3906 CopyOpc = AArch64::DUPi64;
3907 ExtractSubReg = AArch64::dsub;
3908 break;
3909 default:
3910 // Unknown size, bail out.
3911 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
3912 return false;
3913 }
3914 return true;
3915}
3916
3917MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
3918 std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
3919 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
3920 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3921 unsigned CopyOpc = 0;
3922 unsigned ExtractSubReg = 0;
3923 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, EltSize: ScalarTy.getSizeInBits())) {
3924 LLVM_DEBUG(
3925 dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
3926 return nullptr;
3927 }
3928
3929 const TargetRegisterClass *DstRC =
3930 getRegClassForTypeOnBank(Ty: ScalarTy, RB: DstRB, GetAllRegSet: true);
3931 if (!DstRC) {
3932 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
3933 return nullptr;
3934 }
3935
3936 const RegisterBank &VecRB = *RBI.getRegBank(Reg: VecReg, MRI, TRI);
3937 const LLT &VecTy = MRI.getType(Reg: VecReg);
3938 const TargetRegisterClass *VecRC =
3939 getRegClassForTypeOnBank(Ty: VecTy, RB: VecRB, GetAllRegSet: true);
3940 if (!VecRC) {
3941 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3942 return nullptr;
3943 }
3944
3945 // The register that we're going to copy into.
3946 Register InsertReg = VecReg;
3947 if (!DstReg)
3948 DstReg = MRI.createVirtualRegister(RegClass: DstRC);
3949 // If the lane index is 0, we just use a subregister COPY.
3950 if (LaneIdx == 0) {
3951 auto Copy = MIRBuilder.buildInstr(Opc: TargetOpcode::COPY, DstOps: {*DstReg}, SrcOps: {})
3952 .addReg(RegNo: VecReg, Flags: {}, SubReg: ExtractSubReg);
3953 RBI.constrainGenericRegister(Reg: *DstReg, RC: *DstRC, MRI);
3954 return &*Copy;
3955 }
3956
3957 // Lane copies require 128-bit wide registers. If we're dealing with an
3958 // unpacked vector, then we need to move up to that width. Insert an implicit
3959 // def and a subregister insert to get us there.
3960 if (VecTy.getSizeInBits() != 128) {
3961 MachineInstr *ScalarToVector = emitScalarToVector(
3962 EltSize: VecTy.getSizeInBits(), DstRC: &AArch64::FPR128RegClass, Scalar: VecReg, MIRBuilder);
3963 if (!ScalarToVector)
3964 return nullptr;
3965 InsertReg = ScalarToVector->getOperand(i: 0).getReg();
3966 }
3967
3968 MachineInstr *LaneCopyMI =
3969 MIRBuilder.buildInstr(Opc: CopyOpc, DstOps: {*DstReg}, SrcOps: {InsertReg}).addImm(Val: LaneIdx);
3970 constrainSelectedInstRegOperands(I&: *LaneCopyMI, TII, TRI, RBI);
3971
3972 // Make sure that we actually constrain the initial copy.
3973 RBI.constrainGenericRegister(Reg: *DstReg, RC: *DstRC, MRI);
3974 return LaneCopyMI;
3975}
3976
3977bool AArch64InstructionSelector::selectExtractElt(
3978 MachineInstr &I, MachineRegisterInfo &MRI) {
3979 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
3980 "unexpected opcode!");
3981 Register DstReg = I.getOperand(i: 0).getReg();
3982 const LLT NarrowTy = MRI.getType(Reg: DstReg);
3983 const Register SrcReg = I.getOperand(i: 1).getReg();
3984 const LLT WideTy = MRI.getType(Reg: SrcReg);
3985 (void)WideTy;
3986 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
3987 "source register size too small!");
3988 assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
3989
3990 // Need the lane index to determine the correct copy opcode.
3991 MachineOperand &LaneIdxOp = I.getOperand(i: 2);
3992 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
3993
3994 if (RBI.getRegBank(Reg: DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
3995 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
3996 return false;
3997 }
3998
3999 // Find the index to extract from.
4000 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: LaneIdxOp.getReg(), MRI);
4001 if (!VRegAndVal)
4002 return false;
4003 unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4004
4005
4006 const RegisterBank &DstRB = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
4007 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, ScalarTy: NarrowTy, VecReg: SrcReg,
4008 LaneIdx, MIRBuilder&: MIB);
4009 if (!Extract)
4010 return false;
4011
4012 I.eraseFromParent();
4013 return true;
4014}
4015
4016bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4017 MachineInstr &I, MachineRegisterInfo &MRI) {
4018 unsigned NumElts = I.getNumOperands() - 1;
4019 Register SrcReg = I.getOperand(i: NumElts).getReg();
4020 const LLT NarrowTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
4021 const LLT SrcTy = MRI.getType(Reg: SrcReg);
4022
4023 assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4024 if (SrcTy.getSizeInBits() > 128) {
4025 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4026 return false;
4027 }
4028
4029 // We implement a split vector operation by treating the sub-vectors as
4030 // scalars and extracting them.
4031 const RegisterBank &DstRB =
4032 *RBI.getRegBank(Reg: I.getOperand(i: 0).getReg(), MRI, TRI);
4033 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4034 Register Dst = I.getOperand(i: OpIdx).getReg();
4035 MachineInstr *Extract =
4036 emitExtractVectorElt(DstReg: Dst, DstRB, ScalarTy: NarrowTy, VecReg: SrcReg, LaneIdx: OpIdx, MIRBuilder&: MIB);
4037 if (!Extract)
4038 return false;
4039 }
4040 I.eraseFromParent();
4041 return true;
4042}
4043
4044bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4045 MachineRegisterInfo &MRI) {
4046 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4047 "unexpected opcode");
4048
4049 // TODO: Handle unmerging into GPRs and from scalars to scalars.
4050 if (RBI.getRegBank(Reg: I.getOperand(i: 0).getReg(), MRI, TRI)->getID() !=
4051 AArch64::FPRRegBankID ||
4052 RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI)->getID() !=
4053 AArch64::FPRRegBankID) {
4054 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4055 "currently unsupported.\n");
4056 return false;
4057 }
4058
4059 // The last operand is the vector source register, and every other operand is
4060 // a register to unpack into.
4061 unsigned NumElts = I.getNumOperands() - 1;
4062 Register SrcReg = I.getOperand(i: NumElts).getReg();
4063 const LLT NarrowTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
4064 const LLT WideTy = MRI.getType(Reg: SrcReg);
4065
4066 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4067 "source register size too small!");
4068
4069 if (!NarrowTy.isScalar())
4070 return selectSplitVectorUnmerge(I, MRI);
4071
4072 // Choose a lane copy opcode and subregister based off of the size of the
4073 // vector's elements.
4074 unsigned CopyOpc = 0;
4075 unsigned ExtractSubReg = 0;
4076 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, EltSize: NarrowTy.getSizeInBits()))
4077 return false;
4078
4079 // Set up for the lane copies.
4080 MachineBasicBlock &MBB = *I.getParent();
4081
4082 // Stores the registers we'll be copying from.
4083 SmallVector<Register, 4> InsertRegs;
4084
4085 // We'll use the first register twice, so we only need NumElts-1 registers.
4086 unsigned NumInsertRegs = NumElts - 1;
4087
4088 // If our elements fit into exactly 128 bits, then we can copy from the source
4089 // directly. Otherwise, we need to do a bit of setup with some subregister
4090 // inserts.
4091 if (NarrowTy.getSizeInBits() * NumElts == 128) {
4092 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4093 } else {
4094 // No. We have to perform subregister inserts. For each insert, create an
4095 // implicit def and a subregister insert, and save the register we create.
4096 // For scalar sources, treat as a pseudo-vector of NarrowTy elements.
4097 unsigned EltSize = WideTy.isVector() ? WideTy.getScalarSizeInBits()
4098 : NarrowTy.getSizeInBits();
4099 const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4100 Ty: LLT::fixed_vector(NumElements: NumElts, ScalarSizeInBits: EltSize), RB: *RBI.getRegBank(Reg: SrcReg, MRI, TRI));
4101 unsigned SubReg = 0;
4102 bool Found = getSubRegForClass(RC, TRI, SubReg);
4103 (void)Found;
4104 assert(Found && "expected to find last operand's subeg idx");
4105 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4106 Register ImpDefReg = MRI.createVirtualRegister(RegClass: &AArch64::FPR128RegClass);
4107 MachineInstr &ImpDefMI =
4108 *BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: TargetOpcode::IMPLICIT_DEF),
4109 DestReg: ImpDefReg);
4110
4111 // Now, create the subregister insert from SrcReg.
4112 Register InsertReg = MRI.createVirtualRegister(RegClass: &AArch64::FPR128RegClass);
4113 MachineInstr &InsMI =
4114 *BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(),
4115 MCID: TII.get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: InsertReg)
4116 .addUse(RegNo: ImpDefReg)
4117 .addUse(RegNo: SrcReg)
4118 .addImm(Val: SubReg);
4119
4120 constrainSelectedInstRegOperands(I&: ImpDefMI, TII, TRI, RBI);
4121 constrainSelectedInstRegOperands(I&: InsMI, TII, TRI, RBI);
4122
4123 // Save the register so that we can copy from it after.
4124 InsertRegs.push_back(Elt: InsertReg);
4125 }
4126 }
4127
4128 // Now that we've created any necessary subregister inserts, we can
4129 // create the copies.
4130 //
4131 // Perform the first copy separately as a subregister copy.
4132 Register CopyTo = I.getOperand(i: 0).getReg();
4133 auto FirstCopy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {CopyTo}, SrcOps: {})
4134 .addReg(RegNo: InsertRegs[0], Flags: {}, SubReg: ExtractSubReg);
4135 constrainSelectedInstRegOperands(I&: *FirstCopy, TII, TRI, RBI);
4136
4137 // Now, perform the remaining copies as vector lane copies.
4138 unsigned LaneIdx = 1;
4139 for (Register InsReg : InsertRegs) {
4140 Register CopyTo = I.getOperand(i: LaneIdx).getReg();
4141 MachineInstr &CopyInst =
4142 *BuildMI(BB&: MBB, I, MIMD: I.getDebugLoc(), MCID: TII.get(Opcode: CopyOpc), DestReg: CopyTo)
4143 .addUse(RegNo: InsReg)
4144 .addImm(Val: LaneIdx);
4145 constrainSelectedInstRegOperands(I&: CopyInst, TII, TRI, RBI);
4146 ++LaneIdx;
4147 }
4148
4149 // Separately constrain the first copy's destination. Because of the
4150 // limitation in constrainOperandRegClass, we can't guarantee that this will
4151 // actually be constrained. So, do it ourselves using the second operand.
4152 const TargetRegisterClass *RC =
4153 MRI.getRegClassOrNull(Reg: I.getOperand(i: 1).getReg());
4154 if (!RC) {
4155 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4156 return false;
4157 }
4158
4159 RBI.constrainGenericRegister(Reg: CopyTo, RC: *RC, MRI);
4160 I.eraseFromParent();
4161 return true;
4162}
4163
4164bool AArch64InstructionSelector::selectConcatVectors(
4165 MachineInstr &I, MachineRegisterInfo &MRI) {
4166 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4167 "Unexpected opcode");
4168 Register Dst = I.getOperand(i: 0).getReg();
4169 Register Op1 = I.getOperand(i: 1).getReg();
4170 Register Op2 = I.getOperand(i: 2).getReg();
4171 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder&: MIB);
4172 if (!ConcatMI)
4173 return false;
4174 I.eraseFromParent();
4175 return true;
4176}
4177
4178unsigned
4179AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4180 MachineFunction &MF) const {
4181 Type *CPTy = CPVal->getType();
4182 Align Alignment = MF.getDataLayout().getPrefTypeAlign(Ty: CPTy);
4183
4184 MachineConstantPool *MCP = MF.getConstantPool();
4185 return MCP->getConstantPoolIndex(C: CPVal, Alignment);
4186}
4187
4188MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4189 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4190 const TargetRegisterClass *RC;
4191 unsigned Opc;
4192 bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4193 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(Ty: CPVal->getType());
4194 switch (Size) {
4195 case 16:
4196 RC = &AArch64::FPR128RegClass;
4197 Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4198 break;
4199 case 8:
4200 RC = &AArch64::FPR64RegClass;
4201 Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4202 break;
4203 case 4:
4204 RC = &AArch64::FPR32RegClass;
4205 Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4206 break;
4207 case 2:
4208 RC = &AArch64::FPR16RegClass;
4209 Opc = AArch64::LDRHui;
4210 break;
4211 default:
4212 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4213 << *CPVal->getType());
4214 return nullptr;
4215 }
4216
4217 MachineInstr *LoadMI = nullptr;
4218 auto &MF = MIRBuilder.getMF();
4219 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4220 if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4221 // Use load(literal) for tiny code model.
4222 LoadMI = &*MIRBuilder.buildInstr(Opc, DstOps: {RC}, SrcOps: {}).addConstantPoolIndex(Idx: CPIdx);
4223 } else {
4224 auto Adrp =
4225 MIRBuilder.buildInstr(Opc: AArch64::ADRP, DstOps: {&AArch64::GPR64RegClass}, SrcOps: {})
4226 .addConstantPoolIndex(Idx: CPIdx, Offset: 0, TargetFlags: AArch64II::MO_PAGE);
4227
4228 LoadMI = &*MIRBuilder.buildInstr(Opc, DstOps: {RC}, SrcOps: {Adrp})
4229 .addConstantPoolIndex(
4230 Idx: CPIdx, Offset: 0, TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4231
4232 constrainSelectedInstRegOperands(I&: *Adrp, TII, TRI, RBI);
4233 }
4234
4235 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4236 LoadMI->addMemOperand(MF, MO: MF.getMachineMemOperand(PtrInfo,
4237 F: MachineMemOperand::MOLoad,
4238 Size, BaseAlignment: Align(Size)));
4239 constrainSelectedInstRegOperands(I&: *LoadMI, TII, TRI, RBI);
4240 return LoadMI;
4241}
4242
4243/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4244/// size and RB.
4245static std::pair<unsigned, unsigned>
4246getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4247 unsigned Opc, SubregIdx;
4248 if (RB.getID() == AArch64::GPRRegBankID) {
4249 if (EltSize == 8) {
4250 Opc = AArch64::INSvi8gpr;
4251 SubregIdx = AArch64::bsub;
4252 } else if (EltSize == 16) {
4253 Opc = AArch64::INSvi16gpr;
4254 SubregIdx = AArch64::ssub;
4255 } else if (EltSize == 32) {
4256 Opc = AArch64::INSvi32gpr;
4257 SubregIdx = AArch64::ssub;
4258 } else if (EltSize == 64) {
4259 Opc = AArch64::INSvi64gpr;
4260 SubregIdx = AArch64::dsub;
4261 } else {
4262 llvm_unreachable("invalid elt size!");
4263 }
4264 } else {
4265 if (EltSize == 8) {
4266 Opc = AArch64::INSvi8lane;
4267 SubregIdx = AArch64::bsub;
4268 } else if (EltSize == 16) {
4269 Opc = AArch64::INSvi16lane;
4270 SubregIdx = AArch64::hsub;
4271 } else if (EltSize == 32) {
4272 Opc = AArch64::INSvi32lane;
4273 SubregIdx = AArch64::ssub;
4274 } else if (EltSize == 64) {
4275 Opc = AArch64::INSvi64lane;
4276 SubregIdx = AArch64::dsub;
4277 } else {
4278 llvm_unreachable("invalid elt size!");
4279 }
4280 }
4281 return std::make_pair(x&: Opc, y&: SubregIdx);
4282}
4283
4284MachineInstr *AArch64InstructionSelector::emitInstr(
4285 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4286 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4287 const ComplexRendererFns &RenderFns) const {
4288 assert(Opcode && "Expected an opcode?");
4289 assert(!isPreISelGenericOpcode(Opcode) &&
4290 "Function should only be used to produce selected instructions!");
4291 auto MI = MIRBuilder.buildInstr(Opc: Opcode, DstOps, SrcOps);
4292 if (RenderFns)
4293 for (auto &Fn : *RenderFns)
4294 Fn(MI);
4295 constrainSelectedInstRegOperands(I&: *MI, TII, TRI, RBI);
4296 return &*MI;
4297}
4298
4299MachineInstr *AArch64InstructionSelector::emitAddSub(
4300 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4301 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4302 MachineIRBuilder &MIRBuilder) const {
4303 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4304 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4305 auto Ty = MRI.getType(Reg: LHS.getReg());
4306 assert(!Ty.isVector() && "Expected a scalar or pointer?");
4307 unsigned Size = Ty.getSizeInBits();
4308 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4309 bool Is32Bit = Size == 32;
4310
4311 // INSTRri form with positive arithmetic immediate.
4312 if (auto Fns = selectArithImmed(Root&: RHS))
4313 return emitInstr(Opcode: AddrModeAndSizeToOpcode[0][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4314 MIRBuilder, RenderFns: Fns);
4315
4316 // INSTRri form with negative arithmetic immediate.
4317 if (auto Fns = selectNegArithImmed(Root&: RHS))
4318 return emitInstr(Opcode: AddrModeAndSizeToOpcode[3][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4319 MIRBuilder, RenderFns: Fns);
4320
4321 // INSTRrx form.
4322 if (auto Fns = selectArithExtendedRegister(Root&: RHS))
4323 return emitInstr(Opcode: AddrModeAndSizeToOpcode[4][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4324 MIRBuilder, RenderFns: Fns);
4325
4326 // INSTRrs form.
4327 if (auto Fns = selectShiftedRegister(Root&: RHS))
4328 return emitInstr(Opcode: AddrModeAndSizeToOpcode[1][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4329 MIRBuilder, RenderFns: Fns);
4330 return emitInstr(Opcode: AddrModeAndSizeToOpcode[2][Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS},
4331 MIRBuilder);
4332}
4333
4334MachineInstr *
4335AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4336 MachineOperand &RHS,
4337 MachineIRBuilder &MIRBuilder) const {
4338 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4339 ._M_elems: {{AArch64::ADDXri, AArch64::ADDWri},
4340 {AArch64::ADDXrs, AArch64::ADDWrs},
4341 {AArch64::ADDXrr, AArch64::ADDWrr},
4342 {AArch64::SUBXri, AArch64::SUBWri},
4343 {AArch64::ADDXrx, AArch64::ADDWrx}}};
4344 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst: DefReg, LHS, RHS, MIRBuilder);
4345}
4346
4347MachineInstr *
4348AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4349 MachineOperand &RHS,
4350 MachineIRBuilder &MIRBuilder) const {
4351 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4352 ._M_elems: {{AArch64::ADDSXri, AArch64::ADDSWri},
4353 {AArch64::ADDSXrs, AArch64::ADDSWrs},
4354 {AArch64::ADDSXrr, AArch64::ADDSWrr},
4355 {AArch64::SUBSXri, AArch64::SUBSWri},
4356 {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4357 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst, LHS, RHS, MIRBuilder);
4358}
4359
4360MachineInstr *
4361AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4362 MachineOperand &RHS,
4363 MachineIRBuilder &MIRBuilder) const {
4364 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4365 ._M_elems: {{AArch64::SUBSXri, AArch64::SUBSWri},
4366 {AArch64::SUBSXrs, AArch64::SUBSWrs},
4367 {AArch64::SUBSXrr, AArch64::SUBSWrr},
4368 {AArch64::ADDSXri, AArch64::ADDSWri},
4369 {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4370 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst, LHS, RHS, MIRBuilder);
4371}
4372
4373MachineInstr *
4374AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4375 MachineOperand &RHS,
4376 MachineIRBuilder &MIRBuilder) const {
4377 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4378 MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4379 bool Is32Bit = (MRI->getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4380 static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4381 return emitInstr(Opcode: OpcTable[Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS}, MIRBuilder);
4382}
4383
4384MachineInstr *
4385AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4386 MachineOperand &RHS,
4387 MachineIRBuilder &MIRBuilder) const {
4388 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4389 MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4390 bool Is32Bit = (MRI->getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4391 static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4392 return emitInstr(Opcode: OpcTable[Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS}, MIRBuilder);
4393}
4394
4395MachineInstr *
4396AArch64InstructionSelector::emitCMP(MachineOperand &LHS, MachineOperand &RHS,
4397 MachineIRBuilder &MIRBuilder) const {
4398 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4399 bool Is32Bit = MRI.getType(Reg: LHS.getReg()).getSizeInBits() == 32;
4400 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4401 return emitSUBS(Dst: MRI.createVirtualRegister(RegClass: RC), LHS, RHS, MIRBuilder);
4402}
4403
4404MachineInstr *
4405AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4406 MachineIRBuilder &MIRBuilder) const {
4407 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4408 bool Is32Bit = (MRI.getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4409 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4410 return emitADDS(Dst: MRI.createVirtualRegister(RegClass: RC), LHS, RHS, MIRBuilder);
4411}
4412
4413MachineInstr *
4414AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4415 MachineIRBuilder &MIRBuilder) const {
4416 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4417 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4418 LLT Ty = MRI.getType(Reg: LHS.getReg());
4419 unsigned RegSize = Ty.getSizeInBits();
4420 bool Is32Bit = (RegSize == 32);
4421 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4422 {AArch64::ANDSXrs, AArch64::ANDSWrs},
4423 {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4424 // ANDS needs a logical immediate for its immediate form. Check if we can
4425 // fold one in.
4426 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI)) {
4427 int64_t Imm = ValAndVReg->Value.getSExtValue();
4428
4429 if (AArch64_AM::isLogicalImmediate(imm: Imm, regSize: RegSize)) {
4430 auto TstMI = MIRBuilder.buildInstr(Opc: OpcTable[0][Is32Bit], DstOps: {Ty}, SrcOps: {LHS});
4431 TstMI.addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: Imm, regSize: RegSize));
4432 constrainSelectedInstRegOperands(I&: *TstMI, TII, TRI, RBI);
4433 return &*TstMI;
4434 }
4435 }
4436
4437 if (auto Fns = selectLogicalShiftedRegister(Root&: RHS))
4438 return emitInstr(Opcode: OpcTable[1][Is32Bit], DstOps: {Ty}, SrcOps: {LHS}, MIRBuilder, RenderFns: Fns);
4439 return emitInstr(Opcode: OpcTable[2][Is32Bit], DstOps: {Ty}, SrcOps: {LHS, RHS}, MIRBuilder);
4440}
4441
4442MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4443 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4444 MachineIRBuilder &MIRBuilder) const {
4445 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4446 assert(Predicate.isPredicate() && "Expected predicate?");
4447 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4448 LLT CmpTy = MRI.getType(Reg: LHS.getReg());
4449 assert(!CmpTy.isVector() && "Expected scalar or pointer");
4450 unsigned Size = CmpTy.getSizeInBits();
4451 (void)Size;
4452 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4453 // Fold the compare into a cmn or tst if possible.
4454 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4455 return FoldCmp;
4456 return emitCMP(LHS, RHS, MIRBuilder);
4457}
4458
4459MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4460 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4461 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4462#ifndef NDEBUG
4463 LLT Ty = MRI.getType(Dst);
4464 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4465 "Expected a 32-bit scalar register?");
4466#endif
4467 const Register ZReg = AArch64::WZR;
4468 AArch64CC::CondCode CC1, CC2;
4469 changeFCMPPredToAArch64CC(P: Pred, CondCode&: CC1, CondCode2&: CC2);
4470 auto InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
4471 if (CC2 == AArch64CC::AL)
4472 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC1,
4473 MIRBuilder);
4474 const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4475 Register Def1Reg = MRI.createVirtualRegister(RegClass: RC);
4476 Register Def2Reg = MRI.createVirtualRegister(RegClass: RC);
4477 auto InvCC2 = AArch64CC::getInvertedCondCode(Code: CC2);
4478 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC1, MIRBuilder);
4479 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC2, MIRBuilder);
4480 auto OrMI = MIRBuilder.buildInstr(Opc: AArch64::ORRWrr, DstOps: {Dst}, SrcOps: {Def1Reg, Def2Reg});
4481 constrainSelectedInstRegOperands(I&: *OrMI, TII, TRI, RBI);
4482 return &*OrMI;
4483}
4484
4485MachineInstr *AArch64InstructionSelector::emitFPCompare(
4486 Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4487 std::optional<CmpInst::Predicate> Pred) const {
4488 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4489 LLT Ty = MRI.getType(Reg: LHS);
4490 if (Ty.isVector())
4491 return nullptr;
4492 unsigned OpSize = Ty.getSizeInBits();
4493 assert(OpSize == 16 || OpSize == 32 || OpSize == 64);
4494
4495 // If this is a compare against +0.0, then we don't have
4496 // to explicitly materialize a constant.
4497 const ConstantFP *FPImm = getConstantFPVRegVal(VReg: RHS, MRI);
4498 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4499
4500 auto IsEqualityPred = [](CmpInst::Predicate P) {
4501 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4502 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4503 };
4504 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4505 // Try commuting the operands.
4506 const ConstantFP *LHSImm = getConstantFPVRegVal(VReg: LHS, MRI);
4507 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4508 ShouldUseImm = true;
4509 std::swap(a&: LHS, b&: RHS);
4510 }
4511 }
4512 unsigned CmpOpcTbl[2][3] = {
4513 {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
4514 {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
4515 unsigned CmpOpc =
4516 CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)];
4517
4518 // Partially build the compare. Decide if we need to add a use for the
4519 // third operand based off whether or not we're comparing against 0.0.
4520 auto CmpMI = MIRBuilder.buildInstr(Opcode: CmpOpc).addUse(RegNo: LHS);
4521 CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4522 if (!ShouldUseImm)
4523 CmpMI.addUse(RegNo: RHS);
4524 constrainSelectedInstRegOperands(I&: *CmpMI, TII, TRI, RBI);
4525 return &*CmpMI;
4526}
4527
4528MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4529 std::optional<Register> Dst, Register Op1, Register Op2,
4530 MachineIRBuilder &MIRBuilder) const {
4531 // We implement a vector concat by:
4532 // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4533 // 2. Insert the upper vector into the destination's upper element
4534 // TODO: some of this code is common with G_BUILD_VECTOR handling.
4535 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4536
4537 const LLT Op1Ty = MRI.getType(Reg: Op1);
4538 const LLT Op2Ty = MRI.getType(Reg: Op2);
4539
4540 if (Op1Ty != Op2Ty) {
4541 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4542 return nullptr;
4543 }
4544 assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4545
4546 if (Op1Ty.getSizeInBits() >= 128) {
4547 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4548 return nullptr;
4549 }
4550
4551 // At the moment we just support 64 bit vector concats.
4552 if (Op1Ty.getSizeInBits() != 64) {
4553 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4554 return nullptr;
4555 }
4556
4557 const LLT ScalarTy = LLT::scalar(SizeInBits: Op1Ty.getSizeInBits());
4558 const RegisterBank &FPRBank = *RBI.getRegBank(Reg: Op1, MRI, TRI);
4559 const TargetRegisterClass *DstRC =
4560 getRegClassForTypeOnBank(Ty: Op1Ty.multiplyElements(Factor: 2), RB: FPRBank);
4561
4562 MachineInstr *WidenedOp1 =
4563 emitScalarToVector(EltSize: ScalarTy.getSizeInBits(), DstRC, Scalar: Op1, MIRBuilder);
4564 MachineInstr *WidenedOp2 =
4565 emitScalarToVector(EltSize: ScalarTy.getSizeInBits(), DstRC, Scalar: Op2, MIRBuilder);
4566 if (!WidenedOp1 || !WidenedOp2) {
4567 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4568 return nullptr;
4569 }
4570
4571 // Now do the insert of the upper element.
4572 unsigned InsertOpc, InsSubRegIdx;
4573 std::tie(args&: InsertOpc, args&: InsSubRegIdx) =
4574 getInsertVecEltOpInfo(RB: FPRBank, EltSize: ScalarTy.getSizeInBits());
4575
4576 if (!Dst)
4577 Dst = MRI.createVirtualRegister(RegClass: DstRC);
4578 auto InsElt =
4579 MIRBuilder
4580 .buildInstr(Opc: InsertOpc, DstOps: {*Dst}, SrcOps: {WidenedOp1->getOperand(i: 0).getReg()})
4581 .addImm(Val: 1) /* Lane index */
4582 .addUse(RegNo: WidenedOp2->getOperand(i: 0).getReg())
4583 .addImm(Val: 0);
4584 constrainSelectedInstRegOperands(I&: *InsElt, TII, TRI, RBI);
4585 return &*InsElt;
4586}
4587
4588MachineInstr *
4589AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4590 Register Src2, AArch64CC::CondCode Pred,
4591 MachineIRBuilder &MIRBuilder) const {
4592 auto &MRI = *MIRBuilder.getMRI();
4593 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg: Dst);
4594 // If we used a register class, then this won't necessarily have an LLT.
4595 // Compute the size based off whether or not we have a class or bank.
4596 unsigned Size;
4597 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(Val: RegClassOrBank))
4598 Size = TRI.getRegSizeInBits(RC: *RC);
4599 else
4600 Size = MRI.getType(Reg: Dst).getSizeInBits();
4601 // Some opcodes use s1.
4602 assert(Size <= 64 && "Expected 64 bits or less only!");
4603 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4604 unsigned Opc = OpcTable[Size == 64];
4605 auto CSINC = MIRBuilder.buildInstr(Opc, DstOps: {Dst}, SrcOps: {Src1, Src2}).addImm(Val: Pred);
4606 constrainSelectedInstRegOperands(I&: *CSINC, TII, TRI, RBI);
4607 return &*CSINC;
4608}
4609
4610MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4611 Register CarryReg) {
4612 MachineRegisterInfo *MRI = MIB.getMRI();
4613 unsigned Opcode = I.getOpcode();
4614
4615 // If the instruction is a SUB, we need to negate the carry,
4616 // because borrowing is indicated by carry-flag == 0.
4617 bool NeedsNegatedCarry =
4618 (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4619
4620 // If the previous instruction will already produce the correct carry, do not
4621 // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4622 // generated during legalization of wide add/sub. This optimization depends on
4623 // these sequences not being interrupted by other instructions.
4624 // We have to select the previous instruction before the carry-using
4625 // instruction is deleted by the calling function, otherwise the previous
4626 // instruction might become dead and would get deleted.
4627 MachineInstr *SrcMI = MRI->getVRegDef(Reg: CarryReg);
4628 if (SrcMI == I.getPrevNode()) {
4629 if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(Val: SrcMI)) {
4630 bool ProducesNegatedCarry = CarrySrcMI->isSub();
4631 if (NeedsNegatedCarry == ProducesNegatedCarry &&
4632 CarrySrcMI->isUnsigned() &&
4633 CarrySrcMI->getCarryOutReg() == CarryReg &&
4634 selectAndRestoreState(I&: *SrcMI))
4635 return nullptr;
4636 }
4637 }
4638
4639 Register DeadReg = MRI->createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
4640
4641 if (NeedsNegatedCarry) {
4642 // (0 - Carry) sets !C in NZCV when Carry == 1
4643 Register ZReg = AArch64::WZR;
4644 return emitInstr(Opcode: AArch64::SUBSWrr, DstOps: {DeadReg}, SrcOps: {ZReg, CarryReg}, MIRBuilder&: MIB);
4645 }
4646
4647 // (Carry - 1) sets !C in NZCV when Carry == 0
4648 auto Fns = select12BitValueWithLeftShift(Immed: 1);
4649 return emitInstr(Opcode: AArch64::SUBSWri, DstOps: {DeadReg}, SrcOps: {CarryReg}, MIRBuilder&: MIB, RenderFns: Fns);
4650}
4651
4652bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4653 MachineRegisterInfo &MRI) {
4654 auto &CarryMI = cast<GAddSubCarryOut>(Val&: I);
4655
4656 if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(Val: &I)) {
4657 // Set NZCV carry according to carry-in VReg
4658 emitCarryIn(I, CarryReg: CarryInMI->getCarryInReg());
4659 }
4660
4661 // Emit the operation and get the correct condition code.
4662 auto OpAndCC = emitOverflowOp(Opcode: I.getOpcode(), Dst: CarryMI.getDstReg(),
4663 LHS&: CarryMI.getLHS(), RHS&: CarryMI.getRHS(), MIRBuilder&: MIB);
4664
4665 Register CarryOutReg = CarryMI.getCarryOutReg();
4666
4667 // Don't convert carry-out to VReg if it is never used
4668 if (!MRI.use_nodbg_empty(RegNo: CarryOutReg)) {
4669 // Now, put the overflow result in the register given by the first operand
4670 // to the overflow op. CSINC increments the result when the predicate is
4671 // false, so to get the increment when it's true, we need to use the
4672 // inverse. In this case, we want to increment when carry is set.
4673 Register ZReg = AArch64::WZR;
4674 emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4675 Pred: getInvertedCondCode(Code: OpAndCC.second), MIRBuilder&: MIB);
4676 }
4677
4678 I.eraseFromParent();
4679 return true;
4680}
4681
4682std::pair<MachineInstr *, AArch64CC::CondCode>
4683AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4684 MachineOperand &LHS,
4685 MachineOperand &RHS,
4686 MachineIRBuilder &MIRBuilder) const {
4687 switch (Opcode) {
4688 default:
4689 llvm_unreachable("Unexpected opcode!");
4690 case TargetOpcode::G_SADDO:
4691 return std::make_pair(x: emitADDS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4692 case TargetOpcode::G_UADDO:
4693 return std::make_pair(x: emitADDS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::HS);
4694 case TargetOpcode::G_SSUBO:
4695 return std::make_pair(x: emitSUBS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4696 case TargetOpcode::G_USUBO:
4697 return std::make_pair(x: emitSUBS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::LO);
4698 case TargetOpcode::G_SADDE:
4699 return std::make_pair(x: emitADCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4700 case TargetOpcode::G_UADDE:
4701 return std::make_pair(x: emitADCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::HS);
4702 case TargetOpcode::G_SSUBE:
4703 return std::make_pair(x: emitSBCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4704 case TargetOpcode::G_USUBE:
4705 return std::make_pair(x: emitSBCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::LO);
4706 }
4707}
4708
4709/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4710/// expressed as a conjunction.
4711/// \param CanNegate Set to true if we can negate the whole sub-tree just by
4712/// changing the conditions on the CMP tests.
4713/// (this means we can call emitConjunctionRec() with
4714/// Negate==true on this sub-tree)
4715/// \param MustBeFirst Set to true if this subtree needs to be negated and we
4716/// cannot do the negation naturally. We are required to
4717/// emit the subtree first in this case.
4718/// \param WillNegate Is true if are called when the result of this
4719/// subexpression must be negated. This happens when the
4720/// outer expression is an OR. We can use this fact to know
4721/// that we have a double negation (or (or ...) ...) that
4722/// can be implemented for free.
4723static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4724 bool WillNegate, MachineRegisterInfo &MRI,
4725 unsigned Depth = 0) {
4726 if (!MRI.hasOneNonDBGUse(RegNo: Val))
4727 return false;
4728 MachineInstr *ValDef = MRI.getVRegDef(Reg: Val);
4729 unsigned Opcode = ValDef->getOpcode();
4730 if (isa<GAnyCmp>(Val: ValDef)) {
4731 CanNegate = true;
4732 MustBeFirst = false;
4733 return true;
4734 }
4735 // Protect against exponential runtime and stack overflow.
4736 if (Depth > 6)
4737 return false;
4738 if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4739 bool IsOR = Opcode == TargetOpcode::G_OR;
4740 Register O0 = ValDef->getOperand(i: 1).getReg();
4741 Register O1 = ValDef->getOperand(i: 2).getReg();
4742 bool CanNegateL;
4743 bool MustBeFirstL;
4744 if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, MRI, Depth: Depth + 1))
4745 return false;
4746 bool CanNegateR;
4747 bool MustBeFirstR;
4748 if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, MRI, Depth: Depth + 1))
4749 return false;
4750
4751 if (MustBeFirstL && MustBeFirstR)
4752 return false;
4753
4754 if (IsOR) {
4755 // For an OR expression we need to be able to naturally negate at least
4756 // one side or we cannot do the transformation at all.
4757 if (!CanNegateL && !CanNegateR)
4758 return false;
4759 // If we the result of the OR will be negated and we can naturally negate
4760 // the leaves, then this sub-tree as a whole negates naturally.
4761 CanNegate = WillNegate && CanNegateL && CanNegateR;
4762 // If we cannot naturally negate the whole sub-tree, then this must be
4763 // emitted first.
4764 MustBeFirst = !CanNegate;
4765 } else {
4766 assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4767 // We cannot naturally negate an AND operation.
4768 CanNegate = false;
4769 MustBeFirst = MustBeFirstL || MustBeFirstR;
4770 }
4771 return true;
4772 }
4773 return false;
4774}
4775
4776MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4777 Register LHS, Register RHS, CmpInst::Predicate CC,
4778 AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4779 MachineIRBuilder &MIB) const {
4780 auto &MRI = *MIB.getMRI();
4781 LLT OpTy = MRI.getType(Reg: LHS);
4782 unsigned CCmpOpc;
4783 std::optional<ValueAndVReg> C;
4784 if (CmpInst::isIntPredicate(P: CC)) {
4785 assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4786 C = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
4787 if (!C || C->Value.sgt(RHS: 31) || C->Value.slt(RHS: -31))
4788 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4789 else if (C->Value.ule(RHS: 31))
4790 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4791 else
4792 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi;
4793 } else {
4794 assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
4795 OpTy.getSizeInBits() == 64);
4796 switch (OpTy.getSizeInBits()) {
4797 case 16:
4798 assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
4799 CCmpOpc = AArch64::FCCMPHrr;
4800 break;
4801 case 32:
4802 CCmpOpc = AArch64::FCCMPSrr;
4803 break;
4804 case 64:
4805 CCmpOpc = AArch64::FCCMPDrr;
4806 break;
4807 default:
4808 return nullptr;
4809 }
4810 }
4811 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
4812 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
4813 auto CCmp =
4814 MIB.buildInstr(Opc: CCmpOpc, DstOps: {}, SrcOps: {LHS});
4815 if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4816 CCmp.addImm(Val: C->Value.getZExtValue());
4817 else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi)
4818 CCmp.addImm(Val: C->Value.abs().getZExtValue());
4819 else
4820 CCmp.addReg(RegNo: RHS);
4821 CCmp.addImm(Val: NZCV).addImm(Val: Predicate);
4822 constrainSelectedInstRegOperands(I&: *CCmp, TII, TRI, RBI);
4823 return &*CCmp;
4824}
4825
4826MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4827 Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4828 AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4829 // We're at a tree leaf, produce a conditional comparison operation.
4830 auto &MRI = *MIB.getMRI();
4831 MachineInstr *ValDef = MRI.getVRegDef(Reg: Val);
4832 unsigned Opcode = ValDef->getOpcode();
4833 if (auto *Cmp = dyn_cast<GAnyCmp>(Val: ValDef)) {
4834 Register LHS = Cmp->getLHSReg();
4835 Register RHS = Cmp->getRHSReg();
4836 CmpInst::Predicate CC = Cmp->getCond();
4837 if (Negate)
4838 CC = CmpInst::getInversePredicate(pred: CC);
4839 if (isa<GICmp>(Val: Cmp)) {
4840 OutCC = changeICMPPredToAArch64CC(P: CC, RHS, MRI: MIB.getMRI());
4841 } else {
4842 // Handle special FP cases.
4843 AArch64CC::CondCode ExtraCC;
4844 changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
4845 // Some floating point conditions can't be tested with a single condition
4846 // code. Construct an additional comparison in this case.
4847 if (ExtraCC != AArch64CC::AL) {
4848 MachineInstr *ExtraCmp;
4849 if (!CCOp)
4850 ExtraCmp = emitFPCompare(LHS, RHS, MIRBuilder&: MIB, Pred: CC);
4851 else
4852 ExtraCmp =
4853 emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC: ExtraCC, MIB);
4854 CCOp = ExtraCmp->getOperand(i: 0).getReg();
4855 Predicate = ExtraCC;
4856 }
4857 }
4858
4859 // Produce a normal comparison if we are first in the chain
4860 if (!CCOp) {
4861 if (isa<GICmp>(Val: Cmp))
4862 return emitCMP(LHS&: Cmp->getOperand(i: 2), RHS&: Cmp->getOperand(i: 3), MIRBuilder&: MIB);
4863 return emitFPCompare(LHS: Cmp->getOperand(i: 2).getReg(),
4864 RHS: Cmp->getOperand(i: 3).getReg(), MIRBuilder&: MIB);
4865 }
4866 // Otherwise produce a ccmp.
4867 return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4868 }
4869 assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4870
4871 bool IsOR = Opcode == TargetOpcode::G_OR;
4872
4873 Register LHS = ValDef->getOperand(i: 1).getReg();
4874 bool CanNegateL;
4875 bool MustBeFirstL;
4876 bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, MRI);
4877 assert(ValidL && "Valid conjunction/disjunction tree");
4878 (void)ValidL;
4879
4880 Register RHS = ValDef->getOperand(i: 2).getReg();
4881 bool CanNegateR;
4882 bool MustBeFirstR;
4883 bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, MRI);
4884 assert(ValidR && "Valid conjunction/disjunction tree");
4885 (void)ValidR;
4886
4887 // Swap sub-tree that must come first to the right side.
4888 if (MustBeFirstL) {
4889 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4890 std::swap(a&: LHS, b&: RHS);
4891 std::swap(a&: CanNegateL, b&: CanNegateR);
4892 std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
4893 }
4894
4895 bool NegateR;
4896 bool NegateAfterR;
4897 bool NegateL;
4898 bool NegateAfterAll;
4899 if (Opcode == TargetOpcode::G_OR) {
4900 // Swap the sub-tree that we can negate naturally to the left.
4901 if (!CanNegateL) {
4902 assert(CanNegateR && "at least one side must be negatable");
4903 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4904 assert(!Negate);
4905 std::swap(a&: LHS, b&: RHS);
4906 NegateR = false;
4907 NegateAfterR = true;
4908 } else {
4909 // Negate the left sub-tree if possible, otherwise negate the result.
4910 NegateR = CanNegateR;
4911 NegateAfterR = !CanNegateR;
4912 }
4913 NegateL = true;
4914 NegateAfterAll = !Negate;
4915 } else {
4916 assert(Opcode == TargetOpcode::G_AND &&
4917 "Valid conjunction/disjunction tree");
4918 assert(!Negate && "Valid conjunction/disjunction tree");
4919
4920 NegateL = false;
4921 NegateR = false;
4922 NegateAfterR = false;
4923 NegateAfterAll = false;
4924 }
4925
4926 // Emit sub-trees.
4927 AArch64CC::CondCode RHSCC;
4928 MachineInstr *CmpR =
4929 emitConjunctionRec(Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate, MIB);
4930 if (NegateAfterR)
4931 RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
4932 MachineInstr *CmpL = emitConjunctionRec(
4933 Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR->getOperand(i: 0).getReg(), Predicate: RHSCC, MIB);
4934 if (NegateAfterAll)
4935 OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
4936 return CmpL;
4937}
4938
4939MachineInstr *AArch64InstructionSelector::emitConjunction(
4940 Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
4941 bool DummyCanNegate;
4942 bool DummyMustBeFirst;
4943 if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false,
4944 MRI&: *MIB.getMRI()))
4945 return nullptr;
4946 return emitConjunctionRec(Val, OutCC, Negate: false, CCOp: Register(), Predicate: AArch64CC::AL, MIB);
4947}
4948
4949bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
4950 MachineInstr &CondMI) {
4951 AArch64CC::CondCode AArch64CC;
4952 MachineInstr *ConjMI = emitConjunction(Val: SelI.getCondReg(), OutCC&: AArch64CC, MIB);
4953 if (!ConjMI)
4954 return false;
4955
4956 emitSelect(Dst: SelI.getReg(Idx: 0), True: SelI.getTrueReg(), False: SelI.getFalseReg(), CC: AArch64CC, MIB);
4957 SelI.eraseFromParent();
4958 return true;
4959}
4960
4961bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
4962 MachineRegisterInfo &MRI = *MIB.getMRI();
4963 // We want to recognize this pattern:
4964 //
4965 // $z = G_FCMP pred, $x, $y
4966 // ...
4967 // $w = G_SELECT $z, $a, $b
4968 //
4969 // Where the value of $z is *only* ever used by the G_SELECT (possibly with
4970 // some copies/truncs in between.)
4971 //
4972 // If we see this, then we can emit something like this:
4973 //
4974 // fcmp $x, $y
4975 // fcsel $w, $a, $b, pred
4976 //
4977 // Rather than emitting both of the rather long sequences in the standard
4978 // G_FCMP/G_SELECT select methods.
4979
4980 // First, check if the condition is defined by a compare.
4981 MachineInstr *CondDef = MRI.getVRegDef(Reg: I.getOperand(i: 1).getReg());
4982
4983 // We can only fold if all of the defs have one use.
4984 Register CondDefReg = CondDef->getOperand(i: 0).getReg();
4985 if (!MRI.hasOneNonDBGUse(RegNo: CondDefReg)) {
4986 // Unless it's another select.
4987 for (const MachineInstr &UI : MRI.use_nodbg_instructions(Reg: CondDefReg)) {
4988 if (CondDef == &UI)
4989 continue;
4990 if (UI.getOpcode() != TargetOpcode::G_SELECT)
4991 return false;
4992 }
4993 }
4994
4995 // Is the condition defined by a compare?
4996 unsigned CondOpc = CondDef->getOpcode();
4997 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
4998 if (tryOptSelectConjunction(SelI&: I, CondMI&: *CondDef))
4999 return true;
5000 return false;
5001 }
5002
5003 AArch64CC::CondCode CondCode;
5004 if (CondOpc == TargetOpcode::G_ICMP) {
5005 auto &PredOp = CondDef->getOperand(i: 1);
5006 emitIntegerCompare(LHS&: CondDef->getOperand(i: 2), RHS&: CondDef->getOperand(i: 3), Predicate&: PredOp,
5007 MIRBuilder&: MIB);
5008 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
5009 CondCode =
5010 changeICMPPredToAArch64CC(P: Pred, RHS: CondDef->getOperand(i: 3).getReg(), MRI: &MRI);
5011 } else {
5012 // Get the condition code for the select.
5013 auto Pred =
5014 static_cast<CmpInst::Predicate>(CondDef->getOperand(i: 1).getPredicate());
5015 AArch64CC::CondCode CondCode2;
5016 changeFCMPPredToAArch64CC(P: Pred, CondCode, CondCode2);
5017
5018 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5019 // instructions to emit the comparison.
5020 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5021 // unnecessary.
5022 if (CondCode2 != AArch64CC::AL)
5023 return false;
5024
5025 if (!emitFPCompare(LHS: CondDef->getOperand(i: 2).getReg(),
5026 RHS: CondDef->getOperand(i: 3).getReg(), MIRBuilder&: MIB)) {
5027 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5028 return false;
5029 }
5030 }
5031
5032 // Emit the select.
5033 emitSelect(Dst: I.getOperand(i: 0).getReg(), True: I.getOperand(i: 2).getReg(),
5034 False: I.getOperand(i: 3).getReg(), CC: CondCode, MIB);
5035 I.eraseFromParent();
5036 return true;
5037}
5038
5039MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5040 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5041 MachineIRBuilder &MIRBuilder) const {
5042 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5043 "Unexpected MachineOperand");
5044 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5045 // We want to find this sort of thing:
5046 // x = G_SUB 0, y
5047 // G_ICMP z, x
5048 //
5049 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5050 // e.g:
5051 //
5052 // cmn z, y
5053
5054 // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5055 MachineInstr *LHSDef = getDefIgnoringCopies(Reg: LHS.getReg(), MRI);
5056 MachineInstr *RHSDef = getDefIgnoringCopies(Reg: RHS.getReg(), MRI);
5057 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5058
5059 // Given this:
5060 //
5061 // x = G_SUB 0, y
5062 // G_ICMP z, x
5063 //
5064 // Produce this:
5065 //
5066 // cmn z, y
5067 if (isCMN(MaybeSub: RHSDef, Pred: P, MRI))
5068 return emitCMN(LHS, RHS&: RHSDef->getOperand(i: 2), MIRBuilder);
5069
5070 // Same idea here, but with the LHS of the compare instead:
5071 //
5072 // Given this:
5073 //
5074 // x = G_SUB 0, y
5075 // G_ICMP x, z
5076 //
5077 // Produce this:
5078 //
5079 // cmn y, z
5080 //
5081 // But be careful! We need to swap the predicate!
5082 if (isCMN(MaybeSub: LHSDef, Pred: P, MRI)) {
5083 if (!CmpInst::isEquality(pred: P)) {
5084 P = CmpInst::getSwappedPredicate(pred: P);
5085 Predicate = MachineOperand::CreatePredicate(Pred: P);
5086 }
5087 return emitCMN(LHS&: LHSDef->getOperand(i: 2), RHS, MIRBuilder);
5088 }
5089
5090 // Given this:
5091 //
5092 // z = G_AND x, y
5093 // G_ICMP z, 0
5094 //
5095 // Produce this if the compare is signed:
5096 //
5097 // tst x, y
5098 if (!CmpInst::isUnsigned(Pred: P) && LHSDef &&
5099 LHSDef->getOpcode() == TargetOpcode::G_AND) {
5100 // Make sure that the RHS is 0.
5101 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI);
5102 if (!ValAndVReg || ValAndVReg->Value != 0)
5103 return nullptr;
5104
5105 return emitTST(LHS&: LHSDef->getOperand(i: 1),
5106 RHS&: LHSDef->getOperand(i: 2), MIRBuilder);
5107 }
5108
5109 return nullptr;
5110}
5111
5112bool AArch64InstructionSelector::selectShuffleVector(
5113 MachineInstr &I, MachineRegisterInfo &MRI) {
5114 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5115 Register Src1Reg = I.getOperand(i: 1).getReg();
5116 Register Src2Reg = I.getOperand(i: 2).getReg();
5117 ArrayRef<int> Mask = I.getOperand(i: 3).getShuffleMask();
5118
5119 MachineBasicBlock &MBB = *I.getParent();
5120 MachineFunction &MF = *MBB.getParent();
5121 LLVMContext &Ctx = MF.getFunction().getContext();
5122
5123 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5124
5125 SmallVector<Constant *, 64> CstIdxs;
5126 for (int Val : Mask) {
5127 // For now, any undef indexes we'll just assume to be 0. This should be
5128 // optimized in future, e.g. to select DUP etc.
5129 Val = Val < 0 ? 0 : Val;
5130 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5131 unsigned Offset = Byte + Val * BytesPerElt;
5132 CstIdxs.emplace_back(Args: ConstantInt::get(Ty: Type::getInt8Ty(C&: Ctx), V: Offset));
5133 }
5134 }
5135
5136 // Use a constant pool to load the index vector for TBL.
5137 Constant *CPVal = ConstantVector::get(V: CstIdxs);
5138 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder&: MIB);
5139 if (!IndexLoad) {
5140 LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5141 return false;
5142 }
5143
5144 if (DstTy.getSizeInBits() != 128) {
5145 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5146 // This case can be done with TBL1.
5147 MachineInstr *Concat =
5148 emitVectorConcat(Dst: std::nullopt, Op1: Src1Reg, Op2: Src2Reg, MIRBuilder&: MIB);
5149 if (!Concat) {
5150 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5151 return false;
5152 }
5153
5154 // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5155 IndexLoad = emitScalarToVector(EltSize: 64, DstRC: &AArch64::FPR128RegClass,
5156 Scalar: IndexLoad->getOperand(i: 0).getReg(), MIRBuilder&: MIB);
5157
5158 auto TBL1 = MIB.buildInstr(
5159 Opc: AArch64::TBLv16i8One, DstOps: {&AArch64::FPR128RegClass},
5160 SrcOps: {Concat->getOperand(i: 0).getReg(), IndexLoad->getOperand(i: 0).getReg()});
5161 constrainSelectedInstRegOperands(I&: *TBL1, TII, TRI, RBI);
5162
5163 auto Copy =
5164 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: 0).getReg()}, SrcOps: {})
5165 .addReg(RegNo: TBL1.getReg(Idx: 0), Flags: {}, SubReg: AArch64::dsub);
5166 RBI.constrainGenericRegister(Reg: Copy.getReg(Idx: 0), RC: AArch64::FPR64RegClass, MRI);
5167 I.eraseFromParent();
5168 return true;
5169 }
5170
5171 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5172 // Q registers for regalloc.
5173 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5174 auto RegSeq = createQTuple(Regs, MIB);
5175 auto TBL2 = MIB.buildInstr(Opc: AArch64::TBLv16i8Two, DstOps: {I.getOperand(i: 0)},
5176 SrcOps: {RegSeq, IndexLoad->getOperand(i: 0)});
5177 constrainSelectedInstRegOperands(I&: *TBL2, TII, TRI, RBI);
5178 I.eraseFromParent();
5179 return true;
5180}
5181
5182MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5183 std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5184 unsigned LaneIdx, const RegisterBank &RB,
5185 MachineIRBuilder &MIRBuilder) const {
5186 MachineInstr *InsElt = nullptr;
5187 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5188 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5189
5190 // Create a register to define with the insert if one wasn't passed in.
5191 if (!DstReg)
5192 DstReg = MRI.createVirtualRegister(RegClass: DstRC);
5193
5194 unsigned EltSize = MRI.getType(Reg: EltReg).getSizeInBits();
5195 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5196
5197 if (RB.getID() == AArch64::FPRRegBankID) {
5198 auto InsSub = emitScalarToVector(EltSize, DstRC, Scalar: EltReg, MIRBuilder);
5199 InsElt = MIRBuilder.buildInstr(Opc, DstOps: {*DstReg}, SrcOps: {SrcReg})
5200 .addImm(Val: LaneIdx)
5201 .addUse(RegNo: InsSub->getOperand(i: 0).getReg())
5202 .addImm(Val: 0);
5203 } else {
5204 InsElt = MIRBuilder.buildInstr(Opc, DstOps: {*DstReg}, SrcOps: {SrcReg})
5205 .addImm(Val: LaneIdx)
5206 .addUse(RegNo: EltReg);
5207 }
5208
5209 constrainSelectedInstRegOperands(I&: *InsElt, TII, TRI, RBI);
5210 return InsElt;
5211}
5212
5213bool AArch64InstructionSelector::selectUSMovFromExtend(
5214 MachineInstr &MI, MachineRegisterInfo &MRI) {
5215 if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5216 MI.getOpcode() != TargetOpcode::G_ZEXT &&
5217 MI.getOpcode() != TargetOpcode::G_ANYEXT)
5218 return false;
5219 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5220 const Register DefReg = MI.getOperand(i: 0).getReg();
5221 const LLT DstTy = MRI.getType(Reg: DefReg);
5222 unsigned DstSize = DstTy.getSizeInBits();
5223
5224 if (DstSize != 32 && DstSize != 64)
5225 return false;
5226
5227 MachineInstr *Extract = getOpcodeDef(Opcode: TargetOpcode::G_EXTRACT_VECTOR_ELT,
5228 Reg: MI.getOperand(i: 1).getReg(), MRI);
5229 int64_t Lane;
5230 if (!Extract || !mi_match(R: Extract->getOperand(i: 2).getReg(), MRI, P: m_ICst(Cst&: Lane)))
5231 return false;
5232 Register Src0 = Extract->getOperand(i: 1).getReg();
5233
5234 const LLT VecTy = MRI.getType(Reg: Src0);
5235 if (VecTy.isScalableVector())
5236 return false;
5237
5238 if (VecTy.getSizeInBits() != 128) {
5239 const MachineInstr *ScalarToVector = emitScalarToVector(
5240 EltSize: VecTy.getSizeInBits(), DstRC: &AArch64::FPR128RegClass, Scalar: Src0, MIRBuilder&: MIB);
5241 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5242 Src0 = ScalarToVector->getOperand(i: 0).getReg();
5243 }
5244
5245 unsigned Opcode;
5246 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5247 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5248 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5249 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5250 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5251 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5252 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5253 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5254 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5255 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5256 else
5257 llvm_unreachable("Unexpected type combo for S/UMov!");
5258
5259 // We may need to generate one of these, depending on the type and sign of the
5260 // input:
5261 // DstReg = SMOV Src0, Lane;
5262 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5263 MachineInstr *ExtI = nullptr;
5264 if (DstSize == 64 && !IsSigned) {
5265 Register NewReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR32RegClass);
5266 MIB.buildInstr(Opc: Opcode, DstOps: {NewReg}, SrcOps: {Src0}).addImm(Val: Lane);
5267 ExtI = MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {DefReg}, SrcOps: {})
5268 .addUse(RegNo: NewReg)
5269 .addImm(Val: AArch64::sub_32);
5270 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
5271 } else
5272 ExtI = MIB.buildInstr(Opc: Opcode, DstOps: {DefReg}, SrcOps: {Src0}).addImm(Val: Lane);
5273
5274 constrainSelectedInstRegOperands(I&: *ExtI, TII, TRI, RBI);
5275 MI.eraseFromParent();
5276 return true;
5277}
5278
5279MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5280 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5281 unsigned int Op;
5282 if (DstSize == 128) {
5283 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5284 return nullptr;
5285 Op = AArch64::MOVIv16b_ns;
5286 } else {
5287 Op = AArch64::MOVIv8b_ns;
5288 }
5289
5290 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5291
5292 if (AArch64_AM::isAdvSIMDModImmType9(Imm: Val)) {
5293 Val = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Val);
5294 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5295 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5296 return &*Mov;
5297 }
5298 return nullptr;
5299}
5300
5301MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5302 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5303 bool Inv) {
5304
5305 unsigned int Op;
5306 if (DstSize == 128) {
5307 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5308 return nullptr;
5309 Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5310 } else {
5311 Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5312 }
5313
5314 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5315 uint64_t Shift;
5316
5317 if (AArch64_AM::isAdvSIMDModImmType5(Imm: Val)) {
5318 Val = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Val);
5319 Shift = 0;
5320 } else if (AArch64_AM::isAdvSIMDModImmType6(Imm: Val)) {
5321 Val = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Val);
5322 Shift = 8;
5323 } else
5324 return nullptr;
5325
5326 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5327 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5328 return &*Mov;
5329}
5330
5331MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5332 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5333 bool Inv) {
5334
5335 unsigned int Op;
5336 if (DstSize == 128) {
5337 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5338 return nullptr;
5339 Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5340 } else {
5341 Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5342 }
5343
5344 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5345 uint64_t Shift;
5346
5347 if ((AArch64_AM::isAdvSIMDModImmType1(Imm: Val))) {
5348 Val = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Val);
5349 Shift = 0;
5350 } else if ((AArch64_AM::isAdvSIMDModImmType2(Imm: Val))) {
5351 Val = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Val);
5352 Shift = 8;
5353 } else if ((AArch64_AM::isAdvSIMDModImmType3(Imm: Val))) {
5354 Val = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Val);
5355 Shift = 16;
5356 } else if ((AArch64_AM::isAdvSIMDModImmType4(Imm: Val))) {
5357 Val = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Val);
5358 Shift = 24;
5359 } else
5360 return nullptr;
5361
5362 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5363 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5364 return &*Mov;
5365}
5366
5367MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5368 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5369
5370 unsigned int Op;
5371 if (DstSize == 128) {
5372 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5373 return nullptr;
5374 Op = AArch64::MOVIv2d_ns;
5375 } else {
5376 Op = AArch64::MOVID;
5377 }
5378
5379 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5380 if (AArch64_AM::isAdvSIMDModImmType10(Imm: Val)) {
5381 Val = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Val);
5382 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5383 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5384 return &*Mov;
5385 }
5386 return nullptr;
5387}
5388
5389MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5390 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5391 bool Inv) {
5392
5393 unsigned int Op;
5394 if (DstSize == 128) {
5395 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5396 return nullptr;
5397 Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5398 } else {
5399 Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5400 }
5401
5402 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5403 uint64_t Shift;
5404
5405 if (AArch64_AM::isAdvSIMDModImmType7(Imm: Val)) {
5406 Val = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Val);
5407 Shift = 264;
5408 } else if (AArch64_AM::isAdvSIMDModImmType8(Imm: Val)) {
5409 Val = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Val);
5410 Shift = 272;
5411 } else
5412 return nullptr;
5413
5414 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5415 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5416 return &*Mov;
5417}
5418
5419MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5420 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5421
5422 unsigned int Op;
5423 bool IsWide = false;
5424 if (DstSize == 128) {
5425 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5426 return nullptr;
5427 Op = AArch64::FMOVv4f32_ns;
5428 IsWide = true;
5429 } else {
5430 Op = AArch64::FMOVv2f32_ns;
5431 }
5432
5433 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5434
5435 if (AArch64_AM::isAdvSIMDModImmType11(Imm: Val)) {
5436 Val = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Val);
5437 } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Imm: Val)) {
5438 Val = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Val);
5439 Op = AArch64::FMOVv2f64_ns;
5440 } else
5441 return nullptr;
5442
5443 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5444 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5445 return &*Mov;
5446}
5447
5448bool AArch64InstructionSelector::selectIndexedExtLoad(
5449 MachineInstr &MI, MachineRegisterInfo &MRI) {
5450 auto &ExtLd = cast<GIndexedAnyExtLoad>(Val&: MI);
5451 Register Dst = ExtLd.getDstReg();
5452 Register WriteBack = ExtLd.getWritebackReg();
5453 Register Base = ExtLd.getBaseReg();
5454 Register Offset = ExtLd.getOffsetReg();
5455 LLT Ty = MRI.getType(Reg: Dst);
5456 assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
5457 unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5458 bool IsPre = ExtLd.isPre();
5459 bool IsSExt = isa<GIndexedSExtLoad>(Val: ExtLd);
5460 unsigned InsertIntoSubReg = 0;
5461 bool IsDst64 = Ty.getSizeInBits() == 64;
5462
5463 // ZExt/SExt should be on gpr but can handle extload and zextload of fpr, so
5464 // long as they are scalar.
5465 bool IsFPR = RBI.getRegBank(Reg: Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID;
5466 if ((IsSExt && IsFPR) || Ty.isVector())
5467 return false;
5468
5469 unsigned Opc = 0;
5470 LLT NewLdDstTy;
5471 LLT s32 = LLT::scalar(SizeInBits: 32);
5472 LLT s64 = LLT::scalar(SizeInBits: 64);
5473
5474 if (MemSizeBits == 8) {
5475 if (IsSExt) {
5476 if (IsDst64)
5477 Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5478 else
5479 Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5480 NewLdDstTy = IsDst64 ? s64 : s32;
5481 } else if (IsFPR) {
5482 Opc = IsPre ? AArch64::LDRBpre : AArch64::LDRBpost;
5483 InsertIntoSubReg = AArch64::bsub;
5484 NewLdDstTy = LLT::scalar(SizeInBits: MemSizeBits);
5485 } else {
5486 Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5487 InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5488 NewLdDstTy = s32;
5489 }
5490 } else if (MemSizeBits == 16) {
5491 if (IsSExt) {
5492 if (IsDst64)
5493 Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5494 else
5495 Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5496 NewLdDstTy = IsDst64 ? s64 : s32;
5497 } else if (IsFPR) {
5498 Opc = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
5499 InsertIntoSubReg = AArch64::hsub;
5500 NewLdDstTy = LLT::scalar(SizeInBits: MemSizeBits);
5501 } else {
5502 Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5503 InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5504 NewLdDstTy = s32;
5505 }
5506 } else if (MemSizeBits == 32) {
5507 if (IsSExt) {
5508 Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5509 NewLdDstTy = s64;
5510 } else if (IsFPR) {
5511 Opc = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
5512 InsertIntoSubReg = AArch64::ssub;
5513 NewLdDstTy = LLT::scalar(SizeInBits: MemSizeBits);
5514 } else {
5515 Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5516 InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5517 NewLdDstTy = s32;
5518 }
5519 } else {
5520 llvm_unreachable("Unexpected size for indexed load");
5521 }
5522
5523 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5524 if (!Cst)
5525 return false; // Shouldn't happen, but just in case.
5526
5527 auto LdMI = MIB.buildInstr(Opc, DstOps: {WriteBack, NewLdDstTy}, SrcOps: {Base})
5528 .addImm(Val: Cst->getSExtValue());
5529 LdMI.cloneMemRefs(OtherMI: ExtLd);
5530 constrainSelectedInstRegOperands(I&: *LdMI, TII, TRI, RBI);
5531 // Make sure to select the load with the MemTy as the dest type, and then
5532 // insert into a larger reg if needed.
5533 if (InsertIntoSubReg) {
5534 // Generate a SUBREG_TO_REG.
5535 auto SubToReg = MIB.buildInstr(Opc: TargetOpcode::SUBREG_TO_REG, DstOps: {Dst}, SrcOps: {})
5536 .addUse(RegNo: LdMI.getReg(Idx: 1))
5537 .addImm(Val: InsertIntoSubReg);
5538 RBI.constrainGenericRegister(
5539 Reg: SubToReg.getReg(Idx: 0),
5540 RC: *getRegClassForTypeOnBank(Ty: MRI.getType(Reg: Dst),
5541 RB: *RBI.getRegBank(Reg: Dst, MRI, TRI)),
5542 MRI);
5543 } else {
5544 auto Copy = MIB.buildCopy(Res: Dst, Op: LdMI.getReg(Idx: 1));
5545 selectCopy(I&: *Copy, TII, MRI, TRI, RBI);
5546 }
5547 MI.eraseFromParent();
5548
5549 return true;
5550}
5551
5552bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5553 MachineRegisterInfo &MRI) {
5554 auto &Ld = cast<GIndexedLoad>(Val&: MI);
5555 Register Dst = Ld.getDstReg();
5556 Register WriteBack = Ld.getWritebackReg();
5557 Register Base = Ld.getBaseReg();
5558 Register Offset = Ld.getOffsetReg();
5559 assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
5560 "Unexpected type for indexed load");
5561 unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5562
5563 if (MemSize < MRI.getType(Reg: Dst).getSizeInBytes())
5564 return selectIndexedExtLoad(MI, MRI);
5565
5566 unsigned Opc = 0;
5567 if (Ld.isPre()) {
5568 static constexpr unsigned GPROpcodes[] = {
5569 AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5570 AArch64::LDRXpre};
5571 static constexpr unsigned FPROpcodes[] = {
5572 AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5573 AArch64::LDRQpre};
5574 Opc = (RBI.getRegBank(Reg: Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5575 ? FPROpcodes[Log2_32(Value: MemSize)]
5576 : GPROpcodes[Log2_32(Value: MemSize)];
5577 ;
5578 } else {
5579 static constexpr unsigned GPROpcodes[] = {
5580 AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5581 AArch64::LDRXpost};
5582 static constexpr unsigned FPROpcodes[] = {
5583 AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5584 AArch64::LDRDpost, AArch64::LDRQpost};
5585 Opc = (RBI.getRegBank(Reg: Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5586 ? FPROpcodes[Log2_32(Value: MemSize)]
5587 : GPROpcodes[Log2_32(Value: MemSize)];
5588 ;
5589 }
5590 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5591 if (!Cst)
5592 return false; // Shouldn't happen, but just in case.
5593 auto LdMI =
5594 MIB.buildInstr(Opc, DstOps: {WriteBack, Dst}, SrcOps: {Base}).addImm(Val: Cst->getSExtValue());
5595 LdMI.cloneMemRefs(OtherMI: Ld);
5596 constrainSelectedInstRegOperands(I&: *LdMI, TII, TRI, RBI);
5597 MI.eraseFromParent();
5598 return true;
5599}
5600
5601bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5602 MachineRegisterInfo &MRI) {
5603 Register Dst = I.getWritebackReg();
5604 Register Val = I.getValueReg();
5605 Register Base = I.getBaseReg();
5606 Register Offset = I.getOffsetReg();
5607 assert(MRI.getType(Val).getSizeInBits() <= 128 &&
5608 "Unexpected type for indexed store");
5609
5610 LocationSize MemSize = I.getMMO().getSize();
5611 unsigned MemSizeInBytes = MemSize.getValue();
5612
5613 assert(MemSizeInBytes && MemSizeInBytes <= 16 &&
5614 "Unexpected indexed store size");
5615 unsigned MemSizeLog2 = Log2_32(Value: MemSizeInBytes);
5616
5617 unsigned Opc = 0;
5618 if (I.isPre()) {
5619 static constexpr unsigned GPROpcodes[] = {
5620 AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5621 AArch64::STRXpre};
5622 static constexpr unsigned FPROpcodes[] = {
5623 AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5624 AArch64::STRQpre};
5625
5626 if (RBI.getRegBank(Reg: Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5627 Opc = FPROpcodes[MemSizeLog2];
5628 else
5629 Opc = GPROpcodes[MemSizeLog2];
5630 } else {
5631 static constexpr unsigned GPROpcodes[] = {
5632 AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5633 AArch64::STRXpost};
5634 static constexpr unsigned FPROpcodes[] = {
5635 AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5636 AArch64::STRDpost, AArch64::STRQpost};
5637
5638 if (RBI.getRegBank(Reg: Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5639 Opc = FPROpcodes[MemSizeLog2];
5640 else
5641 Opc = GPROpcodes[MemSizeLog2];
5642 }
5643
5644 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5645 if (!Cst)
5646 return false; // Shouldn't happen, but just in case.
5647 auto Str =
5648 MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {Val, Base}).addImm(Val: Cst->getSExtValue());
5649 Str.cloneMemRefs(OtherMI: I);
5650 constrainSelectedInstRegOperands(I&: *Str, TII, TRI, RBI);
5651 I.eraseFromParent();
5652 return true;
5653}
5654
5655MachineInstr *
5656AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5657 MachineIRBuilder &MIRBuilder,
5658 MachineRegisterInfo &MRI) {
5659 LLT DstTy = MRI.getType(Reg: Dst);
5660 unsigned DstSize = DstTy.getSizeInBits();
5661 assert((DstSize == 64 || DstSize == 128) &&
5662 "Unexpected vector constant size");
5663
5664 if (CV->isNullValue()) {
5665 if (DstSize == 128) {
5666 auto Mov =
5667 MIRBuilder.buildInstr(Opc: AArch64::MOVIv2d_ns, DstOps: {Dst}, SrcOps: {}).addImm(Val: 0);
5668 constrainSelectedInstRegOperands(I&: *Mov, TII, TRI, RBI);
5669 return &*Mov;
5670 }
5671
5672 if (DstSize == 64) {
5673 auto Mov =
5674 MIRBuilder
5675 .buildInstr(Opc: AArch64::MOVIv2d_ns, DstOps: {&AArch64::FPR128RegClass}, SrcOps: {})
5676 .addImm(Val: 0);
5677 auto Copy = MIRBuilder.buildInstr(Opc: TargetOpcode::COPY, DstOps: {Dst}, SrcOps: {})
5678 .addReg(RegNo: Mov.getReg(Idx: 0), Flags: {}, SubReg: AArch64::dsub);
5679 RBI.constrainGenericRegister(Reg: Dst, RC: AArch64::FPR64RegClass, MRI);
5680 return &*Copy;
5681 }
5682 }
5683
5684 if (Constant *SplatValue = CV->getSplatValue()) {
5685 APInt SplatValueAsInt =
5686 isa<ConstantFP>(Val: SplatValue)
5687 ? cast<ConstantFP>(Val: SplatValue)->getValueAPF().bitcastToAPInt()
5688 : SplatValue->getUniqueInteger();
5689 APInt DefBits = APInt::getSplat(
5690 NewLen: DstSize, V: SplatValueAsInt.trunc(width: DstTy.getScalarSizeInBits()));
5691 auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * {
5692 MachineInstr *NewOp;
5693 bool Inv = false;
5694 if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)) ||
5695 (NewOp =
5696 tryAdvSIMDModImm32(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5697 (NewOp =
5698 tryAdvSIMDModImm321s(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5699 (NewOp =
5700 tryAdvSIMDModImm16(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5701 (NewOp = tryAdvSIMDModImm8(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)) ||
5702 (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)))
5703 return NewOp;
5704
5705 DefBits = ~DefBits;
5706 Inv = true;
5707 if ((NewOp =
5708 tryAdvSIMDModImm32(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5709 (NewOp =
5710 tryAdvSIMDModImm321s(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5711 (NewOp = tryAdvSIMDModImm16(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)))
5712 return NewOp;
5713 return nullptr;
5714 };
5715
5716 if (auto *NewOp = TryMOVIWithBits(DefBits))
5717 return NewOp;
5718
5719 // See if a fneg of the constant can be materialized with a MOVI, etc
5720 auto TryWithFNeg = [&](APInt DefBits, int NumBits,
5721 unsigned NegOpc) -> MachineInstr * {
5722 // FNegate each sub-element of the constant
5723 APInt Neg = APInt::getHighBitsSet(numBits: NumBits, hiBitsSet: 1).zext(width: DstSize);
5724 APInt NegBits(DstSize, 0);
5725 unsigned NumElts = DstSize / NumBits;
5726 for (unsigned i = 0; i < NumElts; i++)
5727 NegBits |= Neg << (NumBits * i);
5728 NegBits = DefBits ^ NegBits;
5729
5730 // Try to create the new constants with MOVI, and if so generate a fneg
5731 // for it.
5732 if (auto *NewOp = TryMOVIWithBits(NegBits)) {
5733 Register NewDst = MRI.createVirtualRegister(
5734 RegClass: DstSize == 64 ? &AArch64::FPR64RegClass : &AArch64::FPR128RegClass);
5735 NewOp->getOperand(i: 0).setReg(NewDst);
5736 return MIRBuilder.buildInstr(Opc: NegOpc, DstOps: {Dst}, SrcOps: {NewDst});
5737 }
5738 return nullptr;
5739 };
5740 MachineInstr *R;
5741 if ((R = TryWithFNeg(DefBits, 32,
5742 DstSize == 64 ? AArch64::FNEGv2f32
5743 : AArch64::FNEGv4f32)) ||
5744 (R = TryWithFNeg(DefBits, 64,
5745 DstSize == 64 ? AArch64::FNEGDr
5746 : AArch64::FNEGv2f64)) ||
5747 (STI.hasFullFP16() &&
5748 (R = TryWithFNeg(DefBits, 16,
5749 DstSize == 64 ? AArch64::FNEGv4f16
5750 : AArch64::FNEGv8f16))))
5751 return R;
5752 }
5753
5754 auto *CPLoad = emitLoadFromConstantPool(CPVal: CV, MIRBuilder);
5755 if (!CPLoad) {
5756 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5757 return nullptr;
5758 }
5759
5760 auto Copy = MIRBuilder.buildCopy(Res: Dst, Op: CPLoad->getOperand(i: 0));
5761 RBI.constrainGenericRegister(
5762 Reg: Dst, RC: *MRI.getRegClass(Reg: CPLoad->getOperand(i: 0).getReg()), MRI);
5763 return &*Copy;
5764}
5765
5766bool AArch64InstructionSelector::tryOptConstantBuildVec(
5767 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5768 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5769 unsigned DstSize = DstTy.getSizeInBits();
5770 assert(DstSize <= 128 && "Unexpected build_vec type!");
5771 if (DstSize < 32)
5772 return false;
5773 // Check if we're building a constant vector, in which case we want to
5774 // generate a constant pool load instead of a vector insert sequence.
5775 SmallVector<Constant *, 16> Csts;
5776 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5777 Register OpReg = I.getOperand(i: Idx).getReg();
5778 if (auto AnyConst = getAnyConstantVRegValWithLookThrough(
5779 VReg: OpReg, MRI, /*LookThroughInstrs=*/true,
5780 /*LookThroughAnyExt=*/true)) {
5781 MachineInstr *DefMI = MRI.getVRegDef(Reg: AnyConst->VReg);
5782
5783 if (DefMI->getOpcode() == TargetOpcode::G_CONSTANT) {
5784 Csts.emplace_back(
5785 Args: ConstantInt::get(Context&: MIB.getMF().getFunction().getContext(),
5786 V: std::move(AnyConst->Value)));
5787 continue;
5788 }
5789
5790 if (DefMI->getOpcode() == TargetOpcode::G_FCONSTANT) {
5791 Csts.emplace_back(
5792 Args: const_cast<ConstantFP *>(DefMI->getOperand(i: 1).getFPImm()));
5793 continue;
5794 }
5795 }
5796 return false;
5797 }
5798 Constant *CV = ConstantVector::get(V: Csts);
5799 if (!emitConstantVector(Dst: I.getOperand(i: 0).getReg(), CV, MIRBuilder&: MIB, MRI))
5800 return false;
5801 I.eraseFromParent();
5802 return true;
5803}
5804
5805bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5806 MachineInstr &I, MachineRegisterInfo &MRI) {
5807 // Given:
5808 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5809 //
5810 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5811 Register Dst = I.getOperand(i: 0).getReg();
5812 Register EltReg = I.getOperand(i: 1).getReg();
5813 LLT EltTy = MRI.getType(Reg: EltReg);
5814 // If the index isn't on the same bank as its elements, then this can't be a
5815 // SUBREG_TO_REG.
5816 const RegisterBank &EltRB = *RBI.getRegBank(Reg: EltReg, MRI, TRI);
5817 const RegisterBank &DstRB = *RBI.getRegBank(Reg: Dst, MRI, TRI);
5818 if (EltRB != DstRB)
5819 return false;
5820 if (any_of(Range: drop_begin(RangeOrContainer: I.operands(), N: 2), P: [&MRI](const MachineOperand &Op) {
5821 return !getOpcodeDef(Opcode: TargetOpcode::G_IMPLICIT_DEF, Reg: Op.getReg(), MRI);
5822 }))
5823 return false;
5824 unsigned SubReg;
5825 const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(Ty: EltTy, RB: EltRB);
5826 if (!EltRC)
5827 return false;
5828 const TargetRegisterClass *DstRC =
5829 getRegClassForTypeOnBank(Ty: MRI.getType(Reg: Dst), RB: DstRB);
5830 if (!DstRC)
5831 return false;
5832 if (!getSubRegForClass(RC: EltRC, TRI, SubReg))
5833 return false;
5834 auto SubregToReg = MIB.buildInstr(Opc: AArch64::SUBREG_TO_REG, DstOps: {Dst}, SrcOps: {})
5835 .addUse(RegNo: EltReg)
5836 .addImm(Val: SubReg);
5837 I.eraseFromParent();
5838 constrainSelectedInstRegOperands(I&: *SubregToReg, TII, TRI, RBI);
5839 return RBI.constrainGenericRegister(Reg: Dst, RC: *DstRC, MRI);
5840}
5841
5842bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5843 MachineRegisterInfo &MRI) {
5844 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5845 // Until we port more of the optimized selections, for now just use a vector
5846 // insert sequence.
5847 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5848 const LLT EltTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
5849 unsigned EltSize = EltTy.getSizeInBits();
5850
5851 if (tryOptConstantBuildVec(I, DstTy, MRI))
5852 return true;
5853 if (tryOptBuildVecToSubregToReg(I, MRI))
5854 return true;
5855
5856 if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5857 return false; // Don't support all element types yet.
5858 const RegisterBank &RB = *RBI.getRegBank(Reg: I.getOperand(i: 1).getReg(), MRI, TRI);
5859
5860 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5861 MachineInstr *ScalarToVec =
5862 emitScalarToVector(EltSize: DstTy.getElementType().getSizeInBits(), DstRC,
5863 Scalar: I.getOperand(i: 1).getReg(), MIRBuilder&: MIB);
5864 if (!ScalarToVec)
5865 return false;
5866
5867 Register DstVec = ScalarToVec->getOperand(i: 0).getReg();
5868 unsigned DstSize = DstTy.getSizeInBits();
5869
5870 // Keep track of the last MI we inserted. Later on, we might be able to save
5871 // a copy using it.
5872 MachineInstr *PrevMI = ScalarToVec;
5873 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5874 // Note that if we don't do a subregister copy, we can end up making an
5875 // extra register.
5876 Register OpReg = I.getOperand(i).getReg();
5877 // Do not emit inserts for undefs
5878 if (!getOpcodeDef<GImplicitDef>(Reg: OpReg, MRI)) {
5879 PrevMI = &*emitLaneInsert(DstReg: std::nullopt, SrcReg: DstVec, EltReg: OpReg, LaneIdx: i - 1, RB, MIRBuilder&: MIB);
5880 DstVec = PrevMI->getOperand(i: 0).getReg();
5881 }
5882 }
5883
5884 // If DstTy's size in bits is less than 128, then emit a subregister copy
5885 // from DstVec to the last register we've defined.
5886 if (DstSize < 128) {
5887 // Force this to be FPR using the destination vector.
5888 const TargetRegisterClass *RC =
5889 getRegClassForTypeOnBank(Ty: DstTy, RB: *RBI.getRegBank(Reg: DstVec, MRI, TRI));
5890 if (!RC)
5891 return false;
5892 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5893 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5894 return false;
5895 }
5896
5897 unsigned SubReg = 0;
5898 if (!getSubRegForClass(RC, TRI, SubReg))
5899 return false;
5900 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5901 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5902 << "\n");
5903 return false;
5904 }
5905
5906 Register Reg = MRI.createVirtualRegister(RegClass: RC);
5907 Register DstReg = I.getOperand(i: 0).getReg();
5908
5909 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {}).addReg(RegNo: DstVec, Flags: {}, SubReg);
5910 MachineOperand &RegOp = I.getOperand(i: 1);
5911 RegOp.setReg(Reg);
5912 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
5913 } else {
5914 // We either have a vector with all elements (except the first one) undef or
5915 // at least one non-undef non-first element. In the first case, we need to
5916 // constrain the output register ourselves as we may have generated an
5917 // INSERT_SUBREG operation which is a generic operation for which the
5918 // output regclass cannot be automatically chosen.
5919 //
5920 // In the second case, there is no need to do this as it may generate an
5921 // instruction like INSvi32gpr where the regclass can be automatically
5922 // chosen.
5923 //
5924 // Also, we save a copy by re-using the destination register on the final
5925 // insert.
5926 PrevMI->getOperand(i: 0).setReg(I.getOperand(i: 0).getReg());
5927 constrainSelectedInstRegOperands(I&: *PrevMI, TII, TRI, RBI);
5928
5929 Register DstReg = PrevMI->getOperand(i: 0).getReg();
5930 if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
5931 const TargetRegisterClass *RC =
5932 getRegClassForTypeOnBank(Ty: DstTy, RB: *RBI.getRegBank(Reg: DstVec, MRI, TRI));
5933 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
5934 }
5935 }
5936
5937 I.eraseFromParent();
5938 return true;
5939}
5940
5941bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5942 unsigned NumVecs,
5943 MachineInstr &I) {
5944 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5945 assert(Opc && "Expected an opcode?");
5946 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5947 auto &MRI = *MIB.getMRI();
5948 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5949 unsigned Size = Ty.getSizeInBits();
5950 assert((Size == 64 || Size == 128) &&
5951 "Destination must be 64 bits or 128 bits?");
5952 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5953 auto Ptr = I.getOperand(i: I.getNumOperands() - 1).getReg();
5954 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5955 auto Load = MIB.buildInstr(Opc, DstOps: {Ty}, SrcOps: {Ptr});
5956 Load.cloneMemRefs(OtherMI: I);
5957 constrainSelectedInstRegOperands(I&: *Load, TII, TRI, RBI);
5958 Register SelectedLoadDst = Load->getOperand(i: 0).getReg();
5959 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5960 auto Vec = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: Idx)}, SrcOps: {})
5961 .addReg(RegNo: SelectedLoadDst, Flags: {}, SubReg: SubReg + Idx);
5962 // Emit the subreg copies and immediately select them.
5963 // FIXME: We should refactor our copy code into an emitCopy helper and
5964 // clean up uses of this pattern elsewhere in the selector.
5965 selectCopy(I&: *Vec, TII, MRI, TRI, RBI);
5966 }
5967 return true;
5968}
5969
5970bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
5971 unsigned Opc, unsigned NumVecs, MachineInstr &I) {
5972 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5973 assert(Opc && "Expected an opcode?");
5974 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5975 auto &MRI = *MIB.getMRI();
5976 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5977 bool Narrow = Ty.getSizeInBits() == 64;
5978
5979 auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
5980 SmallVector<Register, 4> Regs(NumVecs);
5981 std::transform(first: FirstSrcRegIt, last: FirstSrcRegIt + NumVecs, result: Regs.begin(),
5982 unary_op: [](auto MO) { return MO.getReg(); });
5983
5984 if (Narrow) {
5985 transform(Range&: Regs, d_first: Regs.begin(), F: [this](Register Reg) {
5986 return emitScalarToVector(EltSize: 64, DstRC: &AArch64::FPR128RegClass, Scalar: Reg, MIRBuilder&: MIB)
5987 ->getOperand(i: 0)
5988 .getReg();
5989 });
5990 Ty = Ty.multiplyElements(Factor: 2);
5991 }
5992
5993 Register Tuple = createQTuple(Regs, MIB);
5994 auto LaneNo = getIConstantVRegVal(VReg: (FirstSrcRegIt + NumVecs)->getReg(), MRI);
5995 if (!LaneNo)
5996 return false;
5997
5998 Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
5999 auto Load = MIB.buildInstr(Opc, DstOps: {Ty}, SrcOps: {})
6000 .addReg(RegNo: Tuple)
6001 .addImm(Val: LaneNo->getZExtValue())
6002 .addReg(RegNo: Ptr);
6003 Load.cloneMemRefs(OtherMI: I);
6004 constrainSelectedInstRegOperands(I&: *Load, TII, TRI, RBI);
6005 Register SelectedLoadDst = Load->getOperand(i: 0).getReg();
6006 unsigned SubReg = AArch64::qsub0;
6007 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6008 auto Vec = MIB.buildInstr(Opc: TargetOpcode::COPY,
6009 DstOps: {Narrow ? DstOp(&AArch64::FPR128RegClass)
6010 : DstOp(I.getOperand(i: Idx).getReg())},
6011 SrcOps: {})
6012 .addReg(RegNo: SelectedLoadDst, Flags: {}, SubReg: SubReg + Idx);
6013 Register WideReg = Vec.getReg(Idx: 0);
6014 // Emit the subreg copies and immediately select them.
6015 selectCopy(I&: *Vec, TII, MRI, TRI, RBI);
6016 if (Narrow &&
6017 !emitNarrowVector(DstReg: I.getOperand(i: Idx).getReg(), SrcReg: WideReg, MIB, MRI))
6018 return false;
6019 }
6020 return true;
6021}
6022
6023void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
6024 unsigned NumVecs,
6025 unsigned Opc) {
6026 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6027 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6028 Register Ptr = I.getOperand(i: 1 + NumVecs).getReg();
6029
6030 SmallVector<Register, 2> Regs(NumVecs);
6031 std::transform(first: I.operands_begin() + 1, last: I.operands_begin() + 1 + NumVecs,
6032 result: Regs.begin(), unary_op: [](auto MO) { return MO.getReg(); });
6033
6034 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
6035 : createDTuple(Regs, MIB);
6036 auto Store = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {Tuple, Ptr});
6037 Store.cloneMemRefs(OtherMI: I);
6038 constrainSelectedInstRegOperands(I&: *Store, TII, TRI, RBI);
6039}
6040
6041bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
6042 MachineInstr &I, unsigned NumVecs, unsigned Opc) {
6043 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6044 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6045 bool Narrow = Ty.getSizeInBits() == 64;
6046
6047 SmallVector<Register, 2> Regs(NumVecs);
6048 std::transform(first: I.operands_begin() + 1, last: I.operands_begin() + 1 + NumVecs,
6049 result: Regs.begin(), unary_op: [](auto MO) { return MO.getReg(); });
6050
6051 if (Narrow)
6052 transform(Range&: Regs, d_first: Regs.begin(), F: [this](Register Reg) {
6053 return emitScalarToVector(EltSize: 64, DstRC: &AArch64::FPR128RegClass, Scalar: Reg, MIRBuilder&: MIB)
6054 ->getOperand(i: 0)
6055 .getReg();
6056 });
6057
6058 Register Tuple = createQTuple(Regs, MIB);
6059
6060 auto LaneNo = getIConstantVRegVal(VReg: I.getOperand(i: 1 + NumVecs).getReg(), MRI);
6061 if (!LaneNo)
6062 return false;
6063 Register Ptr = I.getOperand(i: 1 + NumVecs + 1).getReg();
6064 auto Store = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {})
6065 .addReg(RegNo: Tuple)
6066 .addImm(Val: LaneNo->getZExtValue())
6067 .addReg(RegNo: Ptr);
6068 Store.cloneMemRefs(OtherMI: I);
6069 constrainSelectedInstRegOperands(I&: *Store, TII, TRI, RBI);
6070 return true;
6071}
6072
6073bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
6074 MachineInstr &I, MachineRegisterInfo &MRI) {
6075 // Find the intrinsic ID.
6076 unsigned IntrinID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
6077
6078 const LLT S8 = LLT::scalar(SizeInBits: 8);
6079 const LLT S16 = LLT::scalar(SizeInBits: 16);
6080 const LLT S32 = LLT::scalar(SizeInBits: 32);
6081 const LLT S64 = LLT::scalar(SizeInBits: 64);
6082 const LLT P0 = LLT::pointer(AddressSpace: 0, SizeInBits: 64);
6083 // Select the instruction.
6084 switch (IntrinID) {
6085 default:
6086 return false;
6087 case Intrinsic::aarch64_ldxp:
6088 case Intrinsic::aarch64_ldaxp: {
6089 auto NewI = MIB.buildInstr(
6090 Opc: IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6091 DstOps: {I.getOperand(i: 0).getReg(), I.getOperand(i: 1).getReg()},
6092 SrcOps: {I.getOperand(i: 3)});
6093 NewI.cloneMemRefs(OtherMI: I);
6094 constrainSelectedInstRegOperands(I&: *NewI, TII, TRI, RBI);
6095 break;
6096 }
6097 case Intrinsic::aarch64_neon_ld1x2: {
6098 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6099 unsigned Opc = 0;
6100 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6101 Opc = AArch64::LD1Twov8b;
6102 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6103 Opc = AArch64::LD1Twov16b;
6104 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6105 Opc = AArch64::LD1Twov4h;
6106 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6107 Opc = AArch64::LD1Twov8h;
6108 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6109 Opc = AArch64::LD1Twov2s;
6110 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6111 Opc = AArch64::LD1Twov4s;
6112 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6113 Opc = AArch64::LD1Twov2d;
6114 else if (Ty == S64 || Ty == P0)
6115 Opc = AArch64::LD1Twov1d;
6116 else
6117 llvm_unreachable("Unexpected type for ld1x2!");
6118 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6119 break;
6120 }
6121 case Intrinsic::aarch64_neon_ld1x3: {
6122 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6123 unsigned Opc = 0;
6124 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6125 Opc = AArch64::LD1Threev8b;
6126 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6127 Opc = AArch64::LD1Threev16b;
6128 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6129 Opc = AArch64::LD1Threev4h;
6130 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6131 Opc = AArch64::LD1Threev8h;
6132 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6133 Opc = AArch64::LD1Threev2s;
6134 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6135 Opc = AArch64::LD1Threev4s;
6136 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6137 Opc = AArch64::LD1Threev2d;
6138 else if (Ty == S64 || Ty == P0)
6139 Opc = AArch64::LD1Threev1d;
6140 else
6141 llvm_unreachable("Unexpected type for ld1x3!");
6142 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6143 break;
6144 }
6145 case Intrinsic::aarch64_neon_ld1x4: {
6146 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6147 unsigned Opc = 0;
6148 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6149 Opc = AArch64::LD1Fourv8b;
6150 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6151 Opc = AArch64::LD1Fourv16b;
6152 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6153 Opc = AArch64::LD1Fourv4h;
6154 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6155 Opc = AArch64::LD1Fourv8h;
6156 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6157 Opc = AArch64::LD1Fourv2s;
6158 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6159 Opc = AArch64::LD1Fourv4s;
6160 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6161 Opc = AArch64::LD1Fourv2d;
6162 else if (Ty == S64 || Ty == P0)
6163 Opc = AArch64::LD1Fourv1d;
6164 else
6165 llvm_unreachable("Unexpected type for ld1x4!");
6166 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6167 break;
6168 }
6169 case Intrinsic::aarch64_neon_ld2: {
6170 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6171 unsigned Opc = 0;
6172 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6173 Opc = AArch64::LD2Twov8b;
6174 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6175 Opc = AArch64::LD2Twov16b;
6176 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6177 Opc = AArch64::LD2Twov4h;
6178 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6179 Opc = AArch64::LD2Twov8h;
6180 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6181 Opc = AArch64::LD2Twov2s;
6182 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6183 Opc = AArch64::LD2Twov4s;
6184 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6185 Opc = AArch64::LD2Twov2d;
6186 else if (Ty == S64 || Ty == P0)
6187 Opc = AArch64::LD1Twov1d;
6188 else
6189 llvm_unreachable("Unexpected type for ld2!");
6190 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6191 break;
6192 }
6193 case Intrinsic::aarch64_neon_ld2lane: {
6194 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6195 unsigned Opc;
6196 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6197 Opc = AArch64::LD2i8;
6198 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6199 Opc = AArch64::LD2i16;
6200 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6201 Opc = AArch64::LD2i32;
6202 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6203 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6204 Opc = AArch64::LD2i64;
6205 else
6206 llvm_unreachable("Unexpected type for st2lane!");
6207 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 2, I))
6208 return false;
6209 break;
6210 }
6211 case Intrinsic::aarch64_neon_ld2r: {
6212 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6213 unsigned Opc = 0;
6214 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6215 Opc = AArch64::LD2Rv8b;
6216 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6217 Opc = AArch64::LD2Rv16b;
6218 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6219 Opc = AArch64::LD2Rv4h;
6220 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6221 Opc = AArch64::LD2Rv8h;
6222 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6223 Opc = AArch64::LD2Rv2s;
6224 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6225 Opc = AArch64::LD2Rv4s;
6226 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6227 Opc = AArch64::LD2Rv2d;
6228 else if (Ty == S64 || Ty == P0)
6229 Opc = AArch64::LD2Rv1d;
6230 else
6231 llvm_unreachable("Unexpected type for ld2r!");
6232 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6233 break;
6234 }
6235 case Intrinsic::aarch64_neon_ld3: {
6236 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6237 unsigned Opc = 0;
6238 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6239 Opc = AArch64::LD3Threev8b;
6240 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6241 Opc = AArch64::LD3Threev16b;
6242 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6243 Opc = AArch64::LD3Threev4h;
6244 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6245 Opc = AArch64::LD3Threev8h;
6246 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6247 Opc = AArch64::LD3Threev2s;
6248 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6249 Opc = AArch64::LD3Threev4s;
6250 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6251 Opc = AArch64::LD3Threev2d;
6252 else if (Ty == S64 || Ty == P0)
6253 Opc = AArch64::LD1Threev1d;
6254 else
6255 llvm_unreachable("Unexpected type for ld3!");
6256 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6257 break;
6258 }
6259 case Intrinsic::aarch64_neon_ld3lane: {
6260 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6261 unsigned Opc;
6262 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6263 Opc = AArch64::LD3i8;
6264 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6265 Opc = AArch64::LD3i16;
6266 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6267 Opc = AArch64::LD3i32;
6268 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6269 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6270 Opc = AArch64::LD3i64;
6271 else
6272 llvm_unreachable("Unexpected type for st3lane!");
6273 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 3, I))
6274 return false;
6275 break;
6276 }
6277 case Intrinsic::aarch64_neon_ld3r: {
6278 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6279 unsigned Opc = 0;
6280 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6281 Opc = AArch64::LD3Rv8b;
6282 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6283 Opc = AArch64::LD3Rv16b;
6284 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6285 Opc = AArch64::LD3Rv4h;
6286 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6287 Opc = AArch64::LD3Rv8h;
6288 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6289 Opc = AArch64::LD3Rv2s;
6290 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6291 Opc = AArch64::LD3Rv4s;
6292 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6293 Opc = AArch64::LD3Rv2d;
6294 else if (Ty == S64 || Ty == P0)
6295 Opc = AArch64::LD3Rv1d;
6296 else
6297 llvm_unreachable("Unexpected type for ld3r!");
6298 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6299 break;
6300 }
6301 case Intrinsic::aarch64_neon_ld4: {
6302 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6303 unsigned Opc = 0;
6304 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6305 Opc = AArch64::LD4Fourv8b;
6306 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6307 Opc = AArch64::LD4Fourv16b;
6308 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6309 Opc = AArch64::LD4Fourv4h;
6310 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6311 Opc = AArch64::LD4Fourv8h;
6312 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6313 Opc = AArch64::LD4Fourv2s;
6314 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6315 Opc = AArch64::LD4Fourv4s;
6316 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6317 Opc = AArch64::LD4Fourv2d;
6318 else if (Ty == S64 || Ty == P0)
6319 Opc = AArch64::LD1Fourv1d;
6320 else
6321 llvm_unreachable("Unexpected type for ld4!");
6322 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6323 break;
6324 }
6325 case Intrinsic::aarch64_neon_ld4lane: {
6326 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6327 unsigned Opc;
6328 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6329 Opc = AArch64::LD4i8;
6330 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6331 Opc = AArch64::LD4i16;
6332 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6333 Opc = AArch64::LD4i32;
6334 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6335 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6336 Opc = AArch64::LD4i64;
6337 else
6338 llvm_unreachable("Unexpected type for st4lane!");
6339 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 4, I))
6340 return false;
6341 break;
6342 }
6343 case Intrinsic::aarch64_neon_ld4r: {
6344 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6345 unsigned Opc = 0;
6346 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6347 Opc = AArch64::LD4Rv8b;
6348 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6349 Opc = AArch64::LD4Rv16b;
6350 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6351 Opc = AArch64::LD4Rv4h;
6352 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6353 Opc = AArch64::LD4Rv8h;
6354 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6355 Opc = AArch64::LD4Rv2s;
6356 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6357 Opc = AArch64::LD4Rv4s;
6358 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6359 Opc = AArch64::LD4Rv2d;
6360 else if (Ty == S64 || Ty == P0)
6361 Opc = AArch64::LD4Rv1d;
6362 else
6363 llvm_unreachable("Unexpected type for ld4r!");
6364 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6365 break;
6366 }
6367 case Intrinsic::aarch64_neon_st1x2: {
6368 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6369 unsigned Opc;
6370 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6371 Opc = AArch64::ST1Twov8b;
6372 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6373 Opc = AArch64::ST1Twov16b;
6374 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6375 Opc = AArch64::ST1Twov4h;
6376 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6377 Opc = AArch64::ST1Twov8h;
6378 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6379 Opc = AArch64::ST1Twov2s;
6380 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6381 Opc = AArch64::ST1Twov4s;
6382 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6383 Opc = AArch64::ST1Twov2d;
6384 else if (Ty == S64 || Ty == P0)
6385 Opc = AArch64::ST1Twov1d;
6386 else
6387 llvm_unreachable("Unexpected type for st1x2!");
6388 selectVectorStoreIntrinsic(I, NumVecs: 2, Opc);
6389 break;
6390 }
6391 case Intrinsic::aarch64_neon_st1x3: {
6392 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6393 unsigned Opc;
6394 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6395 Opc = AArch64::ST1Threev8b;
6396 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6397 Opc = AArch64::ST1Threev16b;
6398 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6399 Opc = AArch64::ST1Threev4h;
6400 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6401 Opc = AArch64::ST1Threev8h;
6402 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6403 Opc = AArch64::ST1Threev2s;
6404 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6405 Opc = AArch64::ST1Threev4s;
6406 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6407 Opc = AArch64::ST1Threev2d;
6408 else if (Ty == S64 || Ty == P0)
6409 Opc = AArch64::ST1Threev1d;
6410 else
6411 llvm_unreachable("Unexpected type for st1x3!");
6412 selectVectorStoreIntrinsic(I, NumVecs: 3, Opc);
6413 break;
6414 }
6415 case Intrinsic::aarch64_neon_st1x4: {
6416 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6417 unsigned Opc;
6418 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6419 Opc = AArch64::ST1Fourv8b;
6420 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6421 Opc = AArch64::ST1Fourv16b;
6422 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6423 Opc = AArch64::ST1Fourv4h;
6424 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6425 Opc = AArch64::ST1Fourv8h;
6426 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6427 Opc = AArch64::ST1Fourv2s;
6428 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6429 Opc = AArch64::ST1Fourv4s;
6430 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6431 Opc = AArch64::ST1Fourv2d;
6432 else if (Ty == S64 || Ty == P0)
6433 Opc = AArch64::ST1Fourv1d;
6434 else
6435 llvm_unreachable("Unexpected type for st1x4!");
6436 selectVectorStoreIntrinsic(I, NumVecs: 4, Opc);
6437 break;
6438 }
6439 case Intrinsic::aarch64_neon_st2: {
6440 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6441 unsigned Opc;
6442 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6443 Opc = AArch64::ST2Twov8b;
6444 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6445 Opc = AArch64::ST2Twov16b;
6446 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6447 Opc = AArch64::ST2Twov4h;
6448 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6449 Opc = AArch64::ST2Twov8h;
6450 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6451 Opc = AArch64::ST2Twov2s;
6452 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6453 Opc = AArch64::ST2Twov4s;
6454 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6455 Opc = AArch64::ST2Twov2d;
6456 else if (Ty == S64 || Ty == P0)
6457 Opc = AArch64::ST1Twov1d;
6458 else
6459 llvm_unreachable("Unexpected type for st2!");
6460 selectVectorStoreIntrinsic(I, NumVecs: 2, Opc);
6461 break;
6462 }
6463 case Intrinsic::aarch64_neon_st3: {
6464 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6465 unsigned Opc;
6466 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6467 Opc = AArch64::ST3Threev8b;
6468 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6469 Opc = AArch64::ST3Threev16b;
6470 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6471 Opc = AArch64::ST3Threev4h;
6472 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6473 Opc = AArch64::ST3Threev8h;
6474 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6475 Opc = AArch64::ST3Threev2s;
6476 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6477 Opc = AArch64::ST3Threev4s;
6478 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6479 Opc = AArch64::ST3Threev2d;
6480 else if (Ty == S64 || Ty == P0)
6481 Opc = AArch64::ST1Threev1d;
6482 else
6483 llvm_unreachable("Unexpected type for st3!");
6484 selectVectorStoreIntrinsic(I, NumVecs: 3, Opc);
6485 break;
6486 }
6487 case Intrinsic::aarch64_neon_st4: {
6488 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6489 unsigned Opc;
6490 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6491 Opc = AArch64::ST4Fourv8b;
6492 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6493 Opc = AArch64::ST4Fourv16b;
6494 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6495 Opc = AArch64::ST4Fourv4h;
6496 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6497 Opc = AArch64::ST4Fourv8h;
6498 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6499 Opc = AArch64::ST4Fourv2s;
6500 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6501 Opc = AArch64::ST4Fourv4s;
6502 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6503 Opc = AArch64::ST4Fourv2d;
6504 else if (Ty == S64 || Ty == P0)
6505 Opc = AArch64::ST1Fourv1d;
6506 else
6507 llvm_unreachable("Unexpected type for st4!");
6508 selectVectorStoreIntrinsic(I, NumVecs: 4, Opc);
6509 break;
6510 }
6511 case Intrinsic::aarch64_neon_st2lane: {
6512 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6513 unsigned Opc;
6514 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6515 Opc = AArch64::ST2i8;
6516 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6517 Opc = AArch64::ST2i16;
6518 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6519 Opc = AArch64::ST2i32;
6520 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6521 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6522 Opc = AArch64::ST2i64;
6523 else
6524 llvm_unreachable("Unexpected type for st2lane!");
6525 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 2, Opc))
6526 return false;
6527 break;
6528 }
6529 case Intrinsic::aarch64_neon_st3lane: {
6530 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6531 unsigned Opc;
6532 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6533 Opc = AArch64::ST3i8;
6534 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6535 Opc = AArch64::ST3i16;
6536 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6537 Opc = AArch64::ST3i32;
6538 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6539 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6540 Opc = AArch64::ST3i64;
6541 else
6542 llvm_unreachable("Unexpected type for st3lane!");
6543 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 3, Opc))
6544 return false;
6545 break;
6546 }
6547 case Intrinsic::aarch64_neon_st4lane: {
6548 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6549 unsigned Opc;
6550 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6551 Opc = AArch64::ST4i8;
6552 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6553 Opc = AArch64::ST4i16;
6554 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6555 Opc = AArch64::ST4i32;
6556 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6557 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6558 Opc = AArch64::ST4i64;
6559 else
6560 llvm_unreachable("Unexpected type for st4lane!");
6561 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 4, Opc))
6562 return false;
6563 break;
6564 }
6565 case Intrinsic::aarch64_mops_memset_tag: {
6566 // Transform
6567 // %dst:gpr(p0) = \
6568 // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6569 // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6570 // where %dst is updated, into
6571 // %Rd:GPR64common, %Rn:GPR64) = \
6572 // MOPSMemorySetTaggingPseudo \
6573 // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6574 // where Rd and Rn are tied.
6575 // It is expected that %val has been extended to s64 in legalization.
6576 // Note that the order of the size/value operands are swapped.
6577
6578 Register DstDef = I.getOperand(i: 0).getReg();
6579 // I.getOperand(1) is the intrinsic function
6580 Register DstUse = I.getOperand(i: 2).getReg();
6581 Register ValUse = I.getOperand(i: 3).getReg();
6582 Register SizeUse = I.getOperand(i: 4).getReg();
6583
6584 // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6585 // Therefore an additional virtual register is required for the updated size
6586 // operand. This value is not accessible via the semantics of the intrinsic.
6587 Register SizeDef = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
6588
6589 auto Memset = MIB.buildInstr(Opc: AArch64::MOPSMemorySetTaggingPseudo,
6590 DstOps: {DstDef, SizeDef}, SrcOps: {DstUse, SizeUse, ValUse});
6591 Memset.cloneMemRefs(OtherMI: I);
6592 constrainSelectedInstRegOperands(I&: *Memset, TII, TRI, RBI);
6593 break;
6594 }
6595 case Intrinsic::ptrauth_resign_load_relative: {
6596 Register DstReg = I.getOperand(i: 0).getReg();
6597 Register ValReg = I.getOperand(i: 2).getReg();
6598 uint64_t AUTKey = I.getOperand(i: 3).getImm();
6599 Register AUTDisc = I.getOperand(i: 4).getReg();
6600 uint64_t PACKey = I.getOperand(i: 5).getImm();
6601 Register PACDisc = I.getOperand(i: 6).getReg();
6602 int64_t Addend = I.getOperand(i: 7).getImm();
6603
6604 Register AUTAddrDisc = AUTDisc;
6605 uint16_t AUTConstDiscC = 0;
6606 std::tie(args&: AUTConstDiscC, args&: AUTAddrDisc) =
6607 extractPtrauthBlendDiscriminators(Disc: AUTDisc, MRI);
6608
6609 Register PACAddrDisc = PACDisc;
6610 uint16_t PACConstDiscC = 0;
6611 std::tie(args&: PACConstDiscC, args&: PACAddrDisc) =
6612 extractPtrauthBlendDiscriminators(Disc: PACDisc, MRI);
6613
6614 MIB.buildCopy(Res: {AArch64::X16}, Op: {ValReg});
6615
6616 MIB.buildInstr(Opcode: AArch64::AUTRELLOADPAC)
6617 .addImm(Val: AUTKey)
6618 .addImm(Val: AUTConstDiscC)
6619 .addUse(RegNo: AUTAddrDisc)
6620 .addImm(Val: PACKey)
6621 .addImm(Val: PACConstDiscC)
6622 .addUse(RegNo: PACAddrDisc)
6623 .addImm(Val: Addend)
6624 .constrainAllUses(TII, TRI, RBI);
6625 MIB.buildCopy(Res: {DstReg}, Op: Register(AArch64::X16));
6626
6627 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6628 I.eraseFromParent();
6629 return true;
6630 }
6631 }
6632
6633 I.eraseFromParent();
6634 return true;
6635}
6636
6637bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6638 MachineRegisterInfo &MRI) {
6639 unsigned IntrinID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
6640
6641 switch (IntrinID) {
6642 default:
6643 break;
6644 case Intrinsic::ptrauth_resign: {
6645 Register DstReg = I.getOperand(i: 0).getReg();
6646 Register ValReg = I.getOperand(i: 2).getReg();
6647 uint64_t AUTKey = I.getOperand(i: 3).getImm();
6648 Register AUTDisc = I.getOperand(i: 4).getReg();
6649 uint64_t PACKey = I.getOperand(i: 5).getImm();
6650 Register PACDisc = I.getOperand(i: 6).getReg();
6651
6652 Register AUTAddrDisc = AUTDisc;
6653 uint16_t AUTConstDiscC = 0;
6654 std::tie(args&: AUTConstDiscC, args&: AUTAddrDisc) =
6655 extractPtrauthBlendDiscriminators(Disc: AUTDisc, MRI);
6656
6657 Register PACAddrDisc = PACDisc;
6658 uint16_t PACConstDiscC = 0;
6659 std::tie(args&: PACConstDiscC, args&: PACAddrDisc) =
6660 extractPtrauthBlendDiscriminators(Disc: PACDisc, MRI);
6661
6662 MIB.buildCopy(Res: {AArch64::X16}, Op: {ValReg});
6663 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
6664 MIB.buildInstr(Opcode: AArch64::AUTPAC)
6665 .addImm(Val: AUTKey)
6666 .addImm(Val: AUTConstDiscC)
6667 .addUse(RegNo: AUTAddrDisc)
6668 .addImm(Val: PACKey)
6669 .addImm(Val: PACConstDiscC)
6670 .addUse(RegNo: PACAddrDisc)
6671 .constrainAllUses(TII, TRI, RBI);
6672 MIB.buildCopy(Res: {DstReg}, Op: Register(AArch64::X16));
6673
6674 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6675 I.eraseFromParent();
6676 return true;
6677 }
6678 case Intrinsic::ptrauth_auth: {
6679 Register DstReg = I.getOperand(i: 0).getReg();
6680 Register ValReg = I.getOperand(i: 2).getReg();
6681 uint64_t AUTKey = I.getOperand(i: 3).getImm();
6682 Register AUTDisc = I.getOperand(i: 4).getReg();
6683
6684 Register AUTAddrDisc = AUTDisc;
6685 uint16_t AUTConstDiscC = 0;
6686 std::tie(args&: AUTConstDiscC, args&: AUTAddrDisc) =
6687 extractPtrauthBlendDiscriminators(Disc: AUTDisc, MRI);
6688
6689 if (STI.isX16X17Safer()) {
6690 MIB.buildCopy(Res: {AArch64::X16}, Op: {ValReg});
6691 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
6692 MIB.buildInstr(Opcode: AArch64::AUTx16x17)
6693 .addImm(Val: AUTKey)
6694 .addImm(Val: AUTConstDiscC)
6695 .addUse(RegNo: AUTAddrDisc)
6696 .constrainAllUses(TII, TRI, RBI);
6697 MIB.buildCopy(Res: {DstReg}, Op: Register(AArch64::X16));
6698 } else {
6699 Register ScratchReg =
6700 MRI.createVirtualRegister(RegClass: &AArch64::GPR64commonRegClass);
6701 MIB.buildInstr(Opcode: AArch64::AUTxMxN)
6702 .addDef(RegNo: DstReg)
6703 .addDef(RegNo: ScratchReg)
6704 .addUse(RegNo: ValReg)
6705 .addImm(Val: AUTKey)
6706 .addImm(Val: AUTConstDiscC)
6707 .addUse(RegNo: AUTAddrDisc)
6708 .constrainAllUses(TII, TRI, RBI);
6709 }
6710
6711 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6712 I.eraseFromParent();
6713 return true;
6714 }
6715 case Intrinsic::frameaddress:
6716 case Intrinsic::returnaddress: {
6717 MachineFunction &MF = *I.getParent()->getParent();
6718 MachineFrameInfo &MFI = MF.getFrameInfo();
6719
6720 unsigned Depth = I.getOperand(i: 2).getImm();
6721 Register DstReg = I.getOperand(i: 0).getReg();
6722 RBI.constrainGenericRegister(Reg: DstReg, RC: AArch64::GPR64RegClass, MRI);
6723
6724 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6725 if (!MFReturnAddr) {
6726 // Insert the copy from LR/X30 into the entry block, before it can be
6727 // clobbered by anything.
6728 MFI.setReturnAddressIsTaken(true);
6729 MFReturnAddr = getFunctionLiveInPhysReg(
6730 MF, TII, PhysReg: AArch64::LR, RC: AArch64::GPR64RegClass, DL: I.getDebugLoc());
6731 }
6732
6733 if (STI.hasPAuth()) {
6734 MIB.buildInstr(Opc: AArch64::XPACI, DstOps: {DstReg}, SrcOps: {MFReturnAddr});
6735 } else {
6736 MIB.buildCopy(Res: {Register(AArch64::LR)}, Op: {MFReturnAddr});
6737 MIB.buildInstr(Opcode: AArch64::XPACLRI);
6738 MIB.buildCopy(Res: {DstReg}, Op: {Register(AArch64::LR)});
6739 }
6740
6741 I.eraseFromParent();
6742 return true;
6743 }
6744
6745 MFI.setFrameAddressIsTaken(true);
6746 Register FrameAddr(AArch64::FP);
6747 while (Depth--) {
6748 Register NextFrame = MRI.createVirtualRegister(RegClass: &AArch64::GPR64spRegClass);
6749 auto Ldr =
6750 MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {NextFrame}, SrcOps: {FrameAddr}).addImm(Val: 0);
6751 constrainSelectedInstRegOperands(I&: *Ldr, TII, TRI, RBI);
6752 FrameAddr = NextFrame;
6753 }
6754
6755 if (IntrinID == Intrinsic::frameaddress)
6756 MIB.buildCopy(Res: {DstReg}, Op: {FrameAddr});
6757 else {
6758 MFI.setReturnAddressIsTaken(true);
6759
6760 if (STI.hasPAuth()) {
6761 Register TmpReg = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
6762 MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {TmpReg}, SrcOps: {FrameAddr}).addImm(Val: 1);
6763 MIB.buildInstr(Opc: AArch64::XPACI, DstOps: {DstReg}, SrcOps: {TmpReg});
6764 } else {
6765 MIB.buildInstr(Opc: AArch64::LDRXui, DstOps: {Register(AArch64::LR)}, SrcOps: {FrameAddr})
6766 .addImm(Val: 1);
6767 MIB.buildInstr(Opcode: AArch64::XPACLRI);
6768 MIB.buildCopy(Res: {DstReg}, Op: {Register(AArch64::LR)});
6769 }
6770 }
6771
6772 I.eraseFromParent();
6773 return true;
6774 }
6775 case Intrinsic::aarch64_neon_tbl2:
6776 SelectTable(I, MRI, NumVecs: 2, Opc1: AArch64::TBLv8i8Two, Opc2: AArch64::TBLv16i8Two, isExt: false);
6777 return true;
6778 case Intrinsic::aarch64_neon_tbl3:
6779 SelectTable(I, MRI, NumVecs: 3, Opc1: AArch64::TBLv8i8Three, Opc2: AArch64::TBLv16i8Three,
6780 isExt: false);
6781 return true;
6782 case Intrinsic::aarch64_neon_tbl4:
6783 SelectTable(I, MRI, NumVecs: 4, Opc1: AArch64::TBLv8i8Four, Opc2: AArch64::TBLv16i8Four, isExt: false);
6784 return true;
6785 case Intrinsic::aarch64_neon_tbx2:
6786 SelectTable(I, MRI, NumVecs: 2, Opc1: AArch64::TBXv8i8Two, Opc2: AArch64::TBXv16i8Two, isExt: true);
6787 return true;
6788 case Intrinsic::aarch64_neon_tbx3:
6789 SelectTable(I, MRI, NumVecs: 3, Opc1: AArch64::TBXv8i8Three, Opc2: AArch64::TBXv16i8Three, isExt: true);
6790 return true;
6791 case Intrinsic::aarch64_neon_tbx4:
6792 SelectTable(I, MRI, NumVecs: 4, Opc1: AArch64::TBXv8i8Four, Opc2: AArch64::TBXv16i8Four, isExt: true);
6793 return true;
6794 case Intrinsic::swift_async_context_addr:
6795 auto Sub = MIB.buildInstr(Opc: AArch64::SUBXri, DstOps: {I.getOperand(i: 0).getReg()},
6796 SrcOps: {Register(AArch64::FP)})
6797 .addImm(Val: 8)
6798 .addImm(Val: 0);
6799 constrainSelectedInstRegOperands(I&: *Sub, TII, TRI, RBI);
6800
6801 MF->getFrameInfo().setFrameAddressIsTaken(true);
6802 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6803 I.eraseFromParent();
6804 return true;
6805 }
6806 return false;
6807}
6808
6809// G_PTRAUTH_GLOBAL_VALUE lowering
6810//
6811// We have 3 lowering alternatives to choose from:
6812// - MOVaddrPAC: similar to MOVaddr, with added PAC.
6813// If the GV doesn't need a GOT load (i.e., is locally defined)
6814// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
6815//
6816// - LOADgotPAC: similar to LOADgot, with added PAC.
6817// If the GV needs a GOT load, materialize the pointer using the usual
6818// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
6819// section is assumed to be read-only (for example, via relro mechanism). See
6820// LowerMOVaddrPAC.
6821//
6822// - LOADauthptrstatic: similar to LOADgot, but use a
6823// special stub slot instead of a GOT slot.
6824// Load a signed pointer for symbol 'sym' from a stub slot named
6825// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
6826// resolving. This usually lowers to adrp+ldr, but also emits an entry into
6827// .data with an
6828// @AUTH relocation. See LowerLOADauthptrstatic.
6829//
6830// All 3 are pseudos that are expand late to longer sequences: this lets us
6831// provide integrity guarantees on the to-be-signed intermediate values.
6832//
6833// LOADauthptrstatic is undesirable because it requires a large section filled
6834// with often similarly-signed pointers, making it a good harvesting target.
6835// Thus, it's only used for ptrauth references to extern_weak to avoid null
6836// checks.
6837
6838bool AArch64InstructionSelector::selectPtrAuthGlobalValue(
6839 MachineInstr &I, MachineRegisterInfo &MRI) const {
6840 Register DefReg = I.getOperand(i: 0).getReg();
6841 Register Addr = I.getOperand(i: 1).getReg();
6842 uint64_t Key = I.getOperand(i: 2).getImm();
6843 Register AddrDisc = I.getOperand(i: 3).getReg();
6844 uint64_t Disc = I.getOperand(i: 4).getImm();
6845 int64_t Offset = 0;
6846
6847 if (Key > AArch64PACKey::LAST)
6848 report_fatal_error(reason: "key in ptrauth global out of range [0, " +
6849 Twine((int)AArch64PACKey::LAST) + "]");
6850
6851 // Blend only works if the integer discriminator is 16-bit wide.
6852 if (!isUInt<16>(x: Disc))
6853 report_fatal_error(
6854 reason: "constant discriminator in ptrauth global out of range [0, 0xffff]");
6855
6856 // Choosing between 3 lowering alternatives is target-specific.
6857 if (!STI.isTargetELF() && !STI.isTargetMachO())
6858 report_fatal_error(reason: "ptrauth global lowering only supported on MachO/ELF");
6859
6860 if (!MRI.hasOneDef(RegNo: Addr))
6861 return false;
6862
6863 // First match any offset we take from the real global.
6864 const MachineInstr *DefMI = &*MRI.def_instr_begin(RegNo: Addr);
6865 if (DefMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6866 Register OffsetReg = DefMI->getOperand(i: 2).getReg();
6867 if (!MRI.hasOneDef(RegNo: OffsetReg))
6868 return false;
6869 const MachineInstr &OffsetMI = *MRI.def_instr_begin(RegNo: OffsetReg);
6870 if (OffsetMI.getOpcode() != TargetOpcode::G_CONSTANT)
6871 return false;
6872
6873 Addr = DefMI->getOperand(i: 1).getReg();
6874 if (!MRI.hasOneDef(RegNo: Addr))
6875 return false;
6876
6877 DefMI = &*MRI.def_instr_begin(RegNo: Addr);
6878 Offset = OffsetMI.getOperand(i: 1).getCImm()->getSExtValue();
6879 }
6880
6881 // We should be left with a genuine unauthenticated GlobalValue.
6882 const GlobalValue *GV;
6883 if (DefMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
6884 GV = DefMI->getOperand(i: 1).getGlobal();
6885 Offset += DefMI->getOperand(i: 1).getOffset();
6886 } else if (DefMI->getOpcode() == AArch64::G_ADD_LOW) {
6887 GV = DefMI->getOperand(i: 2).getGlobal();
6888 Offset += DefMI->getOperand(i: 2).getOffset();
6889 } else {
6890 return false;
6891 }
6892
6893 MachineIRBuilder MIB(I);
6894
6895 // Classify the reference to determine whether it needs a GOT load.
6896 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
6897 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
6898 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
6899 "unsupported non-GOT op flags on ptrauth global reference");
6900 assert((!GV->hasExternalWeakLinkage() || NeedsGOTLoad) &&
6901 "unsupported non-GOT reference to weak ptrauth global");
6902
6903 std::optional<APInt> AddrDiscVal = getIConstantVRegVal(VReg: AddrDisc, MRI);
6904 bool HasAddrDisc = !AddrDiscVal || *AddrDiscVal != 0;
6905
6906 // Non-extern_weak:
6907 // - No GOT load needed -> MOVaddrPAC
6908 // - GOT load for non-extern_weak -> LOADgotPAC
6909 // Note that we disallow extern_weak refs to avoid null checks later.
6910 if (!GV->hasExternalWeakLinkage()) {
6911 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X16}, SrcOps: {});
6912 MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {AArch64::X17}, SrcOps: {});
6913 MIB.buildInstr(Opcode: NeedsGOTLoad ? AArch64::LOADgotPAC : AArch64::MOVaddrPAC)
6914 .addGlobalAddress(GV, Offset)
6915 .addImm(Val: Key)
6916 .addReg(RegNo: HasAddrDisc ? AddrDisc : AArch64::XZR)
6917 .addImm(Val: Disc)
6918 .constrainAllUses(TII, TRI, RBI);
6919 MIB.buildCopy(Res: DefReg, Op: Register(AArch64::X16));
6920 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
6921 I.eraseFromParent();
6922 return true;
6923 }
6924
6925 // extern_weak -> LOADauthptrstatic
6926
6927 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
6928 // offset alone as a pointer if the symbol wasn't available, which would
6929 // probably break null checks in users. Ptrauth complicates things further:
6930 // error out.
6931 if (Offset != 0)
6932 report_fatal_error(
6933 reason: "unsupported non-zero offset in weak ptrauth global reference");
6934
6935 if (HasAddrDisc)
6936 report_fatal_error(reason: "unsupported weak addr-div ptrauth global");
6937
6938 MIB.buildInstr(Opc: AArch64::LOADauthptrstatic, DstOps: {DefReg}, SrcOps: {})
6939 .addGlobalAddress(GV, Offset)
6940 .addImm(Val: Key)
6941 .addImm(Val: Disc);
6942 RBI.constrainGenericRegister(Reg: DefReg, RC: AArch64::GPR64RegClass, MRI);
6943
6944 I.eraseFromParent();
6945 return true;
6946}
6947
6948void AArch64InstructionSelector::SelectTable(MachineInstr &I,
6949 MachineRegisterInfo &MRI,
6950 unsigned NumVec, unsigned Opc1,
6951 unsigned Opc2, bool isExt) {
6952 Register DstReg = I.getOperand(i: 0).getReg();
6953 unsigned Opc = MRI.getType(Reg: DstReg) == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8) ? Opc1 : Opc2;
6954
6955 // Create the REG_SEQUENCE
6956 SmallVector<Register, 4> Regs;
6957 for (unsigned i = 0; i < NumVec; i++)
6958 Regs.push_back(Elt: I.getOperand(i: i + 2 + isExt).getReg());
6959 Register RegSeq = createQTuple(Regs, MIB);
6960
6961 Register IdxReg = I.getOperand(i: 2 + NumVec + isExt).getReg();
6962 MachineInstrBuilder Instr;
6963 if (isExt) {
6964 Register Reg = I.getOperand(i: 2).getReg();
6965 Instr = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Reg, RegSeq, IdxReg});
6966 } else
6967 Instr = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {RegSeq, IdxReg});
6968 constrainSelectedInstRegOperands(I&: *Instr, TII, TRI, RBI);
6969 I.eraseFromParent();
6970}
6971
6972InstructionSelector::ComplexRendererFns
6973AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
6974 auto MaybeImmed = getImmedFromMO(Root);
6975 if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6976 return std::nullopt;
6977 uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
6978 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6979}
6980
6981InstructionSelector::ComplexRendererFns
6982AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
6983 auto MaybeImmed = getImmedFromMO(Root);
6984 if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6985 return std::nullopt;
6986 uint64_t Enc = 31 - *MaybeImmed;
6987 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6988}
6989
6990InstructionSelector::ComplexRendererFns
6991AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
6992 auto MaybeImmed = getImmedFromMO(Root);
6993 if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6994 return std::nullopt;
6995 uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
6996 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6997}
6998
6999InstructionSelector::ComplexRendererFns
7000AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
7001 auto MaybeImmed = getImmedFromMO(Root);
7002 if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7003 return std::nullopt;
7004 uint64_t Enc = 63 - *MaybeImmed;
7005 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
7006}
7007
7008/// Helper to select an immediate value that can be represented as a 12-bit
7009/// value shifted left by either 0 or 12. If it is possible to do so, return
7010/// the immediate and shift value. If not, return std::nullopt.
7011///
7012/// Used by selectArithImmed and selectNegArithImmed.
7013InstructionSelector::ComplexRendererFns
7014AArch64InstructionSelector::select12BitValueWithLeftShift(
7015 uint64_t Immed) const {
7016 unsigned ShiftAmt;
7017 if (Immed >> 12 == 0) {
7018 ShiftAmt = 0;
7019 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
7020 ShiftAmt = 12;
7021 Immed = Immed >> 12;
7022 } else
7023 return std::nullopt;
7024
7025 unsigned ShVal = AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: ShiftAmt);
7026 return {{
7027 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Immed); },
7028 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ShVal); },
7029 }};
7030}
7031
7032/// SelectArithImmed - Select an immediate value that can be represented as
7033/// a 12-bit value shifted left by either 0 or 12. If so, return true with
7034/// Val set to the 12-bit value and Shift set to the shifter operand.
7035InstructionSelector::ComplexRendererFns
7036AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
7037 // This function is called from the addsub_shifted_imm ComplexPattern,
7038 // which lists [imm] as the list of opcode it's interested in, however
7039 // we still need to check whether the operand is actually an immediate
7040 // here because the ComplexPattern opcode list is only used in
7041 // root-level opcode matching.
7042 auto MaybeImmed = getImmedFromMO(Root);
7043 if (MaybeImmed == std::nullopt)
7044 return std::nullopt;
7045 return select12BitValueWithLeftShift(Immed: *MaybeImmed);
7046}
7047
7048/// SelectNegArithImmed - As above, but negates the value before trying to
7049/// select it.
7050InstructionSelector::ComplexRendererFns
7051AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
7052 // We need a register here, because we need to know if we have a 64 or 32
7053 // bit immediate.
7054 if (!Root.isReg())
7055 return std::nullopt;
7056 auto MaybeImmed = getImmedFromMO(Root);
7057 if (MaybeImmed == std::nullopt)
7058 return std::nullopt;
7059 uint64_t Immed = *MaybeImmed;
7060
7061 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
7062 // have the opposite effect on the C flag, so this pattern mustn't match under
7063 // those circumstances.
7064 if (Immed == 0)
7065 return std::nullopt;
7066
7067 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
7068 // the root.
7069 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7070 if (MRI.getType(Reg: Root.getReg()).getSizeInBits() == 32)
7071 Immed = ~((uint32_t)Immed) + 1;
7072 else
7073 Immed = ~Immed + 1ULL;
7074
7075 if (Immed & 0xFFFFFFFFFF000000ULL)
7076 return std::nullopt;
7077
7078 Immed &= 0xFFFFFFULL;
7079 return select12BitValueWithLeftShift(Immed);
7080}
7081
7082/// Checks if we are sure that folding MI into load/store addressing mode is
7083/// beneficial or not.
7084///
7085/// Returns:
7086/// - true if folding MI would be beneficial.
7087/// - false if folding MI would be bad.
7088/// - std::nullopt if it is not sure whether folding MI is beneficial.
7089///
7090/// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example:
7091///
7092/// %13:gpr(s64) = G_CONSTANT i64 1
7093/// %8:gpr(s64) = G_SHL %6, %13(s64)
7094/// %9:gpr(p0) = G_PTR_ADD %0, %8(s64)
7095/// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16))
7096std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode(
7097 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
7098 if (MI.getOpcode() == AArch64::G_SHL) {
7099 // Address operands with shifts are free, except for running on subtargets
7100 // with AddrLSLSlow14.
7101 if (const auto ValAndVeg = getIConstantVRegValWithLookThrough(
7102 VReg: MI.getOperand(i: 2).getReg(), MRI)) {
7103 const APInt ShiftVal = ValAndVeg->Value;
7104
7105 // Don't fold if we know this will be slow.
7106 return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4));
7107 }
7108 }
7109 return std::nullopt;
7110}
7111
7112/// Return true if it is worth folding MI into an extended register. That is,
7113/// if it's safe to pull it into the addressing mode of a load or store as a
7114/// shift.
7115/// \p IsAddrOperand whether the def of MI is used as an address operand
7116/// (e.g. feeding into an LDR/STR).
7117bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
7118 const MachineInstr &MI, const MachineRegisterInfo &MRI,
7119 bool IsAddrOperand) const {
7120
7121 // Always fold if there is one use, or if we're optimizing for size.
7122 Register DefReg = MI.getOperand(i: 0).getReg();
7123 if (MRI.hasOneNonDBGUse(RegNo: DefReg) ||
7124 MI.getParent()->getParent()->getFunction().hasOptSize())
7125 return true;
7126
7127 if (IsAddrOperand) {
7128 // If we are already sure that folding MI is good or bad, return the result.
7129 if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI))
7130 return *Worth;
7131
7132 // Fold G_PTR_ADD if its offset operand can be folded
7133 if (MI.getOpcode() == AArch64::G_PTR_ADD) {
7134 MachineInstr *OffsetInst =
7135 getDefIgnoringCopies(Reg: MI.getOperand(i: 2).getReg(), MRI);
7136
7137 // Note, we already know G_PTR_ADD is used by at least two instructions.
7138 // If we are also sure about whether folding is beneficial or not,
7139 // return the result.
7140 if (const auto Worth = isWorthFoldingIntoAddrMode(MI: *OffsetInst, MRI))
7141 return *Worth;
7142 }
7143 }
7144
7145 // FIXME: Consider checking HasALULSLFast as appropriate.
7146
7147 // We have a fastpath, so folding a shift in and potentially computing it
7148 // many times may be beneficial. Check if this is only used in memory ops.
7149 // If it is, then we should fold.
7150 return all_of(Range: MRI.use_nodbg_instructions(Reg: DefReg),
7151 P: [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
7152}
7153
7154InstructionSelector::ComplexRendererFns
7155AArch64InstructionSelector::selectExtendedSHL(
7156 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
7157 unsigned SizeInBytes, bool WantsExt) const {
7158 assert(Base.isReg() && "Expected base to be a register operand");
7159 assert(Offset.isReg() && "Expected offset to be a register operand");
7160
7161 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7162 MachineInstr *OffsetInst = MRI.getVRegDef(Reg: Offset.getReg());
7163
7164 unsigned OffsetOpc = OffsetInst->getOpcode();
7165 bool LookedThroughZExt = false;
7166 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
7167 // Try to look through a ZEXT.
7168 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
7169 return std::nullopt;
7170
7171 OffsetInst = MRI.getVRegDef(Reg: OffsetInst->getOperand(i: 1).getReg());
7172 OffsetOpc = OffsetInst->getOpcode();
7173 LookedThroughZExt = true;
7174
7175 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
7176 return std::nullopt;
7177 }
7178 // Make sure that the memory op is a valid size.
7179 int64_t LegalShiftVal = Log2_32(Value: SizeInBytes);
7180 if (LegalShiftVal == 0)
7181 return std::nullopt;
7182 if (!isWorthFoldingIntoExtendedReg(MI: *OffsetInst, MRI, IsAddrOperand: true))
7183 return std::nullopt;
7184
7185 // Now, try to find the specific G_CONSTANT. Start by assuming that the
7186 // register we will offset is the LHS, and the register containing the
7187 // constant is the RHS.
7188 Register OffsetReg = OffsetInst->getOperand(i: 1).getReg();
7189 Register ConstantReg = OffsetInst->getOperand(i: 2).getReg();
7190 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
7191 if (!ValAndVReg) {
7192 // We didn't get a constant on the RHS. If the opcode is a shift, then
7193 // we're done.
7194 if (OffsetOpc == TargetOpcode::G_SHL)
7195 return std::nullopt;
7196
7197 // If we have a G_MUL, we can use either register. Try looking at the RHS.
7198 std::swap(a&: OffsetReg, b&: ConstantReg);
7199 ValAndVReg = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
7200 if (!ValAndVReg)
7201 return std::nullopt;
7202 }
7203
7204 // The value must fit into 3 bits, and must be positive. Make sure that is
7205 // true.
7206 int64_t ImmVal = ValAndVReg->Value.getSExtValue();
7207
7208 // Since we're going to pull this into a shift, the constant value must be
7209 // a power of 2. If we got a multiply, then we need to check this.
7210 if (OffsetOpc == TargetOpcode::G_MUL) {
7211 if (!llvm::has_single_bit<uint32_t>(Value: ImmVal))
7212 return std::nullopt;
7213
7214 // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
7215 ImmVal = Log2_32(Value: ImmVal);
7216 }
7217
7218 if ((ImmVal & 0x7) != ImmVal)
7219 return std::nullopt;
7220
7221 // We are only allowed to shift by LegalShiftVal. This shift value is built
7222 // into the instruction, so we can't just use whatever we want.
7223 if (ImmVal != LegalShiftVal)
7224 return std::nullopt;
7225
7226 unsigned SignExtend = 0;
7227 if (WantsExt) {
7228 // Check if the offset is defined by an extend, unless we looked through a
7229 // G_ZEXT earlier.
7230 if (!LookedThroughZExt) {
7231 MachineInstr *ExtInst = getDefIgnoringCopies(Reg: OffsetReg, MRI);
7232 auto Ext = getExtendTypeForInst(MI&: *ExtInst, MRI, IsLoadStore: true);
7233 if (Ext == AArch64_AM::InvalidShiftExtend)
7234 return std::nullopt;
7235
7236 SignExtend = AArch64_AM::isSignExtendShiftType(Type: Ext) ? 1 : 0;
7237 // We only support SXTW for signed extension here.
7238 if (SignExtend && Ext != AArch64_AM::SXTW)
7239 return std::nullopt;
7240 OffsetReg = ExtInst->getOperand(i: 1).getReg();
7241 }
7242
7243 // Need a 32-bit wide register here.
7244 MachineIRBuilder MIB(*MRI.getVRegDef(Reg: Root.getReg()));
7245 OffsetReg = moveScalarRegClass(Reg: OffsetReg, RC: AArch64::GPR32RegClass, MIB);
7246 }
7247
7248 // We can use the LHS of the GEP as the base, and the LHS of the shift as an
7249 // offset. Signify that we are shifting by setting the shift flag to 1.
7250 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: Base.getReg()); },
7251 [=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: OffsetReg); },
7252 [=](MachineInstrBuilder &MIB) {
7253 // Need to add both immediates here to make sure that they are both
7254 // added to the instruction.
7255 MIB.addImm(Val: SignExtend);
7256 MIB.addImm(Val: 1);
7257 }}};
7258}
7259
7260/// This is used for computing addresses like this:
7261///
7262/// ldr x1, [x2, x3, lsl #3]
7263///
7264/// Where x2 is the base register, and x3 is an offset register. The shift-left
7265/// is a constant value specific to this load instruction. That is, we'll never
7266/// see anything other than a 3 here (which corresponds to the size of the
7267/// element being loaded.)
7268InstructionSelector::ComplexRendererFns
7269AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
7270 MachineOperand &Root, unsigned SizeInBytes) const {
7271 if (!Root.isReg())
7272 return std::nullopt;
7273 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7274
7275 // We want to find something like this:
7276 //
7277 // val = G_CONSTANT LegalShiftVal
7278 // shift = G_SHL off_reg val
7279 // ptr = G_PTR_ADD base_reg shift
7280 // x = G_LOAD ptr
7281 //
7282 // And fold it into this addressing mode:
7283 //
7284 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7285
7286 // Check if we can find the G_PTR_ADD.
7287 MachineInstr *PtrAdd =
7288 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7289 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(MI: *PtrAdd, MRI, IsAddrOperand: true))
7290 return std::nullopt;
7291
7292 // Now, try to match an opcode which will match our specific offset.
7293 // We want a G_SHL or a G_MUL.
7294 MachineInstr *OffsetInst =
7295 getDefIgnoringCopies(Reg: PtrAdd->getOperand(i: 2).getReg(), MRI);
7296 return selectExtendedSHL(Root, Base&: PtrAdd->getOperand(i: 1),
7297 Offset&: OffsetInst->getOperand(i: 0), SizeInBytes,
7298 /*WantsExt=*/false);
7299}
7300
7301/// This is used for computing addresses like this:
7302///
7303/// ldr x1, [x2, x3]
7304///
7305/// Where x2 is the base register, and x3 is an offset register.
7306///
7307/// When possible (or profitable) to fold a G_PTR_ADD into the address
7308/// calculation, this will do so. Otherwise, it will return std::nullopt.
7309InstructionSelector::ComplexRendererFns
7310AArch64InstructionSelector::selectAddrModeRegisterOffset(
7311 MachineOperand &Root) const {
7312 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7313
7314 // We need a GEP.
7315 MachineInstr *Gep = MRI.getVRegDef(Reg: Root.getReg());
7316 if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7317 return std::nullopt;
7318
7319 // If this is used more than once, let's not bother folding.
7320 // TODO: Check if they are memory ops. If they are, then we can still fold
7321 // without having to recompute anything.
7322 if (!MRI.hasOneNonDBGUse(RegNo: Gep->getOperand(i: 0).getReg()))
7323 return std::nullopt;
7324
7325 // Base is the GEP's LHS, offset is its RHS.
7326 return {{[=](MachineInstrBuilder &MIB) {
7327 MIB.addUse(RegNo: Gep->getOperand(i: 1).getReg());
7328 },
7329 [=](MachineInstrBuilder &MIB) {
7330 MIB.addUse(RegNo: Gep->getOperand(i: 2).getReg());
7331 },
7332 [=](MachineInstrBuilder &MIB) {
7333 // Need to add both immediates here to make sure that they are both
7334 // added to the instruction.
7335 MIB.addImm(Val: 0);
7336 MIB.addImm(Val: 0);
7337 }}};
7338}
7339
7340/// This is intended to be equivalent to selectAddrModeXRO in
7341/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7342InstructionSelector::ComplexRendererFns
7343AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7344 unsigned SizeInBytes) const {
7345 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7346 if (!Root.isReg())
7347 return std::nullopt;
7348 MachineInstr *PtrAdd =
7349 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7350 if (!PtrAdd)
7351 return std::nullopt;
7352
7353 // Check for an immediates which cannot be encoded in the [base + imm]
7354 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7355 // end up with code like:
7356 //
7357 // mov x0, wide
7358 // add x1 base, x0
7359 // ldr x2, [x1, x0]
7360 //
7361 // In this situation, we can use the [base, xreg] addressing mode to save an
7362 // add/sub:
7363 //
7364 // mov x0, wide
7365 // ldr x2, [base, x0]
7366 auto ValAndVReg =
7367 getIConstantVRegValWithLookThrough(VReg: PtrAdd->getOperand(i: 2).getReg(), MRI);
7368 if (ValAndVReg) {
7369 unsigned Scale = Log2_32(Value: SizeInBytes);
7370 int64_t ImmOff = ValAndVReg->Value.getSExtValue();
7371
7372 // Skip immediates that can be selected in the load/store addressing
7373 // mode.
7374 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
7375 ImmOff < (0x1000 << Scale))
7376 return std::nullopt;
7377
7378 // Helper lambda to decide whether or not it is preferable to emit an add.
7379 auto isPreferredADD = [](int64_t ImmOff) {
7380 // Constants in [0x0, 0xfff] can be encoded in an add.
7381 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
7382 return true;
7383
7384 // Can it be encoded in an add lsl #12?
7385 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
7386 return false;
7387
7388 // It can be encoded in an add lsl #12, but we may not want to. If it is
7389 // possible to select this as a single movz, then prefer that. A single
7390 // movz is faster than an add with a shift.
7391 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
7392 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
7393 };
7394
7395 // If the immediate can be encoded in a single add/sub, then bail out.
7396 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
7397 return std::nullopt;
7398 }
7399
7400 // Try to fold shifts into the addressing mode.
7401 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7402 if (AddrModeFns)
7403 return AddrModeFns;
7404
7405 // If that doesn't work, see if it's possible to fold in registers from
7406 // a GEP.
7407 return selectAddrModeRegisterOffset(Root);
7408}
7409
7410/// This is used for computing addresses like this:
7411///
7412/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7413///
7414/// Where we have a 64-bit base register, a 32-bit offset register, and an
7415/// extend (which may or may not be signed).
7416InstructionSelector::ComplexRendererFns
7417AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7418 unsigned SizeInBytes) const {
7419 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7420
7421 MachineInstr *PtrAdd =
7422 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7423 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(MI: *PtrAdd, MRI, IsAddrOperand: true))
7424 return std::nullopt;
7425
7426 MachineOperand &LHS = PtrAdd->getOperand(i: 1);
7427 MachineOperand &RHS = PtrAdd->getOperand(i: 2);
7428 MachineInstr *OffsetInst = getDefIgnoringCopies(Reg: RHS.getReg(), MRI);
7429
7430 // The first case is the same as selectAddrModeXRO, except we need an extend.
7431 // In this case, we try to find a shift and extend, and fold them into the
7432 // addressing mode.
7433 //
7434 // E.g.
7435 //
7436 // off_reg = G_Z/S/ANYEXT ext_reg
7437 // val = G_CONSTANT LegalShiftVal
7438 // shift = G_SHL off_reg val
7439 // ptr = G_PTR_ADD base_reg shift
7440 // x = G_LOAD ptr
7441 //
7442 // In this case we can get a load like this:
7443 //
7444 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7445 auto ExtendedShl = selectExtendedSHL(Root, Base&: LHS, Offset&: OffsetInst->getOperand(i: 0),
7446 SizeInBytes, /*WantsExt=*/true);
7447 if (ExtendedShl)
7448 return ExtendedShl;
7449
7450 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7451 //
7452 // e.g.
7453 // ldr something, [base_reg, ext_reg, sxtw]
7454 if (!isWorthFoldingIntoExtendedReg(MI: *OffsetInst, MRI, IsAddrOperand: true))
7455 return std::nullopt;
7456
7457 // Check if this is an extend. We'll get an extend type if it is.
7458 AArch64_AM::ShiftExtendType Ext =
7459 getExtendTypeForInst(MI&: *OffsetInst, MRI, /*IsLoadStore=*/true);
7460 if (Ext == AArch64_AM::InvalidShiftExtend)
7461 return std::nullopt;
7462
7463 // Need a 32-bit wide register.
7464 MachineIRBuilder MIB(*PtrAdd);
7465 Register ExtReg = moveScalarRegClass(Reg: OffsetInst->getOperand(i: 1).getReg(),
7466 RC: AArch64::GPR32RegClass, MIB);
7467 unsigned SignExtend = Ext == AArch64_AM::SXTW;
7468
7469 // Base is LHS, offset is ExtReg.
7470 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: LHS.getReg()); },
7471 [=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); },
7472 [=](MachineInstrBuilder &MIB) {
7473 MIB.addImm(Val: SignExtend);
7474 MIB.addImm(Val: 0);
7475 }}};
7476}
7477
7478/// Select a "register plus unscaled signed 9-bit immediate" address. This
7479/// should only match when there is an offset that is not valid for a scaled
7480/// immediate addressing mode. The "Size" argument is the size in bytes of the
7481/// memory reference, which is needed here to know what is valid for a scaled
7482/// immediate.
7483InstructionSelector::ComplexRendererFns
7484AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7485 unsigned Size) const {
7486 MachineRegisterInfo &MRI =
7487 Root.getParent()->getParent()->getParent()->getRegInfo();
7488
7489 if (!Root.isReg())
7490 return std::nullopt;
7491
7492 if (!isBaseWithConstantOffset(Root, MRI))
7493 return std::nullopt;
7494
7495 MachineInstr *RootDef = MRI.getVRegDef(Reg: Root.getReg());
7496
7497 MachineOperand &OffImm = RootDef->getOperand(i: 2);
7498 if (!OffImm.isReg())
7499 return std::nullopt;
7500 MachineInstr *RHS = MRI.getVRegDef(Reg: OffImm.getReg());
7501 if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7502 return std::nullopt;
7503 int64_t RHSC;
7504 MachineOperand &RHSOp1 = RHS->getOperand(i: 1);
7505 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
7506 return std::nullopt;
7507 RHSC = RHSOp1.getCImm()->getSExtValue();
7508
7509 if (RHSC >= -256 && RHSC < 256) {
7510 MachineOperand &Base = RootDef->getOperand(i: 1);
7511 return {{
7512 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Base); },
7513 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC); },
7514 }};
7515 }
7516 return std::nullopt;
7517}
7518
7519InstructionSelector::ComplexRendererFns
7520AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7521 unsigned Size,
7522 MachineRegisterInfo &MRI) const {
7523 if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7524 return std::nullopt;
7525 MachineInstr &Adrp = *MRI.getVRegDef(Reg: RootDef.getOperand(i: 1).getReg());
7526 if (Adrp.getOpcode() != AArch64::ADRP)
7527 return std::nullopt;
7528
7529 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7530 auto Offset = Adrp.getOperand(i: 1).getOffset();
7531 if (Offset % Size != 0)
7532 return std::nullopt;
7533
7534 auto GV = Adrp.getOperand(i: 1).getGlobal();
7535 if (GV->isThreadLocal())
7536 return std::nullopt;
7537
7538 auto &MF = *RootDef.getParent()->getParent();
7539 if (GV->getPointerAlignment(DL: MF.getDataLayout()) < Size)
7540 return std::nullopt;
7541
7542 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM: MF.getTarget());
7543 MachineIRBuilder MIRBuilder(RootDef);
7544 Register AdrpReg = Adrp.getOperand(i: 0).getReg();
7545 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: AdrpReg); },
7546 [=](MachineInstrBuilder &MIB) {
7547 MIB.addGlobalAddress(GV, Offset,
7548 TargetFlags: OpFlags | AArch64II::MO_PAGEOFF |
7549 AArch64II::MO_NC);
7550 }}};
7551}
7552
7553/// Select a "register plus scaled unsigned 12-bit immediate" address. The
7554/// "Size" argument is the size in bytes of the memory reference, which
7555/// determines the scale.
7556InstructionSelector::ComplexRendererFns
7557AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7558 unsigned Size) const {
7559 MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7560 MachineRegisterInfo &MRI = MF.getRegInfo();
7561
7562 if (!Root.isReg())
7563 return std::nullopt;
7564
7565 MachineInstr *RootDef = MRI.getVRegDef(Reg: Root.getReg());
7566 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7567 return {{
7568 [=](MachineInstrBuilder &MIB) { MIB.add(MO: RootDef->getOperand(i: 1)); },
7569 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); },
7570 }};
7571 }
7572
7573 CodeModel::Model CM = MF.getTarget().getCodeModel();
7574 // Check if we can fold in the ADD of small code model ADRP + ADD address.
7575 // HACK: ld64 on Darwin doesn't support relocations on PRFM, so we can't fold
7576 // globals into the offset.
7577 MachineInstr *RootParent = Root.getParent();
7578 if (CM == CodeModel::Small &&
7579 !(RootParent->getOpcode() == AArch64::G_AARCH64_PREFETCH &&
7580 STI.isTargetDarwin())) {
7581 auto OpFns = tryFoldAddLowIntoImm(RootDef&: *RootDef, Size, MRI);
7582 if (OpFns)
7583 return OpFns;
7584 }
7585
7586 if (isBaseWithConstantOffset(Root, MRI)) {
7587 MachineOperand &LHS = RootDef->getOperand(i: 1);
7588 MachineOperand &RHS = RootDef->getOperand(i: 2);
7589 MachineInstr *LHSDef = MRI.getVRegDef(Reg: LHS.getReg());
7590 MachineInstr *RHSDef = MRI.getVRegDef(Reg: RHS.getReg());
7591
7592 int64_t RHSC = (int64_t)RHSDef->getOperand(i: 1).getCImm()->getZExtValue();
7593 unsigned Scale = Log2_32(Value: Size);
7594 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
7595 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7596 return {{
7597 [=](MachineInstrBuilder &MIB) { MIB.add(MO: LHSDef->getOperand(i: 1)); },
7598 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC >> Scale); },
7599 }};
7600
7601 return {{
7602 [=](MachineInstrBuilder &MIB) { MIB.add(MO: LHS); },
7603 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC >> Scale); },
7604 }};
7605 }
7606 }
7607
7608 // Before falling back to our general case, check if the unscaled
7609 // instructions can handle this. If so, that's preferable.
7610 if (selectAddrModeUnscaled(Root, Size))
7611 return std::nullopt;
7612
7613 return {{
7614 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); },
7615 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); },
7616 }};
7617}
7618
7619/// Given a shift instruction, return the correct shift type for that
7620/// instruction.
7621static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7622 switch (MI.getOpcode()) {
7623 default:
7624 return AArch64_AM::InvalidShiftExtend;
7625 case TargetOpcode::G_SHL:
7626 return AArch64_AM::LSL;
7627 case TargetOpcode::G_LSHR:
7628 return AArch64_AM::LSR;
7629 case TargetOpcode::G_ASHR:
7630 return AArch64_AM::ASR;
7631 case TargetOpcode::G_ROTR:
7632 return AArch64_AM::ROR;
7633 }
7634}
7635
7636/// Select a "shifted register" operand. If the value is not shifted, set the
7637/// shift operand to a default value of "lsl 0".
7638InstructionSelector::ComplexRendererFns
7639AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7640 bool AllowROR) const {
7641 if (!Root.isReg())
7642 return std::nullopt;
7643 MachineRegisterInfo &MRI =
7644 Root.getParent()->getParent()->getParent()->getRegInfo();
7645
7646 // Check if the operand is defined by an instruction which corresponds to
7647 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7648 MachineInstr *ShiftInst = MRI.getVRegDef(Reg: Root.getReg());
7649 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(MI&: *ShiftInst);
7650 if (ShType == AArch64_AM::InvalidShiftExtend)
7651 return std::nullopt;
7652 if (ShType == AArch64_AM::ROR && !AllowROR)
7653 return std::nullopt;
7654 if (!isWorthFoldingIntoExtendedReg(MI: *ShiftInst, MRI, IsAddrOperand: false))
7655 return std::nullopt;
7656
7657 // Need an immediate on the RHS.
7658 MachineOperand &ShiftRHS = ShiftInst->getOperand(i: 2);
7659 auto Immed = getImmedFromMO(Root: ShiftRHS);
7660 if (!Immed)
7661 return std::nullopt;
7662
7663 // We have something that we can fold. Fold in the shift's LHS and RHS into
7664 // the instruction.
7665 MachineOperand &ShiftLHS = ShiftInst->getOperand(i: 1);
7666 Register ShiftReg = ShiftLHS.getReg();
7667
7668 unsigned NumBits = MRI.getType(Reg: ShiftReg).getSizeInBits();
7669 unsigned Val = *Immed & (NumBits - 1);
7670 unsigned ShiftVal = AArch64_AM::getShifterImm(ST: ShType, Imm: Val);
7671
7672 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ShiftReg); },
7673 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ShiftVal); }}};
7674}
7675
7676AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7677 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7678 unsigned Opc = MI.getOpcode();
7679
7680 // Handle explicit extend instructions first.
7681 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
7682 unsigned Size;
7683 if (Opc == TargetOpcode::G_SEXT)
7684 Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7685 else
7686 Size = MI.getOperand(i: 2).getImm();
7687 assert(Size != 64 && "Extend from 64 bits?");
7688 switch (Size) {
7689 case 8:
7690 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7691 case 16:
7692 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7693 case 32:
7694 return AArch64_AM::SXTW;
7695 default:
7696 return AArch64_AM::InvalidShiftExtend;
7697 }
7698 }
7699
7700 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
7701 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7702 assert(Size != 64 && "Extend from 64 bits?");
7703 switch (Size) {
7704 case 8:
7705 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7706 case 16:
7707 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7708 case 32:
7709 return AArch64_AM::UXTW;
7710 default:
7711 return AArch64_AM::InvalidShiftExtend;
7712 }
7713 }
7714
7715 // Don't have an explicit extend. Try to handle a G_AND with a constant mask
7716 // on the RHS.
7717 if (Opc != TargetOpcode::G_AND)
7718 return AArch64_AM::InvalidShiftExtend;
7719
7720 std::optional<uint64_t> MaybeAndMask = getImmedFromMO(Root: MI.getOperand(i: 2));
7721 if (!MaybeAndMask)
7722 return AArch64_AM::InvalidShiftExtend;
7723 uint64_t AndMask = *MaybeAndMask;
7724 switch (AndMask) {
7725 default:
7726 return AArch64_AM::InvalidShiftExtend;
7727 case 0xFF:
7728 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7729 case 0xFFFF:
7730 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7731 case 0xFFFFFFFF:
7732 return AArch64_AM::UXTW;
7733 }
7734}
7735
7736Register AArch64InstructionSelector::moveScalarRegClass(
7737 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7738 MachineRegisterInfo &MRI = *MIB.getMRI();
7739 auto Ty = MRI.getType(Reg);
7740 assert(!Ty.isVector() && "Expected scalars only!");
7741 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7742 return Reg;
7743
7744 // Create a copy and immediately select it.
7745 // FIXME: We should have an emitCopy function?
7746 auto Copy = MIB.buildCopy(Res: {&RC}, Op: {Reg});
7747 selectCopy(I&: *Copy, TII, MRI, TRI, RBI);
7748 return Copy.getReg(Idx: 0);
7749}
7750
7751/// Select an "extended register" operand. This operand folds in an extend
7752/// followed by an optional left shift.
7753InstructionSelector::ComplexRendererFns
7754AArch64InstructionSelector::selectArithExtendedRegister(
7755 MachineOperand &Root) const {
7756 if (!Root.isReg())
7757 return std::nullopt;
7758 MachineRegisterInfo &MRI =
7759 Root.getParent()->getParent()->getParent()->getRegInfo();
7760
7761 uint64_t ShiftVal = 0;
7762 Register ExtReg;
7763 AArch64_AM::ShiftExtendType Ext;
7764 MachineInstr *RootDef = getDefIgnoringCopies(Reg: Root.getReg(), MRI);
7765 if (!RootDef)
7766 return std::nullopt;
7767
7768 if (!isWorthFoldingIntoExtendedReg(MI: *RootDef, MRI, IsAddrOperand: false))
7769 return std::nullopt;
7770
7771 // Check if we can fold a shift and an extend.
7772 if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7773 // Look for a constant on the RHS of the shift.
7774 MachineOperand &RHS = RootDef->getOperand(i: 2);
7775 std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(Root: RHS);
7776 if (!MaybeShiftVal)
7777 return std::nullopt;
7778 ShiftVal = *MaybeShiftVal;
7779 if (ShiftVal > 4)
7780 return std::nullopt;
7781 // Look for a valid extend instruction on the LHS of the shift.
7782 MachineOperand &LHS = RootDef->getOperand(i: 1);
7783 MachineInstr *ExtDef = getDefIgnoringCopies(Reg: LHS.getReg(), MRI);
7784 if (!ExtDef)
7785 return std::nullopt;
7786 Ext = getExtendTypeForInst(MI&: *ExtDef, MRI);
7787 if (Ext == AArch64_AM::InvalidShiftExtend)
7788 return std::nullopt;
7789 ExtReg = ExtDef->getOperand(i: 1).getReg();
7790 } else {
7791 // Didn't get a shift. Try just folding an extend.
7792 Ext = getExtendTypeForInst(MI&: *RootDef, MRI);
7793 if (Ext == AArch64_AM::InvalidShiftExtend)
7794 return std::nullopt;
7795 ExtReg = RootDef->getOperand(i: 1).getReg();
7796
7797 // If we have a 32 bit instruction which zeroes out the high half of a
7798 // register, we get an implicit zero extend for free. Check if we have one.
7799 // FIXME: We actually emit the extend right now even though we don't have
7800 // to.
7801 if (Ext == AArch64_AM::UXTW && MRI.getType(Reg: ExtReg).getSizeInBits() == 32) {
7802 MachineInstr *ExtInst = MRI.getVRegDef(Reg: ExtReg);
7803 if (isDef32(MI: *ExtInst))
7804 return std::nullopt;
7805 }
7806 }
7807
7808 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7809 // copy.
7810 MachineIRBuilder MIB(*RootDef);
7811 ExtReg = moveScalarRegClass(Reg: ExtReg, RC: AArch64::GPR32RegClass, MIB);
7812
7813 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); },
7814 [=](MachineInstrBuilder &MIB) {
7815 MIB.addImm(Val: getArithExtendImm(ET: Ext, Imm: ShiftVal));
7816 }}};
7817}
7818
7819InstructionSelector::ComplexRendererFns
7820AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7821 if (!Root.isReg())
7822 return std::nullopt;
7823 MachineRegisterInfo &MRI =
7824 Root.getParent()->getParent()->getParent()->getRegInfo();
7825
7826 auto Extract = getDefSrcRegIgnoringCopies(Reg: Root.getReg(), MRI);
7827 while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7828 STI.isLittleEndian())
7829 Extract =
7830 getDefSrcRegIgnoringCopies(Reg: Extract->MI->getOperand(i: 1).getReg(), MRI);
7831 if (!Extract)
7832 return std::nullopt;
7833
7834 if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
7835 if (Extract->Reg == Extract->MI->getOperand(i: 1).getReg()) {
7836 Register ExtReg = Extract->MI->getOperand(i: 2).getReg();
7837 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7838 }
7839 }
7840 if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) {
7841 LLT SrcTy = MRI.getType(Reg: Extract->MI->getOperand(i: 1).getReg());
7842 auto LaneIdx = getIConstantVRegValWithLookThrough(
7843 VReg: Extract->MI->getOperand(i: 2).getReg(), MRI);
7844 if (LaneIdx && SrcTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) &&
7845 LaneIdx->Value.getSExtValue() == 1) {
7846 Register ExtReg = Extract->MI->getOperand(i: 1).getReg();
7847 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7848 }
7849 }
7850
7851 return std::nullopt;
7852}
7853
7854InstructionSelector::ComplexRendererFns
7855AArch64InstructionSelector::selectCVTFixedPointVecBase(
7856 const MachineOperand &Root) const {
7857 if (!Root.isReg())
7858 return std::nullopt;
7859 const MachineRegisterInfo &MRI =
7860 Root.getParent()->getParent()->getParent()->getRegInfo();
7861
7862 MachineInstr *Dup = getDefIgnoringCopies(Reg: Root.getReg(), MRI);
7863 if (Dup->getOpcode() != AArch64::G_DUP)
7864 return std::nullopt;
7865 std::optional<ValueAndVReg> CstVal =
7866 getAnyConstantVRegValWithLookThrough(VReg: Dup->getOperand(i: 1).getReg(), MRI);
7867 if (!CstVal)
7868 return std::nullopt;
7869
7870 unsigned RegWidth = MRI.getType(Reg: Root.getReg()).getScalarSizeInBits();
7871 APFloat FVal(0.0);
7872 switch (RegWidth) {
7873 case 16:
7874 FVal = APFloat(APFloat::IEEEhalf(), CstVal->Value);
7875 break;
7876 case 32:
7877 FVal = APFloat(APFloat::IEEEsingle(), CstVal->Value);
7878 break;
7879 case 64:
7880 FVal = APFloat(APFloat::IEEEdouble(), CstVal->Value);
7881 break;
7882 default:
7883 return std::nullopt;
7884 };
7885 if (unsigned FBits = CheckFixedPointOperandConstant(FVal, RegWidth,
7886 /*isReciprocal*/ false))
7887 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: FBits); }}};
7888
7889 return std::nullopt;
7890}
7891
7892InstructionSelector::ComplexRendererFns
7893AArch64InstructionSelector::selectCVTFixedPointVec(MachineOperand &Root) const {
7894 return selectCVTFixedPointVecBase(Root);
7895}
7896
7897void AArch64InstructionSelector::renderFixedPointXForm(MachineInstrBuilder &MIB,
7898 const MachineInstr &MI,
7899 int OpIdx) const {
7900 // FIXME: This is only needed to satisfy the type checking in tablegen, and
7901 // should be able to reuse the Renderers already calculated by
7902 // selectCVTFixedPointVecBase.
7903 InstructionSelector::ComplexRendererFns Renderer =
7904 selectCVTFixedPointVecBase(Root: MI.getOperand(i: 2));
7905 assert((Renderer && Renderer->size() == 1) &&
7906 "Expected selectCVTFixedPointVec to provide a function\n");
7907 (Renderer->front())(MIB);
7908}
7909
7910void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7911 const MachineInstr &MI,
7912 int OpIdx) const {
7913 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7914 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7915 "Expected G_CONSTANT");
7916 std::optional<int64_t> CstVal =
7917 getIConstantVRegSExtVal(VReg: MI.getOperand(i: 0).getReg(), MRI);
7918 assert(CstVal && "Expected constant value");
7919 MIB.addImm(Val: *CstVal);
7920}
7921
7922void AArch64InstructionSelector::renderLogicalImm32(
7923 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7924 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7925 "Expected G_CONSTANT");
7926 uint64_t CstVal = I.getOperand(i: 1).getCImm()->getZExtValue();
7927 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(imm: CstVal, regSize: 32);
7928 MIB.addImm(Val: Enc);
7929}
7930
7931void AArch64InstructionSelector::renderLogicalImm64(
7932 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7933 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7934 "Expected G_CONSTANT");
7935 uint64_t CstVal = I.getOperand(i: 1).getCImm()->getZExtValue();
7936 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(imm: CstVal, regSize: 64);
7937 MIB.addImm(Val: Enc);
7938}
7939
7940void AArch64InstructionSelector::renderUbsanTrap(MachineInstrBuilder &MIB,
7941 const MachineInstr &MI,
7942 int OpIdx) const {
7943 assert(MI.getOpcode() == TargetOpcode::G_UBSANTRAP && OpIdx == 0 &&
7944 "Expected G_UBSANTRAP");
7945 MIB.addImm(Val: MI.getOperand(i: 0).getImm() | ('U' << 8));
7946}
7947
7948void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
7949 const MachineInstr &MI,
7950 int OpIdx) const {
7951 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7952 "Expected G_FCONSTANT");
7953 MIB.addImm(
7954 Val: AArch64_AM::getFP16Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
7955}
7956
7957void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
7958 const MachineInstr &MI,
7959 int OpIdx) const {
7960 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7961 "Expected G_FCONSTANT");
7962 MIB.addImm(
7963 Val: AArch64_AM::getFP32Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
7964}
7965
7966void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
7967 const MachineInstr &MI,
7968 int OpIdx) const {
7969 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7970 "Expected G_FCONSTANT");
7971 MIB.addImm(
7972 Val: AArch64_AM::getFP64Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
7973}
7974
7975void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
7976 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7977 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7978 "Expected G_FCONSTANT");
7979 MIB.addImm(Val: AArch64_AM::encodeAdvSIMDModImmType4(Imm: MI.getOperand(i: 1)
7980 .getFPImm()
7981 ->getValueAPF()
7982 .bitcastToAPInt()
7983 .getZExtValue()));
7984}
7985
7986bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
7987 const MachineInstr &MI, unsigned NumBytes) const {
7988 if (!MI.mayLoadOrStore())
7989 return false;
7990 assert(MI.hasOneMemOperand() &&
7991 "Expected load/store to have only one mem op!");
7992 return (*MI.memoperands_begin())->getSize() == NumBytes;
7993}
7994
7995bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
7996 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7997 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits() != 32)
7998 return false;
7999
8000 // Only return true if we know the operation will zero-out the high half of
8001 // the 64-bit register. Truncates can be subregister copies, which don't
8002 // zero out the high bits. Copies and other copy-like instructions can be
8003 // fed by truncates, or could be lowered as subregister copies.
8004 switch (MI.getOpcode()) {
8005 default:
8006 return true;
8007 case TargetOpcode::COPY:
8008 case TargetOpcode::G_BITCAST:
8009 case TargetOpcode::G_TRUNC:
8010 case TargetOpcode::G_PHI:
8011 return false;
8012 }
8013}
8014
8015
8016// Perform fixups on the given PHI instruction's operands to force them all
8017// to be the same as the destination regbank.
8018static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
8019 const AArch64RegisterBankInfo &RBI) {
8020 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
8021 Register DstReg = MI.getOperand(i: 0).getReg();
8022 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: DstReg);
8023 assert(DstRB && "Expected PHI dst to have regbank assigned");
8024 MachineIRBuilder MIB(MI);
8025
8026 // Go through each operand and ensure it has the same regbank.
8027 for (MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands())) {
8028 if (!MO.isReg())
8029 continue;
8030 Register OpReg = MO.getReg();
8031 const RegisterBank *RB = MRI.getRegBankOrNull(Reg: OpReg);
8032 if (RB != DstRB) {
8033 // Insert a cross-bank copy.
8034 auto *OpDef = MRI.getVRegDef(Reg: OpReg);
8035 const LLT &Ty = MRI.getType(Reg: OpReg);
8036 MachineBasicBlock &OpDefBB = *OpDef->getParent();
8037
8038 // Any instruction we insert must appear after all PHIs in the block
8039 // for the block to be valid MIR.
8040 MachineBasicBlock::iterator InsertPt = std::next(x: OpDef->getIterator());
8041 if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
8042 InsertPt = OpDefBB.getFirstNonPHI();
8043 MIB.setInsertPt(MBB&: *OpDef->getParent(), II: InsertPt);
8044 auto Copy = MIB.buildCopy(Res: Ty, Op: OpReg);
8045 MRI.setRegBank(Reg: Copy.getReg(Idx: 0), RegBank: *DstRB);
8046 MO.setReg(Copy.getReg(Idx: 0));
8047 }
8048 }
8049}
8050
8051void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
8052 // We're looking for PHIs, build a list so we don't invalidate iterators.
8053 MachineRegisterInfo &MRI = MF.getRegInfo();
8054 SmallVector<MachineInstr *, 32> Phis;
8055 for (auto &BB : MF) {
8056 for (auto &MI : BB) {
8057 if (MI.getOpcode() == TargetOpcode::G_PHI)
8058 Phis.emplace_back(Args: &MI);
8059 }
8060 }
8061
8062 for (auto *MI : Phis) {
8063 // We need to do some work here if the operand types are < 16 bit and they
8064 // are split across fpr/gpr banks. Since all types <32b on gpr
8065 // end up being assigned gpr32 regclasses, we can end up with PHIs here
8066 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
8067 // be selecting heterogenous regbanks for operands if possible, but we
8068 // still need to be able to deal with it here.
8069 //
8070 // To fix this, if we have a gpr-bank operand < 32b in size and at least
8071 // one other operand is on the fpr bank, then we add cross-bank copies
8072 // to homogenize the operand banks. For simplicity the bank that we choose
8073 // to settle on is whatever bank the def operand has. For example:
8074 //
8075 // %endbb:
8076 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
8077 // =>
8078 // %bb2:
8079 // ...
8080 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
8081 // ...
8082 // %endbb:
8083 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
8084 bool HasGPROp = false, HasFPROp = false;
8085 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands())) {
8086 if (!MO.isReg())
8087 continue;
8088 const LLT &Ty = MRI.getType(Reg: MO.getReg());
8089 if (!Ty.isValid() || !Ty.isScalar())
8090 break;
8091 if (Ty.getSizeInBits() >= 32)
8092 break;
8093 const RegisterBank *RB = MRI.getRegBankOrNull(Reg: MO.getReg());
8094 // If for some reason we don't have a regbank yet. Don't try anything.
8095 if (!RB)
8096 break;
8097
8098 if (RB->getID() == AArch64::GPRRegBankID)
8099 HasGPROp = true;
8100 else
8101 HasFPROp = true;
8102 }
8103 // We have heterogenous regbanks, need to fixup.
8104 if (HasGPROp && HasFPROp)
8105 fixupPHIOpBanks(MI&: *MI, MRI, RBI);
8106 }
8107}
8108
8109namespace llvm {
8110InstructionSelector *
8111createAArch64InstructionSelector(const AArch64TargetMachine &TM,
8112 const AArch64Subtarget &Subtarget,
8113 const AArch64RegisterBankInfo &RBI) {
8114 return new AArch64InstructionSelector(TM, Subtarget, RBI);
8115}
8116}
8117