1//===- ARMTargetTransformInfo.h - ARM specific TTI --------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfoImplBase conforming object specific to the
11/// ARM target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
19
20#include "ARM.h"
21#include "ARMSubtarget.h"
22#include "ARMTargetMachine.h"
23#include "llvm/ADT/ArrayRef.h"
24#include "llvm/Analysis/TargetTransformInfo.h"
25#include "llvm/CodeGen/BasicTTIImpl.h"
26#include "llvm/IR/Constant.h"
27#include "llvm/IR/Function.h"
28#include "llvm/TargetParser/SubtargetFeature.h"
29#include <optional>
30
31namespace llvm {
32
33class APInt;
34class ARMTargetLowering;
35class Instruction;
36class Loop;
37class SCEV;
38class ScalarEvolution;
39class Type;
40class Value;
41
42namespace TailPredication {
43enum Mode {
44 Disabled = 0,
45 EnabledNoReductions,
46 Enabled,
47 ForceEnabledNoReductions,
48 ForceEnabled
49};
50}
51
52// For controlling conversion of memcpy into Tail Predicated loop.
53namespace TPLoop {
54enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
55}
56
57class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> {
58 using BaseT = BasicTTIImplBase<ARMTTIImpl>;
59 using TTI = TargetTransformInfo;
60
61 friend BaseT;
62
63 const ARMSubtarget *ST;
64 const ARMTargetLowering *TLI;
65
66 // Currently the following features are excluded from InlineFeaturesAllowed.
67 // ModeThumb, FeatureNoARM, ModeSoftFloat.
68 // Depending on whether they are set or unset, different
69 // instructions/registers are available. For example, inlining a callee with
70 // -thumb-mode in a caller with +thumb-mode, may cause the assembler to
71 // fail if the callee uses ARM only instructions, e.g. in inline asm.
72 const FeatureBitset InlineFeaturesAllowed = {ARM::Feature8MSecExt,
73 ARM::FeatureAClass,
74 ARM::FeatureAES,
75 ARM::FeatureAcquireRelease,
76 ARM::FeatureAvoidMOVsShOp,
77 ARM::FeatureAvoidMULS,
78 ARM::FeatureAvoidPartialCPSR,
79 ARM::FeatureBF16,
80 ARM::FeatureCRC,
81 ARM::FeatureCheapPredicableCPSR,
82 ARM::FeatureCheckVLDnAlign,
83 ARM::FeatureCrypto,
84 ARM::FeatureD32,
85 ARM::FeatureDB,
86 ARM::FeatureDFB,
87 ARM::FeatureDSP,
88 ARM::FeatureDontWidenVMOVS,
89 ARM::FeatureDotProd,
90 ARM::FeatureExecuteOnly,
91 ARM::FeatureExpandMLx,
92 ARM::FeatureFP16,
93 ARM::FeatureFP16FML,
94 ARM::FeatureFP64,
95 ARM::FeatureFPAO,
96 ARM::FeatureFPARMv8,
97 ARM::FeatureFPARMv8_D16,
98 ARM::FeatureFPARMv8_D16_SP,
99 ARM::FeatureFPARMv8_SP,
100 ARM::FeatureFPRegs,
101 ARM::FeatureFPRegs16,
102 ARM::FeatureFPRegs64,
103 ARM::FeatureFullFP16,
104 ARM::FeatureFuseAES,
105 ARM::FeatureFuseLiterals,
106 ARM::FeatureHWDivARM,
107 ARM::FeatureHWDivThumb,
108 ARM::FeatureHasNoBranchPredictor,
109 ARM::FeatureHasRetAddrStack,
110 ARM::FeatureHasSlowFPVFMx,
111 ARM::FeatureHasSlowFPVMLx,
112 ARM::FeatureHasVMLxHazards,
113 ARM::FeatureLOB,
114 ARM::FeatureLongCalls,
115 ARM::FeatureMClass,
116 ARM::FeatureMP,
117 ARM::FeatureMVEVectorCostFactor1,
118 ARM::FeatureMVEVectorCostFactor2,
119 ARM::FeatureMVEVectorCostFactor4,
120 ARM::FeatureMatMulInt8,
121 ARM::FeatureMuxedUnits,
122 ARM::FeatureNEON,
123 ARM::FeatureNEONForFP,
124 ARM::FeatureNEONForFPMovs,
125 ARM::FeatureNoMovt,
126 ARM::FeatureNoNegativeImmediates,
127 ARM::FeatureNoPostRASched,
128 ARM::FeaturePerfMon,
129 ARM::FeaturePref32BitThumb,
130 ARM::FeaturePrefISHSTBarrier,
131 ARM::FeaturePreferBranchAlign32,
132 ARM::FeaturePreferBranchAlign64,
133 ARM::FeaturePreferVMOVSR,
134 ARM::FeatureProfUnpredicate,
135 ARM::FeatureRAS,
136 ARM::FeatureRClass,
137 ARM::FeatureReserveR9,
138 ARM::FeatureSB,
139 ARM::FeatureSHA2,
140 ARM::FeatureSlowFPBrcc,
141 ARM::FeatureSlowLoadDSubreg,
142 ARM::FeatureSlowOddRegister,
143 ARM::FeatureSlowVDUP32,
144 ARM::FeatureSlowVGETLNi32,
145 ARM::FeatureSplatVFPToNeon,
146 ARM::FeatureStrictAlign,
147 ARM::FeatureThumb2,
148 ARM::FeatureTrustZone,
149 ARM::FeatureUseMIPipeliner,
150 ARM::FeatureUseMISched,
151 ARM::FeatureUseWideStrideVFP,
152 ARM::FeatureV7Clrex,
153 ARM::FeatureVFP2,
154 ARM::FeatureVFP2_SP,
155 ARM::FeatureVFP3,
156 ARM::FeatureVFP3_D16,
157 ARM::FeatureVFP3_D16_SP,
158 ARM::FeatureVFP3_SP,
159 ARM::FeatureVFP4,
160 ARM::FeatureVFP4_D16,
161 ARM::FeatureVFP4_D16_SP,
162 ARM::FeatureVFP4_SP,
163 ARM::FeatureVMLxForwarding,
164 ARM::FeatureVirtualization,
165 ARM::FeatureZCZeroing,
166 ARM::HasMVEFloatOps,
167 ARM::HasMVEIntegerOps,
168 ARM::HasV5TEOps,
169 ARM::HasV5TOps,
170 ARM::HasV6KOps,
171 ARM::HasV6MOps,
172 ARM::HasV6Ops,
173 ARM::HasV6T2Ops,
174 ARM::HasV7Ops,
175 ARM::HasV8MBaselineOps,
176 ARM::HasV8MMainlineOps,
177 ARM::HasV8Ops,
178 ARM::HasV8_1MMainlineOps,
179 ARM::HasV8_1aOps,
180 ARM::HasV8_2aOps,
181 ARM::HasV8_3aOps,
182 ARM::HasV8_4aOps,
183 ARM::HasV8_5aOps,
184 ARM::HasV8_6aOps,
185 ARM::HasV8_7aOps,
186 ARM::HasV8_8aOps,
187 ARM::HasV8_9aOps,
188 ARM::HasV9_0aOps,
189 ARM::HasV9_1aOps,
190 ARM::HasV9_2aOps,
191 ARM::HasV9_3aOps,
192 ARM::HasV9_4aOps,
193 ARM::HasV9_5aOps,
194 ARM::HasV9_6aOps,
195 ARM::HasV9_7aOps};
196
197 const ARMSubtarget *getST() const { return ST; }
198 const ARMTargetLowering *getTLI() const { return TLI; }
199
200public:
201 explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F)
202 : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
203 TLI(ST->getTargetLowering()) {}
204
205 bool areInlineCompatible(const Function *Caller,
206 const Function *Callee) const override;
207
208 bool enableInterleavedAccessVectorization() const override { return true; }
209
210 TTI::AddressingModeKind
211 getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override;
212
213 /// Floating-point computation using ARMv8 AArch32 Advanced
214 /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
215 /// and Arm MVE are IEEE-754 compliant.
216 bool isFPVectorizationPotentiallyUnsafe() const override {
217 return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
218 }
219
220 std::optional<Instruction *>
221 instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
222 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
223 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
224 APInt &UndefElts2, APInt &UndefElts3,
225 std::function<void(Instruction *, unsigned, APInt, APInt &)>
226 SimplifyAndSetOp) const override;
227
228 /// \name Scalar TTI Implementations
229 /// @{
230
231 InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
232 const APInt &Imm,
233 Type *Ty) const override;
234
235 using BaseT::getIntImmCost;
236 InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
237 TTI::TargetCostKind CostKind) const override;
238
239 InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
240 const APInt &Imm, Type *Ty,
241 TTI::TargetCostKind CostKind,
242 Instruction *Inst = nullptr) const override;
243
244 /// @}
245
246 /// \name Vector TTI Implementations
247 /// @{
248
249 unsigned getNumberOfRegisters(unsigned ClassID) const override {
250 bool Vector = (ClassID == 1);
251 if (Vector) {
252 if (ST->hasNEON())
253 return 16;
254 if (ST->hasMVEIntegerOps())
255 return 8;
256 return 0;
257 }
258
259 if (ST->isThumb1Only())
260 return 8;
261 return 13;
262 }
263
264 TypeSize
265 getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override {
266 switch (K) {
267 case TargetTransformInfo::RGK_Scalar:
268 return TypeSize::getFixed(ExactSize: 32);
269 case TargetTransformInfo::RGK_FixedWidthVector:
270 if (ST->hasNEON())
271 return TypeSize::getFixed(ExactSize: 128);
272 if (ST->hasMVEIntegerOps())
273 return TypeSize::getFixed(ExactSize: 128);
274 return TypeSize::getFixed(ExactSize: 0);
275 case TargetTransformInfo::RGK_ScalableVector:
276 return TypeSize::getScalable(MinimumSize: 0);
277 }
278 llvm_unreachable("Unsupported register kind");
279 }
280
281 unsigned getMaxInterleaveFactor(ElementCount VF) const override {
282 return ST->getMaxInterleaveFactor();
283 }
284
285 bool isProfitableLSRChainElement(Instruction *I) const override;
286
287 bool
288 isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace,
289 TTI::MaskKind MaskKind =
290 TTI::MaskKind::VariableOrConstantMask) const override;
291
292 bool
293 isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace,
294 TTI::MaskKind MaskKind =
295 TTI::MaskKind::VariableOrConstantMask) const override {
296 return isLegalMaskedLoad(DataTy, Alignment, AddressSpace, MaskKind);
297 }
298
299 bool forceScalarizeMaskedGather(VectorType *VTy,
300 Align Alignment) const override {
301 // For MVE, we have a custom lowering pass that will already have custom
302 // legalised any gathers that we can lower to MVE intrinsics, and want to
303 // expand all the rest. The pass runs before the masked intrinsic lowering
304 // pass.
305 return true;
306 }
307
308 bool forceScalarizeMaskedScatter(VectorType *VTy,
309 Align Alignment) const override {
310 return forceScalarizeMaskedGather(VTy, Alignment);
311 }
312
313 bool isLegalMaskedGather(Type *Ty, Align Alignment) const override;
314
315 bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override {
316 return isLegalMaskedGather(Ty, Alignment);
317 }
318
319 InstructionCost getMemcpyCost(const Instruction *I) const override;
320
321 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
322 return ST->getMaxInlineSizeThreshold();
323 }
324
325 int getNumMemOps(const IntrinsicInst *I) const;
326
327 InstructionCost
328 getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
329 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
330 VectorType *SubTp, ArrayRef<const Value *> Args = {},
331 const Instruction *CxtI = nullptr) const override;
332
333 bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override;
334
335 bool preferPredicatedReductionSelect() const override;
336
337 bool shouldExpandReduction(const IntrinsicInst *II) const override {
338 return false;
339 }
340
341 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
342 const Instruction *I = nullptr) const override;
343
344 InstructionCost
345 getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
346 TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
347 const Instruction *I = nullptr) const override;
348
349 InstructionCost getCmpSelInstrCost(
350 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
351 TTI::TargetCostKind CostKind,
352 TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
353 TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
354 const Instruction *I = nullptr) const override;
355
356 using BaseT::getVectorInstrCost;
357 InstructionCost
358 getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
359 unsigned Index, const Value *Op0, const Value *Op1,
360 TTI::VectorInstrContext VIC =
361 TTI::VectorInstrContext::None) const override;
362
363 InstructionCost
364 getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr,
365 TTI::TargetCostKind CostKind) const override;
366
367 InstructionCost getArithmeticInstrCost(
368 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
369 TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
370 TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
371 ArrayRef<const Value *> Args = {},
372 const Instruction *CxtI = nullptr) const override;
373
374 InstructionCost getMemoryOpCost(
375 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
376 TTI::TargetCostKind CostKind,
377 TTI::OperandValueInfo OpInfo = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
378 const Instruction *I = nullptr) const override;
379
380 InstructionCost
381 getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
382 TTI::TargetCostKind CostKind) const override;
383
384 InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
385 TTI::TargetCostKind CostKind) const;
386
387 InstructionCost getInterleavedMemoryOpCost(
388 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
389 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
390 bool UseMaskForCond = false, bool UseMaskForGaps = false) const override;
391
392 InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
393 TTI::TargetCostKind CostKind) const;
394
395 InstructionCost
396 getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
397 std::optional<FastMathFlags> FMF,
398 TTI::TargetCostKind CostKind) const override;
399 InstructionCost
400 getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy,
401 VectorType *ValTy, std::optional<FastMathFlags> FMF,
402 TTI::TargetCostKind CostKind) const override;
403 InstructionCost
404 getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy,
405 VectorType *ValTy,
406 TTI::TargetCostKind CostKind) const override;
407
408 InstructionCost
409 getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
410 TTI::TargetCostKind CostKind) const override;
411
412 InstructionCost
413 getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
414 TTI::TargetCostKind CostKind) const override;
415
416 /// getScalingFactorCost - Return the cost of the scaling used in
417 /// addressing mode represented by AM.
418 /// If the AM is supported, the return value must be >= 0.
419 /// If the AM is not supported, the return value is an invalid cost.
420 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
421 StackOffset BaseOffset, bool HasBaseReg,
422 int64_t Scale,
423 unsigned AddrSpace) const override;
424
425 bool maybeLoweredToCall(Instruction &I) const;
426 bool isLoweredToCall(const Function *F) const override;
427 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
428 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
429 HardwareLoopInfo &HWLoopInfo) const override;
430 bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override;
431 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
432 TTI::UnrollingPreferences &UP,
433 OptimizationRemarkEmitter *ORE) const override;
434
435 TailFoldingStyle
436 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override;
437
438 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
439 TTI::PeelingPreferences &PP) const override;
440 bool shouldBuildLookupTablesForConstant(Constant *C) const override {
441 // In the ROPI and RWPI relocation models we can't have pointers to global
442 // variables or functions in constant data, so don't convert switches to
443 // lookup tables if any of the values would need relocation.
444 if (ST->isROPI() || ST->isRWPI())
445 return !C->needsDynamicRelocation();
446
447 return true;
448 }
449
450 bool hasArmWideBranch(bool Thumb) const override;
451
452 bool isProfitableToSinkOperands(Instruction *I,
453 SmallVectorImpl<Use *> &Ops) const override;
454
455 unsigned getNumBytesToPadGlobalArray(unsigned Size,
456 Type *ArrayType) const override;
457
458 /// @}
459};
460
461/// isVREVMask - Check if a vector shuffle corresponds to a VREV
462/// instruction with the specified blocksize. (The order of the elements
463/// within each block of the vector is reversed.)
464inline bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
465 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
466 "Only possible block sizes for VREV are: 16, 32, 64");
467
468 unsigned EltSz = VT.getScalarSizeInBits();
469 if (EltSz != 8 && EltSz != 16 && EltSz != 32)
470 return false;
471
472 unsigned BlockElts = M[0] + 1;
473 // If the first shuffle index is UNDEF, be optimistic.
474 if (M[0] < 0)
475 BlockElts = BlockSize / EltSz;
476
477 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
478 return false;
479
480 for (unsigned i = 0, e = M.size(); i < e; ++i) {
481 if (M[i] < 0)
482 continue; // ignore UNDEF indices
483 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
484 return false;
485 }
486
487 return true;
488}
489
490} // end namespace llvm
491
492#endif // LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
493