1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfoImplBase conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
21#include "llvm/CodeGen/BasicTTIImpl.h"
22#include "llvm/Support/AMDGPUAddrSpace.h"
23#include <optional>
24
25namespace llvm {
26
27class AMDGPUTargetMachine;
28class GCNSubtarget;
29class InstCombiner;
30class Loop;
31class ScalarEvolution;
32class SITargetLowering;
33class Type;
34class Value;
35
36class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
37 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
38 using TTI = TargetTransformInfo;
39
40 friend BaseT;
41
42 Triple TargetTriple;
43
44 const TargetSubtargetInfo *ST;
45 const TargetLoweringBase *TLI;
46
47 const TargetSubtargetInfo *getST() const { return ST; }
48 const TargetLoweringBase *getTLI() const { return TLI; }
49
50public:
51 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
52
53 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
54 TTI::UnrollingPreferences &UP,
55 OptimizationRemarkEmitter *ORE) const override;
56
57 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
58 TTI::PeelingPreferences &PP) const override;
59
60 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
61};
62
63class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
64 using BaseT = BasicTTIImplBase<GCNTTIImpl>;
65 using TTI = TargetTransformInfo;
66
67 friend BaseT;
68
69 const GCNSubtarget *ST;
70 const SITargetLowering *TLI;
71 AMDGPUTTIImpl CommonTTI;
72 bool IsGraphics;
73 bool HasFP32Denormals;
74 bool HasFP64FP16Denormals;
75 static constexpr bool InlinerVectorBonusPercent = 0;
76
77 static const FeatureBitset InlineFeatureIgnoreList;
78
79 const GCNSubtarget *getST() const { return ST; }
80 const SITargetLowering *getTLI() const { return TLI; }
81
82 static inline int getFullRateInstrCost() {
83 return TargetTransformInfo::TCC_Basic;
84 }
85
86 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
87 return CostKind == TTI::TCK_CodeSize ? 2
88 : 2 * TargetTransformInfo::TCC_Basic;
89 }
90
91 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
92 // should be 2 or 4.
93 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
94 return CostKind == TTI::TCK_CodeSize ? 2
95 : 4 * TargetTransformInfo::TCC_Basic;
96 }
97
98 int getTransInstrCost(TTI::TargetCostKind CostKind) const;
99
100 // On some parts, normal fp64 operations are half rate, and others
101 // quarter. This also applies to some integer operations.
102 int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
103
104 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
105
106 /// \returns true if V might be divergent even when all of its operands
107 /// are uniform.
108 bool isSourceOfDivergence(const Value *V) const;
109
110 /// Returns true for the target specific set of operations which produce
111 /// uniform result even taking non-uniform arguments.
112 bool isAlwaysUniform(const Value *V) const;
113
114public:
115 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
116
117 bool hasBranchDivergence(const Function *F = nullptr) const override;
118
119 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
120 TTI::UnrollingPreferences &UP,
121 OptimizationRemarkEmitter *ORE) const override;
122
123 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
124 TTI::PeelingPreferences &PP) const override;
125
126 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override {
127 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
128 return TTI::PSK_FastHardware;
129 }
130
131 unsigned getNumberOfRegisters(unsigned RCID) const override;
132 TypeSize
133 getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override;
134 unsigned getMinVectorRegisterBitWidth() const override;
135 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
136 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
137 unsigned ChainSizeInBytes,
138 VectorType *VecTy) const override;
139 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
140 unsigned ChainSizeInBytes,
141 VectorType *VecTy) const override;
142 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override;
143
144 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
145 unsigned AddrSpace) const;
146 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
147 unsigned AddrSpace) const override;
148 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
149 unsigned AddrSpace) const override;
150
151 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
152 Type *getMemcpyLoopLoweringType(
153 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
154 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
155 std::optional<uint32_t> AtomicElementSize) const override;
156
157 void getMemcpyLoopResidualLoweringType(
158 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
159 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
160 Align SrcAlign, Align DestAlign,
161 std::optional<uint32_t> AtomicCpySize) const override;
162 unsigned getMaxInterleaveFactor(ElementCount VF) const override;
163
164 bool getTgtMemIntrinsic(IntrinsicInst *Inst,
165 MemIntrinsicInfo &Info) const override;
166
167 InstructionCost getArithmeticInstrCost(
168 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
169 TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
170 TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
171 ArrayRef<const Value *> Args = {},
172 const Instruction *CxtI = nullptr) const override;
173
174 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
175 const Instruction *I = nullptr) const override;
176
177 bool isInlineAsmSourceOfDivergence(const CallInst *CI,
178 ArrayRef<unsigned> Indices = {}) const;
179
180 using BaseT::getVectorInstrCost;
181 InstructionCost
182 getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind,
183 unsigned Index, const Value *Op0, const Value *Op1,
184 TTI::VectorInstrContext VIC =
185 TTI::VectorInstrContext::None) const override;
186
187 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
188
189 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
190 // Address space casts must cast between different address spaces.
191 if (FromAS == ToAS)
192 return false;
193
194 // Casts between any aliasing address spaces are valid.
195 return AMDGPU::addrspacesMayAlias(AS1: FromAS, AS2: ToAS);
196 }
197
198 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override {
199 return AMDGPU::addrspacesMayAlias(AS1: AS0, AS2: AS1);
200 }
201
202 unsigned getFlatAddressSpace() const override {
203 // Don't bother running InferAddressSpaces pass on graphics shaders which
204 // don't use flat addressing.
205 if (IsGraphics)
206 return -1;
207 return AMDGPUAS::FLAT_ADDRESS;
208 }
209
210 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
211 Intrinsic::ID IID) const override;
212
213 bool
214 canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override {
215 return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
216 AS != AMDGPUAS::PRIVATE_ADDRESS;
217 }
218
219 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
220 Value *NewV) const override;
221
222 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
223 const Value *Op1, InstCombiner &IC) const;
224
225 bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
226 unsigned LaneAgIdx) const;
227
228 std::optional<Instruction *>
229 instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
230
231 Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC,
232 IntrinsicInst &II,
233 const APInt &DemandedElts,
234 APInt &UndefElts) const;
235
236 Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
237 IntrinsicInst &II) const;
238
239 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
240 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
241 APInt &UndefElts2, APInt &UndefElts3,
242 std::function<void(Instruction *, unsigned, APInt, APInt &)>
243 SimplifyAndSetOp) const override;
244
245 InstructionCost getVectorSplitCost() const { return 0; }
246
247 InstructionCost
248 getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
249 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
250 VectorType *SubTp, ArrayRef<const Value *> Args = {},
251 const Instruction *CxtI = nullptr) const override;
252
253 bool isProfitableToSinkOperands(Instruction *I,
254 SmallVectorImpl<Use *> &Ops) const override;
255
256 bool areInlineCompatible(const Function *Caller,
257 const Function *Callee) const override;
258
259 int getInliningLastCallToStaticBonus() const override;
260 unsigned getInliningThresholdMultiplier() const override { return 11; }
261 unsigned adjustInliningThreshold(const CallBase *CB) const override;
262 unsigned getCallerAllocaCost(const CallBase *CB,
263 const AllocaInst *AI) const override;
264
265 int getInlinerVectorBonusPercent() const override {
266 return InlinerVectorBonusPercent;
267 }
268
269 InstructionCost
270 getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
271 std::optional<FastMathFlags> FMF,
272 TTI::TargetCostKind CostKind) const override;
273
274 InstructionCost getPartialReductionCost(
275 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
276 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
277 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
278 TTI::TargetCostKind CostKind,
279 std::optional<FastMathFlags> FMF) const override {
280 return InstructionCost::getInvalid();
281 }
282
283 InstructionCost
284 getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
285 TTI::TargetCostKind CostKind) const override;
286 InstructionCost
287 getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
288 TTI::TargetCostKind CostKind) const override;
289
290 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
291 unsigned getCacheLineSize() const override { return 128; }
292
293 /// How much before a load we should place the prefetch instruction.
294 /// This is currently measured in number of IR instructions.
295 unsigned getPrefetchDistance() const override;
296
297 /// \return if target want to issue a prefetch in address space \p AS.
298 bool shouldPrefetchAddressSpace(unsigned AS) const override;
299 void collectKernelLaunchBounds(
300 const Function &F,
301 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
302
303 enum class KnownIEEEMode { Unknown, On, Off };
304
305 /// Return KnownIEEEMode::On if we know if the use context can assume
306 /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
307 /// "amdgpu-ieee"="false".
308 KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
309
310 /// Account for loads of i8 vector types to have reduced cost. For
311 /// example the cost of load 4 i8s values is one is the cost of loading
312 /// a single i32 value.
313 InstructionCost getMemoryOpCost(
314 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
315 TTI::TargetCostKind CostKind,
316 TTI::OperandValueInfo OpInfo = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
317 const Instruction *I = nullptr) const override;
318
319 /// When counting parts on AMD GPUs, account for i8s being grouped
320 /// together under a single i32 value. Otherwise fall back to base
321 /// implementation.
322 unsigned getNumberOfParts(Type *Tp) const override;
323
324 ValueUniformity getValueUniformity(const Value *V) const override;
325
326 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
327 StackOffset BaseOffset, bool HasBaseReg,
328 int64_t Scale,
329 unsigned AddrSpace) const override;
330
331 bool isLSRCostLess(const TTI::LSRCost &A,
332 const TTI::LSRCost &B) const override;
333 bool isNumRegsMajorCostOfLSR() const override;
334 bool shouldDropLSRSolutionIfLessProfitable() const override;
335
336 bool isUniform(const Instruction *I,
337 const SmallBitVector &UniformArgs) const override;
338};
339
340} // end namespace llvm
341
342#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
343