1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfoImplBase conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
21#include "llvm/CodeGen/BasicTTIImpl.h"
22#include "llvm/Support/AMDGPUAddrSpace.h"
23#include <optional>
24
25namespace llvm {
26
27class AMDGPUTargetMachine;
28class GCNSubtarget;
29class InstCombiner;
30class Loop;
31class ScalarEvolution;
32class SITargetLowering;
33class Type;
34class Value;
35
36class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
37 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
38 using TTI = TargetTransformInfo;
39
40 friend BaseT;
41
42 Triple TargetTriple;
43
44 const TargetSubtargetInfo *ST;
45 const TargetLoweringBase *TLI;
46
47 const TargetSubtargetInfo *getST() const { return ST; }
48 const TargetLoweringBase *getTLI() const { return TLI; }
49
50public:
51 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
52
53 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
54 TTI::UnrollingPreferences &UP,
55 OptimizationRemarkEmitter *ORE) const override;
56
57 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
58 TTI::PeelingPreferences &PP) const override;
59
60 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
61};
62
63class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
64 using BaseT = BasicTTIImplBase<GCNTTIImpl>;
65 using TTI = TargetTransformInfo;
66
67 friend BaseT;
68
69 const GCNSubtarget *ST;
70 const SITargetLowering *TLI;
71 AMDGPUTTIImpl CommonTTI;
72 bool IsGraphics;
73 bool HasFP32Denormals;
74 bool HasFP64FP16Denormals;
75 static constexpr bool InlinerVectorBonusPercent = 0;
76
77 static const FeatureBitset InlineFeatureIgnoreList;
78
79 const GCNSubtarget *getST() const { return ST; }
80 const SITargetLowering *getTLI() const { return TLI; }
81
82 static inline int getFullRateInstrCost() {
83 return TargetTransformInfo::TCC_Basic;
84 }
85
86 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
87 return CostKind == TTI::TCK_CodeSize ? 2
88 : 2 * TargetTransformInfo::TCC_Basic;
89 }
90
91 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
92 // should be 2 or 4.
93 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
94 return CostKind == TTI::TCK_CodeSize ? 2
95 : 4 * TargetTransformInfo::TCC_Basic;
96 }
97
98 int getTransInstrCost(TTI::TargetCostKind CostKind) const;
99
100 // On some parts, normal fp64 operations are half rate, and others
101 // quarter. This also applies to some integer operations.
102 int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
103
104 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
105
106 /// \returns true if V might be divergent even when all of its operands
107 /// are uniform.
108 bool isSourceOfDivergence(const Value *V) const;
109
110 /// Returns true for the target specific set of operations which produce
111 /// uniform result even taking non-uniform arguments.
112 bool isAlwaysUniform(const Value *V) const;
113
114public:
115 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
116
117 bool hasBranchDivergence(const Function *F = nullptr) const override;
118
119 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
120 TTI::UnrollingPreferences &UP,
121 OptimizationRemarkEmitter *ORE) const override;
122
123 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
124 TTI::PeelingPreferences &PP) const override;
125
126 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override {
127 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
128 return TTI::PSK_FastHardware;
129 }
130
131 unsigned getNumberOfRegisters(unsigned RCID) const override;
132 TypeSize
133 getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override;
134 unsigned getMinVectorRegisterBitWidth() const override;
135 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
136 bool preferSLPInstCountCheck() const override;
137 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
138 unsigned ChainSizeInBytes,
139 VectorType *VecTy) const override;
140 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
141 unsigned ChainSizeInBytes,
142 VectorType *VecTy) const override;
143 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override;
144
145 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
146 unsigned AddrSpace) const;
147 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
148 unsigned AddrSpace) const override;
149 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
150 unsigned AddrSpace) const override;
151
152 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
153 Type *getMemcpyLoopLoweringType(
154 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
155 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
156 std::optional<uint32_t> AtomicElementSize) const override;
157
158 void getMemcpyLoopResidualLoweringType(
159 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
160 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
161 Align SrcAlign, Align DestAlign,
162 std::optional<uint32_t> AtomicCpySize) const override;
163 unsigned getMaxInterleaveFactor(ElementCount VF,
164 bool HasUnorderedReductions) const override;
165
166 bool getTgtMemIntrinsic(IntrinsicInst *Inst,
167 MemIntrinsicInfo &Info) const override;
168
169 InstructionCost getArithmeticInstrCost(
170 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
171 TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
172 TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
173 ArrayRef<const Value *> Args = {},
174 const Instruction *CxtI = nullptr) const override;
175
176 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
177 const Instruction *I = nullptr) const override;
178
179 bool isInlineAsmSourceOfDivergence(const CallInst *CI,
180 ArrayRef<unsigned> Indices = {}) const;
181
182 using BaseT::getVectorInstrCost;
183 InstructionCost
184 getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind,
185 unsigned Index, const Value *Op0, const Value *Op1,
186 TTI::VectorInstrContext VIC =
187 TTI::VectorInstrContext::None) const override;
188
189 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
190
191 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
192 // Address space casts must cast between different address spaces.
193 if (FromAS == ToAS)
194 return false;
195
196 // Casts between any aliasing address spaces are valid.
197 return AMDGPU::addrspacesMayAlias(AS1: FromAS, AS2: ToAS);
198 }
199
200 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override {
201 return AMDGPU::addrspacesMayAlias(AS1: AS0, AS2: AS1);
202 }
203
204 unsigned getFlatAddressSpace() const override {
205 // Don't bother running InferAddressSpaces pass on graphics shaders which
206 // don't use flat addressing.
207 if (IsGraphics)
208 return -1;
209 return AMDGPUAS::FLAT_ADDRESS;
210 }
211
212 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
213 Intrinsic::ID IID) const override;
214
215 bool
216 canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override {
217 return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
218 AS != AMDGPUAS::PRIVATE_ADDRESS;
219 }
220
221 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
222 Value *NewV) const override;
223
224 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
225 const Value *Op1, InstCombiner &IC) const;
226
227 bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
228 unsigned LaneAgIdx) const;
229
230 std::optional<Instruction *>
231 instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
232
233 Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC,
234 IntrinsicInst &II,
235 const APInt &DemandedElts,
236 APInt &UndefElts) const;
237
238 Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
239 IntrinsicInst &II) const;
240
241 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
242 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
243 APInt &UndefElts2, APInt &UndefElts3,
244 std::function<void(Instruction *, unsigned, APInt, APInt &)>
245 SimplifyAndSetOp) const override;
246
247 InstructionCost getVectorSplitCost() const { return 0; }
248
249 InstructionCost
250 getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
251 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
252 VectorType *SubTp, ArrayRef<const Value *> Args = {},
253 const Instruction *CxtI = nullptr) const override;
254
255 bool isProfitableToSinkOperands(Instruction *I,
256 SmallVectorImpl<Use *> &Ops) const override;
257
258 bool areInlineCompatible(const Function *Caller,
259 const Function *Callee) const override;
260
261 int getInliningLastCallToStaticBonus() const override;
262 unsigned getInliningThresholdMultiplier() const override { return 11; }
263 unsigned adjustInliningThreshold(const CallBase *CB) const override;
264 unsigned getCallerAllocaCost(const CallBase *CB,
265 const AllocaInst *AI) const override;
266
267 int getInlinerVectorBonusPercent() const override {
268 return InlinerVectorBonusPercent;
269 }
270
271 InstructionCost
272 getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
273 std::optional<FastMathFlags> FMF,
274 TTI::TargetCostKind CostKind) const override;
275
276 InstructionCost getPartialReductionCost(
277 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
278 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
279 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
280 TTI::TargetCostKind CostKind,
281 std::optional<FastMathFlags> FMF) const override {
282 return InstructionCost::getInvalid();
283 }
284
285 InstructionCost
286 getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
287 TTI::TargetCostKind CostKind) const override;
288 InstructionCost
289 getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
290 TTI::TargetCostKind CostKind) const override;
291
292 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
293 unsigned getCacheLineSize() const override { return 128; }
294
295 /// How much before a load we should place the prefetch instruction.
296 /// This is currently measured in number of IR instructions.
297 unsigned getPrefetchDistance() const override;
298
299 /// \return if target want to issue a prefetch in address space \p AS.
300 bool shouldPrefetchAddressSpace(unsigned AS) const override;
301 void collectKernelLaunchBounds(
302 const Function &F,
303 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
304
305 enum class KnownIEEEMode { Unknown, On, Off };
306
307 /// Return KnownIEEEMode::On if we know if the use context can assume
308 /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
309 /// "amdgpu-ieee"="false".
310 KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
311
312 /// Account for loads of i8 vector types to have reduced cost. For
313 /// example the cost of load 4 i8s values is one is the cost of loading
314 /// a single i32 value.
315 InstructionCost getMemoryOpCost(
316 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
317 TTI::TargetCostKind CostKind,
318 TTI::OperandValueInfo OpInfo = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
319 const Instruction *I = nullptr) const override;
320
321 /// When counting parts on AMD GPUs, account for i8s being grouped
322 /// together under a single i32 value. Otherwise fall back to base
323 /// implementation.
324 unsigned getNumberOfParts(Type *Tp) const override;
325
326 ValueUniformity getValueUniformity(const Value *V) const override;
327
328 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
329 StackOffset BaseOffset, bool HasBaseReg,
330 int64_t Scale,
331 unsigned AddrSpace) const override;
332
333 bool isLSRCostLess(const TTI::LSRCost &A,
334 const TTI::LSRCost &B) const override;
335 bool isNumRegsMajorCostOfLSR() const override;
336 bool shouldDropLSRSolutionIfLessProfitable() const override;
337
338 bool isUniform(const Instruction *I,
339 const SmallBitVector &UniformArgs) const override;
340};
341
342} // end namespace llvm
343
344#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
345