1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfoImplBase conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
21#include "llvm/CodeGen/BasicTTIImpl.h"
22#include "llvm/Support/AMDGPUAddrSpace.h"
23#include <optional>
24
25namespace llvm {
26
27class AMDGPUTargetMachine;
28class GCNSubtarget;
29class InstCombiner;
30class Loop;
31class ScalarEvolution;
32class SITargetLowering;
33class Type;
34class Value;
35
36class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
37 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
38 using TTI = TargetTransformInfo;
39
40 friend BaseT;
41
42 Triple TargetTriple;
43
44 const TargetSubtargetInfo *ST;
45 const TargetLoweringBase *TLI;
46
47 const TargetSubtargetInfo *getST() const { return ST; }
48 const TargetLoweringBase *getTLI() const { return TLI; }
49
50public:
51 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
52
53 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
54 TTI::UnrollingPreferences &UP,
55 OptimizationRemarkEmitter *ORE) const override;
56
57 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
58 TTI::PeelingPreferences &PP) const override;
59
60 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
61};
62
63class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
64 using BaseT = BasicTTIImplBase<GCNTTIImpl>;
65 using TTI = TargetTransformInfo;
66
67 friend BaseT;
68
69 const GCNSubtarget *ST;
70 const SITargetLowering *TLI;
71 AMDGPUTTIImpl CommonTTI;
72 bool IsGraphics;
73 bool HasFP32Denormals;
74 bool HasFP64FP16Denormals;
75 static constexpr bool InlinerVectorBonusPercent = 0;
76
77 static const FeatureBitset InlineFeatureIgnoreList;
78
79 const GCNSubtarget *getST() const { return ST; }
80 const SITargetLowering *getTLI() const { return TLI; }
81
82 static inline int getFullRateInstrCost() {
83 return TargetTransformInfo::TCC_Basic;
84 }
85
86 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
87 return CostKind == TTI::TCK_CodeSize ? 2
88 : 2 * TargetTransformInfo::TCC_Basic;
89 }
90
91 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
92 // should be 2 or 4.
93 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
94 return CostKind == TTI::TCK_CodeSize ? 2
95 : 4 * TargetTransformInfo::TCC_Basic;
96 }
97
98 // On some parts, normal fp64 operations are half rate, and others
99 // quarter. This also applies to some integer operations.
100 int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
101
102 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
103
104 /// \returns true if V might be divergent even when all of its operands
105 /// are uniform.
106 bool isSourceOfDivergence(const Value *V) const;
107
108 /// Returns true for the target specific set of operations which produce
109 /// uniform result even taking non-uniform arguments.
110 bool isAlwaysUniform(const Value *V) const;
111
112public:
113 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
114
115 bool hasBranchDivergence(const Function *F = nullptr) const override;
116
117 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
118 TTI::UnrollingPreferences &UP,
119 OptimizationRemarkEmitter *ORE) const override;
120
121 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
122 TTI::PeelingPreferences &PP) const override;
123
124 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override {
125 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
126 return TTI::PSK_FastHardware;
127 }
128
129 unsigned getNumberOfRegisters(unsigned RCID) const override;
130 TypeSize
131 getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override;
132 unsigned getMinVectorRegisterBitWidth() const override;
133 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
134 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
135 unsigned ChainSizeInBytes,
136 VectorType *VecTy) const override;
137 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
138 unsigned ChainSizeInBytes,
139 VectorType *VecTy) const override;
140 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override;
141
142 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
143 unsigned AddrSpace) const;
144 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
145 unsigned AddrSpace) const override;
146 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
147 unsigned AddrSpace) const override;
148
149 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
150 Type *getMemcpyLoopLoweringType(
151 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
152 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
153 std::optional<uint32_t> AtomicElementSize) const override;
154
155 void getMemcpyLoopResidualLoweringType(
156 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
157 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
158 Align SrcAlign, Align DestAlign,
159 std::optional<uint32_t> AtomicCpySize) const override;
160 unsigned getMaxInterleaveFactor(ElementCount VF) const override;
161
162 bool getTgtMemIntrinsic(IntrinsicInst *Inst,
163 MemIntrinsicInfo &Info) const override;
164
165 InstructionCost getArithmeticInstrCost(
166 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
167 TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
168 TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
169 ArrayRef<const Value *> Args = {},
170 const Instruction *CxtI = nullptr) const override;
171
172 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
173 const Instruction *I = nullptr) const override;
174
175 bool isInlineAsmSourceOfDivergence(const CallInst *CI,
176 ArrayRef<unsigned> Indices = {}) const;
177
178 using BaseT::getVectorInstrCost;
179 InstructionCost
180 getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind,
181 unsigned Index, const Value *Op0, const Value *Op1,
182 TTI::VectorInstrContext VIC =
183 TTI::VectorInstrContext::None) const override;
184
185 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
186
187 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
188 // Address space casts must cast between different address spaces.
189 if (FromAS == ToAS)
190 return false;
191
192 // Casts between any aliasing address spaces are valid.
193 return AMDGPU::addrspacesMayAlias(AS1: FromAS, AS2: ToAS);
194 }
195
196 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override {
197 return AMDGPU::addrspacesMayAlias(AS1: AS0, AS2: AS1);
198 }
199
200 unsigned getFlatAddressSpace() const override {
201 // Don't bother running InferAddressSpaces pass on graphics shaders which
202 // don't use flat addressing.
203 if (IsGraphics)
204 return -1;
205 return AMDGPUAS::FLAT_ADDRESS;
206 }
207
208 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
209 Intrinsic::ID IID) const override;
210
211 bool
212 canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override {
213 return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
214 AS != AMDGPUAS::PRIVATE_ADDRESS;
215 }
216
217 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
218 Value *NewV) const override;
219
220 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
221 const Value *Op1, InstCombiner &IC) const;
222
223 bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
224 unsigned LaneAgIdx) const;
225
226 std::optional<Instruction *>
227 instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
228
229 Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC,
230 IntrinsicInst &II,
231 const APInt &DemandedElts,
232 APInt &UndefElts) const;
233
234 Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
235 IntrinsicInst &II) const;
236
237 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
238 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
239 APInt &UndefElts2, APInt &UndefElts3,
240 std::function<void(Instruction *, unsigned, APInt, APInt &)>
241 SimplifyAndSetOp) const override;
242
243 InstructionCost getVectorSplitCost() const { return 0; }
244
245 InstructionCost
246 getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
247 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
248 VectorType *SubTp, ArrayRef<const Value *> Args = {},
249 const Instruction *CxtI = nullptr) const override;
250
251 bool isProfitableToSinkOperands(Instruction *I,
252 SmallVectorImpl<Use *> &Ops) const override;
253
254 bool areInlineCompatible(const Function *Caller,
255 const Function *Callee) const override;
256
257 int getInliningLastCallToStaticBonus() const override;
258 unsigned getInliningThresholdMultiplier() const override { return 11; }
259 unsigned adjustInliningThreshold(const CallBase *CB) const override;
260 unsigned getCallerAllocaCost(const CallBase *CB,
261 const AllocaInst *AI) const override;
262
263 int getInlinerVectorBonusPercent() const override {
264 return InlinerVectorBonusPercent;
265 }
266
267 InstructionCost
268 getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
269 std::optional<FastMathFlags> FMF,
270 TTI::TargetCostKind CostKind) const override;
271
272 InstructionCost
273 getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
274 TTI::TargetCostKind CostKind) const override;
275 InstructionCost
276 getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
277 TTI::TargetCostKind CostKind) const override;
278
279 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
280 unsigned getCacheLineSize() const override { return 128; }
281
282 /// How much before a load we should place the prefetch instruction.
283 /// This is currently measured in number of IR instructions.
284 unsigned getPrefetchDistance() const override;
285
286 /// \return if target want to issue a prefetch in address space \p AS.
287 bool shouldPrefetchAddressSpace(unsigned AS) const override;
288 void collectKernelLaunchBounds(
289 const Function &F,
290 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
291
292 enum class KnownIEEEMode { Unknown, On, Off };
293
294 /// Return KnownIEEEMode::On if we know if the use context can assume
295 /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
296 /// "amdgpu-ieee"="false".
297 KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
298
299 /// Account for loads of i8 vector types to have reduced cost. For
300 /// example the cost of load 4 i8s values is one is the cost of loading
301 /// a single i32 value.
302 InstructionCost getMemoryOpCost(
303 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
304 TTI::TargetCostKind CostKind,
305 TTI::OperandValueInfo OpInfo = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
306 const Instruction *I = nullptr) const override;
307
308 /// When counting parts on AMD GPUs, account for i8s being grouped
309 /// together under a single i32 value. Otherwise fall back to base
310 /// implementation.
311 unsigned getNumberOfParts(Type *Tp) const override;
312
313 InstructionUniformity getInstructionUniformity(const Value *V) const override;
314};
315
316} // end namespace llvm
317
318#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
319