1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfoImplBase conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
21#include "llvm/CodeGen/BasicTTIImpl.h"
22#include "llvm/Support/AMDGPUAddrSpace.h"
23#include <optional>
24
25namespace llvm {
26
27class AMDGPUTargetMachine;
28class GCNSubtarget;
29class InstCombiner;
30class Loop;
31class ScalarEvolution;
32class SITargetLowering;
33class Type;
34class Value;
35
36class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
37 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
38 using TTI = TargetTransformInfo;
39
40 friend BaseT;
41
42 Triple TargetTriple;
43
44 const TargetSubtargetInfo *ST;
45 const TargetLoweringBase *TLI;
46
47 const TargetSubtargetInfo *getST() const { return ST; }
48 const TargetLoweringBase *getTLI() const { return TLI; }
49
50public:
51 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
52
53 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
54 TTI::UnrollingPreferences &UP,
55 OptimizationRemarkEmitter *ORE) const override;
56
57 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
58 TTI::PeelingPreferences &PP) const override;
59
60 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
61};
62
63class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
64 using BaseT = BasicTTIImplBase<GCNTTIImpl>;
65 using TTI = TargetTransformInfo;
66
67 friend BaseT;
68
69 const GCNSubtarget *ST;
70 const SITargetLowering *TLI;
71 AMDGPUTTIImpl CommonTTI;
72 bool IsGraphics;
73 bool HasFP32Denormals;
74 bool HasFP64FP16Denormals;
75 static constexpr bool InlinerVectorBonusPercent = 0;
76
77 static const FeatureBitset InlineFeatureIgnoreList;
78
79 const GCNSubtarget *getST() const { return ST; }
80 const SITargetLowering *getTLI() const { return TLI; }
81
82 static inline int getFullRateInstrCost() {
83 return TargetTransformInfo::TCC_Basic;
84 }
85
86 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
87 return CostKind == TTI::TCK_CodeSize ? 2
88 : 2 * TargetTransformInfo::TCC_Basic;
89 }
90
91 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
92 // should be 2 or 4.
93 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
94 return CostKind == TTI::TCK_CodeSize ? 2
95 : 4 * TargetTransformInfo::TCC_Basic;
96 }
97
98 // On some parts, normal fp64 operations are half rate, and others
99 // quarter. This also applies to some integer operations.
100 int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
101
102 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
103
104public:
105 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
106
107 bool hasBranchDivergence(const Function *F = nullptr) const override;
108
109 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
110 TTI::UnrollingPreferences &UP,
111 OptimizationRemarkEmitter *ORE) const override;
112
113 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
114 TTI::PeelingPreferences &PP) const override;
115
116 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override {
117 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
118 return TTI::PSK_FastHardware;
119 }
120
121 unsigned getNumberOfRegisters(unsigned RCID) const override;
122 TypeSize
123 getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override;
124 unsigned getMinVectorRegisterBitWidth() const override;
125 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
126 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
127 unsigned ChainSizeInBytes,
128 VectorType *VecTy) const override;
129 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
130 unsigned ChainSizeInBytes,
131 VectorType *VecTy) const override;
132 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override;
133
134 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
135 unsigned AddrSpace) const;
136 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
137 unsigned AddrSpace) const override;
138 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
139 unsigned AddrSpace) const override;
140
141 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
142 Type *getMemcpyLoopLoweringType(
143 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
144 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
145 std::optional<uint32_t> AtomicElementSize) const override;
146
147 void getMemcpyLoopResidualLoweringType(
148 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
149 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
150 Align SrcAlign, Align DestAlign,
151 std::optional<uint32_t> AtomicCpySize) const override;
152 unsigned getMaxInterleaveFactor(ElementCount VF) const override;
153
154 bool getTgtMemIntrinsic(IntrinsicInst *Inst,
155 MemIntrinsicInfo &Info) const override;
156
157 InstructionCost getArithmeticInstrCost(
158 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
159 TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
160 TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
161 ArrayRef<const Value *> Args = {},
162 const Instruction *CxtI = nullptr) const override;
163
164 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
165 const Instruction *I = nullptr) const override;
166
167 bool isInlineAsmSourceOfDivergence(const CallInst *CI,
168 ArrayRef<unsigned> Indices = {}) const;
169
170 using BaseT::getVectorInstrCost;
171 InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
172 TTI::TargetCostKind CostKind,
173 unsigned Index, const Value *Op0,
174 const Value *Op1) const override;
175
176 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
177 bool isSourceOfDivergence(const Value *V) const override;
178 bool isAlwaysUniform(const Value *V) const override;
179
180 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
181 // Address space casts must cast between different address spaces.
182 if (FromAS == ToAS)
183 return false;
184
185 // Casts between any aliasing address spaces are valid.
186 return AMDGPU::addrspacesMayAlias(AS1: FromAS, AS2: ToAS);
187 }
188
189 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override {
190 return AMDGPU::addrspacesMayAlias(AS1: AS0, AS2: AS1);
191 }
192
193 unsigned getFlatAddressSpace() const override {
194 // Don't bother running InferAddressSpaces pass on graphics shaders which
195 // don't use flat addressing.
196 if (IsGraphics)
197 return -1;
198 return AMDGPUAS::FLAT_ADDRESS;
199 }
200
201 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
202 Intrinsic::ID IID) const override;
203
204 bool
205 canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override {
206 return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
207 AS != AMDGPUAS::PRIVATE_ADDRESS;
208 }
209
210 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
211 Value *NewV) const override;
212
213 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
214 const Value *Op1, InstCombiner &IC) const;
215
216 bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
217 unsigned LaneAgIdx) const;
218
219 std::optional<Instruction *>
220 instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
221
222 Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC,
223 IntrinsicInst &II,
224 const APInt &DemandedElts,
225 APInt &UndefElts) const;
226
227 Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
228 IntrinsicInst &II) const;
229
230 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
231 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
232 APInt &UndefElts2, APInt &UndefElts3,
233 std::function<void(Instruction *, unsigned, APInt, APInt &)>
234 SimplifyAndSetOp) const override;
235
236 InstructionCost getVectorSplitCost() const { return 0; }
237
238 InstructionCost
239 getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
240 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
241 VectorType *SubTp, ArrayRef<const Value *> Args = {},
242 const Instruction *CxtI = nullptr) const override;
243
244 bool isProfitableToSinkOperands(Instruction *I,
245 SmallVectorImpl<Use *> &Ops) const override;
246
247 bool areInlineCompatible(const Function *Caller,
248 const Function *Callee) const override;
249
250 int getInliningLastCallToStaticBonus() const override;
251 unsigned getInliningThresholdMultiplier() const override { return 11; }
252 unsigned adjustInliningThreshold(const CallBase *CB) const override;
253 unsigned getCallerAllocaCost(const CallBase *CB,
254 const AllocaInst *AI) const override;
255
256 int getInlinerVectorBonusPercent() const override {
257 return InlinerVectorBonusPercent;
258 }
259
260 InstructionCost
261 getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
262 std::optional<FastMathFlags> FMF,
263 TTI::TargetCostKind CostKind) const override;
264
265 InstructionCost
266 getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
267 TTI::TargetCostKind CostKind) const override;
268 InstructionCost
269 getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
270 TTI::TargetCostKind CostKind) const override;
271
272 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
273 unsigned getCacheLineSize() const override { return 128; }
274
275 /// How much before a load we should place the prefetch instruction.
276 /// This is currently measured in number of IR instructions.
277 unsigned getPrefetchDistance() const override;
278
279 /// \return if target want to issue a prefetch in address space \p AS.
280 bool shouldPrefetchAddressSpace(unsigned AS) const override;
281 void collectKernelLaunchBounds(
282 const Function &F,
283 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
284
285 enum class KnownIEEEMode { Unknown, On, Off };
286
287 /// Return KnownIEEEMode::On if we know if the use context can assume
288 /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
289 /// "amdgpu-ieee"="false".
290 KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
291
292 /// Account for loads of i8 vector types to have reduced cost. For
293 /// example the cost of load 4 i8s values is one is the cost of loading
294 /// a single i32 value.
295 InstructionCost getMemoryOpCost(
296 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
297 TTI::TargetCostKind CostKind,
298 TTI::OperandValueInfo OpInfo = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
299 const Instruction *I = nullptr) const override;
300
301 /// When counting parts on AMD GPUs, account for i8s being grouped
302 /// together under a single i32 value. Otherwise fall back to base
303 /// implementation.
304 unsigned getNumberOfParts(Type *Tp) const override;
305};
306
307} // end namespace llvm
308
309#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
310