1 | //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This file a TargetTransformInfoImplBase conforming object specific to the |
11 | /// AMDGPU target machine. It uses the target's detailed information to |
12 | /// provide more precise answers to certain TTI queries, while letting the |
13 | /// target independent and default TTI implementations handle the rest. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H |
18 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H |
19 | |
20 | #include "AMDGPU.h" |
21 | #include "llvm/CodeGen/BasicTTIImpl.h" |
22 | #include "llvm/Support/AMDGPUAddrSpace.h" |
23 | #include <optional> |
24 | |
25 | namespace llvm { |
26 | |
27 | class AMDGPUTargetMachine; |
28 | class GCNSubtarget; |
29 | class InstCombiner; |
30 | class Loop; |
31 | class ScalarEvolution; |
32 | class SITargetLowering; |
33 | class Type; |
34 | class Value; |
35 | |
36 | class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { |
37 | using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>; |
38 | using TTI = TargetTransformInfo; |
39 | |
40 | friend BaseT; |
41 | |
42 | Triple TargetTriple; |
43 | |
44 | const TargetSubtargetInfo *ST; |
45 | const TargetLoweringBase *TLI; |
46 | |
47 | const TargetSubtargetInfo *getST() const { return ST; } |
48 | const TargetLoweringBase *getTLI() const { return TLI; } |
49 | |
50 | public: |
51 | explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); |
52 | |
53 | void (Loop *L, ScalarEvolution &SE, |
54 | TTI::UnrollingPreferences &UP, |
55 | OptimizationRemarkEmitter *ORE) const override; |
56 | |
57 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
58 | TTI::PeelingPreferences &PP) const override; |
59 | |
60 | uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override; |
61 | }; |
62 | |
63 | class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { |
64 | using BaseT = BasicTTIImplBase<GCNTTIImpl>; |
65 | using TTI = TargetTransformInfo; |
66 | |
67 | friend BaseT; |
68 | |
69 | const GCNSubtarget *ST; |
70 | const SITargetLowering *TLI; |
71 | AMDGPUTTIImpl CommonTTI; |
72 | bool IsGraphics; |
73 | bool HasFP32Denormals; |
74 | bool HasFP64FP16Denormals; |
75 | static constexpr bool InlinerVectorBonusPercent = 0; |
76 | |
77 | static const FeatureBitset InlineFeatureIgnoreList; |
78 | |
79 | const GCNSubtarget *getST() const { return ST; } |
80 | const SITargetLowering *getTLI() const { return TLI; } |
81 | |
82 | static inline int getFullRateInstrCost() { |
83 | return TargetTransformInfo::TCC_Basic; |
84 | } |
85 | |
86 | static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) { |
87 | return CostKind == TTI::TCK_CodeSize ? 2 |
88 | : 2 * TargetTransformInfo::TCC_Basic; |
89 | } |
90 | |
91 | // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe |
92 | // should be 2 or 4. |
93 | static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) { |
94 | return CostKind == TTI::TCK_CodeSize ? 2 |
95 | : 4 * TargetTransformInfo::TCC_Basic; |
96 | } |
97 | |
98 | // On some parts, normal fp64 operations are half rate, and others |
99 | // quarter. This also applies to some integer operations. |
100 | int get64BitInstrCost(TTI::TargetCostKind CostKind) const; |
101 | |
102 | std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const; |
103 | |
104 | public: |
105 | explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); |
106 | |
107 | bool hasBranchDivergence(const Function *F = nullptr) const override; |
108 | |
109 | void (Loop *L, ScalarEvolution &SE, |
110 | TTI::UnrollingPreferences &UP, |
111 | OptimizationRemarkEmitter *ORE) const override; |
112 | |
113 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
114 | TTI::PeelingPreferences &PP) const override; |
115 | |
116 | TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override { |
117 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
118 | return TTI::PSK_FastHardware; |
119 | } |
120 | |
121 | unsigned getNumberOfRegisters(unsigned RCID) const override; |
122 | TypeSize |
123 | getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override; |
124 | unsigned getMinVectorRegisterBitWidth() const override; |
125 | unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override; |
126 | unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, |
127 | unsigned ChainSizeInBytes, |
128 | VectorType *VecTy) const override; |
129 | unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, |
130 | unsigned ChainSizeInBytes, |
131 | VectorType *VecTy) const override; |
132 | unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override; |
133 | |
134 | bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, |
135 | unsigned AddrSpace) const; |
136 | bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, |
137 | unsigned AddrSpace) const override; |
138 | bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, |
139 | unsigned AddrSpace) const override; |
140 | |
141 | uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override; |
142 | Type *getMemcpyLoopLoweringType( |
143 | LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, |
144 | unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, |
145 | std::optional<uint32_t> AtomicElementSize) const override; |
146 | |
147 | void getMemcpyLoopResidualLoweringType( |
148 | SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, |
149 | unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, |
150 | Align SrcAlign, Align DestAlign, |
151 | std::optional<uint32_t> AtomicCpySize) const override; |
152 | unsigned getMaxInterleaveFactor(ElementCount VF) const override; |
153 | |
154 | bool getTgtMemIntrinsic(IntrinsicInst *Inst, |
155 | MemIntrinsicInfo &Info) const override; |
156 | |
157 | InstructionCost getArithmeticInstrCost( |
158 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
159 | TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
160 | TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
161 | ArrayRef<const Value *> Args = {}, |
162 | const Instruction *CxtI = nullptr) const override; |
163 | |
164 | InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, |
165 | const Instruction *I = nullptr) const override; |
166 | |
167 | bool isInlineAsmSourceOfDivergence(const CallInst *CI, |
168 | ArrayRef<unsigned> Indices = {}) const; |
169 | |
170 | using BaseT::getVectorInstrCost; |
171 | InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, |
172 | TTI::TargetCostKind CostKind, |
173 | unsigned Index, const Value *Op0, |
174 | const Value *Op1) const override; |
175 | |
176 | bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const; |
177 | bool isSourceOfDivergence(const Value *V) const override; |
178 | bool isAlwaysUniform(const Value *V) const override; |
179 | |
180 | bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override { |
181 | // Address space casts must cast between different address spaces. |
182 | if (FromAS == ToAS) |
183 | return false; |
184 | |
185 | // Casts between any aliasing address spaces are valid. |
186 | return AMDGPU::addrspacesMayAlias(AS1: FromAS, AS2: ToAS); |
187 | } |
188 | |
189 | bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override { |
190 | return AMDGPU::addrspacesMayAlias(AS1: AS0, AS2: AS1); |
191 | } |
192 | |
193 | unsigned getFlatAddressSpace() const override { |
194 | // Don't bother running InferAddressSpaces pass on graphics shaders which |
195 | // don't use flat addressing. |
196 | if (IsGraphics) |
197 | return -1; |
198 | return AMDGPUAS::FLAT_ADDRESS; |
199 | } |
200 | |
201 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
202 | Intrinsic::ID IID) const override; |
203 | |
204 | bool |
205 | canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override { |
206 | return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && |
207 | AS != AMDGPUAS::PRIVATE_ADDRESS; |
208 | } |
209 | |
210 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, |
211 | Value *NewV) const override; |
212 | |
213 | bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, |
214 | const Value *Op1, InstCombiner &IC) const; |
215 | |
216 | bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, |
217 | unsigned LaneAgIdx) const; |
218 | |
219 | std::optional<Instruction *> |
220 | instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override; |
221 | |
222 | Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, |
223 | IntrinsicInst &II, |
224 | const APInt &DemandedElts, |
225 | APInt &UndefElts) const; |
226 | |
227 | Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC, |
228 | IntrinsicInst &II) const; |
229 | |
230 | std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( |
231 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, |
232 | APInt &UndefElts2, APInt &UndefElts3, |
233 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
234 | SimplifyAndSetOp) const override; |
235 | |
236 | InstructionCost getVectorSplitCost() const { return 0; } |
237 | |
238 | InstructionCost |
239 | getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, |
240 | ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index, |
241 | VectorType *SubTp, ArrayRef<const Value *> Args = {}, |
242 | const Instruction *CxtI = nullptr) const override; |
243 | |
244 | bool isProfitableToSinkOperands(Instruction *I, |
245 | SmallVectorImpl<Use *> &Ops) const override; |
246 | |
247 | bool areInlineCompatible(const Function *Caller, |
248 | const Function *Callee) const override; |
249 | |
250 | int getInliningLastCallToStaticBonus() const override; |
251 | unsigned getInliningThresholdMultiplier() const override { return 11; } |
252 | unsigned adjustInliningThreshold(const CallBase *CB) const override; |
253 | unsigned getCallerAllocaCost(const CallBase *CB, |
254 | const AllocaInst *AI) const override; |
255 | |
256 | int getInlinerVectorBonusPercent() const override { |
257 | return InlinerVectorBonusPercent; |
258 | } |
259 | |
260 | InstructionCost |
261 | getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
262 | std::optional<FastMathFlags> FMF, |
263 | TTI::TargetCostKind CostKind) const override; |
264 | |
265 | InstructionCost |
266 | getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
267 | TTI::TargetCostKind CostKind) const override; |
268 | InstructionCost |
269 | getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, |
270 | TTI::TargetCostKind CostKind) const override; |
271 | |
272 | /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12. |
273 | unsigned getCacheLineSize() const override { return 128; } |
274 | |
275 | /// How much before a load we should place the prefetch instruction. |
276 | /// This is currently measured in number of IR instructions. |
277 | unsigned getPrefetchDistance() const override; |
278 | |
279 | /// \return if target want to issue a prefetch in address space \p AS. |
280 | bool shouldPrefetchAddressSpace(unsigned AS) const override; |
281 | void collectKernelLaunchBounds( |
282 | const Function &F, |
283 | SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override; |
284 | |
285 | enum class KnownIEEEMode { Unknown, On, Off }; |
286 | |
287 | /// Return KnownIEEEMode::On if we know if the use context can assume |
288 | /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume |
289 | /// "amdgpu-ieee"="false". |
290 | KnownIEEEMode fpenvIEEEMode(const Instruction &I) const; |
291 | |
292 | /// Account for loads of i8 vector types to have reduced cost. For |
293 | /// example the cost of load 4 i8s values is one is the cost of loading |
294 | /// a single i32 value. |
295 | InstructionCost getMemoryOpCost( |
296 | unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, |
297 | TTI::TargetCostKind CostKind, |
298 | TTI::OperandValueInfo OpInfo = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
299 | const Instruction *I = nullptr) const override; |
300 | |
301 | /// When counting parts on AMD GPUs, account for i8s being grouped |
302 | /// together under a single i32 value. Otherwise fall back to base |
303 | /// implementation. |
304 | unsigned getNumberOfParts(Type *Tp) const override; |
305 | }; |
306 | |
307 | } // end namespace llvm |
308 | |
309 | #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H |
310 | |