1 | //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This file a TargetTransformInfo::Concept conforming object specific to the |
11 | /// AMDGPU target machine. It uses the target's detailed information to |
12 | /// provide more precise answers to certain TTI queries, while letting the |
13 | /// target independent and default TTI implementations handle the rest. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H |
18 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H |
19 | |
20 | #include "AMDGPU.h" |
21 | #include "llvm/CodeGen/BasicTTIImpl.h" |
22 | #include <optional> |
23 | |
24 | namespace llvm { |
25 | |
26 | class AMDGPUTargetMachine; |
27 | class GCNSubtarget; |
28 | class InstCombiner; |
29 | class Loop; |
30 | class ScalarEvolution; |
31 | class SITargetLowering; |
32 | class Type; |
33 | class Value; |
34 | |
35 | class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { |
36 | using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>; |
37 | using TTI = TargetTransformInfo; |
38 | |
39 | friend BaseT; |
40 | |
41 | Triple TargetTriple; |
42 | |
43 | const TargetSubtargetInfo *ST; |
44 | const TargetLoweringBase *TLI; |
45 | |
46 | const TargetSubtargetInfo *getST() const { return ST; } |
47 | const TargetLoweringBase *getTLI() const { return TLI; } |
48 | |
49 | public: |
50 | explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); |
51 | |
52 | void (Loop *L, ScalarEvolution &SE, |
53 | TTI::UnrollingPreferences &UP, |
54 | OptimizationRemarkEmitter *ORE); |
55 | |
56 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
57 | TTI::PeelingPreferences &PP); |
58 | |
59 | int64_t getMaxMemIntrinsicInlineSizeThreshold() const; |
60 | }; |
61 | |
62 | class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { |
63 | using BaseT = BasicTTIImplBase<GCNTTIImpl>; |
64 | using TTI = TargetTransformInfo; |
65 | |
66 | friend BaseT; |
67 | |
68 | const GCNSubtarget *ST; |
69 | const SITargetLowering *TLI; |
70 | AMDGPUTTIImpl CommonTTI; |
71 | bool IsGraphics; |
72 | bool HasFP32Denormals; |
73 | bool HasFP64FP16Denormals; |
74 | static constexpr bool InlinerVectorBonusPercent = 0; |
75 | |
76 | static const FeatureBitset InlineFeatureIgnoreList; |
77 | |
78 | const GCNSubtarget *getST() const { return ST; } |
79 | const SITargetLowering *getTLI() const { return TLI; } |
80 | |
81 | static inline int getFullRateInstrCost() { |
82 | return TargetTransformInfo::TCC_Basic; |
83 | } |
84 | |
85 | static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) { |
86 | return CostKind == TTI::TCK_CodeSize ? 2 |
87 | : 2 * TargetTransformInfo::TCC_Basic; |
88 | } |
89 | |
90 | // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe |
91 | // should be 2 or 4. |
92 | static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) { |
93 | return CostKind == TTI::TCK_CodeSize ? 2 |
94 | : 4 * TargetTransformInfo::TCC_Basic; |
95 | } |
96 | |
97 | // On some parts, normal fp64 operations are half rate, and others |
98 | // quarter. This also applies to some integer operations. |
99 | int get64BitInstrCost(TTI::TargetCostKind CostKind) const; |
100 | |
101 | std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const; |
102 | |
103 | public: |
104 | explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); |
105 | |
106 | bool hasBranchDivergence(const Function *F = nullptr) const; |
107 | |
108 | void (Loop *L, ScalarEvolution &SE, |
109 | TTI::UnrollingPreferences &UP, |
110 | OptimizationRemarkEmitter *ORE); |
111 | |
112 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
113 | TTI::PeelingPreferences &PP); |
114 | |
115 | TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { |
116 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
117 | return TTI::PSK_FastHardware; |
118 | } |
119 | |
120 | unsigned getNumberOfRegisters(unsigned RCID) const; |
121 | TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const; |
122 | unsigned getMinVectorRegisterBitWidth() const; |
123 | unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; |
124 | unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, |
125 | unsigned ChainSizeInBytes, |
126 | VectorType *VecTy) const; |
127 | unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, |
128 | unsigned ChainSizeInBytes, |
129 | VectorType *VecTy) const; |
130 | unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; |
131 | |
132 | bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, |
133 | unsigned AddrSpace) const; |
134 | bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, |
135 | unsigned AddrSpace) const; |
136 | bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, |
137 | unsigned AddrSpace) const; |
138 | |
139 | int64_t getMaxMemIntrinsicInlineSizeThreshold() const; |
140 | Type *getMemcpyLoopLoweringType( |
141 | LLVMContext & Context, Value * Length, unsigned SrcAddrSpace, |
142 | unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, |
143 | std::optional<uint32_t> AtomicElementSize) const; |
144 | |
145 | void getMemcpyLoopResidualLoweringType( |
146 | SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, |
147 | unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, |
148 | unsigned SrcAlign, unsigned DestAlign, |
149 | std::optional<uint32_t> AtomicCpySize) const; |
150 | unsigned getMaxInterleaveFactor(ElementCount VF); |
151 | |
152 | bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; |
153 | |
154 | InstructionCost getArithmeticInstrCost( |
155 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
156 | TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
157 | TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
158 | ArrayRef<const Value *> Args = std::nullopt, |
159 | const Instruction *CxtI = nullptr); |
160 | |
161 | InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, |
162 | const Instruction *I = nullptr); |
163 | |
164 | bool isInlineAsmSourceOfDivergence(const CallInst *CI, |
165 | ArrayRef<unsigned> Indices = {}) const; |
166 | |
167 | using BaseT::getVectorInstrCost; |
168 | InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, |
169 | TTI::TargetCostKind CostKind, |
170 | unsigned Index, Value *Op0, Value *Op1); |
171 | |
172 | bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const; |
173 | bool isSourceOfDivergence(const Value *V) const; |
174 | bool isAlwaysUniform(const Value *V) const; |
175 | |
176 | bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { |
177 | if (ToAS == AMDGPUAS::FLAT_ADDRESS) { |
178 | switch (FromAS) { |
179 | case AMDGPUAS::GLOBAL_ADDRESS: |
180 | case AMDGPUAS::CONSTANT_ADDRESS: |
181 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: |
182 | case AMDGPUAS::LOCAL_ADDRESS: |
183 | case AMDGPUAS::PRIVATE_ADDRESS: |
184 | return true; |
185 | default: |
186 | break; |
187 | } |
188 | return false; |
189 | } |
190 | if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && |
191 | ToAS == AMDGPUAS::CONSTANT_ADDRESS) || |
192 | (FromAS == AMDGPUAS::CONSTANT_ADDRESS && |
193 | ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)) |
194 | return true; |
195 | return false; |
196 | } |
197 | |
198 | bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const { |
199 | return AMDGPU::addrspacesMayAlias(AS1: AS0, AS2: AS1); |
200 | } |
201 | |
202 | unsigned getFlatAddressSpace() const { |
203 | // Don't bother running InferAddressSpaces pass on graphics shaders which |
204 | // don't use flat addressing. |
205 | if (IsGraphics) |
206 | return -1; |
207 | return AMDGPUAS::FLAT_ADDRESS; |
208 | } |
209 | |
210 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
211 | Intrinsic::ID IID) const; |
212 | |
213 | bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const { |
214 | return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && |
215 | AS != AMDGPUAS::PRIVATE_ADDRESS; |
216 | } |
217 | |
218 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, |
219 | Value *NewV) const; |
220 | |
221 | bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, |
222 | const Value *Op1, InstCombiner &IC) const; |
223 | std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, |
224 | IntrinsicInst &II) const; |
225 | std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( |
226 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, |
227 | APInt &UndefElts2, APInt &UndefElts3, |
228 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
229 | SimplifyAndSetOp) const; |
230 | |
231 | InstructionCost getVectorSplitCost() { return 0; } |
232 | |
233 | InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, |
234 | ArrayRef<int> Mask, |
235 | TTI::TargetCostKind CostKind, int Index, |
236 | VectorType *SubTp, |
237 | ArrayRef<const Value *> Args = std::nullopt, |
238 | const Instruction *CxtI = nullptr); |
239 | |
240 | bool areInlineCompatible(const Function *Caller, |
241 | const Function *Callee) const; |
242 | |
243 | unsigned getInliningThresholdMultiplier() const { return 11; } |
244 | unsigned adjustInliningThreshold(const CallBase *CB) const; |
245 | unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const; |
246 | |
247 | int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; } |
248 | |
249 | InstructionCost getArithmeticReductionCost( |
250 | unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF, |
251 | TTI::TargetCostKind CostKind); |
252 | |
253 | InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
254 | TTI::TargetCostKind CostKind); |
255 | InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
256 | FastMathFlags FMF, |
257 | TTI::TargetCostKind CostKind); |
258 | |
259 | /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12. |
260 | unsigned getCacheLineSize() const override { return 128; } |
261 | |
262 | /// How much before a load we should place the prefetch instruction. |
263 | /// This is currently measured in number of IR instructions. |
264 | unsigned getPrefetchDistance() const override; |
265 | |
266 | /// \return if target want to issue a prefetch in address space \p AS. |
267 | bool shouldPrefetchAddressSpace(unsigned AS) const override; |
268 | }; |
269 | |
270 | } // end namespace llvm |
271 | |
272 | #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H |
273 | |