1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfo::Concept conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
21#include "llvm/CodeGen/BasicTTIImpl.h"
22#include <optional>
23
24namespace llvm {
25
26class AMDGPUTargetMachine;
27class GCNSubtarget;
28class InstCombiner;
29class Loop;
30class ScalarEvolution;
31class SITargetLowering;
32class Type;
33class Value;
34
35class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
36 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
37 using TTI = TargetTransformInfo;
38
39 friend BaseT;
40
41 Triple TargetTriple;
42
43 const TargetSubtargetInfo *ST;
44 const TargetLoweringBase *TLI;
45
46 const TargetSubtargetInfo *getST() const { return ST; }
47 const TargetLoweringBase *getTLI() const { return TLI; }
48
49public:
50 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
51
52 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
53 TTI::UnrollingPreferences &UP,
54 OptimizationRemarkEmitter *ORE);
55
56 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
57 TTI::PeelingPreferences &PP);
58
59 int64_t getMaxMemIntrinsicInlineSizeThreshold() const;
60};
61
62class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
63 using BaseT = BasicTTIImplBase<GCNTTIImpl>;
64 using TTI = TargetTransformInfo;
65
66 friend BaseT;
67
68 const GCNSubtarget *ST;
69 const SITargetLowering *TLI;
70 AMDGPUTTIImpl CommonTTI;
71 bool IsGraphics;
72 bool HasFP32Denormals;
73 bool HasFP64FP16Denormals;
74 static constexpr bool InlinerVectorBonusPercent = 0;
75
76 static const FeatureBitset InlineFeatureIgnoreList;
77
78 const GCNSubtarget *getST() const { return ST; }
79 const SITargetLowering *getTLI() const { return TLI; }
80
81 static inline int getFullRateInstrCost() {
82 return TargetTransformInfo::TCC_Basic;
83 }
84
85 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
86 return CostKind == TTI::TCK_CodeSize ? 2
87 : 2 * TargetTransformInfo::TCC_Basic;
88 }
89
90 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
91 // should be 2 or 4.
92 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
93 return CostKind == TTI::TCK_CodeSize ? 2
94 : 4 * TargetTransformInfo::TCC_Basic;
95 }
96
97 // On some parts, normal fp64 operations are half rate, and others
98 // quarter. This also applies to some integer operations.
99 int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
100
101 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
102
103public:
104 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
105
106 bool hasBranchDivergence(const Function *F = nullptr) const;
107
108 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
109 TTI::UnrollingPreferences &UP,
110 OptimizationRemarkEmitter *ORE);
111
112 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
113 TTI::PeelingPreferences &PP);
114
115 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
116 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
117 return TTI::PSK_FastHardware;
118 }
119
120 unsigned getNumberOfRegisters(unsigned RCID) const;
121 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
122 unsigned getMinVectorRegisterBitWidth() const;
123 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
124 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
125 unsigned ChainSizeInBytes,
126 VectorType *VecTy) const;
127 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
128 unsigned ChainSizeInBytes,
129 VectorType *VecTy) const;
130 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
131
132 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
133 unsigned AddrSpace) const;
134 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
135 unsigned AddrSpace) const;
136 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
137 unsigned AddrSpace) const;
138
139 int64_t getMaxMemIntrinsicInlineSizeThreshold() const;
140 Type *getMemcpyLoopLoweringType(
141 LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
142 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
143 std::optional<uint32_t> AtomicElementSize) const;
144
145 void getMemcpyLoopResidualLoweringType(
146 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
147 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
148 unsigned SrcAlign, unsigned DestAlign,
149 std::optional<uint32_t> AtomicCpySize) const;
150 unsigned getMaxInterleaveFactor(ElementCount VF);
151
152 bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
153
154 InstructionCost getArithmeticInstrCost(
155 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
156 TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
157 TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
158 ArrayRef<const Value *> Args = std::nullopt,
159 const Instruction *CxtI = nullptr);
160
161 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
162 const Instruction *I = nullptr);
163
164 bool isInlineAsmSourceOfDivergence(const CallInst *CI,
165 ArrayRef<unsigned> Indices = {}) const;
166
167 using BaseT::getVectorInstrCost;
168 InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
169 TTI::TargetCostKind CostKind,
170 unsigned Index, Value *Op0, Value *Op1);
171
172 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
173 bool isSourceOfDivergence(const Value *V) const;
174 bool isAlwaysUniform(const Value *V) const;
175
176 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
177 if (ToAS == AMDGPUAS::FLAT_ADDRESS) {
178 switch (FromAS) {
179 case AMDGPUAS::GLOBAL_ADDRESS:
180 case AMDGPUAS::CONSTANT_ADDRESS:
181 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
182 case AMDGPUAS::LOCAL_ADDRESS:
183 case AMDGPUAS::PRIVATE_ADDRESS:
184 return true;
185 default:
186 break;
187 }
188 return false;
189 }
190 if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
191 ToAS == AMDGPUAS::CONSTANT_ADDRESS) ||
192 (FromAS == AMDGPUAS::CONSTANT_ADDRESS &&
193 ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT))
194 return true;
195 return false;
196 }
197
198 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
199 return AMDGPU::addrspacesMayAlias(AS1: AS0, AS2: AS1);
200 }
201
202 unsigned getFlatAddressSpace() const {
203 // Don't bother running InferAddressSpaces pass on graphics shaders which
204 // don't use flat addressing.
205 if (IsGraphics)
206 return -1;
207 return AMDGPUAS::FLAT_ADDRESS;
208 }
209
210 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
211 Intrinsic::ID IID) const;
212
213 bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const {
214 return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
215 AS != AMDGPUAS::PRIVATE_ADDRESS;
216 }
217
218 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
219 Value *NewV) const;
220
221 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
222 const Value *Op1, InstCombiner &IC) const;
223 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
224 IntrinsicInst &II) const;
225 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
226 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
227 APInt &UndefElts2, APInt &UndefElts3,
228 std::function<void(Instruction *, unsigned, APInt, APInt &)>
229 SimplifyAndSetOp) const;
230
231 InstructionCost getVectorSplitCost() { return 0; }
232
233 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
234 ArrayRef<int> Mask,
235 TTI::TargetCostKind CostKind, int Index,
236 VectorType *SubTp,
237 ArrayRef<const Value *> Args = std::nullopt,
238 const Instruction *CxtI = nullptr);
239
240 bool areInlineCompatible(const Function *Caller,
241 const Function *Callee) const;
242
243 unsigned getInliningThresholdMultiplier() const { return 11; }
244 unsigned adjustInliningThreshold(const CallBase *CB) const;
245 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
246
247 int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; }
248
249 InstructionCost getArithmeticReductionCost(
250 unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
251 TTI::TargetCostKind CostKind);
252
253 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
254 TTI::TargetCostKind CostKind);
255 InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
256 FastMathFlags FMF,
257 TTI::TargetCostKind CostKind);
258
259 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
260 unsigned getCacheLineSize() const override { return 128; }
261
262 /// How much before a load we should place the prefetch instruction.
263 /// This is currently measured in number of IR instructions.
264 unsigned getPrefetchDistance() const override;
265
266 /// \return if target want to issue a prefetch in address space \p AS.
267 bool shouldPrefetchAddressSpace(unsigned AS) const override;
268};
269
270} // end namespace llvm
271
272#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
273