| 1 | //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// This file a TargetTransformInfoImplBase conforming object specific to the |
| 11 | /// AMDGPU target machine. It uses the target's detailed information to |
| 12 | /// provide more precise answers to certain TTI queries, while letting the |
| 13 | /// target independent and default TTI implementations handle the rest. |
| 14 | // |
| 15 | //===----------------------------------------------------------------------===// |
| 16 | |
| 17 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H |
| 18 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H |
| 19 | |
| 20 | #include "AMDGPU.h" |
| 21 | #include "llvm/CodeGen/BasicTTIImpl.h" |
| 22 | #include "llvm/Support/AMDGPUAddrSpace.h" |
| 23 | #include <optional> |
| 24 | |
| 25 | namespace llvm { |
| 26 | |
| 27 | class AMDGPUTargetMachine; |
| 28 | class GCNSubtarget; |
| 29 | class InstCombiner; |
| 30 | class Loop; |
| 31 | class ScalarEvolution; |
| 32 | class SITargetLowering; |
| 33 | class Type; |
| 34 | class Value; |
| 35 | |
| 36 | class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { |
| 37 | using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>; |
| 38 | using TTI = TargetTransformInfo; |
| 39 | |
| 40 | friend BaseT; |
| 41 | |
| 42 | Triple TargetTriple; |
| 43 | |
| 44 | const TargetSubtargetInfo *ST; |
| 45 | const TargetLoweringBase *TLI; |
| 46 | |
| 47 | const TargetSubtargetInfo *getST() const { return ST; } |
| 48 | const TargetLoweringBase *getTLI() const { return TLI; } |
| 49 | |
| 50 | public: |
| 51 | explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); |
| 52 | |
| 53 | void (Loop *L, ScalarEvolution &SE, |
| 54 | TTI::UnrollingPreferences &UP, |
| 55 | OptimizationRemarkEmitter *ORE) const override; |
| 56 | |
| 57 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
| 58 | TTI::PeelingPreferences &PP) const override; |
| 59 | |
| 60 | uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override; |
| 61 | }; |
| 62 | |
| 63 | class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { |
| 64 | using BaseT = BasicTTIImplBase<GCNTTIImpl>; |
| 65 | using TTI = TargetTransformInfo; |
| 66 | |
| 67 | friend BaseT; |
| 68 | |
| 69 | const GCNSubtarget *ST; |
| 70 | const SITargetLowering *TLI; |
| 71 | AMDGPUTTIImpl CommonTTI; |
| 72 | bool IsGraphics; |
| 73 | bool HasFP32Denormals; |
| 74 | bool HasFP64FP16Denormals; |
| 75 | static constexpr bool InlinerVectorBonusPercent = 0; |
| 76 | |
| 77 | static const FeatureBitset InlineFeatureIgnoreList; |
| 78 | |
| 79 | const GCNSubtarget *getST() const { return ST; } |
| 80 | const SITargetLowering *getTLI() const { return TLI; } |
| 81 | |
| 82 | static inline int getFullRateInstrCost() { |
| 83 | return TargetTransformInfo::TCC_Basic; |
| 84 | } |
| 85 | |
| 86 | static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) { |
| 87 | return CostKind == TTI::TCK_CodeSize ? 2 |
| 88 | : 2 * TargetTransformInfo::TCC_Basic; |
| 89 | } |
| 90 | |
| 91 | // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe |
| 92 | // should be 2 or 4. |
| 93 | static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) { |
| 94 | return CostKind == TTI::TCK_CodeSize ? 2 |
| 95 | : 4 * TargetTransformInfo::TCC_Basic; |
| 96 | } |
| 97 | |
| 98 | // On some parts, normal fp64 operations are half rate, and others |
| 99 | // quarter. This also applies to some integer operations. |
| 100 | int get64BitInstrCost(TTI::TargetCostKind CostKind) const; |
| 101 | |
| 102 | std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const; |
| 103 | |
| 104 | public: |
| 105 | explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); |
| 106 | |
| 107 | bool hasBranchDivergence(const Function *F = nullptr) const override; |
| 108 | |
| 109 | void (Loop *L, ScalarEvolution &SE, |
| 110 | TTI::UnrollingPreferences &UP, |
| 111 | OptimizationRemarkEmitter *ORE) const override; |
| 112 | |
| 113 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
| 114 | TTI::PeelingPreferences &PP) const override; |
| 115 | |
| 116 | TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override { |
| 117 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
| 118 | return TTI::PSK_FastHardware; |
| 119 | } |
| 120 | |
| 121 | unsigned getNumberOfRegisters(unsigned RCID) const override; |
| 122 | TypeSize |
| 123 | getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override; |
| 124 | unsigned getMinVectorRegisterBitWidth() const override; |
| 125 | unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override; |
| 126 | unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, |
| 127 | unsigned ChainSizeInBytes, |
| 128 | VectorType *VecTy) const override; |
| 129 | unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, |
| 130 | unsigned ChainSizeInBytes, |
| 131 | VectorType *VecTy) const override; |
| 132 | unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override; |
| 133 | |
| 134 | bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, |
| 135 | unsigned AddrSpace) const; |
| 136 | bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, |
| 137 | unsigned AddrSpace) const override; |
| 138 | bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, |
| 139 | unsigned AddrSpace) const override; |
| 140 | |
| 141 | uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override; |
| 142 | Type *getMemcpyLoopLoweringType( |
| 143 | LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, |
| 144 | unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, |
| 145 | std::optional<uint32_t> AtomicElementSize) const override; |
| 146 | |
| 147 | void getMemcpyLoopResidualLoweringType( |
| 148 | SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, |
| 149 | unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, |
| 150 | Align SrcAlign, Align DestAlign, |
| 151 | std::optional<uint32_t> AtomicCpySize) const override; |
| 152 | unsigned getMaxInterleaveFactor(ElementCount VF) const override; |
| 153 | |
| 154 | bool getTgtMemIntrinsic(IntrinsicInst *Inst, |
| 155 | MemIntrinsicInfo &Info) const override; |
| 156 | |
| 157 | InstructionCost getArithmeticInstrCost( |
| 158 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
| 159 | TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
| 160 | TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
| 161 | ArrayRef<const Value *> Args = {}, |
| 162 | const Instruction *CxtI = nullptr) const override; |
| 163 | |
| 164 | InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, |
| 165 | const Instruction *I = nullptr) const override; |
| 166 | |
| 167 | bool isInlineAsmSourceOfDivergence(const CallInst *CI, |
| 168 | ArrayRef<unsigned> Indices = {}) const; |
| 169 | |
| 170 | using BaseT::getVectorInstrCost; |
| 171 | InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, |
| 172 | TTI::TargetCostKind CostKind, |
| 173 | unsigned Index, const Value *Op0, |
| 174 | const Value *Op1) const override; |
| 175 | |
| 176 | bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const; |
| 177 | bool isSourceOfDivergence(const Value *V) const override; |
| 178 | bool isAlwaysUniform(const Value *V) const override; |
| 179 | |
| 180 | bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override { |
| 181 | // Address space casts must cast between different address spaces. |
| 182 | if (FromAS == ToAS) |
| 183 | return false; |
| 184 | |
| 185 | // Casts between any aliasing address spaces are valid. |
| 186 | return AMDGPU::addrspacesMayAlias(AS1: FromAS, AS2: ToAS); |
| 187 | } |
| 188 | |
| 189 | bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override { |
| 190 | return AMDGPU::addrspacesMayAlias(AS1: AS0, AS2: AS1); |
| 191 | } |
| 192 | |
| 193 | unsigned getFlatAddressSpace() const override { |
| 194 | // Don't bother running InferAddressSpaces pass on graphics shaders which |
| 195 | // don't use flat addressing. |
| 196 | if (IsGraphics) |
| 197 | return -1; |
| 198 | return AMDGPUAS::FLAT_ADDRESS; |
| 199 | } |
| 200 | |
| 201 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
| 202 | Intrinsic::ID IID) const override; |
| 203 | |
| 204 | bool |
| 205 | canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override { |
| 206 | return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && |
| 207 | AS != AMDGPUAS::PRIVATE_ADDRESS; |
| 208 | } |
| 209 | |
| 210 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, |
| 211 | Value *NewV) const override; |
| 212 | |
| 213 | bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, |
| 214 | const Value *Op1, InstCombiner &IC) const; |
| 215 | |
| 216 | bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, |
| 217 | unsigned LaneAgIdx) const; |
| 218 | |
| 219 | std::optional<Instruction *> |
| 220 | instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override; |
| 221 | |
| 222 | Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, |
| 223 | IntrinsicInst &II, |
| 224 | const APInt &DemandedElts, |
| 225 | APInt &UndefElts) const; |
| 226 | |
| 227 | Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC, |
| 228 | IntrinsicInst &II) const; |
| 229 | |
| 230 | std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( |
| 231 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, |
| 232 | APInt &UndefElts2, APInt &UndefElts3, |
| 233 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
| 234 | SimplifyAndSetOp) const override; |
| 235 | |
| 236 | InstructionCost getVectorSplitCost() const { return 0; } |
| 237 | |
| 238 | InstructionCost |
| 239 | getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, |
| 240 | ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index, |
| 241 | VectorType *SubTp, ArrayRef<const Value *> Args = {}, |
| 242 | const Instruction *CxtI = nullptr) const override; |
| 243 | |
| 244 | bool isProfitableToSinkOperands(Instruction *I, |
| 245 | SmallVectorImpl<Use *> &Ops) const override; |
| 246 | |
| 247 | bool areInlineCompatible(const Function *Caller, |
| 248 | const Function *Callee) const override; |
| 249 | |
| 250 | int getInliningLastCallToStaticBonus() const override; |
| 251 | unsigned getInliningThresholdMultiplier() const override { return 11; } |
| 252 | unsigned adjustInliningThreshold(const CallBase *CB) const override; |
| 253 | unsigned getCallerAllocaCost(const CallBase *CB, |
| 254 | const AllocaInst *AI) const override; |
| 255 | |
| 256 | int getInlinerVectorBonusPercent() const override { |
| 257 | return InlinerVectorBonusPercent; |
| 258 | } |
| 259 | |
| 260 | InstructionCost |
| 261 | getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
| 262 | std::optional<FastMathFlags> FMF, |
| 263 | TTI::TargetCostKind CostKind) const override; |
| 264 | |
| 265 | InstructionCost |
| 266 | getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
| 267 | TTI::TargetCostKind CostKind) const override; |
| 268 | InstructionCost |
| 269 | getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, |
| 270 | TTI::TargetCostKind CostKind) const override; |
| 271 | |
| 272 | /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12. |
| 273 | unsigned getCacheLineSize() const override { return 128; } |
| 274 | |
| 275 | /// How much before a load we should place the prefetch instruction. |
| 276 | /// This is currently measured in number of IR instructions. |
| 277 | unsigned getPrefetchDistance() const override; |
| 278 | |
| 279 | /// \return if target want to issue a prefetch in address space \p AS. |
| 280 | bool shouldPrefetchAddressSpace(unsigned AS) const override; |
| 281 | void collectKernelLaunchBounds( |
| 282 | const Function &F, |
| 283 | SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override; |
| 284 | |
| 285 | enum class KnownIEEEMode { Unknown, On, Off }; |
| 286 | |
| 287 | /// Return KnownIEEEMode::On if we know if the use context can assume |
| 288 | /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume |
| 289 | /// "amdgpu-ieee"="false". |
| 290 | KnownIEEEMode fpenvIEEEMode(const Instruction &I) const; |
| 291 | |
| 292 | /// Account for loads of i8 vector types to have reduced cost. For |
| 293 | /// example the cost of load 4 i8s values is one is the cost of loading |
| 294 | /// a single i32 value. |
| 295 | InstructionCost getMemoryOpCost( |
| 296 | unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, |
| 297 | TTI::TargetCostKind CostKind, |
| 298 | TTI::OperandValueInfo OpInfo = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
| 299 | const Instruction *I = nullptr) const override; |
| 300 | |
| 301 | /// When counting parts on AMD GPUs, account for i8s being grouped |
| 302 | /// together under a single i32 value. Otherwise fall back to base |
| 303 | /// implementation. |
| 304 | unsigned getNumberOfParts(Type *Tp) const override; |
| 305 | }; |
| 306 | |
| 307 | } // end namespace llvm |
| 308 | |
| 309 | #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H |
| 310 | |