1 | //===-- NVPTXTargetTransformInfo.h - NVPTX specific TTI ---------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// This file a TargetTransformInfoImplBase conforming object specific to the |
10 | /// NVPTX target machine. It uses the target's detailed information to |
11 | /// provide more precise answers to certain TTI queries, while letting the |
12 | /// target independent and default TTI implementations handle the rest. |
13 | /// |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H |
17 | #define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H |
18 | |
19 | #include "MCTargetDesc/NVPTXBaseInfo.h" |
20 | #include "NVPTXTargetMachine.h" |
21 | #include "NVPTXUtilities.h" |
22 | #include "llvm/Analysis/TargetTransformInfo.h" |
23 | #include "llvm/CodeGen/BasicTTIImpl.h" |
24 | #include "llvm/CodeGen/TargetLowering.h" |
25 | #include <optional> |
26 | |
27 | namespace llvm { |
28 | |
29 | class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> { |
30 | typedef BasicTTIImplBase<NVPTXTTIImpl> BaseT; |
31 | typedef TargetTransformInfo TTI; |
32 | friend BaseT; |
33 | |
34 | const NVPTXSubtarget *ST; |
35 | const NVPTXTargetLowering *TLI; |
36 | |
37 | const NVPTXSubtarget *getST() const { return ST; }; |
38 | const NVPTXTargetLowering *getTLI() const { return TLI; }; |
39 | |
40 | public: |
41 | explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM, const Function &F) |
42 | : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl()), |
43 | TLI(ST->getTargetLowering()) {} |
44 | |
45 | bool hasBranchDivergence(const Function *F = nullptr) const override { |
46 | return true; |
47 | } |
48 | |
49 | bool isSourceOfDivergence(const Value *V) const override; |
50 | |
51 | unsigned getFlatAddressSpace() const override { |
52 | return AddressSpace::ADDRESS_SPACE_GENERIC; |
53 | } |
54 | |
55 | bool |
56 | canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override { |
57 | return AS != AddressSpace::ADDRESS_SPACE_SHARED && |
58 | AS != AddressSpace::ADDRESS_SPACE_LOCAL && AS != ADDRESS_SPACE_PARAM; |
59 | } |
60 | |
61 | std::optional<Instruction *> |
62 | instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override; |
63 | |
64 | // Loads and stores can be vectorized if the alignment is at least as big as |
65 | // the load/store we want to vectorize. |
66 | bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, |
67 | unsigned AddrSpace) const override { |
68 | return Alignment >= ChainSizeInBytes; |
69 | } |
70 | bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, |
71 | unsigned AddrSpace) const override { |
72 | return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace); |
73 | } |
74 | |
75 | // NVPTX has infinite registers of all kinds, but the actual machine doesn't. |
76 | // We conservatively return 1 here which is just enough to enable the |
77 | // vectorizers but disables heuristics based on the number of registers. |
78 | // FIXME: Return a more reasonable number, while keeping an eye on |
79 | // LoopVectorizer's unrolling heuristics. |
80 | unsigned getNumberOfRegisters(unsigned ClassID) const override { return 1; } |
81 | |
82 | // Only <2 x half> should be vectorized, so always return 32 for the vector |
83 | // register size. |
84 | TypeSize |
85 | getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override { |
86 | return TypeSize::getFixed(ExactSize: 32); |
87 | } |
88 | unsigned getMinVectorRegisterBitWidth() const override { return 32; } |
89 | |
90 | // We don't want to prevent inlining because of target-cpu and -features |
91 | // attributes that were added to newer versions of LLVM/Clang: There are |
92 | // no incompatible functions in PTX, ptxas will throw errors in such cases. |
93 | bool areInlineCompatible(const Function *Caller, |
94 | const Function *Callee) const override { |
95 | return true; |
96 | } |
97 | |
98 | // Increase the inlining cost threshold by a factor of 11, reflecting that |
99 | // calls are particularly expensive in NVPTX. |
100 | unsigned getInliningThresholdMultiplier() const override { return 11; } |
101 | |
102 | InstructionCost |
103 | getInstructionCost(const User *U, ArrayRef<const Value *> Operands, |
104 | TTI::TargetCostKind CostKind) const override; |
105 | |
106 | InstructionCost getArithmeticInstrCost( |
107 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
108 | TTI::OperandValueInfo Op1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
109 | TTI::OperandValueInfo Op2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
110 | ArrayRef<const Value *> Args = {}, |
111 | const Instruction *CxtI = nullptr) const override; |
112 | |
113 | InstructionCost getScalarizationOverhead( |
114 | VectorType *InTy, const APInt &DemandedElts, bool Insert, bool , |
115 | TTI::TargetCostKind CostKind, bool ForPoisonSrc = true, |
116 | ArrayRef<Value *> VL = {}) const override { |
117 | if (!InTy->getElementCount().isFixed()) |
118 | return InstructionCost::getInvalid(); |
119 | |
120 | auto VT = getTLI()->getValueType(DL, Ty: InTy); |
121 | auto NumElements = InTy->getElementCount().getFixedValue(); |
122 | InstructionCost Cost = 0; |
123 | if (Insert && !VL.empty()) { |
124 | bool AllConstant = all_of(Range: seq(Size: NumElements), P: [&](int Idx) { |
125 | return !DemandedElts[Idx] || isa<Constant>(Val: VL[Idx]); |
126 | }); |
127 | if (AllConstant) { |
128 | Cost += TTI::TCC_Free; |
129 | Insert = false; |
130 | } |
131 | } |
132 | if (Insert && Isv2x16VT(VT)) { |
133 | // Can be built in a single mov |
134 | Cost += 1; |
135 | Insert = false; |
136 | } |
137 | if (Insert && VT == MVT::v4i8) { |
138 | InstructionCost Cost = 3; // 3 x PRMT |
139 | for (auto Idx : seq(Size: NumElements)) |
140 | if (DemandedElts[Idx]) |
141 | Cost += 1; // zext operand to i32 |
142 | Insert = false; |
143 | } |
144 | return Cost + BaseT::getScalarizationOverhead(InTy, DemandedElts, Insert, |
145 | Extract, CostKind, |
146 | ForPoisonSrc, VL); |
147 | } |
148 | |
149 | void (Loop *L, ScalarEvolution &SE, |
150 | TTI::UnrollingPreferences &UP, |
151 | OptimizationRemarkEmitter *ORE) const override; |
152 | |
153 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
154 | TTI::PeelingPreferences &PP) const override; |
155 | |
156 | bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const override { |
157 | // Volatile loads/stores are only supported for shared and global address |
158 | // spaces, or for generic AS that maps to them. |
159 | if (!(AddrSpace == llvm::ADDRESS_SPACE_GENERIC || |
160 | AddrSpace == llvm::ADDRESS_SPACE_GLOBAL || |
161 | AddrSpace == llvm::ADDRESS_SPACE_SHARED)) |
162 | return false; |
163 | |
164 | switch(I->getOpcode()){ |
165 | default: |
166 | return false; |
167 | case Instruction::Load: |
168 | case Instruction::Store: |
169 | return true; |
170 | } |
171 | } |
172 | |
173 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
174 | Intrinsic::ID IID) const override; |
175 | |
176 | unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override; |
177 | |
178 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, |
179 | Value *NewV) const override; |
180 | unsigned getAssumedAddrSpace(const Value *V) const override; |
181 | |
182 | void collectKernelLaunchBounds( |
183 | const Function &F, |
184 | SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override; |
185 | }; |
186 | |
187 | } // end namespace llvm |
188 | |
189 | #endif |
190 | |