1 | //===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | /// \file |
8 | /// This file implements a TargetTransformInfo analysis pass specific to the |
9 | /// Hexagon target machine. It uses the target's detailed information to provide |
10 | /// more precise answers to certain TTI queries, while letting the target |
11 | /// independent and default TTI implementations handle the rest. |
12 | /// |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "HexagonTargetTransformInfo.h" |
16 | #include "HexagonSubtarget.h" |
17 | #include "llvm/Analysis/TargetTransformInfo.h" |
18 | #include "llvm/CodeGen/ValueTypes.h" |
19 | #include "llvm/IR/InstrTypes.h" |
20 | #include "llvm/IR/Instructions.h" |
21 | #include "llvm/IR/User.h" |
22 | #include "llvm/Support/Casting.h" |
23 | #include "llvm/Support/CommandLine.h" |
24 | #include "llvm/Transforms/Utils/LoopPeel.h" |
25 | #include "llvm/Transforms/Utils/UnrollLoop.h" |
26 | |
27 | using namespace llvm; |
28 | |
29 | #define DEBUG_TYPE "hexagontti" |
30 | |
31 | static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx" , cl::init(Val: false), |
32 | cl::Hidden, cl::desc("Enable loop vectorizer for HVX" )); |
33 | |
34 | static cl::opt<bool> EnableV68FloatAutoHVX( |
35 | "force-hvx-float" , cl::Hidden, |
36 | cl::desc("Enable auto-vectorization of floatint point types on v68." )); |
37 | |
38 | static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables" , |
39 | cl::init(Val: true), cl::Hidden, |
40 | cl::desc("Control lookup table emission on Hexagon target" )); |
41 | |
42 | static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem" , cl::init(Val: true), |
43 | cl::Hidden, cl::desc("Enable masked loads/stores for HVX" )); |
44 | |
45 | // Constant "cost factor" to make floating point operations more expensive |
46 | // in terms of vectorization cost. This isn't the best way, but it should |
47 | // do. Ultimately, the cost should use cycles. |
48 | static const unsigned FloatFactor = 4; |
49 | |
50 | bool HexagonTTIImpl::useHVX() const { |
51 | return ST.useHVXOps() && HexagonAutoHVX; |
52 | } |
53 | |
54 | bool HexagonTTIImpl::isHVXVectorType(Type *Ty) const { |
55 | auto *VecTy = dyn_cast<VectorType>(Val: Ty); |
56 | if (!VecTy) |
57 | return false; |
58 | if (!ST.isTypeForHVX(VecTy)) |
59 | return false; |
60 | if (ST.useHVXV69Ops() || !VecTy->getElementType()->isFloatingPointTy()) |
61 | return true; |
62 | return ST.useHVXV68Ops() && EnableV68FloatAutoHVX; |
63 | } |
64 | |
65 | unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const { |
66 | if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) |
67 | return VTy->getNumElements(); |
68 | assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) && |
69 | "Expecting scalar type" ); |
70 | return 1; |
71 | } |
72 | |
73 | TargetTransformInfo::PopcntSupportKind |
74 | HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { |
75 | // Return fast hardware support as every input < 64 bits will be promoted |
76 | // to 64 bits. |
77 | return TargetTransformInfo::PSK_FastHardware; |
78 | } |
79 | |
80 | // The Hexagon target can unroll loops with run-time trip counts. |
81 | void HexagonTTIImpl::(Loop *L, ScalarEvolution &SE, |
82 | TTI::UnrollingPreferences &UP, |
83 | OptimizationRemarkEmitter *ORE) { |
84 | UP.Runtime = UP.Partial = true; |
85 | } |
86 | |
87 | void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
88 | TTI::PeelingPreferences &PP) { |
89 | BaseT::getPeelingPreferences(L, SE, PP); |
90 | // Only try to peel innermost loops with small runtime trip counts. |
91 | if (L && L->isInnermost() && canPeel(L) && |
92 | SE.getSmallConstantTripCount(L) == 0 && |
93 | SE.getSmallConstantMaxTripCount(L) > 0 && |
94 | SE.getSmallConstantMaxTripCount(L) <= 5) { |
95 | PP.PeelCount = 2; |
96 | } |
97 | } |
98 | |
99 | TTI::AddressingModeKind |
100 | HexagonTTIImpl::getPreferredAddressingMode(const Loop *L, |
101 | ScalarEvolution *SE) const { |
102 | return TTI::AMK_PostIndexed; |
103 | } |
104 | |
105 | /// --- Vector TTI begin --- |
106 | |
107 | unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const { |
108 | if (Vector) |
109 | return useHVX() ? 32 : 0; |
110 | return 32; |
111 | } |
112 | |
113 | unsigned HexagonTTIImpl::getMaxInterleaveFactor(ElementCount VF) { |
114 | return useHVX() ? 2 : 1; |
115 | } |
116 | |
117 | TypeSize |
118 | HexagonTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
119 | switch (K) { |
120 | case TargetTransformInfo::RGK_Scalar: |
121 | return TypeSize::getFixed(ExactSize: 32); |
122 | case TargetTransformInfo::RGK_FixedWidthVector: |
123 | return TypeSize::getFixed(ExactSize: getMinVectorRegisterBitWidth()); |
124 | case TargetTransformInfo::RGK_ScalableVector: |
125 | return TypeSize::getScalable(MinimumSize: 0); |
126 | } |
127 | |
128 | llvm_unreachable("Unsupported register kind" ); |
129 | } |
130 | |
131 | unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const { |
132 | return useHVX() ? ST.getVectorLength()*8 : 32; |
133 | } |
134 | |
135 | ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth, |
136 | bool IsScalable) const { |
137 | assert(!IsScalable && "Scalable VFs are not supported for Hexagon" ); |
138 | return ElementCount::getFixed(MinVal: (8 * ST.getVectorLength()) / ElemWidth); |
139 | } |
140 | |
141 | InstructionCost HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, |
142 | ArrayRef<Type *> Tys, |
143 | TTI::TargetCostKind CostKind) { |
144 | return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind); |
145 | } |
146 | |
147 | InstructionCost |
148 | HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
149 | TTI::TargetCostKind CostKind) { |
150 | if (ICA.getID() == Intrinsic::bswap) { |
151 | std::pair<InstructionCost, MVT> LT = |
152 | getTypeLegalizationCost(Ty: ICA.getReturnType()); |
153 | return LT.first + 2; |
154 | } |
155 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
156 | } |
157 | |
158 | InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *Tp, |
159 | ScalarEvolution *SE, |
160 | const SCEV *S) { |
161 | return 0; |
162 | } |
163 | |
164 | InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
165 | MaybeAlign Alignment, |
166 | unsigned AddressSpace, |
167 | TTI::TargetCostKind CostKind, |
168 | TTI::OperandValueInfo OpInfo, |
169 | const Instruction *I) { |
170 | assert(Opcode == Instruction::Load || Opcode == Instruction::Store); |
171 | // TODO: Handle other cost kinds. |
172 | if (CostKind != TTI::TCK_RecipThroughput) |
173 | return 1; |
174 | |
175 | if (Opcode == Instruction::Store) |
176 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
177 | CostKind, OpInfo, I); |
178 | |
179 | if (Src->isVectorTy()) { |
180 | VectorType *VecTy = cast<VectorType>(Val: Src); |
181 | unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedValue(); |
182 | if (isHVXVectorType(Ty: VecTy)) { |
183 | unsigned RegWidth = |
184 | getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
185 | .getFixedValue(); |
186 | assert(RegWidth && "Non-zero vector register width expected" ); |
187 | // Cost of HVX loads. |
188 | if (VecWidth % RegWidth == 0) |
189 | return VecWidth / RegWidth; |
190 | // Cost of constructing HVX vector from scalar loads |
191 | const Align RegAlign(RegWidth / 8); |
192 | if (!Alignment || *Alignment > RegAlign) |
193 | Alignment = RegAlign; |
194 | assert(Alignment); |
195 | unsigned AlignWidth = 8 * Alignment->value(); |
196 | unsigned NumLoads = alignTo(Value: VecWidth, Align: AlignWidth) / AlignWidth; |
197 | return 3 * NumLoads; |
198 | } |
199 | |
200 | // Non-HVX vectors. |
201 | // Add extra cost for floating point types. |
202 | unsigned Cost = |
203 | VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1; |
204 | |
205 | // At this point unspecified alignment is considered as Align(1). |
206 | const Align BoundAlignment = std::min(a: Alignment.valueOrOne(), b: Align(8)); |
207 | unsigned AlignWidth = 8 * BoundAlignment.value(); |
208 | unsigned NumLoads = alignTo(Value: VecWidth, Align: AlignWidth) / AlignWidth; |
209 | if (Alignment == Align(4) || Alignment == Align(8)) |
210 | return Cost * NumLoads; |
211 | // Loads of less than 32 bits will need extra inserts to compose a vector. |
212 | assert(BoundAlignment <= Align(8)); |
213 | unsigned LogA = Log2(A: BoundAlignment); |
214 | return (3 - LogA) * Cost * NumLoads; |
215 | } |
216 | |
217 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind, |
218 | OpInfo, I); |
219 | } |
220 | |
221 | InstructionCost |
222 | HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, |
223 | Align Alignment, unsigned AddressSpace, |
224 | TTI::TargetCostKind CostKind) { |
225 | return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace, |
226 | CostKind); |
227 | } |
228 | |
229 | InstructionCost HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, |
230 | ArrayRef<int> Mask, |
231 | TTI::TargetCostKind CostKind, |
232 | int Index, Type *SubTp, |
233 | ArrayRef<const Value *> Args, |
234 | const Instruction *CxtI) { |
235 | return 1; |
236 | } |
237 | |
238 | InstructionCost HexagonTTIImpl::getGatherScatterOpCost( |
239 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
240 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { |
241 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
242 | Alignment, CostKind, I); |
243 | } |
244 | |
245 | InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost( |
246 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
247 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
248 | bool UseMaskForCond, bool UseMaskForGaps) { |
249 | if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps) |
250 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
251 | Alignment, AddressSpace, |
252 | CostKind, |
253 | UseMaskForCond, UseMaskForGaps); |
254 | return getMemoryOpCost(Opcode, Src: VecTy, Alignment: MaybeAlign(Alignment), AddressSpace, |
255 | CostKind); |
256 | } |
257 | |
258 | InstructionCost HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
259 | Type *CondTy, |
260 | CmpInst::Predicate VecPred, |
261 | TTI::TargetCostKind CostKind, |
262 | const Instruction *I) { |
263 | if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) { |
264 | if (!isHVXVectorType(Ty: ValTy) && ValTy->isFPOrFPVectorTy()) |
265 | return InstructionCost::getMax(); |
266 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
267 | if (Opcode == Instruction::FCmp) |
268 | return LT.first + FloatFactor * getTypeNumElements(Ty: ValTy); |
269 | } |
270 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); |
271 | } |
272 | |
273 | InstructionCost HexagonTTIImpl::getArithmeticInstrCost( |
274 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
275 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
276 | ArrayRef<const Value *> Args, |
277 | const Instruction *CxtI) { |
278 | // TODO: Handle more cost kinds. |
279 | if (CostKind != TTI::TCK_RecipThroughput) |
280 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
281 | Opd2Info: Op2Info, Args, CxtI); |
282 | |
283 | if (Ty->isVectorTy()) { |
284 | if (!isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy()) |
285 | return InstructionCost::getMax(); |
286 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
287 | if (LT.second.isFloatingPoint()) |
288 | return LT.first + FloatFactor * getTypeNumElements(Ty); |
289 | } |
290 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
291 | Args, CxtI); |
292 | } |
293 | |
294 | InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, |
295 | Type *SrcTy, |
296 | TTI::CastContextHint CCH, |
297 | TTI::TargetCostKind CostKind, |
298 | const Instruction *I) { |
299 | auto isNonHVXFP = [this] (Type *Ty) { |
300 | return Ty->isVectorTy() && !isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy(); |
301 | }; |
302 | if (isNonHVXFP(SrcTy) || isNonHVXFP(DstTy)) |
303 | return InstructionCost::getMax(); |
304 | |
305 | if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) { |
306 | unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(Ty: SrcTy) : 0; |
307 | unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(Ty: DstTy) : 0; |
308 | |
309 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: SrcTy); |
310 | std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: DstTy); |
311 | InstructionCost Cost = |
312 | std::max(a: SrcLT.first, b: DstLT.first) + FloatFactor * (SrcN + DstN); |
313 | // TODO: Allow non-throughput costs that aren't binary. |
314 | if (CostKind != TTI::TCK_RecipThroughput) |
315 | return Cost == 0 ? 0 : 1; |
316 | return Cost; |
317 | } |
318 | return 1; |
319 | } |
320 | |
321 | InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
322 | TTI::TargetCostKind CostKind, |
323 | unsigned Index, Value *Op0, |
324 | Value *Op1) { |
325 | Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType() |
326 | : Val; |
327 | if (Opcode == Instruction::InsertElement) { |
328 | // Need two rotations for non-zero index. |
329 | unsigned Cost = (Index != 0) ? 2 : 0; |
330 | if (ElemTy->isIntegerTy(Bitwidth: 32)) |
331 | return Cost; |
332 | // If it's not a 32-bit value, there will need to be an extract. |
333 | return Cost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val, CostKind, |
334 | Index, Op0, Op1); |
335 | } |
336 | |
337 | if (Opcode == Instruction::ExtractElement) |
338 | return 2; |
339 | |
340 | return 1; |
341 | } |
342 | |
343 | bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) { |
344 | // This function is called from scalarize-masked-mem-intrin, which runs |
345 | // in pre-isel. Use ST directly instead of calling isHVXVectorType. |
346 | return HexagonMaskedVMem && ST.isTypeForHVX(VecTy: DataType); |
347 | } |
348 | |
349 | bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) { |
350 | // This function is called from scalarize-masked-mem-intrin, which runs |
351 | // in pre-isel. Use ST directly instead of calling isHVXVectorType. |
352 | return HexagonMaskedVMem && ST.isTypeForHVX(VecTy: DataType); |
353 | } |
354 | |
355 | /// --- Vector TTI end --- |
356 | |
357 | unsigned HexagonTTIImpl::getPrefetchDistance() const { |
358 | return ST.getL1PrefetchDistance(); |
359 | } |
360 | |
361 | unsigned HexagonTTIImpl::getCacheLineSize() const { |
362 | return ST.getL1CacheLineSize(); |
363 | } |
364 | |
365 | InstructionCost |
366 | HexagonTTIImpl::getInstructionCost(const User *U, |
367 | ArrayRef<const Value *> Operands, |
368 | TTI::TargetCostKind CostKind) { |
369 | auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool { |
370 | if (!CI->isIntegerCast()) |
371 | return false; |
372 | // Only extensions from an integer type shorter than 32-bit to i32 |
373 | // can be folded into the load. |
374 | const DataLayout &DL = getDataLayout(); |
375 | unsigned SBW = DL.getTypeSizeInBits(Ty: CI->getSrcTy()); |
376 | unsigned DBW = DL.getTypeSizeInBits(Ty: CI->getDestTy()); |
377 | if (DBW != 32 || SBW >= DBW) |
378 | return false; |
379 | |
380 | const LoadInst *LI = dyn_cast<const LoadInst>(Val: CI->getOperand(i_nocapture: 0)); |
381 | // Technically, this code could allow multiple uses of the load, and |
382 | // check if all the uses are the same extension operation, but this |
383 | // should be sufficient for most cases. |
384 | return LI && LI->hasOneUse(); |
385 | }; |
386 | |
387 | if (const CastInst *CI = dyn_cast<const CastInst>(Val: U)) |
388 | if (isCastFoldedIntoLoad(CI)) |
389 | return TargetTransformInfo::TCC_Free; |
390 | return BaseT::getInstructionCost(U, Operands, CostKind); |
391 | } |
392 | |
393 | bool HexagonTTIImpl::shouldBuildLookupTables() const { |
394 | return EmitLookupTables; |
395 | } |
396 | |