1 | //===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | /// \file |
8 | /// This file implements a TargetTransformInfo analysis pass specific to the |
9 | /// Hexagon target machine. It uses the target's detailed information to provide |
10 | /// more precise answers to certain TTI queries, while letting the target |
11 | /// independent and default TTI implementations handle the rest. |
12 | /// |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "HexagonTargetTransformInfo.h" |
16 | #include "HexagonSubtarget.h" |
17 | #include "llvm/Analysis/TargetTransformInfo.h" |
18 | #include "llvm/CodeGen/ValueTypes.h" |
19 | #include "llvm/IR/InstrTypes.h" |
20 | #include "llvm/IR/Instructions.h" |
21 | #include "llvm/IR/User.h" |
22 | #include "llvm/Support/Casting.h" |
23 | #include "llvm/Support/CommandLine.h" |
24 | #include "llvm/Transforms/Utils/LoopPeel.h" |
25 | #include "llvm/Transforms/Utils/UnrollLoop.h" |
26 | |
27 | using namespace llvm; |
28 | |
29 | #define DEBUG_TYPE "hexagontti" |
30 | |
31 | static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx" , cl::init(Val: false), |
32 | cl::Hidden, cl::desc("Enable loop vectorizer for HVX" )); |
33 | |
34 | static cl::opt<bool> EnableV68FloatAutoHVX( |
35 | "force-hvx-float" , cl::Hidden, |
36 | cl::desc("Enable auto-vectorization of floatint point types on v68." )); |
37 | |
38 | static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables" , |
39 | cl::init(Val: true), cl::Hidden, |
40 | cl::desc("Control lookup table emission on Hexagon target" )); |
41 | |
42 | static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem" , cl::init(Val: true), |
43 | cl::Hidden, cl::desc("Enable masked loads/stores for HVX" )); |
44 | |
45 | // Constant "cost factor" to make floating point operations more expensive |
46 | // in terms of vectorization cost. This isn't the best way, but it should |
47 | // do. Ultimately, the cost should use cycles. |
48 | static const unsigned FloatFactor = 4; |
49 | |
50 | bool HexagonTTIImpl::useHVX() const { |
51 | return ST.useHVXOps() && HexagonAutoHVX; |
52 | } |
53 | |
54 | bool HexagonTTIImpl::isHVXVectorType(Type *Ty) const { |
55 | auto *VecTy = dyn_cast<VectorType>(Val: Ty); |
56 | if (!VecTy) |
57 | return false; |
58 | if (!ST.isTypeForHVX(VecTy)) |
59 | return false; |
60 | if (ST.useHVXV69Ops() || !VecTy->getElementType()->isFloatingPointTy()) |
61 | return true; |
62 | return ST.useHVXV68Ops() && EnableV68FloatAutoHVX; |
63 | } |
64 | |
65 | unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const { |
66 | if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) |
67 | return VTy->getNumElements(); |
68 | assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) && |
69 | "Expecting scalar type" ); |
70 | return 1; |
71 | } |
72 | |
73 | TargetTransformInfo::PopcntSupportKind |
74 | HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { |
75 | // Return fast hardware support as every input < 64 bits will be promoted |
76 | // to 64 bits. |
77 | return TargetTransformInfo::PSK_FastHardware; |
78 | } |
79 | |
80 | // The Hexagon target can unroll loops with run-time trip counts. |
81 | void HexagonTTIImpl::( |
82 | Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, |
83 | OptimizationRemarkEmitter *ORE) const { |
84 | UP.Runtime = UP.Partial = true; |
85 | } |
86 | |
87 | void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
88 | TTI::PeelingPreferences &PP) const { |
89 | BaseT::getPeelingPreferences(L, SE, PP); |
90 | // Only try to peel innermost loops with small runtime trip counts. |
91 | if (L && L->isInnermost() && canPeel(L) && |
92 | SE.getSmallConstantTripCount(L) == 0 && |
93 | SE.getSmallConstantMaxTripCount(L) > 0 && |
94 | SE.getSmallConstantMaxTripCount(L) <= 5) { |
95 | PP.PeelCount = 2; |
96 | } |
97 | } |
98 | |
99 | TTI::AddressingModeKind |
100 | HexagonTTIImpl::getPreferredAddressingMode(const Loop *L, |
101 | ScalarEvolution *SE) const { |
102 | return TTI::AMK_PostIndexed; |
103 | } |
104 | |
105 | /// --- Vector TTI begin --- |
106 | |
107 | unsigned HexagonTTIImpl::getNumberOfRegisters(unsigned ClassID) const { |
108 | bool Vector = ClassID == 1; |
109 | if (Vector) |
110 | return useHVX() ? 32 : 0; |
111 | return 32; |
112 | } |
113 | |
114 | unsigned HexagonTTIImpl::getMaxInterleaveFactor(ElementCount VF) const { |
115 | return useHVX() ? 2 : 1; |
116 | } |
117 | |
118 | TypeSize |
119 | HexagonTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
120 | switch (K) { |
121 | case TargetTransformInfo::RGK_Scalar: |
122 | return TypeSize::getFixed(ExactSize: 32); |
123 | case TargetTransformInfo::RGK_FixedWidthVector: |
124 | return TypeSize::getFixed(ExactSize: getMinVectorRegisterBitWidth()); |
125 | case TargetTransformInfo::RGK_ScalableVector: |
126 | return TypeSize::getScalable(MinimumSize: 0); |
127 | } |
128 | |
129 | llvm_unreachable("Unsupported register kind" ); |
130 | } |
131 | |
132 | unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const { |
133 | return useHVX() ? ST.getVectorLength()*8 : 32; |
134 | } |
135 | |
136 | ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth, |
137 | bool IsScalable) const { |
138 | assert(!IsScalable && "Scalable VFs are not supported for Hexagon" ); |
139 | return ElementCount::getFixed(MinVal: (8 * ST.getVectorLength()) / ElemWidth); |
140 | } |
141 | |
142 | InstructionCost |
143 | HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys, |
144 | TTI::TargetCostKind CostKind) const { |
145 | return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind); |
146 | } |
147 | |
148 | InstructionCost |
149 | HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
150 | TTI::TargetCostKind CostKind) const { |
151 | if (ICA.getID() == Intrinsic::bswap) { |
152 | std::pair<InstructionCost, MVT> LT = |
153 | getTypeLegalizationCost(Ty: ICA.getReturnType()); |
154 | return LT.first + 2; |
155 | } |
156 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
157 | } |
158 | |
159 | InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *Tp, |
160 | ScalarEvolution *SE, |
161 | const SCEV *S) const { |
162 | return 0; |
163 | } |
164 | |
165 | InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
166 | Align Alignment, |
167 | unsigned AddressSpace, |
168 | TTI::TargetCostKind CostKind, |
169 | TTI::OperandValueInfo OpInfo, |
170 | const Instruction *I) const { |
171 | assert(Opcode == Instruction::Load || Opcode == Instruction::Store); |
172 | // TODO: Handle other cost kinds. |
173 | if (CostKind != TTI::TCK_RecipThroughput) |
174 | return 1; |
175 | |
176 | if (Opcode == Instruction::Store) |
177 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
178 | CostKind, OpInfo, I); |
179 | |
180 | if (Src->isVectorTy()) { |
181 | VectorType *VecTy = cast<VectorType>(Val: Src); |
182 | unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedValue(); |
183 | if (isHVXVectorType(Ty: VecTy)) { |
184 | unsigned RegWidth = |
185 | getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
186 | .getFixedValue(); |
187 | assert(RegWidth && "Non-zero vector register width expected" ); |
188 | // Cost of HVX loads. |
189 | if (VecWidth % RegWidth == 0) |
190 | return VecWidth / RegWidth; |
191 | // Cost of constructing HVX vector from scalar loads |
192 | const Align RegAlign(RegWidth / 8); |
193 | if (Alignment > RegAlign) |
194 | Alignment = RegAlign; |
195 | unsigned AlignWidth = 8 * Alignment.value(); |
196 | unsigned NumLoads = alignTo(Value: VecWidth, Align: AlignWidth) / AlignWidth; |
197 | return 3 * NumLoads; |
198 | } |
199 | |
200 | // Non-HVX vectors. |
201 | // Add extra cost for floating point types. |
202 | unsigned Cost = |
203 | VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1; |
204 | |
205 | // At this point unspecified alignment is considered as Align(1). |
206 | const Align BoundAlignment = std::min(a: Alignment, b: Align(8)); |
207 | unsigned AlignWidth = 8 * BoundAlignment.value(); |
208 | unsigned NumLoads = alignTo(Value: VecWidth, Align: AlignWidth) / AlignWidth; |
209 | if (Alignment == Align(4) || Alignment == Align(8)) |
210 | return Cost * NumLoads; |
211 | // Loads of less than 32 bits will need extra inserts to compose a vector. |
212 | assert(BoundAlignment <= Align(8)); |
213 | unsigned LogA = Log2(A: BoundAlignment); |
214 | return (3 - LogA) * Cost * NumLoads; |
215 | } |
216 | |
217 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind, |
218 | OpInfo, I); |
219 | } |
220 | |
221 | InstructionCost |
222 | HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, |
223 | Align Alignment, unsigned AddressSpace, |
224 | TTI::TargetCostKind CostKind) const { |
225 | return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace, |
226 | CostKind); |
227 | } |
228 | |
229 | InstructionCost |
230 | HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, |
231 | VectorType *SrcTy, ArrayRef<int> Mask, |
232 | TTI::TargetCostKind CostKind, int Index, |
233 | VectorType *SubTp, ArrayRef<const Value *> Args, |
234 | const Instruction *CxtI) const { |
235 | return 1; |
236 | } |
237 | |
238 | InstructionCost HexagonTTIImpl::getGatherScatterOpCost( |
239 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
240 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { |
241 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
242 | Alignment, CostKind, I); |
243 | } |
244 | |
245 | InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost( |
246 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
247 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
248 | bool UseMaskForCond, bool UseMaskForGaps) const { |
249 | if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps) |
250 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
251 | Alignment, AddressSpace, |
252 | CostKind, |
253 | UseMaskForCond, UseMaskForGaps); |
254 | return getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind); |
255 | } |
256 | |
257 | InstructionCost HexagonTTIImpl::getCmpSelInstrCost( |
258 | unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, |
259 | TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, |
260 | TTI::OperandValueInfo Op2Info, const Instruction *I) const { |
261 | if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) { |
262 | if (!isHVXVectorType(Ty: ValTy) && ValTy->isFPOrFPVectorTy()) |
263 | return InstructionCost::getMax(); |
264 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
265 | if (Opcode == Instruction::FCmp) |
266 | return LT.first + FloatFactor * getTypeNumElements(Ty: ValTy); |
267 | } |
268 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
269 | Op1Info, Op2Info, I); |
270 | } |
271 | |
272 | InstructionCost HexagonTTIImpl::getArithmeticInstrCost( |
273 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
274 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
275 | ArrayRef<const Value *> Args, const Instruction *CxtI) const { |
276 | // TODO: Handle more cost kinds. |
277 | if (CostKind != TTI::TCK_RecipThroughput) |
278 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
279 | Opd2Info: Op2Info, Args, CxtI); |
280 | |
281 | if (Ty->isVectorTy()) { |
282 | if (!isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy()) |
283 | return InstructionCost::getMax(); |
284 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
285 | if (LT.second.isFloatingPoint()) |
286 | return LT.first + FloatFactor * getTypeNumElements(Ty); |
287 | } |
288 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
289 | Args, CxtI); |
290 | } |
291 | |
292 | InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, |
293 | Type *SrcTy, |
294 | TTI::CastContextHint CCH, |
295 | TTI::TargetCostKind CostKind, |
296 | const Instruction *I) const { |
297 | auto isNonHVXFP = [this] (Type *Ty) { |
298 | return Ty->isVectorTy() && !isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy(); |
299 | }; |
300 | if (isNonHVXFP(SrcTy) || isNonHVXFP(DstTy)) |
301 | return InstructionCost::getMax(); |
302 | |
303 | if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) { |
304 | unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(Ty: SrcTy) : 0; |
305 | unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(Ty: DstTy) : 0; |
306 | |
307 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: SrcTy); |
308 | std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: DstTy); |
309 | InstructionCost Cost = |
310 | std::max(a: SrcLT.first, b: DstLT.first) + FloatFactor * (SrcN + DstN); |
311 | // TODO: Allow non-throughput costs that aren't binary. |
312 | if (CostKind != TTI::TCK_RecipThroughput) |
313 | return Cost == 0 ? 0 : 1; |
314 | return Cost; |
315 | } |
316 | return 1; |
317 | } |
318 | |
319 | InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
320 | TTI::TargetCostKind CostKind, |
321 | unsigned Index, |
322 | const Value *Op0, |
323 | const Value *Op1) const { |
324 | Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType() |
325 | : Val; |
326 | if (Opcode == Instruction::InsertElement) { |
327 | // Need two rotations for non-zero index. |
328 | unsigned Cost = (Index != 0) ? 2 : 0; |
329 | if (ElemTy->isIntegerTy(Bitwidth: 32)) |
330 | return Cost; |
331 | // If it's not a 32-bit value, there will need to be an extract. |
332 | return Cost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val, CostKind, |
333 | Index, Op0, Op1); |
334 | } |
335 | |
336 | if (Opcode == Instruction::ExtractElement) |
337 | return 2; |
338 | |
339 | return 1; |
340 | } |
341 | |
342 | bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/, |
343 | unsigned /*AddressSpace*/) const { |
344 | // This function is called from scalarize-masked-mem-intrin, which runs |
345 | // in pre-isel. Use ST directly instead of calling isHVXVectorType. |
346 | return HexagonMaskedVMem && ST.isTypeForHVX(VecTy: DataType); |
347 | } |
348 | |
349 | bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/, |
350 | unsigned /*AddressSpace*/) const { |
351 | // This function is called from scalarize-masked-mem-intrin, which runs |
352 | // in pre-isel. Use ST directly instead of calling isHVXVectorType. |
353 | return HexagonMaskedVMem && ST.isTypeForHVX(VecTy: DataType); |
354 | } |
355 | |
356 | /// --- Vector TTI end --- |
357 | |
358 | unsigned HexagonTTIImpl::getPrefetchDistance() const { |
359 | return ST.getL1PrefetchDistance(); |
360 | } |
361 | |
362 | unsigned HexagonTTIImpl::getCacheLineSize() const { |
363 | return ST.getL1CacheLineSize(); |
364 | } |
365 | |
366 | InstructionCost |
367 | HexagonTTIImpl::getInstructionCost(const User *U, |
368 | ArrayRef<const Value *> Operands, |
369 | TTI::TargetCostKind CostKind) const { |
370 | auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool { |
371 | if (!CI->isIntegerCast()) |
372 | return false; |
373 | // Only extensions from an integer type shorter than 32-bit to i32 |
374 | // can be folded into the load. |
375 | const DataLayout &DL = getDataLayout(); |
376 | unsigned SBW = DL.getTypeSizeInBits(Ty: CI->getSrcTy()); |
377 | unsigned DBW = DL.getTypeSizeInBits(Ty: CI->getDestTy()); |
378 | if (DBW != 32 || SBW >= DBW) |
379 | return false; |
380 | |
381 | const LoadInst *LI = dyn_cast<const LoadInst>(Val: CI->getOperand(i_nocapture: 0)); |
382 | // Technically, this code could allow multiple uses of the load, and |
383 | // check if all the uses are the same extension operation, but this |
384 | // should be sufficient for most cases. |
385 | return LI && LI->hasOneUse(); |
386 | }; |
387 | |
388 | if (const CastInst *CI = dyn_cast<const CastInst>(Val: U)) |
389 | if (isCastFoldedIntoLoad(CI)) |
390 | return TargetTransformInfo::TCC_Free; |
391 | return BaseT::getInstructionCost(U, Operands, CostKind); |
392 | } |
393 | |
394 | bool HexagonTTIImpl::shouldBuildLookupTables() const { |
395 | return EmitLookupTables; |
396 | } |
397 | |