1//===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8/// This file implements a TargetTransformInfo analysis pass specific to the
9/// Hexagon target machine. It uses the target's detailed information to provide
10/// more precise answers to certain TTI queries, while letting the target
11/// independent and default TTI implementations handle the rest.
12///
13//===----------------------------------------------------------------------===//
14
15#include "HexagonTargetTransformInfo.h"
16#include "HexagonSubtarget.h"
17#include "llvm/Analysis/TargetTransformInfo.h"
18#include "llvm/CodeGen/ValueTypes.h"
19#include "llvm/IR/InstrTypes.h"
20#include "llvm/IR/Instructions.h"
21#include "llvm/IR/User.h"
22#include "llvm/Support/Casting.h"
23#include "llvm/Support/CommandLine.h"
24#include "llvm/Transforms/Utils/LoopPeel.h"
25#include "llvm/Transforms/Utils/UnrollLoop.h"
26
27using namespace llvm;
28
29#define DEBUG_TYPE "hexagontti"
30
31static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(Val: false),
32 cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
33
34cl::opt<bool> HexagonAllowScatterGatherHVX(
35 "hexagon-allow-scatter-gather-hvx", cl::init(Val: false), cl::Hidden,
36 cl::desc("Allow auto-generation of HVX scatter-gather"));
37
38static cl::opt<bool> EnableV68FloatAutoHVX(
39 "force-hvx-float", cl::Hidden,
40 cl::desc("Enable auto-vectorization of floatint point types on v68."));
41
42static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
43 cl::init(Val: true), cl::Hidden,
44 cl::desc("Control lookup table emission on Hexagon target"));
45
46static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem", cl::init(Val: true),
47 cl::Hidden, cl::desc("Enable masked loads/stores for HVX"));
48
49// Constant "cost factor" to make floating point operations more expensive
50// in terms of vectorization cost. This isn't the best way, but it should
51// do. Ultimately, the cost should use cycles.
52static const unsigned FloatFactor = 4;
53
54bool HexagonTTIImpl::useHVX() const {
55 return ST.useHVXOps() && HexagonAutoHVX;
56}
57
58bool HexagonTTIImpl::isHVXVectorType(Type *Ty) const {
59 auto *VecTy = dyn_cast<VectorType>(Val: Ty);
60 if (!VecTy)
61 return false;
62 if (!ST.isTypeForHVX(VecTy))
63 return false;
64 if (ST.useHVXV69Ops() || !VecTy->getElementType()->isFloatingPointTy())
65 return true;
66 return ST.useHVXV68Ops() && EnableV68FloatAutoHVX;
67}
68
69unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const {
70 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty))
71 return VTy->getNumElements();
72 assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) &&
73 "Expecting scalar type");
74 return 1;
75}
76
77TargetTransformInfo::PopcntSupportKind
78HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
79 // Return fast hardware support as every input < 64 bits will be promoted
80 // to 64 bits.
81 return TargetTransformInfo::PSK_FastHardware;
82}
83
84// The Hexagon target can unroll loops with run-time trip counts.
85void HexagonTTIImpl::getUnrollingPreferences(
86 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
87 OptimizationRemarkEmitter *ORE) const {
88 UP.Runtime = UP.Partial = true;
89}
90
91void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
92 TTI::PeelingPreferences &PP) const {
93 BaseT::getPeelingPreferences(L, SE, PP);
94 // Only try to peel innermost loops with small runtime trip counts.
95 if (L && L->isInnermost() && canPeel(L) &&
96 SE.getSmallConstantTripCount(L) == 0 &&
97 SE.getSmallConstantMaxTripCount(L) > 0 &&
98 SE.getSmallConstantMaxTripCount(L) <= 5) {
99 PP.PeelCount = 2;
100 }
101}
102
103TTI::AddressingModeKind
104HexagonTTIImpl::getPreferredAddressingMode(const Loop *L,
105 ScalarEvolution *SE) const {
106 return TTI::AMK_PostIndexed;
107}
108
109/// --- Vector TTI begin ---
110
111unsigned HexagonTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
112 bool Vector = ClassID == 1;
113 if (Vector)
114 return useHVX() ? 32 : 0;
115 return 32;
116}
117
118unsigned
119HexagonTTIImpl::getMaxInterleaveFactor(ElementCount VF,
120 bool HasUnorderedReductions) const {
121 return useHVX() ? 2 : 1;
122}
123
124TypeSize
125HexagonTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
126 switch (K) {
127 case TargetTransformInfo::RGK_Scalar:
128 return TypeSize::getFixed(ExactSize: 32);
129 case TargetTransformInfo::RGK_FixedWidthVector:
130 return TypeSize::getFixed(ExactSize: getMinVectorRegisterBitWidth());
131 case TargetTransformInfo::RGK_ScalableVector:
132 return TypeSize::getScalable(MinimumSize: 0);
133 }
134
135 llvm_unreachable("Unsupported register kind");
136}
137
138unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
139 return useHVX() ? ST.getVectorLength()*8 : 32;
140}
141
142ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth,
143 bool IsScalable) const {
144 assert(!IsScalable && "Scalable VFs are not supported for Hexagon");
145 return ElementCount::getFixed(MinVal: (8 * ST.getVectorLength()) / ElemWidth);
146}
147
148InstructionCost
149HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
150 TTI::TargetCostKind CostKind) const {
151 return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind);
152}
153
154InstructionCost
155HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
156 TTI::TargetCostKind CostKind) const {
157 if (ICA.getID() == Intrinsic::bswap) {
158 std::pair<InstructionCost, MVT> LT =
159 getTypeLegalizationCost(Ty: ICA.getReturnType());
160 return LT.first + 2;
161 }
162 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
163}
164
165InstructionCost
166HexagonTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
167 const SCEV *S,
168 TTI::TargetCostKind CostKind) const {
169 return 0;
170}
171
172InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
173 Align Alignment,
174 unsigned AddressSpace,
175 TTI::TargetCostKind CostKind,
176 TTI::OperandValueInfo OpInfo,
177 const Instruction *I) const {
178 assert(Opcode == Instruction::Load || Opcode == Instruction::Store);
179
180 // FIXME: Load latency isn't handled here
181 if (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency)
182 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
183 CostKind, OpInfo, I);
184
185 // TODO: Handle other cost kinds.
186 if (CostKind != TTI::TCK_RecipThroughput)
187 return 1;
188
189 if (Opcode == Instruction::Store)
190 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
191 CostKind, OpInfo, I);
192
193 if (Src->isVectorTy()) {
194 VectorType *VecTy = cast<VectorType>(Val: Src);
195 unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedValue();
196 if (isHVXVectorType(Ty: VecTy)) {
197 unsigned RegWidth =
198 getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
199 .getFixedValue();
200 assert(RegWidth && "Non-zero vector register width expected");
201 // Cost of HVX loads.
202 if (VecWidth % RegWidth == 0)
203 return VecWidth / RegWidth;
204 // Cost of constructing HVX vector from scalar loads
205 const Align RegAlign(RegWidth / 8);
206 if (Alignment > RegAlign)
207 Alignment = RegAlign;
208 unsigned AlignWidth = 8 * Alignment.value();
209 unsigned NumLoads = alignTo(Value: VecWidth, Align: AlignWidth) / AlignWidth;
210 return 3 * NumLoads;
211 }
212
213 // Non-HVX vectors.
214 // Add extra cost for floating point types.
215 unsigned Cost =
216 VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1;
217
218 // At this point unspecified alignment is considered as Align(1).
219 const Align BoundAlignment = std::min(a: Alignment, b: Align(8));
220 unsigned AlignWidth = 8 * BoundAlignment.value();
221 unsigned NumLoads = alignTo(Value: VecWidth, Align: AlignWidth) / AlignWidth;
222 if (Alignment == Align(4) || Alignment == Align(8))
223 return Cost * NumLoads;
224 // Loads of less than 32 bits will need extra inserts to compose a vector.
225 assert(BoundAlignment <= Align(8));
226 unsigned LogA = Log2(A: BoundAlignment);
227 return (3 - LogA) * Cost * NumLoads;
228 }
229
230 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
231 OpInfo, I);
232}
233
234InstructionCost
235HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
236 VectorType *SrcTy, ArrayRef<int> Mask,
237 TTI::TargetCostKind CostKind, int Index,
238 VectorType *SubTp, ArrayRef<const Value *> Args,
239 const Instruction *CxtI) const {
240 return 1;
241}
242
243InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost(
244 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
245 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
246 bool UseMaskForCond, bool UseMaskForGaps) const {
247 if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
248 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
249 Alignment, AddressSpace,
250 CostKind,
251 UseMaskForCond, UseMaskForGaps);
252 return getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind);
253}
254
255InstructionCost HexagonTTIImpl::getCmpSelInstrCost(
256 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
257 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
258 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
259 if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) {
260 if (!isHVXVectorType(Ty: ValTy) && ValTy->isFPOrFPVectorTy())
261 return InstructionCost::getMax();
262 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
263 if (Opcode == Instruction::FCmp)
264 return LT.first + FloatFactor * getTypeNumElements(Ty: ValTy);
265 }
266 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
267 Op1Info, Op2Info, I);
268}
269
270InstructionCost HexagonTTIImpl::getArithmeticInstrCost(
271 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
272 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
273 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
274 // TODO: Handle more cost kinds.
275 if (CostKind != TTI::TCK_RecipThroughput)
276 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
277 Opd2Info: Op2Info, Args, CxtI);
278
279 if (Ty->isVectorTy()) {
280 if (!isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy())
281 return InstructionCost::getMax();
282 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
283 if (LT.second.isFloatingPoint())
284 return LT.first + FloatFactor * getTypeNumElements(Ty);
285 }
286 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
287 Args, CxtI);
288}
289
290InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
291 Type *SrcTy,
292 TTI::CastContextHint CCH,
293 TTI::TargetCostKind CostKind,
294 const Instruction *I) const {
295 auto isNonHVXFP = [this] (Type *Ty) {
296 return Ty->isVectorTy() && !isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy();
297 };
298 if (isNonHVXFP(SrcTy) || isNonHVXFP(DstTy))
299 return InstructionCost::getMax();
300
301 if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
302 unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(Ty: SrcTy) : 0;
303 unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(Ty: DstTy) : 0;
304
305 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: SrcTy);
306 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: DstTy);
307 InstructionCost Cost =
308 std::max(a: SrcLT.first, b: DstLT.first) + FloatFactor * (SrcN + DstN);
309 // TODO: Allow non-throughput costs that aren't binary.
310 if (CostKind != TTI::TCK_RecipThroughput)
311 return Cost == 0 ? 0 : 1;
312 return Cost;
313 }
314 return 1;
315}
316
317InstructionCost HexagonTTIImpl::getVectorInstrCost(
318 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
319 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
320 Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
321 : Val;
322 if (Opcode == Instruction::InsertElement) {
323 // Need two rotations for non-zero index.
324 unsigned Cost = (Index != 0) ? 2 : 0;
325 if (ElemTy->isIntegerTy(BitWidth: 32))
326 return Cost;
327 // If it's not a 32-bit value, there will need to be an extract.
328 return Cost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val, CostKind,
329 Index, Op0, Op1, VIC);
330 }
331
332 if (Opcode == Instruction::ExtractElement)
333 return 2;
334
335 return 1;
336}
337
338bool HexagonTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
339 switch (II->getIntrinsicID()) {
340 case Intrinsic::vector_reduce_add:
341 return false;
342 }
343 return true;
344}
345
346bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/,
347 unsigned /*AddressSpace*/,
348 TTI::MaskKind /*MaskKind*/) const {
349 // This function is called from scalarize-masked-mem-intrin, which runs
350 // in pre-isel. Use ST directly instead of calling isHVXVectorType.
351 return HexagonMaskedVMem && ST.isTypeForHVX(VecTy: DataType);
352}
353
354bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/,
355 unsigned /*AddressSpace*/,
356 TTI::MaskKind /*MaskKind*/) const {
357 // This function is called from scalarize-masked-mem-intrin, which runs
358 // in pre-isel. Use ST directly instead of calling isHVXVectorType.
359 return HexagonMaskedVMem && ST.isTypeForHVX(VecTy: DataType);
360}
361
362bool HexagonTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
363 // For now assume we can not deal with all HVX datatypes.
364 if (!Ty->isVectorTy() || !ST.isTypeForHVX(VecTy: Ty) ||
365 !HexagonAllowScatterGatherHVX)
366 return false;
367 // This must be in sync with HexagonVectorCombine pass.
368 switch (Ty->getScalarSizeInBits()) {
369 case 8:
370 return (getTypeNumElements(Ty) == 128);
371 case 16:
372 if (getTypeNumElements(Ty) == 64 || getTypeNumElements(Ty) == 32)
373 return (Alignment >= 2);
374 break;
375 case 32:
376 if (getTypeNumElements(Ty) == 32)
377 return (Alignment >= 4);
378 break;
379 default:
380 break;
381 }
382 return false;
383}
384
385bool HexagonTTIImpl::isLegalMaskedScatter(Type *Ty, Align Alignment) const {
386 if (!Ty->isVectorTy() || !ST.isTypeForHVX(VecTy: Ty) ||
387 !HexagonAllowScatterGatherHVX)
388 return false;
389 // This must be in sync with HexagonVectorCombine pass.
390 switch (Ty->getScalarSizeInBits()) {
391 case 8:
392 return (getTypeNumElements(Ty) == 128);
393 case 16:
394 if (getTypeNumElements(Ty) == 64)
395 return (Alignment >= 2);
396 break;
397 case 32:
398 if (getTypeNumElements(Ty) == 32)
399 return (Alignment >= 4);
400 break;
401 default:
402 break;
403 }
404 return false;
405}
406
407bool HexagonTTIImpl::forceScalarizeMaskedGather(VectorType *VTy,
408 Align Alignment) const {
409 return !isLegalMaskedGather(Ty: VTy, Alignment);
410}
411
412bool HexagonTTIImpl::forceScalarizeMaskedScatter(VectorType *VTy,
413 Align Alignment) const {
414 return !isLegalMaskedScatter(Ty: VTy, Alignment);
415}
416
417/// --- Vector TTI end ---
418
419unsigned HexagonTTIImpl::getPrefetchDistance() const {
420 return ST.getL1PrefetchDistance();
421}
422
423unsigned HexagonTTIImpl::getCacheLineSize() const {
424 return ST.getL1CacheLineSize();
425}
426
427InstructionCost
428HexagonTTIImpl::getInstructionCost(const User *U,
429 ArrayRef<const Value *> Operands,
430 TTI::TargetCostKind CostKind) const {
431 auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool {
432 if (!CI->isIntegerCast())
433 return false;
434 // Only extensions from an integer type shorter than 32-bit to i32
435 // can be folded into the load.
436 const DataLayout &DL = getDataLayout();
437 unsigned SBW = DL.getTypeSizeInBits(Ty: CI->getSrcTy());
438 unsigned DBW = DL.getTypeSizeInBits(Ty: CI->getDestTy());
439 if (DBW != 32 || SBW >= DBW)
440 return false;
441
442 const LoadInst *LI = dyn_cast<const LoadInst>(Val: CI->getOperand(i_nocapture: 0));
443 // Technically, this code could allow multiple uses of the load, and
444 // check if all the uses are the same extension operation, but this
445 // should be sufficient for most cases.
446 return LI && LI->hasOneUse();
447 };
448
449 if (const CastInst *CI = dyn_cast<const CastInst>(Val: U))
450 if (isCastFoldedIntoLoad(CI))
451 return TargetTransformInfo::TCC_Free;
452 return BaseT::getInstructionCost(U, Operands, CostKind);
453}
454
455bool HexagonTTIImpl::shouldBuildLookupTables() const {
456 return EmitLookupTables;
457}
458