1//===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8/// This file implements a TargetTransformInfo analysis pass specific to the
9/// Hexagon target machine. It uses the target's detailed information to provide
10/// more precise answers to certain TTI queries, while letting the target
11/// independent and default TTI implementations handle the rest.
12///
13//===----------------------------------------------------------------------===//
14
15#include "HexagonTargetTransformInfo.h"
16#include "HexagonSubtarget.h"
17#include "llvm/Analysis/TargetTransformInfo.h"
18#include "llvm/CodeGen/ValueTypes.h"
19#include "llvm/IR/InstrTypes.h"
20#include "llvm/IR/Instructions.h"
21#include "llvm/IR/User.h"
22#include "llvm/Support/Casting.h"
23#include "llvm/Support/CommandLine.h"
24#include "llvm/Transforms/Utils/LoopPeel.h"
25#include "llvm/Transforms/Utils/UnrollLoop.h"
26
27using namespace llvm;
28
29#define DEBUG_TYPE "hexagontti"
30
31static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(Val: false),
32 cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
33
34cl::opt<bool> HexagonAllowScatterGatherHVX(
35 "hexagon-allow-scatter-gather-hvx", cl::init(Val: false), cl::Hidden,
36 cl::desc("Allow auto-generation of HVX scatter-gather"));
37
38static cl::opt<bool> EnableV68FloatAutoHVX(
39 "force-hvx-float", cl::Hidden,
40 cl::desc("Enable auto-vectorization of floatint point types on v68."));
41
42static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
43 cl::init(Val: true), cl::Hidden,
44 cl::desc("Control lookup table emission on Hexagon target"));
45
46static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem", cl::init(Val: true),
47 cl::Hidden, cl::desc("Enable masked loads/stores for HVX"));
48
49// Constant "cost factor" to make floating point operations more expensive
50// in terms of vectorization cost. This isn't the best way, but it should
51// do. Ultimately, the cost should use cycles.
52static const unsigned FloatFactor = 4;
53
54bool HexagonTTIImpl::useHVX() const {
55 return ST.useHVXOps() && HexagonAutoHVX;
56}
57
58bool HexagonTTIImpl::isHVXVectorType(Type *Ty) const {
59 auto *VecTy = dyn_cast<VectorType>(Val: Ty);
60 if (!VecTy)
61 return false;
62 if (!ST.isTypeForHVX(VecTy))
63 return false;
64 if (ST.useHVXV69Ops() || !VecTy->getElementType()->isFloatingPointTy())
65 return true;
66 return ST.useHVXV68Ops() && EnableV68FloatAutoHVX;
67}
68
69unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const {
70 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty))
71 return VTy->getNumElements();
72 assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) &&
73 "Expecting scalar type");
74 return 1;
75}
76
77TargetTransformInfo::PopcntSupportKind
78HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
79 // Return fast hardware support as every input < 64 bits will be promoted
80 // to 64 bits.
81 return TargetTransformInfo::PSK_FastHardware;
82}
83
84// The Hexagon target can unroll loops with run-time trip counts.
85void HexagonTTIImpl::getUnrollingPreferences(
86 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
87 OptimizationRemarkEmitter *ORE) const {
88 UP.Runtime = UP.Partial = true;
89}
90
91void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
92 TTI::PeelingPreferences &PP) const {
93 BaseT::getPeelingPreferences(L, SE, PP);
94 // Only try to peel innermost loops with small runtime trip counts.
95 if (L && L->isInnermost() && canPeel(L) &&
96 SE.getSmallConstantTripCount(L) == 0 &&
97 SE.getSmallConstantMaxTripCount(L) > 0 &&
98 SE.getSmallConstantMaxTripCount(L) <= 5) {
99 PP.PeelCount = 2;
100 }
101}
102
103TTI::AddressingModeKind
104HexagonTTIImpl::getPreferredAddressingMode(const Loop *L,
105 ScalarEvolution *SE) const {
106 return TTI::AMK_PostIndexed;
107}
108
109/// --- Vector TTI begin ---
110
111unsigned HexagonTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
112 bool Vector = ClassID == 1;
113 if (Vector)
114 return useHVX() ? 32 : 0;
115 return 32;
116}
117
118unsigned HexagonTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
119 return useHVX() ? 2 : 1;
120}
121
122TypeSize
123HexagonTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
124 switch (K) {
125 case TargetTransformInfo::RGK_Scalar:
126 return TypeSize::getFixed(ExactSize: 32);
127 case TargetTransformInfo::RGK_FixedWidthVector:
128 return TypeSize::getFixed(ExactSize: getMinVectorRegisterBitWidth());
129 case TargetTransformInfo::RGK_ScalableVector:
130 return TypeSize::getScalable(MinimumSize: 0);
131 }
132
133 llvm_unreachable("Unsupported register kind");
134}
135
136unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
137 return useHVX() ? ST.getVectorLength()*8 : 32;
138}
139
140ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth,
141 bool IsScalable) const {
142 assert(!IsScalable && "Scalable VFs are not supported for Hexagon");
143 return ElementCount::getFixed(MinVal: (8 * ST.getVectorLength()) / ElemWidth);
144}
145
146InstructionCost
147HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
148 TTI::TargetCostKind CostKind) const {
149 return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind);
150}
151
152InstructionCost
153HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
154 TTI::TargetCostKind CostKind) const {
155 if (ICA.getID() == Intrinsic::bswap) {
156 std::pair<InstructionCost, MVT> LT =
157 getTypeLegalizationCost(Ty: ICA.getReturnType());
158 return LT.first + 2;
159 }
160 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
161}
162
163InstructionCost
164HexagonTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
165 const SCEV *S,
166 TTI::TargetCostKind CostKind) const {
167 return 0;
168}
169
170InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
171 Align Alignment,
172 unsigned AddressSpace,
173 TTI::TargetCostKind CostKind,
174 TTI::OperandValueInfo OpInfo,
175 const Instruction *I) const {
176 assert(Opcode == Instruction::Load || Opcode == Instruction::Store);
177 // TODO: Handle other cost kinds.
178 if (CostKind != TTI::TCK_RecipThroughput)
179 return 1;
180
181 if (Opcode == Instruction::Store)
182 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
183 CostKind, OpInfo, I);
184
185 if (Src->isVectorTy()) {
186 VectorType *VecTy = cast<VectorType>(Val: Src);
187 unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedValue();
188 if (isHVXVectorType(Ty: VecTy)) {
189 unsigned RegWidth =
190 getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
191 .getFixedValue();
192 assert(RegWidth && "Non-zero vector register width expected");
193 // Cost of HVX loads.
194 if (VecWidth % RegWidth == 0)
195 return VecWidth / RegWidth;
196 // Cost of constructing HVX vector from scalar loads
197 const Align RegAlign(RegWidth / 8);
198 if (Alignment > RegAlign)
199 Alignment = RegAlign;
200 unsigned AlignWidth = 8 * Alignment.value();
201 unsigned NumLoads = alignTo(Value: VecWidth, Align: AlignWidth) / AlignWidth;
202 return 3 * NumLoads;
203 }
204
205 // Non-HVX vectors.
206 // Add extra cost for floating point types.
207 unsigned Cost =
208 VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1;
209
210 // At this point unspecified alignment is considered as Align(1).
211 const Align BoundAlignment = std::min(a: Alignment, b: Align(8));
212 unsigned AlignWidth = 8 * BoundAlignment.value();
213 unsigned NumLoads = alignTo(Value: VecWidth, Align: AlignWidth) / AlignWidth;
214 if (Alignment == Align(4) || Alignment == Align(8))
215 return Cost * NumLoads;
216 // Loads of less than 32 bits will need extra inserts to compose a vector.
217 assert(BoundAlignment <= Align(8));
218 unsigned LogA = Log2(A: BoundAlignment);
219 return (3 - LogA) * Cost * NumLoads;
220 }
221
222 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
223 OpInfo, I);
224}
225
226InstructionCost
227HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
228 VectorType *SrcTy, ArrayRef<int> Mask,
229 TTI::TargetCostKind CostKind, int Index,
230 VectorType *SubTp, ArrayRef<const Value *> Args,
231 const Instruction *CxtI) const {
232 return 1;
233}
234
235InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost(
236 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
237 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
238 bool UseMaskForCond, bool UseMaskForGaps) const {
239 if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
240 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
241 Alignment, AddressSpace,
242 CostKind,
243 UseMaskForCond, UseMaskForGaps);
244 return getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind);
245}
246
247InstructionCost HexagonTTIImpl::getCmpSelInstrCost(
248 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
249 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
250 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
251 if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) {
252 if (!isHVXVectorType(Ty: ValTy) && ValTy->isFPOrFPVectorTy())
253 return InstructionCost::getMax();
254 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
255 if (Opcode == Instruction::FCmp)
256 return LT.first + FloatFactor * getTypeNumElements(Ty: ValTy);
257 }
258 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
259 Op1Info, Op2Info, I);
260}
261
262InstructionCost HexagonTTIImpl::getArithmeticInstrCost(
263 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
264 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
265 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
266 // TODO: Handle more cost kinds.
267 if (CostKind != TTI::TCK_RecipThroughput)
268 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
269 Opd2Info: Op2Info, Args, CxtI);
270
271 if (Ty->isVectorTy()) {
272 if (!isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy())
273 return InstructionCost::getMax();
274 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
275 if (LT.second.isFloatingPoint())
276 return LT.first + FloatFactor * getTypeNumElements(Ty);
277 }
278 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
279 Args, CxtI);
280}
281
282InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
283 Type *SrcTy,
284 TTI::CastContextHint CCH,
285 TTI::TargetCostKind CostKind,
286 const Instruction *I) const {
287 auto isNonHVXFP = [this] (Type *Ty) {
288 return Ty->isVectorTy() && !isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy();
289 };
290 if (isNonHVXFP(SrcTy) || isNonHVXFP(DstTy))
291 return InstructionCost::getMax();
292
293 if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
294 unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(Ty: SrcTy) : 0;
295 unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(Ty: DstTy) : 0;
296
297 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: SrcTy);
298 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: DstTy);
299 InstructionCost Cost =
300 std::max(a: SrcLT.first, b: DstLT.first) + FloatFactor * (SrcN + DstN);
301 // TODO: Allow non-throughput costs that aren't binary.
302 if (CostKind != TTI::TCK_RecipThroughput)
303 return Cost == 0 ? 0 : 1;
304 return Cost;
305 }
306 return 1;
307}
308
309InstructionCost HexagonTTIImpl::getVectorInstrCost(
310 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
311 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
312 Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
313 : Val;
314 if (Opcode == Instruction::InsertElement) {
315 // Need two rotations for non-zero index.
316 unsigned Cost = (Index != 0) ? 2 : 0;
317 if (ElemTy->isIntegerTy(Bitwidth: 32))
318 return Cost;
319 // If it's not a 32-bit value, there will need to be an extract.
320 return Cost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val, CostKind,
321 Index, Op0, Op1, VIC);
322 }
323
324 if (Opcode == Instruction::ExtractElement)
325 return 2;
326
327 return 1;
328}
329
330bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/,
331 unsigned /*AddressSpace*/,
332 TTI::MaskKind /*MaskKind*/) const {
333 // This function is called from scalarize-masked-mem-intrin, which runs
334 // in pre-isel. Use ST directly instead of calling isHVXVectorType.
335 return HexagonMaskedVMem && ST.isTypeForHVX(VecTy: DataType);
336}
337
338bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/,
339 unsigned /*AddressSpace*/,
340 TTI::MaskKind /*MaskKind*/) const {
341 // This function is called from scalarize-masked-mem-intrin, which runs
342 // in pre-isel. Use ST directly instead of calling isHVXVectorType.
343 return HexagonMaskedVMem && ST.isTypeForHVX(VecTy: DataType);
344}
345
346bool HexagonTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
347 // For now assume we can not deal with all HVX datatypes.
348 if (!Ty->isVectorTy() || !ST.isTypeForHVX(VecTy: Ty) ||
349 !HexagonAllowScatterGatherHVX)
350 return false;
351 // This must be in sync with HexagonVectorCombine pass.
352 switch (Ty->getScalarSizeInBits()) {
353 case 8:
354 return (getTypeNumElements(Ty) == 128);
355 case 16:
356 if (getTypeNumElements(Ty) == 64 || getTypeNumElements(Ty) == 32)
357 return (Alignment >= 2);
358 break;
359 case 32:
360 if (getTypeNumElements(Ty) == 32)
361 return (Alignment >= 4);
362 break;
363 default:
364 break;
365 }
366 return false;
367}
368
369bool HexagonTTIImpl::isLegalMaskedScatter(Type *Ty, Align Alignment) const {
370 if (!Ty->isVectorTy() || !ST.isTypeForHVX(VecTy: Ty) ||
371 !HexagonAllowScatterGatherHVX)
372 return false;
373 // This must be in sync with HexagonVectorCombine pass.
374 switch (Ty->getScalarSizeInBits()) {
375 case 8:
376 return (getTypeNumElements(Ty) == 128);
377 case 16:
378 if (getTypeNumElements(Ty) == 64)
379 return (Alignment >= 2);
380 break;
381 case 32:
382 if (getTypeNumElements(Ty) == 32)
383 return (Alignment >= 4);
384 break;
385 default:
386 break;
387 }
388 return false;
389}
390
391bool HexagonTTIImpl::forceScalarizeMaskedGather(VectorType *VTy,
392 Align Alignment) const {
393 return !isLegalMaskedGather(Ty: VTy, Alignment);
394}
395
396bool HexagonTTIImpl::forceScalarizeMaskedScatter(VectorType *VTy,
397 Align Alignment) const {
398 return !isLegalMaskedScatter(Ty: VTy, Alignment);
399}
400
401/// --- Vector TTI end ---
402
403unsigned HexagonTTIImpl::getPrefetchDistance() const {
404 return ST.getL1PrefetchDistance();
405}
406
407unsigned HexagonTTIImpl::getCacheLineSize() const {
408 return ST.getL1CacheLineSize();
409}
410
411InstructionCost
412HexagonTTIImpl::getInstructionCost(const User *U,
413 ArrayRef<const Value *> Operands,
414 TTI::TargetCostKind CostKind) const {
415 auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool {
416 if (!CI->isIntegerCast())
417 return false;
418 // Only extensions from an integer type shorter than 32-bit to i32
419 // can be folded into the load.
420 const DataLayout &DL = getDataLayout();
421 unsigned SBW = DL.getTypeSizeInBits(Ty: CI->getSrcTy());
422 unsigned DBW = DL.getTypeSizeInBits(Ty: CI->getDestTy());
423 if (DBW != 32 || SBW >= DBW)
424 return false;
425
426 const LoadInst *LI = dyn_cast<const LoadInst>(Val: CI->getOperand(i_nocapture: 0));
427 // Technically, this code could allow multiple uses of the load, and
428 // check if all the uses are the same extension operation, but this
429 // should be sufficient for most cases.
430 return LI && LI->hasOneUse();
431 };
432
433 if (const CastInst *CI = dyn_cast<const CastInst>(Val: U))
434 if (isCastFoldedIntoLoad(CI))
435 return TargetTransformInfo::TCC_Free;
436 return BaseT::getInstructionCost(U, Operands, CostKind);
437}
438
439bool HexagonTTIImpl::shouldBuildLookupTables() const {
440 return EmitLookupTables;
441}
442