1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetTransformInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "AMDGPUTargetMachine.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "SIModeRegisterDefaults.h"
22#include "llvm/ADT/SmallBitVector.h"
23#include "llvm/Analysis/InlineCost.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/ValueTracking.h"
26#include "llvm/CodeGen/Analysis.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/IR/PatternMatch.h"
31#include "llvm/Support/KnownBits.h"
32#include <optional>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
38static cl::opt<unsigned> UnrollThresholdPrivate(
39 "amdgpu-unroll-threshold-private",
40 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
41 cl::init(Val: 2700), cl::Hidden);
42
43static cl::opt<unsigned> UnrollThresholdLocal(
44 "amdgpu-unroll-threshold-local",
45 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
46 cl::init(Val: 1000), cl::Hidden);
47
48static cl::opt<unsigned> UnrollThresholdIf(
49 "amdgpu-unroll-threshold-if",
50 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
51 cl::init(Val: 200), cl::Hidden);
52
53static cl::opt<bool> UnrollRuntimeLocal(
54 "amdgpu-unroll-runtime-local",
55 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
56 cl::init(Val: true), cl::Hidden);
57
58static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
59 "amdgpu-unroll-max-block-to-analyze",
60 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
61 cl::init(Val: 32), cl::Hidden);
62
63static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
64 cl::Hidden, cl::init(Val: 4000),
65 cl::desc("Cost of alloca argument"));
66
67// If the amount of scratch memory to eliminate exceeds our ability to allocate
68// it into registers we gain nothing by aggressively inlining functions for that
69// heuristic.
70static cl::opt<unsigned>
71 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
72 cl::init(Val: 256),
73 cl::desc("Maximum alloca size to use for inline cost"));
74
75// Inliner constraint to achieve reasonable compilation time.
76static cl::opt<size_t> InlineMaxBB(
77 "amdgpu-inline-max-bb", cl::Hidden, cl::init(Val: 1100),
78 cl::desc("Maximum number of BBs allowed in a function after inlining"
79 " (compile time constraint)"));
80
81// This default unroll factor is based on microbenchmarks on gfx1030.
82static cl::opt<unsigned> MemcpyLoopUnroll(
83 "amdgpu-memcpy-loop-unroll",
84 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
85 "operations when lowering statically-sized memcpy, memmove, or"
86 "memset as a loop"),
87 cl::init(Val: 16), cl::Hidden);
88
89static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
90 unsigned Depth = 0) {
91 const Instruction *I = dyn_cast<Instruction>(Val: Cond);
92 if (!I)
93 return false;
94
95 if (!L->contains(Inst: I))
96 return false;
97 for (const Value *V : I->operand_values()) {
98 if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) {
99 if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) {
100 return SubLoop->contains(Inst: PHI); }))
101 return true;
102 } else if (Depth < 10 && dependsOnLocalPhi(L, Cond: V, Depth: Depth+1))
103 return true;
104 }
105 return false;
106}
107
108AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
109 : BaseT(TM, F.getDataLayout()),
110 TargetTriple(TM->getTargetTriple()),
111 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
112 TLI(ST->getTargetLowering()) {}
113
114void AMDGPUTTIImpl::getUnrollingPreferences(
115 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
116 OptimizationRemarkEmitter *ORE) const {
117 const Function &F = *L->getHeader()->getParent();
118 UP.Threshold =
119 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold", Default: 300);
120 UP.MaxCount = std::numeric_limits<unsigned>::max();
121 UP.Partial = true;
122
123 // Conditional branch in a loop back edge needs 3 additional exec
124 // manipulations in average.
125 UP.BEInsns += 3;
126
127 // We want to run unroll even for the loops which have been vectorized.
128 UP.UnrollVectorizedLoop = true;
129
130 // Enable runtime unrolling for loops whose trip count is not known at
131 // compile time.
132 UP.Runtime = true;
133
134 // Maximum alloca size than can fit registers. Reserve 16 registers.
135 const unsigned MaxAlloca = (256 - 16) * 4;
136 unsigned ThresholdPrivate = UnrollThresholdPrivate;
137 unsigned ThresholdLocal = UnrollThresholdLocal;
138
139 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
140 // provided threshold value as the default for Threshold
141 if (MDNode *LoopUnrollThreshold =
142 findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold")) {
143 if (LoopUnrollThreshold->getNumOperands() == 2) {
144 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
145 MD: LoopUnrollThreshold->getOperand(I: 1));
146 if (MetaThresholdValue) {
147 // We will also use the supplied value for PartialThreshold for now.
148 // We may introduce additional metadata if it becomes necessary in the
149 // future.
150 UP.Threshold = MetaThresholdValue->getSExtValue();
151 UP.PartialThreshold = UP.Threshold;
152 ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold);
153 ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold);
154 }
155 }
156 }
157
158 unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal);
159 for (const BasicBlock *BB : L->getBlocks()) {
160 const DataLayout &DL = BB->getDataLayout();
161 unsigned LocalGEPsSeen = 0;
162
163 if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) {
164 return SubLoop->contains(BB); }))
165 continue; // Block belongs to an inner loop.
166
167 for (const Instruction &I : *BB) {
168 // Unroll a loop which contains an "if" statement whose condition
169 // defined by a PHI belonging to the loop. This may help to eliminate
170 // if region and potentially even PHI itself, saving on both divergence
171 // and registers used for the PHI.
172 // Add a small bonus for each of such "if" statements.
173 if (const CondBrInst *Br = dyn_cast<CondBrInst>(Val: &I)) {
174 if (UP.Threshold < MaxBoost) {
175 BasicBlock *Succ0 = Br->getSuccessor(i: 0);
176 BasicBlock *Succ1 = Br->getSuccessor(i: 1);
177 if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) ||
178 (L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1)))
179 continue;
180 if (dependsOnLocalPhi(L, Cond: Br->getCondition())) {
181 UP.Threshold += UnrollThresholdIf;
182 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
183 << " for loop:\n"
184 << *L << " due to " << *Br << '\n');
185 if (UP.Threshold >= MaxBoost)
186 return;
187 }
188 }
189 continue;
190 }
191
192 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I);
193 if (!GEP)
194 continue;
195
196 unsigned AS = GEP->getAddressSpace();
197 unsigned Threshold = 0;
198 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
199 Threshold = ThresholdPrivate;
200 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
201 Threshold = ThresholdLocal;
202 else
203 continue;
204
205 if (UP.Threshold >= Threshold)
206 continue;
207
208 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
209 const Value *Ptr = GEP->getPointerOperand();
210 const AllocaInst *Alloca =
211 dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr));
212 if (!Alloca || !Alloca->isStaticAlloca())
213 continue;
214 auto AllocaSize = Alloca->getAllocationSize(DL);
215 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
216 continue;
217 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
218 AS == AMDGPUAS::REGION_ADDRESS) {
219 LocalGEPsSeen++;
220 // Inhibit unroll for local memory if we have seen addressing not to
221 // a variable, most likely we will be unable to combine it.
222 // Do not unroll too deep inner loops for local memory to give a chance
223 // to unroll an outer loop for a more important reason.
224 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
225 (!isa<GlobalVariable>(Val: GEP->getPointerOperand()) &&
226 !isa<Argument>(Val: GEP->getPointerOperand())))
227 continue;
228 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
229 << *L << " due to LDS use.\n");
230 UP.Runtime = UnrollRuntimeLocal;
231 }
232
233 // Check if GEP depends on a value defined by this loop itself.
234 bool HasLoopDef = false;
235 for (const Value *Op : GEP->operands()) {
236 const Instruction *Inst = dyn_cast<Instruction>(Val: Op);
237 if (!Inst || L->isLoopInvariant(V: Op))
238 continue;
239
240 if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) {
241 return SubLoop->contains(Inst); }))
242 continue;
243 HasLoopDef = true;
244 break;
245 }
246 if (!HasLoopDef)
247 continue;
248
249 // We want to do whatever we can to limit the number of alloca
250 // instructions that make it through to the code generator. allocas
251 // require us to use indirect addressing, which is slow and prone to
252 // compiler bugs. If this loop does an address calculation on an
253 // alloca ptr, then we want to use a higher than normal loop unroll
254 // threshold. This will give SROA a better chance to eliminate these
255 // allocas.
256 //
257 // We also want to have more unrolling for local memory to let ds
258 // instructions with different offsets combine.
259 //
260 // Don't use the maximum allowed value here as it will make some
261 // programs way too big.
262 UP.Threshold = Threshold;
263 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
264 << " for loop:\n"
265 << *L << " due to " << *GEP << '\n');
266 if (UP.Threshold >= MaxBoost)
267 return;
268 }
269
270 // If we got a GEP in a small BB from inner loop then increase max trip
271 // count to analyze for better estimation cost in unroll
272 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
273 UP.MaxIterationsCountToAnalyze = 32;
274 }
275}
276
277void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
278 TTI::PeelingPreferences &PP) const {
279 BaseT::getPeelingPreferences(L, SE, PP);
280}
281
282uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
283 return 1024;
284}
285
286GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
287 : BaseT(TM, F.getDataLayout()),
288 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
289 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
290 IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) {
291 SIModeRegisterDefaults Mode(F, *ST);
292 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
293 HasFP64FP16Denormals =
294 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
295}
296
297bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
298 return !F || !ST->isSingleLaneExecution(Kernel: *F);
299}
300
301unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
302 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
303 // registers. See getRegisterClassForType for the implementation.
304 // In this case vector registers are not vector in terms of
305 // VGPRs, but those which can hold multiple values.
306
307 // This is really the number of registers to fill when vectorizing /
308 // interleaving loops, so we lie to avoid trying to use all registers.
309 return 4;
310}
311
312TypeSize
313GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
314 switch (K) {
315 case TargetTransformInfo::RGK_Scalar:
316 return TypeSize::getFixed(ExactSize: 32);
317 case TargetTransformInfo::RGK_FixedWidthVector:
318 return TypeSize::getFixed(ExactSize: (ST->hasPackedFP64Ops() || ST->hasPackedU64Ops())
319 ? 128
320 : ST->hasPackedFP32Ops() ? 64
321 : 32);
322 case TargetTransformInfo::RGK_ScalableVector:
323 return TypeSize::getScalable(MinimumSize: 0);
324 }
325 llvm_unreachable("Unsupported register kind");
326}
327
328unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
329 return 32;
330}
331
332unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
333 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
334 return 32 * 4 / ElemWidth;
335 // For a given width return the max 0number of elements that can be combined
336 // into a wider bit value:
337 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
338 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
339 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
340 : (ElemWidth == 64 &&
341 (ST->hasPackedFP64Ops() || ST->hasPackedU64Ops()))
342 ? 2
343 : 1;
344}
345
346bool GCNTTIImpl::preferSLPInstCountCheck() const {
347 // The integer inst-count heuristic causes regressions on gfx94x and gfx950
348 // because 2-element vector trees that pass the scalar/vector instruction
349 // count comparison still widen scalar moves (e.g. v_mov_b32 to v_mov_b64)
350 // after codegen, increasing register pressure and throughput cost without
351 // reducing the total instruction count.
352 return !ST->hasGFX940Insts() && !ST->hasGFX950Insts();
353}
354
355unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
356 unsigned ChainSizeInBytes,
357 VectorType *VecTy) const {
358 unsigned VecRegBitWidth = VF * LoadSize;
359 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
360 // TODO: Support element-size less than 32bit?
361 return 128 / LoadSize;
362
363 return VF;
364}
365
366unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
367 unsigned ChainSizeInBytes,
368 VectorType *VecTy) const {
369 unsigned VecRegBitWidth = VF * StoreSize;
370 if (VecRegBitWidth > 128)
371 return 128 / StoreSize;
372
373 return VF;
374}
375
376unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
377 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
378 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
379 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
380 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
381 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
382 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
383 return 512;
384 }
385
386 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
387 return 8 * ST->getMaxPrivateElementSize();
388
389 // Common to flat, global, local and region. Assume for unknown addrspace.
390 return 128;
391}
392
393bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
394 Align Alignment,
395 unsigned AddrSpace) const {
396 // We allow vectorization of flat stores, even though we may need to decompose
397 // them later if they may access private memory. We don't have enough context
398 // here, and legalization can handle it.
399 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
400 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
401 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
402 }
403 return true;
404}
405
406bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
407 Align Alignment,
408 unsigned AddrSpace) const {
409 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
410}
411
412bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
413 Align Alignment,
414 unsigned AddrSpace) const {
415 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
416}
417
418uint64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
419 return 1024;
420}
421
422Type *GCNTTIImpl::getMemcpyLoopLoweringType(
423 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
424 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
425 std::optional<uint32_t> AtomicElementSize) const {
426
427 if (AtomicElementSize)
428 return Type::getIntNTy(C&: Context, N: *AtomicElementSize * 8);
429
430 // 16-byte accesses achieve the highest copy throughput.
431 // If the operation has a fixed known length that is large enough, it is
432 // worthwhile to return an even wider type and let legalization lower it into
433 // multiple accesses, effectively unrolling the memcpy loop.
434 // We also rely on legalization to decompose into smaller accesses for
435 // subtargets and address spaces where it is necessary.
436 //
437 // Don't unroll if Length is not a constant, since unrolling leads to worse
438 // performance for length values that are smaller or slightly larger than the
439 // total size of the type returned here. Mitigating that would require a more
440 // complex lowering for variable-length memcpy and memmove.
441 unsigned I32EltsInVector = 4;
442 if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Val: Length))
443 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context),
444 NumElts: MemcpyLoopUnroll * I32EltsInVector);
445
446 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: I32EltsInVector);
447}
448
449void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
450 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
451 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
452 Align SrcAlign, Align DestAlign,
453 std::optional<uint32_t> AtomicCpySize) const {
454
455 if (AtomicCpySize)
456 BaseT::getMemcpyLoopResidualLoweringType(
457 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
458 DestAlign, AtomicCpySize);
459
460 Type *I32x4Ty = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 4);
461 while (RemainingBytes >= 16) {
462 OpsOut.push_back(Elt: I32x4Ty);
463 RemainingBytes -= 16;
464 }
465
466 Type *I64Ty = Type::getInt64Ty(C&: Context);
467 while (RemainingBytes >= 8) {
468 OpsOut.push_back(Elt: I64Ty);
469 RemainingBytes -= 8;
470 }
471
472 Type *I32Ty = Type::getInt32Ty(C&: Context);
473 while (RemainingBytes >= 4) {
474 OpsOut.push_back(Elt: I32Ty);
475 RemainingBytes -= 4;
476 }
477
478 Type *I16Ty = Type::getInt16Ty(C&: Context);
479 while (RemainingBytes >= 2) {
480 OpsOut.push_back(Elt: I16Ty);
481 RemainingBytes -= 2;
482 }
483
484 Type *I8Ty = Type::getInt8Ty(C&: Context);
485 while (RemainingBytes) {
486 OpsOut.push_back(Elt: I8Ty);
487 --RemainingBytes;
488 }
489}
490
491unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF,
492 bool HasUnorderedReductions) const {
493 // Disable unrolling if the loop is not vectorized.
494 // TODO: Enable this again.
495 if (VF.isScalar())
496 return 1;
497
498 return 8;
499}
500
501bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
502 MemIntrinsicInfo &Info) const {
503 switch (Inst->getIntrinsicID()) {
504 case Intrinsic::amdgcn_ds_ordered_add:
505 case Intrinsic::amdgcn_ds_ordered_swap: {
506 auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 2));
507 auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 4));
508 if (!Ordering || !Volatile)
509 return false; // Invalid.
510
511 unsigned OrderingVal = Ordering->getZExtValue();
512 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
513 return false;
514
515 Info.PtrVal = Inst->getArgOperand(i: 0);
516 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
517 Info.ReadMem = true;
518 Info.WriteMem = true;
519 Info.IsVolatile = !Volatile->isZero();
520 return true;
521 }
522 default:
523 return false;
524 }
525}
526
527InstructionCost GCNTTIImpl::getArithmeticInstrCost(
528 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
529 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
530 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
531
532 // Legalize the type.
533 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
534 int ISD = TLI->InstructionOpcodeToISD(Opcode);
535
536 // Because we don't have any legal vector operations, but the legal types, we
537 // need to account for split vectors.
538 unsigned NElts = LT.second.isVector() ?
539 LT.second.getVectorNumElements() : 1;
540
541 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
542
543 switch (ISD) {
544 case ISD::SHL:
545 case ISD::SRL:
546 case ISD::SRA:
547 if (SLT == MVT::i64)
548 return get64BitInstrCost(CostKind) * LT.first * NElts;
549
550 if (ST->has16BitInsts() && SLT == MVT::i16)
551 NElts = (NElts + 1) / 2;
552
553 // i32
554 return getFullRateInstrCost() * LT.first * NElts;
555 case ISD::ADD:
556 case ISD::SUB:
557 if (SLT == MVT::i64 && ST->hasPackedU64Ops())
558 NElts = (NElts + 1) / 2;
559 [[fallthrough]];
560 case ISD::AND:
561 case ISD::OR:
562 case ISD::XOR:
563 if (SLT == MVT::i64) {
564 // and, or and xor are typically split into 2 VALU instructions.
565 return 2 * getFullRateInstrCost() * LT.first * NElts;
566 }
567
568 if (ST->has16BitInsts() && SLT == MVT::i16)
569 NElts = (NElts + 1) / 2;
570
571 return LT.first * NElts * getFullRateInstrCost();
572 case ISD::MUL: {
573 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
574 if (SLT == MVT::i64) {
575 const int FullRateCost = getFullRateInstrCost();
576 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
577 }
578
579 if (ST->has16BitInsts() && SLT == MVT::i16)
580 NElts = (NElts + 1) / 2;
581
582 // i32
583 return QuarterRateCost * NElts * LT.first;
584 }
585 case ISD::FMUL:
586 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
587 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
588 // fused operation.
589 if (CxtI && CxtI->hasOneUse())
590 if (const auto *FAdd = dyn_cast<BinaryOperator>(Val: *CxtI->user_begin())) {
591 const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode());
592 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
593 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
594 return TargetTransformInfo::TCC_Free;
595 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
596 return TargetTransformInfo::TCC_Free;
597
598 // Estimate all types may be fused with contract/unsafe flags
599 const TargetOptions &Options = TLI->getTargetMachine().Options;
600 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
601 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
602 return TargetTransformInfo::TCC_Free;
603 }
604 }
605 [[fallthrough]];
606 case ISD::FADD:
607 case ISD::FSUB:
608 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
609 NElts = (NElts + 1) / 2;
610 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
611 NElts = (NElts + 1) / 2;
612 if (SLT == MVT::f64) {
613 if (ST->hasPackedFP64Ops())
614 NElts = (NElts + 1) / 2;
615 return LT.first * NElts * get64BitInstrCost(CostKind);
616 }
617
618 if (ST->has16BitInsts() && SLT == MVT::f16)
619 NElts = (NElts + 1) / 2;
620
621 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
622 return LT.first * NElts * getFullRateInstrCost();
623 break;
624 case ISD::FDIV:
625 case ISD::FREM:
626 // FIXME: frem should be handled separately. The fdiv in it is most of it,
627 // but the current lowering is also not entirely correct.
628 if (SLT == MVT::f64) {
629 int Cost = 7 * get64BitInstrCost(CostKind) +
630 getQuarterRateInstrCost(CostKind) +
631 3 * getHalfRateInstrCost(CostKind);
632 // Add cost of workaround.
633 if (!ST->hasUsableDivScaleConditionOutput())
634 Cost += 3 * getFullRateInstrCost();
635
636 return LT.first * Cost * NElts;
637 }
638
639 if (!Args.empty() && match(V: Args[0], P: PatternMatch::m_FPOne())) {
640 // TODO: This is more complicated, unsafe flags etc.
641 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
642 (SLT == MVT::f16 && ST->has16BitInsts())) {
643 return LT.first * getTransInstrCost(CostKind) * NElts;
644 }
645 }
646
647 if (SLT == MVT::f16 && ST->has16BitInsts()) {
648 // 2 x v_cvt_f32_f16
649 // f32 rcp
650 // f32 fmul
651 // v_cvt_f16_f32
652 // f16 div_fixup
653 int Cost = 4 * getFullRateInstrCost() + 2 * getTransInstrCost(CostKind);
654 return LT.first * Cost * NElts;
655 }
656
657 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
658 // Fast unsafe fdiv lowering:
659 // f32 rcp
660 // f32 fmul
661 int Cost = getTransInstrCost(CostKind) + getFullRateInstrCost();
662 return LT.first * Cost * NElts;
663 }
664
665 if (SLT == MVT::f32 || SLT == MVT::f16) {
666 // 4 more v_cvt_* insts without f16 insts support
667 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
668 1 * getTransInstrCost(CostKind);
669
670 if (!HasFP32Denormals) {
671 // FP mode switches.
672 Cost += 2 * getFullRateInstrCost();
673 }
674
675 return LT.first * NElts * Cost;
676 }
677 break;
678 case ISD::FNEG:
679 // Use the backend' estimation. If fneg is not free each element will cost
680 // one additional instruction.
681 return TLI->isFNegFree(VT: SLT) ? 0 : NElts;
682 default:
683 break;
684 }
685
686 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
687 Args, CxtI);
688}
689
690// Return true if there's a potential benefit from using v2f16/v2i16
691// instructions for an intrinsic, even if it requires nontrivial legalization.
692static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
693 switch (ID) {
694 case Intrinsic::fma:
695 case Intrinsic::fmuladd:
696 case Intrinsic::copysign:
697 case Intrinsic::minimumnum:
698 case Intrinsic::maximumnum:
699 case Intrinsic::canonicalize:
700 // There's a small benefit to using vector ops in the legalized code.
701 case Intrinsic::round:
702 case Intrinsic::uadd_sat:
703 case Intrinsic::usub_sat:
704 case Intrinsic::sadd_sat:
705 case Intrinsic::ssub_sat:
706 case Intrinsic::abs:
707 return true;
708 default:
709 return false;
710 }
711}
712
713InstructionCost
714GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
715 TTI::TargetCostKind CostKind) const {
716 switch (ICA.getID()) {
717 case Intrinsic::fabs:
718 // Free source modifier in the common case.
719 return 0;
720 case Intrinsic::amdgcn_workitem_id_x:
721 case Intrinsic::amdgcn_workitem_id_y:
722 case Intrinsic::amdgcn_workitem_id_z:
723 // TODO: If hasPackedTID, or if the calling context is not an entry point
724 // there may be a bit instruction.
725 return 0;
726 case Intrinsic::amdgcn_workgroup_id_x:
727 case Intrinsic::amdgcn_workgroup_id_y:
728 case Intrinsic::amdgcn_workgroup_id_z:
729 case Intrinsic::amdgcn_lds_kernel_id:
730 case Intrinsic::amdgcn_dispatch_ptr:
731 case Intrinsic::amdgcn_dispatch_id:
732 case Intrinsic::amdgcn_implicitarg_ptr:
733 case Intrinsic::amdgcn_queue_ptr:
734 // Read from an argument register.
735 return 0;
736 default:
737 break;
738 }
739
740 Type *RetTy = ICA.getReturnType();
741
742 Intrinsic::ID IID = ICA.getID();
743 switch (IID) {
744 case Intrinsic::exp:
745 case Intrinsic::exp2:
746 case Intrinsic::exp10: {
747 // Legalize the type.
748 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
749 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
750 unsigned NElts =
751 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
752
753 if (SLT == MVT::f64) {
754 unsigned NumOps = 20;
755 if (IID == Intrinsic::exp)
756 ++NumOps;
757 else if (IID == Intrinsic::exp10)
758 NumOps += 3;
759
760 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
761 }
762
763 if (SLT == MVT::f32) {
764 unsigned NumFullRateOps = 0;
765 // v_exp_f32 (transcendental).
766 unsigned NumTransOps = 1;
767
768 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
769 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
770 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
771 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
772 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
773 } else {
774 if (IID == Intrinsic::exp) {
775 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
776 NumFullRateOps = 1;
777 } else if (IID == Intrinsic::exp10) {
778 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
779 NumFullRateOps = 3;
780 NumTransOps = 2;
781 }
782 // Denorm scaling adds setcc + select + fadd + select + fmul.
783 if (HasFP32Denormals)
784 NumFullRateOps += 5;
785 }
786
787 InstructionCost Cost = NumFullRateOps * getFullRateInstrCost() +
788 NumTransOps * getTransInstrCost(CostKind);
789 return LT.first * NElts * Cost;
790 }
791
792 break;
793 }
794 case Intrinsic::log:
795 case Intrinsic::log2:
796 case Intrinsic::log10: {
797 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
798 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
799 unsigned NElts =
800 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
801
802 if (SLT == MVT::f32) {
803 unsigned NumFullRateOps = 0;
804
805 if (IID == Intrinsic::log2) {
806 // LowerFLOG2: just v_log_f32.
807 } else if (ICA.getFlags().approxFunc()) {
808 // LowerFLOGUnsafe: v_log_f32 + fmul (base conversion).
809 NumFullRateOps = 1;
810 } else {
811 // LowerFLOGCommon non-AFN: v_log_f32 + extended-precision
812 // multiply + finite check.
813 NumFullRateOps = ST->hasFastFMAF32() ? 8 : 11;
814 }
815
816 if (HasFP32Denormals)
817 NumFullRateOps += 5;
818
819 InstructionCost Cost =
820 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
821 return LT.first * NElts * Cost;
822 }
823
824 break;
825 }
826 case Intrinsic::sin:
827 case Intrinsic::cos: {
828 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
829 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
830 unsigned NElts =
831 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
832
833 if (SLT == MVT::f32) {
834 // LowerTrig: fmul(1/2pi) + v_sin/v_cos.
835 unsigned NumFullRateOps = ST->hasTrigReducedRange() ? 2 : 1;
836
837 InstructionCost Cost =
838 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
839 return LT.first * NElts * Cost;
840 }
841
842 break;
843 }
844 case Intrinsic::sqrt: {
845 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
846 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
847 unsigned NElts =
848 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
849
850 if (SLT == MVT::f32) {
851 unsigned NumFullRateOps = 0;
852
853 if (!ICA.getFlags().approxFunc()) {
854 // lowerFSQRTF32 non-AFN: v_sqrt_f32 + refinement + scale fixup.
855 NumFullRateOps = HasFP32Denormals ? 17 : 16;
856 }
857
858 InstructionCost Cost =
859 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
860 return LT.first * NElts * Cost;
861 }
862
863 break;
864 }
865 default:
866 break;
867 }
868
869 if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID()))
870 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
871
872 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
873 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
874 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
875
876 if ((ST->hasVOP3PInsts() &&
877 (SLT == MVT::f16 || SLT == MVT::i16 ||
878 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
879 (ST->hasPackedFP64Ops() && SLT == MVT::f64) ||
880 (ST->hasPackedU64Ops() && SLT == MVT::i64)) {
881 NElts = (NElts + 1) / 2;
882 } else if (SLT == MVT::f32) {
883 bool HasPk2FP32Op = ST->hasPackedFP32Ops() &&
884 IID != Intrinsic::minimumnum &&
885 IID != Intrinsic::maximumnum;
886 NElts = HasPk2FP32Op ? (NElts + 1) / 2 : NElts;
887 }
888
889 // TODO: Get more refined intrinsic costs?
890 unsigned InstRate = getQuarterRateInstrCost(CostKind);
891
892 switch (ICA.getID()) {
893 case Intrinsic::fma:
894 case Intrinsic::fmuladd:
895 if (SLT == MVT::f64) {
896 InstRate = get64BitInstrCost(CostKind);
897 break;
898 }
899
900 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
901 InstRate = getFullRateInstrCost();
902 else {
903 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
904 : getQuarterRateInstrCost(CostKind);
905 }
906 break;
907 case Intrinsic::copysign:
908 return NElts * getFullRateInstrCost();
909 case Intrinsic::minimumnum:
910 case Intrinsic::maximumnum: {
911 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
912 // promotion takes the place of the canonicalize.
913 unsigned NumOps = 3;
914 if (const IntrinsicInst *II = ICA.getInst()) {
915 // Directly legal with ieee=0
916 // TODO: Not directly legal with strictfp
917 if (fpenvIEEEMode(I: *II) == KnownIEEEMode::Off)
918 NumOps = 1;
919 }
920
921 unsigned BaseRate =
922 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
923 InstRate = BaseRate * NumOps;
924 break;
925 }
926 case Intrinsic::canonicalize: {
927 InstRate =
928 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
929 break;
930 }
931 case Intrinsic::uadd_sat:
932 case Intrinsic::usub_sat:
933 case Intrinsic::sadd_sat:
934 case Intrinsic::ssub_sat: {
935 if (SLT == MVT::i16 || SLT == MVT::i32)
936 InstRate = getFullRateInstrCost();
937
938 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
939 if (any_of(Range: ValidSatTys, P: equal_to(Arg&: LT.second)))
940 NElts = 1;
941 break;
942 }
943 case Intrinsic::abs:
944 // Expansion takes 2 instructions for VALU
945 if (SLT == MVT::i16 || SLT == MVT::i32)
946 InstRate = 2 * getFullRateInstrCost();
947 break;
948 default:
949 break;
950 }
951
952 return LT.first * NElts * InstRate;
953}
954
955InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
956 TTI::TargetCostKind CostKind,
957 const Instruction *I) const {
958 assert((I == nullptr || I->getOpcode() == Opcode) &&
959 "Opcode should reflect passed instruction.");
960 const bool SCost =
961 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
962 const int CBrCost = SCost ? 5 : 7;
963 switch (Opcode) {
964 case Instruction::UncondBr:
965 // Branch instruction takes about 4 slots on gfx900.
966 return SCost ? 1 : 4;
967 case Instruction::CondBr:
968 // Suppose conditional branch takes additional 3 exec manipulations
969 // instructions in average.
970 return CBrCost;
971 case Instruction::Switch: {
972 const auto *SI = dyn_cast_or_null<SwitchInst>(Val: I);
973 // Each case (including default) takes 1 cmp + 1 cbr instructions in
974 // average.
975 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
976 }
977 case Instruction::Ret:
978 return SCost ? 1 : 10;
979 }
980 return BaseT::getCFInstrCost(Opcode, CostKind, I);
981}
982
983InstructionCost
984GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
985 std::optional<FastMathFlags> FMF,
986 TTI::TargetCostKind CostKind) const {
987 if (TTI::requiresOrderedReduction(FMF))
988 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
989
990 EVT OrigTy = TLI->getValueType(DL, Ty);
991
992 // Computes cost on targets that have packed math instructions(which support
993 // 16-bit types only).
994 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
995 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
996
997 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
998 return LT.first * getFullRateInstrCost();
999}
1000
1001InstructionCost
1002GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1003 FastMathFlags FMF,
1004 TTI::TargetCostKind CostKind) const {
1005 EVT OrigTy = TLI->getValueType(DL, Ty);
1006
1007 // Computes cost on targets that have packed math instructions(which support
1008 // 16-bit types only).
1009 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
1010 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1011
1012 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1013 return LT.first * getHalfRateInstrCost(CostKind);
1014}
1015
1016InstructionCost GCNTTIImpl::getVectorInstrCost(
1017 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
1018 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
1019 switch (Opcode) {
1020 case Instruction::ExtractElement:
1021 case Instruction::InsertElement: {
1022 unsigned EltSize
1023 = DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType());
1024 // Dynamic indexing isn't free and is best avoided.
1025 if (Index == ~0u)
1026 return 2;
1027 if (EltSize < 32) {
1028 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
1029 return 0;
1030 // Extract element sequences of consecutive i8 values that match a
1031 // register size are free most likely. It is not possible to know
1032 // if this extract is part of a consecutive sequence so this may
1033 // apply more generally.
1034 if (Opcode == Instruction::ExtractElement && EltSize == 8) {
1035 if (auto *FVTy = dyn_cast<FixedVectorType>(Val: ValTy)) {
1036 unsigned NumElts = FVTy->getNumElements();
1037 if (NumElts >= 4 && isPowerOf2_32(Value: NumElts))
1038 return 0;
1039 }
1040 }
1041 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
1042 VIC);
1043 }
1044
1045 // Extracts are just reads of a subregister, so are free. Inserts are
1046 // considered free because we don't want to have any cost for scalarizing
1047 // operations, and we don't have to copy into a different register class.
1048 return 0;
1049 }
1050 default:
1051 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
1052 VIC);
1053 }
1054}
1055
1056/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
1057/// this is analyzing the collective result of all output registers. Otherwise,
1058/// this is only querying a specific result index if this returns multiple
1059/// registers in a struct.
1060bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
1061 const CallInst *CI, ArrayRef<unsigned> Indices) const {
1062 // TODO: Handle complex extract indices
1063 if (Indices.size() > 1)
1064 return true;
1065
1066 const DataLayout &DL = CI->getDataLayout();
1067 const SIRegisterInfo *TRI = ST->getRegisterInfo();
1068 TargetLowering::AsmOperandInfoVector TargetConstraints =
1069 TLI->ParseConstraints(DL, TRI: ST->getRegisterInfo(), Call: *CI);
1070
1071 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
1072
1073 int OutputIdx = 0;
1074 for (auto &TC : TargetConstraints) {
1075 if (TC.Type != InlineAsm::isOutput)
1076 continue;
1077
1078 // Skip outputs we don't care about.
1079 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
1080 continue;
1081
1082 TLI->ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
1083
1084 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
1085 TRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second;
1086
1087 // For AGPR constraints null is returned on subtargets without AGPRs, so
1088 // assume divergent for null.
1089 if (!RC || !TRI->isSGPRClass(RC))
1090 return true;
1091 }
1092
1093 return false;
1094}
1095
1096bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
1097 const IntrinsicInst *ReadReg) const {
1098 Metadata *MD =
1099 cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: 0))->getMetadata();
1100 StringRef RegName =
1101 cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: 0))->getString();
1102
1103 // Special case registers that look like VCC.
1104 MVT VT = MVT::getVT(Ty: ReadReg->getType());
1105 if (VT == MVT::i1)
1106 return true;
1107
1108 // Special case scalar registers that start with 'v'.
1109 if (RegName.starts_with(Prefix: "vcc") || RegName.empty())
1110 return false;
1111
1112 // VGPR or AGPR is divergent. There aren't any specially named vector
1113 // registers.
1114 return RegName[0] == 'v' || RegName[0] == 'a';
1115}
1116
1117/// \returns true if the result of the value could potentially be
1118/// different across workitems in a wavefront.
1119bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1120 if (const Argument *A = dyn_cast<Argument>(Val: V))
1121 return !AMDGPU::isArgPassedInSGPR(Arg: A);
1122
1123 // Loads from the private and flat address spaces are divergent, because
1124 // threads can execute the load instruction with the same inputs and get
1125 // different results.
1126 //
1127 // All other loads are not divergent, because if threads issue loads with the
1128 // same arguments, they will always get the same result.
1129 if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V))
1130 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1131 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1132
1133 // Atomics are divergent because they are executed sequentially: when an
1134 // atomic operation refers to the same address in each thread, then each
1135 // thread after the first sees the value written by the previous thread as
1136 // original value.
1137 if (isa<AtomicRMWInst, AtomicCmpXchgInst>(Val: V))
1138 return true;
1139
1140 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
1141 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1142 switch (IID) {
1143 case Intrinsic::read_register:
1144 return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic);
1145 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1146 unsigned SrcAS =
1147 Intrinsic->getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1148 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1149 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1150 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1151 ST->hasGloballyAddressableScratch();
1152 }
1153 case Intrinsic::amdgcn_workitem_id_y:
1154 case Intrinsic::amdgcn_workitem_id_z: {
1155 const Function *F = Intrinsic->getFunction();
1156 bool HasUniformYZ =
1157 ST->hasWavefrontsEvenlySplittingXDim(F: *F, /*RequitezUniformYZ=*/REquiresUniformYZ: true);
1158 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1159 F: *F, Dim: IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1160 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1161 }
1162 default:
1163 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID);
1164 }
1165 }
1166
1167 // Assume all function calls are a source of divergence.
1168 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1169 if (CI->isInlineAsm())
1170 return isInlineAsmSourceOfDivergence(CI);
1171 return true;
1172 }
1173
1174 // Assume all function calls are a source of divergence.
1175 if (isa<InvokeInst>(Val: V))
1176 return true;
1177
1178 // If the target supports globally addressable scratch, the mapping from
1179 // scratch memory to the flat aperture changes therefore an address space cast
1180 // is no longer uniform.
1181 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(Val: V)) {
1182 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1183 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1184 ST->hasGloballyAddressableScratch();
1185 }
1186
1187 return false;
1188}
1189
1190bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1191 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V))
1192 return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID());
1193
1194 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1195 if (CI->isInlineAsm())
1196 return !isInlineAsmSourceOfDivergence(CI);
1197 return false;
1198 }
1199
1200 // In most cases TID / wavefrontsize is uniform.
1201 //
1202 // However, if a kernel has uneven dimesions we can have a value of
1203 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1204 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1205 // packed into a same wave which gives 1 and 0 after the division by 64
1206 // respectively.
1207 //
1208 // The X dimension doesn't reset within a wave if either both the Y
1209 // and Z dimensions are of length 1, or if the X dimension's required
1210 // size is a power of 2. Note, however, if the X dimension's maximum
1211 // size is a power of 2 < the wavefront size, division by the wavefront
1212 // size is guaranteed to yield 0, so this is also a no-reset case.
1213 bool XDimDoesntResetWithinWaves = false;
1214 if (auto *I = dyn_cast<Instruction>(Val: V)) {
1215 const Function *F = I->getFunction();
1216 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(F: *F);
1217 }
1218 using namespace llvm::PatternMatch;
1219 uint64_t C;
1220 if (match(V, P: m_LShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1221 R: m_ConstantInt(V&: C))) ||
1222 match(V, P: m_AShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1223 R: m_ConstantInt(V&: C)))) {
1224 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1225 }
1226
1227 Value *Mask;
1228 if (match(V, P: m_c_And(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1229 R: m_Value(V&: Mask)))) {
1230 return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >=
1231 ST->getWavefrontSizeLog2() &&
1232 XDimDoesntResetWithinWaves;
1233 }
1234
1235 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V);
1236 if (!ExtValue)
1237 return false;
1238
1239 const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: 0));
1240 if (!CI)
1241 return false;
1242
1243 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) {
1244 switch (Intrinsic->getIntrinsicID()) {
1245 default:
1246 return false;
1247 case Intrinsic::amdgcn_if:
1248 case Intrinsic::amdgcn_else: {
1249 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1250 return Indices.size() == 1 && Indices[0] == 1;
1251 }
1252 }
1253 }
1254
1255 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1256 // divergent for the overall struct return. We need to override it in the
1257 // case we're extracting an SGPR component here.
1258 if (CI->isInlineAsm())
1259 return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices());
1260
1261 return false;
1262}
1263
1264bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1265 Intrinsic::ID IID) const {
1266 switch (IID) {
1267 case Intrinsic::amdgcn_is_shared:
1268 case Intrinsic::amdgcn_is_private:
1269 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1270 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1271 case Intrinsic::amdgcn_load_to_lds:
1272 case Intrinsic::amdgcn_make_buffer_rsrc:
1273 OpIndexes.push_back(Elt: 0);
1274 return true;
1275 default:
1276 return false;
1277 }
1278}
1279
1280Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1281 Value *OldV,
1282 Value *NewV) const {
1283 auto IntrID = II->getIntrinsicID();
1284 switch (IntrID) {
1285 case Intrinsic::amdgcn_is_shared:
1286 case Intrinsic::amdgcn_is_private: {
1287 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1288 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1289 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1290 LLVMContext &Ctx = NewV->getType()->getContext();
1291 ConstantInt *NewVal = (TrueAS == NewAS) ?
1292 ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx);
1293 return NewVal;
1294 }
1295 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1296 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1297 Type *DestTy = II->getType();
1298 Type *SrcTy = NewV->getType();
1299 unsigned NewAS = SrcTy->getPointerAddressSpace();
1300 if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS))
1301 return nullptr;
1302 Module *M = II->getModule();
1303 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1304 M, id: II->getIntrinsicID(), OverloadTys: {DestTy, SrcTy, DestTy});
1305 II->setArgOperand(i: 0, v: NewV);
1306 II->setCalledFunction(NewDecl);
1307 return II;
1308 }
1309 case Intrinsic::amdgcn_load_to_lds: {
1310 Type *SrcTy = NewV->getType();
1311 Module *M = II->getModule();
1312 Function *NewDecl =
1313 Intrinsic::getOrInsertDeclaration(M, id: II->getIntrinsicID(), OverloadTys: {SrcTy});
1314 II->setArgOperand(i: 0, v: NewV);
1315 II->setCalledFunction(NewDecl);
1316 return II;
1317 }
1318 case Intrinsic::amdgcn_make_buffer_rsrc: {
1319 Type *SrcTy = NewV->getType();
1320 Type *DstTy = II->getType();
1321 Module *M = II->getModule();
1322 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1323 M, id: II->getIntrinsicID(), OverloadTys: {DstTy, SrcTy});
1324 II->setArgOperand(i: 0, v: NewV);
1325 II->setCalledFunction(NewDecl);
1326 return II;
1327 }
1328 default:
1329 return nullptr;
1330 }
1331}
1332
1333InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1334 VectorType *DstTy, VectorType *SrcTy,
1335 ArrayRef<int> Mask,
1336 TTI::TargetCostKind CostKind,
1337 int Index, VectorType *SubTp,
1338 ArrayRef<const Value *> Args,
1339 const Instruction *CxtI) const {
1340 if (!isa<FixedVectorType>(Val: SrcTy))
1341 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1342 SubTp);
1343
1344 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1345
1346 unsigned ScalarSize = DL.getTypeSizeInBits(Ty: SrcTy->getElementType());
1347 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1348 (ScalarSize == 16 || ScalarSize == 8)) {
1349 // Larger vector widths may require additional instructions, but are
1350 // typically cheaper than scalarized versions.
1351 //
1352 // We assume that shuffling at a register granularity can be done for free.
1353 // This is not true for vectors fed into memory instructions, but it is
1354 // effectively true for all other shuffling. The emphasis of the logic here
1355 // is to assist generic transform in cleaning up / canonicalizing those
1356 // shuffles.
1357
1358 // With op_sel VOP3P instructions freely can access the low half or high
1359 // half of a register, so any swizzle of two elements is free.
1360 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(Val: SrcTy)) {
1361 unsigned NumSrcElts = SrcVecTy->getNumElements();
1362 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1363 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1364 Kind == TTI::SK_PermuteSingleSrc))
1365 return 0;
1366 }
1367
1368 unsigned EltsPerReg = 32 / ScalarSize;
1369 switch (Kind) {
1370 case TTI::SK_Broadcast:
1371 // A single v_perm_b32 can be re-used for all destination registers.
1372 return 1;
1373 case TTI::SK_Reverse:
1374 // One instruction per register.
1375 if (auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy))
1376 return divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: EltsPerReg);
1377 return InstructionCost::getInvalid();
1378 case TTI::SK_ExtractSubvector:
1379 if (Index % EltsPerReg == 0)
1380 return 0; // Shuffling at register granularity
1381 if (auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy))
1382 return divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: EltsPerReg);
1383 return InstructionCost::getInvalid();
1384 case TTI::SK_InsertSubvector: {
1385 auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy);
1386 if (!DstVecTy)
1387 return InstructionCost::getInvalid();
1388 unsigned NumDstElts = DstVecTy->getNumElements();
1389 unsigned NumInsertElts = cast<FixedVectorType>(Val: SubTp)->getNumElements();
1390 unsigned EndIndex = Index + NumInsertElts;
1391 unsigned BeginSubIdx = Index % EltsPerReg;
1392 unsigned EndSubIdx = EndIndex % EltsPerReg;
1393 unsigned Cost = 0;
1394
1395 if (BeginSubIdx != 0) {
1396 // Need to shift the inserted vector into place. The cost is the number
1397 // of destination registers overlapped by the inserted vector.
1398 Cost = divideCeil(Numerator: EndIndex, Denominator: EltsPerReg) - (Index / EltsPerReg);
1399 }
1400
1401 // If the last register overlap is partial, there may be three source
1402 // registers feeding into it; that takes an extra instruction.
1403 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1404 Cost += 1;
1405
1406 return Cost;
1407 }
1408 case TTI::SK_Splice: {
1409 auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy);
1410 if (!DstVecTy)
1411 return InstructionCost::getInvalid();
1412 unsigned NumElts = DstVecTy->getNumElements();
1413 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1414 // Determine the sub-region of the result vector that requires
1415 // sub-register shuffles / mixing.
1416 unsigned EltsFromLHS = NumElts - Index;
1417 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1418 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1419 if (LHSIsAligned && RHSIsAligned)
1420 return 0;
1421 if (LHSIsAligned && !RHSIsAligned)
1422 return divideCeil(Numerator: NumElts, Denominator: EltsPerReg) - (EltsFromLHS / EltsPerReg);
1423 if (!LHSIsAligned && RHSIsAligned)
1424 return divideCeil(Numerator: EltsFromLHS, Denominator: EltsPerReg);
1425 return divideCeil(Numerator: NumElts, Denominator: EltsPerReg);
1426 }
1427 default:
1428 break;
1429 }
1430
1431 if (!Mask.empty()) {
1432 unsigned NumSrcElts = cast<FixedVectorType>(Val: SrcTy)->getNumElements();
1433
1434 // Generically estimate the cost by assuming that each destination
1435 // register is derived from sources via v_perm_b32 instructions if it
1436 // can't be copied as-is.
1437 //
1438 // For each destination register, derive the cost of obtaining it based
1439 // on the number of source registers that feed into it.
1440 unsigned Cost = 0;
1441 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1442 SmallVector<int, 4> Regs;
1443 bool Aligned = true;
1444 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1445 int SrcIdx = Mask[DstIdx + I];
1446 if (SrcIdx == -1)
1447 continue;
1448 int Reg;
1449 if (SrcIdx < (int)NumSrcElts) {
1450 Reg = SrcIdx / EltsPerReg;
1451 if (SrcIdx % EltsPerReg != I)
1452 Aligned = false;
1453 } else {
1454 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1455 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1456 Aligned = false;
1457 }
1458 if (!llvm::is_contained(Range&: Regs, Element: Reg))
1459 Regs.push_back(Elt: Reg);
1460 }
1461 if (Regs.size() >= 2)
1462 Cost += Regs.size() - 1;
1463 else if (!Aligned)
1464 Cost += 1;
1465 }
1466 return Cost;
1467 }
1468 }
1469
1470 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1471 SubTp);
1472}
1473
1474/// Whether it is profitable to sink the operands of an
1475/// Instruction I to the basic block of I.
1476/// This helps using several modifiers (like abs and neg) more often.
1477bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
1478 SmallVectorImpl<Use *> &Ops) const {
1479 using namespace PatternMatch;
1480
1481 for (auto &Op : I->operands()) {
1482 // Ensure we are not already sinking this operand.
1483 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op.get(); }))
1484 continue;
1485
1486 if (match(V: &Op, P: m_FAbs(Op0: m_Value())) || match(V: &Op, P: m_FNeg(X: m_Value()))) {
1487 Ops.push_back(Elt: &Op);
1488 continue;
1489 }
1490
1491 // Check for zero-cost multiple use InsertElement/ExtractElement
1492 // instructions
1493 if (Instruction *OpInst = dyn_cast<Instruction>(Val: Op.get())) {
1494 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1495 Instruction *VecOpInst = dyn_cast<Instruction>(Val: OpInst->getOperand(i: 0));
1496 if (VecOpInst && VecOpInst->hasOneUse())
1497 continue;
1498
1499 if (getVectorInstrCost(Opcode: OpInst->getOpcode(), ValTy: OpInst->getType(),
1500 CostKind: TTI::TCK_RecipThroughput, Index: 0,
1501 Op0: OpInst->getOperand(i: 0),
1502 Op1: OpInst->getOperand(i: 1)) == 0) {
1503 Ops.push_back(Elt: &Op);
1504 continue;
1505 }
1506 }
1507 }
1508
1509 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: Op.get())) {
1510
1511 unsigned EltSize = DL.getTypeSizeInBits(
1512 Ty: cast<VectorType>(Val: Shuffle->getType())->getElementType());
1513
1514 // For i32 (or greater) shufflevectors, these will be lowered into a
1515 // series of insert / extract elements, which will be coalesced away.
1516 if (EltSize < 16 || !ST->has16BitInsts())
1517 continue;
1518
1519 int NumSubElts, SubIndex;
1520 if (Shuffle->changesLength()) {
1521 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1522 Ops.push_back(Elt: &Op);
1523 continue;
1524 }
1525
1526 if ((Shuffle->isExtractSubvectorMask(Index&: SubIndex) ||
1527 Shuffle->isInsertSubvectorMask(NumSubElts, Index&: SubIndex)) &&
1528 !(SubIndex & 0x1)) {
1529 Ops.push_back(Elt: &Op);
1530 continue;
1531 }
1532 }
1533
1534 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1535 Shuffle->isSingleSource()) {
1536 Ops.push_back(Elt: &Op);
1537 continue;
1538 }
1539 }
1540 }
1541
1542 return !Ops.empty();
1543}
1544
1545bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1546 const Function *Callee) const {
1547 const TargetMachine &TM = getTLI()->getTargetMachine();
1548 const GCNSubtarget *CallerST
1549 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1550 const GCNSubtarget *CalleeST
1551 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1552
1553 if (!BaseT::areInlineCompatible(Caller, Callee))
1554 return false;
1555
1556 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1557 // no way to support merge for backend defined attributes.
1558 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1559 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1560 if (!CallerMode.isInlineCompatible(CalleeMode))
1561 return false;
1562
1563 if (Callee->hasFnAttribute(Kind: Attribute::AlwaysInline) ||
1564 Callee->hasFnAttribute(Kind: Attribute::InlineHint))
1565 return true;
1566
1567 // Hack to make compile times reasonable.
1568 if (InlineMaxBB) {
1569 // Single BB does not increase total BB amount.
1570 if (Callee->size() == 1)
1571 return true;
1572 size_t BBSize = Caller->size() + Callee->size() - 1;
1573 return BBSize <= InlineMaxBB;
1574 }
1575
1576 return true;
1577}
1578
1579static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1580 const SITargetLowering *TLI,
1581 const GCNTTIImpl *TTIImpl) {
1582 const int NrOfSGPRUntilSpill = 26;
1583 const int NrOfVGPRUntilSpill = 32;
1584
1585 const DataLayout &DL = TTIImpl->getDataLayout();
1586
1587 unsigned adjustThreshold = 0;
1588 int SGPRsInUse = 0;
1589 int VGPRsInUse = 0;
1590 for (const Use &A : CB->args()) {
1591 SmallVector<EVT, 4> ValueVTs;
1592 ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs);
1593 for (auto ArgVT : ValueVTs) {
1594 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1595 Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT);
1596 if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A)))
1597 SGPRsInUse += CCRegNum;
1598 else
1599 VGPRsInUse += CCRegNum;
1600 }
1601 }
1602
1603 // The cost of passing function arguments through the stack:
1604 // 1 instruction to put a function argument on the stack in the caller.
1605 // 1 instruction to take a function argument from the stack in callee.
1606 // 1 instruction is explicitly take care of data dependencies in callee
1607 // function.
1608 InstructionCost ArgStackCost(1);
1609 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1610 Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1611 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1612 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1613 Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1614 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1615
1616 // The penalty cost is computed relative to the cost of instructions and does
1617 // not model any storage costs.
1618 adjustThreshold += std::max(a: 0, b: SGPRsInUse - NrOfSGPRUntilSpill) *
1619 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1620 adjustThreshold += std::max(a: 0, b: VGPRsInUse - NrOfVGPRUntilSpill) *
1621 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1622 return adjustThreshold;
1623}
1624
1625static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1626 const DataLayout &DL) {
1627 // If we have a pointer to a private array passed into a function
1628 // it will not be optimized out, leaving scratch usage.
1629 // This function calculates the total size in bytes of the memory that would
1630 // end in scratch if the call was not inlined.
1631 unsigned AllocaSize = 0;
1632 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1633 for (Value *PtrArg : CB->args()) {
1634 PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType());
1635 if (!Ty)
1636 continue;
1637
1638 unsigned AddrSpace = Ty->getAddressSpace();
1639 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1640 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1641 continue;
1642
1643 const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg));
1644 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(Ptr: AI).second)
1645 continue;
1646
1647 if (auto Size = AI->getAllocationSize(DL))
1648 AllocaSize += Size->getFixedValue();
1649 }
1650 return AllocaSize;
1651}
1652
1653int GCNTTIImpl::getInliningLastCallToStaticBonus() const {
1654 return BaseT::getInliningLastCallToStaticBonus() *
1655 getInliningThresholdMultiplier();
1656}
1657
1658unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1659 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this);
1660
1661 // Private object passed as arguments may end up in scratch usage if the call
1662 // is not inlined. Increase the inline threshold to promote inlining.
1663 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1664 if (AllocaSize > 0)
1665 Threshold += ArgAllocaCost;
1666 return Threshold;
1667}
1668
1669unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1670 const AllocaInst *AI) const {
1671
1672 // Below the cutoff, assume that the private memory objects would be
1673 // optimized
1674 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1675 if (AllocaSize <= ArgAllocaCutoff)
1676 return 0;
1677
1678 // Above the cutoff, we give a cost to each private memory object
1679 // depending its size. If the array can be optimized by SROA this cost is not
1680 // added to the total-cost in the inliner cost analysis.
1681 //
1682 // We choose the total cost of the alloca such that their sum cancels the
1683 // bonus given in the threshold (ArgAllocaCost).
1684 //
1685 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1686 //
1687 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1688 // the single-bb bonus and the vector-bonus.
1689 //
1690 // We compensate the first two multipliers, by repeating logic from the
1691 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1692 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1693 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1694
1695 bool SingleBB = none_of(Range&: *CB->getCalledFunction(), P: [](const BasicBlock &BB) {
1696 return BB.getTerminator()->getNumSuccessors() > 1;
1697 });
1698 if (SingleBB) {
1699 Threshold += Threshold / 2;
1700 }
1701
1702 auto ArgAllocaSize = AI->getAllocationSize(DL);
1703 if (!ArgAllocaSize)
1704 return 0;
1705
1706 // Attribute the bonus proportionally to the alloca size
1707 unsigned AllocaThresholdBonus =
1708 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1709
1710 return AllocaThresholdBonus;
1711}
1712
1713void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1714 TTI::UnrollingPreferences &UP,
1715 OptimizationRemarkEmitter *ORE) const {
1716 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1717}
1718
1719void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1720 TTI::PeelingPreferences &PP) const {
1721 CommonTTI.getPeelingPreferences(L, SE, PP);
1722}
1723
1724int GCNTTIImpl::getTransInstrCost(TTI::TargetCostKind CostKind) const {
1725 return getQuarterRateInstrCost(CostKind);
1726}
1727
1728int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1729 return ST->hasFullRate64Ops()
1730 ? getFullRateInstrCost()
1731 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1732 : getQuarterRateInstrCost(CostKind);
1733}
1734
1735std::pair<InstructionCost, MVT>
1736GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1737 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1738 auto Size = DL.getTypeSizeInBits(Ty);
1739 // Maximum load or store can handle 8 dwords for scalar and 4 for
1740 // vector ALU. Let's assume anything above 8 dwords is expensive
1741 // even if legal.
1742 if (Size <= 256)
1743 return Cost;
1744
1745 Cost.first += (Size + 255) / 256;
1746 return Cost;
1747}
1748
1749unsigned GCNTTIImpl::getPrefetchDistance() const {
1750 return ST->hasPrefetch() ? 128 : 0;
1751}
1752
1753bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1754 return AMDGPU::isFlatGlobalAddrSpace(AS);
1755}
1756
1757void GCNTTIImpl::collectKernelLaunchBounds(
1758 const Function &F,
1759 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1760 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1761 LB.push_back(Elt: {"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1762 LB.push_back(Elt: {"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1763 LB.push_back(Elt: {"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1764 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1765 ST->getFlatWorkGroupSizes(F);
1766 LB.push_back(Elt: {"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1767 LB.push_back(Elt: {"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1768 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1769 LB.push_back(Elt: {"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1770 LB.push_back(Elt: {"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1771}
1772
1773GCNTTIImpl::KnownIEEEMode
1774GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
1775 if (!ST->hasFeature(Feature: AMDGPU::FeatureDX10ClampAndIEEEMode))
1776 return KnownIEEEMode::On; // Only mode on gfx1170+
1777
1778 const Function *F = I.getFunction();
1779 if (!F)
1780 return KnownIEEEMode::Unknown;
1781
1782 Attribute IEEEAttr = F->getFnAttribute(Kind: "amdgpu-ieee");
1783 if (IEEEAttr.isValid())
1784 return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
1785
1786 return AMDGPU::isShader(CC: F->getCallingConv()) ? KnownIEEEMode::Off
1787 : KnownIEEEMode::On;
1788}
1789
1790InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1791 Align Alignment,
1792 unsigned AddressSpace,
1793 TTI::TargetCostKind CostKind,
1794 TTI::OperandValueInfo OpInfo,
1795 const Instruction *I) const {
1796 if (VectorType *VecTy = dyn_cast<VectorType>(Val: Src)) {
1797 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1798 CostKind != TTI::TCK_Latency &&
1799 VecTy->getElementType()->isIntegerTy(BitWidth: 8)) {
1800 return divideCeil(Numerator: DL.getTypeSizeInBits(Ty: VecTy) - 1,
1801 Denominator: getLoadStoreVecRegBitWidth(AddrSpace: AddressSpace));
1802 }
1803 }
1804 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1805 OpInfo, I);
1806}
1807
1808unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1809 if (VectorType *VecTy = dyn_cast<VectorType>(Val: Tp)) {
1810 if (VecTy->getElementType()->isIntegerTy(BitWidth: 8)) {
1811 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1812 return divideCeil(Numerator: ElementCount - 1, Denominator: 4);
1813 }
1814 }
1815 return BaseT::getNumberOfParts(Tp);
1816}
1817
1818ValueUniformity GCNTTIImpl::getValueUniformity(const Value *V) const {
1819 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
1820 switch (Intrinsic->getIntrinsicID()) {
1821 case Intrinsic::amdgcn_wave_shuffle:
1822 return ValueUniformity::Custom;
1823 default:
1824 break;
1825 }
1826 }
1827
1828 if (isAlwaysUniform(V))
1829 return ValueUniformity::AlwaysUniform;
1830
1831 if (isSourceOfDivergence(V))
1832 return ValueUniformity::NeverUniform;
1833
1834 return ValueUniformity::Default;
1835}
1836
1837InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
1838 StackOffset BaseOffset,
1839 bool HasBaseReg, int64_t Scale,
1840 unsigned AddrSpace) const {
1841 if (HasBaseReg && Scale != 0) {
1842 // gfx1250+ can fold base+scale*index when scale matches the memory access
1843 // size (scale_offset bit). Supported for flat/global/constant/scratch
1844 // (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).
1845 if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&
1846 (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace) ||
1847 AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
1848 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {
1849 TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);
1850 if (TypeSize::isKnownLE(LHS: StoreSize, RHS: TypeSize::getFixed(ExactSize: 16)) &&
1851 static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)
1852 return 0;
1853 }
1854 return 1;
1855 }
1856 return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1857 AddrSpace);
1858}
1859
1860bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A,
1861 const TTI::LSRCost &B) const {
1862 // Favor lower per-iteration work over preheader/setup costs.
1863 // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
1864 // effective instruction count (base+scale*index requires a separate ADD).
1865 unsigned EffInsnsA = A.Insns + A.ScaleCost;
1866 unsigned EffInsnsB = B.Insns + B.ScaleCost;
1867
1868 return std::tie(args&: EffInsnsA, args: A.NumIVMuls, args: A.AddRecCost, args: A.NumBaseAdds,
1869 args: A.SetupCost, args: A.ImmCost, args: A.NumRegs) <
1870 std::tie(args&: EffInsnsB, args: B.NumIVMuls, args: B.AddRecCost, args: B.NumBaseAdds,
1871 args: B.SetupCost, args: B.ImmCost, args: B.NumRegs);
1872}
1873
1874bool GCNTTIImpl::isNumRegsMajorCostOfLSR() const {
1875 // isLSRCostLess de-prioritizes register count; keep consistent.
1876 return false;
1877}
1878
1879bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
1880 // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
1881 return true;
1882}
1883
1884bool GCNTTIImpl::isUniform(const Instruction *I,
1885 const SmallBitVector &UniformArgs) const {
1886 const IntrinsicInst *Intrinsic = cast<IntrinsicInst>(Val: I);
1887 switch (Intrinsic->getIntrinsicID()) {
1888 case Intrinsic::amdgcn_wave_shuffle:
1889 // wave_shuffle(Value, Index): result is uniform when either Value or Index
1890 // is uniform.
1891 return UniformArgs[0] || UniformArgs[1];
1892 default:
1893 llvm_unreachable("unexpected intrinsic in isUniform");
1894 }
1895}
1896