1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetTransformInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "AMDGPUTargetMachine.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "SIModeRegisterDefaults.h"
22#include "llvm/ADT/SmallBitVector.h"
23#include "llvm/Analysis/InlineCost.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/ValueTracking.h"
26#include "llvm/CodeGen/Analysis.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/IR/PatternMatch.h"
31#include "llvm/Support/KnownBits.h"
32#include "llvm/Transforms/Utils/UnrollLoop.h"
33#include <optional>
34
35using namespace llvm;
36
37#define DEBUG_TYPE "AMDGPUtti"
38
39static cl::opt<unsigned> UnrollThresholdPrivate(
40 "amdgpu-unroll-threshold-private",
41 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
42 cl::init(Val: 2700), cl::Hidden);
43
44static cl::opt<unsigned> UnrollThresholdLocal(
45 "amdgpu-unroll-threshold-local",
46 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
47 cl::init(Val: 1000), cl::Hidden);
48
49static cl::opt<unsigned> UnrollThresholdIf(
50 "amdgpu-unroll-threshold-if",
51 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
52 cl::init(Val: 200), cl::Hidden);
53
54static cl::opt<bool> UnrollRuntimeLocal(
55 "amdgpu-unroll-runtime-local",
56 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
57 cl::init(Val: true), cl::Hidden);
58
59static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
60 "amdgpu-unroll-max-block-to-analyze",
61 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
62 cl::init(Val: 32), cl::Hidden);
63
64static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
65 cl::Hidden, cl::init(Val: 4000),
66 cl::desc("Cost of alloca argument"));
67
68// If the amount of scratch memory to eliminate exceeds our ability to allocate
69// it into registers we gain nothing by aggressively inlining functions for that
70// heuristic.
71static cl::opt<unsigned>
72 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
73 cl::init(Val: 256),
74 cl::desc("Maximum alloca size to use for inline cost"));
75
76// Inliner constraint to achieve reasonable compilation time.
77static cl::opt<size_t> InlineMaxBB(
78 "amdgpu-inline-max-bb", cl::Hidden, cl::init(Val: 1100),
79 cl::desc("Maximum number of BBs allowed in a function after inlining"
80 " (compile time constraint)"));
81
82// This default unroll factor is based on microbenchmarks on gfx1030.
83static cl::opt<unsigned> MemcpyLoopUnroll(
84 "amdgpu-memcpy-loop-unroll",
85 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
86 "operations when lowering statically-sized memcpy, memmove, or"
87 "memset as a loop"),
88 cl::init(Val: 16), cl::Hidden);
89
90static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
91 unsigned Depth = 0) {
92 const Instruction *I = dyn_cast<Instruction>(Val: Cond);
93 if (!I)
94 return false;
95
96 for (const Value *V : I->operand_values()) {
97 if (!L->contains(Inst: I))
98 continue;
99 if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) {
100 if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) {
101 return SubLoop->contains(Inst: PHI); }))
102 return true;
103 } else if (Depth < 10 && dependsOnLocalPhi(L, Cond: V, Depth: Depth+1))
104 return true;
105 }
106 return false;
107}
108
109AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
110 : BaseT(TM, F.getDataLayout()),
111 TargetTriple(TM->getTargetTriple()),
112 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
113 TLI(ST->getTargetLowering()) {}
114
115void AMDGPUTTIImpl::getUnrollingPreferences(
116 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
117 OptimizationRemarkEmitter *ORE) const {
118 const Function &F = *L->getHeader()->getParent();
119 UP.Threshold =
120 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold", Default: 300);
121 UP.MaxCount = std::numeric_limits<unsigned>::max();
122 UP.Partial = true;
123
124 // Conditional branch in a loop back edge needs 3 additional exec
125 // manipulations in average.
126 UP.BEInsns += 3;
127
128 // We want to run unroll even for the loops which have been vectorized.
129 UP.UnrollVectorizedLoop = true;
130
131 // TODO: Do we want runtime unrolling?
132
133 // Maximum alloca size than can fit registers. Reserve 16 registers.
134 const unsigned MaxAlloca = (256 - 16) * 4;
135 unsigned ThresholdPrivate = UnrollThresholdPrivate;
136 unsigned ThresholdLocal = UnrollThresholdLocal;
137
138 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
139 // provided threshold value as the default for Threshold
140 if (MDNode *LoopUnrollThreshold =
141 findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold")) {
142 if (LoopUnrollThreshold->getNumOperands() == 2) {
143 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
144 MD: LoopUnrollThreshold->getOperand(I: 1));
145 if (MetaThresholdValue) {
146 // We will also use the supplied value for PartialThreshold for now.
147 // We may introduce additional metadata if it becomes necessary in the
148 // future.
149 UP.Threshold = MetaThresholdValue->getSExtValue();
150 UP.PartialThreshold = UP.Threshold;
151 ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold);
152 ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold);
153 }
154 }
155 }
156
157 unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal);
158 for (const BasicBlock *BB : L->getBlocks()) {
159 const DataLayout &DL = BB->getDataLayout();
160 unsigned LocalGEPsSeen = 0;
161
162 if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) {
163 return SubLoop->contains(BB); }))
164 continue; // Block belongs to an inner loop.
165
166 for (const Instruction &I : *BB) {
167 // Unroll a loop which contains an "if" statement whose condition
168 // defined by a PHI belonging to the loop. This may help to eliminate
169 // if region and potentially even PHI itself, saving on both divergence
170 // and registers used for the PHI.
171 // Add a small bonus for each of such "if" statements.
172 if (const CondBrInst *Br = dyn_cast<CondBrInst>(Val: &I)) {
173 if (UP.Threshold < MaxBoost) {
174 BasicBlock *Succ0 = Br->getSuccessor(i: 0);
175 BasicBlock *Succ1 = Br->getSuccessor(i: 1);
176 if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) ||
177 (L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1)))
178 continue;
179 if (dependsOnLocalPhi(L, Cond: Br->getCondition())) {
180 UP.Threshold += UnrollThresholdIf;
181 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
182 << " for loop:\n"
183 << *L << " due to " << *Br << '\n');
184 if (UP.Threshold >= MaxBoost)
185 return;
186 }
187 }
188 continue;
189 }
190
191 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I);
192 if (!GEP)
193 continue;
194
195 unsigned AS = GEP->getAddressSpace();
196 unsigned Threshold = 0;
197 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
198 Threshold = ThresholdPrivate;
199 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
200 Threshold = ThresholdLocal;
201 else
202 continue;
203
204 if (UP.Threshold >= Threshold)
205 continue;
206
207 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
208 const Value *Ptr = GEP->getPointerOperand();
209 const AllocaInst *Alloca =
210 dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr));
211 if (!Alloca || !Alloca->isStaticAlloca())
212 continue;
213 auto AllocaSize = Alloca->getAllocationSize(DL);
214 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
215 continue;
216 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
217 AS == AMDGPUAS::REGION_ADDRESS) {
218 LocalGEPsSeen++;
219 // Inhibit unroll for local memory if we have seen addressing not to
220 // a variable, most likely we will be unable to combine it.
221 // Do not unroll too deep inner loops for local memory to give a chance
222 // to unroll an outer loop for a more important reason.
223 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
224 (!isa<GlobalVariable>(Val: GEP->getPointerOperand()) &&
225 !isa<Argument>(Val: GEP->getPointerOperand())))
226 continue;
227 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
228 << *L << " due to LDS use.\n");
229 UP.Runtime = UnrollRuntimeLocal;
230 }
231
232 // Check if GEP depends on a value defined by this loop itself.
233 bool HasLoopDef = false;
234 for (const Value *Op : GEP->operands()) {
235 const Instruction *Inst = dyn_cast<Instruction>(Val: Op);
236 if (!Inst || L->isLoopInvariant(V: Op))
237 continue;
238
239 if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) {
240 return SubLoop->contains(Inst); }))
241 continue;
242 HasLoopDef = true;
243 break;
244 }
245 if (!HasLoopDef)
246 continue;
247
248 // We want to do whatever we can to limit the number of alloca
249 // instructions that make it through to the code generator. allocas
250 // require us to use indirect addressing, which is slow and prone to
251 // compiler bugs. If this loop does an address calculation on an
252 // alloca ptr, then we want to use a higher than normal loop unroll
253 // threshold. This will give SROA a better chance to eliminate these
254 // allocas.
255 //
256 // We also want to have more unrolling for local memory to let ds
257 // instructions with different offsets combine.
258 //
259 // Don't use the maximum allowed value here as it will make some
260 // programs way too big.
261 UP.Threshold = Threshold;
262 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
263 << " for loop:\n"
264 << *L << " due to " << *GEP << '\n');
265 if (UP.Threshold >= MaxBoost)
266 return;
267 }
268
269 // If we got a GEP in a small BB from inner loop then increase max trip
270 // count to analyze for better estimation cost in unroll
271 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
272 UP.MaxIterationsCountToAnalyze = 32;
273 }
274 // If a user provided an explicit unroll pragma (with or without count),
275 // override expensive trip count checks
276 UnrollPragmaInfo PInfo(L);
277 if (PInfo.PragmaEnableUnroll || PInfo.PragmaCount > 0)
278 UP.AllowExpensiveTripCount = true;
279}
280
281void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
282 TTI::PeelingPreferences &PP) const {
283 BaseT::getPeelingPreferences(L, SE, PP);
284}
285
286uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
287 return 1024;
288}
289
290const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
291 // Codegen control options which don't matter.
292 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
293 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
294 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
295
296 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
297
298 // Property of the kernel/environment which can't actually differ.
299 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
300 AMDGPU::FeatureTrapHandler,
301
302 // The default assumption needs to be ecc is enabled, but no directly
303 // exposed operations depend on it, so it can be safely inlined.
304 AMDGPU::FeatureSRAMECC,
305
306 // Perf-tuning features
307 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
308
309GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
310 : BaseT(TM, F.getDataLayout()),
311 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
312 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
313 IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) {
314 SIModeRegisterDefaults Mode(F, *ST);
315 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
316 HasFP64FP16Denormals =
317 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
318}
319
320bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
321 return !F || !ST->isSingleLaneExecution(Kernel: *F);
322}
323
324unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
325 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
326 // registers. See getRegisterClassForType for the implementation.
327 // In this case vector registers are not vector in terms of
328 // VGPRs, but those which can hold multiple values.
329
330 // This is really the number of registers to fill when vectorizing /
331 // interleaving loops, so we lie to avoid trying to use all registers.
332 return 4;
333}
334
335TypeSize
336GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
337 switch (K) {
338 case TargetTransformInfo::RGK_Scalar:
339 return TypeSize::getFixed(ExactSize: 32);
340 case TargetTransformInfo::RGK_FixedWidthVector:
341 return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? 64 : 32);
342 case TargetTransformInfo::RGK_ScalableVector:
343 return TypeSize::getScalable(MinimumSize: 0);
344 }
345 llvm_unreachable("Unsupported register kind");
346}
347
348unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
349 return 32;
350}
351
352unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
353 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
354 return 32 * 4 / ElemWidth;
355 // For a given width return the max 0number of elements that can be combined
356 // into a wider bit value:
357 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
358 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
359 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
360 : 1;
361}
362
363unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
364 unsigned ChainSizeInBytes,
365 VectorType *VecTy) const {
366 unsigned VecRegBitWidth = VF * LoadSize;
367 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
368 // TODO: Support element-size less than 32bit?
369 return 128 / LoadSize;
370
371 return VF;
372}
373
374unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
375 unsigned ChainSizeInBytes,
376 VectorType *VecTy) const {
377 unsigned VecRegBitWidth = VF * StoreSize;
378 if (VecRegBitWidth > 128)
379 return 128 / StoreSize;
380
381 return VF;
382}
383
384unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
385 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
386 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
387 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
388 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
389 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
390 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
391 return 512;
392 }
393
394 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
395 return 8 * ST->getMaxPrivateElementSize();
396
397 // Common to flat, global, local and region. Assume for unknown addrspace.
398 return 128;
399}
400
401bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
402 Align Alignment,
403 unsigned AddrSpace) const {
404 // We allow vectorization of flat stores, even though we may need to decompose
405 // them later if they may access private memory. We don't have enough context
406 // here, and legalization can handle it.
407 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
408 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
409 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
410 }
411 return true;
412}
413
414bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
415 Align Alignment,
416 unsigned AddrSpace) const {
417 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
418}
419
420bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
421 Align Alignment,
422 unsigned AddrSpace) const {
423 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
424}
425
426uint64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
427 return 1024;
428}
429
430Type *GCNTTIImpl::getMemcpyLoopLoweringType(
431 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
432 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
433 std::optional<uint32_t> AtomicElementSize) const {
434
435 if (AtomicElementSize)
436 return Type::getIntNTy(C&: Context, N: *AtomicElementSize * 8);
437
438 // 16-byte accesses achieve the highest copy throughput.
439 // If the operation has a fixed known length that is large enough, it is
440 // worthwhile to return an even wider type and let legalization lower it into
441 // multiple accesses, effectively unrolling the memcpy loop.
442 // We also rely on legalization to decompose into smaller accesses for
443 // subtargets and address spaces where it is necessary.
444 //
445 // Don't unroll if Length is not a constant, since unrolling leads to worse
446 // performance for length values that are smaller or slightly larger than the
447 // total size of the type returned here. Mitigating that would require a more
448 // complex lowering for variable-length memcpy and memmove.
449 unsigned I32EltsInVector = 4;
450 if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Val: Length))
451 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context),
452 NumElts: MemcpyLoopUnroll * I32EltsInVector);
453
454 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: I32EltsInVector);
455}
456
457void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
458 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
459 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
460 Align SrcAlign, Align DestAlign,
461 std::optional<uint32_t> AtomicCpySize) const {
462
463 if (AtomicCpySize)
464 BaseT::getMemcpyLoopResidualLoweringType(
465 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
466 DestAlign, AtomicCpySize);
467
468 Type *I32x4Ty = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 4);
469 while (RemainingBytes >= 16) {
470 OpsOut.push_back(Elt: I32x4Ty);
471 RemainingBytes -= 16;
472 }
473
474 Type *I64Ty = Type::getInt64Ty(C&: Context);
475 while (RemainingBytes >= 8) {
476 OpsOut.push_back(Elt: I64Ty);
477 RemainingBytes -= 8;
478 }
479
480 Type *I32Ty = Type::getInt32Ty(C&: Context);
481 while (RemainingBytes >= 4) {
482 OpsOut.push_back(Elt: I32Ty);
483 RemainingBytes -= 4;
484 }
485
486 Type *I16Ty = Type::getInt16Ty(C&: Context);
487 while (RemainingBytes >= 2) {
488 OpsOut.push_back(Elt: I16Ty);
489 RemainingBytes -= 2;
490 }
491
492 Type *I8Ty = Type::getInt8Ty(C&: Context);
493 while (RemainingBytes) {
494 OpsOut.push_back(Elt: I8Ty);
495 --RemainingBytes;
496 }
497}
498
499unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
500 // Disable unrolling if the loop is not vectorized.
501 // TODO: Enable this again.
502 if (VF.isScalar())
503 return 1;
504
505 return 8;
506}
507
508bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
509 MemIntrinsicInfo &Info) const {
510 switch (Inst->getIntrinsicID()) {
511 case Intrinsic::amdgcn_ds_ordered_add:
512 case Intrinsic::amdgcn_ds_ordered_swap: {
513 auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 2));
514 auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 4));
515 if (!Ordering || !Volatile)
516 return false; // Invalid.
517
518 unsigned OrderingVal = Ordering->getZExtValue();
519 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
520 return false;
521
522 Info.PtrVal = Inst->getArgOperand(i: 0);
523 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
524 Info.ReadMem = true;
525 Info.WriteMem = true;
526 Info.IsVolatile = !Volatile->isZero();
527 return true;
528 }
529 default:
530 return false;
531 }
532}
533
534InstructionCost GCNTTIImpl::getArithmeticInstrCost(
535 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
536 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
537 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
538
539 // Legalize the type.
540 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
541 int ISD = TLI->InstructionOpcodeToISD(Opcode);
542
543 // Because we don't have any legal vector operations, but the legal types, we
544 // need to account for split vectors.
545 unsigned NElts = LT.second.isVector() ?
546 LT.second.getVectorNumElements() : 1;
547
548 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
549
550 switch (ISD) {
551 case ISD::SHL:
552 case ISD::SRL:
553 case ISD::SRA:
554 if (SLT == MVT::i64)
555 return get64BitInstrCost(CostKind) * LT.first * NElts;
556
557 if (ST->has16BitInsts() && SLT == MVT::i16)
558 NElts = (NElts + 1) / 2;
559
560 // i32
561 return getFullRateInstrCost() * LT.first * NElts;
562 case ISD::ADD:
563 case ISD::SUB:
564 case ISD::AND:
565 case ISD::OR:
566 case ISD::XOR:
567 if (SLT == MVT::i64) {
568 // and, or and xor are typically split into 2 VALU instructions.
569 return 2 * getFullRateInstrCost() * LT.first * NElts;
570 }
571
572 if (ST->has16BitInsts() && SLT == MVT::i16)
573 NElts = (NElts + 1) / 2;
574
575 return LT.first * NElts * getFullRateInstrCost();
576 case ISD::MUL: {
577 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
578 if (SLT == MVT::i64) {
579 const int FullRateCost = getFullRateInstrCost();
580 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
581 }
582
583 if (ST->has16BitInsts() && SLT == MVT::i16)
584 NElts = (NElts + 1) / 2;
585
586 // i32
587 return QuarterRateCost * NElts * LT.first;
588 }
589 case ISD::FMUL:
590 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
591 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
592 // fused operation.
593 if (CxtI && CxtI->hasOneUse())
594 if (const auto *FAdd = dyn_cast<BinaryOperator>(Val: *CxtI->user_begin())) {
595 const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode());
596 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
597 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
598 return TargetTransformInfo::TCC_Free;
599 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
600 return TargetTransformInfo::TCC_Free;
601
602 // Estimate all types may be fused with contract/unsafe flags
603 const TargetOptions &Options = TLI->getTargetMachine().Options;
604 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
605 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
606 return TargetTransformInfo::TCC_Free;
607 }
608 }
609 [[fallthrough]];
610 case ISD::FADD:
611 case ISD::FSUB:
612 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
613 NElts = (NElts + 1) / 2;
614 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
615 NElts = (NElts + 1) / 2;
616 if (SLT == MVT::f64)
617 return LT.first * NElts * get64BitInstrCost(CostKind);
618
619 if (ST->has16BitInsts() && SLT == MVT::f16)
620 NElts = (NElts + 1) / 2;
621
622 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
623 return LT.first * NElts * getFullRateInstrCost();
624 break;
625 case ISD::FDIV:
626 case ISD::FREM:
627 // FIXME: frem should be handled separately. The fdiv in it is most of it,
628 // but the current lowering is also not entirely correct.
629 if (SLT == MVT::f64) {
630 int Cost = 7 * get64BitInstrCost(CostKind) +
631 getQuarterRateInstrCost(CostKind) +
632 3 * getHalfRateInstrCost(CostKind);
633 // Add cost of workaround.
634 if (!ST->hasUsableDivScaleConditionOutput())
635 Cost += 3 * getFullRateInstrCost();
636
637 return LT.first * Cost * NElts;
638 }
639
640 if (!Args.empty() && match(V: Args[0], P: PatternMatch::m_FPOne())) {
641 // TODO: This is more complicated, unsafe flags etc.
642 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
643 (SLT == MVT::f16 && ST->has16BitInsts())) {
644 return LT.first * getTransInstrCost(CostKind) * NElts;
645 }
646 }
647
648 if (SLT == MVT::f16 && ST->has16BitInsts()) {
649 // 2 x v_cvt_f32_f16
650 // f32 rcp
651 // f32 fmul
652 // v_cvt_f16_f32
653 // f16 div_fixup
654 int Cost = 4 * getFullRateInstrCost() + 2 * getTransInstrCost(CostKind);
655 return LT.first * Cost * NElts;
656 }
657
658 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
659 // Fast unsafe fdiv lowering:
660 // f32 rcp
661 // f32 fmul
662 int Cost = getTransInstrCost(CostKind) + getFullRateInstrCost();
663 return LT.first * Cost * NElts;
664 }
665
666 if (SLT == MVT::f32 || SLT == MVT::f16) {
667 // 4 more v_cvt_* insts without f16 insts support
668 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
669 1 * getTransInstrCost(CostKind);
670
671 if (!HasFP32Denormals) {
672 // FP mode switches.
673 Cost += 2 * getFullRateInstrCost();
674 }
675
676 return LT.first * NElts * Cost;
677 }
678 break;
679 case ISD::FNEG:
680 // Use the backend' estimation. If fneg is not free each element will cost
681 // one additional instruction.
682 return TLI->isFNegFree(VT: SLT) ? 0 : NElts;
683 default:
684 break;
685 }
686
687 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
688 Args, CxtI);
689}
690
691// Return true if there's a potential benefit from using v2f16/v2i16
692// instructions for an intrinsic, even if it requires nontrivial legalization.
693static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
694 switch (ID) {
695 case Intrinsic::fma:
696 case Intrinsic::fmuladd:
697 case Intrinsic::copysign:
698 case Intrinsic::minimumnum:
699 case Intrinsic::maximumnum:
700 case Intrinsic::canonicalize:
701 // There's a small benefit to using vector ops in the legalized code.
702 case Intrinsic::round:
703 case Intrinsic::uadd_sat:
704 case Intrinsic::usub_sat:
705 case Intrinsic::sadd_sat:
706 case Intrinsic::ssub_sat:
707 case Intrinsic::abs:
708 return true;
709 default:
710 return false;
711 }
712}
713
714InstructionCost
715GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
716 TTI::TargetCostKind CostKind) const {
717 switch (ICA.getID()) {
718 case Intrinsic::fabs:
719 // Free source modifier in the common case.
720 return 0;
721 case Intrinsic::amdgcn_workitem_id_x:
722 case Intrinsic::amdgcn_workitem_id_y:
723 case Intrinsic::amdgcn_workitem_id_z:
724 // TODO: If hasPackedTID, or if the calling context is not an entry point
725 // there may be a bit instruction.
726 return 0;
727 case Intrinsic::amdgcn_workgroup_id_x:
728 case Intrinsic::amdgcn_workgroup_id_y:
729 case Intrinsic::amdgcn_workgroup_id_z:
730 case Intrinsic::amdgcn_lds_kernel_id:
731 case Intrinsic::amdgcn_dispatch_ptr:
732 case Intrinsic::amdgcn_dispatch_id:
733 case Intrinsic::amdgcn_implicitarg_ptr:
734 case Intrinsic::amdgcn_queue_ptr:
735 // Read from an argument register.
736 return 0;
737 default:
738 break;
739 }
740
741 Type *RetTy = ICA.getReturnType();
742
743 Intrinsic::ID IID = ICA.getID();
744 switch (IID) {
745 case Intrinsic::exp:
746 case Intrinsic::exp2:
747 case Intrinsic::exp10: {
748 // Legalize the type.
749 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
750 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
751 unsigned NElts =
752 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
753
754 if (SLT == MVT::f64) {
755 unsigned NumOps = 20;
756 if (IID == Intrinsic::exp)
757 ++NumOps;
758 else if (IID == Intrinsic::exp10)
759 NumOps += 3;
760
761 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
762 }
763
764 if (SLT == MVT::f32) {
765 unsigned NumFullRateOps = 0;
766 // v_exp_f32 (transcendental).
767 unsigned NumTransOps = 1;
768
769 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
770 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
771 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
772 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
773 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
774 } else {
775 if (IID == Intrinsic::exp) {
776 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
777 NumFullRateOps = 1;
778 } else if (IID == Intrinsic::exp10) {
779 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
780 NumFullRateOps = 3;
781 NumTransOps = 2;
782 }
783 // Denorm scaling adds setcc + select + fadd + select + fmul.
784 if (HasFP32Denormals)
785 NumFullRateOps += 5;
786 }
787
788 InstructionCost Cost = NumFullRateOps * getFullRateInstrCost() +
789 NumTransOps * getTransInstrCost(CostKind);
790 return LT.first * NElts * Cost;
791 }
792
793 break;
794 }
795 case Intrinsic::log:
796 case Intrinsic::log2:
797 case Intrinsic::log10: {
798 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
799 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
800 unsigned NElts =
801 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
802
803 if (SLT == MVT::f32) {
804 unsigned NumFullRateOps = 0;
805
806 if (IID == Intrinsic::log2) {
807 // LowerFLOG2: just v_log_f32.
808 } else if (ICA.getFlags().approxFunc()) {
809 // LowerFLOGUnsafe: v_log_f32 + fmul (base conversion).
810 NumFullRateOps = 1;
811 } else {
812 // LowerFLOGCommon non-AFN: v_log_f32 + extended-precision
813 // multiply + finite check.
814 NumFullRateOps = ST->hasFastFMAF32() ? 8 : 11;
815 }
816
817 if (HasFP32Denormals)
818 NumFullRateOps += 5;
819
820 InstructionCost Cost =
821 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
822 return LT.first * NElts * Cost;
823 }
824
825 break;
826 }
827 case Intrinsic::sin:
828 case Intrinsic::cos: {
829 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
830 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
831 unsigned NElts =
832 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
833
834 if (SLT == MVT::f32) {
835 // LowerTrig: fmul(1/2pi) + v_sin/v_cos.
836 unsigned NumFullRateOps = ST->hasTrigReducedRange() ? 2 : 1;
837
838 InstructionCost Cost =
839 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
840 return LT.first * NElts * Cost;
841 }
842
843 break;
844 }
845 case Intrinsic::sqrt: {
846 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
847 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
848 unsigned NElts =
849 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
850
851 if (SLT == MVT::f32) {
852 unsigned NumFullRateOps = 0;
853
854 if (!ICA.getFlags().approxFunc()) {
855 // lowerFSQRTF32 non-AFN: v_sqrt_f32 + refinement + scale fixup.
856 NumFullRateOps = HasFP32Denormals ? 17 : 16;
857 }
858
859 InstructionCost Cost =
860 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
861 return LT.first * NElts * Cost;
862 }
863
864 break;
865 }
866 default:
867 break;
868 }
869
870 if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID()))
871 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
872
873 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
874 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
875 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
876
877 if ((ST->hasVOP3PInsts() &&
878 (SLT == MVT::f16 || SLT == MVT::i16 ||
879 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
880 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
881 NElts = (NElts + 1) / 2;
882
883 // TODO: Get more refined intrinsic costs?
884 unsigned InstRate = getQuarterRateInstrCost(CostKind);
885
886 switch (ICA.getID()) {
887 case Intrinsic::fma:
888 case Intrinsic::fmuladd:
889 if (SLT == MVT::f64) {
890 InstRate = get64BitInstrCost(CostKind);
891 break;
892 }
893
894 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
895 InstRate = getFullRateInstrCost();
896 else {
897 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
898 : getQuarterRateInstrCost(CostKind);
899 }
900 break;
901 case Intrinsic::copysign:
902 return NElts * getFullRateInstrCost();
903 case Intrinsic::minimumnum:
904 case Intrinsic::maximumnum: {
905 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
906 // promotion takes the place of the canonicalize.
907 unsigned NumOps = 3;
908 if (const IntrinsicInst *II = ICA.getInst()) {
909 // Directly legal with ieee=0
910 // TODO: Not directly legal with strictfp
911 if (fpenvIEEEMode(I: *II) == KnownIEEEMode::Off)
912 NumOps = 1;
913 }
914
915 unsigned BaseRate =
916 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
917 InstRate = BaseRate * NumOps;
918 break;
919 }
920 case Intrinsic::canonicalize: {
921 InstRate =
922 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
923 break;
924 }
925 case Intrinsic::uadd_sat:
926 case Intrinsic::usub_sat:
927 case Intrinsic::sadd_sat:
928 case Intrinsic::ssub_sat: {
929 if (SLT == MVT::i16 || SLT == MVT::i32)
930 InstRate = getFullRateInstrCost();
931
932 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
933 if (any_of(Range: ValidSatTys, P: equal_to(Arg&: LT.second)))
934 NElts = 1;
935 break;
936 }
937 case Intrinsic::abs:
938 // Expansion takes 2 instructions for VALU
939 if (SLT == MVT::i16 || SLT == MVT::i32)
940 InstRate = 2 * getFullRateInstrCost();
941 break;
942 default:
943 break;
944 }
945
946 return LT.first * NElts * InstRate;
947}
948
949InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
950 TTI::TargetCostKind CostKind,
951 const Instruction *I) const {
952 assert((I == nullptr || I->getOpcode() == Opcode) &&
953 "Opcode should reflect passed instruction.");
954 const bool SCost =
955 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
956 const int CBrCost = SCost ? 5 : 7;
957 switch (Opcode) {
958 case Instruction::UncondBr:
959 // Branch instruction takes about 4 slots on gfx900.
960 return SCost ? 1 : 4;
961 case Instruction::CondBr:
962 // Suppose conditional branch takes additional 3 exec manipulations
963 // instructions in average.
964 return CBrCost;
965 case Instruction::Switch: {
966 const auto *SI = dyn_cast_or_null<SwitchInst>(Val: I);
967 // Each case (including default) takes 1 cmp + 1 cbr instructions in
968 // average.
969 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
970 }
971 case Instruction::Ret:
972 return SCost ? 1 : 10;
973 }
974 return BaseT::getCFInstrCost(Opcode, CostKind, I);
975}
976
977InstructionCost
978GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
979 std::optional<FastMathFlags> FMF,
980 TTI::TargetCostKind CostKind) const {
981 if (TTI::requiresOrderedReduction(FMF))
982 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
983
984 EVT OrigTy = TLI->getValueType(DL, Ty);
985
986 // Computes cost on targets that have packed math instructions(which support
987 // 16-bit types only).
988 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
989 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
990
991 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
992 return LT.first * getFullRateInstrCost();
993}
994
995InstructionCost
996GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
997 FastMathFlags FMF,
998 TTI::TargetCostKind CostKind) const {
999 EVT OrigTy = TLI->getValueType(DL, Ty);
1000
1001 // Computes cost on targets that have packed math instructions(which support
1002 // 16-bit types only).
1003 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
1004 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1005
1006 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1007 return LT.first * getHalfRateInstrCost(CostKind);
1008}
1009
1010InstructionCost GCNTTIImpl::getVectorInstrCost(
1011 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
1012 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
1013 switch (Opcode) {
1014 case Instruction::ExtractElement:
1015 case Instruction::InsertElement: {
1016 unsigned EltSize
1017 = DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType());
1018 if (EltSize < 32) {
1019 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
1020 return 0;
1021 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
1022 VIC);
1023 }
1024
1025 // Extracts are just reads of a subregister, so are free. Inserts are
1026 // considered free because we don't want to have any cost for scalarizing
1027 // operations, and we don't have to copy into a different register class.
1028
1029 // Dynamic indexing isn't free and is best avoided.
1030 return Index == ~0u ? 2 : 0;
1031 }
1032 default:
1033 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
1034 VIC);
1035 }
1036}
1037
1038/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
1039/// this is analyzing the collective result of all output registers. Otherwise,
1040/// this is only querying a specific result index if this returns multiple
1041/// registers in a struct.
1042bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
1043 const CallInst *CI, ArrayRef<unsigned> Indices) const {
1044 // TODO: Handle complex extract indices
1045 if (Indices.size() > 1)
1046 return true;
1047
1048 const DataLayout &DL = CI->getDataLayout();
1049 const SIRegisterInfo *TRI = ST->getRegisterInfo();
1050 TargetLowering::AsmOperandInfoVector TargetConstraints =
1051 TLI->ParseConstraints(DL, TRI: ST->getRegisterInfo(), Call: *CI);
1052
1053 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
1054
1055 int OutputIdx = 0;
1056 for (auto &TC : TargetConstraints) {
1057 if (TC.Type != InlineAsm::isOutput)
1058 continue;
1059
1060 // Skip outputs we don't care about.
1061 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
1062 continue;
1063
1064 TLI->ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
1065
1066 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
1067 TRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second;
1068
1069 // For AGPR constraints null is returned on subtargets without AGPRs, so
1070 // assume divergent for null.
1071 if (!RC || !TRI->isSGPRClass(RC))
1072 return true;
1073 }
1074
1075 return false;
1076}
1077
1078bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
1079 const IntrinsicInst *ReadReg) const {
1080 Metadata *MD =
1081 cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: 0))->getMetadata();
1082 StringRef RegName =
1083 cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: 0))->getString();
1084
1085 // Special case registers that look like VCC.
1086 MVT VT = MVT::getVT(Ty: ReadReg->getType());
1087 if (VT == MVT::i1)
1088 return true;
1089
1090 // Special case scalar registers that start with 'v'.
1091 if (RegName.starts_with(Prefix: "vcc") || RegName.empty())
1092 return false;
1093
1094 // VGPR or AGPR is divergent. There aren't any specially named vector
1095 // registers.
1096 return RegName[0] == 'v' || RegName[0] == 'a';
1097}
1098
1099/// \returns true if the result of the value could potentially be
1100/// different across workitems in a wavefront.
1101bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1102 if (const Argument *A = dyn_cast<Argument>(Val: V))
1103 return !AMDGPU::isArgPassedInSGPR(Arg: A);
1104
1105 // Loads from the private and flat address spaces are divergent, because
1106 // threads can execute the load instruction with the same inputs and get
1107 // different results.
1108 //
1109 // All other loads are not divergent, because if threads issue loads with the
1110 // same arguments, they will always get the same result.
1111 if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V))
1112 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1113 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1114
1115 // Atomics are divergent because they are executed sequentially: when an
1116 // atomic operation refers to the same address in each thread, then each
1117 // thread after the first sees the value written by the previous thread as
1118 // original value.
1119 if (isa<AtomicRMWInst, AtomicCmpXchgInst>(Val: V))
1120 return true;
1121
1122 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
1123 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1124 switch (IID) {
1125 case Intrinsic::read_register:
1126 return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic);
1127 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1128 unsigned SrcAS =
1129 Intrinsic->getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1130 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1131 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1132 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1133 ST->hasGloballyAddressableScratch();
1134 }
1135 case Intrinsic::amdgcn_workitem_id_y:
1136 case Intrinsic::amdgcn_workitem_id_z: {
1137 const Function *F = Intrinsic->getFunction();
1138 bool HasUniformYZ =
1139 ST->hasWavefrontsEvenlySplittingXDim(F: *F, /*RequitezUniformYZ=*/REquiresUniformYZ: true);
1140 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1141 F: *F, Dim: IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1142 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1143 }
1144 default:
1145 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID);
1146 }
1147 }
1148
1149 // Assume all function calls are a source of divergence.
1150 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1151 if (CI->isInlineAsm())
1152 return isInlineAsmSourceOfDivergence(CI);
1153 return true;
1154 }
1155
1156 // Assume all function calls are a source of divergence.
1157 if (isa<InvokeInst>(Val: V))
1158 return true;
1159
1160 // If the target supports globally addressable scratch, the mapping from
1161 // scratch memory to the flat aperture changes therefore an address space cast
1162 // is no longer uniform.
1163 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(Val: V)) {
1164 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1165 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1166 ST->hasGloballyAddressableScratch();
1167 }
1168
1169 return false;
1170}
1171
1172bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1173 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V))
1174 return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID());
1175
1176 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1177 if (CI->isInlineAsm())
1178 return !isInlineAsmSourceOfDivergence(CI);
1179 return false;
1180 }
1181
1182 // In most cases TID / wavefrontsize is uniform.
1183 //
1184 // However, if a kernel has uneven dimesions we can have a value of
1185 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1186 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1187 // packed into a same wave which gives 1 and 0 after the division by 64
1188 // respectively.
1189 //
1190 // The X dimension doesn't reset within a wave if either both the Y
1191 // and Z dimensions are of length 1, or if the X dimension's required
1192 // size is a power of 2. Note, however, if the X dimension's maximum
1193 // size is a power of 2 < the wavefront size, division by the wavefront
1194 // size is guaranteed to yield 0, so this is also a no-reset case.
1195 bool XDimDoesntResetWithinWaves = false;
1196 if (auto *I = dyn_cast<Instruction>(Val: V)) {
1197 const Function *F = I->getFunction();
1198 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(F: *F);
1199 }
1200 using namespace llvm::PatternMatch;
1201 uint64_t C;
1202 if (match(V, P: m_LShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1203 R: m_ConstantInt(V&: C))) ||
1204 match(V, P: m_AShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1205 R: m_ConstantInt(V&: C)))) {
1206 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1207 }
1208
1209 Value *Mask;
1210 if (match(V, P: m_c_And(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1211 R: m_Value(V&: Mask)))) {
1212 return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >=
1213 ST->getWavefrontSizeLog2() &&
1214 XDimDoesntResetWithinWaves;
1215 }
1216
1217 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V);
1218 if (!ExtValue)
1219 return false;
1220
1221 const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: 0));
1222 if (!CI)
1223 return false;
1224
1225 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) {
1226 switch (Intrinsic->getIntrinsicID()) {
1227 default:
1228 return false;
1229 case Intrinsic::amdgcn_if:
1230 case Intrinsic::amdgcn_else: {
1231 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1232 return Indices.size() == 1 && Indices[0] == 1;
1233 }
1234 }
1235 }
1236
1237 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1238 // divergent for the overall struct return. We need to override it in the
1239 // case we're extracting an SGPR component here.
1240 if (CI->isInlineAsm())
1241 return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices());
1242
1243 return false;
1244}
1245
1246bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1247 Intrinsic::ID IID) const {
1248 switch (IID) {
1249 case Intrinsic::amdgcn_is_shared:
1250 case Intrinsic::amdgcn_is_private:
1251 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1252 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1253 case Intrinsic::amdgcn_load_to_lds:
1254 case Intrinsic::amdgcn_make_buffer_rsrc:
1255 OpIndexes.push_back(Elt: 0);
1256 return true;
1257 default:
1258 return false;
1259 }
1260}
1261
1262Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1263 Value *OldV,
1264 Value *NewV) const {
1265 auto IntrID = II->getIntrinsicID();
1266 switch (IntrID) {
1267 case Intrinsic::amdgcn_is_shared:
1268 case Intrinsic::amdgcn_is_private: {
1269 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1270 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1271 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1272 LLVMContext &Ctx = NewV->getType()->getContext();
1273 ConstantInt *NewVal = (TrueAS == NewAS) ?
1274 ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx);
1275 return NewVal;
1276 }
1277 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1278 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1279 Type *DestTy = II->getType();
1280 Type *SrcTy = NewV->getType();
1281 unsigned NewAS = SrcTy->getPointerAddressSpace();
1282 if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS))
1283 return nullptr;
1284 Module *M = II->getModule();
1285 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1286 M, id: II->getIntrinsicID(), OverloadTys: {DestTy, SrcTy, DestTy});
1287 II->setArgOperand(i: 0, v: NewV);
1288 II->setCalledFunction(NewDecl);
1289 return II;
1290 }
1291 case Intrinsic::amdgcn_load_to_lds: {
1292 Type *SrcTy = NewV->getType();
1293 Module *M = II->getModule();
1294 Function *NewDecl =
1295 Intrinsic::getOrInsertDeclaration(M, id: II->getIntrinsicID(), OverloadTys: {SrcTy});
1296 II->setArgOperand(i: 0, v: NewV);
1297 II->setCalledFunction(NewDecl);
1298 return II;
1299 }
1300 case Intrinsic::amdgcn_make_buffer_rsrc: {
1301 Type *SrcTy = NewV->getType();
1302 Type *DstTy = II->getType();
1303 Module *M = II->getModule();
1304 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1305 M, id: II->getIntrinsicID(), OverloadTys: {DstTy, SrcTy});
1306 II->setArgOperand(i: 0, v: NewV);
1307 II->setCalledFunction(NewDecl);
1308 return II;
1309 }
1310 default:
1311 return nullptr;
1312 }
1313}
1314
1315InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1316 VectorType *DstTy, VectorType *SrcTy,
1317 ArrayRef<int> Mask,
1318 TTI::TargetCostKind CostKind,
1319 int Index, VectorType *SubTp,
1320 ArrayRef<const Value *> Args,
1321 const Instruction *CxtI) const {
1322 if (!isa<FixedVectorType>(Val: SrcTy))
1323 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1324 SubTp);
1325
1326 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1327
1328 unsigned ScalarSize = DL.getTypeSizeInBits(Ty: SrcTy->getElementType());
1329 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1330 (ScalarSize == 16 || ScalarSize == 8)) {
1331 // Larger vector widths may require additional instructions, but are
1332 // typically cheaper than scalarized versions.
1333 //
1334 // We assume that shuffling at a register granularity can be done for free.
1335 // This is not true for vectors fed into memory instructions, but it is
1336 // effectively true for all other shuffling. The emphasis of the logic here
1337 // is to assist generic transform in cleaning up / canonicalizing those
1338 // shuffles.
1339
1340 // With op_sel VOP3P instructions freely can access the low half or high
1341 // half of a register, so any swizzle of two elements is free.
1342 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(Val: SrcTy)) {
1343 unsigned NumSrcElts = SrcVecTy->getNumElements();
1344 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1345 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1346 Kind == TTI::SK_PermuteSingleSrc))
1347 return 0;
1348 }
1349
1350 unsigned EltsPerReg = 32 / ScalarSize;
1351 switch (Kind) {
1352 case TTI::SK_Broadcast:
1353 // A single v_perm_b32 can be re-used for all destination registers.
1354 return 1;
1355 case TTI::SK_Reverse:
1356 // One instruction per register.
1357 if (auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy))
1358 return divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: EltsPerReg);
1359 return InstructionCost::getInvalid();
1360 case TTI::SK_ExtractSubvector:
1361 if (Index % EltsPerReg == 0)
1362 return 0; // Shuffling at register granularity
1363 if (auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy))
1364 return divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: EltsPerReg);
1365 return InstructionCost::getInvalid();
1366 case TTI::SK_InsertSubvector: {
1367 auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy);
1368 if (!DstVecTy)
1369 return InstructionCost::getInvalid();
1370 unsigned NumDstElts = DstVecTy->getNumElements();
1371 unsigned NumInsertElts = cast<FixedVectorType>(Val: SubTp)->getNumElements();
1372 unsigned EndIndex = Index + NumInsertElts;
1373 unsigned BeginSubIdx = Index % EltsPerReg;
1374 unsigned EndSubIdx = EndIndex % EltsPerReg;
1375 unsigned Cost = 0;
1376
1377 if (BeginSubIdx != 0) {
1378 // Need to shift the inserted vector into place. The cost is the number
1379 // of destination registers overlapped by the inserted vector.
1380 Cost = divideCeil(Numerator: EndIndex, Denominator: EltsPerReg) - (Index / EltsPerReg);
1381 }
1382
1383 // If the last register overlap is partial, there may be three source
1384 // registers feeding into it; that takes an extra instruction.
1385 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1386 Cost += 1;
1387
1388 return Cost;
1389 }
1390 case TTI::SK_Splice: {
1391 auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy);
1392 if (!DstVecTy)
1393 return InstructionCost::getInvalid();
1394 unsigned NumElts = DstVecTy->getNumElements();
1395 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1396 // Determine the sub-region of the result vector that requires
1397 // sub-register shuffles / mixing.
1398 unsigned EltsFromLHS = NumElts - Index;
1399 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1400 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1401 if (LHSIsAligned && RHSIsAligned)
1402 return 0;
1403 if (LHSIsAligned && !RHSIsAligned)
1404 return divideCeil(Numerator: NumElts, Denominator: EltsPerReg) - (EltsFromLHS / EltsPerReg);
1405 if (!LHSIsAligned && RHSIsAligned)
1406 return divideCeil(Numerator: EltsFromLHS, Denominator: EltsPerReg);
1407 return divideCeil(Numerator: NumElts, Denominator: EltsPerReg);
1408 }
1409 default:
1410 break;
1411 }
1412
1413 if (!Mask.empty()) {
1414 unsigned NumSrcElts = cast<FixedVectorType>(Val: SrcTy)->getNumElements();
1415
1416 // Generically estimate the cost by assuming that each destination
1417 // register is derived from sources via v_perm_b32 instructions if it
1418 // can't be copied as-is.
1419 //
1420 // For each destination register, derive the cost of obtaining it based
1421 // on the number of source registers that feed into it.
1422 unsigned Cost = 0;
1423 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1424 SmallVector<int, 4> Regs;
1425 bool Aligned = true;
1426 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1427 int SrcIdx = Mask[DstIdx + I];
1428 if (SrcIdx == -1)
1429 continue;
1430 int Reg;
1431 if (SrcIdx < (int)NumSrcElts) {
1432 Reg = SrcIdx / EltsPerReg;
1433 if (SrcIdx % EltsPerReg != I)
1434 Aligned = false;
1435 } else {
1436 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1437 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1438 Aligned = false;
1439 }
1440 if (!llvm::is_contained(Range&: Regs, Element: Reg))
1441 Regs.push_back(Elt: Reg);
1442 }
1443 if (Regs.size() >= 2)
1444 Cost += Regs.size() - 1;
1445 else if (!Aligned)
1446 Cost += 1;
1447 }
1448 return Cost;
1449 }
1450 }
1451
1452 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1453 SubTp);
1454}
1455
1456/// Whether it is profitable to sink the operands of an
1457/// Instruction I to the basic block of I.
1458/// This helps using several modifiers (like abs and neg) more often.
1459bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
1460 SmallVectorImpl<Use *> &Ops) const {
1461 using namespace PatternMatch;
1462
1463 for (auto &Op : I->operands()) {
1464 // Ensure we are not already sinking this operand.
1465 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op.get(); }))
1466 continue;
1467
1468 if (match(V: &Op, P: m_FAbs(Op0: m_Value())) || match(V: &Op, P: m_FNeg(X: m_Value()))) {
1469 Ops.push_back(Elt: &Op);
1470 continue;
1471 }
1472
1473 // Check for zero-cost multiple use InsertElement/ExtractElement
1474 // instructions
1475 if (Instruction *OpInst = dyn_cast<Instruction>(Val: Op.get())) {
1476 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1477 Instruction *VecOpInst = dyn_cast<Instruction>(Val: OpInst->getOperand(i: 0));
1478 if (VecOpInst && VecOpInst->hasOneUse())
1479 continue;
1480
1481 if (getVectorInstrCost(Opcode: OpInst->getOpcode(), ValTy: OpInst->getType(),
1482 CostKind: TTI::TCK_RecipThroughput, Index: 0,
1483 Op0: OpInst->getOperand(i: 0),
1484 Op1: OpInst->getOperand(i: 1)) == 0) {
1485 Ops.push_back(Elt: &Op);
1486 continue;
1487 }
1488 }
1489 }
1490
1491 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: Op.get())) {
1492
1493 unsigned EltSize = DL.getTypeSizeInBits(
1494 Ty: cast<VectorType>(Val: Shuffle->getType())->getElementType());
1495
1496 // For i32 (or greater) shufflevectors, these will be lowered into a
1497 // series of insert / extract elements, which will be coalesced away.
1498 if (EltSize < 16 || !ST->has16BitInsts())
1499 continue;
1500
1501 int NumSubElts, SubIndex;
1502 if (Shuffle->changesLength()) {
1503 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1504 Ops.push_back(Elt: &Op);
1505 continue;
1506 }
1507
1508 if ((Shuffle->isExtractSubvectorMask(Index&: SubIndex) ||
1509 Shuffle->isInsertSubvectorMask(NumSubElts, Index&: SubIndex)) &&
1510 !(SubIndex & 0x1)) {
1511 Ops.push_back(Elt: &Op);
1512 continue;
1513 }
1514 }
1515
1516 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1517 Shuffle->isSingleSource()) {
1518 Ops.push_back(Elt: &Op);
1519 continue;
1520 }
1521 }
1522 }
1523
1524 return !Ops.empty();
1525}
1526
1527bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1528 const Function *Callee) const {
1529 const TargetMachine &TM = getTLI()->getTargetMachine();
1530 const GCNSubtarget *CallerST
1531 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1532 const GCNSubtarget *CalleeST
1533 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1534
1535 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1536 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1537
1538 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1539 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1540 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1541 return false;
1542
1543 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1544 // no way to support merge for backend defined attributes.
1545 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1546 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1547 if (!CallerMode.isInlineCompatible(CalleeMode))
1548 return false;
1549
1550 if (Callee->hasFnAttribute(Kind: Attribute::AlwaysInline) ||
1551 Callee->hasFnAttribute(Kind: Attribute::InlineHint))
1552 return true;
1553
1554 // Hack to make compile times reasonable.
1555 if (InlineMaxBB) {
1556 // Single BB does not increase total BB amount.
1557 if (Callee->size() == 1)
1558 return true;
1559 size_t BBSize = Caller->size() + Callee->size() - 1;
1560 return BBSize <= InlineMaxBB;
1561 }
1562
1563 return true;
1564}
1565
1566static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1567 const SITargetLowering *TLI,
1568 const GCNTTIImpl *TTIImpl) {
1569 const int NrOfSGPRUntilSpill = 26;
1570 const int NrOfVGPRUntilSpill = 32;
1571
1572 const DataLayout &DL = TTIImpl->getDataLayout();
1573
1574 unsigned adjustThreshold = 0;
1575 int SGPRsInUse = 0;
1576 int VGPRsInUse = 0;
1577 for (const Use &A : CB->args()) {
1578 SmallVector<EVT, 4> ValueVTs;
1579 ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs);
1580 for (auto ArgVT : ValueVTs) {
1581 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1582 Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT);
1583 if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A)))
1584 SGPRsInUse += CCRegNum;
1585 else
1586 VGPRsInUse += CCRegNum;
1587 }
1588 }
1589
1590 // The cost of passing function arguments through the stack:
1591 // 1 instruction to put a function argument on the stack in the caller.
1592 // 1 instruction to take a function argument from the stack in callee.
1593 // 1 instruction is explicitly take care of data dependencies in callee
1594 // function.
1595 InstructionCost ArgStackCost(1);
1596 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1597 Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1598 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1599 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1600 Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1601 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1602
1603 // The penalty cost is computed relative to the cost of instructions and does
1604 // not model any storage costs.
1605 adjustThreshold += std::max(a: 0, b: SGPRsInUse - NrOfSGPRUntilSpill) *
1606 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1607 adjustThreshold += std::max(a: 0, b: VGPRsInUse - NrOfVGPRUntilSpill) *
1608 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1609 return adjustThreshold;
1610}
1611
1612static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1613 const DataLayout &DL) {
1614 // If we have a pointer to a private array passed into a function
1615 // it will not be optimized out, leaving scratch usage.
1616 // This function calculates the total size in bytes of the memory that would
1617 // end in scratch if the call was not inlined.
1618 unsigned AllocaSize = 0;
1619 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1620 for (Value *PtrArg : CB->args()) {
1621 PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType());
1622 if (!Ty)
1623 continue;
1624
1625 unsigned AddrSpace = Ty->getAddressSpace();
1626 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1627 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1628 continue;
1629
1630 const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg));
1631 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(Ptr: AI).second)
1632 continue;
1633
1634 if (auto Size = AI->getAllocationSize(DL))
1635 AllocaSize += Size->getFixedValue();
1636 }
1637 return AllocaSize;
1638}
1639
1640int GCNTTIImpl::getInliningLastCallToStaticBonus() const {
1641 return BaseT::getInliningLastCallToStaticBonus() *
1642 getInliningThresholdMultiplier();
1643}
1644
1645unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1646 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this);
1647
1648 // Private object passed as arguments may end up in scratch usage if the call
1649 // is not inlined. Increase the inline threshold to promote inlining.
1650 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1651 if (AllocaSize > 0)
1652 Threshold += ArgAllocaCost;
1653 return Threshold;
1654}
1655
1656unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1657 const AllocaInst *AI) const {
1658
1659 // Below the cutoff, assume that the private memory objects would be
1660 // optimized
1661 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1662 if (AllocaSize <= ArgAllocaCutoff)
1663 return 0;
1664
1665 // Above the cutoff, we give a cost to each private memory object
1666 // depending its size. If the array can be optimized by SROA this cost is not
1667 // added to the total-cost in the inliner cost analysis.
1668 //
1669 // We choose the total cost of the alloca such that their sum cancels the
1670 // bonus given in the threshold (ArgAllocaCost).
1671 //
1672 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1673 //
1674 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1675 // the single-bb bonus and the vector-bonus.
1676 //
1677 // We compensate the first two multipliers, by repeating logic from the
1678 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1679 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1680 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1681
1682 bool SingleBB = none_of(Range&: *CB->getCalledFunction(), P: [](const BasicBlock &BB) {
1683 return BB.getTerminator()->getNumSuccessors() > 1;
1684 });
1685 if (SingleBB) {
1686 Threshold += Threshold / 2;
1687 }
1688
1689 auto ArgAllocaSize = AI->getAllocationSize(DL);
1690 if (!ArgAllocaSize)
1691 return 0;
1692
1693 // Attribute the bonus proportionally to the alloca size
1694 unsigned AllocaThresholdBonus =
1695 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1696
1697 return AllocaThresholdBonus;
1698}
1699
1700void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1701 TTI::UnrollingPreferences &UP,
1702 OptimizationRemarkEmitter *ORE) const {
1703 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1704}
1705
1706void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1707 TTI::PeelingPreferences &PP) const {
1708 CommonTTI.getPeelingPreferences(L, SE, PP);
1709}
1710
1711int GCNTTIImpl::getTransInstrCost(TTI::TargetCostKind CostKind) const {
1712 return getQuarterRateInstrCost(CostKind);
1713}
1714
1715int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1716 return ST->hasFullRate64Ops()
1717 ? getFullRateInstrCost()
1718 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1719 : getQuarterRateInstrCost(CostKind);
1720}
1721
1722std::pair<InstructionCost, MVT>
1723GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1724 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1725 auto Size = DL.getTypeSizeInBits(Ty);
1726 // Maximum load or store can handle 8 dwords for scalar and 4 for
1727 // vector ALU. Let's assume anything above 8 dwords is expensive
1728 // even if legal.
1729 if (Size <= 256)
1730 return Cost;
1731
1732 Cost.first += (Size + 255) / 256;
1733 return Cost;
1734}
1735
1736unsigned GCNTTIImpl::getPrefetchDistance() const {
1737 return ST->hasPrefetch() ? 128 : 0;
1738}
1739
1740bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1741 return AMDGPU::isFlatGlobalAddrSpace(AS);
1742}
1743
1744void GCNTTIImpl::collectKernelLaunchBounds(
1745 const Function &F,
1746 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1747 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1748 LB.push_back(Elt: {"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1749 LB.push_back(Elt: {"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1750 LB.push_back(Elt: {"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1751 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1752 ST->getFlatWorkGroupSizes(F);
1753 LB.push_back(Elt: {"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1754 LB.push_back(Elt: {"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1755 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1756 LB.push_back(Elt: {"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1757 LB.push_back(Elt: {"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1758}
1759
1760GCNTTIImpl::KnownIEEEMode
1761GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
1762 if (!ST->hasFeature(Feature: AMDGPU::FeatureDX10ClampAndIEEEMode))
1763 return KnownIEEEMode::On; // Only mode on gfx1170+
1764
1765 const Function *F = I.getFunction();
1766 if (!F)
1767 return KnownIEEEMode::Unknown;
1768
1769 Attribute IEEEAttr = F->getFnAttribute(Kind: "amdgpu-ieee");
1770 if (IEEEAttr.isValid())
1771 return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
1772
1773 return AMDGPU::isShader(CC: F->getCallingConv()) ? KnownIEEEMode::Off
1774 : KnownIEEEMode::On;
1775}
1776
1777InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1778 Align Alignment,
1779 unsigned AddressSpace,
1780 TTI::TargetCostKind CostKind,
1781 TTI::OperandValueInfo OpInfo,
1782 const Instruction *I) const {
1783 if (VectorType *VecTy = dyn_cast<VectorType>(Val: Src)) {
1784 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1785 VecTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
1786 return divideCeil(Numerator: DL.getTypeSizeInBits(Ty: VecTy) - 1,
1787 Denominator: getLoadStoreVecRegBitWidth(AddrSpace: AddressSpace));
1788 }
1789 }
1790 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1791 OpInfo, I);
1792}
1793
1794unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1795 if (VectorType *VecTy = dyn_cast<VectorType>(Val: Tp)) {
1796 if (VecTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
1797 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1798 return divideCeil(Numerator: ElementCount - 1, Denominator: 4);
1799 }
1800 }
1801 return BaseT::getNumberOfParts(Tp);
1802}
1803
1804ValueUniformity GCNTTIImpl::getValueUniformity(const Value *V) const {
1805 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
1806 switch (Intrinsic->getIntrinsicID()) {
1807 case Intrinsic::amdgcn_wave_shuffle:
1808 return ValueUniformity::Custom;
1809 default:
1810 break;
1811 }
1812 }
1813
1814 if (isAlwaysUniform(V))
1815 return ValueUniformity::AlwaysUniform;
1816
1817 if (isSourceOfDivergence(V))
1818 return ValueUniformity::NeverUniform;
1819
1820 return ValueUniformity::Default;
1821}
1822
1823InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
1824 StackOffset BaseOffset,
1825 bool HasBaseReg, int64_t Scale,
1826 unsigned AddrSpace) const {
1827 if (HasBaseReg && Scale != 0) {
1828 // gfx1250+ can fold base+scale*index when scale matches the memory access
1829 // size (scale_offset bit). Supported for flat/global/constant/scratch
1830 // (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).
1831 if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&
1832 (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace) ||
1833 AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
1834 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {
1835 TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);
1836 if (TypeSize::isKnownLE(LHS: StoreSize, RHS: TypeSize::getFixed(ExactSize: 16)) &&
1837 static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)
1838 return 0;
1839 }
1840 return 1;
1841 }
1842 return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1843 AddrSpace);
1844}
1845
1846bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A,
1847 const TTI::LSRCost &B) const {
1848 // Favor lower per-iteration work over preheader/setup costs.
1849 // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
1850 // effective instruction count (base+scale*index requires a separate ADD).
1851 unsigned EffInsnsA = A.Insns + A.ScaleCost;
1852 unsigned EffInsnsB = B.Insns + B.ScaleCost;
1853
1854 return std::tie(args&: EffInsnsA, args: A.NumIVMuls, args: A.AddRecCost, args: A.NumBaseAdds,
1855 args: A.SetupCost, args: A.ImmCost, args: A.NumRegs) <
1856 std::tie(args&: EffInsnsB, args: B.NumIVMuls, args: B.AddRecCost, args: B.NumBaseAdds,
1857 args: B.SetupCost, args: B.ImmCost, args: B.NumRegs);
1858}
1859
1860bool GCNTTIImpl::isNumRegsMajorCostOfLSR() const {
1861 // isLSRCostLess de-prioritizes register count; keep consistent.
1862 return false;
1863}
1864
1865bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
1866 // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
1867 return true;
1868}
1869
1870bool GCNTTIImpl::isUniform(const Instruction *I,
1871 const SmallBitVector &UniformArgs) const {
1872 const IntrinsicInst *Intrinsic = cast<IntrinsicInst>(Val: I);
1873 switch (Intrinsic->getIntrinsicID()) {
1874 case Intrinsic::amdgcn_wave_shuffle:
1875 // wave_shuffle(Value, Index): result is uniform when either Value or Index
1876 // is uniform.
1877 return UniformArgs[0] || UniformArgs[1];
1878 default:
1879 llvm_unreachable("unexpected intrinsic in isUniform");
1880 }
1881}
1882