1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetTransformInfo.h"
18#include "AMDGPUTargetMachine.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "SIModeRegisterDefaults.h"
21#include "llvm/Analysis/InlineCost.h"
22#include "llvm/Analysis/LoopInfo.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/Analysis.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
28#include "llvm/IR/PatternMatch.h"
29#include "llvm/Support/KnownBits.h"
30#include "llvm/Transforms/Utils/UnrollLoop.h"
31#include <optional>
32
33using namespace llvm;
34
35#define DEBUG_TYPE "AMDGPUtti"
36
37static cl::opt<unsigned> UnrollThresholdPrivate(
38 "amdgpu-unroll-threshold-private",
39 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
40 cl::init(Val: 2700), cl::Hidden);
41
42static cl::opt<unsigned> UnrollThresholdLocal(
43 "amdgpu-unroll-threshold-local",
44 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
45 cl::init(Val: 1000), cl::Hidden);
46
47static cl::opt<unsigned> UnrollThresholdIf(
48 "amdgpu-unroll-threshold-if",
49 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
50 cl::init(Val: 200), cl::Hidden);
51
52static cl::opt<bool> UnrollRuntimeLocal(
53 "amdgpu-unroll-runtime-local",
54 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
55 cl::init(Val: true), cl::Hidden);
56
57static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
58 "amdgpu-unroll-max-block-to-analyze",
59 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
60 cl::init(Val: 32), cl::Hidden);
61
62static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
63 cl::Hidden, cl::init(Val: 4000),
64 cl::desc("Cost of alloca argument"));
65
66// If the amount of scratch memory to eliminate exceeds our ability to allocate
67// it into registers we gain nothing by aggressively inlining functions for that
68// heuristic.
69static cl::opt<unsigned>
70 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
71 cl::init(Val: 256),
72 cl::desc("Maximum alloca size to use for inline cost"));
73
74// Inliner constraint to achieve reasonable compilation time.
75static cl::opt<size_t> InlineMaxBB(
76 "amdgpu-inline-max-bb", cl::Hidden, cl::init(Val: 1100),
77 cl::desc("Maximum number of BBs allowed in a function after inlining"
78 " (compile time constraint)"));
79
80// This default unroll factor is based on microbenchmarks on gfx1030.
81static cl::opt<unsigned> MemcpyLoopUnroll(
82 "amdgpu-memcpy-loop-unroll",
83 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
84 "operations when lowering statically-sized memcpy, memmove, or"
85 "memset as a loop"),
86 cl::init(Val: 16), cl::Hidden);
87
88static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
89 unsigned Depth = 0) {
90 const Instruction *I = dyn_cast<Instruction>(Val: Cond);
91 if (!I)
92 return false;
93
94 for (const Value *V : I->operand_values()) {
95 if (!L->contains(Inst: I))
96 continue;
97 if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) {
98 if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) {
99 return SubLoop->contains(Inst: PHI); }))
100 return true;
101 } else if (Depth < 10 && dependsOnLocalPhi(L, Cond: V, Depth: Depth+1))
102 return true;
103 }
104 return false;
105}
106
107AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
108 : BaseT(TM, F.getDataLayout()),
109 TargetTriple(TM->getTargetTriple()),
110 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
111 TLI(ST->getTargetLowering()) {}
112
113void AMDGPUTTIImpl::getUnrollingPreferences(
114 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
115 OptimizationRemarkEmitter *ORE) const {
116 const Function &F = *L->getHeader()->getParent();
117 UP.Threshold =
118 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold", Default: 300);
119 UP.MaxCount = std::numeric_limits<unsigned>::max();
120 UP.Partial = true;
121
122 // Conditional branch in a loop back edge needs 3 additional exec
123 // manipulations in average.
124 UP.BEInsns += 3;
125
126 // We want to run unroll even for the loops which have been vectorized.
127 UP.UnrollVectorizedLoop = true;
128
129 // TODO: Do we want runtime unrolling?
130
131 // Maximum alloca size than can fit registers. Reserve 16 registers.
132 const unsigned MaxAlloca = (256 - 16) * 4;
133 unsigned ThresholdPrivate = UnrollThresholdPrivate;
134 unsigned ThresholdLocal = UnrollThresholdLocal;
135
136 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
137 // provided threshold value as the default for Threshold
138 if (MDNode *LoopUnrollThreshold =
139 findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold")) {
140 if (LoopUnrollThreshold->getNumOperands() == 2) {
141 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
142 MD: LoopUnrollThreshold->getOperand(I: 1));
143 if (MetaThresholdValue) {
144 // We will also use the supplied value for PartialThreshold for now.
145 // We may introduce additional metadata if it becomes necessary in the
146 // future.
147 UP.Threshold = MetaThresholdValue->getSExtValue();
148 UP.PartialThreshold = UP.Threshold;
149 ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold);
150 ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold);
151 }
152 }
153 }
154
155 unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal);
156 for (const BasicBlock *BB : L->getBlocks()) {
157 const DataLayout &DL = BB->getDataLayout();
158 unsigned LocalGEPsSeen = 0;
159
160 if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) {
161 return SubLoop->contains(BB); }))
162 continue; // Block belongs to an inner loop.
163
164 for (const Instruction &I : *BB) {
165 // Unroll a loop which contains an "if" statement whose condition
166 // defined by a PHI belonging to the loop. This may help to eliminate
167 // if region and potentially even PHI itself, saving on both divergence
168 // and registers used for the PHI.
169 // Add a small bonus for each of such "if" statements.
170 if (const CondBrInst *Br = dyn_cast<CondBrInst>(Val: &I)) {
171 if (UP.Threshold < MaxBoost) {
172 BasicBlock *Succ0 = Br->getSuccessor(i: 0);
173 BasicBlock *Succ1 = Br->getSuccessor(i: 1);
174 if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) ||
175 (L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1)))
176 continue;
177 if (dependsOnLocalPhi(L, Cond: Br->getCondition())) {
178 UP.Threshold += UnrollThresholdIf;
179 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
180 << " for loop:\n"
181 << *L << " due to " << *Br << '\n');
182 if (UP.Threshold >= MaxBoost)
183 return;
184 }
185 }
186 continue;
187 }
188
189 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I);
190 if (!GEP)
191 continue;
192
193 unsigned AS = GEP->getAddressSpace();
194 unsigned Threshold = 0;
195 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
196 Threshold = ThresholdPrivate;
197 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
198 Threshold = ThresholdLocal;
199 else
200 continue;
201
202 if (UP.Threshold >= Threshold)
203 continue;
204
205 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
206 const Value *Ptr = GEP->getPointerOperand();
207 const AllocaInst *Alloca =
208 dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr));
209 if (!Alloca || !Alloca->isStaticAlloca())
210 continue;
211 auto AllocaSize = Alloca->getAllocationSize(DL);
212 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
213 continue;
214 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
215 AS == AMDGPUAS::REGION_ADDRESS) {
216 LocalGEPsSeen++;
217 // Inhibit unroll for local memory if we have seen addressing not to
218 // a variable, most likely we will be unable to combine it.
219 // Do not unroll too deep inner loops for local memory to give a chance
220 // to unroll an outer loop for a more important reason.
221 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
222 (!isa<GlobalVariable>(Val: GEP->getPointerOperand()) &&
223 !isa<Argument>(Val: GEP->getPointerOperand())))
224 continue;
225 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
226 << *L << " due to LDS use.\n");
227 UP.Runtime = UnrollRuntimeLocal;
228 }
229
230 // Check if GEP depends on a value defined by this loop itself.
231 bool HasLoopDef = false;
232 for (const Value *Op : GEP->operands()) {
233 const Instruction *Inst = dyn_cast<Instruction>(Val: Op);
234 if (!Inst || L->isLoopInvariant(V: Op))
235 continue;
236
237 if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) {
238 return SubLoop->contains(Inst); }))
239 continue;
240 HasLoopDef = true;
241 break;
242 }
243 if (!HasLoopDef)
244 continue;
245
246 // We want to do whatever we can to limit the number of alloca
247 // instructions that make it through to the code generator. allocas
248 // require us to use indirect addressing, which is slow and prone to
249 // compiler bugs. If this loop does an address calculation on an
250 // alloca ptr, then we want to use a higher than normal loop unroll
251 // threshold. This will give SROA a better chance to eliminate these
252 // allocas.
253 //
254 // We also want to have more unrolling for local memory to let ds
255 // instructions with different offsets combine.
256 //
257 // Don't use the maximum allowed value here as it will make some
258 // programs way too big.
259 UP.Threshold = Threshold;
260 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
261 << " for loop:\n"
262 << *L << " due to " << *GEP << '\n');
263 if (UP.Threshold >= MaxBoost)
264 return;
265 }
266
267 // If we got a GEP in a small BB from inner loop then increase max trip
268 // count to analyze for better estimation cost in unroll
269 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
270 UP.MaxIterationsCountToAnalyze = 32;
271 }
272 // If a user provided an explicit unroll pragma (with or without count),
273 // override expensive trip count checks
274 UnrollPragmaInfo PInfo(L);
275 if (PInfo.PragmaEnableUnroll || PInfo.PragmaCount > 0)
276 UP.AllowExpensiveTripCount = true;
277}
278
279void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
280 TTI::PeelingPreferences &PP) const {
281 BaseT::getPeelingPreferences(L, SE, PP);
282}
283
284uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
285 return 1024;
286}
287
288const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
289 // Codegen control options which don't matter.
290 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
291 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
292 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
293
294 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
295
296 // Property of the kernel/environment which can't actually differ.
297 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
298 AMDGPU::FeatureTrapHandler,
299
300 // The default assumption needs to be ecc is enabled, but no directly
301 // exposed operations depend on it, so it can be safely inlined.
302 AMDGPU::FeatureSRAMECC,
303
304 // Perf-tuning features
305 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
306
307GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
308 : BaseT(TM, F.getDataLayout()),
309 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
310 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
311 IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) {
312 SIModeRegisterDefaults Mode(F, *ST);
313 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
314 HasFP64FP16Denormals =
315 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
316}
317
318bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
319 return !F || !ST->isSingleLaneExecution(Kernel: *F);
320}
321
322unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
323 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
324 // registers. See getRegisterClassForType for the implementation.
325 // In this case vector registers are not vector in terms of
326 // VGPRs, but those which can hold multiple values.
327
328 // This is really the number of registers to fill when vectorizing /
329 // interleaving loops, so we lie to avoid trying to use all registers.
330 return 4;
331}
332
333TypeSize
334GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
335 switch (K) {
336 case TargetTransformInfo::RGK_Scalar:
337 return TypeSize::getFixed(ExactSize: 32);
338 case TargetTransformInfo::RGK_FixedWidthVector:
339 return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? 64 : 32);
340 case TargetTransformInfo::RGK_ScalableVector:
341 return TypeSize::getScalable(MinimumSize: 0);
342 }
343 llvm_unreachable("Unsupported register kind");
344}
345
346unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
347 return 32;
348}
349
350unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
351 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
352 return 32 * 4 / ElemWidth;
353 // For a given width return the max 0number of elements that can be combined
354 // into a wider bit value:
355 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
356 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
357 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
358 : 1;
359}
360
361unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
362 unsigned ChainSizeInBytes,
363 VectorType *VecTy) const {
364 unsigned VecRegBitWidth = VF * LoadSize;
365 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
366 // TODO: Support element-size less than 32bit?
367 return 128 / LoadSize;
368
369 return VF;
370}
371
372unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
373 unsigned ChainSizeInBytes,
374 VectorType *VecTy) const {
375 unsigned VecRegBitWidth = VF * StoreSize;
376 if (VecRegBitWidth > 128)
377 return 128 / StoreSize;
378
379 return VF;
380}
381
382unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
383 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
384 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
385 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
386 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
387 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
388 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
389 return 512;
390 }
391
392 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
393 return 8 * ST->getMaxPrivateElementSize();
394
395 // Common to flat, global, local and region. Assume for unknown addrspace.
396 return 128;
397}
398
399bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
400 Align Alignment,
401 unsigned AddrSpace) const {
402 // We allow vectorization of flat stores, even though we may need to decompose
403 // them later if they may access private memory. We don't have enough context
404 // here, and legalization can handle it.
405 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
406 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
407 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
408 }
409 return true;
410}
411
412bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
413 Align Alignment,
414 unsigned AddrSpace) const {
415 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
416}
417
418bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
419 Align Alignment,
420 unsigned AddrSpace) const {
421 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
422}
423
424uint64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
425 return 1024;
426}
427
428Type *GCNTTIImpl::getMemcpyLoopLoweringType(
429 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
430 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
431 std::optional<uint32_t> AtomicElementSize) const {
432
433 if (AtomicElementSize)
434 return Type::getIntNTy(C&: Context, N: *AtomicElementSize * 8);
435
436 // 16-byte accesses achieve the highest copy throughput.
437 // If the operation has a fixed known length that is large enough, it is
438 // worthwhile to return an even wider type and let legalization lower it into
439 // multiple accesses, effectively unrolling the memcpy loop.
440 // We also rely on legalization to decompose into smaller accesses for
441 // subtargets and address spaces where it is necessary.
442 //
443 // Don't unroll if Length is not a constant, since unrolling leads to worse
444 // performance for length values that are smaller or slightly larger than the
445 // total size of the type returned here. Mitigating that would require a more
446 // complex lowering for variable-length memcpy and memmove.
447 unsigned I32EltsInVector = 4;
448 if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Val: Length))
449 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context),
450 NumElts: MemcpyLoopUnroll * I32EltsInVector);
451
452 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: I32EltsInVector);
453}
454
455void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
456 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
457 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
458 Align SrcAlign, Align DestAlign,
459 std::optional<uint32_t> AtomicCpySize) const {
460
461 if (AtomicCpySize)
462 BaseT::getMemcpyLoopResidualLoweringType(
463 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
464 DestAlign, AtomicCpySize);
465
466 Type *I32x4Ty = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 4);
467 while (RemainingBytes >= 16) {
468 OpsOut.push_back(Elt: I32x4Ty);
469 RemainingBytes -= 16;
470 }
471
472 Type *I64Ty = Type::getInt64Ty(C&: Context);
473 while (RemainingBytes >= 8) {
474 OpsOut.push_back(Elt: I64Ty);
475 RemainingBytes -= 8;
476 }
477
478 Type *I32Ty = Type::getInt32Ty(C&: Context);
479 while (RemainingBytes >= 4) {
480 OpsOut.push_back(Elt: I32Ty);
481 RemainingBytes -= 4;
482 }
483
484 Type *I16Ty = Type::getInt16Ty(C&: Context);
485 while (RemainingBytes >= 2) {
486 OpsOut.push_back(Elt: I16Ty);
487 RemainingBytes -= 2;
488 }
489
490 Type *I8Ty = Type::getInt8Ty(C&: Context);
491 while (RemainingBytes) {
492 OpsOut.push_back(Elt: I8Ty);
493 --RemainingBytes;
494 }
495}
496
497unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
498 // Disable unrolling if the loop is not vectorized.
499 // TODO: Enable this again.
500 if (VF.isScalar())
501 return 1;
502
503 return 8;
504}
505
506bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
507 MemIntrinsicInfo &Info) const {
508 switch (Inst->getIntrinsicID()) {
509 case Intrinsic::amdgcn_ds_ordered_add:
510 case Intrinsic::amdgcn_ds_ordered_swap: {
511 auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 2));
512 auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 4));
513 if (!Ordering || !Volatile)
514 return false; // Invalid.
515
516 unsigned OrderingVal = Ordering->getZExtValue();
517 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
518 return false;
519
520 Info.PtrVal = Inst->getArgOperand(i: 0);
521 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
522 Info.ReadMem = true;
523 Info.WriteMem = true;
524 Info.IsVolatile = !Volatile->isZero();
525 return true;
526 }
527 default:
528 return false;
529 }
530}
531
532InstructionCost GCNTTIImpl::getArithmeticInstrCost(
533 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
534 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
535 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
536
537 // Legalize the type.
538 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
539 int ISD = TLI->InstructionOpcodeToISD(Opcode);
540
541 // Because we don't have any legal vector operations, but the legal types, we
542 // need to account for split vectors.
543 unsigned NElts = LT.second.isVector() ?
544 LT.second.getVectorNumElements() : 1;
545
546 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
547
548 switch (ISD) {
549 case ISD::SHL:
550 case ISD::SRL:
551 case ISD::SRA:
552 if (SLT == MVT::i64)
553 return get64BitInstrCost(CostKind) * LT.first * NElts;
554
555 if (ST->has16BitInsts() && SLT == MVT::i16)
556 NElts = (NElts + 1) / 2;
557
558 // i32
559 return getFullRateInstrCost() * LT.first * NElts;
560 case ISD::ADD:
561 case ISD::SUB:
562 case ISD::AND:
563 case ISD::OR:
564 case ISD::XOR:
565 if (SLT == MVT::i64) {
566 // and, or and xor are typically split into 2 VALU instructions.
567 return 2 * getFullRateInstrCost() * LT.first * NElts;
568 }
569
570 if (ST->has16BitInsts() && SLT == MVT::i16)
571 NElts = (NElts + 1) / 2;
572
573 return LT.first * NElts * getFullRateInstrCost();
574 case ISD::MUL: {
575 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
576 if (SLT == MVT::i64) {
577 const int FullRateCost = getFullRateInstrCost();
578 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
579 }
580
581 if (ST->has16BitInsts() && SLT == MVT::i16)
582 NElts = (NElts + 1) / 2;
583
584 // i32
585 return QuarterRateCost * NElts * LT.first;
586 }
587 case ISD::FMUL:
588 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
589 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
590 // fused operation.
591 if (CxtI && CxtI->hasOneUse())
592 if (const auto *FAdd = dyn_cast<BinaryOperator>(Val: *CxtI->user_begin())) {
593 const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode());
594 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
595 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
596 return TargetTransformInfo::TCC_Free;
597 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
598 return TargetTransformInfo::TCC_Free;
599
600 // Estimate all types may be fused with contract/unsafe flags
601 const TargetOptions &Options = TLI->getTargetMachine().Options;
602 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
603 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
604 return TargetTransformInfo::TCC_Free;
605 }
606 }
607 [[fallthrough]];
608 case ISD::FADD:
609 case ISD::FSUB:
610 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
611 NElts = (NElts + 1) / 2;
612 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
613 NElts = (NElts + 1) / 2;
614 if (SLT == MVT::f64)
615 return LT.first * NElts * get64BitInstrCost(CostKind);
616
617 if (ST->has16BitInsts() && SLT == MVT::f16)
618 NElts = (NElts + 1) / 2;
619
620 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
621 return LT.first * NElts * getFullRateInstrCost();
622 break;
623 case ISD::FDIV:
624 case ISD::FREM:
625 // FIXME: frem should be handled separately. The fdiv in it is most of it,
626 // but the current lowering is also not entirely correct.
627 if (SLT == MVT::f64) {
628 int Cost = 7 * get64BitInstrCost(CostKind) +
629 getQuarterRateInstrCost(CostKind) +
630 3 * getHalfRateInstrCost(CostKind);
631 // Add cost of workaround.
632 if (!ST->hasUsableDivScaleConditionOutput())
633 Cost += 3 * getFullRateInstrCost();
634
635 return LT.first * Cost * NElts;
636 }
637
638 if (!Args.empty() && match(V: Args[0], P: PatternMatch::m_FPOne())) {
639 // TODO: This is more complicated, unsafe flags etc.
640 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
641 (SLT == MVT::f16 && ST->has16BitInsts())) {
642 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
643 }
644 }
645
646 if (SLT == MVT::f16 && ST->has16BitInsts()) {
647 // 2 x v_cvt_f32_f16
648 // f32 rcp
649 // f32 fmul
650 // v_cvt_f16_f32
651 // f16 div_fixup
652 int Cost =
653 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
654 return LT.first * Cost * NElts;
655 }
656
657 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
658 // Fast unsafe fdiv lowering:
659 // f32 rcp
660 // f32 fmul
661 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
662 return LT.first * Cost * NElts;
663 }
664
665 if (SLT == MVT::f32 || SLT == MVT::f16) {
666 // 4 more v_cvt_* insts without f16 insts support
667 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
668 1 * getQuarterRateInstrCost(CostKind);
669
670 if (!HasFP32Denormals) {
671 // FP mode switches.
672 Cost += 2 * getFullRateInstrCost();
673 }
674
675 return LT.first * NElts * Cost;
676 }
677 break;
678 case ISD::FNEG:
679 // Use the backend' estimation. If fneg is not free each element will cost
680 // one additional instruction.
681 return TLI->isFNegFree(VT: SLT) ? 0 : NElts;
682 default:
683 break;
684 }
685
686 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
687 Args, CxtI);
688}
689
690// Return true if there's a potential benefit from using v2f16/v2i16
691// instructions for an intrinsic, even if it requires nontrivial legalization.
692static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
693 switch (ID) {
694 case Intrinsic::fma:
695 case Intrinsic::fmuladd:
696 case Intrinsic::copysign:
697 case Intrinsic::minimumnum:
698 case Intrinsic::maximumnum:
699 case Intrinsic::canonicalize:
700 // There's a small benefit to using vector ops in the legalized code.
701 case Intrinsic::round:
702 case Intrinsic::uadd_sat:
703 case Intrinsic::usub_sat:
704 case Intrinsic::sadd_sat:
705 case Intrinsic::ssub_sat:
706 case Intrinsic::abs:
707 return true;
708 default:
709 return false;
710 }
711}
712
713InstructionCost
714GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
715 TTI::TargetCostKind CostKind) const {
716 switch (ICA.getID()) {
717 case Intrinsic::fabs:
718 // Free source modifier in the common case.
719 return 0;
720 case Intrinsic::amdgcn_workitem_id_x:
721 case Intrinsic::amdgcn_workitem_id_y:
722 case Intrinsic::amdgcn_workitem_id_z:
723 // TODO: If hasPackedTID, or if the calling context is not an entry point
724 // there may be a bit instruction.
725 return 0;
726 case Intrinsic::amdgcn_workgroup_id_x:
727 case Intrinsic::amdgcn_workgroup_id_y:
728 case Intrinsic::amdgcn_workgroup_id_z:
729 case Intrinsic::amdgcn_lds_kernel_id:
730 case Intrinsic::amdgcn_dispatch_ptr:
731 case Intrinsic::amdgcn_dispatch_id:
732 case Intrinsic::amdgcn_implicitarg_ptr:
733 case Intrinsic::amdgcn_queue_ptr:
734 // Read from an argument register.
735 return 0;
736 default:
737 break;
738 }
739
740 Type *RetTy = ICA.getReturnType();
741
742 Intrinsic::ID IID = ICA.getID();
743 switch (IID) {
744 case Intrinsic::exp:
745 case Intrinsic::exp2:
746 case Intrinsic::exp10: {
747 // Legalize the type.
748 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
749 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
750 unsigned NElts =
751 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
752
753 if (SLT == MVT::f64) {
754 unsigned NumOps = 20;
755 if (IID == Intrinsic::exp)
756 ++NumOps;
757 else if (IID == Intrinsic::exp10)
758 NumOps += 3;
759
760 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
761 }
762
763 if (SLT == MVT::f32) {
764 unsigned NumFullRateOps = 0;
765 // v_exp_f32 (quarter rate).
766 unsigned NumQuarterRateOps = 1;
767
768 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
769 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
770 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
771 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
772 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
773 } else {
774 if (IID == Intrinsic::exp) {
775 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
776 NumFullRateOps = 1;
777 } else if (IID == Intrinsic::exp10) {
778 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
779 NumFullRateOps = 3;
780 NumQuarterRateOps = 2;
781 }
782 // Denorm scaling adds setcc + select + fadd + select + fmul.
783 if (HasFP32Denormals)
784 NumFullRateOps += 5;
785 }
786
787 InstructionCost Cost =
788 NumFullRateOps * getFullRateInstrCost() +
789 NumQuarterRateOps * getQuarterRateInstrCost(CostKind);
790 return LT.first * NElts * Cost;
791 }
792
793 break;
794 }
795 default:
796 break;
797 }
798
799 if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID()))
800 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
801
802 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
803 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
804 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
805
806 if ((ST->hasVOP3PInsts() &&
807 (SLT == MVT::f16 || SLT == MVT::i16 ||
808 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
809 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
810 NElts = (NElts + 1) / 2;
811
812 // TODO: Get more refined intrinsic costs?
813 unsigned InstRate = getQuarterRateInstrCost(CostKind);
814
815 switch (ICA.getID()) {
816 case Intrinsic::fma:
817 case Intrinsic::fmuladd:
818 if (SLT == MVT::f64) {
819 InstRate = get64BitInstrCost(CostKind);
820 break;
821 }
822
823 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
824 InstRate = getFullRateInstrCost();
825 else {
826 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
827 : getQuarterRateInstrCost(CostKind);
828 }
829 break;
830 case Intrinsic::copysign:
831 return NElts * getFullRateInstrCost();
832 case Intrinsic::minimumnum:
833 case Intrinsic::maximumnum: {
834 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
835 // promotion takes the place of the canonicalize.
836 unsigned NumOps = 3;
837 if (const IntrinsicInst *II = ICA.getInst()) {
838 // Directly legal with ieee=0
839 // TODO: Not directly legal with strictfp
840 if (fpenvIEEEMode(I: *II) == KnownIEEEMode::Off)
841 NumOps = 1;
842 }
843
844 unsigned BaseRate =
845 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
846 InstRate = BaseRate * NumOps;
847 break;
848 }
849 case Intrinsic::canonicalize: {
850 InstRate =
851 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
852 break;
853 }
854 case Intrinsic::uadd_sat:
855 case Intrinsic::usub_sat:
856 case Intrinsic::sadd_sat:
857 case Intrinsic::ssub_sat: {
858 if (SLT == MVT::i16 || SLT == MVT::i32)
859 InstRate = getFullRateInstrCost();
860
861 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
862 if (any_of(Range: ValidSatTys, P: equal_to(Arg&: LT.second)))
863 NElts = 1;
864 break;
865 }
866 case Intrinsic::abs:
867 // Expansion takes 2 instructions for VALU
868 if (SLT == MVT::i16 || SLT == MVT::i32)
869 InstRate = 2 * getFullRateInstrCost();
870 break;
871 default:
872 break;
873 }
874
875 return LT.first * NElts * InstRate;
876}
877
878InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
879 TTI::TargetCostKind CostKind,
880 const Instruction *I) const {
881 assert((I == nullptr || I->getOpcode() == Opcode) &&
882 "Opcode should reflect passed instruction.");
883 const bool SCost =
884 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
885 const int CBrCost = SCost ? 5 : 7;
886 switch (Opcode) {
887 case Instruction::UncondBr:
888 // Branch instruction takes about 4 slots on gfx900.
889 return SCost ? 1 : 4;
890 case Instruction::CondBr:
891 // Suppose conditional branch takes additional 3 exec manipulations
892 // instructions in average.
893 return CBrCost;
894 case Instruction::Switch: {
895 const auto *SI = dyn_cast_or_null<SwitchInst>(Val: I);
896 // Each case (including default) takes 1 cmp + 1 cbr instructions in
897 // average.
898 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
899 }
900 case Instruction::Ret:
901 return SCost ? 1 : 10;
902 }
903 return BaseT::getCFInstrCost(Opcode, CostKind, I);
904}
905
906InstructionCost
907GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
908 std::optional<FastMathFlags> FMF,
909 TTI::TargetCostKind CostKind) const {
910 if (TTI::requiresOrderedReduction(FMF))
911 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
912
913 EVT OrigTy = TLI->getValueType(DL, Ty);
914
915 // Computes cost on targets that have packed math instructions(which support
916 // 16-bit types only).
917 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
918 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
919
920 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
921 return LT.first * getFullRateInstrCost();
922}
923
924InstructionCost
925GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
926 FastMathFlags FMF,
927 TTI::TargetCostKind CostKind) const {
928 EVT OrigTy = TLI->getValueType(DL, Ty);
929
930 // Computes cost on targets that have packed math instructions(which support
931 // 16-bit types only).
932 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
933 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
934
935 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
936 return LT.first * getHalfRateInstrCost(CostKind);
937}
938
939InstructionCost GCNTTIImpl::getVectorInstrCost(
940 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
941 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
942 switch (Opcode) {
943 case Instruction::ExtractElement:
944 case Instruction::InsertElement: {
945 unsigned EltSize
946 = DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType());
947 if (EltSize < 32) {
948 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
949 return 0;
950 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
951 VIC);
952 }
953
954 // Extracts are just reads of a subregister, so are free. Inserts are
955 // considered free because we don't want to have any cost for scalarizing
956 // operations, and we don't have to copy into a different register class.
957
958 // Dynamic indexing isn't free and is best avoided.
959 return Index == ~0u ? 2 : 0;
960 }
961 default:
962 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
963 VIC);
964 }
965}
966
967/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
968/// this is analyzing the collective result of all output registers. Otherwise,
969/// this is only querying a specific result index if this returns multiple
970/// registers in a struct.
971bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
972 const CallInst *CI, ArrayRef<unsigned> Indices) const {
973 // TODO: Handle complex extract indices
974 if (Indices.size() > 1)
975 return true;
976
977 const DataLayout &DL = CI->getDataLayout();
978 const SIRegisterInfo *TRI = ST->getRegisterInfo();
979 TargetLowering::AsmOperandInfoVector TargetConstraints =
980 TLI->ParseConstraints(DL, TRI: ST->getRegisterInfo(), Call: *CI);
981
982 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
983
984 int OutputIdx = 0;
985 for (auto &TC : TargetConstraints) {
986 if (TC.Type != InlineAsm::isOutput)
987 continue;
988
989 // Skip outputs we don't care about.
990 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
991 continue;
992
993 TLI->ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
994
995 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
996 TRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second;
997
998 // For AGPR constraints null is returned on subtargets without AGPRs, so
999 // assume divergent for null.
1000 if (!RC || !TRI->isSGPRClass(RC))
1001 return true;
1002 }
1003
1004 return false;
1005}
1006
1007bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
1008 const IntrinsicInst *ReadReg) const {
1009 Metadata *MD =
1010 cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: 0))->getMetadata();
1011 StringRef RegName =
1012 cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: 0))->getString();
1013
1014 // Special case registers that look like VCC.
1015 MVT VT = MVT::getVT(Ty: ReadReg->getType());
1016 if (VT == MVT::i1)
1017 return true;
1018
1019 // Special case scalar registers that start with 'v'.
1020 if (RegName.starts_with(Prefix: "vcc") || RegName.empty())
1021 return false;
1022
1023 // VGPR or AGPR is divergent. There aren't any specially named vector
1024 // registers.
1025 return RegName[0] == 'v' || RegName[0] == 'a';
1026}
1027
1028/// \returns true if the result of the value could potentially be
1029/// different across workitems in a wavefront.
1030bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1031 if (const Argument *A = dyn_cast<Argument>(Val: V))
1032 return !AMDGPU::isArgPassedInSGPR(Arg: A);
1033
1034 // Loads from the private and flat address spaces are divergent, because
1035 // threads can execute the load instruction with the same inputs and get
1036 // different results.
1037 //
1038 // All other loads are not divergent, because if threads issue loads with the
1039 // same arguments, they will always get the same result.
1040 if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V))
1041 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1042 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1043
1044 // Atomics are divergent because they are executed sequentially: when an
1045 // atomic operation refers to the same address in each thread, then each
1046 // thread after the first sees the value written by the previous thread as
1047 // original value.
1048 if (isa<AtomicRMWInst, AtomicCmpXchgInst>(Val: V))
1049 return true;
1050
1051 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
1052 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1053 switch (IID) {
1054 case Intrinsic::read_register:
1055 return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic);
1056 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1057 unsigned SrcAS =
1058 Intrinsic->getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1059 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1060 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1061 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1062 ST->hasGloballyAddressableScratch();
1063 }
1064 case Intrinsic::amdgcn_workitem_id_y:
1065 case Intrinsic::amdgcn_workitem_id_z: {
1066 const Function *F = Intrinsic->getFunction();
1067 bool HasUniformYZ =
1068 ST->hasWavefrontsEvenlySplittingXDim(F: *F, /*RequitezUniformYZ=*/REquiresUniformYZ: true);
1069 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1070 F: *F, Dim: IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1071 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1072 }
1073 default:
1074 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID);
1075 }
1076 }
1077
1078 // Assume all function calls are a source of divergence.
1079 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1080 if (CI->isInlineAsm())
1081 return isInlineAsmSourceOfDivergence(CI);
1082 return true;
1083 }
1084
1085 // Assume all function calls are a source of divergence.
1086 if (isa<InvokeInst>(Val: V))
1087 return true;
1088
1089 // If the target supports globally addressable scratch, the mapping from
1090 // scratch memory to the flat aperture changes therefore an address space cast
1091 // is no longer uniform.
1092 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(Val: V)) {
1093 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1094 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1095 ST->hasGloballyAddressableScratch();
1096 }
1097
1098 return false;
1099}
1100
1101bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1102 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V))
1103 return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID());
1104
1105 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1106 if (CI->isInlineAsm())
1107 return !isInlineAsmSourceOfDivergence(CI);
1108 return false;
1109 }
1110
1111 // In most cases TID / wavefrontsize is uniform.
1112 //
1113 // However, if a kernel has uneven dimesions we can have a value of
1114 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1115 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1116 // packed into a same wave which gives 1 and 0 after the division by 64
1117 // respectively.
1118 //
1119 // The X dimension doesn't reset within a wave if either both the Y
1120 // and Z dimensions are of length 1, or if the X dimension's required
1121 // size is a power of 2. Note, however, if the X dimension's maximum
1122 // size is a power of 2 < the wavefront size, division by the wavefront
1123 // size is guaranteed to yield 0, so this is also a no-reset case.
1124 bool XDimDoesntResetWithinWaves = false;
1125 if (auto *I = dyn_cast<Instruction>(Val: V)) {
1126 const Function *F = I->getFunction();
1127 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(F: *F);
1128 }
1129 using namespace llvm::PatternMatch;
1130 uint64_t C;
1131 if (match(V, P: m_LShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1132 R: m_ConstantInt(V&: C))) ||
1133 match(V, P: m_AShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1134 R: m_ConstantInt(V&: C)))) {
1135 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1136 }
1137
1138 Value *Mask;
1139 if (match(V, P: m_c_And(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1140 R: m_Value(V&: Mask)))) {
1141 return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >=
1142 ST->getWavefrontSizeLog2() &&
1143 XDimDoesntResetWithinWaves;
1144 }
1145
1146 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V);
1147 if (!ExtValue)
1148 return false;
1149
1150 const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: 0));
1151 if (!CI)
1152 return false;
1153
1154 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) {
1155 switch (Intrinsic->getIntrinsicID()) {
1156 default:
1157 return false;
1158 case Intrinsic::amdgcn_if:
1159 case Intrinsic::amdgcn_else: {
1160 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1161 return Indices.size() == 1 && Indices[0] == 1;
1162 }
1163 }
1164 }
1165
1166 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1167 // divergent for the overall struct return. We need to override it in the
1168 // case we're extracting an SGPR component here.
1169 if (CI->isInlineAsm())
1170 return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices());
1171
1172 return false;
1173}
1174
1175bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1176 Intrinsic::ID IID) const {
1177 switch (IID) {
1178 case Intrinsic::amdgcn_is_shared:
1179 case Intrinsic::amdgcn_is_private:
1180 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1181 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1182 case Intrinsic::amdgcn_load_to_lds:
1183 case Intrinsic::amdgcn_make_buffer_rsrc:
1184 OpIndexes.push_back(Elt: 0);
1185 return true;
1186 default:
1187 return false;
1188 }
1189}
1190
1191Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1192 Value *OldV,
1193 Value *NewV) const {
1194 auto IntrID = II->getIntrinsicID();
1195 switch (IntrID) {
1196 case Intrinsic::amdgcn_is_shared:
1197 case Intrinsic::amdgcn_is_private: {
1198 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1199 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1200 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1201 LLVMContext &Ctx = NewV->getType()->getContext();
1202 ConstantInt *NewVal = (TrueAS == NewAS) ?
1203 ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx);
1204 return NewVal;
1205 }
1206 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1207 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1208 Type *DestTy = II->getType();
1209 Type *SrcTy = NewV->getType();
1210 unsigned NewAS = SrcTy->getPointerAddressSpace();
1211 if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS))
1212 return nullptr;
1213 Module *M = II->getModule();
1214 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1215 M, id: II->getIntrinsicID(), Tys: {DestTy, SrcTy, DestTy});
1216 II->setArgOperand(i: 0, v: NewV);
1217 II->setCalledFunction(NewDecl);
1218 return II;
1219 }
1220 case Intrinsic::amdgcn_load_to_lds: {
1221 Type *SrcTy = NewV->getType();
1222 Module *M = II->getModule();
1223 Function *NewDecl =
1224 Intrinsic::getOrInsertDeclaration(M, id: II->getIntrinsicID(), Tys: {SrcTy});
1225 II->setArgOperand(i: 0, v: NewV);
1226 II->setCalledFunction(NewDecl);
1227 return II;
1228 }
1229 case Intrinsic::amdgcn_make_buffer_rsrc: {
1230 Type *SrcTy = NewV->getType();
1231 Type *DstTy = II->getType();
1232 Module *M = II->getModule();
1233 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1234 M, id: II->getIntrinsicID(), Tys: {DstTy, SrcTy});
1235 II->setArgOperand(i: 0, v: NewV);
1236 II->setCalledFunction(NewDecl);
1237 return II;
1238 }
1239 default:
1240 return nullptr;
1241 }
1242}
1243
1244InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1245 VectorType *DstTy, VectorType *SrcTy,
1246 ArrayRef<int> Mask,
1247 TTI::TargetCostKind CostKind,
1248 int Index, VectorType *SubTp,
1249 ArrayRef<const Value *> Args,
1250 const Instruction *CxtI) const {
1251 if (!isa<FixedVectorType>(Val: SrcTy))
1252 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1253 SubTp);
1254
1255 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1256
1257 unsigned ScalarSize = DL.getTypeSizeInBits(Ty: SrcTy->getElementType());
1258 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1259 (ScalarSize == 16 || ScalarSize == 8)) {
1260 // Larger vector widths may require additional instructions, but are
1261 // typically cheaper than scalarized versions.
1262 //
1263 // We assume that shuffling at a register granularity can be done for free.
1264 // This is not true for vectors fed into memory instructions, but it is
1265 // effectively true for all other shuffling. The emphasis of the logic here
1266 // is to assist generic transform in cleaning up / canonicalizing those
1267 // shuffles.
1268
1269 // With op_sel VOP3P instructions freely can access the low half or high
1270 // half of a register, so any swizzle of two elements is free.
1271 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(Val: SrcTy)) {
1272 unsigned NumSrcElts = SrcVecTy->getNumElements();
1273 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1274 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1275 Kind == TTI::SK_PermuteSingleSrc))
1276 return 0;
1277 }
1278
1279 unsigned EltsPerReg = 32 / ScalarSize;
1280 switch (Kind) {
1281 case TTI::SK_Broadcast:
1282 // A single v_perm_b32 can be re-used for all destination registers.
1283 return 1;
1284 case TTI::SK_Reverse:
1285 // One instruction per register.
1286 if (auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy))
1287 return divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: EltsPerReg);
1288 return InstructionCost::getInvalid();
1289 case TTI::SK_ExtractSubvector:
1290 if (Index % EltsPerReg == 0)
1291 return 0; // Shuffling at register granularity
1292 if (auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy))
1293 return divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: EltsPerReg);
1294 return InstructionCost::getInvalid();
1295 case TTI::SK_InsertSubvector: {
1296 auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy);
1297 if (!DstVecTy)
1298 return InstructionCost::getInvalid();
1299 unsigned NumDstElts = DstVecTy->getNumElements();
1300 unsigned NumInsertElts = cast<FixedVectorType>(Val: SubTp)->getNumElements();
1301 unsigned EndIndex = Index + NumInsertElts;
1302 unsigned BeginSubIdx = Index % EltsPerReg;
1303 unsigned EndSubIdx = EndIndex % EltsPerReg;
1304 unsigned Cost = 0;
1305
1306 if (BeginSubIdx != 0) {
1307 // Need to shift the inserted vector into place. The cost is the number
1308 // of destination registers overlapped by the inserted vector.
1309 Cost = divideCeil(Numerator: EndIndex, Denominator: EltsPerReg) - (Index / EltsPerReg);
1310 }
1311
1312 // If the last register overlap is partial, there may be three source
1313 // registers feeding into it; that takes an extra instruction.
1314 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1315 Cost += 1;
1316
1317 return Cost;
1318 }
1319 case TTI::SK_Splice: {
1320 auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy);
1321 if (!DstVecTy)
1322 return InstructionCost::getInvalid();
1323 unsigned NumElts = DstVecTy->getNumElements();
1324 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1325 // Determine the sub-region of the result vector that requires
1326 // sub-register shuffles / mixing.
1327 unsigned EltsFromLHS = NumElts - Index;
1328 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1329 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1330 if (LHSIsAligned && RHSIsAligned)
1331 return 0;
1332 if (LHSIsAligned && !RHSIsAligned)
1333 return divideCeil(Numerator: NumElts, Denominator: EltsPerReg) - (EltsFromLHS / EltsPerReg);
1334 if (!LHSIsAligned && RHSIsAligned)
1335 return divideCeil(Numerator: EltsFromLHS, Denominator: EltsPerReg);
1336 return divideCeil(Numerator: NumElts, Denominator: EltsPerReg);
1337 }
1338 default:
1339 break;
1340 }
1341
1342 if (!Mask.empty()) {
1343 unsigned NumSrcElts = cast<FixedVectorType>(Val: SrcTy)->getNumElements();
1344
1345 // Generically estimate the cost by assuming that each destination
1346 // register is derived from sources via v_perm_b32 instructions if it
1347 // can't be copied as-is.
1348 //
1349 // For each destination register, derive the cost of obtaining it based
1350 // on the number of source registers that feed into it.
1351 unsigned Cost = 0;
1352 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1353 SmallVector<int, 4> Regs;
1354 bool Aligned = true;
1355 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1356 int SrcIdx = Mask[DstIdx + I];
1357 if (SrcIdx == -1)
1358 continue;
1359 int Reg;
1360 if (SrcIdx < (int)NumSrcElts) {
1361 Reg = SrcIdx / EltsPerReg;
1362 if (SrcIdx % EltsPerReg != I)
1363 Aligned = false;
1364 } else {
1365 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1366 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1367 Aligned = false;
1368 }
1369 if (!llvm::is_contained(Range&: Regs, Element: Reg))
1370 Regs.push_back(Elt: Reg);
1371 }
1372 if (Regs.size() >= 2)
1373 Cost += Regs.size() - 1;
1374 else if (!Aligned)
1375 Cost += 1;
1376 }
1377 return Cost;
1378 }
1379 }
1380
1381 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1382 SubTp);
1383}
1384
1385/// Whether it is profitable to sink the operands of an
1386/// Instruction I to the basic block of I.
1387/// This helps using several modifiers (like abs and neg) more often.
1388bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
1389 SmallVectorImpl<Use *> &Ops) const {
1390 using namespace PatternMatch;
1391
1392 for (auto &Op : I->operands()) {
1393 // Ensure we are not already sinking this operand.
1394 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op.get(); }))
1395 continue;
1396
1397 if (match(V: &Op, P: m_FAbs(Op0: m_Value())) || match(V: &Op, P: m_FNeg(X: m_Value()))) {
1398 Ops.push_back(Elt: &Op);
1399 continue;
1400 }
1401
1402 // Check for zero-cost multiple use InsertElement/ExtractElement
1403 // instructions
1404 if (Instruction *OpInst = dyn_cast<Instruction>(Val: Op.get())) {
1405 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1406 Instruction *VecOpInst = dyn_cast<Instruction>(Val: OpInst->getOperand(i: 0));
1407 if (VecOpInst && VecOpInst->hasOneUse())
1408 continue;
1409
1410 if (getVectorInstrCost(Opcode: OpInst->getOpcode(), ValTy: OpInst->getType(),
1411 CostKind: TTI::TCK_RecipThroughput, Index: 0,
1412 Op0: OpInst->getOperand(i: 0),
1413 Op1: OpInst->getOperand(i: 1)) == 0) {
1414 Ops.push_back(Elt: &Op);
1415 continue;
1416 }
1417 }
1418 }
1419
1420 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: Op.get())) {
1421
1422 unsigned EltSize = DL.getTypeSizeInBits(
1423 Ty: cast<VectorType>(Val: Shuffle->getType())->getElementType());
1424
1425 // For i32 (or greater) shufflevectors, these will be lowered into a
1426 // series of insert / extract elements, which will be coalesced away.
1427 if (EltSize < 16 || !ST->has16BitInsts())
1428 continue;
1429
1430 int NumSubElts, SubIndex;
1431 if (Shuffle->changesLength()) {
1432 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1433 Ops.push_back(Elt: &Op);
1434 continue;
1435 }
1436
1437 if ((Shuffle->isExtractSubvectorMask(Index&: SubIndex) ||
1438 Shuffle->isInsertSubvectorMask(NumSubElts, Index&: SubIndex)) &&
1439 !(SubIndex & 0x1)) {
1440 Ops.push_back(Elt: &Op);
1441 continue;
1442 }
1443 }
1444
1445 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1446 Shuffle->isSingleSource()) {
1447 Ops.push_back(Elt: &Op);
1448 continue;
1449 }
1450 }
1451 }
1452
1453 return !Ops.empty();
1454}
1455
1456bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1457 const Function *Callee) const {
1458 const TargetMachine &TM = getTLI()->getTargetMachine();
1459 const GCNSubtarget *CallerST
1460 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1461 const GCNSubtarget *CalleeST
1462 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1463
1464 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1465 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1466
1467 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1468 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1469 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1470 return false;
1471
1472 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1473 // no way to support merge for backend defined attributes.
1474 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1475 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1476 if (!CallerMode.isInlineCompatible(CalleeMode))
1477 return false;
1478
1479 if (Callee->hasFnAttribute(Kind: Attribute::AlwaysInline) ||
1480 Callee->hasFnAttribute(Kind: Attribute::InlineHint))
1481 return true;
1482
1483 // Hack to make compile times reasonable.
1484 if (InlineMaxBB) {
1485 // Single BB does not increase total BB amount.
1486 if (Callee->size() == 1)
1487 return true;
1488 size_t BBSize = Caller->size() + Callee->size() - 1;
1489 return BBSize <= InlineMaxBB;
1490 }
1491
1492 return true;
1493}
1494
1495static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1496 const SITargetLowering *TLI,
1497 const GCNTTIImpl *TTIImpl) {
1498 const int NrOfSGPRUntilSpill = 26;
1499 const int NrOfVGPRUntilSpill = 32;
1500
1501 const DataLayout &DL = TTIImpl->getDataLayout();
1502
1503 unsigned adjustThreshold = 0;
1504 int SGPRsInUse = 0;
1505 int VGPRsInUse = 0;
1506 for (const Use &A : CB->args()) {
1507 SmallVector<EVT, 4> ValueVTs;
1508 ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs);
1509 for (auto ArgVT : ValueVTs) {
1510 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1511 Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT);
1512 if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A)))
1513 SGPRsInUse += CCRegNum;
1514 else
1515 VGPRsInUse += CCRegNum;
1516 }
1517 }
1518
1519 // The cost of passing function arguments through the stack:
1520 // 1 instruction to put a function argument on the stack in the caller.
1521 // 1 instruction to take a function argument from the stack in callee.
1522 // 1 instruction is explicitly take care of data dependencies in callee
1523 // function.
1524 InstructionCost ArgStackCost(1);
1525 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1526 Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1527 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1528 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1529 Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1530 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1531
1532 // The penalty cost is computed relative to the cost of instructions and does
1533 // not model any storage costs.
1534 adjustThreshold += std::max(a: 0, b: SGPRsInUse - NrOfSGPRUntilSpill) *
1535 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1536 adjustThreshold += std::max(a: 0, b: VGPRsInUse - NrOfVGPRUntilSpill) *
1537 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1538 return adjustThreshold;
1539}
1540
1541static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1542 const DataLayout &DL) {
1543 // If we have a pointer to a private array passed into a function
1544 // it will not be optimized out, leaving scratch usage.
1545 // This function calculates the total size in bytes of the memory that would
1546 // end in scratch if the call was not inlined.
1547 unsigned AllocaSize = 0;
1548 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1549 for (Value *PtrArg : CB->args()) {
1550 PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType());
1551 if (!Ty)
1552 continue;
1553
1554 unsigned AddrSpace = Ty->getAddressSpace();
1555 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1556 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1557 continue;
1558
1559 const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg));
1560 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(Ptr: AI).second)
1561 continue;
1562
1563 if (auto Size = AI->getAllocationSize(DL))
1564 AllocaSize += Size->getFixedValue();
1565 }
1566 return AllocaSize;
1567}
1568
1569int GCNTTIImpl::getInliningLastCallToStaticBonus() const {
1570 return BaseT::getInliningLastCallToStaticBonus() *
1571 getInliningThresholdMultiplier();
1572}
1573
1574unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1575 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this);
1576
1577 // Private object passed as arguments may end up in scratch usage if the call
1578 // is not inlined. Increase the inline threshold to promote inlining.
1579 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1580 if (AllocaSize > 0)
1581 Threshold += ArgAllocaCost;
1582 return Threshold;
1583}
1584
1585unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1586 const AllocaInst *AI) const {
1587
1588 // Below the cutoff, assume that the private memory objects would be
1589 // optimized
1590 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1591 if (AllocaSize <= ArgAllocaCutoff)
1592 return 0;
1593
1594 // Above the cutoff, we give a cost to each private memory object
1595 // depending its size. If the array can be optimized by SROA this cost is not
1596 // added to the total-cost in the inliner cost analysis.
1597 //
1598 // We choose the total cost of the alloca such that their sum cancels the
1599 // bonus given in the threshold (ArgAllocaCost).
1600 //
1601 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1602 //
1603 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1604 // the single-bb bonus and the vector-bonus.
1605 //
1606 // We compensate the first two multipliers, by repeating logic from the
1607 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1608 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1609 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1610
1611 bool SingleBB = none_of(Range&: *CB->getCalledFunction(), P: [](const BasicBlock &BB) {
1612 return BB.getTerminator()->getNumSuccessors() > 1;
1613 });
1614 if (SingleBB) {
1615 Threshold += Threshold / 2;
1616 }
1617
1618 auto ArgAllocaSize = AI->getAllocationSize(DL);
1619 if (!ArgAllocaSize)
1620 return 0;
1621
1622 // Attribute the bonus proportionally to the alloca size
1623 unsigned AllocaThresholdBonus =
1624 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1625
1626 return AllocaThresholdBonus;
1627}
1628
1629void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1630 TTI::UnrollingPreferences &UP,
1631 OptimizationRemarkEmitter *ORE) const {
1632 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1633}
1634
1635void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1636 TTI::PeelingPreferences &PP) const {
1637 CommonTTI.getPeelingPreferences(L, SE, PP);
1638}
1639
1640int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1641 return ST->hasFullRate64Ops()
1642 ? getFullRateInstrCost()
1643 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1644 : getQuarterRateInstrCost(CostKind);
1645}
1646
1647std::pair<InstructionCost, MVT>
1648GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1649 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1650 auto Size = DL.getTypeSizeInBits(Ty);
1651 // Maximum load or store can handle 8 dwords for scalar and 4 for
1652 // vector ALU. Let's assume anything above 8 dwords is expensive
1653 // even if legal.
1654 if (Size <= 256)
1655 return Cost;
1656
1657 Cost.first += (Size + 255) / 256;
1658 return Cost;
1659}
1660
1661unsigned GCNTTIImpl::getPrefetchDistance() const {
1662 return ST->hasPrefetch() ? 128 : 0;
1663}
1664
1665bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1666 return AMDGPU::isFlatGlobalAddrSpace(AS);
1667}
1668
1669void GCNTTIImpl::collectKernelLaunchBounds(
1670 const Function &F,
1671 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1672 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1673 LB.push_back(Elt: {"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1674 LB.push_back(Elt: {"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1675 LB.push_back(Elt: {"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1676 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1677 ST->getFlatWorkGroupSizes(F);
1678 LB.push_back(Elt: {"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1679 LB.push_back(Elt: {"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1680 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1681 LB.push_back(Elt: {"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1682 LB.push_back(Elt: {"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1683}
1684
1685GCNTTIImpl::KnownIEEEMode
1686GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
1687 if (!ST->hasFeature(Feature: AMDGPU::FeatureDX10ClampAndIEEEMode))
1688 return KnownIEEEMode::On; // Only mode on gfx1170+
1689
1690 const Function *F = I.getFunction();
1691 if (!F)
1692 return KnownIEEEMode::Unknown;
1693
1694 Attribute IEEEAttr = F->getFnAttribute(Kind: "amdgpu-ieee");
1695 if (IEEEAttr.isValid())
1696 return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
1697
1698 return AMDGPU::isShader(CC: F->getCallingConv()) ? KnownIEEEMode::Off
1699 : KnownIEEEMode::On;
1700}
1701
1702InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1703 Align Alignment,
1704 unsigned AddressSpace,
1705 TTI::TargetCostKind CostKind,
1706 TTI::OperandValueInfo OpInfo,
1707 const Instruction *I) const {
1708 if (VectorType *VecTy = dyn_cast<VectorType>(Val: Src)) {
1709 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1710 VecTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
1711 return divideCeil(Numerator: DL.getTypeSizeInBits(Ty: VecTy) - 1,
1712 Denominator: getLoadStoreVecRegBitWidth(AddrSpace: AddressSpace));
1713 }
1714 }
1715 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1716 OpInfo, I);
1717}
1718
1719unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1720 if (VectorType *VecTy = dyn_cast<VectorType>(Val: Tp)) {
1721 if (VecTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
1722 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1723 return divideCeil(Numerator: ElementCount - 1, Denominator: 4);
1724 }
1725 }
1726 return BaseT::getNumberOfParts(Tp);
1727}
1728
1729InstructionUniformity
1730GCNTTIImpl::getInstructionUniformity(const Value *V) const {
1731 if (isAlwaysUniform(V))
1732 return InstructionUniformity::AlwaysUniform;
1733
1734 if (isSourceOfDivergence(V))
1735 return InstructionUniformity::NeverUniform;
1736
1737 return InstructionUniformity::Default;
1738}
1739