1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetTransformInfo.h"
18#include "AMDGPUTargetMachine.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "SIModeRegisterDefaults.h"
21#include "llvm/Analysis/InlineCost.h"
22#include "llvm/Analysis/LoopInfo.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/Analysis.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/IR/PatternMatch.h"
28#include "llvm/Support/KnownBits.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "AMDGPUtti"
34
35static cl::opt<unsigned> UnrollThresholdPrivate(
36 "amdgpu-unroll-threshold-private",
37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38 cl::init(Val: 2700), cl::Hidden);
39
40static cl::opt<unsigned> UnrollThresholdLocal(
41 "amdgpu-unroll-threshold-local",
42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43 cl::init(Val: 1000), cl::Hidden);
44
45static cl::opt<unsigned> UnrollThresholdIf(
46 "amdgpu-unroll-threshold-if",
47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48 cl::init(Val: 200), cl::Hidden);
49
50static cl::opt<bool> UnrollRuntimeLocal(
51 "amdgpu-unroll-runtime-local",
52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53 cl::init(Val: true), cl::Hidden);
54
55static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58 cl::init(Val: 32), cl::Hidden);
59
60static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61 cl::Hidden, cl::init(Val: 4000),
62 cl::desc("Cost of alloca argument"));
63
64// If the amount of scratch memory to eliminate exceeds our ability to allocate
65// it into registers we gain nothing by aggressively inlining functions for that
66// heuristic.
67static cl::opt<unsigned>
68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69 cl::init(Val: 256),
70 cl::desc("Maximum alloca size to use for inline cost"));
71
72// Inliner constraint to achieve reasonable compilation time.
73static cl::opt<size_t> InlineMaxBB(
74 "amdgpu-inline-max-bb", cl::Hidden, cl::init(Val: 1100),
75 cl::desc("Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
77
78// This default unroll factor is based on microbenchmarks on gfx1030.
79static cl::opt<unsigned> MemcpyLoopUnroll(
80 "amdgpu-memcpy-loop-unroll",
81 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
82 "operations when lowering memcpy as a loop"),
83 cl::init(Val: 16), cl::Hidden);
84
85static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
86 unsigned Depth = 0) {
87 const Instruction *I = dyn_cast<Instruction>(Val: Cond);
88 if (!I)
89 return false;
90
91 for (const Value *V : I->operand_values()) {
92 if (!L->contains(Inst: I))
93 continue;
94 if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) {
95 if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) {
96 return SubLoop->contains(Inst: PHI); }))
97 return true;
98 } else if (Depth < 10 && dependsOnLocalPhi(L, Cond: V, Depth: Depth+1))
99 return true;
100 }
101 return false;
102}
103
104AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
105 : BaseT(TM, F.getDataLayout()),
106 TargetTriple(TM->getTargetTriple()),
107 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
108 TLI(ST->getTargetLowering()) {}
109
110void AMDGPUTTIImpl::getUnrollingPreferences(
111 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
112 OptimizationRemarkEmitter *ORE) const {
113 const Function &F = *L->getHeader()->getParent();
114 UP.Threshold =
115 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold", Default: 300);
116 UP.MaxCount = std::numeric_limits<unsigned>::max();
117 UP.Partial = true;
118
119 // Conditional branch in a loop back edge needs 3 additional exec
120 // manipulations in average.
121 UP.BEInsns += 3;
122
123 // We want to run unroll even for the loops which have been vectorized.
124 UP.UnrollVectorizedLoop = true;
125
126 // TODO: Do we want runtime unrolling?
127
128 // Maximum alloca size than can fit registers. Reserve 16 registers.
129 const unsigned MaxAlloca = (256 - 16) * 4;
130 unsigned ThresholdPrivate = UnrollThresholdPrivate;
131 unsigned ThresholdLocal = UnrollThresholdLocal;
132
133 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
134 // provided threshold value as the default for Threshold
135 if (MDNode *LoopUnrollThreshold =
136 findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold")) {
137 if (LoopUnrollThreshold->getNumOperands() == 2) {
138 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
139 MD: LoopUnrollThreshold->getOperand(I: 1));
140 if (MetaThresholdValue) {
141 // We will also use the supplied value for PartialThreshold for now.
142 // We may introduce additional metadata if it becomes necessary in the
143 // future.
144 UP.Threshold = MetaThresholdValue->getSExtValue();
145 UP.PartialThreshold = UP.Threshold;
146 ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold);
147 ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold);
148 }
149 }
150 }
151
152 unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal);
153 for (const BasicBlock *BB : L->getBlocks()) {
154 const DataLayout &DL = BB->getDataLayout();
155 unsigned LocalGEPsSeen = 0;
156
157 if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) {
158 return SubLoop->contains(BB); }))
159 continue; // Block belongs to an inner loop.
160
161 for (const Instruction &I : *BB) {
162 // Unroll a loop which contains an "if" statement whose condition
163 // defined by a PHI belonging to the loop. This may help to eliminate
164 // if region and potentially even PHI itself, saving on both divergence
165 // and registers used for the PHI.
166 // Add a small bonus for each of such "if" statements.
167 if (const BranchInst *Br = dyn_cast<BranchInst>(Val: &I)) {
168 if (UP.Threshold < MaxBoost && Br->isConditional()) {
169 BasicBlock *Succ0 = Br->getSuccessor(i: 0);
170 BasicBlock *Succ1 = Br->getSuccessor(i: 1);
171 if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) ||
172 (L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1)))
173 continue;
174 if (dependsOnLocalPhi(L, Cond: Br->getCondition())) {
175 UP.Threshold += UnrollThresholdIf;
176 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
177 << " for loop:\n"
178 << *L << " due to " << *Br << '\n');
179 if (UP.Threshold >= MaxBoost)
180 return;
181 }
182 }
183 continue;
184 }
185
186 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I);
187 if (!GEP)
188 continue;
189
190 unsigned AS = GEP->getAddressSpace();
191 unsigned Threshold = 0;
192 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
193 Threshold = ThresholdPrivate;
194 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
195 Threshold = ThresholdLocal;
196 else
197 continue;
198
199 if (UP.Threshold >= Threshold)
200 continue;
201
202 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
203 const Value *Ptr = GEP->getPointerOperand();
204 const AllocaInst *Alloca =
205 dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr));
206 if (!Alloca || !Alloca->isStaticAlloca())
207 continue;
208 Type *Ty = Alloca->getAllocatedType();
209 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
210 if (AllocaSize > MaxAlloca)
211 continue;
212 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
213 AS == AMDGPUAS::REGION_ADDRESS) {
214 LocalGEPsSeen++;
215 // Inhibit unroll for local memory if we have seen addressing not to
216 // a variable, most likely we will be unable to combine it.
217 // Do not unroll too deep inner loops for local memory to give a chance
218 // to unroll an outer loop for a more important reason.
219 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
220 (!isa<GlobalVariable>(Val: GEP->getPointerOperand()) &&
221 !isa<Argument>(Val: GEP->getPointerOperand())))
222 continue;
223 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
224 << *L << " due to LDS use.\n");
225 UP.Runtime = UnrollRuntimeLocal;
226 }
227
228 // Check if GEP depends on a value defined by this loop itself.
229 bool HasLoopDef = false;
230 for (const Value *Op : GEP->operands()) {
231 const Instruction *Inst = dyn_cast<Instruction>(Val: Op);
232 if (!Inst || L->isLoopInvariant(V: Op))
233 continue;
234
235 if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) {
236 return SubLoop->contains(Inst); }))
237 continue;
238 HasLoopDef = true;
239 break;
240 }
241 if (!HasLoopDef)
242 continue;
243
244 // We want to do whatever we can to limit the number of alloca
245 // instructions that make it through to the code generator. allocas
246 // require us to use indirect addressing, which is slow and prone to
247 // compiler bugs. If this loop does an address calculation on an
248 // alloca ptr, then we want to use a higher than normal loop unroll
249 // threshold. This will give SROA a better chance to eliminate these
250 // allocas.
251 //
252 // We also want to have more unrolling for local memory to let ds
253 // instructions with different offsets combine.
254 //
255 // Don't use the maximum allowed value here as it will make some
256 // programs way too big.
257 UP.Threshold = Threshold;
258 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
259 << " for loop:\n"
260 << *L << " due to " << *GEP << '\n');
261 if (UP.Threshold >= MaxBoost)
262 return;
263 }
264
265 // If we got a GEP in a small BB from inner loop then increase max trip
266 // count to analyze for better estimation cost in unroll
267 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
268 UP.MaxIterationsCountToAnalyze = 32;
269 }
270}
271
272void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
273 TTI::PeelingPreferences &PP) const {
274 BaseT::getPeelingPreferences(L, SE, PP);
275}
276
277uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
278 return 1024;
279}
280
281const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
282 // Codegen control options which don't matter.
283 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
284 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
285 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
286 AMDGPU::FeatureUnalignedAccessMode,
287
288 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
289
290 // Property of the kernel/environment which can't actually differ.
291 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
292 AMDGPU::FeatureTrapHandler,
293
294 // The default assumption needs to be ecc is enabled, but no directly
295 // exposed operations depend on it, so it can be safely inlined.
296 AMDGPU::FeatureSRAMECC,
297
298 // Perf-tuning features
299 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
300
301GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
302 : BaseT(TM, F.getDataLayout()),
303 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
304 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
305 IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) {
306 SIModeRegisterDefaults Mode(F, *ST);
307 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
308 HasFP64FP16Denormals =
309 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
310}
311
312bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
313 return !F || !ST->isSingleLaneExecution(Kernel: *F);
314}
315
316unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
317 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
318 // registers. See getRegisterClassForType for the implementation.
319 // In this case vector registers are not vector in terms of
320 // VGPRs, but those which can hold multiple values.
321
322 // This is really the number of registers to fill when vectorizing /
323 // interleaving loops, so we lie to avoid trying to use all registers.
324 return 4;
325}
326
327TypeSize
328GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
329 switch (K) {
330 case TargetTransformInfo::RGK_Scalar:
331 return TypeSize::getFixed(ExactSize: 32);
332 case TargetTransformInfo::RGK_FixedWidthVector:
333 return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? 64 : 32);
334 case TargetTransformInfo::RGK_ScalableVector:
335 return TypeSize::getScalable(MinimumSize: 0);
336 }
337 llvm_unreachable("Unsupported register kind");
338}
339
340unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
341 return 32;
342}
343
344unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346 return 32 * 4 / ElemWidth;
347 // For a given width return the max 0number of elements that can be combined
348 // into a wider bit value:
349 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
350 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
351 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
352 : 1;
353}
354
355unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
356 unsigned ChainSizeInBytes,
357 VectorType *VecTy) const {
358 unsigned VecRegBitWidth = VF * LoadSize;
359 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
360 // TODO: Support element-size less than 32bit?
361 return 128 / LoadSize;
362
363 return VF;
364}
365
366unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
367 unsigned ChainSizeInBytes,
368 VectorType *VecTy) const {
369 unsigned VecRegBitWidth = VF * StoreSize;
370 if (VecRegBitWidth > 128)
371 return 128 / StoreSize;
372
373 return VF;
374}
375
376unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
377 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
378 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
379 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
380 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
381 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
382 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
383 return 512;
384 }
385
386 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
387 return 8 * ST->getMaxPrivateElementSize();
388
389 // Common to flat, global, local and region. Assume for unknown addrspace.
390 return 128;
391}
392
393bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
394 Align Alignment,
395 unsigned AddrSpace) const {
396 // We allow vectorization of flat stores, even though we may need to decompose
397 // them later if they may access private memory. We don't have enough context
398 // here, and legalization can handle it.
399 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
400 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
401 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
402 }
403 return true;
404}
405
406bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
407 Align Alignment,
408 unsigned AddrSpace) const {
409 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
410}
411
412bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
413 Align Alignment,
414 unsigned AddrSpace) const {
415 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
416}
417
418uint64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
419 return 1024;
420}
421
422Type *GCNTTIImpl::getMemcpyLoopLoweringType(
423 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
424 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
425 std::optional<uint32_t> AtomicElementSize) const {
426
427 if (AtomicElementSize)
428 return Type::getIntNTy(C&: Context, N: *AtomicElementSize * 8);
429
430 // 16-byte accesses achieve the highest copy throughput.
431 // If the operation has a fixed known length that is large enough, it is
432 // worthwhile to return an even wider type and let legalization lower it into
433 // multiple accesses, effectively unrolling the memcpy loop.
434 // We also rely on legalization to decompose into smaller accesses for
435 // subtargets and address spaces where it is necessary.
436 //
437 // Don't unroll if Length is not a constant, since unrolling leads to worse
438 // performance for length values that are smaller or slightly larger than the
439 // total size of the type returned here. Mitigating that would require a more
440 // complex lowering for variable-length memcpy and memmove.
441 unsigned I32EltsInVector = 4;
442 if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Val: Length))
443 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context),
444 NumElts: MemcpyLoopUnroll * I32EltsInVector);
445
446 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: I32EltsInVector);
447}
448
449void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
450 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
451 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
452 Align SrcAlign, Align DestAlign,
453 std::optional<uint32_t> AtomicCpySize) const {
454
455 if (AtomicCpySize)
456 BaseT::getMemcpyLoopResidualLoweringType(
457 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
458 DestAlign, AtomicCpySize);
459
460 Type *I32x4Ty = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 4);
461 while (RemainingBytes >= 16) {
462 OpsOut.push_back(Elt: I32x4Ty);
463 RemainingBytes -= 16;
464 }
465
466 Type *I64Ty = Type::getInt64Ty(C&: Context);
467 while (RemainingBytes >= 8) {
468 OpsOut.push_back(Elt: I64Ty);
469 RemainingBytes -= 8;
470 }
471
472 Type *I32Ty = Type::getInt32Ty(C&: Context);
473 while (RemainingBytes >= 4) {
474 OpsOut.push_back(Elt: I32Ty);
475 RemainingBytes -= 4;
476 }
477
478 Type *I16Ty = Type::getInt16Ty(C&: Context);
479 while (RemainingBytes >= 2) {
480 OpsOut.push_back(Elt: I16Ty);
481 RemainingBytes -= 2;
482 }
483
484 Type *I8Ty = Type::getInt8Ty(C&: Context);
485 while (RemainingBytes) {
486 OpsOut.push_back(Elt: I8Ty);
487 --RemainingBytes;
488 }
489}
490
491unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
492 // Disable unrolling if the loop is not vectorized.
493 // TODO: Enable this again.
494 if (VF.isScalar())
495 return 1;
496
497 return 8;
498}
499
500bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
501 MemIntrinsicInfo &Info) const {
502 switch (Inst->getIntrinsicID()) {
503 case Intrinsic::amdgcn_ds_ordered_add:
504 case Intrinsic::amdgcn_ds_ordered_swap: {
505 auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 2));
506 auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 4));
507 if (!Ordering || !Volatile)
508 return false; // Invalid.
509
510 unsigned OrderingVal = Ordering->getZExtValue();
511 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
512 return false;
513
514 Info.PtrVal = Inst->getArgOperand(i: 0);
515 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
516 Info.ReadMem = true;
517 Info.WriteMem = true;
518 Info.IsVolatile = !Volatile->isZero();
519 return true;
520 }
521 default:
522 return false;
523 }
524}
525
526InstructionCost GCNTTIImpl::getArithmeticInstrCost(
527 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
528 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
529 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
530
531 // Legalize the type.
532 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
533 int ISD = TLI->InstructionOpcodeToISD(Opcode);
534
535 // Because we don't have any legal vector operations, but the legal types, we
536 // need to account for split vectors.
537 unsigned NElts = LT.second.isVector() ?
538 LT.second.getVectorNumElements() : 1;
539
540 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
541
542 switch (ISD) {
543 case ISD::SHL:
544 case ISD::SRL:
545 case ISD::SRA:
546 if (SLT == MVT::i64)
547 return get64BitInstrCost(CostKind) * LT.first * NElts;
548
549 if (ST->has16BitInsts() && SLT == MVT::i16)
550 NElts = (NElts + 1) / 2;
551
552 // i32
553 return getFullRateInstrCost() * LT.first * NElts;
554 case ISD::ADD:
555 case ISD::SUB:
556 case ISD::AND:
557 case ISD::OR:
558 case ISD::XOR:
559 if (SLT == MVT::i64) {
560 // and, or and xor are typically split into 2 VALU instructions.
561 return 2 * getFullRateInstrCost() * LT.first * NElts;
562 }
563
564 if (ST->has16BitInsts() && SLT == MVT::i16)
565 NElts = (NElts + 1) / 2;
566
567 return LT.first * NElts * getFullRateInstrCost();
568 case ISD::MUL: {
569 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
570 if (SLT == MVT::i64) {
571 const int FullRateCost = getFullRateInstrCost();
572 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
573 }
574
575 if (ST->has16BitInsts() && SLT == MVT::i16)
576 NElts = (NElts + 1) / 2;
577
578 // i32
579 return QuarterRateCost * NElts * LT.first;
580 }
581 case ISD::FMUL:
582 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
583 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
584 // fused operation.
585 if (CxtI && CxtI->hasOneUse())
586 if (const auto *FAdd = dyn_cast<BinaryOperator>(Val: *CxtI->user_begin())) {
587 const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode());
588 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
589 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
590 return TargetTransformInfo::TCC_Free;
591 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
592 return TargetTransformInfo::TCC_Free;
593
594 // Estimate all types may be fused with contract/unsafe flags
595 const TargetOptions &Options = TLI->getTargetMachine().Options;
596 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
597 Options.UnsafeFPMath ||
598 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
599 return TargetTransformInfo::TCC_Free;
600 }
601 }
602 [[fallthrough]];
603 case ISD::FADD:
604 case ISD::FSUB:
605 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
606 NElts = (NElts + 1) / 2;
607 if (SLT == MVT::f64)
608 return LT.first * NElts * get64BitInstrCost(CostKind);
609
610 if (ST->has16BitInsts() && SLT == MVT::f16)
611 NElts = (NElts + 1) / 2;
612
613 if (SLT == MVT::f32 || SLT == MVT::f16)
614 return LT.first * NElts * getFullRateInstrCost();
615 break;
616 case ISD::FDIV:
617 case ISD::FREM:
618 // FIXME: frem should be handled separately. The fdiv in it is most of it,
619 // but the current lowering is also not entirely correct.
620 if (SLT == MVT::f64) {
621 int Cost = 7 * get64BitInstrCost(CostKind) +
622 getQuarterRateInstrCost(CostKind) +
623 3 * getHalfRateInstrCost(CostKind);
624 // Add cost of workaround.
625 if (!ST->hasUsableDivScaleConditionOutput())
626 Cost += 3 * getFullRateInstrCost();
627
628 return LT.first * Cost * NElts;
629 }
630
631 if (!Args.empty() && match(V: Args[0], P: PatternMatch::m_FPOne())) {
632 // TODO: This is more complicated, unsafe flags etc.
633 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
634 (SLT == MVT::f16 && ST->has16BitInsts())) {
635 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
636 }
637 }
638
639 if (SLT == MVT::f16 && ST->has16BitInsts()) {
640 // 2 x v_cvt_f32_f16
641 // f32 rcp
642 // f32 fmul
643 // v_cvt_f16_f32
644 // f16 div_fixup
645 int Cost =
646 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
647 return LT.first * Cost * NElts;
648 }
649
650 if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
651 TLI->getTargetMachine().Options.UnsafeFPMath)) {
652 // Fast unsafe fdiv lowering:
653 // f32 rcp
654 // f32 fmul
655 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
656 return LT.first * Cost * NElts;
657 }
658
659 if (SLT == MVT::f32 || SLT == MVT::f16) {
660 // 4 more v_cvt_* insts without f16 insts support
661 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
662 1 * getQuarterRateInstrCost(CostKind);
663
664 if (!HasFP32Denormals) {
665 // FP mode switches.
666 Cost += 2 * getFullRateInstrCost();
667 }
668
669 return LT.first * NElts * Cost;
670 }
671 break;
672 case ISD::FNEG:
673 // Use the backend' estimation. If fneg is not free each element will cost
674 // one additional instruction.
675 return TLI->isFNegFree(VT: SLT) ? 0 : NElts;
676 default:
677 break;
678 }
679
680 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
681 Args, CxtI);
682}
683
684// Return true if there's a potential benefit from using v2f16/v2i16
685// instructions for an intrinsic, even if it requires nontrivial legalization.
686static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
687 switch (ID) {
688 case Intrinsic::fma:
689 case Intrinsic::fmuladd:
690 case Intrinsic::copysign:
691 case Intrinsic::minimumnum:
692 case Intrinsic::maximumnum:
693 case Intrinsic::canonicalize:
694 // There's a small benefit to using vector ops in the legalized code.
695 case Intrinsic::round:
696 case Intrinsic::uadd_sat:
697 case Intrinsic::usub_sat:
698 case Intrinsic::sadd_sat:
699 case Intrinsic::ssub_sat:
700 case Intrinsic::abs:
701 return true;
702 default:
703 return false;
704 }
705}
706
707InstructionCost
708GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
709 TTI::TargetCostKind CostKind) const {
710 switch (ICA.getID()) {
711 case Intrinsic::fabs:
712 // Free source modifier in the common case.
713 return 0;
714 case Intrinsic::amdgcn_workitem_id_x:
715 case Intrinsic::amdgcn_workitem_id_y:
716 case Intrinsic::amdgcn_workitem_id_z:
717 // TODO: If hasPackedTID, or if the calling context is not an entry point
718 // there may be a bit instruction.
719 return 0;
720 case Intrinsic::amdgcn_workgroup_id_x:
721 case Intrinsic::amdgcn_workgroup_id_y:
722 case Intrinsic::amdgcn_workgroup_id_z:
723 case Intrinsic::amdgcn_lds_kernel_id:
724 case Intrinsic::amdgcn_dispatch_ptr:
725 case Intrinsic::amdgcn_dispatch_id:
726 case Intrinsic::amdgcn_implicitarg_ptr:
727 case Intrinsic::amdgcn_queue_ptr:
728 // Read from an argument register.
729 return 0;
730 default:
731 break;
732 }
733
734 if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID()))
735 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
736
737 Type *RetTy = ICA.getReturnType();
738
739 // Legalize the type.
740 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
741
742 unsigned NElts = LT.second.isVector() ?
743 LT.second.getVectorNumElements() : 1;
744
745 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
746
747 if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
748 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
749 NElts = (NElts + 1) / 2;
750
751 // TODO: Get more refined intrinsic costs?
752 unsigned InstRate = getQuarterRateInstrCost(CostKind);
753
754 switch (ICA.getID()) {
755 case Intrinsic::fma:
756 case Intrinsic::fmuladd:
757 if (SLT == MVT::f64) {
758 InstRate = get64BitInstrCost(CostKind);
759 break;
760 }
761
762 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
763 InstRate = getFullRateInstrCost();
764 else {
765 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
766 : getQuarterRateInstrCost(CostKind);
767 }
768 break;
769 case Intrinsic::copysign:
770 return NElts * getFullRateInstrCost();
771 case Intrinsic::minimumnum:
772 case Intrinsic::maximumnum: {
773 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
774 // promotion takes the place of the canonicalize.
775 unsigned NumOps = 3;
776 if (const IntrinsicInst *II = ICA.getInst()) {
777 // Directly legal with ieee=0
778 // TODO: Not directly legal with strictfp
779 if (fpenvIEEEMode(I: *II) == KnownIEEEMode::Off)
780 NumOps = 1;
781 }
782
783 unsigned BaseRate =
784 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
785 InstRate = BaseRate * NumOps;
786 break;
787 }
788 case Intrinsic::canonicalize: {
789 InstRate =
790 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
791 break;
792 }
793 case Intrinsic::uadd_sat:
794 case Intrinsic::usub_sat:
795 case Intrinsic::sadd_sat:
796 case Intrinsic::ssub_sat: {
797 if (SLT == MVT::i16 || SLT == MVT::i32)
798 InstRate = getFullRateInstrCost();
799
800 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
801 if (any_of(Range: ValidSatTys, P: [&LT](MVT M) { return M == LT.second; }))
802 NElts = 1;
803 break;
804 }
805 case Intrinsic::abs:
806 // Expansion takes 2 instructions for VALU
807 if (SLT == MVT::i16 || SLT == MVT::i32)
808 InstRate = 2 * getFullRateInstrCost();
809 break;
810 default:
811 break;
812 }
813
814 return LT.first * NElts * InstRate;
815}
816
817InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
818 TTI::TargetCostKind CostKind,
819 const Instruction *I) const {
820 assert((I == nullptr || I->getOpcode() == Opcode) &&
821 "Opcode should reflect passed instruction.");
822 const bool SCost =
823 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
824 const int CBrCost = SCost ? 5 : 7;
825 switch (Opcode) {
826 case Instruction::Br: {
827 // Branch instruction takes about 4 slots on gfx900.
828 const auto *BI = dyn_cast_or_null<BranchInst>(Val: I);
829 if (BI && BI->isUnconditional())
830 return SCost ? 1 : 4;
831 // Suppose conditional branch takes additional 3 exec manipulations
832 // instructions in average.
833 return CBrCost;
834 }
835 case Instruction::Switch: {
836 const auto *SI = dyn_cast_or_null<SwitchInst>(Val: I);
837 // Each case (including default) takes 1 cmp + 1 cbr instructions in
838 // average.
839 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
840 }
841 case Instruction::Ret:
842 return SCost ? 1 : 10;
843 }
844 return BaseT::getCFInstrCost(Opcode, CostKind, I);
845}
846
847InstructionCost
848GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
849 std::optional<FastMathFlags> FMF,
850 TTI::TargetCostKind CostKind) const {
851 if (TTI::requiresOrderedReduction(FMF))
852 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
853
854 EVT OrigTy = TLI->getValueType(DL, Ty);
855
856 // Computes cost on targets that have packed math instructions(which support
857 // 16-bit types only).
858 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
859 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
860
861 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
862 return LT.first * getFullRateInstrCost();
863}
864
865InstructionCost
866GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
867 FastMathFlags FMF,
868 TTI::TargetCostKind CostKind) const {
869 EVT OrigTy = TLI->getValueType(DL, Ty);
870
871 // Computes cost on targets that have packed math instructions(which support
872 // 16-bit types only).
873 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
874 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
875
876 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
877 return LT.first * getHalfRateInstrCost(CostKind);
878}
879
880InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
881 TTI::TargetCostKind CostKind,
882 unsigned Index, const Value *Op0,
883 const Value *Op1) const {
884 switch (Opcode) {
885 case Instruction::ExtractElement:
886 case Instruction::InsertElement: {
887 unsigned EltSize
888 = DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType());
889 if (EltSize < 32) {
890 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
891 return 0;
892 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0,
893 Op1);
894 }
895
896 // Extracts are just reads of a subregister, so are free. Inserts are
897 // considered free because we don't want to have any cost for scalarizing
898 // operations, and we don't have to copy into a different register class.
899
900 // Dynamic indexing isn't free and is best avoided.
901 return Index == ~0u ? 2 : 0;
902 }
903 default:
904 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1);
905 }
906}
907
908/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
909/// this is analyzing the collective result of all output registers. Otherwise,
910/// this is only querying a specific result index if this returns multiple
911/// registers in a struct.
912bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
913 const CallInst *CI, ArrayRef<unsigned> Indices) const {
914 // TODO: Handle complex extract indices
915 if (Indices.size() > 1)
916 return true;
917
918 const DataLayout &DL = CI->getDataLayout();
919 const SIRegisterInfo *TRI = ST->getRegisterInfo();
920 TargetLowering::AsmOperandInfoVector TargetConstraints =
921 TLI->ParseConstraints(DL, TRI: ST->getRegisterInfo(), Call: *CI);
922
923 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
924
925 int OutputIdx = 0;
926 for (auto &TC : TargetConstraints) {
927 if (TC.Type != InlineAsm::isOutput)
928 continue;
929
930 // Skip outputs we don't care about.
931 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
932 continue;
933
934 TLI->ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
935
936 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
937 TRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second;
938
939 // For AGPR constraints null is returned on subtargets without AGPRs, so
940 // assume divergent for null.
941 if (!RC || !TRI->isSGPRClass(RC))
942 return true;
943 }
944
945 return false;
946}
947
948bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
949 const IntrinsicInst *ReadReg) const {
950 Metadata *MD =
951 cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: 0))->getMetadata();
952 StringRef RegName =
953 cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: 0))->getString();
954
955 // Special case registers that look like VCC.
956 MVT VT = MVT::getVT(Ty: ReadReg->getType());
957 if (VT == MVT::i1)
958 return true;
959
960 // Special case scalar registers that start with 'v'.
961 if (RegName.starts_with(Prefix: "vcc") || RegName.empty())
962 return false;
963
964 // VGPR or AGPR is divergent. There aren't any specially named vector
965 // registers.
966 return RegName[0] == 'v' || RegName[0] == 'a';
967}
968
969/// \returns true if the result of the value could potentially be
970/// different across workitems in a wavefront.
971bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
972 if (const Argument *A = dyn_cast<Argument>(Val: V))
973 return !AMDGPU::isArgPassedInSGPR(Arg: A);
974
975 // Loads from the private and flat address spaces are divergent, because
976 // threads can execute the load instruction with the same inputs and get
977 // different results.
978 //
979 // All other loads are not divergent, because if threads issue loads with the
980 // same arguments, they will always get the same result.
981 if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V))
982 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
983 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
984
985 // Atomics are divergent because they are executed sequentially: when an
986 // atomic operation refers to the same address in each thread, then each
987 // thread after the first sees the value written by the previous thread as
988 // original value.
989 if (isa<AtomicRMWInst, AtomicCmpXchgInst>(Val: V))
990 return true;
991
992 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
993 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
994 return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic);
995
996 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: Intrinsic->getIntrinsicID());
997 }
998
999 // Assume all function calls are a source of divergence.
1000 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1001 if (CI->isInlineAsm())
1002 return isInlineAsmSourceOfDivergence(CI);
1003 return true;
1004 }
1005
1006 // Assume all function calls are a source of divergence.
1007 if (isa<InvokeInst>(Val: V))
1008 return true;
1009
1010 return false;
1011}
1012
1013bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1014 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V))
1015 return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID());
1016
1017 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1018 if (CI->isInlineAsm())
1019 return !isInlineAsmSourceOfDivergence(CI);
1020 return false;
1021 }
1022
1023 // In most cases TID / wavefrontsize is uniform.
1024 //
1025 // However, if a kernel has uneven dimesions we can have a value of
1026 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1027 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1028 // packed into a same wave which gives 1 and 0 after the division by 64
1029 // respectively.
1030 //
1031 // FIXME: limit it to 1D kernels only, although that shall be possible
1032 // to perform this optimization is the size of the X dimension is a power
1033 // of 2, we just do not currently have infrastructure to query it.
1034 using namespace llvm::PatternMatch;
1035 uint64_t C;
1036 if (match(V, P: m_LShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1037 R: m_ConstantInt(V&: C))) ||
1038 match(V, P: m_AShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1039 R: m_ConstantInt(V&: C)))) {
1040 const Function *F = cast<Instruction>(Val: V)->getFunction();
1041 return C >= ST->getWavefrontSizeLog2() &&
1042 ST->getMaxWorkitemID(Kernel: *F, Dimension: 1) == 0 && ST->getMaxWorkitemID(Kernel: *F, Dimension: 2) == 0;
1043 }
1044
1045 Value *Mask;
1046 if (match(V, P: m_c_And(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1047 R: m_Value(V&: Mask)))) {
1048 const Function *F = cast<Instruction>(Val: V)->getFunction();
1049 const DataLayout &DL = F->getDataLayout();
1050 return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >=
1051 ST->getWavefrontSizeLog2() &&
1052 ST->getMaxWorkitemID(Kernel: *F, Dimension: 1) == 0 && ST->getMaxWorkitemID(Kernel: *F, Dimension: 2) == 0;
1053 }
1054
1055 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V);
1056 if (!ExtValue)
1057 return false;
1058
1059 const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: 0));
1060 if (!CI)
1061 return false;
1062
1063 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) {
1064 switch (Intrinsic->getIntrinsicID()) {
1065 default:
1066 return false;
1067 case Intrinsic::amdgcn_if:
1068 case Intrinsic::amdgcn_else: {
1069 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1070 return Indices.size() == 1 && Indices[0] == 1;
1071 }
1072 }
1073 }
1074
1075 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1076 // divergent for the overall struct return. We need to override it in the
1077 // case we're extracting an SGPR component here.
1078 if (CI->isInlineAsm())
1079 return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices());
1080
1081 return false;
1082}
1083
1084bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1085 Intrinsic::ID IID) const {
1086 switch (IID) {
1087 case Intrinsic::amdgcn_is_shared:
1088 case Intrinsic::amdgcn_is_private:
1089 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1090 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1091 case Intrinsic::amdgcn_load_to_lds:
1092 case Intrinsic::amdgcn_make_buffer_rsrc:
1093 OpIndexes.push_back(Elt: 0);
1094 return true;
1095 default:
1096 return false;
1097 }
1098}
1099
1100Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1101 Value *OldV,
1102 Value *NewV) const {
1103 auto IntrID = II->getIntrinsicID();
1104 switch (IntrID) {
1105 case Intrinsic::amdgcn_is_shared:
1106 case Intrinsic::amdgcn_is_private: {
1107 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1108 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1109 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1110 LLVMContext &Ctx = NewV->getType()->getContext();
1111 ConstantInt *NewVal = (TrueAS == NewAS) ?
1112 ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx);
1113 return NewVal;
1114 }
1115 case Intrinsic::ptrmask: {
1116 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1117 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1118 Value *MaskOp = II->getArgOperand(i: 1);
1119 Type *MaskTy = MaskOp->getType();
1120
1121 bool DoTruncate = false;
1122
1123 const GCNTargetMachine &TM =
1124 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1125 if (!TM.isNoopAddrSpaceCast(SrcAS: OldAS, DestAS: NewAS)) {
1126 // All valid 64-bit to 32-bit casts work by chopping off the high
1127 // bits. Any masking only clearing the low bits will also apply in the new
1128 // address space.
1129 if (DL.getPointerSizeInBits(AS: OldAS) != 64 ||
1130 DL.getPointerSizeInBits(AS: NewAS) != 32)
1131 return nullptr;
1132
1133 // TODO: Do we need to thread more context in here?
1134 KnownBits Known = computeKnownBits(V: MaskOp, DL, AC: nullptr, CxtI: II);
1135 if (Known.countMinLeadingOnes() < 32)
1136 return nullptr;
1137
1138 DoTruncate = true;
1139 }
1140
1141 IRBuilder<> B(II);
1142 if (DoTruncate) {
1143 MaskTy = B.getInt32Ty();
1144 MaskOp = B.CreateTrunc(V: MaskOp, DestTy: MaskTy);
1145 }
1146
1147 return B.CreateIntrinsic(ID: Intrinsic::ptrmask, Types: {NewV->getType(), MaskTy},
1148 Args: {NewV, MaskOp});
1149 }
1150 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1151 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1152 Type *DestTy = II->getType();
1153 Type *SrcTy = NewV->getType();
1154 unsigned NewAS = SrcTy->getPointerAddressSpace();
1155 if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS))
1156 return nullptr;
1157 Module *M = II->getModule();
1158 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1159 M, id: II->getIntrinsicID(), Tys: {DestTy, SrcTy, DestTy});
1160 II->setArgOperand(i: 0, v: NewV);
1161 II->setCalledFunction(NewDecl);
1162 return II;
1163 }
1164 case Intrinsic::amdgcn_load_to_lds: {
1165 Type *SrcTy = NewV->getType();
1166 Module *M = II->getModule();
1167 Function *NewDecl =
1168 Intrinsic::getOrInsertDeclaration(M, id: II->getIntrinsicID(), Tys: {SrcTy});
1169 II->setArgOperand(i: 0, v: NewV);
1170 II->setCalledFunction(NewDecl);
1171 return II;
1172 }
1173 case Intrinsic::amdgcn_make_buffer_rsrc: {
1174 Type *SrcTy = NewV->getType();
1175 Type *DstTy = II->getType();
1176 Module *M = II->getModule();
1177 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1178 M, id: II->getIntrinsicID(), Tys: {DstTy, SrcTy});
1179 II->setArgOperand(i: 0, v: NewV);
1180 II->setCalledFunction(NewDecl);
1181 return II;
1182 }
1183 default:
1184 return nullptr;
1185 }
1186}
1187
1188InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1189 VectorType *DstTy, VectorType *SrcTy,
1190 ArrayRef<int> Mask,
1191 TTI::TargetCostKind CostKind,
1192 int Index, VectorType *SubTp,
1193 ArrayRef<const Value *> Args,
1194 const Instruction *CxtI) const {
1195 if (!isa<FixedVectorType>(Val: SrcTy))
1196 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1197 SubTp);
1198
1199 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1200
1201 unsigned ScalarSize = DL.getTypeSizeInBits(Ty: SrcTy->getElementType());
1202 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1203 (ScalarSize == 16 || ScalarSize == 8)) {
1204 // Larger vector widths may require additional instructions, but are
1205 // typically cheaper than scalarized versions.
1206 unsigned NumVectorElts = cast<FixedVectorType>(Val: SrcTy)->getNumElements();
1207 unsigned RequestedElts =
1208 count_if(Range&: Mask, P: [](int MaskElt) { return MaskElt != -1; });
1209 unsigned EltsPerReg = 32 / ScalarSize;
1210 if (RequestedElts == 0)
1211 return 0;
1212 switch (Kind) {
1213 case TTI::SK_Broadcast:
1214 case TTI::SK_Reverse:
1215 case TTI::SK_PermuteSingleSrc: {
1216 // With op_sel VOP3P instructions freely can access the low half or high
1217 // half of a register, so any swizzle of two elements is free.
1218 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
1219 return 0;
1220 unsigned NumPerms = alignTo(Value: RequestedElts, Align: EltsPerReg) / EltsPerReg;
1221 // SK_Broadcast just reuses the same mask
1222 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1223 return NumPerms + NumPermMasks;
1224 }
1225 case TTI::SK_ExtractSubvector:
1226 case TTI::SK_InsertSubvector: {
1227 // Even aligned accesses are free
1228 if (!(Index % 2))
1229 return 0;
1230 // Insert/extract subvectors only require shifts / extract code to get the
1231 // relevant bits
1232 return alignTo(Value: RequestedElts, Align: EltsPerReg) / EltsPerReg;
1233 }
1234 case TTI::SK_PermuteTwoSrc:
1235 case TTI::SK_Splice:
1236 case TTI::SK_Select: {
1237 unsigned NumPerms = alignTo(Value: RequestedElts, Align: EltsPerReg) / EltsPerReg;
1238 // SK_Select just reuses the same mask
1239 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1240 return NumPerms + NumPermMasks;
1241 }
1242
1243 default:
1244 break;
1245 }
1246 }
1247
1248 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1249 SubTp);
1250}
1251
1252/// Whether it is profitable to sink the operands of an
1253/// Instruction I to the basic block of I.
1254/// This helps using several modifiers (like abs and neg) more often.
1255bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
1256 SmallVectorImpl<Use *> &Ops) const {
1257 using namespace PatternMatch;
1258
1259 for (auto &Op : I->operands()) {
1260 // Ensure we are not already sinking this operand.
1261 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op.get(); }))
1262 continue;
1263
1264 if (match(V: &Op, P: m_FAbs(Op0: m_Value())) || match(V: &Op, P: m_FNeg(X: m_Value())))
1265 Ops.push_back(Elt: &Op);
1266 }
1267
1268 return !Ops.empty();
1269}
1270
1271bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1272 const Function *Callee) const {
1273 const TargetMachine &TM = getTLI()->getTargetMachine();
1274 const GCNSubtarget *CallerST
1275 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1276 const GCNSubtarget *CalleeST
1277 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1278
1279 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1280 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1281
1282 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1283 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1284 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1285 return false;
1286
1287 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1288 // no way to support merge for backend defined attributes.
1289 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1290 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1291 if (!CallerMode.isInlineCompatible(CalleeMode))
1292 return false;
1293
1294 if (Callee->hasFnAttribute(Kind: Attribute::AlwaysInline) ||
1295 Callee->hasFnAttribute(Kind: Attribute::InlineHint))
1296 return true;
1297
1298 // Hack to make compile times reasonable.
1299 if (InlineMaxBB) {
1300 // Single BB does not increase total BB amount.
1301 if (Callee->size() == 1)
1302 return true;
1303 size_t BBSize = Caller->size() + Callee->size() - 1;
1304 return BBSize <= InlineMaxBB;
1305 }
1306
1307 return true;
1308}
1309
1310static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1311 const SITargetLowering *TLI,
1312 const GCNTTIImpl *TTIImpl) {
1313 const int NrOfSGPRUntilSpill = 26;
1314 const int NrOfVGPRUntilSpill = 32;
1315
1316 const DataLayout &DL = TTIImpl->getDataLayout();
1317
1318 unsigned adjustThreshold = 0;
1319 int SGPRsInUse = 0;
1320 int VGPRsInUse = 0;
1321 for (const Use &A : CB->args()) {
1322 SmallVector<EVT, 4> ValueVTs;
1323 ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs);
1324 for (auto ArgVT : ValueVTs) {
1325 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1326 Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT);
1327 if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A)))
1328 SGPRsInUse += CCRegNum;
1329 else
1330 VGPRsInUse += CCRegNum;
1331 }
1332 }
1333
1334 // The cost of passing function arguments through the stack:
1335 // 1 instruction to put a function argument on the stack in the caller.
1336 // 1 instruction to take a function argument from the stack in callee.
1337 // 1 instruction is explicitly take care of data dependencies in callee
1338 // function.
1339 InstructionCost ArgStackCost(1);
1340 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1341 Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1342 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1343 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1344 Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1345 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1346
1347 // The penalty cost is computed relative to the cost of instructions and does
1348 // not model any storage costs.
1349 adjustThreshold += std::max(a: 0, b: SGPRsInUse - NrOfSGPRUntilSpill) *
1350 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1351 adjustThreshold += std::max(a: 0, b: VGPRsInUse - NrOfVGPRUntilSpill) *
1352 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1353 return adjustThreshold;
1354}
1355
1356static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1357 const DataLayout &DL) {
1358 // If we have a pointer to a private array passed into a function
1359 // it will not be optimized out, leaving scratch usage.
1360 // This function calculates the total size in bytes of the memory that would
1361 // end in scratch if the call was not inlined.
1362 unsigned AllocaSize = 0;
1363 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1364 for (Value *PtrArg : CB->args()) {
1365 PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType());
1366 if (!Ty)
1367 continue;
1368
1369 unsigned AddrSpace = Ty->getAddressSpace();
1370 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1371 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1372 continue;
1373
1374 const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg));
1375 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(Ptr: AI).second)
1376 continue;
1377
1378 AllocaSize += DL.getTypeAllocSize(Ty: AI->getAllocatedType());
1379 }
1380 return AllocaSize;
1381}
1382
1383int GCNTTIImpl::getInliningLastCallToStaticBonus() const {
1384 return BaseT::getInliningLastCallToStaticBonus() *
1385 getInliningThresholdMultiplier();
1386}
1387
1388unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1389 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this);
1390
1391 // Private object passed as arguments may end up in scratch usage if the call
1392 // is not inlined. Increase the inline threshold to promote inlining.
1393 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1394 if (AllocaSize > 0)
1395 Threshold += ArgAllocaCost;
1396 return Threshold;
1397}
1398
1399unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1400 const AllocaInst *AI) const {
1401
1402 // Below the cutoff, assume that the private memory objects would be
1403 // optimized
1404 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1405 if (AllocaSize <= ArgAllocaCutoff)
1406 return 0;
1407
1408 // Above the cutoff, we give a cost to each private memory object
1409 // depending its size. If the array can be optimized by SROA this cost is not
1410 // added to the total-cost in the inliner cost analysis.
1411 //
1412 // We choose the total cost of the alloca such that their sum cancels the
1413 // bonus given in the threshold (ArgAllocaCost).
1414 //
1415 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1416 //
1417 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1418 // the single-bb bonus and the vector-bonus.
1419 //
1420 // We compensate the first two multipliers, by repeating logic from the
1421 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1422 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1423 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1424
1425 bool SingleBB = none_of(Range&: *CB->getCalledFunction(), P: [](const BasicBlock &BB) {
1426 return BB.getTerminator()->getNumSuccessors() > 1;
1427 });
1428 if (SingleBB) {
1429 Threshold += Threshold / 2;
1430 }
1431
1432 auto ArgAllocaSize = DL.getTypeAllocSize(Ty: AI->getAllocatedType());
1433
1434 // Attribute the bonus proportionally to the alloca size
1435 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1436
1437 return AllocaThresholdBonus;
1438}
1439
1440void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1441 TTI::UnrollingPreferences &UP,
1442 OptimizationRemarkEmitter *ORE) const {
1443 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1444}
1445
1446void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1447 TTI::PeelingPreferences &PP) const {
1448 CommonTTI.getPeelingPreferences(L, SE, PP);
1449}
1450
1451int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1452 return ST->hasFullRate64Ops()
1453 ? getFullRateInstrCost()
1454 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1455 : getQuarterRateInstrCost(CostKind);
1456}
1457
1458std::pair<InstructionCost, MVT>
1459GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1460 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1461 auto Size = DL.getTypeSizeInBits(Ty);
1462 // Maximum load or store can handle 8 dwords for scalar and 4 for
1463 // vector ALU. Let's assume anything above 8 dwords is expensive
1464 // even if legal.
1465 if (Size <= 256)
1466 return Cost;
1467
1468 Cost.first += (Size + 255) / 256;
1469 return Cost;
1470}
1471
1472unsigned GCNTTIImpl::getPrefetchDistance() const {
1473 return ST->hasPrefetch() ? 128 : 0;
1474}
1475
1476bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1477 return AMDGPU::isFlatGlobalAddrSpace(AS);
1478}
1479
1480void GCNTTIImpl::collectKernelLaunchBounds(
1481 const Function &F,
1482 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1483 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1484 LB.push_back(Elt: {"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1485 LB.push_back(Elt: {"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1486 LB.push_back(Elt: {"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1487 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1488 ST->getFlatWorkGroupSizes(F);
1489 LB.push_back(Elt: {"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1490 LB.push_back(Elt: {"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1491 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1492 LB.push_back(Elt: {"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1493 LB.push_back(Elt: {"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1494}
1495
1496GCNTTIImpl::KnownIEEEMode
1497GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
1498 if (!ST->hasIEEEMode()) // Only mode on gfx12
1499 return KnownIEEEMode::On;
1500
1501 const Function *F = I.getFunction();
1502 if (!F)
1503 return KnownIEEEMode::Unknown;
1504
1505 Attribute IEEEAttr = F->getFnAttribute(Kind: "amdgpu-ieee");
1506 if (IEEEAttr.isValid())
1507 return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
1508
1509 return AMDGPU::isShader(CC: F->getCallingConv()) ? KnownIEEEMode::Off
1510 : KnownIEEEMode::On;
1511}
1512
1513InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1514 Align Alignment,
1515 unsigned AddressSpace,
1516 TTI::TargetCostKind CostKind,
1517 TTI::OperandValueInfo OpInfo,
1518 const Instruction *I) const {
1519 if (VectorType *VecTy = dyn_cast<VectorType>(Val: Src)) {
1520 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1521 VecTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
1522 return divideCeil(Numerator: DL.getTypeSizeInBits(Ty: VecTy) - 1,
1523 Denominator: getLoadStoreVecRegBitWidth(AddrSpace: AddressSpace));
1524 }
1525 }
1526 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1527 OpInfo, I);
1528}
1529
1530unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1531 if (VectorType *VecTy = dyn_cast<VectorType>(Val: Tp)) {
1532 if (VecTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
1533 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1534 return divideCeil(Numerator: ElementCount - 1, Denominator: 4);
1535 }
1536 }
1537 return BaseT::getNumberOfParts(Tp);
1538}
1539