1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetTransformInfo.h"
18#include "AMDGPUTargetMachine.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "SIModeRegisterDefaults.h"
21#include "llvm/Analysis/InlineCost.h"
22#include "llvm/Analysis/LoopInfo.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/Analysis.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
28#include "llvm/IR/PatternMatch.h"
29#include "llvm/Support/KnownBits.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "AMDGPUtti"
35
36static cl::opt<unsigned> UnrollThresholdPrivate(
37 "amdgpu-unroll-threshold-private",
38 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
39 cl::init(Val: 2700), cl::Hidden);
40
41static cl::opt<unsigned> UnrollThresholdLocal(
42 "amdgpu-unroll-threshold-local",
43 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
44 cl::init(Val: 1000), cl::Hidden);
45
46static cl::opt<unsigned> UnrollThresholdIf(
47 "amdgpu-unroll-threshold-if",
48 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
49 cl::init(Val: 200), cl::Hidden);
50
51static cl::opt<bool> UnrollRuntimeLocal(
52 "amdgpu-unroll-runtime-local",
53 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
54 cl::init(Val: true), cl::Hidden);
55
56static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
57 "amdgpu-unroll-max-block-to-analyze",
58 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
59 cl::init(Val: 32), cl::Hidden);
60
61static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
62 cl::Hidden, cl::init(Val: 4000),
63 cl::desc("Cost of alloca argument"));
64
65// If the amount of scratch memory to eliminate exceeds our ability to allocate
66// it into registers we gain nothing by aggressively inlining functions for that
67// heuristic.
68static cl::opt<unsigned>
69 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
70 cl::init(Val: 256),
71 cl::desc("Maximum alloca size to use for inline cost"));
72
73// Inliner constraint to achieve reasonable compilation time.
74static cl::opt<size_t> InlineMaxBB(
75 "amdgpu-inline-max-bb", cl::Hidden, cl::init(Val: 1100),
76 cl::desc("Maximum number of BBs allowed in a function after inlining"
77 " (compile time constraint)"));
78
79// This default unroll factor is based on microbenchmarks on gfx1030.
80static cl::opt<unsigned> MemcpyLoopUnroll(
81 "amdgpu-memcpy-loop-unroll",
82 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
83 "operations when lowering memcpy as a loop"),
84 cl::init(Val: 16), cl::Hidden);
85
86static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
87 unsigned Depth = 0) {
88 const Instruction *I = dyn_cast<Instruction>(Val: Cond);
89 if (!I)
90 return false;
91
92 for (const Value *V : I->operand_values()) {
93 if (!L->contains(Inst: I))
94 continue;
95 if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) {
96 if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) {
97 return SubLoop->contains(Inst: PHI); }))
98 return true;
99 } else if (Depth < 10 && dependsOnLocalPhi(L, Cond: V, Depth: Depth+1))
100 return true;
101 }
102 return false;
103}
104
105AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
106 : BaseT(TM, F.getDataLayout()),
107 TargetTriple(TM->getTargetTriple()),
108 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
109 TLI(ST->getTargetLowering()) {}
110
111void AMDGPUTTIImpl::getUnrollingPreferences(
112 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
113 OptimizationRemarkEmitter *ORE) const {
114 const Function &F = *L->getHeader()->getParent();
115 UP.Threshold =
116 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold", Default: 300);
117 UP.MaxCount = std::numeric_limits<unsigned>::max();
118 UP.Partial = true;
119
120 // Conditional branch in a loop back edge needs 3 additional exec
121 // manipulations in average.
122 UP.BEInsns += 3;
123
124 // We want to run unroll even for the loops which have been vectorized.
125 UP.UnrollVectorizedLoop = true;
126
127 // TODO: Do we want runtime unrolling?
128
129 // Maximum alloca size than can fit registers. Reserve 16 registers.
130 const unsigned MaxAlloca = (256 - 16) * 4;
131 unsigned ThresholdPrivate = UnrollThresholdPrivate;
132 unsigned ThresholdLocal = UnrollThresholdLocal;
133
134 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
135 // provided threshold value as the default for Threshold
136 if (MDNode *LoopUnrollThreshold =
137 findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold")) {
138 if (LoopUnrollThreshold->getNumOperands() == 2) {
139 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
140 MD: LoopUnrollThreshold->getOperand(I: 1));
141 if (MetaThresholdValue) {
142 // We will also use the supplied value for PartialThreshold for now.
143 // We may introduce additional metadata if it becomes necessary in the
144 // future.
145 UP.Threshold = MetaThresholdValue->getSExtValue();
146 UP.PartialThreshold = UP.Threshold;
147 ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold);
148 ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold);
149 }
150 }
151 }
152
153 unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal);
154 for (const BasicBlock *BB : L->getBlocks()) {
155 const DataLayout &DL = BB->getDataLayout();
156 unsigned LocalGEPsSeen = 0;
157
158 if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) {
159 return SubLoop->contains(BB); }))
160 continue; // Block belongs to an inner loop.
161
162 for (const Instruction &I : *BB) {
163 // Unroll a loop which contains an "if" statement whose condition
164 // defined by a PHI belonging to the loop. This may help to eliminate
165 // if region and potentially even PHI itself, saving on both divergence
166 // and registers used for the PHI.
167 // Add a small bonus for each of such "if" statements.
168 if (const BranchInst *Br = dyn_cast<BranchInst>(Val: &I)) {
169 if (UP.Threshold < MaxBoost && Br->isConditional()) {
170 BasicBlock *Succ0 = Br->getSuccessor(i: 0);
171 BasicBlock *Succ1 = Br->getSuccessor(i: 1);
172 if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) ||
173 (L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1)))
174 continue;
175 if (dependsOnLocalPhi(L, Cond: Br->getCondition())) {
176 UP.Threshold += UnrollThresholdIf;
177 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
178 << " for loop:\n"
179 << *L << " due to " << *Br << '\n');
180 if (UP.Threshold >= MaxBoost)
181 return;
182 }
183 }
184 continue;
185 }
186
187 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I);
188 if (!GEP)
189 continue;
190
191 unsigned AS = GEP->getAddressSpace();
192 unsigned Threshold = 0;
193 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
194 Threshold = ThresholdPrivate;
195 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
196 Threshold = ThresholdLocal;
197 else
198 continue;
199
200 if (UP.Threshold >= Threshold)
201 continue;
202
203 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
204 const Value *Ptr = GEP->getPointerOperand();
205 const AllocaInst *Alloca =
206 dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr));
207 if (!Alloca || !Alloca->isStaticAlloca())
208 continue;
209 auto AllocaSize = Alloca->getAllocationSize(DL);
210 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
211 continue;
212 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
213 AS == AMDGPUAS::REGION_ADDRESS) {
214 LocalGEPsSeen++;
215 // Inhibit unroll for local memory if we have seen addressing not to
216 // a variable, most likely we will be unable to combine it.
217 // Do not unroll too deep inner loops for local memory to give a chance
218 // to unroll an outer loop for a more important reason.
219 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2)
220 continue;
221
222 const Value *V = getUnderlyingObject(V: GEP->getPointerOperand());
223 if (!isa<GlobalVariable>(Val: V) && !isa<Argument>(Val: V))
224 continue;
225
226 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
227 << *L << " due to LDS use.\n");
228 UP.Runtime = UnrollRuntimeLocal;
229 }
230
231 // Check if GEP depends on a value defined by this loop itself.
232 bool HasLoopDef = false;
233 for (const Value *Op : GEP->operands()) {
234 const Instruction *Inst = dyn_cast<Instruction>(Val: Op);
235 if (!Inst || L->isLoopInvariant(V: Op))
236 continue;
237
238 if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) {
239 return SubLoop->contains(Inst); }))
240 continue;
241 HasLoopDef = true;
242 break;
243 }
244 if (!HasLoopDef)
245 continue;
246
247 // We want to do whatever we can to limit the number of alloca
248 // instructions that make it through to the code generator. allocas
249 // require us to use indirect addressing, which is slow and prone to
250 // compiler bugs. If this loop does an address calculation on an
251 // alloca ptr, then we want to use a higher than normal loop unroll
252 // threshold. This will give SROA a better chance to eliminate these
253 // allocas.
254 //
255 // We also want to have more unrolling for local memory to let ds
256 // instructions with different offsets combine.
257 //
258 // Don't use the maximum allowed value here as it will make some
259 // programs way too big.
260 UP.Threshold = Threshold;
261 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
262 << " for loop:\n"
263 << *L << " due to " << *GEP << '\n');
264 if (UP.Threshold >= MaxBoost)
265 return;
266 }
267
268 // If we got a GEP in a small BB from inner loop then increase max trip
269 // count to analyze for better estimation cost in unroll
270 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
271 UP.MaxIterationsCountToAnalyze = 32;
272 }
273}
274
275void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
276 TTI::PeelingPreferences &PP) const {
277 BaseT::getPeelingPreferences(L, SE, PP);
278}
279
280uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
281 return 1024;
282}
283
284const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
285 // Codegen control options which don't matter.
286 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
287 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
288 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
289 AMDGPU::FeatureUnalignedAccessMode,
290
291 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
292
293 // Property of the kernel/environment which can't actually differ.
294 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
295 AMDGPU::FeatureTrapHandler,
296
297 // The default assumption needs to be ecc is enabled, but no directly
298 // exposed operations depend on it, so it can be safely inlined.
299 AMDGPU::FeatureSRAMECC,
300
301 // Perf-tuning features
302 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
303
304GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
305 : BaseT(TM, F.getDataLayout()),
306 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
307 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
308 IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) {
309 SIModeRegisterDefaults Mode(F, *ST);
310 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
311 HasFP64FP16Denormals =
312 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
313}
314
315bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
316 return !F || !ST->isSingleLaneExecution(Kernel: *F);
317}
318
319unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
320 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
321 // registers. See getRegisterClassForType for the implementation.
322 // In this case vector registers are not vector in terms of
323 // VGPRs, but those which can hold multiple values.
324
325 // This is really the number of registers to fill when vectorizing /
326 // interleaving loops, so we lie to avoid trying to use all registers.
327 return 4;
328}
329
330TypeSize
331GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
332 switch (K) {
333 case TargetTransformInfo::RGK_Scalar:
334 return TypeSize::getFixed(ExactSize: 32);
335 case TargetTransformInfo::RGK_FixedWidthVector:
336 return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? 64 : 32);
337 case TargetTransformInfo::RGK_ScalableVector:
338 return TypeSize::getScalable(MinimumSize: 0);
339 }
340 llvm_unreachable("Unsupported register kind");
341}
342
343unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344 return 32;
345}
346
347unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
348 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
349 return 32 * 4 / ElemWidth;
350 // For a given width return the max 0number of elements that can be combined
351 // into a wider bit value:
352 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
353 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
354 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
355 : 1;
356}
357
358unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
359 unsigned ChainSizeInBytes,
360 VectorType *VecTy) const {
361 unsigned VecRegBitWidth = VF * LoadSize;
362 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
363 // TODO: Support element-size less than 32bit?
364 return 128 / LoadSize;
365
366 return VF;
367}
368
369unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
370 unsigned ChainSizeInBytes,
371 VectorType *VecTy) const {
372 unsigned VecRegBitWidth = VF * StoreSize;
373 if (VecRegBitWidth > 128)
374 return 128 / StoreSize;
375
376 return VF;
377}
378
379unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
380 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
381 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
382 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
383 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
384 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
385 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
386 return 512;
387 }
388
389 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
390 return 8 * ST->getMaxPrivateElementSize();
391
392 // Common to flat, global, local and region. Assume for unknown addrspace.
393 return 128;
394}
395
396bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
397 Align Alignment,
398 unsigned AddrSpace) const {
399 // We allow vectorization of flat stores, even though we may need to decompose
400 // them later if they may access private memory. We don't have enough context
401 // here, and legalization can handle it.
402 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
403 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
404 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
405 }
406 return true;
407}
408
409bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
410 Align Alignment,
411 unsigned AddrSpace) const {
412 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
413}
414
415bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
416 Align Alignment,
417 unsigned AddrSpace) const {
418 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
419}
420
421uint64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
422 return 1024;
423}
424
425Type *GCNTTIImpl::getMemcpyLoopLoweringType(
426 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
427 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
428 std::optional<uint32_t> AtomicElementSize) const {
429
430 if (AtomicElementSize)
431 return Type::getIntNTy(C&: Context, N: *AtomicElementSize * 8);
432
433 // 16-byte accesses achieve the highest copy throughput.
434 // If the operation has a fixed known length that is large enough, it is
435 // worthwhile to return an even wider type and let legalization lower it into
436 // multiple accesses, effectively unrolling the memcpy loop.
437 // We also rely on legalization to decompose into smaller accesses for
438 // subtargets and address spaces where it is necessary.
439 //
440 // Don't unroll if Length is not a constant, since unrolling leads to worse
441 // performance for length values that are smaller or slightly larger than the
442 // total size of the type returned here. Mitigating that would require a more
443 // complex lowering for variable-length memcpy and memmove.
444 unsigned I32EltsInVector = 4;
445 if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Val: Length))
446 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context),
447 NumElts: MemcpyLoopUnroll * I32EltsInVector);
448
449 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: I32EltsInVector);
450}
451
452void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
453 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
454 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
455 Align SrcAlign, Align DestAlign,
456 std::optional<uint32_t> AtomicCpySize) const {
457
458 if (AtomicCpySize)
459 BaseT::getMemcpyLoopResidualLoweringType(
460 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
461 DestAlign, AtomicCpySize);
462
463 Type *I32x4Ty = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 4);
464 while (RemainingBytes >= 16) {
465 OpsOut.push_back(Elt: I32x4Ty);
466 RemainingBytes -= 16;
467 }
468
469 Type *I64Ty = Type::getInt64Ty(C&: Context);
470 while (RemainingBytes >= 8) {
471 OpsOut.push_back(Elt: I64Ty);
472 RemainingBytes -= 8;
473 }
474
475 Type *I32Ty = Type::getInt32Ty(C&: Context);
476 while (RemainingBytes >= 4) {
477 OpsOut.push_back(Elt: I32Ty);
478 RemainingBytes -= 4;
479 }
480
481 Type *I16Ty = Type::getInt16Ty(C&: Context);
482 while (RemainingBytes >= 2) {
483 OpsOut.push_back(Elt: I16Ty);
484 RemainingBytes -= 2;
485 }
486
487 Type *I8Ty = Type::getInt8Ty(C&: Context);
488 while (RemainingBytes) {
489 OpsOut.push_back(Elt: I8Ty);
490 --RemainingBytes;
491 }
492}
493
494unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
495 // Disable unrolling if the loop is not vectorized.
496 // TODO: Enable this again.
497 if (VF.isScalar())
498 return 1;
499
500 return 8;
501}
502
503bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
504 MemIntrinsicInfo &Info) const {
505 switch (Inst->getIntrinsicID()) {
506 case Intrinsic::amdgcn_ds_ordered_add:
507 case Intrinsic::amdgcn_ds_ordered_swap: {
508 auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 2));
509 auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 4));
510 if (!Ordering || !Volatile)
511 return false; // Invalid.
512
513 unsigned OrderingVal = Ordering->getZExtValue();
514 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
515 return false;
516
517 Info.PtrVal = Inst->getArgOperand(i: 0);
518 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
519 Info.ReadMem = true;
520 Info.WriteMem = true;
521 Info.IsVolatile = !Volatile->isZero();
522 return true;
523 }
524 default:
525 return false;
526 }
527}
528
529InstructionCost GCNTTIImpl::getArithmeticInstrCost(
530 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
531 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
532 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
533
534 // Legalize the type.
535 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
536 int ISD = TLI->InstructionOpcodeToISD(Opcode);
537
538 // Because we don't have any legal vector operations, but the legal types, we
539 // need to account for split vectors.
540 unsigned NElts = LT.second.isVector() ?
541 LT.second.getVectorNumElements() : 1;
542
543 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
544
545 switch (ISD) {
546 case ISD::SHL:
547 case ISD::SRL:
548 case ISD::SRA:
549 if (SLT == MVT::i64)
550 return get64BitInstrCost(CostKind) * LT.first * NElts;
551
552 if (ST->has16BitInsts() && SLT == MVT::i16)
553 NElts = (NElts + 1) / 2;
554
555 // i32
556 return getFullRateInstrCost() * LT.first * NElts;
557 case ISD::ADD:
558 case ISD::SUB:
559 case ISD::AND:
560 case ISD::OR:
561 case ISD::XOR:
562 if (SLT == MVT::i64) {
563 // and, or and xor are typically split into 2 VALU instructions.
564 return 2 * getFullRateInstrCost() * LT.first * NElts;
565 }
566
567 if (ST->has16BitInsts() && SLT == MVT::i16)
568 NElts = (NElts + 1) / 2;
569
570 return LT.first * NElts * getFullRateInstrCost();
571 case ISD::MUL: {
572 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
573 if (SLT == MVT::i64) {
574 const int FullRateCost = getFullRateInstrCost();
575 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
576 }
577
578 if (ST->has16BitInsts() && SLT == MVT::i16)
579 NElts = (NElts + 1) / 2;
580
581 // i32
582 return QuarterRateCost * NElts * LT.first;
583 }
584 case ISD::FMUL:
585 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
586 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
587 // fused operation.
588 if (CxtI && CxtI->hasOneUse())
589 if (const auto *FAdd = dyn_cast<BinaryOperator>(Val: *CxtI->user_begin())) {
590 const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode());
591 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
592 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
593 return TargetTransformInfo::TCC_Free;
594 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
595 return TargetTransformInfo::TCC_Free;
596
597 // Estimate all types may be fused with contract/unsafe flags
598 const TargetOptions &Options = TLI->getTargetMachine().Options;
599 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
600 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
601 return TargetTransformInfo::TCC_Free;
602 }
603 }
604 [[fallthrough]];
605 case ISD::FADD:
606 case ISD::FSUB:
607 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
608 NElts = (NElts + 1) / 2;
609 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
610 NElts = (NElts + 1) / 2;
611 if (SLT == MVT::f64)
612 return LT.first * NElts * get64BitInstrCost(CostKind);
613
614 if (ST->has16BitInsts() && SLT == MVT::f16)
615 NElts = (NElts + 1) / 2;
616
617 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
618 return LT.first * NElts * getFullRateInstrCost();
619 break;
620 case ISD::FDIV:
621 case ISD::FREM:
622 // FIXME: frem should be handled separately. The fdiv in it is most of it,
623 // but the current lowering is also not entirely correct.
624 if (SLT == MVT::f64) {
625 int Cost = 7 * get64BitInstrCost(CostKind) +
626 getQuarterRateInstrCost(CostKind) +
627 3 * getHalfRateInstrCost(CostKind);
628 // Add cost of workaround.
629 if (!ST->hasUsableDivScaleConditionOutput())
630 Cost += 3 * getFullRateInstrCost();
631
632 return LT.first * Cost * NElts;
633 }
634
635 if (!Args.empty() && match(V: Args[0], P: PatternMatch::m_FPOne())) {
636 // TODO: This is more complicated, unsafe flags etc.
637 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
638 (SLT == MVT::f16 && ST->has16BitInsts())) {
639 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
640 }
641 }
642
643 if (SLT == MVT::f16 && ST->has16BitInsts()) {
644 // 2 x v_cvt_f32_f16
645 // f32 rcp
646 // f32 fmul
647 // v_cvt_f16_f32
648 // f16 div_fixup
649 int Cost =
650 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
651 return LT.first * Cost * NElts;
652 }
653
654 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
655 // Fast unsafe fdiv lowering:
656 // f32 rcp
657 // f32 fmul
658 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
659 return LT.first * Cost * NElts;
660 }
661
662 if (SLT == MVT::f32 || SLT == MVT::f16) {
663 // 4 more v_cvt_* insts without f16 insts support
664 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
665 1 * getQuarterRateInstrCost(CostKind);
666
667 if (!HasFP32Denormals) {
668 // FP mode switches.
669 Cost += 2 * getFullRateInstrCost();
670 }
671
672 return LT.first * NElts * Cost;
673 }
674 break;
675 case ISD::FNEG:
676 // Use the backend' estimation. If fneg is not free each element will cost
677 // one additional instruction.
678 return TLI->isFNegFree(VT: SLT) ? 0 : NElts;
679 default:
680 break;
681 }
682
683 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
684 Args, CxtI);
685}
686
687// Return true if there's a potential benefit from using v2f16/v2i16
688// instructions for an intrinsic, even if it requires nontrivial legalization.
689static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
690 switch (ID) {
691 case Intrinsic::fma:
692 case Intrinsic::fmuladd:
693 case Intrinsic::copysign:
694 case Intrinsic::minimumnum:
695 case Intrinsic::maximumnum:
696 case Intrinsic::canonicalize:
697 // There's a small benefit to using vector ops in the legalized code.
698 case Intrinsic::round:
699 case Intrinsic::uadd_sat:
700 case Intrinsic::usub_sat:
701 case Intrinsic::sadd_sat:
702 case Intrinsic::ssub_sat:
703 case Intrinsic::abs:
704 return true;
705 default:
706 return false;
707 }
708}
709
710InstructionCost
711GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
712 TTI::TargetCostKind CostKind) const {
713 switch (ICA.getID()) {
714 case Intrinsic::fabs:
715 // Free source modifier in the common case.
716 return 0;
717 case Intrinsic::amdgcn_workitem_id_x:
718 case Intrinsic::amdgcn_workitem_id_y:
719 case Intrinsic::amdgcn_workitem_id_z:
720 // TODO: If hasPackedTID, or if the calling context is not an entry point
721 // there may be a bit instruction.
722 return 0;
723 case Intrinsic::amdgcn_workgroup_id_x:
724 case Intrinsic::amdgcn_workgroup_id_y:
725 case Intrinsic::amdgcn_workgroup_id_z:
726 case Intrinsic::amdgcn_lds_kernel_id:
727 case Intrinsic::amdgcn_dispatch_ptr:
728 case Intrinsic::amdgcn_dispatch_id:
729 case Intrinsic::amdgcn_implicitarg_ptr:
730 case Intrinsic::amdgcn_queue_ptr:
731 // Read from an argument register.
732 return 0;
733 default:
734 break;
735 }
736
737 if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID()))
738 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
739
740 Type *RetTy = ICA.getReturnType();
741
742 // Legalize the type.
743 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
744
745 unsigned NElts = LT.second.isVector() ?
746 LT.second.getVectorNumElements() : 1;
747
748 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
749
750 if ((ST->hasVOP3PInsts() &&
751 (SLT == MVT::f16 || SLT == MVT::i16 ||
752 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
753 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
754 NElts = (NElts + 1) / 2;
755
756 // TODO: Get more refined intrinsic costs?
757 unsigned InstRate = getQuarterRateInstrCost(CostKind);
758
759 switch (ICA.getID()) {
760 case Intrinsic::fma:
761 case Intrinsic::fmuladd:
762 if (SLT == MVT::f64) {
763 InstRate = get64BitInstrCost(CostKind);
764 break;
765 }
766
767 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
768 InstRate = getFullRateInstrCost();
769 else {
770 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
771 : getQuarterRateInstrCost(CostKind);
772 }
773 break;
774 case Intrinsic::copysign:
775 return NElts * getFullRateInstrCost();
776 case Intrinsic::minimumnum:
777 case Intrinsic::maximumnum: {
778 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
779 // promotion takes the place of the canonicalize.
780 unsigned NumOps = 3;
781 if (const IntrinsicInst *II = ICA.getInst()) {
782 // Directly legal with ieee=0
783 // TODO: Not directly legal with strictfp
784 if (fpenvIEEEMode(I: *II) == KnownIEEEMode::Off)
785 NumOps = 1;
786 }
787
788 unsigned BaseRate =
789 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
790 InstRate = BaseRate * NumOps;
791 break;
792 }
793 case Intrinsic::canonicalize: {
794 InstRate =
795 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
796 break;
797 }
798 case Intrinsic::uadd_sat:
799 case Intrinsic::usub_sat:
800 case Intrinsic::sadd_sat:
801 case Intrinsic::ssub_sat: {
802 if (SLT == MVT::i16 || SLT == MVT::i32)
803 InstRate = getFullRateInstrCost();
804
805 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
806 if (any_of(Range: ValidSatTys, P: equal_to(Arg&: LT.second)))
807 NElts = 1;
808 break;
809 }
810 case Intrinsic::abs:
811 // Expansion takes 2 instructions for VALU
812 if (SLT == MVT::i16 || SLT == MVT::i32)
813 InstRate = 2 * getFullRateInstrCost();
814 break;
815 default:
816 break;
817 }
818
819 return LT.first * NElts * InstRate;
820}
821
822InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
823 TTI::TargetCostKind CostKind,
824 const Instruction *I) const {
825 assert((I == nullptr || I->getOpcode() == Opcode) &&
826 "Opcode should reflect passed instruction.");
827 const bool SCost =
828 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
829 const int CBrCost = SCost ? 5 : 7;
830 switch (Opcode) {
831 case Instruction::Br: {
832 // Branch instruction takes about 4 slots on gfx900.
833 const auto *BI = dyn_cast_or_null<BranchInst>(Val: I);
834 if (BI && BI->isUnconditional())
835 return SCost ? 1 : 4;
836 // Suppose conditional branch takes additional 3 exec manipulations
837 // instructions in average.
838 return CBrCost;
839 }
840 case Instruction::Switch: {
841 const auto *SI = dyn_cast_or_null<SwitchInst>(Val: I);
842 // Each case (including default) takes 1 cmp + 1 cbr instructions in
843 // average.
844 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
845 }
846 case Instruction::Ret:
847 return SCost ? 1 : 10;
848 }
849 return BaseT::getCFInstrCost(Opcode, CostKind, I);
850}
851
852InstructionCost
853GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
854 std::optional<FastMathFlags> FMF,
855 TTI::TargetCostKind CostKind) const {
856 if (TTI::requiresOrderedReduction(FMF))
857 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
858
859 EVT OrigTy = TLI->getValueType(DL, Ty);
860
861 // Computes cost on targets that have packed math instructions(which support
862 // 16-bit types only).
863 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
864 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
865
866 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
867 return LT.first * getFullRateInstrCost();
868}
869
870InstructionCost
871GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
872 FastMathFlags FMF,
873 TTI::TargetCostKind CostKind) const {
874 EVT OrigTy = TLI->getValueType(DL, Ty);
875
876 // Computes cost on targets that have packed math instructions(which support
877 // 16-bit types only).
878 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
879 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
880
881 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
882 return LT.first * getHalfRateInstrCost(CostKind);
883}
884
885InstructionCost GCNTTIImpl::getVectorInstrCost(
886 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
887 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
888 switch (Opcode) {
889 case Instruction::ExtractElement:
890 case Instruction::InsertElement: {
891 unsigned EltSize
892 = DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType());
893 if (EltSize < 32) {
894 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
895 return 0;
896 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
897 VIC);
898 }
899
900 // Extracts are just reads of a subregister, so are free. Inserts are
901 // considered free because we don't want to have any cost for scalarizing
902 // operations, and we don't have to copy into a different register class.
903
904 // Dynamic indexing isn't free and is best avoided.
905 return Index == ~0u ? 2 : 0;
906 }
907 default:
908 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
909 VIC);
910 }
911}
912
913/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
914/// this is analyzing the collective result of all output registers. Otherwise,
915/// this is only querying a specific result index if this returns multiple
916/// registers in a struct.
917bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
918 const CallInst *CI, ArrayRef<unsigned> Indices) const {
919 // TODO: Handle complex extract indices
920 if (Indices.size() > 1)
921 return true;
922
923 const DataLayout &DL = CI->getDataLayout();
924 const SIRegisterInfo *TRI = ST->getRegisterInfo();
925 TargetLowering::AsmOperandInfoVector TargetConstraints =
926 TLI->ParseConstraints(DL, TRI: ST->getRegisterInfo(), Call: *CI);
927
928 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
929
930 int OutputIdx = 0;
931 for (auto &TC : TargetConstraints) {
932 if (TC.Type != InlineAsm::isOutput)
933 continue;
934
935 // Skip outputs we don't care about.
936 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
937 continue;
938
939 TLI->ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
940
941 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
942 TRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second;
943
944 // For AGPR constraints null is returned on subtargets without AGPRs, so
945 // assume divergent for null.
946 if (!RC || !TRI->isSGPRClass(RC))
947 return true;
948 }
949
950 return false;
951}
952
953bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
954 const IntrinsicInst *ReadReg) const {
955 Metadata *MD =
956 cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: 0))->getMetadata();
957 StringRef RegName =
958 cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: 0))->getString();
959
960 // Special case registers that look like VCC.
961 MVT VT = MVT::getVT(Ty: ReadReg->getType());
962 if (VT == MVT::i1)
963 return true;
964
965 // Special case scalar registers that start with 'v'.
966 if (RegName.starts_with(Prefix: "vcc") || RegName.empty())
967 return false;
968
969 // VGPR or AGPR is divergent. There aren't any specially named vector
970 // registers.
971 return RegName[0] == 'v' || RegName[0] == 'a';
972}
973
974/// \returns true if the result of the value could potentially be
975/// different across workitems in a wavefront.
976bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
977 if (const Argument *A = dyn_cast<Argument>(Val: V))
978 return !AMDGPU::isArgPassedInSGPR(Arg: A);
979
980 // Loads from the private and flat address spaces are divergent, because
981 // threads can execute the load instruction with the same inputs and get
982 // different results.
983 //
984 // All other loads are not divergent, because if threads issue loads with the
985 // same arguments, they will always get the same result.
986 if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V))
987 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
988 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
989
990 // Atomics are divergent because they are executed sequentially: when an
991 // atomic operation refers to the same address in each thread, then each
992 // thread after the first sees the value written by the previous thread as
993 // original value.
994 if (isa<AtomicRMWInst, AtomicCmpXchgInst>(Val: V))
995 return true;
996
997 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
998 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
999 switch (IID) {
1000 case Intrinsic::read_register:
1001 return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic);
1002 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1003 unsigned SrcAS =
1004 Intrinsic->getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1005 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1006 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1007 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1008 ST->hasGloballyAddressableScratch();
1009 }
1010 case Intrinsic::amdgcn_workitem_id_y:
1011 case Intrinsic::amdgcn_workitem_id_z: {
1012 const Function *F = Intrinsic->getFunction();
1013 bool HasUniformYZ =
1014 ST->hasWavefrontsEvenlySplittingXDim(F: *F, /*RequitezUniformYZ=*/REquiresUniformYZ: true);
1015 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1016 F: *F, Dim: IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1017 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1018 }
1019 default:
1020 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID);
1021 }
1022 }
1023
1024 // Assume all function calls are a source of divergence.
1025 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1026 if (CI->isInlineAsm())
1027 return isInlineAsmSourceOfDivergence(CI);
1028 return true;
1029 }
1030
1031 // Assume all function calls are a source of divergence.
1032 if (isa<InvokeInst>(Val: V))
1033 return true;
1034
1035 // If the target supports globally addressable scratch, the mapping from
1036 // scratch memory to the flat aperture changes therefore an address space cast
1037 // is no longer uniform.
1038 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(Val: V)) {
1039 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1040 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1041 ST->hasGloballyAddressableScratch();
1042 }
1043
1044 return false;
1045}
1046
1047bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1048 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V))
1049 return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID());
1050
1051 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1052 if (CI->isInlineAsm())
1053 return !isInlineAsmSourceOfDivergence(CI);
1054 return false;
1055 }
1056
1057 // In most cases TID / wavefrontsize is uniform.
1058 //
1059 // However, if a kernel has uneven dimesions we can have a value of
1060 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1061 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1062 // packed into a same wave which gives 1 and 0 after the division by 64
1063 // respectively.
1064 //
1065 // The X dimension doesn't reset within a wave if either both the Y
1066 // and Z dimensions are of length 1, or if the X dimension's required
1067 // size is a power of 2. Note, however, if the X dimension's maximum
1068 // size is a power of 2 < the wavefront size, division by the wavefront
1069 // size is guaranteed to yield 0, so this is also a no-reset case.
1070 bool XDimDoesntResetWithinWaves = false;
1071 if (auto *I = dyn_cast<Instruction>(Val: V)) {
1072 const Function *F = I->getFunction();
1073 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(F: *F);
1074 }
1075 using namespace llvm::PatternMatch;
1076 uint64_t C;
1077 if (match(V, P: m_LShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1078 R: m_ConstantInt(V&: C))) ||
1079 match(V, P: m_AShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1080 R: m_ConstantInt(V&: C)))) {
1081 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1082 }
1083
1084 Value *Mask;
1085 if (match(V, P: m_c_And(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1086 R: m_Value(V&: Mask)))) {
1087 return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >=
1088 ST->getWavefrontSizeLog2() &&
1089 XDimDoesntResetWithinWaves;
1090 }
1091
1092 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V);
1093 if (!ExtValue)
1094 return false;
1095
1096 const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: 0));
1097 if (!CI)
1098 return false;
1099
1100 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) {
1101 switch (Intrinsic->getIntrinsicID()) {
1102 default:
1103 return false;
1104 case Intrinsic::amdgcn_if:
1105 case Intrinsic::amdgcn_else: {
1106 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1107 return Indices.size() == 1 && Indices[0] == 1;
1108 }
1109 }
1110 }
1111
1112 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1113 // divergent for the overall struct return. We need to override it in the
1114 // case we're extracting an SGPR component here.
1115 if (CI->isInlineAsm())
1116 return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices());
1117
1118 return false;
1119}
1120
1121bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1122 Intrinsic::ID IID) const {
1123 switch (IID) {
1124 case Intrinsic::amdgcn_is_shared:
1125 case Intrinsic::amdgcn_is_private:
1126 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1127 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1128 case Intrinsic::amdgcn_load_to_lds:
1129 case Intrinsic::amdgcn_make_buffer_rsrc:
1130 OpIndexes.push_back(Elt: 0);
1131 return true;
1132 default:
1133 return false;
1134 }
1135}
1136
1137Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1138 Value *OldV,
1139 Value *NewV) const {
1140 auto IntrID = II->getIntrinsicID();
1141 switch (IntrID) {
1142 case Intrinsic::amdgcn_is_shared:
1143 case Intrinsic::amdgcn_is_private: {
1144 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1145 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1146 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1147 LLVMContext &Ctx = NewV->getType()->getContext();
1148 ConstantInt *NewVal = (TrueAS == NewAS) ?
1149 ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx);
1150 return NewVal;
1151 }
1152 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1153 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1154 Type *DestTy = II->getType();
1155 Type *SrcTy = NewV->getType();
1156 unsigned NewAS = SrcTy->getPointerAddressSpace();
1157 if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS))
1158 return nullptr;
1159 Module *M = II->getModule();
1160 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1161 M, id: II->getIntrinsicID(), Tys: {DestTy, SrcTy, DestTy});
1162 II->setArgOperand(i: 0, v: NewV);
1163 II->setCalledFunction(NewDecl);
1164 return II;
1165 }
1166 case Intrinsic::amdgcn_load_to_lds: {
1167 Type *SrcTy = NewV->getType();
1168 Module *M = II->getModule();
1169 Function *NewDecl =
1170 Intrinsic::getOrInsertDeclaration(M, id: II->getIntrinsicID(), Tys: {SrcTy});
1171 II->setArgOperand(i: 0, v: NewV);
1172 II->setCalledFunction(NewDecl);
1173 return II;
1174 }
1175 case Intrinsic::amdgcn_make_buffer_rsrc: {
1176 Type *SrcTy = NewV->getType();
1177 Type *DstTy = II->getType();
1178 Module *M = II->getModule();
1179 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1180 M, id: II->getIntrinsicID(), Tys: {DstTy, SrcTy});
1181 II->setArgOperand(i: 0, v: NewV);
1182 II->setCalledFunction(NewDecl);
1183 return II;
1184 }
1185 default:
1186 return nullptr;
1187 }
1188}
1189
1190InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1191 VectorType *DstTy, VectorType *SrcTy,
1192 ArrayRef<int> Mask,
1193 TTI::TargetCostKind CostKind,
1194 int Index, VectorType *SubTp,
1195 ArrayRef<const Value *> Args,
1196 const Instruction *CxtI) const {
1197 if (!isa<FixedVectorType>(Val: SrcTy))
1198 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1199 SubTp);
1200
1201 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1202
1203 unsigned ScalarSize = DL.getTypeSizeInBits(Ty: SrcTy->getElementType());
1204 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1205 (ScalarSize == 16 || ScalarSize == 8)) {
1206 // Larger vector widths may require additional instructions, but are
1207 // typically cheaper than scalarized versions.
1208 //
1209 // We assume that shuffling at a register granularity can be done for free.
1210 // This is not true for vectors fed into memory instructions, but it is
1211 // effectively true for all other shuffling. The emphasis of the logic here
1212 // is to assist generic transform in cleaning up / canonicalizing those
1213 // shuffles.
1214
1215 // With op_sel VOP3P instructions freely can access the low half or high
1216 // half of a register, so any swizzle of two elements is free.
1217 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(Val: SrcTy)) {
1218 unsigned NumSrcElts = SrcVecTy->getNumElements();
1219 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1220 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1221 Kind == TTI::SK_PermuteSingleSrc))
1222 return 0;
1223 }
1224
1225 unsigned EltsPerReg = 32 / ScalarSize;
1226 switch (Kind) {
1227 case TTI::SK_Broadcast:
1228 // A single v_perm_b32 can be re-used for all destination registers.
1229 return 1;
1230 case TTI::SK_Reverse:
1231 // One instruction per register.
1232 if (auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy))
1233 return divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: EltsPerReg);
1234 return InstructionCost::getInvalid();
1235 case TTI::SK_ExtractSubvector:
1236 if (Index % EltsPerReg == 0)
1237 return 0; // Shuffling at register granularity
1238 if (auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy))
1239 return divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: EltsPerReg);
1240 return InstructionCost::getInvalid();
1241 case TTI::SK_InsertSubvector: {
1242 auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy);
1243 if (!DstVecTy)
1244 return InstructionCost::getInvalid();
1245 unsigned NumDstElts = DstVecTy->getNumElements();
1246 unsigned NumInsertElts = cast<FixedVectorType>(Val: SubTp)->getNumElements();
1247 unsigned EndIndex = Index + NumInsertElts;
1248 unsigned BeginSubIdx = Index % EltsPerReg;
1249 unsigned EndSubIdx = EndIndex % EltsPerReg;
1250 unsigned Cost = 0;
1251
1252 if (BeginSubIdx != 0) {
1253 // Need to shift the inserted vector into place. The cost is the number
1254 // of destination registers overlapped by the inserted vector.
1255 Cost = divideCeil(Numerator: EndIndex, Denominator: EltsPerReg) - (Index / EltsPerReg);
1256 }
1257
1258 // If the last register overlap is partial, there may be three source
1259 // registers feeding into it; that takes an extra instruction.
1260 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1261 Cost += 1;
1262
1263 return Cost;
1264 }
1265 case TTI::SK_Splice: {
1266 auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy);
1267 if (!DstVecTy)
1268 return InstructionCost::getInvalid();
1269 unsigned NumElts = DstVecTy->getNumElements();
1270 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1271 // Determine the sub-region of the result vector that requires
1272 // sub-register shuffles / mixing.
1273 unsigned EltsFromLHS = NumElts - Index;
1274 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1275 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1276 if (LHSIsAligned && RHSIsAligned)
1277 return 0;
1278 if (LHSIsAligned && !RHSIsAligned)
1279 return divideCeil(Numerator: NumElts, Denominator: EltsPerReg) - (EltsFromLHS / EltsPerReg);
1280 if (!LHSIsAligned && RHSIsAligned)
1281 return divideCeil(Numerator: EltsFromLHS, Denominator: EltsPerReg);
1282 return divideCeil(Numerator: NumElts, Denominator: EltsPerReg);
1283 }
1284 default:
1285 break;
1286 }
1287
1288 if (!Mask.empty()) {
1289 unsigned NumSrcElts = cast<FixedVectorType>(Val: SrcTy)->getNumElements();
1290
1291 // Generically estimate the cost by assuming that each destination
1292 // register is derived from sources via v_perm_b32 instructions if it
1293 // can't be copied as-is.
1294 //
1295 // For each destination register, derive the cost of obtaining it based
1296 // on the number of source registers that feed into it.
1297 unsigned Cost = 0;
1298 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1299 SmallVector<int, 4> Regs;
1300 bool Aligned = true;
1301 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1302 int SrcIdx = Mask[DstIdx + I];
1303 if (SrcIdx == -1)
1304 continue;
1305 int Reg;
1306 if (SrcIdx < (int)NumSrcElts) {
1307 Reg = SrcIdx / EltsPerReg;
1308 if (SrcIdx % EltsPerReg != I)
1309 Aligned = false;
1310 } else {
1311 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1312 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1313 Aligned = false;
1314 }
1315 if (!llvm::is_contained(Range&: Regs, Element: Reg))
1316 Regs.push_back(Elt: Reg);
1317 }
1318 if (Regs.size() >= 2)
1319 Cost += Regs.size() - 1;
1320 else if (!Aligned)
1321 Cost += 1;
1322 }
1323 return Cost;
1324 }
1325 }
1326
1327 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1328 SubTp);
1329}
1330
1331/// Whether it is profitable to sink the operands of an
1332/// Instruction I to the basic block of I.
1333/// This helps using several modifiers (like abs and neg) more often.
1334bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
1335 SmallVectorImpl<Use *> &Ops) const {
1336 using namespace PatternMatch;
1337
1338 for (auto &Op : I->operands()) {
1339 // Ensure we are not already sinking this operand.
1340 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op.get(); }))
1341 continue;
1342
1343 if (match(V: &Op, P: m_FAbs(Op0: m_Value())) || match(V: &Op, P: m_FNeg(X: m_Value())))
1344 Ops.push_back(Elt: &Op);
1345 }
1346
1347 return !Ops.empty();
1348}
1349
1350bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1351 const Function *Callee) const {
1352 const TargetMachine &TM = getTLI()->getTargetMachine();
1353 const GCNSubtarget *CallerST
1354 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1355 const GCNSubtarget *CalleeST
1356 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1357
1358 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1359 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1360
1361 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1362 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1363 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1364 return false;
1365
1366 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1367 // no way to support merge for backend defined attributes.
1368 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1369 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1370 if (!CallerMode.isInlineCompatible(CalleeMode))
1371 return false;
1372
1373 if (Callee->hasFnAttribute(Kind: Attribute::AlwaysInline) ||
1374 Callee->hasFnAttribute(Kind: Attribute::InlineHint))
1375 return true;
1376
1377 // Hack to make compile times reasonable.
1378 if (InlineMaxBB) {
1379 // Single BB does not increase total BB amount.
1380 if (Callee->size() == 1)
1381 return true;
1382 size_t BBSize = Caller->size() + Callee->size() - 1;
1383 return BBSize <= InlineMaxBB;
1384 }
1385
1386 return true;
1387}
1388
1389static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1390 const SITargetLowering *TLI,
1391 const GCNTTIImpl *TTIImpl) {
1392 const int NrOfSGPRUntilSpill = 26;
1393 const int NrOfVGPRUntilSpill = 32;
1394
1395 const DataLayout &DL = TTIImpl->getDataLayout();
1396
1397 unsigned adjustThreshold = 0;
1398 int SGPRsInUse = 0;
1399 int VGPRsInUse = 0;
1400 for (const Use &A : CB->args()) {
1401 SmallVector<EVT, 4> ValueVTs;
1402 ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs);
1403 for (auto ArgVT : ValueVTs) {
1404 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1405 Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT);
1406 if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A)))
1407 SGPRsInUse += CCRegNum;
1408 else
1409 VGPRsInUse += CCRegNum;
1410 }
1411 }
1412
1413 // The cost of passing function arguments through the stack:
1414 // 1 instruction to put a function argument on the stack in the caller.
1415 // 1 instruction to take a function argument from the stack in callee.
1416 // 1 instruction is explicitly take care of data dependencies in callee
1417 // function.
1418 InstructionCost ArgStackCost(1);
1419 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1420 Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1421 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1422 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1423 Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1424 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1425
1426 // The penalty cost is computed relative to the cost of instructions and does
1427 // not model any storage costs.
1428 adjustThreshold += std::max(a: 0, b: SGPRsInUse - NrOfSGPRUntilSpill) *
1429 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1430 adjustThreshold += std::max(a: 0, b: VGPRsInUse - NrOfVGPRUntilSpill) *
1431 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1432 return adjustThreshold;
1433}
1434
1435static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1436 const DataLayout &DL) {
1437 // If we have a pointer to a private array passed into a function
1438 // it will not be optimized out, leaving scratch usage.
1439 // This function calculates the total size in bytes of the memory that would
1440 // end in scratch if the call was not inlined.
1441 unsigned AllocaSize = 0;
1442 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1443 for (Value *PtrArg : CB->args()) {
1444 PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType());
1445 if (!Ty)
1446 continue;
1447
1448 unsigned AddrSpace = Ty->getAddressSpace();
1449 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1450 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1451 continue;
1452
1453 const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg));
1454 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(Ptr: AI).second)
1455 continue;
1456
1457 if (auto Size = AI->getAllocationSize(DL))
1458 AllocaSize += Size->getFixedValue();
1459 }
1460 return AllocaSize;
1461}
1462
1463int GCNTTIImpl::getInliningLastCallToStaticBonus() const {
1464 return BaseT::getInliningLastCallToStaticBonus() *
1465 getInliningThresholdMultiplier();
1466}
1467
1468unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1469 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this);
1470
1471 // Private object passed as arguments may end up in scratch usage if the call
1472 // is not inlined. Increase the inline threshold to promote inlining.
1473 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1474 if (AllocaSize > 0)
1475 Threshold += ArgAllocaCost;
1476 return Threshold;
1477}
1478
1479unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1480 const AllocaInst *AI) const {
1481
1482 // Below the cutoff, assume that the private memory objects would be
1483 // optimized
1484 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1485 if (AllocaSize <= ArgAllocaCutoff)
1486 return 0;
1487
1488 // Above the cutoff, we give a cost to each private memory object
1489 // depending its size. If the array can be optimized by SROA this cost is not
1490 // added to the total-cost in the inliner cost analysis.
1491 //
1492 // We choose the total cost of the alloca such that their sum cancels the
1493 // bonus given in the threshold (ArgAllocaCost).
1494 //
1495 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1496 //
1497 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1498 // the single-bb bonus and the vector-bonus.
1499 //
1500 // We compensate the first two multipliers, by repeating logic from the
1501 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1502 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1503 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1504
1505 bool SingleBB = none_of(Range&: *CB->getCalledFunction(), P: [](const BasicBlock &BB) {
1506 return BB.getTerminator()->getNumSuccessors() > 1;
1507 });
1508 if (SingleBB) {
1509 Threshold += Threshold / 2;
1510 }
1511
1512 auto ArgAllocaSize = AI->getAllocationSize(DL);
1513 if (!ArgAllocaSize)
1514 return 0;
1515
1516 // Attribute the bonus proportionally to the alloca size
1517 unsigned AllocaThresholdBonus =
1518 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1519
1520 return AllocaThresholdBonus;
1521}
1522
1523void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1524 TTI::UnrollingPreferences &UP,
1525 OptimizationRemarkEmitter *ORE) const {
1526 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1527}
1528
1529void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1530 TTI::PeelingPreferences &PP) const {
1531 CommonTTI.getPeelingPreferences(L, SE, PP);
1532}
1533
1534int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1535 return ST->hasFullRate64Ops()
1536 ? getFullRateInstrCost()
1537 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1538 : getQuarterRateInstrCost(CostKind);
1539}
1540
1541std::pair<InstructionCost, MVT>
1542GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1543 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1544 auto Size = DL.getTypeSizeInBits(Ty);
1545 // Maximum load or store can handle 8 dwords for scalar and 4 for
1546 // vector ALU. Let's assume anything above 8 dwords is expensive
1547 // even if legal.
1548 if (Size <= 256)
1549 return Cost;
1550
1551 Cost.first += (Size + 255) / 256;
1552 return Cost;
1553}
1554
1555unsigned GCNTTIImpl::getPrefetchDistance() const {
1556 return ST->hasPrefetch() ? 128 : 0;
1557}
1558
1559bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1560 return AMDGPU::isFlatGlobalAddrSpace(AS);
1561}
1562
1563void GCNTTIImpl::collectKernelLaunchBounds(
1564 const Function &F,
1565 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1566 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1567 LB.push_back(Elt: {"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1568 LB.push_back(Elt: {"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1569 LB.push_back(Elt: {"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1570 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1571 ST->getFlatWorkGroupSizes(F);
1572 LB.push_back(Elt: {"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1573 LB.push_back(Elt: {"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1574 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1575 LB.push_back(Elt: {"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1576 LB.push_back(Elt: {"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1577}
1578
1579GCNTTIImpl::KnownIEEEMode
1580GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
1581 if (!ST->hasIEEEMode()) // Only mode on gfx12
1582 return KnownIEEEMode::On;
1583
1584 const Function *F = I.getFunction();
1585 if (!F)
1586 return KnownIEEEMode::Unknown;
1587
1588 Attribute IEEEAttr = F->getFnAttribute(Kind: "amdgpu-ieee");
1589 if (IEEEAttr.isValid())
1590 return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
1591
1592 return AMDGPU::isShader(CC: F->getCallingConv()) ? KnownIEEEMode::Off
1593 : KnownIEEEMode::On;
1594}
1595
1596InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1597 Align Alignment,
1598 unsigned AddressSpace,
1599 TTI::TargetCostKind CostKind,
1600 TTI::OperandValueInfo OpInfo,
1601 const Instruction *I) const {
1602 if (VectorType *VecTy = dyn_cast<VectorType>(Val: Src)) {
1603 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1604 VecTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
1605 return divideCeil(Numerator: DL.getTypeSizeInBits(Ty: VecTy) - 1,
1606 Denominator: getLoadStoreVecRegBitWidth(AddrSpace: AddressSpace));
1607 }
1608 }
1609 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1610 OpInfo, I);
1611}
1612
1613unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1614 if (VectorType *VecTy = dyn_cast<VectorType>(Val: Tp)) {
1615 if (VecTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
1616 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1617 return divideCeil(Numerator: ElementCount - 1, Denominator: 4);
1618 }
1619 }
1620 return BaseT::getNumberOfParts(Tp);
1621}
1622
1623InstructionUniformity
1624GCNTTIImpl::getInstructionUniformity(const Value *V) const {
1625 if (isAlwaysUniform(V))
1626 return InstructionUniformity::AlwaysUniform;
1627
1628 if (isSourceOfDivergence(V))
1629 return InstructionUniformity::NeverUniform;
1630
1631 return InstructionUniformity::Default;
1632}
1633