1 | //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // \file |
10 | // This file implements a TargetTransformInfo analysis pass specific to the |
11 | // AMDGPU target machine. It uses the target's detailed information to provide |
12 | // more precise answers to certain TTI queries, while letting the target |
13 | // independent and default TTI implementations handle the rest. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #include "AMDGPUTargetTransformInfo.h" |
18 | #include "AMDGPUTargetMachine.h" |
19 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
20 | #include "SIModeRegisterDefaults.h" |
21 | #include "llvm/Analysis/InlineCost.h" |
22 | #include "llvm/Analysis/LoopInfo.h" |
23 | #include "llvm/Analysis/ValueTracking.h" |
24 | #include "llvm/CodeGen/Analysis.h" |
25 | #include "llvm/IR/IRBuilder.h" |
26 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
27 | #include "llvm/IR/PatternMatch.h" |
28 | #include "llvm/Support/KnownBits.h" |
29 | #include <optional> |
30 | |
31 | using namespace llvm; |
32 | |
33 | #define DEBUG_TYPE "AMDGPUtti" |
34 | |
35 | static cl::opt<unsigned> UnrollThresholdPrivate( |
36 | "amdgpu-unroll-threshold-private" , |
37 | cl::desc("Unroll threshold for AMDGPU if private memory used in a loop" ), |
38 | cl::init(Val: 2700), cl::Hidden); |
39 | |
40 | static cl::opt<unsigned> UnrollThresholdLocal( |
41 | "amdgpu-unroll-threshold-local" , |
42 | cl::desc("Unroll threshold for AMDGPU if local memory used in a loop" ), |
43 | cl::init(Val: 1000), cl::Hidden); |
44 | |
45 | static cl::opt<unsigned> UnrollThresholdIf( |
46 | "amdgpu-unroll-threshold-if" , |
47 | cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop" ), |
48 | cl::init(Val: 200), cl::Hidden); |
49 | |
50 | static cl::opt<bool> UnrollRuntimeLocal( |
51 | "amdgpu-unroll-runtime-local" , |
52 | cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop" ), |
53 | cl::init(Val: true), cl::Hidden); |
54 | |
55 | static cl::opt<unsigned> UnrollMaxBlockToAnalyze( |
56 | "amdgpu-unroll-max-block-to-analyze" , |
57 | cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU" ), |
58 | cl::init(Val: 32), cl::Hidden); |
59 | |
60 | static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost" , |
61 | cl::Hidden, cl::init(Val: 4000), |
62 | cl::desc("Cost of alloca argument" )); |
63 | |
64 | // If the amount of scratch memory to eliminate exceeds our ability to allocate |
65 | // it into registers we gain nothing by aggressively inlining functions for that |
66 | // heuristic. |
67 | static cl::opt<unsigned> |
68 | ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff" , cl::Hidden, |
69 | cl::init(Val: 256), |
70 | cl::desc("Maximum alloca size to use for inline cost" )); |
71 | |
72 | // Inliner constraint to achieve reasonable compilation time. |
73 | static cl::opt<size_t> InlineMaxBB( |
74 | "amdgpu-inline-max-bb" , cl::Hidden, cl::init(Val: 1100), |
75 | cl::desc("Maximum number of BBs allowed in a function after inlining" |
76 | " (compile time constraint)" )); |
77 | |
78 | static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, |
79 | unsigned Depth = 0) { |
80 | const Instruction *I = dyn_cast<Instruction>(Val: Cond); |
81 | if (!I) |
82 | return false; |
83 | |
84 | for (const Value *V : I->operand_values()) { |
85 | if (!L->contains(Inst: I)) |
86 | continue; |
87 | if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) { |
88 | if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) { |
89 | return SubLoop->contains(Inst: PHI); })) |
90 | return true; |
91 | } else if (Depth < 10 && dependsOnLocalPhi(L, Cond: V, Depth: Depth+1)) |
92 | return true; |
93 | } |
94 | return false; |
95 | } |
96 | |
97 | AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) |
98 | : BaseT(TM, F.getDataLayout()), |
99 | TargetTriple(TM->getTargetTriple()), |
100 | ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), |
101 | TLI(ST->getTargetLowering()) {} |
102 | |
103 | void AMDGPUTTIImpl::(Loop *L, ScalarEvolution &SE, |
104 | TTI::UnrollingPreferences &UP, |
105 | OptimizationRemarkEmitter *ORE) { |
106 | const Function &F = *L->getHeader()->getParent(); |
107 | UP.Threshold = |
108 | F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold" , Default: 300); |
109 | UP.MaxCount = std::numeric_limits<unsigned>::max(); |
110 | UP.Partial = true; |
111 | |
112 | // Conditional branch in a loop back edge needs 3 additional exec |
113 | // manipulations in average. |
114 | UP.BEInsns += 3; |
115 | |
116 | // We want to run unroll even for the loops which have been vectorized. |
117 | UP.UnrollVectorizedLoop = true; |
118 | |
119 | // TODO: Do we want runtime unrolling? |
120 | |
121 | // Maximum alloca size than can fit registers. Reserve 16 registers. |
122 | const unsigned MaxAlloca = (256 - 16) * 4; |
123 | unsigned ThresholdPrivate = UnrollThresholdPrivate; |
124 | unsigned ThresholdLocal = UnrollThresholdLocal; |
125 | |
126 | // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the |
127 | // provided threshold value as the default for Threshold |
128 | if (MDNode *LoopUnrollThreshold = |
129 | findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold" )) { |
130 | if (LoopUnrollThreshold->getNumOperands() == 2) { |
131 | ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>( |
132 | MD: LoopUnrollThreshold->getOperand(I: 1)); |
133 | if (MetaThresholdValue) { |
134 | // We will also use the supplied value for PartialThreshold for now. |
135 | // We may introduce additional metadata if it becomes necessary in the |
136 | // future. |
137 | UP.Threshold = MetaThresholdValue->getSExtValue(); |
138 | UP.PartialThreshold = UP.Threshold; |
139 | ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold); |
140 | ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold); |
141 | } |
142 | } |
143 | } |
144 | |
145 | unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal); |
146 | for (const BasicBlock *BB : L->getBlocks()) { |
147 | const DataLayout &DL = BB->getDataLayout(); |
148 | unsigned LocalGEPsSeen = 0; |
149 | |
150 | if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) { |
151 | return SubLoop->contains(BB); })) |
152 | continue; // Block belongs to an inner loop. |
153 | |
154 | for (const Instruction &I : *BB) { |
155 | // Unroll a loop which contains an "if" statement whose condition |
156 | // defined by a PHI belonging to the loop. This may help to eliminate |
157 | // if region and potentially even PHI itself, saving on both divergence |
158 | // and registers used for the PHI. |
159 | // Add a small bonus for each of such "if" statements. |
160 | if (const BranchInst *Br = dyn_cast<BranchInst>(Val: &I)) { |
161 | if (UP.Threshold < MaxBoost && Br->isConditional()) { |
162 | BasicBlock *Succ0 = Br->getSuccessor(i: 0); |
163 | BasicBlock *Succ1 = Br->getSuccessor(i: 1); |
164 | if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) || |
165 | (L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1))) |
166 | continue; |
167 | if (dependsOnLocalPhi(L, Cond: Br->getCondition())) { |
168 | UP.Threshold += UnrollThresholdIf; |
169 | LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold |
170 | << " for loop:\n" |
171 | << *L << " due to " << *Br << '\n'); |
172 | if (UP.Threshold >= MaxBoost) |
173 | return; |
174 | } |
175 | } |
176 | continue; |
177 | } |
178 | |
179 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I); |
180 | if (!GEP) |
181 | continue; |
182 | |
183 | unsigned AS = GEP->getAddressSpace(); |
184 | unsigned Threshold = 0; |
185 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) |
186 | Threshold = ThresholdPrivate; |
187 | else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) |
188 | Threshold = ThresholdLocal; |
189 | else |
190 | continue; |
191 | |
192 | if (UP.Threshold >= Threshold) |
193 | continue; |
194 | |
195 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
196 | const Value *Ptr = GEP->getPointerOperand(); |
197 | const AllocaInst *Alloca = |
198 | dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr)); |
199 | if (!Alloca || !Alloca->isStaticAlloca()) |
200 | continue; |
201 | Type *Ty = Alloca->getAllocatedType(); |
202 | unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; |
203 | if (AllocaSize > MaxAlloca) |
204 | continue; |
205 | } else if (AS == AMDGPUAS::LOCAL_ADDRESS || |
206 | AS == AMDGPUAS::REGION_ADDRESS) { |
207 | LocalGEPsSeen++; |
208 | // Inhibit unroll for local memory if we have seen addressing not to |
209 | // a variable, most likely we will be unable to combine it. |
210 | // Do not unroll too deep inner loops for local memory to give a chance |
211 | // to unroll an outer loop for a more important reason. |
212 | if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || |
213 | (!isa<GlobalVariable>(Val: GEP->getPointerOperand()) && |
214 | !isa<Argument>(Val: GEP->getPointerOperand()))) |
215 | continue; |
216 | LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n" |
217 | << *L << " due to LDS use.\n" ); |
218 | UP.Runtime = UnrollRuntimeLocal; |
219 | } |
220 | |
221 | // Check if GEP depends on a value defined by this loop itself. |
222 | bool HasLoopDef = false; |
223 | for (const Value *Op : GEP->operands()) { |
224 | const Instruction *Inst = dyn_cast<Instruction>(Val: Op); |
225 | if (!Inst || L->isLoopInvariant(V: Op)) |
226 | continue; |
227 | |
228 | if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) { |
229 | return SubLoop->contains(Inst); })) |
230 | continue; |
231 | HasLoopDef = true; |
232 | break; |
233 | } |
234 | if (!HasLoopDef) |
235 | continue; |
236 | |
237 | // We want to do whatever we can to limit the number of alloca |
238 | // instructions that make it through to the code generator. allocas |
239 | // require us to use indirect addressing, which is slow and prone to |
240 | // compiler bugs. If this loop does an address calculation on an |
241 | // alloca ptr, then we want to use a higher than normal loop unroll |
242 | // threshold. This will give SROA a better chance to eliminate these |
243 | // allocas. |
244 | // |
245 | // We also want to have more unrolling for local memory to let ds |
246 | // instructions with different offsets combine. |
247 | // |
248 | // Don't use the maximum allowed value here as it will make some |
249 | // programs way too big. |
250 | UP.Threshold = Threshold; |
251 | LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold |
252 | << " for loop:\n" |
253 | << *L << " due to " << *GEP << '\n'); |
254 | if (UP.Threshold >= MaxBoost) |
255 | return; |
256 | } |
257 | |
258 | // If we got a GEP in a small BB from inner loop then increase max trip |
259 | // count to analyze for better estimation cost in unroll |
260 | if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze) |
261 | UP.MaxIterationsCountToAnalyze = 32; |
262 | } |
263 | } |
264 | |
265 | void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
266 | TTI::PeelingPreferences &PP) { |
267 | BaseT::getPeelingPreferences(L, SE, PP); |
268 | } |
269 | |
270 | int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { |
271 | return 1024; |
272 | } |
273 | |
274 | const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { |
275 | // Codegen control options which don't matter. |
276 | AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler, |
277 | AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal, |
278 | AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess, |
279 | AMDGPU::FeatureUnalignedAccessMode, |
280 | |
281 | AMDGPU::FeatureAutoWaitcntBeforeBarrier, |
282 | |
283 | // Property of the kernel/environment which can't actually differ. |
284 | AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK, |
285 | AMDGPU::FeatureTrapHandler, |
286 | |
287 | // The default assumption needs to be ecc is enabled, but no directly |
288 | // exposed operations depend on it, so it can be safely inlined. |
289 | AMDGPU::FeatureSRAMECC, |
290 | |
291 | // Perf-tuning features |
292 | AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops}; |
293 | |
294 | GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) |
295 | : BaseT(TM, F.getDataLayout()), |
296 | ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), |
297 | TLI(ST->getTargetLowering()), CommonTTI(TM, F), |
298 | IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) { |
299 | SIModeRegisterDefaults Mode(F, *ST); |
300 | HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign(); |
301 | HasFP64FP16Denormals = |
302 | Mode.FP64FP16Denormals != DenormalMode::getPreserveSign(); |
303 | } |
304 | |
305 | bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { |
306 | return !F || !ST->isSingleLaneExecution(Kernel: *F); |
307 | } |
308 | |
309 | unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { |
310 | // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector |
311 | // registers. See getRegisterClassForType for the implementation. |
312 | // In this case vector registers are not vector in terms of |
313 | // VGPRs, but those which can hold multiple values. |
314 | |
315 | // This is really the number of registers to fill when vectorizing / |
316 | // interleaving loops, so we lie to avoid trying to use all registers. |
317 | return 4; |
318 | } |
319 | |
320 | TypeSize |
321 | GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
322 | switch (K) { |
323 | case TargetTransformInfo::RGK_Scalar: |
324 | return TypeSize::getFixed(ExactSize: 32); |
325 | case TargetTransformInfo::RGK_FixedWidthVector: |
326 | return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? 64 : 32); |
327 | case TargetTransformInfo::RGK_ScalableVector: |
328 | return TypeSize::getScalable(MinimumSize: 0); |
329 | } |
330 | llvm_unreachable("Unsupported register kind" ); |
331 | } |
332 | |
333 | unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { |
334 | return 32; |
335 | } |
336 | |
337 | unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { |
338 | if (Opcode == Instruction::Load || Opcode == Instruction::Store) |
339 | return 32 * 4 / ElemWidth; |
340 | return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 |
341 | : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 |
342 | : 1; |
343 | } |
344 | |
345 | unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, |
346 | unsigned ChainSizeInBytes, |
347 | VectorType *VecTy) const { |
348 | unsigned VecRegBitWidth = VF * LoadSize; |
349 | if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) |
350 | // TODO: Support element-size less than 32bit? |
351 | return 128 / LoadSize; |
352 | |
353 | return VF; |
354 | } |
355 | |
356 | unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, |
357 | unsigned ChainSizeInBytes, |
358 | VectorType *VecTy) const { |
359 | unsigned VecRegBitWidth = VF * StoreSize; |
360 | if (VecRegBitWidth > 128) |
361 | return 128 / StoreSize; |
362 | |
363 | return VF; |
364 | } |
365 | |
366 | unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { |
367 | if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || |
368 | AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || |
369 | AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || |
370 | AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER || |
371 | AddrSpace == AMDGPUAS::BUFFER_RESOURCE || |
372 | AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) { |
373 | return 512; |
374 | } |
375 | |
376 | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) |
377 | return 8 * ST->getMaxPrivateElementSize(); |
378 | |
379 | // Common to flat, global, local and region. Assume for unknown addrspace. |
380 | return 128; |
381 | } |
382 | |
383 | bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, |
384 | Align Alignment, |
385 | unsigned AddrSpace) const { |
386 | // We allow vectorization of flat stores, even though we may need to decompose |
387 | // them later if they may access private memory. We don't have enough context |
388 | // here, and legalization can handle it. |
389 | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { |
390 | return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && |
391 | ChainSizeInBytes <= ST->getMaxPrivateElementSize(); |
392 | } |
393 | return true; |
394 | } |
395 | |
396 | bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, |
397 | Align Alignment, |
398 | unsigned AddrSpace) const { |
399 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
400 | } |
401 | |
402 | bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, |
403 | Align Alignment, |
404 | unsigned AddrSpace) const { |
405 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
406 | } |
407 | |
408 | int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { |
409 | return 1024; |
410 | } |
411 | |
412 | // FIXME: Really we would like to issue multiple 128-bit loads and stores per |
413 | // iteration. Should we report a larger size and let it legalize? |
414 | // |
415 | // FIXME: Should we use narrower types for local/region, or account for when |
416 | // unaligned access is legal? |
417 | // |
418 | // FIXME: This could use fine tuning and microbenchmarks. |
419 | Type *GCNTTIImpl::getMemcpyLoopLoweringType( |
420 | LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, |
421 | unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, |
422 | std::optional<uint32_t> AtomicElementSize) const { |
423 | |
424 | if (AtomicElementSize) |
425 | return Type::getIntNTy(C&: Context, N: *AtomicElementSize * 8); |
426 | |
427 | unsigned MinAlign = std::min(a: SrcAlign, b: DestAlign); |
428 | |
429 | // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the |
430 | // hardware into byte accesses. If you assume all alignments are equally |
431 | // probable, it's more efficient on average to use short accesses for this |
432 | // case. |
433 | if (MinAlign == 2) |
434 | return Type::getInt16Ty(C&: Context); |
435 | |
436 | // Not all subtargets have 128-bit DS instructions, and we currently don't |
437 | // form them by default. |
438 | if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
439 | SrcAddrSpace == AMDGPUAS::REGION_ADDRESS || |
440 | DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
441 | DestAddrSpace == AMDGPUAS::REGION_ADDRESS) { |
442 | return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 2); |
443 | } |
444 | |
445 | // Global memory works best with 16-byte accesses. Private memory will also |
446 | // hit this, although they'll be decomposed. |
447 | return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 4); |
448 | } |
449 | |
450 | void GCNTTIImpl::getMemcpyLoopResidualLoweringType( |
451 | SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, |
452 | unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, |
453 | unsigned SrcAlign, unsigned DestAlign, |
454 | std::optional<uint32_t> AtomicCpySize) const { |
455 | assert(RemainingBytes < 16); |
456 | |
457 | if (AtomicCpySize) |
458 | BaseT::getMemcpyLoopResidualLoweringType( |
459 | OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, |
460 | DestAlign, AtomicCpySize); |
461 | |
462 | unsigned MinAlign = std::min(a: SrcAlign, b: DestAlign); |
463 | |
464 | if (MinAlign != 2) { |
465 | Type *I64Ty = Type::getInt64Ty(C&: Context); |
466 | while (RemainingBytes >= 8) { |
467 | OpsOut.push_back(Elt: I64Ty); |
468 | RemainingBytes -= 8; |
469 | } |
470 | |
471 | Type *I32Ty = Type::getInt32Ty(C&: Context); |
472 | while (RemainingBytes >= 4) { |
473 | OpsOut.push_back(Elt: I32Ty); |
474 | RemainingBytes -= 4; |
475 | } |
476 | } |
477 | |
478 | Type *I16Ty = Type::getInt16Ty(C&: Context); |
479 | while (RemainingBytes >= 2) { |
480 | OpsOut.push_back(Elt: I16Ty); |
481 | RemainingBytes -= 2; |
482 | } |
483 | |
484 | Type *I8Ty = Type::getInt8Ty(C&: Context); |
485 | while (RemainingBytes) { |
486 | OpsOut.push_back(Elt: I8Ty); |
487 | --RemainingBytes; |
488 | } |
489 | } |
490 | |
491 | unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) { |
492 | // Disable unrolling if the loop is not vectorized. |
493 | // TODO: Enable this again. |
494 | if (VF.isScalar()) |
495 | return 1; |
496 | |
497 | return 8; |
498 | } |
499 | |
500 | bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, |
501 | MemIntrinsicInfo &Info) const { |
502 | switch (Inst->getIntrinsicID()) { |
503 | case Intrinsic::amdgcn_ds_ordered_add: |
504 | case Intrinsic::amdgcn_ds_ordered_swap: { |
505 | auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 2)); |
506 | auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 4)); |
507 | if (!Ordering || !Volatile) |
508 | return false; // Invalid. |
509 | |
510 | unsigned OrderingVal = Ordering->getZExtValue(); |
511 | if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent)) |
512 | return false; |
513 | |
514 | Info.PtrVal = Inst->getArgOperand(i: 0); |
515 | Info.Ordering = static_cast<AtomicOrdering>(OrderingVal); |
516 | Info.ReadMem = true; |
517 | Info.WriteMem = true; |
518 | Info.IsVolatile = !Volatile->isZero(); |
519 | return true; |
520 | } |
521 | default: |
522 | return false; |
523 | } |
524 | } |
525 | |
526 | InstructionCost GCNTTIImpl::getArithmeticInstrCost( |
527 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
528 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
529 | ArrayRef<const Value *> Args, |
530 | const Instruction *CxtI) { |
531 | |
532 | // Legalize the type. |
533 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
534 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
535 | |
536 | // Because we don't have any legal vector operations, but the legal types, we |
537 | // need to account for split vectors. |
538 | unsigned NElts = LT.second.isVector() ? |
539 | LT.second.getVectorNumElements() : 1; |
540 | |
541 | MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; |
542 | |
543 | switch (ISD) { |
544 | case ISD::SHL: |
545 | case ISD::SRL: |
546 | case ISD::SRA: |
547 | if (SLT == MVT::i64) |
548 | return get64BitInstrCost(CostKind) * LT.first * NElts; |
549 | |
550 | if (ST->has16BitInsts() && SLT == MVT::i16) |
551 | NElts = (NElts + 1) / 2; |
552 | |
553 | // i32 |
554 | return getFullRateInstrCost() * LT.first * NElts; |
555 | case ISD::ADD: |
556 | case ISD::SUB: |
557 | case ISD::AND: |
558 | case ISD::OR: |
559 | case ISD::XOR: |
560 | if (SLT == MVT::i64) { |
561 | // and, or and xor are typically split into 2 VALU instructions. |
562 | return 2 * getFullRateInstrCost() * LT.first * NElts; |
563 | } |
564 | |
565 | if (ST->has16BitInsts() && SLT == MVT::i16) |
566 | NElts = (NElts + 1) / 2; |
567 | |
568 | return LT.first * NElts * getFullRateInstrCost(); |
569 | case ISD::MUL: { |
570 | const int QuarterRateCost = getQuarterRateInstrCost(CostKind); |
571 | if (SLT == MVT::i64) { |
572 | const int FullRateCost = getFullRateInstrCost(); |
573 | return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; |
574 | } |
575 | |
576 | if (ST->has16BitInsts() && SLT == MVT::i16) |
577 | NElts = (NElts + 1) / 2; |
578 | |
579 | // i32 |
580 | return QuarterRateCost * NElts * LT.first; |
581 | } |
582 | case ISD::FMUL: |
583 | // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for |
584 | // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole |
585 | // fused operation. |
586 | if (CxtI && CxtI->hasOneUse()) |
587 | if (const auto *FAdd = dyn_cast<BinaryOperator>(Val: *CxtI->user_begin())) { |
588 | const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode()); |
589 | if (OPC == ISD::FADD || OPC == ISD::FSUB) { |
590 | if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals) |
591 | return TargetTransformInfo::TCC_Free; |
592 | if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals) |
593 | return TargetTransformInfo::TCC_Free; |
594 | |
595 | // Estimate all types may be fused with contract/unsafe flags |
596 | const TargetOptions &Options = TLI->getTargetMachine().Options; |
597 | if (Options.AllowFPOpFusion == FPOpFusion::Fast || |
598 | Options.UnsafeFPMath || |
599 | (FAdd->hasAllowContract() && CxtI->hasAllowContract())) |
600 | return TargetTransformInfo::TCC_Free; |
601 | } |
602 | } |
603 | [[fallthrough]]; |
604 | case ISD::FADD: |
605 | case ISD::FSUB: |
606 | if (ST->hasPackedFP32Ops() && SLT == MVT::f32) |
607 | NElts = (NElts + 1) / 2; |
608 | if (SLT == MVT::f64) |
609 | return LT.first * NElts * get64BitInstrCost(CostKind); |
610 | |
611 | if (ST->has16BitInsts() && SLT == MVT::f16) |
612 | NElts = (NElts + 1) / 2; |
613 | |
614 | if (SLT == MVT::f32 || SLT == MVT::f16) |
615 | return LT.first * NElts * getFullRateInstrCost(); |
616 | break; |
617 | case ISD::FDIV: |
618 | case ISD::FREM: |
619 | // FIXME: frem should be handled separately. The fdiv in it is most of it, |
620 | // but the current lowering is also not entirely correct. |
621 | if (SLT == MVT::f64) { |
622 | int Cost = 7 * get64BitInstrCost(CostKind) + |
623 | getQuarterRateInstrCost(CostKind) + |
624 | 3 * getHalfRateInstrCost(CostKind); |
625 | // Add cost of workaround. |
626 | if (!ST->hasUsableDivScaleConditionOutput()) |
627 | Cost += 3 * getFullRateInstrCost(); |
628 | |
629 | return LT.first * Cost * NElts; |
630 | } |
631 | |
632 | if (!Args.empty() && match(V: Args[0], P: PatternMatch::m_FPOne())) { |
633 | // TODO: This is more complicated, unsafe flags etc. |
634 | if ((SLT == MVT::f32 && !HasFP32Denormals) || |
635 | (SLT == MVT::f16 && ST->has16BitInsts())) { |
636 | return LT.first * getQuarterRateInstrCost(CostKind) * NElts; |
637 | } |
638 | } |
639 | |
640 | if (SLT == MVT::f16 && ST->has16BitInsts()) { |
641 | // 2 x v_cvt_f32_f16 |
642 | // f32 rcp |
643 | // f32 fmul |
644 | // v_cvt_f16_f32 |
645 | // f16 div_fixup |
646 | int Cost = |
647 | 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind); |
648 | return LT.first * Cost * NElts; |
649 | } |
650 | |
651 | if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) || |
652 | TLI->getTargetMachine().Options.UnsafeFPMath)) { |
653 | // Fast unsafe fdiv lowering: |
654 | // f32 rcp |
655 | // f32 fmul |
656 | int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost(); |
657 | return LT.first * Cost * NElts; |
658 | } |
659 | |
660 | if (SLT == MVT::f32 || SLT == MVT::f16) { |
661 | // 4 more v_cvt_* insts without f16 insts support |
662 | int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() + |
663 | 1 * getQuarterRateInstrCost(CostKind); |
664 | |
665 | if (!HasFP32Denormals) { |
666 | // FP mode switches. |
667 | Cost += 2 * getFullRateInstrCost(); |
668 | } |
669 | |
670 | return LT.first * NElts * Cost; |
671 | } |
672 | break; |
673 | case ISD::FNEG: |
674 | // Use the backend' estimation. If fneg is not free each element will cost |
675 | // one additional instruction. |
676 | return TLI->isFNegFree(VT: SLT) ? 0 : NElts; |
677 | default: |
678 | break; |
679 | } |
680 | |
681 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
682 | Args, CxtI); |
683 | } |
684 | |
685 | // Return true if there's a potential benefit from using v2f16/v2i16 |
686 | // instructions for an intrinsic, even if it requires nontrivial legalization. |
687 | static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { |
688 | switch (ID) { |
689 | case Intrinsic::fma: // TODO: fmuladd |
690 | // There's a small benefit to using vector ops in the legalized code. |
691 | case Intrinsic::round: |
692 | case Intrinsic::uadd_sat: |
693 | case Intrinsic::usub_sat: |
694 | case Intrinsic::sadd_sat: |
695 | case Intrinsic::ssub_sat: |
696 | return true; |
697 | default: |
698 | return false; |
699 | } |
700 | } |
701 | |
702 | InstructionCost |
703 | GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
704 | TTI::TargetCostKind CostKind) { |
705 | if (ICA.getID() == Intrinsic::fabs) |
706 | return 0; |
707 | |
708 | if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID())) |
709 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
710 | |
711 | Type *RetTy = ICA.getReturnType(); |
712 | |
713 | // Legalize the type. |
714 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy); |
715 | |
716 | unsigned NElts = LT.second.isVector() ? |
717 | LT.second.getVectorNumElements() : 1; |
718 | |
719 | MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; |
720 | |
721 | if (SLT == MVT::f64) |
722 | return LT.first * NElts * get64BitInstrCost(CostKind); |
723 | |
724 | if ((ST->has16BitInsts() && SLT == MVT::f16) || |
725 | (ST->hasPackedFP32Ops() && SLT == MVT::f32)) |
726 | NElts = (NElts + 1) / 2; |
727 | |
728 | // TODO: Get more refined intrinsic costs? |
729 | unsigned InstRate = getQuarterRateInstrCost(CostKind); |
730 | |
731 | switch (ICA.getID()) { |
732 | case Intrinsic::fma: |
733 | InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) |
734 | : getQuarterRateInstrCost(CostKind); |
735 | break; |
736 | case Intrinsic::uadd_sat: |
737 | case Intrinsic::usub_sat: |
738 | case Intrinsic::sadd_sat: |
739 | case Intrinsic::ssub_sat: |
740 | static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16}; |
741 | if (any_of(Range: ValidSatTys, P: [<](MVT M) { return M == LT.second; })) |
742 | NElts = 1; |
743 | break; |
744 | } |
745 | |
746 | return LT.first * NElts * InstRate; |
747 | } |
748 | |
749 | InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode, |
750 | TTI::TargetCostKind CostKind, |
751 | const Instruction *I) { |
752 | assert((I == nullptr || I->getOpcode() == Opcode) && |
753 | "Opcode should reflect passed instruction." ); |
754 | const bool SCost = |
755 | (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency); |
756 | const int CBrCost = SCost ? 5 : 7; |
757 | switch (Opcode) { |
758 | case Instruction::Br: { |
759 | // Branch instruction takes about 4 slots on gfx900. |
760 | auto BI = dyn_cast_or_null<BranchInst>(Val: I); |
761 | if (BI && BI->isUnconditional()) |
762 | return SCost ? 1 : 4; |
763 | // Suppose conditional branch takes additional 3 exec manipulations |
764 | // instructions in average. |
765 | return CBrCost; |
766 | } |
767 | case Instruction::Switch: { |
768 | auto SI = dyn_cast_or_null<SwitchInst>(Val: I); |
769 | // Each case (including default) takes 1 cmp + 1 cbr instructions in |
770 | // average. |
771 | return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1); |
772 | } |
773 | case Instruction::Ret: |
774 | return SCost ? 1 : 10; |
775 | } |
776 | return BaseT::getCFInstrCost(Opcode, CostKind, I); |
777 | } |
778 | |
779 | InstructionCost |
780 | GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
781 | std::optional<FastMathFlags> FMF, |
782 | TTI::TargetCostKind CostKind) { |
783 | if (TTI::requiresOrderedReduction(FMF)) |
784 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
785 | |
786 | EVT OrigTy = TLI->getValueType(DL, Ty); |
787 | |
788 | // Computes cost on targets that have packed math instructions(which support |
789 | // 16-bit types only). |
790 | if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) |
791 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
792 | |
793 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
794 | return LT.first * getFullRateInstrCost(); |
795 | } |
796 | |
797 | InstructionCost |
798 | GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
799 | FastMathFlags FMF, |
800 | TTI::TargetCostKind CostKind) { |
801 | EVT OrigTy = TLI->getValueType(DL, Ty); |
802 | |
803 | // Computes cost on targets that have packed math instructions(which support |
804 | // 16-bit types only). |
805 | if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) |
806 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
807 | |
808 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
809 | return LT.first * getHalfRateInstrCost(CostKind); |
810 | } |
811 | |
812 | InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, |
813 | TTI::TargetCostKind CostKind, |
814 | unsigned Index, Value *Op0, |
815 | Value *Op1) { |
816 | switch (Opcode) { |
817 | case Instruction::ExtractElement: |
818 | case Instruction::InsertElement: { |
819 | unsigned EltSize |
820 | = DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType()); |
821 | if (EltSize < 32) { |
822 | if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) |
823 | return 0; |
824 | return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, |
825 | Op1); |
826 | } |
827 | |
828 | // Extracts are just reads of a subregister, so are free. Inserts are |
829 | // considered free because we don't want to have any cost for scalarizing |
830 | // operations, and we don't have to copy into a different register class. |
831 | |
832 | // Dynamic indexing isn't free and is best avoided. |
833 | return Index == ~0u ? 2 : 0; |
834 | } |
835 | default: |
836 | return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1); |
837 | } |
838 | } |
839 | |
840 | /// Analyze if the results of inline asm are divergent. If \p Indices is empty, |
841 | /// this is analyzing the collective result of all output registers. Otherwise, |
842 | /// this is only querying a specific result index if this returns multiple |
843 | /// registers in a struct. |
844 | bool GCNTTIImpl::isInlineAsmSourceOfDivergence( |
845 | const CallInst *CI, ArrayRef<unsigned> Indices) const { |
846 | // TODO: Handle complex extract indices |
847 | if (Indices.size() > 1) |
848 | return true; |
849 | |
850 | const DataLayout &DL = CI->getDataLayout(); |
851 | const SIRegisterInfo *TRI = ST->getRegisterInfo(); |
852 | TargetLowering::AsmOperandInfoVector TargetConstraints = |
853 | TLI->ParseConstraints(DL, TRI: ST->getRegisterInfo(), Call: *CI); |
854 | |
855 | const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0]; |
856 | |
857 | int OutputIdx = 0; |
858 | for (auto &TC : TargetConstraints) { |
859 | if (TC.Type != InlineAsm::isOutput) |
860 | continue; |
861 | |
862 | // Skip outputs we don't care about. |
863 | if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++) |
864 | continue; |
865 | |
866 | TLI->ComputeConstraintToUse(OpInfo&: TC, Op: SDValue()); |
867 | |
868 | const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint( |
869 | TRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second; |
870 | |
871 | // For AGPR constraints null is returned on subtargets without AGPRs, so |
872 | // assume divergent for null. |
873 | if (!RC || !TRI->isSGPRClass(RC)) |
874 | return true; |
875 | } |
876 | |
877 | return false; |
878 | } |
879 | |
880 | bool GCNTTIImpl::isReadRegisterSourceOfDivergence( |
881 | const IntrinsicInst *ReadReg) const { |
882 | Metadata *MD = |
883 | cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: 0))->getMetadata(); |
884 | StringRef RegName = |
885 | cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: 0))->getString(); |
886 | |
887 | // Special case registers that look like VCC. |
888 | MVT VT = MVT::getVT(Ty: ReadReg->getType()); |
889 | if (VT == MVT::i1) |
890 | return true; |
891 | |
892 | // Special case scalar registers that start with 'v'. |
893 | if (RegName.starts_with(Prefix: "vcc" ) || RegName.empty()) |
894 | return false; |
895 | |
896 | // VGPR or AGPR is divergent. There aren't any specially named vector |
897 | // registers. |
898 | return RegName[0] == 'v' || RegName[0] == 'a'; |
899 | } |
900 | |
901 | /// \returns true if the result of the value could potentially be |
902 | /// different across workitems in a wavefront. |
903 | bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { |
904 | if (const Argument *A = dyn_cast<Argument>(Val: V)) |
905 | return !AMDGPU::isArgPassedInSGPR(Arg: A); |
906 | |
907 | // Loads from the private and flat address spaces are divergent, because |
908 | // threads can execute the load instruction with the same inputs and get |
909 | // different results. |
910 | // |
911 | // All other loads are not divergent, because if threads issue loads with the |
912 | // same arguments, they will always get the same result. |
913 | if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V)) |
914 | return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || |
915 | Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS; |
916 | |
917 | // Atomics are divergent because they are executed sequentially: when an |
918 | // atomic operation refers to the same address in each thread, then each |
919 | // thread after the first sees the value written by the previous thread as |
920 | // original value. |
921 | if (isa<AtomicRMWInst>(Val: V) || isa<AtomicCmpXchgInst>(Val: V)) |
922 | return true; |
923 | |
924 | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) { |
925 | if (Intrinsic->getIntrinsicID() == Intrinsic::read_register) |
926 | return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic); |
927 | |
928 | return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: Intrinsic->getIntrinsicID()); |
929 | } |
930 | |
931 | // Assume all function calls are a source of divergence. |
932 | if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) { |
933 | if (CI->isInlineAsm()) |
934 | return isInlineAsmSourceOfDivergence(CI); |
935 | return true; |
936 | } |
937 | |
938 | // Assume all function calls are a source of divergence. |
939 | if (isa<InvokeInst>(Val: V)) |
940 | return true; |
941 | |
942 | return false; |
943 | } |
944 | |
945 | bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { |
946 | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) |
947 | return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID()); |
948 | |
949 | if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) { |
950 | if (CI->isInlineAsm()) |
951 | return !isInlineAsmSourceOfDivergence(CI); |
952 | return false; |
953 | } |
954 | |
955 | // In most cases TID / wavefrontsize is uniform. |
956 | // |
957 | // However, if a kernel has uneven dimesions we can have a value of |
958 | // workitem-id-x divided by the wavefrontsize non-uniform. For example |
959 | // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1) |
960 | // packed into a same wave which gives 1 and 0 after the division by 64 |
961 | // respectively. |
962 | // |
963 | // FIXME: limit it to 1D kernels only, although that shall be possible |
964 | // to perform this optimization is the size of the X dimension is a power |
965 | // of 2, we just do not currently have infrastructure to query it. |
966 | using namespace llvm::PatternMatch; |
967 | uint64_t C; |
968 | if (match(V, P: m_LShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), |
969 | R: m_ConstantInt(V&: C))) || |
970 | match(V, P: m_AShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), |
971 | R: m_ConstantInt(V&: C)))) { |
972 | const Function *F = cast<Instruction>(Val: V)->getFunction(); |
973 | return C >= ST->getWavefrontSizeLog2() && |
974 | ST->getMaxWorkitemID(Kernel: *F, Dimension: 1) == 0 && ST->getMaxWorkitemID(Kernel: *F, Dimension: 2) == 0; |
975 | } |
976 | |
977 | Value *Mask; |
978 | if (match(V, P: m_c_And(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), |
979 | R: m_Value(V&: Mask)))) { |
980 | const Function *F = cast<Instruction>(Val: V)->getFunction(); |
981 | const DataLayout &DL = F->getDataLayout(); |
982 | return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >= |
983 | ST->getWavefrontSizeLog2() && |
984 | ST->getMaxWorkitemID(Kernel: *F, Dimension: 1) == 0 && ST->getMaxWorkitemID(Kernel: *F, Dimension: 2) == 0; |
985 | } |
986 | |
987 | const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V); |
988 | if (!ExtValue) |
989 | return false; |
990 | |
991 | const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: 0)); |
992 | if (!CI) |
993 | return false; |
994 | |
995 | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) { |
996 | switch (Intrinsic->getIntrinsicID()) { |
997 | default: |
998 | return false; |
999 | case Intrinsic::amdgcn_if: |
1000 | case Intrinsic::amdgcn_else: { |
1001 | ArrayRef<unsigned> Indices = ExtValue->getIndices(); |
1002 | return Indices.size() == 1 && Indices[0] == 1; |
1003 | } |
1004 | } |
1005 | } |
1006 | |
1007 | // If we have inline asm returning mixed SGPR and VGPR results, we inferred |
1008 | // divergent for the overall struct return. We need to override it in the |
1009 | // case we're extracting an SGPR component here. |
1010 | if (CI->isInlineAsm()) |
1011 | return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices()); |
1012 | |
1013 | return false; |
1014 | } |
1015 | |
1016 | bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
1017 | Intrinsic::ID IID) const { |
1018 | switch (IID) { |
1019 | case Intrinsic::amdgcn_is_shared: |
1020 | case Intrinsic::amdgcn_is_private: |
1021 | case Intrinsic::amdgcn_flat_atomic_fadd: |
1022 | case Intrinsic::amdgcn_flat_atomic_fmax: |
1023 | case Intrinsic::amdgcn_flat_atomic_fmin: |
1024 | case Intrinsic::amdgcn_flat_atomic_fmax_num: |
1025 | case Intrinsic::amdgcn_flat_atomic_fmin_num: |
1026 | OpIndexes.push_back(Elt: 0); |
1027 | return true; |
1028 | default: |
1029 | return false; |
1030 | } |
1031 | } |
1032 | |
1033 | Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, |
1034 | Value *OldV, |
1035 | Value *NewV) const { |
1036 | auto IntrID = II->getIntrinsicID(); |
1037 | switch (IntrID) { |
1038 | case Intrinsic::amdgcn_is_shared: |
1039 | case Intrinsic::amdgcn_is_private: { |
1040 | unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ? |
1041 | AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; |
1042 | unsigned NewAS = NewV->getType()->getPointerAddressSpace(); |
1043 | LLVMContext &Ctx = NewV->getType()->getContext(); |
1044 | ConstantInt *NewVal = (TrueAS == NewAS) ? |
1045 | ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx); |
1046 | return NewVal; |
1047 | } |
1048 | case Intrinsic::ptrmask: { |
1049 | unsigned OldAS = OldV->getType()->getPointerAddressSpace(); |
1050 | unsigned NewAS = NewV->getType()->getPointerAddressSpace(); |
1051 | Value *MaskOp = II->getArgOperand(i: 1); |
1052 | Type *MaskTy = MaskOp->getType(); |
1053 | |
1054 | bool DoTruncate = false; |
1055 | |
1056 | const GCNTargetMachine &TM = |
1057 | static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine()); |
1058 | if (!TM.isNoopAddrSpaceCast(SrcAS: OldAS, DestAS: NewAS)) { |
1059 | // All valid 64-bit to 32-bit casts work by chopping off the high |
1060 | // bits. Any masking only clearing the low bits will also apply in the new |
1061 | // address space. |
1062 | if (DL.getPointerSizeInBits(AS: OldAS) != 64 || |
1063 | DL.getPointerSizeInBits(AS: NewAS) != 32) |
1064 | return nullptr; |
1065 | |
1066 | // TODO: Do we need to thread more context in here? |
1067 | KnownBits Known = computeKnownBits(V: MaskOp, DL, Depth: 0, AC: nullptr, CxtI: II); |
1068 | if (Known.countMinLeadingOnes() < 32) |
1069 | return nullptr; |
1070 | |
1071 | DoTruncate = true; |
1072 | } |
1073 | |
1074 | IRBuilder<> B(II); |
1075 | if (DoTruncate) { |
1076 | MaskTy = B.getInt32Ty(); |
1077 | MaskOp = B.CreateTrunc(V: MaskOp, DestTy: MaskTy); |
1078 | } |
1079 | |
1080 | return B.CreateIntrinsic(ID: Intrinsic::ptrmask, Types: {NewV->getType(), MaskTy}, |
1081 | Args: {NewV, MaskOp}); |
1082 | } |
1083 | case Intrinsic::amdgcn_flat_atomic_fadd: |
1084 | case Intrinsic::amdgcn_flat_atomic_fmax: |
1085 | case Intrinsic::amdgcn_flat_atomic_fmin: |
1086 | case Intrinsic::amdgcn_flat_atomic_fmax_num: |
1087 | case Intrinsic::amdgcn_flat_atomic_fmin_num: { |
1088 | Type *DestTy = II->getType(); |
1089 | Type *SrcTy = NewV->getType(); |
1090 | unsigned NewAS = SrcTy->getPointerAddressSpace(); |
1091 | if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS)) |
1092 | return nullptr; |
1093 | Module *M = II->getModule(); |
1094 | Function *NewDecl = Intrinsic::getDeclaration(M, id: II->getIntrinsicID(), |
1095 | Tys: {DestTy, SrcTy, DestTy}); |
1096 | II->setArgOperand(i: 0, v: NewV); |
1097 | II->setCalledFunction(NewDecl); |
1098 | return II; |
1099 | } |
1100 | default: |
1101 | return nullptr; |
1102 | } |
1103 | } |
1104 | |
1105 | InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, |
1106 | VectorType *VT, ArrayRef<int> Mask, |
1107 | TTI::TargetCostKind CostKind, |
1108 | int Index, VectorType *SubTp, |
1109 | ArrayRef<const Value *> Args, |
1110 | const Instruction *CxtI) { |
1111 | if (!isa<FixedVectorType>(Val: VT)) |
1112 | return BaseT::getShuffleCost(Kind, Tp: VT, Mask, CostKind, Index, SubTp); |
1113 | |
1114 | Kind = improveShuffleKindFromMask(Kind, Mask, Ty: VT, Index, SubTy&: SubTp); |
1115 | |
1116 | // Larger vector widths may require additional instructions, but are |
1117 | // typically cheaper than scalarized versions. |
1118 | unsigned NumVectorElts = cast<FixedVectorType>(Val: VT)->getNumElements(); |
1119 | if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && |
1120 | DL.getTypeSizeInBits(Ty: VT->getElementType()) == 16) { |
1121 | bool HasVOP3P = ST->hasVOP3PInsts(); |
1122 | unsigned RequestedElts = |
1123 | count_if(Range&: Mask, P: [](int MaskElt) { return MaskElt != -1; }); |
1124 | if (RequestedElts == 0) |
1125 | return 0; |
1126 | switch (Kind) { |
1127 | case TTI::SK_Broadcast: |
1128 | case TTI::SK_Reverse: |
1129 | case TTI::SK_PermuteSingleSrc: { |
1130 | // With op_sel VOP3P instructions freely can access the low half or high |
1131 | // half of a register, so any swizzle of two elements is free. |
1132 | if (HasVOP3P && NumVectorElts == 2) |
1133 | return 0; |
1134 | unsigned NumPerms = alignTo(Value: RequestedElts, Align: 2) / 2; |
1135 | // SK_Broadcast just reuses the same mask |
1136 | unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; |
1137 | return NumPerms + NumPermMasks; |
1138 | } |
1139 | case TTI::SK_ExtractSubvector: |
1140 | case TTI::SK_InsertSubvector: { |
1141 | // Even aligned accesses are free |
1142 | if (!(Index % 2)) |
1143 | return 0; |
1144 | // Insert/extract subvectors only require shifts / extract code to get the |
1145 | // relevant bits |
1146 | return alignTo(Value: RequestedElts, Align: 2) / 2; |
1147 | } |
1148 | case TTI::SK_PermuteTwoSrc: |
1149 | case TTI::SK_Splice: |
1150 | case TTI::SK_Select: { |
1151 | unsigned NumPerms = alignTo(Value: RequestedElts, Align: 2) / 2; |
1152 | // SK_Select just reuses the same mask |
1153 | unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; |
1154 | return NumPerms + NumPermMasks; |
1155 | } |
1156 | |
1157 | default: |
1158 | break; |
1159 | } |
1160 | } |
1161 | |
1162 | return BaseT::getShuffleCost(Kind, Tp: VT, Mask, CostKind, Index, SubTp); |
1163 | } |
1164 | |
1165 | bool GCNTTIImpl::areInlineCompatible(const Function *Caller, |
1166 | const Function *Callee) const { |
1167 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
1168 | const GCNSubtarget *CallerST |
1169 | = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller)); |
1170 | const GCNSubtarget *CalleeST |
1171 | = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee)); |
1172 | |
1173 | const FeatureBitset &CallerBits = CallerST->getFeatureBits(); |
1174 | const FeatureBitset &CalleeBits = CalleeST->getFeatureBits(); |
1175 | |
1176 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; |
1177 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; |
1178 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) |
1179 | return false; |
1180 | |
1181 | // FIXME: dx10_clamp can just take the caller setting, but there seems to be |
1182 | // no way to support merge for backend defined attributes. |
1183 | SIModeRegisterDefaults CallerMode(*Caller, *CallerST); |
1184 | SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST); |
1185 | if (!CallerMode.isInlineCompatible(CalleeMode)) |
1186 | return false; |
1187 | |
1188 | if (Callee->hasFnAttribute(Kind: Attribute::AlwaysInline) || |
1189 | Callee->hasFnAttribute(Kind: Attribute::InlineHint)) |
1190 | return true; |
1191 | |
1192 | // Hack to make compile times reasonable. |
1193 | if (InlineMaxBB) { |
1194 | // Single BB does not increase total BB amount. |
1195 | if (Callee->size() == 1) |
1196 | return true; |
1197 | size_t BBSize = Caller->size() + Callee->size() - 1; |
1198 | return BBSize <= InlineMaxBB; |
1199 | } |
1200 | |
1201 | return true; |
1202 | } |
1203 | |
1204 | static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, |
1205 | const SITargetLowering *TLI, |
1206 | const GCNTTIImpl *TTIImpl) { |
1207 | const int NrOfSGPRUntilSpill = 26; |
1208 | const int NrOfVGPRUntilSpill = 32; |
1209 | |
1210 | const DataLayout &DL = TTIImpl->getDataLayout(); |
1211 | |
1212 | unsigned adjustThreshold = 0; |
1213 | int SGPRsInUse = 0; |
1214 | int VGPRsInUse = 0; |
1215 | for (const Use &A : CB->args()) { |
1216 | SmallVector<EVT, 4> ValueVTs; |
1217 | ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs); |
1218 | for (auto ArgVT : ValueVTs) { |
1219 | unsigned CCRegNum = TLI->getNumRegistersForCallingConv( |
1220 | Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT); |
1221 | if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A))) |
1222 | SGPRsInUse += CCRegNum; |
1223 | else |
1224 | VGPRsInUse += CCRegNum; |
1225 | } |
1226 | } |
1227 | |
1228 | // The cost of passing function arguments through the stack: |
1229 | // 1 instruction to put a function argument on the stack in the caller. |
1230 | // 1 instruction to take a function argument from the stack in callee. |
1231 | // 1 instruction is explicitly take care of data dependencies in callee |
1232 | // function. |
1233 | InstructionCost ArgStackCost(1); |
1234 | ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost( |
1235 | Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4), |
1236 | AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency); |
1237 | ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost( |
1238 | Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4), |
1239 | AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency); |
1240 | |
1241 | // The penalty cost is computed relative to the cost of instructions and does |
1242 | // not model any storage costs. |
1243 | adjustThreshold += std::max(a: 0, b: SGPRsInUse - NrOfSGPRUntilSpill) * |
1244 | *ArgStackCost.getValue() * InlineConstants::getInstrCost(); |
1245 | adjustThreshold += std::max(a: 0, b: VGPRsInUse - NrOfVGPRUntilSpill) * |
1246 | *ArgStackCost.getValue() * InlineConstants::getInstrCost(); |
1247 | return adjustThreshold; |
1248 | } |
1249 | |
1250 | static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, |
1251 | const DataLayout &DL) { |
1252 | // If we have a pointer to a private array passed into a function |
1253 | // it will not be optimized out, leaving scratch usage. |
1254 | // This function calculates the total size in bytes of the memory that would |
1255 | // end in scratch if the call was not inlined. |
1256 | unsigned AllocaSize = 0; |
1257 | SmallPtrSet<const AllocaInst *, 8> AIVisited; |
1258 | for (Value *PtrArg : CB->args()) { |
1259 | PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType()); |
1260 | if (!Ty) |
1261 | continue; |
1262 | |
1263 | unsigned AddrSpace = Ty->getAddressSpace(); |
1264 | if (AddrSpace != AMDGPUAS::FLAT_ADDRESS && |
1265 | AddrSpace != AMDGPUAS::PRIVATE_ADDRESS) |
1266 | continue; |
1267 | |
1268 | const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg)); |
1269 | if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(Ptr: AI).second) |
1270 | continue; |
1271 | |
1272 | AllocaSize += DL.getTypeAllocSize(Ty: AI->getAllocatedType()); |
1273 | } |
1274 | return AllocaSize; |
1275 | } |
1276 | |
1277 | unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { |
1278 | unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this); |
1279 | |
1280 | // Private object passed as arguments may end up in scratch usage if the call |
1281 | // is not inlined. Increase the inline threshold to promote inlining. |
1282 | unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL); |
1283 | if (AllocaSize > 0) |
1284 | Threshold += ArgAllocaCost; |
1285 | return Threshold; |
1286 | } |
1287 | |
1288 | unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB, |
1289 | const AllocaInst *AI) const { |
1290 | |
1291 | // Below the cutoff, assume that the private memory objects would be |
1292 | // optimized |
1293 | auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL); |
1294 | if (AllocaSize <= ArgAllocaCutoff) |
1295 | return 0; |
1296 | |
1297 | // Above the cutoff, we give a cost to each private memory object |
1298 | // depending its size. If the array can be optimized by SROA this cost is not |
1299 | // added to the total-cost in the inliner cost analysis. |
1300 | // |
1301 | // We choose the total cost of the alloca such that their sum cancels the |
1302 | // bonus given in the threshold (ArgAllocaCost). |
1303 | // |
1304 | // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost |
1305 | // |
1306 | // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier, |
1307 | // the single-bb bonus and the vector-bonus. |
1308 | // |
1309 | // We compensate the first two multipliers, by repeating logic from the |
1310 | // inliner-cost in here. The vector-bonus is 0 on AMDGPU. |
1311 | static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0" ); |
1312 | unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier(); |
1313 | |
1314 | bool SingleBB = none_of(Range&: *CB->getCalledFunction(), P: [](const BasicBlock &BB) { |
1315 | return BB.getTerminator()->getNumSuccessors() > 1; |
1316 | }); |
1317 | if (SingleBB) { |
1318 | Threshold += Threshold / 2; |
1319 | } |
1320 | |
1321 | auto ArgAllocaSize = DL.getTypeAllocSize(Ty: AI->getAllocatedType()); |
1322 | |
1323 | // Attribute the bonus proportionally to the alloca size |
1324 | unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize; |
1325 | |
1326 | return AllocaThresholdBonus; |
1327 | } |
1328 | |
1329 | void GCNTTIImpl::(Loop *L, ScalarEvolution &SE, |
1330 | TTI::UnrollingPreferences &UP, |
1331 | OptimizationRemarkEmitter *ORE) { |
1332 | CommonTTI.getUnrollingPreferences(L, SE, UP, ORE); |
1333 | } |
1334 | |
1335 | void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
1336 | TTI::PeelingPreferences &PP) { |
1337 | CommonTTI.getPeelingPreferences(L, SE, PP); |
1338 | } |
1339 | |
1340 | int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const { |
1341 | return ST->hasFullRate64Ops() |
1342 | ? getFullRateInstrCost() |
1343 | : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) |
1344 | : getQuarterRateInstrCost(CostKind); |
1345 | } |
1346 | |
1347 | std::pair<InstructionCost, MVT> |
1348 | GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { |
1349 | std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty); |
1350 | auto Size = DL.getTypeSizeInBits(Ty); |
1351 | // Maximum load or store can handle 8 dwords for scalar and 4 for |
1352 | // vector ALU. Let's assume anything above 8 dwords is expensive |
1353 | // even if legal. |
1354 | if (Size <= 256) |
1355 | return Cost; |
1356 | |
1357 | Cost.first += (Size + 255) / 256; |
1358 | return Cost; |
1359 | } |
1360 | |
1361 | unsigned GCNTTIImpl::getPrefetchDistance() const { |
1362 | return ST->hasPrefetch() ? 128 : 0; |
1363 | } |
1364 | |
1365 | bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { |
1366 | return AMDGPU::isFlatGlobalAddrSpace(AS); |
1367 | } |
1368 | |