1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains both AMDGPU target machine and the CodeGen pass builder.
11/// The AMDGPU target machine contains all of the hardware specific information
12/// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
13/// CodeGen pass builder handles the pass pipeline for new pass manager.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetMachine.h"
18#include "AMDGPU.h"
19#include "AMDGPUAliasAnalysis.h"
20#include "AMDGPUArgumentUsageInfo.h"
21#include "AMDGPUBarrierLatency.h"
22#include "AMDGPUCtorDtorLowering.h"
23#include "AMDGPUExportClustering.h"
24#include "AMDGPUExportKernelRuntimeHandles.h"
25#include "AMDGPUHazardLatency.h"
26#include "AMDGPUIGroupLP.h"
27#include "AMDGPUISelDAGToDAG.h"
28#include "AMDGPULowerVGPREncoding.h"
29#include "AMDGPUMacroFusion.h"
30#include "AMDGPUPerfHintAnalysis.h"
31#include "AMDGPUPreloadKernArgProlog.h"
32#include "AMDGPUPrepareAGPRAlloc.h"
33#include "AMDGPURemoveIncompatibleFunctions.h"
34#include "AMDGPUReserveWWMRegs.h"
35#include "AMDGPUResourceUsageAnalysis.h"
36#include "AMDGPUSplitModule.h"
37#include "AMDGPUTargetObjectFile.h"
38#include "AMDGPUTargetTransformInfo.h"
39#include "AMDGPUUnifyDivergentExitNodes.h"
40#include "AMDGPUWaitSGPRHazards.h"
41#include "GCNDPPCombine.h"
42#include "GCNIterativeScheduler.h"
43#include "GCNNSAReassign.h"
44#include "GCNPreRALongBranchReg.h"
45#include "GCNPreRAOptimizations.h"
46#include "GCNRewritePartialRegUses.h"
47#include "GCNSchedStrategy.h"
48#include "GCNVOPDUtils.h"
49#include "R600.h"
50#include "R600TargetMachine.h"
51#include "SIFixSGPRCopies.h"
52#include "SIFixVGPRCopies.h"
53#include "SIFoldOperands.h"
54#include "SIFormMemoryClauses.h"
55#include "SILoadStoreOptimizer.h"
56#include "SILowerControlFlow.h"
57#include "SILowerSGPRSpills.h"
58#include "SILowerWWMCopies.h"
59#include "SIMachineFunctionInfo.h"
60#include "SIMachineScheduler.h"
61#include "SIOptimizeExecMasking.h"
62#include "SIOptimizeExecMaskingPreRA.h"
63#include "SIOptimizeVGPRLiveRange.h"
64#include "SIPeepholeSDWA.h"
65#include "SIPostRABundler.h"
66#include "SIPreAllocateWWMRegs.h"
67#include "SIShrinkInstructions.h"
68#include "SIWholeQuadMode.h"
69#include "TargetInfo/AMDGPUTargetInfo.h"
70#include "Utils/AMDGPUBaseInfo.h"
71#include "llvm/Analysis/CGSCCPassManager.h"
72#include "llvm/Analysis/CallGraphSCCPass.h"
73#include "llvm/Analysis/KernelInfo.h"
74#include "llvm/Analysis/UniformityAnalysis.h"
75#include "llvm/CodeGen/AtomicExpand.h"
76#include "llvm/CodeGen/BranchRelaxation.h"
77#include "llvm/CodeGen/DeadMachineInstructionElim.h"
78#include "llvm/CodeGen/EarlyIfConversion.h"
79#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
80#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
81#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
82#include "llvm/CodeGen/GlobalISel/Legalizer.h"
83#include "llvm/CodeGen/GlobalISel/Localizer.h"
84#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
85#include "llvm/CodeGen/MIRParser/MIParser.h"
86#include "llvm/CodeGen/MachineCSE.h"
87#include "llvm/CodeGen/MachineLICM.h"
88#include "llvm/CodeGen/MachineScheduler.h"
89#include "llvm/CodeGen/Passes.h"
90#include "llvm/CodeGen/PostRAHazardRecognizer.h"
91#include "llvm/CodeGen/RegAllocRegistry.h"
92#include "llvm/CodeGen/TargetPassConfig.h"
93#include "llvm/IR/IntrinsicsAMDGPU.h"
94#include "llvm/IR/PassManager.h"
95#include "llvm/IR/PatternMatch.h"
96#include "llvm/InitializePasses.h"
97#include "llvm/MC/TargetRegistry.h"
98#include "llvm/Passes/CodeGenPassBuilder.h"
99#include "llvm/Passes/PassBuilder.h"
100#include "llvm/Support/Compiler.h"
101#include "llvm/Support/FormatVariadic.h"
102#include "llvm/Transforms/HipStdPar/HipStdPar.h"
103#include "llvm/Transforms/IPO.h"
104#include "llvm/Transforms/IPO/AlwaysInliner.h"
105#include "llvm/Transforms/IPO/ExpandVariadics.h"
106#include "llvm/Transforms/IPO/GlobalDCE.h"
107#include "llvm/Transforms/IPO/Internalize.h"
108#include "llvm/Transforms/Scalar.h"
109#include "llvm/Transforms/Scalar/EarlyCSE.h"
110#include "llvm/Transforms/Scalar/FlattenCFG.h"
111#include "llvm/Transforms/Scalar/GVN.h"
112#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
113#include "llvm/Transforms/Scalar/LICM.h"
114#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
115#include "llvm/Transforms/Scalar/LoopPassManager.h"
116#include "llvm/Transforms/Scalar/NaryReassociate.h"
117#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
118#include "llvm/Transforms/Scalar/Sink.h"
119#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
120#include "llvm/Transforms/Scalar/StructurizeCFG.h"
121#include "llvm/Transforms/Utils.h"
122#include "llvm/Transforms/Utils/FixIrreducible.h"
123#include "llvm/Transforms/Utils/LCSSA.h"
124#include "llvm/Transforms/Utils/LowerSwitch.h"
125#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
126#include "llvm/Transforms/Utils/UnifyLoopExits.h"
127#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
128#include <optional>
129
130using namespace llvm;
131using namespace llvm::PatternMatch;
132
133namespace {
134//===----------------------------------------------------------------------===//
135// AMDGPU CodeGen Pass Builder interface.
136//===----------------------------------------------------------------------===//
137
138class AMDGPUCodeGenPassBuilder
139 : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
140 using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
141
142public:
143 AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
144 const CGPassBuilderOption &Opts,
145 PassInstrumentationCallbacks *PIC);
146
147 void addIRPasses(PassManagerWrapper &PMW) const;
148 void addCodeGenPrepare(PassManagerWrapper &PMW) const;
149 void addPreISel(PassManagerWrapper &PMW) const;
150 void addILPOpts(PassManagerWrapper &PMWM) const;
151 void addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const;
152 Error addInstSelector(PassManagerWrapper &PMW) const;
153 void addPreRewrite(PassManagerWrapper &PMW) const;
154 void addMachineSSAOptimization(PassManagerWrapper &PMW) const;
155 void addPostRegAlloc(PassManagerWrapper &PMW) const;
156 void addPreEmitPass(PassManagerWrapper &PMWM) const;
157 void addPreEmitRegAlloc(PassManagerWrapper &PMW) const;
158 Error addRegAssignmentFast(PassManagerWrapper &PMW) const;
159 Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const;
160 void addPreRegAlloc(PassManagerWrapper &PMW) const;
161 Error addFastRegAlloc(PassManagerWrapper &PMW) const;
162 void addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
163 void addPreSched2(PassManagerWrapper &PMW) const;
164 void addPostBBSections(PassManagerWrapper &PMW) const;
165
166 /// Check if a pass is enabled given \p Opt option. The option always
167 /// overrides defaults if explicitly used. Otherwise its default will be used
168 /// given that a pass shall work at an optimization \p Level minimum.
169 bool isPassEnabled(const cl::opt<bool> &Opt,
170 CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
171 void addEarlyCSEOrGVNPass(PassManagerWrapper &PMW) const;
172 void addStraightLineScalarOptimizationPasses(PassManagerWrapper &PMW) const;
173};
174
175class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
176public:
177 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
178 : RegisterRegAllocBase(N, D, C) {}
179};
180
181class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
182public:
183 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
184 : RegisterRegAllocBase(N, D, C) {}
185};
186
187class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
188public:
189 WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
190 : RegisterRegAllocBase(N, D, C) {}
191};
192
193static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
194 const MachineRegisterInfo &MRI,
195 const Register Reg) {
196 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
197 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
198}
199
200static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
201 const MachineRegisterInfo &MRI,
202 const Register Reg) {
203 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
204 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
205}
206
207static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
208 const MachineRegisterInfo &MRI,
209 const Register Reg) {
210 const SIMachineFunctionInfo *MFI =
211 MRI.getMF().getInfo<SIMachineFunctionInfo>();
212 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
213 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
214 MFI->checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG);
215}
216
217/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
218static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
219
220/// A dummy default pass factory indicates whether the register allocator is
221/// overridden on the command line.
222static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
223static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
224static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
225
226static SGPRRegisterRegAlloc
227defaultSGPRRegAlloc("default",
228 "pick SGPR register allocator based on -O option",
229 useDefaultRegisterAllocator);
230
231static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
232 RegisterPassParser<SGPRRegisterRegAlloc>>
233SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
234 cl::desc("Register allocator to use for SGPRs"));
235
236static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
237 RegisterPassParser<VGPRRegisterRegAlloc>>
238VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
239 cl::desc("Register allocator to use for VGPRs"));
240
241static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
242 RegisterPassParser<WWMRegisterRegAlloc>>
243 WWMRegAlloc("wwm-regalloc", cl::Hidden,
244 cl::init(Val: &useDefaultRegisterAllocator),
245 cl::desc("Register allocator to use for WWM registers"));
246
247static void initializeDefaultSGPRRegisterAllocatorOnce() {
248 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
249
250 if (!Ctor) {
251 Ctor = SGPRRegAlloc;
252 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
253 }
254}
255
256static void initializeDefaultVGPRRegisterAllocatorOnce() {
257 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
258
259 if (!Ctor) {
260 Ctor = VGPRRegAlloc;
261 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
262 }
263}
264
265static void initializeDefaultWWMRegisterAllocatorOnce() {
266 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
267
268 if (!Ctor) {
269 Ctor = WWMRegAlloc;
270 WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
271 }
272}
273
274static FunctionPass *createBasicSGPRRegisterAllocator() {
275 return createBasicRegisterAllocator(F: onlyAllocateSGPRs);
276}
277
278static FunctionPass *createGreedySGPRRegisterAllocator() {
279 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
280}
281
282static FunctionPass *createFastSGPRRegisterAllocator() {
283 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
284}
285
286static FunctionPass *createBasicVGPRRegisterAllocator() {
287 return createBasicRegisterAllocator(F: onlyAllocateVGPRs);
288}
289
290static FunctionPass *createGreedyVGPRRegisterAllocator() {
291 return createGreedyRegisterAllocator(F: onlyAllocateVGPRs);
292}
293
294static FunctionPass *createFastVGPRRegisterAllocator() {
295 return createFastRegisterAllocator(F: onlyAllocateVGPRs, ClearVirtRegs: true);
296}
297
298static FunctionPass *createBasicWWMRegisterAllocator() {
299 return createBasicRegisterAllocator(F: onlyAllocateWWMRegs);
300}
301
302static FunctionPass *createGreedyWWMRegisterAllocator() {
303 return createGreedyRegisterAllocator(F: onlyAllocateWWMRegs);
304}
305
306static FunctionPass *createFastWWMRegisterAllocator() {
307 return createFastRegisterAllocator(F: onlyAllocateWWMRegs, ClearVirtRegs: false);
308}
309
310static SGPRRegisterRegAlloc basicRegAllocSGPR(
311 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
312static SGPRRegisterRegAlloc greedyRegAllocSGPR(
313 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
314
315static SGPRRegisterRegAlloc fastRegAllocSGPR(
316 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
317
318
319static VGPRRegisterRegAlloc basicRegAllocVGPR(
320 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
321static VGPRRegisterRegAlloc greedyRegAllocVGPR(
322 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
323
324static VGPRRegisterRegAlloc fastRegAllocVGPR(
325 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
326static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
327 "basic register allocator",
328 createBasicWWMRegisterAllocator);
329static WWMRegisterRegAlloc
330 greedyRegAllocWWMReg("greedy", "greedy register allocator",
331 createGreedyWWMRegisterAllocator);
332static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
333 createFastWWMRegisterAllocator);
334
335static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
336 return Phase == ThinOrFullLTOPhase::FullLTOPreLink ||
337 Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
338}
339} // anonymous namespace
340
341static cl::opt<bool>
342EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
343 cl::desc("Run early if-conversion"),
344 cl::init(Val: false));
345
346static cl::opt<bool>
347OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
348 cl::desc("Run pre-RA exec mask optimizations"),
349 cl::init(Val: true));
350
351static cl::opt<bool>
352 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
353 cl::desc("Lower GPU ctor / dtors to globals on the device."),
354 cl::init(Val: true), cl::Hidden);
355
356// Option to disable vectorizer for tests.
357static cl::opt<bool> EnableLoadStoreVectorizer(
358 "amdgpu-load-store-vectorizer",
359 cl::desc("Enable load store vectorizer"),
360 cl::init(Val: true),
361 cl::Hidden);
362
363// Option to control global loads scalarization
364static cl::opt<bool> ScalarizeGlobal(
365 "amdgpu-scalarize-global-loads",
366 cl::desc("Enable global load scalarization"),
367 cl::init(Val: true),
368 cl::Hidden);
369
370// Option to run internalize pass.
371static cl::opt<bool> InternalizeSymbols(
372 "amdgpu-internalize-symbols",
373 cl::desc("Enable elimination of non-kernel functions and unused globals"),
374 cl::init(Val: false),
375 cl::Hidden);
376
377// Option to inline all early.
378static cl::opt<bool> EarlyInlineAll(
379 "amdgpu-early-inline-all",
380 cl::desc("Inline all functions early"),
381 cl::init(Val: false),
382 cl::Hidden);
383
384static cl::opt<bool> RemoveIncompatibleFunctions(
385 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
386 cl::desc("Enable removal of functions when they"
387 "use features not supported by the target GPU"),
388 cl::init(Val: true));
389
390static cl::opt<bool> EnableSDWAPeephole(
391 "amdgpu-sdwa-peephole",
392 cl::desc("Enable SDWA peepholer"),
393 cl::init(Val: true));
394
395static cl::opt<bool> EnableDPPCombine(
396 "amdgpu-dpp-combine",
397 cl::desc("Enable DPP combiner"),
398 cl::init(Val: true));
399
400// Enable address space based alias analysis
401static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
402 cl::desc("Enable AMDGPU Alias Analysis"),
403 cl::init(Val: true));
404
405// Enable lib calls simplifications
406static cl::opt<bool> EnableLibCallSimplify(
407 "amdgpu-simplify-libcall",
408 cl::desc("Enable amdgpu library simplifications"),
409 cl::init(Val: true),
410 cl::Hidden);
411
412static cl::opt<bool> EnableLowerKernelArguments(
413 "amdgpu-ir-lower-kernel-arguments",
414 cl::desc("Lower kernel argument loads in IR pass"),
415 cl::init(Val: true),
416 cl::Hidden);
417
418static cl::opt<bool> EnableRegReassign(
419 "amdgpu-reassign-regs",
420 cl::desc("Enable register reassign optimizations on gfx10+"),
421 cl::init(Val: true),
422 cl::Hidden);
423
424static cl::opt<bool> OptVGPRLiveRange(
425 "amdgpu-opt-vgpr-liverange",
426 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
427 cl::init(Val: true), cl::Hidden);
428
429static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
430 "amdgpu-atomic-optimizer-strategy",
431 cl::desc("Select DPP or Iterative strategy for scan"),
432 cl::init(Val: ScanOptions::Iterative),
433 cl::values(
434 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
435 clEnumValN(ScanOptions::Iterative, "Iterative",
436 "Use Iterative approach for scan"),
437 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
438
439// Enable Mode register optimization
440static cl::opt<bool> EnableSIModeRegisterPass(
441 "amdgpu-mode-register",
442 cl::desc("Enable mode register pass"),
443 cl::init(Val: true),
444 cl::Hidden);
445
446// Enable GFX11+ s_delay_alu insertion
447static cl::opt<bool>
448 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
449 cl::desc("Enable s_delay_alu insertion"),
450 cl::init(Val: true), cl::Hidden);
451
452// Enable GFX11+ VOPD
453static cl::opt<bool>
454 EnableVOPD("amdgpu-enable-vopd",
455 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
456 cl::init(Val: true), cl::Hidden);
457
458// Option is used in lit tests to prevent deadcoding of patterns inspected.
459static cl::opt<bool>
460EnableDCEInRA("amdgpu-dce-in-ra",
461 cl::init(Val: true), cl::Hidden,
462 cl::desc("Enable machine DCE inside regalloc"));
463
464static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
465 cl::desc("Adjust wave priority"),
466 cl::init(Val: false), cl::Hidden);
467
468static cl::opt<bool> EnableScalarIRPasses(
469 "amdgpu-scalar-ir-passes",
470 cl::desc("Enable scalar IR passes"),
471 cl::init(Val: true),
472 cl::Hidden);
473
474static cl::opt<bool> EnableLowerExecSync(
475 "amdgpu-enable-lower-exec-sync",
476 cl::desc("Enable lowering of execution synchronization."), cl::init(Val: true),
477 cl::Hidden);
478
479static cl::opt<bool>
480 EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
481 cl::desc("Enable lowering of lds to global memory pass "
482 "and asan instrument resulting IR."),
483 cl::init(Val: true), cl::Hidden);
484
485static cl::opt<bool, true> EnableLowerModuleLDS(
486 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
487 cl::location(L&: AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(Val: true),
488 cl::Hidden);
489
490static cl::opt<bool> EnablePreRAOptimizations(
491 "amdgpu-enable-pre-ra-optimizations",
492 cl::desc("Enable Pre-RA optimizations pass"), cl::init(Val: true),
493 cl::Hidden);
494
495static cl::opt<bool> EnablePromoteKernelArguments(
496 "amdgpu-enable-promote-kernel-arguments",
497 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
498 cl::Hidden, cl::init(Val: true));
499
500static cl::opt<bool> EnableImageIntrinsicOptimizer(
501 "amdgpu-enable-image-intrinsic-optimizer",
502 cl::desc("Enable image intrinsic optimizer pass"), cl::init(Val: true),
503 cl::Hidden);
504
505static cl::opt<bool>
506 EnableLoopPrefetch("amdgpu-loop-prefetch",
507 cl::desc("Enable loop data prefetch on AMDGPU"),
508 cl::Hidden, cl::init(Val: false));
509
510static cl::opt<std::string>
511 AMDGPUSchedStrategy("amdgpu-sched-strategy",
512 cl::desc("Select custom AMDGPU scheduling strategy."),
513 cl::Hidden, cl::init(Val: ""));
514
515static cl::opt<bool> EnableRewritePartialRegUses(
516 "amdgpu-enable-rewrite-partial-reg-uses",
517 cl::desc("Enable rewrite partial reg uses pass"), cl::init(Val: true),
518 cl::Hidden);
519
520static cl::opt<bool> EnableHipStdPar(
521 "amdgpu-enable-hipstdpar",
522 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(Val: false),
523 cl::Hidden);
524
525static cl::opt<bool>
526 EnableAMDGPUAttributor("amdgpu-attributor-enable",
527 cl::desc("Enable AMDGPUAttributorPass"),
528 cl::init(Val: true), cl::Hidden);
529
530static cl::opt<bool> NewRegBankSelect(
531 "new-reg-bank-select",
532 cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of "
533 "regbankselect"),
534 cl::init(Val: false), cl::Hidden);
535
536static cl::opt<bool> HasClosedWorldAssumption(
537 "amdgpu-link-time-closed-world",
538 cl::desc("Whether has closed-world assumption at link time"),
539 cl::init(Val: false), cl::Hidden);
540
541static cl::opt<bool> EnableUniformIntrinsicCombine(
542 "amdgpu-enable-uniform-intrinsic-combine",
543 cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
544 cl::init(Val: true), cl::Hidden);
545
546extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
547 // Register the target
548 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
549 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
550
551 PassRegistry *PR = PassRegistry::getPassRegistry();
552 initializeR600ClauseMergePassPass(*PR);
553 initializeR600ControlFlowFinalizerPass(*PR);
554 initializeR600PacketizerPass(*PR);
555 initializeR600ExpandSpecialInstrsPassPass(*PR);
556 initializeR600VectorRegMergerPass(*PR);
557 initializeR600EmitClauseMarkersPass(*PR);
558 initializeR600MachineCFGStructurizerPass(*PR);
559 initializeGlobalISel(*PR);
560 initializeAMDGPUAsmPrinterPass(*PR);
561 initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
562 initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
563 initializeGCNDPPCombineLegacyPass(*PR);
564 initializeSILowerI1CopiesLegacyPass(*PR);
565 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
566 initializeAMDGPURegBankSelectPass(*PR);
567 initializeAMDGPURegBankLegalizePass(*PR);
568 initializeSILowerWWMCopiesLegacyPass(*PR);
569 initializeAMDGPUMarkLastScratchLoadLegacyPass(*PR);
570 initializeSILowerSGPRSpillsLegacyPass(*PR);
571 initializeSIFixSGPRCopiesLegacyPass(*PR);
572 initializeSIFixVGPRCopiesLegacyPass(*PR);
573 initializeSIFoldOperandsLegacyPass(*PR);
574 initializeSIPeepholeSDWALegacyPass(*PR);
575 initializeSIShrinkInstructionsLegacyPass(*PR);
576 initializeSIOptimizeExecMaskingPreRALegacyPass(*PR);
577 initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR);
578 initializeSILoadStoreOptimizerLegacyPass(*PR);
579 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
580 initializeAMDGPUAlwaysInlinePass(*PR);
581 initializeAMDGPULowerExecSyncLegacyPass(*PR);
582 initializeAMDGPUSwLowerLDSLegacyPass(*PR);
583 initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
584 initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(*PR);
585 initializeAMDGPUAtomicOptimizerPass(*PR);
586 initializeAMDGPULowerKernelArgumentsPass(*PR);
587 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
588 initializeAMDGPULowerKernelAttributesPass(*PR);
589 initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
590 initializeAMDGPUPostLegalizerCombinerPass(*PR);
591 initializeAMDGPUPreLegalizerCombinerPass(*PR);
592 initializeAMDGPURegBankCombinerPass(*PR);
593 initializeAMDGPUPromoteAllocaPass(*PR);
594 initializeAMDGPUCodeGenPreparePass(*PR);
595 initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
596 initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
597 initializeAMDGPULowerModuleLDSLegacyPass(*PR);
598 initializeAMDGPULowerBufferFatPointersPass(*PR);
599 initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
600 initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
601 initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
602 initializeAMDGPURewriteOutArgumentsPass(*PR);
603 initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
604 initializeSIAnnotateControlFlowLegacyPass(*PR);
605 initializeAMDGPUInsertDelayAluLegacyPass(*PR);
606 initializeAMDGPULowerVGPREncodingLegacyPass(*PR);
607 initializeSIInsertHardClausesLegacyPass(*PR);
608 initializeSIInsertWaitcntsLegacyPass(*PR);
609 initializeSIModeRegisterLegacyPass(*PR);
610 initializeSIWholeQuadModeLegacyPass(*PR);
611 initializeSILowerControlFlowLegacyPass(*PR);
612 initializeSIPreEmitPeepholeLegacyPass(*PR);
613 initializeSILateBranchLoweringLegacyPass(*PR);
614 initializeSIMemoryLegalizerLegacyPass(*PR);
615 initializeSIOptimizeExecMaskingLegacyPass(*PR);
616 initializeSIPreAllocateWWMRegsLegacyPass(*PR);
617 initializeSIFormMemoryClausesLegacyPass(*PR);
618 initializeSIPostRABundlerLegacyPass(*PR);
619 initializeGCNCreateVOPDLegacyPass(*PR);
620 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
621 initializeAMDGPUAAWrapperPassPass(*PR);
622 initializeAMDGPUExternalAAWrapperPass(*PR);
623 initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
624 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
625 initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR);
626 initializeGCNNSAReassignLegacyPass(*PR);
627 initializeGCNPreRAOptimizationsLegacyPass(*PR);
628 initializeGCNPreRALongBranchRegLegacyPass(*PR);
629 initializeGCNRewritePartialRegUsesLegacyPass(*PR);
630 initializeGCNRegPressurePrinterPass(*PR);
631 initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
632 initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
633 initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
634 initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
635}
636
637static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
638 return std::make_unique<AMDGPUTargetObjectFile>();
639}
640
641static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
642 return new SIScheduleDAGMI(C);
643}
644
645static ScheduleDAGInstrs *
646createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
647 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
648 ScheduleDAGMILive *DAG =
649 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(args&: C));
650 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
651 if (ST.shouldClusterStores())
652 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
653 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
654 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
655 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
656 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
657 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
658 return DAG;
659}
660
661static ScheduleDAGInstrs *
662createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
663 ScheduleDAGMILive *DAG =
664 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(args&: C));
665 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
666 return DAG;
667}
668
669static ScheduleDAGInstrs *
670createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
671 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
672 ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
673 C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(args&: C));
674 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
675 if (ST.shouldClusterStores())
676 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
677 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
678 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
679 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
680 return DAG;
681}
682
683static ScheduleDAGInstrs *
684createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
685 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
686 auto *DAG = new GCNIterativeScheduler(
687 C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
688 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
689 if (ST.shouldClusterStores())
690 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
691 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
692 return DAG;
693}
694
695static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
696 auto *DAG = new GCNIterativeScheduler(
697 C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
698 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
699 return DAG;
700}
701
702static ScheduleDAGInstrs *
703createIterativeILPMachineScheduler(MachineSchedContext *C) {
704 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
705 auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
706 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
707 if (ST.shouldClusterStores())
708 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
709 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
710 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
711 return DAG;
712}
713
714static MachineSchedRegistry
715SISchedRegistry("si", "Run SI's custom scheduler",
716 createSIMachineScheduler);
717
718static MachineSchedRegistry
719GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
720 "Run GCN scheduler to maximize occupancy",
721 createGCNMaxOccupancyMachineScheduler);
722
723static MachineSchedRegistry
724 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
725 createGCNMaxILPMachineScheduler);
726
727static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
728 "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
729 createGCNMaxMemoryClauseMachineScheduler);
730
731static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
732 "gcn-iterative-max-occupancy-experimental",
733 "Run GCN scheduler to maximize occupancy (experimental)",
734 createIterativeGCNMaxOccupancyMachineScheduler);
735
736static MachineSchedRegistry GCNMinRegSchedRegistry(
737 "gcn-iterative-minreg",
738 "Run GCN iterative scheduler for minimal register usage (experimental)",
739 createMinRegScheduler);
740
741static MachineSchedRegistry GCNILPSchedRegistry(
742 "gcn-iterative-ilp",
743 "Run GCN iterative scheduler for ILP scheduling (experimental)",
744 createIterativeILPMachineScheduler);
745
746LLVM_READNONE
747static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
748 if (!GPU.empty())
749 return GPU;
750
751 // Need to default to a target with flat support for HSA.
752 if (TT.isAMDGCN())
753 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
754
755 return "r600";
756}
757
758static Reloc::Model getEffectiveRelocModel() {
759 // The AMDGPU toolchain only supports generating shared objects, so we
760 // must always use PIC.
761 return Reloc::PIC_;
762}
763
764AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
765 StringRef CPU, StringRef FS,
766 const TargetOptions &Options,
767 std::optional<Reloc::Model> RM,
768 std::optional<CodeModel::Model> CM,
769 CodeGenOptLevel OptLevel)
770 : CodeGenTargetMachineImpl(
771 T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, GPU: CPU), FS, Options,
772 getEffectiveRelocModel(), getEffectiveCodeModel(CM, Default: CodeModel::Small),
773 OptLevel),
774 TLOF(createTLOF(TT: getTargetTriple())) {
775 initAsmInfo();
776 if (TT.isAMDGCN()) {
777 if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize64"))
778 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave64));
779 else if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize32"))
780 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave32));
781 }
782}
783
784bool AMDGPUTargetMachine::EnableFunctionCalls = false;
785bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
786
787AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
788
789StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
790 Attribute GPUAttr = F.getFnAttribute(Kind: "target-cpu");
791 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
792}
793
794StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
795 Attribute FSAttr = F.getFnAttribute(Kind: "target-features");
796
797 return FSAttr.isValid() ? FSAttr.getValueAsString()
798 : getTargetFeatureString();
799}
800
801llvm::ScheduleDAGInstrs *
802AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
803 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
804 ScheduleDAGMILive *DAG = createSchedLive(C);
805 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
806 if (ST.shouldClusterStores())
807 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
808 return DAG;
809}
810
811/// Predicate for Internalize pass.
812static bool mustPreserveGV(const GlobalValue &GV) {
813 if (const Function *F = dyn_cast<Function>(Val: &GV))
814 return F->isDeclaration() || F->getName().starts_with(Prefix: "__asan_") ||
815 F->getName().starts_with(Prefix: "__sanitizer_") ||
816 AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
817
818 GV.removeDeadConstantUsers();
819 return !GV.use_empty();
820}
821
822void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
823 if (EnableAMDGPUAliasAnalysis)
824 AAM.registerFunctionAnalysis<AMDGPUAA>();
825}
826
827static Expected<ScanOptions>
828parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
829 if (Params.empty())
830 return ScanOptions::Iterative;
831 Params.consume_front(Prefix: "strategy=");
832 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
833 .Case(S: "dpp", Value: ScanOptions::DPP)
834 .Cases(CaseStrings: {"iterative", ""}, Value: ScanOptions::Iterative)
835 .Case(S: "none", Value: ScanOptions::None)
836 .Default(Value: std::nullopt);
837 if (Result)
838 return *Result;
839 return make_error<StringError>(Args: "invalid parameter", Args: inconvertibleErrorCode());
840}
841
842Expected<AMDGPUAttributorOptions>
843parseAMDGPUAttributorPassOptions(StringRef Params) {
844 AMDGPUAttributorOptions Result;
845 while (!Params.empty()) {
846 StringRef ParamName;
847 std::tie(args&: ParamName, args&: Params) = Params.split(Separator: ';');
848 if (ParamName == "closed-world") {
849 Result.IsClosedWorld = true;
850 } else {
851 return make_error<StringError>(
852 Args: formatv(Fmt: "invalid AMDGPUAttributor pass parameter '{0}' ", Vals&: ParamName)
853 .str(),
854 Args: inconvertibleErrorCode());
855 }
856 }
857 return Result;
858}
859
860void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
861
862#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
863#include "llvm/Passes/TargetPassRegistry.inc"
864
865 PB.registerScalarOptimizerLateEPCallback(
866 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
867 if (Level == OptimizationLevel::O0)
868 return;
869
870 FPM.addPass(Pass: InferAddressSpacesPass());
871 });
872
873 PB.registerVectorizerEndEPCallback(
874 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
875 if (Level == OptimizationLevel::O0)
876 return;
877
878 FPM.addPass(Pass: InferAddressSpacesPass());
879 });
880
881 PB.registerPipelineEarlySimplificationEPCallback(
882 C: [](ModulePassManager &PM, OptimizationLevel Level,
883 ThinOrFullLTOPhase Phase) {
884 if (!isLTOPreLink(Phase)) {
885 // When we are not using -fgpu-rdc, we can run accelerator code
886 // selection relatively early, but still after linking to prevent
887 // eager removal of potentially reachable symbols.
888 if (EnableHipStdPar) {
889 PM.addPass(Pass: HipStdParMathFixupPass());
890 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
891 }
892 PM.addPass(Pass: AMDGPUPrintfRuntimeBindingPass());
893 }
894
895 if (Level == OptimizationLevel::O0)
896 return;
897
898 // We don't want to run internalization at per-module stage.
899 if (InternalizeSymbols && !isLTOPreLink(Phase)) {
900 PM.addPass(Pass: InternalizePass(mustPreserveGV));
901 PM.addPass(Pass: GlobalDCEPass());
902 }
903
904 if (EarlyInlineAll && !EnableFunctionCalls)
905 PM.addPass(Pass: AMDGPUAlwaysInlinePass());
906 });
907
908 PB.registerPeepholeEPCallback(
909 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
910 if (Level == OptimizationLevel::O0)
911 return;
912
913 FPM.addPass(Pass: AMDGPUUseNativeCallsPass());
914 if (EnableLibCallSimplify)
915 FPM.addPass(Pass: AMDGPUSimplifyLibCallsPass());
916
917 if (EnableUniformIntrinsicCombine)
918 FPM.addPass(Pass: AMDGPUUniformIntrinsicCombinePass());
919 });
920
921 PB.registerCGSCCOptimizerLateEPCallback(
922 C: [this](CGSCCPassManager &PM, OptimizationLevel Level) {
923 if (Level == OptimizationLevel::O0)
924 return;
925
926 FunctionPassManager FPM;
927
928 // Add promote kernel arguments pass to the opt pipeline right before
929 // infer address spaces which is needed to do actual address space
930 // rewriting.
931 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
932 EnablePromoteKernelArguments)
933 FPM.addPass(Pass: AMDGPUPromoteKernelArgumentsPass());
934
935 // Add infer address spaces pass to the opt pipeline after inlining
936 // but before SROA to increase SROA opportunities.
937 FPM.addPass(Pass: InferAddressSpacesPass());
938
939 // This should run after inlining to have any chance of doing
940 // anything, and before other cleanup optimizations.
941 FPM.addPass(Pass: AMDGPULowerKernelAttributesPass());
942
943 if (Level != OptimizationLevel::O0) {
944 // Promote alloca to vector before SROA and loop unroll. If we
945 // manage to eliminate allocas before unroll we may choose to unroll
946 // less.
947 FPM.addPass(Pass: AMDGPUPromoteAllocaToVectorPass(*this));
948 }
949
950 PM.addPass(Pass: createCGSCCToFunctionPassAdaptor(Pass: std::move(FPM)));
951 });
952
953 // FIXME: Why is AMDGPUAttributor not in CGSCC?
954 PB.registerOptimizerLastEPCallback(C: [this](ModulePassManager &MPM,
955 OptimizationLevel Level,
956 ThinOrFullLTOPhase Phase) {
957 if (Level != OptimizationLevel::O0) {
958 if (!isLTOPreLink(Phase)) {
959 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
960 AMDGPUAttributorOptions Opts;
961 MPM.addPass(Pass: AMDGPUAttributorPass(*this, Opts, Phase));
962 }
963 }
964 }
965 });
966
967 PB.registerFullLinkTimeOptimizationLastEPCallback(
968 C: [this](ModulePassManager &PM, OptimizationLevel Level) {
969 // When we are using -fgpu-rdc, we can only run accelerator code
970 // selection after linking to prevent, otherwise we end up removing
971 // potentially reachable symbols that were exported as external in other
972 // modules.
973 if (EnableHipStdPar) {
974 PM.addPass(Pass: HipStdParMathFixupPass());
975 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
976 }
977 // We want to support the -lto-partitions=N option as "best effort".
978 // For that, we need to lower LDS earlier in the pipeline before the
979 // module is partitioned for codegen.
980 if (EnableLowerExecSync)
981 PM.addPass(Pass: AMDGPULowerExecSyncPass());
982 if (EnableSwLowerLDS)
983 PM.addPass(Pass: AMDGPUSwLowerLDSPass(*this));
984 if (EnableLowerModuleLDS)
985 PM.addPass(Pass: AMDGPULowerModuleLDSPass(*this));
986 if (Level != OptimizationLevel::O0) {
987 // We only want to run this with O2 or higher since inliner and SROA
988 // don't run in O1.
989 if (Level != OptimizationLevel::O1) {
990 PM.addPass(
991 Pass: createModuleToFunctionPassAdaptor(Pass: InferAddressSpacesPass()));
992 }
993 // Do we really need internalization in LTO?
994 if (InternalizeSymbols) {
995 PM.addPass(Pass: InternalizePass(mustPreserveGV));
996 PM.addPass(Pass: GlobalDCEPass());
997 }
998 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
999 AMDGPUAttributorOptions Opt;
1000 if (HasClosedWorldAssumption)
1001 Opt.IsClosedWorld = true;
1002 PM.addPass(Pass: AMDGPUAttributorPass(
1003 *this, Opt, ThinOrFullLTOPhase::FullLTOPostLink));
1004 }
1005 }
1006 if (!NoKernelInfoEndLTO) {
1007 FunctionPassManager FPM;
1008 FPM.addPass(Pass: KernelInfoPrinter(this));
1009 PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM)));
1010 }
1011 });
1012
1013 PB.registerRegClassFilterParsingCallback(
1014 C: [](StringRef FilterName) -> RegAllocFilterFunc {
1015 if (FilterName == "sgpr")
1016 return onlyAllocateSGPRs;
1017 if (FilterName == "vgpr")
1018 return onlyAllocateVGPRs;
1019 if (FilterName == "wwm")
1020 return onlyAllocateWWMRegs;
1021 return nullptr;
1022 });
1023}
1024
1025int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
1026 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1027 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1028 AddrSpace == AMDGPUAS::REGION_ADDRESS)
1029 ? -1
1030 : 0;
1031}
1032
1033bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
1034 unsigned DestAS) const {
1035 return AMDGPU::isFlatGlobalAddrSpace(AS: SrcAS) &&
1036 AMDGPU::isFlatGlobalAddrSpace(AS: DestAS);
1037}
1038
1039unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
1040 if (auto *Arg = dyn_cast<Argument>(Val: V);
1041 Arg &&
1042 AMDGPU::isModuleEntryFunctionCC(CC: Arg->getParent()->getCallingConv()) &&
1043 !Arg->hasByRefAttr())
1044 return AMDGPUAS::GLOBAL_ADDRESS;
1045
1046 const auto *LD = dyn_cast<LoadInst>(Val: V);
1047 if (!LD) // TODO: Handle invariant load like constant.
1048 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
1049
1050 // It must be a generic pointer loaded.
1051 assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
1052
1053 const auto *Ptr = LD->getPointerOperand();
1054 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
1055 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
1056 // For a generic pointer loaded from the constant memory, it could be assumed
1057 // as a global pointer since the constant memory is only populated on the
1058 // host side. As implied by the offload programming model, only global
1059 // pointers could be referenced on the host side.
1060 return AMDGPUAS::GLOBAL_ADDRESS;
1061}
1062
1063std::pair<const Value *, unsigned>
1064AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
1065 if (auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
1066 switch (II->getIntrinsicID()) {
1067 case Intrinsic::amdgcn_is_shared:
1068 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::LOCAL_ADDRESS);
1069 case Intrinsic::amdgcn_is_private:
1070 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::PRIVATE_ADDRESS);
1071 default:
1072 break;
1073 }
1074 return std::pair(nullptr, -1);
1075 }
1076 // Check the global pointer predication based on
1077 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
1078 // the order of 'is_shared' and 'is_private' is not significant.
1079 Value *Ptr;
1080 if (match(
1081 V: const_cast<Value *>(V),
1082 P: m_c_And(L: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_shared>(Op0: m_Value(V&: Ptr))),
1083 R: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_private>(
1084 Op0: m_Deferred(V: Ptr))))))
1085 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
1086
1087 return std::pair(nullptr, -1);
1088}
1089
1090unsigned
1091AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
1092 switch (Kind) {
1093 case PseudoSourceValue::Stack:
1094 case PseudoSourceValue::FixedStack:
1095 return AMDGPUAS::PRIVATE_ADDRESS;
1096 case PseudoSourceValue::ConstantPool:
1097 case PseudoSourceValue::GOT:
1098 case PseudoSourceValue::JumpTable:
1099 case PseudoSourceValue::GlobalValueCallEntry:
1100 case PseudoSourceValue::ExternalSymbolCallEntry:
1101 return AMDGPUAS::CONSTANT_ADDRESS;
1102 }
1103 return AMDGPUAS::FLAT_ADDRESS;
1104}
1105
1106bool AMDGPUTargetMachine::splitModule(
1107 Module &M, unsigned NumParts,
1108 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1109 // FIXME(?): Would be better to use an already existing Analysis/PassManager,
1110 // but all current users of this API don't have one ready and would need to
1111 // create one anyway. Let's hide the boilerplate for now to keep it simple.
1112
1113 LoopAnalysisManager LAM;
1114 FunctionAnalysisManager FAM;
1115 CGSCCAnalysisManager CGAM;
1116 ModuleAnalysisManager MAM;
1117
1118 PassBuilder PB(this);
1119 PB.registerModuleAnalyses(MAM);
1120 PB.registerFunctionAnalyses(FAM);
1121 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
1122
1123 ModulePassManager MPM;
1124 MPM.addPass(Pass: AMDGPUSplitModulePass(NumParts, ModuleCallback));
1125 MPM.run(IR&: M, AM&: MAM);
1126 return true;
1127}
1128
1129//===----------------------------------------------------------------------===//
1130// GCN Target Machine (SI+)
1131//===----------------------------------------------------------------------===//
1132
1133GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
1134 StringRef CPU, StringRef FS,
1135 const TargetOptions &Options,
1136 std::optional<Reloc::Model> RM,
1137 std::optional<CodeModel::Model> CM,
1138 CodeGenOptLevel OL, bool JIT)
1139 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
1140
1141const TargetSubtargetInfo *
1142GCNTargetMachine::getSubtargetImpl(const Function &F) const {
1143 StringRef GPU = getGPUName(F);
1144 StringRef FS = getFeatureString(F);
1145
1146 SmallString<128> SubtargetKey(GPU);
1147 SubtargetKey.append(RHS: FS);
1148
1149 auto &I = SubtargetMap[SubtargetKey];
1150 if (!I) {
1151 // This needs to be done before we create a new subtarget since any
1152 // creation will depend on the TM and the code generation flags on the
1153 // function that reside in TargetOptions.
1154 resetTargetOptions(F);
1155 I = std::make_unique<GCNSubtarget>(args: TargetTriple, args&: GPU, args&: FS, args: *this);
1156 }
1157
1158 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
1159
1160 return I.get();
1161}
1162
1163TargetTransformInfo
1164GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
1165 return TargetTransformInfo(std::make_unique<GCNTTIImpl>(args: this, args: F));
1166}
1167
1168Error GCNTargetMachine::buildCodeGenPipeline(
1169 ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
1170 CodeGenFileType FileType, const CGPassBuilderOption &Opts,
1171 PassInstrumentationCallbacks *PIC) {
1172 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
1173 return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
1174}
1175
1176ScheduleDAGInstrs *
1177GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
1178 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1179 if (ST.enableSIScheduler())
1180 return createSIMachineScheduler(C);
1181
1182 Attribute SchedStrategyAttr =
1183 C->MF->getFunction().getFnAttribute(Kind: "amdgpu-sched-strategy");
1184 StringRef SchedStrategy = SchedStrategyAttr.isValid()
1185 ? SchedStrategyAttr.getValueAsString()
1186 : AMDGPUSchedStrategy;
1187
1188 if (SchedStrategy == "max-ilp")
1189 return createGCNMaxILPMachineScheduler(C);
1190
1191 if (SchedStrategy == "max-memory-clause")
1192 return createGCNMaxMemoryClauseMachineScheduler(C);
1193
1194 if (SchedStrategy == "iterative-ilp")
1195 return createIterativeILPMachineScheduler(C);
1196
1197 if (SchedStrategy == "iterative-minreg")
1198 return createMinRegScheduler(C);
1199
1200 if (SchedStrategy == "iterative-maxocc")
1201 return createIterativeGCNMaxOccupancyMachineScheduler(C);
1202
1203 return createGCNMaxOccupancyMachineScheduler(C);
1204}
1205
1206ScheduleDAGInstrs *
1207GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
1208 ScheduleDAGMI *DAG =
1209 new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(args&: C),
1210 /*RemoveKillFlags=*/true);
1211 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1212 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1213 if (ST.shouldClusterStores())
1214 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1215 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
1216 if ((EnableVOPD.getNumOccurrences() ||
1217 getOptLevel() >= CodeGenOptLevel::Less) &&
1218 EnableVOPD)
1219 DAG->addMutation(Mutation: createVOPDPairingMutation());
1220 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
1221 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
1222 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
1223 return DAG;
1224}
1225//===----------------------------------------------------------------------===//
1226// AMDGPU Legacy Pass Setup
1227//===----------------------------------------------------------------------===//
1228
1229std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
1230 return getStandardCSEConfigForOpt(Level: TM->getOptLevel());
1231}
1232
1233namespace {
1234
1235class GCNPassConfig final : public AMDGPUPassConfig {
1236public:
1237 GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
1238 : AMDGPUPassConfig(TM, PM) {
1239 substitutePass(StandardID: &PostRASchedulerID, TargetID: &PostMachineSchedulerID);
1240 }
1241
1242 GCNTargetMachine &getGCNTargetMachine() const {
1243 return getTM<GCNTargetMachine>();
1244 }
1245
1246 bool addPreISel() override;
1247 void addMachineSSAOptimization() override;
1248 bool addILPOpts() override;
1249 bool addInstSelector() override;
1250 bool addIRTranslator() override;
1251 void addPreLegalizeMachineIR() override;
1252 bool addLegalizeMachineIR() override;
1253 void addPreRegBankSelect() override;
1254 bool addRegBankSelect() override;
1255 void addPreGlobalInstructionSelect() override;
1256 bool addGlobalInstructionSelect() override;
1257 void addPreRegAlloc() override;
1258 void addFastRegAlloc() override;
1259 void addOptimizedRegAlloc() override;
1260
1261 FunctionPass *createSGPRAllocPass(bool Optimized);
1262 FunctionPass *createVGPRAllocPass(bool Optimized);
1263 FunctionPass *createWWMRegAllocPass(bool Optimized);
1264 FunctionPass *createRegAllocPass(bool Optimized) override;
1265
1266 bool addRegAssignAndRewriteFast() override;
1267 bool addRegAssignAndRewriteOptimized() override;
1268
1269 bool addPreRewrite() override;
1270 void addPostRegAlloc() override;
1271 void addPreSched2() override;
1272 void addPreEmitPass() override;
1273 void addPostBBSections() override;
1274};
1275
1276} // end anonymous namespace
1277
1278AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM)
1279 : TargetPassConfig(TM, PM) {
1280 // Exceptions and StackMaps are not supported, so these passes will never do
1281 // anything.
1282 disablePass(PassID: &StackMapLivenessID);
1283 disablePass(PassID: &FuncletLayoutID);
1284 // Garbage collection is not supported.
1285 disablePass(PassID: &GCLoweringID);
1286 disablePass(PassID: &ShadowStackGCLoweringID);
1287}
1288
1289void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
1290 if (getOptLevel() == CodeGenOptLevel::Aggressive)
1291 addPass(P: createGVNPass());
1292 else
1293 addPass(P: createEarlyCSEPass());
1294}
1295
1296void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
1297 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
1298 addPass(P: createLoopDataPrefetchPass());
1299 addPass(P: createSeparateConstOffsetFromGEPPass());
1300 // ReassociateGEPs exposes more opportunities for SLSR. See
1301 // the example in reassociate-geps-and-slsr.ll.
1302 addPass(P: createStraightLineStrengthReducePass());
1303 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1304 // EarlyCSE can reuse.
1305 addEarlyCSEOrGVNPass();
1306 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1307 addPass(P: createNaryReassociatePass());
1308 // NaryReassociate on GEPs creates redundant common expressions, so run
1309 // EarlyCSE after it.
1310 addPass(P: createEarlyCSEPass());
1311}
1312
1313void AMDGPUPassConfig::addIRPasses() {
1314 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1315
1316 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
1317 addPass(P: createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1318
1319 // There is no reason to run these.
1320 disablePass(PassID: &StackMapLivenessID);
1321 disablePass(PassID: &FuncletLayoutID);
1322 disablePass(PassID: &PatchableFunctionID);
1323
1324 addPass(P: createAMDGPUPrintfRuntimeBinding());
1325 if (LowerCtorDtor)
1326 addPass(P: createAMDGPUCtorDtorLoweringLegacyPass());
1327
1328 if (TM.getTargetTriple().isAMDGCN() &&
1329 isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
1330 addPass(P: createAMDGPUImageIntrinsicOptimizerPass(&TM));
1331
1332 if (EnableUniformIntrinsicCombine)
1333 addPass(P: createAMDGPUUniformIntrinsicCombineLegacyPass());
1334
1335 // This can be disabled by passing ::Disable here or on the command line
1336 // with --expand-variadics-override=disable.
1337 addPass(P: createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
1338
1339 // Function calls are not supported, so make sure we inline everything.
1340 addPass(P: createAMDGPUAlwaysInlinePass());
1341 addPass(P: createAlwaysInlinerLegacyPass());
1342
1343 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1344 if (TM.getTargetTriple().getArch() == Triple::r600)
1345 addPass(P: createR600OpenCLImageTypeLoweringPass());
1346
1347 // Make enqueued block runtime handles externally visible.
1348 addPass(P: createAMDGPUExportKernelRuntimeHandlesLegacyPass());
1349
1350 // Lower special LDS accesses.
1351 if (EnableLowerExecSync)
1352 addPass(P: createAMDGPULowerExecSyncLegacyPass());
1353
1354 // Lower LDS accesses to global memory pass if address sanitizer is enabled.
1355 if (EnableSwLowerLDS)
1356 addPass(P: createAMDGPUSwLowerLDSLegacyPass(TM: &TM));
1357
1358 // Runs before PromoteAlloca so the latter can account for function uses
1359 if (EnableLowerModuleLDS) {
1360 addPass(P: createAMDGPULowerModuleLDSLegacyPass(TM: &TM));
1361 }
1362
1363 // Run atomic optimizer before Atomic Expand
1364 if ((TM.getTargetTriple().isAMDGCN()) &&
1365 (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1366 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1367 addPass(P: createAMDGPUAtomicOptimizerPass(ScanStrategy: AMDGPUAtomicOptimizerStrategy));
1368 }
1369
1370 addPass(P: createAtomicExpandLegacyPass());
1371
1372 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1373 addPass(P: createAMDGPUPromoteAlloca());
1374
1375 if (isPassEnabled(Opt: EnableScalarIRPasses))
1376 addStraightLineScalarOptimizationPasses();
1377
1378 if (EnableAMDGPUAliasAnalysis) {
1379 addPass(P: createAMDGPUAAWrapperPass());
1380 addPass(P: createExternalAAWrapperPass(Callback: [](Pass &P, Function &,
1381 AAResults &AAR) {
1382 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1383 AAR.addAAResult(AAResult&: WrapperPass->getResult());
1384 }));
1385 }
1386
1387 if (TM.getTargetTriple().isAMDGCN()) {
1388 // TODO: May want to move later or split into an early and late one.
1389 addPass(P: createAMDGPUCodeGenPreparePass());
1390 }
1391
1392 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1393 // have expanded.
1394 if (TM.getOptLevel() > CodeGenOptLevel::Less)
1395 addPass(P: createLICMPass());
1396 }
1397
1398 TargetPassConfig::addIRPasses();
1399
1400 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1401 // example, GVN can combine
1402 //
1403 // %0 = add %a, %b
1404 // %1 = add %b, %a
1405 //
1406 // and
1407 //
1408 // %0 = shl nsw %a, 2
1409 // %1 = shl %a, 2
1410 //
1411 // but EarlyCSE can do neither of them.
1412 if (isPassEnabled(Opt: EnableScalarIRPasses))
1413 addEarlyCSEOrGVNPass();
1414}
1415
1416void AMDGPUPassConfig::addCodeGenPrepare() {
1417 if (TM->getTargetTriple().isAMDGCN() &&
1418 TM->getOptLevel() > CodeGenOptLevel::None)
1419 addPass(P: createAMDGPUPreloadKernelArgumentsLegacyPass(TM));
1420
1421 if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
1422 addPass(P: createAMDGPULowerKernelArgumentsPass());
1423
1424 TargetPassConfig::addCodeGenPrepare();
1425
1426 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
1427 addPass(P: createLoadStoreVectorizerPass());
1428
1429 if (TM->getTargetTriple().isAMDGCN()) {
1430 // This lowering has been placed after codegenprepare to take advantage of
1431 // address mode matching (which is why it isn't put with the LDS lowerings).
1432 // It could be placed anywhere before uniformity annotations (an analysis
1433 // that it changes by splitting up fat pointers into their components)
1434 // but has been put before switch lowering and CFG flattening so that those
1435 // passes can run on the more optimized control flow this pass creates in
1436 // many cases.
1437 addPass(P: createAMDGPULowerBufferFatPointersPass());
1438 addPass(P: createAMDGPULowerIntrinsicsLegacyPass());
1439 }
1440
1441 // LowerSwitch pass may introduce unreachable blocks that can
1442 // cause unexpected behavior for subsequent passes. Placing it
1443 // here seems better that these blocks would get cleaned up by
1444 // UnreachableBlockElim inserted next in the pass flow.
1445 addPass(P: createLowerSwitchPass());
1446}
1447
1448bool AMDGPUPassConfig::addPreISel() {
1449 if (TM->getOptLevel() > CodeGenOptLevel::None)
1450 addPass(P: createFlattenCFGPass());
1451 return false;
1452}
1453
1454bool AMDGPUPassConfig::addInstSelector() {
1455 addPass(P: createAMDGPUISelDag(TM&: getAMDGPUTargetMachine(), OptLevel: getOptLevel()));
1456 return false;
1457}
1458
1459bool AMDGPUPassConfig::addGCPasses() {
1460 // Do nothing. GC is not supported.
1461 return false;
1462}
1463
1464//===----------------------------------------------------------------------===//
1465// GCN Legacy Pass Setup
1466//===----------------------------------------------------------------------===//
1467
1468bool GCNPassConfig::addPreISel() {
1469 AMDGPUPassConfig::addPreISel();
1470
1471 if (TM->getOptLevel() > CodeGenOptLevel::None)
1472 addPass(P: createSinkingPass());
1473
1474 if (TM->getOptLevel() > CodeGenOptLevel::None)
1475 addPass(P: createAMDGPULateCodeGenPrepareLegacyPass());
1476
1477 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1478 // regions formed by them.
1479 addPass(PassID: &AMDGPUUnifyDivergentExitNodesID);
1480 addPass(P: createFixIrreduciblePass());
1481 addPass(P: createUnifyLoopExitsPass());
1482 addPass(P: createStructurizeCFGPass(SkipUniformRegions: false)); // true -> SkipUniformRegions
1483
1484 addPass(P: createAMDGPUAnnotateUniformValuesLegacy());
1485 addPass(P: createSIAnnotateControlFlowLegacyPass());
1486 // TODO: Move this right after structurizeCFG to avoid extra divergence
1487 // analysis. This depends on stopping SIAnnotateControlFlow from making
1488 // control flow modifications.
1489 addPass(P: createAMDGPURewriteUndefForPHILegacyPass());
1490
1491 // SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
1492 // with -new-reg-bank-select and without any of the fallback options.
1493 if (!getCGPassBuilderOption().EnableGlobalISelOption ||
1494 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
1495 addPass(P: createLCSSAPass());
1496
1497 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1498 addPass(PassID: &AMDGPUPerfHintAnalysisLegacyID);
1499
1500 return false;
1501}
1502
1503void GCNPassConfig::addMachineSSAOptimization() {
1504 TargetPassConfig::addMachineSSAOptimization();
1505
1506 // We want to fold operands after PeepholeOptimizer has run (or as part of
1507 // it), because it will eliminate extra copies making it easier to fold the
1508 // real source operand. We want to eliminate dead instructions after, so that
1509 // we see fewer uses of the copies. We then need to clean up the dead
1510 // instructions leftover after the operands are folded as well.
1511 //
1512 // XXX - Can we get away without running DeadMachineInstructionElim again?
1513 addPass(PassID: &SIFoldOperandsLegacyID);
1514 if (EnableDPPCombine)
1515 addPass(PassID: &GCNDPPCombineLegacyID);
1516 addPass(PassID: &SILoadStoreOptimizerLegacyID);
1517 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
1518 addPass(PassID: &SIPeepholeSDWALegacyID);
1519 addPass(PassID: &EarlyMachineLICMID);
1520 addPass(PassID: &MachineCSELegacyID);
1521 addPass(PassID: &SIFoldOperandsLegacyID);
1522 }
1523 addPass(PassID: &DeadMachineInstructionElimID);
1524 addPass(P: createSIShrinkInstructionsLegacyPass());
1525}
1526
1527bool GCNPassConfig::addILPOpts() {
1528 if (EnableEarlyIfConversion)
1529 addPass(PassID: &EarlyIfConverterLegacyID);
1530
1531 TargetPassConfig::addILPOpts();
1532 return false;
1533}
1534
1535bool GCNPassConfig::addInstSelector() {
1536 AMDGPUPassConfig::addInstSelector();
1537 addPass(PassID: &SIFixSGPRCopiesLegacyID);
1538 addPass(P: createSILowerI1CopiesLegacyPass());
1539 return false;
1540}
1541
1542bool GCNPassConfig::addIRTranslator() {
1543 addPass(P: new IRTranslator(getOptLevel()));
1544 return false;
1545}
1546
1547void GCNPassConfig::addPreLegalizeMachineIR() {
1548 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1549 addPass(P: createAMDGPUPreLegalizeCombiner(IsOptNone));
1550 addPass(P: new Localizer());
1551}
1552
1553bool GCNPassConfig::addLegalizeMachineIR() {
1554 addPass(P: new Legalizer());
1555 return false;
1556}
1557
1558void GCNPassConfig::addPreRegBankSelect() {
1559 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1560 addPass(P: createAMDGPUPostLegalizeCombiner(IsOptNone));
1561 addPass(P: createAMDGPUGlobalISelDivergenceLoweringPass());
1562}
1563
1564bool GCNPassConfig::addRegBankSelect() {
1565 if (NewRegBankSelect) {
1566 addPass(P: createAMDGPURegBankSelectPass());
1567 addPass(P: createAMDGPURegBankLegalizePass());
1568 } else {
1569 addPass(P: new RegBankSelect());
1570 }
1571 return false;
1572}
1573
1574void GCNPassConfig::addPreGlobalInstructionSelect() {
1575 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1576 addPass(P: createAMDGPURegBankCombiner(IsOptNone));
1577}
1578
1579bool GCNPassConfig::addGlobalInstructionSelect() {
1580 addPass(P: new InstructionSelect(getOptLevel()));
1581 return false;
1582}
1583
1584void GCNPassConfig::addFastRegAlloc() {
1585 // FIXME: We have to disable the verifier here because of PHIElimination +
1586 // TwoAddressInstructions disabling it.
1587
1588 // This must be run immediately after phi elimination and before
1589 // TwoAddressInstructions, otherwise the processing of the tied operand of
1590 // SI_ELSE will introduce a copy of the tied operand source after the else.
1591 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1592
1593 insertPass(TargetPassID: &TwoAddressInstructionPassID, InsertedPassID: &SIWholeQuadModeID);
1594
1595 TargetPassConfig::addFastRegAlloc();
1596}
1597
1598void GCNPassConfig::addPreRegAlloc() {
1599 if (getOptLevel() != CodeGenOptLevel::None)
1600 addPass(PassID: &AMDGPUPrepareAGPRAllocLegacyID);
1601}
1602
1603void GCNPassConfig::addOptimizedRegAlloc() {
1604 if (EnableDCEInRA)
1605 insertPass(TargetPassID: &DetectDeadLanesID, InsertedPassID: &DeadMachineInstructionElimID);
1606
1607 // FIXME: when an instruction has a Killed operand, and the instruction is
1608 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1609 // the register in LiveVariables, this would trigger a failure in verifier,
1610 // we should fix it and enable the verifier.
1611 if (OptVGPRLiveRange)
1612 insertPass(TargetPassID: &LiveVariablesID, InsertedPassID: &SIOptimizeVGPRLiveRangeLegacyID);
1613
1614 // This must be run immediately after phi elimination and before
1615 // TwoAddressInstructions, otherwise the processing of the tied operand of
1616 // SI_ELSE will introduce a copy of the tied operand source after the else.
1617 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1618
1619 if (EnableRewritePartialRegUses)
1620 insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNRewritePartialRegUsesID);
1621
1622 if (isPassEnabled(Opt: EnablePreRAOptimizations))
1623 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &GCNPreRAOptimizationsID);
1624
1625 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1626 // instructions that cause scheduling barriers.
1627 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIWholeQuadModeID);
1628
1629 if (OptExecMaskPreRA)
1630 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIOptimizeExecMaskingPreRAID);
1631
1632 // This is not an essential optimization and it has a noticeable impact on
1633 // compilation time, so we only enable it from O2.
1634 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1635 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIFormMemoryClausesID);
1636
1637 TargetPassConfig::addOptimizedRegAlloc();
1638}
1639
1640bool GCNPassConfig::addPreRewrite() {
1641 if (EnableRegReassign)
1642 addPass(PassID: &GCNNSAReassignID);
1643
1644 addPass(PassID: &AMDGPURewriteAGPRCopyMFMALegacyID);
1645 return true;
1646}
1647
1648FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1649 // Initialize the global default.
1650 llvm::call_once(flag&: InitializeDefaultSGPRRegisterAllocatorFlag,
1651 F&: initializeDefaultSGPRRegisterAllocatorOnce);
1652
1653 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1654 if (Ctor != useDefaultRegisterAllocator)
1655 return Ctor();
1656
1657 if (Optimized)
1658 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
1659
1660 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
1661}
1662
1663FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1664 // Initialize the global default.
1665 llvm::call_once(flag&: InitializeDefaultVGPRRegisterAllocatorFlag,
1666 F&: initializeDefaultVGPRRegisterAllocatorOnce);
1667
1668 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1669 if (Ctor != useDefaultRegisterAllocator)
1670 return Ctor();
1671
1672 if (Optimized)
1673 return createGreedyVGPRRegisterAllocator();
1674
1675 return createFastVGPRRegisterAllocator();
1676}
1677
1678FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1679 // Initialize the global default.
1680 llvm::call_once(flag&: InitializeDefaultWWMRegisterAllocatorFlag,
1681 F&: initializeDefaultWWMRegisterAllocatorOnce);
1682
1683 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1684 if (Ctor != useDefaultRegisterAllocator)
1685 return Ctor();
1686
1687 if (Optimized)
1688 return createGreedyWWMRegisterAllocator();
1689
1690 return createFastWWMRegisterAllocator();
1691}
1692
1693FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1694 llvm_unreachable("should not be used");
1695}
1696
1697static const char RegAllocOptNotSupportedMessage[] =
1698 "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1699 "and -vgpr-regalloc";
1700
1701bool GCNPassConfig::addRegAssignAndRewriteFast() {
1702 if (!usingDefaultRegAlloc())
1703 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1704
1705 addPass(PassID: &GCNPreRALongBranchRegID);
1706
1707 addPass(P: createSGPRAllocPass(Optimized: false));
1708
1709 // Equivalent of PEI for SGPRs.
1710 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1711
1712 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1713 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1714
1715 // For allocating other wwm register operands.
1716 addPass(P: createWWMRegAllocPass(Optimized: false));
1717
1718 addPass(PassID: &SILowerWWMCopiesLegacyID);
1719 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1720
1721 // For allocating per-thread VGPRs.
1722 addPass(P: createVGPRAllocPass(Optimized: false));
1723
1724 return true;
1725}
1726
1727bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1728 if (!usingDefaultRegAlloc())
1729 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1730
1731 addPass(PassID: &GCNPreRALongBranchRegID);
1732
1733 addPass(P: createSGPRAllocPass(Optimized: true));
1734
1735 // Commit allocated register changes. This is mostly necessary because too
1736 // many things rely on the use lists of the physical registers, such as the
1737 // verifier. This is only necessary with allocators which use LiveIntervals,
1738 // since FastRegAlloc does the replacements itself.
1739 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1740
1741 // At this point, the sgpr-regalloc has been done and it is good to have the
1742 // stack slot coloring to try to optimize the SGPR spill stack indices before
1743 // attempting the custom SGPR spill lowering.
1744 addPass(PassID: &StackSlotColoringID);
1745
1746 // Equivalent of PEI for SGPRs.
1747 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1748
1749 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1750 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1751
1752 // For allocating other whole wave mode registers.
1753 addPass(P: createWWMRegAllocPass(Optimized: true));
1754 addPass(PassID: &SILowerWWMCopiesLegacyID);
1755 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1756 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1757
1758 // For allocating per-thread VGPRs.
1759 addPass(P: createVGPRAllocPass(Optimized: true));
1760
1761 addPreRewrite();
1762 addPass(PassID: &VirtRegRewriterID);
1763
1764 addPass(PassID: &AMDGPUMarkLastScratchLoadID);
1765
1766 return true;
1767}
1768
1769void GCNPassConfig::addPostRegAlloc() {
1770 addPass(PassID: &SIFixVGPRCopiesID);
1771 if (getOptLevel() > CodeGenOptLevel::None)
1772 addPass(PassID: &SIOptimizeExecMaskingLegacyID);
1773 TargetPassConfig::addPostRegAlloc();
1774}
1775
1776void GCNPassConfig::addPreSched2() {
1777 if (TM->getOptLevel() > CodeGenOptLevel::None)
1778 addPass(P: createSIShrinkInstructionsLegacyPass());
1779 addPass(PassID: &SIPostRABundlerLegacyID);
1780}
1781
1782void GCNPassConfig::addPreEmitPass() {
1783 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less))
1784 addPass(PassID: &GCNCreateVOPDID);
1785 addPass(P: createSIMemoryLegalizerPass());
1786 addPass(P: createSIInsertWaitcntsPass());
1787
1788 addPass(P: createSIModeRegisterPass());
1789
1790 if (getOptLevel() > CodeGenOptLevel::None)
1791 addPass(PassID: &SIInsertHardClausesID);
1792
1793 addPass(PassID: &SILateBranchLoweringPassID);
1794 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
1795 addPass(P: createAMDGPUSetWavePriorityPass());
1796 if (getOptLevel() > CodeGenOptLevel::None)
1797 addPass(PassID: &SIPreEmitPeepholeID);
1798 // The hazard recognizer that runs as part of the post-ra scheduler does not
1799 // guarantee to be able handle all hazards correctly. This is because if there
1800 // are multiple scheduling regions in a basic block, the regions are scheduled
1801 // bottom up, so when we begin to schedule a region we don't know what
1802 // instructions were emitted directly before it.
1803 //
1804 // Here we add a stand-alone hazard recognizer pass which can handle all
1805 // cases.
1806 addPass(PassID: &PostRAHazardRecognizerID);
1807
1808 addPass(PassID: &AMDGPUWaitSGPRHazardsLegacyID);
1809
1810 addPass(PassID: &AMDGPULowerVGPREncodingLegacyID);
1811
1812 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less))
1813 addPass(PassID: &AMDGPUInsertDelayAluID);
1814
1815 addPass(PassID: &BranchRelaxationPassID);
1816}
1817
1818void GCNPassConfig::addPostBBSections() {
1819 // We run this later to avoid passes like livedebugvalues and BBSections
1820 // having to deal with the apparent multi-entry functions we may generate.
1821 addPass(P: createAMDGPUPreloadKernArgPrologLegacyPass());
1822}
1823
1824TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1825 return new GCNPassConfig(*this, PM);
1826}
1827
1828void GCNTargetMachine::registerMachineRegisterInfoCallback(
1829 MachineFunction &MF) const {
1830 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1831 MF.getRegInfo().addDelegate(delegate: MFI);
1832}
1833
1834MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1835 BumpPtrAllocator &Allocator, const Function &F,
1836 const TargetSubtargetInfo *STI) const {
1837 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1838 Allocator, F, STI: static_cast<const GCNSubtarget *>(STI));
1839}
1840
1841yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1842 return new yaml::SIMachineFunctionInfo();
1843}
1844
1845yaml::MachineFunctionInfo *
1846GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1847 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1848 return new yaml::SIMachineFunctionInfo(
1849 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1850}
1851
1852bool GCNTargetMachine::parseMachineFunctionInfo(
1853 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1854 SMDiagnostic &Error, SMRange &SourceRange) const {
1855 const yaml::SIMachineFunctionInfo &YamlMFI =
1856 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1857 MachineFunction &MF = PFS.MF;
1858 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1859 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1860
1861 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1862 return true;
1863
1864 if (MFI->Occupancy == 0) {
1865 // Fixup the subtarget dependent default value.
1866 MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
1867 }
1868
1869 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1870 Register TempReg;
1871 if (parseNamedRegisterReference(PFS, Reg&: TempReg, Src: RegName.Value, Error)) {
1872 SourceRange = RegName.SourceRange;
1873 return true;
1874 }
1875 RegVal = TempReg;
1876
1877 return false;
1878 };
1879
1880 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1881 Register &RegVal) {
1882 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1883 };
1884
1885 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1886 return true;
1887
1888 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1889 return true;
1890
1891 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1892 MFI->LongBranchReservedReg))
1893 return true;
1894
1895 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1896 // Create a diagnostic for a the register string literal.
1897 const MemoryBuffer &Buffer =
1898 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
1899 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1900 RegName.Value.size(), SourceMgr::DK_Error,
1901 "incorrect register class for field", RegName.Value,
1902 {}, {});
1903 SourceRange = RegName.SourceRange;
1904 return true;
1905 };
1906
1907 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1908 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1909 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1910 return true;
1911
1912 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1913 !AMDGPU::SGPR_128RegClass.contains(Reg: MFI->ScratchRSrcReg)) {
1914 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1915 }
1916
1917 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1918 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->FrameOffsetReg)) {
1919 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1920 }
1921
1922 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1923 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->StackPtrOffsetReg)) {
1924 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1925 }
1926
1927 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1928 Register ParsedReg;
1929 if (parseRegister(YamlReg, ParsedReg))
1930 return true;
1931
1932 MFI->reserveWWMRegister(Reg: ParsedReg);
1933 }
1934
1935 for (const auto &[_, Info] : PFS.VRegInfosNamed) {
1936 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
1937 }
1938 for (const auto &[_, Info] : PFS.VRegInfos) {
1939 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
1940 }
1941
1942 for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
1943 Register ParsedReg;
1944 if (parseRegister(YamlRegStr, ParsedReg))
1945 return true;
1946 MFI->SpillPhysVGPRs.push_back(Elt: ParsedReg);
1947 }
1948
1949 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1950 const TargetRegisterClass &RC,
1951 ArgDescriptor &Arg, unsigned UserSGPRs,
1952 unsigned SystemSGPRs) {
1953 // Skip parsing if it's not present.
1954 if (!A)
1955 return false;
1956
1957 if (A->IsRegister) {
1958 Register Reg;
1959 if (parseNamedRegisterReference(PFS, Reg, Src: A->RegisterName.Value, Error)) {
1960 SourceRange = A->RegisterName.SourceRange;
1961 return true;
1962 }
1963 if (!RC.contains(Reg))
1964 return diagnoseRegisterClass(A->RegisterName);
1965 Arg = ArgDescriptor::createRegister(Reg);
1966 } else
1967 Arg = ArgDescriptor::createStack(Offset: A->StackOffset);
1968 // Check and apply the optional mask.
1969 if (A->Mask)
1970 Arg = ArgDescriptor::createArg(Arg, Mask: *A->Mask);
1971
1972 MFI->NumUserSGPRs += UserSGPRs;
1973 MFI->NumSystemSGPRs += SystemSGPRs;
1974 return false;
1975 };
1976
1977 if (YamlMFI.ArgInfo &&
1978 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1979 AMDGPU::SGPR_128RegClass,
1980 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1981 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1982 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1983 2, 0) ||
1984 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1985 MFI->ArgInfo.QueuePtr, 2, 0) ||
1986 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1987 AMDGPU::SReg_64RegClass,
1988 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1989 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1990 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1991 2, 0) ||
1992 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1993 AMDGPU::SReg_64RegClass,
1994 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1995 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1996 AMDGPU::SGPR_32RegClass,
1997 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1998 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1999 AMDGPU::SGPR_32RegClass,
2000 MFI->ArgInfo.LDSKernelId, 0, 1) ||
2001 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
2002 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
2003 0, 1) ||
2004 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
2005 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
2006 0, 1) ||
2007 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
2008 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
2009 0, 1) ||
2010 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
2011 AMDGPU::SGPR_32RegClass,
2012 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
2013 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
2014 AMDGPU::SGPR_32RegClass,
2015 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
2016 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
2017 AMDGPU::SReg_64RegClass,
2018 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
2019 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
2020 AMDGPU::SReg_64RegClass,
2021 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
2022 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
2023 AMDGPU::VGPR_32RegClass,
2024 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
2025 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
2026 AMDGPU::VGPR_32RegClass,
2027 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
2028 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
2029 AMDGPU::VGPR_32RegClass,
2030 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
2031 return true;
2032
2033 // Parse FirstKernArgPreloadReg separately, since it's a Register,
2034 // not ArgDescriptor.
2035 if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) {
2036 const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg;
2037
2038 if (!A.IsRegister) {
2039 // For stack arguments, we don't have RegisterName.SourceRange,
2040 // but we should have some location info from the YAML parser
2041 const MemoryBuffer &Buffer =
2042 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
2043 // Create a minimal valid source range
2044 SMLoc Loc = SMLoc::getFromPointer(Ptr: Buffer.getBufferStart());
2045 SMRange Range(Loc, Loc);
2046
2047 Error = SMDiagnostic(
2048 *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error,
2049 "firstKernArgPreloadReg must be a register, not a stack location", "",
2050 {}, {});
2051
2052 SourceRange = Range;
2053 return true;
2054 }
2055
2056 Register Reg;
2057 if (parseNamedRegisterReference(PFS, Reg, Src: A.RegisterName.Value, Error)) {
2058 SourceRange = A.RegisterName.SourceRange;
2059 return true;
2060 }
2061
2062 if (!AMDGPU::SGPR_32RegClass.contains(Reg))
2063 return diagnoseRegisterClass(A.RegisterName);
2064
2065 MFI->ArgInfo.FirstKernArgPreloadReg = Reg;
2066 MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs;
2067 }
2068
2069 if (ST.hasIEEEMode())
2070 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
2071 if (ST.hasDX10ClampMode())
2072 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
2073
2074 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
2075 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
2076 ? DenormalMode::IEEE
2077 : DenormalMode::PreserveSign;
2078 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
2079 ? DenormalMode::IEEE
2080 : DenormalMode::PreserveSign;
2081
2082 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
2083 ? DenormalMode::IEEE
2084 : DenormalMode::PreserveSign;
2085 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
2086 ? DenormalMode::IEEE
2087 : DenormalMode::PreserveSign;
2088
2089 if (YamlMFI.HasInitWholeWave)
2090 MFI->setInitWholeWave();
2091
2092 return false;
2093}
2094
2095//===----------------------------------------------------------------------===//
2096// AMDGPU CodeGen Pass Builder interface.
2097//===----------------------------------------------------------------------===//
2098
2099AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
2100 GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
2101 PassInstrumentationCallbacks *PIC)
2102 : CodeGenPassBuilder(TM, Opts, PIC) {
2103 Opt.MISchedPostRA = true;
2104 Opt.RequiresCodeGenSCCOrder = true;
2105 // Exceptions and StackMaps are not supported, so these passes will never do
2106 // anything.
2107 // Garbage collection is not supported.
2108 disablePass<StackMapLivenessPass, FuncletLayoutPass, PatchableFunctionPass,
2109 ShadowStackGCLoweringPass, GCLoweringPass>();
2110}
2111
2112void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
2113 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) {
2114 flushFPMsToMPM(PMW);
2115 addModulePass(Pass: AMDGPURemoveIncompatibleFunctionsPass(TM), PMW);
2116 }
2117
2118 flushFPMsToMPM(PMW);
2119 addModulePass(Pass: AMDGPUPrintfRuntimeBindingPass(), PMW);
2120 if (LowerCtorDtor)
2121 addModulePass(Pass: AMDGPUCtorDtorLoweringPass(), PMW);
2122
2123 if (isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
2124 addFunctionPass(Pass: AMDGPUImageIntrinsicOptimizerPass(TM), PMW);
2125
2126 if (EnableUniformIntrinsicCombine)
2127 addFunctionPass(Pass: AMDGPUUniformIntrinsicCombinePass(), PMW);
2128 // This can be disabled by passing ::Disable here or on the command line
2129 // with --expand-variadics-override=disable.
2130 flushFPMsToMPM(PMW);
2131 addModulePass(Pass: ExpandVariadicsPass(ExpandVariadicsMode::Lowering), PMW);
2132
2133 addModulePass(Pass: AMDGPUAlwaysInlinePass(), PMW);
2134 addModulePass(Pass: AlwaysInlinerPass(), PMW);
2135
2136 addModulePass(Pass: AMDGPUExportKernelRuntimeHandlesPass(), PMW);
2137
2138 if (EnableLowerExecSync)
2139 addModulePass(Pass: AMDGPULowerExecSyncPass(), PMW);
2140
2141 if (EnableSwLowerLDS)
2142 addModulePass(Pass: AMDGPUSwLowerLDSPass(TM), PMW);
2143
2144 // Runs before PromoteAlloca so the latter can account for function uses
2145 if (EnableLowerModuleLDS)
2146 addModulePass(Pass: AMDGPULowerModuleLDSPass(TM), PMW);
2147
2148 // Run atomic optimizer before Atomic Expand
2149 if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
2150 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
2151 addFunctionPass(
2152 Pass: AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy), PMW);
2153
2154 addFunctionPass(Pass: AtomicExpandPass(TM), PMW);
2155
2156 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2157 addFunctionPass(Pass: AMDGPUPromoteAllocaPass(TM), PMW);
2158 if (isPassEnabled(Opt: EnableScalarIRPasses))
2159 addStraightLineScalarOptimizationPasses(PMW);
2160
2161 // TODO: Handle EnableAMDGPUAliasAnalysis
2162
2163 // TODO: May want to move later or split into an early and late one.
2164 addFunctionPass(Pass: AMDGPUCodeGenPreparePass(TM), PMW);
2165
2166 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
2167 // have expanded.
2168 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2169 addFunctionPass(Pass: createFunctionToLoopPassAdaptor(Pass: LICMPass(LICMOptions()),
2170 /*UseMemorySSA=*/true),
2171 PMW);
2172 }
2173 }
2174
2175 Base::addIRPasses(PMW);
2176
2177 // EarlyCSE is not always strong enough to clean up what LSR produces. For
2178 // example, GVN can combine
2179 //
2180 // %0 = add %a, %b
2181 // %1 = add %b, %a
2182 //
2183 // and
2184 //
2185 // %0 = shl nsw %a, 2
2186 // %1 = shl %a, 2
2187 //
2188 // but EarlyCSE can do neither of them.
2189 if (isPassEnabled(Opt: EnableScalarIRPasses))
2190 addEarlyCSEOrGVNPass(PMW);
2191}
2192
2193void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(
2194 PassManagerWrapper &PMW) const {
2195 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2196 flushFPMsToMPM(PMW);
2197 addModulePass(Pass: AMDGPUPreloadKernelArgumentsPass(TM), PMW);
2198 }
2199
2200 if (EnableLowerKernelArguments)
2201 addFunctionPass(Pass: AMDGPULowerKernelArgumentsPass(TM), PMW);
2202
2203 Base::addCodeGenPrepare(PMW);
2204
2205 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
2206 addFunctionPass(Pass: LoadStoreVectorizerPass(), PMW);
2207
2208 // This lowering has been placed after codegenprepare to take advantage of
2209 // address mode matching (which is why it isn't put with the LDS lowerings).
2210 // It could be placed anywhere before uniformity annotations (an analysis
2211 // that it changes by splitting up fat pointers into their components)
2212 // but has been put before switch lowering and CFG flattening so that those
2213 // passes can run on the more optimized control flow this pass creates in
2214 // many cases.
2215 flushFPMsToMPM(PMW);
2216 addModulePass(Pass: AMDGPULowerBufferFatPointersPass(TM), PMW);
2217 flushFPMsToMPM(PMW);
2218 requireCGSCCOrder(PMW);
2219
2220 addModulePass(Pass: AMDGPULowerIntrinsicsPass(TM), PMW);
2221
2222 // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
2223 // behavior for subsequent passes. Placing it here seems better that these
2224 // blocks would get cleaned up by UnreachableBlockElim inserted next in the
2225 // pass flow.
2226 addFunctionPass(Pass: LowerSwitchPass(), PMW);
2227}
2228
2229void AMDGPUCodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const {
2230
2231 // Require AMDGPUArgumentUsageAnalysis so that it's available during ISel.
2232 flushFPMsToMPM(PMW);
2233 addModulePass(Pass: RequireAnalysisPass<AMDGPUArgumentUsageAnalysis, Module>(),
2234 PMW);
2235
2236 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2237 addFunctionPass(Pass: FlattenCFGPass(), PMW);
2238 addFunctionPass(Pass: SinkingPass(), PMW);
2239 addFunctionPass(Pass: AMDGPULateCodeGenPreparePass(TM), PMW);
2240 }
2241
2242 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
2243 // regions formed by them.
2244
2245 addFunctionPass(Pass: AMDGPUUnifyDivergentExitNodesPass(), PMW);
2246 addFunctionPass(Pass: FixIrreduciblePass(), PMW);
2247 addFunctionPass(Pass: UnifyLoopExitsPass(), PMW);
2248 addFunctionPass(Pass: StructurizeCFGPass(/*SkipUniformRegions=*/false), PMW);
2249
2250 addFunctionPass(Pass: AMDGPUAnnotateUniformValuesPass(), PMW);
2251
2252 addFunctionPass(Pass: SIAnnotateControlFlowPass(TM), PMW);
2253
2254 // TODO: Move this right after structurizeCFG to avoid extra divergence
2255 // analysis. This depends on stopping SIAnnotateControlFlow from making
2256 // control flow modifications.
2257 addFunctionPass(Pass: AMDGPURewriteUndefForPHIPass(), PMW);
2258
2259 if (!getCGPassBuilderOption().EnableGlobalISelOption ||
2260 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
2261 addFunctionPass(Pass: LCSSAPass(), PMW);
2262
2263 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2264 flushFPMsToMPM(PMW);
2265 addModulePass(Pass: AMDGPUPerfHintAnalysisPass(TM), PMW);
2266 }
2267
2268 // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
2269 // isn't this in addInstSelector?
2270 addFunctionPass(Pass: RequireAnalysisPass<UniformityInfoAnalysis, Function>(), PMW,
2271 /*Force=*/true);
2272}
2273
2274void AMDGPUCodeGenPassBuilder::addILPOpts(PassManagerWrapper &PMW) const {
2275 if (EnableEarlyIfConversion)
2276 addMachineFunctionPass(Pass: EarlyIfConverterPass(), PMW);
2277
2278 Base::addILPOpts(PMW);
2279}
2280
2281void AMDGPUCodeGenPassBuilder::addAsmPrinter(PassManagerWrapper &PMW,
2282 CreateMCStreamer) const {
2283 // TODO: Add AsmPrinter.
2284}
2285
2286Error AMDGPUCodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const {
2287 addMachineFunctionPass(Pass: AMDGPUISelDAGToDAGPass(TM), PMW);
2288 addMachineFunctionPass(Pass: SIFixSGPRCopiesPass(), PMW);
2289 addMachineFunctionPass(Pass: SILowerI1CopiesPass(), PMW);
2290 return Error::success();
2291}
2292
2293void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const {
2294 if (EnableRegReassign) {
2295 addMachineFunctionPass(Pass: GCNNSAReassignPass(), PMW);
2296 }
2297
2298 addMachineFunctionPass(Pass: AMDGPURewriteAGPRCopyMFMAPass(), PMW);
2299}
2300
2301void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
2302 PassManagerWrapper &PMW) const {
2303 Base::addMachineSSAOptimization(PMW);
2304
2305 addMachineFunctionPass(Pass: SIFoldOperandsPass(), PMW);
2306 if (EnableDPPCombine) {
2307 addMachineFunctionPass(Pass: GCNDPPCombinePass(), PMW);
2308 }
2309 addMachineFunctionPass(Pass: SILoadStoreOptimizerPass(), PMW);
2310 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
2311 addMachineFunctionPass(Pass: SIPeepholeSDWAPass(), PMW);
2312 addMachineFunctionPass(Pass: EarlyMachineLICMPass(), PMW);
2313 addMachineFunctionPass(Pass: MachineCSEPass(), PMW);
2314 addMachineFunctionPass(Pass: SIFoldOperandsPass(), PMW);
2315 }
2316 addMachineFunctionPass(Pass: DeadMachineInstructionElimPass(), PMW);
2317 addMachineFunctionPass(Pass: SIShrinkInstructionsPass(), PMW);
2318}
2319
2320Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const {
2321 insertPass<PHIEliminationPass>(Pass: SILowerControlFlowPass());
2322
2323 insertPass<TwoAddressInstructionPass>(Pass: SIWholeQuadModePass());
2324
2325 return Base::addFastRegAlloc(PMW);
2326}
2327
2328Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast(
2329 PassManagerWrapper &PMW) const {
2330 // TODO: handle default regalloc override error (with regalloc-npm)
2331
2332 addMachineFunctionPass(Pass: GCNPreRALongBranchRegPass(), PMW);
2333
2334 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2335 PMW);
2336
2337 // Equivalent of PEI for SGPRs.
2338 addMachineFunctionPass(Pass: SILowerSGPRSpillsPass(), PMW);
2339
2340 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2341 addMachineFunctionPass(Pass: SIPreAllocateWWMRegsPass(), PMW);
2342
2343 // For allocating other wwm register operands.
2344 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}),
2345 PMW);
2346
2347 addMachineFunctionPass(Pass: SILowerWWMCopiesPass(), PMW);
2348 addMachineFunctionPass(Pass: AMDGPUReserveWWMRegsPass(), PMW);
2349
2350 // For allocating per-thread VGPRs.
2351 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2352
2353 return Error::success();
2354}
2355
2356void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
2357 PassManagerWrapper &PMW) const {
2358 if (EnableDCEInRA)
2359 insertPass<DetectDeadLanesPass>(Pass: DeadMachineInstructionElimPass());
2360
2361 // FIXME: when an instruction has a Killed operand, and the instruction is
2362 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
2363 // the register in LiveVariables, this would trigger a failure in verifier,
2364 // we should fix it and enable the verifier.
2365 if (OptVGPRLiveRange)
2366 insertPass<RequireAnalysisPass<LiveVariablesAnalysis, MachineFunction>>(
2367 Pass: SIOptimizeVGPRLiveRangePass());
2368
2369 // This must be run immediately after phi elimination and before
2370 // TwoAddressInstructions, otherwise the processing of the tied operand of
2371 // SI_ELSE will introduce a copy of the tied operand source after the else.
2372 insertPass<PHIEliminationPass>(Pass: SILowerControlFlowPass());
2373
2374 if (EnableRewritePartialRegUses)
2375 insertPass<RenameIndependentSubregsPass>(Pass: GCNRewritePartialRegUsesPass());
2376
2377 if (isPassEnabled(Opt: EnablePreRAOptimizations))
2378 insertPass<MachineSchedulerPass>(Pass: GCNPreRAOptimizationsPass());
2379
2380 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
2381 // instructions that cause scheduling barriers.
2382 insertPass<MachineSchedulerPass>(Pass: SIWholeQuadModePass());
2383
2384 if (OptExecMaskPreRA)
2385 insertPass<MachineSchedulerPass>(Pass: SIOptimizeExecMaskingPreRAPass());
2386
2387 // This is not an essential optimization and it has a noticeable impact on
2388 // compilation time, so we only enable it from O2.
2389 if (TM.getOptLevel() > CodeGenOptLevel::Less)
2390 insertPass<MachineSchedulerPass>(Pass: SIFormMemoryClausesPass());
2391
2392 Base::addOptimizedRegAlloc(PMW);
2393}
2394
2395void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const {
2396 if (getOptLevel() != CodeGenOptLevel::None)
2397 addMachineFunctionPass(Pass: AMDGPUPrepareAGPRAllocPass(), PMW);
2398}
2399
2400Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
2401 PassManagerWrapper &PMW) const {
2402 // TODO: Check --regalloc-npm option
2403
2404 addMachineFunctionPass(Pass: GCNPreRALongBranchRegPass(), PMW);
2405
2406 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2407
2408 // Commit allocated register changes. This is mostly necessary because too
2409 // many things rely on the use lists of the physical registers, such as the
2410 // verifier. This is only necessary with allocators which use LiveIntervals,
2411 // since FastRegAlloc does the replacements itself.
2412 addMachineFunctionPass(Pass: VirtRegRewriterPass(false), PMW);
2413
2414 // At this point, the sgpr-regalloc has been done and it is good to have the
2415 // stack slot coloring to try to optimize the SGPR spill stack indices before
2416 // attempting the custom SGPR spill lowering.
2417 addMachineFunctionPass(Pass: StackSlotColoringPass(), PMW);
2418
2419 // Equivalent of PEI for SGPRs.
2420 addMachineFunctionPass(Pass: SILowerSGPRSpillsPass(), PMW);
2421
2422 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2423 addMachineFunctionPass(Pass: SIPreAllocateWWMRegsPass(), PMW);
2424
2425 // For allocating other wwm register operands.
2426 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2427 addMachineFunctionPass(Pass: SILowerWWMCopiesPass(), PMW);
2428 addMachineFunctionPass(Pass: VirtRegRewriterPass(false), PMW);
2429 addMachineFunctionPass(Pass: AMDGPUReserveWWMRegsPass(), PMW);
2430
2431 // For allocating per-thread VGPRs.
2432 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2433
2434 addPreRewrite(PMW);
2435 addMachineFunctionPass(Pass: VirtRegRewriterPass(true), PMW);
2436
2437 addMachineFunctionPass(Pass: AMDGPUMarkLastScratchLoadPass(), PMW);
2438 return Error::success();
2439}
2440
2441void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const {
2442 addMachineFunctionPass(Pass: SIFixVGPRCopiesPass(), PMW);
2443 if (TM.getOptLevel() > CodeGenOptLevel::None)
2444 addMachineFunctionPass(Pass: SIOptimizeExecMaskingPass(), PMW);
2445 Base::addPostRegAlloc(PMW);
2446}
2447
2448void AMDGPUCodeGenPassBuilder::addPreSched2(PassManagerWrapper &PMW) const {
2449 if (TM.getOptLevel() > CodeGenOptLevel::None)
2450 addMachineFunctionPass(Pass: SIShrinkInstructionsPass(), PMW);
2451 addMachineFunctionPass(Pass: SIPostRABundlerPass(), PMW);
2452}
2453
2454void AMDGPUCodeGenPassBuilder::addPostBBSections(
2455 PassManagerWrapper &PMW) const {
2456 // We run this later to avoid passes like livedebugvalues and BBSections
2457 // having to deal with the apparent multi-entry functions we may generate.
2458 addMachineFunctionPass(Pass: AMDGPUPreloadKernArgPrologPass(), PMW);
2459}
2460
2461void AMDGPUCodeGenPassBuilder::addPreEmitPass(PassManagerWrapper &PMW) const {
2462 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less)) {
2463 addMachineFunctionPass(Pass: GCNCreateVOPDPass(), PMW);
2464 }
2465
2466 addMachineFunctionPass(Pass: SIMemoryLegalizerPass(), PMW);
2467 addMachineFunctionPass(Pass: SIInsertWaitcntsPass(), PMW);
2468
2469 addMachineFunctionPass(Pass: SIModeRegisterPass(), PMW);
2470
2471 if (TM.getOptLevel() > CodeGenOptLevel::None)
2472 addMachineFunctionPass(Pass: SIInsertHardClausesPass(), PMW);
2473
2474 addMachineFunctionPass(Pass: SILateBranchLoweringPass(), PMW);
2475
2476 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
2477 addMachineFunctionPass(Pass: AMDGPUSetWavePriorityPass(), PMW);
2478
2479 if (TM.getOptLevel() > CodeGenOptLevel::None)
2480 addMachineFunctionPass(Pass: SIPreEmitPeepholePass(), PMW);
2481
2482 // The hazard recognizer that runs as part of the post-ra scheduler does not
2483 // guarantee to be able handle all hazards correctly. This is because if there
2484 // are multiple scheduling regions in a basic block, the regions are scheduled
2485 // bottom up, so when we begin to schedule a region we don't know what
2486 // instructions were emitted directly before it.
2487 //
2488 // Here we add a stand-alone hazard recognizer pass which can handle all
2489 // cases.
2490 addMachineFunctionPass(Pass: PostRAHazardRecognizerPass(), PMW);
2491 addMachineFunctionPass(Pass: AMDGPUWaitSGPRHazardsPass(), PMW);
2492 addMachineFunctionPass(Pass: AMDGPULowerVGPREncodingPass(), PMW);
2493
2494 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less)) {
2495 addMachineFunctionPass(Pass: AMDGPUInsertDelayAluPass(), PMW);
2496 }
2497
2498 addMachineFunctionPass(Pass: BranchRelaxationPass(), PMW);
2499}
2500
2501bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
2502 CodeGenOptLevel Level) const {
2503 if (Opt.getNumOccurrences())
2504 return Opt;
2505 if (TM.getOptLevel() < Level)
2506 return false;
2507 return Opt;
2508}
2509
2510void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(
2511 PassManagerWrapper &PMW) const {
2512 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
2513 addFunctionPass(Pass: GVNPass(), PMW);
2514 else
2515 addFunctionPass(Pass: EarlyCSEPass(), PMW);
2516}
2517
2518void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
2519 PassManagerWrapper &PMW) const {
2520 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
2521 addFunctionPass(Pass: LoopDataPrefetchPass(), PMW);
2522
2523 addFunctionPass(Pass: SeparateConstOffsetFromGEPPass(), PMW);
2524
2525 // ReassociateGEPs exposes more opportunities for SLSR. See
2526 // the example in reassociate-geps-and-slsr.ll.
2527 addFunctionPass(Pass: StraightLineStrengthReducePass(), PMW);
2528
2529 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
2530 // EarlyCSE can reuse.
2531 addEarlyCSEOrGVNPass(PMW);
2532
2533 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
2534 addFunctionPass(Pass: NaryReassociatePass(), PMW);
2535
2536 // NaryReassociate on GEPs creates redundant common expressions, so run
2537 // EarlyCSE after it.
2538 addFunctionPass(Pass: EarlyCSEPass(), PMW);
2539}
2540