1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains both AMDGPU target machine and the CodeGen pass builder.
11/// The AMDGPU target machine contains all of the hardware specific information
12/// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
13/// CodeGen pass builder handles the pass pipeline for new pass manager.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetMachine.h"
18#include "AMDGPU.h"
19#include "AMDGPUAliasAnalysis.h"
20#include "AMDGPUBarrierLatency.h"
21#include "AMDGPUCoExecSchedStrategy.h"
22#include "AMDGPUCtorDtorLowering.h"
23#include "AMDGPUExportClustering.h"
24#include "AMDGPUExportKernelRuntimeHandles.h"
25#include "AMDGPUHazardLatency.h"
26#include "AMDGPUIGroupLP.h"
27#include "AMDGPUISelDAGToDAG.h"
28#include "AMDGPULowerVGPREncoding.h"
29#include "AMDGPUMacroFusion.h"
30#include "AMDGPUNextUseAnalysis.h"
31#include "AMDGPUPerfHintAnalysis.h"
32#include "AMDGPUPreloadKernArgProlog.h"
33#include "AMDGPUPrepareAGPRAlloc.h"
34#include "AMDGPURemoveIncompatibleFunctions.h"
35#include "AMDGPUReserveWWMRegs.h"
36#include "AMDGPUResourceUsageAnalysis.h"
37#include "AMDGPUSplitModule.h"
38#include "AMDGPUTargetObjectFile.h"
39#include "AMDGPUTargetTransformInfo.h"
40#include "AMDGPUUnifyDivergentExitNodes.h"
41#include "AMDGPUWaitSGPRHazards.h"
42#include "GCNDPPCombine.h"
43#include "GCNIterativeScheduler.h"
44#include "GCNNSAReassign.h"
45#include "GCNPreRALongBranchReg.h"
46#include "GCNPreRAOptimizations.h"
47#include "GCNRewritePartialRegUses.h"
48#include "GCNSchedStrategy.h"
49#include "GCNVOPDUtils.h"
50#include "R600.h"
51#include "R600TargetMachine.h"
52#include "SIFixSGPRCopies.h"
53#include "SIFixVGPRCopies.h"
54#include "SIFoldOperands.h"
55#include "SIFormMemoryClauses.h"
56#include "SILoadStoreOptimizer.h"
57#include "SILowerControlFlow.h"
58#include "SILowerSGPRSpills.h"
59#include "SILowerWWMCopies.h"
60#include "SIMachineFunctionInfo.h"
61#include "SIMachineScheduler.h"
62#include "SIOptimizeExecMasking.h"
63#include "SIOptimizeExecMaskingPreRA.h"
64#include "SIOptimizeVGPRLiveRange.h"
65#include "SIPeepholeSDWA.h"
66#include "SIPostRABundler.h"
67#include "SIPreAllocateWWMRegs.h"
68#include "SIShrinkInstructions.h"
69#include "SIWholeQuadMode.h"
70#include "TargetInfo/AMDGPUTargetInfo.h"
71#include "Utils/AMDGPUBaseInfo.h"
72#include "llvm/Analysis/CGSCCPassManager.h"
73#include "llvm/Analysis/CallGraphSCCPass.h"
74#include "llvm/Analysis/KernelInfo.h"
75#include "llvm/Analysis/UniformityAnalysis.h"
76#include "llvm/CodeGen/AtomicExpand.h"
77#include "llvm/CodeGen/BranchRelaxation.h"
78#include "llvm/CodeGen/DeadMachineInstructionElim.h"
79#include "llvm/CodeGen/EarlyIfConversion.h"
80#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
81#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
82#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
83#include "llvm/CodeGen/GlobalISel/Legalizer.h"
84#include "llvm/CodeGen/GlobalISel/Localizer.h"
85#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
86#include "llvm/CodeGen/MIRParser/MIParser.h"
87#include "llvm/CodeGen/MachineCSE.h"
88#include "llvm/CodeGen/MachineLICM.h"
89#include "llvm/CodeGen/MachineScheduler.h"
90#include "llvm/CodeGen/Passes.h"
91#include "llvm/CodeGen/PostRAHazardRecognizer.h"
92#include "llvm/CodeGen/RegAllocRegistry.h"
93#include "llvm/CodeGen/TargetPassConfig.h"
94#include "llvm/IR/DiagnosticInfo.h"
95#include "llvm/IR/IntrinsicsAMDGPU.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/PassManager.h"
98#include "llvm/IR/PatternMatch.h"
99#include "llvm/InitializePasses.h"
100#include "llvm/MC/TargetRegistry.h"
101#include "llvm/Passes/CodeGenPassBuilder.h"
102#include "llvm/Passes/PassBuilder.h"
103#include "llvm/Support/Compiler.h"
104#include "llvm/Support/FormatVariadic.h"
105#include "llvm/Transforms/HipStdPar/HipStdPar.h"
106#include "llvm/Transforms/IPO.h"
107#include "llvm/Transforms/IPO/AlwaysInliner.h"
108#include "llvm/Transforms/IPO/ExpandVariadics.h"
109#include "llvm/Transforms/IPO/GlobalDCE.h"
110#include "llvm/Transforms/IPO/Internalize.h"
111#include "llvm/Transforms/Scalar.h"
112#include "llvm/Transforms/Scalar/EarlyCSE.h"
113#include "llvm/Transforms/Scalar/FlattenCFG.h"
114#include "llvm/Transforms/Scalar/GVN.h"
115#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
116#include "llvm/Transforms/Scalar/LICM.h"
117#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
118#include "llvm/Transforms/Scalar/LoopPassManager.h"
119#include "llvm/Transforms/Scalar/NaryReassociate.h"
120#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
121#include "llvm/Transforms/Scalar/Sink.h"
122#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
123#include "llvm/Transforms/Scalar/StructurizeCFG.h"
124#include "llvm/Transforms/Utils.h"
125#include "llvm/Transforms/Utils/FixIrreducible.h"
126#include "llvm/Transforms/Utils/LCSSA.h"
127#include "llvm/Transforms/Utils/LowerSwitch.h"
128#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
129#include "llvm/Transforms/Utils/UnifyLoopExits.h"
130#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
131#include <optional>
132
133using namespace llvm;
134using namespace llvm::PatternMatch;
135
136namespace {
137//===----------------------------------------------------------------------===//
138// AMDGPU CodeGen Pass Builder interface.
139//===----------------------------------------------------------------------===//
140
141class AMDGPUCodeGenPassBuilder
142 : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
143 using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
144
145public:
146 AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
147 const CGPassBuilderOption &Opts,
148 PassInstrumentationCallbacks *PIC);
149
150 void addIRPasses(PassManagerWrapper &PMW) const;
151 void addCodeGenPrepare(PassManagerWrapper &PMW) const;
152 void addPreISel(PassManagerWrapper &PMW) const;
153 void addILPOpts(PassManagerWrapper &PMWM) const;
154 void addAsmPrinterBegin(PassManagerWrapper &PMW) const;
155 void addAsmPrinter(PassManagerWrapper &PMW) const;
156 void addAsmPrinterEnd(PassManagerWrapper &PMW) const;
157 Error addInstSelector(PassManagerWrapper &PMW) const;
158 void addPreRewrite(PassManagerWrapper &PMW) const;
159 void addMachineSSAOptimization(PassManagerWrapper &PMW) const;
160 void addPostRegAlloc(PassManagerWrapper &PMW) const;
161 void addPreEmitPass(PassManagerWrapper &PMWM) const;
162 void addPreEmitRegAlloc(PassManagerWrapper &PMW) const;
163 Error addRegAssignmentFast(PassManagerWrapper &PMW) const;
164 Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const;
165 void addPreRegAlloc(PassManagerWrapper &PMW) const;
166 Error addFastRegAlloc(PassManagerWrapper &PMW) const;
167 Error addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
168 void addPreSched2(PassManagerWrapper &PMW) const;
169 void addPostBBSections(PassManagerWrapper &PMW) const;
170
171private:
172 Error validateRegAllocOptions() const;
173
174public:
175 /// Check if a pass is enabled given \p Opt option. The option always
176 /// overrides defaults if explicitly used. Otherwise its default will be used
177 /// given that a pass shall work at an optimization \p Level minimum.
178 bool isPassEnabled(const cl::opt<bool> &Opt,
179 CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
180 void addEarlyCSEOrGVNPass(PassManagerWrapper &PMW) const;
181 void addStraightLineScalarOptimizationPasses(PassManagerWrapper &PMW) const;
182};
183
184class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
185public:
186 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
187 : RegisterRegAllocBase(N, D, C) {}
188};
189
190class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
191public:
192 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
193 : RegisterRegAllocBase(N, D, C) {}
194};
195
196class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
197public:
198 WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
199 : RegisterRegAllocBase(N, D, C) {}
200};
201
202static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
203 const MachineRegisterInfo &MRI,
204 const Register Reg) {
205 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
206 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
207}
208
209static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
210 const MachineRegisterInfo &MRI,
211 const Register Reg) {
212 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
213 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
214}
215
216static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
217 const MachineRegisterInfo &MRI,
218 const Register Reg) {
219 const SIMachineFunctionInfo *MFI =
220 MRI.getMF().getInfo<SIMachineFunctionInfo>();
221 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
222 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
223 MFI->checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG);
224}
225
226/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
227static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
228
229/// A dummy default pass factory indicates whether the register allocator is
230/// overridden on the command line.
231static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
232static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
233static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
234
235static SGPRRegisterRegAlloc
236defaultSGPRRegAlloc("default",
237 "pick SGPR register allocator based on -O option",
238 useDefaultRegisterAllocator);
239
240static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
241 RegisterPassParser<SGPRRegisterRegAlloc>>
242SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
243 cl::desc("Register allocator to use for SGPRs"));
244
245static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
246 RegisterPassParser<VGPRRegisterRegAlloc>>
247VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
248 cl::desc("Register allocator to use for VGPRs"));
249
250static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
251 RegisterPassParser<WWMRegisterRegAlloc>>
252 WWMRegAlloc("wwm-regalloc", cl::Hidden,
253 cl::init(Val: &useDefaultRegisterAllocator),
254 cl::desc("Register allocator to use for WWM registers"));
255
256// New pass manager register allocator options for AMDGPU
257static cl::opt<RegAllocType, false, RegAllocTypeParser> SGPRRegAllocNPM(
258 "sgpr-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
259 cl::desc("Register allocator for SGPRs (new pass manager)"));
260
261static cl::opt<RegAllocType, false, RegAllocTypeParser> VGPRRegAllocNPM(
262 "vgpr-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
263 cl::desc("Register allocator for VGPRs (new pass manager)"));
264
265static cl::opt<RegAllocType, false, RegAllocTypeParser> WWMRegAllocNPM(
266 "wwm-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
267 cl::desc("Register allocator for WWM registers (new pass manager)"));
268
269/// Check if the given RegAllocType is supported for AMDGPU NPM register
270/// allocation. Only Fast and Greedy are supported; Basic and PBQP are not.
271static Error checkRegAllocSupported(RegAllocType RAType, StringRef RegName) {
272 if (RAType == RegAllocType::Basic || RAType == RegAllocType::PBQP) {
273 return make_error<StringError>(
274 Args: Twine("unsupported register allocator '") +
275 (RAType == RegAllocType::Basic ? "basic" : "pbqp") + "' for " +
276 RegName + " registers",
277 Args: inconvertibleErrorCode());
278 }
279 return Error::success();
280}
281
282Error AMDGPUCodeGenPassBuilder::validateRegAllocOptions() const {
283 // 1. Generic --regalloc-npm is not supported for AMDGPU.
284 if (Opt.RegAlloc != RegAllocType::Unset) {
285 return make_error<StringError>(
286 Args: "-regalloc-npm not supported for amdgcn. Use -sgpr-regalloc-npm, "
287 "-vgpr-regalloc-npm, and -wwm-regalloc-npm",
288 Args: inconvertibleErrorCode());
289 }
290
291 // 2. Legacy PM regalloc options are not compatible with NPM.
292 if (SGPRRegAlloc.getNumOccurrences() > 0 ||
293 VGPRRegAlloc.getNumOccurrences() > 0 ||
294 WWMRegAlloc.getNumOccurrences() > 0) {
295 return make_error<StringError>(
296 Args: "-sgpr-regalloc, -vgpr-regalloc, and -wwm-regalloc are legacy PM "
297 "options. Use -sgpr-regalloc-npm, -vgpr-regalloc-npm, and "
298 "-wwm-regalloc-npm with the new pass manager",
299 Args: inconvertibleErrorCode());
300 }
301
302 // 3. Only Fast and Greedy allocators are supported for AMDGPU.
303 if (auto Err = checkRegAllocSupported(RAType: SGPRRegAllocNPM, RegName: "SGPR"))
304 return Err;
305 if (auto Err = checkRegAllocSupported(RAType: WWMRegAllocNPM, RegName: "WWM"))
306 return Err;
307 if (auto Err = checkRegAllocSupported(RAType: VGPRRegAllocNPM, RegName: "VGPR"))
308 return Err;
309
310 return Error::success();
311}
312
313static void initializeDefaultSGPRRegisterAllocatorOnce() {
314 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
315
316 if (!Ctor) {
317 Ctor = SGPRRegAlloc;
318 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
319 }
320}
321
322static void initializeDefaultVGPRRegisterAllocatorOnce() {
323 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
324
325 if (!Ctor) {
326 Ctor = VGPRRegAlloc;
327 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
328 }
329}
330
331static void initializeDefaultWWMRegisterAllocatorOnce() {
332 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
333
334 if (!Ctor) {
335 Ctor = WWMRegAlloc;
336 WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
337 }
338}
339
340static FunctionPass *createBasicSGPRRegisterAllocator() {
341 return createBasicRegisterAllocator(F: onlyAllocateSGPRs);
342}
343
344static FunctionPass *createGreedySGPRRegisterAllocator() {
345 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
346}
347
348static FunctionPass *createFastSGPRRegisterAllocator() {
349 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
350}
351
352static FunctionPass *createBasicVGPRRegisterAllocator() {
353 return createBasicRegisterAllocator(F: onlyAllocateVGPRs);
354}
355
356static FunctionPass *createGreedyVGPRRegisterAllocator() {
357 return createGreedyRegisterAllocator(F: onlyAllocateVGPRs);
358}
359
360static FunctionPass *createFastVGPRRegisterAllocator() {
361 return createFastRegisterAllocator(F: onlyAllocateVGPRs, ClearVirtRegs: true);
362}
363
364static FunctionPass *createBasicWWMRegisterAllocator() {
365 return createBasicRegisterAllocator(F: onlyAllocateWWMRegs);
366}
367
368static FunctionPass *createGreedyWWMRegisterAllocator() {
369 return createGreedyRegisterAllocator(F: onlyAllocateWWMRegs);
370}
371
372static FunctionPass *createFastWWMRegisterAllocator() {
373 return createFastRegisterAllocator(F: onlyAllocateWWMRegs, ClearVirtRegs: false);
374}
375
376static SGPRRegisterRegAlloc basicRegAllocSGPR(
377 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
378static SGPRRegisterRegAlloc greedyRegAllocSGPR(
379 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
380
381static SGPRRegisterRegAlloc fastRegAllocSGPR(
382 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
383
384
385static VGPRRegisterRegAlloc basicRegAllocVGPR(
386 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
387static VGPRRegisterRegAlloc greedyRegAllocVGPR(
388 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
389
390static VGPRRegisterRegAlloc fastRegAllocVGPR(
391 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
392static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
393 "basic register allocator",
394 createBasicWWMRegisterAllocator);
395static WWMRegisterRegAlloc
396 greedyRegAllocWWMReg("greedy", "greedy register allocator",
397 createGreedyWWMRegisterAllocator);
398static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
399 createFastWWMRegisterAllocator);
400
401static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
402 return Phase == ThinOrFullLTOPhase::FullLTOPreLink ||
403 Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
404}
405} // anonymous namespace
406
407static cl::opt<bool>
408EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
409 cl::desc("Run early if-conversion"),
410 cl::init(Val: false));
411
412static cl::opt<bool>
413OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
414 cl::desc("Run pre-RA exec mask optimizations"),
415 cl::init(Val: true));
416
417static cl::opt<bool>
418 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
419 cl::desc("Lower GPU ctor / dtors to globals on the device."),
420 cl::init(Val: true), cl::Hidden);
421
422// Option to disable vectorizer for tests.
423static cl::opt<bool> EnableLoadStoreVectorizer(
424 "amdgpu-load-store-vectorizer",
425 cl::desc("Enable load store vectorizer"),
426 cl::init(Val: true),
427 cl::Hidden);
428
429// Option to control global loads scalarization
430static cl::opt<bool> ScalarizeGlobal(
431 "amdgpu-scalarize-global-loads",
432 cl::desc("Enable global load scalarization"),
433 cl::init(Val: true),
434 cl::Hidden);
435
436// Option to run internalize pass.
437static cl::opt<bool> InternalizeSymbols(
438 "amdgpu-internalize-symbols",
439 cl::desc("Enable elimination of non-kernel functions and unused globals"),
440 cl::init(Val: false),
441 cl::Hidden);
442
443// Option to inline all early.
444static cl::opt<bool> EarlyInlineAll(
445 "amdgpu-early-inline-all",
446 cl::desc("Inline all functions early"),
447 cl::init(Val: false),
448 cl::Hidden);
449
450static cl::opt<bool> RemoveIncompatibleFunctions(
451 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
452 cl::desc("Enable removal of functions when they"
453 "use features not supported by the target GPU"),
454 cl::init(Val: true));
455
456static cl::opt<bool> EnableSDWAPeephole(
457 "amdgpu-sdwa-peephole",
458 cl::desc("Enable SDWA peepholer"),
459 cl::init(Val: true));
460
461static cl::opt<bool> EnableDPPCombine(
462 "amdgpu-dpp-combine",
463 cl::desc("Enable DPP combiner"),
464 cl::init(Val: true));
465
466// Enable address space based alias analysis
467static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
468 cl::desc("Enable AMDGPU Alias Analysis"),
469 cl::init(Val: true));
470
471// Enable lib calls simplifications
472static cl::opt<bool> EnableLibCallSimplify(
473 "amdgpu-simplify-libcall",
474 cl::desc("Enable amdgpu library simplifications"),
475 cl::init(Val: true),
476 cl::Hidden);
477
478static cl::opt<bool> EnableLowerKernelArguments(
479 "amdgpu-ir-lower-kernel-arguments",
480 cl::desc("Lower kernel argument loads in IR pass"),
481 cl::init(Val: true),
482 cl::Hidden);
483
484static cl::opt<bool> EnableRegReassign(
485 "amdgpu-reassign-regs",
486 cl::desc("Enable register reassign optimizations on gfx10+"),
487 cl::init(Val: true),
488 cl::Hidden);
489
490static cl::opt<bool> OptVGPRLiveRange(
491 "amdgpu-opt-vgpr-liverange",
492 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
493 cl::init(Val: true), cl::Hidden);
494
495static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
496 "amdgpu-atomic-optimizer-strategy",
497 cl::desc("Select DPP or Iterative strategy for scan"),
498 cl::init(Val: ScanOptions::Iterative),
499 cl::values(
500 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
501 clEnumValN(ScanOptions::Iterative, "Iterative",
502 "Use Iterative approach for scan"),
503 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
504
505// Enable Mode register optimization
506static cl::opt<bool> EnableSIModeRegisterPass(
507 "amdgpu-mode-register",
508 cl::desc("Enable mode register pass"),
509 cl::init(Val: true),
510 cl::Hidden);
511
512// Enable GFX11+ s_delay_alu insertion
513static cl::opt<bool>
514 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
515 cl::desc("Enable s_delay_alu insertion"),
516 cl::init(Val: true), cl::Hidden);
517
518// Enable GFX11+ VOPD
519static cl::opt<bool>
520 EnableVOPD("amdgpu-enable-vopd",
521 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
522 cl::init(Val: true), cl::Hidden);
523
524// Option is used in lit tests to prevent deadcoding of patterns inspected.
525static cl::opt<bool>
526EnableDCEInRA("amdgpu-dce-in-ra",
527 cl::init(Val: true), cl::Hidden,
528 cl::desc("Enable machine DCE inside regalloc"));
529
530static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
531 cl::desc("Adjust wave priority"),
532 cl::init(Val: false), cl::Hidden);
533
534static cl::opt<bool> EnableScalarIRPasses(
535 "amdgpu-scalar-ir-passes",
536 cl::desc("Enable scalar IR passes"),
537 cl::init(Val: true),
538 cl::Hidden);
539
540static cl::opt<bool> EnableLowerExecSync(
541 "amdgpu-enable-lower-exec-sync",
542 cl::desc("Enable lowering of execution synchronization."), cl::init(Val: true),
543 cl::Hidden);
544
545static cl::opt<bool>
546 EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
547 cl::desc("Enable lowering of lds to global memory pass "
548 "and asan instrument resulting IR."),
549 cl::init(Val: true), cl::Hidden);
550
551static cl::opt<bool, true> EnableObjectLinking(
552 "amdgpu-enable-object-linking",
553 cl::desc("Enable object linking for cross-TU LDS and ABI support"),
554 cl::location(L&: AMDGPUTargetMachine::EnableObjectLinking), cl::init(Val: false),
555 cl::Hidden);
556
557static cl::opt<bool, true> EnableLowerModuleLDS(
558 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
559 cl::location(L&: AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(Val: true),
560 cl::Hidden);
561
562static cl::opt<bool> EnablePreRAOptimizations(
563 "amdgpu-enable-pre-ra-optimizations",
564 cl::desc("Enable Pre-RA optimizations pass"), cl::init(Val: true),
565 cl::Hidden);
566
567static cl::opt<bool> EnablePromoteKernelArguments(
568 "amdgpu-enable-promote-kernel-arguments",
569 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
570 cl::Hidden, cl::init(Val: true));
571
572static cl::opt<bool> EnableImageIntrinsicOptimizer(
573 "amdgpu-enable-image-intrinsic-optimizer",
574 cl::desc("Enable image intrinsic optimizer pass"), cl::init(Val: true),
575 cl::Hidden);
576
577static cl::opt<bool>
578 EnableLoopPrefetch("amdgpu-loop-prefetch",
579 cl::desc("Enable loop data prefetch on AMDGPU"),
580 cl::Hidden, cl::init(Val: false));
581
582static cl::opt<std::string>
583 AMDGPUSchedStrategy("amdgpu-sched-strategy",
584 cl::desc("Select custom AMDGPU scheduling strategy."),
585 cl::Hidden, cl::init(Val: ""));
586
587// Scheduler selection is consulted both when creating the scheduler and from
588// overrideSchedPolicy(), so keep the attribute and global command line handling
589// in one helper.
590StringRef llvm::AMDGPU::getSchedStrategy(const Function &F) {
591 Attribute SchedStrategyAttr = F.getFnAttribute(Kind: "amdgpu-sched-strategy");
592 if (SchedStrategyAttr.isValid())
593 return SchedStrategyAttr.getValueAsString();
594
595 if (!AMDGPUSchedStrategy.empty())
596 return AMDGPUSchedStrategy;
597
598 return "";
599}
600
601static void
602diagnoseUnsupportedCoExecSchedulerSelection(const Function &F,
603 const GCNSubtarget &ST) {
604 if (ST.hasGFX1250Insts())
605 return;
606
607 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
608 F, "'amdgpu-sched-strategy'='coexec' is only supported for gfx1250",
609 DiagnosticLocation(), DS_Warning));
610}
611
612static bool useNoopPostScheduler(const Function &F) {
613 Attribute PostSchedStrategyAttr =
614 F.getFnAttribute(Kind: "amdgpu-post-sched-strategy");
615 return PostSchedStrategyAttr.isValid() &&
616 PostSchedStrategyAttr.getValueAsString() == "nop";
617}
618
619static cl::opt<bool> EnableRewritePartialRegUses(
620 "amdgpu-enable-rewrite-partial-reg-uses",
621 cl::desc("Enable rewrite partial reg uses pass"), cl::init(Val: true),
622 cl::Hidden);
623
624static cl::opt<bool> EnableHipStdPar(
625 "amdgpu-enable-hipstdpar",
626 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(Val: false),
627 cl::Hidden);
628
629static cl::opt<bool>
630 EnableAMDGPUAttributor("amdgpu-attributor-enable",
631 cl::desc("Enable AMDGPUAttributorPass"),
632 cl::init(Val: true), cl::Hidden);
633
634static cl::opt<bool> HasClosedWorldAssumption(
635 "amdgpu-link-time-closed-world",
636 cl::desc("Whether has closed-world assumption at link time"),
637 cl::init(Val: false), cl::Hidden);
638
639static cl::opt<bool> EnableUniformIntrinsicCombine(
640 "amdgpu-enable-uniform-intrinsic-combine",
641 cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
642 cl::init(Val: true), cl::Hidden);
643
644extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
645 // Register the target
646 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
647 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
648
649 PassRegistry *PR = PassRegistry::getPassRegistry();
650 initializeR600ClauseMergePassPass(*PR);
651 initializeR600ControlFlowFinalizerPass(*PR);
652 initializeR600PacketizerPass(*PR);
653 initializeR600ExpandSpecialInstrsPassPass(*PR);
654 initializeR600VectorRegMergerPass(*PR);
655 initializeR600EmitClauseMarkersPass(*PR);
656 initializeR600MachineCFGStructurizerPass(*PR);
657 initializeGlobalISel(*PR);
658 initializeAMDGPUAsmPrinterPass(*PR);
659 initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
660 initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
661 initializeGCNDPPCombineLegacyPass(*PR);
662 initializeSILowerI1CopiesLegacyPass(*PR);
663 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
664 initializeAMDGPURegBankSelectPass(*PR);
665 initializeAMDGPURegBankLegalizePass(*PR);
666 initializeSILowerWWMCopiesLegacyPass(*PR);
667 initializeAMDGPUMarkLastScratchLoadLegacyPass(*PR);
668 initializeSILowerSGPRSpillsLegacyPass(*PR);
669 initializeSIFixSGPRCopiesLegacyPass(*PR);
670 initializeSIFixVGPRCopiesLegacyPass(*PR);
671 initializeSIFoldOperandsLegacyPass(*PR);
672 initializeSIPeepholeSDWALegacyPass(*PR);
673 initializeSIShrinkInstructionsLegacyPass(*PR);
674 initializeSIOptimizeExecMaskingPreRALegacyPass(*PR);
675 initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR);
676 initializeAMDGPUNextUseAnalysisLegacyPassPass(*PR);
677 initializeAMDGPUNextUseAnalysisPrinterLegacyPassPass(*PR);
678 initializeSILoadStoreOptimizerLegacyPass(*PR);
679 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
680 initializeAMDGPUAlwaysInlinePass(*PR);
681 initializeAMDGPULowerExecSyncLegacyPass(*PR);
682 initializeAMDGPUSwLowerLDSLegacyPass(*PR);
683 initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
684 initializeAMDGPUAtomicOptimizerPass(*PR);
685 initializeAMDGPULowerKernelArgumentsPass(*PR);
686 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
687 initializeAMDGPULowerKernelAttributesPass(*PR);
688 initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
689 initializeAMDGPUPostLegalizerCombinerPass(*PR);
690 initializeAMDGPUPreLegalizerCombinerPass(*PR);
691 initializeAMDGPURegBankCombinerPass(*PR);
692 initializeAMDGPUPromoteAllocaPass(*PR);
693 initializeAMDGPUCodeGenPreparePass(*PR);
694 initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
695 initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
696 initializeAMDGPULowerModuleLDSLegacyPass(*PR);
697 initializeAMDGPULowerBufferFatPointersPass(*PR);
698 initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
699 initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
700 initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
701 initializeAMDGPURewriteOutArgumentsPass(*PR);
702 initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
703 initializeSIAnnotateControlFlowLegacyPass(*PR);
704 initializeAMDGPUInsertDelayAluLegacyPass(*PR);
705 initializeAMDGPULowerVGPREncodingLegacyPass(*PR);
706 initializeSIInsertHardClausesLegacyPass(*PR);
707 initializeSIInsertWaitcntsLegacyPass(*PR);
708 initializeSIModeRegisterLegacyPass(*PR);
709 initializeSIWholeQuadModeLegacyPass(*PR);
710 initializeSILowerControlFlowLegacyPass(*PR);
711 initializeSIPreEmitPeepholeLegacyPass(*PR);
712 initializeSILateBranchLoweringLegacyPass(*PR);
713 initializeSIMemoryLegalizerLegacyPass(*PR);
714 initializeSIOptimizeExecMaskingLegacyPass(*PR);
715 initializeSIPreAllocateWWMRegsLegacyPass(*PR);
716 initializeSIFormMemoryClausesLegacyPass(*PR);
717 initializeSIPostRABundlerLegacyPass(*PR);
718 initializeGCNCreateVOPDLegacyPass(*PR);
719 initializeAMDGPUUnifyDivergentExitNodesLegacyPass(*PR);
720 initializeAMDGPUAAWrapperPassPass(*PR);
721 initializeAMDGPUExternalAAWrapperPass(*PR);
722 initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
723 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
724 initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR);
725 initializeGCNNSAReassignLegacyPass(*PR);
726 initializeGCNPreRAOptimizationsLegacyPass(*PR);
727 initializeGCNPreRALongBranchRegLegacyPass(*PR);
728 initializeGCNRewritePartialRegUsesLegacyPass(*PR);
729 initializeGCNRegPressurePrinterPass(*PR);
730 initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
731 initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
732 initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
733 initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
734}
735
736static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
737 return std::make_unique<AMDGPUTargetObjectFile>();
738}
739
740static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
741 return new SIScheduleDAGMI(C);
742}
743
744static ScheduleDAGInstrs *
745createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
746 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
747 ScheduleDAGMILive *DAG =
748 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(args&: C));
749 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
750 if (ST.shouldClusterStores())
751 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
752 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
753 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
754 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
755 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
756 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
757 return DAG;
758}
759
760static ScheduleDAGInstrs *
761createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
762 ScheduleDAGMILive *DAG =
763 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(args&: C));
764 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
765 return DAG;
766}
767
768static ScheduleDAGInstrs *
769createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
770 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
771 ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
772 C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(args&: C));
773 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
774 if (ST.shouldClusterStores())
775 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
776 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
777 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
778 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
779 return DAG;
780}
781
782static ScheduleDAGInstrs *
783createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
784 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
785 auto *DAG = new GCNIterativeScheduler(
786 C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
787 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
788 if (ST.shouldClusterStores())
789 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
790 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
791 return DAG;
792}
793
794static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
795 auto *DAG = new GCNIterativeScheduler(
796 C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
797 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
798 return DAG;
799}
800
801static ScheduleDAGInstrs *
802createIterativeILPMachineScheduler(MachineSchedContext *C) {
803 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
804 auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
805 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
806 if (ST.shouldClusterStores())
807 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
808 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
809 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
810 return DAG;
811}
812
813static MachineSchedRegistry
814SISchedRegistry("si", "Run SI's custom scheduler",
815 createSIMachineScheduler);
816
817static MachineSchedRegistry
818GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
819 "Run GCN scheduler to maximize occupancy",
820 createGCNMaxOccupancyMachineScheduler);
821
822static MachineSchedRegistry
823 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
824 createGCNMaxILPMachineScheduler);
825
826static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
827 "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
828 createGCNMaxMemoryClauseMachineScheduler);
829
830static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
831 "gcn-iterative-max-occupancy-experimental",
832 "Run GCN scheduler to maximize occupancy (experimental)",
833 createIterativeGCNMaxOccupancyMachineScheduler);
834
835static MachineSchedRegistry GCNMinRegSchedRegistry(
836 "gcn-iterative-minreg",
837 "Run GCN iterative scheduler for minimal register usage (experimental)",
838 createMinRegScheduler);
839
840static MachineSchedRegistry GCNILPSchedRegistry(
841 "gcn-iterative-ilp",
842 "Run GCN iterative scheduler for ILP scheduling (experimental)",
843 createIterativeILPMachineScheduler);
844
845LLVM_READNONE
846static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
847 if (!GPU.empty())
848 return GPU;
849
850 // Need to default to a target with flat support for HSA.
851 if (TT.isAMDGCN())
852 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
853
854 return "r600";
855}
856
857static Reloc::Model getEffectiveRelocModel() {
858 // The AMDGPU toolchain only supports generating shared objects, so we
859 // must always use PIC.
860 return Reloc::PIC_;
861}
862
863AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
864 StringRef CPU, StringRef FS,
865 const TargetOptions &Options,
866 std::optional<Reloc::Model> RM,
867 std::optional<CodeModel::Model> CM,
868 CodeGenOptLevel OptLevel)
869 : CodeGenTargetMachineImpl(
870 T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, GPU: CPU), FS, Options,
871 getEffectiveRelocModel(), getEffectiveCodeModel(CM, Default: CodeModel::Small),
872 OptLevel),
873 TLOF(createTLOF(TT: getTargetTriple())) {
874 initAsmInfo();
875 if (TT.isAMDGCN()) {
876 if (getMCSubtargetInfo().checkFeatures(FS: "+wavefrontsize64"))
877 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave64));
878 else if (getMCSubtargetInfo().checkFeatures(FS: "+wavefrontsize32"))
879 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave32));
880 }
881}
882
883bool AMDGPUTargetMachine::EnableFunctionCalls = false;
884bool AMDGPUTargetMachine::EnableObjectLinking = false;
885bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
886
887AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
888
889StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
890 Attribute GPUAttr = F.getFnAttribute(Kind: "target-cpu");
891 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
892}
893
894StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
895 Attribute FSAttr = F.getFnAttribute(Kind: "target-features");
896
897 return FSAttr.isValid() ? FSAttr.getValueAsString()
898 : getTargetFeatureString();
899}
900
901llvm::ScheduleDAGInstrs *
902AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
903 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
904 ScheduleDAGMILive *DAG = createSchedLive(C);
905 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
906 if (ST.shouldClusterStores())
907 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
908 return DAG;
909}
910
911/// Predicate for Internalize pass.
912static bool mustPreserveGV(const GlobalValue &GV) {
913 if (const Function *F = dyn_cast<Function>(Val: &GV))
914 return F->isDeclaration() || F->getName().starts_with(Prefix: "__asan_") ||
915 F->getName().starts_with(Prefix: "__sanitizer_") ||
916 AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
917
918 GV.removeDeadConstantUsers();
919 return !GV.use_empty();
920}
921
922void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
923 if (EnableAMDGPUAliasAnalysis)
924 AAM.registerFunctionAnalysis<AMDGPUAA>();
925}
926
927static Expected<ScanOptions>
928parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
929 if (Params.empty())
930 return ScanOptions::Iterative;
931 Params.consume_front(Prefix: "strategy=");
932 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
933 .Case(S: "dpp", Value: ScanOptions::DPP)
934 .Cases(CaseStrings: {"iterative", ""}, Value: ScanOptions::Iterative)
935 .Case(S: "none", Value: ScanOptions::None)
936 .Default(Value: std::nullopt);
937 if (Result)
938 return *Result;
939 return make_error<StringError>(Args: "invalid parameter", Args: inconvertibleErrorCode());
940}
941
942Expected<AMDGPUAttributorOptions>
943parseAMDGPUAttributorPassOptions(StringRef Params) {
944 AMDGPUAttributorOptions Result;
945 while (!Params.empty()) {
946 StringRef ParamName;
947 std::tie(args&: ParamName, args&: Params) = Params.split(Separator: ';');
948 if (ParamName == "closed-world") {
949 Result.IsClosedWorld = true;
950 } else {
951 return make_error<StringError>(
952 Args: formatv(Fmt: "invalid AMDGPUAttributor pass parameter '{0}' ", Vals&: ParamName)
953 .str(),
954 Args: inconvertibleErrorCode());
955 }
956 }
957 return Result;
958}
959
960void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
961
962#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
963#include "llvm/Passes/TargetPassRegistry.inc"
964
965 PB.registerPipelineParsingCallback(
966 C: [this](StringRef Name, CGSCCPassManager &PM,
967 ArrayRef<PassBuilder::PipelineElement> Pipeline) {
968 if (Name == "amdgpu-attributor-cgscc" && getTargetTriple().isAMDGCN()) {
969 PM.addPass(Pass: AMDGPUAttributorCGSCCPass(
970 *static_cast<GCNTargetMachine *>(this)));
971 return true;
972 }
973 return false;
974 });
975
976 PB.registerScalarOptimizerLateEPCallback(
977 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
978 if (Level == OptimizationLevel::O0)
979 return;
980
981 FPM.addPass(Pass: InferAddressSpacesPass());
982 });
983
984 PB.registerVectorizerEndEPCallback(
985 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
986 if (Level == OptimizationLevel::O0)
987 return;
988
989 FPM.addPass(Pass: InferAddressSpacesPass());
990 });
991
992 PB.registerPipelineEarlySimplificationEPCallback(
993 C: [this](ModulePassManager &PM, OptimizationLevel Level,
994 ThinOrFullLTOPhase Phase) {
995 if (!isLTOPreLink(Phase) && getTargetTriple().isAMDGCN()) {
996 // When we are not using -fgpu-rdc, we can run accelerator code
997 // selection relatively early, but still after linking to prevent
998 // eager removal of potentially reachable symbols.
999 if (EnableHipStdPar) {
1000 PM.addPass(Pass: HipStdParMathFixupPass());
1001 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
1002 }
1003
1004 PM.addPass(Pass: AMDGPUPrintfRuntimeBindingPass());
1005 }
1006
1007 if (Level == OptimizationLevel::O0)
1008 return;
1009
1010 // We don't want to run internalization at per-module stage.
1011 if (InternalizeSymbols && !isLTOPreLink(Phase)) {
1012 PM.addPass(Pass: InternalizePass(mustPreserveGV));
1013 PM.addPass(Pass: GlobalDCEPass());
1014 }
1015
1016 if (EarlyInlineAll && !EnableFunctionCalls)
1017 PM.addPass(Pass: AMDGPUAlwaysInlinePass());
1018 });
1019
1020 PB.registerPeepholeEPCallback(
1021 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
1022 if (Level == OptimizationLevel::O0)
1023 return;
1024
1025 FPM.addPass(Pass: AMDGPUUseNativeCallsPass());
1026 if (EnableLibCallSimplify)
1027 FPM.addPass(Pass: AMDGPUSimplifyLibCallsPass());
1028
1029 if (EnableUniformIntrinsicCombine)
1030 FPM.addPass(Pass: AMDGPUUniformIntrinsicCombinePass());
1031 });
1032
1033 PB.registerCGSCCOptimizerLateEPCallback(
1034 C: [this](CGSCCPassManager &PM, OptimizationLevel Level) {
1035 if (Level == OptimizationLevel::O0)
1036 return;
1037
1038 FunctionPassManager FPM;
1039
1040 // Add promote kernel arguments pass to the opt pipeline right before
1041 // infer address spaces which is needed to do actual address space
1042 // rewriting.
1043 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
1044 EnablePromoteKernelArguments)
1045 FPM.addPass(Pass: AMDGPUPromoteKernelArgumentsPass());
1046
1047 // Add infer address spaces pass to the opt pipeline after inlining
1048 // but before SROA to increase SROA opportunities.
1049 FPM.addPass(Pass: InferAddressSpacesPass());
1050
1051 // This should run after inlining to have any chance of doing
1052 // anything, and before other cleanup optimizations.
1053 FPM.addPass(Pass: AMDGPULowerKernelAttributesPass());
1054
1055 // Promote alloca to vector before SROA and loop unroll. If we
1056 // manage to eliminate allocas before unroll we may choose to unroll
1057 // less.
1058 FPM.addPass(Pass: AMDGPUPromoteAllocaToVectorPass(*this));
1059
1060 PM.addPass(Pass: createCGSCCToFunctionPassAdaptor(Pass: std::move(FPM)));
1061 });
1062
1063 // FIXME: Why is AMDGPUAttributor not in CGSCC?
1064 PB.registerOptimizerLastEPCallback(C: [this](ModulePassManager &MPM,
1065 OptimizationLevel Level,
1066 ThinOrFullLTOPhase Phase) {
1067 if (Level != OptimizationLevel::O0) {
1068 if (!isLTOPreLink(Phase)) {
1069 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
1070 AMDGPUAttributorOptions Opts;
1071 MPM.addPass(Pass: AMDGPUAttributorPass(*this, Opts, Phase));
1072 }
1073 }
1074 }
1075 });
1076
1077 PB.registerFullLinkTimeOptimizationLastEPCallback(
1078 C: [this](ModulePassManager &PM, OptimizationLevel Level) {
1079 // When we are using -fgpu-rdc, we can only run accelerator code
1080 // selection after linking to prevent, otherwise we end up removing
1081 // potentially reachable symbols that were exported as external in other
1082 // modules.
1083 if (EnableHipStdPar) {
1084 PM.addPass(Pass: HipStdParMathFixupPass());
1085 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
1086 }
1087 // We want to support the -lto-partitions=N option as "best effort".
1088 // For that, we need to lower LDS earlier in the pipeline before the
1089 // module is partitioned for codegen.
1090 if (EnableLowerExecSync)
1091 PM.addPass(Pass: AMDGPULowerExecSyncPass());
1092 if (EnableSwLowerLDS)
1093 PM.addPass(Pass: AMDGPUSwLowerLDSPass());
1094 if (EnableLowerModuleLDS)
1095 PM.addPass(Pass: AMDGPULowerModuleLDSPass(*this));
1096 if (Level != OptimizationLevel::O0) {
1097 // We only want to run this with O2 or higher since inliner and SROA
1098 // don't run in O1.
1099 if (Level != OptimizationLevel::O1) {
1100 PM.addPass(
1101 Pass: createModuleToFunctionPassAdaptor(Pass: InferAddressSpacesPass()));
1102 }
1103 // Do we really need internalization in LTO?
1104 if (InternalizeSymbols) {
1105 PM.addPass(Pass: InternalizePass(mustPreserveGV));
1106 PM.addPass(Pass: GlobalDCEPass());
1107 }
1108 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
1109 AMDGPUAttributorOptions Opt;
1110 if (HasClosedWorldAssumption)
1111 Opt.IsClosedWorld = true;
1112 PM.addPass(Pass: AMDGPUAttributorPass(
1113 *this, Opt, ThinOrFullLTOPhase::FullLTOPostLink));
1114 }
1115 }
1116 if (!NoKernelInfoEndLTO) {
1117 FunctionPassManager FPM;
1118 FPM.addPass(Pass: KernelInfoPrinter(this));
1119 PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM)));
1120 }
1121 });
1122
1123 PB.registerRegClassFilterParsingCallback(
1124 C: [](StringRef FilterName) -> RegAllocFilterFunc {
1125 if (FilterName == "sgpr")
1126 return onlyAllocateSGPRs;
1127 if (FilterName == "vgpr")
1128 return onlyAllocateVGPRs;
1129 if (FilterName == "wwm")
1130 return onlyAllocateWWMRegs;
1131 return nullptr;
1132 });
1133}
1134
1135bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
1136 unsigned DestAS) const {
1137 return AMDGPU::isFlatGlobalAddrSpace(AS: SrcAS) &&
1138 AMDGPU::isFlatGlobalAddrSpace(AS: DestAS);
1139}
1140
1141unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
1142 if (auto *Arg = dyn_cast<Argument>(Val: V);
1143 Arg &&
1144 AMDGPU::isModuleEntryFunctionCC(CC: Arg->getParent()->getCallingConv()) &&
1145 !Arg->hasByRefAttr())
1146 return AMDGPUAS::GLOBAL_ADDRESS;
1147
1148 const auto *LD = dyn_cast<LoadInst>(Val: V);
1149 if (!LD) // TODO: Handle invariant load like constant.
1150 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
1151
1152 // It must be a generic pointer loaded.
1153 assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
1154
1155 const auto *Ptr = LD->getPointerOperand();
1156 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
1157 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
1158 // For a generic pointer loaded from the constant memory, it could be assumed
1159 // as a global pointer since the constant memory is only populated on the
1160 // host side. As implied by the offload programming model, only global
1161 // pointers could be referenced on the host side.
1162 return AMDGPUAS::GLOBAL_ADDRESS;
1163}
1164
1165std::pair<const Value *, unsigned>
1166AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
1167 if (auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
1168 switch (II->getIntrinsicID()) {
1169 case Intrinsic::amdgcn_is_shared:
1170 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::LOCAL_ADDRESS);
1171 case Intrinsic::amdgcn_is_private:
1172 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::PRIVATE_ADDRESS);
1173 default:
1174 break;
1175 }
1176 return std::pair(nullptr, -1);
1177 }
1178 // Check the global pointer predication based on
1179 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
1180 // the order of 'is_shared' and 'is_private' is not significant.
1181 Value *Ptr;
1182 if (match(
1183 V: const_cast<Value *>(V),
1184 P: m_c_And(L: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_shared>(Op0: m_Value(V&: Ptr))),
1185 R: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_private>(
1186 Op0: m_Deferred(V: Ptr))))))
1187 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
1188
1189 return std::pair(nullptr, -1);
1190}
1191
1192unsigned
1193AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
1194 switch (Kind) {
1195 case PseudoSourceValue::Stack:
1196 case PseudoSourceValue::FixedStack:
1197 return AMDGPUAS::PRIVATE_ADDRESS;
1198 case PseudoSourceValue::ConstantPool:
1199 case PseudoSourceValue::GOT:
1200 case PseudoSourceValue::JumpTable:
1201 case PseudoSourceValue::GlobalValueCallEntry:
1202 case PseudoSourceValue::ExternalSymbolCallEntry:
1203 return AMDGPUAS::CONSTANT_ADDRESS;
1204 }
1205 return AMDGPUAS::FLAT_ADDRESS;
1206}
1207
1208bool AMDGPUTargetMachine::splitModule(
1209 Module &M, unsigned NumParts,
1210 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1211 // FIXME(?): Would be better to use an already existing Analysis/PassManager,
1212 // but all current users of this API don't have one ready and would need to
1213 // create one anyway. Let's hide the boilerplate for now to keep it simple.
1214
1215 LoopAnalysisManager LAM;
1216 FunctionAnalysisManager FAM;
1217 CGSCCAnalysisManager CGAM;
1218 ModuleAnalysisManager MAM;
1219
1220 PassBuilder PB(this);
1221 PB.registerModuleAnalyses(MAM);
1222 PB.registerFunctionAnalyses(FAM);
1223 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
1224
1225 ModulePassManager MPM;
1226 MPM.addPass(Pass: AMDGPUSplitModulePass(NumParts, ModuleCallback));
1227 MPM.run(IR&: M, AM&: MAM);
1228 return true;
1229}
1230
1231//===----------------------------------------------------------------------===//
1232// GCN Target Machine (SI+)
1233//===----------------------------------------------------------------------===//
1234
1235GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
1236 StringRef CPU, StringRef FS,
1237 const TargetOptions &Options,
1238 std::optional<Reloc::Model> RM,
1239 std::optional<CodeModel::Model> CM,
1240 CodeGenOptLevel OL, bool JIT)
1241 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
1242
1243enum class OOBFlagValue {
1244 Any = 0,
1245 Relaxed = 1,
1246 Strict = 2,
1247};
1248
1249/// Returns the OOB mode encoded by a module flag.
1250/// An absent flag defaults to Any.
1251static OOBFlagValue getOOBFlagValue(const Module &M, StringRef FlagName) {
1252 const auto *Flag =
1253 mdconst::dyn_extract_or_null<ConstantInt>(MD: M.getModuleFlag(Key: FlagName));
1254 if (!Flag)
1255 return OOBFlagValue::Any;
1256 return static_cast<OOBFlagValue>(Flag->getZExtValue());
1257}
1258
1259const TargetSubtargetInfo *
1260GCNTargetMachine::getSubtargetImpl(const Function &F) const {
1261 StringRef GPU = getGPUName(F);
1262 StringRef FS = getFeatureString(F);
1263
1264 const Module &M = *F.getParent();
1265 OOBFlagValue BufOOB = getOOBFlagValue(M, FlagName: AMDGPUOOBMode::BufferFlag);
1266 OOBFlagValue TBufOOB = getOOBFlagValue(M, FlagName: AMDGPUOOBMode::TBufferFlag);
1267 bool BufRelaxed = BufOOB == OOBFlagValue::Relaxed;
1268 bool TBufRelaxed = TBufOOB == OOBFlagValue::Relaxed;
1269 SmallString<128> SubtargetKey(GPU);
1270 SubtargetKey.append(RHS: FS);
1271 if (BufRelaxed)
1272 SubtargetKey.append(RHS: ",buf-oob=1");
1273 if (TBufRelaxed)
1274 SubtargetKey.append(RHS: ",tbuf-oob=1");
1275
1276 auto &I = SubtargetMap[SubtargetKey];
1277 if (!I) {
1278 I = std::make_unique<GCNSubtarget>(args: TargetTriple, args&: GPU, args&: FS, args: *this, args&: BufRelaxed,
1279 args&: TBufRelaxed);
1280 }
1281
1282 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
1283
1284 return I.get();
1285}
1286
1287TargetTransformInfo
1288GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
1289 return TargetTransformInfo(std::make_unique<GCNTTIImpl>(args: this, args: F));
1290}
1291
1292Error GCNTargetMachine::buildCodeGenPipeline(
1293 ModulePassManager &MPM, ModuleAnalysisManager &MAM, raw_pwrite_stream &Out,
1294 raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
1295 const CGPassBuilderOption &Opts, MCContext &Ctx,
1296 PassInstrumentationCallbacks *PIC) {
1297 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
1298 return CGPB.buildPipeline(MPM, MAM, Out, DwoOut, FileType, Ctx);
1299}
1300
1301ScheduleDAGInstrs *
1302GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
1303 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1304 if (ST.enableSIScheduler())
1305 return createSIMachineScheduler(C);
1306
1307 StringRef SchedStrategy = AMDGPU::getSchedStrategy(F: C->MF->getFunction());
1308
1309 if (SchedStrategy == "max-ilp")
1310 return createGCNMaxILPMachineScheduler(C);
1311
1312 if (SchedStrategy == "max-memory-clause")
1313 return createGCNMaxMemoryClauseMachineScheduler(C);
1314
1315 if (SchedStrategy == "iterative-ilp")
1316 return createIterativeILPMachineScheduler(C);
1317
1318 if (SchedStrategy == "iterative-minreg")
1319 return createMinRegScheduler(C);
1320
1321 if (SchedStrategy == "iterative-maxocc")
1322 return createIterativeGCNMaxOccupancyMachineScheduler(C);
1323
1324 if (SchedStrategy == "coexec") {
1325 diagnoseUnsupportedCoExecSchedulerSelection(F: C->MF->getFunction(), ST);
1326 return createGCNCoExecMachineScheduler(C);
1327 }
1328
1329 return createGCNMaxOccupancyMachineScheduler(C);
1330}
1331
1332ScheduleDAGInstrs *
1333GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
1334 if (useNoopPostScheduler(F: C->MF->getFunction()))
1335 return createGCNNoopPostMachineScheduler(C);
1336
1337 ScheduleDAGMI *DAG =
1338 new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(args&: C),
1339 /*RemoveKillFlags=*/true);
1340 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1341 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1342 if (ST.shouldClusterStores())
1343 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1344 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
1345 if ((EnableVOPD.getNumOccurrences() ||
1346 getOptLevel() >= CodeGenOptLevel::Less) &&
1347 EnableVOPD)
1348 DAG->addMutation(Mutation: createVOPDPairingMutation());
1349 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
1350 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
1351 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
1352 return DAG;
1353}
1354//===----------------------------------------------------------------------===//
1355// AMDGPU Legacy Pass Setup
1356//===----------------------------------------------------------------------===//
1357
1358std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
1359 return getStandardCSEConfigForOpt(Level: TM->getOptLevel());
1360}
1361
1362namespace {
1363
1364class GCNPassConfig final : public AMDGPUPassConfig {
1365public:
1366 GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
1367 : AMDGPUPassConfig(TM, PM) {
1368 substitutePass(StandardID: &PostRASchedulerID, TargetID: &PostMachineSchedulerID);
1369 }
1370
1371 GCNTargetMachine &getGCNTargetMachine() const {
1372 return getTM<GCNTargetMachine>();
1373 }
1374
1375 bool addPreISel() override;
1376 void addMachineSSAOptimization() override;
1377 bool addILPOpts() override;
1378 bool addInstSelector() override;
1379 bool addIRTranslator() override;
1380 void addPreLegalizeMachineIR() override;
1381 bool addLegalizeMachineIR() override;
1382 void addPreRegBankSelect() override;
1383 bool addRegBankSelect() override;
1384 void addPreGlobalInstructionSelect() override;
1385 bool addGlobalInstructionSelect() override;
1386 void addPreRegAlloc() override;
1387 void addFastRegAlloc() override;
1388 void addOptimizedRegAlloc() override;
1389
1390 FunctionPass *createSGPRAllocPass(bool Optimized);
1391 FunctionPass *createVGPRAllocPass(bool Optimized);
1392 FunctionPass *createWWMRegAllocPass(bool Optimized);
1393 FunctionPass *createRegAllocPass(bool Optimized) override;
1394
1395 bool addRegAssignAndRewriteFast() override;
1396 bool addRegAssignAndRewriteOptimized() override;
1397
1398 bool addPreRewrite() override;
1399 void addPostRegAlloc() override;
1400 void addPreSched2() override;
1401 void addPreEmitPass() override;
1402 void addPostBBSections() override;
1403};
1404
1405} // end anonymous namespace
1406
1407AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM)
1408 : TargetPassConfig(TM, PM) {
1409 // Exceptions and StackMaps are not supported, so these passes will never do
1410 // anything.
1411 disablePass(PassID: &StackMapLivenessID);
1412 disablePass(PassID: &FuncletLayoutID);
1413 // Garbage collection is not supported.
1414 disablePass(PassID: &GCLoweringID);
1415 disablePass(PassID: &ShadowStackGCLoweringID);
1416}
1417
1418void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
1419 if (getOptLevel() == CodeGenOptLevel::Aggressive)
1420 addPass(P: createGVNPass());
1421 else
1422 addPass(P: createEarlyCSEPass());
1423}
1424
1425void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
1426 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
1427 addPass(P: createLoopDataPrefetchPass());
1428 addPass(P: createSeparateConstOffsetFromGEPPass());
1429 // ReassociateGEPs exposes more opportunities for SLSR. See
1430 // the example in reassociate-geps-and-slsr.ll.
1431 addPass(P: createStraightLineStrengthReducePass());
1432 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1433 // EarlyCSE can reuse.
1434 addEarlyCSEOrGVNPass();
1435 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1436 addPass(P: createNaryReassociatePass());
1437 // NaryReassociate on GEPs creates redundant common expressions, so run
1438 // EarlyCSE after it.
1439 addPass(P: createEarlyCSEPass());
1440}
1441
1442void AMDGPUPassConfig::addIRPasses() {
1443 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1444
1445 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
1446 addPass(P: createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1447
1448 // There is no reason to run these.
1449 disablePass(PassID: &StackMapLivenessID);
1450 disablePass(PassID: &FuncletLayoutID);
1451 disablePass(PassID: &PatchableFunctionID);
1452
1453 if (TM.getTargetTriple().isAMDGCN())
1454 addPass(P: createAMDGPUPrintfRuntimeBinding());
1455
1456 if (LowerCtorDtor)
1457 addPass(P: createAMDGPUCtorDtorLoweringLegacyPass());
1458
1459 if (TM.getTargetTriple().isAMDGCN() &&
1460 isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
1461 addPass(P: createAMDGPUImageIntrinsicOptimizerPass(&TM));
1462
1463 if (EnableUniformIntrinsicCombine)
1464 addPass(P: createAMDGPUUniformIntrinsicCombineLegacyPass());
1465
1466 // This can be disabled by passing ::Disable here or on the command line
1467 // with --expand-variadics-override=disable.
1468 addPass(P: createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
1469
1470 // Function calls are not supported, so make sure we inline everything.
1471 addPass(P: createAMDGPUAlwaysInlinePass());
1472 addPass(P: createAlwaysInlinerLegacyPass());
1473
1474 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1475 if (TM.getTargetTriple().getArch() == Triple::r600)
1476 addPass(P: createR600OpenCLImageTypeLoweringPass());
1477
1478 // Make enqueued block runtime handles externally visible.
1479 addPass(P: createAMDGPUExportKernelRuntimeHandlesLegacyPass());
1480
1481 // Lower special LDS accesses.
1482 if (EnableLowerExecSync)
1483 addPass(P: createAMDGPULowerExecSyncLegacyPass());
1484
1485 // Lower LDS accesses to global memory pass if address sanitizer is enabled.
1486 if (EnableSwLowerLDS)
1487 addPass(P: createAMDGPUSwLowerLDSLegacyPass());
1488
1489 // Runs before PromoteAlloca so the latter can account for function uses
1490 if (EnableLowerModuleLDS) {
1491 addPass(P: createAMDGPULowerModuleLDSLegacyPass(TM: &TM));
1492 }
1493
1494 // Run atomic optimizer before Atomic Expand
1495 if ((TM.getTargetTriple().isAMDGCN()) &&
1496 (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1497 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1498 addPass(P: createAMDGPUAtomicOptimizerPass(ScanStrategy: AMDGPUAtomicOptimizerStrategy));
1499 }
1500
1501 addPass(P: createAtomicExpandLegacyPass());
1502
1503 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1504 addPass(P: createAMDGPUPromoteAlloca());
1505
1506 if (isPassEnabled(Opt: EnableScalarIRPasses))
1507 addStraightLineScalarOptimizationPasses();
1508
1509 if (EnableAMDGPUAliasAnalysis) {
1510 addPass(P: createAMDGPUAAWrapperPass());
1511 addPass(P: createExternalAAWrapperPass(Callback: [](Pass &P, Function &,
1512 AAResults &AAR) {
1513 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1514 AAR.addAAResult(AAResult&: WrapperPass->getResult());
1515 }));
1516 }
1517
1518 if (TM.getTargetTriple().isAMDGCN()) {
1519 // TODO: May want to move later or split into an early and late one.
1520 addPass(P: createAMDGPUCodeGenPreparePass());
1521 }
1522
1523 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1524 // have expanded.
1525 if (TM.getOptLevel() > CodeGenOptLevel::Less)
1526 addPass(P: createLICMPass());
1527 }
1528
1529 TargetPassConfig::addIRPasses();
1530
1531 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1532 // example, GVN can combine
1533 //
1534 // %0 = add %a, %b
1535 // %1 = add %b, %a
1536 //
1537 // and
1538 //
1539 // %0 = shl nsw %a, 2
1540 // %1 = shl %a, 2
1541 //
1542 // but EarlyCSE can do neither of them.
1543 if (isPassEnabled(Opt: EnableScalarIRPasses))
1544 addEarlyCSEOrGVNPass();
1545}
1546
1547void AMDGPUPassConfig::addCodeGenPrepare() {
1548 if (TM->getTargetTriple().isAMDGCN() &&
1549 TM->getOptLevel() > CodeGenOptLevel::None)
1550 addPass(P: createAMDGPUPreloadKernelArgumentsLegacyPass(TM));
1551
1552 if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
1553 addPass(P: createAMDGPULowerKernelArgumentsPass());
1554
1555 TargetPassConfig::addCodeGenPrepare();
1556
1557 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
1558 addPass(P: createLoadStoreVectorizerPass());
1559
1560 if (TM->getTargetTriple().isAMDGCN()) {
1561 // This lowering has been placed after codegenprepare to take advantage of
1562 // address mode matching (which is why it isn't put with the LDS lowerings).
1563 // It could be placed anywhere before uniformity annotations (an analysis
1564 // that it changes by splitting up fat pointers into their components)
1565 // but has been put before switch lowering and CFG flattening so that those
1566 // passes can run on the more optimized control flow this pass creates in
1567 // many cases.
1568 addPass(P: createAMDGPULowerBufferFatPointersPass());
1569 addPass(P: createAMDGPULowerIntrinsicsLegacyPass());
1570 }
1571
1572 // LowerSwitch pass may introduce unreachable blocks that can
1573 // cause unexpected behavior for subsequent passes. Placing it
1574 // here seems better that these blocks would get cleaned up by
1575 // UnreachableBlockElim inserted next in the pass flow.
1576 addPass(P: createLowerSwitchPass());
1577}
1578
1579bool AMDGPUPassConfig::addPreISel() {
1580 if (TM->getOptLevel() > CodeGenOptLevel::None)
1581 addPass(P: createFlattenCFGPass());
1582 return false;
1583}
1584
1585bool AMDGPUPassConfig::addInstSelector() {
1586 addPass(P: createAMDGPUISelDag(TM&: getAMDGPUTargetMachine(), OptLevel: getOptLevel()));
1587 return false;
1588}
1589
1590bool AMDGPUPassConfig::addGCPasses() {
1591 // Do nothing. GC is not supported.
1592 return false;
1593}
1594
1595//===----------------------------------------------------------------------===//
1596// GCN Legacy Pass Setup
1597//===----------------------------------------------------------------------===//
1598
1599bool GCNPassConfig::addPreISel() {
1600 AMDGPUPassConfig::addPreISel();
1601
1602 if (TM->getOptLevel() > CodeGenOptLevel::None) {
1603 addPass(P: createSinkingPass());
1604 addPass(P: createAMDGPULateCodeGenPrepareLegacyPass());
1605 }
1606
1607 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1608 // regions formed by them.
1609 addPass(PassID: &AMDGPUUnifyDivergentExitNodesID);
1610 addPass(P: createFixIrreduciblePass());
1611 addPass(P: createUnifyLoopExitsPass());
1612 addPass(P: createStructurizeCFGPass(SkipUniformRegions: false)); // true -> SkipUniformRegions
1613
1614 addPass(P: createAMDGPUAnnotateUniformValuesLegacy());
1615 addPass(P: createSIAnnotateControlFlowLegacyPass());
1616 // TODO: Move this right after structurizeCFG to avoid extra divergence
1617 // analysis. This depends on stopping SIAnnotateControlFlow from making
1618 // control flow modifications.
1619 addPass(P: createAMDGPURewriteUndefForPHILegacyPass());
1620
1621 // SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
1622 // without any of the fallback options.
1623 if (getCGPassBuilderOption().EnableGlobalISelOption !=
1624 cl::boolOrDefault::BOU_TRUE ||
1625 !isGlobalISelAbortEnabled())
1626 addPass(P: createLCSSAPass());
1627
1628 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1629 addPass(PassID: &AMDGPUPerfHintAnalysisLegacyID);
1630
1631 return false;
1632}
1633
1634void GCNPassConfig::addMachineSSAOptimization() {
1635 TargetPassConfig::addMachineSSAOptimization();
1636
1637 // We want to fold operands after PeepholeOptimizer has run (or as part of
1638 // it), because it will eliminate extra copies making it easier to fold the
1639 // real source operand. We want to eliminate dead instructions after, so that
1640 // we see fewer uses of the copies. We then need to clean up the dead
1641 // instructions leftover after the operands are folded as well.
1642 //
1643 // XXX - Can we get away without running DeadMachineInstructionElim again?
1644 addPass(PassID: &SIFoldOperandsLegacyID);
1645 if (EnableDPPCombine)
1646 addPass(PassID: &GCNDPPCombineLegacyID);
1647 addPass(PassID: &SILoadStoreOptimizerLegacyID);
1648 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
1649 addPass(PassID: &SIPeepholeSDWALegacyID);
1650 addPass(PassID: &EarlyMachineLICMID);
1651 addPass(PassID: &MachineCSELegacyID);
1652 addPass(PassID: &SIFoldOperandsLegacyID);
1653 }
1654 addPass(PassID: &DeadMachineInstructionElimID);
1655 addPass(P: createSIShrinkInstructionsLegacyPass());
1656}
1657
1658bool GCNPassConfig::addILPOpts() {
1659 if (EnableEarlyIfConversion)
1660 addPass(PassID: &EarlyIfConverterLegacyID);
1661
1662 TargetPassConfig::addILPOpts();
1663 return false;
1664}
1665
1666bool GCNPassConfig::addInstSelector() {
1667 AMDGPUPassConfig::addInstSelector();
1668 addPass(PassID: &SIFixSGPRCopiesLegacyID);
1669 addPass(P: createSILowerI1CopiesLegacyPass());
1670 return false;
1671}
1672
1673bool GCNPassConfig::addIRTranslator() {
1674 addPass(P: new IRTranslator(getOptLevel()));
1675 return false;
1676}
1677
1678void GCNPassConfig::addPreLegalizeMachineIR() {
1679 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1680 addPass(P: createAMDGPUPreLegalizeCombiner(IsOptNone));
1681 addPass(P: new Localizer());
1682}
1683
1684bool GCNPassConfig::addLegalizeMachineIR() {
1685 addPass(P: new Legalizer());
1686 return false;
1687}
1688
1689void GCNPassConfig::addPreRegBankSelect() {
1690 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1691 addPass(P: createAMDGPUPostLegalizeCombiner(IsOptNone));
1692 addPass(P: createAMDGPUGlobalISelDivergenceLoweringPass());
1693}
1694
1695bool GCNPassConfig::addRegBankSelect() {
1696 addPass(P: createAMDGPURegBankSelectPass());
1697 addPass(P: createAMDGPURegBankLegalizePass());
1698 return false;
1699}
1700
1701void GCNPassConfig::addPreGlobalInstructionSelect() {
1702 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1703 addPass(P: createAMDGPURegBankCombiner(IsOptNone));
1704}
1705
1706bool GCNPassConfig::addGlobalInstructionSelect() {
1707 addPass(P: new InstructionSelect(getOptLevel()));
1708 return false;
1709}
1710
1711void GCNPassConfig::addFastRegAlloc() {
1712 // FIXME: We have to disable the verifier here because of PHIElimination +
1713 // TwoAddressInstructions disabling it.
1714
1715 // This must be run immediately after phi elimination and before
1716 // TwoAddressInstructions, otherwise the processing of the tied operand of
1717 // SI_ELSE will introduce a copy of the tied operand source after the else.
1718 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1719
1720 insertPass(TargetPassID: &TwoAddressInstructionPassID, InsertedPassID: &SIWholeQuadModeID);
1721
1722 TargetPassConfig::addFastRegAlloc();
1723}
1724
1725void GCNPassConfig::addPreRegAlloc() {
1726 if (getOptLevel() != CodeGenOptLevel::None)
1727 addPass(PassID: &AMDGPUPrepareAGPRAllocLegacyID);
1728}
1729
1730void GCNPassConfig::addOptimizedRegAlloc() {
1731 if (EnableDCEInRA)
1732 insertPass(TargetPassID: &DetectDeadLanesID, InsertedPassID: &DeadMachineInstructionElimID);
1733
1734 // FIXME: when an instruction has a Killed operand, and the instruction is
1735 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1736 // the register in LiveVariables, this would trigger a failure in verifier,
1737 // we should fix it and enable the verifier.
1738 if (OptVGPRLiveRange)
1739 insertPass(TargetPassID: &LiveVariablesID, InsertedPassID: &SIOptimizeVGPRLiveRangeLegacyID);
1740
1741 // This must be run immediately after phi elimination and before
1742 // TwoAddressInstructions, otherwise the processing of the tied operand of
1743 // SI_ELSE will introduce a copy of the tied operand source after the else.
1744 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1745
1746 if (EnableRewritePartialRegUses)
1747 insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNRewritePartialRegUsesID);
1748
1749 if (isPassEnabled(Opt: EnablePreRAOptimizations))
1750 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &GCNPreRAOptimizationsID);
1751
1752 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1753 // instructions that cause scheduling barriers.
1754 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIWholeQuadModeID);
1755
1756 if (OptExecMaskPreRA)
1757 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIOptimizeExecMaskingPreRAID);
1758
1759 // This is not an essential optimization and it has a noticeable impact on
1760 // compilation time, so we only enable it from O2.
1761 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1762 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIFormMemoryClausesID);
1763
1764 TargetPassConfig::addOptimizedRegAlloc();
1765}
1766
1767bool GCNPassConfig::addPreRewrite() {
1768 if (EnableRegReassign)
1769 addPass(PassID: &GCNNSAReassignID);
1770
1771 addPass(PassID: &AMDGPURewriteAGPRCopyMFMALegacyID);
1772 return true;
1773}
1774
1775FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1776 // Initialize the global default.
1777 llvm::call_once(flag&: InitializeDefaultSGPRRegisterAllocatorFlag,
1778 F&: initializeDefaultSGPRRegisterAllocatorOnce);
1779
1780 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1781 if (Ctor != useDefaultRegisterAllocator)
1782 return Ctor();
1783
1784 if (Optimized)
1785 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
1786
1787 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
1788}
1789
1790FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1791 // Initialize the global default.
1792 llvm::call_once(flag&: InitializeDefaultVGPRRegisterAllocatorFlag,
1793 F&: initializeDefaultVGPRRegisterAllocatorOnce);
1794
1795 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1796 if (Ctor != useDefaultRegisterAllocator)
1797 return Ctor();
1798
1799 if (Optimized)
1800 return createGreedyVGPRRegisterAllocator();
1801
1802 return createFastVGPRRegisterAllocator();
1803}
1804
1805FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1806 // Initialize the global default.
1807 llvm::call_once(flag&: InitializeDefaultWWMRegisterAllocatorFlag,
1808 F&: initializeDefaultWWMRegisterAllocatorOnce);
1809
1810 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1811 if (Ctor != useDefaultRegisterAllocator)
1812 return Ctor();
1813
1814 if (Optimized)
1815 return createGreedyWWMRegisterAllocator();
1816
1817 return createFastWWMRegisterAllocator();
1818}
1819
1820FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1821 llvm_unreachable("should not be used");
1822}
1823
1824static const char RegAllocOptNotSupportedMessage[] =
1825 "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1826 "and -vgpr-regalloc";
1827
1828bool GCNPassConfig::addRegAssignAndRewriteFast() {
1829 if (!usingDefaultRegAlloc())
1830 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1831
1832 addPass(PassID: &GCNPreRALongBranchRegID);
1833
1834 addPass(P: createSGPRAllocPass(Optimized: false));
1835
1836 // Equivalent of PEI for SGPRs.
1837 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1838
1839 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1840 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1841
1842 // For allocating other wwm register operands.
1843 addPass(P: createWWMRegAllocPass(Optimized: false));
1844
1845 addPass(PassID: &SILowerWWMCopiesLegacyID);
1846 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1847
1848 // For allocating per-thread VGPRs.
1849 addPass(P: createVGPRAllocPass(Optimized: false));
1850
1851 return true;
1852}
1853
1854bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1855 if (!usingDefaultRegAlloc())
1856 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1857
1858 addPass(PassID: &GCNPreRALongBranchRegID);
1859
1860 addPass(P: createSGPRAllocPass(Optimized: true));
1861
1862 // Commit allocated register changes. This is mostly necessary because too
1863 // many things rely on the use lists of the physical registers, such as the
1864 // verifier. This is only necessary with allocators which use LiveIntervals,
1865 // since FastRegAlloc does the replacements itself.
1866 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1867
1868 // At this point, the sgpr-regalloc has been done and it is good to have the
1869 // stack slot coloring to try to optimize the SGPR spill stack indices before
1870 // attempting the custom SGPR spill lowering.
1871 addPass(PassID: &StackSlotColoringID);
1872
1873 // Equivalent of PEI for SGPRs.
1874 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1875
1876 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1877 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1878
1879 // For allocating other whole wave mode registers.
1880 addPass(P: createWWMRegAllocPass(Optimized: true));
1881 addPass(PassID: &SILowerWWMCopiesLegacyID);
1882 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1883 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1884
1885 // For allocating per-thread VGPRs.
1886 addPass(P: createVGPRAllocPass(Optimized: true));
1887
1888 addPreRewrite();
1889 addPass(PassID: &VirtRegRewriterID);
1890
1891 addPass(PassID: &AMDGPUMarkLastScratchLoadID);
1892
1893 return true;
1894}
1895
1896void GCNPassConfig::addPostRegAlloc() {
1897 addPass(PassID: &SIFixVGPRCopiesID);
1898 if (getOptLevel() > CodeGenOptLevel::None)
1899 addPass(PassID: &SIOptimizeExecMaskingLegacyID);
1900 TargetPassConfig::addPostRegAlloc();
1901}
1902
1903void GCNPassConfig::addPreSched2() {
1904 if (TM->getOptLevel() > CodeGenOptLevel::None)
1905 addPass(P: createSIShrinkInstructionsLegacyPass());
1906 addPass(PassID: &SIPostRABundlerLegacyID);
1907}
1908
1909void GCNPassConfig::addPreEmitPass() {
1910 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less))
1911 addPass(PassID: &GCNCreateVOPDID);
1912 addPass(P: createSIMemoryLegalizerPass());
1913 addPass(P: createSIInsertWaitcntsPass());
1914
1915 addPass(P: createSIModeRegisterPass());
1916
1917 if (getOptLevel() > CodeGenOptLevel::None)
1918 addPass(PassID: &SIInsertHardClausesID);
1919
1920 addPass(PassID: &SILateBranchLoweringPassID);
1921 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
1922 addPass(P: createAMDGPUSetWavePriorityPass());
1923 if (getOptLevel() > CodeGenOptLevel::None)
1924 addPass(PassID: &SIPreEmitPeepholeID);
1925 // The hazard recognizer that runs as part of the post-ra scheduler does not
1926 // guarantee to be able handle all hazards correctly. This is because if there
1927 // are multiple scheduling regions in a basic block, the regions are scheduled
1928 // bottom up, so when we begin to schedule a region we don't know what
1929 // instructions were emitted directly before it.
1930 //
1931 // Here we add a stand-alone hazard recognizer pass which can handle all
1932 // cases.
1933 addPass(PassID: &PostRAHazardRecognizerID);
1934
1935 addPass(PassID: &AMDGPUWaitSGPRHazardsLegacyID);
1936
1937 addPass(PassID: &AMDGPULowerVGPREncodingLegacyID);
1938
1939 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less))
1940 addPass(PassID: &AMDGPUInsertDelayAluID);
1941
1942 addPass(PassID: &BranchRelaxationPassID);
1943}
1944
1945void GCNPassConfig::addPostBBSections() {
1946 // We run this later to avoid passes like livedebugvalues and BBSections
1947 // having to deal with the apparent multi-entry functions we may generate.
1948 addPass(P: createAMDGPUPreloadKernArgPrologLegacyPass());
1949}
1950
1951TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1952 return new GCNPassConfig(*this, PM);
1953}
1954
1955void GCNTargetMachine::registerMachineRegisterInfoCallback(
1956 MachineFunction &MF) const {
1957 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1958 MF.getRegInfo().addDelegate(delegate: MFI);
1959}
1960
1961MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1962 BumpPtrAllocator &Allocator, const Function &F,
1963 const TargetSubtargetInfo *STI) const {
1964 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1965 Allocator, F, STI: static_cast<const GCNSubtarget *>(STI));
1966}
1967
1968yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1969 return new yaml::SIMachineFunctionInfo();
1970}
1971
1972yaml::MachineFunctionInfo *
1973GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1974 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1975 return new yaml::SIMachineFunctionInfo(
1976 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1977}
1978
1979bool GCNTargetMachine::parseMachineFunctionInfo(
1980 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1981 SMDiagnostic &Error, SMRange &SourceRange) const {
1982 const yaml::SIMachineFunctionInfo &YamlMFI =
1983 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1984 MachineFunction &MF = PFS.MF;
1985 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1986 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1987
1988 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1989 return true;
1990
1991 if (MFI->Occupancy == 0) {
1992 // Fixup the subtarget dependent default value.
1993 MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
1994 }
1995
1996 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1997 Register TempReg;
1998 if (parseNamedRegisterReference(PFS, Reg&: TempReg, Src: RegName.Value, Error)) {
1999 SourceRange = RegName.SourceRange;
2000 return true;
2001 }
2002 RegVal = TempReg;
2003
2004 return false;
2005 };
2006
2007 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
2008 Register &RegVal) {
2009 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
2010 };
2011
2012 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
2013 return true;
2014
2015 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
2016 return true;
2017
2018 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
2019 MFI->LongBranchReservedReg))
2020 return true;
2021
2022 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
2023 // Create a diagnostic for a the register string literal.
2024 const MemoryBuffer &Buffer =
2025 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
2026 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
2027 RegName.Value.size(), SourceMgr::DK_Error,
2028 "incorrect register class for field", RegName.Value,
2029 {}, {});
2030 SourceRange = RegName.SourceRange;
2031 return true;
2032 };
2033
2034 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
2035 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
2036 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
2037 return true;
2038
2039 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
2040 !AMDGPU::SGPR_128RegClass.contains(Reg: MFI->ScratchRSrcReg)) {
2041 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
2042 }
2043
2044 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
2045 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->FrameOffsetReg)) {
2046 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
2047 }
2048
2049 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
2050 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->StackPtrOffsetReg)) {
2051 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
2052 }
2053
2054 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
2055 Register ParsedReg;
2056 if (parseRegister(YamlReg, ParsedReg))
2057 return true;
2058
2059 MFI->reserveWWMRegister(Reg: ParsedReg);
2060 }
2061
2062 for (const auto &[_, Info] : PFS.VRegInfosNamed) {
2063 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
2064 }
2065 for (const auto &[_, Info] : PFS.VRegInfos) {
2066 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
2067 }
2068
2069 for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
2070 Register ParsedReg;
2071 if (parseRegister(YamlRegStr, ParsedReg))
2072 return true;
2073 MFI->SpillPhysVGPRs.push_back(Elt: ParsedReg);
2074 }
2075
2076 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
2077 const TargetRegisterClass &RC,
2078 ArgDescriptor &Arg, unsigned UserSGPRs,
2079 unsigned SystemSGPRs) {
2080 // Skip parsing if it's not present.
2081 if (!A)
2082 return false;
2083
2084 if (A->IsRegister) {
2085 Register Reg;
2086 if (parseNamedRegisterReference(PFS, Reg, Src: A->RegisterName.Value, Error)) {
2087 SourceRange = A->RegisterName.SourceRange;
2088 return true;
2089 }
2090 if (!RC.contains(Reg))
2091 return diagnoseRegisterClass(A->RegisterName);
2092 Arg = ArgDescriptor::createRegister(Reg);
2093 } else
2094 Arg = ArgDescriptor::createStack(Offset: A->StackOffset);
2095 // Check and apply the optional mask.
2096 if (A->Mask)
2097 Arg = ArgDescriptor::createArg(Arg, Mask: *A->Mask);
2098
2099 MFI->NumUserSGPRs += UserSGPRs;
2100 MFI->NumSystemSGPRs += SystemSGPRs;
2101 return false;
2102 };
2103
2104 if (YamlMFI.ArgInfo &&
2105 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
2106 AMDGPU::SGPR_128RegClass,
2107 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
2108 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
2109 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
2110 2, 0) ||
2111 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
2112 MFI->ArgInfo.QueuePtr, 2, 0) ||
2113 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
2114 AMDGPU::SReg_64RegClass,
2115 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
2116 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
2117 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
2118 2, 0) ||
2119 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
2120 AMDGPU::SReg_64RegClass,
2121 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
2122 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
2123 AMDGPU::SGPR_32RegClass,
2124 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
2125 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
2126 AMDGPU::SGPR_32RegClass,
2127 MFI->ArgInfo.LDSKernelId, 0, 1) ||
2128 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
2129 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
2130 0, 1) ||
2131 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
2132 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
2133 0, 1) ||
2134 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
2135 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
2136 0, 1) ||
2137 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
2138 AMDGPU::SGPR_32RegClass,
2139 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
2140 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
2141 AMDGPU::SGPR_32RegClass,
2142 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
2143 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
2144 AMDGPU::SReg_64RegClass,
2145 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
2146 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
2147 AMDGPU::SReg_64RegClass,
2148 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
2149 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
2150 AMDGPU::VGPR_32RegClass,
2151 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
2152 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
2153 AMDGPU::VGPR_32RegClass,
2154 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
2155 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
2156 AMDGPU::VGPR_32RegClass,
2157 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
2158 return true;
2159
2160 // Parse FirstKernArgPreloadReg separately, since it's a Register,
2161 // not ArgDescriptor.
2162 if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) {
2163 const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg;
2164
2165 if (!A.IsRegister) {
2166 // For stack arguments, we don't have RegisterName.SourceRange,
2167 // but we should have some location info from the YAML parser
2168 const MemoryBuffer &Buffer =
2169 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
2170 // Create a minimal valid source range
2171 SMLoc Loc = SMLoc::getFromPointer(Ptr: Buffer.getBufferStart());
2172 SMRange Range(Loc, Loc);
2173
2174 Error = SMDiagnostic(
2175 *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error,
2176 "firstKernArgPreloadReg must be a register, not a stack location", "",
2177 {}, {});
2178
2179 SourceRange = Range;
2180 return true;
2181 }
2182
2183 Register Reg;
2184 if (parseNamedRegisterReference(PFS, Reg, Src: A.RegisterName.Value, Error)) {
2185 SourceRange = A.RegisterName.SourceRange;
2186 return true;
2187 }
2188
2189 if (!AMDGPU::SGPR_32RegClass.contains(Reg))
2190 return diagnoseRegisterClass(A.RegisterName);
2191
2192 MFI->ArgInfo.FirstKernArgPreloadReg = Reg;
2193 MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs;
2194 }
2195
2196 if (ST.hasFeature(Feature: AMDGPU::FeatureDX10ClampAndIEEEMode)) {
2197 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
2198 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
2199 }
2200
2201 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
2202 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
2203 ? DenormalMode::IEEE
2204 : DenormalMode::PreserveSign;
2205 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
2206 ? DenormalMode::IEEE
2207 : DenormalMode::PreserveSign;
2208
2209 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
2210 ? DenormalMode::IEEE
2211 : DenormalMode::PreserveSign;
2212 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
2213 ? DenormalMode::IEEE
2214 : DenormalMode::PreserveSign;
2215
2216 if (YamlMFI.HasInitWholeWave)
2217 MFI->setInitWholeWave();
2218
2219 return false;
2220}
2221
2222//===----------------------------------------------------------------------===//
2223// AMDGPU CodeGen Pass Builder interface.
2224//===----------------------------------------------------------------------===//
2225
2226AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
2227 GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
2228 PassInstrumentationCallbacks *PIC)
2229 : CodeGenPassBuilder(TM, Opts, PIC) {
2230 Opt.MISchedPostRA = true;
2231 Opt.RequiresCodeGenSCCOrder = true;
2232 // Exceptions and StackMaps are not supported, so these passes will never do
2233 // anything.
2234 // Garbage collection is not supported.
2235 disablePass<StackMapLivenessPass, FuncletLayoutPass, PatchableFunctionPass,
2236 ShadowStackGCLoweringPass, GCLoweringPass>();
2237}
2238
2239void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
2240 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) {
2241 flushFPMsToMPM(PMW);
2242 addModulePass(Pass: AMDGPURemoveIncompatibleFunctionsPass(TM), PMW);
2243 }
2244
2245 flushFPMsToMPM(PMW);
2246
2247 if (TM.getTargetTriple().isAMDGCN())
2248 addModulePass(Pass: AMDGPUPrintfRuntimeBindingPass(), PMW);
2249
2250 if (LowerCtorDtor)
2251 addModulePass(Pass: AMDGPUCtorDtorLoweringPass(), PMW);
2252
2253 if (isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
2254 addFunctionPass(Pass: AMDGPUImageIntrinsicOptimizerPass(TM), PMW);
2255
2256 if (EnableUniformIntrinsicCombine)
2257 addFunctionPass(Pass: AMDGPUUniformIntrinsicCombinePass(), PMW);
2258 // This can be disabled by passing ::Disable here or on the command line
2259 // with --expand-variadics-override=disable.
2260 flushFPMsToMPM(PMW);
2261 addModulePass(Pass: ExpandVariadicsPass(ExpandVariadicsMode::Lowering), PMW);
2262
2263 addModulePass(Pass: AMDGPUAlwaysInlinePass(), PMW);
2264 addModulePass(Pass: AlwaysInlinerPass(), PMW);
2265
2266 addModulePass(Pass: AMDGPUExportKernelRuntimeHandlesPass(), PMW);
2267
2268 if (EnableLowerExecSync)
2269 addModulePass(Pass: AMDGPULowerExecSyncPass(), PMW);
2270
2271 if (EnableSwLowerLDS)
2272 addModulePass(Pass: AMDGPUSwLowerLDSPass(), PMW);
2273
2274 // Runs before PromoteAlloca so the latter can account for function uses
2275 if (EnableLowerModuleLDS)
2276 addModulePass(Pass: AMDGPULowerModuleLDSPass(TM), PMW);
2277
2278 // Run atomic optimizer before Atomic Expand
2279 if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
2280 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
2281 addFunctionPass(
2282 Pass: AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy), PMW);
2283
2284 addFunctionPass(Pass: AtomicExpandPass(TM), PMW);
2285
2286 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2287 addFunctionPass(Pass: AMDGPUPromoteAllocaPass(TM), PMW);
2288 if (isPassEnabled(Opt: EnableScalarIRPasses))
2289 addStraightLineScalarOptimizationPasses(PMW);
2290
2291 // TODO: Handle EnableAMDGPUAliasAnalysis
2292
2293 // TODO: May want to move later or split into an early and late one.
2294 addFunctionPass(Pass: AMDGPUCodeGenPreparePass(TM), PMW);
2295
2296 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
2297 // have expanded.
2298 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2299 addFunctionPass(Pass: createFunctionToLoopPassAdaptor(Pass: LICMPass(LICMOptions()),
2300 /*UseMemorySSA=*/true),
2301 PMW);
2302 }
2303 }
2304
2305 Base::addIRPasses(PMW);
2306
2307 // EarlyCSE is not always strong enough to clean up what LSR produces. For
2308 // example, GVN can combine
2309 //
2310 // %0 = add %a, %b
2311 // %1 = add %b, %a
2312 //
2313 // and
2314 //
2315 // %0 = shl nsw %a, 2
2316 // %1 = shl %a, 2
2317 //
2318 // but EarlyCSE can do neither of them.
2319 if (isPassEnabled(Opt: EnableScalarIRPasses))
2320 addEarlyCSEOrGVNPass(PMW);
2321}
2322
2323void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(
2324 PassManagerWrapper &PMW) const {
2325 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2326 flushFPMsToMPM(PMW);
2327 addModulePass(Pass: AMDGPUPreloadKernelArgumentsPass(TM), PMW);
2328 }
2329
2330 if (EnableLowerKernelArguments)
2331 addFunctionPass(Pass: AMDGPULowerKernelArgumentsPass(TM), PMW);
2332
2333 Base::addCodeGenPrepare(PMW);
2334
2335 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
2336 addFunctionPass(Pass: LoadStoreVectorizerPass(), PMW);
2337
2338 // This lowering has been placed after codegenprepare to take advantage of
2339 // address mode matching (which is why it isn't put with the LDS lowerings).
2340 // It could be placed anywhere before uniformity annotations (an analysis
2341 // that it changes by splitting up fat pointers into their components)
2342 // but has been put before switch lowering and CFG flattening so that those
2343 // passes can run on the more optimized control flow this pass creates in
2344 // many cases.
2345 flushFPMsToMPM(PMW);
2346 addModulePass(Pass: AMDGPULowerBufferFatPointersPass(TM), PMW);
2347 flushFPMsToMPM(PMW);
2348 requireCGSCCOrder(PMW);
2349
2350 addModulePass(Pass: AMDGPULowerIntrinsicsPass(TM), PMW);
2351
2352 // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
2353 // behavior for subsequent passes. Placing it here seems better that these
2354 // blocks would get cleaned up by UnreachableBlockElim inserted next in the
2355 // pass flow.
2356 addFunctionPass(Pass: LowerSwitchPass(), PMW);
2357}
2358
2359void AMDGPUCodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const {
2360
2361 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2362 addFunctionPass(Pass: FlattenCFGPass(), PMW);
2363 addFunctionPass(Pass: SinkingPass(), PMW);
2364 addFunctionPass(Pass: AMDGPULateCodeGenPreparePass(TM), PMW);
2365 }
2366
2367 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
2368 // regions formed by them.
2369
2370 addFunctionPass(Pass: AMDGPUUnifyDivergentExitNodesPass(), PMW);
2371 addFunctionPass(Pass: FixIrreduciblePass(), PMW);
2372 addFunctionPass(Pass: UnifyLoopExitsPass(), PMW);
2373 addFunctionPass(Pass: StructurizeCFGPass(/*SkipUniformRegions=*/false), PMW);
2374
2375 addFunctionPass(Pass: AMDGPUAnnotateUniformValuesPass(), PMW);
2376
2377 addFunctionPass(Pass: SIAnnotateControlFlowPass(TM), PMW);
2378
2379 // TODO: Move this right after structurizeCFG to avoid extra divergence
2380 // analysis. This depends on stopping SIAnnotateControlFlow from making
2381 // control flow modifications.
2382 addFunctionPass(Pass: AMDGPURewriteUndefForPHIPass(), PMW);
2383
2384 if (getCGPassBuilderOption().EnableGlobalISelOption !=
2385 cl::boolOrDefault::BOU_TRUE ||
2386 !isGlobalISelAbortEnabled())
2387 addFunctionPass(Pass: LCSSAPass(), PMW);
2388
2389 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2390 flushFPMsToMPM(PMW);
2391 addModulePass(Pass: AMDGPUPerfHintAnalysisPass(TM), PMW);
2392 }
2393
2394 // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
2395 // isn't this in addInstSelector?
2396 addFunctionPass(Pass: RequireAnalysisPass<UniformityInfoAnalysis, Function>(), PMW,
2397 /*Force=*/true);
2398}
2399
2400void AMDGPUCodeGenPassBuilder::addILPOpts(PassManagerWrapper &PMW) const {
2401 if (EnableEarlyIfConversion)
2402 addMachineFunctionPass(Pass: EarlyIfConverterPass(), PMW);
2403
2404 Base::addILPOpts(PMW);
2405}
2406
2407void AMDGPUCodeGenPassBuilder::addAsmPrinterBegin(
2408 PassManagerWrapper &PMW) const {
2409 // TODO: Add AsmPrinterBegin
2410}
2411
2412void AMDGPUCodeGenPassBuilder::addAsmPrinter(PassManagerWrapper &PMW) const {
2413 // TODO: Add AsmPrinter.
2414}
2415
2416void AMDGPUCodeGenPassBuilder::addAsmPrinterEnd(PassManagerWrapper &PMW) const {
2417 // TODO: Add AsmPrinterEnd
2418}
2419
2420Error AMDGPUCodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const {
2421 addMachineFunctionPass(Pass: AMDGPUISelDAGToDAGPass(TM), PMW);
2422 addMachineFunctionPass(Pass: SIFixSGPRCopiesPass(), PMW);
2423 addMachineFunctionPass(Pass: SILowerI1CopiesPass(), PMW);
2424 return Error::success();
2425}
2426
2427void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const {
2428 if (EnableRegReassign) {
2429 addMachineFunctionPass(Pass: GCNNSAReassignPass(), PMW);
2430 }
2431
2432 addMachineFunctionPass(Pass: AMDGPURewriteAGPRCopyMFMAPass(), PMW);
2433}
2434
2435void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
2436 PassManagerWrapper &PMW) const {
2437 Base::addMachineSSAOptimization(PMW);
2438
2439 addMachineFunctionPass(Pass: SIFoldOperandsPass(), PMW);
2440 if (EnableDPPCombine) {
2441 addMachineFunctionPass(Pass: GCNDPPCombinePass(), PMW);
2442 }
2443 addMachineFunctionPass(Pass: SILoadStoreOptimizerPass(), PMW);
2444 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
2445 addMachineFunctionPass(Pass: SIPeepholeSDWAPass(), PMW);
2446 addMachineFunctionPass(Pass: EarlyMachineLICMPass(), PMW);
2447 addMachineFunctionPass(Pass: MachineCSEPass(), PMW);
2448 addMachineFunctionPass(Pass: SIFoldOperandsPass(), PMW);
2449 }
2450 addMachineFunctionPass(Pass: DeadMachineInstructionElimPass(), PMW);
2451 addMachineFunctionPass(Pass: SIShrinkInstructionsPass(), PMW);
2452}
2453
2454Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const {
2455 insertPass<PHIEliminationPass>(Pass: SILowerControlFlowPass());
2456
2457 insertPass<TwoAddressInstructionPass>(Pass: SIWholeQuadModePass());
2458
2459 return Base::addFastRegAlloc(PMW);
2460}
2461
2462Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast(
2463 PassManagerWrapper &PMW) const {
2464 if (auto Err = validateRegAllocOptions())
2465 return Err;
2466
2467 addMachineFunctionPass(Pass: GCNPreRALongBranchRegPass(), PMW);
2468
2469 // SGPR allocation - default to fast at -O0.
2470 if (SGPRRegAllocNPM == RegAllocType::Greedy)
2471 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2472 else
2473 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2474 PMW);
2475
2476 // Equivalent of PEI for SGPRs.
2477 addMachineFunctionPass(Pass: SILowerSGPRSpillsPass(), PMW);
2478
2479 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2480 addMachineFunctionPass(Pass: SIPreAllocateWWMRegsPass(), PMW);
2481
2482 // WWM allocation - default to fast at -O0.
2483 if (WWMRegAllocNPM == RegAllocType::Greedy)
2484 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2485 else
2486 addMachineFunctionPass(
2487 Pass: RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
2488
2489 addMachineFunctionPass(Pass: SILowerWWMCopiesPass(), PMW);
2490 addMachineFunctionPass(Pass: AMDGPUReserveWWMRegsPass(), PMW);
2491
2492 // VGPR allocation - default to fast at -O0.
2493 if (VGPRRegAllocNPM == RegAllocType::Greedy)
2494 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2495 else
2496 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2497
2498 return Error::success();
2499}
2500
2501Error AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
2502 PassManagerWrapper &PMW) const {
2503 if (EnableDCEInRA)
2504 insertPass<DetectDeadLanesPass>(Pass: DeadMachineInstructionElimPass());
2505
2506 // FIXME: when an instruction has a Killed operand, and the instruction is
2507 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
2508 // the register in LiveVariables, this would trigger a failure in verifier,
2509 // we should fix it and enable the verifier.
2510 if (OptVGPRLiveRange)
2511 insertPass<RequireAnalysisPass<LiveVariablesAnalysis, MachineFunction>>(
2512 Pass: SIOptimizeVGPRLiveRangePass());
2513
2514 // This must be run immediately after phi elimination and before
2515 // TwoAddressInstructions, otherwise the processing of the tied operand of
2516 // SI_ELSE will introduce a copy of the tied operand source after the else.
2517 insertPass<PHIEliminationPass>(Pass: SILowerControlFlowPass());
2518
2519 if (EnableRewritePartialRegUses)
2520 insertPass<RenameIndependentSubregsPass>(Pass: GCNRewritePartialRegUsesPass());
2521
2522 if (isPassEnabled(Opt: EnablePreRAOptimizations))
2523 insertPass<MachineSchedulerPass>(Pass: GCNPreRAOptimizationsPass());
2524
2525 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
2526 // instructions that cause scheduling barriers.
2527 insertPass<MachineSchedulerPass>(Pass: SIWholeQuadModePass());
2528
2529 if (OptExecMaskPreRA)
2530 insertPass<MachineSchedulerPass>(Pass: SIOptimizeExecMaskingPreRAPass());
2531
2532 // This is not an essential optimization and it has a noticeable impact on
2533 // compilation time, so we only enable it from O2.
2534 if (TM.getOptLevel() > CodeGenOptLevel::Less)
2535 insertPass<MachineSchedulerPass>(Pass: SIFormMemoryClausesPass());
2536
2537 return Base::addOptimizedRegAlloc(PMW);
2538}
2539
2540void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const {
2541 if (getOptLevel() != CodeGenOptLevel::None)
2542 addMachineFunctionPass(Pass: AMDGPUPrepareAGPRAllocPass(), PMW);
2543}
2544
2545Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
2546 PassManagerWrapper &PMW) const {
2547 if (auto Err = validateRegAllocOptions())
2548 return Err;
2549
2550 addMachineFunctionPass(Pass: GCNPreRALongBranchRegPass(), PMW);
2551
2552 // SGPR allocation - default to greedy at -O1 and above.
2553 if (SGPRRegAllocNPM == RegAllocType::Fast)
2554 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2555 PMW);
2556 else
2557 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2558
2559 // Commit allocated register changes. This is mostly necessary because too
2560 // many things rely on the use lists of the physical registers, such as the
2561 // verifier. This is only necessary with allocators which use LiveIntervals,
2562 // since FastRegAlloc does the replacements itself.
2563 addMachineFunctionPass(Pass: VirtRegRewriterPass(false), PMW);
2564
2565 // At this point, the sgpr-regalloc has been done and it is good to have the
2566 // stack slot coloring to try to optimize the SGPR spill stack indices before
2567 // attempting the custom SGPR spill lowering.
2568 addMachineFunctionPass(Pass: StackSlotColoringPass(), PMW);
2569
2570 // Equivalent of PEI for SGPRs.
2571 addMachineFunctionPass(Pass: SILowerSGPRSpillsPass(), PMW);
2572
2573 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2574 addMachineFunctionPass(Pass: SIPreAllocateWWMRegsPass(), PMW);
2575
2576 // WWM allocation - default to greedy at -O1 and above.
2577 if (WWMRegAllocNPM == RegAllocType::Fast)
2578 addMachineFunctionPass(
2579 Pass: RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
2580 else
2581 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2582 addMachineFunctionPass(Pass: SILowerWWMCopiesPass(), PMW);
2583 addMachineFunctionPass(Pass: VirtRegRewriterPass(false), PMW);
2584 addMachineFunctionPass(Pass: AMDGPUReserveWWMRegsPass(), PMW);
2585
2586 // VGPR allocation - default to greedy at -O1 and above.
2587 if (VGPRRegAllocNPM == RegAllocType::Fast)
2588 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2589 else
2590 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2591
2592 addPreRewrite(PMW);
2593 addMachineFunctionPass(Pass: VirtRegRewriterPass(true), PMW);
2594
2595 addMachineFunctionPass(Pass: AMDGPUMarkLastScratchLoadPass(), PMW);
2596 return Error::success();
2597}
2598
2599void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const {
2600 addMachineFunctionPass(Pass: SIFixVGPRCopiesPass(), PMW);
2601 if (TM.getOptLevel() > CodeGenOptLevel::None)
2602 addMachineFunctionPass(Pass: SIOptimizeExecMaskingPass(), PMW);
2603 Base::addPostRegAlloc(PMW);
2604}
2605
2606void AMDGPUCodeGenPassBuilder::addPreSched2(PassManagerWrapper &PMW) const {
2607 if (TM.getOptLevel() > CodeGenOptLevel::None)
2608 addMachineFunctionPass(Pass: SIShrinkInstructionsPass(), PMW);
2609 addMachineFunctionPass(Pass: SIPostRABundlerPass(), PMW);
2610}
2611
2612void AMDGPUCodeGenPassBuilder::addPostBBSections(
2613 PassManagerWrapper &PMW) const {
2614 // We run this later to avoid passes like livedebugvalues and BBSections
2615 // having to deal with the apparent multi-entry functions we may generate.
2616 addMachineFunctionPass(Pass: AMDGPUPreloadKernArgPrologPass(), PMW);
2617}
2618
2619void AMDGPUCodeGenPassBuilder::addPreEmitPass(PassManagerWrapper &PMW) const {
2620 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less)) {
2621 addMachineFunctionPass(Pass: GCNCreateVOPDPass(), PMW);
2622 }
2623
2624 addMachineFunctionPass(Pass: SIMemoryLegalizerPass(), PMW);
2625 addMachineFunctionPass(Pass: SIInsertWaitcntsPass(), PMW);
2626
2627 addMachineFunctionPass(Pass: SIModeRegisterPass(), PMW);
2628
2629 if (TM.getOptLevel() > CodeGenOptLevel::None)
2630 addMachineFunctionPass(Pass: SIInsertHardClausesPass(), PMW);
2631
2632 addMachineFunctionPass(Pass: SILateBranchLoweringPass(), PMW);
2633
2634 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
2635 addMachineFunctionPass(Pass: AMDGPUSetWavePriorityPass(), PMW);
2636
2637 if (TM.getOptLevel() > CodeGenOptLevel::None)
2638 addMachineFunctionPass(Pass: SIPreEmitPeepholePass(), PMW);
2639
2640 // The hazard recognizer that runs as part of the post-ra scheduler does not
2641 // guarantee to be able handle all hazards correctly. This is because if there
2642 // are multiple scheduling regions in a basic block, the regions are scheduled
2643 // bottom up, so when we begin to schedule a region we don't know what
2644 // instructions were emitted directly before it.
2645 //
2646 // Here we add a stand-alone hazard recognizer pass which can handle all
2647 // cases.
2648 addMachineFunctionPass(Pass: PostRAHazardRecognizerPass(), PMW);
2649 addMachineFunctionPass(Pass: AMDGPUWaitSGPRHazardsPass(), PMW);
2650 addMachineFunctionPass(Pass: AMDGPULowerVGPREncodingPass(), PMW);
2651
2652 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less)) {
2653 addMachineFunctionPass(Pass: AMDGPUInsertDelayAluPass(), PMW);
2654 }
2655
2656 addMachineFunctionPass(Pass: BranchRelaxationPass(), PMW);
2657}
2658
2659bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
2660 CodeGenOptLevel Level) const {
2661 if (Opt.getNumOccurrences())
2662 return Opt;
2663 if (TM.getOptLevel() < Level)
2664 return false;
2665 return Opt;
2666}
2667
2668void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(
2669 PassManagerWrapper &PMW) const {
2670 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
2671 addFunctionPass(Pass: GVNPass(), PMW);
2672 else
2673 addFunctionPass(Pass: EarlyCSEPass(), PMW);
2674}
2675
2676void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
2677 PassManagerWrapper &PMW) const {
2678 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
2679 addFunctionPass(Pass: LoopDataPrefetchPass(), PMW);
2680
2681 addFunctionPass(Pass: SeparateConstOffsetFromGEPPass(), PMW);
2682
2683 // ReassociateGEPs exposes more opportunities for SLSR. See
2684 // the example in reassociate-geps-and-slsr.ll.
2685 addFunctionPass(Pass: StraightLineStrengthReducePass(), PMW);
2686
2687 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
2688 // EarlyCSE can reuse.
2689 addEarlyCSEOrGVNPass(PMW);
2690
2691 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
2692 addFunctionPass(Pass: NaryReassociatePass(), PMW);
2693
2694 // NaryReassociate on GEPs creates redundant common expressions, so run
2695 // EarlyCSE after it.
2696 addFunctionPass(Pass: EarlyCSEPass(), PMW);
2697}
2698