1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains both AMDGPU target machine and the CodeGen pass builder.
11/// The AMDGPU target machine contains all of the hardware specific information
12/// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
13/// CodeGen pass builder handles the pass pipeline for new pass manager.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetMachine.h"
18#include "AMDGPU.h"
19#include "AMDGPUAliasAnalysis.h"
20#include "AMDGPUBarrierLatency.h"
21#include "AMDGPUCoExecSchedStrategy.h"
22#include "AMDGPUCtorDtorLowering.h"
23#include "AMDGPUExportClustering.h"
24#include "AMDGPUExportKernelRuntimeHandles.h"
25#include "AMDGPUHazardLatency.h"
26#include "AMDGPUIGroupLP.h"
27#include "AMDGPUISelDAGToDAG.h"
28#include "AMDGPULowerVGPREncoding.h"
29#include "AMDGPUMacroFusion.h"
30#include "AMDGPUPerfHintAnalysis.h"
31#include "AMDGPUPreloadKernArgProlog.h"
32#include "AMDGPUPrepareAGPRAlloc.h"
33#include "AMDGPURemoveIncompatibleFunctions.h"
34#include "AMDGPUReserveWWMRegs.h"
35#include "AMDGPUResourceUsageAnalysis.h"
36#include "AMDGPUSplitModule.h"
37#include "AMDGPUTargetObjectFile.h"
38#include "AMDGPUTargetTransformInfo.h"
39#include "AMDGPUUnifyDivergentExitNodes.h"
40#include "AMDGPUWaitSGPRHazards.h"
41#include "GCNDPPCombine.h"
42#include "GCNIterativeScheduler.h"
43#include "GCNNSAReassign.h"
44#include "GCNPreRALongBranchReg.h"
45#include "GCNPreRAOptimizations.h"
46#include "GCNRewritePartialRegUses.h"
47#include "GCNSchedStrategy.h"
48#include "GCNVOPDUtils.h"
49#include "R600.h"
50#include "R600TargetMachine.h"
51#include "SIFixSGPRCopies.h"
52#include "SIFixVGPRCopies.h"
53#include "SIFoldOperands.h"
54#include "SIFormMemoryClauses.h"
55#include "SILoadStoreOptimizer.h"
56#include "SILowerControlFlow.h"
57#include "SILowerSGPRSpills.h"
58#include "SILowerWWMCopies.h"
59#include "SIMachineFunctionInfo.h"
60#include "SIMachineScheduler.h"
61#include "SIOptimizeExecMasking.h"
62#include "SIOptimizeExecMaskingPreRA.h"
63#include "SIOptimizeVGPRLiveRange.h"
64#include "SIPeepholeSDWA.h"
65#include "SIPostRABundler.h"
66#include "SIPreAllocateWWMRegs.h"
67#include "SIShrinkInstructions.h"
68#include "SIWholeQuadMode.h"
69#include "TargetInfo/AMDGPUTargetInfo.h"
70#include "Utils/AMDGPUBaseInfo.h"
71#include "llvm/Analysis/CGSCCPassManager.h"
72#include "llvm/Analysis/CallGraphSCCPass.h"
73#include "llvm/Analysis/KernelInfo.h"
74#include "llvm/Analysis/UniformityAnalysis.h"
75#include "llvm/CodeGen/AtomicExpand.h"
76#include "llvm/CodeGen/BranchRelaxation.h"
77#include "llvm/CodeGen/DeadMachineInstructionElim.h"
78#include "llvm/CodeGen/EarlyIfConversion.h"
79#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
80#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
81#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
82#include "llvm/CodeGen/GlobalISel/Legalizer.h"
83#include "llvm/CodeGen/GlobalISel/Localizer.h"
84#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
85#include "llvm/CodeGen/MIRParser/MIParser.h"
86#include "llvm/CodeGen/MachineCSE.h"
87#include "llvm/CodeGen/MachineLICM.h"
88#include "llvm/CodeGen/MachineScheduler.h"
89#include "llvm/CodeGen/Passes.h"
90#include "llvm/CodeGen/PostRAHazardRecognizer.h"
91#include "llvm/CodeGen/RegAllocRegistry.h"
92#include "llvm/CodeGen/TargetPassConfig.h"
93#include "llvm/IR/DiagnosticInfo.h"
94#include "llvm/IR/IntrinsicsAMDGPU.h"
95#include "llvm/IR/PassManager.h"
96#include "llvm/IR/PatternMatch.h"
97#include "llvm/InitializePasses.h"
98#include "llvm/MC/TargetRegistry.h"
99#include "llvm/Passes/CodeGenPassBuilder.h"
100#include "llvm/Passes/PassBuilder.h"
101#include "llvm/Support/Compiler.h"
102#include "llvm/Support/FormatVariadic.h"
103#include "llvm/Transforms/HipStdPar/HipStdPar.h"
104#include "llvm/Transforms/IPO.h"
105#include "llvm/Transforms/IPO/AlwaysInliner.h"
106#include "llvm/Transforms/IPO/ExpandVariadics.h"
107#include "llvm/Transforms/IPO/GlobalDCE.h"
108#include "llvm/Transforms/IPO/Internalize.h"
109#include "llvm/Transforms/Scalar.h"
110#include "llvm/Transforms/Scalar/EarlyCSE.h"
111#include "llvm/Transforms/Scalar/FlattenCFG.h"
112#include "llvm/Transforms/Scalar/GVN.h"
113#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
114#include "llvm/Transforms/Scalar/LICM.h"
115#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
116#include "llvm/Transforms/Scalar/LoopPassManager.h"
117#include "llvm/Transforms/Scalar/NaryReassociate.h"
118#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
119#include "llvm/Transforms/Scalar/Sink.h"
120#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
121#include "llvm/Transforms/Scalar/StructurizeCFG.h"
122#include "llvm/Transforms/Utils.h"
123#include "llvm/Transforms/Utils/FixIrreducible.h"
124#include "llvm/Transforms/Utils/LCSSA.h"
125#include "llvm/Transforms/Utils/LowerSwitch.h"
126#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
127#include "llvm/Transforms/Utils/UnifyLoopExits.h"
128#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
129#include <optional>
130
131using namespace llvm;
132using namespace llvm::PatternMatch;
133
134namespace {
135//===----------------------------------------------------------------------===//
136// AMDGPU CodeGen Pass Builder interface.
137//===----------------------------------------------------------------------===//
138
139class AMDGPUCodeGenPassBuilder
140 : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
141 using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
142
143public:
144 AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
145 const CGPassBuilderOption &Opts,
146 PassInstrumentationCallbacks *PIC);
147
148 void addIRPasses(PassManagerWrapper &PMW) const;
149 void addCodeGenPrepare(PassManagerWrapper &PMW) const;
150 void addPreISel(PassManagerWrapper &PMW) const;
151 void addILPOpts(PassManagerWrapper &PMWM) const;
152 void addAsmPrinterBegin(PassManagerWrapper &PMW, CreateMCStreamer) const;
153 void addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const;
154 void addAsmPrinterEnd(PassManagerWrapper &PMW, CreateMCStreamer) const;
155 Error addInstSelector(PassManagerWrapper &PMW) const;
156 void addPreRewrite(PassManagerWrapper &PMW) const;
157 void addMachineSSAOptimization(PassManagerWrapper &PMW) const;
158 void addPostRegAlloc(PassManagerWrapper &PMW) const;
159 void addPreEmitPass(PassManagerWrapper &PMWM) const;
160 void addPreEmitRegAlloc(PassManagerWrapper &PMW) const;
161 Error addRegAssignmentFast(PassManagerWrapper &PMW) const;
162 Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const;
163 void addPreRegAlloc(PassManagerWrapper &PMW) const;
164 Error addFastRegAlloc(PassManagerWrapper &PMW) const;
165 Error addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
166 void addPreSched2(PassManagerWrapper &PMW) const;
167 void addPostBBSections(PassManagerWrapper &PMW) const;
168
169private:
170 Error validateRegAllocOptions() const;
171
172public:
173 /// Check if a pass is enabled given \p Opt option. The option always
174 /// overrides defaults if explicitly used. Otherwise its default will be used
175 /// given that a pass shall work at an optimization \p Level minimum.
176 bool isPassEnabled(const cl::opt<bool> &Opt,
177 CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
178 void addEarlyCSEOrGVNPass(PassManagerWrapper &PMW) const;
179 void addStraightLineScalarOptimizationPasses(PassManagerWrapper &PMW) const;
180};
181
182class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
183public:
184 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
185 : RegisterRegAllocBase(N, D, C) {}
186};
187
188class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
189public:
190 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
191 : RegisterRegAllocBase(N, D, C) {}
192};
193
194class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
195public:
196 WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
197 : RegisterRegAllocBase(N, D, C) {}
198};
199
200static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
201 const MachineRegisterInfo &MRI,
202 const Register Reg) {
203 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
204 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
205}
206
207static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
208 const MachineRegisterInfo &MRI,
209 const Register Reg) {
210 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
211 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
212}
213
214static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
215 const MachineRegisterInfo &MRI,
216 const Register Reg) {
217 const SIMachineFunctionInfo *MFI =
218 MRI.getMF().getInfo<SIMachineFunctionInfo>();
219 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
220 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
221 MFI->checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG);
222}
223
224/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
225static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
226
227/// A dummy default pass factory indicates whether the register allocator is
228/// overridden on the command line.
229static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
230static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
231static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
232
233static SGPRRegisterRegAlloc
234defaultSGPRRegAlloc("default",
235 "pick SGPR register allocator based on -O option",
236 useDefaultRegisterAllocator);
237
238static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
239 RegisterPassParser<SGPRRegisterRegAlloc>>
240SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
241 cl::desc("Register allocator to use for SGPRs"));
242
243static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
244 RegisterPassParser<VGPRRegisterRegAlloc>>
245VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
246 cl::desc("Register allocator to use for VGPRs"));
247
248static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
249 RegisterPassParser<WWMRegisterRegAlloc>>
250 WWMRegAlloc("wwm-regalloc", cl::Hidden,
251 cl::init(Val: &useDefaultRegisterAllocator),
252 cl::desc("Register allocator to use for WWM registers"));
253
254// New pass manager register allocator options for AMDGPU
255static cl::opt<RegAllocType, false, RegAllocTypeParser> SGPRRegAllocNPM(
256 "sgpr-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
257 cl::desc("Register allocator for SGPRs (new pass manager)"));
258
259static cl::opt<RegAllocType, false, RegAllocTypeParser> VGPRRegAllocNPM(
260 "vgpr-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
261 cl::desc("Register allocator for VGPRs (new pass manager)"));
262
263static cl::opt<RegAllocType, false, RegAllocTypeParser> WWMRegAllocNPM(
264 "wwm-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
265 cl::desc("Register allocator for WWM registers (new pass manager)"));
266
267/// Check if the given RegAllocType is supported for AMDGPU NPM register
268/// allocation. Only Fast and Greedy are supported; Basic and PBQP are not.
269static Error checkRegAllocSupported(RegAllocType RAType, StringRef RegName) {
270 if (RAType == RegAllocType::Basic || RAType == RegAllocType::PBQP) {
271 return make_error<StringError>(
272 Args: Twine("unsupported register allocator '") +
273 (RAType == RegAllocType::Basic ? "basic" : "pbqp") + "' for " +
274 RegName + " registers",
275 Args: inconvertibleErrorCode());
276 }
277 return Error::success();
278}
279
280Error AMDGPUCodeGenPassBuilder::validateRegAllocOptions() const {
281 // 1. Generic --regalloc-npm is not supported for AMDGPU.
282 if (Opt.RegAlloc != RegAllocType::Unset) {
283 return make_error<StringError>(
284 Args: "-regalloc-npm not supported for amdgcn. Use -sgpr-regalloc-npm, "
285 "-vgpr-regalloc-npm, and -wwm-regalloc-npm",
286 Args: inconvertibleErrorCode());
287 }
288
289 // 2. Legacy PM regalloc options are not compatible with NPM.
290 if (SGPRRegAlloc.getNumOccurrences() > 0 ||
291 VGPRRegAlloc.getNumOccurrences() > 0 ||
292 WWMRegAlloc.getNumOccurrences() > 0) {
293 return make_error<StringError>(
294 Args: "-sgpr-regalloc, -vgpr-regalloc, and -wwm-regalloc are legacy PM "
295 "options. Use -sgpr-regalloc-npm, -vgpr-regalloc-npm, and "
296 "-wwm-regalloc-npm with the new pass manager",
297 Args: inconvertibleErrorCode());
298 }
299
300 // 3. Only Fast and Greedy allocators are supported for AMDGPU.
301 if (auto Err = checkRegAllocSupported(RAType: SGPRRegAllocNPM, RegName: "SGPR"))
302 return Err;
303 if (auto Err = checkRegAllocSupported(RAType: WWMRegAllocNPM, RegName: "WWM"))
304 return Err;
305 if (auto Err = checkRegAllocSupported(RAType: VGPRRegAllocNPM, RegName: "VGPR"))
306 return Err;
307
308 return Error::success();
309}
310
311static void initializeDefaultSGPRRegisterAllocatorOnce() {
312 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
313
314 if (!Ctor) {
315 Ctor = SGPRRegAlloc;
316 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
317 }
318}
319
320static void initializeDefaultVGPRRegisterAllocatorOnce() {
321 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
322
323 if (!Ctor) {
324 Ctor = VGPRRegAlloc;
325 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
326 }
327}
328
329static void initializeDefaultWWMRegisterAllocatorOnce() {
330 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
331
332 if (!Ctor) {
333 Ctor = WWMRegAlloc;
334 WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
335 }
336}
337
338static FunctionPass *createBasicSGPRRegisterAllocator() {
339 return createBasicRegisterAllocator(F: onlyAllocateSGPRs);
340}
341
342static FunctionPass *createGreedySGPRRegisterAllocator() {
343 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
344}
345
346static FunctionPass *createFastSGPRRegisterAllocator() {
347 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
348}
349
350static FunctionPass *createBasicVGPRRegisterAllocator() {
351 return createBasicRegisterAllocator(F: onlyAllocateVGPRs);
352}
353
354static FunctionPass *createGreedyVGPRRegisterAllocator() {
355 return createGreedyRegisterAllocator(F: onlyAllocateVGPRs);
356}
357
358static FunctionPass *createFastVGPRRegisterAllocator() {
359 return createFastRegisterAllocator(F: onlyAllocateVGPRs, ClearVirtRegs: true);
360}
361
362static FunctionPass *createBasicWWMRegisterAllocator() {
363 return createBasicRegisterAllocator(F: onlyAllocateWWMRegs);
364}
365
366static FunctionPass *createGreedyWWMRegisterAllocator() {
367 return createGreedyRegisterAllocator(F: onlyAllocateWWMRegs);
368}
369
370static FunctionPass *createFastWWMRegisterAllocator() {
371 return createFastRegisterAllocator(F: onlyAllocateWWMRegs, ClearVirtRegs: false);
372}
373
374static SGPRRegisterRegAlloc basicRegAllocSGPR(
375 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
376static SGPRRegisterRegAlloc greedyRegAllocSGPR(
377 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
378
379static SGPRRegisterRegAlloc fastRegAllocSGPR(
380 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
381
382
383static VGPRRegisterRegAlloc basicRegAllocVGPR(
384 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
385static VGPRRegisterRegAlloc greedyRegAllocVGPR(
386 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
387
388static VGPRRegisterRegAlloc fastRegAllocVGPR(
389 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
390static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
391 "basic register allocator",
392 createBasicWWMRegisterAllocator);
393static WWMRegisterRegAlloc
394 greedyRegAllocWWMReg("greedy", "greedy register allocator",
395 createGreedyWWMRegisterAllocator);
396static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
397 createFastWWMRegisterAllocator);
398
399static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
400 return Phase == ThinOrFullLTOPhase::FullLTOPreLink ||
401 Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
402}
403} // anonymous namespace
404
405static cl::opt<bool>
406EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
407 cl::desc("Run early if-conversion"),
408 cl::init(Val: false));
409
410static cl::opt<bool>
411OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
412 cl::desc("Run pre-RA exec mask optimizations"),
413 cl::init(Val: true));
414
415static cl::opt<bool>
416 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
417 cl::desc("Lower GPU ctor / dtors to globals on the device."),
418 cl::init(Val: true), cl::Hidden);
419
420// Option to disable vectorizer for tests.
421static cl::opt<bool> EnableLoadStoreVectorizer(
422 "amdgpu-load-store-vectorizer",
423 cl::desc("Enable load store vectorizer"),
424 cl::init(Val: true),
425 cl::Hidden);
426
427// Option to control global loads scalarization
428static cl::opt<bool> ScalarizeGlobal(
429 "amdgpu-scalarize-global-loads",
430 cl::desc("Enable global load scalarization"),
431 cl::init(Val: true),
432 cl::Hidden);
433
434// Option to run internalize pass.
435static cl::opt<bool> InternalizeSymbols(
436 "amdgpu-internalize-symbols",
437 cl::desc("Enable elimination of non-kernel functions and unused globals"),
438 cl::init(Val: false),
439 cl::Hidden);
440
441// Option to inline all early.
442static cl::opt<bool> EarlyInlineAll(
443 "amdgpu-early-inline-all",
444 cl::desc("Inline all functions early"),
445 cl::init(Val: false),
446 cl::Hidden);
447
448static cl::opt<bool> RemoveIncompatibleFunctions(
449 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
450 cl::desc("Enable removal of functions when they"
451 "use features not supported by the target GPU"),
452 cl::init(Val: true));
453
454static cl::opt<bool> EnableSDWAPeephole(
455 "amdgpu-sdwa-peephole",
456 cl::desc("Enable SDWA peepholer"),
457 cl::init(Val: true));
458
459static cl::opt<bool> EnableDPPCombine(
460 "amdgpu-dpp-combine",
461 cl::desc("Enable DPP combiner"),
462 cl::init(Val: true));
463
464// Enable address space based alias analysis
465static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
466 cl::desc("Enable AMDGPU Alias Analysis"),
467 cl::init(Val: true));
468
469// Enable lib calls simplifications
470static cl::opt<bool> EnableLibCallSimplify(
471 "amdgpu-simplify-libcall",
472 cl::desc("Enable amdgpu library simplifications"),
473 cl::init(Val: true),
474 cl::Hidden);
475
476static cl::opt<bool> EnableLowerKernelArguments(
477 "amdgpu-ir-lower-kernel-arguments",
478 cl::desc("Lower kernel argument loads in IR pass"),
479 cl::init(Val: true),
480 cl::Hidden);
481
482static cl::opt<bool> EnableRegReassign(
483 "amdgpu-reassign-regs",
484 cl::desc("Enable register reassign optimizations on gfx10+"),
485 cl::init(Val: true),
486 cl::Hidden);
487
488static cl::opt<bool> OptVGPRLiveRange(
489 "amdgpu-opt-vgpr-liverange",
490 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
491 cl::init(Val: true), cl::Hidden);
492
493static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
494 "amdgpu-atomic-optimizer-strategy",
495 cl::desc("Select DPP or Iterative strategy for scan"),
496 cl::init(Val: ScanOptions::Iterative),
497 cl::values(
498 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
499 clEnumValN(ScanOptions::Iterative, "Iterative",
500 "Use Iterative approach for scan"),
501 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
502
503// Enable Mode register optimization
504static cl::opt<bool> EnableSIModeRegisterPass(
505 "amdgpu-mode-register",
506 cl::desc("Enable mode register pass"),
507 cl::init(Val: true),
508 cl::Hidden);
509
510// Enable GFX11+ s_delay_alu insertion
511static cl::opt<bool>
512 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
513 cl::desc("Enable s_delay_alu insertion"),
514 cl::init(Val: true), cl::Hidden);
515
516// Enable GFX11+ VOPD
517static cl::opt<bool>
518 EnableVOPD("amdgpu-enable-vopd",
519 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
520 cl::init(Val: true), cl::Hidden);
521
522// Option is used in lit tests to prevent deadcoding of patterns inspected.
523static cl::opt<bool>
524EnableDCEInRA("amdgpu-dce-in-ra",
525 cl::init(Val: true), cl::Hidden,
526 cl::desc("Enable machine DCE inside regalloc"));
527
528static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
529 cl::desc("Adjust wave priority"),
530 cl::init(Val: false), cl::Hidden);
531
532static cl::opt<bool> EnableScalarIRPasses(
533 "amdgpu-scalar-ir-passes",
534 cl::desc("Enable scalar IR passes"),
535 cl::init(Val: true),
536 cl::Hidden);
537
538static cl::opt<bool> EnableLowerExecSync(
539 "amdgpu-enable-lower-exec-sync",
540 cl::desc("Enable lowering of execution synchronization."), cl::init(Val: true),
541 cl::Hidden);
542
543static cl::opt<bool>
544 EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
545 cl::desc("Enable lowering of lds to global memory pass "
546 "and asan instrument resulting IR."),
547 cl::init(Val: true), cl::Hidden);
548
549static cl::opt<bool, true> EnableLowerModuleLDS(
550 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
551 cl::location(L&: AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(Val: true),
552 cl::Hidden);
553
554static cl::opt<bool> EnablePreRAOptimizations(
555 "amdgpu-enable-pre-ra-optimizations",
556 cl::desc("Enable Pre-RA optimizations pass"), cl::init(Val: true),
557 cl::Hidden);
558
559static cl::opt<bool> EnablePromoteKernelArguments(
560 "amdgpu-enable-promote-kernel-arguments",
561 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
562 cl::Hidden, cl::init(Val: true));
563
564static cl::opt<bool> EnableImageIntrinsicOptimizer(
565 "amdgpu-enable-image-intrinsic-optimizer",
566 cl::desc("Enable image intrinsic optimizer pass"), cl::init(Val: true),
567 cl::Hidden);
568
569static cl::opt<bool>
570 EnableLoopPrefetch("amdgpu-loop-prefetch",
571 cl::desc("Enable loop data prefetch on AMDGPU"),
572 cl::Hidden, cl::init(Val: false));
573
574static cl::opt<std::string>
575 AMDGPUSchedStrategy("amdgpu-sched-strategy",
576 cl::desc("Select custom AMDGPU scheduling strategy."),
577 cl::Hidden, cl::init(Val: ""));
578
579// Scheduler selection is consulted both when creating the scheduler and from
580// overrideSchedPolicy(), so keep the attribute and global command line handling
581// in one helper.
582StringRef llvm::AMDGPU::getSchedStrategy(const Function &F) {
583 Attribute SchedStrategyAttr = F.getFnAttribute(Kind: "amdgpu-sched-strategy");
584 if (SchedStrategyAttr.isValid())
585 return SchedStrategyAttr.getValueAsString();
586
587 if (!AMDGPUSchedStrategy.empty())
588 return AMDGPUSchedStrategy;
589
590 return "";
591}
592
593static void
594diagnoseUnsupportedCoExecSchedulerSelection(const Function &F,
595 const GCNSubtarget &ST) {
596 if (ST.hasGFX1250Insts())
597 return;
598
599 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
600 F, "'amdgpu-sched-strategy'='coexec' is only supported for gfx1250",
601 DiagnosticLocation(), DS_Warning));
602}
603
604static bool useNoopPostScheduler(const Function &F) {
605 Attribute PostSchedStrategyAttr =
606 F.getFnAttribute(Kind: "amdgpu-post-sched-strategy");
607 return PostSchedStrategyAttr.isValid() &&
608 PostSchedStrategyAttr.getValueAsString() == "nop";
609}
610
611static cl::opt<bool> EnableRewritePartialRegUses(
612 "amdgpu-enable-rewrite-partial-reg-uses",
613 cl::desc("Enable rewrite partial reg uses pass"), cl::init(Val: true),
614 cl::Hidden);
615
616static cl::opt<bool> EnableHipStdPar(
617 "amdgpu-enable-hipstdpar",
618 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(Val: false),
619 cl::Hidden);
620
621static cl::opt<bool>
622 EnableAMDGPUAttributor("amdgpu-attributor-enable",
623 cl::desc("Enable AMDGPUAttributorPass"),
624 cl::init(Val: true), cl::Hidden);
625
626static cl::opt<bool> NewRegBankSelect(
627 "new-reg-bank-select",
628 cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of "
629 "regbankselect"),
630 cl::init(Val: false), cl::Hidden);
631
632static cl::opt<bool> HasClosedWorldAssumption(
633 "amdgpu-link-time-closed-world",
634 cl::desc("Whether has closed-world assumption at link time"),
635 cl::init(Val: false), cl::Hidden);
636
637static cl::opt<bool> EnableUniformIntrinsicCombine(
638 "amdgpu-enable-uniform-intrinsic-combine",
639 cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
640 cl::init(Val: true), cl::Hidden);
641
642extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
643 // Register the target
644 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
645 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
646
647 PassRegistry *PR = PassRegistry::getPassRegistry();
648 initializeR600ClauseMergePassPass(*PR);
649 initializeR600ControlFlowFinalizerPass(*PR);
650 initializeR600PacketizerPass(*PR);
651 initializeR600ExpandSpecialInstrsPassPass(*PR);
652 initializeR600VectorRegMergerPass(*PR);
653 initializeR600EmitClauseMarkersPass(*PR);
654 initializeR600MachineCFGStructurizerPass(*PR);
655 initializeGlobalISel(*PR);
656 initializeAMDGPUAsmPrinterPass(*PR);
657 initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
658 initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
659 initializeGCNDPPCombineLegacyPass(*PR);
660 initializeSILowerI1CopiesLegacyPass(*PR);
661 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
662 initializeAMDGPURegBankSelectPass(*PR);
663 initializeAMDGPURegBankLegalizePass(*PR);
664 initializeSILowerWWMCopiesLegacyPass(*PR);
665 initializeAMDGPUMarkLastScratchLoadLegacyPass(*PR);
666 initializeSILowerSGPRSpillsLegacyPass(*PR);
667 initializeSIFixSGPRCopiesLegacyPass(*PR);
668 initializeSIFixVGPRCopiesLegacyPass(*PR);
669 initializeSIFoldOperandsLegacyPass(*PR);
670 initializeSIPeepholeSDWALegacyPass(*PR);
671 initializeSIShrinkInstructionsLegacyPass(*PR);
672 initializeSIOptimizeExecMaskingPreRALegacyPass(*PR);
673 initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR);
674 initializeSILoadStoreOptimizerLegacyPass(*PR);
675 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
676 initializeAMDGPUAlwaysInlinePass(*PR);
677 initializeAMDGPULowerExecSyncLegacyPass(*PR);
678 initializeAMDGPUSwLowerLDSLegacyPass(*PR);
679 initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
680 initializeAMDGPUAtomicOptimizerPass(*PR);
681 initializeAMDGPULowerKernelArgumentsPass(*PR);
682 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
683 initializeAMDGPULowerKernelAttributesPass(*PR);
684 initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
685 initializeAMDGPUPostLegalizerCombinerPass(*PR);
686 initializeAMDGPUPreLegalizerCombinerPass(*PR);
687 initializeAMDGPURegBankCombinerPass(*PR);
688 initializeAMDGPUPromoteAllocaPass(*PR);
689 initializeAMDGPUCodeGenPreparePass(*PR);
690 initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
691 initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
692 initializeAMDGPULowerModuleLDSLegacyPass(*PR);
693 initializeAMDGPULowerBufferFatPointersPass(*PR);
694 initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
695 initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
696 initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
697 initializeAMDGPURewriteOutArgumentsPass(*PR);
698 initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
699 initializeSIAnnotateControlFlowLegacyPass(*PR);
700 initializeAMDGPUInsertDelayAluLegacyPass(*PR);
701 initializeAMDGPULowerVGPREncodingLegacyPass(*PR);
702 initializeSIInsertHardClausesLegacyPass(*PR);
703 initializeSIInsertWaitcntsLegacyPass(*PR);
704 initializeSIModeRegisterLegacyPass(*PR);
705 initializeSIWholeQuadModeLegacyPass(*PR);
706 initializeSILowerControlFlowLegacyPass(*PR);
707 initializeSIPreEmitPeepholeLegacyPass(*PR);
708 initializeSILateBranchLoweringLegacyPass(*PR);
709 initializeSIMemoryLegalizerLegacyPass(*PR);
710 initializeSIOptimizeExecMaskingLegacyPass(*PR);
711 initializeSIPreAllocateWWMRegsLegacyPass(*PR);
712 initializeSIFormMemoryClausesLegacyPass(*PR);
713 initializeSIPostRABundlerLegacyPass(*PR);
714 initializeGCNCreateVOPDLegacyPass(*PR);
715 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
716 initializeAMDGPUAAWrapperPassPass(*PR);
717 initializeAMDGPUExternalAAWrapperPass(*PR);
718 initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
719 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
720 initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR);
721 initializeGCNNSAReassignLegacyPass(*PR);
722 initializeGCNPreRAOptimizationsLegacyPass(*PR);
723 initializeGCNPreRALongBranchRegLegacyPass(*PR);
724 initializeGCNRewritePartialRegUsesLegacyPass(*PR);
725 initializeGCNRegPressurePrinterPass(*PR);
726 initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
727 initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
728 initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
729 initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
730}
731
732static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
733 return std::make_unique<AMDGPUTargetObjectFile>();
734}
735
736static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
737 return new SIScheduleDAGMI(C);
738}
739
740static ScheduleDAGInstrs *
741createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
742 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
743 ScheduleDAGMILive *DAG =
744 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(args&: C));
745 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
746 if (ST.shouldClusterStores())
747 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
748 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
749 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
750 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
751 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
752 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
753 return DAG;
754}
755
756static ScheduleDAGInstrs *
757createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
758 ScheduleDAGMILive *DAG =
759 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(args&: C));
760 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
761 return DAG;
762}
763
764static ScheduleDAGInstrs *
765createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
766 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
767 ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
768 C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(args&: C));
769 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
770 if (ST.shouldClusterStores())
771 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
772 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
773 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
774 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
775 return DAG;
776}
777
778static ScheduleDAGInstrs *
779createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
780 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
781 auto *DAG = new GCNIterativeScheduler(
782 C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
783 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
784 if (ST.shouldClusterStores())
785 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
786 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
787 return DAG;
788}
789
790static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
791 auto *DAG = new GCNIterativeScheduler(
792 C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
793 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
794 return DAG;
795}
796
797static ScheduleDAGInstrs *
798createIterativeILPMachineScheduler(MachineSchedContext *C) {
799 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
800 auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
801 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
802 if (ST.shouldClusterStores())
803 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
804 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
805 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
806 return DAG;
807}
808
809static MachineSchedRegistry
810SISchedRegistry("si", "Run SI's custom scheduler",
811 createSIMachineScheduler);
812
813static MachineSchedRegistry
814GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
815 "Run GCN scheduler to maximize occupancy",
816 createGCNMaxOccupancyMachineScheduler);
817
818static MachineSchedRegistry
819 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
820 createGCNMaxILPMachineScheduler);
821
822static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
823 "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
824 createGCNMaxMemoryClauseMachineScheduler);
825
826static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
827 "gcn-iterative-max-occupancy-experimental",
828 "Run GCN scheduler to maximize occupancy (experimental)",
829 createIterativeGCNMaxOccupancyMachineScheduler);
830
831static MachineSchedRegistry GCNMinRegSchedRegistry(
832 "gcn-iterative-minreg",
833 "Run GCN iterative scheduler for minimal register usage (experimental)",
834 createMinRegScheduler);
835
836static MachineSchedRegistry GCNILPSchedRegistry(
837 "gcn-iterative-ilp",
838 "Run GCN iterative scheduler for ILP scheduling (experimental)",
839 createIterativeILPMachineScheduler);
840
841LLVM_READNONE
842static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
843 if (!GPU.empty())
844 return GPU;
845
846 // Need to default to a target with flat support for HSA.
847 if (TT.isAMDGCN())
848 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
849
850 return "r600";
851}
852
853static Reloc::Model getEffectiveRelocModel() {
854 // The AMDGPU toolchain only supports generating shared objects, so we
855 // must always use PIC.
856 return Reloc::PIC_;
857}
858
859AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
860 StringRef CPU, StringRef FS,
861 const TargetOptions &Options,
862 std::optional<Reloc::Model> RM,
863 std::optional<CodeModel::Model> CM,
864 CodeGenOptLevel OptLevel)
865 : CodeGenTargetMachineImpl(
866 T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, GPU: CPU), FS, Options,
867 getEffectiveRelocModel(), getEffectiveCodeModel(CM, Default: CodeModel::Small),
868 OptLevel),
869 TLOF(createTLOF(TT: getTargetTriple())) {
870 initAsmInfo();
871 if (TT.isAMDGCN()) {
872 if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize64"))
873 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave64));
874 else if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize32"))
875 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave32));
876 }
877}
878
879bool AMDGPUTargetMachine::EnableFunctionCalls = false;
880bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
881
882AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
883
884StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
885 Attribute GPUAttr = F.getFnAttribute(Kind: "target-cpu");
886 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
887}
888
889StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
890 Attribute FSAttr = F.getFnAttribute(Kind: "target-features");
891
892 return FSAttr.isValid() ? FSAttr.getValueAsString()
893 : getTargetFeatureString();
894}
895
896llvm::ScheduleDAGInstrs *
897AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
898 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
899 ScheduleDAGMILive *DAG = createSchedLive(C);
900 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
901 if (ST.shouldClusterStores())
902 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
903 return DAG;
904}
905
906/// Predicate for Internalize pass.
907static bool mustPreserveGV(const GlobalValue &GV) {
908 if (const Function *F = dyn_cast<Function>(Val: &GV))
909 return F->isDeclaration() || F->getName().starts_with(Prefix: "__asan_") ||
910 F->getName().starts_with(Prefix: "__sanitizer_") ||
911 AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
912
913 GV.removeDeadConstantUsers();
914 return !GV.use_empty();
915}
916
917void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
918 if (EnableAMDGPUAliasAnalysis)
919 AAM.registerFunctionAnalysis<AMDGPUAA>();
920}
921
922static Expected<ScanOptions>
923parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
924 if (Params.empty())
925 return ScanOptions::Iterative;
926 Params.consume_front(Prefix: "strategy=");
927 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
928 .Case(S: "dpp", Value: ScanOptions::DPP)
929 .Cases(CaseStrings: {"iterative", ""}, Value: ScanOptions::Iterative)
930 .Case(S: "none", Value: ScanOptions::None)
931 .Default(Value: std::nullopt);
932 if (Result)
933 return *Result;
934 return make_error<StringError>(Args: "invalid parameter", Args: inconvertibleErrorCode());
935}
936
937Expected<AMDGPUAttributorOptions>
938parseAMDGPUAttributorPassOptions(StringRef Params) {
939 AMDGPUAttributorOptions Result;
940 while (!Params.empty()) {
941 StringRef ParamName;
942 std::tie(args&: ParamName, args&: Params) = Params.split(Separator: ';');
943 if (ParamName == "closed-world") {
944 Result.IsClosedWorld = true;
945 } else {
946 return make_error<StringError>(
947 Args: formatv(Fmt: "invalid AMDGPUAttributor pass parameter '{0}' ", Vals&: ParamName)
948 .str(),
949 Args: inconvertibleErrorCode());
950 }
951 }
952 return Result;
953}
954
955void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
956
957#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
958#include "llvm/Passes/TargetPassRegistry.inc"
959
960 PB.registerPipelineParsingCallback(
961 C: [this](StringRef Name, CGSCCPassManager &PM,
962 ArrayRef<PassBuilder::PipelineElement> Pipeline) {
963 if (Name == "amdgpu-attributor-cgscc" && getTargetTriple().isAMDGCN()) {
964 PM.addPass(Pass: AMDGPUAttributorCGSCCPass(
965 *static_cast<GCNTargetMachine *>(this)));
966 return true;
967 }
968 return false;
969 });
970
971 PB.registerScalarOptimizerLateEPCallback(
972 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
973 if (Level == OptimizationLevel::O0)
974 return;
975
976 FPM.addPass(Pass: InferAddressSpacesPass());
977 });
978
979 PB.registerVectorizerEndEPCallback(
980 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
981 if (Level == OptimizationLevel::O0)
982 return;
983
984 FPM.addPass(Pass: InferAddressSpacesPass());
985 });
986
987 PB.registerPipelineEarlySimplificationEPCallback(
988 C: [this](ModulePassManager &PM, OptimizationLevel Level,
989 ThinOrFullLTOPhase Phase) {
990 if (!isLTOPreLink(Phase) && getTargetTriple().isAMDGCN()) {
991 // When we are not using -fgpu-rdc, we can run accelerator code
992 // selection relatively early, but still after linking to prevent
993 // eager removal of potentially reachable symbols.
994 if (EnableHipStdPar) {
995 PM.addPass(Pass: HipStdParMathFixupPass());
996 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
997 }
998
999 PM.addPass(Pass: AMDGPUPrintfRuntimeBindingPass());
1000 }
1001
1002 if (Level == OptimizationLevel::O0)
1003 return;
1004
1005 // We don't want to run internalization at per-module stage.
1006 if (InternalizeSymbols && !isLTOPreLink(Phase)) {
1007 PM.addPass(Pass: InternalizePass(mustPreserveGV));
1008 PM.addPass(Pass: GlobalDCEPass());
1009 }
1010
1011 if (EarlyInlineAll && !EnableFunctionCalls)
1012 PM.addPass(Pass: AMDGPUAlwaysInlinePass());
1013 });
1014
1015 PB.registerPeepholeEPCallback(
1016 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
1017 if (Level == OptimizationLevel::O0)
1018 return;
1019
1020 FPM.addPass(Pass: AMDGPUUseNativeCallsPass());
1021 if (EnableLibCallSimplify)
1022 FPM.addPass(Pass: AMDGPUSimplifyLibCallsPass());
1023
1024 if (EnableUniformIntrinsicCombine)
1025 FPM.addPass(Pass: AMDGPUUniformIntrinsicCombinePass());
1026 });
1027
1028 PB.registerCGSCCOptimizerLateEPCallback(
1029 C: [this](CGSCCPassManager &PM, OptimizationLevel Level) {
1030 if (Level == OptimizationLevel::O0)
1031 return;
1032
1033 FunctionPassManager FPM;
1034
1035 // Add promote kernel arguments pass to the opt pipeline right before
1036 // infer address spaces which is needed to do actual address space
1037 // rewriting.
1038 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
1039 EnablePromoteKernelArguments)
1040 FPM.addPass(Pass: AMDGPUPromoteKernelArgumentsPass());
1041
1042 // Add infer address spaces pass to the opt pipeline after inlining
1043 // but before SROA to increase SROA opportunities.
1044 FPM.addPass(Pass: InferAddressSpacesPass());
1045
1046 // This should run after inlining to have any chance of doing
1047 // anything, and before other cleanup optimizations.
1048 FPM.addPass(Pass: AMDGPULowerKernelAttributesPass());
1049
1050 if (Level != OptimizationLevel::O0) {
1051 // Promote alloca to vector before SROA and loop unroll. If we
1052 // manage to eliminate allocas before unroll we may choose to unroll
1053 // less.
1054 FPM.addPass(Pass: AMDGPUPromoteAllocaToVectorPass(*this));
1055 }
1056
1057 PM.addPass(Pass: createCGSCCToFunctionPassAdaptor(Pass: std::move(FPM)));
1058 });
1059
1060 // FIXME: Why is AMDGPUAttributor not in CGSCC?
1061 PB.registerOptimizerLastEPCallback(C: [this](ModulePassManager &MPM,
1062 OptimizationLevel Level,
1063 ThinOrFullLTOPhase Phase) {
1064 if (Level != OptimizationLevel::O0) {
1065 if (!isLTOPreLink(Phase)) {
1066 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
1067 AMDGPUAttributorOptions Opts;
1068 MPM.addPass(Pass: AMDGPUAttributorPass(*this, Opts, Phase));
1069 }
1070 }
1071 }
1072 });
1073
1074 PB.registerFullLinkTimeOptimizationLastEPCallback(
1075 C: [this](ModulePassManager &PM, OptimizationLevel Level) {
1076 // When we are using -fgpu-rdc, we can only run accelerator code
1077 // selection after linking to prevent, otherwise we end up removing
1078 // potentially reachable symbols that were exported as external in other
1079 // modules.
1080 if (EnableHipStdPar) {
1081 PM.addPass(Pass: HipStdParMathFixupPass());
1082 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
1083 }
1084 // We want to support the -lto-partitions=N option as "best effort".
1085 // For that, we need to lower LDS earlier in the pipeline before the
1086 // module is partitioned for codegen.
1087 if (EnableLowerExecSync)
1088 PM.addPass(Pass: AMDGPULowerExecSyncPass());
1089 if (EnableSwLowerLDS)
1090 PM.addPass(Pass: AMDGPUSwLowerLDSPass(*this));
1091 if (EnableLowerModuleLDS)
1092 PM.addPass(Pass: AMDGPULowerModuleLDSPass(*this));
1093 if (Level != OptimizationLevel::O0) {
1094 // We only want to run this with O2 or higher since inliner and SROA
1095 // don't run in O1.
1096 if (Level != OptimizationLevel::O1) {
1097 PM.addPass(
1098 Pass: createModuleToFunctionPassAdaptor(Pass: InferAddressSpacesPass()));
1099 }
1100 // Do we really need internalization in LTO?
1101 if (InternalizeSymbols) {
1102 PM.addPass(Pass: InternalizePass(mustPreserveGV));
1103 PM.addPass(Pass: GlobalDCEPass());
1104 }
1105 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
1106 AMDGPUAttributorOptions Opt;
1107 if (HasClosedWorldAssumption)
1108 Opt.IsClosedWorld = true;
1109 PM.addPass(Pass: AMDGPUAttributorPass(
1110 *this, Opt, ThinOrFullLTOPhase::FullLTOPostLink));
1111 }
1112 }
1113 if (!NoKernelInfoEndLTO) {
1114 FunctionPassManager FPM;
1115 FPM.addPass(Pass: KernelInfoPrinter(this));
1116 PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM)));
1117 }
1118 });
1119
1120 PB.registerRegClassFilterParsingCallback(
1121 C: [](StringRef FilterName) -> RegAllocFilterFunc {
1122 if (FilterName == "sgpr")
1123 return onlyAllocateSGPRs;
1124 if (FilterName == "vgpr")
1125 return onlyAllocateVGPRs;
1126 if (FilterName == "wwm")
1127 return onlyAllocateWWMRegs;
1128 return nullptr;
1129 });
1130}
1131
1132bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
1133 unsigned DestAS) const {
1134 return AMDGPU::isFlatGlobalAddrSpace(AS: SrcAS) &&
1135 AMDGPU::isFlatGlobalAddrSpace(AS: DestAS);
1136}
1137
1138unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
1139 if (auto *Arg = dyn_cast<Argument>(Val: V);
1140 Arg &&
1141 AMDGPU::isModuleEntryFunctionCC(CC: Arg->getParent()->getCallingConv()) &&
1142 !Arg->hasByRefAttr())
1143 return AMDGPUAS::GLOBAL_ADDRESS;
1144
1145 const auto *LD = dyn_cast<LoadInst>(Val: V);
1146 if (!LD) // TODO: Handle invariant load like constant.
1147 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
1148
1149 // It must be a generic pointer loaded.
1150 assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
1151
1152 const auto *Ptr = LD->getPointerOperand();
1153 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
1154 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
1155 // For a generic pointer loaded from the constant memory, it could be assumed
1156 // as a global pointer since the constant memory is only populated on the
1157 // host side. As implied by the offload programming model, only global
1158 // pointers could be referenced on the host side.
1159 return AMDGPUAS::GLOBAL_ADDRESS;
1160}
1161
1162std::pair<const Value *, unsigned>
1163AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
1164 if (auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
1165 switch (II->getIntrinsicID()) {
1166 case Intrinsic::amdgcn_is_shared:
1167 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::LOCAL_ADDRESS);
1168 case Intrinsic::amdgcn_is_private:
1169 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::PRIVATE_ADDRESS);
1170 default:
1171 break;
1172 }
1173 return std::pair(nullptr, -1);
1174 }
1175 // Check the global pointer predication based on
1176 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
1177 // the order of 'is_shared' and 'is_private' is not significant.
1178 Value *Ptr;
1179 if (match(
1180 V: const_cast<Value *>(V),
1181 P: m_c_And(L: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_shared>(Op0: m_Value(V&: Ptr))),
1182 R: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_private>(
1183 Op0: m_Deferred(V: Ptr))))))
1184 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
1185
1186 return std::pair(nullptr, -1);
1187}
1188
1189unsigned
1190AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
1191 switch (Kind) {
1192 case PseudoSourceValue::Stack:
1193 case PseudoSourceValue::FixedStack:
1194 return AMDGPUAS::PRIVATE_ADDRESS;
1195 case PseudoSourceValue::ConstantPool:
1196 case PseudoSourceValue::GOT:
1197 case PseudoSourceValue::JumpTable:
1198 case PseudoSourceValue::GlobalValueCallEntry:
1199 case PseudoSourceValue::ExternalSymbolCallEntry:
1200 return AMDGPUAS::CONSTANT_ADDRESS;
1201 }
1202 return AMDGPUAS::FLAT_ADDRESS;
1203}
1204
1205bool AMDGPUTargetMachine::splitModule(
1206 Module &M, unsigned NumParts,
1207 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1208 // FIXME(?): Would be better to use an already existing Analysis/PassManager,
1209 // but all current users of this API don't have one ready and would need to
1210 // create one anyway. Let's hide the boilerplate for now to keep it simple.
1211
1212 LoopAnalysisManager LAM;
1213 FunctionAnalysisManager FAM;
1214 CGSCCAnalysisManager CGAM;
1215 ModuleAnalysisManager MAM;
1216
1217 PassBuilder PB(this);
1218 PB.registerModuleAnalyses(MAM);
1219 PB.registerFunctionAnalyses(FAM);
1220 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
1221
1222 ModulePassManager MPM;
1223 MPM.addPass(Pass: AMDGPUSplitModulePass(NumParts, ModuleCallback));
1224 MPM.run(IR&: M, AM&: MAM);
1225 return true;
1226}
1227
1228//===----------------------------------------------------------------------===//
1229// GCN Target Machine (SI+)
1230//===----------------------------------------------------------------------===//
1231
1232GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
1233 StringRef CPU, StringRef FS,
1234 const TargetOptions &Options,
1235 std::optional<Reloc::Model> RM,
1236 std::optional<CodeModel::Model> CM,
1237 CodeGenOptLevel OL, bool JIT)
1238 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
1239
1240const TargetSubtargetInfo *
1241GCNTargetMachine::getSubtargetImpl(const Function &F) const {
1242 StringRef GPU = getGPUName(F);
1243 StringRef FS = getFeatureString(F);
1244
1245 SmallString<128> SubtargetKey(GPU);
1246 SubtargetKey.append(RHS: FS);
1247
1248 auto &I = SubtargetMap[SubtargetKey];
1249 if (!I) {
1250 // This needs to be done before we create a new subtarget since any
1251 // creation will depend on the TM and the code generation flags on the
1252 // function that reside in TargetOptions.
1253 resetTargetOptions(F);
1254 I = std::make_unique<GCNSubtarget>(args: TargetTriple, args&: GPU, args&: FS, args: *this);
1255 }
1256
1257 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
1258
1259 return I.get();
1260}
1261
1262TargetTransformInfo
1263GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
1264 return TargetTransformInfo(std::make_unique<GCNTTIImpl>(args: this, args: F));
1265}
1266
1267Error GCNTargetMachine::buildCodeGenPipeline(
1268 ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
1269 CodeGenFileType FileType, const CGPassBuilderOption &Opts, MCContext &Ctx,
1270 PassInstrumentationCallbacks *PIC) {
1271 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
1272 return CGPB.buildPipeline(MPM, Out, DwoOut, FileType, Ctx);
1273}
1274
1275ScheduleDAGInstrs *
1276GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
1277 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1278 if (ST.enableSIScheduler())
1279 return createSIMachineScheduler(C);
1280
1281 StringRef SchedStrategy = AMDGPU::getSchedStrategy(F: C->MF->getFunction());
1282
1283 if (SchedStrategy == "max-ilp")
1284 return createGCNMaxILPMachineScheduler(C);
1285
1286 if (SchedStrategy == "max-memory-clause")
1287 return createGCNMaxMemoryClauseMachineScheduler(C);
1288
1289 if (SchedStrategy == "iterative-ilp")
1290 return createIterativeILPMachineScheduler(C);
1291
1292 if (SchedStrategy == "iterative-minreg")
1293 return createMinRegScheduler(C);
1294
1295 if (SchedStrategy == "iterative-maxocc")
1296 return createIterativeGCNMaxOccupancyMachineScheduler(C);
1297
1298 if (SchedStrategy == "coexec") {
1299 diagnoseUnsupportedCoExecSchedulerSelection(F: C->MF->getFunction(), ST);
1300 return createGCNCoExecMachineScheduler(C);
1301 }
1302
1303 return createGCNMaxOccupancyMachineScheduler(C);
1304}
1305
1306ScheduleDAGInstrs *
1307GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
1308 if (useNoopPostScheduler(F: C->MF->getFunction()))
1309 return createGCNNoopPostMachineScheduler(C);
1310
1311 ScheduleDAGMI *DAG =
1312 new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(args&: C),
1313 /*RemoveKillFlags=*/true);
1314 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1315 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1316 if (ST.shouldClusterStores())
1317 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1318 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
1319 if ((EnableVOPD.getNumOccurrences() ||
1320 getOptLevel() >= CodeGenOptLevel::Less) &&
1321 EnableVOPD)
1322 DAG->addMutation(Mutation: createVOPDPairingMutation());
1323 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
1324 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
1325 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
1326 return DAG;
1327}
1328//===----------------------------------------------------------------------===//
1329// AMDGPU Legacy Pass Setup
1330//===----------------------------------------------------------------------===//
1331
1332std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
1333 return getStandardCSEConfigForOpt(Level: TM->getOptLevel());
1334}
1335
1336namespace {
1337
1338class GCNPassConfig final : public AMDGPUPassConfig {
1339public:
1340 GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
1341 : AMDGPUPassConfig(TM, PM) {
1342 substitutePass(StandardID: &PostRASchedulerID, TargetID: &PostMachineSchedulerID);
1343 }
1344
1345 GCNTargetMachine &getGCNTargetMachine() const {
1346 return getTM<GCNTargetMachine>();
1347 }
1348
1349 bool addPreISel() override;
1350 void addMachineSSAOptimization() override;
1351 bool addILPOpts() override;
1352 bool addInstSelector() override;
1353 bool addIRTranslator() override;
1354 void addPreLegalizeMachineIR() override;
1355 bool addLegalizeMachineIR() override;
1356 void addPreRegBankSelect() override;
1357 bool addRegBankSelect() override;
1358 void addPreGlobalInstructionSelect() override;
1359 bool addGlobalInstructionSelect() override;
1360 void addPreRegAlloc() override;
1361 void addFastRegAlloc() override;
1362 void addOptimizedRegAlloc() override;
1363
1364 FunctionPass *createSGPRAllocPass(bool Optimized);
1365 FunctionPass *createVGPRAllocPass(bool Optimized);
1366 FunctionPass *createWWMRegAllocPass(bool Optimized);
1367 FunctionPass *createRegAllocPass(bool Optimized) override;
1368
1369 bool addRegAssignAndRewriteFast() override;
1370 bool addRegAssignAndRewriteOptimized() override;
1371
1372 bool addPreRewrite() override;
1373 void addPostRegAlloc() override;
1374 void addPreSched2() override;
1375 void addPreEmitPass() override;
1376 void addPostBBSections() override;
1377};
1378
1379} // end anonymous namespace
1380
1381AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM)
1382 : TargetPassConfig(TM, PM) {
1383 // Exceptions and StackMaps are not supported, so these passes will never do
1384 // anything.
1385 disablePass(PassID: &StackMapLivenessID);
1386 disablePass(PassID: &FuncletLayoutID);
1387 // Garbage collection is not supported.
1388 disablePass(PassID: &GCLoweringID);
1389 disablePass(PassID: &ShadowStackGCLoweringID);
1390}
1391
1392void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
1393 if (getOptLevel() == CodeGenOptLevel::Aggressive)
1394 addPass(P: createGVNPass());
1395 else
1396 addPass(P: createEarlyCSEPass());
1397}
1398
1399void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
1400 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
1401 addPass(P: createLoopDataPrefetchPass());
1402 addPass(P: createSeparateConstOffsetFromGEPPass());
1403 // ReassociateGEPs exposes more opportunities for SLSR. See
1404 // the example in reassociate-geps-and-slsr.ll.
1405 addPass(P: createStraightLineStrengthReducePass());
1406 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1407 // EarlyCSE can reuse.
1408 addEarlyCSEOrGVNPass();
1409 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1410 addPass(P: createNaryReassociatePass());
1411 // NaryReassociate on GEPs creates redundant common expressions, so run
1412 // EarlyCSE after it.
1413 addPass(P: createEarlyCSEPass());
1414}
1415
1416void AMDGPUPassConfig::addIRPasses() {
1417 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1418
1419 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
1420 addPass(P: createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1421
1422 // There is no reason to run these.
1423 disablePass(PassID: &StackMapLivenessID);
1424 disablePass(PassID: &FuncletLayoutID);
1425 disablePass(PassID: &PatchableFunctionID);
1426
1427 if (TM.getTargetTriple().isAMDGCN())
1428 addPass(P: createAMDGPUPrintfRuntimeBinding());
1429
1430 if (LowerCtorDtor)
1431 addPass(P: createAMDGPUCtorDtorLoweringLegacyPass());
1432
1433 if (TM.getTargetTriple().isAMDGCN() &&
1434 isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
1435 addPass(P: createAMDGPUImageIntrinsicOptimizerPass(&TM));
1436
1437 if (EnableUniformIntrinsicCombine)
1438 addPass(P: createAMDGPUUniformIntrinsicCombineLegacyPass());
1439
1440 // This can be disabled by passing ::Disable here or on the command line
1441 // with --expand-variadics-override=disable.
1442 addPass(P: createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
1443
1444 // Function calls are not supported, so make sure we inline everything.
1445 addPass(P: createAMDGPUAlwaysInlinePass());
1446 addPass(P: createAlwaysInlinerLegacyPass());
1447
1448 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1449 if (TM.getTargetTriple().getArch() == Triple::r600)
1450 addPass(P: createR600OpenCLImageTypeLoweringPass());
1451
1452 // Make enqueued block runtime handles externally visible.
1453 addPass(P: createAMDGPUExportKernelRuntimeHandlesLegacyPass());
1454
1455 // Lower special LDS accesses.
1456 if (EnableLowerExecSync)
1457 addPass(P: createAMDGPULowerExecSyncLegacyPass());
1458
1459 // Lower LDS accesses to global memory pass if address sanitizer is enabled.
1460 if (EnableSwLowerLDS)
1461 addPass(P: createAMDGPUSwLowerLDSLegacyPass(TM: &TM));
1462
1463 // Runs before PromoteAlloca so the latter can account for function uses
1464 if (EnableLowerModuleLDS) {
1465 addPass(P: createAMDGPULowerModuleLDSLegacyPass(TM: &TM));
1466 }
1467
1468 // Run atomic optimizer before Atomic Expand
1469 if ((TM.getTargetTriple().isAMDGCN()) &&
1470 (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1471 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1472 addPass(P: createAMDGPUAtomicOptimizerPass(ScanStrategy: AMDGPUAtomicOptimizerStrategy));
1473 }
1474
1475 addPass(P: createAtomicExpandLegacyPass());
1476
1477 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1478 addPass(P: createAMDGPUPromoteAlloca());
1479
1480 if (isPassEnabled(Opt: EnableScalarIRPasses))
1481 addStraightLineScalarOptimizationPasses();
1482
1483 if (EnableAMDGPUAliasAnalysis) {
1484 addPass(P: createAMDGPUAAWrapperPass());
1485 addPass(P: createExternalAAWrapperPass(Callback: [](Pass &P, Function &,
1486 AAResults &AAR) {
1487 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1488 AAR.addAAResult(AAResult&: WrapperPass->getResult());
1489 }));
1490 }
1491
1492 if (TM.getTargetTriple().isAMDGCN()) {
1493 // TODO: May want to move later or split into an early and late one.
1494 addPass(P: createAMDGPUCodeGenPreparePass());
1495 }
1496
1497 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1498 // have expanded.
1499 if (TM.getOptLevel() > CodeGenOptLevel::Less)
1500 addPass(P: createLICMPass());
1501 }
1502
1503 TargetPassConfig::addIRPasses();
1504
1505 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1506 // example, GVN can combine
1507 //
1508 // %0 = add %a, %b
1509 // %1 = add %b, %a
1510 //
1511 // and
1512 //
1513 // %0 = shl nsw %a, 2
1514 // %1 = shl %a, 2
1515 //
1516 // but EarlyCSE can do neither of them.
1517 if (isPassEnabled(Opt: EnableScalarIRPasses))
1518 addEarlyCSEOrGVNPass();
1519}
1520
1521void AMDGPUPassConfig::addCodeGenPrepare() {
1522 if (TM->getTargetTriple().isAMDGCN() &&
1523 TM->getOptLevel() > CodeGenOptLevel::None)
1524 addPass(P: createAMDGPUPreloadKernelArgumentsLegacyPass(TM));
1525
1526 if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
1527 addPass(P: createAMDGPULowerKernelArgumentsPass());
1528
1529 TargetPassConfig::addCodeGenPrepare();
1530
1531 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
1532 addPass(P: createLoadStoreVectorizerPass());
1533
1534 if (TM->getTargetTriple().isAMDGCN()) {
1535 // This lowering has been placed after codegenprepare to take advantage of
1536 // address mode matching (which is why it isn't put with the LDS lowerings).
1537 // It could be placed anywhere before uniformity annotations (an analysis
1538 // that it changes by splitting up fat pointers into their components)
1539 // but has been put before switch lowering and CFG flattening so that those
1540 // passes can run on the more optimized control flow this pass creates in
1541 // many cases.
1542 addPass(P: createAMDGPULowerBufferFatPointersPass());
1543 addPass(P: createAMDGPULowerIntrinsicsLegacyPass());
1544 }
1545
1546 // LowerSwitch pass may introduce unreachable blocks that can
1547 // cause unexpected behavior for subsequent passes. Placing it
1548 // here seems better that these blocks would get cleaned up by
1549 // UnreachableBlockElim inserted next in the pass flow.
1550 addPass(P: createLowerSwitchPass());
1551}
1552
1553bool AMDGPUPassConfig::addPreISel() {
1554 if (TM->getOptLevel() > CodeGenOptLevel::None)
1555 addPass(P: createFlattenCFGPass());
1556 return false;
1557}
1558
1559bool AMDGPUPassConfig::addInstSelector() {
1560 addPass(P: createAMDGPUISelDag(TM&: getAMDGPUTargetMachine(), OptLevel: getOptLevel()));
1561 return false;
1562}
1563
1564bool AMDGPUPassConfig::addGCPasses() {
1565 // Do nothing. GC is not supported.
1566 return false;
1567}
1568
1569//===----------------------------------------------------------------------===//
1570// GCN Legacy Pass Setup
1571//===----------------------------------------------------------------------===//
1572
1573bool GCNPassConfig::addPreISel() {
1574 AMDGPUPassConfig::addPreISel();
1575
1576 if (TM->getOptLevel() > CodeGenOptLevel::None)
1577 addPass(P: createSinkingPass());
1578
1579 if (TM->getOptLevel() > CodeGenOptLevel::None)
1580 addPass(P: createAMDGPULateCodeGenPrepareLegacyPass());
1581
1582 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1583 // regions formed by them.
1584 addPass(PassID: &AMDGPUUnifyDivergentExitNodesID);
1585 addPass(P: createFixIrreduciblePass());
1586 addPass(P: createUnifyLoopExitsPass());
1587 addPass(P: createStructurizeCFGPass(SkipUniformRegions: false)); // true -> SkipUniformRegions
1588
1589 addPass(P: createAMDGPUAnnotateUniformValuesLegacy());
1590 addPass(P: createSIAnnotateControlFlowLegacyPass());
1591 // TODO: Move this right after structurizeCFG to avoid extra divergence
1592 // analysis. This depends on stopping SIAnnotateControlFlow from making
1593 // control flow modifications.
1594 addPass(P: createAMDGPURewriteUndefForPHILegacyPass());
1595
1596 // SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
1597 // with -new-reg-bank-select and without any of the fallback options.
1598 if (!getCGPassBuilderOption().EnableGlobalISelOption ||
1599 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
1600 addPass(P: createLCSSAPass());
1601
1602 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1603 addPass(PassID: &AMDGPUPerfHintAnalysisLegacyID);
1604
1605 return false;
1606}
1607
1608void GCNPassConfig::addMachineSSAOptimization() {
1609 TargetPassConfig::addMachineSSAOptimization();
1610
1611 // We want to fold operands after PeepholeOptimizer has run (or as part of
1612 // it), because it will eliminate extra copies making it easier to fold the
1613 // real source operand. We want to eliminate dead instructions after, so that
1614 // we see fewer uses of the copies. We then need to clean up the dead
1615 // instructions leftover after the operands are folded as well.
1616 //
1617 // XXX - Can we get away without running DeadMachineInstructionElim again?
1618 addPass(PassID: &SIFoldOperandsLegacyID);
1619 if (EnableDPPCombine)
1620 addPass(PassID: &GCNDPPCombineLegacyID);
1621 addPass(PassID: &SILoadStoreOptimizerLegacyID);
1622 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
1623 addPass(PassID: &SIPeepholeSDWALegacyID);
1624 addPass(PassID: &EarlyMachineLICMID);
1625 addPass(PassID: &MachineCSELegacyID);
1626 addPass(PassID: &SIFoldOperandsLegacyID);
1627 }
1628 addPass(PassID: &DeadMachineInstructionElimID);
1629 addPass(P: createSIShrinkInstructionsLegacyPass());
1630}
1631
1632bool GCNPassConfig::addILPOpts() {
1633 if (EnableEarlyIfConversion)
1634 addPass(PassID: &EarlyIfConverterLegacyID);
1635
1636 TargetPassConfig::addILPOpts();
1637 return false;
1638}
1639
1640bool GCNPassConfig::addInstSelector() {
1641 AMDGPUPassConfig::addInstSelector();
1642 addPass(PassID: &SIFixSGPRCopiesLegacyID);
1643 addPass(P: createSILowerI1CopiesLegacyPass());
1644 return false;
1645}
1646
1647bool GCNPassConfig::addIRTranslator() {
1648 addPass(P: new IRTranslator(getOptLevel()));
1649 return false;
1650}
1651
1652void GCNPassConfig::addPreLegalizeMachineIR() {
1653 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1654 addPass(P: createAMDGPUPreLegalizeCombiner(IsOptNone));
1655 addPass(P: new Localizer());
1656}
1657
1658bool GCNPassConfig::addLegalizeMachineIR() {
1659 addPass(P: new Legalizer());
1660 return false;
1661}
1662
1663void GCNPassConfig::addPreRegBankSelect() {
1664 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1665 addPass(P: createAMDGPUPostLegalizeCombiner(IsOptNone));
1666 addPass(P: createAMDGPUGlobalISelDivergenceLoweringPass());
1667}
1668
1669bool GCNPassConfig::addRegBankSelect() {
1670 if (NewRegBankSelect) {
1671 addPass(P: createAMDGPURegBankSelectPass());
1672 addPass(P: createAMDGPURegBankLegalizePass());
1673 } else {
1674 addPass(P: new RegBankSelect());
1675 }
1676 return false;
1677}
1678
1679void GCNPassConfig::addPreGlobalInstructionSelect() {
1680 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1681 addPass(P: createAMDGPURegBankCombiner(IsOptNone));
1682}
1683
1684bool GCNPassConfig::addGlobalInstructionSelect() {
1685 addPass(P: new InstructionSelect(getOptLevel()));
1686 return false;
1687}
1688
1689void GCNPassConfig::addFastRegAlloc() {
1690 // FIXME: We have to disable the verifier here because of PHIElimination +
1691 // TwoAddressInstructions disabling it.
1692
1693 // This must be run immediately after phi elimination and before
1694 // TwoAddressInstructions, otherwise the processing of the tied operand of
1695 // SI_ELSE will introduce a copy of the tied operand source after the else.
1696 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1697
1698 insertPass(TargetPassID: &TwoAddressInstructionPassID, InsertedPassID: &SIWholeQuadModeID);
1699
1700 TargetPassConfig::addFastRegAlloc();
1701}
1702
1703void GCNPassConfig::addPreRegAlloc() {
1704 if (getOptLevel() != CodeGenOptLevel::None)
1705 addPass(PassID: &AMDGPUPrepareAGPRAllocLegacyID);
1706}
1707
1708void GCNPassConfig::addOptimizedRegAlloc() {
1709 if (EnableDCEInRA)
1710 insertPass(TargetPassID: &DetectDeadLanesID, InsertedPassID: &DeadMachineInstructionElimID);
1711
1712 // FIXME: when an instruction has a Killed operand, and the instruction is
1713 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1714 // the register in LiveVariables, this would trigger a failure in verifier,
1715 // we should fix it and enable the verifier.
1716 if (OptVGPRLiveRange)
1717 insertPass(TargetPassID: &LiveVariablesID, InsertedPassID: &SIOptimizeVGPRLiveRangeLegacyID);
1718
1719 // This must be run immediately after phi elimination and before
1720 // TwoAddressInstructions, otherwise the processing of the tied operand of
1721 // SI_ELSE will introduce a copy of the tied operand source after the else.
1722 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1723
1724 if (EnableRewritePartialRegUses)
1725 insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNRewritePartialRegUsesID);
1726
1727 if (isPassEnabled(Opt: EnablePreRAOptimizations))
1728 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &GCNPreRAOptimizationsID);
1729
1730 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1731 // instructions that cause scheduling barriers.
1732 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIWholeQuadModeID);
1733
1734 if (OptExecMaskPreRA)
1735 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIOptimizeExecMaskingPreRAID);
1736
1737 // This is not an essential optimization and it has a noticeable impact on
1738 // compilation time, so we only enable it from O2.
1739 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1740 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIFormMemoryClausesID);
1741
1742 TargetPassConfig::addOptimizedRegAlloc();
1743}
1744
1745bool GCNPassConfig::addPreRewrite() {
1746 if (EnableRegReassign)
1747 addPass(PassID: &GCNNSAReassignID);
1748
1749 addPass(PassID: &AMDGPURewriteAGPRCopyMFMALegacyID);
1750 return true;
1751}
1752
1753FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1754 // Initialize the global default.
1755 llvm::call_once(flag&: InitializeDefaultSGPRRegisterAllocatorFlag,
1756 F&: initializeDefaultSGPRRegisterAllocatorOnce);
1757
1758 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1759 if (Ctor != useDefaultRegisterAllocator)
1760 return Ctor();
1761
1762 if (Optimized)
1763 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
1764
1765 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
1766}
1767
1768FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1769 // Initialize the global default.
1770 llvm::call_once(flag&: InitializeDefaultVGPRRegisterAllocatorFlag,
1771 F&: initializeDefaultVGPRRegisterAllocatorOnce);
1772
1773 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1774 if (Ctor != useDefaultRegisterAllocator)
1775 return Ctor();
1776
1777 if (Optimized)
1778 return createGreedyVGPRRegisterAllocator();
1779
1780 return createFastVGPRRegisterAllocator();
1781}
1782
1783FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1784 // Initialize the global default.
1785 llvm::call_once(flag&: InitializeDefaultWWMRegisterAllocatorFlag,
1786 F&: initializeDefaultWWMRegisterAllocatorOnce);
1787
1788 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1789 if (Ctor != useDefaultRegisterAllocator)
1790 return Ctor();
1791
1792 if (Optimized)
1793 return createGreedyWWMRegisterAllocator();
1794
1795 return createFastWWMRegisterAllocator();
1796}
1797
1798FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1799 llvm_unreachable("should not be used");
1800}
1801
1802static const char RegAllocOptNotSupportedMessage[] =
1803 "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1804 "and -vgpr-regalloc";
1805
1806bool GCNPassConfig::addRegAssignAndRewriteFast() {
1807 if (!usingDefaultRegAlloc())
1808 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1809
1810 addPass(PassID: &GCNPreRALongBranchRegID);
1811
1812 addPass(P: createSGPRAllocPass(Optimized: false));
1813
1814 // Equivalent of PEI for SGPRs.
1815 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1816
1817 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1818 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1819
1820 // For allocating other wwm register operands.
1821 addPass(P: createWWMRegAllocPass(Optimized: false));
1822
1823 addPass(PassID: &SILowerWWMCopiesLegacyID);
1824 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1825
1826 // For allocating per-thread VGPRs.
1827 addPass(P: createVGPRAllocPass(Optimized: false));
1828
1829 return true;
1830}
1831
1832bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1833 if (!usingDefaultRegAlloc())
1834 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1835
1836 addPass(PassID: &GCNPreRALongBranchRegID);
1837
1838 addPass(P: createSGPRAllocPass(Optimized: true));
1839
1840 // Commit allocated register changes. This is mostly necessary because too
1841 // many things rely on the use lists of the physical registers, such as the
1842 // verifier. This is only necessary with allocators which use LiveIntervals,
1843 // since FastRegAlloc does the replacements itself.
1844 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1845
1846 // At this point, the sgpr-regalloc has been done and it is good to have the
1847 // stack slot coloring to try to optimize the SGPR spill stack indices before
1848 // attempting the custom SGPR spill lowering.
1849 addPass(PassID: &StackSlotColoringID);
1850
1851 // Equivalent of PEI for SGPRs.
1852 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1853
1854 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1855 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1856
1857 // For allocating other whole wave mode registers.
1858 addPass(P: createWWMRegAllocPass(Optimized: true));
1859 addPass(PassID: &SILowerWWMCopiesLegacyID);
1860 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1861 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1862
1863 // For allocating per-thread VGPRs.
1864 addPass(P: createVGPRAllocPass(Optimized: true));
1865
1866 addPreRewrite();
1867 addPass(PassID: &VirtRegRewriterID);
1868
1869 addPass(PassID: &AMDGPUMarkLastScratchLoadID);
1870
1871 return true;
1872}
1873
1874void GCNPassConfig::addPostRegAlloc() {
1875 addPass(PassID: &SIFixVGPRCopiesID);
1876 if (getOptLevel() > CodeGenOptLevel::None)
1877 addPass(PassID: &SIOptimizeExecMaskingLegacyID);
1878 TargetPassConfig::addPostRegAlloc();
1879}
1880
1881void GCNPassConfig::addPreSched2() {
1882 if (TM->getOptLevel() > CodeGenOptLevel::None)
1883 addPass(P: createSIShrinkInstructionsLegacyPass());
1884 addPass(PassID: &SIPostRABundlerLegacyID);
1885}
1886
1887void GCNPassConfig::addPreEmitPass() {
1888 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less))
1889 addPass(PassID: &GCNCreateVOPDID);
1890 addPass(P: createSIMemoryLegalizerPass());
1891 addPass(P: createSIInsertWaitcntsPass());
1892
1893 addPass(P: createSIModeRegisterPass());
1894
1895 if (getOptLevel() > CodeGenOptLevel::None)
1896 addPass(PassID: &SIInsertHardClausesID);
1897
1898 addPass(PassID: &SILateBranchLoweringPassID);
1899 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
1900 addPass(P: createAMDGPUSetWavePriorityPass());
1901 if (getOptLevel() > CodeGenOptLevel::None)
1902 addPass(PassID: &SIPreEmitPeepholeID);
1903 // The hazard recognizer that runs as part of the post-ra scheduler does not
1904 // guarantee to be able handle all hazards correctly. This is because if there
1905 // are multiple scheduling regions in a basic block, the regions are scheduled
1906 // bottom up, so when we begin to schedule a region we don't know what
1907 // instructions were emitted directly before it.
1908 //
1909 // Here we add a stand-alone hazard recognizer pass which can handle all
1910 // cases.
1911 addPass(PassID: &PostRAHazardRecognizerID);
1912
1913 addPass(PassID: &AMDGPUWaitSGPRHazardsLegacyID);
1914
1915 addPass(PassID: &AMDGPULowerVGPREncodingLegacyID);
1916
1917 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less))
1918 addPass(PassID: &AMDGPUInsertDelayAluID);
1919
1920 addPass(PassID: &BranchRelaxationPassID);
1921}
1922
1923void GCNPassConfig::addPostBBSections() {
1924 // We run this later to avoid passes like livedebugvalues and BBSections
1925 // having to deal with the apparent multi-entry functions we may generate.
1926 addPass(P: createAMDGPUPreloadKernArgPrologLegacyPass());
1927}
1928
1929TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1930 return new GCNPassConfig(*this, PM);
1931}
1932
1933void GCNTargetMachine::registerMachineRegisterInfoCallback(
1934 MachineFunction &MF) const {
1935 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1936 MF.getRegInfo().addDelegate(delegate: MFI);
1937}
1938
1939MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1940 BumpPtrAllocator &Allocator, const Function &F,
1941 const TargetSubtargetInfo *STI) const {
1942 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1943 Allocator, F, STI: static_cast<const GCNSubtarget *>(STI));
1944}
1945
1946yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1947 return new yaml::SIMachineFunctionInfo();
1948}
1949
1950yaml::MachineFunctionInfo *
1951GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1952 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1953 return new yaml::SIMachineFunctionInfo(
1954 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1955}
1956
1957bool GCNTargetMachine::parseMachineFunctionInfo(
1958 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1959 SMDiagnostic &Error, SMRange &SourceRange) const {
1960 const yaml::SIMachineFunctionInfo &YamlMFI =
1961 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1962 MachineFunction &MF = PFS.MF;
1963 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1964 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1965
1966 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1967 return true;
1968
1969 if (MFI->Occupancy == 0) {
1970 // Fixup the subtarget dependent default value.
1971 MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
1972 }
1973
1974 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1975 Register TempReg;
1976 if (parseNamedRegisterReference(PFS, Reg&: TempReg, Src: RegName.Value, Error)) {
1977 SourceRange = RegName.SourceRange;
1978 return true;
1979 }
1980 RegVal = TempReg;
1981
1982 return false;
1983 };
1984
1985 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1986 Register &RegVal) {
1987 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1988 };
1989
1990 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1991 return true;
1992
1993 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1994 return true;
1995
1996 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1997 MFI->LongBranchReservedReg))
1998 return true;
1999
2000 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
2001 // Create a diagnostic for a the register string literal.
2002 const MemoryBuffer &Buffer =
2003 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
2004 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
2005 RegName.Value.size(), SourceMgr::DK_Error,
2006 "incorrect register class for field", RegName.Value,
2007 {}, {});
2008 SourceRange = RegName.SourceRange;
2009 return true;
2010 };
2011
2012 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
2013 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
2014 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
2015 return true;
2016
2017 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
2018 !AMDGPU::SGPR_128RegClass.contains(Reg: MFI->ScratchRSrcReg)) {
2019 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
2020 }
2021
2022 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
2023 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->FrameOffsetReg)) {
2024 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
2025 }
2026
2027 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
2028 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->StackPtrOffsetReg)) {
2029 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
2030 }
2031
2032 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
2033 Register ParsedReg;
2034 if (parseRegister(YamlReg, ParsedReg))
2035 return true;
2036
2037 MFI->reserveWWMRegister(Reg: ParsedReg);
2038 }
2039
2040 for (const auto &[_, Info] : PFS.VRegInfosNamed) {
2041 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
2042 }
2043 for (const auto &[_, Info] : PFS.VRegInfos) {
2044 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
2045 }
2046
2047 for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
2048 Register ParsedReg;
2049 if (parseRegister(YamlRegStr, ParsedReg))
2050 return true;
2051 MFI->SpillPhysVGPRs.push_back(Elt: ParsedReg);
2052 }
2053
2054 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
2055 const TargetRegisterClass &RC,
2056 ArgDescriptor &Arg, unsigned UserSGPRs,
2057 unsigned SystemSGPRs) {
2058 // Skip parsing if it's not present.
2059 if (!A)
2060 return false;
2061
2062 if (A->IsRegister) {
2063 Register Reg;
2064 if (parseNamedRegisterReference(PFS, Reg, Src: A->RegisterName.Value, Error)) {
2065 SourceRange = A->RegisterName.SourceRange;
2066 return true;
2067 }
2068 if (!RC.contains(Reg))
2069 return diagnoseRegisterClass(A->RegisterName);
2070 Arg = ArgDescriptor::createRegister(Reg);
2071 } else
2072 Arg = ArgDescriptor::createStack(Offset: A->StackOffset);
2073 // Check and apply the optional mask.
2074 if (A->Mask)
2075 Arg = ArgDescriptor::createArg(Arg, Mask: *A->Mask);
2076
2077 MFI->NumUserSGPRs += UserSGPRs;
2078 MFI->NumSystemSGPRs += SystemSGPRs;
2079 return false;
2080 };
2081
2082 if (YamlMFI.ArgInfo &&
2083 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
2084 AMDGPU::SGPR_128RegClass,
2085 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
2086 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
2087 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
2088 2, 0) ||
2089 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
2090 MFI->ArgInfo.QueuePtr, 2, 0) ||
2091 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
2092 AMDGPU::SReg_64RegClass,
2093 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
2094 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
2095 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
2096 2, 0) ||
2097 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
2098 AMDGPU::SReg_64RegClass,
2099 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
2100 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
2101 AMDGPU::SGPR_32RegClass,
2102 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
2103 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
2104 AMDGPU::SGPR_32RegClass,
2105 MFI->ArgInfo.LDSKernelId, 0, 1) ||
2106 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
2107 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
2108 0, 1) ||
2109 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
2110 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
2111 0, 1) ||
2112 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
2113 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
2114 0, 1) ||
2115 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
2116 AMDGPU::SGPR_32RegClass,
2117 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
2118 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
2119 AMDGPU::SGPR_32RegClass,
2120 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
2121 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
2122 AMDGPU::SReg_64RegClass,
2123 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
2124 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
2125 AMDGPU::SReg_64RegClass,
2126 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
2127 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
2128 AMDGPU::VGPR_32RegClass,
2129 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
2130 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
2131 AMDGPU::VGPR_32RegClass,
2132 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
2133 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
2134 AMDGPU::VGPR_32RegClass,
2135 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
2136 return true;
2137
2138 // Parse FirstKernArgPreloadReg separately, since it's a Register,
2139 // not ArgDescriptor.
2140 if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) {
2141 const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg;
2142
2143 if (!A.IsRegister) {
2144 // For stack arguments, we don't have RegisterName.SourceRange,
2145 // but we should have some location info from the YAML parser
2146 const MemoryBuffer &Buffer =
2147 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
2148 // Create a minimal valid source range
2149 SMLoc Loc = SMLoc::getFromPointer(Ptr: Buffer.getBufferStart());
2150 SMRange Range(Loc, Loc);
2151
2152 Error = SMDiagnostic(
2153 *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error,
2154 "firstKernArgPreloadReg must be a register, not a stack location", "",
2155 {}, {});
2156
2157 SourceRange = Range;
2158 return true;
2159 }
2160
2161 Register Reg;
2162 if (parseNamedRegisterReference(PFS, Reg, Src: A.RegisterName.Value, Error)) {
2163 SourceRange = A.RegisterName.SourceRange;
2164 return true;
2165 }
2166
2167 if (!AMDGPU::SGPR_32RegClass.contains(Reg))
2168 return diagnoseRegisterClass(A.RegisterName);
2169
2170 MFI->ArgInfo.FirstKernArgPreloadReg = Reg;
2171 MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs;
2172 }
2173
2174 if (ST.hasFeature(Feature: AMDGPU::FeatureDX10ClampAndIEEEMode)) {
2175 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
2176 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
2177 }
2178
2179 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
2180 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
2181 ? DenormalMode::IEEE
2182 : DenormalMode::PreserveSign;
2183 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
2184 ? DenormalMode::IEEE
2185 : DenormalMode::PreserveSign;
2186
2187 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
2188 ? DenormalMode::IEEE
2189 : DenormalMode::PreserveSign;
2190 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
2191 ? DenormalMode::IEEE
2192 : DenormalMode::PreserveSign;
2193
2194 if (YamlMFI.HasInitWholeWave)
2195 MFI->setInitWholeWave();
2196
2197 return false;
2198}
2199
2200//===----------------------------------------------------------------------===//
2201// AMDGPU CodeGen Pass Builder interface.
2202//===----------------------------------------------------------------------===//
2203
2204AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
2205 GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
2206 PassInstrumentationCallbacks *PIC)
2207 : CodeGenPassBuilder(TM, Opts, PIC) {
2208 Opt.MISchedPostRA = true;
2209 Opt.RequiresCodeGenSCCOrder = true;
2210 // Exceptions and StackMaps are not supported, so these passes will never do
2211 // anything.
2212 // Garbage collection is not supported.
2213 disablePass<StackMapLivenessPass, FuncletLayoutPass, PatchableFunctionPass,
2214 ShadowStackGCLoweringPass, GCLoweringPass>();
2215}
2216
2217void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
2218 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) {
2219 flushFPMsToMPM(PMW);
2220 addModulePass(Pass: AMDGPURemoveIncompatibleFunctionsPass(TM), PMW);
2221 }
2222
2223 flushFPMsToMPM(PMW);
2224
2225 if (TM.getTargetTriple().isAMDGCN())
2226 addModulePass(Pass: AMDGPUPrintfRuntimeBindingPass(), PMW);
2227
2228 if (LowerCtorDtor)
2229 addModulePass(Pass: AMDGPUCtorDtorLoweringPass(), PMW);
2230
2231 if (isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
2232 addFunctionPass(Pass: AMDGPUImageIntrinsicOptimizerPass(TM), PMW);
2233
2234 if (EnableUniformIntrinsicCombine)
2235 addFunctionPass(Pass: AMDGPUUniformIntrinsicCombinePass(), PMW);
2236 // This can be disabled by passing ::Disable here or on the command line
2237 // with --expand-variadics-override=disable.
2238 flushFPMsToMPM(PMW);
2239 addModulePass(Pass: ExpandVariadicsPass(ExpandVariadicsMode::Lowering), PMW);
2240
2241 addModulePass(Pass: AMDGPUAlwaysInlinePass(), PMW);
2242 addModulePass(Pass: AlwaysInlinerPass(), PMW);
2243
2244 addModulePass(Pass: AMDGPUExportKernelRuntimeHandlesPass(), PMW);
2245
2246 if (EnableLowerExecSync)
2247 addModulePass(Pass: AMDGPULowerExecSyncPass(), PMW);
2248
2249 if (EnableSwLowerLDS)
2250 addModulePass(Pass: AMDGPUSwLowerLDSPass(TM), PMW);
2251
2252 // Runs before PromoteAlloca so the latter can account for function uses
2253 if (EnableLowerModuleLDS)
2254 addModulePass(Pass: AMDGPULowerModuleLDSPass(TM), PMW);
2255
2256 // Run atomic optimizer before Atomic Expand
2257 if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
2258 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
2259 addFunctionPass(
2260 Pass: AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy), PMW);
2261
2262 addFunctionPass(Pass: AtomicExpandPass(TM), PMW);
2263
2264 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2265 addFunctionPass(Pass: AMDGPUPromoteAllocaPass(TM), PMW);
2266 if (isPassEnabled(Opt: EnableScalarIRPasses))
2267 addStraightLineScalarOptimizationPasses(PMW);
2268
2269 // TODO: Handle EnableAMDGPUAliasAnalysis
2270
2271 // TODO: May want to move later or split into an early and late one.
2272 addFunctionPass(Pass: AMDGPUCodeGenPreparePass(TM), PMW);
2273
2274 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
2275 // have expanded.
2276 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2277 addFunctionPass(Pass: createFunctionToLoopPassAdaptor(Pass: LICMPass(LICMOptions()),
2278 /*UseMemorySSA=*/true),
2279 PMW);
2280 }
2281 }
2282
2283 Base::addIRPasses(PMW);
2284
2285 // EarlyCSE is not always strong enough to clean up what LSR produces. For
2286 // example, GVN can combine
2287 //
2288 // %0 = add %a, %b
2289 // %1 = add %b, %a
2290 //
2291 // and
2292 //
2293 // %0 = shl nsw %a, 2
2294 // %1 = shl %a, 2
2295 //
2296 // but EarlyCSE can do neither of them.
2297 if (isPassEnabled(Opt: EnableScalarIRPasses))
2298 addEarlyCSEOrGVNPass(PMW);
2299}
2300
2301void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(
2302 PassManagerWrapper &PMW) const {
2303 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2304 flushFPMsToMPM(PMW);
2305 addModulePass(Pass: AMDGPUPreloadKernelArgumentsPass(TM), PMW);
2306 }
2307
2308 if (EnableLowerKernelArguments)
2309 addFunctionPass(Pass: AMDGPULowerKernelArgumentsPass(TM), PMW);
2310
2311 Base::addCodeGenPrepare(PMW);
2312
2313 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
2314 addFunctionPass(Pass: LoadStoreVectorizerPass(), PMW);
2315
2316 // This lowering has been placed after codegenprepare to take advantage of
2317 // address mode matching (which is why it isn't put with the LDS lowerings).
2318 // It could be placed anywhere before uniformity annotations (an analysis
2319 // that it changes by splitting up fat pointers into their components)
2320 // but has been put before switch lowering and CFG flattening so that those
2321 // passes can run on the more optimized control flow this pass creates in
2322 // many cases.
2323 flushFPMsToMPM(PMW);
2324 addModulePass(Pass: AMDGPULowerBufferFatPointersPass(TM), PMW);
2325 flushFPMsToMPM(PMW);
2326 requireCGSCCOrder(PMW);
2327
2328 addModulePass(Pass: AMDGPULowerIntrinsicsPass(TM), PMW);
2329
2330 // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
2331 // behavior for subsequent passes. Placing it here seems better that these
2332 // blocks would get cleaned up by UnreachableBlockElim inserted next in the
2333 // pass flow.
2334 addFunctionPass(Pass: LowerSwitchPass(), PMW);
2335}
2336
2337void AMDGPUCodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const {
2338
2339 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2340 addFunctionPass(Pass: FlattenCFGPass(), PMW);
2341 addFunctionPass(Pass: SinkingPass(), PMW);
2342 addFunctionPass(Pass: AMDGPULateCodeGenPreparePass(TM), PMW);
2343 }
2344
2345 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
2346 // regions formed by them.
2347
2348 addFunctionPass(Pass: AMDGPUUnifyDivergentExitNodesPass(), PMW);
2349 addFunctionPass(Pass: FixIrreduciblePass(), PMW);
2350 addFunctionPass(Pass: UnifyLoopExitsPass(), PMW);
2351 addFunctionPass(Pass: StructurizeCFGPass(/*SkipUniformRegions=*/false), PMW);
2352
2353 addFunctionPass(Pass: AMDGPUAnnotateUniformValuesPass(), PMW);
2354
2355 addFunctionPass(Pass: SIAnnotateControlFlowPass(TM), PMW);
2356
2357 // TODO: Move this right after structurizeCFG to avoid extra divergence
2358 // analysis. This depends on stopping SIAnnotateControlFlow from making
2359 // control flow modifications.
2360 addFunctionPass(Pass: AMDGPURewriteUndefForPHIPass(), PMW);
2361
2362 if (!getCGPassBuilderOption().EnableGlobalISelOption ||
2363 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
2364 addFunctionPass(Pass: LCSSAPass(), PMW);
2365
2366 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2367 flushFPMsToMPM(PMW);
2368 addModulePass(Pass: AMDGPUPerfHintAnalysisPass(TM), PMW);
2369 }
2370
2371 // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
2372 // isn't this in addInstSelector?
2373 addFunctionPass(Pass: RequireAnalysisPass<UniformityInfoAnalysis, Function>(), PMW,
2374 /*Force=*/true);
2375}
2376
2377void AMDGPUCodeGenPassBuilder::addILPOpts(PassManagerWrapper &PMW) const {
2378 if (EnableEarlyIfConversion)
2379 addMachineFunctionPass(Pass: EarlyIfConverterPass(), PMW);
2380
2381 Base::addILPOpts(PMW);
2382}
2383
2384void AMDGPUCodeGenPassBuilder::addAsmPrinterBegin(
2385 PassManagerWrapper &PMW, CreateMCStreamer CreateStreamer) const {
2386 // TODO: Add AsmPrinterBegin
2387}
2388
2389void AMDGPUCodeGenPassBuilder::addAsmPrinter(
2390 PassManagerWrapper &PMW, CreateMCStreamer CreateStreamer) const {
2391 // TODO: Add AsmPrinter.
2392}
2393
2394void AMDGPUCodeGenPassBuilder::addAsmPrinterEnd(
2395 PassManagerWrapper &PMW, CreateMCStreamer CreateStreamer) const {
2396 // TODO: Add AsmPrinterEnd
2397}
2398
2399Error AMDGPUCodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const {
2400 addMachineFunctionPass(Pass: AMDGPUISelDAGToDAGPass(TM), PMW);
2401 addMachineFunctionPass(Pass: SIFixSGPRCopiesPass(), PMW);
2402 addMachineFunctionPass(Pass: SILowerI1CopiesPass(), PMW);
2403 return Error::success();
2404}
2405
2406void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const {
2407 if (EnableRegReassign) {
2408 addMachineFunctionPass(Pass: GCNNSAReassignPass(), PMW);
2409 }
2410
2411 addMachineFunctionPass(Pass: AMDGPURewriteAGPRCopyMFMAPass(), PMW);
2412}
2413
2414void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
2415 PassManagerWrapper &PMW) const {
2416 Base::addMachineSSAOptimization(PMW);
2417
2418 addMachineFunctionPass(Pass: SIFoldOperandsPass(), PMW);
2419 if (EnableDPPCombine) {
2420 addMachineFunctionPass(Pass: GCNDPPCombinePass(), PMW);
2421 }
2422 addMachineFunctionPass(Pass: SILoadStoreOptimizerPass(), PMW);
2423 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
2424 addMachineFunctionPass(Pass: SIPeepholeSDWAPass(), PMW);
2425 addMachineFunctionPass(Pass: EarlyMachineLICMPass(), PMW);
2426 addMachineFunctionPass(Pass: MachineCSEPass(), PMW);
2427 addMachineFunctionPass(Pass: SIFoldOperandsPass(), PMW);
2428 }
2429 addMachineFunctionPass(Pass: DeadMachineInstructionElimPass(), PMW);
2430 addMachineFunctionPass(Pass: SIShrinkInstructionsPass(), PMW);
2431}
2432
2433Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const {
2434 insertPass<PHIEliminationPass>(Pass: SILowerControlFlowPass());
2435
2436 insertPass<TwoAddressInstructionPass>(Pass: SIWholeQuadModePass());
2437
2438 return Base::addFastRegAlloc(PMW);
2439}
2440
2441Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast(
2442 PassManagerWrapper &PMW) const {
2443 if (auto Err = validateRegAllocOptions())
2444 return Err;
2445
2446 addMachineFunctionPass(Pass: GCNPreRALongBranchRegPass(), PMW);
2447
2448 // SGPR allocation - default to fast at -O0.
2449 if (SGPRRegAllocNPM == RegAllocType::Greedy)
2450 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2451 else
2452 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2453 PMW);
2454
2455 // Equivalent of PEI for SGPRs.
2456 addMachineFunctionPass(Pass: SILowerSGPRSpillsPass(), PMW);
2457
2458 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2459 addMachineFunctionPass(Pass: SIPreAllocateWWMRegsPass(), PMW);
2460
2461 // WWM allocation - default to fast at -O0.
2462 if (WWMRegAllocNPM == RegAllocType::Greedy)
2463 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2464 else
2465 addMachineFunctionPass(
2466 Pass: RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
2467
2468 addMachineFunctionPass(Pass: SILowerWWMCopiesPass(), PMW);
2469 addMachineFunctionPass(Pass: AMDGPUReserveWWMRegsPass(), PMW);
2470
2471 // VGPR allocation - default to fast at -O0.
2472 if (VGPRRegAllocNPM == RegAllocType::Greedy)
2473 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2474 else
2475 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2476
2477 return Error::success();
2478}
2479
2480Error AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
2481 PassManagerWrapper &PMW) const {
2482 if (EnableDCEInRA)
2483 insertPass<DetectDeadLanesPass>(Pass: DeadMachineInstructionElimPass());
2484
2485 // FIXME: when an instruction has a Killed operand, and the instruction is
2486 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
2487 // the register in LiveVariables, this would trigger a failure in verifier,
2488 // we should fix it and enable the verifier.
2489 if (OptVGPRLiveRange)
2490 insertPass<RequireAnalysisPass<LiveVariablesAnalysis, MachineFunction>>(
2491 Pass: SIOptimizeVGPRLiveRangePass());
2492
2493 // This must be run immediately after phi elimination and before
2494 // TwoAddressInstructions, otherwise the processing of the tied operand of
2495 // SI_ELSE will introduce a copy of the tied operand source after the else.
2496 insertPass<PHIEliminationPass>(Pass: SILowerControlFlowPass());
2497
2498 if (EnableRewritePartialRegUses)
2499 insertPass<RenameIndependentSubregsPass>(Pass: GCNRewritePartialRegUsesPass());
2500
2501 if (isPassEnabled(Opt: EnablePreRAOptimizations))
2502 insertPass<MachineSchedulerPass>(Pass: GCNPreRAOptimizationsPass());
2503
2504 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
2505 // instructions that cause scheduling barriers.
2506 insertPass<MachineSchedulerPass>(Pass: SIWholeQuadModePass());
2507
2508 if (OptExecMaskPreRA)
2509 insertPass<MachineSchedulerPass>(Pass: SIOptimizeExecMaskingPreRAPass());
2510
2511 // This is not an essential optimization and it has a noticeable impact on
2512 // compilation time, so we only enable it from O2.
2513 if (TM.getOptLevel() > CodeGenOptLevel::Less)
2514 insertPass<MachineSchedulerPass>(Pass: SIFormMemoryClausesPass());
2515
2516 return Base::addOptimizedRegAlloc(PMW);
2517}
2518
2519void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const {
2520 if (getOptLevel() != CodeGenOptLevel::None)
2521 addMachineFunctionPass(Pass: AMDGPUPrepareAGPRAllocPass(), PMW);
2522}
2523
2524Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
2525 PassManagerWrapper &PMW) const {
2526 if (auto Err = validateRegAllocOptions())
2527 return Err;
2528
2529 addMachineFunctionPass(Pass: GCNPreRALongBranchRegPass(), PMW);
2530
2531 // SGPR allocation - default to greedy at -O1 and above.
2532 if (SGPRRegAllocNPM == RegAllocType::Fast)
2533 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2534 PMW);
2535 else
2536 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2537
2538 // Commit allocated register changes. This is mostly necessary because too
2539 // many things rely on the use lists of the physical registers, such as the
2540 // verifier. This is only necessary with allocators which use LiveIntervals,
2541 // since FastRegAlloc does the replacements itself.
2542 addMachineFunctionPass(Pass: VirtRegRewriterPass(false), PMW);
2543
2544 // At this point, the sgpr-regalloc has been done and it is good to have the
2545 // stack slot coloring to try to optimize the SGPR spill stack indices before
2546 // attempting the custom SGPR spill lowering.
2547 addMachineFunctionPass(Pass: StackSlotColoringPass(), PMW);
2548
2549 // Equivalent of PEI for SGPRs.
2550 addMachineFunctionPass(Pass: SILowerSGPRSpillsPass(), PMW);
2551
2552 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2553 addMachineFunctionPass(Pass: SIPreAllocateWWMRegsPass(), PMW);
2554
2555 // WWM allocation - default to greedy at -O1 and above.
2556 if (WWMRegAllocNPM == RegAllocType::Fast)
2557 addMachineFunctionPass(
2558 Pass: RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
2559 else
2560 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2561 addMachineFunctionPass(Pass: SILowerWWMCopiesPass(), PMW);
2562 addMachineFunctionPass(Pass: VirtRegRewriterPass(false), PMW);
2563 addMachineFunctionPass(Pass: AMDGPUReserveWWMRegsPass(), PMW);
2564
2565 // VGPR allocation - default to greedy at -O1 and above.
2566 if (VGPRRegAllocNPM == RegAllocType::Fast)
2567 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2568 else
2569 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2570
2571 addPreRewrite(PMW);
2572 addMachineFunctionPass(Pass: VirtRegRewriterPass(true), PMW);
2573
2574 addMachineFunctionPass(Pass: AMDGPUMarkLastScratchLoadPass(), PMW);
2575 return Error::success();
2576}
2577
2578void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const {
2579 addMachineFunctionPass(Pass: SIFixVGPRCopiesPass(), PMW);
2580 if (TM.getOptLevel() > CodeGenOptLevel::None)
2581 addMachineFunctionPass(Pass: SIOptimizeExecMaskingPass(), PMW);
2582 Base::addPostRegAlloc(PMW);
2583}
2584
2585void AMDGPUCodeGenPassBuilder::addPreSched2(PassManagerWrapper &PMW) const {
2586 if (TM.getOptLevel() > CodeGenOptLevel::None)
2587 addMachineFunctionPass(Pass: SIShrinkInstructionsPass(), PMW);
2588 addMachineFunctionPass(Pass: SIPostRABundlerPass(), PMW);
2589}
2590
2591void AMDGPUCodeGenPassBuilder::addPostBBSections(
2592 PassManagerWrapper &PMW) const {
2593 // We run this later to avoid passes like livedebugvalues and BBSections
2594 // having to deal with the apparent multi-entry functions we may generate.
2595 addMachineFunctionPass(Pass: AMDGPUPreloadKernArgPrologPass(), PMW);
2596}
2597
2598void AMDGPUCodeGenPassBuilder::addPreEmitPass(PassManagerWrapper &PMW) const {
2599 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less)) {
2600 addMachineFunctionPass(Pass: GCNCreateVOPDPass(), PMW);
2601 }
2602
2603 addMachineFunctionPass(Pass: SIMemoryLegalizerPass(), PMW);
2604 addMachineFunctionPass(Pass: SIInsertWaitcntsPass(), PMW);
2605
2606 addMachineFunctionPass(Pass: SIModeRegisterPass(), PMW);
2607
2608 if (TM.getOptLevel() > CodeGenOptLevel::None)
2609 addMachineFunctionPass(Pass: SIInsertHardClausesPass(), PMW);
2610
2611 addMachineFunctionPass(Pass: SILateBranchLoweringPass(), PMW);
2612
2613 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
2614 addMachineFunctionPass(Pass: AMDGPUSetWavePriorityPass(), PMW);
2615
2616 if (TM.getOptLevel() > CodeGenOptLevel::None)
2617 addMachineFunctionPass(Pass: SIPreEmitPeepholePass(), PMW);
2618
2619 // The hazard recognizer that runs as part of the post-ra scheduler does not
2620 // guarantee to be able handle all hazards correctly. This is because if there
2621 // are multiple scheduling regions in a basic block, the regions are scheduled
2622 // bottom up, so when we begin to schedule a region we don't know what
2623 // instructions were emitted directly before it.
2624 //
2625 // Here we add a stand-alone hazard recognizer pass which can handle all
2626 // cases.
2627 addMachineFunctionPass(Pass: PostRAHazardRecognizerPass(), PMW);
2628 addMachineFunctionPass(Pass: AMDGPUWaitSGPRHazardsPass(), PMW);
2629 addMachineFunctionPass(Pass: AMDGPULowerVGPREncodingPass(), PMW);
2630
2631 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less)) {
2632 addMachineFunctionPass(Pass: AMDGPUInsertDelayAluPass(), PMW);
2633 }
2634
2635 addMachineFunctionPass(Pass: BranchRelaxationPass(), PMW);
2636}
2637
2638bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
2639 CodeGenOptLevel Level) const {
2640 if (Opt.getNumOccurrences())
2641 return Opt;
2642 if (TM.getOptLevel() < Level)
2643 return false;
2644 return Opt;
2645}
2646
2647void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(
2648 PassManagerWrapper &PMW) const {
2649 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
2650 addFunctionPass(Pass: GVNPass(), PMW);
2651 else
2652 addFunctionPass(Pass: EarlyCSEPass(), PMW);
2653}
2654
2655void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
2656 PassManagerWrapper &PMW) const {
2657 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
2658 addFunctionPass(Pass: LoopDataPrefetchPass(), PMW);
2659
2660 addFunctionPass(Pass: SeparateConstOffsetFromGEPPass(), PMW);
2661
2662 // ReassociateGEPs exposes more opportunities for SLSR. See
2663 // the example in reassociate-geps-and-slsr.ll.
2664 addFunctionPass(Pass: StraightLineStrengthReducePass(), PMW);
2665
2666 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
2667 // EarlyCSE can reuse.
2668 addEarlyCSEOrGVNPass(PMW);
2669
2670 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
2671 addFunctionPass(Pass: NaryReassociatePass(), PMW);
2672
2673 // NaryReassociate on GEPs creates redundant common expressions, so run
2674 // EarlyCSE after it.
2675 addFunctionPass(Pass: EarlyCSEPass(), PMW);
2676}
2677