1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains both AMDGPU target machine and the CodeGen pass builder.
11/// The AMDGPU target machine contains all of the hardware specific information
12/// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
13/// CodeGen pass builder handles the pass pipeline for new pass manager.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetMachine.h"
18#include "AMDGPU.h"
19#include "AMDGPUAliasAnalysis.h"
20#include "AMDGPUBarrierLatency.h"
21#include "AMDGPUCtorDtorLowering.h"
22#include "AMDGPUExportClustering.h"
23#include "AMDGPUExportKernelRuntimeHandles.h"
24#include "AMDGPUHazardLatency.h"
25#include "AMDGPUIGroupLP.h"
26#include "AMDGPUISelDAGToDAG.h"
27#include "AMDGPULowerVGPREncoding.h"
28#include "AMDGPUMacroFusion.h"
29#include "AMDGPUPerfHintAnalysis.h"
30#include "AMDGPUPreloadKernArgProlog.h"
31#include "AMDGPUPrepareAGPRAlloc.h"
32#include "AMDGPURemoveIncompatibleFunctions.h"
33#include "AMDGPUReserveWWMRegs.h"
34#include "AMDGPUResourceUsageAnalysis.h"
35#include "AMDGPUSplitModule.h"
36#include "AMDGPUTargetObjectFile.h"
37#include "AMDGPUTargetTransformInfo.h"
38#include "AMDGPUUnifyDivergentExitNodes.h"
39#include "AMDGPUWaitSGPRHazards.h"
40#include "GCNDPPCombine.h"
41#include "GCNIterativeScheduler.h"
42#include "GCNNSAReassign.h"
43#include "GCNPreRALongBranchReg.h"
44#include "GCNPreRAOptimizations.h"
45#include "GCNRewritePartialRegUses.h"
46#include "GCNSchedStrategy.h"
47#include "GCNVOPDUtils.h"
48#include "R600.h"
49#include "R600TargetMachine.h"
50#include "SIFixSGPRCopies.h"
51#include "SIFixVGPRCopies.h"
52#include "SIFoldOperands.h"
53#include "SIFormMemoryClauses.h"
54#include "SILoadStoreOptimizer.h"
55#include "SILowerControlFlow.h"
56#include "SILowerSGPRSpills.h"
57#include "SILowerWWMCopies.h"
58#include "SIMachineFunctionInfo.h"
59#include "SIMachineScheduler.h"
60#include "SIOptimizeExecMasking.h"
61#include "SIOptimizeExecMaskingPreRA.h"
62#include "SIOptimizeVGPRLiveRange.h"
63#include "SIPeepholeSDWA.h"
64#include "SIPostRABundler.h"
65#include "SIPreAllocateWWMRegs.h"
66#include "SIShrinkInstructions.h"
67#include "SIWholeQuadMode.h"
68#include "TargetInfo/AMDGPUTargetInfo.h"
69#include "Utils/AMDGPUBaseInfo.h"
70#include "llvm/Analysis/CGSCCPassManager.h"
71#include "llvm/Analysis/CallGraphSCCPass.h"
72#include "llvm/Analysis/KernelInfo.h"
73#include "llvm/Analysis/UniformityAnalysis.h"
74#include "llvm/CodeGen/AtomicExpand.h"
75#include "llvm/CodeGen/BranchRelaxation.h"
76#include "llvm/CodeGen/DeadMachineInstructionElim.h"
77#include "llvm/CodeGen/EarlyIfConversion.h"
78#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
79#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
80#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
81#include "llvm/CodeGen/GlobalISel/Legalizer.h"
82#include "llvm/CodeGen/GlobalISel/Localizer.h"
83#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
84#include "llvm/CodeGen/MIRParser/MIParser.h"
85#include "llvm/CodeGen/MachineCSE.h"
86#include "llvm/CodeGen/MachineLICM.h"
87#include "llvm/CodeGen/MachineScheduler.h"
88#include "llvm/CodeGen/Passes.h"
89#include "llvm/CodeGen/PostRAHazardRecognizer.h"
90#include "llvm/CodeGen/RegAllocRegistry.h"
91#include "llvm/CodeGen/TargetPassConfig.h"
92#include "llvm/IR/IntrinsicsAMDGPU.h"
93#include "llvm/IR/PassManager.h"
94#include "llvm/IR/PatternMatch.h"
95#include "llvm/InitializePasses.h"
96#include "llvm/MC/TargetRegistry.h"
97#include "llvm/Passes/CodeGenPassBuilder.h"
98#include "llvm/Passes/PassBuilder.h"
99#include "llvm/Support/Compiler.h"
100#include "llvm/Support/FormatVariadic.h"
101#include "llvm/Transforms/HipStdPar/HipStdPar.h"
102#include "llvm/Transforms/IPO.h"
103#include "llvm/Transforms/IPO/AlwaysInliner.h"
104#include "llvm/Transforms/IPO/ExpandVariadics.h"
105#include "llvm/Transforms/IPO/GlobalDCE.h"
106#include "llvm/Transforms/IPO/Internalize.h"
107#include "llvm/Transforms/Scalar.h"
108#include "llvm/Transforms/Scalar/EarlyCSE.h"
109#include "llvm/Transforms/Scalar/FlattenCFG.h"
110#include "llvm/Transforms/Scalar/GVN.h"
111#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
112#include "llvm/Transforms/Scalar/LICM.h"
113#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
114#include "llvm/Transforms/Scalar/LoopPassManager.h"
115#include "llvm/Transforms/Scalar/NaryReassociate.h"
116#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
117#include "llvm/Transforms/Scalar/Sink.h"
118#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
119#include "llvm/Transforms/Scalar/StructurizeCFG.h"
120#include "llvm/Transforms/Utils.h"
121#include "llvm/Transforms/Utils/FixIrreducible.h"
122#include "llvm/Transforms/Utils/LCSSA.h"
123#include "llvm/Transforms/Utils/LowerSwitch.h"
124#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
125#include "llvm/Transforms/Utils/UnifyLoopExits.h"
126#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
127#include <optional>
128
129using namespace llvm;
130using namespace llvm::PatternMatch;
131
132namespace {
133//===----------------------------------------------------------------------===//
134// AMDGPU CodeGen Pass Builder interface.
135//===----------------------------------------------------------------------===//
136
137class AMDGPUCodeGenPassBuilder
138 : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
139 using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
140
141public:
142 AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
143 const CGPassBuilderOption &Opts,
144 PassInstrumentationCallbacks *PIC);
145
146 void addIRPasses(PassManagerWrapper &PMW) const;
147 void addCodeGenPrepare(PassManagerWrapper &PMW) const;
148 void addPreISel(PassManagerWrapper &PMW) const;
149 void addILPOpts(PassManagerWrapper &PMWM) const;
150 void addAsmPrinterBegin(PassManagerWrapper &PMW, CreateMCStreamer) const;
151 void addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const;
152 void addAsmPrinterEnd(PassManagerWrapper &PMW, CreateMCStreamer) const;
153 Error addInstSelector(PassManagerWrapper &PMW) const;
154 void addPreRewrite(PassManagerWrapper &PMW) const;
155 void addMachineSSAOptimization(PassManagerWrapper &PMW) const;
156 void addPostRegAlloc(PassManagerWrapper &PMW) const;
157 void addPreEmitPass(PassManagerWrapper &PMWM) const;
158 void addPreEmitRegAlloc(PassManagerWrapper &PMW) const;
159 Error addRegAssignmentFast(PassManagerWrapper &PMW) const;
160 Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const;
161 void addPreRegAlloc(PassManagerWrapper &PMW) const;
162 Error addFastRegAlloc(PassManagerWrapper &PMW) const;
163 Error addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
164 void addPreSched2(PassManagerWrapper &PMW) const;
165 void addPostBBSections(PassManagerWrapper &PMW) const;
166
167private:
168 Error validateRegAllocOptions() const;
169
170public:
171 /// Check if a pass is enabled given \p Opt option. The option always
172 /// overrides defaults if explicitly used. Otherwise its default will be used
173 /// given that a pass shall work at an optimization \p Level minimum.
174 bool isPassEnabled(const cl::opt<bool> &Opt,
175 CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
176 void addEarlyCSEOrGVNPass(PassManagerWrapper &PMW) const;
177 void addStraightLineScalarOptimizationPasses(PassManagerWrapper &PMW) const;
178};
179
180class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
181public:
182 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
183 : RegisterRegAllocBase(N, D, C) {}
184};
185
186class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
187public:
188 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
189 : RegisterRegAllocBase(N, D, C) {}
190};
191
192class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
193public:
194 WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
195 : RegisterRegAllocBase(N, D, C) {}
196};
197
198static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
199 const MachineRegisterInfo &MRI,
200 const Register Reg) {
201 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
202 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
203}
204
205static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
206 const MachineRegisterInfo &MRI,
207 const Register Reg) {
208 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
209 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
210}
211
212static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
213 const MachineRegisterInfo &MRI,
214 const Register Reg) {
215 const SIMachineFunctionInfo *MFI =
216 MRI.getMF().getInfo<SIMachineFunctionInfo>();
217 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
218 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
219 MFI->checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG);
220}
221
222/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
223static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
224
225/// A dummy default pass factory indicates whether the register allocator is
226/// overridden on the command line.
227static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
228static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
229static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
230
231static SGPRRegisterRegAlloc
232defaultSGPRRegAlloc("default",
233 "pick SGPR register allocator based on -O option",
234 useDefaultRegisterAllocator);
235
236static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
237 RegisterPassParser<SGPRRegisterRegAlloc>>
238SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
239 cl::desc("Register allocator to use for SGPRs"));
240
241static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
242 RegisterPassParser<VGPRRegisterRegAlloc>>
243VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
244 cl::desc("Register allocator to use for VGPRs"));
245
246static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
247 RegisterPassParser<WWMRegisterRegAlloc>>
248 WWMRegAlloc("wwm-regalloc", cl::Hidden,
249 cl::init(Val: &useDefaultRegisterAllocator),
250 cl::desc("Register allocator to use for WWM registers"));
251
252// New pass manager register allocator options for AMDGPU
253static cl::opt<RegAllocType, false, RegAllocTypeParser> SGPRRegAllocNPM(
254 "sgpr-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
255 cl::desc("Register allocator for SGPRs (new pass manager)"));
256
257static cl::opt<RegAllocType, false, RegAllocTypeParser> VGPRRegAllocNPM(
258 "vgpr-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
259 cl::desc("Register allocator for VGPRs (new pass manager)"));
260
261static cl::opt<RegAllocType, false, RegAllocTypeParser> WWMRegAllocNPM(
262 "wwm-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
263 cl::desc("Register allocator for WWM registers (new pass manager)"));
264
265/// Check if the given RegAllocType is supported for AMDGPU NPM register
266/// allocation. Only Fast and Greedy are supported; Basic and PBQP are not.
267static Error checkRegAllocSupported(RegAllocType RAType, StringRef RegName) {
268 if (RAType == RegAllocType::Basic || RAType == RegAllocType::PBQP) {
269 return make_error<StringError>(
270 Args: Twine("unsupported register allocator '") +
271 (RAType == RegAllocType::Basic ? "basic" : "pbqp") + "' for " +
272 RegName + " registers",
273 Args: inconvertibleErrorCode());
274 }
275 return Error::success();
276}
277
278Error AMDGPUCodeGenPassBuilder::validateRegAllocOptions() const {
279 // 1. Generic --regalloc-npm is not supported for AMDGPU.
280 if (Opt.RegAlloc != RegAllocType::Unset) {
281 return make_error<StringError>(
282 Args: "-regalloc-npm not supported for amdgcn. Use -sgpr-regalloc-npm, "
283 "-vgpr-regalloc-npm, and -wwm-regalloc-npm",
284 Args: inconvertibleErrorCode());
285 }
286
287 // 2. Legacy PM regalloc options are not compatible with NPM.
288 if (SGPRRegAlloc.getNumOccurrences() > 0 ||
289 VGPRRegAlloc.getNumOccurrences() > 0 ||
290 WWMRegAlloc.getNumOccurrences() > 0) {
291 return make_error<StringError>(
292 Args: "-sgpr-regalloc, -vgpr-regalloc, and -wwm-regalloc are legacy PM "
293 "options. Use -sgpr-regalloc-npm, -vgpr-regalloc-npm, and "
294 "-wwm-regalloc-npm with the new pass manager",
295 Args: inconvertibleErrorCode());
296 }
297
298 // 3. Only Fast and Greedy allocators are supported for AMDGPU.
299 if (auto Err = checkRegAllocSupported(RAType: SGPRRegAllocNPM, RegName: "SGPR"))
300 return Err;
301 if (auto Err = checkRegAllocSupported(RAType: WWMRegAllocNPM, RegName: "WWM"))
302 return Err;
303 if (auto Err = checkRegAllocSupported(RAType: VGPRRegAllocNPM, RegName: "VGPR"))
304 return Err;
305
306 return Error::success();
307}
308
309static void initializeDefaultSGPRRegisterAllocatorOnce() {
310 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
311
312 if (!Ctor) {
313 Ctor = SGPRRegAlloc;
314 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
315 }
316}
317
318static void initializeDefaultVGPRRegisterAllocatorOnce() {
319 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
320
321 if (!Ctor) {
322 Ctor = VGPRRegAlloc;
323 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
324 }
325}
326
327static void initializeDefaultWWMRegisterAllocatorOnce() {
328 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
329
330 if (!Ctor) {
331 Ctor = WWMRegAlloc;
332 WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
333 }
334}
335
336static FunctionPass *createBasicSGPRRegisterAllocator() {
337 return createBasicRegisterAllocator(F: onlyAllocateSGPRs);
338}
339
340static FunctionPass *createGreedySGPRRegisterAllocator() {
341 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
342}
343
344static FunctionPass *createFastSGPRRegisterAllocator() {
345 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
346}
347
348static FunctionPass *createBasicVGPRRegisterAllocator() {
349 return createBasicRegisterAllocator(F: onlyAllocateVGPRs);
350}
351
352static FunctionPass *createGreedyVGPRRegisterAllocator() {
353 return createGreedyRegisterAllocator(F: onlyAllocateVGPRs);
354}
355
356static FunctionPass *createFastVGPRRegisterAllocator() {
357 return createFastRegisterAllocator(F: onlyAllocateVGPRs, ClearVirtRegs: true);
358}
359
360static FunctionPass *createBasicWWMRegisterAllocator() {
361 return createBasicRegisterAllocator(F: onlyAllocateWWMRegs);
362}
363
364static FunctionPass *createGreedyWWMRegisterAllocator() {
365 return createGreedyRegisterAllocator(F: onlyAllocateWWMRegs);
366}
367
368static FunctionPass *createFastWWMRegisterAllocator() {
369 return createFastRegisterAllocator(F: onlyAllocateWWMRegs, ClearVirtRegs: false);
370}
371
372static SGPRRegisterRegAlloc basicRegAllocSGPR(
373 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
374static SGPRRegisterRegAlloc greedyRegAllocSGPR(
375 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
376
377static SGPRRegisterRegAlloc fastRegAllocSGPR(
378 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
379
380
381static VGPRRegisterRegAlloc basicRegAllocVGPR(
382 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
383static VGPRRegisterRegAlloc greedyRegAllocVGPR(
384 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
385
386static VGPRRegisterRegAlloc fastRegAllocVGPR(
387 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
388static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
389 "basic register allocator",
390 createBasicWWMRegisterAllocator);
391static WWMRegisterRegAlloc
392 greedyRegAllocWWMReg("greedy", "greedy register allocator",
393 createGreedyWWMRegisterAllocator);
394static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
395 createFastWWMRegisterAllocator);
396
397static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
398 return Phase == ThinOrFullLTOPhase::FullLTOPreLink ||
399 Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
400}
401} // anonymous namespace
402
403static cl::opt<bool>
404EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
405 cl::desc("Run early if-conversion"),
406 cl::init(Val: false));
407
408static cl::opt<bool>
409OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
410 cl::desc("Run pre-RA exec mask optimizations"),
411 cl::init(Val: true));
412
413static cl::opt<bool>
414 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
415 cl::desc("Lower GPU ctor / dtors to globals on the device."),
416 cl::init(Val: true), cl::Hidden);
417
418// Option to disable vectorizer for tests.
419static cl::opt<bool> EnableLoadStoreVectorizer(
420 "amdgpu-load-store-vectorizer",
421 cl::desc("Enable load store vectorizer"),
422 cl::init(Val: true),
423 cl::Hidden);
424
425// Option to control global loads scalarization
426static cl::opt<bool> ScalarizeGlobal(
427 "amdgpu-scalarize-global-loads",
428 cl::desc("Enable global load scalarization"),
429 cl::init(Val: true),
430 cl::Hidden);
431
432// Option to run internalize pass.
433static cl::opt<bool> InternalizeSymbols(
434 "amdgpu-internalize-symbols",
435 cl::desc("Enable elimination of non-kernel functions and unused globals"),
436 cl::init(Val: false),
437 cl::Hidden);
438
439// Option to inline all early.
440static cl::opt<bool> EarlyInlineAll(
441 "amdgpu-early-inline-all",
442 cl::desc("Inline all functions early"),
443 cl::init(Val: false),
444 cl::Hidden);
445
446static cl::opt<bool> RemoveIncompatibleFunctions(
447 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
448 cl::desc("Enable removal of functions when they"
449 "use features not supported by the target GPU"),
450 cl::init(Val: true));
451
452static cl::opt<bool> EnableSDWAPeephole(
453 "amdgpu-sdwa-peephole",
454 cl::desc("Enable SDWA peepholer"),
455 cl::init(Val: true));
456
457static cl::opt<bool> EnableDPPCombine(
458 "amdgpu-dpp-combine",
459 cl::desc("Enable DPP combiner"),
460 cl::init(Val: true));
461
462// Enable address space based alias analysis
463static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
464 cl::desc("Enable AMDGPU Alias Analysis"),
465 cl::init(Val: true));
466
467// Enable lib calls simplifications
468static cl::opt<bool> EnableLibCallSimplify(
469 "amdgpu-simplify-libcall",
470 cl::desc("Enable amdgpu library simplifications"),
471 cl::init(Val: true),
472 cl::Hidden);
473
474static cl::opt<bool> EnableLowerKernelArguments(
475 "amdgpu-ir-lower-kernel-arguments",
476 cl::desc("Lower kernel argument loads in IR pass"),
477 cl::init(Val: true),
478 cl::Hidden);
479
480static cl::opt<bool> EnableRegReassign(
481 "amdgpu-reassign-regs",
482 cl::desc("Enable register reassign optimizations on gfx10+"),
483 cl::init(Val: true),
484 cl::Hidden);
485
486static cl::opt<bool> OptVGPRLiveRange(
487 "amdgpu-opt-vgpr-liverange",
488 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
489 cl::init(Val: true), cl::Hidden);
490
491static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
492 "amdgpu-atomic-optimizer-strategy",
493 cl::desc("Select DPP or Iterative strategy for scan"),
494 cl::init(Val: ScanOptions::Iterative),
495 cl::values(
496 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
497 clEnumValN(ScanOptions::Iterative, "Iterative",
498 "Use Iterative approach for scan"),
499 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
500
501// Enable Mode register optimization
502static cl::opt<bool> EnableSIModeRegisterPass(
503 "amdgpu-mode-register",
504 cl::desc("Enable mode register pass"),
505 cl::init(Val: true),
506 cl::Hidden);
507
508// Enable GFX11+ s_delay_alu insertion
509static cl::opt<bool>
510 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
511 cl::desc("Enable s_delay_alu insertion"),
512 cl::init(Val: true), cl::Hidden);
513
514// Enable GFX11+ VOPD
515static cl::opt<bool>
516 EnableVOPD("amdgpu-enable-vopd",
517 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
518 cl::init(Val: true), cl::Hidden);
519
520// Option is used in lit tests to prevent deadcoding of patterns inspected.
521static cl::opt<bool>
522EnableDCEInRA("amdgpu-dce-in-ra",
523 cl::init(Val: true), cl::Hidden,
524 cl::desc("Enable machine DCE inside regalloc"));
525
526static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
527 cl::desc("Adjust wave priority"),
528 cl::init(Val: false), cl::Hidden);
529
530static cl::opt<bool> EnableScalarIRPasses(
531 "amdgpu-scalar-ir-passes",
532 cl::desc("Enable scalar IR passes"),
533 cl::init(Val: true),
534 cl::Hidden);
535
536static cl::opt<bool> EnableLowerExecSync(
537 "amdgpu-enable-lower-exec-sync",
538 cl::desc("Enable lowering of execution synchronization."), cl::init(Val: true),
539 cl::Hidden);
540
541static cl::opt<bool>
542 EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
543 cl::desc("Enable lowering of lds to global memory pass "
544 "and asan instrument resulting IR."),
545 cl::init(Val: true), cl::Hidden);
546
547static cl::opt<bool, true> EnableLowerModuleLDS(
548 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
549 cl::location(L&: AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(Val: true),
550 cl::Hidden);
551
552static cl::opt<bool> EnablePreRAOptimizations(
553 "amdgpu-enable-pre-ra-optimizations",
554 cl::desc("Enable Pre-RA optimizations pass"), cl::init(Val: true),
555 cl::Hidden);
556
557static cl::opt<bool> EnablePromoteKernelArguments(
558 "amdgpu-enable-promote-kernel-arguments",
559 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
560 cl::Hidden, cl::init(Val: true));
561
562static cl::opt<bool> EnableImageIntrinsicOptimizer(
563 "amdgpu-enable-image-intrinsic-optimizer",
564 cl::desc("Enable image intrinsic optimizer pass"), cl::init(Val: true),
565 cl::Hidden);
566
567static cl::opt<bool>
568 EnableLoopPrefetch("amdgpu-loop-prefetch",
569 cl::desc("Enable loop data prefetch on AMDGPU"),
570 cl::Hidden, cl::init(Val: false));
571
572static cl::opt<std::string>
573 AMDGPUSchedStrategy("amdgpu-sched-strategy",
574 cl::desc("Select custom AMDGPU scheduling strategy."),
575 cl::Hidden, cl::init(Val: ""));
576
577static cl::opt<bool> EnableRewritePartialRegUses(
578 "amdgpu-enable-rewrite-partial-reg-uses",
579 cl::desc("Enable rewrite partial reg uses pass"), cl::init(Val: true),
580 cl::Hidden);
581
582static cl::opt<bool> EnableHipStdPar(
583 "amdgpu-enable-hipstdpar",
584 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(Val: false),
585 cl::Hidden);
586
587static cl::opt<bool>
588 EnableAMDGPUAttributor("amdgpu-attributor-enable",
589 cl::desc("Enable AMDGPUAttributorPass"),
590 cl::init(Val: true), cl::Hidden);
591
592static cl::opt<bool> NewRegBankSelect(
593 "new-reg-bank-select",
594 cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of "
595 "regbankselect"),
596 cl::init(Val: false), cl::Hidden);
597
598static cl::opt<bool> HasClosedWorldAssumption(
599 "amdgpu-link-time-closed-world",
600 cl::desc("Whether has closed-world assumption at link time"),
601 cl::init(Val: false), cl::Hidden);
602
603static cl::opt<bool> EnableUniformIntrinsicCombine(
604 "amdgpu-enable-uniform-intrinsic-combine",
605 cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
606 cl::init(Val: true), cl::Hidden);
607
608extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
609 // Register the target
610 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
611 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
612
613 PassRegistry *PR = PassRegistry::getPassRegistry();
614 initializeR600ClauseMergePassPass(*PR);
615 initializeR600ControlFlowFinalizerPass(*PR);
616 initializeR600PacketizerPass(*PR);
617 initializeR600ExpandSpecialInstrsPassPass(*PR);
618 initializeR600VectorRegMergerPass(*PR);
619 initializeR600EmitClauseMarkersPass(*PR);
620 initializeR600MachineCFGStructurizerPass(*PR);
621 initializeGlobalISel(*PR);
622 initializeAMDGPUAsmPrinterPass(*PR);
623 initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
624 initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
625 initializeGCNDPPCombineLegacyPass(*PR);
626 initializeSILowerI1CopiesLegacyPass(*PR);
627 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
628 initializeAMDGPURegBankSelectPass(*PR);
629 initializeAMDGPURegBankLegalizePass(*PR);
630 initializeSILowerWWMCopiesLegacyPass(*PR);
631 initializeAMDGPUMarkLastScratchLoadLegacyPass(*PR);
632 initializeSILowerSGPRSpillsLegacyPass(*PR);
633 initializeSIFixSGPRCopiesLegacyPass(*PR);
634 initializeSIFixVGPRCopiesLegacyPass(*PR);
635 initializeSIFoldOperandsLegacyPass(*PR);
636 initializeSIPeepholeSDWALegacyPass(*PR);
637 initializeSIShrinkInstructionsLegacyPass(*PR);
638 initializeSIOptimizeExecMaskingPreRALegacyPass(*PR);
639 initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR);
640 initializeSILoadStoreOptimizerLegacyPass(*PR);
641 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
642 initializeAMDGPUAlwaysInlinePass(*PR);
643 initializeAMDGPULowerExecSyncLegacyPass(*PR);
644 initializeAMDGPUSwLowerLDSLegacyPass(*PR);
645 initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
646 initializeAMDGPUAtomicOptimizerPass(*PR);
647 initializeAMDGPULowerKernelArgumentsPass(*PR);
648 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
649 initializeAMDGPULowerKernelAttributesPass(*PR);
650 initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
651 initializeAMDGPUPostLegalizerCombinerPass(*PR);
652 initializeAMDGPUPreLegalizerCombinerPass(*PR);
653 initializeAMDGPURegBankCombinerPass(*PR);
654 initializeAMDGPUPromoteAllocaPass(*PR);
655 initializeAMDGPUCodeGenPreparePass(*PR);
656 initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
657 initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
658 initializeAMDGPULowerModuleLDSLegacyPass(*PR);
659 initializeAMDGPULowerBufferFatPointersPass(*PR);
660 initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
661 initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
662 initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
663 initializeAMDGPURewriteOutArgumentsPass(*PR);
664 initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
665 initializeSIAnnotateControlFlowLegacyPass(*PR);
666 initializeAMDGPUInsertDelayAluLegacyPass(*PR);
667 initializeAMDGPULowerVGPREncodingLegacyPass(*PR);
668 initializeSIInsertHardClausesLegacyPass(*PR);
669 initializeSIInsertWaitcntsLegacyPass(*PR);
670 initializeSIModeRegisterLegacyPass(*PR);
671 initializeSIWholeQuadModeLegacyPass(*PR);
672 initializeSILowerControlFlowLegacyPass(*PR);
673 initializeSIPreEmitPeepholeLegacyPass(*PR);
674 initializeSILateBranchLoweringLegacyPass(*PR);
675 initializeSIMemoryLegalizerLegacyPass(*PR);
676 initializeSIOptimizeExecMaskingLegacyPass(*PR);
677 initializeSIPreAllocateWWMRegsLegacyPass(*PR);
678 initializeSIFormMemoryClausesLegacyPass(*PR);
679 initializeSIPostRABundlerLegacyPass(*PR);
680 initializeGCNCreateVOPDLegacyPass(*PR);
681 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
682 initializeAMDGPUAAWrapperPassPass(*PR);
683 initializeAMDGPUExternalAAWrapperPass(*PR);
684 initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
685 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
686 initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR);
687 initializeGCNNSAReassignLegacyPass(*PR);
688 initializeGCNPreRAOptimizationsLegacyPass(*PR);
689 initializeGCNPreRALongBranchRegLegacyPass(*PR);
690 initializeGCNRewritePartialRegUsesLegacyPass(*PR);
691 initializeGCNRegPressurePrinterPass(*PR);
692 initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
693 initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
694 initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
695 initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
696}
697
698static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
699 return std::make_unique<AMDGPUTargetObjectFile>();
700}
701
702static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
703 return new SIScheduleDAGMI(C);
704}
705
706static ScheduleDAGInstrs *
707createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
708 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
709 ScheduleDAGMILive *DAG =
710 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(args&: C));
711 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
712 if (ST.shouldClusterStores())
713 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
714 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
715 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
716 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
717 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
718 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
719 return DAG;
720}
721
722static ScheduleDAGInstrs *
723createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
724 ScheduleDAGMILive *DAG =
725 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(args&: C));
726 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
727 return DAG;
728}
729
730static ScheduleDAGInstrs *
731createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
732 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
733 ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
734 C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(args&: C));
735 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
736 if (ST.shouldClusterStores())
737 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
738 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
739 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
740 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
741 return DAG;
742}
743
744static ScheduleDAGInstrs *
745createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
746 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
747 auto *DAG = new GCNIterativeScheduler(
748 C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
749 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
750 if (ST.shouldClusterStores())
751 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
752 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
753 return DAG;
754}
755
756static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
757 auto *DAG = new GCNIterativeScheduler(
758 C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
759 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
760 return DAG;
761}
762
763static ScheduleDAGInstrs *
764createIterativeILPMachineScheduler(MachineSchedContext *C) {
765 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
766 auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
767 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
768 if (ST.shouldClusterStores())
769 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
770 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
771 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
772 return DAG;
773}
774
775static MachineSchedRegistry
776SISchedRegistry("si", "Run SI's custom scheduler",
777 createSIMachineScheduler);
778
779static MachineSchedRegistry
780GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
781 "Run GCN scheduler to maximize occupancy",
782 createGCNMaxOccupancyMachineScheduler);
783
784static MachineSchedRegistry
785 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
786 createGCNMaxILPMachineScheduler);
787
788static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
789 "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
790 createGCNMaxMemoryClauseMachineScheduler);
791
792static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
793 "gcn-iterative-max-occupancy-experimental",
794 "Run GCN scheduler to maximize occupancy (experimental)",
795 createIterativeGCNMaxOccupancyMachineScheduler);
796
797static MachineSchedRegistry GCNMinRegSchedRegistry(
798 "gcn-iterative-minreg",
799 "Run GCN iterative scheduler for minimal register usage (experimental)",
800 createMinRegScheduler);
801
802static MachineSchedRegistry GCNILPSchedRegistry(
803 "gcn-iterative-ilp",
804 "Run GCN iterative scheduler for ILP scheduling (experimental)",
805 createIterativeILPMachineScheduler);
806
807LLVM_READNONE
808static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
809 if (!GPU.empty())
810 return GPU;
811
812 // Need to default to a target with flat support for HSA.
813 if (TT.isAMDGCN())
814 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
815
816 return "r600";
817}
818
819static Reloc::Model getEffectiveRelocModel() {
820 // The AMDGPU toolchain only supports generating shared objects, so we
821 // must always use PIC.
822 return Reloc::PIC_;
823}
824
825AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
826 StringRef CPU, StringRef FS,
827 const TargetOptions &Options,
828 std::optional<Reloc::Model> RM,
829 std::optional<CodeModel::Model> CM,
830 CodeGenOptLevel OptLevel)
831 : CodeGenTargetMachineImpl(
832 T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, GPU: CPU), FS, Options,
833 getEffectiveRelocModel(), getEffectiveCodeModel(CM, Default: CodeModel::Small),
834 OptLevel),
835 TLOF(createTLOF(TT: getTargetTriple())) {
836 initAsmInfo();
837 if (TT.isAMDGCN()) {
838 if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize64"))
839 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave64));
840 else if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize32"))
841 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave32));
842 }
843}
844
845bool AMDGPUTargetMachine::EnableFunctionCalls = false;
846bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
847
848AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
849
850StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
851 Attribute GPUAttr = F.getFnAttribute(Kind: "target-cpu");
852 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
853}
854
855StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
856 Attribute FSAttr = F.getFnAttribute(Kind: "target-features");
857
858 return FSAttr.isValid() ? FSAttr.getValueAsString()
859 : getTargetFeatureString();
860}
861
862llvm::ScheduleDAGInstrs *
863AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
864 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
865 ScheduleDAGMILive *DAG = createSchedLive(C);
866 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
867 if (ST.shouldClusterStores())
868 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
869 return DAG;
870}
871
872/// Predicate for Internalize pass.
873static bool mustPreserveGV(const GlobalValue &GV) {
874 if (const Function *F = dyn_cast<Function>(Val: &GV))
875 return F->isDeclaration() || F->getName().starts_with(Prefix: "__asan_") ||
876 F->getName().starts_with(Prefix: "__sanitizer_") ||
877 AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
878
879 GV.removeDeadConstantUsers();
880 return !GV.use_empty();
881}
882
883void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
884 if (EnableAMDGPUAliasAnalysis)
885 AAM.registerFunctionAnalysis<AMDGPUAA>();
886}
887
888static Expected<ScanOptions>
889parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
890 if (Params.empty())
891 return ScanOptions::Iterative;
892 Params.consume_front(Prefix: "strategy=");
893 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
894 .Case(S: "dpp", Value: ScanOptions::DPP)
895 .Cases(CaseStrings: {"iterative", ""}, Value: ScanOptions::Iterative)
896 .Case(S: "none", Value: ScanOptions::None)
897 .Default(Value: std::nullopt);
898 if (Result)
899 return *Result;
900 return make_error<StringError>(Args: "invalid parameter", Args: inconvertibleErrorCode());
901}
902
903Expected<AMDGPUAttributorOptions>
904parseAMDGPUAttributorPassOptions(StringRef Params) {
905 AMDGPUAttributorOptions Result;
906 while (!Params.empty()) {
907 StringRef ParamName;
908 std::tie(args&: ParamName, args&: Params) = Params.split(Separator: ';');
909 if (ParamName == "closed-world") {
910 Result.IsClosedWorld = true;
911 } else {
912 return make_error<StringError>(
913 Args: formatv(Fmt: "invalid AMDGPUAttributor pass parameter '{0}' ", Vals&: ParamName)
914 .str(),
915 Args: inconvertibleErrorCode());
916 }
917 }
918 return Result;
919}
920
921void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
922
923#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
924#include "llvm/Passes/TargetPassRegistry.inc"
925
926 PB.registerPipelineParsingCallback(
927 C: [this](StringRef Name, CGSCCPassManager &PM,
928 ArrayRef<PassBuilder::PipelineElement> Pipeline) {
929 if (Name == "amdgpu-attributor-cgscc" && getTargetTriple().isAMDGCN()) {
930 PM.addPass(Pass: AMDGPUAttributorCGSCCPass(
931 *static_cast<GCNTargetMachine *>(this)));
932 return true;
933 }
934 return false;
935 });
936
937 PB.registerScalarOptimizerLateEPCallback(
938 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
939 if (Level == OptimizationLevel::O0)
940 return;
941
942 FPM.addPass(Pass: InferAddressSpacesPass());
943 });
944
945 PB.registerVectorizerEndEPCallback(
946 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
947 if (Level == OptimizationLevel::O0)
948 return;
949
950 FPM.addPass(Pass: InferAddressSpacesPass());
951 });
952
953 PB.registerPipelineEarlySimplificationEPCallback(
954 C: [this](ModulePassManager &PM, OptimizationLevel Level,
955 ThinOrFullLTOPhase Phase) {
956 if (!isLTOPreLink(Phase) && getTargetTriple().isAMDGCN()) {
957 // When we are not using -fgpu-rdc, we can run accelerator code
958 // selection relatively early, but still after linking to prevent
959 // eager removal of potentially reachable symbols.
960 if (EnableHipStdPar) {
961 PM.addPass(Pass: HipStdParMathFixupPass());
962 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
963 }
964
965 PM.addPass(Pass: AMDGPUPrintfRuntimeBindingPass());
966 }
967
968 if (Level == OptimizationLevel::O0)
969 return;
970
971 // We don't want to run internalization at per-module stage.
972 if (InternalizeSymbols && !isLTOPreLink(Phase)) {
973 PM.addPass(Pass: InternalizePass(mustPreserveGV));
974 PM.addPass(Pass: GlobalDCEPass());
975 }
976
977 if (EarlyInlineAll && !EnableFunctionCalls)
978 PM.addPass(Pass: AMDGPUAlwaysInlinePass());
979 });
980
981 PB.registerPeepholeEPCallback(
982 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
983 if (Level == OptimizationLevel::O0)
984 return;
985
986 FPM.addPass(Pass: AMDGPUUseNativeCallsPass());
987 if (EnableLibCallSimplify)
988 FPM.addPass(Pass: AMDGPUSimplifyLibCallsPass());
989
990 if (EnableUniformIntrinsicCombine)
991 FPM.addPass(Pass: AMDGPUUniformIntrinsicCombinePass());
992 });
993
994 PB.registerCGSCCOptimizerLateEPCallback(
995 C: [this](CGSCCPassManager &PM, OptimizationLevel Level) {
996 if (Level == OptimizationLevel::O0)
997 return;
998
999 FunctionPassManager FPM;
1000
1001 // Add promote kernel arguments pass to the opt pipeline right before
1002 // infer address spaces which is needed to do actual address space
1003 // rewriting.
1004 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
1005 EnablePromoteKernelArguments)
1006 FPM.addPass(Pass: AMDGPUPromoteKernelArgumentsPass());
1007
1008 // Add infer address spaces pass to the opt pipeline after inlining
1009 // but before SROA to increase SROA opportunities.
1010 FPM.addPass(Pass: InferAddressSpacesPass());
1011
1012 // This should run after inlining to have any chance of doing
1013 // anything, and before other cleanup optimizations.
1014 FPM.addPass(Pass: AMDGPULowerKernelAttributesPass());
1015
1016 if (Level != OptimizationLevel::O0) {
1017 // Promote alloca to vector before SROA and loop unroll. If we
1018 // manage to eliminate allocas before unroll we may choose to unroll
1019 // less.
1020 FPM.addPass(Pass: AMDGPUPromoteAllocaToVectorPass(*this));
1021 }
1022
1023 PM.addPass(Pass: createCGSCCToFunctionPassAdaptor(Pass: std::move(FPM)));
1024 });
1025
1026 // FIXME: Why is AMDGPUAttributor not in CGSCC?
1027 PB.registerOptimizerLastEPCallback(C: [this](ModulePassManager &MPM,
1028 OptimizationLevel Level,
1029 ThinOrFullLTOPhase Phase) {
1030 if (Level != OptimizationLevel::O0) {
1031 if (!isLTOPreLink(Phase)) {
1032 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
1033 AMDGPUAttributorOptions Opts;
1034 MPM.addPass(Pass: AMDGPUAttributorPass(*this, Opts, Phase));
1035 }
1036 }
1037 }
1038 });
1039
1040 PB.registerFullLinkTimeOptimizationLastEPCallback(
1041 C: [this](ModulePassManager &PM, OptimizationLevel Level) {
1042 // When we are using -fgpu-rdc, we can only run accelerator code
1043 // selection after linking to prevent, otherwise we end up removing
1044 // potentially reachable symbols that were exported as external in other
1045 // modules.
1046 if (EnableHipStdPar) {
1047 PM.addPass(Pass: HipStdParMathFixupPass());
1048 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
1049 }
1050 // We want to support the -lto-partitions=N option as "best effort".
1051 // For that, we need to lower LDS earlier in the pipeline before the
1052 // module is partitioned for codegen.
1053 if (EnableLowerExecSync)
1054 PM.addPass(Pass: AMDGPULowerExecSyncPass());
1055 if (EnableSwLowerLDS)
1056 PM.addPass(Pass: AMDGPUSwLowerLDSPass(*this));
1057 if (EnableLowerModuleLDS)
1058 PM.addPass(Pass: AMDGPULowerModuleLDSPass(*this));
1059 if (Level != OptimizationLevel::O0) {
1060 // We only want to run this with O2 or higher since inliner and SROA
1061 // don't run in O1.
1062 if (Level != OptimizationLevel::O1) {
1063 PM.addPass(
1064 Pass: createModuleToFunctionPassAdaptor(Pass: InferAddressSpacesPass()));
1065 }
1066 // Do we really need internalization in LTO?
1067 if (InternalizeSymbols) {
1068 PM.addPass(Pass: InternalizePass(mustPreserveGV));
1069 PM.addPass(Pass: GlobalDCEPass());
1070 }
1071 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
1072 AMDGPUAttributorOptions Opt;
1073 if (HasClosedWorldAssumption)
1074 Opt.IsClosedWorld = true;
1075 PM.addPass(Pass: AMDGPUAttributorPass(
1076 *this, Opt, ThinOrFullLTOPhase::FullLTOPostLink));
1077 }
1078 }
1079 if (!NoKernelInfoEndLTO) {
1080 FunctionPassManager FPM;
1081 FPM.addPass(Pass: KernelInfoPrinter(this));
1082 PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM)));
1083 }
1084 });
1085
1086 PB.registerRegClassFilterParsingCallback(
1087 C: [](StringRef FilterName) -> RegAllocFilterFunc {
1088 if (FilterName == "sgpr")
1089 return onlyAllocateSGPRs;
1090 if (FilterName == "vgpr")
1091 return onlyAllocateVGPRs;
1092 if (FilterName == "wwm")
1093 return onlyAllocateWWMRegs;
1094 return nullptr;
1095 });
1096}
1097
1098bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
1099 unsigned DestAS) const {
1100 return AMDGPU::isFlatGlobalAddrSpace(AS: SrcAS) &&
1101 AMDGPU::isFlatGlobalAddrSpace(AS: DestAS);
1102}
1103
1104unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
1105 if (auto *Arg = dyn_cast<Argument>(Val: V);
1106 Arg &&
1107 AMDGPU::isModuleEntryFunctionCC(CC: Arg->getParent()->getCallingConv()) &&
1108 !Arg->hasByRefAttr())
1109 return AMDGPUAS::GLOBAL_ADDRESS;
1110
1111 const auto *LD = dyn_cast<LoadInst>(Val: V);
1112 if (!LD) // TODO: Handle invariant load like constant.
1113 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
1114
1115 // It must be a generic pointer loaded.
1116 assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
1117
1118 const auto *Ptr = LD->getPointerOperand();
1119 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
1120 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
1121 // For a generic pointer loaded from the constant memory, it could be assumed
1122 // as a global pointer since the constant memory is only populated on the
1123 // host side. As implied by the offload programming model, only global
1124 // pointers could be referenced on the host side.
1125 return AMDGPUAS::GLOBAL_ADDRESS;
1126}
1127
1128std::pair<const Value *, unsigned>
1129AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
1130 if (auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
1131 switch (II->getIntrinsicID()) {
1132 case Intrinsic::amdgcn_is_shared:
1133 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::LOCAL_ADDRESS);
1134 case Intrinsic::amdgcn_is_private:
1135 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::PRIVATE_ADDRESS);
1136 default:
1137 break;
1138 }
1139 return std::pair(nullptr, -1);
1140 }
1141 // Check the global pointer predication based on
1142 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
1143 // the order of 'is_shared' and 'is_private' is not significant.
1144 Value *Ptr;
1145 if (match(
1146 V: const_cast<Value *>(V),
1147 P: m_c_And(L: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_shared>(Op0: m_Value(V&: Ptr))),
1148 R: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_private>(
1149 Op0: m_Deferred(V: Ptr))))))
1150 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
1151
1152 return std::pair(nullptr, -1);
1153}
1154
1155unsigned
1156AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
1157 switch (Kind) {
1158 case PseudoSourceValue::Stack:
1159 case PseudoSourceValue::FixedStack:
1160 return AMDGPUAS::PRIVATE_ADDRESS;
1161 case PseudoSourceValue::ConstantPool:
1162 case PseudoSourceValue::GOT:
1163 case PseudoSourceValue::JumpTable:
1164 case PseudoSourceValue::GlobalValueCallEntry:
1165 case PseudoSourceValue::ExternalSymbolCallEntry:
1166 return AMDGPUAS::CONSTANT_ADDRESS;
1167 }
1168 return AMDGPUAS::FLAT_ADDRESS;
1169}
1170
1171bool AMDGPUTargetMachine::splitModule(
1172 Module &M, unsigned NumParts,
1173 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1174 // FIXME(?): Would be better to use an already existing Analysis/PassManager,
1175 // but all current users of this API don't have one ready and would need to
1176 // create one anyway. Let's hide the boilerplate for now to keep it simple.
1177
1178 LoopAnalysisManager LAM;
1179 FunctionAnalysisManager FAM;
1180 CGSCCAnalysisManager CGAM;
1181 ModuleAnalysisManager MAM;
1182
1183 PassBuilder PB(this);
1184 PB.registerModuleAnalyses(MAM);
1185 PB.registerFunctionAnalyses(FAM);
1186 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
1187
1188 ModulePassManager MPM;
1189 MPM.addPass(Pass: AMDGPUSplitModulePass(NumParts, ModuleCallback));
1190 MPM.run(IR&: M, AM&: MAM);
1191 return true;
1192}
1193
1194//===----------------------------------------------------------------------===//
1195// GCN Target Machine (SI+)
1196//===----------------------------------------------------------------------===//
1197
1198GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
1199 StringRef CPU, StringRef FS,
1200 const TargetOptions &Options,
1201 std::optional<Reloc::Model> RM,
1202 std::optional<CodeModel::Model> CM,
1203 CodeGenOptLevel OL, bool JIT)
1204 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
1205
1206const TargetSubtargetInfo *
1207GCNTargetMachine::getSubtargetImpl(const Function &F) const {
1208 StringRef GPU = getGPUName(F);
1209 StringRef FS = getFeatureString(F);
1210
1211 SmallString<128> SubtargetKey(GPU);
1212 SubtargetKey.append(RHS: FS);
1213
1214 auto &I = SubtargetMap[SubtargetKey];
1215 if (!I) {
1216 // This needs to be done before we create a new subtarget since any
1217 // creation will depend on the TM and the code generation flags on the
1218 // function that reside in TargetOptions.
1219 resetTargetOptions(F);
1220 I = std::make_unique<GCNSubtarget>(args: TargetTriple, args&: GPU, args&: FS, args: *this);
1221 }
1222
1223 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
1224
1225 return I.get();
1226}
1227
1228TargetTransformInfo
1229GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
1230 return TargetTransformInfo(std::make_unique<GCNTTIImpl>(args: this, args: F));
1231}
1232
1233Error GCNTargetMachine::buildCodeGenPipeline(
1234 ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
1235 CodeGenFileType FileType, const CGPassBuilderOption &Opts, MCContext &Ctx,
1236 PassInstrumentationCallbacks *PIC) {
1237 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
1238 return CGPB.buildPipeline(MPM, Out, DwoOut, FileType, Ctx);
1239}
1240
1241ScheduleDAGInstrs *
1242GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
1243 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1244 if (ST.enableSIScheduler())
1245 return createSIMachineScheduler(C);
1246
1247 Attribute SchedStrategyAttr =
1248 C->MF->getFunction().getFnAttribute(Kind: "amdgpu-sched-strategy");
1249 StringRef SchedStrategy = SchedStrategyAttr.isValid()
1250 ? SchedStrategyAttr.getValueAsString()
1251 : AMDGPUSchedStrategy;
1252
1253 if (SchedStrategy == "max-ilp")
1254 return createGCNMaxILPMachineScheduler(C);
1255
1256 if (SchedStrategy == "max-memory-clause")
1257 return createGCNMaxMemoryClauseMachineScheduler(C);
1258
1259 if (SchedStrategy == "iterative-ilp")
1260 return createIterativeILPMachineScheduler(C);
1261
1262 if (SchedStrategy == "iterative-minreg")
1263 return createMinRegScheduler(C);
1264
1265 if (SchedStrategy == "iterative-maxocc")
1266 return createIterativeGCNMaxOccupancyMachineScheduler(C);
1267
1268 return createGCNMaxOccupancyMachineScheduler(C);
1269}
1270
1271ScheduleDAGInstrs *
1272GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
1273 ScheduleDAGMI *DAG =
1274 new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(args&: C),
1275 /*RemoveKillFlags=*/true);
1276 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1277 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1278 if (ST.shouldClusterStores())
1279 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1280 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
1281 if ((EnableVOPD.getNumOccurrences() ||
1282 getOptLevel() >= CodeGenOptLevel::Less) &&
1283 EnableVOPD)
1284 DAG->addMutation(Mutation: createVOPDPairingMutation());
1285 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
1286 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
1287 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
1288 return DAG;
1289}
1290//===----------------------------------------------------------------------===//
1291// AMDGPU Legacy Pass Setup
1292//===----------------------------------------------------------------------===//
1293
1294std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
1295 return getStandardCSEConfigForOpt(Level: TM->getOptLevel());
1296}
1297
1298namespace {
1299
1300class GCNPassConfig final : public AMDGPUPassConfig {
1301public:
1302 GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
1303 : AMDGPUPassConfig(TM, PM) {
1304 substitutePass(StandardID: &PostRASchedulerID, TargetID: &PostMachineSchedulerID);
1305 }
1306
1307 GCNTargetMachine &getGCNTargetMachine() const {
1308 return getTM<GCNTargetMachine>();
1309 }
1310
1311 bool addPreISel() override;
1312 void addMachineSSAOptimization() override;
1313 bool addILPOpts() override;
1314 bool addInstSelector() override;
1315 bool addIRTranslator() override;
1316 void addPreLegalizeMachineIR() override;
1317 bool addLegalizeMachineIR() override;
1318 void addPreRegBankSelect() override;
1319 bool addRegBankSelect() override;
1320 void addPreGlobalInstructionSelect() override;
1321 bool addGlobalInstructionSelect() override;
1322 void addPreRegAlloc() override;
1323 void addFastRegAlloc() override;
1324 void addOptimizedRegAlloc() override;
1325
1326 FunctionPass *createSGPRAllocPass(bool Optimized);
1327 FunctionPass *createVGPRAllocPass(bool Optimized);
1328 FunctionPass *createWWMRegAllocPass(bool Optimized);
1329 FunctionPass *createRegAllocPass(bool Optimized) override;
1330
1331 bool addRegAssignAndRewriteFast() override;
1332 bool addRegAssignAndRewriteOptimized() override;
1333
1334 bool addPreRewrite() override;
1335 void addPostRegAlloc() override;
1336 void addPreSched2() override;
1337 void addPreEmitPass() override;
1338 void addPostBBSections() override;
1339};
1340
1341} // end anonymous namespace
1342
1343AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM)
1344 : TargetPassConfig(TM, PM) {
1345 // Exceptions and StackMaps are not supported, so these passes will never do
1346 // anything.
1347 disablePass(PassID: &StackMapLivenessID);
1348 disablePass(PassID: &FuncletLayoutID);
1349 // Garbage collection is not supported.
1350 disablePass(PassID: &GCLoweringID);
1351 disablePass(PassID: &ShadowStackGCLoweringID);
1352}
1353
1354void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
1355 if (getOptLevel() == CodeGenOptLevel::Aggressive)
1356 addPass(P: createGVNPass());
1357 else
1358 addPass(P: createEarlyCSEPass());
1359}
1360
1361void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
1362 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
1363 addPass(P: createLoopDataPrefetchPass());
1364 addPass(P: createSeparateConstOffsetFromGEPPass());
1365 // ReassociateGEPs exposes more opportunities for SLSR. See
1366 // the example in reassociate-geps-and-slsr.ll.
1367 addPass(P: createStraightLineStrengthReducePass());
1368 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1369 // EarlyCSE can reuse.
1370 addEarlyCSEOrGVNPass();
1371 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1372 addPass(P: createNaryReassociatePass());
1373 // NaryReassociate on GEPs creates redundant common expressions, so run
1374 // EarlyCSE after it.
1375 addPass(P: createEarlyCSEPass());
1376}
1377
1378void AMDGPUPassConfig::addIRPasses() {
1379 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1380
1381 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
1382 addPass(P: createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1383
1384 // There is no reason to run these.
1385 disablePass(PassID: &StackMapLivenessID);
1386 disablePass(PassID: &FuncletLayoutID);
1387 disablePass(PassID: &PatchableFunctionID);
1388
1389 if (TM.getTargetTriple().isAMDGCN())
1390 addPass(P: createAMDGPUPrintfRuntimeBinding());
1391
1392 if (LowerCtorDtor)
1393 addPass(P: createAMDGPUCtorDtorLoweringLegacyPass());
1394
1395 if (TM.getTargetTriple().isAMDGCN() &&
1396 isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
1397 addPass(P: createAMDGPUImageIntrinsicOptimizerPass(&TM));
1398
1399 if (EnableUniformIntrinsicCombine)
1400 addPass(P: createAMDGPUUniformIntrinsicCombineLegacyPass());
1401
1402 // This can be disabled by passing ::Disable here or on the command line
1403 // with --expand-variadics-override=disable.
1404 addPass(P: createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
1405
1406 // Function calls are not supported, so make sure we inline everything.
1407 addPass(P: createAMDGPUAlwaysInlinePass());
1408 addPass(P: createAlwaysInlinerLegacyPass());
1409
1410 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1411 if (TM.getTargetTriple().getArch() == Triple::r600)
1412 addPass(P: createR600OpenCLImageTypeLoweringPass());
1413
1414 // Make enqueued block runtime handles externally visible.
1415 addPass(P: createAMDGPUExportKernelRuntimeHandlesLegacyPass());
1416
1417 // Lower special LDS accesses.
1418 if (EnableLowerExecSync)
1419 addPass(P: createAMDGPULowerExecSyncLegacyPass());
1420
1421 // Lower LDS accesses to global memory pass if address sanitizer is enabled.
1422 if (EnableSwLowerLDS)
1423 addPass(P: createAMDGPUSwLowerLDSLegacyPass(TM: &TM));
1424
1425 // Runs before PromoteAlloca so the latter can account for function uses
1426 if (EnableLowerModuleLDS) {
1427 addPass(P: createAMDGPULowerModuleLDSLegacyPass(TM: &TM));
1428 }
1429
1430 // Run atomic optimizer before Atomic Expand
1431 if ((TM.getTargetTriple().isAMDGCN()) &&
1432 (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1433 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1434 addPass(P: createAMDGPUAtomicOptimizerPass(ScanStrategy: AMDGPUAtomicOptimizerStrategy));
1435 }
1436
1437 addPass(P: createAtomicExpandLegacyPass());
1438
1439 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1440 addPass(P: createAMDGPUPromoteAlloca());
1441
1442 if (isPassEnabled(Opt: EnableScalarIRPasses))
1443 addStraightLineScalarOptimizationPasses();
1444
1445 if (EnableAMDGPUAliasAnalysis) {
1446 addPass(P: createAMDGPUAAWrapperPass());
1447 addPass(P: createExternalAAWrapperPass(Callback: [](Pass &P, Function &,
1448 AAResults &AAR) {
1449 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1450 AAR.addAAResult(AAResult&: WrapperPass->getResult());
1451 }));
1452 }
1453
1454 if (TM.getTargetTriple().isAMDGCN()) {
1455 // TODO: May want to move later or split into an early and late one.
1456 addPass(P: createAMDGPUCodeGenPreparePass());
1457 }
1458
1459 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1460 // have expanded.
1461 if (TM.getOptLevel() > CodeGenOptLevel::Less)
1462 addPass(P: createLICMPass());
1463 }
1464
1465 TargetPassConfig::addIRPasses();
1466
1467 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1468 // example, GVN can combine
1469 //
1470 // %0 = add %a, %b
1471 // %1 = add %b, %a
1472 //
1473 // and
1474 //
1475 // %0 = shl nsw %a, 2
1476 // %1 = shl %a, 2
1477 //
1478 // but EarlyCSE can do neither of them.
1479 if (isPassEnabled(Opt: EnableScalarIRPasses))
1480 addEarlyCSEOrGVNPass();
1481}
1482
1483void AMDGPUPassConfig::addCodeGenPrepare() {
1484 if (TM->getTargetTriple().isAMDGCN() &&
1485 TM->getOptLevel() > CodeGenOptLevel::None)
1486 addPass(P: createAMDGPUPreloadKernelArgumentsLegacyPass(TM));
1487
1488 if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
1489 addPass(P: createAMDGPULowerKernelArgumentsPass());
1490
1491 TargetPassConfig::addCodeGenPrepare();
1492
1493 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
1494 addPass(P: createLoadStoreVectorizerPass());
1495
1496 if (TM->getTargetTriple().isAMDGCN()) {
1497 // This lowering has been placed after codegenprepare to take advantage of
1498 // address mode matching (which is why it isn't put with the LDS lowerings).
1499 // It could be placed anywhere before uniformity annotations (an analysis
1500 // that it changes by splitting up fat pointers into their components)
1501 // but has been put before switch lowering and CFG flattening so that those
1502 // passes can run on the more optimized control flow this pass creates in
1503 // many cases.
1504 addPass(P: createAMDGPULowerBufferFatPointersPass());
1505 addPass(P: createAMDGPULowerIntrinsicsLegacyPass());
1506 }
1507
1508 // LowerSwitch pass may introduce unreachable blocks that can
1509 // cause unexpected behavior for subsequent passes. Placing it
1510 // here seems better that these blocks would get cleaned up by
1511 // UnreachableBlockElim inserted next in the pass flow.
1512 addPass(P: createLowerSwitchPass());
1513}
1514
1515bool AMDGPUPassConfig::addPreISel() {
1516 if (TM->getOptLevel() > CodeGenOptLevel::None)
1517 addPass(P: createFlattenCFGPass());
1518 return false;
1519}
1520
1521bool AMDGPUPassConfig::addInstSelector() {
1522 addPass(P: createAMDGPUISelDag(TM&: getAMDGPUTargetMachine(), OptLevel: getOptLevel()));
1523 return false;
1524}
1525
1526bool AMDGPUPassConfig::addGCPasses() {
1527 // Do nothing. GC is not supported.
1528 return false;
1529}
1530
1531//===----------------------------------------------------------------------===//
1532// GCN Legacy Pass Setup
1533//===----------------------------------------------------------------------===//
1534
1535bool GCNPassConfig::addPreISel() {
1536 AMDGPUPassConfig::addPreISel();
1537
1538 if (TM->getOptLevel() > CodeGenOptLevel::None)
1539 addPass(P: createSinkingPass());
1540
1541 if (TM->getOptLevel() > CodeGenOptLevel::None)
1542 addPass(P: createAMDGPULateCodeGenPrepareLegacyPass());
1543
1544 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1545 // regions formed by them.
1546 addPass(PassID: &AMDGPUUnifyDivergentExitNodesID);
1547 addPass(P: createFixIrreduciblePass());
1548 addPass(P: createUnifyLoopExitsPass());
1549 addPass(P: createStructurizeCFGPass(SkipUniformRegions: false)); // true -> SkipUniformRegions
1550
1551 addPass(P: createAMDGPUAnnotateUniformValuesLegacy());
1552 addPass(P: createSIAnnotateControlFlowLegacyPass());
1553 // TODO: Move this right after structurizeCFG to avoid extra divergence
1554 // analysis. This depends on stopping SIAnnotateControlFlow from making
1555 // control flow modifications.
1556 addPass(P: createAMDGPURewriteUndefForPHILegacyPass());
1557
1558 // SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
1559 // with -new-reg-bank-select and without any of the fallback options.
1560 if (!getCGPassBuilderOption().EnableGlobalISelOption ||
1561 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
1562 addPass(P: createLCSSAPass());
1563
1564 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1565 addPass(PassID: &AMDGPUPerfHintAnalysisLegacyID);
1566
1567 return false;
1568}
1569
1570void GCNPassConfig::addMachineSSAOptimization() {
1571 TargetPassConfig::addMachineSSAOptimization();
1572
1573 // We want to fold operands after PeepholeOptimizer has run (or as part of
1574 // it), because it will eliminate extra copies making it easier to fold the
1575 // real source operand. We want to eliminate dead instructions after, so that
1576 // we see fewer uses of the copies. We then need to clean up the dead
1577 // instructions leftover after the operands are folded as well.
1578 //
1579 // XXX - Can we get away without running DeadMachineInstructionElim again?
1580 addPass(PassID: &SIFoldOperandsLegacyID);
1581 if (EnableDPPCombine)
1582 addPass(PassID: &GCNDPPCombineLegacyID);
1583 addPass(PassID: &SILoadStoreOptimizerLegacyID);
1584 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
1585 addPass(PassID: &SIPeepholeSDWALegacyID);
1586 addPass(PassID: &EarlyMachineLICMID);
1587 addPass(PassID: &MachineCSELegacyID);
1588 addPass(PassID: &SIFoldOperandsLegacyID);
1589 }
1590 addPass(PassID: &DeadMachineInstructionElimID);
1591 addPass(P: createSIShrinkInstructionsLegacyPass());
1592}
1593
1594bool GCNPassConfig::addILPOpts() {
1595 if (EnableEarlyIfConversion)
1596 addPass(PassID: &EarlyIfConverterLegacyID);
1597
1598 TargetPassConfig::addILPOpts();
1599 return false;
1600}
1601
1602bool GCNPassConfig::addInstSelector() {
1603 AMDGPUPassConfig::addInstSelector();
1604 addPass(PassID: &SIFixSGPRCopiesLegacyID);
1605 addPass(P: createSILowerI1CopiesLegacyPass());
1606 return false;
1607}
1608
1609bool GCNPassConfig::addIRTranslator() {
1610 addPass(P: new IRTranslator(getOptLevel()));
1611 return false;
1612}
1613
1614void GCNPassConfig::addPreLegalizeMachineIR() {
1615 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1616 addPass(P: createAMDGPUPreLegalizeCombiner(IsOptNone));
1617 addPass(P: new Localizer());
1618}
1619
1620bool GCNPassConfig::addLegalizeMachineIR() {
1621 addPass(P: new Legalizer());
1622 return false;
1623}
1624
1625void GCNPassConfig::addPreRegBankSelect() {
1626 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1627 addPass(P: createAMDGPUPostLegalizeCombiner(IsOptNone));
1628 addPass(P: createAMDGPUGlobalISelDivergenceLoweringPass());
1629}
1630
1631bool GCNPassConfig::addRegBankSelect() {
1632 if (NewRegBankSelect) {
1633 addPass(P: createAMDGPURegBankSelectPass());
1634 addPass(P: createAMDGPURegBankLegalizePass());
1635 } else {
1636 addPass(P: new RegBankSelect());
1637 }
1638 return false;
1639}
1640
1641void GCNPassConfig::addPreGlobalInstructionSelect() {
1642 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1643 addPass(P: createAMDGPURegBankCombiner(IsOptNone));
1644}
1645
1646bool GCNPassConfig::addGlobalInstructionSelect() {
1647 addPass(P: new InstructionSelect(getOptLevel()));
1648 return false;
1649}
1650
1651void GCNPassConfig::addFastRegAlloc() {
1652 // FIXME: We have to disable the verifier here because of PHIElimination +
1653 // TwoAddressInstructions disabling it.
1654
1655 // This must be run immediately after phi elimination and before
1656 // TwoAddressInstructions, otherwise the processing of the tied operand of
1657 // SI_ELSE will introduce a copy of the tied operand source after the else.
1658 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1659
1660 insertPass(TargetPassID: &TwoAddressInstructionPassID, InsertedPassID: &SIWholeQuadModeID);
1661
1662 TargetPassConfig::addFastRegAlloc();
1663}
1664
1665void GCNPassConfig::addPreRegAlloc() {
1666 if (getOptLevel() != CodeGenOptLevel::None)
1667 addPass(PassID: &AMDGPUPrepareAGPRAllocLegacyID);
1668}
1669
1670void GCNPassConfig::addOptimizedRegAlloc() {
1671 if (EnableDCEInRA)
1672 insertPass(TargetPassID: &DetectDeadLanesID, InsertedPassID: &DeadMachineInstructionElimID);
1673
1674 // FIXME: when an instruction has a Killed operand, and the instruction is
1675 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1676 // the register in LiveVariables, this would trigger a failure in verifier,
1677 // we should fix it and enable the verifier.
1678 if (OptVGPRLiveRange)
1679 insertPass(TargetPassID: &LiveVariablesID, InsertedPassID: &SIOptimizeVGPRLiveRangeLegacyID);
1680
1681 // This must be run immediately after phi elimination and before
1682 // TwoAddressInstructions, otherwise the processing of the tied operand of
1683 // SI_ELSE will introduce a copy of the tied operand source after the else.
1684 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1685
1686 if (EnableRewritePartialRegUses)
1687 insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNRewritePartialRegUsesID);
1688
1689 if (isPassEnabled(Opt: EnablePreRAOptimizations))
1690 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &GCNPreRAOptimizationsID);
1691
1692 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1693 // instructions that cause scheduling barriers.
1694 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIWholeQuadModeID);
1695
1696 if (OptExecMaskPreRA)
1697 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIOptimizeExecMaskingPreRAID);
1698
1699 // This is not an essential optimization and it has a noticeable impact on
1700 // compilation time, so we only enable it from O2.
1701 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1702 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIFormMemoryClausesID);
1703
1704 TargetPassConfig::addOptimizedRegAlloc();
1705}
1706
1707bool GCNPassConfig::addPreRewrite() {
1708 if (EnableRegReassign)
1709 addPass(PassID: &GCNNSAReassignID);
1710
1711 addPass(PassID: &AMDGPURewriteAGPRCopyMFMALegacyID);
1712 return true;
1713}
1714
1715FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1716 // Initialize the global default.
1717 llvm::call_once(flag&: InitializeDefaultSGPRRegisterAllocatorFlag,
1718 F&: initializeDefaultSGPRRegisterAllocatorOnce);
1719
1720 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1721 if (Ctor != useDefaultRegisterAllocator)
1722 return Ctor();
1723
1724 if (Optimized)
1725 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
1726
1727 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
1728}
1729
1730FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1731 // Initialize the global default.
1732 llvm::call_once(flag&: InitializeDefaultVGPRRegisterAllocatorFlag,
1733 F&: initializeDefaultVGPRRegisterAllocatorOnce);
1734
1735 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1736 if (Ctor != useDefaultRegisterAllocator)
1737 return Ctor();
1738
1739 if (Optimized)
1740 return createGreedyVGPRRegisterAllocator();
1741
1742 return createFastVGPRRegisterAllocator();
1743}
1744
1745FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1746 // Initialize the global default.
1747 llvm::call_once(flag&: InitializeDefaultWWMRegisterAllocatorFlag,
1748 F&: initializeDefaultWWMRegisterAllocatorOnce);
1749
1750 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1751 if (Ctor != useDefaultRegisterAllocator)
1752 return Ctor();
1753
1754 if (Optimized)
1755 return createGreedyWWMRegisterAllocator();
1756
1757 return createFastWWMRegisterAllocator();
1758}
1759
1760FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1761 llvm_unreachable("should not be used");
1762}
1763
1764static const char RegAllocOptNotSupportedMessage[] =
1765 "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1766 "and -vgpr-regalloc";
1767
1768bool GCNPassConfig::addRegAssignAndRewriteFast() {
1769 if (!usingDefaultRegAlloc())
1770 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1771
1772 addPass(PassID: &GCNPreRALongBranchRegID);
1773
1774 addPass(P: createSGPRAllocPass(Optimized: false));
1775
1776 // Equivalent of PEI for SGPRs.
1777 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1778
1779 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1780 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1781
1782 // For allocating other wwm register operands.
1783 addPass(P: createWWMRegAllocPass(Optimized: false));
1784
1785 addPass(PassID: &SILowerWWMCopiesLegacyID);
1786 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1787
1788 // For allocating per-thread VGPRs.
1789 addPass(P: createVGPRAllocPass(Optimized: false));
1790
1791 return true;
1792}
1793
1794bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1795 if (!usingDefaultRegAlloc())
1796 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1797
1798 addPass(PassID: &GCNPreRALongBranchRegID);
1799
1800 addPass(P: createSGPRAllocPass(Optimized: true));
1801
1802 // Commit allocated register changes. This is mostly necessary because too
1803 // many things rely on the use lists of the physical registers, such as the
1804 // verifier. This is only necessary with allocators which use LiveIntervals,
1805 // since FastRegAlloc does the replacements itself.
1806 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1807
1808 // At this point, the sgpr-regalloc has been done and it is good to have the
1809 // stack slot coloring to try to optimize the SGPR spill stack indices before
1810 // attempting the custom SGPR spill lowering.
1811 addPass(PassID: &StackSlotColoringID);
1812
1813 // Equivalent of PEI for SGPRs.
1814 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1815
1816 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1817 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1818
1819 // For allocating other whole wave mode registers.
1820 addPass(P: createWWMRegAllocPass(Optimized: true));
1821 addPass(PassID: &SILowerWWMCopiesLegacyID);
1822 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1823 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1824
1825 // For allocating per-thread VGPRs.
1826 addPass(P: createVGPRAllocPass(Optimized: true));
1827
1828 addPreRewrite();
1829 addPass(PassID: &VirtRegRewriterID);
1830
1831 addPass(PassID: &AMDGPUMarkLastScratchLoadID);
1832
1833 return true;
1834}
1835
1836void GCNPassConfig::addPostRegAlloc() {
1837 addPass(PassID: &SIFixVGPRCopiesID);
1838 if (getOptLevel() > CodeGenOptLevel::None)
1839 addPass(PassID: &SIOptimizeExecMaskingLegacyID);
1840 TargetPassConfig::addPostRegAlloc();
1841}
1842
1843void GCNPassConfig::addPreSched2() {
1844 if (TM->getOptLevel() > CodeGenOptLevel::None)
1845 addPass(P: createSIShrinkInstructionsLegacyPass());
1846 addPass(PassID: &SIPostRABundlerLegacyID);
1847}
1848
1849void GCNPassConfig::addPreEmitPass() {
1850 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less))
1851 addPass(PassID: &GCNCreateVOPDID);
1852 addPass(P: createSIMemoryLegalizerPass());
1853 addPass(P: createSIInsertWaitcntsPass());
1854
1855 addPass(P: createSIModeRegisterPass());
1856
1857 if (getOptLevel() > CodeGenOptLevel::None)
1858 addPass(PassID: &SIInsertHardClausesID);
1859
1860 addPass(PassID: &SILateBranchLoweringPassID);
1861 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
1862 addPass(P: createAMDGPUSetWavePriorityPass());
1863 if (getOptLevel() > CodeGenOptLevel::None)
1864 addPass(PassID: &SIPreEmitPeepholeID);
1865 // The hazard recognizer that runs as part of the post-ra scheduler does not
1866 // guarantee to be able handle all hazards correctly. This is because if there
1867 // are multiple scheduling regions in a basic block, the regions are scheduled
1868 // bottom up, so when we begin to schedule a region we don't know what
1869 // instructions were emitted directly before it.
1870 //
1871 // Here we add a stand-alone hazard recognizer pass which can handle all
1872 // cases.
1873 addPass(PassID: &PostRAHazardRecognizerID);
1874
1875 addPass(PassID: &AMDGPUWaitSGPRHazardsLegacyID);
1876
1877 addPass(PassID: &AMDGPULowerVGPREncodingLegacyID);
1878
1879 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less))
1880 addPass(PassID: &AMDGPUInsertDelayAluID);
1881
1882 addPass(PassID: &BranchRelaxationPassID);
1883}
1884
1885void GCNPassConfig::addPostBBSections() {
1886 // We run this later to avoid passes like livedebugvalues and BBSections
1887 // having to deal with the apparent multi-entry functions we may generate.
1888 addPass(P: createAMDGPUPreloadKernArgPrologLegacyPass());
1889}
1890
1891TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1892 return new GCNPassConfig(*this, PM);
1893}
1894
1895void GCNTargetMachine::registerMachineRegisterInfoCallback(
1896 MachineFunction &MF) const {
1897 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1898 MF.getRegInfo().addDelegate(delegate: MFI);
1899}
1900
1901MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1902 BumpPtrAllocator &Allocator, const Function &F,
1903 const TargetSubtargetInfo *STI) const {
1904 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1905 Allocator, F, STI: static_cast<const GCNSubtarget *>(STI));
1906}
1907
1908yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1909 return new yaml::SIMachineFunctionInfo();
1910}
1911
1912yaml::MachineFunctionInfo *
1913GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1914 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1915 return new yaml::SIMachineFunctionInfo(
1916 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1917}
1918
1919bool GCNTargetMachine::parseMachineFunctionInfo(
1920 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1921 SMDiagnostic &Error, SMRange &SourceRange) const {
1922 const yaml::SIMachineFunctionInfo &YamlMFI =
1923 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1924 MachineFunction &MF = PFS.MF;
1925 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1926 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1927
1928 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1929 return true;
1930
1931 if (MFI->Occupancy == 0) {
1932 // Fixup the subtarget dependent default value.
1933 MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
1934 }
1935
1936 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1937 Register TempReg;
1938 if (parseNamedRegisterReference(PFS, Reg&: TempReg, Src: RegName.Value, Error)) {
1939 SourceRange = RegName.SourceRange;
1940 return true;
1941 }
1942 RegVal = TempReg;
1943
1944 return false;
1945 };
1946
1947 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1948 Register &RegVal) {
1949 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1950 };
1951
1952 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1953 return true;
1954
1955 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1956 return true;
1957
1958 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1959 MFI->LongBranchReservedReg))
1960 return true;
1961
1962 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1963 // Create a diagnostic for a the register string literal.
1964 const MemoryBuffer &Buffer =
1965 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
1966 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1967 RegName.Value.size(), SourceMgr::DK_Error,
1968 "incorrect register class for field", RegName.Value,
1969 {}, {});
1970 SourceRange = RegName.SourceRange;
1971 return true;
1972 };
1973
1974 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1975 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1976 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1977 return true;
1978
1979 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1980 !AMDGPU::SGPR_128RegClass.contains(Reg: MFI->ScratchRSrcReg)) {
1981 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1982 }
1983
1984 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1985 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->FrameOffsetReg)) {
1986 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1987 }
1988
1989 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1990 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->StackPtrOffsetReg)) {
1991 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1992 }
1993
1994 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1995 Register ParsedReg;
1996 if (parseRegister(YamlReg, ParsedReg))
1997 return true;
1998
1999 MFI->reserveWWMRegister(Reg: ParsedReg);
2000 }
2001
2002 for (const auto &[_, Info] : PFS.VRegInfosNamed) {
2003 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
2004 }
2005 for (const auto &[_, Info] : PFS.VRegInfos) {
2006 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
2007 }
2008
2009 for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
2010 Register ParsedReg;
2011 if (parseRegister(YamlRegStr, ParsedReg))
2012 return true;
2013 MFI->SpillPhysVGPRs.push_back(Elt: ParsedReg);
2014 }
2015
2016 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
2017 const TargetRegisterClass &RC,
2018 ArgDescriptor &Arg, unsigned UserSGPRs,
2019 unsigned SystemSGPRs) {
2020 // Skip parsing if it's not present.
2021 if (!A)
2022 return false;
2023
2024 if (A->IsRegister) {
2025 Register Reg;
2026 if (parseNamedRegisterReference(PFS, Reg, Src: A->RegisterName.Value, Error)) {
2027 SourceRange = A->RegisterName.SourceRange;
2028 return true;
2029 }
2030 if (!RC.contains(Reg))
2031 return diagnoseRegisterClass(A->RegisterName);
2032 Arg = ArgDescriptor::createRegister(Reg);
2033 } else
2034 Arg = ArgDescriptor::createStack(Offset: A->StackOffset);
2035 // Check and apply the optional mask.
2036 if (A->Mask)
2037 Arg = ArgDescriptor::createArg(Arg, Mask: *A->Mask);
2038
2039 MFI->NumUserSGPRs += UserSGPRs;
2040 MFI->NumSystemSGPRs += SystemSGPRs;
2041 return false;
2042 };
2043
2044 if (YamlMFI.ArgInfo &&
2045 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
2046 AMDGPU::SGPR_128RegClass,
2047 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
2048 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
2049 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
2050 2, 0) ||
2051 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
2052 MFI->ArgInfo.QueuePtr, 2, 0) ||
2053 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
2054 AMDGPU::SReg_64RegClass,
2055 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
2056 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
2057 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
2058 2, 0) ||
2059 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
2060 AMDGPU::SReg_64RegClass,
2061 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
2062 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
2063 AMDGPU::SGPR_32RegClass,
2064 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
2065 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
2066 AMDGPU::SGPR_32RegClass,
2067 MFI->ArgInfo.LDSKernelId, 0, 1) ||
2068 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
2069 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
2070 0, 1) ||
2071 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
2072 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
2073 0, 1) ||
2074 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
2075 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
2076 0, 1) ||
2077 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
2078 AMDGPU::SGPR_32RegClass,
2079 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
2080 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
2081 AMDGPU::SGPR_32RegClass,
2082 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
2083 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
2084 AMDGPU::SReg_64RegClass,
2085 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
2086 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
2087 AMDGPU::SReg_64RegClass,
2088 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
2089 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
2090 AMDGPU::VGPR_32RegClass,
2091 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
2092 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
2093 AMDGPU::VGPR_32RegClass,
2094 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
2095 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
2096 AMDGPU::VGPR_32RegClass,
2097 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
2098 return true;
2099
2100 // Parse FirstKernArgPreloadReg separately, since it's a Register,
2101 // not ArgDescriptor.
2102 if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) {
2103 const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg;
2104
2105 if (!A.IsRegister) {
2106 // For stack arguments, we don't have RegisterName.SourceRange,
2107 // but we should have some location info from the YAML parser
2108 const MemoryBuffer &Buffer =
2109 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
2110 // Create a minimal valid source range
2111 SMLoc Loc = SMLoc::getFromPointer(Ptr: Buffer.getBufferStart());
2112 SMRange Range(Loc, Loc);
2113
2114 Error = SMDiagnostic(
2115 *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error,
2116 "firstKernArgPreloadReg must be a register, not a stack location", "",
2117 {}, {});
2118
2119 SourceRange = Range;
2120 return true;
2121 }
2122
2123 Register Reg;
2124 if (parseNamedRegisterReference(PFS, Reg, Src: A.RegisterName.Value, Error)) {
2125 SourceRange = A.RegisterName.SourceRange;
2126 return true;
2127 }
2128
2129 if (!AMDGPU::SGPR_32RegClass.contains(Reg))
2130 return diagnoseRegisterClass(A.RegisterName);
2131
2132 MFI->ArgInfo.FirstKernArgPreloadReg = Reg;
2133 MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs;
2134 }
2135
2136 if (ST.hasFeature(Feature: AMDGPU::FeatureDX10ClampAndIEEEMode)) {
2137 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
2138 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
2139 }
2140
2141 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
2142 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
2143 ? DenormalMode::IEEE
2144 : DenormalMode::PreserveSign;
2145 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
2146 ? DenormalMode::IEEE
2147 : DenormalMode::PreserveSign;
2148
2149 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
2150 ? DenormalMode::IEEE
2151 : DenormalMode::PreserveSign;
2152 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
2153 ? DenormalMode::IEEE
2154 : DenormalMode::PreserveSign;
2155
2156 if (YamlMFI.HasInitWholeWave)
2157 MFI->setInitWholeWave();
2158
2159 return false;
2160}
2161
2162//===----------------------------------------------------------------------===//
2163// AMDGPU CodeGen Pass Builder interface.
2164//===----------------------------------------------------------------------===//
2165
2166AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
2167 GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
2168 PassInstrumentationCallbacks *PIC)
2169 : CodeGenPassBuilder(TM, Opts, PIC) {
2170 Opt.MISchedPostRA = true;
2171 Opt.RequiresCodeGenSCCOrder = true;
2172 // Exceptions and StackMaps are not supported, so these passes will never do
2173 // anything.
2174 // Garbage collection is not supported.
2175 disablePass<StackMapLivenessPass, FuncletLayoutPass, PatchableFunctionPass,
2176 ShadowStackGCLoweringPass, GCLoweringPass>();
2177}
2178
2179void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
2180 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) {
2181 flushFPMsToMPM(PMW);
2182 addModulePass(Pass: AMDGPURemoveIncompatibleFunctionsPass(TM), PMW);
2183 }
2184
2185 flushFPMsToMPM(PMW);
2186
2187 if (TM.getTargetTriple().isAMDGCN())
2188 addModulePass(Pass: AMDGPUPrintfRuntimeBindingPass(), PMW);
2189
2190 if (LowerCtorDtor)
2191 addModulePass(Pass: AMDGPUCtorDtorLoweringPass(), PMW);
2192
2193 if (isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
2194 addFunctionPass(Pass: AMDGPUImageIntrinsicOptimizerPass(TM), PMW);
2195
2196 if (EnableUniformIntrinsicCombine)
2197 addFunctionPass(Pass: AMDGPUUniformIntrinsicCombinePass(), PMW);
2198 // This can be disabled by passing ::Disable here or on the command line
2199 // with --expand-variadics-override=disable.
2200 flushFPMsToMPM(PMW);
2201 addModulePass(Pass: ExpandVariadicsPass(ExpandVariadicsMode::Lowering), PMW);
2202
2203 addModulePass(Pass: AMDGPUAlwaysInlinePass(), PMW);
2204 addModulePass(Pass: AlwaysInlinerPass(), PMW);
2205
2206 addModulePass(Pass: AMDGPUExportKernelRuntimeHandlesPass(), PMW);
2207
2208 if (EnableLowerExecSync)
2209 addModulePass(Pass: AMDGPULowerExecSyncPass(), PMW);
2210
2211 if (EnableSwLowerLDS)
2212 addModulePass(Pass: AMDGPUSwLowerLDSPass(TM), PMW);
2213
2214 // Runs before PromoteAlloca so the latter can account for function uses
2215 if (EnableLowerModuleLDS)
2216 addModulePass(Pass: AMDGPULowerModuleLDSPass(TM), PMW);
2217
2218 // Run atomic optimizer before Atomic Expand
2219 if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
2220 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
2221 addFunctionPass(
2222 Pass: AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy), PMW);
2223
2224 addFunctionPass(Pass: AtomicExpandPass(TM), PMW);
2225
2226 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2227 addFunctionPass(Pass: AMDGPUPromoteAllocaPass(TM), PMW);
2228 if (isPassEnabled(Opt: EnableScalarIRPasses))
2229 addStraightLineScalarOptimizationPasses(PMW);
2230
2231 // TODO: Handle EnableAMDGPUAliasAnalysis
2232
2233 // TODO: May want to move later or split into an early and late one.
2234 addFunctionPass(Pass: AMDGPUCodeGenPreparePass(TM), PMW);
2235
2236 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
2237 // have expanded.
2238 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2239 addFunctionPass(Pass: createFunctionToLoopPassAdaptor(Pass: LICMPass(LICMOptions()),
2240 /*UseMemorySSA=*/true),
2241 PMW);
2242 }
2243 }
2244
2245 Base::addIRPasses(PMW);
2246
2247 // EarlyCSE is not always strong enough to clean up what LSR produces. For
2248 // example, GVN can combine
2249 //
2250 // %0 = add %a, %b
2251 // %1 = add %b, %a
2252 //
2253 // and
2254 //
2255 // %0 = shl nsw %a, 2
2256 // %1 = shl %a, 2
2257 //
2258 // but EarlyCSE can do neither of them.
2259 if (isPassEnabled(Opt: EnableScalarIRPasses))
2260 addEarlyCSEOrGVNPass(PMW);
2261}
2262
2263void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(
2264 PassManagerWrapper &PMW) const {
2265 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2266 flushFPMsToMPM(PMW);
2267 addModulePass(Pass: AMDGPUPreloadKernelArgumentsPass(TM), PMW);
2268 }
2269
2270 if (EnableLowerKernelArguments)
2271 addFunctionPass(Pass: AMDGPULowerKernelArgumentsPass(TM), PMW);
2272
2273 Base::addCodeGenPrepare(PMW);
2274
2275 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
2276 addFunctionPass(Pass: LoadStoreVectorizerPass(), PMW);
2277
2278 // This lowering has been placed after codegenprepare to take advantage of
2279 // address mode matching (which is why it isn't put with the LDS lowerings).
2280 // It could be placed anywhere before uniformity annotations (an analysis
2281 // that it changes by splitting up fat pointers into their components)
2282 // but has been put before switch lowering and CFG flattening so that those
2283 // passes can run on the more optimized control flow this pass creates in
2284 // many cases.
2285 flushFPMsToMPM(PMW);
2286 addModulePass(Pass: AMDGPULowerBufferFatPointersPass(TM), PMW);
2287 flushFPMsToMPM(PMW);
2288 requireCGSCCOrder(PMW);
2289
2290 addModulePass(Pass: AMDGPULowerIntrinsicsPass(TM), PMW);
2291
2292 // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
2293 // behavior for subsequent passes. Placing it here seems better that these
2294 // blocks would get cleaned up by UnreachableBlockElim inserted next in the
2295 // pass flow.
2296 addFunctionPass(Pass: LowerSwitchPass(), PMW);
2297}
2298
2299void AMDGPUCodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const {
2300
2301 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2302 addFunctionPass(Pass: FlattenCFGPass(), PMW);
2303 addFunctionPass(Pass: SinkingPass(), PMW);
2304 addFunctionPass(Pass: AMDGPULateCodeGenPreparePass(TM), PMW);
2305 }
2306
2307 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
2308 // regions formed by them.
2309
2310 addFunctionPass(Pass: AMDGPUUnifyDivergentExitNodesPass(), PMW);
2311 addFunctionPass(Pass: FixIrreduciblePass(), PMW);
2312 addFunctionPass(Pass: UnifyLoopExitsPass(), PMW);
2313 addFunctionPass(Pass: StructurizeCFGPass(/*SkipUniformRegions=*/false), PMW);
2314
2315 addFunctionPass(Pass: AMDGPUAnnotateUniformValuesPass(), PMW);
2316
2317 addFunctionPass(Pass: SIAnnotateControlFlowPass(TM), PMW);
2318
2319 // TODO: Move this right after structurizeCFG to avoid extra divergence
2320 // analysis. This depends on stopping SIAnnotateControlFlow from making
2321 // control flow modifications.
2322 addFunctionPass(Pass: AMDGPURewriteUndefForPHIPass(), PMW);
2323
2324 if (!getCGPassBuilderOption().EnableGlobalISelOption ||
2325 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
2326 addFunctionPass(Pass: LCSSAPass(), PMW);
2327
2328 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2329 flushFPMsToMPM(PMW);
2330 addModulePass(Pass: AMDGPUPerfHintAnalysisPass(TM), PMW);
2331 }
2332
2333 // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
2334 // isn't this in addInstSelector?
2335 addFunctionPass(Pass: RequireAnalysisPass<UniformityInfoAnalysis, Function>(), PMW,
2336 /*Force=*/true);
2337}
2338
2339void AMDGPUCodeGenPassBuilder::addILPOpts(PassManagerWrapper &PMW) const {
2340 if (EnableEarlyIfConversion)
2341 addMachineFunctionPass(Pass: EarlyIfConverterPass(), PMW);
2342
2343 Base::addILPOpts(PMW);
2344}
2345
2346void AMDGPUCodeGenPassBuilder::addAsmPrinterBegin(
2347 PassManagerWrapper &PMW, CreateMCStreamer CreateStreamer) const {
2348 // TODO: Add AsmPrinterBegin
2349}
2350
2351void AMDGPUCodeGenPassBuilder::addAsmPrinter(
2352 PassManagerWrapper &PMW, CreateMCStreamer CreateStreamer) const {
2353 // TODO: Add AsmPrinter.
2354}
2355
2356void AMDGPUCodeGenPassBuilder::addAsmPrinterEnd(
2357 PassManagerWrapper &PMW, CreateMCStreamer CreateStreamer) const {
2358 // TODO: Add AsmPrinterEnd
2359}
2360
2361Error AMDGPUCodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const {
2362 addMachineFunctionPass(Pass: AMDGPUISelDAGToDAGPass(TM), PMW);
2363 addMachineFunctionPass(Pass: SIFixSGPRCopiesPass(), PMW);
2364 addMachineFunctionPass(Pass: SILowerI1CopiesPass(), PMW);
2365 return Error::success();
2366}
2367
2368void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const {
2369 if (EnableRegReassign) {
2370 addMachineFunctionPass(Pass: GCNNSAReassignPass(), PMW);
2371 }
2372
2373 addMachineFunctionPass(Pass: AMDGPURewriteAGPRCopyMFMAPass(), PMW);
2374}
2375
2376void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
2377 PassManagerWrapper &PMW) const {
2378 Base::addMachineSSAOptimization(PMW);
2379
2380 addMachineFunctionPass(Pass: SIFoldOperandsPass(), PMW);
2381 if (EnableDPPCombine) {
2382 addMachineFunctionPass(Pass: GCNDPPCombinePass(), PMW);
2383 }
2384 addMachineFunctionPass(Pass: SILoadStoreOptimizerPass(), PMW);
2385 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
2386 addMachineFunctionPass(Pass: SIPeepholeSDWAPass(), PMW);
2387 addMachineFunctionPass(Pass: EarlyMachineLICMPass(), PMW);
2388 addMachineFunctionPass(Pass: MachineCSEPass(), PMW);
2389 addMachineFunctionPass(Pass: SIFoldOperandsPass(), PMW);
2390 }
2391 addMachineFunctionPass(Pass: DeadMachineInstructionElimPass(), PMW);
2392 addMachineFunctionPass(Pass: SIShrinkInstructionsPass(), PMW);
2393}
2394
2395Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const {
2396 insertPass<PHIEliminationPass>(Pass: SILowerControlFlowPass());
2397
2398 insertPass<TwoAddressInstructionPass>(Pass: SIWholeQuadModePass());
2399
2400 return Base::addFastRegAlloc(PMW);
2401}
2402
2403Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast(
2404 PassManagerWrapper &PMW) const {
2405 if (auto Err = validateRegAllocOptions())
2406 return Err;
2407
2408 addMachineFunctionPass(Pass: GCNPreRALongBranchRegPass(), PMW);
2409
2410 // SGPR allocation - default to fast at -O0.
2411 if (SGPRRegAllocNPM == RegAllocType::Greedy)
2412 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2413 else
2414 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2415 PMW);
2416
2417 // Equivalent of PEI for SGPRs.
2418 addMachineFunctionPass(Pass: SILowerSGPRSpillsPass(), PMW);
2419
2420 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2421 addMachineFunctionPass(Pass: SIPreAllocateWWMRegsPass(), PMW);
2422
2423 // WWM allocation - default to fast at -O0.
2424 if (WWMRegAllocNPM == RegAllocType::Greedy)
2425 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2426 else
2427 addMachineFunctionPass(
2428 Pass: RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
2429
2430 addMachineFunctionPass(Pass: SILowerWWMCopiesPass(), PMW);
2431 addMachineFunctionPass(Pass: AMDGPUReserveWWMRegsPass(), PMW);
2432
2433 // VGPR allocation - default to fast at -O0.
2434 if (VGPRRegAllocNPM == RegAllocType::Greedy)
2435 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2436 else
2437 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2438
2439 return Error::success();
2440}
2441
2442Error AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
2443 PassManagerWrapper &PMW) const {
2444 if (EnableDCEInRA)
2445 insertPass<DetectDeadLanesPass>(Pass: DeadMachineInstructionElimPass());
2446
2447 // FIXME: when an instruction has a Killed operand, and the instruction is
2448 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
2449 // the register in LiveVariables, this would trigger a failure in verifier,
2450 // we should fix it and enable the verifier.
2451 if (OptVGPRLiveRange)
2452 insertPass<RequireAnalysisPass<LiveVariablesAnalysis, MachineFunction>>(
2453 Pass: SIOptimizeVGPRLiveRangePass());
2454
2455 // This must be run immediately after phi elimination and before
2456 // TwoAddressInstructions, otherwise the processing of the tied operand of
2457 // SI_ELSE will introduce a copy of the tied operand source after the else.
2458 insertPass<PHIEliminationPass>(Pass: SILowerControlFlowPass());
2459
2460 if (EnableRewritePartialRegUses)
2461 insertPass<RenameIndependentSubregsPass>(Pass: GCNRewritePartialRegUsesPass());
2462
2463 if (isPassEnabled(Opt: EnablePreRAOptimizations))
2464 insertPass<MachineSchedulerPass>(Pass: GCNPreRAOptimizationsPass());
2465
2466 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
2467 // instructions that cause scheduling barriers.
2468 insertPass<MachineSchedulerPass>(Pass: SIWholeQuadModePass());
2469
2470 if (OptExecMaskPreRA)
2471 insertPass<MachineSchedulerPass>(Pass: SIOptimizeExecMaskingPreRAPass());
2472
2473 // This is not an essential optimization and it has a noticeable impact on
2474 // compilation time, so we only enable it from O2.
2475 if (TM.getOptLevel() > CodeGenOptLevel::Less)
2476 insertPass<MachineSchedulerPass>(Pass: SIFormMemoryClausesPass());
2477
2478 return Base::addOptimizedRegAlloc(PMW);
2479}
2480
2481void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const {
2482 if (getOptLevel() != CodeGenOptLevel::None)
2483 addMachineFunctionPass(Pass: AMDGPUPrepareAGPRAllocPass(), PMW);
2484}
2485
2486Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
2487 PassManagerWrapper &PMW) const {
2488 if (auto Err = validateRegAllocOptions())
2489 return Err;
2490
2491 addMachineFunctionPass(Pass: GCNPreRALongBranchRegPass(), PMW);
2492
2493 // SGPR allocation - default to greedy at -O1 and above.
2494 if (SGPRRegAllocNPM == RegAllocType::Fast)
2495 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2496 PMW);
2497 else
2498 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2499
2500 // Commit allocated register changes. This is mostly necessary because too
2501 // many things rely on the use lists of the physical registers, such as the
2502 // verifier. This is only necessary with allocators which use LiveIntervals,
2503 // since FastRegAlloc does the replacements itself.
2504 addMachineFunctionPass(Pass: VirtRegRewriterPass(false), PMW);
2505
2506 // At this point, the sgpr-regalloc has been done and it is good to have the
2507 // stack slot coloring to try to optimize the SGPR spill stack indices before
2508 // attempting the custom SGPR spill lowering.
2509 addMachineFunctionPass(Pass: StackSlotColoringPass(), PMW);
2510
2511 // Equivalent of PEI for SGPRs.
2512 addMachineFunctionPass(Pass: SILowerSGPRSpillsPass(), PMW);
2513
2514 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2515 addMachineFunctionPass(Pass: SIPreAllocateWWMRegsPass(), PMW);
2516
2517 // WWM allocation - default to greedy at -O1 and above.
2518 if (WWMRegAllocNPM == RegAllocType::Fast)
2519 addMachineFunctionPass(
2520 Pass: RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
2521 else
2522 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2523 addMachineFunctionPass(Pass: SILowerWWMCopiesPass(), PMW);
2524 addMachineFunctionPass(Pass: VirtRegRewriterPass(false), PMW);
2525 addMachineFunctionPass(Pass: AMDGPUReserveWWMRegsPass(), PMW);
2526
2527 // VGPR allocation - default to greedy at -O1 and above.
2528 if (VGPRRegAllocNPM == RegAllocType::Fast)
2529 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2530 else
2531 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2532
2533 addPreRewrite(PMW);
2534 addMachineFunctionPass(Pass: VirtRegRewriterPass(true), PMW);
2535
2536 addMachineFunctionPass(Pass: AMDGPUMarkLastScratchLoadPass(), PMW);
2537 return Error::success();
2538}
2539
2540void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const {
2541 addMachineFunctionPass(Pass: SIFixVGPRCopiesPass(), PMW);
2542 if (TM.getOptLevel() > CodeGenOptLevel::None)
2543 addMachineFunctionPass(Pass: SIOptimizeExecMaskingPass(), PMW);
2544 Base::addPostRegAlloc(PMW);
2545}
2546
2547void AMDGPUCodeGenPassBuilder::addPreSched2(PassManagerWrapper &PMW) const {
2548 if (TM.getOptLevel() > CodeGenOptLevel::None)
2549 addMachineFunctionPass(Pass: SIShrinkInstructionsPass(), PMW);
2550 addMachineFunctionPass(Pass: SIPostRABundlerPass(), PMW);
2551}
2552
2553void AMDGPUCodeGenPassBuilder::addPostBBSections(
2554 PassManagerWrapper &PMW) const {
2555 // We run this later to avoid passes like livedebugvalues and BBSections
2556 // having to deal with the apparent multi-entry functions we may generate.
2557 addMachineFunctionPass(Pass: AMDGPUPreloadKernArgPrologPass(), PMW);
2558}
2559
2560void AMDGPUCodeGenPassBuilder::addPreEmitPass(PassManagerWrapper &PMW) const {
2561 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less)) {
2562 addMachineFunctionPass(Pass: GCNCreateVOPDPass(), PMW);
2563 }
2564
2565 addMachineFunctionPass(Pass: SIMemoryLegalizerPass(), PMW);
2566 addMachineFunctionPass(Pass: SIInsertWaitcntsPass(), PMW);
2567
2568 addMachineFunctionPass(Pass: SIModeRegisterPass(), PMW);
2569
2570 if (TM.getOptLevel() > CodeGenOptLevel::None)
2571 addMachineFunctionPass(Pass: SIInsertHardClausesPass(), PMW);
2572
2573 addMachineFunctionPass(Pass: SILateBranchLoweringPass(), PMW);
2574
2575 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
2576 addMachineFunctionPass(Pass: AMDGPUSetWavePriorityPass(), PMW);
2577
2578 if (TM.getOptLevel() > CodeGenOptLevel::None)
2579 addMachineFunctionPass(Pass: SIPreEmitPeepholePass(), PMW);
2580
2581 // The hazard recognizer that runs as part of the post-ra scheduler does not
2582 // guarantee to be able handle all hazards correctly. This is because if there
2583 // are multiple scheduling regions in a basic block, the regions are scheduled
2584 // bottom up, so when we begin to schedule a region we don't know what
2585 // instructions were emitted directly before it.
2586 //
2587 // Here we add a stand-alone hazard recognizer pass which can handle all
2588 // cases.
2589 addMachineFunctionPass(Pass: PostRAHazardRecognizerPass(), PMW);
2590 addMachineFunctionPass(Pass: AMDGPUWaitSGPRHazardsPass(), PMW);
2591 addMachineFunctionPass(Pass: AMDGPULowerVGPREncodingPass(), PMW);
2592
2593 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less)) {
2594 addMachineFunctionPass(Pass: AMDGPUInsertDelayAluPass(), PMW);
2595 }
2596
2597 addMachineFunctionPass(Pass: BranchRelaxationPass(), PMW);
2598}
2599
2600bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
2601 CodeGenOptLevel Level) const {
2602 if (Opt.getNumOccurrences())
2603 return Opt;
2604 if (TM.getOptLevel() < Level)
2605 return false;
2606 return Opt;
2607}
2608
2609void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(
2610 PassManagerWrapper &PMW) const {
2611 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
2612 addFunctionPass(Pass: GVNPass(), PMW);
2613 else
2614 addFunctionPass(Pass: EarlyCSEPass(), PMW);
2615}
2616
2617void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
2618 PassManagerWrapper &PMW) const {
2619 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
2620 addFunctionPass(Pass: LoopDataPrefetchPass(), PMW);
2621
2622 addFunctionPass(Pass: SeparateConstOffsetFromGEPPass(), PMW);
2623
2624 // ReassociateGEPs exposes more opportunities for SLSR. See
2625 // the example in reassociate-geps-and-slsr.ll.
2626 addFunctionPass(Pass: StraightLineStrengthReducePass(), PMW);
2627
2628 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
2629 // EarlyCSE can reuse.
2630 addEarlyCSEOrGVNPass(PMW);
2631
2632 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
2633 addFunctionPass(Pass: NaryReassociatePass(), PMW);
2634
2635 // NaryReassociate on GEPs creates redundant common expressions, so run
2636 // EarlyCSE after it.
2637 addFunctionPass(Pass: EarlyCSEPass(), PMW);
2638}
2639