1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains both AMDGPU target machine and the CodeGen pass builder.
11/// The AMDGPU target machine contains all of the hardware specific information
12/// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
13/// CodeGen pass builder handles the pass pipeline for new pass manager.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetMachine.h"
18#include "AMDGPU.h"
19#include "AMDGPUAliasAnalysis.h"
20#include "AMDGPUBarrierLatency.h"
21#include "AMDGPUCtorDtorLowering.h"
22#include "AMDGPUExportClustering.h"
23#include "AMDGPUExportKernelRuntimeHandles.h"
24#include "AMDGPUHazardLatency.h"
25#include "AMDGPUIGroupLP.h"
26#include "AMDGPUISelDAGToDAG.h"
27#include "AMDGPULowerVGPREncoding.h"
28#include "AMDGPUMacroFusion.h"
29#include "AMDGPUPerfHintAnalysis.h"
30#include "AMDGPUPreloadKernArgProlog.h"
31#include "AMDGPUPrepareAGPRAlloc.h"
32#include "AMDGPURemoveIncompatibleFunctions.h"
33#include "AMDGPUReserveWWMRegs.h"
34#include "AMDGPUResourceUsageAnalysis.h"
35#include "AMDGPUSplitModule.h"
36#include "AMDGPUTargetObjectFile.h"
37#include "AMDGPUTargetTransformInfo.h"
38#include "AMDGPUUnifyDivergentExitNodes.h"
39#include "AMDGPUWaitSGPRHazards.h"
40#include "GCNDPPCombine.h"
41#include "GCNIterativeScheduler.h"
42#include "GCNNSAReassign.h"
43#include "GCNPreRALongBranchReg.h"
44#include "GCNPreRAOptimizations.h"
45#include "GCNRewritePartialRegUses.h"
46#include "GCNSchedStrategy.h"
47#include "GCNVOPDUtils.h"
48#include "R600.h"
49#include "R600TargetMachine.h"
50#include "SIFixSGPRCopies.h"
51#include "SIFixVGPRCopies.h"
52#include "SIFoldOperands.h"
53#include "SIFormMemoryClauses.h"
54#include "SILoadStoreOptimizer.h"
55#include "SILowerControlFlow.h"
56#include "SILowerSGPRSpills.h"
57#include "SILowerWWMCopies.h"
58#include "SIMachineFunctionInfo.h"
59#include "SIMachineScheduler.h"
60#include "SIOptimizeExecMasking.h"
61#include "SIOptimizeExecMaskingPreRA.h"
62#include "SIOptimizeVGPRLiveRange.h"
63#include "SIPeepholeSDWA.h"
64#include "SIPostRABundler.h"
65#include "SIPreAllocateWWMRegs.h"
66#include "SIShrinkInstructions.h"
67#include "SIWholeQuadMode.h"
68#include "TargetInfo/AMDGPUTargetInfo.h"
69#include "Utils/AMDGPUBaseInfo.h"
70#include "llvm/Analysis/CGSCCPassManager.h"
71#include "llvm/Analysis/CallGraphSCCPass.h"
72#include "llvm/Analysis/KernelInfo.h"
73#include "llvm/Analysis/UniformityAnalysis.h"
74#include "llvm/CodeGen/AtomicExpand.h"
75#include "llvm/CodeGen/BranchRelaxation.h"
76#include "llvm/CodeGen/DeadMachineInstructionElim.h"
77#include "llvm/CodeGen/EarlyIfConversion.h"
78#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
79#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
80#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
81#include "llvm/CodeGen/GlobalISel/Legalizer.h"
82#include "llvm/CodeGen/GlobalISel/Localizer.h"
83#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
84#include "llvm/CodeGen/MIRParser/MIParser.h"
85#include "llvm/CodeGen/MachineCSE.h"
86#include "llvm/CodeGen/MachineLICM.h"
87#include "llvm/CodeGen/MachineScheduler.h"
88#include "llvm/CodeGen/Passes.h"
89#include "llvm/CodeGen/PostRAHazardRecognizer.h"
90#include "llvm/CodeGen/RegAllocRegistry.h"
91#include "llvm/CodeGen/TargetPassConfig.h"
92#include "llvm/IR/IntrinsicsAMDGPU.h"
93#include "llvm/IR/PassManager.h"
94#include "llvm/IR/PatternMatch.h"
95#include "llvm/InitializePasses.h"
96#include "llvm/MC/TargetRegistry.h"
97#include "llvm/Passes/CodeGenPassBuilder.h"
98#include "llvm/Passes/PassBuilder.h"
99#include "llvm/Support/Compiler.h"
100#include "llvm/Support/FormatVariadic.h"
101#include "llvm/Transforms/HipStdPar/HipStdPar.h"
102#include "llvm/Transforms/IPO.h"
103#include "llvm/Transforms/IPO/AlwaysInliner.h"
104#include "llvm/Transforms/IPO/ExpandVariadics.h"
105#include "llvm/Transforms/IPO/GlobalDCE.h"
106#include "llvm/Transforms/IPO/Internalize.h"
107#include "llvm/Transforms/Scalar.h"
108#include "llvm/Transforms/Scalar/EarlyCSE.h"
109#include "llvm/Transforms/Scalar/FlattenCFG.h"
110#include "llvm/Transforms/Scalar/GVN.h"
111#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
112#include "llvm/Transforms/Scalar/LICM.h"
113#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
114#include "llvm/Transforms/Scalar/LoopPassManager.h"
115#include "llvm/Transforms/Scalar/NaryReassociate.h"
116#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
117#include "llvm/Transforms/Scalar/Sink.h"
118#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
119#include "llvm/Transforms/Scalar/StructurizeCFG.h"
120#include "llvm/Transforms/Utils.h"
121#include "llvm/Transforms/Utils/FixIrreducible.h"
122#include "llvm/Transforms/Utils/LCSSA.h"
123#include "llvm/Transforms/Utils/LowerSwitch.h"
124#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
125#include "llvm/Transforms/Utils/UnifyLoopExits.h"
126#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
127#include <optional>
128
129using namespace llvm;
130using namespace llvm::PatternMatch;
131
132namespace {
133//===----------------------------------------------------------------------===//
134// AMDGPU CodeGen Pass Builder interface.
135//===----------------------------------------------------------------------===//
136
137class AMDGPUCodeGenPassBuilder
138 : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
139 using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
140
141public:
142 AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
143 const CGPassBuilderOption &Opts,
144 PassInstrumentationCallbacks *PIC);
145
146 void addIRPasses(PassManagerWrapper &PMW) const;
147 void addCodeGenPrepare(PassManagerWrapper &PMW) const;
148 void addPreISel(PassManagerWrapper &PMW) const;
149 void addILPOpts(PassManagerWrapper &PMWM) const;
150 void addAsmPrinterBegin(PassManagerWrapper &PMW, CreateMCStreamer) const;
151 void addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const;
152 void addAsmPrinterEnd(PassManagerWrapper &PMW, CreateMCStreamer) const;
153 Error addInstSelector(PassManagerWrapper &PMW) const;
154 void addPreRewrite(PassManagerWrapper &PMW) const;
155 void addMachineSSAOptimization(PassManagerWrapper &PMW) const;
156 void addPostRegAlloc(PassManagerWrapper &PMW) const;
157 void addPreEmitPass(PassManagerWrapper &PMWM) const;
158 void addPreEmitRegAlloc(PassManagerWrapper &PMW) const;
159 Error addRegAssignmentFast(PassManagerWrapper &PMW) const;
160 Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const;
161 void addPreRegAlloc(PassManagerWrapper &PMW) const;
162 Error addFastRegAlloc(PassManagerWrapper &PMW) const;
163 Error addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
164 void addPreSched2(PassManagerWrapper &PMW) const;
165 void addPostBBSections(PassManagerWrapper &PMW) const;
166
167private:
168 Error validateRegAllocOptions() const;
169
170public:
171 /// Check if a pass is enabled given \p Opt option. The option always
172 /// overrides defaults if explicitly used. Otherwise its default will be used
173 /// given that a pass shall work at an optimization \p Level minimum.
174 bool isPassEnabled(const cl::opt<bool> &Opt,
175 CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
176 void addEarlyCSEOrGVNPass(PassManagerWrapper &PMW) const;
177 void addStraightLineScalarOptimizationPasses(PassManagerWrapper &PMW) const;
178};
179
180class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
181public:
182 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
183 : RegisterRegAllocBase(N, D, C) {}
184};
185
186class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
187public:
188 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
189 : RegisterRegAllocBase(N, D, C) {}
190};
191
192class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
193public:
194 WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
195 : RegisterRegAllocBase(N, D, C) {}
196};
197
198static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
199 const MachineRegisterInfo &MRI,
200 const Register Reg) {
201 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
202 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
203}
204
205static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
206 const MachineRegisterInfo &MRI,
207 const Register Reg) {
208 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
209 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
210}
211
212static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
213 const MachineRegisterInfo &MRI,
214 const Register Reg) {
215 const SIMachineFunctionInfo *MFI =
216 MRI.getMF().getInfo<SIMachineFunctionInfo>();
217 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
218 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
219 MFI->checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG);
220}
221
222/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
223static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
224
225/// A dummy default pass factory indicates whether the register allocator is
226/// overridden on the command line.
227static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
228static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
229static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
230
231static SGPRRegisterRegAlloc
232defaultSGPRRegAlloc("default",
233 "pick SGPR register allocator based on -O option",
234 useDefaultRegisterAllocator);
235
236static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
237 RegisterPassParser<SGPRRegisterRegAlloc>>
238SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
239 cl::desc("Register allocator to use for SGPRs"));
240
241static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
242 RegisterPassParser<VGPRRegisterRegAlloc>>
243VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
244 cl::desc("Register allocator to use for VGPRs"));
245
246static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
247 RegisterPassParser<WWMRegisterRegAlloc>>
248 WWMRegAlloc("wwm-regalloc", cl::Hidden,
249 cl::init(Val: &useDefaultRegisterAllocator),
250 cl::desc("Register allocator to use for WWM registers"));
251
252// New pass manager register allocator options for AMDGPU
253static cl::opt<RegAllocType, false, RegAllocTypeParser> SGPRRegAllocNPM(
254 "sgpr-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
255 cl::desc("Register allocator for SGPRs (new pass manager)"));
256
257static cl::opt<RegAllocType, false, RegAllocTypeParser> VGPRRegAllocNPM(
258 "vgpr-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
259 cl::desc("Register allocator for VGPRs (new pass manager)"));
260
261static cl::opt<RegAllocType, false, RegAllocTypeParser> WWMRegAllocNPM(
262 "wwm-regalloc-npm", cl::Hidden, cl::init(Val: RegAllocType::Default),
263 cl::desc("Register allocator for WWM registers (new pass manager)"));
264
265/// Check if the given RegAllocType is supported for AMDGPU NPM register
266/// allocation. Only Fast and Greedy are supported; Basic and PBQP are not.
267static Error checkRegAllocSupported(RegAllocType RAType, StringRef RegName) {
268 if (RAType == RegAllocType::Basic || RAType == RegAllocType::PBQP) {
269 return make_error<StringError>(
270 Args: Twine("unsupported register allocator '") +
271 (RAType == RegAllocType::Basic ? "basic" : "pbqp") + "' for " +
272 RegName + " registers",
273 Args: inconvertibleErrorCode());
274 }
275 return Error::success();
276}
277
278Error AMDGPUCodeGenPassBuilder::validateRegAllocOptions() const {
279 // 1. Generic --regalloc-npm is not supported for AMDGPU.
280 if (Opt.RegAlloc != RegAllocType::Unset) {
281 return make_error<StringError>(
282 Args: "-regalloc-npm not supported for amdgcn. Use -sgpr-regalloc-npm, "
283 "-vgpr-regalloc-npm, and -wwm-regalloc-npm",
284 Args: inconvertibleErrorCode());
285 }
286
287 // 2. Legacy PM regalloc options are not compatible with NPM.
288 if (SGPRRegAlloc.getNumOccurrences() > 0 ||
289 VGPRRegAlloc.getNumOccurrences() > 0 ||
290 WWMRegAlloc.getNumOccurrences() > 0) {
291 return make_error<StringError>(
292 Args: "-sgpr-regalloc, -vgpr-regalloc, and -wwm-regalloc are legacy PM "
293 "options. Use -sgpr-regalloc-npm, -vgpr-regalloc-npm, and "
294 "-wwm-regalloc-npm with the new pass manager",
295 Args: inconvertibleErrorCode());
296 }
297
298 // 3. Only Fast and Greedy allocators are supported for AMDGPU.
299 if (auto Err = checkRegAllocSupported(RAType: SGPRRegAllocNPM, RegName: "SGPR"))
300 return Err;
301 if (auto Err = checkRegAllocSupported(RAType: WWMRegAllocNPM, RegName: "WWM"))
302 return Err;
303 if (auto Err = checkRegAllocSupported(RAType: VGPRRegAllocNPM, RegName: "VGPR"))
304 return Err;
305
306 return Error::success();
307}
308
309static void initializeDefaultSGPRRegisterAllocatorOnce() {
310 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
311
312 if (!Ctor) {
313 Ctor = SGPRRegAlloc;
314 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
315 }
316}
317
318static void initializeDefaultVGPRRegisterAllocatorOnce() {
319 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
320
321 if (!Ctor) {
322 Ctor = VGPRRegAlloc;
323 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
324 }
325}
326
327static void initializeDefaultWWMRegisterAllocatorOnce() {
328 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
329
330 if (!Ctor) {
331 Ctor = WWMRegAlloc;
332 WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
333 }
334}
335
336static FunctionPass *createBasicSGPRRegisterAllocator() {
337 return createBasicRegisterAllocator(F: onlyAllocateSGPRs);
338}
339
340static FunctionPass *createGreedySGPRRegisterAllocator() {
341 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
342}
343
344static FunctionPass *createFastSGPRRegisterAllocator() {
345 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
346}
347
348static FunctionPass *createBasicVGPRRegisterAllocator() {
349 return createBasicRegisterAllocator(F: onlyAllocateVGPRs);
350}
351
352static FunctionPass *createGreedyVGPRRegisterAllocator() {
353 return createGreedyRegisterAllocator(F: onlyAllocateVGPRs);
354}
355
356static FunctionPass *createFastVGPRRegisterAllocator() {
357 return createFastRegisterAllocator(F: onlyAllocateVGPRs, ClearVirtRegs: true);
358}
359
360static FunctionPass *createBasicWWMRegisterAllocator() {
361 return createBasicRegisterAllocator(F: onlyAllocateWWMRegs);
362}
363
364static FunctionPass *createGreedyWWMRegisterAllocator() {
365 return createGreedyRegisterAllocator(F: onlyAllocateWWMRegs);
366}
367
368static FunctionPass *createFastWWMRegisterAllocator() {
369 return createFastRegisterAllocator(F: onlyAllocateWWMRegs, ClearVirtRegs: false);
370}
371
372static SGPRRegisterRegAlloc basicRegAllocSGPR(
373 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
374static SGPRRegisterRegAlloc greedyRegAllocSGPR(
375 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
376
377static SGPRRegisterRegAlloc fastRegAllocSGPR(
378 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
379
380
381static VGPRRegisterRegAlloc basicRegAllocVGPR(
382 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
383static VGPRRegisterRegAlloc greedyRegAllocVGPR(
384 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
385
386static VGPRRegisterRegAlloc fastRegAllocVGPR(
387 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
388static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
389 "basic register allocator",
390 createBasicWWMRegisterAllocator);
391static WWMRegisterRegAlloc
392 greedyRegAllocWWMReg("greedy", "greedy register allocator",
393 createGreedyWWMRegisterAllocator);
394static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
395 createFastWWMRegisterAllocator);
396
397static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
398 return Phase == ThinOrFullLTOPhase::FullLTOPreLink ||
399 Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
400}
401} // anonymous namespace
402
403static cl::opt<bool>
404EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
405 cl::desc("Run early if-conversion"),
406 cl::init(Val: false));
407
408static cl::opt<bool>
409OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
410 cl::desc("Run pre-RA exec mask optimizations"),
411 cl::init(Val: true));
412
413static cl::opt<bool>
414 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
415 cl::desc("Lower GPU ctor / dtors to globals on the device."),
416 cl::init(Val: true), cl::Hidden);
417
418// Option to disable vectorizer for tests.
419static cl::opt<bool> EnableLoadStoreVectorizer(
420 "amdgpu-load-store-vectorizer",
421 cl::desc("Enable load store vectorizer"),
422 cl::init(Val: true),
423 cl::Hidden);
424
425// Option to control global loads scalarization
426static cl::opt<bool> ScalarizeGlobal(
427 "amdgpu-scalarize-global-loads",
428 cl::desc("Enable global load scalarization"),
429 cl::init(Val: true),
430 cl::Hidden);
431
432// Option to run internalize pass.
433static cl::opt<bool> InternalizeSymbols(
434 "amdgpu-internalize-symbols",
435 cl::desc("Enable elimination of non-kernel functions and unused globals"),
436 cl::init(Val: false),
437 cl::Hidden);
438
439// Option to inline all early.
440static cl::opt<bool> EarlyInlineAll(
441 "amdgpu-early-inline-all",
442 cl::desc("Inline all functions early"),
443 cl::init(Val: false),
444 cl::Hidden);
445
446static cl::opt<bool> RemoveIncompatibleFunctions(
447 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
448 cl::desc("Enable removal of functions when they"
449 "use features not supported by the target GPU"),
450 cl::init(Val: true));
451
452static cl::opt<bool> EnableSDWAPeephole(
453 "amdgpu-sdwa-peephole",
454 cl::desc("Enable SDWA peepholer"),
455 cl::init(Val: true));
456
457static cl::opt<bool> EnableDPPCombine(
458 "amdgpu-dpp-combine",
459 cl::desc("Enable DPP combiner"),
460 cl::init(Val: true));
461
462// Enable address space based alias analysis
463static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
464 cl::desc("Enable AMDGPU Alias Analysis"),
465 cl::init(Val: true));
466
467// Enable lib calls simplifications
468static cl::opt<bool> EnableLibCallSimplify(
469 "amdgpu-simplify-libcall",
470 cl::desc("Enable amdgpu library simplifications"),
471 cl::init(Val: true),
472 cl::Hidden);
473
474static cl::opt<bool> EnableLowerKernelArguments(
475 "amdgpu-ir-lower-kernel-arguments",
476 cl::desc("Lower kernel argument loads in IR pass"),
477 cl::init(Val: true),
478 cl::Hidden);
479
480static cl::opt<bool> EnableRegReassign(
481 "amdgpu-reassign-regs",
482 cl::desc("Enable register reassign optimizations on gfx10+"),
483 cl::init(Val: true),
484 cl::Hidden);
485
486static cl::opt<bool> OptVGPRLiveRange(
487 "amdgpu-opt-vgpr-liverange",
488 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
489 cl::init(Val: true), cl::Hidden);
490
491static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
492 "amdgpu-atomic-optimizer-strategy",
493 cl::desc("Select DPP or Iterative strategy for scan"),
494 cl::init(Val: ScanOptions::Iterative),
495 cl::values(
496 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
497 clEnumValN(ScanOptions::Iterative, "Iterative",
498 "Use Iterative approach for scan"),
499 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
500
501// Enable Mode register optimization
502static cl::opt<bool> EnableSIModeRegisterPass(
503 "amdgpu-mode-register",
504 cl::desc("Enable mode register pass"),
505 cl::init(Val: true),
506 cl::Hidden);
507
508// Enable GFX11+ s_delay_alu insertion
509static cl::opt<bool>
510 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
511 cl::desc("Enable s_delay_alu insertion"),
512 cl::init(Val: true), cl::Hidden);
513
514// Enable GFX11+ VOPD
515static cl::opt<bool>
516 EnableVOPD("amdgpu-enable-vopd",
517 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
518 cl::init(Val: true), cl::Hidden);
519
520// Option is used in lit tests to prevent deadcoding of patterns inspected.
521static cl::opt<bool>
522EnableDCEInRA("amdgpu-dce-in-ra",
523 cl::init(Val: true), cl::Hidden,
524 cl::desc("Enable machine DCE inside regalloc"));
525
526static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
527 cl::desc("Adjust wave priority"),
528 cl::init(Val: false), cl::Hidden);
529
530static cl::opt<bool> EnableScalarIRPasses(
531 "amdgpu-scalar-ir-passes",
532 cl::desc("Enable scalar IR passes"),
533 cl::init(Val: true),
534 cl::Hidden);
535
536static cl::opt<bool> EnableLowerExecSync(
537 "amdgpu-enable-lower-exec-sync",
538 cl::desc("Enable lowering of execution synchronization."), cl::init(Val: true),
539 cl::Hidden);
540
541static cl::opt<bool>
542 EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
543 cl::desc("Enable lowering of lds to global memory pass "
544 "and asan instrument resulting IR."),
545 cl::init(Val: true), cl::Hidden);
546
547static cl::opt<bool, true> EnableLowerModuleLDS(
548 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
549 cl::location(L&: AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(Val: true),
550 cl::Hidden);
551
552static cl::opt<bool> EnablePreRAOptimizations(
553 "amdgpu-enable-pre-ra-optimizations",
554 cl::desc("Enable Pre-RA optimizations pass"), cl::init(Val: true),
555 cl::Hidden);
556
557static cl::opt<bool> EnablePromoteKernelArguments(
558 "amdgpu-enable-promote-kernel-arguments",
559 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
560 cl::Hidden, cl::init(Val: true));
561
562static cl::opt<bool> EnableImageIntrinsicOptimizer(
563 "amdgpu-enable-image-intrinsic-optimizer",
564 cl::desc("Enable image intrinsic optimizer pass"), cl::init(Val: true),
565 cl::Hidden);
566
567static cl::opt<bool>
568 EnableLoopPrefetch("amdgpu-loop-prefetch",
569 cl::desc("Enable loop data prefetch on AMDGPU"),
570 cl::Hidden, cl::init(Val: false));
571
572static cl::opt<std::string>
573 AMDGPUSchedStrategy("amdgpu-sched-strategy",
574 cl::desc("Select custom AMDGPU scheduling strategy."),
575 cl::Hidden, cl::init(Val: ""));
576
577static cl::opt<bool> EnableRewritePartialRegUses(
578 "amdgpu-enable-rewrite-partial-reg-uses",
579 cl::desc("Enable rewrite partial reg uses pass"), cl::init(Val: true),
580 cl::Hidden);
581
582static cl::opt<bool> EnableHipStdPar(
583 "amdgpu-enable-hipstdpar",
584 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(Val: false),
585 cl::Hidden);
586
587static cl::opt<bool>
588 EnableAMDGPUAttributor("amdgpu-attributor-enable",
589 cl::desc("Enable AMDGPUAttributorPass"),
590 cl::init(Val: true), cl::Hidden);
591
592static cl::opt<bool> NewRegBankSelect(
593 "new-reg-bank-select",
594 cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of "
595 "regbankselect"),
596 cl::init(Val: false), cl::Hidden);
597
598static cl::opt<bool> HasClosedWorldAssumption(
599 "amdgpu-link-time-closed-world",
600 cl::desc("Whether has closed-world assumption at link time"),
601 cl::init(Val: false), cl::Hidden);
602
603static cl::opt<bool> EnableUniformIntrinsicCombine(
604 "amdgpu-enable-uniform-intrinsic-combine",
605 cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
606 cl::init(Val: true), cl::Hidden);
607
608extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
609 // Register the target
610 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
611 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
612
613 PassRegistry *PR = PassRegistry::getPassRegistry();
614 initializeR600ClauseMergePassPass(*PR);
615 initializeR600ControlFlowFinalizerPass(*PR);
616 initializeR600PacketizerPass(*PR);
617 initializeR600ExpandSpecialInstrsPassPass(*PR);
618 initializeR600VectorRegMergerPass(*PR);
619 initializeR600EmitClauseMarkersPass(*PR);
620 initializeR600MachineCFGStructurizerPass(*PR);
621 initializeGlobalISel(*PR);
622 initializeAMDGPUAsmPrinterPass(*PR);
623 initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
624 initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
625 initializeGCNDPPCombineLegacyPass(*PR);
626 initializeSILowerI1CopiesLegacyPass(*PR);
627 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
628 initializeAMDGPURegBankSelectPass(*PR);
629 initializeAMDGPURegBankLegalizePass(*PR);
630 initializeSILowerWWMCopiesLegacyPass(*PR);
631 initializeAMDGPUMarkLastScratchLoadLegacyPass(*PR);
632 initializeSILowerSGPRSpillsLegacyPass(*PR);
633 initializeSIFixSGPRCopiesLegacyPass(*PR);
634 initializeSIFixVGPRCopiesLegacyPass(*PR);
635 initializeSIFoldOperandsLegacyPass(*PR);
636 initializeSIPeepholeSDWALegacyPass(*PR);
637 initializeSIShrinkInstructionsLegacyPass(*PR);
638 initializeSIOptimizeExecMaskingPreRALegacyPass(*PR);
639 initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR);
640 initializeSILoadStoreOptimizerLegacyPass(*PR);
641 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
642 initializeAMDGPUAlwaysInlinePass(*PR);
643 initializeAMDGPULowerExecSyncLegacyPass(*PR);
644 initializeAMDGPUSwLowerLDSLegacyPass(*PR);
645 initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
646 initializeAMDGPUAtomicOptimizerPass(*PR);
647 initializeAMDGPULowerKernelArgumentsPass(*PR);
648 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
649 initializeAMDGPULowerKernelAttributesPass(*PR);
650 initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
651 initializeAMDGPUPostLegalizerCombinerPass(*PR);
652 initializeAMDGPUPreLegalizerCombinerPass(*PR);
653 initializeAMDGPURegBankCombinerPass(*PR);
654 initializeAMDGPUPromoteAllocaPass(*PR);
655 initializeAMDGPUCodeGenPreparePass(*PR);
656 initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
657 initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
658 initializeAMDGPULowerModuleLDSLegacyPass(*PR);
659 initializeAMDGPULowerBufferFatPointersPass(*PR);
660 initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
661 initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
662 initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
663 initializeAMDGPURewriteOutArgumentsPass(*PR);
664 initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
665 initializeSIAnnotateControlFlowLegacyPass(*PR);
666 initializeAMDGPUInsertDelayAluLegacyPass(*PR);
667 initializeAMDGPULowerVGPREncodingLegacyPass(*PR);
668 initializeSIInsertHardClausesLegacyPass(*PR);
669 initializeSIInsertWaitcntsLegacyPass(*PR);
670 initializeSIModeRegisterLegacyPass(*PR);
671 initializeSIWholeQuadModeLegacyPass(*PR);
672 initializeSILowerControlFlowLegacyPass(*PR);
673 initializeSIPreEmitPeepholeLegacyPass(*PR);
674 initializeSILateBranchLoweringLegacyPass(*PR);
675 initializeSIMemoryLegalizerLegacyPass(*PR);
676 initializeSIOptimizeExecMaskingLegacyPass(*PR);
677 initializeSIPreAllocateWWMRegsLegacyPass(*PR);
678 initializeSIFormMemoryClausesLegacyPass(*PR);
679 initializeSIPostRABundlerLegacyPass(*PR);
680 initializeGCNCreateVOPDLegacyPass(*PR);
681 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
682 initializeAMDGPUAAWrapperPassPass(*PR);
683 initializeAMDGPUExternalAAWrapperPass(*PR);
684 initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
685 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
686 initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR);
687 initializeGCNNSAReassignLegacyPass(*PR);
688 initializeGCNPreRAOptimizationsLegacyPass(*PR);
689 initializeGCNPreRALongBranchRegLegacyPass(*PR);
690 initializeGCNRewritePartialRegUsesLegacyPass(*PR);
691 initializeGCNRegPressurePrinterPass(*PR);
692 initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
693 initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
694 initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
695 initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
696}
697
698static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
699 return std::make_unique<AMDGPUTargetObjectFile>();
700}
701
702static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
703 return new SIScheduleDAGMI(C);
704}
705
706static ScheduleDAGInstrs *
707createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
708 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
709 ScheduleDAGMILive *DAG =
710 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(args&: C));
711 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
712 if (ST.shouldClusterStores())
713 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
714 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
715 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
716 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
717 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
718 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
719 return DAG;
720}
721
722static ScheduleDAGInstrs *
723createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
724 ScheduleDAGMILive *DAG =
725 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(args&: C));
726 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
727 return DAG;
728}
729
730static ScheduleDAGInstrs *
731createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
732 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
733 ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
734 C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(args&: C));
735 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
736 if (ST.shouldClusterStores())
737 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
738 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
739 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
740 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
741 return DAG;
742}
743
744static ScheduleDAGInstrs *
745createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
746 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
747 auto *DAG = new GCNIterativeScheduler(
748 C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
749 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
750 if (ST.shouldClusterStores())
751 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
752 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
753 return DAG;
754}
755
756static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
757 auto *DAG = new GCNIterativeScheduler(
758 C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
759 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
760 return DAG;
761}
762
763static ScheduleDAGInstrs *
764createIterativeILPMachineScheduler(MachineSchedContext *C) {
765 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
766 auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
767 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
768 if (ST.shouldClusterStores())
769 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
770 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
771 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
772 return DAG;
773}
774
775static MachineSchedRegistry
776SISchedRegistry("si", "Run SI's custom scheduler",
777 createSIMachineScheduler);
778
779static MachineSchedRegistry
780GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
781 "Run GCN scheduler to maximize occupancy",
782 createGCNMaxOccupancyMachineScheduler);
783
784static MachineSchedRegistry
785 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
786 createGCNMaxILPMachineScheduler);
787
788static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
789 "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
790 createGCNMaxMemoryClauseMachineScheduler);
791
792static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
793 "gcn-iterative-max-occupancy-experimental",
794 "Run GCN scheduler to maximize occupancy (experimental)",
795 createIterativeGCNMaxOccupancyMachineScheduler);
796
797static MachineSchedRegistry GCNMinRegSchedRegistry(
798 "gcn-iterative-minreg",
799 "Run GCN iterative scheduler for minimal register usage (experimental)",
800 createMinRegScheduler);
801
802static MachineSchedRegistry GCNILPSchedRegistry(
803 "gcn-iterative-ilp",
804 "Run GCN iterative scheduler for ILP scheduling (experimental)",
805 createIterativeILPMachineScheduler);
806
807LLVM_READNONE
808static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
809 if (!GPU.empty())
810 return GPU;
811
812 // Need to default to a target with flat support for HSA.
813 if (TT.isAMDGCN())
814 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
815
816 return "r600";
817}
818
819static Reloc::Model getEffectiveRelocModel() {
820 // The AMDGPU toolchain only supports generating shared objects, so we
821 // must always use PIC.
822 return Reloc::PIC_;
823}
824
825AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
826 StringRef CPU, StringRef FS,
827 const TargetOptions &Options,
828 std::optional<Reloc::Model> RM,
829 std::optional<CodeModel::Model> CM,
830 CodeGenOptLevel OptLevel)
831 : CodeGenTargetMachineImpl(
832 T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, GPU: CPU), FS, Options,
833 getEffectiveRelocModel(), getEffectiveCodeModel(CM, Default: CodeModel::Small),
834 OptLevel),
835 TLOF(createTLOF(TT: getTargetTriple())) {
836 initAsmInfo();
837 if (TT.isAMDGCN()) {
838 if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize64"))
839 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave64));
840 else if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize32"))
841 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave32));
842 }
843}
844
845bool AMDGPUTargetMachine::EnableFunctionCalls = false;
846bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
847
848AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
849
850StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
851 Attribute GPUAttr = F.getFnAttribute(Kind: "target-cpu");
852 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
853}
854
855StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
856 Attribute FSAttr = F.getFnAttribute(Kind: "target-features");
857
858 return FSAttr.isValid() ? FSAttr.getValueAsString()
859 : getTargetFeatureString();
860}
861
862llvm::ScheduleDAGInstrs *
863AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
864 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
865 ScheduleDAGMILive *DAG = createSchedLive(C);
866 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
867 if (ST.shouldClusterStores())
868 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
869 return DAG;
870}
871
872/// Predicate for Internalize pass.
873static bool mustPreserveGV(const GlobalValue &GV) {
874 if (const Function *F = dyn_cast<Function>(Val: &GV))
875 return F->isDeclaration() || F->getName().starts_with(Prefix: "__asan_") ||
876 F->getName().starts_with(Prefix: "__sanitizer_") ||
877 AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
878
879 GV.removeDeadConstantUsers();
880 return !GV.use_empty();
881}
882
883void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
884 if (EnableAMDGPUAliasAnalysis)
885 AAM.registerFunctionAnalysis<AMDGPUAA>();
886}
887
888static Expected<ScanOptions>
889parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
890 if (Params.empty())
891 return ScanOptions::Iterative;
892 Params.consume_front(Prefix: "strategy=");
893 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
894 .Case(S: "dpp", Value: ScanOptions::DPP)
895 .Cases(CaseStrings: {"iterative", ""}, Value: ScanOptions::Iterative)
896 .Case(S: "none", Value: ScanOptions::None)
897 .Default(Value: std::nullopt);
898 if (Result)
899 return *Result;
900 return make_error<StringError>(Args: "invalid parameter", Args: inconvertibleErrorCode());
901}
902
903Expected<AMDGPUAttributorOptions>
904parseAMDGPUAttributorPassOptions(StringRef Params) {
905 AMDGPUAttributorOptions Result;
906 while (!Params.empty()) {
907 StringRef ParamName;
908 std::tie(args&: ParamName, args&: Params) = Params.split(Separator: ';');
909 if (ParamName == "closed-world") {
910 Result.IsClosedWorld = true;
911 } else {
912 return make_error<StringError>(
913 Args: formatv(Fmt: "invalid AMDGPUAttributor pass parameter '{0}' ", Vals&: ParamName)
914 .str(),
915 Args: inconvertibleErrorCode());
916 }
917 }
918 return Result;
919}
920
921void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
922
923#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
924#include "llvm/Passes/TargetPassRegistry.inc"
925
926 PB.registerScalarOptimizerLateEPCallback(
927 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
928 if (Level == OptimizationLevel::O0)
929 return;
930
931 FPM.addPass(Pass: InferAddressSpacesPass());
932 });
933
934 PB.registerVectorizerEndEPCallback(
935 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
936 if (Level == OptimizationLevel::O0)
937 return;
938
939 FPM.addPass(Pass: InferAddressSpacesPass());
940 });
941
942 PB.registerPipelineEarlySimplificationEPCallback(
943 C: [](ModulePassManager &PM, OptimizationLevel Level,
944 ThinOrFullLTOPhase Phase) {
945 if (!isLTOPreLink(Phase)) {
946 // When we are not using -fgpu-rdc, we can run accelerator code
947 // selection relatively early, but still after linking to prevent
948 // eager removal of potentially reachable symbols.
949 if (EnableHipStdPar) {
950 PM.addPass(Pass: HipStdParMathFixupPass());
951 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
952 }
953 PM.addPass(Pass: AMDGPUPrintfRuntimeBindingPass());
954 }
955
956 if (Level == OptimizationLevel::O0)
957 return;
958
959 // We don't want to run internalization at per-module stage.
960 if (InternalizeSymbols && !isLTOPreLink(Phase)) {
961 PM.addPass(Pass: InternalizePass(mustPreserveGV));
962 PM.addPass(Pass: GlobalDCEPass());
963 }
964
965 if (EarlyInlineAll && !EnableFunctionCalls)
966 PM.addPass(Pass: AMDGPUAlwaysInlinePass());
967 });
968
969 PB.registerPeepholeEPCallback(
970 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
971 if (Level == OptimizationLevel::O0)
972 return;
973
974 FPM.addPass(Pass: AMDGPUUseNativeCallsPass());
975 if (EnableLibCallSimplify)
976 FPM.addPass(Pass: AMDGPUSimplifyLibCallsPass());
977
978 if (EnableUniformIntrinsicCombine)
979 FPM.addPass(Pass: AMDGPUUniformIntrinsicCombinePass());
980 });
981
982 PB.registerCGSCCOptimizerLateEPCallback(
983 C: [this](CGSCCPassManager &PM, OptimizationLevel Level) {
984 if (Level == OptimizationLevel::O0)
985 return;
986
987 FunctionPassManager FPM;
988
989 // Add promote kernel arguments pass to the opt pipeline right before
990 // infer address spaces which is needed to do actual address space
991 // rewriting.
992 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
993 EnablePromoteKernelArguments)
994 FPM.addPass(Pass: AMDGPUPromoteKernelArgumentsPass());
995
996 // Add infer address spaces pass to the opt pipeline after inlining
997 // but before SROA to increase SROA opportunities.
998 FPM.addPass(Pass: InferAddressSpacesPass());
999
1000 // This should run after inlining to have any chance of doing
1001 // anything, and before other cleanup optimizations.
1002 FPM.addPass(Pass: AMDGPULowerKernelAttributesPass());
1003
1004 if (Level != OptimizationLevel::O0) {
1005 // Promote alloca to vector before SROA and loop unroll. If we
1006 // manage to eliminate allocas before unroll we may choose to unroll
1007 // less.
1008 FPM.addPass(Pass: AMDGPUPromoteAllocaToVectorPass(*this));
1009 }
1010
1011 PM.addPass(Pass: createCGSCCToFunctionPassAdaptor(Pass: std::move(FPM)));
1012 });
1013
1014 // FIXME: Why is AMDGPUAttributor not in CGSCC?
1015 PB.registerOptimizerLastEPCallback(C: [this](ModulePassManager &MPM,
1016 OptimizationLevel Level,
1017 ThinOrFullLTOPhase Phase) {
1018 if (Level != OptimizationLevel::O0) {
1019 if (!isLTOPreLink(Phase)) {
1020 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
1021 AMDGPUAttributorOptions Opts;
1022 MPM.addPass(Pass: AMDGPUAttributorPass(*this, Opts, Phase));
1023 }
1024 }
1025 }
1026 });
1027
1028 PB.registerFullLinkTimeOptimizationLastEPCallback(
1029 C: [this](ModulePassManager &PM, OptimizationLevel Level) {
1030 // When we are using -fgpu-rdc, we can only run accelerator code
1031 // selection after linking to prevent, otherwise we end up removing
1032 // potentially reachable symbols that were exported as external in other
1033 // modules.
1034 if (EnableHipStdPar) {
1035 PM.addPass(Pass: HipStdParMathFixupPass());
1036 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
1037 }
1038 // We want to support the -lto-partitions=N option as "best effort".
1039 // For that, we need to lower LDS earlier in the pipeline before the
1040 // module is partitioned for codegen.
1041 if (EnableLowerExecSync)
1042 PM.addPass(Pass: AMDGPULowerExecSyncPass());
1043 if (EnableSwLowerLDS)
1044 PM.addPass(Pass: AMDGPUSwLowerLDSPass(*this));
1045 if (EnableLowerModuleLDS)
1046 PM.addPass(Pass: AMDGPULowerModuleLDSPass(*this));
1047 if (Level != OptimizationLevel::O0) {
1048 // We only want to run this with O2 or higher since inliner and SROA
1049 // don't run in O1.
1050 if (Level != OptimizationLevel::O1) {
1051 PM.addPass(
1052 Pass: createModuleToFunctionPassAdaptor(Pass: InferAddressSpacesPass()));
1053 }
1054 // Do we really need internalization in LTO?
1055 if (InternalizeSymbols) {
1056 PM.addPass(Pass: InternalizePass(mustPreserveGV));
1057 PM.addPass(Pass: GlobalDCEPass());
1058 }
1059 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
1060 AMDGPUAttributorOptions Opt;
1061 if (HasClosedWorldAssumption)
1062 Opt.IsClosedWorld = true;
1063 PM.addPass(Pass: AMDGPUAttributorPass(
1064 *this, Opt, ThinOrFullLTOPhase::FullLTOPostLink));
1065 }
1066 }
1067 if (!NoKernelInfoEndLTO) {
1068 FunctionPassManager FPM;
1069 FPM.addPass(Pass: KernelInfoPrinter(this));
1070 PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM)));
1071 }
1072 });
1073
1074 PB.registerRegClassFilterParsingCallback(
1075 C: [](StringRef FilterName) -> RegAllocFilterFunc {
1076 if (FilterName == "sgpr")
1077 return onlyAllocateSGPRs;
1078 if (FilterName == "vgpr")
1079 return onlyAllocateVGPRs;
1080 if (FilterName == "wwm")
1081 return onlyAllocateWWMRegs;
1082 return nullptr;
1083 });
1084}
1085
1086int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
1087 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1088 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1089 AddrSpace == AMDGPUAS::REGION_ADDRESS)
1090 ? -1
1091 : 0;
1092}
1093
1094bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
1095 unsigned DestAS) const {
1096 return AMDGPU::isFlatGlobalAddrSpace(AS: SrcAS) &&
1097 AMDGPU::isFlatGlobalAddrSpace(AS: DestAS);
1098}
1099
1100unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
1101 if (auto *Arg = dyn_cast<Argument>(Val: V);
1102 Arg &&
1103 AMDGPU::isModuleEntryFunctionCC(CC: Arg->getParent()->getCallingConv()) &&
1104 !Arg->hasByRefAttr())
1105 return AMDGPUAS::GLOBAL_ADDRESS;
1106
1107 const auto *LD = dyn_cast<LoadInst>(Val: V);
1108 if (!LD) // TODO: Handle invariant load like constant.
1109 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
1110
1111 // It must be a generic pointer loaded.
1112 assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
1113
1114 const auto *Ptr = LD->getPointerOperand();
1115 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
1116 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
1117 // For a generic pointer loaded from the constant memory, it could be assumed
1118 // as a global pointer since the constant memory is only populated on the
1119 // host side. As implied by the offload programming model, only global
1120 // pointers could be referenced on the host side.
1121 return AMDGPUAS::GLOBAL_ADDRESS;
1122}
1123
1124std::pair<const Value *, unsigned>
1125AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
1126 if (auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
1127 switch (II->getIntrinsicID()) {
1128 case Intrinsic::amdgcn_is_shared:
1129 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::LOCAL_ADDRESS);
1130 case Intrinsic::amdgcn_is_private:
1131 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::PRIVATE_ADDRESS);
1132 default:
1133 break;
1134 }
1135 return std::pair(nullptr, -1);
1136 }
1137 // Check the global pointer predication based on
1138 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
1139 // the order of 'is_shared' and 'is_private' is not significant.
1140 Value *Ptr;
1141 if (match(
1142 V: const_cast<Value *>(V),
1143 P: m_c_And(L: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_shared>(Op0: m_Value(V&: Ptr))),
1144 R: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_private>(
1145 Op0: m_Deferred(V: Ptr))))))
1146 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
1147
1148 return std::pair(nullptr, -1);
1149}
1150
1151unsigned
1152AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
1153 switch (Kind) {
1154 case PseudoSourceValue::Stack:
1155 case PseudoSourceValue::FixedStack:
1156 return AMDGPUAS::PRIVATE_ADDRESS;
1157 case PseudoSourceValue::ConstantPool:
1158 case PseudoSourceValue::GOT:
1159 case PseudoSourceValue::JumpTable:
1160 case PseudoSourceValue::GlobalValueCallEntry:
1161 case PseudoSourceValue::ExternalSymbolCallEntry:
1162 return AMDGPUAS::CONSTANT_ADDRESS;
1163 }
1164 return AMDGPUAS::FLAT_ADDRESS;
1165}
1166
1167bool AMDGPUTargetMachine::splitModule(
1168 Module &M, unsigned NumParts,
1169 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1170 // FIXME(?): Would be better to use an already existing Analysis/PassManager,
1171 // but all current users of this API don't have one ready and would need to
1172 // create one anyway. Let's hide the boilerplate for now to keep it simple.
1173
1174 LoopAnalysisManager LAM;
1175 FunctionAnalysisManager FAM;
1176 CGSCCAnalysisManager CGAM;
1177 ModuleAnalysisManager MAM;
1178
1179 PassBuilder PB(this);
1180 PB.registerModuleAnalyses(MAM);
1181 PB.registerFunctionAnalyses(FAM);
1182 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
1183
1184 ModulePassManager MPM;
1185 MPM.addPass(Pass: AMDGPUSplitModulePass(NumParts, ModuleCallback));
1186 MPM.run(IR&: M, AM&: MAM);
1187 return true;
1188}
1189
1190//===----------------------------------------------------------------------===//
1191// GCN Target Machine (SI+)
1192//===----------------------------------------------------------------------===//
1193
1194GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
1195 StringRef CPU, StringRef FS,
1196 const TargetOptions &Options,
1197 std::optional<Reloc::Model> RM,
1198 std::optional<CodeModel::Model> CM,
1199 CodeGenOptLevel OL, bool JIT)
1200 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
1201
1202const TargetSubtargetInfo *
1203GCNTargetMachine::getSubtargetImpl(const Function &F) const {
1204 StringRef GPU = getGPUName(F);
1205 StringRef FS = getFeatureString(F);
1206
1207 SmallString<128> SubtargetKey(GPU);
1208 SubtargetKey.append(RHS: FS);
1209
1210 auto &I = SubtargetMap[SubtargetKey];
1211 if (!I) {
1212 // This needs to be done before we create a new subtarget since any
1213 // creation will depend on the TM and the code generation flags on the
1214 // function that reside in TargetOptions.
1215 resetTargetOptions(F);
1216 I = std::make_unique<GCNSubtarget>(args: TargetTriple, args&: GPU, args&: FS, args: *this);
1217 }
1218
1219 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
1220
1221 return I.get();
1222}
1223
1224TargetTransformInfo
1225GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
1226 return TargetTransformInfo(std::make_unique<GCNTTIImpl>(args: this, args: F));
1227}
1228
1229Error GCNTargetMachine::buildCodeGenPipeline(
1230 ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
1231 CodeGenFileType FileType, const CGPassBuilderOption &Opts, MCContext &Ctx,
1232 PassInstrumentationCallbacks *PIC) {
1233 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
1234 return CGPB.buildPipeline(MPM, Out, DwoOut, FileType, Ctx);
1235}
1236
1237ScheduleDAGInstrs *
1238GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
1239 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1240 if (ST.enableSIScheduler())
1241 return createSIMachineScheduler(C);
1242
1243 Attribute SchedStrategyAttr =
1244 C->MF->getFunction().getFnAttribute(Kind: "amdgpu-sched-strategy");
1245 StringRef SchedStrategy = SchedStrategyAttr.isValid()
1246 ? SchedStrategyAttr.getValueAsString()
1247 : AMDGPUSchedStrategy;
1248
1249 if (SchedStrategy == "max-ilp")
1250 return createGCNMaxILPMachineScheduler(C);
1251
1252 if (SchedStrategy == "max-memory-clause")
1253 return createGCNMaxMemoryClauseMachineScheduler(C);
1254
1255 if (SchedStrategy == "iterative-ilp")
1256 return createIterativeILPMachineScheduler(C);
1257
1258 if (SchedStrategy == "iterative-minreg")
1259 return createMinRegScheduler(C);
1260
1261 if (SchedStrategy == "iterative-maxocc")
1262 return createIterativeGCNMaxOccupancyMachineScheduler(C);
1263
1264 return createGCNMaxOccupancyMachineScheduler(C);
1265}
1266
1267ScheduleDAGInstrs *
1268GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
1269 ScheduleDAGMI *DAG =
1270 new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(args&: C),
1271 /*RemoveKillFlags=*/true);
1272 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1273 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1274 if (ST.shouldClusterStores())
1275 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1276 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
1277 if ((EnableVOPD.getNumOccurrences() ||
1278 getOptLevel() >= CodeGenOptLevel::Less) &&
1279 EnableVOPD)
1280 DAG->addMutation(Mutation: createVOPDPairingMutation());
1281 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
1282 DAG->addMutation(Mutation: createAMDGPUBarrierLatencyDAGMutation(MF: C->MF));
1283 DAG->addMutation(Mutation: createAMDGPUHazardLatencyDAGMutation(MF: C->MF));
1284 return DAG;
1285}
1286//===----------------------------------------------------------------------===//
1287// AMDGPU Legacy Pass Setup
1288//===----------------------------------------------------------------------===//
1289
1290std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
1291 return getStandardCSEConfigForOpt(Level: TM->getOptLevel());
1292}
1293
1294namespace {
1295
1296class GCNPassConfig final : public AMDGPUPassConfig {
1297public:
1298 GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
1299 : AMDGPUPassConfig(TM, PM) {
1300 substitutePass(StandardID: &PostRASchedulerID, TargetID: &PostMachineSchedulerID);
1301 }
1302
1303 GCNTargetMachine &getGCNTargetMachine() const {
1304 return getTM<GCNTargetMachine>();
1305 }
1306
1307 bool addPreISel() override;
1308 void addMachineSSAOptimization() override;
1309 bool addILPOpts() override;
1310 bool addInstSelector() override;
1311 bool addIRTranslator() override;
1312 void addPreLegalizeMachineIR() override;
1313 bool addLegalizeMachineIR() override;
1314 void addPreRegBankSelect() override;
1315 bool addRegBankSelect() override;
1316 void addPreGlobalInstructionSelect() override;
1317 bool addGlobalInstructionSelect() override;
1318 void addPreRegAlloc() override;
1319 void addFastRegAlloc() override;
1320 void addOptimizedRegAlloc() override;
1321
1322 FunctionPass *createSGPRAllocPass(bool Optimized);
1323 FunctionPass *createVGPRAllocPass(bool Optimized);
1324 FunctionPass *createWWMRegAllocPass(bool Optimized);
1325 FunctionPass *createRegAllocPass(bool Optimized) override;
1326
1327 bool addRegAssignAndRewriteFast() override;
1328 bool addRegAssignAndRewriteOptimized() override;
1329
1330 bool addPreRewrite() override;
1331 void addPostRegAlloc() override;
1332 void addPreSched2() override;
1333 void addPreEmitPass() override;
1334 void addPostBBSections() override;
1335};
1336
1337} // end anonymous namespace
1338
1339AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM)
1340 : TargetPassConfig(TM, PM) {
1341 // Exceptions and StackMaps are not supported, so these passes will never do
1342 // anything.
1343 disablePass(PassID: &StackMapLivenessID);
1344 disablePass(PassID: &FuncletLayoutID);
1345 // Garbage collection is not supported.
1346 disablePass(PassID: &GCLoweringID);
1347 disablePass(PassID: &ShadowStackGCLoweringID);
1348}
1349
1350void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
1351 if (getOptLevel() == CodeGenOptLevel::Aggressive)
1352 addPass(P: createGVNPass());
1353 else
1354 addPass(P: createEarlyCSEPass());
1355}
1356
1357void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
1358 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
1359 addPass(P: createLoopDataPrefetchPass());
1360 addPass(P: createSeparateConstOffsetFromGEPPass());
1361 // ReassociateGEPs exposes more opportunities for SLSR. See
1362 // the example in reassociate-geps-and-slsr.ll.
1363 addPass(P: createStraightLineStrengthReducePass());
1364 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1365 // EarlyCSE can reuse.
1366 addEarlyCSEOrGVNPass();
1367 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1368 addPass(P: createNaryReassociatePass());
1369 // NaryReassociate on GEPs creates redundant common expressions, so run
1370 // EarlyCSE after it.
1371 addPass(P: createEarlyCSEPass());
1372}
1373
1374void AMDGPUPassConfig::addIRPasses() {
1375 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1376
1377 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
1378 addPass(P: createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1379
1380 // There is no reason to run these.
1381 disablePass(PassID: &StackMapLivenessID);
1382 disablePass(PassID: &FuncletLayoutID);
1383 disablePass(PassID: &PatchableFunctionID);
1384
1385 addPass(P: createAMDGPUPrintfRuntimeBinding());
1386 if (LowerCtorDtor)
1387 addPass(P: createAMDGPUCtorDtorLoweringLegacyPass());
1388
1389 if (TM.getTargetTriple().isAMDGCN() &&
1390 isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
1391 addPass(P: createAMDGPUImageIntrinsicOptimizerPass(&TM));
1392
1393 if (EnableUniformIntrinsicCombine)
1394 addPass(P: createAMDGPUUniformIntrinsicCombineLegacyPass());
1395
1396 // This can be disabled by passing ::Disable here or on the command line
1397 // with --expand-variadics-override=disable.
1398 addPass(P: createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
1399
1400 // Function calls are not supported, so make sure we inline everything.
1401 addPass(P: createAMDGPUAlwaysInlinePass());
1402 addPass(P: createAlwaysInlinerLegacyPass());
1403
1404 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1405 if (TM.getTargetTriple().getArch() == Triple::r600)
1406 addPass(P: createR600OpenCLImageTypeLoweringPass());
1407
1408 // Make enqueued block runtime handles externally visible.
1409 addPass(P: createAMDGPUExportKernelRuntimeHandlesLegacyPass());
1410
1411 // Lower special LDS accesses.
1412 if (EnableLowerExecSync)
1413 addPass(P: createAMDGPULowerExecSyncLegacyPass());
1414
1415 // Lower LDS accesses to global memory pass if address sanitizer is enabled.
1416 if (EnableSwLowerLDS)
1417 addPass(P: createAMDGPUSwLowerLDSLegacyPass(TM: &TM));
1418
1419 // Runs before PromoteAlloca so the latter can account for function uses
1420 if (EnableLowerModuleLDS) {
1421 addPass(P: createAMDGPULowerModuleLDSLegacyPass(TM: &TM));
1422 }
1423
1424 // Run atomic optimizer before Atomic Expand
1425 if ((TM.getTargetTriple().isAMDGCN()) &&
1426 (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1427 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1428 addPass(P: createAMDGPUAtomicOptimizerPass(ScanStrategy: AMDGPUAtomicOptimizerStrategy));
1429 }
1430
1431 addPass(P: createAtomicExpandLegacyPass());
1432
1433 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1434 addPass(P: createAMDGPUPromoteAlloca());
1435
1436 if (isPassEnabled(Opt: EnableScalarIRPasses))
1437 addStraightLineScalarOptimizationPasses();
1438
1439 if (EnableAMDGPUAliasAnalysis) {
1440 addPass(P: createAMDGPUAAWrapperPass());
1441 addPass(P: createExternalAAWrapperPass(Callback: [](Pass &P, Function &,
1442 AAResults &AAR) {
1443 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1444 AAR.addAAResult(AAResult&: WrapperPass->getResult());
1445 }));
1446 }
1447
1448 if (TM.getTargetTriple().isAMDGCN()) {
1449 // TODO: May want to move later or split into an early and late one.
1450 addPass(P: createAMDGPUCodeGenPreparePass());
1451 }
1452
1453 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1454 // have expanded.
1455 if (TM.getOptLevel() > CodeGenOptLevel::Less)
1456 addPass(P: createLICMPass());
1457 }
1458
1459 TargetPassConfig::addIRPasses();
1460
1461 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1462 // example, GVN can combine
1463 //
1464 // %0 = add %a, %b
1465 // %1 = add %b, %a
1466 //
1467 // and
1468 //
1469 // %0 = shl nsw %a, 2
1470 // %1 = shl %a, 2
1471 //
1472 // but EarlyCSE can do neither of them.
1473 if (isPassEnabled(Opt: EnableScalarIRPasses))
1474 addEarlyCSEOrGVNPass();
1475}
1476
1477void AMDGPUPassConfig::addCodeGenPrepare() {
1478 if (TM->getTargetTriple().isAMDGCN() &&
1479 TM->getOptLevel() > CodeGenOptLevel::None)
1480 addPass(P: createAMDGPUPreloadKernelArgumentsLegacyPass(TM));
1481
1482 if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
1483 addPass(P: createAMDGPULowerKernelArgumentsPass());
1484
1485 TargetPassConfig::addCodeGenPrepare();
1486
1487 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
1488 addPass(P: createLoadStoreVectorizerPass());
1489
1490 if (TM->getTargetTriple().isAMDGCN()) {
1491 // This lowering has been placed after codegenprepare to take advantage of
1492 // address mode matching (which is why it isn't put with the LDS lowerings).
1493 // It could be placed anywhere before uniformity annotations (an analysis
1494 // that it changes by splitting up fat pointers into their components)
1495 // but has been put before switch lowering and CFG flattening so that those
1496 // passes can run on the more optimized control flow this pass creates in
1497 // many cases.
1498 addPass(P: createAMDGPULowerBufferFatPointersPass());
1499 addPass(P: createAMDGPULowerIntrinsicsLegacyPass());
1500 }
1501
1502 // LowerSwitch pass may introduce unreachable blocks that can
1503 // cause unexpected behavior for subsequent passes. Placing it
1504 // here seems better that these blocks would get cleaned up by
1505 // UnreachableBlockElim inserted next in the pass flow.
1506 addPass(P: createLowerSwitchPass());
1507}
1508
1509bool AMDGPUPassConfig::addPreISel() {
1510 if (TM->getOptLevel() > CodeGenOptLevel::None)
1511 addPass(P: createFlattenCFGPass());
1512 return false;
1513}
1514
1515bool AMDGPUPassConfig::addInstSelector() {
1516 addPass(P: createAMDGPUISelDag(TM&: getAMDGPUTargetMachine(), OptLevel: getOptLevel()));
1517 return false;
1518}
1519
1520bool AMDGPUPassConfig::addGCPasses() {
1521 // Do nothing. GC is not supported.
1522 return false;
1523}
1524
1525//===----------------------------------------------------------------------===//
1526// GCN Legacy Pass Setup
1527//===----------------------------------------------------------------------===//
1528
1529bool GCNPassConfig::addPreISel() {
1530 AMDGPUPassConfig::addPreISel();
1531
1532 if (TM->getOptLevel() > CodeGenOptLevel::None)
1533 addPass(P: createSinkingPass());
1534
1535 if (TM->getOptLevel() > CodeGenOptLevel::None)
1536 addPass(P: createAMDGPULateCodeGenPrepareLegacyPass());
1537
1538 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1539 // regions formed by them.
1540 addPass(PassID: &AMDGPUUnifyDivergentExitNodesID);
1541 addPass(P: createFixIrreduciblePass());
1542 addPass(P: createUnifyLoopExitsPass());
1543 addPass(P: createStructurizeCFGPass(SkipUniformRegions: false)); // true -> SkipUniformRegions
1544
1545 addPass(P: createAMDGPUAnnotateUniformValuesLegacy());
1546 addPass(P: createSIAnnotateControlFlowLegacyPass());
1547 // TODO: Move this right after structurizeCFG to avoid extra divergence
1548 // analysis. This depends on stopping SIAnnotateControlFlow from making
1549 // control flow modifications.
1550 addPass(P: createAMDGPURewriteUndefForPHILegacyPass());
1551
1552 // SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
1553 // with -new-reg-bank-select and without any of the fallback options.
1554 if (!getCGPassBuilderOption().EnableGlobalISelOption ||
1555 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
1556 addPass(P: createLCSSAPass());
1557
1558 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1559 addPass(PassID: &AMDGPUPerfHintAnalysisLegacyID);
1560
1561 return false;
1562}
1563
1564void GCNPassConfig::addMachineSSAOptimization() {
1565 TargetPassConfig::addMachineSSAOptimization();
1566
1567 // We want to fold operands after PeepholeOptimizer has run (or as part of
1568 // it), because it will eliminate extra copies making it easier to fold the
1569 // real source operand. We want to eliminate dead instructions after, so that
1570 // we see fewer uses of the copies. We then need to clean up the dead
1571 // instructions leftover after the operands are folded as well.
1572 //
1573 // XXX - Can we get away without running DeadMachineInstructionElim again?
1574 addPass(PassID: &SIFoldOperandsLegacyID);
1575 if (EnableDPPCombine)
1576 addPass(PassID: &GCNDPPCombineLegacyID);
1577 addPass(PassID: &SILoadStoreOptimizerLegacyID);
1578 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
1579 addPass(PassID: &SIPeepholeSDWALegacyID);
1580 addPass(PassID: &EarlyMachineLICMID);
1581 addPass(PassID: &MachineCSELegacyID);
1582 addPass(PassID: &SIFoldOperandsLegacyID);
1583 }
1584 addPass(PassID: &DeadMachineInstructionElimID);
1585 addPass(P: createSIShrinkInstructionsLegacyPass());
1586}
1587
1588bool GCNPassConfig::addILPOpts() {
1589 if (EnableEarlyIfConversion)
1590 addPass(PassID: &EarlyIfConverterLegacyID);
1591
1592 TargetPassConfig::addILPOpts();
1593 return false;
1594}
1595
1596bool GCNPassConfig::addInstSelector() {
1597 AMDGPUPassConfig::addInstSelector();
1598 addPass(PassID: &SIFixSGPRCopiesLegacyID);
1599 addPass(P: createSILowerI1CopiesLegacyPass());
1600 return false;
1601}
1602
1603bool GCNPassConfig::addIRTranslator() {
1604 addPass(P: new IRTranslator(getOptLevel()));
1605 return false;
1606}
1607
1608void GCNPassConfig::addPreLegalizeMachineIR() {
1609 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1610 addPass(P: createAMDGPUPreLegalizeCombiner(IsOptNone));
1611 addPass(P: new Localizer());
1612}
1613
1614bool GCNPassConfig::addLegalizeMachineIR() {
1615 addPass(P: new Legalizer());
1616 return false;
1617}
1618
1619void GCNPassConfig::addPreRegBankSelect() {
1620 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1621 addPass(P: createAMDGPUPostLegalizeCombiner(IsOptNone));
1622 addPass(P: createAMDGPUGlobalISelDivergenceLoweringPass());
1623}
1624
1625bool GCNPassConfig::addRegBankSelect() {
1626 if (NewRegBankSelect) {
1627 addPass(P: createAMDGPURegBankSelectPass());
1628 addPass(P: createAMDGPURegBankLegalizePass());
1629 } else {
1630 addPass(P: new RegBankSelect());
1631 }
1632 return false;
1633}
1634
1635void GCNPassConfig::addPreGlobalInstructionSelect() {
1636 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1637 addPass(P: createAMDGPURegBankCombiner(IsOptNone));
1638}
1639
1640bool GCNPassConfig::addGlobalInstructionSelect() {
1641 addPass(P: new InstructionSelect(getOptLevel()));
1642 return false;
1643}
1644
1645void GCNPassConfig::addFastRegAlloc() {
1646 // FIXME: We have to disable the verifier here because of PHIElimination +
1647 // TwoAddressInstructions disabling it.
1648
1649 // This must be run immediately after phi elimination and before
1650 // TwoAddressInstructions, otherwise the processing of the tied operand of
1651 // SI_ELSE will introduce a copy of the tied operand source after the else.
1652 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1653
1654 insertPass(TargetPassID: &TwoAddressInstructionPassID, InsertedPassID: &SIWholeQuadModeID);
1655
1656 TargetPassConfig::addFastRegAlloc();
1657}
1658
1659void GCNPassConfig::addPreRegAlloc() {
1660 if (getOptLevel() != CodeGenOptLevel::None)
1661 addPass(PassID: &AMDGPUPrepareAGPRAllocLegacyID);
1662}
1663
1664void GCNPassConfig::addOptimizedRegAlloc() {
1665 if (EnableDCEInRA)
1666 insertPass(TargetPassID: &DetectDeadLanesID, InsertedPassID: &DeadMachineInstructionElimID);
1667
1668 // FIXME: when an instruction has a Killed operand, and the instruction is
1669 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1670 // the register in LiveVariables, this would trigger a failure in verifier,
1671 // we should fix it and enable the verifier.
1672 if (OptVGPRLiveRange)
1673 insertPass(TargetPassID: &LiveVariablesID, InsertedPassID: &SIOptimizeVGPRLiveRangeLegacyID);
1674
1675 // This must be run immediately after phi elimination and before
1676 // TwoAddressInstructions, otherwise the processing of the tied operand of
1677 // SI_ELSE will introduce a copy of the tied operand source after the else.
1678 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1679
1680 if (EnableRewritePartialRegUses)
1681 insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNRewritePartialRegUsesID);
1682
1683 if (isPassEnabled(Opt: EnablePreRAOptimizations))
1684 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &GCNPreRAOptimizationsID);
1685
1686 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1687 // instructions that cause scheduling barriers.
1688 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIWholeQuadModeID);
1689
1690 if (OptExecMaskPreRA)
1691 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIOptimizeExecMaskingPreRAID);
1692
1693 // This is not an essential optimization and it has a noticeable impact on
1694 // compilation time, so we only enable it from O2.
1695 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1696 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIFormMemoryClausesID);
1697
1698 TargetPassConfig::addOptimizedRegAlloc();
1699}
1700
1701bool GCNPassConfig::addPreRewrite() {
1702 if (EnableRegReassign)
1703 addPass(PassID: &GCNNSAReassignID);
1704
1705 addPass(PassID: &AMDGPURewriteAGPRCopyMFMALegacyID);
1706 return true;
1707}
1708
1709FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1710 // Initialize the global default.
1711 llvm::call_once(flag&: InitializeDefaultSGPRRegisterAllocatorFlag,
1712 F&: initializeDefaultSGPRRegisterAllocatorOnce);
1713
1714 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1715 if (Ctor != useDefaultRegisterAllocator)
1716 return Ctor();
1717
1718 if (Optimized)
1719 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
1720
1721 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
1722}
1723
1724FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1725 // Initialize the global default.
1726 llvm::call_once(flag&: InitializeDefaultVGPRRegisterAllocatorFlag,
1727 F&: initializeDefaultVGPRRegisterAllocatorOnce);
1728
1729 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1730 if (Ctor != useDefaultRegisterAllocator)
1731 return Ctor();
1732
1733 if (Optimized)
1734 return createGreedyVGPRRegisterAllocator();
1735
1736 return createFastVGPRRegisterAllocator();
1737}
1738
1739FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1740 // Initialize the global default.
1741 llvm::call_once(flag&: InitializeDefaultWWMRegisterAllocatorFlag,
1742 F&: initializeDefaultWWMRegisterAllocatorOnce);
1743
1744 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1745 if (Ctor != useDefaultRegisterAllocator)
1746 return Ctor();
1747
1748 if (Optimized)
1749 return createGreedyWWMRegisterAllocator();
1750
1751 return createFastWWMRegisterAllocator();
1752}
1753
1754FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1755 llvm_unreachable("should not be used");
1756}
1757
1758static const char RegAllocOptNotSupportedMessage[] =
1759 "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1760 "and -vgpr-regalloc";
1761
1762bool GCNPassConfig::addRegAssignAndRewriteFast() {
1763 if (!usingDefaultRegAlloc())
1764 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1765
1766 addPass(PassID: &GCNPreRALongBranchRegID);
1767
1768 addPass(P: createSGPRAllocPass(Optimized: false));
1769
1770 // Equivalent of PEI for SGPRs.
1771 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1772
1773 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1774 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1775
1776 // For allocating other wwm register operands.
1777 addPass(P: createWWMRegAllocPass(Optimized: false));
1778
1779 addPass(PassID: &SILowerWWMCopiesLegacyID);
1780 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1781
1782 // For allocating per-thread VGPRs.
1783 addPass(P: createVGPRAllocPass(Optimized: false));
1784
1785 return true;
1786}
1787
1788bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1789 if (!usingDefaultRegAlloc())
1790 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1791
1792 addPass(PassID: &GCNPreRALongBranchRegID);
1793
1794 addPass(P: createSGPRAllocPass(Optimized: true));
1795
1796 // Commit allocated register changes. This is mostly necessary because too
1797 // many things rely on the use lists of the physical registers, such as the
1798 // verifier. This is only necessary with allocators which use LiveIntervals,
1799 // since FastRegAlloc does the replacements itself.
1800 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1801
1802 // At this point, the sgpr-regalloc has been done and it is good to have the
1803 // stack slot coloring to try to optimize the SGPR spill stack indices before
1804 // attempting the custom SGPR spill lowering.
1805 addPass(PassID: &StackSlotColoringID);
1806
1807 // Equivalent of PEI for SGPRs.
1808 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1809
1810 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1811 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1812
1813 // For allocating other whole wave mode registers.
1814 addPass(P: createWWMRegAllocPass(Optimized: true));
1815 addPass(PassID: &SILowerWWMCopiesLegacyID);
1816 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1817 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1818
1819 // For allocating per-thread VGPRs.
1820 addPass(P: createVGPRAllocPass(Optimized: true));
1821
1822 addPreRewrite();
1823 addPass(PassID: &VirtRegRewriterID);
1824
1825 addPass(PassID: &AMDGPUMarkLastScratchLoadID);
1826
1827 return true;
1828}
1829
1830void GCNPassConfig::addPostRegAlloc() {
1831 addPass(PassID: &SIFixVGPRCopiesID);
1832 if (getOptLevel() > CodeGenOptLevel::None)
1833 addPass(PassID: &SIOptimizeExecMaskingLegacyID);
1834 TargetPassConfig::addPostRegAlloc();
1835}
1836
1837void GCNPassConfig::addPreSched2() {
1838 if (TM->getOptLevel() > CodeGenOptLevel::None)
1839 addPass(P: createSIShrinkInstructionsLegacyPass());
1840 addPass(PassID: &SIPostRABundlerLegacyID);
1841}
1842
1843void GCNPassConfig::addPreEmitPass() {
1844 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less))
1845 addPass(PassID: &GCNCreateVOPDID);
1846 addPass(P: createSIMemoryLegalizerPass());
1847 addPass(P: createSIInsertWaitcntsPass());
1848
1849 addPass(P: createSIModeRegisterPass());
1850
1851 if (getOptLevel() > CodeGenOptLevel::None)
1852 addPass(PassID: &SIInsertHardClausesID);
1853
1854 addPass(PassID: &SILateBranchLoweringPassID);
1855 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
1856 addPass(P: createAMDGPUSetWavePriorityPass());
1857 if (getOptLevel() > CodeGenOptLevel::None)
1858 addPass(PassID: &SIPreEmitPeepholeID);
1859 // The hazard recognizer that runs as part of the post-ra scheduler does not
1860 // guarantee to be able handle all hazards correctly. This is because if there
1861 // are multiple scheduling regions in a basic block, the regions are scheduled
1862 // bottom up, so when we begin to schedule a region we don't know what
1863 // instructions were emitted directly before it.
1864 //
1865 // Here we add a stand-alone hazard recognizer pass which can handle all
1866 // cases.
1867 addPass(PassID: &PostRAHazardRecognizerID);
1868
1869 addPass(PassID: &AMDGPUWaitSGPRHazardsLegacyID);
1870
1871 addPass(PassID: &AMDGPULowerVGPREncodingLegacyID);
1872
1873 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less))
1874 addPass(PassID: &AMDGPUInsertDelayAluID);
1875
1876 addPass(PassID: &BranchRelaxationPassID);
1877}
1878
1879void GCNPassConfig::addPostBBSections() {
1880 // We run this later to avoid passes like livedebugvalues and BBSections
1881 // having to deal with the apparent multi-entry functions we may generate.
1882 addPass(P: createAMDGPUPreloadKernArgPrologLegacyPass());
1883}
1884
1885TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1886 return new GCNPassConfig(*this, PM);
1887}
1888
1889void GCNTargetMachine::registerMachineRegisterInfoCallback(
1890 MachineFunction &MF) const {
1891 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1892 MF.getRegInfo().addDelegate(delegate: MFI);
1893}
1894
1895MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1896 BumpPtrAllocator &Allocator, const Function &F,
1897 const TargetSubtargetInfo *STI) const {
1898 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1899 Allocator, F, STI: static_cast<const GCNSubtarget *>(STI));
1900}
1901
1902yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1903 return new yaml::SIMachineFunctionInfo();
1904}
1905
1906yaml::MachineFunctionInfo *
1907GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1908 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1909 return new yaml::SIMachineFunctionInfo(
1910 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1911}
1912
1913bool GCNTargetMachine::parseMachineFunctionInfo(
1914 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1915 SMDiagnostic &Error, SMRange &SourceRange) const {
1916 const yaml::SIMachineFunctionInfo &YamlMFI =
1917 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1918 MachineFunction &MF = PFS.MF;
1919 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1920 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1921
1922 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1923 return true;
1924
1925 if (MFI->Occupancy == 0) {
1926 // Fixup the subtarget dependent default value.
1927 MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
1928 }
1929
1930 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1931 Register TempReg;
1932 if (parseNamedRegisterReference(PFS, Reg&: TempReg, Src: RegName.Value, Error)) {
1933 SourceRange = RegName.SourceRange;
1934 return true;
1935 }
1936 RegVal = TempReg;
1937
1938 return false;
1939 };
1940
1941 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1942 Register &RegVal) {
1943 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1944 };
1945
1946 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1947 return true;
1948
1949 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1950 return true;
1951
1952 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1953 MFI->LongBranchReservedReg))
1954 return true;
1955
1956 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1957 // Create a diagnostic for a the register string literal.
1958 const MemoryBuffer &Buffer =
1959 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
1960 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1961 RegName.Value.size(), SourceMgr::DK_Error,
1962 "incorrect register class for field", RegName.Value,
1963 {}, {});
1964 SourceRange = RegName.SourceRange;
1965 return true;
1966 };
1967
1968 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1969 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1970 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1971 return true;
1972
1973 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1974 !AMDGPU::SGPR_128RegClass.contains(Reg: MFI->ScratchRSrcReg)) {
1975 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1976 }
1977
1978 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1979 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->FrameOffsetReg)) {
1980 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1981 }
1982
1983 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1984 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->StackPtrOffsetReg)) {
1985 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1986 }
1987
1988 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1989 Register ParsedReg;
1990 if (parseRegister(YamlReg, ParsedReg))
1991 return true;
1992
1993 MFI->reserveWWMRegister(Reg: ParsedReg);
1994 }
1995
1996 for (const auto &[_, Info] : PFS.VRegInfosNamed) {
1997 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
1998 }
1999 for (const auto &[_, Info] : PFS.VRegInfos) {
2000 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
2001 }
2002
2003 for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
2004 Register ParsedReg;
2005 if (parseRegister(YamlRegStr, ParsedReg))
2006 return true;
2007 MFI->SpillPhysVGPRs.push_back(Elt: ParsedReg);
2008 }
2009
2010 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
2011 const TargetRegisterClass &RC,
2012 ArgDescriptor &Arg, unsigned UserSGPRs,
2013 unsigned SystemSGPRs) {
2014 // Skip parsing if it's not present.
2015 if (!A)
2016 return false;
2017
2018 if (A->IsRegister) {
2019 Register Reg;
2020 if (parseNamedRegisterReference(PFS, Reg, Src: A->RegisterName.Value, Error)) {
2021 SourceRange = A->RegisterName.SourceRange;
2022 return true;
2023 }
2024 if (!RC.contains(Reg))
2025 return diagnoseRegisterClass(A->RegisterName);
2026 Arg = ArgDescriptor::createRegister(Reg);
2027 } else
2028 Arg = ArgDescriptor::createStack(Offset: A->StackOffset);
2029 // Check and apply the optional mask.
2030 if (A->Mask)
2031 Arg = ArgDescriptor::createArg(Arg, Mask: *A->Mask);
2032
2033 MFI->NumUserSGPRs += UserSGPRs;
2034 MFI->NumSystemSGPRs += SystemSGPRs;
2035 return false;
2036 };
2037
2038 if (YamlMFI.ArgInfo &&
2039 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
2040 AMDGPU::SGPR_128RegClass,
2041 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
2042 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
2043 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
2044 2, 0) ||
2045 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
2046 MFI->ArgInfo.QueuePtr, 2, 0) ||
2047 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
2048 AMDGPU::SReg_64RegClass,
2049 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
2050 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
2051 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
2052 2, 0) ||
2053 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
2054 AMDGPU::SReg_64RegClass,
2055 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
2056 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
2057 AMDGPU::SGPR_32RegClass,
2058 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
2059 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
2060 AMDGPU::SGPR_32RegClass,
2061 MFI->ArgInfo.LDSKernelId, 0, 1) ||
2062 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
2063 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
2064 0, 1) ||
2065 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
2066 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
2067 0, 1) ||
2068 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
2069 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
2070 0, 1) ||
2071 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
2072 AMDGPU::SGPR_32RegClass,
2073 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
2074 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
2075 AMDGPU::SGPR_32RegClass,
2076 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
2077 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
2078 AMDGPU::SReg_64RegClass,
2079 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
2080 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
2081 AMDGPU::SReg_64RegClass,
2082 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
2083 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
2084 AMDGPU::VGPR_32RegClass,
2085 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
2086 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
2087 AMDGPU::VGPR_32RegClass,
2088 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
2089 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
2090 AMDGPU::VGPR_32RegClass,
2091 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
2092 return true;
2093
2094 // Parse FirstKernArgPreloadReg separately, since it's a Register,
2095 // not ArgDescriptor.
2096 if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) {
2097 const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg;
2098
2099 if (!A.IsRegister) {
2100 // For stack arguments, we don't have RegisterName.SourceRange,
2101 // but we should have some location info from the YAML parser
2102 const MemoryBuffer &Buffer =
2103 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
2104 // Create a minimal valid source range
2105 SMLoc Loc = SMLoc::getFromPointer(Ptr: Buffer.getBufferStart());
2106 SMRange Range(Loc, Loc);
2107
2108 Error = SMDiagnostic(
2109 *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error,
2110 "firstKernArgPreloadReg must be a register, not a stack location", "",
2111 {}, {});
2112
2113 SourceRange = Range;
2114 return true;
2115 }
2116
2117 Register Reg;
2118 if (parseNamedRegisterReference(PFS, Reg, Src: A.RegisterName.Value, Error)) {
2119 SourceRange = A.RegisterName.SourceRange;
2120 return true;
2121 }
2122
2123 if (!AMDGPU::SGPR_32RegClass.contains(Reg))
2124 return diagnoseRegisterClass(A.RegisterName);
2125
2126 MFI->ArgInfo.FirstKernArgPreloadReg = Reg;
2127 MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs;
2128 }
2129
2130 if (ST.hasIEEEMode())
2131 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
2132 if (ST.hasDX10ClampMode())
2133 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
2134
2135 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
2136 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
2137 ? DenormalMode::IEEE
2138 : DenormalMode::PreserveSign;
2139 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
2140 ? DenormalMode::IEEE
2141 : DenormalMode::PreserveSign;
2142
2143 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
2144 ? DenormalMode::IEEE
2145 : DenormalMode::PreserveSign;
2146 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
2147 ? DenormalMode::IEEE
2148 : DenormalMode::PreserveSign;
2149
2150 if (YamlMFI.HasInitWholeWave)
2151 MFI->setInitWholeWave();
2152
2153 return false;
2154}
2155
2156//===----------------------------------------------------------------------===//
2157// AMDGPU CodeGen Pass Builder interface.
2158//===----------------------------------------------------------------------===//
2159
2160AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
2161 GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
2162 PassInstrumentationCallbacks *PIC)
2163 : CodeGenPassBuilder(TM, Opts, PIC) {
2164 Opt.MISchedPostRA = true;
2165 Opt.RequiresCodeGenSCCOrder = true;
2166 // Exceptions and StackMaps are not supported, so these passes will never do
2167 // anything.
2168 // Garbage collection is not supported.
2169 disablePass<StackMapLivenessPass, FuncletLayoutPass, PatchableFunctionPass,
2170 ShadowStackGCLoweringPass, GCLoweringPass>();
2171}
2172
2173void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
2174 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) {
2175 flushFPMsToMPM(PMW);
2176 addModulePass(Pass: AMDGPURemoveIncompatibleFunctionsPass(TM), PMW);
2177 }
2178
2179 flushFPMsToMPM(PMW);
2180 addModulePass(Pass: AMDGPUPrintfRuntimeBindingPass(), PMW);
2181 if (LowerCtorDtor)
2182 addModulePass(Pass: AMDGPUCtorDtorLoweringPass(), PMW);
2183
2184 if (isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
2185 addFunctionPass(Pass: AMDGPUImageIntrinsicOptimizerPass(TM), PMW);
2186
2187 if (EnableUniformIntrinsicCombine)
2188 addFunctionPass(Pass: AMDGPUUniformIntrinsicCombinePass(), PMW);
2189 // This can be disabled by passing ::Disable here or on the command line
2190 // with --expand-variadics-override=disable.
2191 flushFPMsToMPM(PMW);
2192 addModulePass(Pass: ExpandVariadicsPass(ExpandVariadicsMode::Lowering), PMW);
2193
2194 addModulePass(Pass: AMDGPUAlwaysInlinePass(), PMW);
2195 addModulePass(Pass: AlwaysInlinerPass(), PMW);
2196
2197 addModulePass(Pass: AMDGPUExportKernelRuntimeHandlesPass(), PMW);
2198
2199 if (EnableLowerExecSync)
2200 addModulePass(Pass: AMDGPULowerExecSyncPass(), PMW);
2201
2202 if (EnableSwLowerLDS)
2203 addModulePass(Pass: AMDGPUSwLowerLDSPass(TM), PMW);
2204
2205 // Runs before PromoteAlloca so the latter can account for function uses
2206 if (EnableLowerModuleLDS)
2207 addModulePass(Pass: AMDGPULowerModuleLDSPass(TM), PMW);
2208
2209 // Run atomic optimizer before Atomic Expand
2210 if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
2211 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
2212 addFunctionPass(
2213 Pass: AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy), PMW);
2214
2215 addFunctionPass(Pass: AtomicExpandPass(TM), PMW);
2216
2217 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2218 addFunctionPass(Pass: AMDGPUPromoteAllocaPass(TM), PMW);
2219 if (isPassEnabled(Opt: EnableScalarIRPasses))
2220 addStraightLineScalarOptimizationPasses(PMW);
2221
2222 // TODO: Handle EnableAMDGPUAliasAnalysis
2223
2224 // TODO: May want to move later or split into an early and late one.
2225 addFunctionPass(Pass: AMDGPUCodeGenPreparePass(TM), PMW);
2226
2227 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
2228 // have expanded.
2229 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2230 addFunctionPass(Pass: createFunctionToLoopPassAdaptor(Pass: LICMPass(LICMOptions()),
2231 /*UseMemorySSA=*/true),
2232 PMW);
2233 }
2234 }
2235
2236 Base::addIRPasses(PMW);
2237
2238 // EarlyCSE is not always strong enough to clean up what LSR produces. For
2239 // example, GVN can combine
2240 //
2241 // %0 = add %a, %b
2242 // %1 = add %b, %a
2243 //
2244 // and
2245 //
2246 // %0 = shl nsw %a, 2
2247 // %1 = shl %a, 2
2248 //
2249 // but EarlyCSE can do neither of them.
2250 if (isPassEnabled(Opt: EnableScalarIRPasses))
2251 addEarlyCSEOrGVNPass(PMW);
2252}
2253
2254void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(
2255 PassManagerWrapper &PMW) const {
2256 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2257 flushFPMsToMPM(PMW);
2258 addModulePass(Pass: AMDGPUPreloadKernelArgumentsPass(TM), PMW);
2259 }
2260
2261 if (EnableLowerKernelArguments)
2262 addFunctionPass(Pass: AMDGPULowerKernelArgumentsPass(TM), PMW);
2263
2264 Base::addCodeGenPrepare(PMW);
2265
2266 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
2267 addFunctionPass(Pass: LoadStoreVectorizerPass(), PMW);
2268
2269 // This lowering has been placed after codegenprepare to take advantage of
2270 // address mode matching (which is why it isn't put with the LDS lowerings).
2271 // It could be placed anywhere before uniformity annotations (an analysis
2272 // that it changes by splitting up fat pointers into their components)
2273 // but has been put before switch lowering and CFG flattening so that those
2274 // passes can run on the more optimized control flow this pass creates in
2275 // many cases.
2276 flushFPMsToMPM(PMW);
2277 addModulePass(Pass: AMDGPULowerBufferFatPointersPass(TM), PMW);
2278 flushFPMsToMPM(PMW);
2279 requireCGSCCOrder(PMW);
2280
2281 addModulePass(Pass: AMDGPULowerIntrinsicsPass(TM), PMW);
2282
2283 // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
2284 // behavior for subsequent passes. Placing it here seems better that these
2285 // blocks would get cleaned up by UnreachableBlockElim inserted next in the
2286 // pass flow.
2287 addFunctionPass(Pass: LowerSwitchPass(), PMW);
2288}
2289
2290void AMDGPUCodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const {
2291
2292 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2293 addFunctionPass(Pass: FlattenCFGPass(), PMW);
2294 addFunctionPass(Pass: SinkingPass(), PMW);
2295 addFunctionPass(Pass: AMDGPULateCodeGenPreparePass(TM), PMW);
2296 }
2297
2298 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
2299 // regions formed by them.
2300
2301 addFunctionPass(Pass: AMDGPUUnifyDivergentExitNodesPass(), PMW);
2302 addFunctionPass(Pass: FixIrreduciblePass(), PMW);
2303 addFunctionPass(Pass: UnifyLoopExitsPass(), PMW);
2304 addFunctionPass(Pass: StructurizeCFGPass(/*SkipUniformRegions=*/false), PMW);
2305
2306 addFunctionPass(Pass: AMDGPUAnnotateUniformValuesPass(), PMW);
2307
2308 addFunctionPass(Pass: SIAnnotateControlFlowPass(TM), PMW);
2309
2310 // TODO: Move this right after structurizeCFG to avoid extra divergence
2311 // analysis. This depends on stopping SIAnnotateControlFlow from making
2312 // control flow modifications.
2313 addFunctionPass(Pass: AMDGPURewriteUndefForPHIPass(), PMW);
2314
2315 if (!getCGPassBuilderOption().EnableGlobalISelOption ||
2316 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
2317 addFunctionPass(Pass: LCSSAPass(), PMW);
2318
2319 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2320 flushFPMsToMPM(PMW);
2321 addModulePass(Pass: AMDGPUPerfHintAnalysisPass(TM), PMW);
2322 }
2323
2324 // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
2325 // isn't this in addInstSelector?
2326 addFunctionPass(Pass: RequireAnalysisPass<UniformityInfoAnalysis, Function>(), PMW,
2327 /*Force=*/true);
2328}
2329
2330void AMDGPUCodeGenPassBuilder::addILPOpts(PassManagerWrapper &PMW) const {
2331 if (EnableEarlyIfConversion)
2332 addMachineFunctionPass(Pass: EarlyIfConverterPass(), PMW);
2333
2334 Base::addILPOpts(PMW);
2335}
2336
2337void AMDGPUCodeGenPassBuilder::addAsmPrinterBegin(
2338 PassManagerWrapper &PMW, CreateMCStreamer CreateStreamer) const {
2339 // TODO: Add AsmPrinterBegin
2340}
2341
2342void AMDGPUCodeGenPassBuilder::addAsmPrinter(
2343 PassManagerWrapper &PMW, CreateMCStreamer CreateStreamer) const {
2344 // TODO: Add AsmPrinter.
2345}
2346
2347void AMDGPUCodeGenPassBuilder::addAsmPrinterEnd(
2348 PassManagerWrapper &PMW, CreateMCStreamer CreateStreamer) const {
2349 // TODO: Add AsmPrinterEnd
2350}
2351
2352Error AMDGPUCodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const {
2353 addMachineFunctionPass(Pass: AMDGPUISelDAGToDAGPass(TM), PMW);
2354 addMachineFunctionPass(Pass: SIFixSGPRCopiesPass(), PMW);
2355 addMachineFunctionPass(Pass: SILowerI1CopiesPass(), PMW);
2356 return Error::success();
2357}
2358
2359void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const {
2360 if (EnableRegReassign) {
2361 addMachineFunctionPass(Pass: GCNNSAReassignPass(), PMW);
2362 }
2363
2364 addMachineFunctionPass(Pass: AMDGPURewriteAGPRCopyMFMAPass(), PMW);
2365}
2366
2367void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
2368 PassManagerWrapper &PMW) const {
2369 Base::addMachineSSAOptimization(PMW);
2370
2371 addMachineFunctionPass(Pass: SIFoldOperandsPass(), PMW);
2372 if (EnableDPPCombine) {
2373 addMachineFunctionPass(Pass: GCNDPPCombinePass(), PMW);
2374 }
2375 addMachineFunctionPass(Pass: SILoadStoreOptimizerPass(), PMW);
2376 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
2377 addMachineFunctionPass(Pass: SIPeepholeSDWAPass(), PMW);
2378 addMachineFunctionPass(Pass: EarlyMachineLICMPass(), PMW);
2379 addMachineFunctionPass(Pass: MachineCSEPass(), PMW);
2380 addMachineFunctionPass(Pass: SIFoldOperandsPass(), PMW);
2381 }
2382 addMachineFunctionPass(Pass: DeadMachineInstructionElimPass(), PMW);
2383 addMachineFunctionPass(Pass: SIShrinkInstructionsPass(), PMW);
2384}
2385
2386Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const {
2387 insertPass<PHIEliminationPass>(Pass: SILowerControlFlowPass());
2388
2389 insertPass<TwoAddressInstructionPass>(Pass: SIWholeQuadModePass());
2390
2391 return Base::addFastRegAlloc(PMW);
2392}
2393
2394Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast(
2395 PassManagerWrapper &PMW) const {
2396 if (auto Err = validateRegAllocOptions())
2397 return Err;
2398
2399 addMachineFunctionPass(Pass: GCNPreRALongBranchRegPass(), PMW);
2400
2401 // SGPR allocation - default to fast at -O0.
2402 if (SGPRRegAllocNPM == RegAllocType::Greedy)
2403 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2404 else
2405 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2406 PMW);
2407
2408 // Equivalent of PEI for SGPRs.
2409 addMachineFunctionPass(Pass: SILowerSGPRSpillsPass(), PMW);
2410
2411 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2412 addMachineFunctionPass(Pass: SIPreAllocateWWMRegsPass(), PMW);
2413
2414 // WWM allocation - default to fast at -O0.
2415 if (WWMRegAllocNPM == RegAllocType::Greedy)
2416 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2417 else
2418 addMachineFunctionPass(
2419 Pass: RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
2420
2421 addMachineFunctionPass(Pass: SILowerWWMCopiesPass(), PMW);
2422 addMachineFunctionPass(Pass: AMDGPUReserveWWMRegsPass(), PMW);
2423
2424 // VGPR allocation - default to fast at -O0.
2425 if (VGPRRegAllocNPM == RegAllocType::Greedy)
2426 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2427 else
2428 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2429
2430 return Error::success();
2431}
2432
2433Error AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
2434 PassManagerWrapper &PMW) const {
2435 if (EnableDCEInRA)
2436 insertPass<DetectDeadLanesPass>(Pass: DeadMachineInstructionElimPass());
2437
2438 // FIXME: when an instruction has a Killed operand, and the instruction is
2439 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
2440 // the register in LiveVariables, this would trigger a failure in verifier,
2441 // we should fix it and enable the verifier.
2442 if (OptVGPRLiveRange)
2443 insertPass<RequireAnalysisPass<LiveVariablesAnalysis, MachineFunction>>(
2444 Pass: SIOptimizeVGPRLiveRangePass());
2445
2446 // This must be run immediately after phi elimination and before
2447 // TwoAddressInstructions, otherwise the processing of the tied operand of
2448 // SI_ELSE will introduce a copy of the tied operand source after the else.
2449 insertPass<PHIEliminationPass>(Pass: SILowerControlFlowPass());
2450
2451 if (EnableRewritePartialRegUses)
2452 insertPass<RenameIndependentSubregsPass>(Pass: GCNRewritePartialRegUsesPass());
2453
2454 if (isPassEnabled(Opt: EnablePreRAOptimizations))
2455 insertPass<MachineSchedulerPass>(Pass: GCNPreRAOptimizationsPass());
2456
2457 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
2458 // instructions that cause scheduling barriers.
2459 insertPass<MachineSchedulerPass>(Pass: SIWholeQuadModePass());
2460
2461 if (OptExecMaskPreRA)
2462 insertPass<MachineSchedulerPass>(Pass: SIOptimizeExecMaskingPreRAPass());
2463
2464 // This is not an essential optimization and it has a noticeable impact on
2465 // compilation time, so we only enable it from O2.
2466 if (TM.getOptLevel() > CodeGenOptLevel::Less)
2467 insertPass<MachineSchedulerPass>(Pass: SIFormMemoryClausesPass());
2468
2469 return Base::addOptimizedRegAlloc(PMW);
2470}
2471
2472void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const {
2473 if (getOptLevel() != CodeGenOptLevel::None)
2474 addMachineFunctionPass(Pass: AMDGPUPrepareAGPRAllocPass(), PMW);
2475}
2476
2477Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
2478 PassManagerWrapper &PMW) const {
2479 if (auto Err = validateRegAllocOptions())
2480 return Err;
2481
2482 addMachineFunctionPass(Pass: GCNPreRALongBranchRegPass(), PMW);
2483
2484 // SGPR allocation - default to greedy at -O1 and above.
2485 if (SGPRRegAllocNPM == RegAllocType::Fast)
2486 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2487 PMW);
2488 else
2489 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2490
2491 // Commit allocated register changes. This is mostly necessary because too
2492 // many things rely on the use lists of the physical registers, such as the
2493 // verifier. This is only necessary with allocators which use LiveIntervals,
2494 // since FastRegAlloc does the replacements itself.
2495 addMachineFunctionPass(Pass: VirtRegRewriterPass(false), PMW);
2496
2497 // At this point, the sgpr-regalloc has been done and it is good to have the
2498 // stack slot coloring to try to optimize the SGPR spill stack indices before
2499 // attempting the custom SGPR spill lowering.
2500 addMachineFunctionPass(Pass: StackSlotColoringPass(), PMW);
2501
2502 // Equivalent of PEI for SGPRs.
2503 addMachineFunctionPass(Pass: SILowerSGPRSpillsPass(), PMW);
2504
2505 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2506 addMachineFunctionPass(Pass: SIPreAllocateWWMRegsPass(), PMW);
2507
2508 // WWM allocation - default to greedy at -O1 and above.
2509 if (WWMRegAllocNPM == RegAllocType::Fast)
2510 addMachineFunctionPass(
2511 Pass: RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
2512 else
2513 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2514 addMachineFunctionPass(Pass: SILowerWWMCopiesPass(), PMW);
2515 addMachineFunctionPass(Pass: VirtRegRewriterPass(false), PMW);
2516 addMachineFunctionPass(Pass: AMDGPUReserveWWMRegsPass(), PMW);
2517
2518 // VGPR allocation - default to greedy at -O1 and above.
2519 if (VGPRRegAllocNPM == RegAllocType::Fast)
2520 addMachineFunctionPass(Pass: RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2521 else
2522 addMachineFunctionPass(Pass: RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2523
2524 addPreRewrite(PMW);
2525 addMachineFunctionPass(Pass: VirtRegRewriterPass(true), PMW);
2526
2527 addMachineFunctionPass(Pass: AMDGPUMarkLastScratchLoadPass(), PMW);
2528 return Error::success();
2529}
2530
2531void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const {
2532 addMachineFunctionPass(Pass: SIFixVGPRCopiesPass(), PMW);
2533 if (TM.getOptLevel() > CodeGenOptLevel::None)
2534 addMachineFunctionPass(Pass: SIOptimizeExecMaskingPass(), PMW);
2535 Base::addPostRegAlloc(PMW);
2536}
2537
2538void AMDGPUCodeGenPassBuilder::addPreSched2(PassManagerWrapper &PMW) const {
2539 if (TM.getOptLevel() > CodeGenOptLevel::None)
2540 addMachineFunctionPass(Pass: SIShrinkInstructionsPass(), PMW);
2541 addMachineFunctionPass(Pass: SIPostRABundlerPass(), PMW);
2542}
2543
2544void AMDGPUCodeGenPassBuilder::addPostBBSections(
2545 PassManagerWrapper &PMW) const {
2546 // We run this later to avoid passes like livedebugvalues and BBSections
2547 // having to deal with the apparent multi-entry functions we may generate.
2548 addMachineFunctionPass(Pass: AMDGPUPreloadKernArgPrologPass(), PMW);
2549}
2550
2551void AMDGPUCodeGenPassBuilder::addPreEmitPass(PassManagerWrapper &PMW) const {
2552 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less)) {
2553 addMachineFunctionPass(Pass: GCNCreateVOPDPass(), PMW);
2554 }
2555
2556 addMachineFunctionPass(Pass: SIMemoryLegalizerPass(), PMW);
2557 addMachineFunctionPass(Pass: SIInsertWaitcntsPass(), PMW);
2558
2559 addMachineFunctionPass(Pass: SIModeRegisterPass(), PMW);
2560
2561 if (TM.getOptLevel() > CodeGenOptLevel::None)
2562 addMachineFunctionPass(Pass: SIInsertHardClausesPass(), PMW);
2563
2564 addMachineFunctionPass(Pass: SILateBranchLoweringPass(), PMW);
2565
2566 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
2567 addMachineFunctionPass(Pass: AMDGPUSetWavePriorityPass(), PMW);
2568
2569 if (TM.getOptLevel() > CodeGenOptLevel::None)
2570 addMachineFunctionPass(Pass: SIPreEmitPeepholePass(), PMW);
2571
2572 // The hazard recognizer that runs as part of the post-ra scheduler does not
2573 // guarantee to be able handle all hazards correctly. This is because if there
2574 // are multiple scheduling regions in a basic block, the regions are scheduled
2575 // bottom up, so when we begin to schedule a region we don't know what
2576 // instructions were emitted directly before it.
2577 //
2578 // Here we add a stand-alone hazard recognizer pass which can handle all
2579 // cases.
2580 addMachineFunctionPass(Pass: PostRAHazardRecognizerPass(), PMW);
2581 addMachineFunctionPass(Pass: AMDGPUWaitSGPRHazardsPass(), PMW);
2582 addMachineFunctionPass(Pass: AMDGPULowerVGPREncodingPass(), PMW);
2583
2584 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less)) {
2585 addMachineFunctionPass(Pass: AMDGPUInsertDelayAluPass(), PMW);
2586 }
2587
2588 addMachineFunctionPass(Pass: BranchRelaxationPass(), PMW);
2589}
2590
2591bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
2592 CodeGenOptLevel Level) const {
2593 if (Opt.getNumOccurrences())
2594 return Opt;
2595 if (TM.getOptLevel() < Level)
2596 return false;
2597 return Opt;
2598}
2599
2600void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(
2601 PassManagerWrapper &PMW) const {
2602 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
2603 addFunctionPass(Pass: GVNPass(), PMW);
2604 else
2605 addFunctionPass(Pass: EarlyCSEPass(), PMW);
2606}
2607
2608void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
2609 PassManagerWrapper &PMW) const {
2610 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
2611 addFunctionPass(Pass: LoopDataPrefetchPass(), PMW);
2612
2613 addFunctionPass(Pass: SeparateConstOffsetFromGEPPass(), PMW);
2614
2615 // ReassociateGEPs exposes more opportunities for SLSR. See
2616 // the example in reassociate-geps-and-slsr.ll.
2617 addFunctionPass(Pass: StraightLineStrengthReducePass(), PMW);
2618
2619 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
2620 // EarlyCSE can reuse.
2621 addEarlyCSEOrGVNPass(PMW);
2622
2623 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
2624 addFunctionPass(Pass: NaryReassociatePass(), PMW);
2625
2626 // NaryReassociate on GEPs creates redundant common expressions, so run
2627 // EarlyCSE after it.
2628 addFunctionPass(Pass: EarlyCSEPass(), PMW);
2629}
2630