1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains both AMDGPU target machine and the CodeGen pass builder.
11/// The AMDGPU target machine contains all of the hardware specific information
12/// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
13/// CodeGen pass builder handles the pass pipeline for new pass manager.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetMachine.h"
18#include "AMDGPU.h"
19#include "AMDGPUAliasAnalysis.h"
20#include "AMDGPUCtorDtorLowering.h"
21#include "AMDGPUExportClustering.h"
22#include "AMDGPUExportKernelRuntimeHandles.h"
23#include "AMDGPUIGroupLP.h"
24#include "AMDGPUISelDAGToDAG.h"
25#include "AMDGPUMacroFusion.h"
26#include "AMDGPUPerfHintAnalysis.h"
27#include "AMDGPUPreloadKernArgProlog.h"
28#include "AMDGPURemoveIncompatibleFunctions.h"
29#include "AMDGPUReserveWWMRegs.h"
30#include "AMDGPUSplitModule.h"
31#include "AMDGPUTargetObjectFile.h"
32#include "AMDGPUTargetTransformInfo.h"
33#include "AMDGPUUnifyDivergentExitNodes.h"
34#include "AMDGPUWaitSGPRHazards.h"
35#include "GCNDPPCombine.h"
36#include "GCNIterativeScheduler.h"
37#include "GCNNSAReassign.h"
38#include "GCNPreRALongBranchReg.h"
39#include "GCNPreRAOptimizations.h"
40#include "GCNRewritePartialRegUses.h"
41#include "GCNSchedStrategy.h"
42#include "GCNVOPDUtils.h"
43#include "R600.h"
44#include "R600TargetMachine.h"
45#include "SIFixSGPRCopies.h"
46#include "SIFixVGPRCopies.h"
47#include "SIFoldOperands.h"
48#include "SIFormMemoryClauses.h"
49#include "SILoadStoreOptimizer.h"
50#include "SILowerControlFlow.h"
51#include "SILowerSGPRSpills.h"
52#include "SILowerWWMCopies.h"
53#include "SIMachineFunctionInfo.h"
54#include "SIMachineScheduler.h"
55#include "SIOptimizeExecMasking.h"
56#include "SIOptimizeExecMaskingPreRA.h"
57#include "SIOptimizeVGPRLiveRange.h"
58#include "SIPeepholeSDWA.h"
59#include "SIPostRABundler.h"
60#include "SIPreAllocateWWMRegs.h"
61#include "SIShrinkInstructions.h"
62#include "SIWholeQuadMode.h"
63#include "TargetInfo/AMDGPUTargetInfo.h"
64#include "Utils/AMDGPUBaseInfo.h"
65#include "llvm/Analysis/CGSCCPassManager.h"
66#include "llvm/Analysis/CallGraphSCCPass.h"
67#include "llvm/Analysis/KernelInfo.h"
68#include "llvm/Analysis/UniformityAnalysis.h"
69#include "llvm/CodeGen/AtomicExpand.h"
70#include "llvm/CodeGen/BranchRelaxation.h"
71#include "llvm/CodeGen/DeadMachineInstructionElim.h"
72#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
73#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
74#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
75#include "llvm/CodeGen/GlobalISel/Legalizer.h"
76#include "llvm/CodeGen/GlobalISel/Localizer.h"
77#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
78#include "llvm/CodeGen/MIRParser/MIParser.h"
79#include "llvm/CodeGen/MachineCSE.h"
80#include "llvm/CodeGen/MachineLICM.h"
81#include "llvm/CodeGen/MachineScheduler.h"
82#include "llvm/CodeGen/Passes.h"
83#include "llvm/CodeGen/PostRAHazardRecognizer.h"
84#include "llvm/CodeGen/RegAllocRegistry.h"
85#include "llvm/CodeGen/TargetPassConfig.h"
86#include "llvm/IR/IntrinsicsAMDGPU.h"
87#include "llvm/IR/PassManager.h"
88#include "llvm/IR/PatternMatch.h"
89#include "llvm/InitializePasses.h"
90#include "llvm/MC/TargetRegistry.h"
91#include "llvm/Passes/PassBuilder.h"
92#include "llvm/Support/Compiler.h"
93#include "llvm/Support/FormatVariadic.h"
94#include "llvm/Transforms/HipStdPar/HipStdPar.h"
95#include "llvm/Transforms/IPO.h"
96#include "llvm/Transforms/IPO/AlwaysInliner.h"
97#include "llvm/Transforms/IPO/ExpandVariadics.h"
98#include "llvm/Transforms/IPO/GlobalDCE.h"
99#include "llvm/Transforms/IPO/Internalize.h"
100#include "llvm/Transforms/Scalar.h"
101#include "llvm/Transforms/Scalar/EarlyCSE.h"
102#include "llvm/Transforms/Scalar/FlattenCFG.h"
103#include "llvm/Transforms/Scalar/GVN.h"
104#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
105#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
106#include "llvm/Transforms/Scalar/NaryReassociate.h"
107#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
108#include "llvm/Transforms/Scalar/Sink.h"
109#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
110#include "llvm/Transforms/Scalar/StructurizeCFG.h"
111#include "llvm/Transforms/Utils.h"
112#include "llvm/Transforms/Utils/FixIrreducible.h"
113#include "llvm/Transforms/Utils/LCSSA.h"
114#include "llvm/Transforms/Utils/LowerSwitch.h"
115#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
116#include "llvm/Transforms/Utils/UnifyLoopExits.h"
117#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
118#include <optional>
119
120using namespace llvm;
121using namespace llvm::PatternMatch;
122
123namespace {
124class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
125public:
126 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
127 : RegisterRegAllocBase(N, D, C) {}
128};
129
130class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
131public:
132 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
133 : RegisterRegAllocBase(N, D, C) {}
134};
135
136class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
137public:
138 WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
139 : RegisterRegAllocBase(N, D, C) {}
140};
141
142static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
143 const MachineRegisterInfo &MRI,
144 const Register Reg) {
145 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
146 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
147}
148
149static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
150 const MachineRegisterInfo &MRI,
151 const Register Reg) {
152 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
153 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
154}
155
156static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
157 const MachineRegisterInfo &MRI,
158 const Register Reg) {
159 const SIMachineFunctionInfo *MFI =
160 MRI.getMF().getInfo<SIMachineFunctionInfo>();
161 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
162 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
163 MFI->checkFlag(Reg, Flag: AMDGPU::VirtRegFlag::WWM_REG);
164}
165
166/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
167static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
168
169/// A dummy default pass factory indicates whether the register allocator is
170/// overridden on the command line.
171static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
172static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
173static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
174
175static SGPRRegisterRegAlloc
176defaultSGPRRegAlloc("default",
177 "pick SGPR register allocator based on -O option",
178 useDefaultRegisterAllocator);
179
180static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
181 RegisterPassParser<SGPRRegisterRegAlloc>>
182SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
183 cl::desc("Register allocator to use for SGPRs"));
184
185static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
186 RegisterPassParser<VGPRRegisterRegAlloc>>
187VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
188 cl::desc("Register allocator to use for VGPRs"));
189
190static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
191 RegisterPassParser<WWMRegisterRegAlloc>>
192 WWMRegAlloc("wwm-regalloc", cl::Hidden,
193 cl::init(Val: &useDefaultRegisterAllocator),
194 cl::desc("Register allocator to use for WWM registers"));
195
196static void initializeDefaultSGPRRegisterAllocatorOnce() {
197 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
198
199 if (!Ctor) {
200 Ctor = SGPRRegAlloc;
201 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
202 }
203}
204
205static void initializeDefaultVGPRRegisterAllocatorOnce() {
206 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
207
208 if (!Ctor) {
209 Ctor = VGPRRegAlloc;
210 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
211 }
212}
213
214static void initializeDefaultWWMRegisterAllocatorOnce() {
215 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
216
217 if (!Ctor) {
218 Ctor = WWMRegAlloc;
219 WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
220 }
221}
222
223static FunctionPass *createBasicSGPRRegisterAllocator() {
224 return createBasicRegisterAllocator(F: onlyAllocateSGPRs);
225}
226
227static FunctionPass *createGreedySGPRRegisterAllocator() {
228 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
229}
230
231static FunctionPass *createFastSGPRRegisterAllocator() {
232 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
233}
234
235static FunctionPass *createBasicVGPRRegisterAllocator() {
236 return createBasicRegisterAllocator(F: onlyAllocateVGPRs);
237}
238
239static FunctionPass *createGreedyVGPRRegisterAllocator() {
240 return createGreedyRegisterAllocator(F: onlyAllocateVGPRs);
241}
242
243static FunctionPass *createFastVGPRRegisterAllocator() {
244 return createFastRegisterAllocator(F: onlyAllocateVGPRs, ClearVirtRegs: true);
245}
246
247static FunctionPass *createBasicWWMRegisterAllocator() {
248 return createBasicRegisterAllocator(F: onlyAllocateWWMRegs);
249}
250
251static FunctionPass *createGreedyWWMRegisterAllocator() {
252 return createGreedyRegisterAllocator(F: onlyAllocateWWMRegs);
253}
254
255static FunctionPass *createFastWWMRegisterAllocator() {
256 return createFastRegisterAllocator(F: onlyAllocateWWMRegs, ClearVirtRegs: false);
257}
258
259static SGPRRegisterRegAlloc basicRegAllocSGPR(
260 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
261static SGPRRegisterRegAlloc greedyRegAllocSGPR(
262 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
263
264static SGPRRegisterRegAlloc fastRegAllocSGPR(
265 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
266
267
268static VGPRRegisterRegAlloc basicRegAllocVGPR(
269 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
270static VGPRRegisterRegAlloc greedyRegAllocVGPR(
271 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
272
273static VGPRRegisterRegAlloc fastRegAllocVGPR(
274 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
275static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
276 "basic register allocator",
277 createBasicWWMRegisterAllocator);
278static WWMRegisterRegAlloc
279 greedyRegAllocWWMReg("greedy", "greedy register allocator",
280 createGreedyWWMRegisterAllocator);
281static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
282 createFastWWMRegisterAllocator);
283
284static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
285 return Phase == ThinOrFullLTOPhase::FullLTOPreLink ||
286 Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
287}
288} // anonymous namespace
289
290static cl::opt<bool>
291EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
292 cl::desc("Run early if-conversion"),
293 cl::init(Val: false));
294
295static cl::opt<bool>
296OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
297 cl::desc("Run pre-RA exec mask optimizations"),
298 cl::init(Val: true));
299
300static cl::opt<bool>
301 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
302 cl::desc("Lower GPU ctor / dtors to globals on the device."),
303 cl::init(Val: true), cl::Hidden);
304
305// Option to disable vectorizer for tests.
306static cl::opt<bool> EnableLoadStoreVectorizer(
307 "amdgpu-load-store-vectorizer",
308 cl::desc("Enable load store vectorizer"),
309 cl::init(Val: true),
310 cl::Hidden);
311
312// Option to control global loads scalarization
313static cl::opt<bool> ScalarizeGlobal(
314 "amdgpu-scalarize-global-loads",
315 cl::desc("Enable global load scalarization"),
316 cl::init(Val: true),
317 cl::Hidden);
318
319// Option to run internalize pass.
320static cl::opt<bool> InternalizeSymbols(
321 "amdgpu-internalize-symbols",
322 cl::desc("Enable elimination of non-kernel functions and unused globals"),
323 cl::init(Val: false),
324 cl::Hidden);
325
326// Option to inline all early.
327static cl::opt<bool> EarlyInlineAll(
328 "amdgpu-early-inline-all",
329 cl::desc("Inline all functions early"),
330 cl::init(Val: false),
331 cl::Hidden);
332
333static cl::opt<bool> RemoveIncompatibleFunctions(
334 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
335 cl::desc("Enable removal of functions when they"
336 "use features not supported by the target GPU"),
337 cl::init(Val: true));
338
339static cl::opt<bool> EnableSDWAPeephole(
340 "amdgpu-sdwa-peephole",
341 cl::desc("Enable SDWA peepholer"),
342 cl::init(Val: true));
343
344static cl::opt<bool> EnableDPPCombine(
345 "amdgpu-dpp-combine",
346 cl::desc("Enable DPP combiner"),
347 cl::init(Val: true));
348
349// Enable address space based alias analysis
350static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
351 cl::desc("Enable AMDGPU Alias Analysis"),
352 cl::init(Val: true));
353
354// Enable lib calls simplifications
355static cl::opt<bool> EnableLibCallSimplify(
356 "amdgpu-simplify-libcall",
357 cl::desc("Enable amdgpu library simplifications"),
358 cl::init(Val: true),
359 cl::Hidden);
360
361static cl::opt<bool> EnableLowerKernelArguments(
362 "amdgpu-ir-lower-kernel-arguments",
363 cl::desc("Lower kernel argument loads in IR pass"),
364 cl::init(Val: true),
365 cl::Hidden);
366
367static cl::opt<bool> EnableRegReassign(
368 "amdgpu-reassign-regs",
369 cl::desc("Enable register reassign optimizations on gfx10+"),
370 cl::init(Val: true),
371 cl::Hidden);
372
373static cl::opt<bool> OptVGPRLiveRange(
374 "amdgpu-opt-vgpr-liverange",
375 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
376 cl::init(Val: true), cl::Hidden);
377
378static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
379 "amdgpu-atomic-optimizer-strategy",
380 cl::desc("Select DPP or Iterative strategy for scan"),
381 cl::init(Val: ScanOptions::Iterative),
382 cl::values(
383 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
384 clEnumValN(ScanOptions::Iterative, "Iterative",
385 "Use Iterative approach for scan"),
386 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
387
388// Enable Mode register optimization
389static cl::opt<bool> EnableSIModeRegisterPass(
390 "amdgpu-mode-register",
391 cl::desc("Enable mode register pass"),
392 cl::init(Val: true),
393 cl::Hidden);
394
395// Enable GFX11+ s_delay_alu insertion
396static cl::opt<bool>
397 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
398 cl::desc("Enable s_delay_alu insertion"),
399 cl::init(Val: true), cl::Hidden);
400
401// Enable GFX11+ VOPD
402static cl::opt<bool>
403 EnableVOPD("amdgpu-enable-vopd",
404 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
405 cl::init(Val: true), cl::Hidden);
406
407// Option is used in lit tests to prevent deadcoding of patterns inspected.
408static cl::opt<bool>
409EnableDCEInRA("amdgpu-dce-in-ra",
410 cl::init(Val: true), cl::Hidden,
411 cl::desc("Enable machine DCE inside regalloc"));
412
413static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
414 cl::desc("Adjust wave priority"),
415 cl::init(Val: false), cl::Hidden);
416
417static cl::opt<bool> EnableScalarIRPasses(
418 "amdgpu-scalar-ir-passes",
419 cl::desc("Enable scalar IR passes"),
420 cl::init(Val: true),
421 cl::Hidden);
422
423static cl::opt<bool>
424 EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
425 cl::desc("Enable lowering of lds to global memory pass "
426 "and asan instrument resulting IR."),
427 cl::init(Val: true), cl::Hidden);
428
429static cl::opt<bool, true> EnableLowerModuleLDS(
430 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
431 cl::location(L&: AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(Val: true),
432 cl::Hidden);
433
434static cl::opt<bool> EnablePreRAOptimizations(
435 "amdgpu-enable-pre-ra-optimizations",
436 cl::desc("Enable Pre-RA optimizations pass"), cl::init(Val: true),
437 cl::Hidden);
438
439static cl::opt<bool> EnablePromoteKernelArguments(
440 "amdgpu-enable-promote-kernel-arguments",
441 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
442 cl::Hidden, cl::init(Val: true));
443
444static cl::opt<bool> EnableImageIntrinsicOptimizer(
445 "amdgpu-enable-image-intrinsic-optimizer",
446 cl::desc("Enable image intrinsic optimizer pass"), cl::init(Val: true),
447 cl::Hidden);
448
449static cl::opt<bool>
450 EnableLoopPrefetch("amdgpu-loop-prefetch",
451 cl::desc("Enable loop data prefetch on AMDGPU"),
452 cl::Hidden, cl::init(Val: false));
453
454static cl::opt<std::string>
455 AMDGPUSchedStrategy("amdgpu-sched-strategy",
456 cl::desc("Select custom AMDGPU scheduling strategy."),
457 cl::Hidden, cl::init(Val: ""));
458
459static cl::opt<bool> EnableRewritePartialRegUses(
460 "amdgpu-enable-rewrite-partial-reg-uses",
461 cl::desc("Enable rewrite partial reg uses pass"), cl::init(Val: true),
462 cl::Hidden);
463
464static cl::opt<bool> EnableHipStdPar(
465 "amdgpu-enable-hipstdpar",
466 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(Val: false),
467 cl::Hidden);
468
469static cl::opt<bool>
470 EnableAMDGPUAttributor("amdgpu-attributor-enable",
471 cl::desc("Enable AMDGPUAttributorPass"),
472 cl::init(Val: true), cl::Hidden);
473
474static cl::opt<bool> NewRegBankSelect(
475 "new-reg-bank-select",
476 cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of "
477 "regbankselect"),
478 cl::init(Val: false), cl::Hidden);
479
480static cl::opt<bool> HasClosedWorldAssumption(
481 "amdgpu-link-time-closed-world",
482 cl::desc("Whether has closed-world assumption at link time"),
483 cl::init(Val: false), cl::Hidden);
484
485extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
486 // Register the target
487 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
488 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
489
490 PassRegistry *PR = PassRegistry::getPassRegistry();
491 initializeR600ClauseMergePassPass(*PR);
492 initializeR600ControlFlowFinalizerPass(*PR);
493 initializeR600PacketizerPass(*PR);
494 initializeR600ExpandSpecialInstrsPassPass(*PR);
495 initializeR600VectorRegMergerPass(*PR);
496 initializeR600EmitClauseMarkersPass(*PR);
497 initializeR600MachineCFGStructurizerPass(*PR);
498 initializeGlobalISel(*PR);
499 initializeAMDGPUAsmPrinterPass(*PR);
500 initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
501 initializeGCNDPPCombineLegacyPass(*PR);
502 initializeSILowerI1CopiesLegacyPass(*PR);
503 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
504 initializeAMDGPURegBankSelectPass(*PR);
505 initializeAMDGPURegBankLegalizePass(*PR);
506 initializeSILowerWWMCopiesLegacyPass(*PR);
507 initializeAMDGPUMarkLastScratchLoadLegacyPass(*PR);
508 initializeSILowerSGPRSpillsLegacyPass(*PR);
509 initializeSIFixSGPRCopiesLegacyPass(*PR);
510 initializeSIFixVGPRCopiesLegacyPass(*PR);
511 initializeSIFoldOperandsLegacyPass(*PR);
512 initializeSIPeepholeSDWALegacyPass(*PR);
513 initializeSIShrinkInstructionsLegacyPass(*PR);
514 initializeSIOptimizeExecMaskingPreRALegacyPass(*PR);
515 initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR);
516 initializeSILoadStoreOptimizerLegacyPass(*PR);
517 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
518 initializeAMDGPUAlwaysInlinePass(*PR);
519 initializeAMDGPUSwLowerLDSLegacyPass(*PR);
520 initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
521 initializeAMDGPUArgumentUsageInfoPass(*PR);
522 initializeAMDGPUAtomicOptimizerPass(*PR);
523 initializeAMDGPULowerKernelArgumentsPass(*PR);
524 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
525 initializeAMDGPULowerKernelAttributesPass(*PR);
526 initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
527 initializeAMDGPUPostLegalizerCombinerPass(*PR);
528 initializeAMDGPUPreLegalizerCombinerPass(*PR);
529 initializeAMDGPURegBankCombinerPass(*PR);
530 initializeAMDGPUPromoteAllocaPass(*PR);
531 initializeAMDGPUCodeGenPreparePass(*PR);
532 initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
533 initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
534 initializeAMDGPULowerModuleLDSLegacyPass(*PR);
535 initializeAMDGPULowerBufferFatPointersPass(*PR);
536 initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
537 initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
538 initializeAMDGPURewriteOutArgumentsPass(*PR);
539 initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
540 initializeSIAnnotateControlFlowLegacyPass(*PR);
541 initializeAMDGPUInsertDelayAluLegacyPass(*PR);
542 initializeSIInsertHardClausesLegacyPass(*PR);
543 initializeSIInsertWaitcntsLegacyPass(*PR);
544 initializeSIModeRegisterLegacyPass(*PR);
545 initializeSIWholeQuadModeLegacyPass(*PR);
546 initializeSILowerControlFlowLegacyPass(*PR);
547 initializeSIPreEmitPeepholeLegacyPass(*PR);
548 initializeSILateBranchLoweringLegacyPass(*PR);
549 initializeSIMemoryLegalizerLegacyPass(*PR);
550 initializeSIOptimizeExecMaskingLegacyPass(*PR);
551 initializeSIPreAllocateWWMRegsLegacyPass(*PR);
552 initializeSIFormMemoryClausesLegacyPass(*PR);
553 initializeSIPostRABundlerLegacyPass(*PR);
554 initializeGCNCreateVOPDLegacyPass(*PR);
555 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
556 initializeAMDGPUAAWrapperPassPass(*PR);
557 initializeAMDGPUExternalAAWrapperPass(*PR);
558 initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
559 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
560 initializeAMDGPUResourceUsageAnalysisPass(*PR);
561 initializeGCNNSAReassignLegacyPass(*PR);
562 initializeGCNPreRAOptimizationsLegacyPass(*PR);
563 initializeGCNPreRALongBranchRegLegacyPass(*PR);
564 initializeGCNRewritePartialRegUsesLegacyPass(*PR);
565 initializeGCNRegPressurePrinterPass(*PR);
566 initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
567 initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
568 initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
569}
570
571static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
572 return std::make_unique<AMDGPUTargetObjectFile>();
573}
574
575static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
576 return new SIScheduleDAGMI(C);
577}
578
579static ScheduleDAGInstrs *
580createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
581 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
582 ScheduleDAGMILive *DAG =
583 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(args&: C));
584 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
585 if (ST.shouldClusterStores())
586 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
587 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
588 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
589 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
590 return DAG;
591}
592
593static ScheduleDAGInstrs *
594createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
595 ScheduleDAGMILive *DAG =
596 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(args&: C));
597 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
598 return DAG;
599}
600
601static ScheduleDAGInstrs *
602createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
603 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
604 ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
605 C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(args&: C));
606 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
607 if (ST.shouldClusterStores())
608 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
609 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
610 return DAG;
611}
612
613static ScheduleDAGInstrs *
614createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
615 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
616 auto *DAG = new GCNIterativeScheduler(
617 C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
618 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
619 if (ST.shouldClusterStores())
620 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
621 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
622 return DAG;
623}
624
625static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
626 auto *DAG = new GCNIterativeScheduler(
627 C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
628 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
629 return DAG;
630}
631
632static ScheduleDAGInstrs *
633createIterativeILPMachineScheduler(MachineSchedContext *C) {
634 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
635 auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
636 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
637 if (ST.shouldClusterStores())
638 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
639 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
640 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
641 return DAG;
642}
643
644static MachineSchedRegistry
645SISchedRegistry("si", "Run SI's custom scheduler",
646 createSIMachineScheduler);
647
648static MachineSchedRegistry
649GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
650 "Run GCN scheduler to maximize occupancy",
651 createGCNMaxOccupancyMachineScheduler);
652
653static MachineSchedRegistry
654 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
655 createGCNMaxILPMachineScheduler);
656
657static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
658 "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
659 createGCNMaxMemoryClauseMachineScheduler);
660
661static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
662 "gcn-iterative-max-occupancy-experimental",
663 "Run GCN scheduler to maximize occupancy (experimental)",
664 createIterativeGCNMaxOccupancyMachineScheduler);
665
666static MachineSchedRegistry GCNMinRegSchedRegistry(
667 "gcn-iterative-minreg",
668 "Run GCN iterative scheduler for minimal register usage (experimental)",
669 createMinRegScheduler);
670
671static MachineSchedRegistry GCNILPSchedRegistry(
672 "gcn-iterative-ilp",
673 "Run GCN iterative scheduler for ILP scheduling (experimental)",
674 createIterativeILPMachineScheduler);
675
676static StringRef computeDataLayout(const Triple &TT) {
677 if (TT.getArch() == Triple::r600) {
678 // 32-bit pointers.
679 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
680 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
681 }
682
683 // 32-bit private, local, and region pointers. 64-bit global, constant and
684 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
685 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
686 // (address space 7), and 128-bit non-integral buffer resourcees (address
687 // space 8) which cannot be non-trivilally accessed by LLVM memory operations
688 // like getelementptr.
689 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
690 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
691 "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
692 "v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
693}
694
695LLVM_READNONE
696static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
697 if (!GPU.empty())
698 return GPU;
699
700 // Need to default to a target with flat support for HSA.
701 if (TT.isAMDGCN())
702 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
703
704 return "r600";
705}
706
707static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
708 // The AMDGPU toolchain only supports generating shared objects, so we
709 // must always use PIC.
710 return Reloc::PIC_;
711}
712
713AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
714 StringRef CPU, StringRef FS,
715 const TargetOptions &Options,
716 std::optional<Reloc::Model> RM,
717 std::optional<CodeModel::Model> CM,
718 CodeGenOptLevel OptLevel)
719 : CodeGenTargetMachineImpl(
720 T, computeDataLayout(TT), TT, getGPUOrDefault(TT, GPU: CPU), FS, Options,
721 getEffectiveRelocModel(RM),
722 getEffectiveCodeModel(CM, Default: CodeModel::Small), OptLevel),
723 TLOF(createTLOF(TT: getTargetTriple())) {
724 initAsmInfo();
725 if (TT.isAMDGCN()) {
726 if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize64"))
727 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave64));
728 else if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize32"))
729 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave32));
730 }
731}
732
733bool AMDGPUTargetMachine::EnableFunctionCalls = false;
734bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
735
736AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
737
738StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
739 Attribute GPUAttr = F.getFnAttribute(Kind: "target-cpu");
740 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
741}
742
743StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
744 Attribute FSAttr = F.getFnAttribute(Kind: "target-features");
745
746 return FSAttr.isValid() ? FSAttr.getValueAsString()
747 : getTargetFeatureString();
748}
749
750llvm::ScheduleDAGInstrs *
751AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
752 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
753 ScheduleDAGMILive *DAG = createSchedLive(C);
754 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
755 if (ST.shouldClusterStores())
756 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
757 return DAG;
758}
759
760/// Predicate for Internalize pass.
761static bool mustPreserveGV(const GlobalValue &GV) {
762 if (const Function *F = dyn_cast<Function>(Val: &GV))
763 return F->isDeclaration() || F->getName().starts_with(Prefix: "__asan_") ||
764 F->getName().starts_with(Prefix: "__sanitizer_") ||
765 AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
766
767 GV.removeDeadConstantUsers();
768 return !GV.use_empty();
769}
770
771void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
772 AAM.registerFunctionAnalysis<AMDGPUAA>();
773}
774
775static Expected<ScanOptions>
776parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
777 if (Params.empty())
778 return ScanOptions::Iterative;
779 Params.consume_front(Prefix: "strategy=");
780 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
781 .Case(S: "dpp", Value: ScanOptions::DPP)
782 .Cases(S0: "iterative", S1: "", Value: ScanOptions::Iterative)
783 .Case(S: "none", Value: ScanOptions::None)
784 .Default(Value: std::nullopt);
785 if (Result)
786 return *Result;
787 return make_error<StringError>(Args: "invalid parameter", Args: inconvertibleErrorCode());
788}
789
790Expected<AMDGPUAttributorOptions>
791parseAMDGPUAttributorPassOptions(StringRef Params) {
792 AMDGPUAttributorOptions Result;
793 while (!Params.empty()) {
794 StringRef ParamName;
795 std::tie(args&: ParamName, args&: Params) = Params.split(Separator: ';');
796 if (ParamName == "closed-world") {
797 Result.IsClosedWorld = true;
798 } else {
799 return make_error<StringError>(
800 Args: formatv(Fmt: "invalid AMDGPUAttributor pass parameter '{0}' ", Vals&: ParamName)
801 .str(),
802 Args: inconvertibleErrorCode());
803 }
804 }
805 return Result;
806}
807
808void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
809
810#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
811#include "llvm/Passes/TargetPassRegistry.inc"
812
813 PB.registerScalarOptimizerLateEPCallback(
814 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
815 if (Level == OptimizationLevel::O0)
816 return;
817
818 FPM.addPass(Pass: InferAddressSpacesPass());
819 });
820
821 PB.registerVectorizerEndEPCallback(
822 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
823 if (Level == OptimizationLevel::O0)
824 return;
825
826 FPM.addPass(Pass: InferAddressSpacesPass());
827 });
828
829 PB.registerPipelineEarlySimplificationEPCallback(
830 C: [](ModulePassManager &PM, OptimizationLevel Level,
831 ThinOrFullLTOPhase Phase) {
832 if (!isLTOPreLink(Phase)) {
833 // When we are not using -fgpu-rdc, we can run accelerator code
834 // selection relatively early, but still after linking to prevent
835 // eager removal of potentially reachable symbols.
836 if (EnableHipStdPar)
837 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
838 PM.addPass(Pass: AMDGPUPrintfRuntimeBindingPass());
839 }
840
841 if (Level == OptimizationLevel::O0)
842 return;
843
844 PM.addPass(Pass: AMDGPUUnifyMetadataPass());
845
846 // We don't want to run internalization at per-module stage.
847 if (InternalizeSymbols && !isLTOPreLink(Phase)) {
848 PM.addPass(Pass: InternalizePass(mustPreserveGV));
849 PM.addPass(Pass: GlobalDCEPass());
850 }
851
852 if (EarlyInlineAll && !EnableFunctionCalls)
853 PM.addPass(Pass: AMDGPUAlwaysInlinePass());
854 });
855
856 PB.registerPeepholeEPCallback(
857 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
858 if (Level == OptimizationLevel::O0)
859 return;
860
861 FPM.addPass(Pass: AMDGPUUseNativeCallsPass());
862 if (EnableLibCallSimplify)
863 FPM.addPass(Pass: AMDGPUSimplifyLibCallsPass());
864 });
865
866 PB.registerCGSCCOptimizerLateEPCallback(
867 C: [this](CGSCCPassManager &PM, OptimizationLevel Level) {
868 if (Level == OptimizationLevel::O0)
869 return;
870
871 FunctionPassManager FPM;
872
873 // Add promote kernel arguments pass to the opt pipeline right before
874 // infer address spaces which is needed to do actual address space
875 // rewriting.
876 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
877 EnablePromoteKernelArguments)
878 FPM.addPass(Pass: AMDGPUPromoteKernelArgumentsPass());
879
880 // Add infer address spaces pass to the opt pipeline after inlining
881 // but before SROA to increase SROA opportunities.
882 FPM.addPass(Pass: InferAddressSpacesPass());
883
884 // This should run after inlining to have any chance of doing
885 // anything, and before other cleanup optimizations.
886 FPM.addPass(Pass: AMDGPULowerKernelAttributesPass());
887
888 if (Level != OptimizationLevel::O0) {
889 // Promote alloca to vector before SROA and loop unroll. If we
890 // manage to eliminate allocas before unroll we may choose to unroll
891 // less.
892 FPM.addPass(Pass: AMDGPUPromoteAllocaToVectorPass(*this));
893 }
894
895 PM.addPass(Pass: createCGSCCToFunctionPassAdaptor(Pass: std::move(FPM)));
896 });
897
898 // FIXME: Why is AMDGPUAttributor not in CGSCC?
899 PB.registerOptimizerLastEPCallback(C: [this](ModulePassManager &MPM,
900 OptimizationLevel Level,
901 ThinOrFullLTOPhase Phase) {
902 if (Level != OptimizationLevel::O0) {
903 if (!isLTOPreLink(Phase)) {
904 AMDGPUAttributorOptions Opts;
905 MPM.addPass(Pass: AMDGPUAttributorPass(*this, Opts, Phase));
906 }
907 }
908 });
909
910 PB.registerFullLinkTimeOptimizationLastEPCallback(
911 C: [this](ModulePassManager &PM, OptimizationLevel Level) {
912 // When we are using -fgpu-rdc, we can only run accelerator code
913 // selection after linking to prevent, otherwise we end up removing
914 // potentially reachable symbols that were exported as external in other
915 // modules.
916 if (EnableHipStdPar)
917 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
918 // We want to support the -lto-partitions=N option as "best effort".
919 // For that, we need to lower LDS earlier in the pipeline before the
920 // module is partitioned for codegen.
921 if (EnableSwLowerLDS)
922 PM.addPass(Pass: AMDGPUSwLowerLDSPass(*this));
923 if (EnableLowerModuleLDS)
924 PM.addPass(Pass: AMDGPULowerModuleLDSPass(*this));
925 if (Level != OptimizationLevel::O0) {
926 // We only want to run this with O2 or higher since inliner and SROA
927 // don't run in O1.
928 if (Level != OptimizationLevel::O1) {
929 PM.addPass(
930 Pass: createModuleToFunctionPassAdaptor(Pass: InferAddressSpacesPass()));
931 }
932 // Do we really need internalization in LTO?
933 if (InternalizeSymbols) {
934 PM.addPass(Pass: InternalizePass(mustPreserveGV));
935 PM.addPass(Pass: GlobalDCEPass());
936 }
937 if (EnableAMDGPUAttributor) {
938 AMDGPUAttributorOptions Opt;
939 if (HasClosedWorldAssumption)
940 Opt.IsClosedWorld = true;
941 PM.addPass(Pass: AMDGPUAttributorPass(
942 *this, Opt, ThinOrFullLTOPhase::FullLTOPostLink));
943 }
944 }
945 if (!NoKernelInfoEndLTO) {
946 FunctionPassManager FPM;
947 FPM.addPass(Pass: KernelInfoPrinter(this));
948 PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM)));
949 }
950 });
951
952 PB.registerRegClassFilterParsingCallback(
953 C: [](StringRef FilterName) -> RegAllocFilterFunc {
954 if (FilterName == "sgpr")
955 return onlyAllocateSGPRs;
956 if (FilterName == "vgpr")
957 return onlyAllocateVGPRs;
958 if (FilterName == "wwm")
959 return onlyAllocateWWMRegs;
960 return nullptr;
961 });
962}
963
964int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
965 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
966 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
967 AddrSpace == AMDGPUAS::REGION_ADDRESS)
968 ? -1
969 : 0;
970}
971
972bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
973 unsigned DestAS) const {
974 return AMDGPU::isFlatGlobalAddrSpace(AS: SrcAS) &&
975 AMDGPU::isFlatGlobalAddrSpace(AS: DestAS);
976}
977
978unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
979 if (auto *Arg = dyn_cast<Argument>(Val: V);
980 Arg &&
981 AMDGPU::isModuleEntryFunctionCC(CC: Arg->getParent()->getCallingConv()) &&
982 !Arg->hasByRefAttr())
983 return AMDGPUAS::GLOBAL_ADDRESS;
984
985 const auto *LD = dyn_cast<LoadInst>(Val: V);
986 if (!LD) // TODO: Handle invariant load like constant.
987 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
988
989 // It must be a generic pointer loaded.
990 assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
991
992 const auto *Ptr = LD->getPointerOperand();
993 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
994 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
995 // For a generic pointer loaded from the constant memory, it could be assumed
996 // as a global pointer since the constant memory is only populated on the
997 // host side. As implied by the offload programming model, only global
998 // pointers could be referenced on the host side.
999 return AMDGPUAS::GLOBAL_ADDRESS;
1000}
1001
1002std::pair<const Value *, unsigned>
1003AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
1004 if (auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
1005 switch (II->getIntrinsicID()) {
1006 case Intrinsic::amdgcn_is_shared:
1007 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::LOCAL_ADDRESS);
1008 case Intrinsic::amdgcn_is_private:
1009 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::PRIVATE_ADDRESS);
1010 default:
1011 break;
1012 }
1013 return std::pair(nullptr, -1);
1014 }
1015 // Check the global pointer predication based on
1016 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
1017 // the order of 'is_shared' and 'is_private' is not significant.
1018 Value *Ptr;
1019 if (match(
1020 V: const_cast<Value *>(V),
1021 P: m_c_And(L: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_shared>(Op0: m_Value(V&: Ptr))),
1022 R: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_private>(
1023 Op0: m_Deferred(V: Ptr))))))
1024 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
1025
1026 return std::pair(nullptr, -1);
1027}
1028
1029unsigned
1030AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
1031 switch (Kind) {
1032 case PseudoSourceValue::Stack:
1033 case PseudoSourceValue::FixedStack:
1034 return AMDGPUAS::PRIVATE_ADDRESS;
1035 case PseudoSourceValue::ConstantPool:
1036 case PseudoSourceValue::GOT:
1037 case PseudoSourceValue::JumpTable:
1038 case PseudoSourceValue::GlobalValueCallEntry:
1039 case PseudoSourceValue::ExternalSymbolCallEntry:
1040 return AMDGPUAS::CONSTANT_ADDRESS;
1041 }
1042 return AMDGPUAS::FLAT_ADDRESS;
1043}
1044
1045bool AMDGPUTargetMachine::splitModule(
1046 Module &M, unsigned NumParts,
1047 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1048 // FIXME(?): Would be better to use an already existing Analysis/PassManager,
1049 // but all current users of this API don't have one ready and would need to
1050 // create one anyway. Let's hide the boilerplate for now to keep it simple.
1051
1052 LoopAnalysisManager LAM;
1053 FunctionAnalysisManager FAM;
1054 CGSCCAnalysisManager CGAM;
1055 ModuleAnalysisManager MAM;
1056
1057 PassBuilder PB(this);
1058 PB.registerModuleAnalyses(MAM);
1059 PB.registerFunctionAnalyses(FAM);
1060 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
1061
1062 ModulePassManager MPM;
1063 MPM.addPass(Pass: AMDGPUSplitModulePass(NumParts, ModuleCallback));
1064 MPM.run(IR&: M, AM&: MAM);
1065 return true;
1066}
1067
1068//===----------------------------------------------------------------------===//
1069// GCN Target Machine (SI+)
1070//===----------------------------------------------------------------------===//
1071
1072GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
1073 StringRef CPU, StringRef FS,
1074 const TargetOptions &Options,
1075 std::optional<Reloc::Model> RM,
1076 std::optional<CodeModel::Model> CM,
1077 CodeGenOptLevel OL, bool JIT)
1078 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
1079
1080const TargetSubtargetInfo *
1081GCNTargetMachine::getSubtargetImpl(const Function &F) const {
1082 StringRef GPU = getGPUName(F);
1083 StringRef FS = getFeatureString(F);
1084
1085 SmallString<128> SubtargetKey(GPU);
1086 SubtargetKey.append(RHS: FS);
1087
1088 auto &I = SubtargetMap[SubtargetKey];
1089 if (!I) {
1090 // This needs to be done before we create a new subtarget since any
1091 // creation will depend on the TM and the code generation flags on the
1092 // function that reside in TargetOptions.
1093 resetTargetOptions(F);
1094 I = std::make_unique<GCNSubtarget>(args: TargetTriple, args&: GPU, args&: FS, args: *this);
1095 }
1096
1097 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
1098
1099 return I.get();
1100}
1101
1102TargetTransformInfo
1103GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
1104 return TargetTransformInfo(std::make_unique<GCNTTIImpl>(args: this, args: F));
1105}
1106
1107Error GCNTargetMachine::buildCodeGenPipeline(
1108 ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
1109 CodeGenFileType FileType, const CGPassBuilderOption &Opts,
1110 PassInstrumentationCallbacks *PIC) {
1111 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
1112 return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
1113}
1114
1115ScheduleDAGInstrs *
1116GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
1117 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1118 if (ST.enableSIScheduler())
1119 return createSIMachineScheduler(C);
1120
1121 Attribute SchedStrategyAttr =
1122 C->MF->getFunction().getFnAttribute(Kind: "amdgpu-sched-strategy");
1123 StringRef SchedStrategy = SchedStrategyAttr.isValid()
1124 ? SchedStrategyAttr.getValueAsString()
1125 : AMDGPUSchedStrategy;
1126
1127 if (SchedStrategy == "max-ilp")
1128 return createGCNMaxILPMachineScheduler(C);
1129
1130 if (SchedStrategy == "max-memory-clause")
1131 return createGCNMaxMemoryClauseMachineScheduler(C);
1132
1133 if (SchedStrategy == "iterative-ilp")
1134 return createIterativeILPMachineScheduler(C);
1135
1136 if (SchedStrategy == "iterative-minreg")
1137 return createMinRegScheduler(C);
1138
1139 if (SchedStrategy == "iterative-maxocc")
1140 return createIterativeGCNMaxOccupancyMachineScheduler(C);
1141
1142 return createGCNMaxOccupancyMachineScheduler(C);
1143}
1144
1145ScheduleDAGInstrs *
1146GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
1147 ScheduleDAGMI *DAG =
1148 new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(args&: C),
1149 /*RemoveKillFlags=*/true);
1150 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1151 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1152 if (ST.shouldClusterStores())
1153 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1154 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
1155 if ((EnableVOPD.getNumOccurrences() ||
1156 getOptLevel() >= CodeGenOptLevel::Less) &&
1157 EnableVOPD)
1158 DAG->addMutation(Mutation: createVOPDPairingMutation());
1159 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
1160 return DAG;
1161}
1162//===----------------------------------------------------------------------===//
1163// AMDGPU Legacy Pass Setup
1164//===----------------------------------------------------------------------===//
1165
1166std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
1167 return getStandardCSEConfigForOpt(Level: TM->getOptLevel());
1168}
1169
1170namespace {
1171
1172class GCNPassConfig final : public AMDGPUPassConfig {
1173public:
1174 GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
1175 : AMDGPUPassConfig(TM, PM) {
1176 // It is necessary to know the register usage of the entire call graph. We
1177 // allow calls without EnableAMDGPUFunctionCalls if they are marked
1178 // noinline, so this is always required.
1179 setRequiresCodeGenSCCOrder(true);
1180 substitutePass(StandardID: &PostRASchedulerID, TargetID: &PostMachineSchedulerID);
1181 }
1182
1183 GCNTargetMachine &getGCNTargetMachine() const {
1184 return getTM<GCNTargetMachine>();
1185 }
1186
1187 bool addPreISel() override;
1188 void addMachineSSAOptimization() override;
1189 bool addILPOpts() override;
1190 bool addInstSelector() override;
1191 bool addIRTranslator() override;
1192 void addPreLegalizeMachineIR() override;
1193 bool addLegalizeMachineIR() override;
1194 void addPreRegBankSelect() override;
1195 bool addRegBankSelect() override;
1196 void addPreGlobalInstructionSelect() override;
1197 bool addGlobalInstructionSelect() override;
1198 void addFastRegAlloc() override;
1199 void addOptimizedRegAlloc() override;
1200
1201 FunctionPass *createSGPRAllocPass(bool Optimized);
1202 FunctionPass *createVGPRAllocPass(bool Optimized);
1203 FunctionPass *createWWMRegAllocPass(bool Optimized);
1204 FunctionPass *createRegAllocPass(bool Optimized) override;
1205
1206 bool addRegAssignAndRewriteFast() override;
1207 bool addRegAssignAndRewriteOptimized() override;
1208
1209 bool addPreRewrite() override;
1210 void addPostRegAlloc() override;
1211 void addPreSched2() override;
1212 void addPreEmitPass() override;
1213 void addPostBBSections() override;
1214};
1215
1216} // end anonymous namespace
1217
1218AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM)
1219 : TargetPassConfig(TM, PM) {
1220 // Exceptions and StackMaps are not supported, so these passes will never do
1221 // anything.
1222 disablePass(PassID: &StackMapLivenessID);
1223 disablePass(PassID: &FuncletLayoutID);
1224 // Garbage collection is not supported.
1225 disablePass(PassID: &GCLoweringID);
1226 disablePass(PassID: &ShadowStackGCLoweringID);
1227}
1228
1229void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
1230 if (getOptLevel() == CodeGenOptLevel::Aggressive)
1231 addPass(P: createGVNPass());
1232 else
1233 addPass(P: createEarlyCSEPass());
1234}
1235
1236void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
1237 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
1238 addPass(P: createLoopDataPrefetchPass());
1239 addPass(P: createSeparateConstOffsetFromGEPPass());
1240 // ReassociateGEPs exposes more opportunities for SLSR. See
1241 // the example in reassociate-geps-and-slsr.ll.
1242 addPass(P: createStraightLineStrengthReducePass());
1243 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1244 // EarlyCSE can reuse.
1245 addEarlyCSEOrGVNPass();
1246 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1247 addPass(P: createNaryReassociatePass());
1248 // NaryReassociate on GEPs creates redundant common expressions, so run
1249 // EarlyCSE after it.
1250 addPass(P: createEarlyCSEPass());
1251}
1252
1253void AMDGPUPassConfig::addIRPasses() {
1254 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1255
1256 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
1257 addPass(P: createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1258
1259 // There is no reason to run these.
1260 disablePass(PassID: &StackMapLivenessID);
1261 disablePass(PassID: &FuncletLayoutID);
1262 disablePass(PassID: &PatchableFunctionID);
1263
1264 addPass(P: createAMDGPUPrintfRuntimeBinding());
1265 if (LowerCtorDtor)
1266 addPass(P: createAMDGPUCtorDtorLoweringLegacyPass());
1267
1268 if (isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
1269 addPass(P: createAMDGPUImageIntrinsicOptimizerPass(&TM));
1270
1271 // This can be disabled by passing ::Disable here or on the command line
1272 // with --expand-variadics-override=disable.
1273 addPass(P: createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
1274
1275 // Function calls are not supported, so make sure we inline everything.
1276 addPass(P: createAMDGPUAlwaysInlinePass());
1277 addPass(P: createAlwaysInlinerLegacyPass());
1278
1279 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1280 if (TM.getTargetTriple().getArch() == Triple::r600)
1281 addPass(P: createR600OpenCLImageTypeLoweringPass());
1282
1283 // Make enqueued block runtime handles externally visible.
1284 addPass(P: createAMDGPUExportKernelRuntimeHandlesLegacyPass());
1285
1286 // Lower LDS accesses to global memory pass if address sanitizer is enabled.
1287 if (EnableSwLowerLDS)
1288 addPass(P: createAMDGPUSwLowerLDSLegacyPass(TM: &TM));
1289
1290 // Runs before PromoteAlloca so the latter can account for function uses
1291 if (EnableLowerModuleLDS) {
1292 addPass(P: createAMDGPULowerModuleLDSLegacyPass(TM: &TM));
1293 }
1294
1295 // Run atomic optimizer before Atomic Expand
1296 if ((TM.getTargetTriple().isAMDGCN()) &&
1297 (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1298 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1299 addPass(P: createAMDGPUAtomicOptimizerPass(ScanStrategy: AMDGPUAtomicOptimizerStrategy));
1300 }
1301
1302 addPass(P: createAtomicExpandLegacyPass());
1303
1304 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1305 addPass(P: createAMDGPUPromoteAlloca());
1306
1307 if (isPassEnabled(Opt: EnableScalarIRPasses))
1308 addStraightLineScalarOptimizationPasses();
1309
1310 if (EnableAMDGPUAliasAnalysis) {
1311 addPass(P: createAMDGPUAAWrapperPass());
1312 addPass(P: createExternalAAWrapperPass(Callback: [](Pass &P, Function &,
1313 AAResults &AAR) {
1314 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1315 AAR.addAAResult(AAResult&: WrapperPass->getResult());
1316 }));
1317 }
1318
1319 if (TM.getTargetTriple().isAMDGCN()) {
1320 // TODO: May want to move later or split into an early and late one.
1321 addPass(P: createAMDGPUCodeGenPreparePass());
1322 }
1323
1324 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1325 // have expanded.
1326 if (TM.getOptLevel() > CodeGenOptLevel::Less)
1327 addPass(P: createLICMPass());
1328 }
1329
1330 TargetPassConfig::addIRPasses();
1331
1332 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1333 // example, GVN can combine
1334 //
1335 // %0 = add %a, %b
1336 // %1 = add %b, %a
1337 //
1338 // and
1339 //
1340 // %0 = shl nsw %a, 2
1341 // %1 = shl %a, 2
1342 //
1343 // but EarlyCSE can do neither of them.
1344 if (isPassEnabled(Opt: EnableScalarIRPasses))
1345 addEarlyCSEOrGVNPass();
1346}
1347
1348void AMDGPUPassConfig::addCodeGenPrepare() {
1349 if (TM->getTargetTriple().isAMDGCN() &&
1350 TM->getOptLevel() > CodeGenOptLevel::None)
1351 addPass(P: createAMDGPUPreloadKernelArgumentsLegacyPass(TM));
1352
1353 if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
1354 addPass(P: createAMDGPULowerKernelArgumentsPass());
1355
1356 if (TM->getTargetTriple().isAMDGCN()) {
1357 // This lowering has been placed after codegenprepare to take advantage of
1358 // address mode matching (which is why it isn't put with the LDS lowerings).
1359 // It could be placed anywhere before uniformity annotations (an analysis
1360 // that it changes by splitting up fat pointers into their components)
1361 // but has been put before switch lowering and CFG flattening so that those
1362 // passes can run on the more optimized control flow this pass creates in
1363 // many cases.
1364 //
1365 // FIXME: This should ideally be put after the LoadStoreVectorizer.
1366 // However, due to some annoying facts about ResourceUsageAnalysis,
1367 // (especially as exercised in the resource-usage-dead-function test),
1368 // we need all the function passes codegenprepare all the way through
1369 // said resource usage analysis to run on the call graph produced
1370 // before codegenprepare runs (because codegenprepare will knock some
1371 // nodes out of the graph, which leads to function-level passes not
1372 // being run on them, which causes crashes in the resource usage analysis).
1373 addPass(P: createAMDGPULowerBufferFatPointersPass());
1374 // In accordance with the above FIXME, manually force all the
1375 // function-level passes into a CGSCCPassManager.
1376 addPass(P: new DummyCGSCCPass());
1377 }
1378
1379 TargetPassConfig::addCodeGenPrepare();
1380
1381 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
1382 addPass(P: createLoadStoreVectorizerPass());
1383
1384 // LowerSwitch pass may introduce unreachable blocks that can
1385 // cause unexpected behavior for subsequent passes. Placing it
1386 // here seems better that these blocks would get cleaned up by
1387 // UnreachableBlockElim inserted next in the pass flow.
1388 addPass(P: createLowerSwitchPass());
1389}
1390
1391bool AMDGPUPassConfig::addPreISel() {
1392 if (TM->getOptLevel() > CodeGenOptLevel::None)
1393 addPass(P: createFlattenCFGPass());
1394 return false;
1395}
1396
1397bool AMDGPUPassConfig::addInstSelector() {
1398 addPass(P: createAMDGPUISelDag(TM&: getAMDGPUTargetMachine(), OptLevel: getOptLevel()));
1399 return false;
1400}
1401
1402bool AMDGPUPassConfig::addGCPasses() {
1403 // Do nothing. GC is not supported.
1404 return false;
1405}
1406
1407//===----------------------------------------------------------------------===//
1408// GCN Legacy Pass Setup
1409//===----------------------------------------------------------------------===//
1410
1411bool GCNPassConfig::addPreISel() {
1412 AMDGPUPassConfig::addPreISel();
1413
1414 if (TM->getOptLevel() > CodeGenOptLevel::None)
1415 addPass(P: createSinkingPass());
1416
1417 if (TM->getOptLevel() > CodeGenOptLevel::None)
1418 addPass(P: createAMDGPULateCodeGenPrepareLegacyPass());
1419
1420 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1421 // regions formed by them.
1422 addPass(PassID: &AMDGPUUnifyDivergentExitNodesID);
1423 addPass(P: createFixIrreduciblePass());
1424 addPass(P: createUnifyLoopExitsPass());
1425 addPass(P: createStructurizeCFGPass(SkipUniformRegions: false)); // true -> SkipUniformRegions
1426
1427 addPass(P: createAMDGPUAnnotateUniformValuesLegacy());
1428 addPass(P: createSIAnnotateControlFlowLegacyPass());
1429 // TODO: Move this right after structurizeCFG to avoid extra divergence
1430 // analysis. This depends on stopping SIAnnotateControlFlow from making
1431 // control flow modifications.
1432 addPass(P: createAMDGPURewriteUndefForPHILegacyPass());
1433
1434 // SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
1435 // with -new-reg-bank-select and without any of the fallback options.
1436 if (!getCGPassBuilderOption().EnableGlobalISelOption ||
1437 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
1438 addPass(P: createLCSSAPass());
1439
1440 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1441 addPass(PassID: &AMDGPUPerfHintAnalysisLegacyID);
1442
1443 return false;
1444}
1445
1446void GCNPassConfig::addMachineSSAOptimization() {
1447 TargetPassConfig::addMachineSSAOptimization();
1448
1449 // We want to fold operands after PeepholeOptimizer has run (or as part of
1450 // it), because it will eliminate extra copies making it easier to fold the
1451 // real source operand. We want to eliminate dead instructions after, so that
1452 // we see fewer uses of the copies. We then need to clean up the dead
1453 // instructions leftover after the operands are folded as well.
1454 //
1455 // XXX - Can we get away without running DeadMachineInstructionElim again?
1456 addPass(PassID: &SIFoldOperandsLegacyID);
1457 if (EnableDPPCombine)
1458 addPass(PassID: &GCNDPPCombineLegacyID);
1459 addPass(PassID: &SILoadStoreOptimizerLegacyID);
1460 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
1461 addPass(PassID: &SIPeepholeSDWALegacyID);
1462 addPass(PassID: &EarlyMachineLICMID);
1463 addPass(PassID: &MachineCSELegacyID);
1464 addPass(PassID: &SIFoldOperandsLegacyID);
1465 }
1466 addPass(PassID: &DeadMachineInstructionElimID);
1467 addPass(P: createSIShrinkInstructionsLegacyPass());
1468}
1469
1470bool GCNPassConfig::addILPOpts() {
1471 if (EnableEarlyIfConversion)
1472 addPass(PassID: &EarlyIfConverterLegacyID);
1473
1474 TargetPassConfig::addILPOpts();
1475 return false;
1476}
1477
1478bool GCNPassConfig::addInstSelector() {
1479 AMDGPUPassConfig::addInstSelector();
1480 addPass(PassID: &SIFixSGPRCopiesLegacyID);
1481 addPass(P: createSILowerI1CopiesLegacyPass());
1482 return false;
1483}
1484
1485bool GCNPassConfig::addIRTranslator() {
1486 addPass(P: new IRTranslator(getOptLevel()));
1487 return false;
1488}
1489
1490void GCNPassConfig::addPreLegalizeMachineIR() {
1491 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1492 addPass(P: createAMDGPUPreLegalizeCombiner(IsOptNone));
1493 addPass(P: new Localizer());
1494}
1495
1496bool GCNPassConfig::addLegalizeMachineIR() {
1497 addPass(P: new Legalizer());
1498 return false;
1499}
1500
1501void GCNPassConfig::addPreRegBankSelect() {
1502 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1503 addPass(P: createAMDGPUPostLegalizeCombiner(IsOptNone));
1504 addPass(P: createAMDGPUGlobalISelDivergenceLoweringPass());
1505}
1506
1507bool GCNPassConfig::addRegBankSelect() {
1508 if (NewRegBankSelect) {
1509 addPass(P: createAMDGPURegBankSelectPass());
1510 addPass(P: createAMDGPURegBankLegalizePass());
1511 } else {
1512 addPass(P: new RegBankSelect());
1513 }
1514 return false;
1515}
1516
1517void GCNPassConfig::addPreGlobalInstructionSelect() {
1518 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1519 addPass(P: createAMDGPURegBankCombiner(IsOptNone));
1520}
1521
1522bool GCNPassConfig::addGlobalInstructionSelect() {
1523 addPass(P: new InstructionSelect(getOptLevel()));
1524 return false;
1525}
1526
1527void GCNPassConfig::addFastRegAlloc() {
1528 // FIXME: We have to disable the verifier here because of PHIElimination +
1529 // TwoAddressInstructions disabling it.
1530
1531 // This must be run immediately after phi elimination and before
1532 // TwoAddressInstructions, otherwise the processing of the tied operand of
1533 // SI_ELSE will introduce a copy of the tied operand source after the else.
1534 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1535
1536 insertPass(TargetPassID: &TwoAddressInstructionPassID, InsertedPassID: &SIWholeQuadModeID);
1537
1538 TargetPassConfig::addFastRegAlloc();
1539}
1540
1541void GCNPassConfig::addOptimizedRegAlloc() {
1542 if (EnableDCEInRA)
1543 insertPass(TargetPassID: &DetectDeadLanesID, InsertedPassID: &DeadMachineInstructionElimID);
1544
1545 // FIXME: when an instruction has a Killed operand, and the instruction is
1546 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1547 // the register in LiveVariables, this would trigger a failure in verifier,
1548 // we should fix it and enable the verifier.
1549 if (OptVGPRLiveRange)
1550 insertPass(TargetPassID: &LiveVariablesID, InsertedPassID: &SIOptimizeVGPRLiveRangeLegacyID);
1551
1552 // This must be run immediately after phi elimination and before
1553 // TwoAddressInstructions, otherwise the processing of the tied operand of
1554 // SI_ELSE will introduce a copy of the tied operand source after the else.
1555 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowLegacyID);
1556
1557 if (EnableRewritePartialRegUses)
1558 insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNRewritePartialRegUsesID);
1559
1560 if (isPassEnabled(Opt: EnablePreRAOptimizations))
1561 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &GCNPreRAOptimizationsID);
1562
1563 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1564 // instructions that cause scheduling barriers.
1565 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIWholeQuadModeID);
1566
1567 if (OptExecMaskPreRA)
1568 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIOptimizeExecMaskingPreRAID);
1569
1570 // This is not an essential optimization and it has a noticeable impact on
1571 // compilation time, so we only enable it from O2.
1572 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1573 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIFormMemoryClausesID);
1574
1575 TargetPassConfig::addOptimizedRegAlloc();
1576}
1577
1578bool GCNPassConfig::addPreRewrite() {
1579 if (EnableRegReassign)
1580 addPass(PassID: &GCNNSAReassignID);
1581
1582 addPass(PassID: &AMDGPURewriteAGPRCopyMFMALegacyID);
1583 return true;
1584}
1585
1586FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1587 // Initialize the global default.
1588 llvm::call_once(flag&: InitializeDefaultSGPRRegisterAllocatorFlag,
1589 F&: initializeDefaultSGPRRegisterAllocatorOnce);
1590
1591 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1592 if (Ctor != useDefaultRegisterAllocator)
1593 return Ctor();
1594
1595 if (Optimized)
1596 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
1597
1598 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
1599}
1600
1601FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1602 // Initialize the global default.
1603 llvm::call_once(flag&: InitializeDefaultVGPRRegisterAllocatorFlag,
1604 F&: initializeDefaultVGPRRegisterAllocatorOnce);
1605
1606 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1607 if (Ctor != useDefaultRegisterAllocator)
1608 return Ctor();
1609
1610 if (Optimized)
1611 return createGreedyVGPRRegisterAllocator();
1612
1613 return createFastVGPRRegisterAllocator();
1614}
1615
1616FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1617 // Initialize the global default.
1618 llvm::call_once(flag&: InitializeDefaultWWMRegisterAllocatorFlag,
1619 F&: initializeDefaultWWMRegisterAllocatorOnce);
1620
1621 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1622 if (Ctor != useDefaultRegisterAllocator)
1623 return Ctor();
1624
1625 if (Optimized)
1626 return createGreedyWWMRegisterAllocator();
1627
1628 return createFastWWMRegisterAllocator();
1629}
1630
1631FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1632 llvm_unreachable("should not be used");
1633}
1634
1635static const char RegAllocOptNotSupportedMessage[] =
1636 "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1637 "and -vgpr-regalloc";
1638
1639bool GCNPassConfig::addRegAssignAndRewriteFast() {
1640 if (!usingDefaultRegAlloc())
1641 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1642
1643 addPass(PassID: &GCNPreRALongBranchRegID);
1644
1645 addPass(P: createSGPRAllocPass(Optimized: false));
1646
1647 // Equivalent of PEI for SGPRs.
1648 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1649
1650 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1651 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1652
1653 // For allocating other wwm register operands.
1654 addPass(P: createWWMRegAllocPass(Optimized: false));
1655
1656 addPass(PassID: &SILowerWWMCopiesLegacyID);
1657 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1658
1659 // For allocating per-thread VGPRs.
1660 addPass(P: createVGPRAllocPass(Optimized: false));
1661
1662 return true;
1663}
1664
1665bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1666 if (!usingDefaultRegAlloc())
1667 reportFatalUsageError(reason: RegAllocOptNotSupportedMessage);
1668
1669 addPass(PassID: &GCNPreRALongBranchRegID);
1670
1671 addPass(P: createSGPRAllocPass(Optimized: true));
1672
1673 // Commit allocated register changes. This is mostly necessary because too
1674 // many things rely on the use lists of the physical registers, such as the
1675 // verifier. This is only necessary with allocators which use LiveIntervals,
1676 // since FastRegAlloc does the replacements itself.
1677 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1678
1679 // At this point, the sgpr-regalloc has been done and it is good to have the
1680 // stack slot coloring to try to optimize the SGPR spill stack indices before
1681 // attempting the custom SGPR spill lowering.
1682 addPass(PassID: &StackSlotColoringID);
1683
1684 // Equivalent of PEI for SGPRs.
1685 addPass(PassID: &SILowerSGPRSpillsLegacyID);
1686
1687 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1688 addPass(PassID: &SIPreAllocateWWMRegsLegacyID);
1689
1690 // For allocating other whole wave mode registers.
1691 addPass(P: createWWMRegAllocPass(Optimized: true));
1692 addPass(PassID: &SILowerWWMCopiesLegacyID);
1693 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1694 addPass(PassID: &AMDGPUReserveWWMRegsLegacyID);
1695
1696 // For allocating per-thread VGPRs.
1697 addPass(P: createVGPRAllocPass(Optimized: true));
1698
1699 addPreRewrite();
1700 addPass(PassID: &VirtRegRewriterID);
1701
1702 addPass(PassID: &AMDGPUMarkLastScratchLoadID);
1703
1704 return true;
1705}
1706
1707void GCNPassConfig::addPostRegAlloc() {
1708 addPass(PassID: &SIFixVGPRCopiesID);
1709 if (getOptLevel() > CodeGenOptLevel::None)
1710 addPass(PassID: &SIOptimizeExecMaskingLegacyID);
1711 TargetPassConfig::addPostRegAlloc();
1712}
1713
1714void GCNPassConfig::addPreSched2() {
1715 if (TM->getOptLevel() > CodeGenOptLevel::None)
1716 addPass(P: createSIShrinkInstructionsLegacyPass());
1717 addPass(PassID: &SIPostRABundlerLegacyID);
1718}
1719
1720void GCNPassConfig::addPreEmitPass() {
1721 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less))
1722 addPass(PassID: &GCNCreateVOPDID);
1723 addPass(P: createSIMemoryLegalizerPass());
1724 addPass(P: createSIInsertWaitcntsPass());
1725
1726 addPass(P: createSIModeRegisterPass());
1727
1728 if (getOptLevel() > CodeGenOptLevel::None)
1729 addPass(PassID: &SIInsertHardClausesID);
1730
1731 addPass(PassID: &SILateBranchLoweringPassID);
1732 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
1733 addPass(P: createAMDGPUSetWavePriorityPass());
1734 if (getOptLevel() > CodeGenOptLevel::None)
1735 addPass(PassID: &SIPreEmitPeepholeID);
1736 // The hazard recognizer that runs as part of the post-ra scheduler does not
1737 // guarantee to be able handle all hazards correctly. This is because if there
1738 // are multiple scheduling regions in a basic block, the regions are scheduled
1739 // bottom up, so when we begin to schedule a region we don't know what
1740 // instructions were emitted directly before it.
1741 //
1742 // Here we add a stand-alone hazard recognizer pass which can handle all
1743 // cases.
1744 addPass(PassID: &PostRAHazardRecognizerID);
1745
1746 addPass(PassID: &AMDGPUWaitSGPRHazardsLegacyID);
1747
1748 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less))
1749 addPass(PassID: &AMDGPUInsertDelayAluID);
1750
1751 addPass(PassID: &BranchRelaxationPassID);
1752}
1753
1754void GCNPassConfig::addPostBBSections() {
1755 // We run this later to avoid passes like livedebugvalues and BBSections
1756 // having to deal with the apparent multi-entry functions we may generate.
1757 addPass(P: createAMDGPUPreloadKernArgPrologLegacyPass());
1758}
1759
1760TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1761 return new GCNPassConfig(*this, PM);
1762}
1763
1764void GCNTargetMachine::registerMachineRegisterInfoCallback(
1765 MachineFunction &MF) const {
1766 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1767 MF.getRegInfo().addDelegate(delegate: MFI);
1768}
1769
1770MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1771 BumpPtrAllocator &Allocator, const Function &F,
1772 const TargetSubtargetInfo *STI) const {
1773 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1774 Allocator, F, STI: static_cast<const GCNSubtarget *>(STI));
1775}
1776
1777yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1778 return new yaml::SIMachineFunctionInfo();
1779}
1780
1781yaml::MachineFunctionInfo *
1782GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1783 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1784 return new yaml::SIMachineFunctionInfo(
1785 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1786}
1787
1788bool GCNTargetMachine::parseMachineFunctionInfo(
1789 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1790 SMDiagnostic &Error, SMRange &SourceRange) const {
1791 const yaml::SIMachineFunctionInfo &YamlMFI =
1792 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1793 MachineFunction &MF = PFS.MF;
1794 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1795 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1796
1797 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1798 return true;
1799
1800 if (MFI->Occupancy == 0) {
1801 // Fixup the subtarget dependent default value.
1802 MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
1803 }
1804
1805 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1806 Register TempReg;
1807 if (parseNamedRegisterReference(PFS, Reg&: TempReg, Src: RegName.Value, Error)) {
1808 SourceRange = RegName.SourceRange;
1809 return true;
1810 }
1811 RegVal = TempReg;
1812
1813 return false;
1814 };
1815
1816 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1817 Register &RegVal) {
1818 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1819 };
1820
1821 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1822 return true;
1823
1824 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1825 return true;
1826
1827 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1828 MFI->LongBranchReservedReg))
1829 return true;
1830
1831 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1832 // Create a diagnostic for a the register string literal.
1833 const MemoryBuffer &Buffer =
1834 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
1835 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1836 RegName.Value.size(), SourceMgr::DK_Error,
1837 "incorrect register class for field", RegName.Value,
1838 {}, {});
1839 SourceRange = RegName.SourceRange;
1840 return true;
1841 };
1842
1843 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1844 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1845 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1846 return true;
1847
1848 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1849 !AMDGPU::SGPR_128RegClass.contains(Reg: MFI->ScratchRSrcReg)) {
1850 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1851 }
1852
1853 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1854 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->FrameOffsetReg)) {
1855 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1856 }
1857
1858 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1859 !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->StackPtrOffsetReg)) {
1860 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1861 }
1862
1863 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1864 Register ParsedReg;
1865 if (parseRegister(YamlReg, ParsedReg))
1866 return true;
1867
1868 MFI->reserveWWMRegister(Reg: ParsedReg);
1869 }
1870
1871 for (const auto &[_, Info] : PFS.VRegInfosNamed) {
1872 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
1873 }
1874 for (const auto &[_, Info] : PFS.VRegInfos) {
1875 MFI->setFlag(Reg: Info->VReg, Flag: Info->Flags);
1876 }
1877
1878 for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
1879 Register ParsedReg;
1880 if (parseRegister(YamlRegStr, ParsedReg))
1881 return true;
1882 MFI->SpillPhysVGPRs.push_back(Elt: ParsedReg);
1883 }
1884
1885 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1886 const TargetRegisterClass &RC,
1887 ArgDescriptor &Arg, unsigned UserSGPRs,
1888 unsigned SystemSGPRs) {
1889 // Skip parsing if it's not present.
1890 if (!A)
1891 return false;
1892
1893 if (A->IsRegister) {
1894 Register Reg;
1895 if (parseNamedRegisterReference(PFS, Reg, Src: A->RegisterName.Value, Error)) {
1896 SourceRange = A->RegisterName.SourceRange;
1897 return true;
1898 }
1899 if (!RC.contains(Reg))
1900 return diagnoseRegisterClass(A->RegisterName);
1901 Arg = ArgDescriptor::createRegister(Reg);
1902 } else
1903 Arg = ArgDescriptor::createStack(Offset: A->StackOffset);
1904 // Check and apply the optional mask.
1905 if (A->Mask)
1906 Arg = ArgDescriptor::createArg(Arg, Mask: *A->Mask);
1907
1908 MFI->NumUserSGPRs += UserSGPRs;
1909 MFI->NumSystemSGPRs += SystemSGPRs;
1910 return false;
1911 };
1912
1913 if (YamlMFI.ArgInfo &&
1914 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1915 AMDGPU::SGPR_128RegClass,
1916 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1917 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1918 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1919 2, 0) ||
1920 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1921 MFI->ArgInfo.QueuePtr, 2, 0) ||
1922 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1923 AMDGPU::SReg_64RegClass,
1924 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1925 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1926 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1927 2, 0) ||
1928 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1929 AMDGPU::SReg_64RegClass,
1930 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1931 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1932 AMDGPU::SGPR_32RegClass,
1933 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1934 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1935 AMDGPU::SGPR_32RegClass,
1936 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1937 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1938 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1939 0, 1) ||
1940 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1941 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1942 0, 1) ||
1943 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1944 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1945 0, 1) ||
1946 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1947 AMDGPU::SGPR_32RegClass,
1948 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1949 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1950 AMDGPU::SGPR_32RegClass,
1951 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1952 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1953 AMDGPU::SReg_64RegClass,
1954 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1955 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1956 AMDGPU::SReg_64RegClass,
1957 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1958 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1959 AMDGPU::VGPR_32RegClass,
1960 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1961 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1962 AMDGPU::VGPR_32RegClass,
1963 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1964 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1965 AMDGPU::VGPR_32RegClass,
1966 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1967 return true;
1968
1969 if (ST.hasIEEEMode())
1970 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1971 if (ST.hasDX10ClampMode())
1972 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1973
1974 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1975 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1976 ? DenormalMode::IEEE
1977 : DenormalMode::PreserveSign;
1978 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1979 ? DenormalMode::IEEE
1980 : DenormalMode::PreserveSign;
1981
1982 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1983 ? DenormalMode::IEEE
1984 : DenormalMode::PreserveSign;
1985 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1986 ? DenormalMode::IEEE
1987 : DenormalMode::PreserveSign;
1988
1989 if (YamlMFI.HasInitWholeWave)
1990 MFI->setInitWholeWave();
1991
1992 return false;
1993}
1994
1995//===----------------------------------------------------------------------===//
1996// AMDGPU CodeGen Pass Builder interface.
1997//===----------------------------------------------------------------------===//
1998
1999AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
2000 GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
2001 PassInstrumentationCallbacks *PIC)
2002 : CodeGenPassBuilder(TM, Opts, PIC) {
2003 Opt.MISchedPostRA = true;
2004 Opt.RequiresCodeGenSCCOrder = true;
2005 // Exceptions and StackMaps are not supported, so these passes will never do
2006 // anything.
2007 // Garbage collection is not supported.
2008 disablePass<StackMapLivenessPass, FuncletLayoutPass,
2009 ShadowStackGCLoweringPass>();
2010}
2011
2012void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
2013 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
2014 addPass(AMDGPURemoveIncompatibleFunctionsPass(TM));
2015
2016 addPass(AMDGPUPrintfRuntimeBindingPass());
2017 if (LowerCtorDtor)
2018 addPass(AMDGPUCtorDtorLoweringPass());
2019
2020 if (isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
2021 addPass(AMDGPUImageIntrinsicOptimizerPass(TM));
2022
2023 // This can be disabled by passing ::Disable here or on the command line
2024 // with --expand-variadics-override=disable.
2025 addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering));
2026
2027 addPass(AMDGPUAlwaysInlinePass());
2028 addPass(AlwaysInlinerPass());
2029
2030 addPass(AMDGPUExportKernelRuntimeHandlesPass());
2031
2032 if (EnableSwLowerLDS)
2033 addPass(AMDGPUSwLowerLDSPass(TM));
2034
2035 // Runs before PromoteAlloca so the latter can account for function uses
2036 if (EnableLowerModuleLDS)
2037 addPass(AMDGPULowerModuleLDSPass(TM));
2038
2039 // Run atomic optimizer before Atomic Expand
2040 if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
2041 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
2042 addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy));
2043
2044 addPass(AtomicExpandPass(&TM));
2045
2046 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2047 addPass(AMDGPUPromoteAllocaPass(TM));
2048 if (isPassEnabled(Opt: EnableScalarIRPasses))
2049 addStraightLineScalarOptimizationPasses(addPass);
2050
2051 // TODO: Handle EnableAMDGPUAliasAnalysis
2052
2053 // TODO: May want to move later or split into an early and late one.
2054 addPass(AMDGPUCodeGenPreparePass(TM));
2055
2056 // TODO: LICM
2057 }
2058
2059 Base::addIRPasses(addPass);
2060
2061 // EarlyCSE is not always strong enough to clean up what LSR produces. For
2062 // example, GVN can combine
2063 //
2064 // %0 = add %a, %b
2065 // %1 = add %b, %a
2066 //
2067 // and
2068 //
2069 // %0 = shl nsw %a, 2
2070 // %1 = shl %a, 2
2071 //
2072 // but EarlyCSE can do neither of them.
2073 if (isPassEnabled(Opt: EnableScalarIRPasses))
2074 addEarlyCSEOrGVNPass(addPass);
2075}
2076
2077void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
2078 if (TM.getOptLevel() > CodeGenOptLevel::None)
2079 addPass(AMDGPUPreloadKernelArgumentsPass(TM));
2080
2081 if (EnableLowerKernelArguments)
2082 addPass(AMDGPULowerKernelArgumentsPass(TM));
2083
2084 // This lowering has been placed after codegenprepare to take advantage of
2085 // address mode matching (which is why it isn't put with the LDS lowerings).
2086 // It could be placed anywhere before uniformity annotations (an analysis
2087 // that it changes by splitting up fat pointers into their components)
2088 // but has been put before switch lowering and CFG flattening so that those
2089 // passes can run on the more optimized control flow this pass creates in
2090 // many cases.
2091 //
2092 // FIXME: This should ideally be put after the LoadStoreVectorizer.
2093 // However, due to some annoying facts about ResourceUsageAnalysis,
2094 // (especially as exercised in the resource-usage-dead-function test),
2095 // we need all the function passes codegenprepare all the way through
2096 // said resource usage analysis to run on the call graph produced
2097 // before codegenprepare runs (because codegenprepare will knock some
2098 // nodes out of the graph, which leads to function-level passes not
2099 // being run on them, which causes crashes in the resource usage analysis).
2100 addPass(AMDGPULowerBufferFatPointersPass(TM));
2101
2102 Base::addCodeGenPrepare(addPass);
2103
2104 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
2105 addPass(LoadStoreVectorizerPass());
2106
2107 // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
2108 // behavior for subsequent passes. Placing it here seems better that these
2109 // blocks would get cleaned up by UnreachableBlockElim inserted next in the
2110 // pass flow.
2111 addPass(LowerSwitchPass());
2112}
2113
2114void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
2115
2116 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2117 addPass(FlattenCFGPass());
2118 addPass(SinkingPass());
2119 addPass(AMDGPULateCodeGenPreparePass(TM));
2120 }
2121
2122 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
2123 // regions formed by them.
2124
2125 addPass(AMDGPUUnifyDivergentExitNodesPass());
2126 addPass(FixIrreduciblePass());
2127 addPass(UnifyLoopExitsPass());
2128 addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false));
2129
2130 addPass(AMDGPUAnnotateUniformValuesPass());
2131
2132 addPass(SIAnnotateControlFlowPass(TM));
2133
2134 // TODO: Move this right after structurizeCFG to avoid extra divergence
2135 // analysis. This depends on stopping SIAnnotateControlFlow from making
2136 // control flow modifications.
2137 addPass(AMDGPURewriteUndefForPHIPass());
2138
2139 if (!getCGPassBuilderOption().EnableGlobalISelOption ||
2140 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
2141 addPass(LCSSAPass());
2142
2143 if (TM.getOptLevel() > CodeGenOptLevel::Less)
2144 addPass(AMDGPUPerfHintAnalysisPass(TM));
2145
2146 // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
2147 // isn't this in addInstSelector?
2148 addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>());
2149}
2150
2151void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const {
2152 if (EnableEarlyIfConversion)
2153 addPass(EarlyIfConverterPass());
2154
2155 Base::addILPOpts(addPass);
2156}
2157
2158void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
2159 CreateMCStreamer) const {
2160 // TODO: Add AsmPrinter.
2161}
2162
2163Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
2164 addPass(AMDGPUISelDAGToDAGPass(TM));
2165 addPass(SIFixSGPRCopiesPass());
2166 addPass(SILowerI1CopiesPass());
2167 return Error::success();
2168}
2169
2170void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const {
2171 if (EnableRegReassign) {
2172 addPass(GCNNSAReassignPass());
2173 }
2174}
2175
2176void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
2177 AddMachinePass &addPass) const {
2178 Base::addMachineSSAOptimization(addPass);
2179
2180 addPass(SIFoldOperandsPass());
2181 if (EnableDPPCombine) {
2182 addPass(GCNDPPCombinePass());
2183 }
2184 addPass(SILoadStoreOptimizerPass());
2185 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
2186 addPass(SIPeepholeSDWAPass());
2187 addPass(EarlyMachineLICMPass());
2188 addPass(MachineCSEPass());
2189 addPass(SIFoldOperandsPass());
2190 }
2191 addPass(DeadMachineInstructionElimPass());
2192 addPass(SIShrinkInstructionsPass());
2193}
2194
2195
2196
2197Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
2198 AddMachinePass &addPass) const {
2199 // TODO: Check --regalloc-npm option
2200
2201 addPass(GCNPreRALongBranchRegPass());
2202
2203 addPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}));
2204
2205 // Commit allocated register changes. This is mostly necessary because too
2206 // many things rely on the use lists of the physical registers, such as the
2207 // verifier. This is only necessary with allocators which use LiveIntervals,
2208 // since FastRegAlloc does the replacements itself.
2209 addPass(VirtRegRewriterPass(false));
2210
2211 // At this point, the sgpr-regalloc has been done and it is good to have the
2212 // stack slot coloring to try to optimize the SGPR spill stack indices before
2213 // attempting the custom SGPR spill lowering.
2214 addPass(StackSlotColoringPass());
2215
2216 // Equivalent of PEI for SGPRs.
2217 addPass(SILowerSGPRSpillsPass());
2218
2219 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2220 addPass(SIPreAllocateWWMRegsPass());
2221
2222 // For allocating other wwm register operands.
2223 // addRegAlloc<RAGreedyPass>(addPass, RegAllocPhase::WWM);
2224 addPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}));
2225 addPass(SILowerWWMCopiesPass());
2226 addPass(VirtRegRewriterPass(false));
2227 addPass(AMDGPUReserveWWMRegsPass());
2228
2229 // For allocating per-thread VGPRs.
2230 // addRegAlloc<RAGreedyPass>(addPass, RegAllocPhase::VGPR);
2231 addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}));
2232
2233
2234 addPreRewrite(addPass);
2235 addPass(VirtRegRewriterPass(true));
2236
2237 // TODO: addPass(AMDGPUMarkLastScratchLoadPass());
2238 return Error::success();
2239}
2240
2241void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
2242 addPass(SIFixVGPRCopiesPass());
2243 if (TM.getOptLevel() > CodeGenOptLevel::None)
2244 addPass(SIOptimizeExecMaskingPass());
2245 Base::addPostRegAlloc(addPass);
2246}
2247
2248void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
2249 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less)) {
2250 addPass(GCNCreateVOPDPass());
2251 }
2252
2253 addPass(SIMemoryLegalizerPass());
2254 addPass(SIInsertWaitcntsPass());
2255
2256 // TODO: addPass(SIModeRegisterPass());
2257
2258 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2259 // TODO: addPass(SIInsertHardClausesPass());
2260 }
2261
2262 addPass(SILateBranchLoweringPass());
2263
2264 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
2265 addPass(AMDGPUSetWavePriorityPass());
2266
2267 if (TM.getOptLevel() > CodeGenOptLevel::None)
2268 addPass(SIPreEmitPeepholePass());
2269
2270 // The hazard recognizer that runs as part of the post-ra scheduler does not
2271 // guarantee to be able handle all hazards correctly. This is because if there
2272 // are multiple scheduling regions in a basic block, the regions are scheduled
2273 // bottom up, so when we begin to schedule a region we don't know what
2274 // instructions were emitted directly before it.
2275 //
2276 // Here we add a stand-alone hazard recognizer pass which can handle all
2277 // cases.
2278 addPass(PostRAHazardRecognizerPass());
2279 addPass(AMDGPUWaitSGPRHazardsPass());
2280
2281 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less)) {
2282 addPass(AMDGPUInsertDelayAluPass());
2283 }
2284
2285 addPass(BranchRelaxationPass());
2286}
2287
2288bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
2289 CodeGenOptLevel Level) const {
2290 if (Opt.getNumOccurrences())
2291 return Opt;
2292 if (TM.getOptLevel() < Level)
2293 return false;
2294 return Opt;
2295}
2296
2297void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(AddIRPass &addPass) const {
2298 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
2299 addPass(GVNPass());
2300 else
2301 addPass(EarlyCSEPass());
2302}
2303
2304void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
2305 AddIRPass &addPass) const {
2306 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
2307 addPass(LoopDataPrefetchPass());
2308
2309 addPass(SeparateConstOffsetFromGEPPass());
2310
2311 // ReassociateGEPs exposes more opportunities for SLSR. See
2312 // the example in reassociate-geps-and-slsr.ll.
2313 addPass(StraightLineStrengthReducePass());
2314
2315 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
2316 // EarlyCSE can reuse.
2317 addEarlyCSEOrGVNPass(addPass);
2318
2319 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
2320 addPass(NaryReassociatePass());
2321
2322 // NaryReassociate on GEPs creates redundant common expressions, so run
2323 // EarlyCSE after it.
2324 addPass(EarlyCSEPass());
2325}
2326