1 | //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// The AMDGPU target machine contains all of the hardware specific |
11 | /// information needed to emit code for SI+ GPUs. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "AMDGPUTargetMachine.h" |
16 | #include "AMDGPU.h" |
17 | #include "AMDGPUAliasAnalysis.h" |
18 | #include "AMDGPUCodeGenPassBuilder.h" |
19 | #include "AMDGPUCtorDtorLowering.h" |
20 | #include "AMDGPUExportClustering.h" |
21 | #include "AMDGPUIGroupLP.h" |
22 | #include "AMDGPUISelDAGToDAG.h" |
23 | #include "AMDGPUMacroFusion.h" |
24 | #include "AMDGPURegBankSelect.h" |
25 | #include "AMDGPUSplitModule.h" |
26 | #include "AMDGPUTargetObjectFile.h" |
27 | #include "AMDGPUTargetTransformInfo.h" |
28 | #include "AMDGPUUnifyDivergentExitNodes.h" |
29 | #include "GCNIterativeScheduler.h" |
30 | #include "GCNSchedStrategy.h" |
31 | #include "GCNVOPDUtils.h" |
32 | #include "R600.h" |
33 | #include "R600MachineFunctionInfo.h" |
34 | #include "R600TargetMachine.h" |
35 | #include "SIMachineFunctionInfo.h" |
36 | #include "SIMachineScheduler.h" |
37 | #include "TargetInfo/AMDGPUTargetInfo.h" |
38 | #include "Utils/AMDGPUBaseInfo.h" |
39 | #include "llvm/Analysis/CGSCCPassManager.h" |
40 | #include "llvm/Analysis/CallGraphSCCPass.h" |
41 | #include "llvm/CodeGen/GlobalISel/CSEInfo.h" |
42 | #include "llvm/CodeGen/GlobalISel/IRTranslator.h" |
43 | #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" |
44 | #include "llvm/CodeGen/GlobalISel/Legalizer.h" |
45 | #include "llvm/CodeGen/GlobalISel/Localizer.h" |
46 | #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" |
47 | #include "llvm/CodeGen/MIRParser/MIParser.h" |
48 | #include "llvm/CodeGen/Passes.h" |
49 | #include "llvm/CodeGen/RegAllocRegistry.h" |
50 | #include "llvm/CodeGen/TargetPassConfig.h" |
51 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
52 | #include "llvm/IR/PassManager.h" |
53 | #include "llvm/IR/PatternMatch.h" |
54 | #include "llvm/InitializePasses.h" |
55 | #include "llvm/MC/TargetRegistry.h" |
56 | #include "llvm/Passes/PassBuilder.h" |
57 | #include "llvm/Transforms/HipStdPar/HipStdPar.h" |
58 | #include "llvm/Transforms/IPO.h" |
59 | #include "llvm/Transforms/IPO/AlwaysInliner.h" |
60 | #include "llvm/Transforms/IPO/ExpandVariadics.h" |
61 | #include "llvm/Transforms/IPO/GlobalDCE.h" |
62 | #include "llvm/Transforms/IPO/Internalize.h" |
63 | #include "llvm/Transforms/Scalar.h" |
64 | #include "llvm/Transforms/Scalar/GVN.h" |
65 | #include "llvm/Transforms/Scalar/InferAddressSpaces.h" |
66 | #include "llvm/Transforms/Utils.h" |
67 | #include "llvm/Transforms/Utils/SimplifyLibCalls.h" |
68 | #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" |
69 | #include <optional> |
70 | |
71 | using namespace llvm; |
72 | using namespace llvm::PatternMatch; |
73 | |
74 | namespace { |
75 | class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { |
76 | public: |
77 | SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) |
78 | : RegisterRegAllocBase(N, D, C) {} |
79 | }; |
80 | |
81 | class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { |
82 | public: |
83 | VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) |
84 | : RegisterRegAllocBase(N, D, C) {} |
85 | }; |
86 | |
87 | static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, |
88 | const MachineRegisterInfo &MRI, |
89 | const Register Reg) { |
90 | const TargetRegisterClass *RC = MRI.getRegClass(Reg); |
91 | return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC); |
92 | } |
93 | |
94 | static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, |
95 | const MachineRegisterInfo &MRI, |
96 | const Register Reg) { |
97 | const TargetRegisterClass *RC = MRI.getRegClass(Reg); |
98 | return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC); |
99 | } |
100 | |
101 | /// -{sgpr|vgpr}-regalloc=... command line option. |
102 | static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } |
103 | |
104 | /// A dummy default pass factory indicates whether the register allocator is |
105 | /// overridden on the command line. |
106 | static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; |
107 | static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; |
108 | |
109 | static SGPRRegisterRegAlloc |
110 | defaultSGPRRegAlloc("default" , |
111 | "pick SGPR register allocator based on -O option" , |
112 | useDefaultRegisterAllocator); |
113 | |
114 | static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, |
115 | RegisterPassParser<SGPRRegisterRegAlloc>> |
116 | SGPRRegAlloc("sgpr-regalloc" , cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator), |
117 | cl::desc("Register allocator to use for SGPRs" )); |
118 | |
119 | static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, |
120 | RegisterPassParser<VGPRRegisterRegAlloc>> |
121 | VGPRRegAlloc("vgpr-regalloc" , cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator), |
122 | cl::desc("Register allocator to use for VGPRs" )); |
123 | |
124 | |
125 | static void initializeDefaultSGPRRegisterAllocatorOnce() { |
126 | RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); |
127 | |
128 | if (!Ctor) { |
129 | Ctor = SGPRRegAlloc; |
130 | SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); |
131 | } |
132 | } |
133 | |
134 | static void initializeDefaultVGPRRegisterAllocatorOnce() { |
135 | RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); |
136 | |
137 | if (!Ctor) { |
138 | Ctor = VGPRRegAlloc; |
139 | VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); |
140 | } |
141 | } |
142 | |
143 | static FunctionPass *createBasicSGPRRegisterAllocator() { |
144 | return createBasicRegisterAllocator(F: onlyAllocateSGPRs); |
145 | } |
146 | |
147 | static FunctionPass *createGreedySGPRRegisterAllocator() { |
148 | return createGreedyRegisterAllocator(F: onlyAllocateSGPRs); |
149 | } |
150 | |
151 | static FunctionPass *createFastSGPRRegisterAllocator() { |
152 | return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false); |
153 | } |
154 | |
155 | static FunctionPass *createBasicVGPRRegisterAllocator() { |
156 | return createBasicRegisterAllocator(F: onlyAllocateVGPRs); |
157 | } |
158 | |
159 | static FunctionPass *createGreedyVGPRRegisterAllocator() { |
160 | return createGreedyRegisterAllocator(F: onlyAllocateVGPRs); |
161 | } |
162 | |
163 | static FunctionPass *createFastVGPRRegisterAllocator() { |
164 | return createFastRegisterAllocator(F: onlyAllocateVGPRs, ClearVirtRegs: true); |
165 | } |
166 | |
167 | static SGPRRegisterRegAlloc basicRegAllocSGPR( |
168 | "basic" , "basic register allocator" , createBasicSGPRRegisterAllocator); |
169 | static SGPRRegisterRegAlloc greedyRegAllocSGPR( |
170 | "greedy" , "greedy register allocator" , createGreedySGPRRegisterAllocator); |
171 | |
172 | static SGPRRegisterRegAlloc fastRegAllocSGPR( |
173 | "fast" , "fast register allocator" , createFastSGPRRegisterAllocator); |
174 | |
175 | |
176 | static VGPRRegisterRegAlloc basicRegAllocVGPR( |
177 | "basic" , "basic register allocator" , createBasicVGPRRegisterAllocator); |
178 | static VGPRRegisterRegAlloc greedyRegAllocVGPR( |
179 | "greedy" , "greedy register allocator" , createGreedyVGPRRegisterAllocator); |
180 | |
181 | static VGPRRegisterRegAlloc fastRegAllocVGPR( |
182 | "fast" , "fast register allocator" , createFastVGPRRegisterAllocator); |
183 | } // anonymous namespace |
184 | |
185 | static cl::opt<bool> |
186 | EnableEarlyIfConversion("amdgpu-early-ifcvt" , cl::Hidden, |
187 | cl::desc("Run early if-conversion" ), |
188 | cl::init(Val: false)); |
189 | |
190 | static cl::opt<bool> |
191 | OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra" , cl::Hidden, |
192 | cl::desc("Run pre-RA exec mask optimizations" ), |
193 | cl::init(Val: true)); |
194 | |
195 | static cl::opt<bool> |
196 | LowerCtorDtor("amdgpu-lower-global-ctor-dtor" , |
197 | cl::desc("Lower GPU ctor / dtors to globals on the device." ), |
198 | cl::init(Val: true), cl::Hidden); |
199 | |
200 | // Option to disable vectorizer for tests. |
201 | static cl::opt<bool> EnableLoadStoreVectorizer( |
202 | "amdgpu-load-store-vectorizer" , |
203 | cl::desc("Enable load store vectorizer" ), |
204 | cl::init(Val: true), |
205 | cl::Hidden); |
206 | |
207 | // Option to control global loads scalarization |
208 | static cl::opt<bool> ScalarizeGlobal( |
209 | "amdgpu-scalarize-global-loads" , |
210 | cl::desc("Enable global load scalarization" ), |
211 | cl::init(Val: true), |
212 | cl::Hidden); |
213 | |
214 | // Option to run internalize pass. |
215 | static cl::opt<bool> InternalizeSymbols( |
216 | "amdgpu-internalize-symbols" , |
217 | cl::desc("Enable elimination of non-kernel functions and unused globals" ), |
218 | cl::init(Val: false), |
219 | cl::Hidden); |
220 | |
221 | // Option to inline all early. |
222 | static cl::opt<bool> EarlyInlineAll( |
223 | "amdgpu-early-inline-all" , |
224 | cl::desc("Inline all functions early" ), |
225 | cl::init(Val: false), |
226 | cl::Hidden); |
227 | |
228 | static cl::opt<bool> RemoveIncompatibleFunctions( |
229 | "amdgpu-enable-remove-incompatible-functions" , cl::Hidden, |
230 | cl::desc("Enable removal of functions when they" |
231 | "use features not supported by the target GPU" ), |
232 | cl::init(Val: true)); |
233 | |
234 | static cl::opt<bool> EnableSDWAPeephole( |
235 | "amdgpu-sdwa-peephole" , |
236 | cl::desc("Enable SDWA peepholer" ), |
237 | cl::init(Val: true)); |
238 | |
239 | static cl::opt<bool> EnableDPPCombine( |
240 | "amdgpu-dpp-combine" , |
241 | cl::desc("Enable DPP combiner" ), |
242 | cl::init(Val: true)); |
243 | |
244 | // Enable address space based alias analysis |
245 | static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa" , cl::Hidden, |
246 | cl::desc("Enable AMDGPU Alias Analysis" ), |
247 | cl::init(Val: true)); |
248 | |
249 | // Option to run late CFG structurizer |
250 | static cl::opt<bool, true> LateCFGStructurize( |
251 | "amdgpu-late-structurize" , |
252 | cl::desc("Enable late CFG structurization" ), |
253 | cl::location(L&: AMDGPUTargetMachine::EnableLateStructurizeCFG), |
254 | cl::Hidden); |
255 | |
256 | // Disable structurizer-based control-flow lowering in order to test convergence |
257 | // control tokens. This should eventually be replaced by the wave-transform. |
258 | static cl::opt<bool, true> DisableStructurizer( |
259 | "amdgpu-disable-structurizer" , |
260 | cl::desc("Disable structurizer for experiments; produces unusable code" ), |
261 | cl::location(L&: AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden); |
262 | |
263 | // Enable lib calls simplifications |
264 | static cl::opt<bool> EnableLibCallSimplify( |
265 | "amdgpu-simplify-libcall" , |
266 | cl::desc("Enable amdgpu library simplifications" ), |
267 | cl::init(Val: true), |
268 | cl::Hidden); |
269 | |
270 | static cl::opt<bool> EnableLowerKernelArguments( |
271 | "amdgpu-ir-lower-kernel-arguments" , |
272 | cl::desc("Lower kernel argument loads in IR pass" ), |
273 | cl::init(Val: true), |
274 | cl::Hidden); |
275 | |
276 | static cl::opt<bool> EnableRegReassign( |
277 | "amdgpu-reassign-regs" , |
278 | cl::desc("Enable register reassign optimizations on gfx10+" ), |
279 | cl::init(Val: true), |
280 | cl::Hidden); |
281 | |
282 | static cl::opt<bool> OptVGPRLiveRange( |
283 | "amdgpu-opt-vgpr-liverange" , |
284 | cl::desc("Enable VGPR liverange optimizations for if-else structure" ), |
285 | cl::init(Val: true), cl::Hidden); |
286 | |
287 | static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy( |
288 | "amdgpu-atomic-optimizer-strategy" , |
289 | cl::desc("Select DPP or Iterative strategy for scan" ), |
290 | cl::init(Val: ScanOptions::Iterative), |
291 | cl::values( |
292 | clEnumValN(ScanOptions::DPP, "DPP" , "Use DPP operations for scan" ), |
293 | clEnumValN(ScanOptions::Iterative, "Iterative" , |
294 | "Use Iterative approach for scan" ), |
295 | clEnumValN(ScanOptions::None, "None" , "Disable atomic optimizer" ))); |
296 | |
297 | // Enable Mode register optimization |
298 | static cl::opt<bool> EnableSIModeRegisterPass( |
299 | "amdgpu-mode-register" , |
300 | cl::desc("Enable mode register pass" ), |
301 | cl::init(Val: true), |
302 | cl::Hidden); |
303 | |
304 | // Enable GFX11.5+ s_singleuse_vdst insertion |
305 | static cl::opt<bool> |
306 | EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst" , |
307 | cl::desc("Enable s_singleuse_vdst insertion" ), |
308 | cl::init(Val: false), cl::Hidden); |
309 | |
310 | // Enable GFX11+ s_delay_alu insertion |
311 | static cl::opt<bool> |
312 | EnableInsertDelayAlu("amdgpu-enable-delay-alu" , |
313 | cl::desc("Enable s_delay_alu insertion" ), |
314 | cl::init(Val: true), cl::Hidden); |
315 | |
316 | // Enable GFX11+ VOPD |
317 | static cl::opt<bool> |
318 | EnableVOPD("amdgpu-enable-vopd" , |
319 | cl::desc("Enable VOPD, dual issue of VALU in wave32" ), |
320 | cl::init(Val: true), cl::Hidden); |
321 | |
322 | // Option is used in lit tests to prevent deadcoding of patterns inspected. |
323 | static cl::opt<bool> |
324 | EnableDCEInRA("amdgpu-dce-in-ra" , |
325 | cl::init(Val: true), cl::Hidden, |
326 | cl::desc("Enable machine DCE inside regalloc" )); |
327 | |
328 | static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority" , |
329 | cl::desc("Adjust wave priority" ), |
330 | cl::init(Val: false), cl::Hidden); |
331 | |
332 | static cl::opt<bool> EnableScalarIRPasses( |
333 | "amdgpu-scalar-ir-passes" , |
334 | cl::desc("Enable scalar IR passes" ), |
335 | cl::init(Val: true), |
336 | cl::Hidden); |
337 | |
338 | static cl::opt<bool> EnableStructurizerWorkarounds( |
339 | "amdgpu-enable-structurizer-workarounds" , |
340 | cl::desc("Enable workarounds for the StructurizeCFG pass" ), cl::init(Val: true), |
341 | cl::Hidden); |
342 | |
343 | static cl::opt<bool, true> EnableLowerModuleLDS( |
344 | "amdgpu-enable-lower-module-lds" , cl::desc("Enable lower module lds pass" ), |
345 | cl::location(L&: AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(Val: true), |
346 | cl::Hidden); |
347 | |
348 | static cl::opt<bool> EnablePreRAOptimizations( |
349 | "amdgpu-enable-pre-ra-optimizations" , |
350 | cl::desc("Enable Pre-RA optimizations pass" ), cl::init(Val: true), |
351 | cl::Hidden); |
352 | |
353 | static cl::opt<bool> EnablePromoteKernelArguments( |
354 | "amdgpu-enable-promote-kernel-arguments" , |
355 | cl::desc("Enable promotion of flat kernel pointer arguments to global" ), |
356 | cl::Hidden, cl::init(Val: true)); |
357 | |
358 | static cl::opt<bool> EnableImageIntrinsicOptimizer( |
359 | "amdgpu-enable-image-intrinsic-optimizer" , |
360 | cl::desc("Enable image intrinsic optimizer pass" ), cl::init(Val: true), |
361 | cl::Hidden); |
362 | |
363 | static cl::opt<bool> |
364 | EnableLoopPrefetch("amdgpu-loop-prefetch" , |
365 | cl::desc("Enable loop data prefetch on AMDGPU" ), |
366 | cl::Hidden, cl::init(Val: false)); |
367 | |
368 | static cl::opt<bool> EnableMaxIlpSchedStrategy( |
369 | "amdgpu-enable-max-ilp-scheduling-strategy" , |
370 | cl::desc("Enable scheduling strategy to maximize ILP for a single wave." ), |
371 | cl::Hidden, cl::init(Val: false)); |
372 | |
373 | static cl::opt<bool> EnableRewritePartialRegUses( |
374 | "amdgpu-enable-rewrite-partial-reg-uses" , |
375 | cl::desc("Enable rewrite partial reg uses pass" ), cl::init(Val: true), |
376 | cl::Hidden); |
377 | |
378 | static cl::opt<bool> EnableHipStdPar( |
379 | "amdgpu-enable-hipstdpar" , |
380 | cl::desc("Enable HIP Standard Parallelism Offload support" ), cl::init(Val: false), |
381 | cl::Hidden); |
382 | |
383 | extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { |
384 | // Register the target |
385 | RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); |
386 | RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); |
387 | |
388 | PassRegistry *PR = PassRegistry::getPassRegistry(); |
389 | initializeR600ClauseMergePassPass(*PR); |
390 | initializeR600ControlFlowFinalizerPass(*PR); |
391 | initializeR600PacketizerPass(*PR); |
392 | initializeR600ExpandSpecialInstrsPassPass(*PR); |
393 | initializeR600VectorRegMergerPass(*PR); |
394 | initializeGlobalISel(*PR); |
395 | initializeAMDGPUDAGToDAGISelLegacyPass(*PR); |
396 | initializeGCNDPPCombinePass(*PR); |
397 | initializeSILowerI1CopiesPass(*PR); |
398 | initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); |
399 | initializeSILowerWWMCopiesPass(*PR); |
400 | initializeAMDGPUMarkLastScratchLoadPass(*PR); |
401 | initializeSILowerSGPRSpillsPass(*PR); |
402 | initializeSIFixSGPRCopiesPass(*PR); |
403 | initializeSIFixVGPRCopiesPass(*PR); |
404 | initializeSIFoldOperandsPass(*PR); |
405 | initializeSIPeepholeSDWAPass(*PR); |
406 | initializeSIShrinkInstructionsPass(*PR); |
407 | initializeSIOptimizeExecMaskingPreRAPass(*PR); |
408 | initializeSIOptimizeVGPRLiveRangePass(*PR); |
409 | initializeSILoadStoreOptimizerPass(*PR); |
410 | initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); |
411 | initializeAMDGPUAlwaysInlinePass(*PR); |
412 | initializeAMDGPUAttributorLegacyPass(*PR); |
413 | initializeAMDGPUAnnotateKernelFeaturesPass(*PR); |
414 | initializeAMDGPUAnnotateUniformValuesPass(*PR); |
415 | initializeAMDGPUArgumentUsageInfoPass(*PR); |
416 | initializeAMDGPUAtomicOptimizerPass(*PR); |
417 | initializeAMDGPULowerKernelArgumentsPass(*PR); |
418 | initializeAMDGPUPromoteKernelArgumentsPass(*PR); |
419 | initializeAMDGPULowerKernelAttributesPass(*PR); |
420 | initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); |
421 | initializeAMDGPUPostLegalizerCombinerPass(*PR); |
422 | initializeAMDGPUPreLegalizerCombinerPass(*PR); |
423 | initializeAMDGPURegBankCombinerPass(*PR); |
424 | initializeAMDGPURegBankSelectPass(*PR); |
425 | initializeAMDGPUPromoteAllocaPass(*PR); |
426 | initializeAMDGPUPromoteAllocaToVectorPass(*PR); |
427 | initializeAMDGPUCodeGenPreparePass(*PR); |
428 | initializeAMDGPULateCodeGenPreparePass(*PR); |
429 | initializeAMDGPURemoveIncompatibleFunctionsPass(*PR); |
430 | initializeAMDGPULowerModuleLDSLegacyPass(*PR); |
431 | initializeAMDGPULowerBufferFatPointersPass(*PR); |
432 | initializeAMDGPURewriteOutArgumentsPass(*PR); |
433 | initializeAMDGPURewriteUndefForPHILegacyPass(*PR); |
434 | initializeAMDGPUUnifyMetadataPass(*PR); |
435 | initializeSIAnnotateControlFlowPass(*PR); |
436 | initializeAMDGPUInsertSingleUseVDSTPass(*PR); |
437 | initializeAMDGPUInsertDelayAluPass(*PR); |
438 | initializeSIInsertHardClausesPass(*PR); |
439 | initializeSIInsertWaitcntsPass(*PR); |
440 | initializeSIModeRegisterPass(*PR); |
441 | initializeSIWholeQuadModePass(*PR); |
442 | initializeSILowerControlFlowPass(*PR); |
443 | initializeSIPreEmitPeepholePass(*PR); |
444 | initializeSILateBranchLoweringPass(*PR); |
445 | initializeSIMemoryLegalizerPass(*PR); |
446 | initializeSIOptimizeExecMaskingPass(*PR); |
447 | initializeSIPreAllocateWWMRegsPass(*PR); |
448 | initializeSIFormMemoryClausesPass(*PR); |
449 | initializeSIPostRABundlerPass(*PR); |
450 | initializeGCNCreateVOPDPass(*PR); |
451 | initializeAMDGPUUnifyDivergentExitNodesPass(*PR); |
452 | initializeAMDGPUAAWrapperPassPass(*PR); |
453 | initializeAMDGPUExternalAAWrapperPass(*PR); |
454 | initializeAMDGPUImageIntrinsicOptimizerPass(*PR); |
455 | initializeAMDGPUPrintfRuntimeBindingPass(*PR); |
456 | initializeAMDGPUResourceUsageAnalysisPass(*PR); |
457 | initializeGCNNSAReassignPass(*PR); |
458 | initializeGCNPreRAOptimizationsPass(*PR); |
459 | initializeGCNPreRALongBranchRegPass(*PR); |
460 | initializeGCNRewritePartialRegUsesPass(*PR); |
461 | initializeGCNRegPressurePrinterPass(*PR); |
462 | } |
463 | |
464 | static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { |
465 | return std::make_unique<AMDGPUTargetObjectFile>(); |
466 | } |
467 | |
468 | static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { |
469 | return new SIScheduleDAGMI(C); |
470 | } |
471 | |
472 | static ScheduleDAGInstrs * |
473 | createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { |
474 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
475 | ScheduleDAGMILive *DAG = |
476 | new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(args&: C)); |
477 | DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
478 | if (ST.shouldClusterStores()) |
479 | DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
480 | DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial)); |
481 | DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation()); |
482 | DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation()); |
483 | return DAG; |
484 | } |
485 | |
486 | static ScheduleDAGInstrs * |
487 | createGCNMaxILPMachineScheduler(MachineSchedContext *C) { |
488 | ScheduleDAGMILive *DAG = |
489 | new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(args&: C)); |
490 | DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial)); |
491 | return DAG; |
492 | } |
493 | |
494 | static ScheduleDAGInstrs * |
495 | createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { |
496 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
497 | auto DAG = new GCNIterativeScheduler(C, |
498 | GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); |
499 | DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
500 | if (ST.shouldClusterStores()) |
501 | DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
502 | return DAG; |
503 | } |
504 | |
505 | static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { |
506 | return new GCNIterativeScheduler(C, |
507 | GCNIterativeScheduler::SCHEDULE_MINREGFORCED); |
508 | } |
509 | |
510 | static ScheduleDAGInstrs * |
511 | createIterativeILPMachineScheduler(MachineSchedContext *C) { |
512 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
513 | auto DAG = new GCNIterativeScheduler(C, |
514 | GCNIterativeScheduler::SCHEDULE_ILP); |
515 | DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
516 | if (ST.shouldClusterStores()) |
517 | DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
518 | DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation()); |
519 | return DAG; |
520 | } |
521 | |
522 | static MachineSchedRegistry |
523 | SISchedRegistry("si" , "Run SI's custom scheduler" , |
524 | createSIMachineScheduler); |
525 | |
526 | static MachineSchedRegistry |
527 | GCNMaxOccupancySchedRegistry("gcn-max-occupancy" , |
528 | "Run GCN scheduler to maximize occupancy" , |
529 | createGCNMaxOccupancyMachineScheduler); |
530 | |
531 | static MachineSchedRegistry |
532 | GCNMaxILPSchedRegistry("gcn-max-ilp" , "Run GCN scheduler to maximize ilp" , |
533 | createGCNMaxILPMachineScheduler); |
534 | |
535 | static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry( |
536 | "gcn-iterative-max-occupancy-experimental" , |
537 | "Run GCN scheduler to maximize occupancy (experimental)" , |
538 | createIterativeGCNMaxOccupancyMachineScheduler); |
539 | |
540 | static MachineSchedRegistry GCNMinRegSchedRegistry( |
541 | "gcn-iterative-minreg" , |
542 | "Run GCN iterative scheduler for minimal register usage (experimental)" , |
543 | createMinRegScheduler); |
544 | |
545 | static MachineSchedRegistry GCNILPSchedRegistry( |
546 | "gcn-iterative-ilp" , |
547 | "Run GCN iterative scheduler for ILP scheduling (experimental)" , |
548 | createIterativeILPMachineScheduler); |
549 | |
550 | static StringRef computeDataLayout(const Triple &TT) { |
551 | if (TT.getArch() == Triple::r600) { |
552 | // 32-bit pointers. |
553 | return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" |
554 | "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" ; |
555 | } |
556 | |
557 | // 32-bit private, local, and region pointers. 64-bit global, constant and |
558 | // flat. 160-bit non-integral fat buffer pointers that include a 128-bit |
559 | // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values |
560 | // (address space 7), and 128-bit non-integral buffer resourcees (address |
561 | // space 8) which cannot be non-trivilally accessed by LLVM memory operations |
562 | // like getelementptr. |
563 | return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" |
564 | "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-" |
565 | "v32:32-v48:64-v96:" |
566 | "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-" |
567 | "G1-ni:7:8:9" ; |
568 | } |
569 | |
570 | LLVM_READNONE |
571 | static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { |
572 | if (!GPU.empty()) |
573 | return GPU; |
574 | |
575 | // Need to default to a target with flat support for HSA. |
576 | if (TT.getArch() == Triple::amdgcn) |
577 | return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic" ; |
578 | |
579 | return "r600" ; |
580 | } |
581 | |
582 | static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { |
583 | // The AMDGPU toolchain only supports generating shared objects, so we |
584 | // must always use PIC. |
585 | return Reloc::PIC_; |
586 | } |
587 | |
588 | AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, |
589 | StringRef CPU, StringRef FS, |
590 | const TargetOptions &Options, |
591 | std::optional<Reloc::Model> RM, |
592 | std::optional<CodeModel::Model> CM, |
593 | CodeGenOptLevel OptLevel) |
594 | : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, GPU: CPU), |
595 | FS, Options, getEffectiveRelocModel(RM), |
596 | getEffectiveCodeModel(CM, Default: CodeModel::Small), OptLevel), |
597 | TLOF(createTLOF(TT: getTargetTriple())) { |
598 | initAsmInfo(); |
599 | if (TT.getArch() == Triple::amdgcn) { |
600 | if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize64" )) |
601 | MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave64)); |
602 | else if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize32" )) |
603 | MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave32)); |
604 | } |
605 | } |
606 | |
607 | bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; |
608 | bool AMDGPUTargetMachine::EnableFunctionCalls = false; |
609 | bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; |
610 | bool AMDGPUTargetMachine::DisableStructurizer = false; |
611 | |
612 | AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; |
613 | |
614 | StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { |
615 | Attribute GPUAttr = F.getFnAttribute(Kind: "target-cpu" ); |
616 | return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); |
617 | } |
618 | |
619 | StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { |
620 | Attribute FSAttr = F.getFnAttribute(Kind: "target-features" ); |
621 | |
622 | return FSAttr.isValid() ? FSAttr.getValueAsString() |
623 | : getTargetFeatureString(); |
624 | } |
625 | |
626 | /// Predicate for Internalize pass. |
627 | static bool mustPreserveGV(const GlobalValue &GV) { |
628 | if (const Function *F = dyn_cast<Function>(Val: &GV)) |
629 | return F->isDeclaration() || F->getName().starts_with(Prefix: "__asan_" ) || |
630 | F->getName().starts_with(Prefix: "__sanitizer_" ) || |
631 | AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()); |
632 | |
633 | GV.removeDeadConstantUsers(); |
634 | return !GV.use_empty(); |
635 | } |
636 | |
637 | void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { |
638 | AAM.registerFunctionAnalysis<AMDGPUAA>(); |
639 | } |
640 | |
641 | static Expected<ScanOptions> |
642 | parseAMDGPUAtomicOptimizerStrategy(StringRef Params) { |
643 | if (Params.empty()) |
644 | return ScanOptions::Iterative; |
645 | Params.consume_front(Prefix: "strategy=" ); |
646 | auto Result = StringSwitch<std::optional<ScanOptions>>(Params) |
647 | .Case(S: "dpp" , Value: ScanOptions::DPP) |
648 | .Cases(S0: "iterative" , S1: "" , Value: ScanOptions::Iterative) |
649 | .Case(S: "none" , Value: ScanOptions::None) |
650 | .Default(Value: std::nullopt); |
651 | if (Result) |
652 | return *Result; |
653 | return make_error<StringError>(Args: "invalid parameter" , Args: inconvertibleErrorCode()); |
654 | } |
655 | |
656 | Error AMDGPUTargetMachine::buildCodeGenPipeline( |
657 | ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, |
658 | CodeGenFileType FileType, const CGPassBuilderOption &Opts, |
659 | PassInstrumentationCallbacks *PIC) { |
660 | AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC); |
661 | return CGPB.buildPipeline(MPM, Out, DwoOut, FileType); |
662 | } |
663 | |
664 | void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { |
665 | |
666 | #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def" |
667 | #include "llvm/Passes/TargetPassRegistry.inc" |
668 | |
669 | PB.registerPipelineStartEPCallback( |
670 | C: [](ModulePassManager &PM, OptimizationLevel Level) { |
671 | FunctionPassManager FPM; |
672 | PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM))); |
673 | if (EnableHipStdPar) |
674 | PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass()); |
675 | }); |
676 | |
677 | PB.registerPipelineEarlySimplificationEPCallback( |
678 | C: [](ModulePassManager &PM, OptimizationLevel Level) { |
679 | PM.addPass(Pass: AMDGPUPrintfRuntimeBindingPass()); |
680 | |
681 | if (Level == OptimizationLevel::O0) |
682 | return; |
683 | |
684 | PM.addPass(Pass: AMDGPUUnifyMetadataPass()); |
685 | |
686 | if (InternalizeSymbols) { |
687 | PM.addPass(Pass: InternalizePass(mustPreserveGV)); |
688 | PM.addPass(Pass: GlobalDCEPass()); |
689 | } |
690 | |
691 | if (EarlyInlineAll && !EnableFunctionCalls) |
692 | PM.addPass(Pass: AMDGPUAlwaysInlinePass()); |
693 | }); |
694 | |
695 | PB.registerPeepholeEPCallback( |
696 | C: [](FunctionPassManager &FPM, OptimizationLevel Level) { |
697 | if (Level == OptimizationLevel::O0) |
698 | return; |
699 | |
700 | FPM.addPass(Pass: AMDGPUUseNativeCallsPass()); |
701 | if (EnableLibCallSimplify) |
702 | FPM.addPass(Pass: AMDGPUSimplifyLibCallsPass()); |
703 | }); |
704 | |
705 | PB.registerCGSCCOptimizerLateEPCallback( |
706 | C: [this](CGSCCPassManager &PM, OptimizationLevel Level) { |
707 | if (Level == OptimizationLevel::O0) |
708 | return; |
709 | |
710 | FunctionPassManager FPM; |
711 | |
712 | // Add promote kernel arguments pass to the opt pipeline right before |
713 | // infer address spaces which is needed to do actual address space |
714 | // rewriting. |
715 | if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && |
716 | EnablePromoteKernelArguments) |
717 | FPM.addPass(Pass: AMDGPUPromoteKernelArgumentsPass()); |
718 | |
719 | // Add infer address spaces pass to the opt pipeline after inlining |
720 | // but before SROA to increase SROA opportunities. |
721 | FPM.addPass(Pass: InferAddressSpacesPass()); |
722 | |
723 | // This should run after inlining to have any chance of doing |
724 | // anything, and before other cleanup optimizations. |
725 | FPM.addPass(Pass: AMDGPULowerKernelAttributesPass()); |
726 | |
727 | if (Level != OptimizationLevel::O0) { |
728 | // Promote alloca to vector before SROA and loop unroll. If we |
729 | // manage to eliminate allocas before unroll we may choose to unroll |
730 | // less. |
731 | FPM.addPass(Pass: AMDGPUPromoteAllocaToVectorPass(*this)); |
732 | } |
733 | |
734 | PM.addPass(Pass: createCGSCCToFunctionPassAdaptor(Pass: std::move(FPM))); |
735 | }); |
736 | |
737 | // FIXME: Why is AMDGPUAttributor not in CGSCC? |
738 | PB.registerOptimizerLastEPCallback( |
739 | C: [this](ModulePassManager &MPM, OptimizationLevel Level) { |
740 | if (Level != OptimizationLevel::O0) { |
741 | MPM.addPass(Pass: AMDGPUAttributorPass(*this)); |
742 | } |
743 | }); |
744 | |
745 | PB.registerFullLinkTimeOptimizationLastEPCallback( |
746 | C: [this](ModulePassManager &PM, OptimizationLevel Level) { |
747 | // We want to support the -lto-partitions=N option as "best effort". |
748 | // For that, we need to lower LDS earlier in the pipeline before the |
749 | // module is partitioned for codegen. |
750 | if (EnableLowerModuleLDS) |
751 | PM.addPass(Pass: AMDGPULowerModuleLDSPass(*this)); |
752 | }); |
753 | |
754 | PB.registerRegClassFilterParsingCallback( |
755 | C: [](StringRef FilterName) -> RegAllocFilterFunc { |
756 | if (FilterName == "sgpr" ) |
757 | return onlyAllocateSGPRs; |
758 | if (FilterName == "vgpr" ) |
759 | return onlyAllocateVGPRs; |
760 | return nullptr; |
761 | }); |
762 | } |
763 | |
764 | int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { |
765 | return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
766 | AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || |
767 | AddrSpace == AMDGPUAS::REGION_ADDRESS) |
768 | ? -1 |
769 | : 0; |
770 | } |
771 | |
772 | bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, |
773 | unsigned DestAS) const { |
774 | return AMDGPU::isFlatGlobalAddrSpace(AS: SrcAS) && |
775 | AMDGPU::isFlatGlobalAddrSpace(AS: DestAS); |
776 | } |
777 | |
778 | unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { |
779 | const auto *LD = dyn_cast<LoadInst>(Val: V); |
780 | if (!LD) |
781 | return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; |
782 | |
783 | // It must be a generic pointer loaded. |
784 | assert(V->getType()->isPointerTy() && |
785 | V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); |
786 | |
787 | const auto *Ptr = LD->getPointerOperand(); |
788 | if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) |
789 | return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; |
790 | // For a generic pointer loaded from the constant memory, it could be assumed |
791 | // as a global pointer since the constant memory is only populated on the |
792 | // host side. As implied by the offload programming model, only global |
793 | // pointers could be referenced on the host side. |
794 | return AMDGPUAS::GLOBAL_ADDRESS; |
795 | } |
796 | |
797 | std::pair<const Value *, unsigned> |
798 | AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { |
799 | if (auto *II = dyn_cast<IntrinsicInst>(Val: V)) { |
800 | switch (II->getIntrinsicID()) { |
801 | case Intrinsic::amdgcn_is_shared: |
802 | return std::pair(II->getArgOperand(i: 0), AMDGPUAS::LOCAL_ADDRESS); |
803 | case Intrinsic::amdgcn_is_private: |
804 | return std::pair(II->getArgOperand(i: 0), AMDGPUAS::PRIVATE_ADDRESS); |
805 | default: |
806 | break; |
807 | } |
808 | return std::pair(nullptr, -1); |
809 | } |
810 | // Check the global pointer predication based on |
811 | // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and |
812 | // the order of 'is_shared' and 'is_private' is not significant. |
813 | Value *Ptr; |
814 | if (match( |
815 | V: const_cast<Value *>(V), |
816 | P: m_c_And(L: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_shared>(Op0: m_Value(V&: Ptr))), |
817 | R: m_Not(V: m_Intrinsic<Intrinsic::amdgcn_is_private>( |
818 | Op0: m_Deferred(V: Ptr)))))) |
819 | return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); |
820 | |
821 | return std::pair(nullptr, -1); |
822 | } |
823 | |
824 | unsigned |
825 | AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { |
826 | switch (Kind) { |
827 | case PseudoSourceValue::Stack: |
828 | case PseudoSourceValue::FixedStack: |
829 | return AMDGPUAS::PRIVATE_ADDRESS; |
830 | case PseudoSourceValue::ConstantPool: |
831 | case PseudoSourceValue::GOT: |
832 | case PseudoSourceValue::JumpTable: |
833 | case PseudoSourceValue::GlobalValueCallEntry: |
834 | case PseudoSourceValue::ExternalSymbolCallEntry: |
835 | return AMDGPUAS::CONSTANT_ADDRESS; |
836 | } |
837 | return AMDGPUAS::FLAT_ADDRESS; |
838 | } |
839 | |
840 | bool AMDGPUTargetMachine::splitModule( |
841 | Module &M, unsigned NumParts, |
842 | function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { |
843 | // FIXME(?): Would be better to use an already existing Analysis/PassManager, |
844 | // but all current users of this API don't have one ready and would need to |
845 | // create one anyway. Let's hide the boilerplate for now to keep it simple. |
846 | |
847 | LoopAnalysisManager LAM; |
848 | FunctionAnalysisManager FAM; |
849 | CGSCCAnalysisManager CGAM; |
850 | ModuleAnalysisManager MAM; |
851 | |
852 | PassBuilder PB(this); |
853 | PB.registerModuleAnalyses(MAM); |
854 | PB.registerFunctionAnalyses(FAM); |
855 | PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); |
856 | |
857 | ModulePassManager MPM; |
858 | MPM.addPass(Pass: AMDGPUSplitModulePass(NumParts, ModuleCallback)); |
859 | MPM.run(IR&: M, AM&: MAM); |
860 | return true; |
861 | } |
862 | |
863 | //===----------------------------------------------------------------------===// |
864 | // GCN Target Machine (SI+) |
865 | //===----------------------------------------------------------------------===// |
866 | |
867 | GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, |
868 | StringRef CPU, StringRef FS, |
869 | const TargetOptions &Options, |
870 | std::optional<Reloc::Model> RM, |
871 | std::optional<CodeModel::Model> CM, |
872 | CodeGenOptLevel OL, bool JIT) |
873 | : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} |
874 | |
875 | const TargetSubtargetInfo * |
876 | GCNTargetMachine::getSubtargetImpl(const Function &F) const { |
877 | StringRef GPU = getGPUName(F); |
878 | StringRef FS = getFeatureString(F); |
879 | |
880 | SmallString<128> SubtargetKey(GPU); |
881 | SubtargetKey.append(RHS: FS); |
882 | |
883 | auto &I = SubtargetMap[SubtargetKey]; |
884 | if (!I) { |
885 | // This needs to be done before we create a new subtarget since any |
886 | // creation will depend on the TM and the code generation flags on the |
887 | // function that reside in TargetOptions. |
888 | resetTargetOptions(F); |
889 | I = std::make_unique<GCNSubtarget>(args: TargetTriple, args&: GPU, args&: FS, args: *this); |
890 | } |
891 | |
892 | I->setScalarizeGlobalBehavior(ScalarizeGlobal); |
893 | |
894 | return I.get(); |
895 | } |
896 | |
897 | TargetTransformInfo |
898 | GCNTargetMachine::getTargetTransformInfo(const Function &F) const { |
899 | return TargetTransformInfo(GCNTTIImpl(this, F)); |
900 | } |
901 | |
902 | //===----------------------------------------------------------------------===// |
903 | // AMDGPU Pass Setup |
904 | //===----------------------------------------------------------------------===// |
905 | |
906 | std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { |
907 | return getStandardCSEConfigForOpt(Level: TM->getOptLevel()); |
908 | } |
909 | |
910 | namespace { |
911 | |
912 | class GCNPassConfig final : public AMDGPUPassConfig { |
913 | public: |
914 | GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
915 | : AMDGPUPassConfig(TM, PM) { |
916 | // It is necessary to know the register usage of the entire call graph. We |
917 | // allow calls without EnableAMDGPUFunctionCalls if they are marked |
918 | // noinline, so this is always required. |
919 | setRequiresCodeGenSCCOrder(true); |
920 | substitutePass(StandardID: &PostRASchedulerID, TargetID: &PostMachineSchedulerID); |
921 | } |
922 | |
923 | GCNTargetMachine &getGCNTargetMachine() const { |
924 | return getTM<GCNTargetMachine>(); |
925 | } |
926 | |
927 | ScheduleDAGInstrs * |
928 | createMachineScheduler(MachineSchedContext *C) const override; |
929 | |
930 | ScheduleDAGInstrs * |
931 | createPostMachineScheduler(MachineSchedContext *C) const override { |
932 | ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive( |
933 | C, std::make_unique<PostGenericScheduler>(args&: C), |
934 | /*RemoveKillFlags=*/true); |
935 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
936 | DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
937 | if (ST.shouldClusterStores()) |
938 | DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
939 | DAG->addMutation(Mutation: ST.createFillMFMAShadowMutation(TII: DAG->TII)); |
940 | DAG->addMutation( |
941 | Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA)); |
942 | if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less)) |
943 | DAG->addMutation(Mutation: createVOPDPairingMutation()); |
944 | return DAG; |
945 | } |
946 | |
947 | bool addPreISel() override; |
948 | void addMachineSSAOptimization() override; |
949 | bool addILPOpts() override; |
950 | bool addInstSelector() override; |
951 | bool addIRTranslator() override; |
952 | void addPreLegalizeMachineIR() override; |
953 | bool addLegalizeMachineIR() override; |
954 | void addPreRegBankSelect() override; |
955 | bool addRegBankSelect() override; |
956 | void addPreGlobalInstructionSelect() override; |
957 | bool addGlobalInstructionSelect() override; |
958 | void addFastRegAlloc() override; |
959 | void addOptimizedRegAlloc() override; |
960 | |
961 | FunctionPass *createSGPRAllocPass(bool Optimized); |
962 | FunctionPass *createVGPRAllocPass(bool Optimized); |
963 | FunctionPass *createRegAllocPass(bool Optimized) override; |
964 | |
965 | bool addRegAssignAndRewriteFast() override; |
966 | bool addRegAssignAndRewriteOptimized() override; |
967 | |
968 | void addPreRegAlloc() override; |
969 | bool addPreRewrite() override; |
970 | void addPostRegAlloc() override; |
971 | void addPreSched2() override; |
972 | void addPreEmitPass() override; |
973 | }; |
974 | |
975 | } // end anonymous namespace |
976 | |
977 | AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
978 | : TargetPassConfig(TM, PM) { |
979 | // Exceptions and StackMaps are not supported, so these passes will never do |
980 | // anything. |
981 | disablePass(PassID: &StackMapLivenessID); |
982 | disablePass(PassID: &FuncletLayoutID); |
983 | // Garbage collection is not supported. |
984 | disablePass(PassID: &GCLoweringID); |
985 | disablePass(PassID: &ShadowStackGCLoweringID); |
986 | } |
987 | |
988 | void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { |
989 | if (getOptLevel() == CodeGenOptLevel::Aggressive) |
990 | addPass(P: createGVNPass()); |
991 | else |
992 | addPass(P: createEarlyCSEPass()); |
993 | } |
994 | |
995 | void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { |
996 | if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive)) |
997 | addPass(P: createLoopDataPrefetchPass()); |
998 | addPass(P: createSeparateConstOffsetFromGEPPass()); |
999 | // ReassociateGEPs exposes more opportunities for SLSR. See |
1000 | // the example in reassociate-geps-and-slsr.ll. |
1001 | addPass(P: createStraightLineStrengthReducePass()); |
1002 | // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or |
1003 | // EarlyCSE can reuse. |
1004 | addEarlyCSEOrGVNPass(); |
1005 | // Run NaryReassociate after EarlyCSE/GVN to be more effective. |
1006 | addPass(P: createNaryReassociatePass()); |
1007 | // NaryReassociate on GEPs creates redundant common expressions, so run |
1008 | // EarlyCSE after it. |
1009 | addPass(P: createEarlyCSEPass()); |
1010 | } |
1011 | |
1012 | void AMDGPUPassConfig::addIRPasses() { |
1013 | const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); |
1014 | |
1015 | Triple::ArchType Arch = TM.getTargetTriple().getArch(); |
1016 | if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn) |
1017 | addPass(P: createAMDGPURemoveIncompatibleFunctionsPass(&TM)); |
1018 | |
1019 | // There is no reason to run these. |
1020 | disablePass(PassID: &StackMapLivenessID); |
1021 | disablePass(PassID: &FuncletLayoutID); |
1022 | disablePass(PassID: &PatchableFunctionID); |
1023 | |
1024 | addPass(P: createAMDGPUPrintfRuntimeBinding()); |
1025 | if (LowerCtorDtor) |
1026 | addPass(P: createAMDGPUCtorDtorLoweringLegacyPass()); |
1027 | |
1028 | if (isPassEnabled(Opt: EnableImageIntrinsicOptimizer)) |
1029 | addPass(P: createAMDGPUImageIntrinsicOptimizerPass(&TM)); |
1030 | |
1031 | // This can be disabled by passing ::Disable here or on the command line |
1032 | // with --expand-variadics-override=disable. |
1033 | addPass(P: createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); |
1034 | |
1035 | // Function calls are not supported, so make sure we inline everything. |
1036 | addPass(P: createAMDGPUAlwaysInlinePass()); |
1037 | addPass(P: createAlwaysInlinerLegacyPass()); |
1038 | |
1039 | // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. |
1040 | if (Arch == Triple::r600) |
1041 | addPass(P: createR600OpenCLImageTypeLoweringPass()); |
1042 | |
1043 | // Replace OpenCL enqueued block function pointers with global variables. |
1044 | addPass(P: createAMDGPUOpenCLEnqueuedBlockLoweringPass()); |
1045 | |
1046 | // Runs before PromoteAlloca so the latter can account for function uses |
1047 | if (EnableLowerModuleLDS) { |
1048 | addPass(P: createAMDGPULowerModuleLDSLegacyPass(TM: &TM)); |
1049 | } |
1050 | |
1051 | if (TM.getOptLevel() > CodeGenOptLevel::None) |
1052 | addPass(P: createInferAddressSpacesPass()); |
1053 | |
1054 | // Run atomic optimizer before Atomic Expand |
1055 | if ((TM.getTargetTriple().getArch() == Triple::amdgcn) && |
1056 | (TM.getOptLevel() >= CodeGenOptLevel::Less) && |
1057 | (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) { |
1058 | addPass(P: createAMDGPUAtomicOptimizerPass(ScanStrategy: AMDGPUAtomicOptimizerStrategy)); |
1059 | } |
1060 | |
1061 | addPass(P: createAtomicExpandLegacyPass()); |
1062 | |
1063 | if (TM.getOptLevel() > CodeGenOptLevel::None) { |
1064 | addPass(P: createAMDGPUPromoteAlloca()); |
1065 | |
1066 | if (isPassEnabled(Opt: EnableScalarIRPasses)) |
1067 | addStraightLineScalarOptimizationPasses(); |
1068 | |
1069 | if (EnableAMDGPUAliasAnalysis) { |
1070 | addPass(P: createAMDGPUAAWrapperPass()); |
1071 | addPass(P: createExternalAAWrapperPass(Callback: [](Pass &P, Function &, |
1072 | AAResults &AAR) { |
1073 | if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) |
1074 | AAR.addAAResult(AAResult&: WrapperPass->getResult()); |
1075 | })); |
1076 | } |
1077 | |
1078 | if (TM.getTargetTriple().getArch() == Triple::amdgcn) { |
1079 | // TODO: May want to move later or split into an early and late one. |
1080 | addPass(P: createAMDGPUCodeGenPreparePass()); |
1081 | } |
1082 | |
1083 | // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may |
1084 | // have expanded. |
1085 | if (TM.getOptLevel() > CodeGenOptLevel::Less) |
1086 | addPass(P: createLICMPass()); |
1087 | } |
1088 | |
1089 | TargetPassConfig::addIRPasses(); |
1090 | |
1091 | // EarlyCSE is not always strong enough to clean up what LSR produces. For |
1092 | // example, GVN can combine |
1093 | // |
1094 | // %0 = add %a, %b |
1095 | // %1 = add %b, %a |
1096 | // |
1097 | // and |
1098 | // |
1099 | // %0 = shl nsw %a, 2 |
1100 | // %1 = shl %a, 2 |
1101 | // |
1102 | // but EarlyCSE can do neither of them. |
1103 | if (isPassEnabled(Opt: EnableScalarIRPasses)) |
1104 | addEarlyCSEOrGVNPass(); |
1105 | } |
1106 | |
1107 | void AMDGPUPassConfig::addCodeGenPrepare() { |
1108 | if (TM->getTargetTriple().getArch() == Triple::amdgcn) { |
1109 | // FIXME: This pass adds 2 hacky attributes that can be replaced with an |
1110 | // analysis, and should be removed. |
1111 | addPass(P: createAMDGPUAnnotateKernelFeaturesPass()); |
1112 | } |
1113 | |
1114 | if (TM->getTargetTriple().getArch() == Triple::amdgcn && |
1115 | EnableLowerKernelArguments) |
1116 | addPass(P: createAMDGPULowerKernelArgumentsPass()); |
1117 | |
1118 | if (TM->getTargetTriple().getArch() == Triple::amdgcn) { |
1119 | // This lowering has been placed after codegenprepare to take advantage of |
1120 | // address mode matching (which is why it isn't put with the LDS lowerings). |
1121 | // It could be placed anywhere before uniformity annotations (an analysis |
1122 | // that it changes by splitting up fat pointers into their components) |
1123 | // but has been put before switch lowering and CFG flattening so that those |
1124 | // passes can run on the more optimized control flow this pass creates in |
1125 | // many cases. |
1126 | // |
1127 | // FIXME: This should ideally be put after the LoadStoreVectorizer. |
1128 | // However, due to some annoying facts about ResourceUsageAnalysis, |
1129 | // (especially as exercised in the resource-usage-dead-function test), |
1130 | // we need all the function passes codegenprepare all the way through |
1131 | // said resource usage analysis to run on the call graph produced |
1132 | // before codegenprepare runs (because codegenprepare will knock some |
1133 | // nodes out of the graph, which leads to function-level passes not |
1134 | // being run on them, which causes crashes in the resource usage analysis). |
1135 | addPass(P: createAMDGPULowerBufferFatPointersPass()); |
1136 | // In accordance with the above FIXME, manually force all the |
1137 | // function-level passes into a CGSCCPassManager. |
1138 | addPass(P: new DummyCGSCCPass()); |
1139 | } |
1140 | |
1141 | TargetPassConfig::addCodeGenPrepare(); |
1142 | |
1143 | if (isPassEnabled(Opt: EnableLoadStoreVectorizer)) |
1144 | addPass(P: createLoadStoreVectorizerPass()); |
1145 | |
1146 | // LowerSwitch pass may introduce unreachable blocks that can |
1147 | // cause unexpected behavior for subsequent passes. Placing it |
1148 | // here seems better that these blocks would get cleaned up by |
1149 | // UnreachableBlockElim inserted next in the pass flow. |
1150 | addPass(P: createLowerSwitchPass()); |
1151 | } |
1152 | |
1153 | bool AMDGPUPassConfig::addPreISel() { |
1154 | if (TM->getOptLevel() > CodeGenOptLevel::None) |
1155 | addPass(P: createFlattenCFGPass()); |
1156 | return false; |
1157 | } |
1158 | |
1159 | bool AMDGPUPassConfig::addInstSelector() { |
1160 | addPass(P: createAMDGPUISelDag(TM&: getAMDGPUTargetMachine(), OptLevel: getOptLevel())); |
1161 | return false; |
1162 | } |
1163 | |
1164 | bool AMDGPUPassConfig::addGCPasses() { |
1165 | // Do nothing. GC is not supported. |
1166 | return false; |
1167 | } |
1168 | |
1169 | llvm::ScheduleDAGInstrs * |
1170 | AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { |
1171 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
1172 | ScheduleDAGMILive *DAG = createGenericSchedLive(C); |
1173 | DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
1174 | if (ST.shouldClusterStores()) |
1175 | DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
1176 | return DAG; |
1177 | } |
1178 | |
1179 | MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo( |
1180 | BumpPtrAllocator &Allocator, const Function &F, |
1181 | const TargetSubtargetInfo *STI) const { |
1182 | return R600MachineFunctionInfo::create<R600MachineFunctionInfo>( |
1183 | Allocator, F, STI: static_cast<const R600Subtarget *>(STI)); |
1184 | } |
1185 | |
1186 | //===----------------------------------------------------------------------===// |
1187 | // GCN Pass Setup |
1188 | //===----------------------------------------------------------------------===// |
1189 | |
1190 | ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( |
1191 | MachineSchedContext *C) const { |
1192 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
1193 | if (ST.enableSIScheduler()) |
1194 | return createSIMachineScheduler(C); |
1195 | |
1196 | if (EnableMaxIlpSchedStrategy) |
1197 | return createGCNMaxILPMachineScheduler(C); |
1198 | |
1199 | return createGCNMaxOccupancyMachineScheduler(C); |
1200 | } |
1201 | |
1202 | bool GCNPassConfig::addPreISel() { |
1203 | AMDGPUPassConfig::addPreISel(); |
1204 | |
1205 | if (TM->getOptLevel() > CodeGenOptLevel::None) |
1206 | addPass(P: createSinkingPass()); |
1207 | |
1208 | if (TM->getOptLevel() > CodeGenOptLevel::None) |
1209 | addPass(P: createAMDGPULateCodeGenPreparePass()); |
1210 | |
1211 | // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit |
1212 | // regions formed by them. |
1213 | addPass(PassID: &AMDGPUUnifyDivergentExitNodesID); |
1214 | if (!LateCFGStructurize && !DisableStructurizer) { |
1215 | if (EnableStructurizerWorkarounds) { |
1216 | addPass(P: createFixIrreduciblePass()); |
1217 | addPass(P: createUnifyLoopExitsPass()); |
1218 | } |
1219 | addPass(P: createStructurizeCFGPass(SkipUniformRegions: false)); // true -> SkipUniformRegions |
1220 | } |
1221 | addPass(P: createAMDGPUAnnotateUniformValues()); |
1222 | if (!LateCFGStructurize && !DisableStructurizer) { |
1223 | addPass(P: createSIAnnotateControlFlowPass()); |
1224 | // TODO: Move this right after structurizeCFG to avoid extra divergence |
1225 | // analysis. This depends on stopping SIAnnotateControlFlow from making |
1226 | // control flow modifications. |
1227 | addPass(P: createAMDGPURewriteUndefForPHILegacyPass()); |
1228 | } |
1229 | addPass(P: createLCSSAPass()); |
1230 | |
1231 | if (TM->getOptLevel() > CodeGenOptLevel::Less) |
1232 | addPass(PassID: &AMDGPUPerfHintAnalysisID); |
1233 | |
1234 | return false; |
1235 | } |
1236 | |
1237 | void GCNPassConfig::addMachineSSAOptimization() { |
1238 | TargetPassConfig::addMachineSSAOptimization(); |
1239 | |
1240 | // We want to fold operands after PeepholeOptimizer has run (or as part of |
1241 | // it), because it will eliminate extra copies making it easier to fold the |
1242 | // real source operand. We want to eliminate dead instructions after, so that |
1243 | // we see fewer uses of the copies. We then need to clean up the dead |
1244 | // instructions leftover after the operands are folded as well. |
1245 | // |
1246 | // XXX - Can we get away without running DeadMachineInstructionElim again? |
1247 | addPass(PassID: &SIFoldOperandsID); |
1248 | if (EnableDPPCombine) |
1249 | addPass(PassID: &GCNDPPCombineID); |
1250 | addPass(PassID: &SILoadStoreOptimizerID); |
1251 | if (isPassEnabled(Opt: EnableSDWAPeephole)) { |
1252 | addPass(PassID: &SIPeepholeSDWAID); |
1253 | addPass(PassID: &EarlyMachineLICMID); |
1254 | addPass(PassID: &MachineCSEID); |
1255 | addPass(PassID: &SIFoldOperandsID); |
1256 | } |
1257 | addPass(PassID: &DeadMachineInstructionElimID); |
1258 | addPass(P: createSIShrinkInstructionsPass()); |
1259 | } |
1260 | |
1261 | bool GCNPassConfig::addILPOpts() { |
1262 | if (EnableEarlyIfConversion) |
1263 | addPass(PassID: &EarlyIfConverterID); |
1264 | |
1265 | TargetPassConfig::addILPOpts(); |
1266 | return false; |
1267 | } |
1268 | |
1269 | bool GCNPassConfig::addInstSelector() { |
1270 | AMDGPUPassConfig::addInstSelector(); |
1271 | addPass(PassID: &SIFixSGPRCopiesID); |
1272 | addPass(P: createSILowerI1CopiesPass()); |
1273 | return false; |
1274 | } |
1275 | |
1276 | bool GCNPassConfig::addIRTranslator() { |
1277 | addPass(P: new IRTranslator(getOptLevel())); |
1278 | return false; |
1279 | } |
1280 | |
1281 | void GCNPassConfig::addPreLegalizeMachineIR() { |
1282 | bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; |
1283 | addPass(P: createAMDGPUPreLegalizeCombiner(IsOptNone)); |
1284 | addPass(P: new Localizer()); |
1285 | } |
1286 | |
1287 | bool GCNPassConfig::addLegalizeMachineIR() { |
1288 | addPass(P: new Legalizer()); |
1289 | return false; |
1290 | } |
1291 | |
1292 | void GCNPassConfig::addPreRegBankSelect() { |
1293 | bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; |
1294 | addPass(P: createAMDGPUPostLegalizeCombiner(IsOptNone)); |
1295 | addPass(P: createAMDGPUGlobalISelDivergenceLoweringPass()); |
1296 | } |
1297 | |
1298 | bool GCNPassConfig::addRegBankSelect() { |
1299 | addPass(P: new AMDGPURegBankSelect()); |
1300 | return false; |
1301 | } |
1302 | |
1303 | void GCNPassConfig::addPreGlobalInstructionSelect() { |
1304 | bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; |
1305 | addPass(P: createAMDGPURegBankCombiner(IsOptNone)); |
1306 | } |
1307 | |
1308 | bool GCNPassConfig::addGlobalInstructionSelect() { |
1309 | addPass(P: new InstructionSelect(getOptLevel())); |
1310 | return false; |
1311 | } |
1312 | |
1313 | void GCNPassConfig::addPreRegAlloc() { |
1314 | if (LateCFGStructurize) { |
1315 | addPass(P: createAMDGPUMachineCFGStructurizerPass()); |
1316 | } |
1317 | } |
1318 | |
1319 | void GCNPassConfig::addFastRegAlloc() { |
1320 | // FIXME: We have to disable the verifier here because of PHIElimination + |
1321 | // TwoAddressInstructions disabling it. |
1322 | |
1323 | // This must be run immediately after phi elimination and before |
1324 | // TwoAddressInstructions, otherwise the processing of the tied operand of |
1325 | // SI_ELSE will introduce a copy of the tied operand source after the else. |
1326 | insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowID); |
1327 | |
1328 | insertPass(TargetPassID: &TwoAddressInstructionPassID, InsertedPassID: &SIWholeQuadModeID); |
1329 | |
1330 | TargetPassConfig::addFastRegAlloc(); |
1331 | } |
1332 | |
1333 | void GCNPassConfig::addOptimizedRegAlloc() { |
1334 | // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation |
1335 | // instructions that cause scheduling barriers. |
1336 | insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIWholeQuadModeID); |
1337 | |
1338 | if (OptExecMaskPreRA) |
1339 | insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIOptimizeExecMaskingPreRAID); |
1340 | |
1341 | if (EnableRewritePartialRegUses) |
1342 | insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNRewritePartialRegUsesID); |
1343 | |
1344 | if (isPassEnabled(Opt: EnablePreRAOptimizations)) |
1345 | insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNPreRAOptimizationsID); |
1346 | |
1347 | // This is not an essential optimization and it has a noticeable impact on |
1348 | // compilation time, so we only enable it from O2. |
1349 | if (TM->getOptLevel() > CodeGenOptLevel::Less) |
1350 | insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIFormMemoryClausesID); |
1351 | |
1352 | // FIXME: when an instruction has a Killed operand, and the instruction is |
1353 | // inside a bundle, seems only the BUNDLE instruction appears as the Kills of |
1354 | // the register in LiveVariables, this would trigger a failure in verifier, |
1355 | // we should fix it and enable the verifier. |
1356 | if (OptVGPRLiveRange) |
1357 | insertPass(TargetPassID: &LiveVariablesID, InsertedPassID: &SIOptimizeVGPRLiveRangeID); |
1358 | // This must be run immediately after phi elimination and before |
1359 | // TwoAddressInstructions, otherwise the processing of the tied operand of |
1360 | // SI_ELSE will introduce a copy of the tied operand source after the else. |
1361 | insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowID); |
1362 | |
1363 | if (EnableDCEInRA) |
1364 | insertPass(TargetPassID: &DetectDeadLanesID, InsertedPassID: &DeadMachineInstructionElimID); |
1365 | |
1366 | TargetPassConfig::addOptimizedRegAlloc(); |
1367 | } |
1368 | |
1369 | bool GCNPassConfig::addPreRewrite() { |
1370 | addPass(PassID: &SILowerWWMCopiesID); |
1371 | if (EnableRegReassign) |
1372 | addPass(PassID: &GCNNSAReassignID); |
1373 | return true; |
1374 | } |
1375 | |
1376 | FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { |
1377 | // Initialize the global default. |
1378 | llvm::call_once(flag&: InitializeDefaultSGPRRegisterAllocatorFlag, |
1379 | F&: initializeDefaultSGPRRegisterAllocatorOnce); |
1380 | |
1381 | RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); |
1382 | if (Ctor != useDefaultRegisterAllocator) |
1383 | return Ctor(); |
1384 | |
1385 | if (Optimized) |
1386 | return createGreedyRegisterAllocator(F: onlyAllocateSGPRs); |
1387 | |
1388 | return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false); |
1389 | } |
1390 | |
1391 | FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { |
1392 | // Initialize the global default. |
1393 | llvm::call_once(flag&: InitializeDefaultVGPRRegisterAllocatorFlag, |
1394 | F&: initializeDefaultVGPRRegisterAllocatorOnce); |
1395 | |
1396 | RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); |
1397 | if (Ctor != useDefaultRegisterAllocator) |
1398 | return Ctor(); |
1399 | |
1400 | if (Optimized) |
1401 | return createGreedyVGPRRegisterAllocator(); |
1402 | |
1403 | return createFastVGPRRegisterAllocator(); |
1404 | } |
1405 | |
1406 | FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { |
1407 | llvm_unreachable("should not be used" ); |
1408 | } |
1409 | |
1410 | static const char RegAllocOptNotSupportedMessage[] = |
1411 | "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc" ; |
1412 | |
1413 | bool GCNPassConfig::addRegAssignAndRewriteFast() { |
1414 | if (!usingDefaultRegAlloc()) |
1415 | report_fatal_error(reason: RegAllocOptNotSupportedMessage); |
1416 | |
1417 | addPass(PassID: &GCNPreRALongBranchRegID); |
1418 | |
1419 | addPass(P: createSGPRAllocPass(Optimized: false)); |
1420 | |
1421 | // Equivalent of PEI for SGPRs. |
1422 | addPass(PassID: &SILowerSGPRSpillsID); |
1423 | addPass(PassID: &SIPreAllocateWWMRegsID); |
1424 | |
1425 | addPass(P: createVGPRAllocPass(Optimized: false)); |
1426 | |
1427 | addPass(PassID: &SILowerWWMCopiesID); |
1428 | return true; |
1429 | } |
1430 | |
1431 | bool GCNPassConfig::addRegAssignAndRewriteOptimized() { |
1432 | if (!usingDefaultRegAlloc()) |
1433 | report_fatal_error(reason: RegAllocOptNotSupportedMessage); |
1434 | |
1435 | addPass(PassID: &GCNPreRALongBranchRegID); |
1436 | |
1437 | addPass(P: createSGPRAllocPass(Optimized: true)); |
1438 | |
1439 | // Commit allocated register changes. This is mostly necessary because too |
1440 | // many things rely on the use lists of the physical registers, such as the |
1441 | // verifier. This is only necessary with allocators which use LiveIntervals, |
1442 | // since FastRegAlloc does the replacements itself. |
1443 | addPass(P: createVirtRegRewriter(ClearVirtRegs: false)); |
1444 | |
1445 | // Equivalent of PEI for SGPRs. |
1446 | addPass(PassID: &SILowerSGPRSpillsID); |
1447 | addPass(PassID: &SIPreAllocateWWMRegsID); |
1448 | |
1449 | addPass(P: createVGPRAllocPass(Optimized: true)); |
1450 | |
1451 | addPreRewrite(); |
1452 | addPass(PassID: &VirtRegRewriterID); |
1453 | |
1454 | addPass(PassID: &AMDGPUMarkLastScratchLoadID); |
1455 | |
1456 | return true; |
1457 | } |
1458 | |
1459 | void GCNPassConfig::addPostRegAlloc() { |
1460 | addPass(PassID: &SIFixVGPRCopiesID); |
1461 | if (getOptLevel() > CodeGenOptLevel::None) |
1462 | addPass(PassID: &SIOptimizeExecMaskingID); |
1463 | TargetPassConfig::addPostRegAlloc(); |
1464 | } |
1465 | |
1466 | void GCNPassConfig::addPreSched2() { |
1467 | if (TM->getOptLevel() > CodeGenOptLevel::None) |
1468 | addPass(P: createSIShrinkInstructionsPass()); |
1469 | addPass(PassID: &SIPostRABundlerID); |
1470 | } |
1471 | |
1472 | void GCNPassConfig::addPreEmitPass() { |
1473 | if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less)) |
1474 | addPass(PassID: &GCNCreateVOPDID); |
1475 | addPass(P: createSIMemoryLegalizerPass()); |
1476 | addPass(P: createSIInsertWaitcntsPass()); |
1477 | |
1478 | addPass(P: createSIModeRegisterPass()); |
1479 | |
1480 | if (getOptLevel() > CodeGenOptLevel::None) |
1481 | addPass(PassID: &SIInsertHardClausesID); |
1482 | |
1483 | addPass(PassID: &SILateBranchLoweringPassID); |
1484 | if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less)) |
1485 | addPass(P: createAMDGPUSetWavePriorityPass()); |
1486 | if (getOptLevel() > CodeGenOptLevel::None) |
1487 | addPass(PassID: &SIPreEmitPeepholeID); |
1488 | // The hazard recognizer that runs as part of the post-ra scheduler does not |
1489 | // guarantee to be able handle all hazards correctly. This is because if there |
1490 | // are multiple scheduling regions in a basic block, the regions are scheduled |
1491 | // bottom up, so when we begin to schedule a region we don't know what |
1492 | // instructions were emitted directly before it. |
1493 | // |
1494 | // Here we add a stand-alone hazard recognizer pass which can handle all |
1495 | // cases. |
1496 | addPass(PassID: &PostRAHazardRecognizerID); |
1497 | |
1498 | if (isPassEnabled(Opt: EnableInsertSingleUseVDST, Level: CodeGenOptLevel::Less)) |
1499 | addPass(PassID: &AMDGPUInsertSingleUseVDSTID); |
1500 | |
1501 | if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less)) |
1502 | addPass(PassID: &AMDGPUInsertDelayAluID); |
1503 | |
1504 | addPass(PassID: &BranchRelaxationPassID); |
1505 | } |
1506 | |
1507 | TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { |
1508 | return new GCNPassConfig(*this, PM); |
1509 | } |
1510 | |
1511 | void GCNTargetMachine::registerMachineRegisterInfoCallback( |
1512 | MachineFunction &MF) const { |
1513 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1514 | MF.getRegInfo().addDelegate(delegate: MFI); |
1515 | } |
1516 | |
1517 | MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo( |
1518 | BumpPtrAllocator &Allocator, const Function &F, |
1519 | const TargetSubtargetInfo *STI) const { |
1520 | return SIMachineFunctionInfo::create<SIMachineFunctionInfo>( |
1521 | Allocator, F, STI: static_cast<const GCNSubtarget *>(STI)); |
1522 | } |
1523 | |
1524 | yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { |
1525 | return new yaml::SIMachineFunctionInfo(); |
1526 | } |
1527 | |
1528 | yaml::MachineFunctionInfo * |
1529 | GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { |
1530 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1531 | return new yaml::SIMachineFunctionInfo( |
1532 | *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF); |
1533 | } |
1534 | |
1535 | bool GCNTargetMachine::parseMachineFunctionInfo( |
1536 | const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, |
1537 | SMDiagnostic &Error, SMRange &SourceRange) const { |
1538 | const yaml::SIMachineFunctionInfo &YamlMFI = |
1539 | static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); |
1540 | MachineFunction &MF = PFS.MF; |
1541 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1542 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1543 | |
1544 | if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) |
1545 | return true; |
1546 | |
1547 | if (MFI->Occupancy == 0) { |
1548 | // Fixup the subtarget dependent default value. |
1549 | MFI->Occupancy = ST.computeOccupancy(F: MF.getFunction(), LDSSize: MFI->getLDSSize()); |
1550 | } |
1551 | |
1552 | auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { |
1553 | Register TempReg; |
1554 | if (parseNamedRegisterReference(PFS, Reg&: TempReg, Src: RegName.Value, Error)) { |
1555 | SourceRange = RegName.SourceRange; |
1556 | return true; |
1557 | } |
1558 | RegVal = TempReg; |
1559 | |
1560 | return false; |
1561 | }; |
1562 | |
1563 | auto parseOptionalRegister = [&](const yaml::StringValue &RegName, |
1564 | Register &RegVal) { |
1565 | return !RegName.Value.empty() && parseRegister(RegName, RegVal); |
1566 | }; |
1567 | |
1568 | if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) |
1569 | return true; |
1570 | |
1571 | if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy)) |
1572 | return true; |
1573 | |
1574 | if (parseOptionalRegister(YamlMFI.LongBranchReservedReg, |
1575 | MFI->LongBranchReservedReg)) |
1576 | return true; |
1577 | |
1578 | auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { |
1579 | // Create a diagnostic for a the register string literal. |
1580 | const MemoryBuffer &Buffer = |
1581 | *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID()); |
1582 | Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, |
1583 | RegName.Value.size(), SourceMgr::DK_Error, |
1584 | "incorrect register class for field" , RegName.Value, |
1585 | std::nullopt, std::nullopt); |
1586 | SourceRange = RegName.SourceRange; |
1587 | return true; |
1588 | }; |
1589 | |
1590 | if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || |
1591 | parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || |
1592 | parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) |
1593 | return true; |
1594 | |
1595 | if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && |
1596 | !AMDGPU::SGPR_128RegClass.contains(Reg: MFI->ScratchRSrcReg)) { |
1597 | return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); |
1598 | } |
1599 | |
1600 | if (MFI->FrameOffsetReg != AMDGPU::FP_REG && |
1601 | !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->FrameOffsetReg)) { |
1602 | return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); |
1603 | } |
1604 | |
1605 | if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && |
1606 | !AMDGPU::SGPR_32RegClass.contains(Reg: MFI->StackPtrOffsetReg)) { |
1607 | return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); |
1608 | } |
1609 | |
1610 | for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { |
1611 | Register ParsedReg; |
1612 | if (parseRegister(YamlReg, ParsedReg)) |
1613 | return true; |
1614 | |
1615 | MFI->reserveWWMRegister(Reg: ParsedReg); |
1616 | } |
1617 | |
1618 | auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A, |
1619 | const TargetRegisterClass &RC, |
1620 | ArgDescriptor &Arg, unsigned UserSGPRs, |
1621 | unsigned SystemSGPRs) { |
1622 | // Skip parsing if it's not present. |
1623 | if (!A) |
1624 | return false; |
1625 | |
1626 | if (A->IsRegister) { |
1627 | Register Reg; |
1628 | if (parseNamedRegisterReference(PFS, Reg, Src: A->RegisterName.Value, Error)) { |
1629 | SourceRange = A->RegisterName.SourceRange; |
1630 | return true; |
1631 | } |
1632 | if (!RC.contains(Reg)) |
1633 | return diagnoseRegisterClass(A->RegisterName); |
1634 | Arg = ArgDescriptor::createRegister(Reg); |
1635 | } else |
1636 | Arg = ArgDescriptor::createStack(Offset: A->StackOffset); |
1637 | // Check and apply the optional mask. |
1638 | if (A->Mask) |
1639 | Arg = ArgDescriptor::createArg(Arg, Mask: *A->Mask); |
1640 | |
1641 | MFI->NumUserSGPRs += UserSGPRs; |
1642 | MFI->NumSystemSGPRs += SystemSGPRs; |
1643 | return false; |
1644 | }; |
1645 | |
1646 | if (YamlMFI.ArgInfo && |
1647 | (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, |
1648 | AMDGPU::SGPR_128RegClass, |
1649 | MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || |
1650 | parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, |
1651 | AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, |
1652 | 2, 0) || |
1653 | parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, |
1654 | MFI->ArgInfo.QueuePtr, 2, 0) || |
1655 | parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, |
1656 | AMDGPU::SReg_64RegClass, |
1657 | MFI->ArgInfo.KernargSegmentPtr, 2, 0) || |
1658 | parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, |
1659 | AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, |
1660 | 2, 0) || |
1661 | parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, |
1662 | AMDGPU::SReg_64RegClass, |
1663 | MFI->ArgInfo.FlatScratchInit, 2, 0) || |
1664 | parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, |
1665 | AMDGPU::SGPR_32RegClass, |
1666 | MFI->ArgInfo.PrivateSegmentSize, 0, 0) || |
1667 | parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId, |
1668 | AMDGPU::SGPR_32RegClass, |
1669 | MFI->ArgInfo.LDSKernelId, 0, 1) || |
1670 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, |
1671 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, |
1672 | 0, 1) || |
1673 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, |
1674 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, |
1675 | 0, 1) || |
1676 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, |
1677 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, |
1678 | 0, 1) || |
1679 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, |
1680 | AMDGPU::SGPR_32RegClass, |
1681 | MFI->ArgInfo.WorkGroupInfo, 0, 1) || |
1682 | parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, |
1683 | AMDGPU::SGPR_32RegClass, |
1684 | MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || |
1685 | parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, |
1686 | AMDGPU::SReg_64RegClass, |
1687 | MFI->ArgInfo.ImplicitArgPtr, 0, 0) || |
1688 | parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, |
1689 | AMDGPU::SReg_64RegClass, |
1690 | MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || |
1691 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, |
1692 | AMDGPU::VGPR_32RegClass, |
1693 | MFI->ArgInfo.WorkItemIDX, 0, 0) || |
1694 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, |
1695 | AMDGPU::VGPR_32RegClass, |
1696 | MFI->ArgInfo.WorkItemIDY, 0, 0) || |
1697 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, |
1698 | AMDGPU::VGPR_32RegClass, |
1699 | MFI->ArgInfo.WorkItemIDZ, 0, 0))) |
1700 | return true; |
1701 | |
1702 | if (ST.hasIEEEMode()) |
1703 | MFI->Mode.IEEE = YamlMFI.Mode.IEEE; |
1704 | if (ST.hasDX10ClampMode()) |
1705 | MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; |
1706 | |
1707 | // FIXME: Move proper support for denormal-fp-math into base MachineFunction |
1708 | MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals |
1709 | ? DenormalMode::IEEE |
1710 | : DenormalMode::PreserveSign; |
1711 | MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals |
1712 | ? DenormalMode::IEEE |
1713 | : DenormalMode::PreserveSign; |
1714 | |
1715 | MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals |
1716 | ? DenormalMode::IEEE |
1717 | : DenormalMode::PreserveSign; |
1718 | MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals |
1719 | ? DenormalMode::IEEE |
1720 | : DenormalMode::PreserveSign; |
1721 | |
1722 | return false; |
1723 | } |
1724 | |