1 | //===- Construction of pass pipelines -------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// |
10 | /// This file provides the implementation of the PassBuilder based on our |
11 | /// static pass registry as well as related functionality. It also provides |
12 | /// helpers to aid in analyzing, debugging, and testing passes and pass |
13 | /// pipelines. |
14 | /// |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #include "llvm/ADT/Statistic.h" |
18 | #include "llvm/Analysis/AliasAnalysis.h" |
19 | #include "llvm/Analysis/BasicAliasAnalysis.h" |
20 | #include "llvm/Analysis/CGSCCPassManager.h" |
21 | #include "llvm/Analysis/CtxProfAnalysis.h" |
22 | #include "llvm/Analysis/GlobalsModRef.h" |
23 | #include "llvm/Analysis/InlineAdvisor.h" |
24 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
25 | #include "llvm/Analysis/ScopedNoAliasAA.h" |
26 | #include "llvm/Analysis/TypeBasedAliasAnalysis.h" |
27 | #include "llvm/IR/PassManager.h" |
28 | #include "llvm/Pass.h" |
29 | #include "llvm/Passes/OptimizationLevel.h" |
30 | #include "llvm/Passes/PassBuilder.h" |
31 | #include "llvm/Support/CommandLine.h" |
32 | #include "llvm/Support/ErrorHandling.h" |
33 | #include "llvm/Support/PGOOptions.h" |
34 | #include "llvm/Support/VirtualFileSystem.h" |
35 | #include "llvm/Target/TargetMachine.h" |
36 | #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" |
37 | #include "llvm/Transforms/Coroutines/CoroAnnotationElide.h" |
38 | #include "llvm/Transforms/Coroutines/CoroCleanup.h" |
39 | #include "llvm/Transforms/Coroutines/CoroConditionalWrapper.h" |
40 | #include "llvm/Transforms/Coroutines/CoroEarly.h" |
41 | #include "llvm/Transforms/Coroutines/CoroElide.h" |
42 | #include "llvm/Transforms/Coroutines/CoroSplit.h" |
43 | #include "llvm/Transforms/HipStdPar/HipStdPar.h" |
44 | #include "llvm/Transforms/IPO/AlwaysInliner.h" |
45 | #include "llvm/Transforms/IPO/Annotation2Metadata.h" |
46 | #include "llvm/Transforms/IPO/ArgumentPromotion.h" |
47 | #include "llvm/Transforms/IPO/Attributor.h" |
48 | #include "llvm/Transforms/IPO/CalledValuePropagation.h" |
49 | #include "llvm/Transforms/IPO/ConstantMerge.h" |
50 | #include "llvm/Transforms/IPO/CrossDSOCFI.h" |
51 | #include "llvm/Transforms/IPO/DeadArgumentElimination.h" |
52 | #include "llvm/Transforms/IPO/ElimAvailExtern.h" |
53 | #include "llvm/Transforms/IPO/EmbedBitcodePass.h" |
54 | #include "llvm/Transforms/IPO/ExpandVariadics.h" |
55 | #include "llvm/Transforms/IPO/FatLTOCleanup.h" |
56 | #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" |
57 | #include "llvm/Transforms/IPO/FunctionAttrs.h" |
58 | #include "llvm/Transforms/IPO/GlobalDCE.h" |
59 | #include "llvm/Transforms/IPO/GlobalOpt.h" |
60 | #include "llvm/Transforms/IPO/GlobalSplit.h" |
61 | #include "llvm/Transforms/IPO/HotColdSplitting.h" |
62 | #include "llvm/Transforms/IPO/IROutliner.h" |
63 | #include "llvm/Transforms/IPO/InferFunctionAttrs.h" |
64 | #include "llvm/Transforms/IPO/Inliner.h" |
65 | #include "llvm/Transforms/IPO/LowerTypeTests.h" |
66 | #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" |
67 | #include "llvm/Transforms/IPO/MergeFunctions.h" |
68 | #include "llvm/Transforms/IPO/ModuleInliner.h" |
69 | #include "llvm/Transforms/IPO/OpenMPOpt.h" |
70 | #include "llvm/Transforms/IPO/PartialInlining.h" |
71 | #include "llvm/Transforms/IPO/SCCP.h" |
72 | #include "llvm/Transforms/IPO/SampleProfile.h" |
73 | #include "llvm/Transforms/IPO/SampleProfileProbe.h" |
74 | #include "llvm/Transforms/IPO/WholeProgramDevirt.h" |
75 | #include "llvm/Transforms/InstCombine/InstCombine.h" |
76 | #include "llvm/Transforms/Instrumentation/CGProfile.h" |
77 | #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" |
78 | #include "llvm/Transforms/Instrumentation/InstrProfiling.h" |
79 | #include "llvm/Transforms/Instrumentation/MemProfInstrumentation.h" |
80 | #include "llvm/Transforms/Instrumentation/MemProfUse.h" |
81 | #include "llvm/Transforms/Instrumentation/PGOCtxProfFlattening.h" |
82 | #include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h" |
83 | #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h" |
84 | #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" |
85 | #include "llvm/Transforms/Scalar/ADCE.h" |
86 | #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" |
87 | #include "llvm/Transforms/Scalar/AnnotationRemarks.h" |
88 | #include "llvm/Transforms/Scalar/BDCE.h" |
89 | #include "llvm/Transforms/Scalar/CallSiteSplitting.h" |
90 | #include "llvm/Transforms/Scalar/ConstraintElimination.h" |
91 | #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" |
92 | #include "llvm/Transforms/Scalar/DFAJumpThreading.h" |
93 | #include "llvm/Transforms/Scalar/DeadStoreElimination.h" |
94 | #include "llvm/Transforms/Scalar/DivRemPairs.h" |
95 | #include "llvm/Transforms/Scalar/EarlyCSE.h" |
96 | #include "llvm/Transforms/Scalar/Float2Int.h" |
97 | #include "llvm/Transforms/Scalar/GVN.h" |
98 | #include "llvm/Transforms/Scalar/IndVarSimplify.h" |
99 | #include "llvm/Transforms/Scalar/InferAlignment.h" |
100 | #include "llvm/Transforms/Scalar/InstSimplifyPass.h" |
101 | #include "llvm/Transforms/Scalar/JumpTableToSwitch.h" |
102 | #include "llvm/Transforms/Scalar/JumpThreading.h" |
103 | #include "llvm/Transforms/Scalar/LICM.h" |
104 | #include "llvm/Transforms/Scalar/LoopDeletion.h" |
105 | #include "llvm/Transforms/Scalar/LoopDistribute.h" |
106 | #include "llvm/Transforms/Scalar/LoopFlatten.h" |
107 | #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" |
108 | #include "llvm/Transforms/Scalar/LoopInstSimplify.h" |
109 | #include "llvm/Transforms/Scalar/LoopInterchange.h" |
110 | #include "llvm/Transforms/Scalar/LoopLoadElimination.h" |
111 | #include "llvm/Transforms/Scalar/LoopPassManager.h" |
112 | #include "llvm/Transforms/Scalar/LoopRotation.h" |
113 | #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" |
114 | #include "llvm/Transforms/Scalar/LoopSink.h" |
115 | #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" |
116 | #include "llvm/Transforms/Scalar/LoopUnrollPass.h" |
117 | #include "llvm/Transforms/Scalar/LoopVersioningLICM.h" |
118 | #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" |
119 | #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" |
120 | #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" |
121 | #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" |
122 | #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" |
123 | #include "llvm/Transforms/Scalar/NewGVN.h" |
124 | #include "llvm/Transforms/Scalar/Reassociate.h" |
125 | #include "llvm/Transforms/Scalar/SCCP.h" |
126 | #include "llvm/Transforms/Scalar/SROA.h" |
127 | #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" |
128 | #include "llvm/Transforms/Scalar/SimplifyCFG.h" |
129 | #include "llvm/Transforms/Scalar/SpeculativeExecution.h" |
130 | #include "llvm/Transforms/Scalar/TailRecursionElimination.h" |
131 | #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" |
132 | #include "llvm/Transforms/Utils/AddDiscriminators.h" |
133 | #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" |
134 | #include "llvm/Transforms/Utils/CanonicalizeAliases.h" |
135 | #include "llvm/Transforms/Utils/CountVisits.h" |
136 | #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" |
137 | #include "llvm/Transforms/Utils/ExtraPassManager.h" |
138 | #include "llvm/Transforms/Utils/InjectTLIMappings.h" |
139 | #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h" |
140 | #include "llvm/Transforms/Utils/Mem2Reg.h" |
141 | #include "llvm/Transforms/Utils/MoveAutoInit.h" |
142 | #include "llvm/Transforms/Utils/NameAnonGlobals.h" |
143 | #include "llvm/Transforms/Utils/RelLookupTableConverter.h" |
144 | #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" |
145 | #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
146 | #include "llvm/Transforms/Vectorize/SLPVectorizer.h" |
147 | #include "llvm/Transforms/Vectorize/VectorCombine.h" |
148 | |
149 | using namespace llvm; |
150 | |
151 | static cl::opt<InliningAdvisorMode> UseInlineAdvisor( |
152 | "enable-ml-inliner" , cl::init(Val: InliningAdvisorMode::Default), cl::Hidden, |
153 | cl::desc("Enable ML policy for inliner. Currently trained for -Oz only" ), |
154 | cl::values(clEnumValN(InliningAdvisorMode::Default, "default" , |
155 | "Heuristics-based inliner version" ), |
156 | clEnumValN(InliningAdvisorMode::Development, "development" , |
157 | "Use development mode (runtime-loadable model)" ), |
158 | clEnumValN(InliningAdvisorMode::Release, "release" , |
159 | "Use release mode (AOT-compiled model)" ))); |
160 | |
161 | /// Flag to enable inline deferral during PGO. |
162 | static cl::opt<bool> |
163 | EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral" , cl::init(Val: true), |
164 | cl::Hidden, |
165 | cl::desc("Enable inline deferral during PGO" )); |
166 | |
167 | static cl::opt<bool> EnableModuleInliner("enable-module-inliner" , |
168 | cl::init(Val: false), cl::Hidden, |
169 | cl::desc("Enable module inliner" )); |
170 | |
171 | static cl::opt<bool> PerformMandatoryInliningsFirst( |
172 | "mandatory-inlining-first" , cl::init(Val: false), cl::Hidden, |
173 | cl::desc("Perform mandatory inlinings module-wide, before performing " |
174 | "inlining" )); |
175 | |
176 | static cl::opt<bool> EnableEagerlyInvalidateAnalyses( |
177 | "eagerly-invalidate-analyses" , cl::init(Val: true), cl::Hidden, |
178 | cl::desc("Eagerly invalidate more analyses in default pipelines" )); |
179 | |
180 | static cl::opt<bool> EnableMergeFunctions( |
181 | "enable-merge-functions" , cl::init(Val: false), cl::Hidden, |
182 | cl::desc("Enable function merging as part of the optimization pipeline" )); |
183 | |
184 | static cl::opt<bool> EnablePostPGOLoopRotation( |
185 | "enable-post-pgo-loop-rotation" , cl::init(Val: true), cl::Hidden, |
186 | cl::desc("Run the loop rotation transformation after PGO instrumentation" )); |
187 | |
188 | static cl::opt<bool> EnableGlobalAnalyses( |
189 | "enable-global-analyses" , cl::init(Val: true), cl::Hidden, |
190 | cl::desc("Enable inter-procedural analyses" )); |
191 | |
192 | static cl::opt<bool> RunPartialInlining("enable-partial-inlining" , |
193 | cl::init(Val: false), cl::Hidden, |
194 | cl::desc("Run Partial inlining pass" )); |
195 | |
196 | static cl::opt<bool> ( |
197 | "extra-vectorizer-passes" , cl::init(Val: false), cl::Hidden, |
198 | cl::desc("Run cleanup optimization passes after vectorization" )); |
199 | |
200 | static cl::opt<bool> RunNewGVN("enable-newgvn" , cl::init(Val: false), cl::Hidden, |
201 | cl::desc("Run the NewGVN pass" )); |
202 | |
203 | static cl::opt<bool> |
204 | EnableLoopInterchange("enable-loopinterchange" , cl::init(Val: false), cl::Hidden, |
205 | cl::desc("Enable the LoopInterchange Pass" )); |
206 | |
207 | static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam" , |
208 | cl::init(Val: false), cl::Hidden, |
209 | cl::desc("Enable Unroll And Jam Pass" )); |
210 | |
211 | static cl::opt<bool> EnableLoopFlatten("enable-loop-flatten" , cl::init(Val: false), |
212 | cl::Hidden, |
213 | cl::desc("Enable the LoopFlatten Pass" )); |
214 | |
215 | // Experimentally allow loop header duplication. This should allow for better |
216 | // optimization at Oz, since loop-idiom recognition can then recognize things |
217 | // like memcpy. If this ends up being useful for many targets, we should drop |
218 | // this flag and make a code generation option that can be controlled |
219 | // independent of the opt level and exposed through the frontend. |
220 | static cl::opt<bool> ( |
221 | "enable-loop-header-duplication" , cl::init(Val: false), cl::Hidden, |
222 | cl::desc("Enable loop header duplication at any optimization level" )); |
223 | |
224 | static cl::opt<bool> |
225 | EnableDFAJumpThreading("enable-dfa-jump-thread" , |
226 | cl::desc("Enable DFA jump threading" ), |
227 | cl::init(Val: false), cl::Hidden); |
228 | |
229 | static cl::opt<bool> |
230 | EnableHotColdSplit("hot-cold-split" , |
231 | cl::desc("Enable hot-cold splitting pass" )); |
232 | |
233 | static cl::opt<bool> EnableIROutliner("ir-outliner" , cl::init(Val: false), |
234 | cl::Hidden, |
235 | cl::desc("Enable ir outliner pass" )); |
236 | |
237 | static cl::opt<bool> |
238 | DisablePreInliner("disable-preinline" , cl::init(Val: false), cl::Hidden, |
239 | cl::desc("Disable pre-instrumentation inliner" )); |
240 | |
241 | static cl::opt<int> PreInlineThreshold( |
242 | "preinline-threshold" , cl::Hidden, cl::init(Val: 75), |
243 | cl::desc("Control the amount of inlining in pre-instrumentation inliner " |
244 | "(default = 75)" )); |
245 | |
246 | static cl::opt<bool> |
247 | EnableGVNHoist("enable-gvn-hoist" , |
248 | cl::desc("Enable the GVN hoisting pass (default = off)" )); |
249 | |
250 | static cl::opt<bool> |
251 | EnableGVNSink("enable-gvn-sink" , |
252 | cl::desc("Enable the GVN sinking pass (default = off)" )); |
253 | |
254 | static cl::opt<bool> EnableJumpTableToSwitch( |
255 | "enable-jump-table-to-switch" , |
256 | cl::desc("Enable JumpTableToSwitch pass (default = off)" )); |
257 | |
258 | // This option is used in simplifying testing SampleFDO optimizations for |
259 | // profile loading. |
260 | static cl::opt<bool> |
261 | EnableCHR("enable-chr" , cl::init(Val: true), cl::Hidden, |
262 | cl::desc("Enable control height reduction optimization (CHR)" )); |
263 | |
264 | static cl::opt<bool> FlattenedProfileUsed( |
265 | "flattened-profile-used" , cl::init(Val: false), cl::Hidden, |
266 | cl::desc("Indicate the sample profile being used is flattened, i.e., " |
267 | "no inline hierarchy exists in the profile" )); |
268 | |
269 | static cl::opt<bool> |
270 | EnableMatrix("enable-matrix" , cl::init(Val: false), cl::Hidden, |
271 | cl::desc("Enable lowering of the matrix intrinsics" )); |
272 | |
273 | static cl::opt<bool> EnableConstraintElimination( |
274 | "enable-constraint-elimination" , cl::init(Val: true), cl::Hidden, |
275 | cl::desc( |
276 | "Enable pass to eliminate conditions based on linear constraints" )); |
277 | |
278 | static cl::opt<AttributorRunOption> AttributorRun( |
279 | "attributor-enable" , cl::Hidden, cl::init(Val: AttributorRunOption::NONE), |
280 | cl::desc("Enable the attributor inter-procedural deduction pass" ), |
281 | cl::values(clEnumValN(AttributorRunOption::ALL, "all" , |
282 | "enable all attributor runs" ), |
283 | clEnumValN(AttributorRunOption::MODULE, "module" , |
284 | "enable module-wide attributor runs" ), |
285 | clEnumValN(AttributorRunOption::CGSCC, "cgscc" , |
286 | "enable call graph SCC attributor runs" ), |
287 | clEnumValN(AttributorRunOption::NONE, "none" , |
288 | "disable attributor runs" ))); |
289 | |
290 | static cl::opt<bool> EnableSampledInstr( |
291 | "enable-sampled-instrumentation" , cl::init(Val: false), cl::Hidden, |
292 | cl::desc("Enable profile instrumentation sampling (default = off)" )); |
293 | static cl::opt<bool> UseLoopVersioningLICM( |
294 | "enable-loop-versioning-licm" , cl::init(Val: false), cl::Hidden, |
295 | cl::desc("Enable the experimental Loop Versioning LICM pass" )); |
296 | |
297 | static cl::opt<std::string> InstrumentColdFuncOnlyPath( |
298 | "instrument-cold-function-only-path" , cl::init(Val: "" ), |
299 | cl::desc("File path for cold function only instrumentation(requires use " |
300 | "with --pgo-instrument-cold-function-only)" ), |
301 | cl::Hidden); |
302 | |
303 | extern cl::opt<std::string> UseCtxProfile; |
304 | extern cl::opt<bool> PGOInstrumentColdFunctionOnly; |
305 | |
306 | namespace llvm { |
307 | extern cl::opt<bool> EnableMemProfContextDisambiguation; |
308 | } // namespace llvm |
309 | |
310 | PipelineTuningOptions::PipelineTuningOptions() { |
311 | LoopInterleaving = true; |
312 | LoopVectorization = true; |
313 | SLPVectorization = false; |
314 | LoopUnrolling = true; |
315 | LoopInterchange = EnableLoopInterchange; |
316 | ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; |
317 | LicmMssaOptCap = SetLicmMssaOptCap; |
318 | LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; |
319 | CallGraphProfile = true; |
320 | UnifiedLTO = false; |
321 | MergeFunctions = EnableMergeFunctions; |
322 | InlinerThreshold = -1; |
323 | EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses; |
324 | } |
325 | |
326 | namespace llvm { |
327 | extern cl::opt<unsigned> MaxDevirtIterations; |
328 | } // namespace llvm |
329 | |
330 | void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM, |
331 | OptimizationLevel Level) { |
332 | for (auto &C : PeepholeEPCallbacks) |
333 | C(FPM, Level); |
334 | } |
335 | void PassBuilder::invokeLateLoopOptimizationsEPCallbacks( |
336 | LoopPassManager &LPM, OptimizationLevel Level) { |
337 | for (auto &C : LateLoopOptimizationsEPCallbacks) |
338 | C(LPM, Level); |
339 | } |
340 | void PassBuilder::invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM, |
341 | OptimizationLevel Level) { |
342 | for (auto &C : LoopOptimizerEndEPCallbacks) |
343 | C(LPM, Level); |
344 | } |
345 | void PassBuilder::invokeScalarOptimizerLateEPCallbacks( |
346 | FunctionPassManager &FPM, OptimizationLevel Level) { |
347 | for (auto &C : ScalarOptimizerLateEPCallbacks) |
348 | C(FPM, Level); |
349 | } |
350 | void PassBuilder::invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM, |
351 | OptimizationLevel Level) { |
352 | for (auto &C : CGSCCOptimizerLateEPCallbacks) |
353 | C(CGPM, Level); |
354 | } |
355 | void PassBuilder::invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM, |
356 | OptimizationLevel Level) { |
357 | for (auto &C : VectorizerStartEPCallbacks) |
358 | C(FPM, Level); |
359 | } |
360 | void PassBuilder::invokeVectorizerEndEPCallbacks(FunctionPassManager &FPM, |
361 | OptimizationLevel Level) { |
362 | for (auto &C : VectorizerEndEPCallbacks) |
363 | C(FPM, Level); |
364 | } |
365 | void PassBuilder::invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM, |
366 | OptimizationLevel Level, |
367 | ThinOrFullLTOPhase Phase) { |
368 | for (auto &C : OptimizerEarlyEPCallbacks) |
369 | C(MPM, Level, Phase); |
370 | } |
371 | void PassBuilder::invokeOptimizerLastEPCallbacks(ModulePassManager &MPM, |
372 | OptimizationLevel Level, |
373 | ThinOrFullLTOPhase Phase) { |
374 | for (auto &C : OptimizerLastEPCallbacks) |
375 | C(MPM, Level, Phase); |
376 | } |
377 | void PassBuilder::invokeFullLinkTimeOptimizationEarlyEPCallbacks( |
378 | ModulePassManager &MPM, OptimizationLevel Level) { |
379 | for (auto &C : FullLinkTimeOptimizationEarlyEPCallbacks) |
380 | C(MPM, Level); |
381 | } |
382 | void PassBuilder::invokeFullLinkTimeOptimizationLastEPCallbacks( |
383 | ModulePassManager &MPM, OptimizationLevel Level) { |
384 | for (auto &C : FullLinkTimeOptimizationLastEPCallbacks) |
385 | C(MPM, Level); |
386 | } |
387 | void PassBuilder::invokePipelineStartEPCallbacks(ModulePassManager &MPM, |
388 | OptimizationLevel Level) { |
389 | for (auto &C : PipelineStartEPCallbacks) |
390 | C(MPM, Level); |
391 | } |
392 | void PassBuilder::invokePipelineEarlySimplificationEPCallbacks( |
393 | ModulePassManager &MPM, OptimizationLevel Level, ThinOrFullLTOPhase Phase) { |
394 | for (auto &C : PipelineEarlySimplificationEPCallbacks) |
395 | C(MPM, Level, Phase); |
396 | } |
397 | |
398 | // Helper to add AnnotationRemarksPass. |
399 | static void (ModulePassManager &MPM) { |
400 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: AnnotationRemarksPass())); |
401 | } |
402 | |
403 | // Helper to check if the current compilation phase is preparing for LTO |
404 | static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { |
405 | return Phase == ThinOrFullLTOPhase::ThinLTOPreLink || |
406 | Phase == ThinOrFullLTOPhase::FullLTOPreLink; |
407 | } |
408 | |
409 | // Helper to check if the current compilation phase is LTO backend |
410 | static bool isLTOPostLink(ThinOrFullLTOPhase Phase) { |
411 | return Phase == ThinOrFullLTOPhase::ThinLTOPostLink || |
412 | Phase == ThinOrFullLTOPhase::FullLTOPostLink; |
413 | } |
414 | |
415 | // Helper to wrap conditionally Coro passes. |
416 | static CoroConditionalWrapper buildCoroWrapper(ThinOrFullLTOPhase Phase) { |
417 | // TODO: Skip passes according to Phase. |
418 | ModulePassManager CoroPM; |
419 | CoroPM.addPass(Pass: CoroEarlyPass()); |
420 | CGSCCPassManager CGPM; |
421 | CGPM.addPass(Pass: CoroSplitPass()); |
422 | CoroPM.addPass(Pass: createModuleToPostOrderCGSCCPassAdaptor(Pass: std::move(CGPM))); |
423 | CoroPM.addPass(Pass: CoroCleanupPass()); |
424 | CoroPM.addPass(Pass: GlobalDCEPass()); |
425 | return CoroConditionalWrapper(std::move(CoroPM)); |
426 | } |
427 | |
428 | // TODO: Investigate the cost/benefit of tail call elimination on debugging. |
429 | FunctionPassManager |
430 | PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, |
431 | ThinOrFullLTOPhase Phase) { |
432 | |
433 | FunctionPassManager FPM; |
434 | |
435 | if (AreStatisticsEnabled()) |
436 | FPM.addPass(Pass: CountVisitsPass()); |
437 | |
438 | // Form SSA out of local memory accesses after breaking apart aggregates into |
439 | // scalars. |
440 | FPM.addPass(Pass: SROAPass(SROAOptions::ModifyCFG)); |
441 | |
442 | // Catch trivial redundancies |
443 | FPM.addPass(Pass: EarlyCSEPass(true /* Enable mem-ssa. */)); |
444 | |
445 | // Hoisting of scalars and load expressions. |
446 | FPM.addPass( |
447 | Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(B: true))); |
448 | FPM.addPass(Pass: InstCombinePass()); |
449 | |
450 | FPM.addPass(Pass: LibCallsShrinkWrapPass()); |
451 | |
452 | invokePeepholeEPCallbacks(FPM, Level); |
453 | |
454 | FPM.addPass( |
455 | Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(B: true))); |
456 | |
457 | // Form canonically associated expression trees, and simplify the trees using |
458 | // basic mathematical properties. For example, this will form (nearly) |
459 | // minimal multiplication trees. |
460 | FPM.addPass(Pass: ReassociatePass()); |
461 | |
462 | // Add the primary loop simplification pipeline. |
463 | // FIXME: Currently this is split into two loop pass pipelines because we run |
464 | // some function passes in between them. These can and should be removed |
465 | // and/or replaced by scheduling the loop pass equivalents in the correct |
466 | // positions. But those equivalent passes aren't powerful enough yet. |
467 | // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still |
468 | // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to |
469 | // fully replace `SimplifyCFGPass`, and the closest to the other we have is |
470 | // `LoopInstSimplify`. |
471 | LoopPassManager LPM1, LPM2; |
472 | |
473 | // Simplify the loop body. We do this initially to clean up after other loop |
474 | // passes run, either when iterating on a loop or on inner loops with |
475 | // implications on the outer loop. |
476 | LPM1.addPass(Pass: LoopInstSimplifyPass()); |
477 | LPM1.addPass(Pass: LoopSimplifyCFGPass()); |
478 | |
479 | // Try to remove as much code from the loop header as possible, |
480 | // to reduce amount of IR that will have to be duplicated. However, |
481 | // do not perform speculative hoisting the first time as LICM |
482 | // will destroy metadata that may not need to be destroyed if run |
483 | // after loop rotation. |
484 | // TODO: Investigate promotion cap for O1. |
485 | LPM1.addPass(Pass: LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, |
486 | /*AllowSpeculation=*/false)); |
487 | |
488 | LPM1.addPass(Pass: LoopRotatePass(/* Disable header duplication */ true, |
489 | isLTOPreLink(Phase))); |
490 | // TODO: Investigate promotion cap for O1. |
491 | LPM1.addPass(Pass: LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, |
492 | /*AllowSpeculation=*/true)); |
493 | LPM1.addPass(Pass: SimpleLoopUnswitchPass()); |
494 | if (EnableLoopFlatten) |
495 | LPM1.addPass(Pass: LoopFlattenPass()); |
496 | |
497 | LPM2.addPass(Pass: LoopIdiomRecognizePass()); |
498 | LPM2.addPass(Pass: IndVarSimplifyPass()); |
499 | |
500 | invokeLateLoopOptimizationsEPCallbacks(LPM&: LPM2, Level); |
501 | |
502 | LPM2.addPass(Pass: LoopDeletionPass()); |
503 | |
504 | // Do not enable unrolling in PreLinkThinLTO phase during sample PGO |
505 | // because it changes IR to makes profile annotation in back compile |
506 | // inaccurate. The normal unroller doesn't pay attention to forced full unroll |
507 | // attributes so we need to make sure and allow the full unroll pass to pay |
508 | // attention to it. |
509 | if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || |
510 | PGOOpt->Action != PGOOptions::SampleUse) |
511 | LPM2.addPass(Pass: LoopFullUnrollPass(Level.getSpeedupLevel(), |
512 | /* OnlyWhenForced= */ !PTO.LoopUnrolling, |
513 | PTO.ForgetAllSCEVInLoopUnroll)); |
514 | |
515 | invokeLoopOptimizerEndEPCallbacks(LPM&: LPM2, Level); |
516 | |
517 | FPM.addPass(Pass: createFunctionToLoopPassAdaptor(Pass: std::move(LPM1), |
518 | /*UseMemorySSA=*/true, |
519 | /*UseBlockFrequencyInfo=*/true)); |
520 | FPM.addPass( |
521 | Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(B: true))); |
522 | FPM.addPass(Pass: InstCombinePass()); |
523 | // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. |
524 | // *All* loop passes must preserve it, in order to be able to use it. |
525 | FPM.addPass(Pass: createFunctionToLoopPassAdaptor(Pass: std::move(LPM2), |
526 | /*UseMemorySSA=*/false, |
527 | /*UseBlockFrequencyInfo=*/false)); |
528 | |
529 | // Delete small array after loop unroll. |
530 | FPM.addPass(Pass: SROAPass(SROAOptions::ModifyCFG)); |
531 | |
532 | // Specially optimize memory movement as it doesn't look like dataflow in SSA. |
533 | FPM.addPass(Pass: MemCpyOptPass()); |
534 | |
535 | // Sparse conditional constant propagation. |
536 | // FIXME: It isn't clear why we do this *after* loop passes rather than |
537 | // before... |
538 | FPM.addPass(Pass: SCCPPass()); |
539 | |
540 | // Delete dead bit computations (instcombine runs after to fold away the dead |
541 | // computations, and then ADCE will run later to exploit any new DCE |
542 | // opportunities that creates). |
543 | FPM.addPass(Pass: BDCEPass()); |
544 | |
545 | // Run instcombine after redundancy and dead bit elimination to exploit |
546 | // opportunities opened up by them. |
547 | FPM.addPass(Pass: InstCombinePass()); |
548 | invokePeepholeEPCallbacks(FPM, Level); |
549 | |
550 | FPM.addPass(Pass: CoroElidePass()); |
551 | |
552 | invokeScalarOptimizerLateEPCallbacks(FPM, Level); |
553 | |
554 | // Finally, do an expensive DCE pass to catch all the dead code exposed by |
555 | // the simplifications and basic cleanup after all the simplifications. |
556 | // TODO: Investigate if this is too expensive. |
557 | FPM.addPass(Pass: ADCEPass()); |
558 | FPM.addPass( |
559 | Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(B: true))); |
560 | FPM.addPass(Pass: InstCombinePass()); |
561 | invokePeepholeEPCallbacks(FPM, Level); |
562 | |
563 | return FPM; |
564 | } |
565 | |
566 | FunctionPassManager |
567 | PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, |
568 | ThinOrFullLTOPhase Phase) { |
569 | assert(Level != OptimizationLevel::O0 && "Must request optimizations!" ); |
570 | |
571 | // The O1 pipeline has a separate pipeline creation function to simplify |
572 | // construction readability. |
573 | if (Level.getSpeedupLevel() == 1) |
574 | return buildO1FunctionSimplificationPipeline(Level, Phase); |
575 | |
576 | FunctionPassManager FPM; |
577 | |
578 | if (AreStatisticsEnabled()) |
579 | FPM.addPass(Pass: CountVisitsPass()); |
580 | |
581 | // Form SSA out of local memory accesses after breaking apart aggregates into |
582 | // scalars. |
583 | FPM.addPass(Pass: SROAPass(SROAOptions::ModifyCFG)); |
584 | |
585 | // Catch trivial redundancies |
586 | FPM.addPass(Pass: EarlyCSEPass(true /* Enable mem-ssa. */)); |
587 | if (EnableKnowledgeRetention) |
588 | FPM.addPass(Pass: AssumeSimplifyPass()); |
589 | |
590 | // Hoisting of scalars and load expressions. |
591 | if (EnableGVNHoist) |
592 | FPM.addPass(Pass: GVNHoistPass()); |
593 | |
594 | // Global value numbering based sinking. |
595 | if (EnableGVNSink) { |
596 | FPM.addPass(Pass: GVNSinkPass()); |
597 | FPM.addPass( |
598 | Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(B: true))); |
599 | } |
600 | |
601 | // Speculative execution if the target has divergent branches; otherwise nop. |
602 | FPM.addPass(Pass: SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true)); |
603 | |
604 | // Optimize based on known information about branches, and cleanup afterward. |
605 | FPM.addPass(Pass: JumpThreadingPass()); |
606 | FPM.addPass(Pass: CorrelatedValuePropagationPass()); |
607 | |
608 | // Jump table to switch conversion. |
609 | if (EnableJumpTableToSwitch) |
610 | FPM.addPass(Pass: JumpTableToSwitchPass()); |
611 | |
612 | FPM.addPass( |
613 | Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(B: true))); |
614 | FPM.addPass(Pass: InstCombinePass()); |
615 | FPM.addPass(Pass: AggressiveInstCombinePass()); |
616 | |
617 | if (!Level.isOptimizingForSize()) |
618 | FPM.addPass(Pass: LibCallsShrinkWrapPass()); |
619 | |
620 | invokePeepholeEPCallbacks(FPM, Level); |
621 | |
622 | // For PGO use pipeline, try to optimize memory intrinsics such as memcpy |
623 | // using the size value profile. Don't perform this when optimizing for size. |
624 | if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse && |
625 | !Level.isOptimizingForSize()) |
626 | FPM.addPass(Pass: PGOMemOPSizeOpt()); |
627 | |
628 | FPM.addPass(Pass: TailCallElimPass(/*UpdateFunctionEntryCount=*/ |
629 | isInstrumentedPGOUse())); |
630 | FPM.addPass( |
631 | Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(B: true))); |
632 | |
633 | // Form canonically associated expression trees, and simplify the trees using |
634 | // basic mathematical properties. For example, this will form (nearly) |
635 | // minimal multiplication trees. |
636 | FPM.addPass(Pass: ReassociatePass()); |
637 | |
638 | if (EnableConstraintElimination) |
639 | FPM.addPass(Pass: ConstraintEliminationPass()); |
640 | |
641 | // Add the primary loop simplification pipeline. |
642 | // FIXME: Currently this is split into two loop pass pipelines because we run |
643 | // some function passes in between them. These can and should be removed |
644 | // and/or replaced by scheduling the loop pass equivalents in the correct |
645 | // positions. But those equivalent passes aren't powerful enough yet. |
646 | // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still |
647 | // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to |
648 | // fully replace `SimplifyCFGPass`, and the closest to the other we have is |
649 | // `LoopInstSimplify`. |
650 | LoopPassManager LPM1, LPM2; |
651 | |
652 | // Simplify the loop body. We do this initially to clean up after other loop |
653 | // passes run, either when iterating on a loop or on inner loops with |
654 | // implications on the outer loop. |
655 | LPM1.addPass(Pass: LoopInstSimplifyPass()); |
656 | LPM1.addPass(Pass: LoopSimplifyCFGPass()); |
657 | |
658 | // Try to remove as much code from the loop header as possible, |
659 | // to reduce amount of IR that will have to be duplicated. However, |
660 | // do not perform speculative hoisting the first time as LICM |
661 | // will destroy metadata that may not need to be destroyed if run |
662 | // after loop rotation. |
663 | // TODO: Investigate promotion cap for O1. |
664 | LPM1.addPass(Pass: LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, |
665 | /*AllowSpeculation=*/false)); |
666 | |
667 | // Disable header duplication in loop rotation at -Oz. |
668 | LPM1.addPass(Pass: LoopRotatePass(EnableLoopHeaderDuplication || |
669 | Level != OptimizationLevel::Oz, |
670 | isLTOPreLink(Phase))); |
671 | // TODO: Investigate promotion cap for O1. |
672 | LPM1.addPass(Pass: LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, |
673 | /*AllowSpeculation=*/true)); |
674 | LPM1.addPass( |
675 | Pass: SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3)); |
676 | if (EnableLoopFlatten) |
677 | LPM1.addPass(Pass: LoopFlattenPass()); |
678 | |
679 | LPM2.addPass(Pass: LoopIdiomRecognizePass()); |
680 | LPM2.addPass(Pass: IndVarSimplifyPass()); |
681 | |
682 | { |
683 | ExtraLoopPassManager<ShouldRunExtraSimpleLoopUnswitch> ; |
684 | ExtraPasses.addPass(Pass: SimpleLoopUnswitchPass(/* NonTrivial */ Level == |
685 | OptimizationLevel::O3)); |
686 | LPM2.addPass(Pass: std::move(ExtraPasses)); |
687 | } |
688 | |
689 | invokeLateLoopOptimizationsEPCallbacks(LPM&: LPM2, Level); |
690 | |
691 | LPM2.addPass(Pass: LoopDeletionPass()); |
692 | |
693 | // Do not enable unrolling in PreLinkThinLTO phase during sample PGO |
694 | // because it changes IR to makes profile annotation in back compile |
695 | // inaccurate. The normal unroller doesn't pay attention to forced full unroll |
696 | // attributes so we need to make sure and allow the full unroll pass to pay |
697 | // attention to it. |
698 | if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || |
699 | PGOOpt->Action != PGOOptions::SampleUse) |
700 | LPM2.addPass(Pass: LoopFullUnrollPass(Level.getSpeedupLevel(), |
701 | /* OnlyWhenForced= */ !PTO.LoopUnrolling, |
702 | PTO.ForgetAllSCEVInLoopUnroll)); |
703 | |
704 | invokeLoopOptimizerEndEPCallbacks(LPM&: LPM2, Level); |
705 | |
706 | FPM.addPass(Pass: createFunctionToLoopPassAdaptor(Pass: std::move(LPM1), |
707 | /*UseMemorySSA=*/true, |
708 | /*UseBlockFrequencyInfo=*/true)); |
709 | FPM.addPass( |
710 | Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(B: true))); |
711 | FPM.addPass(Pass: InstCombinePass()); |
712 | // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, |
713 | // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. |
714 | // *All* loop passes must preserve it, in order to be able to use it. |
715 | FPM.addPass(Pass: createFunctionToLoopPassAdaptor(Pass: std::move(LPM2), |
716 | /*UseMemorySSA=*/false, |
717 | /*UseBlockFrequencyInfo=*/false)); |
718 | |
719 | // Delete small array after loop unroll. |
720 | FPM.addPass(Pass: SROAPass(SROAOptions::ModifyCFG)); |
721 | |
722 | // Try vectorization/scalarization transforms that are both improvements |
723 | // themselves and can allow further folds with GVN and InstCombine. |
724 | FPM.addPass(Pass: VectorCombinePass(/*TryEarlyFoldsOnly=*/true)); |
725 | |
726 | // Eliminate redundancies. |
727 | FPM.addPass(Pass: MergedLoadStoreMotionPass()); |
728 | if (RunNewGVN) |
729 | FPM.addPass(Pass: NewGVNPass()); |
730 | else |
731 | FPM.addPass(Pass: GVNPass()); |
732 | |
733 | // Sparse conditional constant propagation. |
734 | // FIXME: It isn't clear why we do this *after* loop passes rather than |
735 | // before... |
736 | FPM.addPass(Pass: SCCPPass()); |
737 | |
738 | // Delete dead bit computations (instcombine runs after to fold away the dead |
739 | // computations, and then ADCE will run later to exploit any new DCE |
740 | // opportunities that creates). |
741 | FPM.addPass(Pass: BDCEPass()); |
742 | |
743 | // Run instcombine after redundancy and dead bit elimination to exploit |
744 | // opportunities opened up by them. |
745 | FPM.addPass(Pass: InstCombinePass()); |
746 | invokePeepholeEPCallbacks(FPM, Level); |
747 | |
748 | // Re-consider control flow based optimizations after redundancy elimination, |
749 | // redo DCE, etc. |
750 | if (EnableDFAJumpThreading) |
751 | FPM.addPass(Pass: DFAJumpThreadingPass()); |
752 | |
753 | FPM.addPass(Pass: JumpThreadingPass()); |
754 | FPM.addPass(Pass: CorrelatedValuePropagationPass()); |
755 | |
756 | // Finally, do an expensive DCE pass to catch all the dead code exposed by |
757 | // the simplifications and basic cleanup after all the simplifications. |
758 | // TODO: Investigate if this is too expensive. |
759 | FPM.addPass(Pass: ADCEPass()); |
760 | |
761 | // Specially optimize memory movement as it doesn't look like dataflow in SSA. |
762 | FPM.addPass(Pass: MemCpyOptPass()); |
763 | |
764 | FPM.addPass(Pass: DSEPass()); |
765 | FPM.addPass(Pass: MoveAutoInitPass()); |
766 | |
767 | FPM.addPass(Pass: createFunctionToLoopPassAdaptor( |
768 | Pass: LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, |
769 | /*AllowSpeculation=*/true), |
770 | /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); |
771 | |
772 | FPM.addPass(Pass: CoroElidePass()); |
773 | |
774 | invokeScalarOptimizerLateEPCallbacks(FPM, Level); |
775 | |
776 | FPM.addPass(Pass: SimplifyCFGPass(SimplifyCFGOptions() |
777 | .convertSwitchRangeToICmp(B: true) |
778 | .hoistCommonInsts(B: true) |
779 | .sinkCommonInsts(B: true))); |
780 | FPM.addPass(Pass: InstCombinePass()); |
781 | invokePeepholeEPCallbacks(FPM, Level); |
782 | |
783 | return FPM; |
784 | } |
785 | |
786 | void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) { |
787 | MPM.addPass(Pass: CanonicalizeAliasesPass()); |
788 | MPM.addPass(Pass: NameAnonGlobalPass()); |
789 | } |
790 | |
791 | void PassBuilder::addPreInlinerPasses(ModulePassManager &MPM, |
792 | OptimizationLevel Level, |
793 | ThinOrFullLTOPhase LTOPhase) { |
794 | assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!" ); |
795 | if (DisablePreInliner) |
796 | return; |
797 | InlineParams IP; |
798 | |
799 | IP.DefaultThreshold = PreInlineThreshold; |
800 | |
801 | // FIXME: The hint threshold has the same value used by the regular inliner |
802 | // when not optimzing for size. This should probably be lowered after |
803 | // performance testing. |
804 | // FIXME: this comment is cargo culted from the old pass manager, revisit). |
805 | IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325; |
806 | ModuleInlinerWrapperPass MIWP( |
807 | IP, /* MandatoryFirst */ true, |
808 | InlineContext{.LTOPhase: LTOPhase, .Pass: InlinePass::EarlyInliner}); |
809 | CGSCCPassManager &CGPipeline = MIWP.getPM(); |
810 | |
811 | FunctionPassManager FPM; |
812 | FPM.addPass(Pass: SROAPass(SROAOptions::ModifyCFG)); |
813 | FPM.addPass(Pass: EarlyCSEPass()); // Catch trivial redundancies. |
814 | FPM.addPass(Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp( |
815 | B: true))); // Merge & remove basic blocks. |
816 | FPM.addPass(Pass: InstCombinePass()); // Combine silly sequences. |
817 | invokePeepholeEPCallbacks(FPM, Level); |
818 | |
819 | CGPipeline.addPass(Pass: createCGSCCToFunctionPassAdaptor( |
820 | Pass: std::move(FPM), EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses)); |
821 | |
822 | MPM.addPass(Pass: std::move(MIWP)); |
823 | |
824 | // Delete anything that is now dead to make sure that we don't instrument |
825 | // dead code. Instrumentation can end up keeping dead code around and |
826 | // dramatically increase code size. |
827 | MPM.addPass(Pass: GlobalDCEPass()); |
828 | } |
829 | |
830 | void PassBuilder::addPostPGOLoopRotation(ModulePassManager &MPM, |
831 | OptimizationLevel Level) { |
832 | if (EnablePostPGOLoopRotation) { |
833 | // Disable header duplication in loop rotation at -Oz. |
834 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor( |
835 | Pass: createFunctionToLoopPassAdaptor( |
836 | Pass: LoopRotatePass(EnableLoopHeaderDuplication || |
837 | Level != OptimizationLevel::Oz), |
838 | /*UseMemorySSA=*/false, |
839 | /*UseBlockFrequencyInfo=*/false), |
840 | EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses)); |
841 | } |
842 | } |
843 | |
844 | void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, |
845 | OptimizationLevel Level, bool RunProfileGen, |
846 | bool IsCS, bool AtomicCounterUpdate, |
847 | std::string ProfileFile, |
848 | std::string ProfileRemappingFile, |
849 | IntrusiveRefCntPtr<vfs::FileSystem> FS) { |
850 | assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!" ); |
851 | |
852 | if (!RunProfileGen) { |
853 | assert(!ProfileFile.empty() && "Profile use expecting a profile file!" ); |
854 | MPM.addPass( |
855 | Pass: PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS)); |
856 | // Cache ProfileSummaryAnalysis once to avoid the potential need to insert |
857 | // RequireAnalysisPass for PSI before subsequent non-module passes. |
858 | MPM.addPass(Pass: RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); |
859 | return; |
860 | } |
861 | |
862 | // Perform PGO instrumentation. |
863 | MPM.addPass(Pass: PGOInstrumentationGen(IsCS ? PGOInstrumentationType::CSFDO |
864 | : PGOInstrumentationType::FDO)); |
865 | |
866 | addPostPGOLoopRotation(MPM, Level); |
867 | // Add the profile lowering pass. |
868 | InstrProfOptions Options; |
869 | if (!ProfileFile.empty()) |
870 | Options.InstrProfileOutput = ProfileFile; |
871 | // Do counter promotion at Level greater than O0. |
872 | Options.DoCounterPromotion = true; |
873 | Options.UseBFIInPromotion = IsCS; |
874 | if (EnableSampledInstr) { |
875 | Options.Sampling = true; |
876 | // With sampling, there is little beneifit to enable counter promotion. |
877 | // But note that sampling does work with counter promotion. |
878 | Options.DoCounterPromotion = false; |
879 | } |
880 | Options.Atomic = AtomicCounterUpdate; |
881 | MPM.addPass(Pass: InstrProfilingLoweringPass(Options, IsCS)); |
882 | } |
883 | |
884 | void PassBuilder::addPGOInstrPassesForO0( |
885 | ModulePassManager &MPM, bool RunProfileGen, bool IsCS, |
886 | bool AtomicCounterUpdate, std::string ProfileFile, |
887 | std::string ProfileRemappingFile, IntrusiveRefCntPtr<vfs::FileSystem> FS) { |
888 | if (!RunProfileGen) { |
889 | assert(!ProfileFile.empty() && "Profile use expecting a profile file!" ); |
890 | MPM.addPass( |
891 | Pass: PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS)); |
892 | // Cache ProfileSummaryAnalysis once to avoid the potential need to insert |
893 | // RequireAnalysisPass for PSI before subsequent non-module passes. |
894 | MPM.addPass(Pass: RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); |
895 | return; |
896 | } |
897 | |
898 | // Perform PGO instrumentation. |
899 | MPM.addPass(Pass: PGOInstrumentationGen(IsCS ? PGOInstrumentationType::CSFDO |
900 | : PGOInstrumentationType::FDO)); |
901 | // Add the profile lowering pass. |
902 | InstrProfOptions Options; |
903 | if (!ProfileFile.empty()) |
904 | Options.InstrProfileOutput = ProfileFile; |
905 | // Do not do counter promotion at O0. |
906 | Options.DoCounterPromotion = false; |
907 | Options.UseBFIInPromotion = IsCS; |
908 | Options.Atomic = AtomicCounterUpdate; |
909 | MPM.addPass(Pass: InstrProfilingLoweringPass(Options, IsCS)); |
910 | } |
911 | |
912 | static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) { |
913 | return getInlineParams(OptLevel: Level.getSpeedupLevel(), SizeOptLevel: Level.getSizeLevel()); |
914 | } |
915 | |
916 | ModuleInlinerWrapperPass |
917 | PassBuilder::buildInlinerPipeline(OptimizationLevel Level, |
918 | ThinOrFullLTOPhase Phase) { |
919 | InlineParams IP; |
920 | if (PTO.InlinerThreshold == -1) |
921 | IP = getInlineParamsFromOptLevel(Level); |
922 | else |
923 | IP = getInlineParams(Threshold: PTO.InlinerThreshold); |
924 | // For PreLinkThinLTO + SamplePGO or PreLinkFullLTO + SamplePGO, |
925 | // set hot-caller threshold to 0 to disable hot |
926 | // callsite inline (as much as possible [1]) because it makes |
927 | // profile annotation in the backend inaccurate. |
928 | // |
929 | // [1] Note the cost of a function could be below zero due to erased |
930 | // prologue / epilogue. |
931 | if (isLTOPreLink(Phase) && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) |
932 | IP.HotCallSiteThreshold = 0; |
933 | |
934 | if (PGOOpt) |
935 | IP.EnableDeferral = EnablePGOInlineDeferral; |
936 | |
937 | ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst, |
938 | InlineContext{.LTOPhase: Phase, .Pass: InlinePass::CGSCCInliner}, |
939 | UseInlineAdvisor, MaxDevirtIterations); |
940 | |
941 | // Require the GlobalsAA analysis for the module so we can query it within |
942 | // the CGSCC pipeline. |
943 | if (EnableGlobalAnalyses) { |
944 | MIWP.addModulePass(Pass: RequireAnalysisPass<GlobalsAA, Module>()); |
945 | // Invalidate AAManager so it can be recreated and pick up the newly |
946 | // available GlobalsAA. |
947 | MIWP.addModulePass( |
948 | Pass: createModuleToFunctionPassAdaptor(Pass: InvalidateAnalysisPass<AAManager>())); |
949 | } |
950 | |
951 | // Require the ProfileSummaryAnalysis for the module so we can query it within |
952 | // the inliner pass. |
953 | MIWP.addModulePass(Pass: RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); |
954 | |
955 | // Now begin the main postorder CGSCC pipeline. |
956 | // FIXME: The current CGSCC pipeline has its origins in the legacy pass |
957 | // manager and trying to emulate its precise behavior. Much of this doesn't |
958 | // make a lot of sense and we should revisit the core CGSCC structure. |
959 | CGSCCPassManager &MainCGPipeline = MIWP.getPM(); |
960 | |
961 | // Note: historically, the PruneEH pass was run first to deduce nounwind and |
962 | // generally clean up exception handling overhead. It isn't clear this is |
963 | // valuable as the inliner doesn't currently care whether it is inlining an |
964 | // invoke or a call. |
965 | |
966 | if (AttributorRun & AttributorRunOption::CGSCC) |
967 | MainCGPipeline.addPass(Pass: AttributorCGSCCPass()); |
968 | |
969 | // Deduce function attributes. We do another run of this after the function |
970 | // simplification pipeline, so this only needs to run when it could affect the |
971 | // function simplification pipeline, which is only the case with recursive |
972 | // functions. |
973 | MainCGPipeline.addPass(Pass: PostOrderFunctionAttrsPass(/*SkipNonRecursive*/ true)); |
974 | |
975 | // When at O3 add argument promotion to the pass pipeline. |
976 | // FIXME: It isn't at all clear why this should be limited to O3. |
977 | if (Level == OptimizationLevel::O3) |
978 | MainCGPipeline.addPass(Pass: ArgumentPromotionPass()); |
979 | |
980 | // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if |
981 | // there are no OpenMP runtime calls present in the module. |
982 | if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) |
983 | MainCGPipeline.addPass(Pass: OpenMPOptCGSCCPass(Phase)); |
984 | |
985 | invokeCGSCCOptimizerLateEPCallbacks(CGPM&: MainCGPipeline, Level); |
986 | |
987 | // Add the core function simplification pipeline nested inside the |
988 | // CGSCC walk. |
989 | MainCGPipeline.addPass(Pass: createCGSCCToFunctionPassAdaptor( |
990 | Pass: buildFunctionSimplificationPipeline(Level, Phase), |
991 | EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses, /*NoRerun=*/true)); |
992 | |
993 | // Finally, deduce any function attributes based on the fully simplified |
994 | // function. |
995 | MainCGPipeline.addPass(Pass: PostOrderFunctionAttrsPass()); |
996 | |
997 | // Mark that the function is fully simplified and that it shouldn't be |
998 | // simplified again if we somehow revisit it due to CGSCC mutations unless |
999 | // it's been modified since. |
1000 | MainCGPipeline.addPass(Pass: createCGSCCToFunctionPassAdaptor( |
1001 | Pass: RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>())); |
1002 | |
1003 | if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { |
1004 | MainCGPipeline.addPass(Pass: CoroSplitPass(Level != OptimizationLevel::O0)); |
1005 | MainCGPipeline.addPass(Pass: CoroAnnotationElidePass()); |
1006 | } |
1007 | |
1008 | // Make sure we don't affect potential future NoRerun CGSCC adaptors. |
1009 | MIWP.addLateModulePass(Pass: createModuleToFunctionPassAdaptor( |
1010 | Pass: InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>())); |
1011 | |
1012 | return MIWP; |
1013 | } |
1014 | |
1015 | ModulePassManager |
1016 | PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, |
1017 | ThinOrFullLTOPhase Phase) { |
1018 | ModulePassManager MPM; |
1019 | |
1020 | InlineParams IP = getInlineParamsFromOptLevel(Level); |
1021 | // For PreLinkThinLTO + SamplePGO or PreLinkFullLTO + SamplePGO, |
1022 | // set hot-caller threshold to 0 to disable hot |
1023 | // callsite inline (as much as possible [1]) because it makes |
1024 | // profile annotation in the backend inaccurate. |
1025 | // |
1026 | // [1] Note the cost of a function could be below zero due to erased |
1027 | // prologue / epilogue. |
1028 | if (isLTOPreLink(Phase) && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) |
1029 | IP.HotCallSiteThreshold = 0; |
1030 | |
1031 | if (PGOOpt) |
1032 | IP.EnableDeferral = EnablePGOInlineDeferral; |
1033 | |
1034 | // The inline deferral logic is used to avoid losing some |
1035 | // inlining chance in future. It is helpful in SCC inliner, in which |
1036 | // inlining is processed in bottom-up order. |
1037 | // While in module inliner, the inlining order is a priority-based order |
1038 | // by default. The inline deferral is unnecessary there. So we disable the |
1039 | // inline deferral logic in module inliner. |
1040 | IP.EnableDeferral = false; |
1041 | |
1042 | MPM.addPass(Pass: ModuleInlinerPass(IP, UseInlineAdvisor, Phase)); |
1043 | if (!UseCtxProfile.empty() && Phase == ThinOrFullLTOPhase::ThinLTOPostLink) { |
1044 | MPM.addPass(Pass: GlobalOptPass()); |
1045 | MPM.addPass(Pass: GlobalDCEPass()); |
1046 | MPM.addPass(Pass: PGOCtxProfFlatteningPass(/*IsPreThinlink=*/false)); |
1047 | } |
1048 | |
1049 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor( |
1050 | Pass: buildFunctionSimplificationPipeline(Level, Phase), |
1051 | EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses)); |
1052 | |
1053 | if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { |
1054 | MPM.addPass(Pass: createModuleToPostOrderCGSCCPassAdaptor( |
1055 | Pass: CoroSplitPass(Level != OptimizationLevel::O0))); |
1056 | MPM.addPass( |
1057 | Pass: createModuleToPostOrderCGSCCPassAdaptor(Pass: CoroAnnotationElidePass())); |
1058 | } |
1059 | |
1060 | return MPM; |
1061 | } |
1062 | |
1063 | ModulePassManager |
1064 | PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, |
1065 | ThinOrFullLTOPhase Phase) { |
1066 | assert(Level != OptimizationLevel::O0 && |
1067 | "Should not be used for O0 pipeline" ); |
1068 | |
1069 | assert(Phase != ThinOrFullLTOPhase::FullLTOPostLink && |
1070 | "FullLTOPostLink shouldn't call buildModuleSimplificationPipeline!" ); |
1071 | |
1072 | ModulePassManager MPM; |
1073 | |
1074 | // Place pseudo probe instrumentation as the first pass of the pipeline to |
1075 | // minimize the impact of optimization changes. |
1076 | if (PGOOpt && PGOOpt->PseudoProbeForProfiling && |
1077 | Phase != ThinOrFullLTOPhase::ThinLTOPostLink) |
1078 | MPM.addPass(Pass: SampleProfileProbePass(TM)); |
1079 | |
1080 | bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse); |
1081 | |
1082 | // In ThinLTO mode, when flattened profile is used, all the available |
1083 | // profile information will be annotated in PreLink phase so there is |
1084 | // no need to load the profile again in PostLink. |
1085 | bool LoadSampleProfile = |
1086 | HasSampleProfile && |
1087 | !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink); |
1088 | |
1089 | // During the ThinLTO backend phase we perform early indirect call promotion |
1090 | // here, before globalopt. Otherwise imported available_externally functions |
1091 | // look unreferenced and are removed. If we are going to load the sample |
1092 | // profile then defer until later. |
1093 | // TODO: See if we can move later and consolidate with the location where |
1094 | // we perform ICP when we are loading a sample profile. |
1095 | // TODO: We pass HasSampleProfile (whether there was a sample profile file |
1096 | // passed to the compile) to the SamplePGO flag of ICP. This is used to |
1097 | // determine whether the new direct calls are annotated with prof metadata. |
1098 | // Ideally this should be determined from whether the IR is annotated with |
1099 | // sample profile, and not whether the a sample profile was provided on the |
1100 | // command line. E.g. for flattened profiles where we will not be reloading |
1101 | // the sample profile in the ThinLTO backend, we ideally shouldn't have to |
1102 | // provide the sample profile file. |
1103 | if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile) |
1104 | MPM.addPass(Pass: PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile)); |
1105 | |
1106 | // Create an early function pass manager to cleanup the output of the |
1107 | // frontend. Not necessary with LTO post link pipelines since the pre link |
1108 | // pipeline already cleaned up the frontend output. |
1109 | if (Phase != ThinOrFullLTOPhase::ThinLTOPostLink) { |
1110 | // Do basic inference of function attributes from known properties of system |
1111 | // libraries and other oracles. |
1112 | MPM.addPass(Pass: InferFunctionAttrsPass()); |
1113 | MPM.addPass(Pass: CoroEarlyPass()); |
1114 | |
1115 | FunctionPassManager EarlyFPM; |
1116 | EarlyFPM.addPass(Pass: EntryExitInstrumenterPass(/*PostInlining=*/false)); |
1117 | // Lower llvm.expect to metadata before attempting transforms. |
1118 | // Compare/branch metadata may alter the behavior of passes like |
1119 | // SimplifyCFG. |
1120 | EarlyFPM.addPass(Pass: LowerExpectIntrinsicPass()); |
1121 | EarlyFPM.addPass(Pass: SimplifyCFGPass()); |
1122 | EarlyFPM.addPass(Pass: SROAPass(SROAOptions::ModifyCFG)); |
1123 | EarlyFPM.addPass(Pass: EarlyCSEPass()); |
1124 | if (Level == OptimizationLevel::O3) |
1125 | EarlyFPM.addPass(Pass: CallSiteSplittingPass()); |
1126 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor( |
1127 | Pass: std::move(EarlyFPM), EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses)); |
1128 | } |
1129 | |
1130 | if (LoadSampleProfile) { |
1131 | // Annotate sample profile right after early FPM to ensure freshness of |
1132 | // the debug info. |
1133 | MPM.addPass(Pass: SampleProfileLoaderPass(PGOOpt->ProfileFile, |
1134 | PGOOpt->ProfileRemappingFile, Phase)); |
1135 | // Cache ProfileSummaryAnalysis once to avoid the potential need to insert |
1136 | // RequireAnalysisPass for PSI before subsequent non-module passes. |
1137 | MPM.addPass(Pass: RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); |
1138 | // Do not invoke ICP in the LTOPrelink phase as it makes it hard |
1139 | // for the profile annotation to be accurate in the LTO backend. |
1140 | if (!isLTOPreLink(Phase)) |
1141 | // We perform early indirect call promotion here, before globalopt. |
1142 | // This is important for the ThinLTO backend phase because otherwise |
1143 | // imported available_externally functions look unreferenced and are |
1144 | // removed. |
1145 | MPM.addPass( |
1146 | Pass: PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */)); |
1147 | } |
1148 | |
1149 | // Try to perform OpenMP specific optimizations on the module. This is a |
1150 | // (quick!) no-op if there are no OpenMP runtime calls present in the module. |
1151 | MPM.addPass(Pass: OpenMPOptPass(Phase)); |
1152 | |
1153 | if (AttributorRun & AttributorRunOption::MODULE) |
1154 | MPM.addPass(Pass: AttributorPass()); |
1155 | |
1156 | // Lower type metadata and the type.test intrinsic in the ThinLTO |
1157 | // post link pipeline after ICP. This is to enable usage of the type |
1158 | // tests in ICP sequences. |
1159 | if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink) |
1160 | MPM.addPass(Pass: LowerTypeTestsPass(nullptr, nullptr, |
1161 | lowertypetests::DropTestKind::Assume)); |
1162 | |
1163 | invokePipelineEarlySimplificationEPCallbacks(MPM, Level, Phase); |
1164 | |
1165 | // Interprocedural constant propagation now that basic cleanup has occurred |
1166 | // and prior to optimizing globals. |
1167 | // FIXME: This position in the pipeline hasn't been carefully considered in |
1168 | // years, it should be re-analyzed. |
1169 | MPM.addPass(Pass: IPSCCPPass( |
1170 | IPSCCPOptions(/*AllowFuncSpec=*/ |
1171 | Level != OptimizationLevel::Os && |
1172 | Level != OptimizationLevel::Oz && |
1173 | !isLTOPreLink(Phase)))); |
1174 | |
1175 | // Attach metadata to indirect call sites indicating the set of functions |
1176 | // they may target at run-time. This should follow IPSCCP. |
1177 | MPM.addPass(Pass: CalledValuePropagationPass()); |
1178 | |
1179 | // Optimize globals to try and fold them into constants. |
1180 | MPM.addPass(Pass: GlobalOptPass()); |
1181 | |
1182 | // Create a small function pass pipeline to cleanup after all the global |
1183 | // optimizations. |
1184 | FunctionPassManager GlobalCleanupPM; |
1185 | // FIXME: Should this instead by a run of SROA? |
1186 | GlobalCleanupPM.addPass(Pass: PromotePass()); |
1187 | GlobalCleanupPM.addPass(Pass: InstCombinePass()); |
1188 | invokePeepholeEPCallbacks(FPM&: GlobalCleanupPM, Level); |
1189 | GlobalCleanupPM.addPass( |
1190 | Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(B: true))); |
1191 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(GlobalCleanupPM), |
1192 | EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses)); |
1193 | |
1194 | // We already asserted this happens in non-FullLTOPostLink earlier. |
1195 | const bool IsPreLink = Phase != ThinOrFullLTOPhase::ThinLTOPostLink; |
1196 | // Enable contextual profiling instrumentation. |
1197 | const bool IsCtxProfGen = |
1198 | IsPreLink && PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled(); |
1199 | const bool IsPGOPreLink = !IsCtxProfGen && PGOOpt && IsPreLink; |
1200 | const bool IsPGOInstrGen = |
1201 | IsPGOPreLink && PGOOpt->Action == PGOOptions::IRInstr; |
1202 | const bool IsPGOInstrUse = |
1203 | IsPGOPreLink && PGOOpt->Action == PGOOptions::IRUse; |
1204 | const bool IsMemprofUse = IsPGOPreLink && !PGOOpt->MemoryProfile.empty(); |
1205 | // We don't want to mix pgo ctx gen and pgo gen; we also don't currently |
1206 | // enable ctx profiling from the frontend. |
1207 | assert(!(IsPGOInstrGen && PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled()) && |
1208 | "Enabling both instrumented PGO and contextual instrumentation is not " |
1209 | "supported." ); |
1210 | const bool IsCtxProfUse = |
1211 | !UseCtxProfile.empty() && Phase == ThinOrFullLTOPhase::ThinLTOPreLink; |
1212 | |
1213 | assert( |
1214 | (InstrumentColdFuncOnlyPath.empty() || PGOInstrumentColdFunctionOnly) && |
1215 | "--instrument-cold-function-only-path is provided but " |
1216 | "--pgo-instrument-cold-function-only is not enabled" ); |
1217 | const bool IsColdFuncOnlyInstrGen = PGOInstrumentColdFunctionOnly && |
1218 | IsPGOPreLink && |
1219 | !InstrumentColdFuncOnlyPath.empty(); |
1220 | |
1221 | if (IsPGOInstrGen || IsPGOInstrUse || IsMemprofUse || IsCtxProfGen || |
1222 | IsCtxProfUse || IsColdFuncOnlyInstrGen) |
1223 | addPreInlinerPasses(MPM, Level, LTOPhase: Phase); |
1224 | |
1225 | // Add all the requested passes for instrumentation PGO, if requested. |
1226 | if (IsPGOInstrGen || IsPGOInstrUse) { |
1227 | addPGOInstrPasses(MPM, Level, |
1228 | /*RunProfileGen=*/IsPGOInstrGen, |
1229 | /*IsCS=*/false, AtomicCounterUpdate: PGOOpt->AtomicCounterUpdate, |
1230 | ProfileFile: PGOOpt->ProfileFile, ProfileRemappingFile: PGOOpt->ProfileRemappingFile, |
1231 | FS: PGOOpt->FS); |
1232 | } else if (IsCtxProfGen || IsCtxProfUse) { |
1233 | MPM.addPass(Pass: PGOInstrumentationGen(PGOInstrumentationType::CTXPROF)); |
1234 | // In pre-link, we just want the instrumented IR. We use the contextual |
1235 | // profile in the post-thinlink phase. |
1236 | // The instrumentation will be removed in post-thinlink after IPO. |
1237 | // FIXME(mtrofin): move AssignGUIDPass if there is agreement to use this |
1238 | // mechanism for GUIDs. |
1239 | MPM.addPass(Pass: AssignGUIDPass()); |
1240 | if (IsCtxProfUse) { |
1241 | MPM.addPass(Pass: PGOCtxProfFlatteningPass(/*IsPreThinlink=*/true)); |
1242 | return MPM; |
1243 | } |
1244 | // Block further inlining in the instrumented ctxprof case. This avoids |
1245 | // confusingly collecting profiles for the same GUID corresponding to |
1246 | // different variants of the function. We could do like PGO and identify |
1247 | // functions by a (GUID, Hash) tuple, but since the ctxprof "use" waits for |
1248 | // thinlto to happen before performing any further optimizations, it's |
1249 | // unnecessary to collect profiles for non-prevailing copies. |
1250 | MPM.addPass(Pass: NoinlineNonPrevailing()); |
1251 | addPostPGOLoopRotation(MPM, Level); |
1252 | MPM.addPass(Pass: PGOCtxProfLoweringPass()); |
1253 | } else if (IsColdFuncOnlyInstrGen) { |
1254 | addPGOInstrPasses( |
1255 | MPM, Level, /* RunProfileGen */ true, /* IsCS */ false, |
1256 | /* AtomicCounterUpdate */ false, ProfileFile: InstrumentColdFuncOnlyPath, |
1257 | /* ProfileRemappingFile */ "" , FS: IntrusiveRefCntPtr<vfs::FileSystem>()); |
1258 | } |
1259 | |
1260 | if (IsPGOInstrGen || IsPGOInstrUse || IsCtxProfGen) |
1261 | MPM.addPass(Pass: PGOIndirectCallPromotion(false, false)); |
1262 | |
1263 | if (IsPGOPreLink && PGOOpt->CSAction == PGOOptions::CSIRInstr) |
1264 | MPM.addPass(Pass: PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile, |
1265 | EnableSampledInstr)); |
1266 | |
1267 | if (IsMemprofUse) |
1268 | MPM.addPass(Pass: MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS)); |
1269 | |
1270 | if (PGOOpt && (PGOOpt->Action == PGOOptions::IRUse || |
1271 | PGOOpt->Action == PGOOptions::SampleUse)) |
1272 | MPM.addPass(Pass: PGOForceFunctionAttrsPass(PGOOpt->ColdOptType)); |
1273 | |
1274 | MPM.addPass(Pass: AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true)); |
1275 | |
1276 | if (EnableModuleInliner) |
1277 | MPM.addPass(Pass: buildModuleInlinerPipeline(Level, Phase)); |
1278 | else |
1279 | MPM.addPass(Pass: buildInlinerPipeline(Level, Phase)); |
1280 | |
1281 | // Remove any dead arguments exposed by cleanups, constant folding globals, |
1282 | // and argument promotion. |
1283 | MPM.addPass(Pass: DeadArgumentEliminationPass()); |
1284 | |
1285 | if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink) |
1286 | MPM.addPass(Pass: SimplifyTypeTestsPass()); |
1287 | |
1288 | if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink) |
1289 | MPM.addPass(Pass: CoroCleanupPass()); |
1290 | |
1291 | // Optimize globals now that functions are fully simplified. |
1292 | MPM.addPass(Pass: GlobalOptPass()); |
1293 | MPM.addPass(Pass: GlobalDCEPass()); |
1294 | |
1295 | return MPM; |
1296 | } |
1297 | |
1298 | /// TODO: Should LTO cause any differences to this set of passes? |
1299 | void PassBuilder::addVectorPasses(OptimizationLevel Level, |
1300 | FunctionPassManager &FPM, bool IsFullLTO) { |
1301 | FPM.addPass(Pass: LoopVectorizePass( |
1302 | LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); |
1303 | |
1304 | FPM.addPass(Pass: InferAlignmentPass()); |
1305 | if (IsFullLTO) { |
1306 | // The vectorizer may have significantly shortened a loop body; unroll |
1307 | // again. Unroll small loops to hide loop backedge latency and saturate any |
1308 | // parallel execution resources of an out-of-order processor. We also then |
1309 | // need to clean up redundancies and loop invariant code. |
1310 | // FIXME: It would be really good to use a loop-integrated instruction |
1311 | // combiner for cleanup here so that the unrolling and LICM can be pipelined |
1312 | // across the loop nests. |
1313 | // We do UnrollAndJam in a separate LPM to ensure it happens before unroll |
1314 | if (EnableUnrollAndJam && PTO.LoopUnrolling) |
1315 | FPM.addPass(Pass: createFunctionToLoopPassAdaptor( |
1316 | Pass: LoopUnrollAndJamPass(Level.getSpeedupLevel()))); |
1317 | FPM.addPass(Pass: LoopUnrollPass(LoopUnrollOptions( |
1318 | Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, |
1319 | PTO.ForgetAllSCEVInLoopUnroll))); |
1320 | FPM.addPass(Pass: WarnMissedTransformationsPass()); |
1321 | // Now that we are done with loop unrolling, be it either by LoopVectorizer, |
1322 | // or LoopUnroll passes, some variable-offset GEP's into alloca's could have |
1323 | // become constant-offset, thus enabling SROA and alloca promotion. Do so. |
1324 | // NOTE: we are very late in the pipeline, and we don't have any LICM |
1325 | // or SimplifyCFG passes scheduled after us, that would cleanup |
1326 | // the CFG mess this may created if allowed to modify CFG, so forbid that. |
1327 | FPM.addPass(Pass: SROAPass(SROAOptions::PreserveCFG)); |
1328 | } |
1329 | |
1330 | if (!IsFullLTO) { |
1331 | // Eliminate loads by forwarding stores from the previous iteration to loads |
1332 | // of the current iteration. |
1333 | FPM.addPass(Pass: LoopLoadEliminationPass()); |
1334 | } |
1335 | // Cleanup after the loop optimization passes. |
1336 | FPM.addPass(Pass: InstCombinePass()); |
1337 | |
1338 | if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { |
1339 | ExtraFunctionPassManager<ShouldRunExtraVectorPasses> ; |
1340 | // At higher optimization levels, try to clean up any runtime overlap and |
1341 | // alignment checks inserted by the vectorizer. We want to track correlated |
1342 | // runtime checks for two inner loops in the same outer loop, fold any |
1343 | // common computations, hoist loop-invariant aspects out of any outer loop, |
1344 | // and unswitch the runtime checks if possible. Once hoisted, we may have |
1345 | // dead (or speculatable) control flows or more combining opportunities. |
1346 | ExtraPasses.addPass(Pass: EarlyCSEPass()); |
1347 | ExtraPasses.addPass(Pass: CorrelatedValuePropagationPass()); |
1348 | ExtraPasses.addPass(Pass: InstCombinePass()); |
1349 | LoopPassManager LPM; |
1350 | LPM.addPass(Pass: LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, |
1351 | /*AllowSpeculation=*/true)); |
1352 | LPM.addPass(Pass: SimpleLoopUnswitchPass(/* NonTrivial */ Level == |
1353 | OptimizationLevel::O3)); |
1354 | ExtraPasses.addPass( |
1355 | Pass: createFunctionToLoopPassAdaptor(Pass: std::move(LPM), /*UseMemorySSA=*/true, |
1356 | /*UseBlockFrequencyInfo=*/true)); |
1357 | ExtraPasses.addPass( |
1358 | Pass: SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(B: true))); |
1359 | ExtraPasses.addPass(Pass: InstCombinePass()); |
1360 | FPM.addPass(Pass: std::move(ExtraPasses)); |
1361 | } |
1362 | |
1363 | // Now that we've formed fast to execute loop structures, we do further |
1364 | // optimizations. These are run afterward as they might block doing complex |
1365 | // analyses and transforms such as what are needed for loop vectorization. |
1366 | |
1367 | // Cleanup after loop vectorization, etc. Simplification passes like CVP and |
1368 | // GVN, loop transforms, and others have already run, so it's now better to |
1369 | // convert to more optimized IR using more aggressive simplify CFG options. |
1370 | // The extra sinking transform can create larger basic blocks, so do this |
1371 | // before SLP vectorization. |
1372 | FPM.addPass(Pass: SimplifyCFGPass(SimplifyCFGOptions() |
1373 | .forwardSwitchCondToPhi(B: true) |
1374 | .convertSwitchRangeToICmp(B: true) |
1375 | .convertSwitchToLookupTable(B: true) |
1376 | .needCanonicalLoops(B: false) |
1377 | .hoistCommonInsts(B: true) |
1378 | .sinkCommonInsts(B: true))); |
1379 | |
1380 | if (IsFullLTO) { |
1381 | FPM.addPass(Pass: SCCPPass()); |
1382 | FPM.addPass(Pass: InstCombinePass()); |
1383 | FPM.addPass(Pass: BDCEPass()); |
1384 | } |
1385 | |
1386 | // Optimize parallel scalar instruction chains into SIMD instructions. |
1387 | if (PTO.SLPVectorization) { |
1388 | FPM.addPass(Pass: SLPVectorizerPass()); |
1389 | if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { |
1390 | FPM.addPass(Pass: EarlyCSEPass()); |
1391 | } |
1392 | } |
1393 | // Enhance/cleanup vector code. |
1394 | FPM.addPass(Pass: VectorCombinePass()); |
1395 | |
1396 | if (!IsFullLTO) { |
1397 | FPM.addPass(Pass: InstCombinePass()); |
1398 | // Unroll small loops to hide loop backedge latency and saturate any |
1399 | // parallel execution resources of an out-of-order processor. We also then |
1400 | // need to clean up redundancies and loop invariant code. |
1401 | // FIXME: It would be really good to use a loop-integrated instruction |
1402 | // combiner for cleanup here so that the unrolling and LICM can be pipelined |
1403 | // across the loop nests. |
1404 | // We do UnrollAndJam in a separate LPM to ensure it happens before unroll |
1405 | if (EnableUnrollAndJam && PTO.LoopUnrolling) { |
1406 | FPM.addPass(Pass: createFunctionToLoopPassAdaptor( |
1407 | Pass: LoopUnrollAndJamPass(Level.getSpeedupLevel()))); |
1408 | } |
1409 | FPM.addPass(Pass: LoopUnrollPass(LoopUnrollOptions( |
1410 | Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, |
1411 | PTO.ForgetAllSCEVInLoopUnroll))); |
1412 | FPM.addPass(Pass: WarnMissedTransformationsPass()); |
1413 | // Now that we are done with loop unrolling, be it either by LoopVectorizer, |
1414 | // or LoopUnroll passes, some variable-offset GEP's into alloca's could have |
1415 | // become constant-offset, thus enabling SROA and alloca promotion. Do so. |
1416 | // NOTE: we are very late in the pipeline, and we don't have any LICM |
1417 | // or SimplifyCFG passes scheduled after us, that would cleanup |
1418 | // the CFG mess this may created if allowed to modify CFG, so forbid that. |
1419 | FPM.addPass(Pass: SROAPass(SROAOptions::PreserveCFG)); |
1420 | } |
1421 | |
1422 | FPM.addPass(Pass: InferAlignmentPass()); |
1423 | FPM.addPass(Pass: InstCombinePass()); |
1424 | |
1425 | // This is needed for two reasons: |
1426 | // 1. It works around problems that instcombine introduces, such as sinking |
1427 | // expensive FP divides into loops containing multiplications using the |
1428 | // divide result. |
1429 | // 2. It helps to clean up some loop-invariant code created by the loop |
1430 | // unroll pass when IsFullLTO=false. |
1431 | FPM.addPass(Pass: createFunctionToLoopPassAdaptor( |
1432 | Pass: LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, |
1433 | /*AllowSpeculation=*/true), |
1434 | /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); |
1435 | |
1436 | // Now that we've vectorized and unrolled loops, we may have more refined |
1437 | // alignment information, try to re-derive it here. |
1438 | FPM.addPass(Pass: AlignmentFromAssumptionsPass()); |
1439 | } |
1440 | |
1441 | ModulePassManager |
1442 | PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, |
1443 | ThinOrFullLTOPhase LTOPhase) { |
1444 | const bool LTOPreLink = isLTOPreLink(Phase: LTOPhase); |
1445 | ModulePassManager MPM; |
1446 | |
1447 | // Run partial inlining pass to partially inline functions that have |
1448 | // large bodies. |
1449 | if (RunPartialInlining) |
1450 | MPM.addPass(Pass: PartialInlinerPass()); |
1451 | |
1452 | // Remove avail extern fns and globals definitions since we aren't compiling |
1453 | // an object file for later LTO. For LTO we want to preserve these so they |
1454 | // are eligible for inlining at link-time. Note if they are unreferenced they |
1455 | // will be removed by GlobalDCE later, so this only impacts referenced |
1456 | // available externally globals. Eventually they will be suppressed during |
1457 | // codegen, but eliminating here enables more opportunity for GlobalDCE as it |
1458 | // may make globals referenced by available external functions dead and saves |
1459 | // running remaining passes on the eliminated functions. These should be |
1460 | // preserved during prelinking for link-time inlining decisions. |
1461 | if (!LTOPreLink) |
1462 | MPM.addPass(Pass: EliminateAvailableExternallyPass()); |
1463 | |
1464 | // Do RPO function attribute inference across the module to forward-propagate |
1465 | // attributes where applicable. |
1466 | // FIXME: Is this really an optimization rather than a canonicalization? |
1467 | MPM.addPass(Pass: ReversePostOrderFunctionAttrsPass()); |
1468 | |
1469 | // Do a post inline PGO instrumentation and use pass. This is a context |
1470 | // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as |
1471 | // cross-module inline has not been done yet. The context sensitive |
1472 | // instrumentation is after all the inlines are done. |
1473 | if (!LTOPreLink && PGOOpt) { |
1474 | if (PGOOpt->CSAction == PGOOptions::CSIRInstr) |
1475 | addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true, |
1476 | /*IsCS=*/true, AtomicCounterUpdate: PGOOpt->AtomicCounterUpdate, |
1477 | ProfileFile: PGOOpt->CSProfileGenFile, ProfileRemappingFile: PGOOpt->ProfileRemappingFile, |
1478 | FS: PGOOpt->FS); |
1479 | else if (PGOOpt->CSAction == PGOOptions::CSIRUse) |
1480 | addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false, |
1481 | /*IsCS=*/true, AtomicCounterUpdate: PGOOpt->AtomicCounterUpdate, |
1482 | ProfileFile: PGOOpt->ProfileFile, ProfileRemappingFile: PGOOpt->ProfileRemappingFile, |
1483 | FS: PGOOpt->FS); |
1484 | } |
1485 | |
1486 | // Re-compute GlobalsAA here prior to function passes. This is particularly |
1487 | // useful as the above will have inlined, DCE'ed, and function-attr |
1488 | // propagated everything. We should at this point have a reasonably minimal |
1489 | // and richly annotated call graph. By computing aliasing and mod/ref |
1490 | // information for all local globals here, the late loop passes and notably |
1491 | // the vectorizer will be able to use them to help recognize vectorizable |
1492 | // memory operations. |
1493 | if (EnableGlobalAnalyses) |
1494 | MPM.addPass(Pass: RecomputeGlobalsAAPass()); |
1495 | |
1496 | invokeOptimizerEarlyEPCallbacks(MPM, Level, Phase: LTOPhase); |
1497 | |
1498 | FunctionPassManager OptimizePM; |
1499 | // Scheduling LoopVersioningLICM when inlining is over, because after that |
1500 | // we may see more accurate aliasing. Reason to run this late is that too |
1501 | // early versioning may prevent further inlining due to increase of code |
1502 | // size. Other optimizations which runs later might get benefit of no-alias |
1503 | // assumption in clone loop. |
1504 | if (UseLoopVersioningLICM) { |
1505 | OptimizePM.addPass( |
1506 | Pass: createFunctionToLoopPassAdaptor(Pass: LoopVersioningLICMPass())); |
1507 | // LoopVersioningLICM pass might increase new LICM opportunities. |
1508 | OptimizePM.addPass(Pass: createFunctionToLoopPassAdaptor( |
1509 | Pass: LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, |
1510 | /*AllowSpeculation=*/true), |
1511 | /*USeMemorySSA=*/UseMemorySSA: true, /*UseBlockFrequencyInfo=*/false)); |
1512 | } |
1513 | |
1514 | OptimizePM.addPass(Pass: Float2IntPass()); |
1515 | OptimizePM.addPass(Pass: LowerConstantIntrinsicsPass()); |
1516 | |
1517 | if (EnableMatrix) { |
1518 | OptimizePM.addPass(Pass: LowerMatrixIntrinsicsPass()); |
1519 | OptimizePM.addPass(Pass: EarlyCSEPass()); |
1520 | } |
1521 | |
1522 | // CHR pass should only be applied with the profile information. |
1523 | // The check is to check the profile summary information in CHR. |
1524 | if (EnableCHR && Level == OptimizationLevel::O3) |
1525 | OptimizePM.addPass(Pass: ControlHeightReductionPass()); |
1526 | |
1527 | // FIXME: We need to run some loop optimizations to re-rotate loops after |
1528 | // simplifycfg and others undo their rotation. |
1529 | |
1530 | // Optimize the loop execution. These passes operate on entire loop nests |
1531 | // rather than on each loop in an inside-out manner, and so they are actually |
1532 | // function passes. |
1533 | |
1534 | invokeVectorizerStartEPCallbacks(FPM&: OptimizePM, Level); |
1535 | |
1536 | LoopPassManager LPM; |
1537 | // First rotate loops that may have been un-rotated by prior passes. |
1538 | // Disable header duplication at -Oz. |
1539 | LPM.addPass(Pass: LoopRotatePass(EnableLoopHeaderDuplication || |
1540 | Level != OptimizationLevel::Oz, |
1541 | LTOPreLink)); |
1542 | // Some loops may have become dead by now. Try to delete them. |
1543 | // FIXME: see discussion in https://reviews.llvm.org/D112851, |
1544 | // this may need to be revisited once we run GVN before loop deletion |
1545 | // in the simplification pipeline. |
1546 | LPM.addPass(Pass: LoopDeletionPass()); |
1547 | |
1548 | if (PTO.LoopInterchange) |
1549 | LPM.addPass(Pass: LoopInterchangePass()); |
1550 | |
1551 | OptimizePM.addPass(Pass: createFunctionToLoopPassAdaptor( |
1552 | Pass: std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); |
1553 | |
1554 | // Distribute loops to allow partial vectorization. I.e. isolate dependences |
1555 | // into separate loop that would otherwise inhibit vectorization. This is |
1556 | // currently only performed for loops marked with the metadata |
1557 | // llvm.loop.distribute=true or when -enable-loop-distribute is specified. |
1558 | OptimizePM.addPass(Pass: LoopDistributePass()); |
1559 | |
1560 | // Populates the VFABI attribute with the scalar-to-vector mappings |
1561 | // from the TargetLibraryInfo. |
1562 | OptimizePM.addPass(Pass: InjectTLIMappings()); |
1563 | |
1564 | addVectorPasses(Level, FPM&: OptimizePM, /* IsFullLTO */ false); |
1565 | |
1566 | invokeVectorizerEndEPCallbacks(FPM&: OptimizePM, Level); |
1567 | |
1568 | // LoopSink pass sinks instructions hoisted by LICM, which serves as a |
1569 | // canonicalization pass that enables other optimizations. As a result, |
1570 | // LoopSink pass needs to be a very late IR pass to avoid undoing LICM |
1571 | // result too early. |
1572 | OptimizePM.addPass(Pass: LoopSinkPass()); |
1573 | |
1574 | // And finally clean up LCSSA form before generating code. |
1575 | OptimizePM.addPass(Pass: InstSimplifyPass()); |
1576 | |
1577 | // This hoists/decomposes div/rem ops. It should run after other sink/hoist |
1578 | // passes to avoid re-sinking, but before SimplifyCFG because it can allow |
1579 | // flattening of blocks. |
1580 | OptimizePM.addPass(Pass: DivRemPairsPass()); |
1581 | |
1582 | // Try to annotate calls that were created during optimization. |
1583 | OptimizePM.addPass( |
1584 | Pass: TailCallElimPass(/*UpdateFunctionEntryCount=*/isInstrumentedPGOUse())); |
1585 | |
1586 | // LoopSink (and other loop passes since the last simplifyCFG) might have |
1587 | // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. |
1588 | OptimizePM.addPass( |
1589 | Pass: SimplifyCFGPass(SimplifyCFGOptions() |
1590 | .convertSwitchRangeToICmp(B: true) |
1591 | .speculateUnpredictables(B: true) |
1592 | .hoistLoadsStoresWithCondFaulting(B: true))); |
1593 | |
1594 | // Add the core optimizing pipeline. |
1595 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(OptimizePM), |
1596 | EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses)); |
1597 | |
1598 | invokeOptimizerLastEPCallbacks(MPM, Level, Phase: LTOPhase); |
1599 | |
1600 | // Split out cold code. Splitting is done late to avoid hiding context from |
1601 | // other optimizations and inadvertently regressing performance. The tradeoff |
1602 | // is that this has a higher code size cost than splitting early. |
1603 | if (EnableHotColdSplit && !LTOPreLink) |
1604 | MPM.addPass(Pass: HotColdSplittingPass()); |
1605 | |
1606 | // Search the code for similar regions of code. If enough similar regions can |
1607 | // be found where extracting the regions into their own function will decrease |
1608 | // the size of the program, we extract the regions, a deduplicate the |
1609 | // structurally similar regions. |
1610 | if (EnableIROutliner) |
1611 | MPM.addPass(Pass: IROutlinerPass()); |
1612 | |
1613 | // Now we need to do some global optimization transforms. |
1614 | // FIXME: It would seem like these should come first in the optimization |
1615 | // pipeline and maybe be the bottom of the canonicalization pipeline? Weird |
1616 | // ordering here. |
1617 | MPM.addPass(Pass: GlobalDCEPass()); |
1618 | MPM.addPass(Pass: ConstantMergePass()); |
1619 | |
1620 | // Merge functions if requested. It has a better chance to merge functions |
1621 | // after ConstantMerge folded jump tables. |
1622 | if (PTO.MergeFunctions) |
1623 | MPM.addPass(Pass: MergeFunctionsPass()); |
1624 | |
1625 | if (PTO.CallGraphProfile && !LTOPreLink) |
1626 | MPM.addPass(Pass: CGProfilePass(isLTOPostLink(Phase: LTOPhase))); |
1627 | |
1628 | // RelLookupTableConverterPass runs later in LTO post-link pipeline. |
1629 | if (!LTOPreLink) |
1630 | MPM.addPass(Pass: RelLookupTableConverterPass()); |
1631 | |
1632 | return MPM; |
1633 | } |
1634 | |
1635 | ModulePassManager |
1636 | PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, |
1637 | ThinOrFullLTOPhase Phase) { |
1638 | if (Level == OptimizationLevel::O0) |
1639 | return buildO0DefaultPipeline(Level, Phase); |
1640 | |
1641 | ModulePassManager MPM; |
1642 | |
1643 | // Convert @llvm.global.annotations to !annotation metadata. |
1644 | MPM.addPass(Pass: Annotation2MetadataPass()); |
1645 | |
1646 | // Force any function attributes we want the rest of the pipeline to observe. |
1647 | MPM.addPass(Pass: ForceFunctionAttrsPass()); |
1648 | |
1649 | if (PGOOpt && PGOOpt->DebugInfoForProfiling) |
1650 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: AddDiscriminatorsPass())); |
1651 | |
1652 | // Apply module pipeline start EP callback. |
1653 | invokePipelineStartEPCallbacks(MPM, Level); |
1654 | |
1655 | // Add the core simplification pipeline. |
1656 | MPM.addPass(Pass: buildModuleSimplificationPipeline(Level, Phase)); |
1657 | |
1658 | // Now add the optimization pipeline. |
1659 | MPM.addPass(Pass: buildModuleOptimizationPipeline(Level, LTOPhase: Phase)); |
1660 | |
1661 | if (PGOOpt && PGOOpt->PseudoProbeForProfiling && |
1662 | PGOOpt->Action == PGOOptions::SampleUse) |
1663 | MPM.addPass(Pass: PseudoProbeUpdatePass()); |
1664 | |
1665 | // Emit annotation remarks. |
1666 | addAnnotationRemarksPass(MPM); |
1667 | |
1668 | if (isLTOPreLink(Phase)) |
1669 | addRequiredLTOPreLinkPasses(MPM); |
1670 | return MPM; |
1671 | } |
1672 | |
1673 | ModulePassManager |
1674 | PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO, |
1675 | bool EmitSummary) { |
1676 | ModulePassManager MPM; |
1677 | if (ThinLTO) |
1678 | MPM.addPass(Pass: buildThinLTOPreLinkDefaultPipeline(Level)); |
1679 | else |
1680 | MPM.addPass(Pass: buildLTOPreLinkDefaultPipeline(Level)); |
1681 | MPM.addPass(Pass: EmbedBitcodePass(ThinLTO, EmitSummary)); |
1682 | |
1683 | // Perform any cleanups to the IR that aren't suitable for per TU compilation, |
1684 | // like removing CFI/WPD related instructions. Note, we reuse |
1685 | // LowerTypeTestsPass to clean up type tests rather than duplicate that logic |
1686 | // in FatLtoCleanup. |
1687 | MPM.addPass(Pass: FatLtoCleanup()); |
1688 | |
1689 | // If we're doing FatLTO w/ CFI enabled, we don't want the type tests in the |
1690 | // object code, only in the bitcode section, so drop it before we run |
1691 | // module optimization and generate machine code. If llvm.type.test() isn't in |
1692 | // the IR, this won't do anything. |
1693 | MPM.addPass( |
1694 | Pass: LowerTypeTestsPass(nullptr, nullptr, lowertypetests::DropTestKind::All)); |
1695 | |
1696 | // Use the ThinLTO post-link pipeline with sample profiling |
1697 | if (ThinLTO && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) |
1698 | MPM.addPass(Pass: buildThinLTODefaultPipeline(Level, /*ImportSummary=*/nullptr)); |
1699 | else { |
1700 | // ModuleSimplification does not run the coroutine passes for |
1701 | // ThinLTOPreLink, so we need the coroutine passes to run for ThinLTO |
1702 | // builds, otherwise they will miscompile. |
1703 | if (ThinLTO) { |
1704 | // TODO: replace w/ buildCoroWrapper() when it takes phase and level into |
1705 | // consideration. |
1706 | CGSCCPassManager CGPM; |
1707 | CGPM.addPass(Pass: CoroSplitPass(Level != OptimizationLevel::O0)); |
1708 | CGPM.addPass(Pass: CoroAnnotationElidePass()); |
1709 | MPM.addPass(Pass: createModuleToPostOrderCGSCCPassAdaptor(Pass: std::move(CGPM))); |
1710 | MPM.addPass(Pass: CoroCleanupPass()); |
1711 | } |
1712 | |
1713 | // otherwise, just use module optimization |
1714 | MPM.addPass( |
1715 | Pass: buildModuleOptimizationPipeline(Level, LTOPhase: ThinOrFullLTOPhase::None)); |
1716 | // Emit annotation remarks. |
1717 | addAnnotationRemarksPass(MPM); |
1718 | } |
1719 | return MPM; |
1720 | } |
1721 | |
1722 | ModulePassManager |
1723 | PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { |
1724 | if (Level == OptimizationLevel::O0) |
1725 | return buildO0DefaultPipeline(Level, Phase: ThinOrFullLTOPhase::ThinLTOPreLink); |
1726 | |
1727 | ModulePassManager MPM; |
1728 | |
1729 | // Convert @llvm.global.annotations to !annotation metadata. |
1730 | MPM.addPass(Pass: Annotation2MetadataPass()); |
1731 | |
1732 | // Force any function attributes we want the rest of the pipeline to observe. |
1733 | MPM.addPass(Pass: ForceFunctionAttrsPass()); |
1734 | |
1735 | if (PGOOpt && PGOOpt->DebugInfoForProfiling) |
1736 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: AddDiscriminatorsPass())); |
1737 | |
1738 | // Apply module pipeline start EP callback. |
1739 | invokePipelineStartEPCallbacks(MPM, Level); |
1740 | |
1741 | // If we are planning to perform ThinLTO later, we don't bloat the code with |
1742 | // unrolling/vectorization/... now. Just simplify the module as much as we |
1743 | // can. |
1744 | MPM.addPass(Pass: buildModuleSimplificationPipeline( |
1745 | Level, Phase: ThinOrFullLTOPhase::ThinLTOPreLink)); |
1746 | // In pre-link, for ctx prof use, we stop here with an instrumented IR. We let |
1747 | // thinlto use the contextual info to perform imports; then use the contextual |
1748 | // profile in the post-thinlink phase. |
1749 | if (!UseCtxProfile.empty()) { |
1750 | addRequiredLTOPreLinkPasses(MPM); |
1751 | return MPM; |
1752 | } |
1753 | |
1754 | // Run partial inlining pass to partially inline functions that have |
1755 | // large bodies. |
1756 | // FIXME: It isn't clear whether this is really the right place to run this |
1757 | // in ThinLTO. Because there is another canonicalization and simplification |
1758 | // phase that will run after the thin link, running this here ends up with |
1759 | // less information than will be available later and it may grow functions in |
1760 | // ways that aren't beneficial. |
1761 | if (RunPartialInlining) |
1762 | MPM.addPass(Pass: PartialInlinerPass()); |
1763 | |
1764 | if (PGOOpt && PGOOpt->PseudoProbeForProfiling && |
1765 | PGOOpt->Action == PGOOptions::SampleUse) |
1766 | MPM.addPass(Pass: PseudoProbeUpdatePass()); |
1767 | |
1768 | // Handle Optimizer{Early,Last}EPCallbacks added by clang on PreLink. Actual |
1769 | // optimization is going to be done in PostLink stage, but clang can't add |
1770 | // callbacks there in case of in-process ThinLTO called by linker. |
1771 | invokeOptimizerEarlyEPCallbacks(MPM, Level, |
1772 | /*Phase=*/ThinOrFullLTOPhase::ThinLTOPreLink); |
1773 | invokeOptimizerLastEPCallbacks(MPM, Level, |
1774 | /*Phase=*/ThinOrFullLTOPhase::ThinLTOPreLink); |
1775 | |
1776 | // Emit annotation remarks. |
1777 | addAnnotationRemarksPass(MPM); |
1778 | |
1779 | addRequiredLTOPreLinkPasses(MPM); |
1780 | |
1781 | return MPM; |
1782 | } |
1783 | |
1784 | ModulePassManager PassBuilder::buildThinLTODefaultPipeline( |
1785 | OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { |
1786 | ModulePassManager MPM; |
1787 | |
1788 | if (ImportSummary) { |
1789 | // For ThinLTO we must apply the context disambiguation decisions early, to |
1790 | // ensure we can correctly match the callsites to summary data. |
1791 | if (EnableMemProfContextDisambiguation) |
1792 | MPM.addPass(Pass: MemProfContextDisambiguation( |
1793 | ImportSummary, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); |
1794 | |
1795 | // These passes import type identifier resolutions for whole-program |
1796 | // devirtualization and CFI. They must run early because other passes may |
1797 | // disturb the specific instruction patterns that these passes look for, |
1798 | // creating dependencies on resolutions that may not appear in the summary. |
1799 | // |
1800 | // For example, GVN may transform the pattern assume(type.test) appearing in |
1801 | // two basic blocks into assume(phi(type.test, type.test)), which would |
1802 | // transform a dependency on a WPD resolution into a dependency on a type |
1803 | // identifier resolution for CFI. |
1804 | // |
1805 | // Also, WPD has access to more precise information than ICP and can |
1806 | // devirtualize more effectively, so it should operate on the IR first. |
1807 | // |
1808 | // The WPD and LowerTypeTest passes need to run at -O0 to lower type |
1809 | // metadata and intrinsics. |
1810 | MPM.addPass(Pass: WholeProgramDevirtPass(nullptr, ImportSummary)); |
1811 | MPM.addPass(Pass: LowerTypeTestsPass(nullptr, ImportSummary)); |
1812 | } |
1813 | |
1814 | if (Level == OptimizationLevel::O0) { |
1815 | // Run a second time to clean up any type tests left behind by WPD for use |
1816 | // in ICP. |
1817 | MPM.addPass(Pass: LowerTypeTestsPass(nullptr, nullptr, |
1818 | lowertypetests::DropTestKind::Assume)); |
1819 | // Drop available_externally and unreferenced globals. This is necessary |
1820 | // with ThinLTO in order to avoid leaving undefined references to dead |
1821 | // globals in the object file. |
1822 | MPM.addPass(Pass: EliminateAvailableExternallyPass()); |
1823 | MPM.addPass(Pass: GlobalDCEPass()); |
1824 | return MPM; |
1825 | } |
1826 | if (!UseCtxProfile.empty()) { |
1827 | MPM.addPass( |
1828 | Pass: buildModuleInlinerPipeline(Level, Phase: ThinOrFullLTOPhase::ThinLTOPostLink)); |
1829 | } else { |
1830 | // Add the core simplification pipeline. |
1831 | MPM.addPass(Pass: buildModuleSimplificationPipeline( |
1832 | Level, Phase: ThinOrFullLTOPhase::ThinLTOPostLink)); |
1833 | } |
1834 | // Now add the optimization pipeline. |
1835 | MPM.addPass(Pass: buildModuleOptimizationPipeline( |
1836 | Level, LTOPhase: ThinOrFullLTOPhase::ThinLTOPostLink)); |
1837 | |
1838 | // Emit annotation remarks. |
1839 | addAnnotationRemarksPass(MPM); |
1840 | |
1841 | return MPM; |
1842 | } |
1843 | |
1844 | ModulePassManager |
1845 | PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { |
1846 | // FIXME: We should use a customized pre-link pipeline! |
1847 | return buildPerModuleDefaultPipeline(Level, |
1848 | Phase: ThinOrFullLTOPhase::FullLTOPreLink); |
1849 | } |
1850 | |
1851 | ModulePassManager |
1852 | PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, |
1853 | ModuleSummaryIndex *ExportSummary) { |
1854 | ModulePassManager MPM; |
1855 | |
1856 | invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level); |
1857 | |
1858 | // Create a function that performs CFI checks for cross-DSO calls with targets |
1859 | // in the current module. |
1860 | MPM.addPass(Pass: CrossDSOCFIPass()); |
1861 | |
1862 | if (Level == OptimizationLevel::O0) { |
1863 | // The WPD and LowerTypeTest passes need to run at -O0 to lower type |
1864 | // metadata and intrinsics. |
1865 | MPM.addPass(Pass: WholeProgramDevirtPass(ExportSummary, nullptr)); |
1866 | MPM.addPass(Pass: LowerTypeTestsPass(ExportSummary, nullptr)); |
1867 | // Run a second time to clean up any type tests left behind by WPD for use |
1868 | // in ICP. |
1869 | MPM.addPass(Pass: LowerTypeTestsPass(nullptr, nullptr, |
1870 | lowertypetests::DropTestKind::Assume)); |
1871 | |
1872 | MPM.addPass(Pass: buildCoroWrapper(Phase: ThinOrFullLTOPhase::FullLTOPostLink)); |
1873 | |
1874 | invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); |
1875 | |
1876 | // Emit annotation remarks. |
1877 | addAnnotationRemarksPass(MPM); |
1878 | |
1879 | return MPM; |
1880 | } |
1881 | |
1882 | if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { |
1883 | // Load sample profile before running the LTO optimization pipeline. |
1884 | MPM.addPass(Pass: SampleProfileLoaderPass(PGOOpt->ProfileFile, |
1885 | PGOOpt->ProfileRemappingFile, |
1886 | ThinOrFullLTOPhase::FullLTOPostLink)); |
1887 | // Cache ProfileSummaryAnalysis once to avoid the potential need to insert |
1888 | // RequireAnalysisPass for PSI before subsequent non-module passes. |
1889 | MPM.addPass(Pass: RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); |
1890 | } |
1891 | |
1892 | // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present. |
1893 | MPM.addPass(Pass: OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)); |
1894 | |
1895 | // Remove unused virtual tables to improve the quality of code generated by |
1896 | // whole-program devirtualization and bitset lowering. |
1897 | MPM.addPass(Pass: GlobalDCEPass(/*InLTOPostLink=*/true)); |
1898 | |
1899 | // Do basic inference of function attributes from known properties of system |
1900 | // libraries and other oracles. |
1901 | MPM.addPass(Pass: InferFunctionAttrsPass()); |
1902 | |
1903 | if (Level.getSpeedupLevel() > 1) { |
1904 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor( |
1905 | Pass: CallSiteSplittingPass(), EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses)); |
1906 | |
1907 | // Indirect call promotion. This should promote all the targets that are |
1908 | // left by the earlier promotion pass that promotes intra-module targets. |
1909 | // This two-step promotion is to save the compile time. For LTO, it should |
1910 | // produce the same result as if we only do promotion here. |
1911 | MPM.addPass(Pass: PGOIndirectCallPromotion( |
1912 | true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); |
1913 | |
1914 | // Promoting by-reference arguments to by-value exposes more constants to |
1915 | // IPSCCP. |
1916 | CGSCCPassManager CGPM; |
1917 | CGPM.addPass(Pass: PostOrderFunctionAttrsPass()); |
1918 | CGPM.addPass(Pass: ArgumentPromotionPass()); |
1919 | CGPM.addPass( |
1920 | Pass: createCGSCCToFunctionPassAdaptor(Pass: SROAPass(SROAOptions::ModifyCFG))); |
1921 | MPM.addPass(Pass: createModuleToPostOrderCGSCCPassAdaptor(Pass: std::move(CGPM))); |
1922 | |
1923 | // Propagate constants at call sites into the functions they call. This |
1924 | // opens opportunities for globalopt (and inlining) by substituting function |
1925 | // pointers passed as arguments to direct uses of functions. |
1926 | MPM.addPass(Pass: IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/ |
1927 | Level != OptimizationLevel::Os && |
1928 | Level != OptimizationLevel::Oz))); |
1929 | |
1930 | // Attach metadata to indirect call sites indicating the set of functions |
1931 | // they may target at run-time. This should follow IPSCCP. |
1932 | MPM.addPass(Pass: CalledValuePropagationPass()); |
1933 | } |
1934 | |
1935 | // Do RPO function attribute inference across the module to forward-propagate |
1936 | // attributes where applicable. |
1937 | // FIXME: Is this really an optimization rather than a canonicalization? |
1938 | MPM.addPass(Pass: ReversePostOrderFunctionAttrsPass()); |
1939 | |
1940 | // Use in-range annotations on GEP indices to split globals where beneficial. |
1941 | MPM.addPass(Pass: GlobalSplitPass()); |
1942 | |
1943 | // Run whole program optimization of virtual call when the list of callees |
1944 | // is fixed. |
1945 | MPM.addPass(Pass: WholeProgramDevirtPass(ExportSummary, nullptr)); |
1946 | |
1947 | // Stop here at -O1. |
1948 | if (Level == OptimizationLevel::O1) { |
1949 | // The LowerTypeTestsPass needs to run to lower type metadata and the |
1950 | // type.test intrinsics. The pass does nothing if CFI is disabled. |
1951 | MPM.addPass(Pass: LowerTypeTestsPass(ExportSummary, nullptr)); |
1952 | // Run a second time to clean up any type tests left behind by WPD for use |
1953 | // in ICP (which is performed earlier than this in the regular LTO |
1954 | // pipeline). |
1955 | MPM.addPass(Pass: LowerTypeTestsPass(nullptr, nullptr, |
1956 | lowertypetests::DropTestKind::Assume)); |
1957 | |
1958 | MPM.addPass(Pass: buildCoroWrapper(Phase: ThinOrFullLTOPhase::FullLTOPostLink)); |
1959 | |
1960 | invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); |
1961 | |
1962 | // Emit annotation remarks. |
1963 | addAnnotationRemarksPass(MPM); |
1964 | |
1965 | return MPM; |
1966 | } |
1967 | |
1968 | // TODO: Skip to match buildCoroWrapper. |
1969 | MPM.addPass(Pass: CoroEarlyPass()); |
1970 | |
1971 | // Optimize globals to try and fold them into constants. |
1972 | MPM.addPass(Pass: GlobalOptPass()); |
1973 | |
1974 | // Promote any localized globals to SSA registers. |
1975 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: PromotePass())); |
1976 | |
1977 | // Linking modules together can lead to duplicate global constant, only |
1978 | // keep one copy of each constant. |
1979 | MPM.addPass(Pass: ConstantMergePass()); |
1980 | |
1981 | // Remove unused arguments from functions. |
1982 | MPM.addPass(Pass: DeadArgumentEliminationPass()); |
1983 | |
1984 | // Reduce the code after globalopt and ipsccp. Both can open up significant |
1985 | // simplification opportunities, and both can propagate functions through |
1986 | // function pointers. When this happens, we often have to resolve varargs |
1987 | // calls, etc, so let instcombine do this. |
1988 | FunctionPassManager PeepholeFPM; |
1989 | PeepholeFPM.addPass(Pass: InstCombinePass()); |
1990 | if (Level.getSpeedupLevel() > 1) |
1991 | PeepholeFPM.addPass(Pass: AggressiveInstCombinePass()); |
1992 | invokePeepholeEPCallbacks(FPM&: PeepholeFPM, Level); |
1993 | |
1994 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(PeepholeFPM), |
1995 | EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses)); |
1996 | |
1997 | // Lower variadic functions for supported targets prior to inlining. |
1998 | MPM.addPass(Pass: ExpandVariadicsPass(ExpandVariadicsMode::Optimize)); |
1999 | |
2000 | // Note: historically, the PruneEH pass was run first to deduce nounwind and |
2001 | // generally clean up exception handling overhead. It isn't clear this is |
2002 | // valuable as the inliner doesn't currently care whether it is inlining an |
2003 | // invoke or a call. |
2004 | // Run the inliner now. |
2005 | if (EnableModuleInliner) { |
2006 | MPM.addPass(Pass: ModuleInlinerPass(getInlineParamsFromOptLevel(Level), |
2007 | UseInlineAdvisor, |
2008 | ThinOrFullLTOPhase::FullLTOPostLink)); |
2009 | } else { |
2010 | MPM.addPass(Pass: ModuleInlinerWrapperPass( |
2011 | getInlineParamsFromOptLevel(Level), |
2012 | /* MandatoryFirst */ true, |
2013 | InlineContext{.LTOPhase: ThinOrFullLTOPhase::FullLTOPostLink, |
2014 | .Pass: InlinePass::CGSCCInliner})); |
2015 | } |
2016 | |
2017 | // Perform context disambiguation after inlining, since that would reduce the |
2018 | // amount of additional cloning required to distinguish the allocation |
2019 | // contexts. |
2020 | if (EnableMemProfContextDisambiguation) |
2021 | MPM.addPass(Pass: MemProfContextDisambiguation( |
2022 | /*Summary=*/nullptr, |
2023 | PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); |
2024 | |
2025 | // Optimize globals again after we ran the inliner. |
2026 | MPM.addPass(Pass: GlobalOptPass()); |
2027 | |
2028 | // Run the OpenMPOpt pass again after global optimizations. |
2029 | MPM.addPass(Pass: OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)); |
2030 | |
2031 | // Garbage collect dead functions. |
2032 | MPM.addPass(Pass: GlobalDCEPass(/*InLTOPostLink=*/true)); |
2033 | |
2034 | // If we didn't decide to inline a function, check to see if we can |
2035 | // transform it to pass arguments by value instead of by reference. |
2036 | CGSCCPassManager CGPM; |
2037 | CGPM.addPass(Pass: ArgumentPromotionPass()); |
2038 | CGPM.addPass(Pass: CoroSplitPass(Level != OptimizationLevel::O0)); |
2039 | CGPM.addPass(Pass: CoroAnnotationElidePass()); |
2040 | MPM.addPass(Pass: createModuleToPostOrderCGSCCPassAdaptor(Pass: std::move(CGPM))); |
2041 | |
2042 | FunctionPassManager FPM; |
2043 | // The IPO Passes may leave cruft around. Clean up after them. |
2044 | FPM.addPass(Pass: InstCombinePass()); |
2045 | invokePeepholeEPCallbacks(FPM, Level); |
2046 | |
2047 | if (EnableConstraintElimination) |
2048 | FPM.addPass(Pass: ConstraintEliminationPass()); |
2049 | |
2050 | FPM.addPass(Pass: JumpThreadingPass()); |
2051 | |
2052 | // Do a post inline PGO instrumentation and use pass. This is a context |
2053 | // sensitive PGO pass. |
2054 | if (PGOOpt) { |
2055 | if (PGOOpt->CSAction == PGOOptions::CSIRInstr) |
2056 | addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true, |
2057 | /*IsCS=*/true, AtomicCounterUpdate: PGOOpt->AtomicCounterUpdate, |
2058 | ProfileFile: PGOOpt->CSProfileGenFile, ProfileRemappingFile: PGOOpt->ProfileRemappingFile, |
2059 | FS: PGOOpt->FS); |
2060 | else if (PGOOpt->CSAction == PGOOptions::CSIRUse) |
2061 | addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false, |
2062 | /*IsCS=*/true, AtomicCounterUpdate: PGOOpt->AtomicCounterUpdate, |
2063 | ProfileFile: PGOOpt->ProfileFile, ProfileRemappingFile: PGOOpt->ProfileRemappingFile, |
2064 | FS: PGOOpt->FS); |
2065 | } |
2066 | |
2067 | // Break up allocas |
2068 | FPM.addPass(Pass: SROAPass(SROAOptions::ModifyCFG)); |
2069 | |
2070 | // LTO provides additional opportunities for tailcall elimination due to |
2071 | // link-time inlining, and visibility of nocapture attribute. |
2072 | FPM.addPass( |
2073 | Pass: TailCallElimPass(/*UpdateFunctionEntryCount=*/isInstrumentedPGOUse())); |
2074 | |
2075 | // Run a few AA driver optimizations here and now to cleanup the code. |
2076 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM), |
2077 | EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses)); |
2078 | |
2079 | MPM.addPass( |
2080 | Pass: createModuleToPostOrderCGSCCPassAdaptor(Pass: PostOrderFunctionAttrsPass())); |
2081 | |
2082 | // Require the GlobalsAA analysis for the module so we can query it within |
2083 | // MainFPM. |
2084 | if (EnableGlobalAnalyses) { |
2085 | MPM.addPass(Pass: RequireAnalysisPass<GlobalsAA, Module>()); |
2086 | // Invalidate AAManager so it can be recreated and pick up the newly |
2087 | // available GlobalsAA. |
2088 | MPM.addPass( |
2089 | Pass: createModuleToFunctionPassAdaptor(Pass: InvalidateAnalysisPass<AAManager>())); |
2090 | } |
2091 | |
2092 | FunctionPassManager MainFPM; |
2093 | MainFPM.addPass(Pass: createFunctionToLoopPassAdaptor( |
2094 | Pass: LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, |
2095 | /*AllowSpeculation=*/true), |
2096 | /*USeMemorySSA=*/UseMemorySSA: true, /*UseBlockFrequencyInfo=*/false)); |
2097 | |
2098 | if (RunNewGVN) |
2099 | MainFPM.addPass(Pass: NewGVNPass()); |
2100 | else |
2101 | MainFPM.addPass(Pass: GVNPass()); |
2102 | |
2103 | // Remove dead memcpy()'s. |
2104 | MainFPM.addPass(Pass: MemCpyOptPass()); |
2105 | |
2106 | // Nuke dead stores. |
2107 | MainFPM.addPass(Pass: DSEPass()); |
2108 | MainFPM.addPass(Pass: MoveAutoInitPass()); |
2109 | MainFPM.addPass(Pass: MergedLoadStoreMotionPass()); |
2110 | |
2111 | invokeVectorizerStartEPCallbacks(FPM&: MainFPM, Level); |
2112 | |
2113 | LoopPassManager LPM; |
2114 | if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) |
2115 | LPM.addPass(Pass: LoopFlattenPass()); |
2116 | LPM.addPass(Pass: IndVarSimplifyPass()); |
2117 | LPM.addPass(Pass: LoopDeletionPass()); |
2118 | // FIXME: Add loop interchange. |
2119 | |
2120 | // Unroll small loops and perform peeling. |
2121 | LPM.addPass(Pass: LoopFullUnrollPass(Level.getSpeedupLevel(), |
2122 | /* OnlyWhenForced= */ !PTO.LoopUnrolling, |
2123 | PTO.ForgetAllSCEVInLoopUnroll)); |
2124 | // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA. |
2125 | // *All* loop passes must preserve it, in order to be able to use it. |
2126 | MainFPM.addPass(Pass: createFunctionToLoopPassAdaptor( |
2127 | Pass: std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true)); |
2128 | |
2129 | MainFPM.addPass(Pass: LoopDistributePass()); |
2130 | |
2131 | addVectorPasses(Level, FPM&: MainFPM, /* IsFullLTO */ true); |
2132 | |
2133 | invokeVectorizerEndEPCallbacks(FPM&: MainFPM, Level); |
2134 | |
2135 | // Run the OpenMPOpt CGSCC pass again late. |
2136 | MPM.addPass(Pass: createModuleToPostOrderCGSCCPassAdaptor( |
2137 | Pass: OpenMPOptCGSCCPass(ThinOrFullLTOPhase::FullLTOPostLink))); |
2138 | |
2139 | invokePeepholeEPCallbacks(FPM&: MainFPM, Level); |
2140 | MainFPM.addPass(Pass: JumpThreadingPass()); |
2141 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(MainFPM), |
2142 | EagerlyInvalidate: PTO.EagerlyInvalidateAnalyses)); |
2143 | |
2144 | // Lower type metadata and the type.test intrinsic. This pass supports |
2145 | // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs |
2146 | // to be run at link time if CFI is enabled. This pass does nothing if |
2147 | // CFI is disabled. |
2148 | MPM.addPass(Pass: LowerTypeTestsPass(ExportSummary, nullptr)); |
2149 | // Run a second time to clean up any type tests left behind by WPD for use |
2150 | // in ICP (which is performed earlier than this in the regular LTO pipeline). |
2151 | MPM.addPass(Pass: LowerTypeTestsPass(nullptr, nullptr, |
2152 | lowertypetests::DropTestKind::Assume)); |
2153 | |
2154 | // Enable splitting late in the FullLTO post-link pipeline. |
2155 | if (EnableHotColdSplit) |
2156 | MPM.addPass(Pass: HotColdSplittingPass()); |
2157 | |
2158 | // Add late LTO optimization passes. |
2159 | FunctionPassManager LateFPM; |
2160 | |
2161 | // LoopSink pass sinks instructions hoisted by LICM, which serves as a |
2162 | // canonicalization pass that enables other optimizations. As a result, |
2163 | // LoopSink pass needs to be a very late IR pass to avoid undoing LICM |
2164 | // result too early. |
2165 | LateFPM.addPass(Pass: LoopSinkPass()); |
2166 | |
2167 | // This hoists/decomposes div/rem ops. It should run after other sink/hoist |
2168 | // passes to avoid re-sinking, but before SimplifyCFG because it can allow |
2169 | // flattening of blocks. |
2170 | LateFPM.addPass(Pass: DivRemPairsPass()); |
2171 | |
2172 | // Delete basic blocks, which optimization passes may have killed. |
2173 | LateFPM.addPass(Pass: SimplifyCFGPass(SimplifyCFGOptions() |
2174 | .convertSwitchRangeToICmp(B: true) |
2175 | .hoistCommonInsts(B: true) |
2176 | .speculateUnpredictables(B: true))); |
2177 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(LateFPM))); |
2178 | |
2179 | // Drop bodies of available eternally objects to improve GlobalDCE. |
2180 | MPM.addPass(Pass: EliminateAvailableExternallyPass()); |
2181 | |
2182 | // Now that we have optimized the program, discard unreachable functions. |
2183 | MPM.addPass(Pass: GlobalDCEPass(/*InLTOPostLink=*/true)); |
2184 | |
2185 | if (PTO.MergeFunctions) |
2186 | MPM.addPass(Pass: MergeFunctionsPass()); |
2187 | |
2188 | MPM.addPass(Pass: RelLookupTableConverterPass()); |
2189 | |
2190 | if (PTO.CallGraphProfile) |
2191 | MPM.addPass(Pass: CGProfilePass(/*InLTOPostLink=*/true)); |
2192 | |
2193 | MPM.addPass(Pass: CoroCleanupPass()); |
2194 | |
2195 | invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); |
2196 | |
2197 | // Emit annotation remarks. |
2198 | addAnnotationRemarksPass(MPM); |
2199 | |
2200 | return MPM; |
2201 | } |
2202 | |
2203 | ModulePassManager |
2204 | PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, |
2205 | ThinOrFullLTOPhase Phase) { |
2206 | assert(Level == OptimizationLevel::O0 && |
2207 | "buildO0DefaultPipeline should only be used with O0" ); |
2208 | |
2209 | ModulePassManager MPM; |
2210 | |
2211 | // Perform pseudo probe instrumentation in O0 mode. This is for the |
2212 | // consistency between different build modes. For example, a LTO build can be |
2213 | // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in |
2214 | // the postlink will require pseudo probe instrumentation in the prelink. |
2215 | if (PGOOpt && PGOOpt->PseudoProbeForProfiling) |
2216 | MPM.addPass(Pass: SampleProfileProbePass(TM)); |
2217 | |
2218 | if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr || |
2219 | PGOOpt->Action == PGOOptions::IRUse)) |
2220 | addPGOInstrPassesForO0( |
2221 | MPM, |
2222 | /*RunProfileGen=*/(PGOOpt->Action == PGOOptions::IRInstr), |
2223 | /*IsCS=*/false, AtomicCounterUpdate: PGOOpt->AtomicCounterUpdate, ProfileFile: PGOOpt->ProfileFile, |
2224 | ProfileRemappingFile: PGOOpt->ProfileRemappingFile, FS: PGOOpt->FS); |
2225 | |
2226 | // Instrument function entry and exit before all inlining. |
2227 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor( |
2228 | Pass: EntryExitInstrumenterPass(/*PostInlining=*/false))); |
2229 | |
2230 | invokePipelineStartEPCallbacks(MPM, Level); |
2231 | |
2232 | if (PGOOpt && PGOOpt->DebugInfoForProfiling) |
2233 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: AddDiscriminatorsPass())); |
2234 | |
2235 | if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { |
2236 | // Explicitly disable sample loader inlining and use flattened profile in O0 |
2237 | // pipeline. |
2238 | MPM.addPass(Pass: SampleProfileLoaderPass(PGOOpt->ProfileFile, |
2239 | PGOOpt->ProfileRemappingFile, |
2240 | ThinOrFullLTOPhase::None, nullptr, |
2241 | /*DisableSampleProfileInlining=*/true, |
2242 | /*UseFlattenedProfile=*/true)); |
2243 | // Cache ProfileSummaryAnalysis once to avoid the potential need to insert |
2244 | // RequireAnalysisPass for PSI before subsequent non-module passes. |
2245 | MPM.addPass(Pass: RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); |
2246 | } |
2247 | |
2248 | invokePipelineEarlySimplificationEPCallbacks(MPM, Level, Phase); |
2249 | |
2250 | // Build a minimal pipeline based on the semantics required by LLVM, |
2251 | // which is just that always inlining occurs. Further, disable generating |
2252 | // lifetime intrinsics to avoid enabling further optimizations during |
2253 | // code generation. |
2254 | MPM.addPass(Pass: AlwaysInlinerPass( |
2255 | /*InsertLifetimeIntrinsics=*/false)); |
2256 | |
2257 | if (PTO.MergeFunctions) |
2258 | MPM.addPass(Pass: MergeFunctionsPass()); |
2259 | |
2260 | if (EnableMatrix) |
2261 | MPM.addPass( |
2262 | Pass: createModuleToFunctionPassAdaptor(Pass: LowerMatrixIntrinsicsPass(true))); |
2263 | |
2264 | if (!CGSCCOptimizerLateEPCallbacks.empty()) { |
2265 | CGSCCPassManager CGPM; |
2266 | invokeCGSCCOptimizerLateEPCallbacks(CGPM, Level); |
2267 | if (!CGPM.isEmpty()) |
2268 | MPM.addPass(Pass: createModuleToPostOrderCGSCCPassAdaptor(Pass: std::move(CGPM))); |
2269 | } |
2270 | if (!LateLoopOptimizationsEPCallbacks.empty()) { |
2271 | LoopPassManager LPM; |
2272 | invokeLateLoopOptimizationsEPCallbacks(LPM, Level); |
2273 | if (!LPM.isEmpty()) { |
2274 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor( |
2275 | Pass: createFunctionToLoopPassAdaptor(Pass: std::move(LPM)))); |
2276 | } |
2277 | } |
2278 | if (!LoopOptimizerEndEPCallbacks.empty()) { |
2279 | LoopPassManager LPM; |
2280 | invokeLoopOptimizerEndEPCallbacks(LPM, Level); |
2281 | if (!LPM.isEmpty()) { |
2282 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor( |
2283 | Pass: createFunctionToLoopPassAdaptor(Pass: std::move(LPM)))); |
2284 | } |
2285 | } |
2286 | if (!ScalarOptimizerLateEPCallbacks.empty()) { |
2287 | FunctionPassManager FPM; |
2288 | invokeScalarOptimizerLateEPCallbacks(FPM, Level); |
2289 | if (!FPM.isEmpty()) |
2290 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM))); |
2291 | } |
2292 | |
2293 | invokeOptimizerEarlyEPCallbacks(MPM, Level, Phase); |
2294 | |
2295 | if (!VectorizerStartEPCallbacks.empty()) { |
2296 | FunctionPassManager FPM; |
2297 | invokeVectorizerStartEPCallbacks(FPM, Level); |
2298 | if (!FPM.isEmpty()) |
2299 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM))); |
2300 | } |
2301 | |
2302 | if (!VectorizerEndEPCallbacks.empty()) { |
2303 | FunctionPassManager FPM; |
2304 | invokeVectorizerEndEPCallbacks(FPM, Level); |
2305 | if (!FPM.isEmpty()) |
2306 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM))); |
2307 | } |
2308 | |
2309 | MPM.addPass(Pass: buildCoroWrapper(Phase)); |
2310 | |
2311 | invokeOptimizerLastEPCallbacks(MPM, Level, Phase); |
2312 | |
2313 | if (isLTOPreLink(Phase)) |
2314 | addRequiredLTOPreLinkPasses(MPM); |
2315 | |
2316 | MPM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: AnnotationRemarksPass())); |
2317 | |
2318 | return MPM; |
2319 | } |
2320 | |
2321 | AAManager PassBuilder::buildDefaultAAPipeline() { |
2322 | AAManager AA; |
2323 | |
2324 | // The order in which these are registered determines their priority when |
2325 | // being queried. |
2326 | |
2327 | // Add any target-specific alias analyses that should be run early. |
2328 | if (TM) |
2329 | TM->registerEarlyDefaultAliasAnalyses(AA); |
2330 | |
2331 | // First we register the basic alias analysis that provides the majority of |
2332 | // per-function local AA logic. This is a stateless, on-demand local set of |
2333 | // AA techniques. |
2334 | AA.registerFunctionAnalysis<BasicAA>(); |
2335 | |
2336 | // Next we query fast, specialized alias analyses that wrap IR-embedded |
2337 | // information about aliasing. |
2338 | AA.registerFunctionAnalysis<ScopedNoAliasAA>(); |
2339 | AA.registerFunctionAnalysis<TypeBasedAA>(); |
2340 | |
2341 | // Add support for querying global aliasing information when available. |
2342 | // Because the `AAManager` is a function analysis and `GlobalsAA` is a module |
2343 | // analysis, all that the `AAManager` can do is query for any *cached* |
2344 | // results from `GlobalsAA` through a readonly proxy. |
2345 | if (EnableGlobalAnalyses) |
2346 | AA.registerModuleAnalysis<GlobalsAA>(); |
2347 | |
2348 | // Add target-specific alias analyses. |
2349 | if (TM) |
2350 | TM->registerDefaultAliasAnalyses(AA); |
2351 | |
2352 | return AA; |
2353 | } |
2354 | |
2355 | bool PassBuilder::isInstrumentedPGOUse() const { |
2356 | return (PGOOpt && PGOOpt->Action == PGOOptions::IRUse) || |
2357 | !UseCtxProfile.empty(); |
2358 | } |