| 1 | //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Top-level implementation for the NVPTX target. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "NVPTXTargetMachine.h" |
| 14 | #include "NVPTX.h" |
| 15 | #include "NVPTXAliasAnalysis.h" |
| 16 | #include "NVPTXAllocaHoisting.h" |
| 17 | #include "NVPTXAtomicLower.h" |
| 18 | #include "NVPTXCtorDtorLowering.h" |
| 19 | #include "NVPTXLowerAggrCopies.h" |
| 20 | #include "NVPTXMachineFunctionInfo.h" |
| 21 | #include "NVPTXTargetObjectFile.h" |
| 22 | #include "NVPTXTargetTransformInfo.h" |
| 23 | #include "TargetInfo/NVPTXTargetInfo.h" |
| 24 | #include "llvm/Analysis/KernelInfo.h" |
| 25 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 26 | #include "llvm/CodeGen/Passes.h" |
| 27 | #include "llvm/CodeGen/TargetPassConfig.h" |
| 28 | #include "llvm/IR/IntrinsicsNVPTX.h" |
| 29 | #include "llvm/MC/TargetRegistry.h" |
| 30 | #include "llvm/Pass.h" |
| 31 | #include "llvm/Passes/PassBuilder.h" |
| 32 | #include "llvm/Support/CommandLine.h" |
| 33 | #include "llvm/Support/Compiler.h" |
| 34 | #include "llvm/Target/TargetMachine.h" |
| 35 | #include "llvm/Target/TargetOptions.h" |
| 36 | #include "llvm/TargetParser/Triple.h" |
| 37 | #include "llvm/Transforms/IPO/ExpandVariadics.h" |
| 38 | #include "llvm/Transforms/Scalar.h" |
| 39 | #include "llvm/Transforms/Scalar/GVN.h" |
| 40 | #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" |
| 41 | #include <cassert> |
| 42 | #include <optional> |
| 43 | #include <string> |
| 44 | |
| 45 | using namespace llvm; |
| 46 | |
| 47 | // LSV is still relatively new; this switch lets us turn it off in case we |
| 48 | // encounter (or suspect) a bug. |
| 49 | static cl::opt<bool> |
| 50 | DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer" , |
| 51 | cl::desc("Disable load/store vectorizer" ), |
| 52 | cl::init(Val: false), cl::Hidden); |
| 53 | |
| 54 | // TODO: Remove this flag when we are confident with no regressions. |
| 55 | static cl::opt<bool> DisableRequireStructuredCFG( |
| 56 | "disable-nvptx-require-structured-cfg" , |
| 57 | cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " |
| 58 | "structured CFG. The requirement should be disabled only when " |
| 59 | "unexpected regressions happen." ), |
| 60 | cl::init(Val: false), cl::Hidden); |
| 61 | |
| 62 | static cl::opt<bool> UseShortPointersOpt( |
| 63 | "nvptx-short-ptr" , |
| 64 | cl::desc( |
| 65 | "Use 32-bit pointers for accessing const/local/shared address spaces." ), |
| 66 | cl::init(Val: false), cl::Hidden); |
| 67 | |
| 68 | // byval arguments in NVPTX are special. We're only allowed to read from them |
| 69 | // using a special instruction, and if we ever need to write to them or take an |
| 70 | // address, we must make a local copy and use it, instead. |
| 71 | // |
| 72 | // The problem is that local copies are very expensive, and we create them very |
| 73 | // late in the compilation pipeline, so LLVM does not have much of a chance to |
| 74 | // eliminate them, if they turn out to be unnecessary. |
| 75 | // |
| 76 | // One way around that is to create such copies early on, and let them percolate |
| 77 | // through the optimizations. The copying itself will never trigger creation of |
| 78 | // another copy later on, as the reads are allowed. If LLVM can eliminate it, |
| 79 | // it's a win. It the full optimization pipeline can't remove the copy, that's |
| 80 | // as good as it gets in terms of the effort we could've done, and it's |
| 81 | // certainly a much better effort than what we do now. |
| 82 | // |
| 83 | // This early injection of the copies has potential to create undesireable |
| 84 | // side-effects, so it's disabled by default, for now, until it sees more |
| 85 | // testing. |
| 86 | static cl::opt<bool> EarlyByValArgsCopy( |
| 87 | "nvptx-early-byval-copy" , |
| 88 | cl::desc("Create a copy of byval function arguments early." ), |
| 89 | cl::init(Val: false), cl::Hidden); |
| 90 | |
| 91 | extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { |
| 92 | // Register the target. |
| 93 | RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); |
| 94 | RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); |
| 95 | |
| 96 | PassRegistry &PR = *PassRegistry::getPassRegistry(); |
| 97 | // FIXME: This pass is really intended to be invoked during IR optimization, |
| 98 | // but it's very NVPTX-specific. |
| 99 | initializeNVVMReflectLegacyPassPass(PR); |
| 100 | initializeNVVMIntrRangePass(PR); |
| 101 | initializeGenericToNVVMLegacyPassPass(PR); |
| 102 | initializeNVPTXAllocaHoistingPass(PR); |
| 103 | initializeNVPTXAsmPrinterPass(PR); |
| 104 | initializeNVPTXAssignValidGlobalNamesPass(PR); |
| 105 | initializeNVPTXAtomicLowerPass(PR); |
| 106 | initializeNVPTXLowerArgsLegacyPassPass(PR); |
| 107 | initializeNVPTXLowerAllocaPass(PR); |
| 108 | initializeNVPTXLowerUnreachablePass(PR); |
| 109 | initializeNVPTXCtorDtorLoweringLegacyPass(PR); |
| 110 | initializeNVPTXLowerAggrCopiesPass(PR); |
| 111 | initializeNVPTXProxyRegErasurePass(PR); |
| 112 | initializeNVPTXForwardParamsPassPass(PR); |
| 113 | initializeNVPTXDAGToDAGISelLegacyPass(PR); |
| 114 | initializeNVPTXAAWrapperPassPass(PR); |
| 115 | initializeNVPTXExternalAAWrapperPass(PR); |
| 116 | initializeNVPTXPeepholePass(PR); |
| 117 | initializeNVPTXTagInvariantLoadLegacyPassPass(PR); |
| 118 | initializeNVPTXPrologEpilogPassPass(PR); |
| 119 | } |
| 120 | |
| 121 | static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { |
| 122 | std::string Ret = "e" ; |
| 123 | |
| 124 | // Tensor Memory (addrspace:6) is always 32-bits. |
| 125 | // Distributed Shared Memory (addrspace:7) follows shared memory |
| 126 | // (addrspace:3). |
| 127 | if (!is64Bit) |
| 128 | Ret += "-p:32:32-p6:32:32-p7:32:32" ; |
| 129 | else if (UseShortPointers) { |
| 130 | Ret += "-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32" ; |
| 131 | } else |
| 132 | Ret += "-p6:32:32" ; |
| 133 | |
| 134 | Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64" ; |
| 135 | |
| 136 | return Ret; |
| 137 | } |
| 138 | |
| 139 | NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, |
| 140 | StringRef CPU, StringRef FS, |
| 141 | const TargetOptions &Options, |
| 142 | std::optional<Reloc::Model> RM, |
| 143 | std::optional<CodeModel::Model> CM, |
| 144 | CodeGenOptLevel OL, bool is64bit) |
| 145 | // The pic relocation model is used regardless of what the client has |
| 146 | // specified, as it is the only relocation model currently supported. |
| 147 | : CodeGenTargetMachineImpl(T, |
| 148 | computeDataLayout(is64Bit: is64bit, UseShortPointers: UseShortPointersOpt), |
| 149 | TT, CPU, FS, Options, Reloc::PIC_, |
| 150 | getEffectiveCodeModel(CM, Default: CodeModel::Small), OL), |
| 151 | is64bit(is64bit), TLOF(std::make_unique<NVPTXTargetObjectFile>()), |
| 152 | Subtarget(TT, std::string(CPU), std::string(FS), *this), |
| 153 | StrPool(StrAlloc) { |
| 154 | if (TT.getOS() == Triple::NVCL) |
| 155 | drvInterface = NVPTX::NVCL; |
| 156 | else |
| 157 | drvInterface = NVPTX::CUDA; |
| 158 | if (!DisableRequireStructuredCFG) |
| 159 | setRequiresStructuredCFG(true); |
| 160 | initAsmInfo(); |
| 161 | } |
| 162 | |
| 163 | NVPTXTargetMachine::~NVPTXTargetMachine() = default; |
| 164 | |
| 165 | void NVPTXTargetMachine32::anchor() {} |
| 166 | |
| 167 | NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, |
| 168 | StringRef CPU, StringRef FS, |
| 169 | const TargetOptions &Options, |
| 170 | std::optional<Reloc::Model> RM, |
| 171 | std::optional<CodeModel::Model> CM, |
| 172 | CodeGenOptLevel OL, bool JIT) |
| 173 | : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} |
| 174 | |
| 175 | void NVPTXTargetMachine64::anchor() {} |
| 176 | |
| 177 | NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, |
| 178 | StringRef CPU, StringRef FS, |
| 179 | const TargetOptions &Options, |
| 180 | std::optional<Reloc::Model> RM, |
| 181 | std::optional<CodeModel::Model> CM, |
| 182 | CodeGenOptLevel OL, bool JIT) |
| 183 | : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} |
| 184 | |
| 185 | namespace { |
| 186 | |
| 187 | class NVPTXPassConfig : public TargetPassConfig { |
| 188 | public: |
| 189 | NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) |
| 190 | : TargetPassConfig(TM, PM) {} |
| 191 | |
| 192 | NVPTXTargetMachine &getNVPTXTargetMachine() const { |
| 193 | return getTM<NVPTXTargetMachine>(); |
| 194 | } |
| 195 | |
| 196 | void addIRPasses() override; |
| 197 | bool addInstSelector() override; |
| 198 | void addPreRegAlloc() override; |
| 199 | void addPostRegAlloc() override; |
| 200 | void addMachineSSAOptimization() override; |
| 201 | |
| 202 | FunctionPass *createTargetRegisterAllocator(bool) override; |
| 203 | void addFastRegAlloc() override; |
| 204 | void addOptimizedRegAlloc() override; |
| 205 | |
| 206 | bool addRegAssignAndRewriteFast() override { |
| 207 | llvm_unreachable("should not be used" ); |
| 208 | } |
| 209 | |
| 210 | bool addRegAssignAndRewriteOptimized() override { |
| 211 | llvm_unreachable("should not be used" ); |
| 212 | } |
| 213 | |
| 214 | private: |
| 215 | // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This |
| 216 | // function is only called in opt mode. |
| 217 | void addEarlyCSEOrGVNPass(); |
| 218 | |
| 219 | // Add passes that propagate special memory spaces. |
| 220 | void addAddressSpaceInferencePasses(); |
| 221 | |
| 222 | // Add passes that perform straight-line scalar optimizations. |
| 223 | void addStraightLineScalarOptimizationPasses(); |
| 224 | }; |
| 225 | |
| 226 | } // end anonymous namespace |
| 227 | |
| 228 | TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { |
| 229 | return new NVPTXPassConfig(*this, PM); |
| 230 | } |
| 231 | |
| 232 | MachineFunctionInfo *NVPTXTargetMachine::createMachineFunctionInfo( |
| 233 | BumpPtrAllocator &Allocator, const Function &F, |
| 234 | const TargetSubtargetInfo *STI) const { |
| 235 | return NVPTXMachineFunctionInfo::create<NVPTXMachineFunctionInfo>(Allocator, |
| 236 | F, STI); |
| 237 | } |
| 238 | |
| 239 | void NVPTXTargetMachine::registerEarlyDefaultAliasAnalyses(AAManager &AAM) { |
| 240 | AAM.registerFunctionAnalysis<NVPTXAA>(); |
| 241 | } |
| 242 | |
| 243 | void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { |
| 244 | #define GET_PASS_REGISTRY "NVPTXPassRegistry.def" |
| 245 | #include "llvm/Passes/TargetPassRegistry.inc" |
| 246 | |
| 247 | PB.registerPipelineStartEPCallback( |
| 248 | C: [this](ModulePassManager &PM, OptimizationLevel Level) { |
| 249 | // We do not want to fold out calls to nvvm.reflect early if the user |
| 250 | // has not provided a target architecture just yet. |
| 251 | if (Subtarget.hasTargetName()) |
| 252 | PM.addPass(Pass: NVVMReflectPass(Subtarget.getSmVersion())); |
| 253 | |
| 254 | FunctionPassManager FPM; |
| 255 | // Note: NVVMIntrRangePass was causing numerical discrepancies at one |
| 256 | // point, if issues crop up, consider disabling. |
| 257 | FPM.addPass(Pass: NVVMIntrRangePass()); |
| 258 | if (EarlyByValArgsCopy) |
| 259 | FPM.addPass(Pass: NVPTXCopyByValArgsPass()); |
| 260 | PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM))); |
| 261 | }); |
| 262 | |
| 263 | if (!NoKernelInfoEndLTO) { |
| 264 | PB.registerFullLinkTimeOptimizationLastEPCallback( |
| 265 | C: [this](ModulePassManager &PM, OptimizationLevel Level) { |
| 266 | FunctionPassManager FPM; |
| 267 | FPM.addPass(Pass: KernelInfoPrinter(this)); |
| 268 | PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM))); |
| 269 | }); |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | TargetTransformInfo |
| 274 | NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const { |
| 275 | return TargetTransformInfo(std::make_unique<NVPTXTTIImpl>(args: this, args: F)); |
| 276 | } |
| 277 | |
| 278 | std::pair<const Value *, unsigned> |
| 279 | NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const { |
| 280 | if (auto *II = dyn_cast<IntrinsicInst>(Val: V)) { |
| 281 | switch (II->getIntrinsicID()) { |
| 282 | case Intrinsic::nvvm_isspacep_const: |
| 283 | return std::make_pair(x: II->getArgOperand(i: 0), y: llvm::ADDRESS_SPACE_CONST); |
| 284 | case Intrinsic::nvvm_isspacep_global: |
| 285 | return std::make_pair(x: II->getArgOperand(i: 0), y: llvm::ADDRESS_SPACE_GLOBAL); |
| 286 | case Intrinsic::nvvm_isspacep_local: |
| 287 | return std::make_pair(x: II->getArgOperand(i: 0), y: llvm::ADDRESS_SPACE_LOCAL); |
| 288 | case Intrinsic::nvvm_isspacep_shared: |
| 289 | return std::make_pair(x: II->getArgOperand(i: 0), y: llvm::ADDRESS_SPACE_SHARED); |
| 290 | case Intrinsic::nvvm_isspacep_shared_cluster: |
| 291 | return std::make_pair(x: II->getArgOperand(i: 0), |
| 292 | y: llvm::ADDRESS_SPACE_SHARED_CLUSTER); |
| 293 | default: |
| 294 | break; |
| 295 | } |
| 296 | } |
| 297 | return std::make_pair(x: nullptr, y: -1); |
| 298 | } |
| 299 | |
| 300 | void NVPTXPassConfig::addEarlyCSEOrGVNPass() { |
| 301 | if (getOptLevel() == CodeGenOptLevel::Aggressive) |
| 302 | addPass(P: createGVNPass()); |
| 303 | else |
| 304 | addPass(P: createEarlyCSEPass()); |
| 305 | } |
| 306 | |
| 307 | void NVPTXPassConfig::addAddressSpaceInferencePasses() { |
| 308 | // NVPTXLowerArgs emits alloca for byval parameters which can often |
| 309 | // be eliminated by SROA. |
| 310 | addPass(P: createSROAPass()); |
| 311 | addPass(P: createNVPTXLowerAllocaPass()); |
| 312 | // TODO: Consider running InferAddressSpaces during opt, earlier in the |
| 313 | // compilation flow. |
| 314 | addPass(P: createInferAddressSpacesPass()); |
| 315 | addPass(P: createNVPTXAtomicLowerPass()); |
| 316 | } |
| 317 | |
| 318 | void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { |
| 319 | addPass(P: createSeparateConstOffsetFromGEPPass()); |
| 320 | addPass(P: createSpeculativeExecutionPass()); |
| 321 | // ReassociateGEPs exposes more opportunites for SLSR. See |
| 322 | // the example in reassociate-geps-and-slsr.ll. |
| 323 | addPass(P: createStraightLineStrengthReducePass()); |
| 324 | // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or |
| 325 | // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE |
| 326 | // for some of our benchmarks. |
| 327 | addEarlyCSEOrGVNPass(); |
| 328 | // Run NaryReassociate after EarlyCSE/GVN to be more effective. |
| 329 | addPass(P: createNaryReassociatePass()); |
| 330 | // NaryReassociate on GEPs creates redundant common expressions, so run |
| 331 | // EarlyCSE after it. |
| 332 | addPass(P: createEarlyCSEPass()); |
| 333 | } |
| 334 | |
| 335 | void NVPTXPassConfig::addIRPasses() { |
| 336 | // The following passes are known to not play well with virtual regs hanging |
| 337 | // around after register allocation (which in our case, is *all* registers). |
| 338 | // We explicitly disable them here. We do, however, need some functionality |
| 339 | // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the |
| 340 | // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). |
| 341 | disablePass(PassID: &PrologEpilogCodeInserterID); |
| 342 | disablePass(PassID: &MachineLateInstrsCleanupID); |
| 343 | disablePass(PassID: &MachineCopyPropagationID); |
| 344 | disablePass(PassID: &TailDuplicateLegacyID); |
| 345 | disablePass(PassID: &StackMapLivenessID); |
| 346 | disablePass(PassID: &PostRAMachineSinkingID); |
| 347 | disablePass(PassID: &PostRASchedulerID); |
| 348 | disablePass(PassID: &FuncletLayoutID); |
| 349 | disablePass(PassID: &PatchableFunctionID); |
| 350 | disablePass(PassID: &ShrinkWrapID); |
| 351 | disablePass(PassID: &RemoveLoadsIntoFakeUsesID); |
| 352 | |
| 353 | addPass(P: createNVPTXAAWrapperPass()); |
| 354 | addPass(P: createNVPTXExternalAAWrapperPass()); |
| 355 | |
| 356 | // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running |
| 357 | // it here does nothing. But since we need it for correctness when lowering |
| 358 | // to NVPTX, run it here too, in case whoever built our pass pipeline didn't |
| 359 | // call addEarlyAsPossiblePasses. |
| 360 | const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); |
| 361 | addPass(P: createNVVMReflectPass(SmVersion: ST.getSmVersion())); |
| 362 | |
| 363 | if (getOptLevel() != CodeGenOptLevel::None) |
| 364 | addPass(P: createNVPTXImageOptimizerPass()); |
| 365 | addPass(P: createNVPTXAssignValidGlobalNamesPass()); |
| 366 | addPass(P: createGenericToNVVMLegacyPass()); |
| 367 | |
| 368 | // NVPTXLowerArgs is required for correctness and should be run right |
| 369 | // before the address space inference passes. |
| 370 | addPass(P: createNVPTXLowerArgsPass()); |
| 371 | if (getOptLevel() != CodeGenOptLevel::None) { |
| 372 | addAddressSpaceInferencePasses(); |
| 373 | addStraightLineScalarOptimizationPasses(); |
| 374 | } |
| 375 | |
| 376 | addPass(P: createAtomicExpandLegacyPass()); |
| 377 | addPass(P: createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); |
| 378 | addPass(P: createNVPTXCtorDtorLoweringLegacyPass()); |
| 379 | |
| 380 | // === LSR and other generic IR passes === |
| 381 | TargetPassConfig::addIRPasses(); |
| 382 | // EarlyCSE is not always strong enough to clean up what LSR produces. For |
| 383 | // example, GVN can combine |
| 384 | // |
| 385 | // %0 = add %a, %b |
| 386 | // %1 = add %b, %a |
| 387 | // |
| 388 | // and |
| 389 | // |
| 390 | // %0 = shl nsw %a, 2 |
| 391 | // %1 = shl %a, 2 |
| 392 | // |
| 393 | // but EarlyCSE can do neither of them. |
| 394 | if (getOptLevel() != CodeGenOptLevel::None) { |
| 395 | addEarlyCSEOrGVNPass(); |
| 396 | if (!DisableLoadStoreVectorizer) |
| 397 | addPass(P: createLoadStoreVectorizerPass()); |
| 398 | addPass(P: createSROAPass()); |
| 399 | addPass(P: createNVPTXTagInvariantLoadsPass()); |
| 400 | } |
| 401 | |
| 402 | if (ST.hasPTXASUnreachableBug()) { |
| 403 | // Run LowerUnreachable to WAR a ptxas bug. See the commit description of |
| 404 | // 1ee4d880e8760256c606fe55b7af85a4f70d006d for more details. |
| 405 | const auto &Options = getNVPTXTargetMachine().Options; |
| 406 | addPass(P: createNVPTXLowerUnreachablePass(TrapUnreachable: Options.TrapUnreachable, |
| 407 | NoTrapAfterNoreturn: Options.NoTrapAfterNoreturn)); |
| 408 | } |
| 409 | } |
| 410 | |
| 411 | bool NVPTXPassConfig::addInstSelector() { |
| 412 | addPass(P: createLowerAggrCopies()); |
| 413 | addPass(P: createAllocaHoisting()); |
| 414 | addPass(P: createNVPTXISelDag(TM&: getNVPTXTargetMachine(), OptLevel: getOptLevel())); |
| 415 | addPass(P: createNVPTXReplaceImageHandlesPass()); |
| 416 | |
| 417 | return false; |
| 418 | } |
| 419 | |
| 420 | void NVPTXPassConfig::addPreRegAlloc() { |
| 421 | addPass(P: createNVPTXForwardParamsPass()); |
| 422 | // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. |
| 423 | addPass(P: createNVPTXProxyRegErasurePass()); |
| 424 | } |
| 425 | |
| 426 | void NVPTXPassConfig::addPostRegAlloc() { |
| 427 | addPass(P: createNVPTXPrologEpilogPass()); |
| 428 | if (getOptLevel() != CodeGenOptLevel::None) { |
| 429 | // NVPTXPrologEpilogPass calculates frame object offset and replace frame |
| 430 | // index with VRFrame register. NVPTXPeephole need to be run after that and |
| 431 | // will replace VRFrame with VRFrameLocal when possible. |
| 432 | addPass(P: createNVPTXPeephole()); |
| 433 | } |
| 434 | } |
| 435 | |
| 436 | FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { |
| 437 | return nullptr; // No reg alloc |
| 438 | } |
| 439 | |
| 440 | void NVPTXPassConfig::addFastRegAlloc() { |
| 441 | addPass(PassID: &PHIEliminationID); |
| 442 | addPass(PassID: &TwoAddressInstructionPassID); |
| 443 | } |
| 444 | |
| 445 | void NVPTXPassConfig::addOptimizedRegAlloc() { |
| 446 | addPass(PassID: &ProcessImplicitDefsID); |
| 447 | addPass(PassID: &LiveVariablesID); |
| 448 | addPass(PassID: &MachineLoopInfoID); |
| 449 | addPass(PassID: &PHIEliminationID); |
| 450 | |
| 451 | addPass(PassID: &TwoAddressInstructionPassID); |
| 452 | addPass(PassID: &RegisterCoalescerID); |
| 453 | |
| 454 | // PreRA instruction scheduling. |
| 455 | if (addPass(PassID: &MachineSchedulerID)) |
| 456 | printAndVerify(Banner: "After Machine Scheduling" ); |
| 457 | |
| 458 | addPass(PassID: &StackSlotColoringID); |
| 459 | |
| 460 | // FIXME: Needs physical registers |
| 461 | // addPass(&MachineLICMID); |
| 462 | |
| 463 | printAndVerify(Banner: "After StackSlotColoring" ); |
| 464 | } |
| 465 | |
| 466 | void NVPTXPassConfig::addMachineSSAOptimization() { |
| 467 | // Pre-ra tail duplication. |
| 468 | if (addPass(PassID: &EarlyTailDuplicateLegacyID)) |
| 469 | printAndVerify(Banner: "After Pre-RegAlloc TailDuplicate" ); |
| 470 | |
| 471 | // Optimize PHIs before DCE: removing dead PHI cycles may make more |
| 472 | // instructions dead. |
| 473 | addPass(PassID: &OptimizePHIsLegacyID); |
| 474 | |
| 475 | // This pass merges large allocas. StackSlotColoring is a different pass |
| 476 | // which merges spill slots. |
| 477 | addPass(PassID: &StackColoringLegacyID); |
| 478 | |
| 479 | // If the target requests it, assign local variables to stack slots relative |
| 480 | // to one another and simplify frame index references where possible. |
| 481 | addPass(PassID: &LocalStackSlotAllocationID); |
| 482 | |
| 483 | // With optimization, dead code should already be eliminated. However |
| 484 | // there is one known exception: lowered code for arguments that are only |
| 485 | // used by tail calls, where the tail calls reuse the incoming stack |
| 486 | // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). |
| 487 | addPass(PassID: &DeadMachineInstructionElimID); |
| 488 | printAndVerify(Banner: "After codegen DCE pass" ); |
| 489 | |
| 490 | // Allow targets to insert passes that improve instruction level parallelism, |
| 491 | // like if-conversion. Such passes will typically need dominator trees and |
| 492 | // loop info, just like LICM and CSE below. |
| 493 | if (addILPOpts()) |
| 494 | printAndVerify(Banner: "After ILP optimizations" ); |
| 495 | |
| 496 | addPass(PassID: &EarlyMachineLICMID); |
| 497 | addPass(PassID: &MachineCSELegacyID); |
| 498 | |
| 499 | addPass(PassID: &MachineSinkingLegacyID); |
| 500 | printAndVerify(Banner: "After Machine LICM, CSE and Sinking passes" ); |
| 501 | |
| 502 | addPass(PassID: &PeepholeOptimizerLegacyID); |
| 503 | printAndVerify(Banner: "After codegen peephole optimization pass" ); |
| 504 | } |
| 505 | |