1//===-- SILateBranchLowering.cpp - Final preparation of branches ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass mainly lowers early terminate pseudo instructions.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPULaneMaskUtils.h"
16#include "GCNSubtarget.h"
17#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18#include "SIMachineFunctionInfo.h"
19#include "llvm/CodeGen/MachineDominators.h"
20#include "llvm/CodeGen/MachineLoopInfo.h"
21#include "llvm/CodeGen/MachinePassManager.h"
22#include "llvm/InitializePasses.h"
23
24using namespace llvm;
25
26#define DEBUG_TYPE "si-late-branch-lowering"
27
28namespace {
29
30class SILateBranchLowering {
31private:
32 const GCNSubtarget &ST;
33 const SIInstrInfo *TII;
34 const SIRegisterInfo *TRI;
35 MachineDominatorTree *MDT;
36 MachineLoopInfo *MLI;
37 const AMDGPU::LaneMaskConstants &LMC;
38
39 void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST,
40 bool DynamicVGPR);
41 void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
42
43public:
44 SILateBranchLowering(const GCNSubtarget &ST, MachineDominatorTree *MDT,
45 MachineLoopInfo *MLI)
46 : ST(ST), TII(ST.getInstrInfo()), TRI(&TII->getRegisterInfo()), MDT(MDT),
47 MLI(MLI), LMC(AMDGPU::LaneMaskConstants::get(ST)) {}
48
49 bool run(MachineFunction &MF);
50};
51
52class SILateBranchLoweringLegacy : public MachineFunctionPass {
53public:
54 static char ID;
55 SILateBranchLoweringLegacy() : MachineFunctionPass(ID) {}
56
57 bool runOnMachineFunction(MachineFunction &MF) override {
58 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
59 auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
60 auto *MLIWP = getAnalysisIfAvailable<MachineLoopInfoWrapperPass>();
61 MachineLoopInfo *MLI = MLIWP ? &MLIWP->getLI() : nullptr;
62 return SILateBranchLowering(ST, MDT, MLI).run(MF);
63 }
64
65 StringRef getPassName() const override {
66 return "SI Final Branch Preparation";
67 }
68
69 void getAnalysisUsage(AnalysisUsage &AU) const override {
70 AU.addRequired<MachineDominatorTreeWrapperPass>();
71 AU.addPreserved<MachineDominatorTreeWrapperPass>();
72 AU.addPreserved<MachineLoopInfoWrapperPass>();
73 MachineFunctionPass::getAnalysisUsage(AU);
74 }
75};
76
77} // end anonymous namespace
78
79char SILateBranchLoweringLegacy::ID = 0;
80
81INITIALIZE_PASS_BEGIN(SILateBranchLoweringLegacy, DEBUG_TYPE,
82 "SI insert s_cbranch_execz instructions", false, false)
83INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
84INITIALIZE_PASS_END(SILateBranchLoweringLegacy, DEBUG_TYPE,
85 "SI insert s_cbranch_execz instructions", false, false)
86
87char &llvm::SILateBranchLoweringPassID = SILateBranchLoweringLegacy::ID;
88
89static void generateEndPgm(MachineBasicBlock &MBB,
90 MachineBasicBlock::iterator I, DebugLoc DL,
91 const SIInstrInfo *TII, MachineFunction &MF) {
92 const Function &F = MF.getFunction();
93 bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS;
94
95 // Check if hardware has been configured to expect color or depth exports.
96 bool HasColorExports = AMDGPU::getHasColorExport(F);
97 bool HasDepthExports = AMDGPU::getHasDepthExport(F);
98 bool HasExports = HasColorExports || HasDepthExports;
99
100 // Prior to GFX10, hardware always expects at least one export for PS.
101 bool MustExport = !AMDGPU::isGFX10Plus(STI: TII->getSubtarget());
102
103 if (IsPS && (HasExports || MustExport)) {
104 // Generate "null export" if hardware is expecting PS to export.
105 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
106 int Target =
107 ST.hasNullExportTarget()
108 ? AMDGPU::Exp::ET_NULL
109 : (HasColorExports ? AMDGPU::Exp::ET_MRT0 : AMDGPU::Exp::ET_MRTZ);
110 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::EXP_DONE))
111 .addImm(Val: Target)
112 .addReg(RegNo: AMDGPU::VGPR0, Flags: RegState::Undef)
113 .addReg(RegNo: AMDGPU::VGPR0, Flags: RegState::Undef)
114 .addReg(RegNo: AMDGPU::VGPR0, Flags: RegState::Undef)
115 .addReg(RegNo: AMDGPU::VGPR0, Flags: RegState::Undef)
116 .addImm(Val: 1) // vm
117 .addImm(Val: 0) // compr
118 .addImm(Val: 0); // en
119 }
120
121 // s_endpgm
122 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM)).addImm(Val: 0);
123}
124
125static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
126 MachineDominatorTree *MDT, MachineLoopInfo *MLI) {
127 MachineBasicBlock *SplitBB = MBB.splitAt(SplitInst&: MI, /*UpdateLiveIns*/ true);
128
129 // Update dominator tree
130 using DomTreeT = DomTreeBase<MachineBasicBlock>;
131 SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
132 for (MachineBasicBlock *Succ : SplitBB->successors()) {
133 DTUpdates.push_back(Elt: {DomTreeT::Insert, SplitBB, Succ});
134 DTUpdates.push_back(Elt: {DomTreeT::Delete, &MBB, Succ});
135 }
136 DTUpdates.push_back(Elt: {DomTreeT::Insert, &MBB, SplitBB});
137 MDT->applyUpdates(Updates: DTUpdates);
138
139 // Update loop info if available
140 if (MLI) {
141 if (MachineLoop *Loop = MLI->getLoopFor(BB: &MBB))
142 Loop->addBasicBlockToLoop(NewBB: SplitBB, LI&: *MLI);
143 }
144}
145
146static void copyOpWithoutRegFlags(MachineInstrBuilder &MIB,
147 MachineOperand &Op) {
148 if (Op.isReg())
149 MIB.addReg(RegNo: Op.getReg());
150 else
151 MIB.add(MO: Op);
152}
153
154void SILateBranchLowering::expandChainCall(MachineInstr &MI,
155 const GCNSubtarget &ST,
156 bool DynamicVGPR) {
157 // This is a tail call that needs to be expanded into at least
158 // 2 instructions, one for setting EXEC and one for the actual tail call.
159 int ExecIdx =
160 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::exec);
161 assert(ExecIdx != -1 && "Missing EXEC operand");
162 const DebugLoc &DL = MI.getDebugLoc();
163 if (DynamicVGPR) {
164 // We have 3 extra operands and we need to:
165 // * Try to change the VGPR allocation
166 // * Select the callee based on the result of the reallocation attempt
167 // * Select the EXEC mask based on the result of the reallocation attempt
168 // If any of the register operands of the chain pseudo is used in more than
169 // one of these instructions, we need to make sure that the kill flags
170 // aren't copied along.
171 auto AllocMI =
172 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ALLOC_VGPR));
173 copyOpWithoutRegFlags(MIB&: AllocMI,
174 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::numvgprs));
175
176 auto SelectCallee =
177 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64))
178 .addDef(RegNo: TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)->getReg());
179 copyOpWithoutRegFlags(MIB&: SelectCallee,
180 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0));
181 copyOpWithoutRegFlags(MIB&: SelectCallee,
182 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::fbcallee));
183
184 auto SelectExec = BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: TII->get(Opcode: LMC.CSelectOpc))
185 .addDef(RegNo: LMC.ExecReg);
186
187 copyOpWithoutRegFlags(MIB&: SelectExec,
188 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::exec));
189 copyOpWithoutRegFlags(MIB&: SelectExec,
190 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::fbexec));
191 } else {
192 auto SetExec =
193 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg);
194 copyOpWithoutRegFlags(MIB&: SetExec,
195 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::exec));
196 }
197
198 for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx)
199 MI.removeOperand(OpNo: OpIdx);
200
201 MI.setDesc(TII->get(Opcode: AMDGPU::SI_TCRETURN_CHAIN));
202}
203
204void SILateBranchLowering::earlyTerm(MachineInstr &MI,
205 MachineBasicBlock *EarlyExitBlock) {
206 MachineBasicBlock &MBB = *MI.getParent();
207 const DebugLoc &DL = MI.getDebugLoc();
208
209 auto BranchMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC0))
210 .addMBB(MBB: EarlyExitBlock);
211 auto Next = std::next(x: MI.getIterator());
212
213 if (Next != MBB.end() && !Next->isTerminator())
214 splitBlock(MBB, MI&: *BranchMI, MDT, MLI);
215
216 MBB.addSuccessor(Succ: EarlyExitBlock);
217 MDT->insertEdge(From: &MBB, To: EarlyExitBlock);
218}
219
220PreservedAnalyses
221llvm::SILateBranchLoweringPass::run(MachineFunction &MF,
222 MachineFunctionAnalysisManager &MFAM) {
223 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
224 auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(IR&: MF);
225 auto *MLI = MFAM.getCachedResult<MachineLoopAnalysis>(IR&: MF);
226 if (!SILateBranchLowering(ST, MDT, MLI).run(MF))
227 return PreservedAnalyses::all();
228
229 auto PA = getMachineFunctionPassPreservedAnalyses();
230 PA.preserve<MachineDominatorTreeAnalysis>();
231 PA.preserve<MachineLoopAnalysis>();
232 return PA;
233}
234
235bool SILateBranchLowering::run(MachineFunction &MF) {
236 SmallVector<MachineInstr *, 4> EarlyTermInstrs;
237 SmallVector<MachineInstr *, 1> EpilogInstrs;
238 bool MadeChange = false;
239
240 for (MachineBasicBlock &MBB : MF) {
241 for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
242 switch (MI.getOpcode()) {
243 case AMDGPU::S_BRANCH:
244 // Optimize out branches to the next block.
245 // This only occurs in -O0 when BranchFolding is not executed.
246 if (MBB.isLayoutSuccessor(MBB: MI.getOperand(i: 0).getMBB())) {
247 assert(&MI == &MBB.back());
248 MI.eraseFromParent();
249 MadeChange = true;
250 }
251 break;
252
253 case AMDGPU::SI_CS_CHAIN_TC_W32:
254 case AMDGPU::SI_CS_CHAIN_TC_W64:
255 expandChainCall(MI, ST, /*DynamicVGPR=*/false);
256 MadeChange = true;
257 break;
258 case AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR:
259 case AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR:
260 expandChainCall(MI, ST, /*DynamicVGPR=*/true);
261 MadeChange = true;
262 break;
263
264 case AMDGPU::SI_EARLY_TERMINATE_SCC0:
265 EarlyTermInstrs.push_back(Elt: &MI);
266 break;
267
268 case AMDGPU::SI_RETURN_TO_EPILOG:
269 EpilogInstrs.push_back(Elt: &MI);
270 break;
271
272 default:
273 break;
274 }
275 }
276 }
277
278 // Lower any early exit branches first
279 if (!EarlyTermInstrs.empty()) {
280 MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock();
281 DebugLoc DL;
282
283 MF.insert(MBBI: MF.end(), MBB: EarlyExitBlock);
284 BuildMI(BB&: *EarlyExitBlock, I: EarlyExitBlock->end(), MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc),
285 DestReg: LMC.ExecReg)
286 .addImm(Val: 0);
287 generateEndPgm(MBB&: *EarlyExitBlock, I: EarlyExitBlock->end(), DL, TII, MF);
288
289 for (MachineInstr *Instr : EarlyTermInstrs) {
290 // Early termination in GS does nothing
291 if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
292 earlyTerm(MI&: *Instr, EarlyExitBlock);
293 Instr->eraseFromParent();
294 }
295
296 EarlyTermInstrs.clear();
297 MadeChange = true;
298 }
299
300 // Now check return to epilog instructions occur at function end
301 if (!EpilogInstrs.empty()) {
302 MachineBasicBlock *EmptyMBBAtEnd = nullptr;
303 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
304
305 // If there are multiple returns to epilog then all will
306 // become jumps to new empty end block.
307 if (EpilogInstrs.size() > 1) {
308 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
309 MF.insert(MBBI: MF.end(), MBB: EmptyMBBAtEnd);
310 }
311
312 for (auto *MI : EpilogInstrs) {
313 auto *MBB = MI->getParent();
314 if (MBB == &MF.back() && MI == &MBB->back())
315 continue;
316
317 // SI_RETURN_TO_EPILOG is not the last instruction.
318 // Jump to empty block at function end.
319 if (!EmptyMBBAtEnd) {
320 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
321 MF.insert(MBBI: MF.end(), MBB: EmptyMBBAtEnd);
322 }
323
324 MBB->addSuccessor(Succ: EmptyMBBAtEnd);
325 MDT->insertEdge(From: MBB, To: EmptyMBBAtEnd);
326 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_BRANCH))
327 .addMBB(MBB: EmptyMBBAtEnd);
328 MI->eraseFromParent();
329 MadeChange = true;
330 }
331
332 EpilogInstrs.clear();
333 }
334
335 return MadeChange;
336}
337