1//===-- SILateBranchLowering.cpp - Final preparation of branches ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass mainly lowers early terminate pseudo instructions.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
16#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17#include "SIMachineFunctionInfo.h"
18#include "llvm/CodeGen/MachineDominators.h"
19#include "llvm/CodeGen/MachinePassManager.h"
20
21using namespace llvm;
22
23#define DEBUG_TYPE "si-late-branch-lowering"
24
25namespace {
26
27class SILateBranchLowering {
28private:
29 const SIRegisterInfo *TRI = nullptr;
30 const SIInstrInfo *TII = nullptr;
31 MachineDominatorTree *MDT = nullptr;
32
33 void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST,
34 bool DynamicVGPR);
35 void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
36
37public:
38 SILateBranchLowering(MachineDominatorTree *MDT) : MDT(MDT) {}
39
40 bool run(MachineFunction &MF);
41
42 unsigned MovOpc;
43 Register ExecReg;
44};
45
46class SILateBranchLoweringLegacy : public MachineFunctionPass {
47public:
48 static char ID;
49 SILateBranchLoweringLegacy() : MachineFunctionPass(ID) {}
50
51 bool runOnMachineFunction(MachineFunction &MF) override {
52 auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
53 return SILateBranchLowering(MDT).run(MF);
54 }
55
56 StringRef getPassName() const override {
57 return "SI Final Branch Preparation";
58 }
59
60 void getAnalysisUsage(AnalysisUsage &AU) const override {
61 AU.addRequired<MachineDominatorTreeWrapperPass>();
62 AU.addPreserved<MachineDominatorTreeWrapperPass>();
63 MachineFunctionPass::getAnalysisUsage(AU);
64 }
65};
66
67} // end anonymous namespace
68
69char SILateBranchLoweringLegacy::ID = 0;
70
71INITIALIZE_PASS_BEGIN(SILateBranchLoweringLegacy, DEBUG_TYPE,
72 "SI insert s_cbranch_execz instructions", false, false)
73INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
74INITIALIZE_PASS_END(SILateBranchLoweringLegacy, DEBUG_TYPE,
75 "SI insert s_cbranch_execz instructions", false, false)
76
77char &llvm::SILateBranchLoweringPassID = SILateBranchLoweringLegacy::ID;
78
79static void generateEndPgm(MachineBasicBlock &MBB,
80 MachineBasicBlock::iterator I, DebugLoc DL,
81 const SIInstrInfo *TII, MachineFunction &MF) {
82 const Function &F = MF.getFunction();
83 bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS;
84
85 // Check if hardware has been configured to expect color or depth exports.
86 bool HasColorExports = AMDGPU::getHasColorExport(F);
87 bool HasDepthExports = AMDGPU::getHasDepthExport(F);
88 bool HasExports = HasColorExports || HasDepthExports;
89
90 // Prior to GFX10, hardware always expects at least one export for PS.
91 bool MustExport = !AMDGPU::isGFX10Plus(STI: TII->getSubtarget());
92
93 if (IsPS && (HasExports || MustExport)) {
94 // Generate "null export" if hardware is expecting PS to export.
95 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
96 int Target =
97 ST.hasNullExportTarget()
98 ? AMDGPU::Exp::ET_NULL
99 : (HasColorExports ? AMDGPU::Exp::ET_MRT0 : AMDGPU::Exp::ET_MRTZ);
100 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::EXP_DONE))
101 .addImm(Val: Target)
102 .addReg(RegNo: AMDGPU::VGPR0, flags: RegState::Undef)
103 .addReg(RegNo: AMDGPU::VGPR0, flags: RegState::Undef)
104 .addReg(RegNo: AMDGPU::VGPR0, flags: RegState::Undef)
105 .addReg(RegNo: AMDGPU::VGPR0, flags: RegState::Undef)
106 .addImm(Val: 1) // vm
107 .addImm(Val: 0) // compr
108 .addImm(Val: 0); // en
109 }
110
111 // s_endpgm
112 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM)).addImm(Val: 0);
113}
114
115static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
116 MachineDominatorTree *MDT) {
117 MachineBasicBlock *SplitBB = MBB.splitAt(SplitInst&: MI, /*UpdateLiveIns*/ true);
118
119 // Update dominator tree
120 using DomTreeT = DomTreeBase<MachineBasicBlock>;
121 SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
122 for (MachineBasicBlock *Succ : SplitBB->successors()) {
123 DTUpdates.push_back(Elt: {DomTreeT::Insert, SplitBB, Succ});
124 DTUpdates.push_back(Elt: {DomTreeT::Delete, &MBB, Succ});
125 }
126 DTUpdates.push_back(Elt: {DomTreeT::Insert, &MBB, SplitBB});
127 MDT->applyUpdates(Updates: DTUpdates);
128}
129
130static void copyOpWithoutRegFlags(MachineInstrBuilder &MIB,
131 MachineOperand &Op) {
132 if (Op.isReg())
133 MIB.addReg(RegNo: Op.getReg());
134 else
135 MIB.add(MO: Op);
136}
137
138void SILateBranchLowering::expandChainCall(MachineInstr &MI,
139 const GCNSubtarget &ST,
140 bool DynamicVGPR) {
141 // This is a tail call that needs to be expanded into at least
142 // 2 instructions, one for setting EXEC and one for the actual tail call.
143 int ExecIdx =
144 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::exec);
145 assert(ExecIdx != -1 && "Missing EXEC operand");
146 const DebugLoc &DL = MI.getDebugLoc();
147 if (DynamicVGPR) {
148 // We have 3 extra operands and we need to:
149 // * Try to change the VGPR allocation
150 // * Select the callee based on the result of the reallocation attempt
151 // * Select the EXEC mask based on the result of the reallocation attempt
152 // If any of the register operands of the chain pseudo is used in more than
153 // one of these instructions, we need to make sure that the kill flags
154 // aren't copied along.
155 auto AllocMI =
156 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ALLOC_VGPR));
157 copyOpWithoutRegFlags(MIB&: AllocMI,
158 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::numvgprs));
159
160 auto SelectCallee =
161 BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64))
162 .addDef(RegNo: TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)->getReg());
163 copyOpWithoutRegFlags(MIB&: SelectCallee,
164 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0));
165 copyOpWithoutRegFlags(MIB&: SelectCallee,
166 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::fbcallee));
167
168 auto SelectExec = BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL,
169 MCID: TII->get(Opcode: ST.isWave32() ? AMDGPU::S_CSELECT_B32
170 : AMDGPU::S_CSELECT_B64))
171 .addDef(RegNo: ExecReg);
172
173 copyOpWithoutRegFlags(MIB&: SelectExec,
174 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::exec));
175 copyOpWithoutRegFlags(MIB&: SelectExec,
176 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::fbexec));
177 } else {
178 auto SetExec = BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: ExecReg);
179 copyOpWithoutRegFlags(MIB&: SetExec,
180 Op&: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::exec));
181 }
182
183 for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx)
184 MI.removeOperand(OpNo: OpIdx);
185
186 MI.setDesc(TII->get(Opcode: AMDGPU::SI_TCRETURN));
187}
188
189void SILateBranchLowering::earlyTerm(MachineInstr &MI,
190 MachineBasicBlock *EarlyExitBlock) {
191 MachineBasicBlock &MBB = *MI.getParent();
192 const DebugLoc DL = MI.getDebugLoc();
193
194 auto BranchMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC0))
195 .addMBB(MBB: EarlyExitBlock);
196 auto Next = std::next(x: MI.getIterator());
197
198 if (Next != MBB.end() && !Next->isTerminator())
199 splitBlock(MBB, MI&: *BranchMI, MDT);
200
201 MBB.addSuccessor(Succ: EarlyExitBlock);
202 MDT->insertEdge(From: &MBB, To: EarlyExitBlock);
203}
204
205PreservedAnalyses
206llvm::SILateBranchLoweringPass::run(MachineFunction &MF,
207 MachineFunctionAnalysisManager &MFAM) {
208 auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(IR&: MF);
209 if (!SILateBranchLowering(MDT).run(MF))
210 return PreservedAnalyses::all();
211
212 return getMachineFunctionPassPreservedAnalyses()
213 .preserve<MachineDominatorTreeAnalysis>();
214}
215
216bool SILateBranchLowering::run(MachineFunction &MF) {
217 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
218 TII = ST.getInstrInfo();
219 TRI = &TII->getRegisterInfo();
220
221 MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
222 ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
223
224 SmallVector<MachineInstr *, 4> EarlyTermInstrs;
225 SmallVector<MachineInstr *, 1> EpilogInstrs;
226 bool MadeChange = false;
227
228 for (MachineBasicBlock &MBB : MF) {
229 for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
230 switch (MI.getOpcode()) {
231 case AMDGPU::S_BRANCH:
232 // Optimize out branches to the next block.
233 // This only occurs in -O0 when BranchFolding is not executed.
234 if (MBB.isLayoutSuccessor(MBB: MI.getOperand(i: 0).getMBB())) {
235 assert(&MI == &MBB.back());
236 MI.eraseFromParent();
237 MadeChange = true;
238 }
239 break;
240
241 case AMDGPU::SI_CS_CHAIN_TC_W32:
242 case AMDGPU::SI_CS_CHAIN_TC_W64:
243 expandChainCall(MI, ST, /*DynamicVGPR=*/false);
244 MadeChange = true;
245 break;
246 case AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR:
247 case AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR:
248 expandChainCall(MI, ST, /*DynamicVGPR=*/true);
249 MadeChange = true;
250 break;
251
252 case AMDGPU::SI_EARLY_TERMINATE_SCC0:
253 EarlyTermInstrs.push_back(Elt: &MI);
254 break;
255
256 case AMDGPU::SI_RETURN_TO_EPILOG:
257 EpilogInstrs.push_back(Elt: &MI);
258 break;
259
260 default:
261 break;
262 }
263 }
264 }
265
266 // Lower any early exit branches first
267 if (!EarlyTermInstrs.empty()) {
268 MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock();
269 DebugLoc DL;
270
271 MF.insert(MBBI: MF.end(), MBB: EarlyExitBlock);
272 BuildMI(BB&: *EarlyExitBlock, I: EarlyExitBlock->end(), MIMD: DL, MCID: TII->get(Opcode: MovOpc),
273 DestReg: ExecReg)
274 .addImm(Val: 0);
275 generateEndPgm(MBB&: *EarlyExitBlock, I: EarlyExitBlock->end(), DL, TII, MF);
276
277 for (MachineInstr *Instr : EarlyTermInstrs) {
278 // Early termination in GS does nothing
279 if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
280 earlyTerm(MI&: *Instr, EarlyExitBlock);
281 Instr->eraseFromParent();
282 }
283
284 EarlyTermInstrs.clear();
285 MadeChange = true;
286 }
287
288 // Now check return to epilog instructions occur at function end
289 if (!EpilogInstrs.empty()) {
290 MachineBasicBlock *EmptyMBBAtEnd = nullptr;
291 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
292
293 // If there are multiple returns to epilog then all will
294 // become jumps to new empty end block.
295 if (EpilogInstrs.size() > 1) {
296 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
297 MF.insert(MBBI: MF.end(), MBB: EmptyMBBAtEnd);
298 }
299
300 for (auto *MI : EpilogInstrs) {
301 auto *MBB = MI->getParent();
302 if (MBB == &MF.back() && MI == &MBB->back())
303 continue;
304
305 // SI_RETURN_TO_EPILOG is not the last instruction.
306 // Jump to empty block at function end.
307 if (!EmptyMBBAtEnd) {
308 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
309 MF.insert(MBBI: MF.end(), MBB: EmptyMBBAtEnd);
310 }
311
312 MBB->addSuccessor(Succ: EmptyMBBAtEnd);
313 MDT->insertEdge(From: MBB, To: EmptyMBBAtEnd);
314 BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_BRANCH))
315 .addMBB(MBB: EmptyMBBAtEnd);
316 MI->eraseFromParent();
317 MadeChange = true;
318 }
319
320 EpilogInstrs.clear();
321 }
322
323 return MadeChange;
324}
325