1 | //===-- SIPreEmitPeephole.cpp ------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This pass performs the peephole optimizations before code emission. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPU.h" |
15 | #include "GCNSubtarget.h" |
16 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
17 | #include "llvm/CodeGen/MachineFunctionPass.h" |
18 | #include "llvm/CodeGen/TargetSchedule.h" |
19 | #include "llvm/Support/BranchProbability.h" |
20 | |
21 | using namespace llvm; |
22 | |
23 | #define DEBUG_TYPE "si-pre-emit-peephole" |
24 | |
25 | namespace { |
26 | |
27 | class SIPreEmitPeephole { |
28 | private: |
29 | const SIInstrInfo *TII = nullptr; |
30 | const SIRegisterInfo *TRI = nullptr; |
31 | |
32 | bool optimizeVccBranch(MachineInstr &MI) const; |
33 | bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; |
34 | bool getBlockDestinations(MachineBasicBlock &SrcMBB, |
35 | MachineBasicBlock *&TrueMBB, |
36 | MachineBasicBlock *&FalseMBB, |
37 | SmallVectorImpl<MachineOperand> &Cond); |
38 | bool mustRetainExeczBranch(const MachineInstr &Branch, |
39 | const MachineBasicBlock &From, |
40 | const MachineBasicBlock &To) const; |
41 | bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); |
42 | |
43 | public: |
44 | bool run(MachineFunction &MF); |
45 | }; |
46 | |
47 | class SIPreEmitPeepholeLegacy : public MachineFunctionPass { |
48 | public: |
49 | static char ID; |
50 | |
51 | SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) { |
52 | initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry()); |
53 | } |
54 | |
55 | bool runOnMachineFunction(MachineFunction &MF) override { |
56 | return SIPreEmitPeephole().run(MF); |
57 | } |
58 | }; |
59 | |
60 | } // End anonymous namespace. |
61 | |
62 | INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE, |
63 | "SI peephole optimizations" , false, false) |
64 | |
65 | char SIPreEmitPeepholeLegacy::ID = 0; |
66 | |
67 | char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID; |
68 | |
69 | bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { |
70 | // Match: |
71 | // sreg = -1 or 0 |
72 | // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg |
73 | // S_CBRANCH_VCC[N]Z |
74 | // => |
75 | // S_CBRANCH_EXEC[N]Z |
76 | // We end up with this pattern sometimes after basic block placement. |
77 | // It happens while combining a block which assigns -1 or 0 to a saved mask |
78 | // and another block which consumes that saved mask and then a branch. |
79 | // |
80 | // While searching this also performs the following substitution: |
81 | // vcc = V_CMP |
82 | // vcc = S_AND exec, vcc |
83 | // S_CBRANCH_VCC[N]Z |
84 | // => |
85 | // vcc = V_CMP |
86 | // S_CBRANCH_VCC[N]Z |
87 | |
88 | bool Changed = false; |
89 | MachineBasicBlock &MBB = *MI.getParent(); |
90 | const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); |
91 | const bool IsWave32 = ST.isWave32(); |
92 | const unsigned CondReg = TRI->getVCC(); |
93 | const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
94 | const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; |
95 | const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; |
96 | const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
97 | |
98 | MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), |
99 | E = MBB.rend(); |
100 | bool ReadsCond = false; |
101 | unsigned Threshold = 5; |
102 | for (++A; A != E; ++A) { |
103 | if (!--Threshold) |
104 | return false; |
105 | if (A->modifiesRegister(Reg: ExecReg, TRI)) |
106 | return false; |
107 | if (A->modifiesRegister(Reg: CondReg, TRI)) { |
108 | if (!A->definesRegister(Reg: CondReg, TRI) || |
109 | (A->getOpcode() != And && A->getOpcode() != AndN2)) |
110 | return false; |
111 | break; |
112 | } |
113 | ReadsCond |= A->readsRegister(Reg: CondReg, TRI); |
114 | } |
115 | if (A == E) |
116 | return false; |
117 | |
118 | MachineOperand &Op1 = A->getOperand(i: 1); |
119 | MachineOperand &Op2 = A->getOperand(i: 2); |
120 | if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { |
121 | TII->commuteInstruction(MI&: *A); |
122 | Changed = true; |
123 | } |
124 | if (Op1.getReg() != ExecReg) |
125 | return Changed; |
126 | if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0)) |
127 | return Changed; |
128 | |
129 | int64_t MaskValue = 0; |
130 | Register SReg; |
131 | if (Op2.isReg()) { |
132 | SReg = Op2.getReg(); |
133 | auto M = std::next(x: A); |
134 | bool ReadsSreg = false; |
135 | bool ModifiesExec = false; |
136 | for (; M != E; ++M) { |
137 | if (M->definesRegister(Reg: SReg, TRI)) |
138 | break; |
139 | if (M->modifiesRegister(Reg: SReg, TRI)) |
140 | return Changed; |
141 | ReadsSreg |= M->readsRegister(Reg: SReg, TRI); |
142 | ModifiesExec |= M->modifiesRegister(Reg: ExecReg, TRI); |
143 | } |
144 | if (M == E) |
145 | return Changed; |
146 | // If SReg is VCC and SReg definition is a VALU comparison. |
147 | // This means S_AND with EXEC is not required. |
148 | // Erase the S_AND and return. |
149 | // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS |
150 | if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec && |
151 | TII->isVOPC(MI: *M)) { |
152 | A->eraseFromParent(); |
153 | return true; |
154 | } |
155 | if (!M->isMoveImmediate() || !M->getOperand(i: 1).isImm() || |
156 | (M->getOperand(i: 1).getImm() != -1 && M->getOperand(i: 1).getImm() != 0)) |
157 | return Changed; |
158 | MaskValue = M->getOperand(i: 1).getImm(); |
159 | // First if sreg is only used in the AND instruction fold the immediate |
160 | // into the AND. |
161 | if (!ReadsSreg && Op2.isKill()) { |
162 | A->getOperand(i: 2).ChangeToImmediate(ImmVal: MaskValue); |
163 | M->eraseFromParent(); |
164 | } |
165 | } else if (Op2.isImm()) { |
166 | MaskValue = Op2.getImm(); |
167 | } else { |
168 | llvm_unreachable("Op2 must be register or immediate" ); |
169 | } |
170 | |
171 | // Invert mask for s_andn2 |
172 | assert(MaskValue == 0 || MaskValue == -1); |
173 | if (A->getOpcode() == AndN2) |
174 | MaskValue = ~MaskValue; |
175 | |
176 | if (!ReadsCond && A->registerDefIsDead(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) { |
177 | if (!MI.killsRegister(Reg: CondReg, TRI)) { |
178 | // Replace AND with MOV |
179 | if (MaskValue == 0) { |
180 | BuildMI(BB&: *A->getParent(), I&: *A, MIMD: A->getDebugLoc(), MCID: TII->get(Opcode: Mov), DestReg: CondReg) |
181 | .addImm(Val: 0); |
182 | } else { |
183 | BuildMI(BB&: *A->getParent(), I&: *A, MIMD: A->getDebugLoc(), MCID: TII->get(Opcode: Mov), DestReg: CondReg) |
184 | .addReg(RegNo: ExecReg); |
185 | } |
186 | } |
187 | // Remove AND instruction |
188 | A->eraseFromParent(); |
189 | } |
190 | |
191 | bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; |
192 | if (SReg == ExecReg) { |
193 | // EXEC is updated directly |
194 | if (IsVCCZ) { |
195 | MI.eraseFromParent(); |
196 | return true; |
197 | } |
198 | MI.setDesc(TII->get(Opcode: AMDGPU::S_BRANCH)); |
199 | } else if (IsVCCZ && MaskValue == 0) { |
200 | // Will always branch |
201 | // Remove all successors shadowed by new unconditional branch |
202 | MachineBasicBlock *Parent = MI.getParent(); |
203 | SmallVector<MachineInstr *, 4> ToRemove; |
204 | bool Found = false; |
205 | for (MachineInstr &Term : Parent->terminators()) { |
206 | if (Found) { |
207 | if (Term.isBranch()) |
208 | ToRemove.push_back(Elt: &Term); |
209 | } else { |
210 | Found = Term.isIdenticalTo(Other: MI); |
211 | } |
212 | } |
213 | assert(Found && "conditional branch is not terminator" ); |
214 | for (auto *BranchMI : ToRemove) { |
215 | MachineOperand &Dst = BranchMI->getOperand(i: 0); |
216 | assert(Dst.isMBB() && "destination is not basic block" ); |
217 | Parent->removeSuccessor(Succ: Dst.getMBB()); |
218 | BranchMI->eraseFromParent(); |
219 | } |
220 | |
221 | if (MachineBasicBlock *Succ = Parent->getFallThrough()) { |
222 | Parent->removeSuccessor(Succ); |
223 | } |
224 | |
225 | // Rewrite to unconditional branch |
226 | MI.setDesc(TII->get(Opcode: AMDGPU::S_BRANCH)); |
227 | } else if (!IsVCCZ && MaskValue == 0) { |
228 | // Will never branch |
229 | MachineOperand &Dst = MI.getOperand(i: 0); |
230 | assert(Dst.isMBB() && "destination is not basic block" ); |
231 | MI.getParent()->removeSuccessor(Succ: Dst.getMBB()); |
232 | MI.eraseFromParent(); |
233 | return true; |
234 | } else if (MaskValue == -1) { |
235 | // Depends only on EXEC |
236 | MI.setDesc( |
237 | TII->get(Opcode: IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); |
238 | } |
239 | |
240 | MI.removeOperand(OpNo: MI.findRegisterUseOperandIdx(Reg: CondReg, TRI, isKill: false /*Kill*/)); |
241 | MI.addImplicitDefUseOperands(MF&: *MBB.getParent()); |
242 | |
243 | return true; |
244 | } |
245 | |
246 | bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, |
247 | MachineInstr &MI) const { |
248 | MachineBasicBlock &MBB = *MI.getParent(); |
249 | const MachineFunction &MF = *MBB.getParent(); |
250 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
251 | MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
252 | Register IdxReg = Idx->isReg() ? Idx->getReg() : Register(); |
253 | SmallVector<MachineInstr *, 4> ToRemove; |
254 | bool IdxOn = true; |
255 | |
256 | if (!MI.isIdenticalTo(Other: First)) |
257 | return false; |
258 | |
259 | // Scan back to find an identical S_SET_GPR_IDX_ON |
260 | for (MachineBasicBlock::instr_iterator I = std::next(x: First.getIterator()), |
261 | E = MI.getIterator(); |
262 | I != E; ++I) { |
263 | if (I->isBundle()) |
264 | continue; |
265 | switch (I->getOpcode()) { |
266 | case AMDGPU::S_SET_GPR_IDX_MODE: |
267 | return false; |
268 | case AMDGPU::S_SET_GPR_IDX_OFF: |
269 | IdxOn = false; |
270 | ToRemove.push_back(Elt: &*I); |
271 | break; |
272 | default: |
273 | if (I->modifiesRegister(Reg: AMDGPU::M0, TRI)) |
274 | return false; |
275 | if (IdxReg && I->modifiesRegister(Reg: IdxReg, TRI)) |
276 | return false; |
277 | if (llvm::any_of(Range: I->operands(), |
278 | P: [&MRI, this](const MachineOperand &MO) { |
279 | return MO.isReg() && |
280 | TRI->isVectorRegister(MRI, Reg: MO.getReg()); |
281 | })) { |
282 | // The only exception allowed here is another indirect vector move |
283 | // with the same mode. |
284 | if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write || |
285 | I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read)) |
286 | return false; |
287 | } |
288 | } |
289 | } |
290 | |
291 | MI.eraseFromBundle(); |
292 | for (MachineInstr *RI : ToRemove) |
293 | RI->eraseFromBundle(); |
294 | return true; |
295 | } |
296 | |
297 | bool SIPreEmitPeephole::getBlockDestinations( |
298 | MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, |
299 | MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) { |
300 | if (TII->analyzeBranch(MBB&: SrcMBB, TBB&: TrueMBB, FBB&: FalseMBB, Cond)) |
301 | return false; |
302 | |
303 | if (!FalseMBB) |
304 | FalseMBB = SrcMBB.getNextNode(); |
305 | |
306 | return true; |
307 | } |
308 | |
309 | namespace { |
310 | class BranchWeightCostModel { |
311 | const SIInstrInfo &TII; |
312 | const TargetSchedModel &SchedModel; |
313 | BranchProbability BranchProb; |
314 | static constexpr uint64_t BranchNotTakenCost = 1; |
315 | uint64_t BranchTakenCost; |
316 | uint64_t ThenCyclesCost = 0; |
317 | |
318 | public: |
319 | BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch, |
320 | const MachineBasicBlock &Succ) |
321 | : TII(TII), SchedModel(TII.getSchedModel()) { |
322 | const MachineBasicBlock &Head = *Branch.getParent(); |
323 | const auto *FromIt = find(Range: Head.successors(), Val: &Succ); |
324 | assert(FromIt != Head.succ_end()); |
325 | |
326 | BranchProb = Head.getSuccProbability(Succ: FromIt); |
327 | if (BranchProb.isUnknown()) |
328 | BranchProb = BranchProbability::getZero(); |
329 | BranchTakenCost = SchedModel.computeInstrLatency(MI: &Branch); |
330 | } |
331 | |
332 | bool isProfitable(const MachineInstr &MI) { |
333 | if (TII.isWaitcnt(Opcode: MI.getOpcode())) |
334 | return false; |
335 | |
336 | ThenCyclesCost += SchedModel.computeInstrLatency(MI: &MI); |
337 | |
338 | // Consider `P = N/D` to be the probability of execz being false (skipping |
339 | // the then-block) The transformation is profitable if always executing the |
340 | // 'then' block is cheaper than executing sometimes 'then' and always |
341 | // executing s_cbranch_execz: |
342 | // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost |
343 | // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost |
344 | // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D * |
345 | // BranchNotTakenCost |
346 | uint64_t Numerator = BranchProb.getNumerator(); |
347 | uint64_t Denominator = BranchProb.getDenominator(); |
348 | return (Denominator - Numerator) * ThenCyclesCost <= |
349 | ((Denominator - Numerator) * BranchTakenCost + |
350 | Numerator * BranchNotTakenCost); |
351 | } |
352 | }; |
353 | |
354 | bool SIPreEmitPeephole::mustRetainExeczBranch( |
355 | const MachineInstr &Branch, const MachineBasicBlock &From, |
356 | const MachineBasicBlock &To) const { |
357 | assert(is_contained(Branch.getParent()->successors(), &From)); |
358 | BranchWeightCostModel CostModel{*TII, Branch, From}; |
359 | |
360 | const MachineFunction *MF = From.getParent(); |
361 | for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); |
362 | MBBI != End && MBBI != ToI; ++MBBI) { |
363 | const MachineBasicBlock &MBB = *MBBI; |
364 | |
365 | for (const MachineInstr &MI : MBB) { |
366 | // When a uniform loop is inside non-uniform control flow, the branch |
367 | // leaving the loop might never be taken when EXEC = 0. |
368 | // Hence we should retain cbranch out of the loop lest it become infinite. |
369 | if (MI.isConditionalBranch()) |
370 | return true; |
371 | |
372 | if (MI.isUnconditionalBranch() && |
373 | TII->getBranchDestBlock(MI) != MBB.getNextNode()) |
374 | return true; |
375 | |
376 | if (MI.isMetaInstruction()) |
377 | continue; |
378 | |
379 | if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) |
380 | return true; |
381 | |
382 | if (!CostModel.isProfitable(MI)) |
383 | return true; |
384 | } |
385 | } |
386 | |
387 | return false; |
388 | } |
389 | } // namespace |
390 | |
391 | // Returns true if the skip branch instruction is removed. |
392 | bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, |
393 | MachineBasicBlock &SrcMBB) { |
394 | |
395 | if (!TII->getSchedModel().hasInstrSchedModel()) |
396 | return false; |
397 | |
398 | MachineBasicBlock *TrueMBB = nullptr; |
399 | MachineBasicBlock *FalseMBB = nullptr; |
400 | SmallVector<MachineOperand, 1> Cond; |
401 | |
402 | if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) |
403 | return false; |
404 | |
405 | // Consider only the forward branches. |
406 | if (SrcMBB.getNumber() >= TrueMBB->getNumber()) |
407 | return false; |
408 | |
409 | // Consider only when it is legal and profitable |
410 | if (mustRetainExeczBranch(Branch: MI, From: *FalseMBB, To: *TrueMBB)) |
411 | return false; |
412 | |
413 | LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); |
414 | MI.eraseFromParent(); |
415 | SrcMBB.removeSuccessor(Succ: TrueMBB); |
416 | |
417 | return true; |
418 | } |
419 | |
420 | PreservedAnalyses |
421 | llvm::SIPreEmitPeepholePass::run(MachineFunction &MF, |
422 | MachineFunctionAnalysisManager &MFAM) { |
423 | if (!SIPreEmitPeephole().run(MF)) |
424 | return PreservedAnalyses::all(); |
425 | |
426 | return getMachineFunctionPassPreservedAnalyses(); |
427 | } |
428 | |
429 | bool SIPreEmitPeephole::run(MachineFunction &MF) { |
430 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
431 | TII = ST.getInstrInfo(); |
432 | TRI = &TII->getRegisterInfo(); |
433 | bool Changed = false; |
434 | |
435 | MF.RenumberBlocks(); |
436 | |
437 | for (MachineBasicBlock &MBB : MF) { |
438 | MachineBasicBlock::iterator TermI = MBB.getFirstTerminator(); |
439 | // Check first terminator for branches to optimize |
440 | if (TermI != MBB.end()) { |
441 | MachineInstr &MI = *TermI; |
442 | switch (MI.getOpcode()) { |
443 | case AMDGPU::S_CBRANCH_VCCZ: |
444 | case AMDGPU::S_CBRANCH_VCCNZ: |
445 | Changed |= optimizeVccBranch(MI); |
446 | break; |
447 | case AMDGPU::S_CBRANCH_EXECZ: |
448 | Changed |= removeExeczBranch(MI, SrcMBB&: MBB); |
449 | break; |
450 | } |
451 | } |
452 | |
453 | if (!ST.hasVGPRIndexMode()) |
454 | continue; |
455 | |
456 | MachineInstr *SetGPRMI = nullptr; |
457 | const unsigned Threshold = 20; |
458 | unsigned Count = 0; |
459 | // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a |
460 | // second is not needed. Do expensive checks in the optimizeSetGPR() |
461 | // and limit the distance to 20 instructions for compile time purposes. |
462 | // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions |
463 | // may be bundled with the instructions they modify. |
464 | for (auto &MI : make_early_inc_range(Range: MBB.instrs())) { |
465 | if (Count == Threshold) |
466 | SetGPRMI = nullptr; |
467 | else |
468 | ++Count; |
469 | |
470 | if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON) |
471 | continue; |
472 | |
473 | Count = 0; |
474 | if (!SetGPRMI) { |
475 | SetGPRMI = &MI; |
476 | continue; |
477 | } |
478 | |
479 | if (optimizeSetGPR(First&: *SetGPRMI, MI)) |
480 | Changed = true; |
481 | else |
482 | SetGPRMI = &MI; |
483 | } |
484 | } |
485 | |
486 | return Changed; |
487 | } |
488 | |