1//===-- SIPreEmitPeephole.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass performs the peephole optimizations before code emission.
11///
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
16#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17#include "llvm/CodeGen/MachineFunctionPass.h"
18#include "llvm/CodeGen/TargetSchedule.h"
19#include "llvm/Support/BranchProbability.h"
20
21using namespace llvm;
22
23#define DEBUG_TYPE "si-pre-emit-peephole"
24
25namespace {
26
27class SIPreEmitPeephole {
28private:
29 const SIInstrInfo *TII = nullptr;
30 const SIRegisterInfo *TRI = nullptr;
31
32 bool optimizeVccBranch(MachineInstr &MI) const;
33 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
34 bool getBlockDestinations(MachineBasicBlock &SrcMBB,
35 MachineBasicBlock *&TrueMBB,
36 MachineBasicBlock *&FalseMBB,
37 SmallVectorImpl<MachineOperand> &Cond);
38 bool mustRetainExeczBranch(const MachineInstr &Branch,
39 const MachineBasicBlock &From,
40 const MachineBasicBlock &To) const;
41 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
42
43public:
44 bool run(MachineFunction &MF);
45};
46
47class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
48public:
49 static char ID;
50
51 SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {
52 initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry());
53 }
54
55 bool runOnMachineFunction(MachineFunction &MF) override {
56 return SIPreEmitPeephole().run(MF);
57 }
58};
59
60} // End anonymous namespace.
61
62INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE,
63 "SI peephole optimizations", false, false)
64
65char SIPreEmitPeepholeLegacy::ID = 0;
66
67char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID;
68
69bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
70 // Match:
71 // sreg = -1 or 0
72 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
73 // S_CBRANCH_VCC[N]Z
74 // =>
75 // S_CBRANCH_EXEC[N]Z
76 // We end up with this pattern sometimes after basic block placement.
77 // It happens while combining a block which assigns -1 or 0 to a saved mask
78 // and another block which consumes that saved mask and then a branch.
79 //
80 // While searching this also performs the following substitution:
81 // vcc = V_CMP
82 // vcc = S_AND exec, vcc
83 // S_CBRANCH_VCC[N]Z
84 // =>
85 // vcc = V_CMP
86 // S_CBRANCH_VCC[N]Z
87
88 bool Changed = false;
89 MachineBasicBlock &MBB = *MI.getParent();
90 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
91 const bool IsWave32 = ST.isWave32();
92 const unsigned CondReg = TRI->getVCC();
93 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
94 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
95 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
96 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
97
98 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
99 E = MBB.rend();
100 bool ReadsCond = false;
101 unsigned Threshold = 5;
102 for (++A; A != E; ++A) {
103 if (!--Threshold)
104 return false;
105 if (A->modifiesRegister(Reg: ExecReg, TRI))
106 return false;
107 if (A->modifiesRegister(Reg: CondReg, TRI)) {
108 if (!A->definesRegister(Reg: CondReg, TRI) ||
109 (A->getOpcode() != And && A->getOpcode() != AndN2))
110 return false;
111 break;
112 }
113 ReadsCond |= A->readsRegister(Reg: CondReg, TRI);
114 }
115 if (A == E)
116 return false;
117
118 MachineOperand &Op1 = A->getOperand(i: 1);
119 MachineOperand &Op2 = A->getOperand(i: 2);
120 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
121 TII->commuteInstruction(MI&: *A);
122 Changed = true;
123 }
124 if (Op1.getReg() != ExecReg)
125 return Changed;
126 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
127 return Changed;
128
129 int64_t MaskValue = 0;
130 Register SReg;
131 if (Op2.isReg()) {
132 SReg = Op2.getReg();
133 auto M = std::next(x: A);
134 bool ReadsSreg = false;
135 bool ModifiesExec = false;
136 for (; M != E; ++M) {
137 if (M->definesRegister(Reg: SReg, TRI))
138 break;
139 if (M->modifiesRegister(Reg: SReg, TRI))
140 return Changed;
141 ReadsSreg |= M->readsRegister(Reg: SReg, TRI);
142 ModifiesExec |= M->modifiesRegister(Reg: ExecReg, TRI);
143 }
144 if (M == E)
145 return Changed;
146 // If SReg is VCC and SReg definition is a VALU comparison.
147 // This means S_AND with EXEC is not required.
148 // Erase the S_AND and return.
149 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
150 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
151 TII->isVOPC(MI: *M)) {
152 A->eraseFromParent();
153 return true;
154 }
155 if (!M->isMoveImmediate() || !M->getOperand(i: 1).isImm() ||
156 (M->getOperand(i: 1).getImm() != -1 && M->getOperand(i: 1).getImm() != 0))
157 return Changed;
158 MaskValue = M->getOperand(i: 1).getImm();
159 // First if sreg is only used in the AND instruction fold the immediate
160 // into the AND.
161 if (!ReadsSreg && Op2.isKill()) {
162 A->getOperand(i: 2).ChangeToImmediate(ImmVal: MaskValue);
163 M->eraseFromParent();
164 }
165 } else if (Op2.isImm()) {
166 MaskValue = Op2.getImm();
167 } else {
168 llvm_unreachable("Op2 must be register or immediate");
169 }
170
171 // Invert mask for s_andn2
172 assert(MaskValue == 0 || MaskValue == -1);
173 if (A->getOpcode() == AndN2)
174 MaskValue = ~MaskValue;
175
176 if (!ReadsCond && A->registerDefIsDead(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) {
177 if (!MI.killsRegister(Reg: CondReg, TRI)) {
178 // Replace AND with MOV
179 if (MaskValue == 0) {
180 BuildMI(BB&: *A->getParent(), I&: *A, MIMD: A->getDebugLoc(), MCID: TII->get(Opcode: Mov), DestReg: CondReg)
181 .addImm(Val: 0);
182 } else {
183 BuildMI(BB&: *A->getParent(), I&: *A, MIMD: A->getDebugLoc(), MCID: TII->get(Opcode: Mov), DestReg: CondReg)
184 .addReg(RegNo: ExecReg);
185 }
186 }
187 // Remove AND instruction
188 A->eraseFromParent();
189 }
190
191 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
192 if (SReg == ExecReg) {
193 // EXEC is updated directly
194 if (IsVCCZ) {
195 MI.eraseFromParent();
196 return true;
197 }
198 MI.setDesc(TII->get(Opcode: AMDGPU::S_BRANCH));
199 } else if (IsVCCZ && MaskValue == 0) {
200 // Will always branch
201 // Remove all successors shadowed by new unconditional branch
202 MachineBasicBlock *Parent = MI.getParent();
203 SmallVector<MachineInstr *, 4> ToRemove;
204 bool Found = false;
205 for (MachineInstr &Term : Parent->terminators()) {
206 if (Found) {
207 if (Term.isBranch())
208 ToRemove.push_back(Elt: &Term);
209 } else {
210 Found = Term.isIdenticalTo(Other: MI);
211 }
212 }
213 assert(Found && "conditional branch is not terminator");
214 for (auto *BranchMI : ToRemove) {
215 MachineOperand &Dst = BranchMI->getOperand(i: 0);
216 assert(Dst.isMBB() && "destination is not basic block");
217 Parent->removeSuccessor(Succ: Dst.getMBB());
218 BranchMI->eraseFromParent();
219 }
220
221 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
222 Parent->removeSuccessor(Succ);
223 }
224
225 // Rewrite to unconditional branch
226 MI.setDesc(TII->get(Opcode: AMDGPU::S_BRANCH));
227 } else if (!IsVCCZ && MaskValue == 0) {
228 // Will never branch
229 MachineOperand &Dst = MI.getOperand(i: 0);
230 assert(Dst.isMBB() && "destination is not basic block");
231 MI.getParent()->removeSuccessor(Succ: Dst.getMBB());
232 MI.eraseFromParent();
233 return true;
234 } else if (MaskValue == -1) {
235 // Depends only on EXEC
236 MI.setDesc(
237 TII->get(Opcode: IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
238 }
239
240 MI.removeOperand(OpNo: MI.findRegisterUseOperandIdx(Reg: CondReg, TRI, isKill: false /*Kill*/));
241 MI.addImplicitDefUseOperands(MF&: *MBB.getParent());
242
243 return true;
244}
245
246bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
247 MachineInstr &MI) const {
248 MachineBasicBlock &MBB = *MI.getParent();
249 const MachineFunction &MF = *MBB.getParent();
250 const MachineRegisterInfo &MRI = MF.getRegInfo();
251 MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
252 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
253 SmallVector<MachineInstr *, 4> ToRemove;
254 bool IdxOn = true;
255
256 if (!MI.isIdenticalTo(Other: First))
257 return false;
258
259 // Scan back to find an identical S_SET_GPR_IDX_ON
260 for (MachineBasicBlock::instr_iterator I = std::next(x: First.getIterator()),
261 E = MI.getIterator();
262 I != E; ++I) {
263 if (I->isBundle())
264 continue;
265 switch (I->getOpcode()) {
266 case AMDGPU::S_SET_GPR_IDX_MODE:
267 return false;
268 case AMDGPU::S_SET_GPR_IDX_OFF:
269 IdxOn = false;
270 ToRemove.push_back(Elt: &*I);
271 break;
272 default:
273 if (I->modifiesRegister(Reg: AMDGPU::M0, TRI))
274 return false;
275 if (IdxReg && I->modifiesRegister(Reg: IdxReg, TRI))
276 return false;
277 if (llvm::any_of(Range: I->operands(),
278 P: [&MRI, this](const MachineOperand &MO) {
279 return MO.isReg() &&
280 TRI->isVectorRegister(MRI, Reg: MO.getReg());
281 })) {
282 // The only exception allowed here is another indirect vector move
283 // with the same mode.
284 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
285 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
286 return false;
287 }
288 }
289 }
290
291 MI.eraseFromBundle();
292 for (MachineInstr *RI : ToRemove)
293 RI->eraseFromBundle();
294 return true;
295}
296
297bool SIPreEmitPeephole::getBlockDestinations(
298 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
299 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
300 if (TII->analyzeBranch(MBB&: SrcMBB, TBB&: TrueMBB, FBB&: FalseMBB, Cond))
301 return false;
302
303 if (!FalseMBB)
304 FalseMBB = SrcMBB.getNextNode();
305
306 return true;
307}
308
309namespace {
310class BranchWeightCostModel {
311 const SIInstrInfo &TII;
312 const TargetSchedModel &SchedModel;
313 BranchProbability BranchProb;
314 static constexpr uint64_t BranchNotTakenCost = 1;
315 uint64_t BranchTakenCost;
316 uint64_t ThenCyclesCost = 0;
317
318public:
319 BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
320 const MachineBasicBlock &Succ)
321 : TII(TII), SchedModel(TII.getSchedModel()) {
322 const MachineBasicBlock &Head = *Branch.getParent();
323 const auto *FromIt = find(Range: Head.successors(), Val: &Succ);
324 assert(FromIt != Head.succ_end());
325
326 BranchProb = Head.getSuccProbability(Succ: FromIt);
327 if (BranchProb.isUnknown())
328 BranchProb = BranchProbability::getZero();
329 BranchTakenCost = SchedModel.computeInstrLatency(MI: &Branch);
330 }
331
332 bool isProfitable(const MachineInstr &MI) {
333 if (TII.isWaitcnt(Opcode: MI.getOpcode()))
334 return false;
335
336 ThenCyclesCost += SchedModel.computeInstrLatency(MI: &MI);
337
338 // Consider `P = N/D` to be the probability of execz being false (skipping
339 // the then-block) The transformation is profitable if always executing the
340 // 'then' block is cheaper than executing sometimes 'then' and always
341 // executing s_cbranch_execz:
342 // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
343 // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
344 // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
345 // BranchNotTakenCost
346 uint64_t Numerator = BranchProb.getNumerator();
347 uint64_t Denominator = BranchProb.getDenominator();
348 return (Denominator - Numerator) * ThenCyclesCost <=
349 ((Denominator - Numerator) * BranchTakenCost +
350 Numerator * BranchNotTakenCost);
351 }
352};
353
354bool SIPreEmitPeephole::mustRetainExeczBranch(
355 const MachineInstr &Branch, const MachineBasicBlock &From,
356 const MachineBasicBlock &To) const {
357 assert(is_contained(Branch.getParent()->successors(), &From));
358 BranchWeightCostModel CostModel{*TII, Branch, From};
359
360 const MachineFunction *MF = From.getParent();
361 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
362 MBBI != End && MBBI != ToI; ++MBBI) {
363 const MachineBasicBlock &MBB = *MBBI;
364
365 for (const MachineInstr &MI : MBB) {
366 // When a uniform loop is inside non-uniform control flow, the branch
367 // leaving the loop might never be taken when EXEC = 0.
368 // Hence we should retain cbranch out of the loop lest it become infinite.
369 if (MI.isConditionalBranch())
370 return true;
371
372 if (MI.isUnconditionalBranch() &&
373 TII->getBranchDestBlock(MI) != MBB.getNextNode())
374 return true;
375
376 if (MI.isMetaInstruction())
377 continue;
378
379 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
380 return true;
381
382 if (!CostModel.isProfitable(MI))
383 return true;
384 }
385 }
386
387 return false;
388}
389} // namespace
390
391// Returns true if the skip branch instruction is removed.
392bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
393 MachineBasicBlock &SrcMBB) {
394
395 if (!TII->getSchedModel().hasInstrSchedModel())
396 return false;
397
398 MachineBasicBlock *TrueMBB = nullptr;
399 MachineBasicBlock *FalseMBB = nullptr;
400 SmallVector<MachineOperand, 1> Cond;
401
402 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
403 return false;
404
405 // Consider only the forward branches.
406 if (SrcMBB.getNumber() >= TrueMBB->getNumber())
407 return false;
408
409 // Consider only when it is legal and profitable
410 if (mustRetainExeczBranch(Branch: MI, From: *FalseMBB, To: *TrueMBB))
411 return false;
412
413 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
414 MI.eraseFromParent();
415 SrcMBB.removeSuccessor(Succ: TrueMBB);
416
417 return true;
418}
419
420PreservedAnalyses
421llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
422 MachineFunctionAnalysisManager &MFAM) {
423 if (!SIPreEmitPeephole().run(MF))
424 return PreservedAnalyses::all();
425
426 return getMachineFunctionPassPreservedAnalyses();
427}
428
429bool SIPreEmitPeephole::run(MachineFunction &MF) {
430 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
431 TII = ST.getInstrInfo();
432 TRI = &TII->getRegisterInfo();
433 bool Changed = false;
434
435 MF.RenumberBlocks();
436
437 for (MachineBasicBlock &MBB : MF) {
438 MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
439 // Check first terminator for branches to optimize
440 if (TermI != MBB.end()) {
441 MachineInstr &MI = *TermI;
442 switch (MI.getOpcode()) {
443 case AMDGPU::S_CBRANCH_VCCZ:
444 case AMDGPU::S_CBRANCH_VCCNZ:
445 Changed |= optimizeVccBranch(MI);
446 break;
447 case AMDGPU::S_CBRANCH_EXECZ:
448 Changed |= removeExeczBranch(MI, SrcMBB&: MBB);
449 break;
450 }
451 }
452
453 if (!ST.hasVGPRIndexMode())
454 continue;
455
456 MachineInstr *SetGPRMI = nullptr;
457 const unsigned Threshold = 20;
458 unsigned Count = 0;
459 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
460 // second is not needed. Do expensive checks in the optimizeSetGPR()
461 // and limit the distance to 20 instructions for compile time purposes.
462 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
463 // may be bundled with the instructions they modify.
464 for (auto &MI : make_early_inc_range(Range: MBB.instrs())) {
465 if (Count == Threshold)
466 SetGPRMI = nullptr;
467 else
468 ++Count;
469
470 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
471 continue;
472
473 Count = 0;
474 if (!SetGPRMI) {
475 SetGPRMI = &MI;
476 continue;
477 }
478
479 if (optimizeSetGPR(First&: *SetGPRMI, MI))
480 Changed = true;
481 else
482 SetGPRMI = &MI;
483 }
484 }
485
486 return Changed;
487}
488