1 | //===-- SIPreEmitPeephole.cpp ------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This pass performs the peephole optimizations before code emission. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPU.h" |
15 | #include "GCNSubtarget.h" |
16 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
17 | #include "llvm/CodeGen/MachineFunctionPass.h" |
18 | |
19 | using namespace llvm; |
20 | |
21 | #define DEBUG_TYPE "si-pre-emit-peephole" |
22 | |
23 | static unsigned SkipThreshold; |
24 | |
25 | static cl::opt<unsigned, true> SkipThresholdFlag( |
26 | "amdgpu-skip-threshold" , cl::Hidden, |
27 | cl::desc( |
28 | "Number of instructions before jumping over divergent control flow" ), |
29 | cl::location(L&: SkipThreshold), cl::init(Val: 12)); |
30 | |
31 | namespace { |
32 | |
33 | class SIPreEmitPeephole : public MachineFunctionPass { |
34 | private: |
35 | const SIInstrInfo *TII = nullptr; |
36 | const SIRegisterInfo *TRI = nullptr; |
37 | |
38 | bool optimizeVccBranch(MachineInstr &MI) const; |
39 | bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; |
40 | bool getBlockDestinations(MachineBasicBlock &SrcMBB, |
41 | MachineBasicBlock *&TrueMBB, |
42 | MachineBasicBlock *&FalseMBB, |
43 | SmallVectorImpl<MachineOperand> &Cond); |
44 | bool mustRetainExeczBranch(const MachineBasicBlock &From, |
45 | const MachineBasicBlock &To) const; |
46 | bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); |
47 | |
48 | public: |
49 | static char ID; |
50 | |
51 | SIPreEmitPeephole() : MachineFunctionPass(ID) { |
52 | initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry()); |
53 | } |
54 | |
55 | bool runOnMachineFunction(MachineFunction &MF) override; |
56 | }; |
57 | |
58 | } // End anonymous namespace. |
59 | |
60 | INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE, |
61 | "SI peephole optimizations" , false, false) |
62 | |
63 | char SIPreEmitPeephole::ID = 0; |
64 | |
65 | char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID; |
66 | |
67 | bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { |
68 | // Match: |
69 | // sreg = -1 or 0 |
70 | // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg |
71 | // S_CBRANCH_VCC[N]Z |
72 | // => |
73 | // S_CBRANCH_EXEC[N]Z |
74 | // We end up with this pattern sometimes after basic block placement. |
75 | // It happens while combining a block which assigns -1 or 0 to a saved mask |
76 | // and another block which consumes that saved mask and then a branch. |
77 | // |
78 | // While searching this also performs the following substitution: |
79 | // vcc = V_CMP |
80 | // vcc = S_AND exec, vcc |
81 | // S_CBRANCH_VCC[N]Z |
82 | // => |
83 | // vcc = V_CMP |
84 | // S_CBRANCH_VCC[N]Z |
85 | |
86 | bool Changed = false; |
87 | MachineBasicBlock &MBB = *MI.getParent(); |
88 | const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); |
89 | const bool IsWave32 = ST.isWave32(); |
90 | const unsigned CondReg = TRI->getVCC(); |
91 | const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
92 | const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; |
93 | const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; |
94 | const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
95 | |
96 | MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), |
97 | E = MBB.rend(); |
98 | bool ReadsCond = false; |
99 | unsigned Threshold = 5; |
100 | for (++A; A != E; ++A) { |
101 | if (!--Threshold) |
102 | return false; |
103 | if (A->modifiesRegister(Reg: ExecReg, TRI)) |
104 | return false; |
105 | if (A->modifiesRegister(Reg: CondReg, TRI)) { |
106 | if (!A->definesRegister(Reg: CondReg, TRI) || |
107 | (A->getOpcode() != And && A->getOpcode() != AndN2)) |
108 | return false; |
109 | break; |
110 | } |
111 | ReadsCond |= A->readsRegister(Reg: CondReg, TRI); |
112 | } |
113 | if (A == E) |
114 | return false; |
115 | |
116 | MachineOperand &Op1 = A->getOperand(i: 1); |
117 | MachineOperand &Op2 = A->getOperand(i: 2); |
118 | if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { |
119 | TII->commuteInstruction(MI&: *A); |
120 | Changed = true; |
121 | } |
122 | if (Op1.getReg() != ExecReg) |
123 | return Changed; |
124 | if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0)) |
125 | return Changed; |
126 | |
127 | int64_t MaskValue = 0; |
128 | Register SReg; |
129 | if (Op2.isReg()) { |
130 | SReg = Op2.getReg(); |
131 | auto M = std::next(x: A); |
132 | bool ReadsSreg = false; |
133 | bool ModifiesExec = false; |
134 | for (; M != E; ++M) { |
135 | if (M->definesRegister(Reg: SReg, TRI)) |
136 | break; |
137 | if (M->modifiesRegister(Reg: SReg, TRI)) |
138 | return Changed; |
139 | ReadsSreg |= M->readsRegister(Reg: SReg, TRI); |
140 | ModifiesExec |= M->modifiesRegister(Reg: ExecReg, TRI); |
141 | } |
142 | if (M == E) |
143 | return Changed; |
144 | // If SReg is VCC and SReg definition is a VALU comparison. |
145 | // This means S_AND with EXEC is not required. |
146 | // Erase the S_AND and return. |
147 | // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS |
148 | if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec && |
149 | TII->isVOPC(MI: *M)) { |
150 | A->eraseFromParent(); |
151 | return true; |
152 | } |
153 | if (!M->isMoveImmediate() || !M->getOperand(i: 1).isImm() || |
154 | (M->getOperand(i: 1).getImm() != -1 && M->getOperand(i: 1).getImm() != 0)) |
155 | return Changed; |
156 | MaskValue = M->getOperand(i: 1).getImm(); |
157 | // First if sreg is only used in the AND instruction fold the immediate |
158 | // into the AND. |
159 | if (!ReadsSreg && Op2.isKill()) { |
160 | A->getOperand(i: 2).ChangeToImmediate(ImmVal: MaskValue); |
161 | M->eraseFromParent(); |
162 | } |
163 | } else if (Op2.isImm()) { |
164 | MaskValue = Op2.getImm(); |
165 | } else { |
166 | llvm_unreachable("Op2 must be register or immediate" ); |
167 | } |
168 | |
169 | // Invert mask for s_andn2 |
170 | assert(MaskValue == 0 || MaskValue == -1); |
171 | if (A->getOpcode() == AndN2) |
172 | MaskValue = ~MaskValue; |
173 | |
174 | if (!ReadsCond && A->registerDefIsDead(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) { |
175 | if (!MI.killsRegister(Reg: CondReg, TRI)) { |
176 | // Replace AND with MOV |
177 | if (MaskValue == 0) { |
178 | BuildMI(BB&: *A->getParent(), I&: *A, MIMD: A->getDebugLoc(), MCID: TII->get(Opcode: Mov), DestReg: CondReg) |
179 | .addImm(Val: 0); |
180 | } else { |
181 | BuildMI(BB&: *A->getParent(), I&: *A, MIMD: A->getDebugLoc(), MCID: TII->get(Opcode: Mov), DestReg: CondReg) |
182 | .addReg(RegNo: ExecReg); |
183 | } |
184 | } |
185 | // Remove AND instruction |
186 | A->eraseFromParent(); |
187 | } |
188 | |
189 | bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; |
190 | if (SReg == ExecReg) { |
191 | // EXEC is updated directly |
192 | if (IsVCCZ) { |
193 | MI.eraseFromParent(); |
194 | return true; |
195 | } |
196 | MI.setDesc(TII->get(Opcode: AMDGPU::S_BRANCH)); |
197 | } else if (IsVCCZ && MaskValue == 0) { |
198 | // Will always branch |
199 | // Remove all successors shadowed by new unconditional branch |
200 | MachineBasicBlock *Parent = MI.getParent(); |
201 | SmallVector<MachineInstr *, 4> ToRemove; |
202 | bool Found = false; |
203 | for (MachineInstr &Term : Parent->terminators()) { |
204 | if (Found) { |
205 | if (Term.isBranch()) |
206 | ToRemove.push_back(Elt: &Term); |
207 | } else { |
208 | Found = Term.isIdenticalTo(Other: MI); |
209 | } |
210 | } |
211 | assert(Found && "conditional branch is not terminator" ); |
212 | for (auto *BranchMI : ToRemove) { |
213 | MachineOperand &Dst = BranchMI->getOperand(i: 0); |
214 | assert(Dst.isMBB() && "destination is not basic block" ); |
215 | Parent->removeSuccessor(Succ: Dst.getMBB()); |
216 | BranchMI->eraseFromParent(); |
217 | } |
218 | |
219 | if (MachineBasicBlock *Succ = Parent->getFallThrough()) { |
220 | Parent->removeSuccessor(Succ); |
221 | } |
222 | |
223 | // Rewrite to unconditional branch |
224 | MI.setDesc(TII->get(Opcode: AMDGPU::S_BRANCH)); |
225 | } else if (!IsVCCZ && MaskValue == 0) { |
226 | // Will never branch |
227 | MachineOperand &Dst = MI.getOperand(i: 0); |
228 | assert(Dst.isMBB() && "destination is not basic block" ); |
229 | MI.getParent()->removeSuccessor(Succ: Dst.getMBB()); |
230 | MI.eraseFromParent(); |
231 | return true; |
232 | } else if (MaskValue == -1) { |
233 | // Depends only on EXEC |
234 | MI.setDesc( |
235 | TII->get(Opcode: IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); |
236 | } |
237 | |
238 | MI.removeOperand(OpNo: MI.findRegisterUseOperandIdx(Reg: CondReg, TRI, isKill: false /*Kill*/)); |
239 | MI.addImplicitDefUseOperands(MF&: *MBB.getParent()); |
240 | |
241 | return true; |
242 | } |
243 | |
244 | bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, |
245 | MachineInstr &MI) const { |
246 | MachineBasicBlock &MBB = *MI.getParent(); |
247 | const MachineFunction &MF = *MBB.getParent(); |
248 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
249 | MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
250 | Register IdxReg = Idx->isReg() ? Idx->getReg() : Register(); |
251 | SmallVector<MachineInstr *, 4> ToRemove; |
252 | bool IdxOn = true; |
253 | |
254 | if (!MI.isIdenticalTo(Other: First)) |
255 | return false; |
256 | |
257 | // Scan back to find an identical S_SET_GPR_IDX_ON |
258 | for (MachineBasicBlock::instr_iterator I = std::next(x: First.getIterator()), |
259 | E = MI.getIterator(); |
260 | I != E; ++I) { |
261 | if (I->isBundle()) |
262 | continue; |
263 | switch (I->getOpcode()) { |
264 | case AMDGPU::S_SET_GPR_IDX_MODE: |
265 | return false; |
266 | case AMDGPU::S_SET_GPR_IDX_OFF: |
267 | IdxOn = false; |
268 | ToRemove.push_back(Elt: &*I); |
269 | break; |
270 | default: |
271 | if (I->modifiesRegister(Reg: AMDGPU::M0, TRI)) |
272 | return false; |
273 | if (IdxReg && I->modifiesRegister(Reg: IdxReg, TRI)) |
274 | return false; |
275 | if (llvm::any_of(Range: I->operands(), |
276 | P: [&MRI, this](const MachineOperand &MO) { |
277 | return MO.isReg() && |
278 | TRI->isVectorRegister(MRI, Reg: MO.getReg()); |
279 | })) { |
280 | // The only exception allowed here is another indirect vector move |
281 | // with the same mode. |
282 | if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write || |
283 | I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read)) |
284 | return false; |
285 | } |
286 | } |
287 | } |
288 | |
289 | MI.eraseFromBundle(); |
290 | for (MachineInstr *RI : ToRemove) |
291 | RI->eraseFromBundle(); |
292 | return true; |
293 | } |
294 | |
295 | bool SIPreEmitPeephole::getBlockDestinations( |
296 | MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, |
297 | MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) { |
298 | if (TII->analyzeBranch(MBB&: SrcMBB, TBB&: TrueMBB, FBB&: FalseMBB, Cond)) |
299 | return false; |
300 | |
301 | if (!FalseMBB) |
302 | FalseMBB = SrcMBB.getNextNode(); |
303 | |
304 | return true; |
305 | } |
306 | |
307 | bool SIPreEmitPeephole::mustRetainExeczBranch( |
308 | const MachineBasicBlock &From, const MachineBasicBlock &To) const { |
309 | unsigned NumInstr = 0; |
310 | const MachineFunction *MF = From.getParent(); |
311 | |
312 | for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); |
313 | MBBI != End && MBBI != ToI; ++MBBI) { |
314 | const MachineBasicBlock &MBB = *MBBI; |
315 | |
316 | for (const MachineInstr &MI : MBB) { |
317 | // When a uniform loop is inside non-uniform control flow, the branch |
318 | // leaving the loop might never be taken when EXEC = 0. |
319 | // Hence we should retain cbranch out of the loop lest it become infinite. |
320 | if (MI.isConditionalBranch()) |
321 | return true; |
322 | |
323 | if (MI.isMetaInstruction()) |
324 | continue; |
325 | |
326 | if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) |
327 | return true; |
328 | |
329 | // These instructions are potentially expensive even if EXEC = 0. |
330 | if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) || |
331 | TII->isDS(MI) || TII->isWaitcnt(Opcode: MI.getOpcode())) |
332 | return true; |
333 | |
334 | ++NumInstr; |
335 | if (NumInstr >= SkipThreshold) |
336 | return true; |
337 | } |
338 | } |
339 | |
340 | return false; |
341 | } |
342 | |
343 | // Returns true if the skip branch instruction is removed. |
344 | bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, |
345 | MachineBasicBlock &SrcMBB) { |
346 | MachineBasicBlock *TrueMBB = nullptr; |
347 | MachineBasicBlock *FalseMBB = nullptr; |
348 | SmallVector<MachineOperand, 1> Cond; |
349 | |
350 | if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) |
351 | return false; |
352 | |
353 | // Consider only the forward branches. |
354 | if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || |
355 | mustRetainExeczBranch(From: *FalseMBB, To: *TrueMBB)) |
356 | return false; |
357 | |
358 | LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); |
359 | MI.eraseFromParent(); |
360 | SrcMBB.removeSuccessor(Succ: TrueMBB); |
361 | |
362 | return true; |
363 | } |
364 | |
365 | bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { |
366 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
367 | TII = ST.getInstrInfo(); |
368 | TRI = &TII->getRegisterInfo(); |
369 | bool Changed = false; |
370 | |
371 | MF.RenumberBlocks(); |
372 | |
373 | for (MachineBasicBlock &MBB : MF) { |
374 | MachineBasicBlock::iterator TermI = MBB.getFirstTerminator(); |
375 | // Check first terminator for branches to optimize |
376 | if (TermI != MBB.end()) { |
377 | MachineInstr &MI = *TermI; |
378 | switch (MI.getOpcode()) { |
379 | case AMDGPU::S_CBRANCH_VCCZ: |
380 | case AMDGPU::S_CBRANCH_VCCNZ: |
381 | Changed |= optimizeVccBranch(MI); |
382 | break; |
383 | case AMDGPU::S_CBRANCH_EXECZ: |
384 | Changed |= removeExeczBranch(MI, SrcMBB&: MBB); |
385 | break; |
386 | } |
387 | } |
388 | |
389 | if (!ST.hasVGPRIndexMode()) |
390 | continue; |
391 | |
392 | MachineInstr *SetGPRMI = nullptr; |
393 | const unsigned Threshold = 20; |
394 | unsigned Count = 0; |
395 | // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a |
396 | // second is not needed. Do expensive checks in the optimizeSetGPR() |
397 | // and limit the distance to 20 instructions for compile time purposes. |
398 | // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions |
399 | // may be bundled with the instructions they modify. |
400 | for (auto &MI : make_early_inc_range(Range: MBB.instrs())) { |
401 | if (Count == Threshold) |
402 | SetGPRMI = nullptr; |
403 | else |
404 | ++Count; |
405 | |
406 | if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON) |
407 | continue; |
408 | |
409 | Count = 0; |
410 | if (!SetGPRMI) { |
411 | SetGPRMI = &MI; |
412 | continue; |
413 | } |
414 | |
415 | if (optimizeSetGPR(First&: *SetGPRMI, MI)) |
416 | Changed = true; |
417 | else |
418 | SetGPRMI = &MI; |
419 | } |
420 | } |
421 | |
422 | return Changed; |
423 | } |
424 | |