1 | //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "AMDGPU.h" |
10 | #include "GCNSubtarget.h" |
11 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
12 | #include "SIRegisterInfo.h" |
13 | #include "llvm/ADT/SmallVector.h" |
14 | #include "llvm/CodeGen/LiveRegUnits.h" |
15 | #include "llvm/CodeGen/MachineFunctionPass.h" |
16 | #include "llvm/CodeGen/MachineOperand.h" |
17 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
18 | #include "llvm/InitializePasses.h" |
19 | |
20 | using namespace llvm; |
21 | |
22 | #define DEBUG_TYPE "si-optimize-exec-masking" |
23 | |
24 | namespace { |
25 | |
26 | class SIOptimizeExecMasking : public MachineFunctionPass { |
27 | MachineFunction *MF = nullptr; |
28 | const GCNSubtarget *ST = nullptr; |
29 | const SIRegisterInfo *TRI = nullptr; |
30 | const SIInstrInfo *TII = nullptr; |
31 | const MachineRegisterInfo *MRI = nullptr; |
32 | MCRegister Exec; |
33 | |
34 | DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; |
35 | SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors; |
36 | SmallVector<MachineOperand *, 1> KillFlagCandidates; |
37 | |
38 | Register isCopyFromExec(const MachineInstr &MI) const; |
39 | Register isCopyToExec(const MachineInstr &MI) const; |
40 | bool removeTerminatorBit(MachineInstr &MI) const; |
41 | MachineBasicBlock::reverse_iterator |
42 | fixTerminators(MachineBasicBlock &MBB) const; |
43 | MachineBasicBlock::reverse_iterator |
44 | findExecCopy(MachineBasicBlock &MBB, |
45 | MachineBasicBlock::reverse_iterator I) const; |
46 | bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, |
47 | MCRegister Reg, bool UseLiveOuts = false, |
48 | bool IgnoreStart = false) const; |
49 | bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; |
50 | MachineInstr *findInstrBackwards( |
51 | MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, |
52 | ArrayRef<MCRegister> NonModifiableRegs, |
53 | MachineInstr *Terminator = nullptr, |
54 | SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr, |
55 | unsigned MaxInstructions = 20) const; |
56 | bool optimizeExecSequence(); |
57 | void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); |
58 | bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, |
59 | MachineInstr &VCmp, MCRegister Exec) const; |
60 | |
61 | void tryRecordOrSaveexecXorSequence(MachineInstr &MI); |
62 | bool optimizeOrSaveexecXorSequences(); |
63 | |
64 | public: |
65 | static char ID; |
66 | |
67 | SIOptimizeExecMasking() : MachineFunctionPass(ID) { |
68 | initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); |
69 | } |
70 | |
71 | bool runOnMachineFunction(MachineFunction &MF) override; |
72 | |
73 | StringRef getPassName() const override { |
74 | return "SI optimize exec mask operations" ; |
75 | } |
76 | |
77 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
78 | AU.setPreservesCFG(); |
79 | MachineFunctionPass::getAnalysisUsage(AU); |
80 | } |
81 | }; |
82 | |
83 | } // End anonymous namespace. |
84 | |
85 | INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, |
86 | "SI optimize exec mask operations" , false, false) |
87 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
88 | INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, |
89 | "SI optimize exec mask operations" , false, false) |
90 | |
91 | char SIOptimizeExecMasking::ID = 0; |
92 | |
93 | char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; |
94 | |
95 | /// If \p MI is a copy from exec, return the register copied to. |
96 | Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { |
97 | switch (MI.getOpcode()) { |
98 | case AMDGPU::COPY: |
99 | case AMDGPU::S_MOV_B64: |
100 | case AMDGPU::S_MOV_B64_term: |
101 | case AMDGPU::S_MOV_B32: |
102 | case AMDGPU::S_MOV_B32_term: { |
103 | const MachineOperand &Src = MI.getOperand(i: 1); |
104 | if (Src.isReg() && Src.getReg() == Exec) |
105 | return MI.getOperand(i: 0).getReg(); |
106 | } |
107 | } |
108 | |
109 | return AMDGPU::NoRegister; |
110 | } |
111 | |
112 | /// If \p MI is a copy to exec, return the register copied from. |
113 | Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { |
114 | switch (MI.getOpcode()) { |
115 | case AMDGPU::COPY: |
116 | case AMDGPU::S_MOV_B64: |
117 | case AMDGPU::S_MOV_B32: { |
118 | const MachineOperand &Dst = MI.getOperand(i: 0); |
119 | if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(i: 1).isReg()) |
120 | return MI.getOperand(i: 1).getReg(); |
121 | break; |
122 | } |
123 | case AMDGPU::S_MOV_B64_term: |
124 | case AMDGPU::S_MOV_B32_term: |
125 | llvm_unreachable("should have been replaced" ); |
126 | } |
127 | |
128 | return Register(); |
129 | } |
130 | |
131 | /// If \p MI is a logical operation on an exec value, |
132 | /// return the register copied to. |
133 | static Register isLogicalOpOnExec(const MachineInstr &MI) { |
134 | switch (MI.getOpcode()) { |
135 | case AMDGPU::S_AND_B64: |
136 | case AMDGPU::S_OR_B64: |
137 | case AMDGPU::S_XOR_B64: |
138 | case AMDGPU::S_ANDN2_B64: |
139 | case AMDGPU::S_ORN2_B64: |
140 | case AMDGPU::S_NAND_B64: |
141 | case AMDGPU::S_NOR_B64: |
142 | case AMDGPU::S_XNOR_B64: { |
143 | const MachineOperand &Src1 = MI.getOperand(i: 1); |
144 | if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) |
145 | return MI.getOperand(i: 0).getReg(); |
146 | const MachineOperand &Src2 = MI.getOperand(i: 2); |
147 | if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) |
148 | return MI.getOperand(i: 0).getReg(); |
149 | break; |
150 | } |
151 | case AMDGPU::S_AND_B32: |
152 | case AMDGPU::S_OR_B32: |
153 | case AMDGPU::S_XOR_B32: |
154 | case AMDGPU::S_ANDN2_B32: |
155 | case AMDGPU::S_ORN2_B32: |
156 | case AMDGPU::S_NAND_B32: |
157 | case AMDGPU::S_NOR_B32: |
158 | case AMDGPU::S_XNOR_B32: { |
159 | const MachineOperand &Src1 = MI.getOperand(i: 1); |
160 | if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) |
161 | return MI.getOperand(i: 0).getReg(); |
162 | const MachineOperand &Src2 = MI.getOperand(i: 2); |
163 | if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) |
164 | return MI.getOperand(i: 0).getReg(); |
165 | break; |
166 | } |
167 | } |
168 | |
169 | return AMDGPU::NoRegister; |
170 | } |
171 | |
172 | static unsigned getSaveExecOp(unsigned Opc) { |
173 | switch (Opc) { |
174 | case AMDGPU::S_AND_B64: |
175 | return AMDGPU::S_AND_SAVEEXEC_B64; |
176 | case AMDGPU::S_OR_B64: |
177 | return AMDGPU::S_OR_SAVEEXEC_B64; |
178 | case AMDGPU::S_XOR_B64: |
179 | return AMDGPU::S_XOR_SAVEEXEC_B64; |
180 | case AMDGPU::S_ANDN2_B64: |
181 | return AMDGPU::S_ANDN2_SAVEEXEC_B64; |
182 | case AMDGPU::S_ORN2_B64: |
183 | return AMDGPU::S_ORN2_SAVEEXEC_B64; |
184 | case AMDGPU::S_NAND_B64: |
185 | return AMDGPU::S_NAND_SAVEEXEC_B64; |
186 | case AMDGPU::S_NOR_B64: |
187 | return AMDGPU::S_NOR_SAVEEXEC_B64; |
188 | case AMDGPU::S_XNOR_B64: |
189 | return AMDGPU::S_XNOR_SAVEEXEC_B64; |
190 | case AMDGPU::S_AND_B32: |
191 | return AMDGPU::S_AND_SAVEEXEC_B32; |
192 | case AMDGPU::S_OR_B32: |
193 | return AMDGPU::S_OR_SAVEEXEC_B32; |
194 | case AMDGPU::S_XOR_B32: |
195 | return AMDGPU::S_XOR_SAVEEXEC_B32; |
196 | case AMDGPU::S_ANDN2_B32: |
197 | return AMDGPU::S_ANDN2_SAVEEXEC_B32; |
198 | case AMDGPU::S_ORN2_B32: |
199 | return AMDGPU::S_ORN2_SAVEEXEC_B32; |
200 | case AMDGPU::S_NAND_B32: |
201 | return AMDGPU::S_NAND_SAVEEXEC_B32; |
202 | case AMDGPU::S_NOR_B32: |
203 | return AMDGPU::S_NOR_SAVEEXEC_B32; |
204 | case AMDGPU::S_XNOR_B32: |
205 | return AMDGPU::S_XNOR_SAVEEXEC_B32; |
206 | default: |
207 | return AMDGPU::INSTRUCTION_LIST_END; |
208 | } |
209 | } |
210 | |
211 | // These are only terminators to get correct spill code placement during |
212 | // register allocation, so turn them back into normal instructions. |
213 | bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { |
214 | switch (MI.getOpcode()) { |
215 | case AMDGPU::S_MOV_B32_term: { |
216 | bool RegSrc = MI.getOperand(i: 1).isReg(); |
217 | MI.setDesc(TII->get(Opcode: RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); |
218 | return true; |
219 | } |
220 | case AMDGPU::S_MOV_B64_term: { |
221 | bool RegSrc = MI.getOperand(i: 1).isReg(); |
222 | MI.setDesc(TII->get(Opcode: RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); |
223 | return true; |
224 | } |
225 | case AMDGPU::S_XOR_B64_term: { |
226 | // This is only a terminator to get the correct spill code placement during |
227 | // register allocation. |
228 | MI.setDesc(TII->get(Opcode: AMDGPU::S_XOR_B64)); |
229 | return true; |
230 | } |
231 | case AMDGPU::S_XOR_B32_term: { |
232 | // This is only a terminator to get the correct spill code placement during |
233 | // register allocation. |
234 | MI.setDesc(TII->get(Opcode: AMDGPU::S_XOR_B32)); |
235 | return true; |
236 | } |
237 | case AMDGPU::S_OR_B64_term: { |
238 | // This is only a terminator to get the correct spill code placement during |
239 | // register allocation. |
240 | MI.setDesc(TII->get(Opcode: AMDGPU::S_OR_B64)); |
241 | return true; |
242 | } |
243 | case AMDGPU::S_OR_B32_term: { |
244 | // This is only a terminator to get the correct spill code placement during |
245 | // register allocation. |
246 | MI.setDesc(TII->get(Opcode: AMDGPU::S_OR_B32)); |
247 | return true; |
248 | } |
249 | case AMDGPU::S_ANDN2_B64_term: { |
250 | // This is only a terminator to get the correct spill code placement during |
251 | // register allocation. |
252 | MI.setDesc(TII->get(Opcode: AMDGPU::S_ANDN2_B64)); |
253 | return true; |
254 | } |
255 | case AMDGPU::S_ANDN2_B32_term: { |
256 | // This is only a terminator to get the correct spill code placement during |
257 | // register allocation. |
258 | MI.setDesc(TII->get(Opcode: AMDGPU::S_ANDN2_B32)); |
259 | return true; |
260 | } |
261 | case AMDGPU::S_AND_B64_term: { |
262 | // This is only a terminator to get the correct spill code placement during |
263 | // register allocation. |
264 | MI.setDesc(TII->get(Opcode: AMDGPU::S_AND_B64)); |
265 | return true; |
266 | } |
267 | case AMDGPU::S_AND_B32_term: { |
268 | // This is only a terminator to get the correct spill code placement during |
269 | // register allocation. |
270 | MI.setDesc(TII->get(Opcode: AMDGPU::S_AND_B32)); |
271 | return true; |
272 | } |
273 | default: |
274 | return false; |
275 | } |
276 | } |
277 | |
278 | // Turn all pseudoterminators in the block into their equivalent non-terminator |
279 | // instructions. Returns the reverse iterator to the first non-terminator |
280 | // instruction in the block. |
281 | MachineBasicBlock::reverse_iterator |
282 | SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { |
283 | MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); |
284 | |
285 | bool Seen = false; |
286 | MachineBasicBlock::reverse_iterator FirstNonTerm = I; |
287 | for (; I != E; ++I) { |
288 | if (!I->isTerminator()) |
289 | return Seen ? FirstNonTerm : I; |
290 | |
291 | if (removeTerminatorBit(MI&: *I)) { |
292 | if (!Seen) { |
293 | FirstNonTerm = I; |
294 | Seen = true; |
295 | } |
296 | } |
297 | } |
298 | |
299 | return FirstNonTerm; |
300 | } |
301 | |
302 | MachineBasicBlock::reverse_iterator SIOptimizeExecMasking::findExecCopy( |
303 | MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I) const { |
304 | const unsigned InstLimit = 25; |
305 | |
306 | auto E = MBB.rend(); |
307 | for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { |
308 | Register CopyFromExec = isCopyFromExec(MI: *I); |
309 | if (CopyFromExec.isValid()) |
310 | return I; |
311 | } |
312 | |
313 | return E; |
314 | } |
315 | |
316 | // XXX - Seems LiveRegUnits doesn't work correctly since it will incorrectly |
317 | // report the register as unavailable because a super-register with a lane mask |
318 | // is unavailable. |
319 | static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { |
320 | for (MachineBasicBlock *Succ : MBB.successors()) { |
321 | if (Succ->isLiveIn(Reg)) |
322 | return true; |
323 | } |
324 | |
325 | return false; |
326 | } |
327 | |
328 | // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either |
329 | // the beginning of the BB is reached or Pred evaluates to true - which can be |
330 | // an arbitrary condition based on the current MachineInstr, for instance an |
331 | // target instruction. Breaks prematurely by returning nullptr if one of the |
332 | // registers given in NonModifiableRegs is modified by the current instruction. |
333 | MachineInstr *SIOptimizeExecMasking::findInstrBackwards( |
334 | MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, |
335 | ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator, |
336 | SmallVectorImpl<MachineOperand *> *KillFlagCandidates, |
337 | unsigned MaxInstructions) const { |
338 | MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), |
339 | E = Origin.getParent()->rend(); |
340 | unsigned CurrentIteration = 0; |
341 | |
342 | for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { |
343 | if (A->isDebugInstr()) |
344 | continue; |
345 | |
346 | if (Pred(&*A)) |
347 | return &*A; |
348 | |
349 | for (MCRegister Reg : NonModifiableRegs) { |
350 | if (A->modifiesRegister(Reg, TRI)) |
351 | return nullptr; |
352 | |
353 | // Check for kills that appear after the terminator instruction, that |
354 | // would not be detected by clearKillFlags, since they will cause the |
355 | // register to be dead at a later place, causing the verifier to fail. |
356 | // We use the candidates to clear the kill flags later. |
357 | if (Terminator && KillFlagCandidates && A != Terminator && |
358 | A->killsRegister(Reg, TRI)) { |
359 | for (MachineOperand &MO : A->operands()) { |
360 | if (MO.isReg() && MO.isKill()) { |
361 | Register Candidate = MO.getReg(); |
362 | if (Candidate != Reg && TRI->regsOverlap(RegA: Candidate, RegB: Reg)) |
363 | KillFlagCandidates->push_back(Elt: &MO); |
364 | } |
365 | } |
366 | } |
367 | } |
368 | |
369 | ++CurrentIteration; |
370 | } |
371 | |
372 | return nullptr; |
373 | } |
374 | |
375 | // Determine if a register Reg is not re-defined and still in use |
376 | // in the range (Stop..Start]. |
377 | // It does so by backwards calculating liveness from the end of the BB until |
378 | // either Stop or the beginning of the BB is reached. |
379 | // After liveness is calculated, we can determine if Reg is still in use and not |
380 | // defined inbetween the instructions. |
381 | bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, |
382 | MachineInstr &Start, |
383 | MCRegister Reg, |
384 | bool UseLiveOuts, |
385 | bool IgnoreStart) const { |
386 | LiveRegUnits LR(*TRI); |
387 | if (UseLiveOuts) |
388 | LR.addLiveOuts(MBB: *Stop.getParent()); |
389 | |
390 | MachineBasicBlock::reverse_iterator A(Start); |
391 | |
392 | if (IgnoreStart) |
393 | ++A; |
394 | |
395 | for (; A != Stop.getParent()->rend() && A != Stop; ++A) { |
396 | LR.stepBackward(MI: *A); |
397 | } |
398 | |
399 | return !LR.available(Reg) || MRI->isReserved(PhysReg: Reg); |
400 | } |
401 | |
402 | // Determine if a register Reg is not re-defined and still in use |
403 | // in the range (Stop..BB.end]. |
404 | bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, |
405 | MCRegister Reg) const { |
406 | return isRegisterInUseBetween(Stop, Start&: *Stop.getParent()->rbegin(), Reg, UseLiveOuts: true); |
407 | } |
408 | |
409 | // Optimize sequences emitted for control flow lowering. They are originally |
410 | // emitted as the separate operations because spill code may need to be |
411 | // inserted for the saved copy of exec. |
412 | // |
413 | // x = copy exec |
414 | // z = s_<op>_b64 x, y |
415 | // exec = copy z |
416 | // => |
417 | // x = s_<op>_saveexec_b64 y |
418 | // |
419 | bool SIOptimizeExecMasking::optimizeExecSequence() { |
420 | bool Changed = false; |
421 | for (MachineBasicBlock &MBB : *MF) { |
422 | MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); |
423 | MachineBasicBlock::reverse_iterator E = MBB.rend(); |
424 | if (I == E) |
425 | continue; |
426 | |
427 | // It's possible to see other terminator copies after the exec copy. This |
428 | // can happen if control flow pseudos had their outputs used by phis. |
429 | Register CopyToExec; |
430 | |
431 | unsigned SearchCount = 0; |
432 | const unsigned SearchLimit = 5; |
433 | while (I != E && SearchCount++ < SearchLimit) { |
434 | CopyToExec = isCopyToExec(MI: *I); |
435 | if (CopyToExec) |
436 | break; |
437 | ++I; |
438 | } |
439 | |
440 | if (!CopyToExec) |
441 | continue; |
442 | |
443 | // Scan backwards to find the def. |
444 | auto *CopyToExecInst = &*I; |
445 | auto CopyFromExecInst = findExecCopy(MBB, I); |
446 | if (CopyFromExecInst == E) { |
447 | auto PrepareExecInst = std::next(x: I); |
448 | if (PrepareExecInst == E) |
449 | continue; |
450 | // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec |
451 | if (CopyToExecInst->getOperand(i: 1).isKill() && |
452 | isLogicalOpOnExec(MI: *PrepareExecInst) == CopyToExec) { |
453 | LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); |
454 | |
455 | PrepareExecInst->getOperand(i: 0).setReg(Exec); |
456 | |
457 | LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); |
458 | |
459 | CopyToExecInst->eraseFromParent(); |
460 | Changed = true; |
461 | } |
462 | |
463 | continue; |
464 | } |
465 | |
466 | if (isLiveOut(MBB, Reg: CopyToExec)) { |
467 | // The copied register is live out and has a second use in another block. |
468 | LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n" ); |
469 | continue; |
470 | } |
471 | |
472 | Register CopyFromExec = CopyFromExecInst->getOperand(i: 0).getReg(); |
473 | MachineInstr *SaveExecInst = nullptr; |
474 | SmallVector<MachineInstr *, 4> OtherUseInsts; |
475 | |
476 | for (MachineBasicBlock::iterator |
477 | J = std::next(x: CopyFromExecInst->getIterator()), |
478 | JE = I->getIterator(); |
479 | J != JE; ++J) { |
480 | if (SaveExecInst && J->readsRegister(Reg: Exec, TRI)) { |
481 | LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); |
482 | // Make sure this is inserted after any VALU ops that may have been |
483 | // scheduled in between. |
484 | SaveExecInst = nullptr; |
485 | break; |
486 | } |
487 | |
488 | bool ReadsCopyFromExec = J->readsRegister(Reg: CopyFromExec, TRI); |
489 | |
490 | if (J->modifiesRegister(Reg: CopyToExec, TRI)) { |
491 | if (SaveExecInst) { |
492 | LLVM_DEBUG(dbgs() << "Multiple instructions modify " |
493 | << printReg(CopyToExec, TRI) << '\n'); |
494 | SaveExecInst = nullptr; |
495 | break; |
496 | } |
497 | |
498 | unsigned SaveExecOp = getSaveExecOp(Opc: J->getOpcode()); |
499 | if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) |
500 | break; |
501 | |
502 | if (ReadsCopyFromExec) { |
503 | SaveExecInst = &*J; |
504 | LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); |
505 | continue; |
506 | } |
507 | LLVM_DEBUG(dbgs() << "Instruction does not read exec copy: " << *J |
508 | << '\n'); |
509 | break; |
510 | } |
511 | if (ReadsCopyFromExec && !SaveExecInst) { |
512 | // Make sure no other instruction is trying to use this copy, before it |
513 | // will be rewritten by the saveexec, i.e. hasOneUse. There may have |
514 | // been another use, such as an inserted spill. For example: |
515 | // |
516 | // %sgpr0_sgpr1 = COPY %exec |
517 | // spill %sgpr0_sgpr1 |
518 | // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 |
519 | // |
520 | LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J |
521 | << '\n'); |
522 | break; |
523 | } |
524 | |
525 | if (SaveExecInst && J->readsRegister(Reg: CopyToExec, TRI)) { |
526 | assert(SaveExecInst != &*J); |
527 | OtherUseInsts.push_back(Elt: &*J); |
528 | } |
529 | } |
530 | |
531 | if (!SaveExecInst) |
532 | continue; |
533 | |
534 | LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); |
535 | |
536 | MachineOperand &Src0 = SaveExecInst->getOperand(i: 1); |
537 | MachineOperand &Src1 = SaveExecInst->getOperand(i: 2); |
538 | |
539 | MachineOperand *OtherOp = nullptr; |
540 | |
541 | if (Src0.isReg() && Src0.getReg() == CopyFromExec) { |
542 | OtherOp = &Src1; |
543 | } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { |
544 | if (!SaveExecInst->isCommutable()) |
545 | break; |
546 | |
547 | OtherOp = &Src0; |
548 | } else |
549 | llvm_unreachable("unexpected" ); |
550 | |
551 | CopyFromExecInst->eraseFromParent(); |
552 | |
553 | auto InsPt = SaveExecInst->getIterator(); |
554 | const DebugLoc &DL = SaveExecInst->getDebugLoc(); |
555 | |
556 | BuildMI(BB&: MBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: getSaveExecOp(Opc: SaveExecInst->getOpcode())), |
557 | DestReg: CopyFromExec) |
558 | .addReg(RegNo: OtherOp->getReg()); |
559 | SaveExecInst->eraseFromParent(); |
560 | |
561 | CopyToExecInst->eraseFromParent(); |
562 | |
563 | for (MachineInstr *OtherInst : OtherUseInsts) { |
564 | OtherInst->substituteRegister(FromReg: CopyToExec, ToReg: Exec, SubIdx: AMDGPU::NoSubRegister, |
565 | RegInfo: *TRI); |
566 | } |
567 | |
568 | Changed = true; |
569 | } |
570 | |
571 | return Changed; |
572 | } |
573 | |
574 | // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the |
575 | // operands extracted from a v_cmp ..., s_and_saveexec pattern. |
576 | bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( |
577 | MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { |
578 | const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(Opcode: VCmp.getOpcode()); |
579 | |
580 | if (NewOpcode == -1) |
581 | return false; |
582 | |
583 | MachineOperand *Src0 = TII->getNamedOperand(MI&: VCmp, OperandName: AMDGPU::OpName::src0); |
584 | MachineOperand *Src1 = TII->getNamedOperand(MI&: VCmp, OperandName: AMDGPU::OpName::src1); |
585 | |
586 | Register MoveDest = SaveExecInstr.getOperand(i: 0).getReg(); |
587 | |
588 | MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); |
589 | if (!SaveExecInstr.uses().empty()) { |
590 | bool IsSGPR32 = TRI->getRegSizeInBits(Reg: MoveDest, MRI: *MRI) == 32; |
591 | unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
592 | BuildMI(BB&: *SaveExecInstr.getParent(), I: InsertPosIt, |
593 | MIMD: SaveExecInstr.getDebugLoc(), MCID: TII->get(Opcode: MovOpcode), DestReg: MoveDest) |
594 | .addReg(RegNo: Exec); |
595 | } |
596 | |
597 | // Omit dst as V_CMPX is implicitly writing to EXEC. |
598 | // Add dummy src and clamp modifiers, if needed. |
599 | auto Builder = BuildMI(BB&: *VCmp.getParent(), I: std::next(x: InsertPosIt), |
600 | MIMD: VCmp.getDebugLoc(), MCID: TII->get(Opcode: NewOpcode)); |
601 | |
602 | auto TryAddImmediateValueFromNamedOperand = |
603 | [&](unsigned OperandName) -> void { |
604 | if (auto *Mod = TII->getNamedOperand(MI&: VCmp, OperandName)) |
605 | Builder.addImm(Val: Mod->getImm()); |
606 | }; |
607 | |
608 | TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); |
609 | Builder.add(MO: *Src0); |
610 | |
611 | TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); |
612 | Builder.add(MO: *Src1); |
613 | |
614 | TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); |
615 | |
616 | // The kill flags may no longer be correct. |
617 | if (Src0->isReg()) |
618 | MRI->clearKillFlags(Reg: Src0->getReg()); |
619 | if (Src1->isReg()) |
620 | MRI->clearKillFlags(Reg: Src1->getReg()); |
621 | |
622 | for (MachineOperand *MO : KillFlagCandidates) |
623 | MO->setIsKill(false); |
624 | |
625 | SaveExecInstr.eraseFromParent(); |
626 | VCmp.eraseFromParent(); |
627 | |
628 | return true; |
629 | } |
630 | |
631 | // Record (on GFX10.3 and later) occurences of |
632 | // v_cmp_* SGPR, IMM, VGPR |
633 | // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR |
634 | // to be replaced with |
635 | // s_mov_b32 EXEC_SGPR_DEST, exec_lo |
636 | // v_cmpx_* IMM, VGPR |
637 | // to reduce pipeline stalls. |
638 | void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( |
639 | MachineInstr &MI) { |
640 | if (!ST->hasGFX10_3Insts()) |
641 | return; |
642 | |
643 | const unsigned AndSaveExecOpcode = |
644 | ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; |
645 | |
646 | if (MI.getOpcode() != AndSaveExecOpcode) |
647 | return; |
648 | |
649 | Register SaveExecDest = MI.getOperand(i: 0).getReg(); |
650 | if (!TRI->isSGPRReg(MRI: *MRI, Reg: SaveExecDest)) |
651 | return; |
652 | |
653 | MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
654 | if (!SaveExecSrc0->isReg()) |
655 | return; |
656 | |
657 | // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec |
658 | // sequence by looking at an instance of an s_and_saveexec instruction. |
659 | // Returns a pointer to the v_cmp instruction if it is safe to replace the |
660 | // sequence (see the conditions in the function body). This is after register |
661 | // allocation, so some checks on operand dependencies need to be considered. |
662 | MachineInstr *VCmp = nullptr; |
663 | |
664 | // Try to find the last v_cmp instruction that defs the saveexec input |
665 | // operand without any write to Exec or the saveexec input operand inbetween. |
666 | VCmp = findInstrBackwards( |
667 | Origin&: MI, |
668 | Pred: [&](MachineInstr *Check) { |
669 | return AMDGPU::getVCMPXOpFromVCMP(Opcode: Check->getOpcode()) != -1 && |
670 | Check->modifiesRegister(Reg: SaveExecSrc0->getReg(), TRI); |
671 | }, |
672 | NonModifiableRegs: {Exec, SaveExecSrc0->getReg()}); |
673 | |
674 | if (!VCmp) |
675 | return; |
676 | |
677 | MachineOperand *VCmpDest = TII->getNamedOperand(MI&: *VCmp, OperandName: AMDGPU::OpName::sdst); |
678 | assert(VCmpDest && "Should have an sdst operand!" ); |
679 | |
680 | // Check if any of the v_cmp source operands is written by the saveexec. |
681 | MachineOperand *Src0 = TII->getNamedOperand(MI&: *VCmp, OperandName: AMDGPU::OpName::src0); |
682 | if (Src0->isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src0->getReg()) && |
683 | MI.modifiesRegister(Reg: Src0->getReg(), TRI)) |
684 | return; |
685 | |
686 | MachineOperand *Src1 = TII->getNamedOperand(MI&: *VCmp, OperandName: AMDGPU::OpName::src1); |
687 | if (Src1->isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src1->getReg()) && |
688 | MI.modifiesRegister(Reg: Src1->getReg(), TRI)) |
689 | return; |
690 | |
691 | // Don't do the transformation if the destination operand is included in |
692 | // it's MBB Live-outs, meaning it's used in any of its successors, leading |
693 | // to incorrect code if the v_cmp and therefore the def of |
694 | // the dest operand is removed. |
695 | if (isLiveOut(MBB: *VCmp->getParent(), Reg: VCmpDest->getReg())) |
696 | return; |
697 | |
698 | // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the |
699 | // s_and_saveexec, skip the optimization. |
700 | if (isRegisterInUseBetween(Stop&: *VCmp, Start&: MI, Reg: VCmpDest->getReg(), UseLiveOuts: false, IgnoreStart: true) || |
701 | isRegisterInUseAfter(Stop&: MI, Reg: VCmpDest->getReg())) |
702 | return; |
703 | |
704 | // Try to determine if there is a write to any of the VCmp |
705 | // operands between the saveexec and the vcmp. |
706 | // If yes, additional VGPR spilling might need to be inserted. In this case, |
707 | // it's not worth replacing the instruction sequence. |
708 | SmallVector<MCRegister, 2> NonDefRegs; |
709 | if (Src0->isReg()) |
710 | NonDefRegs.push_back(Elt: Src0->getReg()); |
711 | |
712 | if (Src1->isReg()) |
713 | NonDefRegs.push_back(Elt: Src1->getReg()); |
714 | |
715 | if (!findInstrBackwards( |
716 | Origin&: MI, Pred: [&](MachineInstr *Check) { return Check == VCmp; }, NonModifiableRegs: NonDefRegs, |
717 | Terminator: VCmp, KillFlagCandidates: &KillFlagCandidates)) |
718 | return; |
719 | |
720 | if (VCmp) |
721 | SaveExecVCmpMapping[&MI] = VCmp; |
722 | } |
723 | |
724 | // Record occurences of |
725 | // s_or_saveexec s_o, s_i |
726 | // s_xor exec, exec, s_o |
727 | // to be replaced with |
728 | // s_andn2_saveexec s_o, s_i. |
729 | void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) { |
730 | const unsigned XorOpcode = |
731 | ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; |
732 | |
733 | if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) { |
734 | const MachineOperand &XorDst = MI.getOperand(i: 0); |
735 | const MachineOperand &XorSrc0 = MI.getOperand(i: 1); |
736 | const MachineOperand &XorSrc1 = MI.getOperand(i: 2); |
737 | |
738 | if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() && |
739 | XorSrc1.isReg() && |
740 | (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) { |
741 | const unsigned OrSaveexecOpcode = ST->isWave32() |
742 | ? AMDGPU::S_OR_SAVEEXEC_B32 |
743 | : AMDGPU::S_OR_SAVEEXEC_B64; |
744 | |
745 | // Peek at the previous instruction and check if this is a relevant |
746 | // s_or_saveexec instruction. |
747 | MachineInstr &PossibleOrSaveexec = *MI.getPrevNode(); |
748 | if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode) |
749 | return; |
750 | |
751 | const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(i: 0); |
752 | const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(i: 1); |
753 | if (OrDst.isReg() && OrSrc0.isReg()) { |
754 | if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) || |
755 | (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) { |
756 | OrXors.emplace_back(Args: &PossibleOrSaveexec, Args: &MI); |
757 | } |
758 | } |
759 | } |
760 | } |
761 | } |
762 | |
763 | bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() { |
764 | if (OrXors.empty()) { |
765 | return false; |
766 | } |
767 | |
768 | bool Changed = false; |
769 | const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32 |
770 | : AMDGPU::S_ANDN2_SAVEEXEC_B64; |
771 | |
772 | for (const auto &Pair : OrXors) { |
773 | MachineInstr *Or = nullptr; |
774 | MachineInstr *Xor = nullptr; |
775 | std::tie(args&: Or, args&: Xor) = Pair; |
776 | BuildMI(BB&: *Or->getParent(), I: Or->getIterator(), MIMD: Or->getDebugLoc(), |
777 | MCID: TII->get(Opcode: Andn2Opcode), DestReg: Or->getOperand(i: 0).getReg()) |
778 | .addReg(RegNo: Or->getOperand(i: 1).getReg()); |
779 | |
780 | Or->eraseFromParent(); |
781 | Xor->eraseFromParent(); |
782 | |
783 | Changed = true; |
784 | } |
785 | |
786 | return Changed; |
787 | } |
788 | |
789 | bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { |
790 | if (skipFunction(F: MF.getFunction())) |
791 | return false; |
792 | |
793 | this->MF = &MF; |
794 | ST = &MF.getSubtarget<GCNSubtarget>(); |
795 | TRI = ST->getRegisterInfo(); |
796 | TII = ST->getInstrInfo(); |
797 | MRI = &MF.getRegInfo(); |
798 | Exec = TRI->getExec(); |
799 | |
800 | bool Changed = optimizeExecSequence(); |
801 | |
802 | OrXors.clear(); |
803 | SaveExecVCmpMapping.clear(); |
804 | KillFlagCandidates.clear(); |
805 | static unsigned SearchWindow = 10; |
806 | for (MachineBasicBlock &MBB : MF) { |
807 | unsigned SearchCount = 0; |
808 | |
809 | for (auto &MI : llvm::reverse(C&: MBB)) { |
810 | if (MI.isDebugInstr()) |
811 | continue; |
812 | |
813 | if (SearchCount >= SearchWindow) { |
814 | break; |
815 | } |
816 | |
817 | tryRecordOrSaveexecXorSequence(MI); |
818 | tryRecordVCmpxAndSaveexecSequence(MI); |
819 | |
820 | if (MI.modifiesRegister(Reg: Exec, TRI)) { |
821 | break; |
822 | } |
823 | |
824 | ++SearchCount; |
825 | } |
826 | } |
827 | |
828 | Changed |= optimizeOrSaveexecXorSequences(); |
829 | for (const auto &Entry : SaveExecVCmpMapping) { |
830 | MachineInstr *SaveExecInstr = Entry.getFirst(); |
831 | MachineInstr *VCmpInstr = Entry.getSecond(); |
832 | |
833 | Changed |= optimizeVCMPSaveExecSequence(SaveExecInstr&: *SaveExecInstr, VCmp&: *VCmpInstr, Exec); |
834 | } |
835 | |
836 | return Changed; |
837 | } |
838 | |