1 | //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "SIOptimizeExecMasking.h" |
10 | #include "AMDGPU.h" |
11 | #include "GCNSubtarget.h" |
12 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
13 | #include "SIRegisterInfo.h" |
14 | #include "llvm/ADT/SmallVector.h" |
15 | #include "llvm/CodeGen/LiveRegUnits.h" |
16 | #include "llvm/CodeGen/MachineFunctionPass.h" |
17 | #include "llvm/CodeGen/MachineOperand.h" |
18 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
19 | #include "llvm/InitializePasses.h" |
20 | |
21 | using namespace llvm; |
22 | |
23 | #define DEBUG_TYPE "si-optimize-exec-masking" |
24 | |
25 | namespace { |
26 | |
27 | class SIOptimizeExecMasking { |
28 | MachineFunction *MF = nullptr; |
29 | const GCNSubtarget *ST = nullptr; |
30 | const SIRegisterInfo *TRI = nullptr; |
31 | const SIInstrInfo *TII = nullptr; |
32 | const MachineRegisterInfo *MRI = nullptr; |
33 | MCRegister Exec; |
34 | |
35 | DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; |
36 | SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors; |
37 | SmallVector<MachineOperand *, 1> KillFlagCandidates; |
38 | |
39 | Register isCopyFromExec(const MachineInstr &MI) const; |
40 | Register isCopyToExec(const MachineInstr &MI) const; |
41 | bool removeTerminatorBit(MachineInstr &MI) const; |
42 | MachineBasicBlock::reverse_iterator |
43 | fixTerminators(MachineBasicBlock &MBB) const; |
44 | MachineBasicBlock::reverse_iterator |
45 | findExecCopy(MachineBasicBlock &MBB, |
46 | MachineBasicBlock::reverse_iterator I) const; |
47 | bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, |
48 | MCRegister Reg, bool UseLiveOuts = false, |
49 | bool IgnoreStart = false) const; |
50 | bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; |
51 | MachineInstr *findInstrBackwards( |
52 | MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, |
53 | ArrayRef<MCRegister> NonModifiableRegs, |
54 | MachineInstr *Terminator = nullptr, |
55 | SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr, |
56 | unsigned MaxInstructions = 20) const; |
57 | bool optimizeExecSequence(); |
58 | void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); |
59 | bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, |
60 | MachineInstr &VCmp, MCRegister Exec) const; |
61 | |
62 | void tryRecordOrSaveexecXorSequence(MachineInstr &MI); |
63 | bool optimizeOrSaveexecXorSequences(); |
64 | |
65 | public: |
66 | bool run(MachineFunction &MF); |
67 | }; |
68 | |
69 | class SIOptimizeExecMaskingLegacy : public MachineFunctionPass { |
70 | public: |
71 | static char ID; |
72 | |
73 | SIOptimizeExecMaskingLegacy() : MachineFunctionPass(ID) { |
74 | initializeSIOptimizeExecMaskingLegacyPass(*PassRegistry::getPassRegistry()); |
75 | } |
76 | |
77 | bool runOnMachineFunction(MachineFunction &MF) override; |
78 | |
79 | StringRef getPassName() const override { |
80 | return "SI optimize exec mask operations" ; |
81 | } |
82 | |
83 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
84 | AU.setPreservesCFG(); |
85 | MachineFunctionPass::getAnalysisUsage(AU); |
86 | } |
87 | }; |
88 | |
89 | } // End anonymous namespace. |
90 | |
91 | PreservedAnalyses |
92 | SIOptimizeExecMaskingPass::run(MachineFunction &MF, |
93 | MachineFunctionAnalysisManager &) { |
94 | SIOptimizeExecMasking Impl; |
95 | |
96 | if (!Impl.run(MF)) |
97 | return PreservedAnalyses::all(); |
98 | |
99 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
100 | PA.preserveSet<CFGAnalyses>(); |
101 | return PA; |
102 | } |
103 | |
104 | INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingLegacy, DEBUG_TYPE, |
105 | "SI optimize exec mask operations" , false, false) |
106 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
107 | INITIALIZE_PASS_END(SIOptimizeExecMaskingLegacy, DEBUG_TYPE, |
108 | "SI optimize exec mask operations" , false, false) |
109 | |
110 | char SIOptimizeExecMaskingLegacy::ID = 0; |
111 | |
112 | char &llvm::SIOptimizeExecMaskingLegacyID = SIOptimizeExecMaskingLegacy::ID; |
113 | |
114 | /// If \p MI is a copy from exec, return the register copied to. |
115 | Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { |
116 | switch (MI.getOpcode()) { |
117 | case AMDGPU::COPY: |
118 | case AMDGPU::S_MOV_B64: |
119 | case AMDGPU::S_MOV_B64_term: |
120 | case AMDGPU::S_MOV_B32: |
121 | case AMDGPU::S_MOV_B32_term: { |
122 | const MachineOperand &Src = MI.getOperand(i: 1); |
123 | if (Src.isReg() && Src.getReg() == Exec) |
124 | return MI.getOperand(i: 0).getReg(); |
125 | } |
126 | } |
127 | |
128 | return AMDGPU::NoRegister; |
129 | } |
130 | |
131 | /// If \p MI is a copy to exec, return the register copied from. |
132 | Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { |
133 | switch (MI.getOpcode()) { |
134 | case AMDGPU::COPY: |
135 | case AMDGPU::S_MOV_B64: |
136 | case AMDGPU::S_MOV_B32: { |
137 | const MachineOperand &Dst = MI.getOperand(i: 0); |
138 | if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(i: 1).isReg()) |
139 | return MI.getOperand(i: 1).getReg(); |
140 | break; |
141 | } |
142 | case AMDGPU::S_MOV_B64_term: |
143 | case AMDGPU::S_MOV_B32_term: |
144 | llvm_unreachable("should have been replaced" ); |
145 | } |
146 | |
147 | return Register(); |
148 | } |
149 | |
150 | /// If \p MI is a logical operation on an exec value, |
151 | /// return the register copied to. |
152 | static Register isLogicalOpOnExec(const MachineInstr &MI) { |
153 | switch (MI.getOpcode()) { |
154 | case AMDGPU::S_AND_B64: |
155 | case AMDGPU::S_OR_B64: |
156 | case AMDGPU::S_XOR_B64: |
157 | case AMDGPU::S_ANDN2_B64: |
158 | case AMDGPU::S_ORN2_B64: |
159 | case AMDGPU::S_NAND_B64: |
160 | case AMDGPU::S_NOR_B64: |
161 | case AMDGPU::S_XNOR_B64: { |
162 | const MachineOperand &Src1 = MI.getOperand(i: 1); |
163 | if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) |
164 | return MI.getOperand(i: 0).getReg(); |
165 | const MachineOperand &Src2 = MI.getOperand(i: 2); |
166 | if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) |
167 | return MI.getOperand(i: 0).getReg(); |
168 | break; |
169 | } |
170 | case AMDGPU::S_AND_B32: |
171 | case AMDGPU::S_OR_B32: |
172 | case AMDGPU::S_XOR_B32: |
173 | case AMDGPU::S_ANDN2_B32: |
174 | case AMDGPU::S_ORN2_B32: |
175 | case AMDGPU::S_NAND_B32: |
176 | case AMDGPU::S_NOR_B32: |
177 | case AMDGPU::S_XNOR_B32: { |
178 | const MachineOperand &Src1 = MI.getOperand(i: 1); |
179 | if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) |
180 | return MI.getOperand(i: 0).getReg(); |
181 | const MachineOperand &Src2 = MI.getOperand(i: 2); |
182 | if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) |
183 | return MI.getOperand(i: 0).getReg(); |
184 | break; |
185 | } |
186 | } |
187 | |
188 | return AMDGPU::NoRegister; |
189 | } |
190 | |
191 | static unsigned getSaveExecOp(unsigned Opc) { |
192 | switch (Opc) { |
193 | case AMDGPU::S_AND_B64: |
194 | return AMDGPU::S_AND_SAVEEXEC_B64; |
195 | case AMDGPU::S_OR_B64: |
196 | return AMDGPU::S_OR_SAVEEXEC_B64; |
197 | case AMDGPU::S_XOR_B64: |
198 | return AMDGPU::S_XOR_SAVEEXEC_B64; |
199 | case AMDGPU::S_ANDN2_B64: |
200 | return AMDGPU::S_ANDN2_SAVEEXEC_B64; |
201 | case AMDGPU::S_ORN2_B64: |
202 | return AMDGPU::S_ORN2_SAVEEXEC_B64; |
203 | case AMDGPU::S_NAND_B64: |
204 | return AMDGPU::S_NAND_SAVEEXEC_B64; |
205 | case AMDGPU::S_NOR_B64: |
206 | return AMDGPU::S_NOR_SAVEEXEC_B64; |
207 | case AMDGPU::S_XNOR_B64: |
208 | return AMDGPU::S_XNOR_SAVEEXEC_B64; |
209 | case AMDGPU::S_AND_B32: |
210 | return AMDGPU::S_AND_SAVEEXEC_B32; |
211 | case AMDGPU::S_OR_B32: |
212 | return AMDGPU::S_OR_SAVEEXEC_B32; |
213 | case AMDGPU::S_XOR_B32: |
214 | return AMDGPU::S_XOR_SAVEEXEC_B32; |
215 | case AMDGPU::S_ANDN2_B32: |
216 | return AMDGPU::S_ANDN2_SAVEEXEC_B32; |
217 | case AMDGPU::S_ORN2_B32: |
218 | return AMDGPU::S_ORN2_SAVEEXEC_B32; |
219 | case AMDGPU::S_NAND_B32: |
220 | return AMDGPU::S_NAND_SAVEEXEC_B32; |
221 | case AMDGPU::S_NOR_B32: |
222 | return AMDGPU::S_NOR_SAVEEXEC_B32; |
223 | case AMDGPU::S_XNOR_B32: |
224 | return AMDGPU::S_XNOR_SAVEEXEC_B32; |
225 | default: |
226 | return AMDGPU::INSTRUCTION_LIST_END; |
227 | } |
228 | } |
229 | |
230 | // These are only terminators to get correct spill code placement during |
231 | // register allocation, so turn them back into normal instructions. |
232 | bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { |
233 | switch (MI.getOpcode()) { |
234 | case AMDGPU::S_MOV_B32_term: { |
235 | bool RegSrc = MI.getOperand(i: 1).isReg(); |
236 | MI.setDesc(TII->get(Opcode: RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); |
237 | return true; |
238 | } |
239 | case AMDGPU::S_MOV_B64_term: { |
240 | bool RegSrc = MI.getOperand(i: 1).isReg(); |
241 | MI.setDesc(TII->get(Opcode: RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); |
242 | return true; |
243 | } |
244 | case AMDGPU::S_XOR_B64_term: { |
245 | // This is only a terminator to get the correct spill code placement during |
246 | // register allocation. |
247 | MI.setDesc(TII->get(Opcode: AMDGPU::S_XOR_B64)); |
248 | return true; |
249 | } |
250 | case AMDGPU::S_XOR_B32_term: { |
251 | // This is only a terminator to get the correct spill code placement during |
252 | // register allocation. |
253 | MI.setDesc(TII->get(Opcode: AMDGPU::S_XOR_B32)); |
254 | return true; |
255 | } |
256 | case AMDGPU::S_OR_B64_term: { |
257 | // This is only a terminator to get the correct spill code placement during |
258 | // register allocation. |
259 | MI.setDesc(TII->get(Opcode: AMDGPU::S_OR_B64)); |
260 | return true; |
261 | } |
262 | case AMDGPU::S_OR_B32_term: { |
263 | // This is only a terminator to get the correct spill code placement during |
264 | // register allocation. |
265 | MI.setDesc(TII->get(Opcode: AMDGPU::S_OR_B32)); |
266 | return true; |
267 | } |
268 | case AMDGPU::S_ANDN2_B64_term: { |
269 | // This is only a terminator to get the correct spill code placement during |
270 | // register allocation. |
271 | MI.setDesc(TII->get(Opcode: AMDGPU::S_ANDN2_B64)); |
272 | return true; |
273 | } |
274 | case AMDGPU::S_ANDN2_B32_term: { |
275 | // This is only a terminator to get the correct spill code placement during |
276 | // register allocation. |
277 | MI.setDesc(TII->get(Opcode: AMDGPU::S_ANDN2_B32)); |
278 | return true; |
279 | } |
280 | case AMDGPU::S_AND_B64_term: { |
281 | // This is only a terminator to get the correct spill code placement during |
282 | // register allocation. |
283 | MI.setDesc(TII->get(Opcode: AMDGPU::S_AND_B64)); |
284 | return true; |
285 | } |
286 | case AMDGPU::S_AND_B32_term: { |
287 | // This is only a terminator to get the correct spill code placement during |
288 | // register allocation. |
289 | MI.setDesc(TII->get(Opcode: AMDGPU::S_AND_B32)); |
290 | return true; |
291 | } |
292 | default: |
293 | return false; |
294 | } |
295 | } |
296 | |
297 | // Turn all pseudoterminators in the block into their equivalent non-terminator |
298 | // instructions. Returns the reverse iterator to the first non-terminator |
299 | // instruction in the block. |
300 | MachineBasicBlock::reverse_iterator |
301 | SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { |
302 | MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); |
303 | |
304 | bool Seen = false; |
305 | MachineBasicBlock::reverse_iterator FirstNonTerm = I; |
306 | for (; I != E; ++I) { |
307 | if (!I->isTerminator()) |
308 | return Seen ? FirstNonTerm : I; |
309 | |
310 | if (removeTerminatorBit(MI&: *I)) { |
311 | if (!Seen) { |
312 | FirstNonTerm = I; |
313 | Seen = true; |
314 | } |
315 | } |
316 | } |
317 | |
318 | return FirstNonTerm; |
319 | } |
320 | |
321 | MachineBasicBlock::reverse_iterator SIOptimizeExecMasking::findExecCopy( |
322 | MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I) const { |
323 | const unsigned InstLimit = 25; |
324 | |
325 | auto E = MBB.rend(); |
326 | for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { |
327 | Register CopyFromExec = isCopyFromExec(MI: *I); |
328 | if (CopyFromExec.isValid()) |
329 | return I; |
330 | } |
331 | |
332 | return E; |
333 | } |
334 | |
335 | // XXX - Seems LiveRegUnits doesn't work correctly since it will incorrectly |
336 | // report the register as unavailable because a super-register with a lane mask |
337 | // is unavailable. |
338 | static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { |
339 | for (MachineBasicBlock *Succ : MBB.successors()) { |
340 | if (Succ->isLiveIn(Reg)) |
341 | return true; |
342 | } |
343 | |
344 | return false; |
345 | } |
346 | |
347 | // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either |
348 | // the beginning of the BB is reached or Pred evaluates to true - which can be |
349 | // an arbitrary condition based on the current MachineInstr, for instance an |
350 | // target instruction. Breaks prematurely by returning nullptr if one of the |
351 | // registers given in NonModifiableRegs is modified by the current instruction. |
352 | MachineInstr *SIOptimizeExecMasking::findInstrBackwards( |
353 | MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, |
354 | ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator, |
355 | SmallVectorImpl<MachineOperand *> *KillFlagCandidates, |
356 | unsigned MaxInstructions) const { |
357 | MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), |
358 | E = Origin.getParent()->rend(); |
359 | unsigned CurrentIteration = 0; |
360 | |
361 | for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { |
362 | if (A->isDebugInstr()) |
363 | continue; |
364 | |
365 | if (Pred(&*A)) |
366 | return &*A; |
367 | |
368 | for (MCRegister Reg : NonModifiableRegs) { |
369 | if (A->modifiesRegister(Reg, TRI)) |
370 | return nullptr; |
371 | |
372 | // Check for kills that appear after the terminator instruction, that |
373 | // would not be detected by clearKillFlags, since they will cause the |
374 | // register to be dead at a later place, causing the verifier to fail. |
375 | // We use the candidates to clear the kill flags later. |
376 | if (Terminator && KillFlagCandidates && A != Terminator && |
377 | A->killsRegister(Reg, TRI)) { |
378 | for (MachineOperand &MO : A->operands()) { |
379 | if (MO.isReg() && MO.isKill()) { |
380 | Register Candidate = MO.getReg(); |
381 | if (Candidate != Reg && TRI->regsOverlap(RegA: Candidate, RegB: Reg)) |
382 | KillFlagCandidates->push_back(Elt: &MO); |
383 | } |
384 | } |
385 | } |
386 | } |
387 | |
388 | ++CurrentIteration; |
389 | } |
390 | |
391 | return nullptr; |
392 | } |
393 | |
394 | // Determine if a register Reg is not re-defined and still in use |
395 | // in the range (Stop..Start]. |
396 | // It does so by backwards calculating liveness from the end of the BB until |
397 | // either Stop or the beginning of the BB is reached. |
398 | // After liveness is calculated, we can determine if Reg is still in use and not |
399 | // defined inbetween the instructions. |
400 | bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, |
401 | MachineInstr &Start, |
402 | MCRegister Reg, |
403 | bool UseLiveOuts, |
404 | bool IgnoreStart) const { |
405 | LiveRegUnits LR(*TRI); |
406 | if (UseLiveOuts) |
407 | LR.addLiveOuts(MBB: *Stop.getParent()); |
408 | |
409 | MachineBasicBlock::reverse_iterator A(Start); |
410 | |
411 | if (IgnoreStart) |
412 | ++A; |
413 | |
414 | for (; A != Stop.getParent()->rend() && A != Stop; ++A) { |
415 | LR.stepBackward(MI: *A); |
416 | } |
417 | |
418 | return !LR.available(Reg) || MRI->isReserved(PhysReg: Reg); |
419 | } |
420 | |
421 | // Determine if a register Reg is not re-defined and still in use |
422 | // in the range (Stop..BB.end]. |
423 | bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, |
424 | MCRegister Reg) const { |
425 | return isRegisterInUseBetween(Stop, Start&: *Stop.getParent()->rbegin(), Reg, UseLiveOuts: true); |
426 | } |
427 | |
428 | // Optimize sequences emitted for control flow lowering. They are originally |
429 | // emitted as the separate operations because spill code may need to be |
430 | // inserted for the saved copy of exec. |
431 | // |
432 | // x = copy exec |
433 | // z = s_<op>_b64 x, y |
434 | // exec = copy z |
435 | // => |
436 | // x = s_<op>_saveexec_b64 y |
437 | // |
438 | bool SIOptimizeExecMasking::optimizeExecSequence() { |
439 | bool Changed = false; |
440 | for (MachineBasicBlock &MBB : *MF) { |
441 | MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); |
442 | MachineBasicBlock::reverse_iterator E = MBB.rend(); |
443 | if (I == E) |
444 | continue; |
445 | |
446 | // It's possible to see other terminator copies after the exec copy. This |
447 | // can happen if control flow pseudos had their outputs used by phis. |
448 | Register CopyToExec; |
449 | |
450 | unsigned SearchCount = 0; |
451 | const unsigned SearchLimit = 5; |
452 | while (I != E && SearchCount++ < SearchLimit) { |
453 | CopyToExec = isCopyToExec(MI: *I); |
454 | if (CopyToExec) |
455 | break; |
456 | ++I; |
457 | } |
458 | |
459 | if (!CopyToExec) |
460 | continue; |
461 | |
462 | // Scan backwards to find the def. |
463 | auto *CopyToExecInst = &*I; |
464 | auto CopyFromExecInst = findExecCopy(MBB, I); |
465 | if (CopyFromExecInst == E) { |
466 | auto PrepareExecInst = std::next(x: I); |
467 | if (PrepareExecInst == E) |
468 | continue; |
469 | // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec |
470 | if (CopyToExecInst->getOperand(i: 1).isKill() && |
471 | isLogicalOpOnExec(MI: *PrepareExecInst) == CopyToExec) { |
472 | LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); |
473 | |
474 | PrepareExecInst->getOperand(i: 0).setReg(Exec); |
475 | |
476 | LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); |
477 | |
478 | CopyToExecInst->eraseFromParent(); |
479 | Changed = true; |
480 | } |
481 | |
482 | continue; |
483 | } |
484 | |
485 | if (isLiveOut(MBB, Reg: CopyToExec)) { |
486 | // The copied register is live out and has a second use in another block. |
487 | LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n" ); |
488 | continue; |
489 | } |
490 | |
491 | Register CopyFromExec = CopyFromExecInst->getOperand(i: 0).getReg(); |
492 | MachineInstr *SaveExecInst = nullptr; |
493 | SmallVector<MachineInstr *, 4> OtherUseInsts; |
494 | |
495 | for (MachineBasicBlock::iterator |
496 | J = std::next(x: CopyFromExecInst->getIterator()), |
497 | JE = I->getIterator(); |
498 | J != JE; ++J) { |
499 | if (SaveExecInst && J->readsRegister(Reg: Exec, TRI)) { |
500 | LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); |
501 | // Make sure this is inserted after any VALU ops that may have been |
502 | // scheduled in between. |
503 | SaveExecInst = nullptr; |
504 | break; |
505 | } |
506 | |
507 | bool ReadsCopyFromExec = J->readsRegister(Reg: CopyFromExec, TRI); |
508 | |
509 | if (J->modifiesRegister(Reg: CopyToExec, TRI)) { |
510 | if (SaveExecInst) { |
511 | LLVM_DEBUG(dbgs() << "Multiple instructions modify " |
512 | << printReg(CopyToExec, TRI) << '\n'); |
513 | SaveExecInst = nullptr; |
514 | break; |
515 | } |
516 | |
517 | unsigned SaveExecOp = getSaveExecOp(Opc: J->getOpcode()); |
518 | if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) |
519 | break; |
520 | |
521 | if (ReadsCopyFromExec) { |
522 | SaveExecInst = &*J; |
523 | LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); |
524 | continue; |
525 | } |
526 | LLVM_DEBUG(dbgs() << "Instruction does not read exec copy: " << *J |
527 | << '\n'); |
528 | break; |
529 | } |
530 | if (ReadsCopyFromExec && !SaveExecInst) { |
531 | // Make sure no other instruction is trying to use this copy, before it |
532 | // will be rewritten by the saveexec, i.e. hasOneUse. There may have |
533 | // been another use, such as an inserted spill. For example: |
534 | // |
535 | // %sgpr0_sgpr1 = COPY %exec |
536 | // spill %sgpr0_sgpr1 |
537 | // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 |
538 | // |
539 | LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J |
540 | << '\n'); |
541 | break; |
542 | } |
543 | |
544 | if (SaveExecInst && J->readsRegister(Reg: CopyToExec, TRI)) { |
545 | assert(SaveExecInst != &*J); |
546 | OtherUseInsts.push_back(Elt: &*J); |
547 | } |
548 | } |
549 | |
550 | if (!SaveExecInst) |
551 | continue; |
552 | |
553 | LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); |
554 | |
555 | MachineOperand &Src0 = SaveExecInst->getOperand(i: 1); |
556 | MachineOperand &Src1 = SaveExecInst->getOperand(i: 2); |
557 | |
558 | MachineOperand *OtherOp = nullptr; |
559 | |
560 | if (Src0.isReg() && Src0.getReg() == CopyFromExec) { |
561 | OtherOp = &Src1; |
562 | } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { |
563 | if (!SaveExecInst->isCommutable()) |
564 | break; |
565 | |
566 | OtherOp = &Src0; |
567 | } else |
568 | llvm_unreachable("unexpected" ); |
569 | |
570 | CopyFromExecInst->eraseFromParent(); |
571 | |
572 | auto InsPt = SaveExecInst->getIterator(); |
573 | const DebugLoc &DL = SaveExecInst->getDebugLoc(); |
574 | |
575 | BuildMI(BB&: MBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: getSaveExecOp(Opc: SaveExecInst->getOpcode())), |
576 | DestReg: CopyFromExec) |
577 | .addReg(RegNo: OtherOp->getReg()); |
578 | SaveExecInst->eraseFromParent(); |
579 | |
580 | CopyToExecInst->eraseFromParent(); |
581 | |
582 | for (MachineInstr *OtherInst : OtherUseInsts) { |
583 | OtherInst->substituteRegister(FromReg: CopyToExec, ToReg: Exec, SubIdx: AMDGPU::NoSubRegister, |
584 | RegInfo: *TRI); |
585 | } |
586 | |
587 | Changed = true; |
588 | } |
589 | |
590 | return Changed; |
591 | } |
592 | |
593 | // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the |
594 | // operands extracted from a v_cmp ..., s_and_saveexec pattern. |
595 | bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( |
596 | MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { |
597 | const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(Opcode: VCmp.getOpcode()); |
598 | |
599 | if (NewOpcode == -1) |
600 | return false; |
601 | |
602 | MachineOperand *Src0 = TII->getNamedOperand(MI&: VCmp, OperandName: AMDGPU::OpName::src0); |
603 | MachineOperand *Src1 = TII->getNamedOperand(MI&: VCmp, OperandName: AMDGPU::OpName::src1); |
604 | |
605 | Register MoveDest = SaveExecInstr.getOperand(i: 0).getReg(); |
606 | |
607 | MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); |
608 | if (!SaveExecInstr.uses().empty()) { |
609 | bool IsSGPR32 = TRI->getRegSizeInBits(Reg: MoveDest, MRI: *MRI) == 32; |
610 | unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
611 | BuildMI(BB&: *SaveExecInstr.getParent(), I: InsertPosIt, |
612 | MIMD: SaveExecInstr.getDebugLoc(), MCID: TII->get(Opcode: MovOpcode), DestReg: MoveDest) |
613 | .addReg(RegNo: Exec); |
614 | } |
615 | |
616 | // Omit dst as V_CMPX is implicitly writing to EXEC. |
617 | // Add dummy src and clamp modifiers, if needed. |
618 | auto Builder = BuildMI(BB&: *VCmp.getParent(), I: std::next(x: InsertPosIt), |
619 | MIMD: VCmp.getDebugLoc(), MCID: TII->get(Opcode: NewOpcode)); |
620 | |
621 | auto TryAddImmediateValueFromNamedOperand = |
622 | [&](AMDGPU::OpName OperandName) -> void { |
623 | if (auto *Mod = TII->getNamedOperand(MI&: VCmp, OperandName)) |
624 | Builder.addImm(Val: Mod->getImm()); |
625 | }; |
626 | |
627 | TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); |
628 | Builder.add(MO: *Src0); |
629 | |
630 | TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); |
631 | Builder.add(MO: *Src1); |
632 | |
633 | TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); |
634 | |
635 | TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::op_sel); |
636 | |
637 | // The kill flags may no longer be correct. |
638 | if (Src0->isReg()) |
639 | MRI->clearKillFlags(Reg: Src0->getReg()); |
640 | if (Src1->isReg()) |
641 | MRI->clearKillFlags(Reg: Src1->getReg()); |
642 | |
643 | for (MachineOperand *MO : KillFlagCandidates) |
644 | MO->setIsKill(false); |
645 | |
646 | SaveExecInstr.eraseFromParent(); |
647 | VCmp.eraseFromParent(); |
648 | |
649 | return true; |
650 | } |
651 | |
652 | // Record (on GFX10.3 and later) occurences of |
653 | // v_cmp_* SGPR, IMM, VGPR |
654 | // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR |
655 | // to be replaced with |
656 | // s_mov_b32 EXEC_SGPR_DEST, exec_lo |
657 | // v_cmpx_* IMM, VGPR |
658 | // to reduce pipeline stalls. |
659 | void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( |
660 | MachineInstr &MI) { |
661 | if (!ST->hasGFX10_3Insts()) |
662 | return; |
663 | |
664 | const unsigned AndSaveExecOpcode = |
665 | ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; |
666 | |
667 | if (MI.getOpcode() != AndSaveExecOpcode) |
668 | return; |
669 | |
670 | Register SaveExecDest = MI.getOperand(i: 0).getReg(); |
671 | if (!TRI->isSGPRReg(MRI: *MRI, Reg: SaveExecDest)) |
672 | return; |
673 | |
674 | MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0); |
675 | if (!SaveExecSrc0->isReg()) |
676 | return; |
677 | |
678 | // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec |
679 | // sequence by looking at an instance of an s_and_saveexec instruction. |
680 | // Returns a pointer to the v_cmp instruction if it is safe to replace the |
681 | // sequence (see the conditions in the function body). This is after register |
682 | // allocation, so some checks on operand dependencies need to be considered. |
683 | MachineInstr *VCmp = nullptr; |
684 | |
685 | // Try to find the last v_cmp instruction that defs the saveexec input |
686 | // operand without any write to Exec or the saveexec input operand inbetween. |
687 | VCmp = findInstrBackwards( |
688 | Origin&: MI, |
689 | Pred: [&](MachineInstr *Check) { |
690 | return AMDGPU::getVCMPXOpFromVCMP(Opcode: Check->getOpcode()) != -1 && |
691 | Check->modifiesRegister(Reg: SaveExecSrc0->getReg(), TRI); |
692 | }, |
693 | NonModifiableRegs: {Exec, SaveExecSrc0->getReg()}); |
694 | |
695 | if (!VCmp) |
696 | return; |
697 | |
698 | MachineOperand *VCmpDest = TII->getNamedOperand(MI&: *VCmp, OperandName: AMDGPU::OpName::sdst); |
699 | assert(VCmpDest && "Should have an sdst operand!" ); |
700 | |
701 | // Check if any of the v_cmp source operands is written by the saveexec. |
702 | MachineOperand *Src0 = TII->getNamedOperand(MI&: *VCmp, OperandName: AMDGPU::OpName::src0); |
703 | if (Src0->isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src0->getReg()) && |
704 | MI.modifiesRegister(Reg: Src0->getReg(), TRI)) |
705 | return; |
706 | |
707 | MachineOperand *Src1 = TII->getNamedOperand(MI&: *VCmp, OperandName: AMDGPU::OpName::src1); |
708 | if (Src1->isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src1->getReg()) && |
709 | MI.modifiesRegister(Reg: Src1->getReg(), TRI)) |
710 | return; |
711 | |
712 | // Don't do the transformation if the destination operand is included in |
713 | // it's MBB Live-outs, meaning it's used in any of its successors, leading |
714 | // to incorrect code if the v_cmp and therefore the def of |
715 | // the dest operand is removed. |
716 | if (isLiveOut(MBB: *VCmp->getParent(), Reg: VCmpDest->getReg())) |
717 | return; |
718 | |
719 | // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the |
720 | // s_and_saveexec, skip the optimization. |
721 | if (isRegisterInUseBetween(Stop&: *VCmp, Start&: MI, Reg: VCmpDest->getReg(), UseLiveOuts: false, IgnoreStart: true) || |
722 | isRegisterInUseAfter(Stop&: MI, Reg: VCmpDest->getReg())) |
723 | return; |
724 | |
725 | // Try to determine if there is a write to any of the VCmp |
726 | // operands between the saveexec and the vcmp. |
727 | // If yes, additional VGPR spilling might need to be inserted. In this case, |
728 | // it's not worth replacing the instruction sequence. |
729 | SmallVector<MCRegister, 2> NonDefRegs; |
730 | if (Src0->isReg()) |
731 | NonDefRegs.push_back(Elt: Src0->getReg()); |
732 | |
733 | if (Src1->isReg()) |
734 | NonDefRegs.push_back(Elt: Src1->getReg()); |
735 | |
736 | if (!findInstrBackwards( |
737 | Origin&: MI, Pred: [&](MachineInstr *Check) { return Check == VCmp; }, NonModifiableRegs: NonDefRegs, |
738 | Terminator: VCmp, KillFlagCandidates: &KillFlagCandidates)) |
739 | return; |
740 | |
741 | if (VCmp) |
742 | SaveExecVCmpMapping[&MI] = VCmp; |
743 | } |
744 | |
745 | // Record occurences of |
746 | // s_or_saveexec s_o, s_i |
747 | // s_xor exec, exec, s_o |
748 | // to be replaced with |
749 | // s_andn2_saveexec s_o, s_i. |
750 | void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) { |
751 | const unsigned XorOpcode = |
752 | ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; |
753 | |
754 | if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) { |
755 | const MachineOperand &XorDst = MI.getOperand(i: 0); |
756 | const MachineOperand &XorSrc0 = MI.getOperand(i: 1); |
757 | const MachineOperand &XorSrc1 = MI.getOperand(i: 2); |
758 | |
759 | if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() && |
760 | XorSrc1.isReg() && |
761 | (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) { |
762 | const unsigned OrSaveexecOpcode = ST->isWave32() |
763 | ? AMDGPU::S_OR_SAVEEXEC_B32 |
764 | : AMDGPU::S_OR_SAVEEXEC_B64; |
765 | |
766 | // Peek at the previous instruction and check if this is a relevant |
767 | // s_or_saveexec instruction. |
768 | MachineInstr &PossibleOrSaveexec = *MI.getPrevNode(); |
769 | if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode) |
770 | return; |
771 | |
772 | const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(i: 0); |
773 | const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(i: 1); |
774 | if (OrDst.isReg() && OrSrc0.isReg()) { |
775 | if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) || |
776 | (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) { |
777 | OrXors.emplace_back(Args: &PossibleOrSaveexec, Args: &MI); |
778 | } |
779 | } |
780 | } |
781 | } |
782 | } |
783 | |
784 | bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() { |
785 | if (OrXors.empty()) { |
786 | return false; |
787 | } |
788 | |
789 | bool Changed = false; |
790 | const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32 |
791 | : AMDGPU::S_ANDN2_SAVEEXEC_B64; |
792 | |
793 | for (const auto &Pair : OrXors) { |
794 | MachineInstr *Or = nullptr; |
795 | MachineInstr *Xor = nullptr; |
796 | std::tie(args&: Or, args&: Xor) = Pair; |
797 | BuildMI(BB&: *Or->getParent(), I: Or->getIterator(), MIMD: Or->getDebugLoc(), |
798 | MCID: TII->get(Opcode: Andn2Opcode), DestReg: Or->getOperand(i: 0).getReg()) |
799 | .addReg(RegNo: Or->getOperand(i: 1).getReg()); |
800 | |
801 | Or->eraseFromParent(); |
802 | Xor->eraseFromParent(); |
803 | |
804 | Changed = true; |
805 | } |
806 | |
807 | return Changed; |
808 | } |
809 | |
810 | bool SIOptimizeExecMaskingLegacy::runOnMachineFunction(MachineFunction &MF) { |
811 | if (skipFunction(F: MF.getFunction())) |
812 | return false; |
813 | |
814 | return SIOptimizeExecMasking().run(MF); |
815 | } |
816 | |
817 | bool SIOptimizeExecMasking::run(MachineFunction &MF) { |
818 | this->MF = &MF; |
819 | ST = &MF.getSubtarget<GCNSubtarget>(); |
820 | TRI = ST->getRegisterInfo(); |
821 | TII = ST->getInstrInfo(); |
822 | MRI = &MF.getRegInfo(); |
823 | Exec = TRI->getExec(); |
824 | |
825 | bool Changed = optimizeExecSequence(); |
826 | |
827 | OrXors.clear(); |
828 | SaveExecVCmpMapping.clear(); |
829 | KillFlagCandidates.clear(); |
830 | static unsigned SearchWindow = 10; |
831 | for (MachineBasicBlock &MBB : MF) { |
832 | unsigned SearchCount = 0; |
833 | |
834 | for (auto &MI : llvm::reverse(C&: MBB)) { |
835 | if (MI.isDebugInstr()) |
836 | continue; |
837 | |
838 | if (SearchCount >= SearchWindow) { |
839 | break; |
840 | } |
841 | |
842 | tryRecordOrSaveexecXorSequence(MI); |
843 | tryRecordVCmpxAndSaveexecSequence(MI); |
844 | |
845 | if (MI.modifiesRegister(Reg: Exec, TRI)) { |
846 | break; |
847 | } |
848 | |
849 | ++SearchCount; |
850 | } |
851 | } |
852 | |
853 | Changed |= optimizeOrSaveexecXorSequences(); |
854 | for (const auto &Entry : SaveExecVCmpMapping) { |
855 | MachineInstr *SaveExecInstr = Entry.getFirst(); |
856 | MachineInstr *VCmpInstr = Entry.getSecond(); |
857 | |
858 | Changed |= optimizeVCMPSaveExecSequence(SaveExecInstr&: *SaveExecInstr, VCmp&: *VCmpInstr, Exec); |
859 | } |
860 | |
861 | return Changed; |
862 | } |
863 | |