1 | //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This pass adds instructions to enable whole quad mode (strict or non-strict) |
11 | /// for pixel shaders, and strict whole wavefront mode for all programs. |
12 | /// |
13 | /// The "strict" prefix indicates that inactive lanes do not take part in |
14 | /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will |
15 | /// always be enabled irrespective of control flow decisions. Conversely in |
16 | /// non-strict WQM inactive lanes may control flow decisions. |
17 | /// |
18 | /// Whole quad mode is required for derivative computations, but it interferes |
19 | /// with shader side effects (stores and atomics). It ensures that WQM is |
20 | /// enabled when necessary, but disabled around stores and atomics. |
21 | /// |
22 | /// When necessary, this pass creates a function prolog |
23 | /// |
24 | /// S_MOV_B64 LiveMask, EXEC |
25 | /// S_WQM_B64 EXEC, EXEC |
26 | /// |
27 | /// to enter WQM at the top of the function and surrounds blocks of Exact |
28 | /// instructions by |
29 | /// |
30 | /// S_AND_SAVEEXEC_B64 Tmp, LiveMask |
31 | /// ... |
32 | /// S_MOV_B64 EXEC, Tmp |
33 | /// |
34 | /// We also compute when a sequence of instructions requires strict whole |
35 | /// wavefront mode (StrictWWM) and insert instructions to save and restore it: |
36 | /// |
37 | /// S_OR_SAVEEXEC_B64 Tmp, -1 |
38 | /// ... |
39 | /// S_MOV_B64 EXEC, Tmp |
40 | /// |
41 | /// When a sequence of instructions requires strict whole quad mode (StrictWQM) |
42 | /// we use a similar save and restore mechanism and force whole quad mode for |
43 | /// those instructions: |
44 | /// |
45 | /// S_MOV_B64 Tmp, EXEC |
46 | /// S_WQM_B64 EXEC, EXEC |
47 | /// ... |
48 | /// S_MOV_B64 EXEC, Tmp |
49 | /// |
50 | /// In order to avoid excessive switching during sequences of Exact |
51 | /// instructions, the pass first analyzes which instructions must be run in WQM |
52 | /// (aka which instructions produce values that lead to derivative |
53 | /// computations). |
54 | /// |
55 | /// Basic blocks are always exited in WQM as long as some successor needs WQM. |
56 | /// |
57 | /// There is room for improvement given better control flow analysis: |
58 | /// |
59 | /// (1) at the top level (outside of control flow statements, and as long as |
60 | /// kill hasn't been used), one SGPR can be saved by recovering WQM from |
61 | /// the LiveMask (this is implemented for the entry block). |
62 | /// |
63 | /// (2) when entire regions (e.g. if-else blocks or entire loops) only |
64 | /// consist of exact and don't-care instructions, the switch only has to |
65 | /// be done at the entry and exit points rather than potentially in each |
66 | /// block of the region. |
67 | /// |
68 | //===----------------------------------------------------------------------===// |
69 | |
70 | #include "SIWholeQuadMode.h" |
71 | #include "AMDGPU.h" |
72 | #include "GCNSubtarget.h" |
73 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
74 | #include "llvm/ADT/MapVector.h" |
75 | #include "llvm/ADT/PostOrderIterator.h" |
76 | #include "llvm/CodeGen/LiveIntervals.h" |
77 | #include "llvm/CodeGen/MachineBasicBlock.h" |
78 | #include "llvm/CodeGen/MachineDominators.h" |
79 | #include "llvm/CodeGen/MachineFunctionPass.h" |
80 | #include "llvm/CodeGen/MachineInstr.h" |
81 | #include "llvm/CodeGen/MachinePostDominators.h" |
82 | #include "llvm/IR/CallingConv.h" |
83 | #include "llvm/InitializePasses.h" |
84 | #include "llvm/Support/raw_ostream.h" |
85 | |
86 | using namespace llvm; |
87 | |
88 | #define DEBUG_TYPE "si-wqm" |
89 | |
90 | namespace { |
91 | |
92 | enum { |
93 | StateWQM = 0x1, |
94 | StateStrictWWM = 0x2, |
95 | StateStrictWQM = 0x4, |
96 | StateExact = 0x8, |
97 | StateStrict = StateStrictWWM | StateStrictWQM, |
98 | }; |
99 | |
100 | struct PrintState { |
101 | public: |
102 | int State; |
103 | |
104 | explicit PrintState(int State) : State(State) {} |
105 | }; |
106 | |
107 | #ifndef NDEBUG |
108 | static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { |
109 | |
110 | static const std::pair<char, const char *> Mapping[] = { |
111 | std::pair(StateWQM, "WQM" ), std::pair(StateStrictWWM, "StrictWWM" ), |
112 | std::pair(StateStrictWQM, "StrictWQM" ), std::pair(StateExact, "Exact" )}; |
113 | char State = PS.State; |
114 | for (auto M : Mapping) { |
115 | if (State & M.first) { |
116 | OS << M.second; |
117 | State &= ~M.first; |
118 | |
119 | if (State) |
120 | OS << '|'; |
121 | } |
122 | } |
123 | assert(State == 0); |
124 | return OS; |
125 | } |
126 | #endif |
127 | |
128 | struct InstrInfo { |
129 | char Needs = 0; |
130 | char Disabled = 0; |
131 | char OutNeeds = 0; |
132 | char MarkedStates = 0; |
133 | }; |
134 | |
135 | struct BlockInfo { |
136 | char Needs = 0; |
137 | char InNeeds = 0; |
138 | char OutNeeds = 0; |
139 | char InitialState = 0; |
140 | bool NeedsLowering = false; |
141 | }; |
142 | |
143 | struct WorkItem { |
144 | MachineBasicBlock *MBB = nullptr; |
145 | MachineInstr *MI = nullptr; |
146 | |
147 | WorkItem() = default; |
148 | WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} |
149 | WorkItem(MachineInstr *MI) : MI(MI) {} |
150 | }; |
151 | |
152 | class SIWholeQuadMode { |
153 | public: |
154 | SIWholeQuadMode(MachineFunction &MF, LiveIntervals *LIS, |
155 | MachineDominatorTree *MDT, MachinePostDominatorTree *PDT) |
156 | : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()), |
157 | TRI(&TII->getRegisterInfo()), MRI(&MF.getRegInfo()), LIS(LIS), MDT(MDT), |
158 | PDT(PDT) {} |
159 | bool run(MachineFunction &MF); |
160 | |
161 | private: |
162 | const GCNSubtarget *ST; |
163 | const SIInstrInfo *TII; |
164 | const SIRegisterInfo *TRI; |
165 | MachineRegisterInfo *MRI; |
166 | LiveIntervals *LIS; |
167 | MachineDominatorTree *MDT; |
168 | MachinePostDominatorTree *PDT; |
169 | |
170 | unsigned AndOpc; |
171 | unsigned AndTermOpc; |
172 | unsigned AndN2Opc; |
173 | unsigned XorOpc; |
174 | unsigned AndSaveExecOpc; |
175 | unsigned AndSaveExecTermOpc; |
176 | unsigned WQMOpc; |
177 | Register Exec; |
178 | Register LiveMaskReg; |
179 | |
180 | DenseMap<const MachineInstr *, InstrInfo> Instructions; |
181 | MapVector<MachineBasicBlock *, BlockInfo> Blocks; |
182 | |
183 | // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction |
184 | DenseMap<const MachineInstr *, char> StateTransition; |
185 | |
186 | SmallVector<MachineInstr *, 2> LiveMaskQueries; |
187 | SmallVector<MachineInstr *, 4> LowerToMovInstrs; |
188 | SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs; |
189 | SmallVector<MachineInstr *, 4> KillInstrs; |
190 | SmallVector<MachineInstr *, 4> InitExecInstrs; |
191 | SmallVector<MachineInstr *, 4> SetInactiveInstrs; |
192 | |
193 | void printInfo(); |
194 | |
195 | void markInstruction(MachineInstr &MI, char Flag, |
196 | std::vector<WorkItem> &Worklist); |
197 | void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, |
198 | unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist); |
199 | void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag, |
200 | std::vector<WorkItem> &Worklist); |
201 | void markInstructionUses(const MachineInstr &MI, char Flag, |
202 | std::vector<WorkItem> &Worklist); |
203 | char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); |
204 | void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); |
205 | void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); |
206 | char analyzeFunction(MachineFunction &MF); |
207 | |
208 | MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, |
209 | MachineBasicBlock::iterator Before); |
210 | MachineBasicBlock::iterator |
211 | prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, |
212 | MachineBasicBlock::iterator Last, bool PreferLast, |
213 | bool SaveSCC); |
214 | void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
215 | Register SaveWQM); |
216 | void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
217 | Register SavedWQM); |
218 | void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
219 | Register SaveOrig, char StrictStateNeeded); |
220 | void fromStrictMode(MachineBasicBlock &MBB, |
221 | MachineBasicBlock::iterator Before, Register SavedOrig, |
222 | char NonStrictState, char CurrentStrictState); |
223 | |
224 | void splitBlock(MachineInstr *TermMI); |
225 | MachineInstr *lowerKillI1(MachineInstr &MI, bool IsWQM); |
226 | MachineInstr *lowerKillF32(MachineInstr &MI); |
227 | |
228 | void lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI); |
229 | void processBlock(MachineBasicBlock &MBB, BlockInfo &BI, bool IsEntry); |
230 | |
231 | bool lowerLiveMaskQueries(); |
232 | bool lowerCopyInstrs(); |
233 | bool lowerKillInstrs(bool IsWQM); |
234 | void lowerInitExec(MachineInstr &MI); |
235 | MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry, |
236 | bool &Changed); |
237 | }; |
238 | |
239 | class SIWholeQuadModeLegacy : public MachineFunctionPass { |
240 | public: |
241 | static char ID; |
242 | |
243 | SIWholeQuadModeLegacy() : MachineFunctionPass(ID) {} |
244 | |
245 | bool runOnMachineFunction(MachineFunction &MF) override; |
246 | |
247 | StringRef getPassName() const override { return "SI Whole Quad Mode" ; } |
248 | |
249 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
250 | AU.addRequired<LiveIntervalsWrapperPass>(); |
251 | AU.addPreserved<SlotIndexesWrapperPass>(); |
252 | AU.addPreserved<LiveIntervalsWrapperPass>(); |
253 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
254 | AU.addPreserved<MachinePostDominatorTreeWrapperPass>(); |
255 | MachineFunctionPass::getAnalysisUsage(AU); |
256 | } |
257 | |
258 | MachineFunctionProperties getClearedProperties() const override { |
259 | return MachineFunctionProperties().setIsSSA(); |
260 | } |
261 | }; |
262 | } // end anonymous namespace |
263 | |
264 | char SIWholeQuadModeLegacy::ID = 0; |
265 | |
266 | INITIALIZE_PASS_BEGIN(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode" , |
267 | false, false) |
268 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
269 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
270 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) |
271 | INITIALIZE_PASS_END(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode" , |
272 | false, false) |
273 | |
274 | char &llvm::SIWholeQuadModeID = SIWholeQuadModeLegacy::ID; |
275 | |
276 | FunctionPass *llvm::createSIWholeQuadModeLegacyPass() { |
277 | return new SIWholeQuadModeLegacy; |
278 | } |
279 | |
280 | #ifndef NDEBUG |
281 | LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() { |
282 | for (const auto &BII : Blocks) { |
283 | dbgs() << "\n" |
284 | << printMBBReference(*BII.first) << ":\n" |
285 | << " InNeeds = " << PrintState(BII.second.InNeeds) |
286 | << ", Needs = " << PrintState(BII.second.Needs) |
287 | << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n" ; |
288 | |
289 | for (const MachineInstr &MI : *BII.first) { |
290 | auto III = Instructions.find(&MI); |
291 | if (III != Instructions.end()) { |
292 | dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) |
293 | << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; |
294 | } |
295 | } |
296 | } |
297 | } |
298 | #endif |
299 | |
300 | void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, |
301 | std::vector<WorkItem> &Worklist) { |
302 | InstrInfo &II = Instructions[&MI]; |
303 | |
304 | assert(!(Flag & StateExact) && Flag != 0); |
305 | |
306 | // Capture all states requested in marking including disabled ones. |
307 | II.MarkedStates |= Flag; |
308 | |
309 | // Remove any disabled states from the flag. The user that required it gets |
310 | // an undefined value in the helper lanes. For example, this can happen if |
311 | // the result of an atomic is used by instruction that requires WQM, where |
312 | // ignoring the request for WQM is correct as per the relevant specs. |
313 | Flag &= ~II.Disabled; |
314 | |
315 | // Ignore if the flag is already encompassed by the existing needs, or we |
316 | // just disabled everything. |
317 | if ((II.Needs & Flag) == Flag) |
318 | return; |
319 | |
320 | LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); |
321 | II.Needs |= Flag; |
322 | Worklist.emplace_back(args: &MI); |
323 | } |
324 | |
325 | /// Mark all relevant definitions of register \p Reg in usage \p UseMI. |
326 | void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, |
327 | Register Reg, unsigned SubReg, char Flag, |
328 | std::vector<WorkItem> &Worklist) { |
329 | LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); |
330 | |
331 | LiveQueryResult UseLRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: UseMI)); |
332 | const VNInfo *Value = UseLRQ.valueIn(); |
333 | if (!Value) |
334 | return; |
335 | |
336 | // Note: this code assumes that lane masks on AMDGPU completely |
337 | // cover registers. |
338 | const LaneBitmask UseLanes = |
339 | SubReg ? TRI->getSubRegIndexLaneMask(SubIdx: SubReg) |
340 | : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) |
341 | : LaneBitmask::getNone()); |
342 | |
343 | // Perform a depth-first iteration of the LiveRange graph marking defs. |
344 | // Stop processing of a given branch when all use lanes have been defined. |
345 | // The first definition stops processing for a physical register. |
346 | struct PhiEntry { |
347 | const VNInfo *Phi; |
348 | unsigned PredIdx; |
349 | LaneBitmask DefinedLanes; |
350 | |
351 | PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes) |
352 | : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {} |
353 | }; |
354 | using VisitKey = std::pair<const VNInfo *, LaneBitmask>; |
355 | SmallVector<PhiEntry, 2> PhiStack; |
356 | SmallSet<VisitKey, 4> Visited; |
357 | LaneBitmask DefinedLanes; |
358 | unsigned NextPredIdx = 0; // Only used for processing phi nodes |
359 | do { |
360 | const VNInfo *NextValue = nullptr; |
361 | const VisitKey Key(Value, DefinedLanes); |
362 | |
363 | if (Visited.insert(V: Key).second) { |
364 | // On first visit to a phi then start processing first predecessor |
365 | NextPredIdx = 0; |
366 | } |
367 | |
368 | if (Value->isPHIDef()) { |
369 | // Each predecessor node in the phi must be processed as a subgraph |
370 | const MachineBasicBlock *MBB = LIS->getMBBFromIndex(index: Value->def); |
371 | assert(MBB && "Phi-def has no defining MBB" ); |
372 | |
373 | // Find next predecessor to process |
374 | unsigned Idx = NextPredIdx; |
375 | const auto *PI = MBB->pred_begin() + Idx; |
376 | const auto *PE = MBB->pred_end(); |
377 | for (; PI != PE && !NextValue; ++PI, ++Idx) { |
378 | if (const VNInfo *VN = LR.getVNInfoBefore(Idx: LIS->getMBBEndIdx(mbb: *PI))) { |
379 | if (!Visited.count(V: VisitKey(VN, DefinedLanes))) |
380 | NextValue = VN; |
381 | } |
382 | } |
383 | |
384 | // If there are more predecessors to process; add phi to stack |
385 | if (PI != PE) |
386 | PhiStack.emplace_back(Args&: Value, Args&: Idx, Args&: DefinedLanes); |
387 | } else { |
388 | MachineInstr *MI = LIS->getInstructionFromIndex(index: Value->def); |
389 | assert(MI && "Def has no defining instruction" ); |
390 | |
391 | if (Reg.isVirtual()) { |
392 | // Iterate over all operands to find relevant definitions |
393 | bool HasDef = false; |
394 | for (const MachineOperand &Op : MI->all_defs()) { |
395 | if (Op.getReg() != Reg) |
396 | continue; |
397 | |
398 | // Compute lanes defined and overlap with use |
399 | LaneBitmask OpLanes = |
400 | Op.isUndef() ? LaneBitmask::getAll() |
401 | : TRI->getSubRegIndexLaneMask(SubIdx: Op.getSubReg()); |
402 | LaneBitmask Overlap = (UseLanes & OpLanes); |
403 | |
404 | // Record if this instruction defined any of use |
405 | HasDef |= Overlap.any(); |
406 | |
407 | // Mark any lanes defined |
408 | DefinedLanes |= OpLanes; |
409 | } |
410 | |
411 | // Check if all lanes of use have been defined |
412 | if ((DefinedLanes & UseLanes) != UseLanes) { |
413 | // Definition not complete; need to process input value |
414 | LiveQueryResult LRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: *MI)); |
415 | if (const VNInfo *VN = LRQ.valueIn()) { |
416 | if (!Visited.count(V: VisitKey(VN, DefinedLanes))) |
417 | NextValue = VN; |
418 | } |
419 | } |
420 | |
421 | // Only mark the instruction if it defines some part of the use |
422 | if (HasDef) |
423 | markInstruction(MI&: *MI, Flag, Worklist); |
424 | } else { |
425 | // For physical registers simply mark the defining instruction |
426 | markInstruction(MI&: *MI, Flag, Worklist); |
427 | } |
428 | } |
429 | |
430 | if (!NextValue && !PhiStack.empty()) { |
431 | // Reach end of chain; revert to processing last phi |
432 | PhiEntry &Entry = PhiStack.back(); |
433 | NextValue = Entry.Phi; |
434 | NextPredIdx = Entry.PredIdx; |
435 | DefinedLanes = Entry.DefinedLanes; |
436 | PhiStack.pop_back(); |
437 | } |
438 | |
439 | Value = NextValue; |
440 | } while (Value); |
441 | } |
442 | |
443 | void SIWholeQuadMode::markOperand(const MachineInstr &MI, |
444 | const MachineOperand &Op, char Flag, |
445 | std::vector<WorkItem> &Worklist) { |
446 | assert(Op.isReg()); |
447 | Register Reg = Op.getReg(); |
448 | |
449 | // Ignore some hardware registers |
450 | switch (Reg) { |
451 | case AMDGPU::EXEC: |
452 | case AMDGPU::EXEC_LO: |
453 | return; |
454 | default: |
455 | break; |
456 | } |
457 | |
458 | LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op |
459 | << " for " << MI); |
460 | if (Reg.isVirtual()) { |
461 | LiveRange &LR = LIS->getInterval(Reg); |
462 | markDefs(UseMI: MI, LR, Reg, SubReg: Op.getSubReg(), Flag, Worklist); |
463 | } else { |
464 | // Handle physical registers that we need to track; this is mostly relevant |
465 | // for VCC, which can appear as the (implicit) input of a uniform branch, |
466 | // e.g. when a loop counter is stored in a VGPR. |
467 | for (MCRegUnit Unit : TRI->regunits(Reg: Reg.asMCReg())) { |
468 | LiveRange &LR = LIS->getRegUnit(Unit); |
469 | const VNInfo *Value = LR.Query(Idx: LIS->getInstructionIndex(Instr: MI)).valueIn(); |
470 | if (Value) |
471 | markDefs(UseMI: MI, LR, Reg: Unit, SubReg: AMDGPU::NoSubRegister, Flag, Worklist); |
472 | } |
473 | } |
474 | } |
475 | |
476 | /// Mark all instructions defining the uses in \p MI with \p Flag. |
477 | void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, |
478 | std::vector<WorkItem> &Worklist) { |
479 | LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " |
480 | << MI); |
481 | |
482 | for (const MachineOperand &Use : MI.all_uses()) |
483 | markOperand(MI, Op: Use, Flag, Worklist); |
484 | } |
485 | |
486 | // Scan instructions to determine which ones require an Exact execmask and |
487 | // which ones seed WQM requirements. |
488 | char SIWholeQuadMode::scanInstructions(MachineFunction &MF, |
489 | std::vector<WorkItem> &Worklist) { |
490 | char GlobalFlags = 0; |
491 | bool WQMOutputs = MF.getFunction().hasFnAttribute(Kind: "amdgpu-ps-wqm-outputs" ); |
492 | SmallVector<MachineInstr *, 4> SoftWQMInstrs; |
493 | bool HasImplicitDerivatives = |
494 | MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; |
495 | |
496 | // We need to visit the basic blocks in reverse post-order so that we visit |
497 | // defs before uses, in particular so that we don't accidentally mark an |
498 | // instruction as needing e.g. WQM before visiting it and realizing it needs |
499 | // WQM disabled. |
500 | ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); |
501 | for (MachineBasicBlock *MBB : RPOT) { |
502 | BlockInfo &BBI = Blocks[MBB]; |
503 | |
504 | for (MachineInstr &MI : *MBB) { |
505 | InstrInfo &III = Instructions[&MI]; |
506 | unsigned Opcode = MI.getOpcode(); |
507 | char Flags = 0; |
508 | |
509 | if (TII->isWQM(Opcode)) { |
510 | // If LOD is not supported WQM is not needed. |
511 | // Only generate implicit WQM if implicit derivatives are required. |
512 | // This avoids inserting unintended WQM if a shader type without |
513 | // implicit derivatives uses an image sampling instruction. |
514 | if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) { |
515 | // Sampling instructions don't need to produce results for all pixels |
516 | // in a quad, they just require all inputs of a quad to have been |
517 | // computed for derivatives. |
518 | markInstructionUses(MI, Flag: StateWQM, Worklist); |
519 | GlobalFlags |= StateWQM; |
520 | } |
521 | } else if (Opcode == AMDGPU::WQM) { |
522 | // The WQM intrinsic requires its output to have all the helper lanes |
523 | // correct, so we need it to be in WQM. |
524 | Flags = StateWQM; |
525 | LowerToCopyInstrs.insert(X: &MI); |
526 | } else if (Opcode == AMDGPU::SOFT_WQM) { |
527 | LowerToCopyInstrs.insert(X: &MI); |
528 | SoftWQMInstrs.push_back(Elt: &MI); |
529 | } else if (Opcode == AMDGPU::STRICT_WWM) { |
530 | // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus |
531 | // it needs to be executed in WQM or Exact so that its copy doesn't |
532 | // clobber inactive lanes. |
533 | markInstructionUses(MI, Flag: StateStrictWWM, Worklist); |
534 | GlobalFlags |= StateStrictWWM; |
535 | LowerToMovInstrs.push_back(Elt: &MI); |
536 | } else if (Opcode == AMDGPU::STRICT_WQM || |
537 | TII->isDualSourceBlendEXP(MI)) { |
538 | // STRICT_WQM is similar to STRICTWWM, but instead of enabling all |
539 | // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in |
540 | // quads that have at least one active thread. |
541 | markInstructionUses(MI, Flag: StateStrictWQM, Worklist); |
542 | GlobalFlags |= StateStrictWQM; |
543 | |
544 | if (Opcode == AMDGPU::STRICT_WQM) { |
545 | LowerToMovInstrs.push_back(Elt: &MI); |
546 | } else { |
547 | // Dual source blend export acts as implicit strict-wqm, its sources |
548 | // need to be shuffled in strict wqm, but the export itself needs to |
549 | // run in exact mode. |
550 | BBI.Needs |= StateExact; |
551 | if (!(BBI.InNeeds & StateExact)) { |
552 | BBI.InNeeds |= StateExact; |
553 | Worklist.emplace_back(args&: MBB); |
554 | } |
555 | GlobalFlags |= StateExact; |
556 | III.Disabled = StateWQM | StateStrict; |
557 | } |
558 | } else if (Opcode == AMDGPU::LDS_PARAM_LOAD || |
559 | Opcode == AMDGPU::DS_PARAM_LOAD || |
560 | Opcode == AMDGPU::LDS_DIRECT_LOAD || |
561 | Opcode == AMDGPU::DS_DIRECT_LOAD) { |
562 | // Mark these STRICTWQM, but only for the instruction, not its operands. |
563 | // This avoid unnecessarily marking M0 as requiring WQM. |
564 | III.Needs |= StateStrictWQM; |
565 | GlobalFlags |= StateStrictWQM; |
566 | } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) { |
567 | // Disable strict states; StrictWQM will be added as required later. |
568 | III.Disabled = StateStrict; |
569 | MachineOperand &Inactive = MI.getOperand(i: 4); |
570 | if (Inactive.isReg()) { |
571 | if (Inactive.isUndef() && MI.getOperand(i: 3).getImm() == 0) |
572 | LowerToCopyInstrs.insert(X: &MI); |
573 | else |
574 | markOperand(MI, Op: Inactive, Flag: StateStrictWWM, Worklist); |
575 | } |
576 | SetInactiveInstrs.push_back(Elt: &MI); |
577 | BBI.NeedsLowering = true; |
578 | } else if (TII->isDisableWQM(MI)) { |
579 | BBI.Needs |= StateExact; |
580 | if (!(BBI.InNeeds & StateExact)) { |
581 | BBI.InNeeds |= StateExact; |
582 | Worklist.emplace_back(args&: MBB); |
583 | } |
584 | GlobalFlags |= StateExact; |
585 | III.Disabled = StateWQM | StateStrict; |
586 | } else if (Opcode == AMDGPU::SI_PS_LIVE || |
587 | Opcode == AMDGPU::SI_LIVE_MASK) { |
588 | LiveMaskQueries.push_back(Elt: &MI); |
589 | } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || |
590 | Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || |
591 | Opcode == AMDGPU::SI_DEMOTE_I1) { |
592 | KillInstrs.push_back(Elt: &MI); |
593 | BBI.NeedsLowering = true; |
594 | } else if (Opcode == AMDGPU::SI_INIT_EXEC || |
595 | Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT || |
596 | Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) { |
597 | InitExecInstrs.push_back(Elt: &MI); |
598 | } else if (WQMOutputs) { |
599 | // The function is in machine SSA form, which means that physical |
600 | // VGPRs correspond to shader inputs and outputs. Inputs are |
601 | // only used, outputs are only defined. |
602 | // FIXME: is this still valid? |
603 | for (const MachineOperand &MO : MI.defs()) { |
604 | Register Reg = MO.getReg(); |
605 | if (Reg.isPhysical() && |
606 | TRI->hasVectorRegisters(RC: TRI->getPhysRegBaseClass(Reg))) { |
607 | Flags = StateWQM; |
608 | break; |
609 | } |
610 | } |
611 | } |
612 | |
613 | if (Flags) { |
614 | markInstruction(MI, Flag: Flags, Worklist); |
615 | GlobalFlags |= Flags; |
616 | } |
617 | } |
618 | } |
619 | |
620 | // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is |
621 | // ever used anywhere in the function. This implements the corresponding |
622 | // semantics of @llvm.amdgcn.set.inactive. |
623 | // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm. |
624 | if (GlobalFlags & StateWQM) { |
625 | for (MachineInstr *MI : SetInactiveInstrs) |
626 | markInstruction(MI&: *MI, Flag: StateWQM, Worklist); |
627 | for (MachineInstr *MI : SoftWQMInstrs) |
628 | markInstruction(MI&: *MI, Flag: StateWQM, Worklist); |
629 | } |
630 | |
631 | return GlobalFlags; |
632 | } |
633 | |
634 | void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, |
635 | std::vector<WorkItem>& Worklist) { |
636 | MachineBasicBlock *MBB = MI.getParent(); |
637 | InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references |
638 | BlockInfo &BI = Blocks[MBB]; |
639 | |
640 | // Control flow-type instructions and stores to temporary memory that are |
641 | // followed by WQM computations must themselves be in WQM. |
642 | if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) && |
643 | (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { |
644 | Instructions[&MI].Needs = StateWQM; |
645 | II.Needs = StateWQM; |
646 | } |
647 | |
648 | // Propagate to block level |
649 | if (II.Needs & StateWQM) { |
650 | BI.Needs |= StateWQM; |
651 | if (!(BI.InNeeds & StateWQM)) { |
652 | BI.InNeeds |= StateWQM; |
653 | Worklist.emplace_back(args&: MBB); |
654 | } |
655 | } |
656 | |
657 | // Propagate backwards within block |
658 | if (MachineInstr *PrevMI = MI.getPrevNode()) { |
659 | char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds; |
660 | if (!PrevMI->isPHI()) { |
661 | InstrInfo &PrevII = Instructions[PrevMI]; |
662 | if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { |
663 | PrevII.OutNeeds |= InNeeds; |
664 | Worklist.emplace_back(args&: PrevMI); |
665 | } |
666 | } |
667 | } |
668 | |
669 | // Propagate WQM flag to instruction inputs |
670 | assert(!(II.Needs & StateExact)); |
671 | |
672 | if (II.Needs != 0) |
673 | markInstructionUses(MI, Flag: II.Needs, Worklist); |
674 | |
675 | // Ensure we process a block containing StrictWWM/StrictWQM, even if it does |
676 | // not require any WQM transitions. |
677 | if (II.Needs & StateStrictWWM) |
678 | BI.Needs |= StateStrictWWM; |
679 | if (II.Needs & StateStrictWQM) |
680 | BI.Needs |= StateStrictWQM; |
681 | } |
682 | |
683 | void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, |
684 | std::vector<WorkItem>& Worklist) { |
685 | BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. |
686 | |
687 | // Propagate through instructions |
688 | if (!MBB.empty()) { |
689 | MachineInstr *LastMI = &*MBB.rbegin(); |
690 | InstrInfo &LastII = Instructions[LastMI]; |
691 | if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { |
692 | LastII.OutNeeds |= BI.OutNeeds; |
693 | Worklist.emplace_back(args&: LastMI); |
694 | } |
695 | } |
696 | |
697 | // Predecessor blocks must provide for our WQM/Exact needs. |
698 | for (MachineBasicBlock *Pred : MBB.predecessors()) { |
699 | BlockInfo &PredBI = Blocks[Pred]; |
700 | if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) |
701 | continue; |
702 | |
703 | PredBI.OutNeeds |= BI.InNeeds; |
704 | PredBI.InNeeds |= BI.InNeeds; |
705 | Worklist.emplace_back(args&: Pred); |
706 | } |
707 | |
708 | // All successors must be prepared to accept the same set of WQM/Exact data. |
709 | for (MachineBasicBlock *Succ : MBB.successors()) { |
710 | BlockInfo &SuccBI = Blocks[Succ]; |
711 | if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) |
712 | continue; |
713 | |
714 | SuccBI.InNeeds |= BI.OutNeeds; |
715 | Worklist.emplace_back(args&: Succ); |
716 | } |
717 | } |
718 | |
719 | char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { |
720 | std::vector<WorkItem> Worklist; |
721 | char GlobalFlags = scanInstructions(MF, Worklist); |
722 | |
723 | while (!Worklist.empty()) { |
724 | WorkItem WI = Worklist.back(); |
725 | Worklist.pop_back(); |
726 | |
727 | if (WI.MI) |
728 | propagateInstruction(MI&: *WI.MI, Worklist); |
729 | else |
730 | propagateBlock(MBB&: *WI.MBB, Worklist); |
731 | } |
732 | |
733 | return GlobalFlags; |
734 | } |
735 | |
736 | MachineBasicBlock::iterator |
737 | SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, |
738 | MachineBasicBlock::iterator Before) { |
739 | Register SaveReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
740 | |
741 | MachineInstr *Save = |
742 | BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SaveReg) |
743 | .addReg(RegNo: AMDGPU::SCC); |
744 | MachineInstr *Restore = |
745 | BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC) |
746 | .addReg(RegNo: SaveReg); |
747 | |
748 | LIS->InsertMachineInstrInMaps(MI&: *Save); |
749 | LIS->InsertMachineInstrInMaps(MI&: *Restore); |
750 | LIS->createAndComputeVirtRegInterval(Reg: SaveReg); |
751 | |
752 | return Restore; |
753 | } |
754 | |
755 | void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) { |
756 | MachineBasicBlock *BB = TermMI->getParent(); |
757 | LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " |
758 | << *TermMI << "\n" ); |
759 | |
760 | MachineBasicBlock *SplitBB = |
761 | BB->splitAt(SplitInst&: *TermMI, /*UpdateLiveIns*/ true, LIS); |
762 | |
763 | // Convert last instruction in block to a terminator. |
764 | // Note: this only covers the expected patterns |
765 | unsigned NewOpcode = 0; |
766 | switch (TermMI->getOpcode()) { |
767 | case AMDGPU::S_AND_B32: |
768 | NewOpcode = AMDGPU::S_AND_B32_term; |
769 | break; |
770 | case AMDGPU::S_AND_B64: |
771 | NewOpcode = AMDGPU::S_AND_B64_term; |
772 | break; |
773 | case AMDGPU::S_MOV_B32: |
774 | NewOpcode = AMDGPU::S_MOV_B32_term; |
775 | break; |
776 | case AMDGPU::S_MOV_B64: |
777 | NewOpcode = AMDGPU::S_MOV_B64_term; |
778 | break; |
779 | case AMDGPU::S_ANDN2_B32: |
780 | NewOpcode = AMDGPU::S_ANDN2_B32_term; |
781 | break; |
782 | case AMDGPU::S_ANDN2_B64: |
783 | NewOpcode = AMDGPU::S_ANDN2_B64_term; |
784 | break; |
785 | default: |
786 | llvm_unreachable("Unexpected instruction" ); |
787 | } |
788 | |
789 | // These terminators fallthrough to the next block, no need to add an |
790 | // unconditional branch to the next block (SplitBB). |
791 | TermMI->setDesc(TII->get(Opcode: NewOpcode)); |
792 | |
793 | if (SplitBB != BB) { |
794 | // Update dominator trees |
795 | using DomTreeT = DomTreeBase<MachineBasicBlock>; |
796 | SmallVector<DomTreeT::UpdateType, 16> DTUpdates; |
797 | for (MachineBasicBlock *Succ : SplitBB->successors()) { |
798 | DTUpdates.push_back(Elt: {DomTreeT::Insert, SplitBB, Succ}); |
799 | DTUpdates.push_back(Elt: {DomTreeT::Delete, BB, Succ}); |
800 | } |
801 | DTUpdates.push_back(Elt: {DomTreeT::Insert, BB, SplitBB}); |
802 | if (MDT) |
803 | MDT->applyUpdates(Updates: DTUpdates); |
804 | if (PDT) |
805 | PDT->applyUpdates(Updates: DTUpdates); |
806 | } |
807 | } |
808 | |
809 | MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) { |
810 | assert(LiveMaskReg.isVirtual()); |
811 | |
812 | const DebugLoc &DL = MI.getDebugLoc(); |
813 | unsigned Opcode = 0; |
814 | |
815 | assert(MI.getOperand(0).isReg()); |
816 | |
817 | // Comparison is for live lanes; however here we compute the inverse |
818 | // (killed lanes). This is because VCMP will always generate 0 bits |
819 | // for inactive lanes so a mask of live lanes would not be correct |
820 | // inside control flow. |
821 | // Invert the comparison by swapping the operands and adjusting |
822 | // the comparison codes. |
823 | |
824 | switch (MI.getOperand(i: 2).getImm()) { |
825 | case ISD::SETUEQ: |
826 | Opcode = AMDGPU::V_CMP_LG_F32_e64; |
827 | break; |
828 | case ISD::SETUGT: |
829 | Opcode = AMDGPU::V_CMP_GE_F32_e64; |
830 | break; |
831 | case ISD::SETUGE: |
832 | Opcode = AMDGPU::V_CMP_GT_F32_e64; |
833 | break; |
834 | case ISD::SETULT: |
835 | Opcode = AMDGPU::V_CMP_LE_F32_e64; |
836 | break; |
837 | case ISD::SETULE: |
838 | Opcode = AMDGPU::V_CMP_LT_F32_e64; |
839 | break; |
840 | case ISD::SETUNE: |
841 | Opcode = AMDGPU::V_CMP_EQ_F32_e64; |
842 | break; |
843 | case ISD::SETO: |
844 | Opcode = AMDGPU::V_CMP_O_F32_e64; |
845 | break; |
846 | case ISD::SETUO: |
847 | Opcode = AMDGPU::V_CMP_U_F32_e64; |
848 | break; |
849 | case ISD::SETOEQ: |
850 | case ISD::SETEQ: |
851 | Opcode = AMDGPU::V_CMP_NEQ_F32_e64; |
852 | break; |
853 | case ISD::SETOGT: |
854 | case ISD::SETGT: |
855 | Opcode = AMDGPU::V_CMP_NLT_F32_e64; |
856 | break; |
857 | case ISD::SETOGE: |
858 | case ISD::SETGE: |
859 | Opcode = AMDGPU::V_CMP_NLE_F32_e64; |
860 | break; |
861 | case ISD::SETOLT: |
862 | case ISD::SETLT: |
863 | Opcode = AMDGPU::V_CMP_NGT_F32_e64; |
864 | break; |
865 | case ISD::SETOLE: |
866 | case ISD::SETLE: |
867 | Opcode = AMDGPU::V_CMP_NGE_F32_e64; |
868 | break; |
869 | case ISD::SETONE: |
870 | case ISD::SETNE: |
871 | Opcode = AMDGPU::V_CMP_NLG_F32_e64; |
872 | break; |
873 | default: |
874 | llvm_unreachable("invalid ISD:SET cond code" ); |
875 | } |
876 | |
877 | MachineBasicBlock &MBB = *MI.getParent(); |
878 | |
879 | // Pick opcode based on comparison type. |
880 | MachineInstr *VcmpMI; |
881 | const MachineOperand &Op0 = MI.getOperand(i: 0); |
882 | const MachineOperand &Op1 = MI.getOperand(i: 1); |
883 | |
884 | // VCC represents lanes killed. |
885 | Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; |
886 | |
887 | if (TRI->isVGPR(MRI: *MRI, Reg: Op0.getReg())) { |
888 | Opcode = AMDGPU::getVOPe32(Opcode); |
889 | VcmpMI = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode)).add(MO: Op1).add(MO: Op0); |
890 | } else { |
891 | VcmpMI = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode)) |
892 | .addReg(RegNo: VCC, flags: RegState::Define) |
893 | .addImm(Val: 0) // src0 modifiers |
894 | .add(MO: Op1) |
895 | .addImm(Val: 0) // src1 modifiers |
896 | .add(MO: Op0) |
897 | .addImm(Val: 0); // omod |
898 | } |
899 | |
900 | MachineInstr *MaskUpdateMI = |
901 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
902 | .addReg(RegNo: LiveMaskReg) |
903 | .addReg(RegNo: VCC); |
904 | |
905 | // State of SCC represents whether any lanes are live in mask, |
906 | // if SCC is 0 then no lanes will be alive anymore. |
907 | MachineInstr *EarlyTermMI = |
908 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_EARLY_TERMINATE_SCC0)); |
909 | |
910 | MachineInstr *ExecMaskMI = |
911 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: Exec).addReg(RegNo: Exec).addReg(RegNo: VCC); |
912 | |
913 | assert(MBB.succ_size() == 1); |
914 | |
915 | // Update live intervals |
916 | LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *VcmpMI); |
917 | MBB.remove(I: &MI); |
918 | |
919 | LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI); |
920 | LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI); |
921 | LIS->InsertMachineInstrInMaps(MI&: *ExecMaskMI); |
922 | |
923 | return ExecMaskMI; |
924 | } |
925 | |
926 | MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) { |
927 | assert(LiveMaskReg.isVirtual()); |
928 | |
929 | MachineBasicBlock &MBB = *MI.getParent(); |
930 | |
931 | const DebugLoc &DL = MI.getDebugLoc(); |
932 | MachineInstr *MaskUpdateMI = nullptr; |
933 | |
934 | const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1); |
935 | const MachineOperand &Op = MI.getOperand(i: 0); |
936 | int64_t KillVal = MI.getOperand(i: 1).getImm(); |
937 | MachineInstr *ComputeKilledMaskMI = nullptr; |
938 | Register CndReg = !Op.isImm() ? Op.getReg() : Register(); |
939 | Register TmpReg; |
940 | |
941 | // Is this a static or dynamic kill? |
942 | if (Op.isImm()) { |
943 | if (Op.getImm() == KillVal) { |
944 | // Static: all active lanes are killed |
945 | MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
946 | .addReg(RegNo: LiveMaskReg) |
947 | .addReg(RegNo: Exec); |
948 | } else { |
949 | // Static: kill does nothing |
950 | bool IsLastTerminator = std::next(x: MI.getIterator()) == MBB.end(); |
951 | if (!IsLastTerminator) { |
952 | LIS->RemoveMachineInstrFromMaps(MI); |
953 | } else { |
954 | assert(MBB.succ_size() == 1 && MI.getOpcode() != AMDGPU::SI_DEMOTE_I1); |
955 | MachineInstr *NewTerm = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH)) |
956 | .addMBB(MBB: *MBB.succ_begin()); |
957 | LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewTerm); |
958 | } |
959 | MBB.remove(I: &MI); |
960 | return nullptr; |
961 | } |
962 | } else { |
963 | if (!KillVal) { |
964 | // Op represents live lanes after kill, |
965 | // so exec mask needs to be factored in. |
966 | TmpReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
967 | ComputeKilledMaskMI = |
968 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: TmpReg).addReg(RegNo: Exec).add(MO: Op); |
969 | MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
970 | .addReg(RegNo: LiveMaskReg) |
971 | .addReg(RegNo: TmpReg); |
972 | } else { |
973 | // Op represents lanes to kill |
974 | MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
975 | .addReg(RegNo: LiveMaskReg) |
976 | .add(MO: Op); |
977 | } |
978 | } |
979 | |
980 | // State of SCC represents whether any lanes are live in mask, |
981 | // if SCC is 0 then no lanes will be alive anymore. |
982 | MachineInstr *EarlyTermMI = |
983 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_EARLY_TERMINATE_SCC0)); |
984 | |
985 | // In the case we got this far some lanes are still live, |
986 | // update EXEC to deactivate lanes as appropriate. |
987 | MachineInstr *NewTerm; |
988 | MachineInstr *WQMMaskMI = nullptr; |
989 | Register LiveMaskWQM; |
990 | if (IsDemote) { |
991 | // Demote - deactivate quads with only helper lanes |
992 | LiveMaskWQM = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
993 | WQMMaskMI = |
994 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: WQMOpc), DestReg: LiveMaskWQM).addReg(RegNo: LiveMaskReg); |
995 | NewTerm = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: Exec) |
996 | .addReg(RegNo: Exec) |
997 | .addReg(RegNo: LiveMaskWQM); |
998 | } else { |
999 | // Kill - deactivate lanes no longer in live mask |
1000 | if (Op.isImm()) { |
1001 | unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
1002 | NewTerm = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: Exec).addImm(Val: 0); |
1003 | } else if (!IsWQM) { |
1004 | NewTerm = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: Exec) |
1005 | .addReg(RegNo: Exec) |
1006 | .addReg(RegNo: LiveMaskReg); |
1007 | } else { |
1008 | unsigned Opcode = KillVal ? AndN2Opc : AndOpc; |
1009 | NewTerm = |
1010 | BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode), DestReg: Exec).addReg(RegNo: Exec).add(MO: Op); |
1011 | } |
1012 | } |
1013 | |
1014 | // Update live intervals |
1015 | LIS->RemoveMachineInstrFromMaps(MI); |
1016 | MBB.remove(I: &MI); |
1017 | assert(EarlyTermMI); |
1018 | assert(MaskUpdateMI); |
1019 | assert(NewTerm); |
1020 | if (ComputeKilledMaskMI) |
1021 | LIS->InsertMachineInstrInMaps(MI&: *ComputeKilledMaskMI); |
1022 | LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI); |
1023 | LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI); |
1024 | if (WQMMaskMI) |
1025 | LIS->InsertMachineInstrInMaps(MI&: *WQMMaskMI); |
1026 | LIS->InsertMachineInstrInMaps(MI&: *NewTerm); |
1027 | |
1028 | if (CndReg) { |
1029 | LIS->removeInterval(Reg: CndReg); |
1030 | LIS->createAndComputeVirtRegInterval(Reg: CndReg); |
1031 | } |
1032 | if (TmpReg) |
1033 | LIS->createAndComputeVirtRegInterval(Reg: TmpReg); |
1034 | if (LiveMaskWQM) |
1035 | LIS->createAndComputeVirtRegInterval(Reg: LiveMaskWQM); |
1036 | |
1037 | return NewTerm; |
1038 | } |
1039 | |
1040 | // Replace (or supplement) instructions accessing live mask. |
1041 | // This can only happen once all the live mask registers have been created |
1042 | // and the execute state (WQM/StrictWWM/Exact) of instructions is known. |
1043 | void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) { |
1044 | if (!BI.NeedsLowering) |
1045 | return; |
1046 | |
1047 | LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n" ); |
1048 | |
1049 | SmallVector<MachineInstr *, 4> SplitPoints; |
1050 | Register ActiveLanesReg = 0; |
1051 | char State = BI.InitialState; |
1052 | |
1053 | for (MachineInstr &MI : llvm::make_early_inc_range( |
1054 | Range: llvm::make_range(x: MBB.getFirstNonPHI(), y: MBB.end()))) { |
1055 | auto MIState = StateTransition.find(Val: &MI); |
1056 | if (MIState != StateTransition.end()) |
1057 | State = MIState->second; |
1058 | |
1059 | MachineInstr *SplitPoint = nullptr; |
1060 | switch (MI.getOpcode()) { |
1061 | case AMDGPU::SI_DEMOTE_I1: |
1062 | case AMDGPU::SI_KILL_I1_TERMINATOR: |
1063 | SplitPoint = lowerKillI1(MI, IsWQM: State == StateWQM); |
1064 | break; |
1065 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
1066 | SplitPoint = lowerKillF32(MI); |
1067 | break; |
1068 | case AMDGPU::ENTER_STRICT_WWM: |
1069 | ActiveLanesReg = MI.getOperand(i: 0).getReg(); |
1070 | break; |
1071 | case AMDGPU::EXIT_STRICT_WWM: |
1072 | ActiveLanesReg = 0; |
1073 | break; |
1074 | case AMDGPU::V_SET_INACTIVE_B32: |
1075 | if (ActiveLanesReg) { |
1076 | LiveInterval &LI = LIS->getInterval(Reg: MI.getOperand(i: 5).getReg()); |
1077 | MRI->constrainRegClass(Reg: ActiveLanesReg, RC: TRI->getWaveMaskRegClass()); |
1078 | MI.getOperand(i: 5).setReg(ActiveLanesReg); |
1079 | LIS->shrinkToUses(li: &LI); |
1080 | } else { |
1081 | assert(State == StateExact || State == StateWQM); |
1082 | } |
1083 | break; |
1084 | default: |
1085 | break; |
1086 | } |
1087 | if (SplitPoint) |
1088 | SplitPoints.push_back(Elt: SplitPoint); |
1089 | } |
1090 | |
1091 | // Perform splitting after instruction scan to simplify iteration. |
1092 | for (MachineInstr *MI : SplitPoints) |
1093 | splitBlock(TermMI: MI); |
1094 | } |
1095 | |
1096 | // Return an iterator in the (inclusive) range [First, Last] at which |
1097 | // instructions can be safely inserted, keeping in mind that some of the |
1098 | // instructions we want to add necessarily clobber SCC. |
1099 | MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( |
1100 | MachineBasicBlock &MBB, MachineBasicBlock::iterator First, |
1101 | MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { |
1102 | if (!SaveSCC) |
1103 | return PreferLast ? Last : First; |
1104 | |
1105 | LiveRange &LR = |
1106 | LIS->getRegUnit(Unit: *TRI->regunits(Reg: MCRegister::from(Val: AMDGPU::SCC)).begin()); |
1107 | auto MBBE = MBB.end(); |
1108 | SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(Instr: *First) |
1109 | : LIS->getMBBEndIdx(mbb: &MBB); |
1110 | SlotIndex LastIdx = |
1111 | Last != MBBE ? LIS->getInstructionIndex(Instr: *Last) : LIS->getMBBEndIdx(mbb: &MBB); |
1112 | SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; |
1113 | const LiveRange::Segment *S; |
1114 | |
1115 | for (;;) { |
1116 | S = LR.getSegmentContaining(Idx); |
1117 | if (!S) |
1118 | break; |
1119 | |
1120 | if (PreferLast) { |
1121 | SlotIndex Next = S->start.getBaseIndex(); |
1122 | if (Next < FirstIdx) |
1123 | break; |
1124 | Idx = Next; |
1125 | } else { |
1126 | MachineInstr *EndMI = LIS->getInstructionFromIndex(index: S->end.getBaseIndex()); |
1127 | assert(EndMI && "Segment does not end on valid instruction" ); |
1128 | auto NextI = std::next(x: EndMI->getIterator()); |
1129 | if (NextI == MBB.end()) |
1130 | break; |
1131 | SlotIndex Next = LIS->getInstructionIndex(Instr: *NextI); |
1132 | if (Next > LastIdx) |
1133 | break; |
1134 | Idx = Next; |
1135 | } |
1136 | } |
1137 | |
1138 | MachineBasicBlock::iterator MBBI; |
1139 | |
1140 | if (MachineInstr *MI = LIS->getInstructionFromIndex(index: Idx)) |
1141 | MBBI = MI; |
1142 | else { |
1143 | assert(Idx == LIS->getMBBEndIdx(&MBB)); |
1144 | MBBI = MBB.end(); |
1145 | } |
1146 | |
1147 | // Move insertion point past any operations modifying EXEC. |
1148 | // This assumes that the value of SCC defined by any of these operations |
1149 | // does not need to be preserved. |
1150 | while (MBBI != Last) { |
1151 | bool IsExecDef = false; |
1152 | for (const MachineOperand &MO : MBBI->all_defs()) { |
1153 | IsExecDef |= |
1154 | MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC; |
1155 | } |
1156 | if (!IsExecDef) |
1157 | break; |
1158 | MBBI++; |
1159 | S = nullptr; |
1160 | } |
1161 | |
1162 | if (S) |
1163 | MBBI = saveSCC(MBB, Before: MBBI); |
1164 | |
1165 | return MBBI; |
1166 | } |
1167 | |
1168 | void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, |
1169 | MachineBasicBlock::iterator Before, |
1170 | Register SaveWQM) { |
1171 | assert(LiveMaskReg.isVirtual()); |
1172 | |
1173 | bool IsTerminator = Before == MBB.end(); |
1174 | if (!IsTerminator) { |
1175 | auto FirstTerm = MBB.getFirstTerminator(); |
1176 | if (FirstTerm != MBB.end()) { |
1177 | SlotIndex FirstTermIdx = LIS->getInstructionIndex(Instr: *FirstTerm); |
1178 | SlotIndex BeforeIdx = LIS->getInstructionIndex(Instr: *Before); |
1179 | IsTerminator = BeforeIdx > FirstTermIdx; |
1180 | } |
1181 | } |
1182 | |
1183 | MachineInstr *MI; |
1184 | |
1185 | if (SaveWQM) { |
1186 | unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc; |
1187 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode), DestReg: SaveWQM) |
1188 | .addReg(RegNo: LiveMaskReg); |
1189 | } else { |
1190 | unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc; |
1191 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode), DestReg: Exec) |
1192 | .addReg(RegNo: Exec) |
1193 | .addReg(RegNo: LiveMaskReg); |
1194 | } |
1195 | |
1196 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1197 | StateTransition[MI] = StateExact; |
1198 | } |
1199 | |
1200 | void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, |
1201 | MachineBasicBlock::iterator Before, |
1202 | Register SavedWQM) { |
1203 | MachineInstr *MI; |
1204 | |
1205 | if (SavedWQM) { |
1206 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Exec) |
1207 | .addReg(RegNo: SavedWQM); |
1208 | } else { |
1209 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: WQMOpc), DestReg: Exec).addReg(RegNo: Exec); |
1210 | } |
1211 | |
1212 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1213 | StateTransition[MI] = StateWQM; |
1214 | } |
1215 | |
1216 | void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, |
1217 | MachineBasicBlock::iterator Before, |
1218 | Register SaveOrig, char StrictStateNeeded) { |
1219 | MachineInstr *MI; |
1220 | assert(SaveOrig); |
1221 | assert(StrictStateNeeded == StateStrictWWM || |
1222 | StrictStateNeeded == StateStrictWQM); |
1223 | |
1224 | if (StrictStateNeeded == StateStrictWWM) { |
1225 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::ENTER_STRICT_WWM), |
1226 | DestReg: SaveOrig) |
1227 | .addImm(Val: -1); |
1228 | } else { |
1229 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::ENTER_STRICT_WQM), |
1230 | DestReg: SaveOrig) |
1231 | .addImm(Val: -1); |
1232 | } |
1233 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1234 | StateTransition[MI] = StrictStateNeeded; |
1235 | } |
1236 | |
1237 | void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, |
1238 | MachineBasicBlock::iterator Before, |
1239 | Register SavedOrig, char NonStrictState, |
1240 | char CurrentStrictState) { |
1241 | MachineInstr *MI; |
1242 | |
1243 | assert(SavedOrig); |
1244 | assert(CurrentStrictState == StateStrictWWM || |
1245 | CurrentStrictState == StateStrictWQM); |
1246 | |
1247 | if (CurrentStrictState == StateStrictWWM) { |
1248 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::EXIT_STRICT_WWM), |
1249 | DestReg: Exec) |
1250 | .addReg(RegNo: SavedOrig); |
1251 | } else { |
1252 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::EXIT_STRICT_WQM), |
1253 | DestReg: Exec) |
1254 | .addReg(RegNo: SavedOrig); |
1255 | } |
1256 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1257 | StateTransition[MI] = NonStrictState; |
1258 | } |
1259 | |
1260 | void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI, |
1261 | bool IsEntry) { |
1262 | // This is a non-entry block that is WQM throughout, so no need to do |
1263 | // anything. |
1264 | if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { |
1265 | BI.InitialState = StateWQM; |
1266 | return; |
1267 | } |
1268 | |
1269 | LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) |
1270 | << ":\n" ); |
1271 | |
1272 | Register SavedWQMReg; |
1273 | Register SavedNonStrictReg; |
1274 | bool WQMFromExec = IsEntry; |
1275 | char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; |
1276 | char NonStrictState = 0; |
1277 | const TargetRegisterClass *BoolRC = TRI->getBoolRC(); |
1278 | |
1279 | auto II = MBB.getFirstNonPHI(), IE = MBB.end(); |
1280 | if (IsEntry) { |
1281 | // Skip the instruction that saves LiveMask |
1282 | if (II != IE && II->getOpcode() == AMDGPU::COPY && |
1283 | II->getOperand(i: 1).getReg() == TRI->getExec()) |
1284 | ++II; |
1285 | } |
1286 | |
1287 | // This stores the first instruction where it's safe to switch from WQM to |
1288 | // Exact or vice versa. |
1289 | MachineBasicBlock::iterator FirstWQM = IE; |
1290 | |
1291 | // This stores the first instruction where it's safe to switch from Strict |
1292 | // mode to Exact/WQM or to switch to Strict mode. It must always be the same |
1293 | // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must |
1294 | // be safe to switch to/from WQM as well. |
1295 | MachineBasicBlock::iterator FirstStrict = IE; |
1296 | |
1297 | // Record initial state is block information. |
1298 | BI.InitialState = State; |
1299 | |
1300 | for (unsigned Idx = 0;; ++Idx) { |
1301 | MachineBasicBlock::iterator Next = II; |
1302 | char Needs = StateExact | StateWQM; // Strict mode is disabled by default. |
1303 | char OutNeeds = 0; |
1304 | |
1305 | if (FirstWQM == IE) |
1306 | FirstWQM = II; |
1307 | |
1308 | if (FirstStrict == IE) |
1309 | FirstStrict = II; |
1310 | |
1311 | // Adjust needs if this is first instruction of WQM requiring shader. |
1312 | if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM)) |
1313 | Needs = StateWQM; |
1314 | |
1315 | // First, figure out the allowed states (Needs) based on the propagated |
1316 | // flags. |
1317 | if (II != IE) { |
1318 | MachineInstr &MI = *II; |
1319 | |
1320 | if (MI.isTerminator() || TII->mayReadEXEC(MRI: *MRI, MI)) { |
1321 | auto III = Instructions.find(Val: &MI); |
1322 | if (III != Instructions.end()) { |
1323 | if (III->second.Needs & StateStrictWWM) |
1324 | Needs = StateStrictWWM; |
1325 | else if (III->second.Needs & StateStrictWQM) |
1326 | Needs = StateStrictWQM; |
1327 | else if (III->second.Needs & StateWQM) |
1328 | Needs = StateWQM; |
1329 | else |
1330 | Needs &= ~III->second.Disabled; |
1331 | OutNeeds = III->second.OutNeeds; |
1332 | } |
1333 | } else { |
1334 | // If the instruction doesn't actually need a correct EXEC, then we can |
1335 | // safely leave Strict mode enabled. |
1336 | Needs = StateExact | StateWQM | StateStrict; |
1337 | } |
1338 | |
1339 | // Exact mode exit can occur in terminators, but must be before branches. |
1340 | if (MI.isBranch() && OutNeeds == StateExact) |
1341 | Needs = StateExact; |
1342 | |
1343 | ++Next; |
1344 | } else { |
1345 | // End of basic block |
1346 | if (BI.OutNeeds & StateWQM) |
1347 | Needs = StateWQM; |
1348 | else if (BI.OutNeeds == StateExact) |
1349 | Needs = StateExact; |
1350 | else |
1351 | Needs = StateWQM | StateExact; |
1352 | } |
1353 | |
1354 | // Now, transition if necessary. |
1355 | if (!(Needs & State)) { |
1356 | MachineBasicBlock::iterator First; |
1357 | if (State == StateStrictWWM || Needs == StateStrictWWM || |
1358 | State == StateStrictWQM || Needs == StateStrictWQM) { |
1359 | // We must switch to or from Strict mode. |
1360 | First = FirstStrict; |
1361 | } else { |
1362 | // We only need to switch to/from WQM, so we can use FirstWQM. |
1363 | First = FirstWQM; |
1364 | } |
1365 | |
1366 | // Whether we need to save SCC depends on start and end states. |
1367 | bool SaveSCC = false; |
1368 | switch (State) { |
1369 | case StateExact: |
1370 | case StateStrictWWM: |
1371 | case StateStrictWQM: |
1372 | // Exact/Strict -> Strict: save SCC |
1373 | // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec |
1374 | // Exact/Strict -> Exact: no save |
1375 | SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec); |
1376 | break; |
1377 | case StateWQM: |
1378 | // WQM -> Exact/Strict: save SCC |
1379 | SaveSCC = !(Needs & StateWQM); |
1380 | break; |
1381 | default: |
1382 | llvm_unreachable("Unknown state" ); |
1383 | break; |
1384 | } |
1385 | char StartState = State & StateStrict ? NonStrictState : State; |
1386 | bool WQMToExact = |
1387 | StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM); |
1388 | bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) && |
1389 | !(Needs & StateExact); |
1390 | bool PreferLast = Needs == StateWQM; |
1391 | // Exact regions in divergent control flow may run at EXEC=0, so try to |
1392 | // exclude instructions with unexpected effects from them. |
1393 | // FIXME: ideally we would branch over these when EXEC=0, |
1394 | // but this requires updating implicit values, live intervals and CFG. |
1395 | if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) { |
1396 | for (MachineBasicBlock::iterator I = First; I != II; ++I) { |
1397 | if (TII->hasUnwantedEffectsWhenEXECEmpty(MI: *I)) { |
1398 | PreferLast = WQMToExact; |
1399 | break; |
1400 | } |
1401 | } |
1402 | } |
1403 | MachineBasicBlock::iterator Before = |
1404 | prepareInsertion(MBB, First, Last: II, PreferLast, SaveSCC); |
1405 | |
1406 | if (State & StateStrict) { |
1407 | assert(State == StateStrictWWM || State == StateStrictWQM); |
1408 | assert(SavedNonStrictReg); |
1409 | fromStrictMode(MBB, Before, SavedOrig: SavedNonStrictReg, NonStrictState, CurrentStrictState: State); |
1410 | |
1411 | LIS->createAndComputeVirtRegInterval(Reg: SavedNonStrictReg); |
1412 | SavedNonStrictReg = 0; |
1413 | State = NonStrictState; |
1414 | } |
1415 | |
1416 | if (Needs & StateStrict) { |
1417 | NonStrictState = State; |
1418 | assert(Needs == StateStrictWWM || Needs == StateStrictWQM); |
1419 | assert(!SavedNonStrictReg); |
1420 | SavedNonStrictReg = MRI->createVirtualRegister(RegClass: BoolRC); |
1421 | |
1422 | toStrictMode(MBB, Before, SaveOrig: SavedNonStrictReg, StrictStateNeeded: Needs); |
1423 | State = Needs; |
1424 | } else { |
1425 | if (WQMToExact) { |
1426 | if (!WQMFromExec && (OutNeeds & StateWQM)) { |
1427 | assert(!SavedWQMReg); |
1428 | SavedWQMReg = MRI->createVirtualRegister(RegClass: BoolRC); |
1429 | } |
1430 | |
1431 | toExact(MBB, Before, SaveWQM: SavedWQMReg); |
1432 | State = StateExact; |
1433 | } else if (ExactToWQM) { |
1434 | assert(WQMFromExec == (SavedWQMReg == 0)); |
1435 | |
1436 | toWQM(MBB, Before, SavedWQM: SavedWQMReg); |
1437 | |
1438 | if (SavedWQMReg) { |
1439 | LIS->createAndComputeVirtRegInterval(Reg: SavedWQMReg); |
1440 | SavedWQMReg = 0; |
1441 | } |
1442 | State = StateWQM; |
1443 | } else { |
1444 | // We can get here if we transitioned from StrictWWM to a |
1445 | // non-StrictWWM state that already matches our needs, but we |
1446 | // shouldn't need to do anything. |
1447 | assert(Needs & State); |
1448 | } |
1449 | } |
1450 | } |
1451 | |
1452 | if (Needs != (StateExact | StateWQM | StateStrict)) { |
1453 | if (Needs != (StateExact | StateWQM)) |
1454 | FirstWQM = IE; |
1455 | FirstStrict = IE; |
1456 | } |
1457 | |
1458 | if (II == IE) |
1459 | break; |
1460 | |
1461 | II = Next; |
1462 | } |
1463 | assert(!SavedWQMReg); |
1464 | assert(!SavedNonStrictReg); |
1465 | } |
1466 | |
1467 | bool SIWholeQuadMode::lowerLiveMaskQueries() { |
1468 | for (MachineInstr *MI : LiveMaskQueries) { |
1469 | const DebugLoc &DL = MI->getDebugLoc(); |
1470 | Register Dest = MI->getOperand(i: 0).getReg(); |
1471 | |
1472 | MachineInstr *Copy = |
1473 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Dest) |
1474 | .addReg(RegNo: LiveMaskReg); |
1475 | |
1476 | LIS->ReplaceMachineInstrInMaps(MI&: *MI, NewMI&: *Copy); |
1477 | MI->eraseFromParent(); |
1478 | } |
1479 | return !LiveMaskQueries.empty(); |
1480 | } |
1481 | |
1482 | bool SIWholeQuadMode::lowerCopyInstrs() { |
1483 | for (MachineInstr *MI : LowerToMovInstrs) { |
1484 | assert(MI->getNumExplicitOperands() == 2); |
1485 | |
1486 | const Register Reg = MI->getOperand(i: 0).getReg(); |
1487 | |
1488 | const TargetRegisterClass *regClass = |
1489 | TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: 0)); |
1490 | if (TRI->isVGPRClass(RC: regClass)) { |
1491 | const unsigned MovOp = TII->getMovOpcode(DstRC: regClass); |
1492 | MI->setDesc(TII->get(Opcode: MovOp)); |
1493 | |
1494 | // Check that it already implicitly depends on exec (like all VALU movs |
1495 | // should do). |
1496 | assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) { |
1497 | return MO.isUse() && MO.getReg() == AMDGPU::EXEC; |
1498 | })); |
1499 | } else { |
1500 | // Remove early-clobber and exec dependency from simple SGPR copies. |
1501 | // This allows some to be eliminated during/post RA. |
1502 | LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI); |
1503 | if (MI->getOperand(i: 0).isEarlyClobber()) { |
1504 | LIS->removeInterval(Reg); |
1505 | MI->getOperand(i: 0).setIsEarlyClobber(false); |
1506 | LIS->createAndComputeVirtRegInterval(Reg); |
1507 | } |
1508 | int Index = MI->findRegisterUseOperandIdx(Reg: AMDGPU::EXEC, /*TRI=*/nullptr); |
1509 | while (Index >= 0) { |
1510 | MI->removeOperand(OpNo: Index); |
1511 | Index = MI->findRegisterUseOperandIdx(Reg: AMDGPU::EXEC, /*TRI=*/nullptr); |
1512 | } |
1513 | MI->setDesc(TII->get(Opcode: AMDGPU::COPY)); |
1514 | LLVM_DEBUG(dbgs() << " -> " << *MI); |
1515 | } |
1516 | } |
1517 | for (MachineInstr *MI : LowerToCopyInstrs) { |
1518 | LLVM_DEBUG(dbgs() << "simplify: " << *MI); |
1519 | |
1520 | if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) { |
1521 | assert(MI->getNumExplicitOperands() == 6); |
1522 | |
1523 | LiveInterval *RecomputeLI = nullptr; |
1524 | if (MI->getOperand(i: 4).isReg()) |
1525 | RecomputeLI = &LIS->getInterval(Reg: MI->getOperand(i: 4).getReg()); |
1526 | |
1527 | MI->removeOperand(OpNo: 5); |
1528 | MI->removeOperand(OpNo: 4); |
1529 | MI->removeOperand(OpNo: 3); |
1530 | MI->removeOperand(OpNo: 1); |
1531 | |
1532 | if (RecomputeLI) |
1533 | LIS->shrinkToUses(li: RecomputeLI); |
1534 | } else { |
1535 | assert(MI->getNumExplicitOperands() == 2); |
1536 | } |
1537 | |
1538 | unsigned CopyOp = MI->getOperand(i: 1).isReg() |
1539 | ? (unsigned)AMDGPU::COPY |
1540 | : TII->getMovOpcode(DstRC: TRI->getRegClassForOperandReg( |
1541 | MRI: *MRI, MO: MI->getOperand(i: 0))); |
1542 | MI->setDesc(TII->get(Opcode: CopyOp)); |
1543 | LLVM_DEBUG(dbgs() << " -> " << *MI); |
1544 | } |
1545 | return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty(); |
1546 | } |
1547 | |
1548 | bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { |
1549 | for (MachineInstr *MI : KillInstrs) { |
1550 | MachineInstr *SplitPoint = nullptr; |
1551 | switch (MI->getOpcode()) { |
1552 | case AMDGPU::SI_DEMOTE_I1: |
1553 | case AMDGPU::SI_KILL_I1_TERMINATOR: |
1554 | SplitPoint = lowerKillI1(MI&: *MI, IsWQM); |
1555 | break; |
1556 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
1557 | SplitPoint = lowerKillF32(MI&: *MI); |
1558 | break; |
1559 | } |
1560 | if (SplitPoint) |
1561 | splitBlock(TermMI: SplitPoint); |
1562 | } |
1563 | return !KillInstrs.empty(); |
1564 | } |
1565 | |
1566 | void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { |
1567 | MachineBasicBlock *MBB = MI.getParent(); |
1568 | bool IsWave32 = ST->isWave32(); |
1569 | |
1570 | if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) { |
1571 | assert(MBB == &MBB->getParent()->front() && |
1572 | "init whole wave not in entry block" ); |
1573 | Register EntryExec = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
1574 | MachineInstr *SaveExec = |
1575 | BuildMI(BB&: *MBB, I: MBB->begin(), MIMD: MI.getDebugLoc(), |
1576 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 |
1577 | : AMDGPU::S_OR_SAVEEXEC_B64), |
1578 | DestReg: EntryExec) |
1579 | .addImm(Val: -1); |
1580 | |
1581 | // Replace all uses of MI's destination reg with EntryExec. |
1582 | MRI->replaceRegWith(FromReg: MI.getOperand(i: 0).getReg(), ToReg: EntryExec); |
1583 | |
1584 | if (LIS) { |
1585 | LIS->RemoveMachineInstrFromMaps(MI); |
1586 | } |
1587 | |
1588 | MI.eraseFromParent(); |
1589 | |
1590 | if (LIS) { |
1591 | LIS->InsertMachineInstrInMaps(MI&: *SaveExec); |
1592 | LIS->createAndComputeVirtRegInterval(Reg: EntryExec); |
1593 | } |
1594 | return; |
1595 | } |
1596 | |
1597 | if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { |
1598 | // This should be before all vector instructions. |
1599 | MachineInstr *InitMI = |
1600 | BuildMI(BB&: *MBB, I: MBB->begin(), MIMD: MI.getDebugLoc(), |
1601 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), |
1602 | DestReg: Exec) |
1603 | .addImm(Val: MI.getOperand(i: 0).getImm()); |
1604 | if (LIS) { |
1605 | LIS->RemoveMachineInstrFromMaps(MI); |
1606 | LIS->InsertMachineInstrInMaps(MI&: *InitMI); |
1607 | } |
1608 | MI.eraseFromParent(); |
1609 | return; |
1610 | } |
1611 | |
1612 | // Extract the thread count from an SGPR input and set EXEC accordingly. |
1613 | // Since BFM can't shift by 64, handle that case with CMP + CMOV. |
1614 | // |
1615 | // S_BFE_U32 count, input, {shift, 7} |
1616 | // S_BFM_B64 exec, count, 0 |
1617 | // S_CMP_EQ_U32 count, 64 |
1618 | // S_CMOV_B64 exec, -1 |
1619 | Register InputReg = MI.getOperand(i: 0).getReg(); |
1620 | MachineInstr *FirstMI = &*MBB->begin(); |
1621 | if (InputReg.isVirtual()) { |
1622 | MachineInstr *DefInstr = MRI->getVRegDef(Reg: InputReg); |
1623 | assert(DefInstr && DefInstr->isCopy()); |
1624 | if (DefInstr->getParent() == MBB) { |
1625 | if (DefInstr != FirstMI) { |
1626 | // If the `InputReg` is defined in current block, we also need to |
1627 | // move that instruction to the beginning of the block. |
1628 | DefInstr->removeFromParent(); |
1629 | MBB->insert(I: FirstMI, MI: DefInstr); |
1630 | if (LIS) |
1631 | LIS->handleMove(MI&: *DefInstr); |
1632 | } else { |
1633 | // If first instruction is definition then move pointer after it. |
1634 | FirstMI = &*std::next(x: FirstMI->getIterator()); |
1635 | } |
1636 | } |
1637 | } |
1638 | |
1639 | // Insert instruction sequence at block beginning (before vector operations). |
1640 | const DebugLoc DL = MI.getDebugLoc(); |
1641 | const unsigned WavefrontSize = ST->getWavefrontSize(); |
1642 | const unsigned Mask = (WavefrontSize << 1) - 1; |
1643 | Register CountReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass); |
1644 | auto BfeMI = BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BFE_U32), DestReg: CountReg) |
1645 | .addReg(RegNo: InputReg) |
1646 | .addImm(Val: (MI.getOperand(i: 1).getImm() & Mask) | 0x70000); |
1647 | auto BfmMI = |
1648 | BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, |
1649 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), DestReg: Exec) |
1650 | .addReg(RegNo: CountReg) |
1651 | .addImm(Val: 0); |
1652 | auto CmpMI = BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32)) |
1653 | .addReg(RegNo: CountReg, flags: RegState::Kill) |
1654 | .addImm(Val: WavefrontSize); |
1655 | auto CmovMI = |
1656 | BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, |
1657 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), |
1658 | DestReg: Exec) |
1659 | .addImm(Val: -1); |
1660 | |
1661 | if (!LIS) { |
1662 | MI.eraseFromParent(); |
1663 | return; |
1664 | } |
1665 | |
1666 | LIS->RemoveMachineInstrFromMaps(MI); |
1667 | MI.eraseFromParent(); |
1668 | |
1669 | LIS->InsertMachineInstrInMaps(MI&: *BfeMI); |
1670 | LIS->InsertMachineInstrInMaps(MI&: *BfmMI); |
1671 | LIS->InsertMachineInstrInMaps(MI&: *CmpMI); |
1672 | LIS->InsertMachineInstrInMaps(MI&: *CmovMI); |
1673 | |
1674 | LIS->removeInterval(Reg: InputReg); |
1675 | LIS->createAndComputeVirtRegInterval(Reg: InputReg); |
1676 | LIS->createAndComputeVirtRegInterval(Reg: CountReg); |
1677 | } |
1678 | |
1679 | /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry |
1680 | /// for instructions that depend on EXEC. |
1681 | MachineBasicBlock::iterator |
1682 | SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) { |
1683 | MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI(); |
1684 | |
1685 | for (MachineInstr *MI : InitExecInstrs) { |
1686 | // Try to handle undefined cases gracefully: |
1687 | // - multiple INIT_EXEC instructions |
1688 | // - INIT_EXEC instructions not in the entry block |
1689 | if (MI->getParent() == &Entry) |
1690 | InsertPt = std::next(x: MI->getIterator()); |
1691 | |
1692 | lowerInitExec(MI&: *MI); |
1693 | Changed = true; |
1694 | } |
1695 | |
1696 | return InsertPt; |
1697 | } |
1698 | |
1699 | bool SIWholeQuadMode::run(MachineFunction &MF) { |
1700 | LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName() |
1701 | << " ------------- \n" ); |
1702 | LLVM_DEBUG(MF.dump();); |
1703 | |
1704 | Instructions.clear(); |
1705 | Blocks.clear(); |
1706 | LiveMaskQueries.clear(); |
1707 | LowerToCopyInstrs.clear(); |
1708 | LowerToMovInstrs.clear(); |
1709 | KillInstrs.clear(); |
1710 | InitExecInstrs.clear(); |
1711 | SetInactiveInstrs.clear(); |
1712 | StateTransition.clear(); |
1713 | |
1714 | if (ST->isWave32()) { |
1715 | AndOpc = AMDGPU::S_AND_B32; |
1716 | AndTermOpc = AMDGPU::S_AND_B32_term; |
1717 | AndN2Opc = AMDGPU::S_ANDN2_B32; |
1718 | XorOpc = AMDGPU::S_XOR_B32; |
1719 | AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; |
1720 | AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term; |
1721 | WQMOpc = AMDGPU::S_WQM_B32; |
1722 | Exec = AMDGPU::EXEC_LO; |
1723 | } else { |
1724 | AndOpc = AMDGPU::S_AND_B64; |
1725 | AndTermOpc = AMDGPU::S_AND_B64_term; |
1726 | AndN2Opc = AMDGPU::S_ANDN2_B64; |
1727 | XorOpc = AMDGPU::S_XOR_B64; |
1728 | AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; |
1729 | AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term; |
1730 | WQMOpc = AMDGPU::S_WQM_B64; |
1731 | Exec = AMDGPU::EXEC; |
1732 | } |
1733 | |
1734 | const char GlobalFlags = analyzeFunction(MF); |
1735 | bool Changed = false; |
1736 | |
1737 | LiveMaskReg = Exec; |
1738 | |
1739 | MachineBasicBlock &Entry = MF.front(); |
1740 | MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed); |
1741 | |
1742 | // Store a copy of the original live mask when required |
1743 | const bool HasLiveMaskQueries = !LiveMaskQueries.empty(); |
1744 | const bool HasWaveModes = GlobalFlags & ~StateExact; |
1745 | const bool HasKills = !KillInstrs.empty(); |
1746 | const bool UsesWQM = GlobalFlags & StateWQM; |
1747 | if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) { |
1748 | LiveMaskReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
1749 | MachineInstr *MI = |
1750 | BuildMI(BB&: Entry, I: EntryMI, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: LiveMaskReg) |
1751 | .addReg(RegNo: Exec); |
1752 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1753 | Changed = true; |
1754 | } |
1755 | |
1756 | // Check if V_SET_INACTIVE was touched by a strict state mode. |
1757 | // If so, promote to WWM; otherwise lower to COPY. |
1758 | for (MachineInstr *MI : SetInactiveInstrs) { |
1759 | if (LowerToCopyInstrs.contains(key: MI)) |
1760 | continue; |
1761 | auto &Info = Instructions[MI]; |
1762 | if (Info.MarkedStates & StateStrict) { |
1763 | Info.Needs |= StateStrictWWM; |
1764 | Info.Disabled &= ~StateStrictWWM; |
1765 | Blocks[MI->getParent()].Needs |= StateStrictWWM; |
1766 | } else { |
1767 | LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI); |
1768 | LowerToCopyInstrs.insert(X: MI); |
1769 | } |
1770 | } |
1771 | |
1772 | LLVM_DEBUG(printInfo()); |
1773 | |
1774 | Changed |= lowerLiveMaskQueries(); |
1775 | Changed |= lowerCopyInstrs(); |
1776 | |
1777 | if (!HasWaveModes) { |
1778 | // No wave mode execution |
1779 | Changed |= lowerKillInstrs(IsWQM: false); |
1780 | } else if (GlobalFlags == StateWQM) { |
1781 | // Shader only needs WQM |
1782 | auto MI = BuildMI(BB&: Entry, I: EntryMI, MIMD: DebugLoc(), MCID: TII->get(Opcode: WQMOpc), DestReg: Exec) |
1783 | .addReg(RegNo: Exec); |
1784 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1785 | lowerKillInstrs(IsWQM: true); |
1786 | Changed = true; |
1787 | } else { |
1788 | // Mark entry for WQM if required. |
1789 | if (GlobalFlags & StateWQM) |
1790 | Blocks[&Entry].InNeeds |= StateWQM; |
1791 | // Wave mode switching requires full lowering pass. |
1792 | for (auto &BII : Blocks) |
1793 | processBlock(MBB&: *BII.first, BI&: BII.second, IsEntry: BII.first == &Entry); |
1794 | // Lowering blocks causes block splitting so perform as a second pass. |
1795 | for (auto &BII : Blocks) |
1796 | lowerBlock(MBB&: *BII.first, BI&: BII.second); |
1797 | Changed = true; |
1798 | } |
1799 | |
1800 | // Compute live range for live mask |
1801 | if (LiveMaskReg != Exec) |
1802 | LIS->createAndComputeVirtRegInterval(Reg: LiveMaskReg); |
1803 | |
1804 | // Physical registers like SCC aren't tracked by default anyway, so just |
1805 | // removing the ranges we computed is the simplest option for maintaining |
1806 | // the analysis results. |
1807 | LIS->removeAllRegUnitsForPhysReg(Reg: AMDGPU::SCC); |
1808 | |
1809 | // If we performed any kills then recompute EXEC |
1810 | if (!KillInstrs.empty() || !InitExecInstrs.empty()) |
1811 | LIS->removeAllRegUnitsForPhysReg(Reg: AMDGPU::EXEC); |
1812 | |
1813 | return Changed; |
1814 | } |
1815 | |
1816 | bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) { |
1817 | LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); |
1818 | auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); |
1819 | MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; |
1820 | auto *PDTWrapper = |
1821 | getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>(); |
1822 | MachinePostDominatorTree *PDT = |
1823 | PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; |
1824 | SIWholeQuadMode Impl(MF, LIS, MDT, PDT); |
1825 | return Impl.run(MF); |
1826 | } |
1827 | |
1828 | PreservedAnalyses |
1829 | SIWholeQuadModePass::run(MachineFunction &MF, |
1830 | MachineFunctionAnalysisManager &MFAM) { |
1831 | MFPropsModifier _(*this, MF); |
1832 | |
1833 | LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(IR&: MF); |
1834 | MachineDominatorTree *MDT = |
1835 | MFAM.getCachedResult<MachineDominatorTreeAnalysis>(IR&: MF); |
1836 | MachinePostDominatorTree *PDT = |
1837 | MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(IR&: MF); |
1838 | SIWholeQuadMode Impl(MF, LIS, MDT, PDT); |
1839 | bool Changed = Impl.run(MF); |
1840 | if (!Changed) |
1841 | return PreservedAnalyses::all(); |
1842 | |
1843 | PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); |
1844 | PA.preserve<SlotIndexesAnalysis>(); |
1845 | PA.preserve<LiveIntervalsAnalysis>(); |
1846 | PA.preserve<MachineDominatorTreeAnalysis>(); |
1847 | PA.preserve<MachinePostDominatorTreeAnalysis>(); |
1848 | return PA; |
1849 | } |
1850 | |