1 | //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This pass adds instructions to enable whole quad mode (strict or non-strict) |
11 | /// for pixel shaders, and strict whole wavefront mode for all programs. |
12 | /// |
13 | /// The "strict" prefix indicates that inactive lanes do not take part in |
14 | /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will |
15 | /// always be enabled irrespective of control flow decisions. Conversely in |
16 | /// non-strict WQM inactive lanes may control flow decisions. |
17 | /// |
18 | /// Whole quad mode is required for derivative computations, but it interferes |
19 | /// with shader side effects (stores and atomics). It ensures that WQM is |
20 | /// enabled when necessary, but disabled around stores and atomics. |
21 | /// |
22 | /// When necessary, this pass creates a function prolog |
23 | /// |
24 | /// S_MOV_B64 LiveMask, EXEC |
25 | /// S_WQM_B64 EXEC, EXEC |
26 | /// |
27 | /// to enter WQM at the top of the function and surrounds blocks of Exact |
28 | /// instructions by |
29 | /// |
30 | /// S_AND_SAVEEXEC_B64 Tmp, LiveMask |
31 | /// ... |
32 | /// S_MOV_B64 EXEC, Tmp |
33 | /// |
34 | /// We also compute when a sequence of instructions requires strict whole |
35 | /// wavefront mode (StrictWWM) and insert instructions to save and restore it: |
36 | /// |
37 | /// S_OR_SAVEEXEC_B64 Tmp, -1 |
38 | /// ... |
39 | /// S_MOV_B64 EXEC, Tmp |
40 | /// |
41 | /// When a sequence of instructions requires strict whole quad mode (StrictWQM) |
42 | /// we use a similar save and restore mechanism and force whole quad mode for |
43 | /// those instructions: |
44 | /// |
45 | /// S_MOV_B64 Tmp, EXEC |
46 | /// S_WQM_B64 EXEC, EXEC |
47 | /// ... |
48 | /// S_MOV_B64 EXEC, Tmp |
49 | /// |
50 | /// In order to avoid excessive switching during sequences of Exact |
51 | /// instructions, the pass first analyzes which instructions must be run in WQM |
52 | /// (aka which instructions produce values that lead to derivative |
53 | /// computations). |
54 | /// |
55 | /// Basic blocks are always exited in WQM as long as some successor needs WQM. |
56 | /// |
57 | /// There is room for improvement given better control flow analysis: |
58 | /// |
59 | /// (1) at the top level (outside of control flow statements, and as long as |
60 | /// kill hasn't been used), one SGPR can be saved by recovering WQM from |
61 | /// the LiveMask (this is implemented for the entry block). |
62 | /// |
63 | /// (2) when entire regions (e.g. if-else blocks or entire loops) only |
64 | /// consist of exact and don't-care instructions, the switch only has to |
65 | /// be done at the entry and exit points rather than potentially in each |
66 | /// block of the region. |
67 | /// |
68 | //===----------------------------------------------------------------------===// |
69 | |
70 | #include "AMDGPU.h" |
71 | #include "GCNSubtarget.h" |
72 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
73 | #include "llvm/ADT/MapVector.h" |
74 | #include "llvm/ADT/PostOrderIterator.h" |
75 | #include "llvm/CodeGen/LiveIntervals.h" |
76 | #include "llvm/CodeGen/MachineBasicBlock.h" |
77 | #include "llvm/CodeGen/MachineDominators.h" |
78 | #include "llvm/CodeGen/MachineFunctionPass.h" |
79 | #include "llvm/CodeGen/MachineInstr.h" |
80 | #include "llvm/CodeGen/MachinePostDominators.h" |
81 | #include "llvm/IR/CallingConv.h" |
82 | #include "llvm/InitializePasses.h" |
83 | #include "llvm/Support/raw_ostream.h" |
84 | |
85 | using namespace llvm; |
86 | |
87 | #define DEBUG_TYPE "si-wqm" |
88 | |
89 | namespace { |
90 | |
91 | enum { |
92 | StateWQM = 0x1, |
93 | StateStrictWWM = 0x2, |
94 | StateStrictWQM = 0x4, |
95 | StateExact = 0x8, |
96 | StateStrict = StateStrictWWM | StateStrictWQM, |
97 | }; |
98 | |
99 | struct PrintState { |
100 | public: |
101 | int State; |
102 | |
103 | explicit PrintState(int State) : State(State) {} |
104 | }; |
105 | |
106 | #ifndef NDEBUG |
107 | static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { |
108 | |
109 | static const std::pair<char, const char *> Mapping[] = { |
110 | std::pair(StateWQM, "WQM" ), std::pair(StateStrictWWM, "StrictWWM" ), |
111 | std::pair(StateStrictWQM, "StrictWQM" ), std::pair(StateExact, "Exact" )}; |
112 | char State = PS.State; |
113 | for (auto M : Mapping) { |
114 | if (State & M.first) { |
115 | OS << M.second; |
116 | State &= ~M.first; |
117 | |
118 | if (State) |
119 | OS << '|'; |
120 | } |
121 | } |
122 | assert(State == 0); |
123 | return OS; |
124 | } |
125 | #endif |
126 | |
127 | struct InstrInfo { |
128 | char Needs = 0; |
129 | char Disabled = 0; |
130 | char OutNeeds = 0; |
131 | }; |
132 | |
133 | struct BlockInfo { |
134 | char Needs = 0; |
135 | char InNeeds = 0; |
136 | char OutNeeds = 0; |
137 | char InitialState = 0; |
138 | bool NeedsLowering = false; |
139 | }; |
140 | |
141 | struct WorkItem { |
142 | MachineBasicBlock *MBB = nullptr; |
143 | MachineInstr *MI = nullptr; |
144 | |
145 | WorkItem() = default; |
146 | WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} |
147 | WorkItem(MachineInstr *MI) : MI(MI) {} |
148 | }; |
149 | |
150 | class SIWholeQuadMode : public MachineFunctionPass { |
151 | private: |
152 | const SIInstrInfo *TII; |
153 | const SIRegisterInfo *TRI; |
154 | const GCNSubtarget *ST; |
155 | MachineRegisterInfo *MRI; |
156 | LiveIntervals *LIS; |
157 | MachineDominatorTree *MDT; |
158 | MachinePostDominatorTree *PDT; |
159 | |
160 | unsigned AndOpc; |
161 | unsigned AndTermOpc; |
162 | unsigned AndN2Opc; |
163 | unsigned XorOpc; |
164 | unsigned AndSaveExecOpc; |
165 | unsigned AndSaveExecTermOpc; |
166 | unsigned WQMOpc; |
167 | Register Exec; |
168 | Register LiveMaskReg; |
169 | |
170 | DenseMap<const MachineInstr *, InstrInfo> Instructions; |
171 | MapVector<MachineBasicBlock *, BlockInfo> Blocks; |
172 | |
173 | // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction |
174 | DenseMap<const MachineInstr *, char> StateTransition; |
175 | |
176 | SmallVector<MachineInstr *, 2> LiveMaskQueries; |
177 | SmallVector<MachineInstr *, 4> LowerToMovInstrs; |
178 | SmallVector<MachineInstr *, 4> LowerToCopyInstrs; |
179 | SmallVector<MachineInstr *, 4> KillInstrs; |
180 | SmallVector<MachineInstr *, 4> InitExecInstrs; |
181 | |
182 | void printInfo(); |
183 | |
184 | void markInstruction(MachineInstr &MI, char Flag, |
185 | std::vector<WorkItem> &Worklist); |
186 | void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, |
187 | unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist); |
188 | void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag, |
189 | std::vector<WorkItem> &Worklist); |
190 | void markInstructionUses(const MachineInstr &MI, char Flag, |
191 | std::vector<WorkItem> &Worklist); |
192 | char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); |
193 | void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); |
194 | void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); |
195 | char analyzeFunction(MachineFunction &MF); |
196 | |
197 | MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, |
198 | MachineBasicBlock::iterator Before); |
199 | MachineBasicBlock::iterator |
200 | prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, |
201 | MachineBasicBlock::iterator Last, bool PreferLast, |
202 | bool SaveSCC); |
203 | void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
204 | Register SaveWQM); |
205 | void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
206 | Register SavedWQM); |
207 | void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
208 | Register SaveOrig, char StrictStateNeeded); |
209 | void fromStrictMode(MachineBasicBlock &MBB, |
210 | MachineBasicBlock::iterator Before, Register SavedOrig, |
211 | char NonStrictState, char CurrentStrictState); |
212 | |
213 | MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); |
214 | |
215 | MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, |
216 | bool IsWQM); |
217 | MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); |
218 | |
219 | void lowerBlock(MachineBasicBlock &MBB); |
220 | void processBlock(MachineBasicBlock &MBB, bool IsEntry); |
221 | |
222 | bool lowerLiveMaskQueries(); |
223 | bool lowerCopyInstrs(); |
224 | bool lowerKillInstrs(bool IsWQM); |
225 | void lowerInitExec(MachineInstr &MI); |
226 | MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry, |
227 | bool &Changed); |
228 | |
229 | public: |
230 | static char ID; |
231 | |
232 | SIWholeQuadMode() : |
233 | MachineFunctionPass(ID) { } |
234 | |
235 | bool runOnMachineFunction(MachineFunction &MF) override; |
236 | |
237 | StringRef getPassName() const override { return "SI Whole Quad Mode" ; } |
238 | |
239 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
240 | AU.addRequired<LiveIntervalsWrapperPass>(); |
241 | AU.addPreserved<SlotIndexesWrapperPass>(); |
242 | AU.addPreserved<LiveIntervalsWrapperPass>(); |
243 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
244 | AU.addPreserved<MachinePostDominatorTreeWrapperPass>(); |
245 | MachineFunctionPass::getAnalysisUsage(AU); |
246 | } |
247 | |
248 | MachineFunctionProperties getClearedProperties() const override { |
249 | return MachineFunctionProperties().set( |
250 | MachineFunctionProperties::Property::IsSSA); |
251 | } |
252 | }; |
253 | |
254 | } // end anonymous namespace |
255 | |
256 | char SIWholeQuadMode::ID = 0; |
257 | |
258 | INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode" , false, |
259 | false) |
260 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
261 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
262 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) |
263 | INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode" , false, |
264 | false) |
265 | |
266 | char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; |
267 | |
268 | FunctionPass *llvm::createSIWholeQuadModePass() { |
269 | return new SIWholeQuadMode; |
270 | } |
271 | |
272 | #ifndef NDEBUG |
273 | LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() { |
274 | for (const auto &BII : Blocks) { |
275 | dbgs() << "\n" |
276 | << printMBBReference(*BII.first) << ":\n" |
277 | << " InNeeds = " << PrintState(BII.second.InNeeds) |
278 | << ", Needs = " << PrintState(BII.second.Needs) |
279 | << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n" ; |
280 | |
281 | for (const MachineInstr &MI : *BII.first) { |
282 | auto III = Instructions.find(&MI); |
283 | if (III != Instructions.end()) { |
284 | dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) |
285 | << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; |
286 | } |
287 | } |
288 | } |
289 | } |
290 | #endif |
291 | |
292 | void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, |
293 | std::vector<WorkItem> &Worklist) { |
294 | InstrInfo &II = Instructions[&MI]; |
295 | |
296 | assert(!(Flag & StateExact) && Flag != 0); |
297 | |
298 | // Remove any disabled states from the flag. The user that required it gets |
299 | // an undefined value in the helper lanes. For example, this can happen if |
300 | // the result of an atomic is used by instruction that requires WQM, where |
301 | // ignoring the request for WQM is correct as per the relevant specs. |
302 | Flag &= ~II.Disabled; |
303 | |
304 | // Ignore if the flag is already encompassed by the existing needs, or we |
305 | // just disabled everything. |
306 | if ((II.Needs & Flag) == Flag) |
307 | return; |
308 | |
309 | LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); |
310 | II.Needs |= Flag; |
311 | Worklist.emplace_back(args: &MI); |
312 | } |
313 | |
314 | /// Mark all relevant definitions of register \p Reg in usage \p UseMI. |
315 | void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, |
316 | Register Reg, unsigned SubReg, char Flag, |
317 | std::vector<WorkItem> &Worklist) { |
318 | LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); |
319 | |
320 | LiveQueryResult UseLRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: UseMI)); |
321 | const VNInfo *Value = UseLRQ.valueIn(); |
322 | if (!Value) |
323 | return; |
324 | |
325 | // Note: this code assumes that lane masks on AMDGPU completely |
326 | // cover registers. |
327 | const LaneBitmask UseLanes = |
328 | SubReg ? TRI->getSubRegIndexLaneMask(SubIdx: SubReg) |
329 | : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) |
330 | : LaneBitmask::getNone()); |
331 | |
332 | // Perform a depth-first iteration of the LiveRange graph marking defs. |
333 | // Stop processing of a given branch when all use lanes have been defined. |
334 | // The first definition stops processing for a physical register. |
335 | struct PhiEntry { |
336 | const VNInfo *Phi; |
337 | unsigned PredIdx; |
338 | LaneBitmask DefinedLanes; |
339 | |
340 | PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes) |
341 | : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {} |
342 | }; |
343 | using VisitKey = std::pair<const VNInfo *, LaneBitmask>; |
344 | SmallVector<PhiEntry, 2> PhiStack; |
345 | SmallSet<VisitKey, 4> Visited; |
346 | LaneBitmask DefinedLanes; |
347 | unsigned NextPredIdx = 0; // Only used for processing phi nodes |
348 | do { |
349 | const VNInfo *NextValue = nullptr; |
350 | const VisitKey Key(Value, DefinedLanes); |
351 | |
352 | if (Visited.insert(V: Key).second) { |
353 | // On first visit to a phi then start processing first predecessor |
354 | NextPredIdx = 0; |
355 | } |
356 | |
357 | if (Value->isPHIDef()) { |
358 | // Each predecessor node in the phi must be processed as a subgraph |
359 | const MachineBasicBlock *MBB = LIS->getMBBFromIndex(index: Value->def); |
360 | assert(MBB && "Phi-def has no defining MBB" ); |
361 | |
362 | // Find next predecessor to process |
363 | unsigned Idx = NextPredIdx; |
364 | auto PI = MBB->pred_begin() + Idx; |
365 | auto PE = MBB->pred_end(); |
366 | for (; PI != PE && !NextValue; ++PI, ++Idx) { |
367 | if (const VNInfo *VN = LR.getVNInfoBefore(Idx: LIS->getMBBEndIdx(mbb: *PI))) { |
368 | if (!Visited.count(V: VisitKey(VN, DefinedLanes))) |
369 | NextValue = VN; |
370 | } |
371 | } |
372 | |
373 | // If there are more predecessors to process; add phi to stack |
374 | if (PI != PE) |
375 | PhiStack.emplace_back(Args&: Value, Args&: Idx, Args&: DefinedLanes); |
376 | } else { |
377 | MachineInstr *MI = LIS->getInstructionFromIndex(index: Value->def); |
378 | assert(MI && "Def has no defining instruction" ); |
379 | |
380 | if (Reg.isVirtual()) { |
381 | // Iterate over all operands to find relevant definitions |
382 | bool HasDef = false; |
383 | for (const MachineOperand &Op : MI->all_defs()) { |
384 | if (Op.getReg() != Reg) |
385 | continue; |
386 | |
387 | // Compute lanes defined and overlap with use |
388 | LaneBitmask OpLanes = |
389 | Op.isUndef() ? LaneBitmask::getAll() |
390 | : TRI->getSubRegIndexLaneMask(SubIdx: Op.getSubReg()); |
391 | LaneBitmask Overlap = (UseLanes & OpLanes); |
392 | |
393 | // Record if this instruction defined any of use |
394 | HasDef |= Overlap.any(); |
395 | |
396 | // Mark any lanes defined |
397 | DefinedLanes |= OpLanes; |
398 | } |
399 | |
400 | // Check if all lanes of use have been defined |
401 | if ((DefinedLanes & UseLanes) != UseLanes) { |
402 | // Definition not complete; need to process input value |
403 | LiveQueryResult LRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: *MI)); |
404 | if (const VNInfo *VN = LRQ.valueIn()) { |
405 | if (!Visited.count(V: VisitKey(VN, DefinedLanes))) |
406 | NextValue = VN; |
407 | } |
408 | } |
409 | |
410 | // Only mark the instruction if it defines some part of the use |
411 | if (HasDef) |
412 | markInstruction(MI&: *MI, Flag, Worklist); |
413 | } else { |
414 | // For physical registers simply mark the defining instruction |
415 | markInstruction(MI&: *MI, Flag, Worklist); |
416 | } |
417 | } |
418 | |
419 | if (!NextValue && !PhiStack.empty()) { |
420 | // Reach end of chain; revert to processing last phi |
421 | PhiEntry &Entry = PhiStack.back(); |
422 | NextValue = Entry.Phi; |
423 | NextPredIdx = Entry.PredIdx; |
424 | DefinedLanes = Entry.DefinedLanes; |
425 | PhiStack.pop_back(); |
426 | } |
427 | |
428 | Value = NextValue; |
429 | } while (Value); |
430 | } |
431 | |
432 | void SIWholeQuadMode::markOperand(const MachineInstr &MI, |
433 | const MachineOperand &Op, char Flag, |
434 | std::vector<WorkItem> &Worklist) { |
435 | assert(Op.isReg()); |
436 | Register Reg = Op.getReg(); |
437 | |
438 | // Ignore some hardware registers |
439 | switch (Reg) { |
440 | case AMDGPU::EXEC: |
441 | case AMDGPU::EXEC_LO: |
442 | return; |
443 | default: |
444 | break; |
445 | } |
446 | |
447 | LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op |
448 | << " for " << MI); |
449 | if (Reg.isVirtual()) { |
450 | LiveRange &LR = LIS->getInterval(Reg); |
451 | markDefs(UseMI: MI, LR, Reg, SubReg: Op.getSubReg(), Flag, Worklist); |
452 | } else { |
453 | // Handle physical registers that we need to track; this is mostly relevant |
454 | // for VCC, which can appear as the (implicit) input of a uniform branch, |
455 | // e.g. when a loop counter is stored in a VGPR. |
456 | for (MCRegUnit Unit : TRI->regunits(Reg: Reg.asMCReg())) { |
457 | LiveRange &LR = LIS->getRegUnit(Unit); |
458 | const VNInfo *Value = LR.Query(Idx: LIS->getInstructionIndex(Instr: MI)).valueIn(); |
459 | if (Value) |
460 | markDefs(UseMI: MI, LR, Reg: Unit, SubReg: AMDGPU::NoSubRegister, Flag, Worklist); |
461 | } |
462 | } |
463 | } |
464 | |
465 | /// Mark all instructions defining the uses in \p MI with \p Flag. |
466 | void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, |
467 | std::vector<WorkItem> &Worklist) { |
468 | LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " |
469 | << MI); |
470 | |
471 | for (const MachineOperand &Use : MI.all_uses()) |
472 | markOperand(MI, Op: Use, Flag, Worklist); |
473 | } |
474 | |
475 | // Scan instructions to determine which ones require an Exact execmask and |
476 | // which ones seed WQM requirements. |
477 | char SIWholeQuadMode::scanInstructions(MachineFunction &MF, |
478 | std::vector<WorkItem> &Worklist) { |
479 | char GlobalFlags = 0; |
480 | bool WQMOutputs = MF.getFunction().hasFnAttribute(Kind: "amdgpu-ps-wqm-outputs" ); |
481 | SmallVector<MachineInstr *, 4> SetInactiveInstrs; |
482 | SmallVector<MachineInstr *, 4> SoftWQMInstrs; |
483 | bool HasImplicitDerivatives = |
484 | MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; |
485 | |
486 | // We need to visit the basic blocks in reverse post-order so that we visit |
487 | // defs before uses, in particular so that we don't accidentally mark an |
488 | // instruction as needing e.g. WQM before visiting it and realizing it needs |
489 | // WQM disabled. |
490 | ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); |
491 | for (MachineBasicBlock *MBB : RPOT) { |
492 | BlockInfo &BBI = Blocks[MBB]; |
493 | |
494 | for (MachineInstr &MI : *MBB) { |
495 | InstrInfo &III = Instructions[&MI]; |
496 | unsigned Opcode = MI.getOpcode(); |
497 | char Flags = 0; |
498 | |
499 | if (TII->isWQM(Opcode)) { |
500 | // If LOD is not supported WQM is not needed. |
501 | // Only generate implicit WQM if implicit derivatives are required. |
502 | // This avoids inserting unintended WQM if a shader type without |
503 | // implicit derivatives uses an image sampling instruction. |
504 | if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) { |
505 | // Sampling instructions don't need to produce results for all pixels |
506 | // in a quad, they just require all inputs of a quad to have been |
507 | // computed for derivatives. |
508 | markInstructionUses(MI, Flag: StateWQM, Worklist); |
509 | GlobalFlags |= StateWQM; |
510 | } |
511 | } else if (Opcode == AMDGPU::WQM) { |
512 | // The WQM intrinsic requires its output to have all the helper lanes |
513 | // correct, so we need it to be in WQM. |
514 | Flags = StateWQM; |
515 | LowerToCopyInstrs.push_back(Elt: &MI); |
516 | } else if (Opcode == AMDGPU::SOFT_WQM) { |
517 | LowerToCopyInstrs.push_back(Elt: &MI); |
518 | SoftWQMInstrs.push_back(Elt: &MI); |
519 | } else if (Opcode == AMDGPU::STRICT_WWM) { |
520 | // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus |
521 | // it needs to be executed in WQM or Exact so that its copy doesn't |
522 | // clobber inactive lanes. |
523 | markInstructionUses(MI, Flag: StateStrictWWM, Worklist); |
524 | GlobalFlags |= StateStrictWWM; |
525 | LowerToMovInstrs.push_back(Elt: &MI); |
526 | } else if (Opcode == AMDGPU::STRICT_WQM || |
527 | TII->isDualSourceBlendEXP(MI)) { |
528 | // STRICT_WQM is similar to STRICTWWM, but instead of enabling all |
529 | // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in |
530 | // quads that have at least one active thread. |
531 | markInstructionUses(MI, Flag: StateStrictWQM, Worklist); |
532 | GlobalFlags |= StateStrictWQM; |
533 | |
534 | if (Opcode == AMDGPU::STRICT_WQM) { |
535 | LowerToMovInstrs.push_back(Elt: &MI); |
536 | } else { |
537 | // Dual source blend export acts as implicit strict-wqm, its sources |
538 | // need to be shuffled in strict wqm, but the export itself needs to |
539 | // run in exact mode. |
540 | BBI.Needs |= StateExact; |
541 | if (!(BBI.InNeeds & StateExact)) { |
542 | BBI.InNeeds |= StateExact; |
543 | Worklist.emplace_back(args&: MBB); |
544 | } |
545 | GlobalFlags |= StateExact; |
546 | III.Disabled = StateWQM | StateStrict; |
547 | } |
548 | } else if (Opcode == AMDGPU::LDS_PARAM_LOAD || |
549 | Opcode == AMDGPU::DS_PARAM_LOAD || |
550 | Opcode == AMDGPU::LDS_DIRECT_LOAD || |
551 | Opcode == AMDGPU::DS_DIRECT_LOAD) { |
552 | // Mark these STRICTWQM, but only for the instruction, not its operands. |
553 | // This avoid unnecessarily marking M0 as requiring WQM. |
554 | III.Needs |= StateStrictWQM; |
555 | GlobalFlags |= StateStrictWQM; |
556 | } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || |
557 | Opcode == AMDGPU::V_SET_INACTIVE_B64) { |
558 | III.Disabled = StateStrict; |
559 | MachineOperand &Inactive = MI.getOperand(i: 2); |
560 | if (Inactive.isReg()) { |
561 | if (Inactive.isUndef()) { |
562 | LowerToCopyInstrs.push_back(Elt: &MI); |
563 | } else { |
564 | markOperand(MI, Op: Inactive, Flag: StateStrictWWM, Worklist); |
565 | } |
566 | } |
567 | SetInactiveInstrs.push_back(Elt: &MI); |
568 | } else if (TII->isDisableWQM(MI)) { |
569 | BBI.Needs |= StateExact; |
570 | if (!(BBI.InNeeds & StateExact)) { |
571 | BBI.InNeeds |= StateExact; |
572 | Worklist.emplace_back(args&: MBB); |
573 | } |
574 | GlobalFlags |= StateExact; |
575 | III.Disabled = StateWQM | StateStrict; |
576 | } else if (Opcode == AMDGPU::SI_PS_LIVE || |
577 | Opcode == AMDGPU::SI_LIVE_MASK) { |
578 | LiveMaskQueries.push_back(Elt: &MI); |
579 | } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || |
580 | Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || |
581 | Opcode == AMDGPU::SI_DEMOTE_I1) { |
582 | KillInstrs.push_back(Elt: &MI); |
583 | BBI.NeedsLowering = true; |
584 | } else if (Opcode == AMDGPU::SI_INIT_EXEC || |
585 | Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) { |
586 | InitExecInstrs.push_back(Elt: &MI); |
587 | } else if (WQMOutputs) { |
588 | // The function is in machine SSA form, which means that physical |
589 | // VGPRs correspond to shader inputs and outputs. Inputs are |
590 | // only used, outputs are only defined. |
591 | // FIXME: is this still valid? |
592 | for (const MachineOperand &MO : MI.defs()) { |
593 | Register Reg = MO.getReg(); |
594 | if (Reg.isPhysical() && |
595 | TRI->hasVectorRegisters(RC: TRI->getPhysRegBaseClass(Reg))) { |
596 | Flags = StateWQM; |
597 | break; |
598 | } |
599 | } |
600 | } |
601 | |
602 | if (Flags) { |
603 | markInstruction(MI, Flag: Flags, Worklist); |
604 | GlobalFlags |= Flags; |
605 | } |
606 | } |
607 | } |
608 | |
609 | // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is |
610 | // ever used anywhere in the function. This implements the corresponding |
611 | // semantics of @llvm.amdgcn.set.inactive. |
612 | // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm. |
613 | if (GlobalFlags & StateWQM) { |
614 | for (MachineInstr *MI : SetInactiveInstrs) |
615 | markInstruction(MI&: *MI, Flag: StateWQM, Worklist); |
616 | for (MachineInstr *MI : SoftWQMInstrs) |
617 | markInstruction(MI&: *MI, Flag: StateWQM, Worklist); |
618 | } |
619 | |
620 | return GlobalFlags; |
621 | } |
622 | |
623 | void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, |
624 | std::vector<WorkItem>& Worklist) { |
625 | MachineBasicBlock *MBB = MI.getParent(); |
626 | InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references |
627 | BlockInfo &BI = Blocks[MBB]; |
628 | |
629 | // Control flow-type instructions and stores to temporary memory that are |
630 | // followed by WQM computations must themselves be in WQM. |
631 | if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) && |
632 | (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { |
633 | Instructions[&MI].Needs = StateWQM; |
634 | II.Needs = StateWQM; |
635 | } |
636 | |
637 | // Propagate to block level |
638 | if (II.Needs & StateWQM) { |
639 | BI.Needs |= StateWQM; |
640 | if (!(BI.InNeeds & StateWQM)) { |
641 | BI.InNeeds |= StateWQM; |
642 | Worklist.emplace_back(args&: MBB); |
643 | } |
644 | } |
645 | |
646 | // Propagate backwards within block |
647 | if (MachineInstr *PrevMI = MI.getPrevNode()) { |
648 | char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds; |
649 | if (!PrevMI->isPHI()) { |
650 | InstrInfo &PrevII = Instructions[PrevMI]; |
651 | if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { |
652 | PrevII.OutNeeds |= InNeeds; |
653 | Worklist.emplace_back(args&: PrevMI); |
654 | } |
655 | } |
656 | } |
657 | |
658 | // Propagate WQM flag to instruction inputs |
659 | assert(!(II.Needs & StateExact)); |
660 | |
661 | if (II.Needs != 0) |
662 | markInstructionUses(MI, Flag: II.Needs, Worklist); |
663 | |
664 | // Ensure we process a block containing StrictWWM/StrictWQM, even if it does |
665 | // not require any WQM transitions. |
666 | if (II.Needs & StateStrictWWM) |
667 | BI.Needs |= StateStrictWWM; |
668 | if (II.Needs & StateStrictWQM) |
669 | BI.Needs |= StateStrictWQM; |
670 | } |
671 | |
672 | void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, |
673 | std::vector<WorkItem>& Worklist) { |
674 | BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. |
675 | |
676 | // Propagate through instructions |
677 | if (!MBB.empty()) { |
678 | MachineInstr *LastMI = &*MBB.rbegin(); |
679 | InstrInfo &LastII = Instructions[LastMI]; |
680 | if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { |
681 | LastII.OutNeeds |= BI.OutNeeds; |
682 | Worklist.emplace_back(args&: LastMI); |
683 | } |
684 | } |
685 | |
686 | // Predecessor blocks must provide for our WQM/Exact needs. |
687 | for (MachineBasicBlock *Pred : MBB.predecessors()) { |
688 | BlockInfo &PredBI = Blocks[Pred]; |
689 | if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) |
690 | continue; |
691 | |
692 | PredBI.OutNeeds |= BI.InNeeds; |
693 | PredBI.InNeeds |= BI.InNeeds; |
694 | Worklist.emplace_back(args&: Pred); |
695 | } |
696 | |
697 | // All successors must be prepared to accept the same set of WQM/Exact data. |
698 | for (MachineBasicBlock *Succ : MBB.successors()) { |
699 | BlockInfo &SuccBI = Blocks[Succ]; |
700 | if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) |
701 | continue; |
702 | |
703 | SuccBI.InNeeds |= BI.OutNeeds; |
704 | Worklist.emplace_back(args&: Succ); |
705 | } |
706 | } |
707 | |
708 | char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { |
709 | std::vector<WorkItem> Worklist; |
710 | char GlobalFlags = scanInstructions(MF, Worklist); |
711 | |
712 | while (!Worklist.empty()) { |
713 | WorkItem WI = Worklist.back(); |
714 | Worklist.pop_back(); |
715 | |
716 | if (WI.MI) |
717 | propagateInstruction(MI&: *WI.MI, Worklist); |
718 | else |
719 | propagateBlock(MBB&: *WI.MBB, Worklist); |
720 | } |
721 | |
722 | return GlobalFlags; |
723 | } |
724 | |
725 | MachineBasicBlock::iterator |
726 | SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, |
727 | MachineBasicBlock::iterator Before) { |
728 | Register SaveReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
729 | |
730 | MachineInstr *Save = |
731 | BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SaveReg) |
732 | .addReg(RegNo: AMDGPU::SCC); |
733 | MachineInstr *Restore = |
734 | BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC) |
735 | .addReg(RegNo: SaveReg); |
736 | |
737 | LIS->InsertMachineInstrInMaps(MI&: *Save); |
738 | LIS->InsertMachineInstrInMaps(MI&: *Restore); |
739 | LIS->createAndComputeVirtRegInterval(Reg: SaveReg); |
740 | |
741 | return Restore; |
742 | } |
743 | |
744 | MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, |
745 | MachineInstr *TermMI) { |
746 | LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " |
747 | << *TermMI << "\n" ); |
748 | |
749 | MachineBasicBlock *SplitBB = |
750 | BB->splitAt(SplitInst&: *TermMI, /*UpdateLiveIns*/ true, LIS); |
751 | |
752 | // Convert last instruction in block to a terminator. |
753 | // Note: this only covers the expected patterns |
754 | unsigned NewOpcode = 0; |
755 | switch (TermMI->getOpcode()) { |
756 | case AMDGPU::S_AND_B32: |
757 | NewOpcode = AMDGPU::S_AND_B32_term; |
758 | break; |
759 | case AMDGPU::S_AND_B64: |
760 | NewOpcode = AMDGPU::S_AND_B64_term; |
761 | break; |
762 | case AMDGPU::S_MOV_B32: |
763 | NewOpcode = AMDGPU::S_MOV_B32_term; |
764 | break; |
765 | case AMDGPU::S_MOV_B64: |
766 | NewOpcode = AMDGPU::S_MOV_B64_term; |
767 | break; |
768 | default: |
769 | break; |
770 | } |
771 | if (NewOpcode) |
772 | TermMI->setDesc(TII->get(Opcode: NewOpcode)); |
773 | |
774 | if (SplitBB != BB) { |
775 | // Update dominator trees |
776 | using DomTreeT = DomTreeBase<MachineBasicBlock>; |
777 | SmallVector<DomTreeT::UpdateType, 16> DTUpdates; |
778 | for (MachineBasicBlock *Succ : SplitBB->successors()) { |
779 | DTUpdates.push_back(Elt: {DomTreeT::Insert, SplitBB, Succ}); |
780 | DTUpdates.push_back(Elt: {DomTreeT::Delete, BB, Succ}); |
781 | } |
782 | DTUpdates.push_back(Elt: {DomTreeT::Insert, BB, SplitBB}); |
783 | if (MDT) |
784 | MDT->getBase().applyUpdates(Updates: DTUpdates); |
785 | if (PDT) |
786 | PDT->applyUpdates(Updates: DTUpdates); |
787 | |
788 | // Link blocks |
789 | MachineInstr *MI = |
790 | BuildMI(BB&: *BB, I: BB->end(), MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_BRANCH)) |
791 | .addMBB(MBB: SplitBB); |
792 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
793 | } |
794 | |
795 | return SplitBB; |
796 | } |
797 | |
798 | MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, |
799 | MachineInstr &MI) { |
800 | assert(LiveMaskReg.isVirtual()); |
801 | |
802 | const DebugLoc &DL = MI.getDebugLoc(); |
803 | unsigned Opcode = 0; |
804 | |
805 | assert(MI.getOperand(0).isReg()); |
806 | |
807 | // Comparison is for live lanes; however here we compute the inverse |
808 | // (killed lanes). This is because VCMP will always generate 0 bits |
809 | // for inactive lanes so a mask of live lanes would not be correct |
810 | // inside control flow. |
811 | // Invert the comparison by swapping the operands and adjusting |
812 | // the comparison codes. |
813 | |
814 | switch (MI.getOperand(i: 2).getImm()) { |
815 | case ISD::SETUEQ: |
816 | Opcode = AMDGPU::V_CMP_LG_F32_e64; |
817 | break; |
818 | case ISD::SETUGT: |
819 | Opcode = AMDGPU::V_CMP_GE_F32_e64; |
820 | break; |
821 | case ISD::SETUGE: |
822 | Opcode = AMDGPU::V_CMP_GT_F32_e64; |
823 | break; |
824 | case ISD::SETULT: |
825 | Opcode = AMDGPU::V_CMP_LE_F32_e64; |
826 | break; |
827 | case ISD::SETULE: |
828 | Opcode = AMDGPU::V_CMP_LT_F32_e64; |
829 | break; |
830 | case ISD::SETUNE: |
831 | Opcode = AMDGPU::V_CMP_EQ_F32_e64; |
832 | break; |
833 | case ISD::SETO: |
834 | Opcode = AMDGPU::V_CMP_O_F32_e64; |
835 | break; |
836 | case ISD::SETUO: |
837 | Opcode = AMDGPU::V_CMP_U_F32_e64; |
838 | break; |
839 | case ISD::SETOEQ: |
840 | case ISD::SETEQ: |
841 | Opcode = AMDGPU::V_CMP_NEQ_F32_e64; |
842 | break; |
843 | case ISD::SETOGT: |
844 | case ISD::SETGT: |
845 | Opcode = AMDGPU::V_CMP_NLT_F32_e64; |
846 | break; |
847 | case ISD::SETOGE: |
848 | case ISD::SETGE: |
849 | Opcode = AMDGPU::V_CMP_NLE_F32_e64; |
850 | break; |
851 | case ISD::SETOLT: |
852 | case ISD::SETLT: |
853 | Opcode = AMDGPU::V_CMP_NGT_F32_e64; |
854 | break; |
855 | case ISD::SETOLE: |
856 | case ISD::SETLE: |
857 | Opcode = AMDGPU::V_CMP_NGE_F32_e64; |
858 | break; |
859 | case ISD::SETONE: |
860 | case ISD::SETNE: |
861 | Opcode = AMDGPU::V_CMP_NLG_F32_e64; |
862 | break; |
863 | default: |
864 | llvm_unreachable("invalid ISD:SET cond code" ); |
865 | } |
866 | |
867 | // Pick opcode based on comparison type. |
868 | MachineInstr *VcmpMI; |
869 | const MachineOperand &Op0 = MI.getOperand(i: 0); |
870 | const MachineOperand &Op1 = MI.getOperand(i: 1); |
871 | |
872 | // VCC represents lanes killed. |
873 | Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; |
874 | |
875 | if (TRI->isVGPR(MRI: *MRI, Reg: Op0.getReg())) { |
876 | Opcode = AMDGPU::getVOPe32(Opcode); |
877 | VcmpMI = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode)).add(MO: Op1).add(MO: Op0); |
878 | } else { |
879 | VcmpMI = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode)) |
880 | .addReg(RegNo: VCC, flags: RegState::Define) |
881 | .addImm(Val: 0) // src0 modifiers |
882 | .add(MO: Op1) |
883 | .addImm(Val: 0) // src1 modifiers |
884 | .add(MO: Op0) |
885 | .addImm(Val: 0); // omod |
886 | } |
887 | |
888 | MachineInstr *MaskUpdateMI = |
889 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
890 | .addReg(RegNo: LiveMaskReg) |
891 | .addReg(RegNo: VCC); |
892 | |
893 | // State of SCC represents whether any lanes are live in mask, |
894 | // if SCC is 0 then no lanes will be alive anymore. |
895 | MachineInstr *EarlyTermMI = |
896 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_EARLY_TERMINATE_SCC0)); |
897 | |
898 | MachineInstr *ExecMaskMI = |
899 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: Exec).addReg(RegNo: Exec).addReg(RegNo: VCC); |
900 | |
901 | assert(MBB.succ_size() == 1); |
902 | MachineInstr *NewTerm = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH)) |
903 | .addMBB(MBB: *MBB.succ_begin()); |
904 | |
905 | // Update live intervals |
906 | LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *VcmpMI); |
907 | MBB.remove(I: &MI); |
908 | |
909 | LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI); |
910 | LIS->InsertMachineInstrInMaps(MI&: *ExecMaskMI); |
911 | LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI); |
912 | LIS->InsertMachineInstrInMaps(MI&: *NewTerm); |
913 | |
914 | return NewTerm; |
915 | } |
916 | |
917 | MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, |
918 | MachineInstr &MI, bool IsWQM) { |
919 | assert(LiveMaskReg.isVirtual()); |
920 | |
921 | const DebugLoc &DL = MI.getDebugLoc(); |
922 | MachineInstr *MaskUpdateMI = nullptr; |
923 | |
924 | const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1); |
925 | const MachineOperand &Op = MI.getOperand(i: 0); |
926 | int64_t KillVal = MI.getOperand(i: 1).getImm(); |
927 | MachineInstr *ComputeKilledMaskMI = nullptr; |
928 | Register CndReg = !Op.isImm() ? Op.getReg() : Register(); |
929 | Register TmpReg; |
930 | |
931 | // Is this a static or dynamic kill? |
932 | if (Op.isImm()) { |
933 | if (Op.getImm() == KillVal) { |
934 | // Static: all active lanes are killed |
935 | MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
936 | .addReg(RegNo: LiveMaskReg) |
937 | .addReg(RegNo: Exec); |
938 | } else { |
939 | // Static: kill does nothing |
940 | MachineInstr *NewTerm = nullptr; |
941 | if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { |
942 | LIS->RemoveMachineInstrFromMaps(MI); |
943 | } else { |
944 | assert(MBB.succ_size() == 1); |
945 | NewTerm = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH)) |
946 | .addMBB(MBB: *MBB.succ_begin()); |
947 | LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewTerm); |
948 | } |
949 | MBB.remove(I: &MI); |
950 | return NewTerm; |
951 | } |
952 | } else { |
953 | if (!KillVal) { |
954 | // Op represents live lanes after kill, |
955 | // so exec mask needs to be factored in. |
956 | TmpReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
957 | ComputeKilledMaskMI = |
958 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: XorOpc), DestReg: TmpReg).add(MO: Op).addReg(RegNo: Exec); |
959 | MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
960 | .addReg(RegNo: LiveMaskReg) |
961 | .addReg(RegNo: TmpReg); |
962 | } else { |
963 | // Op represents lanes to kill |
964 | MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
965 | .addReg(RegNo: LiveMaskReg) |
966 | .add(MO: Op); |
967 | } |
968 | } |
969 | |
970 | // State of SCC represents whether any lanes are live in mask, |
971 | // if SCC is 0 then no lanes will be alive anymore. |
972 | MachineInstr *EarlyTermMI = |
973 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_EARLY_TERMINATE_SCC0)); |
974 | |
975 | // In the case we got this far some lanes are still live, |
976 | // update EXEC to deactivate lanes as appropriate. |
977 | MachineInstr *NewTerm; |
978 | MachineInstr *WQMMaskMI = nullptr; |
979 | Register LiveMaskWQM; |
980 | if (IsDemote) { |
981 | // Demote - deactivate quads with only helper lanes |
982 | LiveMaskWQM = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
983 | WQMMaskMI = |
984 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: WQMOpc), DestReg: LiveMaskWQM).addReg(RegNo: LiveMaskReg); |
985 | NewTerm = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: Exec) |
986 | .addReg(RegNo: Exec) |
987 | .addReg(RegNo: LiveMaskWQM); |
988 | } else { |
989 | // Kill - deactivate lanes no longer in live mask |
990 | if (Op.isImm()) { |
991 | unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
992 | NewTerm = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: Exec).addImm(Val: 0); |
993 | } else if (!IsWQM) { |
994 | NewTerm = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: Exec) |
995 | .addReg(RegNo: Exec) |
996 | .addReg(RegNo: LiveMaskReg); |
997 | } else { |
998 | unsigned Opcode = KillVal ? AndN2Opc : AndOpc; |
999 | NewTerm = |
1000 | BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode), DestReg: Exec).addReg(RegNo: Exec).add(MO: Op); |
1001 | } |
1002 | } |
1003 | |
1004 | // Update live intervals |
1005 | LIS->RemoveMachineInstrFromMaps(MI); |
1006 | MBB.remove(I: &MI); |
1007 | assert(EarlyTermMI); |
1008 | assert(MaskUpdateMI); |
1009 | assert(NewTerm); |
1010 | if (ComputeKilledMaskMI) |
1011 | LIS->InsertMachineInstrInMaps(MI&: *ComputeKilledMaskMI); |
1012 | LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI); |
1013 | LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI); |
1014 | if (WQMMaskMI) |
1015 | LIS->InsertMachineInstrInMaps(MI&: *WQMMaskMI); |
1016 | LIS->InsertMachineInstrInMaps(MI&: *NewTerm); |
1017 | |
1018 | if (CndReg) { |
1019 | LIS->removeInterval(Reg: CndReg); |
1020 | LIS->createAndComputeVirtRegInterval(Reg: CndReg); |
1021 | } |
1022 | if (TmpReg) |
1023 | LIS->createAndComputeVirtRegInterval(Reg: TmpReg); |
1024 | if (LiveMaskWQM) |
1025 | LIS->createAndComputeVirtRegInterval(Reg: LiveMaskWQM); |
1026 | |
1027 | return NewTerm; |
1028 | } |
1029 | |
1030 | // Replace (or supplement) instructions accessing live mask. |
1031 | // This can only happen once all the live mask registers have been created |
1032 | // and the execute state (WQM/StrictWWM/Exact) of instructions is known. |
1033 | void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { |
1034 | auto BII = Blocks.find(Key: &MBB); |
1035 | if (BII == Blocks.end()) |
1036 | return; |
1037 | |
1038 | const BlockInfo &BI = BII->second; |
1039 | if (!BI.NeedsLowering) |
1040 | return; |
1041 | |
1042 | LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n" ); |
1043 | |
1044 | SmallVector<MachineInstr *, 4> SplitPoints; |
1045 | char State = BI.InitialState; |
1046 | |
1047 | for (MachineInstr &MI : llvm::make_early_inc_range( |
1048 | Range: llvm::make_range(x: MBB.getFirstNonPHI(), y: MBB.end()))) { |
1049 | if (StateTransition.count(Val: &MI)) |
1050 | State = StateTransition[&MI]; |
1051 | |
1052 | MachineInstr *SplitPoint = nullptr; |
1053 | switch (MI.getOpcode()) { |
1054 | case AMDGPU::SI_DEMOTE_I1: |
1055 | case AMDGPU::SI_KILL_I1_TERMINATOR: |
1056 | SplitPoint = lowerKillI1(MBB, MI, IsWQM: State == StateWQM); |
1057 | break; |
1058 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
1059 | SplitPoint = lowerKillF32(MBB, MI); |
1060 | break; |
1061 | default: |
1062 | break; |
1063 | } |
1064 | if (SplitPoint) |
1065 | SplitPoints.push_back(Elt: SplitPoint); |
1066 | } |
1067 | |
1068 | // Perform splitting after instruction scan to simplify iteration. |
1069 | if (!SplitPoints.empty()) { |
1070 | MachineBasicBlock *BB = &MBB; |
1071 | for (MachineInstr *MI : SplitPoints) { |
1072 | BB = splitBlock(BB, TermMI: MI); |
1073 | } |
1074 | } |
1075 | } |
1076 | |
1077 | // Return an iterator in the (inclusive) range [First, Last] at which |
1078 | // instructions can be safely inserted, keeping in mind that some of the |
1079 | // instructions we want to add necessarily clobber SCC. |
1080 | MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( |
1081 | MachineBasicBlock &MBB, MachineBasicBlock::iterator First, |
1082 | MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { |
1083 | if (!SaveSCC) |
1084 | return PreferLast ? Last : First; |
1085 | |
1086 | LiveRange &LR = |
1087 | LIS->getRegUnit(Unit: *TRI->regunits(Reg: MCRegister::from(Val: AMDGPU::SCC)).begin()); |
1088 | auto MBBE = MBB.end(); |
1089 | SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(Instr: *First) |
1090 | : LIS->getMBBEndIdx(mbb: &MBB); |
1091 | SlotIndex LastIdx = |
1092 | Last != MBBE ? LIS->getInstructionIndex(Instr: *Last) : LIS->getMBBEndIdx(mbb: &MBB); |
1093 | SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; |
1094 | const LiveRange::Segment *S; |
1095 | |
1096 | for (;;) { |
1097 | S = LR.getSegmentContaining(Idx); |
1098 | if (!S) |
1099 | break; |
1100 | |
1101 | if (PreferLast) { |
1102 | SlotIndex Next = S->start.getBaseIndex(); |
1103 | if (Next < FirstIdx) |
1104 | break; |
1105 | Idx = Next; |
1106 | } else { |
1107 | MachineInstr *EndMI = LIS->getInstructionFromIndex(index: S->end.getBaseIndex()); |
1108 | assert(EndMI && "Segment does not end on valid instruction" ); |
1109 | auto NextI = std::next(x: EndMI->getIterator()); |
1110 | if (NextI == MBB.end()) |
1111 | break; |
1112 | SlotIndex Next = LIS->getInstructionIndex(Instr: *NextI); |
1113 | if (Next > LastIdx) |
1114 | break; |
1115 | Idx = Next; |
1116 | } |
1117 | } |
1118 | |
1119 | MachineBasicBlock::iterator MBBI; |
1120 | |
1121 | if (MachineInstr *MI = LIS->getInstructionFromIndex(index: Idx)) |
1122 | MBBI = MI; |
1123 | else { |
1124 | assert(Idx == LIS->getMBBEndIdx(&MBB)); |
1125 | MBBI = MBB.end(); |
1126 | } |
1127 | |
1128 | // Move insertion point past any operations modifying EXEC. |
1129 | // This assumes that the value of SCC defined by any of these operations |
1130 | // does not need to be preserved. |
1131 | while (MBBI != Last) { |
1132 | bool IsExecDef = false; |
1133 | for (const MachineOperand &MO : MBBI->all_defs()) { |
1134 | IsExecDef |= |
1135 | MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC; |
1136 | } |
1137 | if (!IsExecDef) |
1138 | break; |
1139 | MBBI++; |
1140 | S = nullptr; |
1141 | } |
1142 | |
1143 | if (S) |
1144 | MBBI = saveSCC(MBB, Before: MBBI); |
1145 | |
1146 | return MBBI; |
1147 | } |
1148 | |
1149 | void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, |
1150 | MachineBasicBlock::iterator Before, |
1151 | Register SaveWQM) { |
1152 | assert(LiveMaskReg.isVirtual()); |
1153 | |
1154 | bool IsTerminator = Before == MBB.end(); |
1155 | if (!IsTerminator) { |
1156 | auto FirstTerm = MBB.getFirstTerminator(); |
1157 | if (FirstTerm != MBB.end()) { |
1158 | SlotIndex FirstTermIdx = LIS->getInstructionIndex(Instr: *FirstTerm); |
1159 | SlotIndex BeforeIdx = LIS->getInstructionIndex(Instr: *Before); |
1160 | IsTerminator = BeforeIdx > FirstTermIdx; |
1161 | } |
1162 | } |
1163 | |
1164 | MachineInstr *MI; |
1165 | |
1166 | if (SaveWQM) { |
1167 | unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc; |
1168 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode), DestReg: SaveWQM) |
1169 | .addReg(RegNo: LiveMaskReg); |
1170 | } else { |
1171 | unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc; |
1172 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode), DestReg: Exec) |
1173 | .addReg(RegNo: Exec) |
1174 | .addReg(RegNo: LiveMaskReg); |
1175 | } |
1176 | |
1177 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1178 | StateTransition[MI] = StateExact; |
1179 | } |
1180 | |
1181 | void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, |
1182 | MachineBasicBlock::iterator Before, |
1183 | Register SavedWQM) { |
1184 | MachineInstr *MI; |
1185 | |
1186 | if (SavedWQM) { |
1187 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Exec) |
1188 | .addReg(RegNo: SavedWQM); |
1189 | } else { |
1190 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: WQMOpc), DestReg: Exec).addReg(RegNo: Exec); |
1191 | } |
1192 | |
1193 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1194 | StateTransition[MI] = StateWQM; |
1195 | } |
1196 | |
1197 | void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, |
1198 | MachineBasicBlock::iterator Before, |
1199 | Register SaveOrig, char StrictStateNeeded) { |
1200 | MachineInstr *MI; |
1201 | assert(SaveOrig); |
1202 | assert(StrictStateNeeded == StateStrictWWM || |
1203 | StrictStateNeeded == StateStrictWQM); |
1204 | |
1205 | if (StrictStateNeeded == StateStrictWWM) { |
1206 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::ENTER_STRICT_WWM), |
1207 | DestReg: SaveOrig) |
1208 | .addImm(Val: -1); |
1209 | } else { |
1210 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::ENTER_STRICT_WQM), |
1211 | DestReg: SaveOrig) |
1212 | .addImm(Val: -1); |
1213 | } |
1214 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1215 | StateTransition[MI] = StrictStateNeeded; |
1216 | } |
1217 | |
1218 | void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, |
1219 | MachineBasicBlock::iterator Before, |
1220 | Register SavedOrig, char NonStrictState, |
1221 | char CurrentStrictState) { |
1222 | MachineInstr *MI; |
1223 | |
1224 | assert(SavedOrig); |
1225 | assert(CurrentStrictState == StateStrictWWM || |
1226 | CurrentStrictState == StateStrictWQM); |
1227 | |
1228 | if (CurrentStrictState == StateStrictWWM) { |
1229 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::EXIT_STRICT_WWM), |
1230 | DestReg: Exec) |
1231 | .addReg(RegNo: SavedOrig); |
1232 | } else { |
1233 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::EXIT_STRICT_WQM), |
1234 | DestReg: Exec) |
1235 | .addReg(RegNo: SavedOrig); |
1236 | } |
1237 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1238 | StateTransition[MI] = NonStrictState; |
1239 | } |
1240 | |
1241 | void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { |
1242 | auto BII = Blocks.find(Key: &MBB); |
1243 | if (BII == Blocks.end()) |
1244 | return; |
1245 | |
1246 | BlockInfo &BI = BII->second; |
1247 | |
1248 | // This is a non-entry block that is WQM throughout, so no need to do |
1249 | // anything. |
1250 | if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { |
1251 | BI.InitialState = StateWQM; |
1252 | return; |
1253 | } |
1254 | |
1255 | LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) |
1256 | << ":\n" ); |
1257 | |
1258 | Register SavedWQMReg; |
1259 | Register SavedNonStrictReg; |
1260 | bool WQMFromExec = IsEntry; |
1261 | char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; |
1262 | char NonStrictState = 0; |
1263 | const TargetRegisterClass *BoolRC = TRI->getBoolRC(); |
1264 | |
1265 | auto II = MBB.getFirstNonPHI(), IE = MBB.end(); |
1266 | if (IsEntry) { |
1267 | // Skip the instruction that saves LiveMask |
1268 | if (II != IE && II->getOpcode() == AMDGPU::COPY && |
1269 | II->getOperand(i: 1).getReg() == TRI->getExec()) |
1270 | ++II; |
1271 | } |
1272 | |
1273 | // This stores the first instruction where it's safe to switch from WQM to |
1274 | // Exact or vice versa. |
1275 | MachineBasicBlock::iterator FirstWQM = IE; |
1276 | |
1277 | // This stores the first instruction where it's safe to switch from Strict |
1278 | // mode to Exact/WQM or to switch to Strict mode. It must always be the same |
1279 | // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must |
1280 | // be safe to switch to/from WQM as well. |
1281 | MachineBasicBlock::iterator FirstStrict = IE; |
1282 | |
1283 | // Record initial state is block information. |
1284 | BI.InitialState = State; |
1285 | |
1286 | for (;;) { |
1287 | MachineBasicBlock::iterator Next = II; |
1288 | char Needs = StateExact | StateWQM; // Strict mode is disabled by default. |
1289 | char OutNeeds = 0; |
1290 | |
1291 | if (FirstWQM == IE) |
1292 | FirstWQM = II; |
1293 | |
1294 | if (FirstStrict == IE) |
1295 | FirstStrict = II; |
1296 | |
1297 | // First, figure out the allowed states (Needs) based on the propagated |
1298 | // flags. |
1299 | if (II != IE) { |
1300 | MachineInstr &MI = *II; |
1301 | |
1302 | if (MI.isTerminator() || TII->mayReadEXEC(MRI: *MRI, MI)) { |
1303 | auto III = Instructions.find(Val: &MI); |
1304 | if (III != Instructions.end()) { |
1305 | if (III->second.Needs & StateStrictWWM) |
1306 | Needs = StateStrictWWM; |
1307 | else if (III->second.Needs & StateStrictWQM) |
1308 | Needs = StateStrictWQM; |
1309 | else if (III->second.Needs & StateWQM) |
1310 | Needs = StateWQM; |
1311 | else |
1312 | Needs &= ~III->second.Disabled; |
1313 | OutNeeds = III->second.OutNeeds; |
1314 | } |
1315 | } else { |
1316 | // If the instruction doesn't actually need a correct EXEC, then we can |
1317 | // safely leave Strict mode enabled. |
1318 | Needs = StateExact | StateWQM | StateStrict; |
1319 | } |
1320 | |
1321 | // Exact mode exit can occur in terminators, but must be before branches. |
1322 | if (MI.isBranch() && OutNeeds == StateExact) |
1323 | Needs = StateExact; |
1324 | |
1325 | ++Next; |
1326 | } else { |
1327 | // End of basic block |
1328 | if (BI.OutNeeds & StateWQM) |
1329 | Needs = StateWQM; |
1330 | else if (BI.OutNeeds == StateExact) |
1331 | Needs = StateExact; |
1332 | else |
1333 | Needs = StateWQM | StateExact; |
1334 | } |
1335 | |
1336 | // Now, transition if necessary. |
1337 | if (!(Needs & State)) { |
1338 | MachineBasicBlock::iterator First; |
1339 | if (State == StateStrictWWM || Needs == StateStrictWWM || |
1340 | State == StateStrictWQM || Needs == StateStrictWQM) { |
1341 | // We must switch to or from Strict mode. |
1342 | First = FirstStrict; |
1343 | } else { |
1344 | // We only need to switch to/from WQM, so we can use FirstWQM. |
1345 | First = FirstWQM; |
1346 | } |
1347 | |
1348 | // Whether we need to save SCC depends on start and end states. |
1349 | bool SaveSCC = false; |
1350 | switch (State) { |
1351 | case StateExact: |
1352 | case StateStrictWWM: |
1353 | case StateStrictWQM: |
1354 | // Exact/Strict -> Strict: save SCC |
1355 | // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec |
1356 | // Exact/Strict -> Exact: no save |
1357 | SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec); |
1358 | break; |
1359 | case StateWQM: |
1360 | // WQM -> Exact/Strict: save SCC |
1361 | SaveSCC = !(Needs & StateWQM); |
1362 | break; |
1363 | default: |
1364 | llvm_unreachable("Unknown state" ); |
1365 | break; |
1366 | } |
1367 | MachineBasicBlock::iterator Before = |
1368 | prepareInsertion(MBB, First, Last: II, PreferLast: Needs == StateWQM, SaveSCC); |
1369 | |
1370 | if (State & StateStrict) { |
1371 | assert(State == StateStrictWWM || State == StateStrictWQM); |
1372 | assert(SavedNonStrictReg); |
1373 | fromStrictMode(MBB, Before, SavedOrig: SavedNonStrictReg, NonStrictState, CurrentStrictState: State); |
1374 | |
1375 | LIS->createAndComputeVirtRegInterval(Reg: SavedNonStrictReg); |
1376 | SavedNonStrictReg = 0; |
1377 | State = NonStrictState; |
1378 | } |
1379 | |
1380 | if (Needs & StateStrict) { |
1381 | NonStrictState = State; |
1382 | assert(Needs == StateStrictWWM || Needs == StateStrictWQM); |
1383 | assert(!SavedNonStrictReg); |
1384 | SavedNonStrictReg = MRI->createVirtualRegister(RegClass: BoolRC); |
1385 | |
1386 | toStrictMode(MBB, Before, SaveOrig: SavedNonStrictReg, StrictStateNeeded: Needs); |
1387 | State = Needs; |
1388 | |
1389 | } else { |
1390 | if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { |
1391 | if (!WQMFromExec && (OutNeeds & StateWQM)) { |
1392 | assert(!SavedWQMReg); |
1393 | SavedWQMReg = MRI->createVirtualRegister(RegClass: BoolRC); |
1394 | } |
1395 | |
1396 | toExact(MBB, Before, SaveWQM: SavedWQMReg); |
1397 | State = StateExact; |
1398 | } else if (State == StateExact && (Needs & StateWQM) && |
1399 | !(Needs & StateExact)) { |
1400 | assert(WQMFromExec == (SavedWQMReg == 0)); |
1401 | |
1402 | toWQM(MBB, Before, SavedWQM: SavedWQMReg); |
1403 | |
1404 | if (SavedWQMReg) { |
1405 | LIS->createAndComputeVirtRegInterval(Reg: SavedWQMReg); |
1406 | SavedWQMReg = 0; |
1407 | } |
1408 | State = StateWQM; |
1409 | } else { |
1410 | // We can get here if we transitioned from StrictWWM to a |
1411 | // non-StrictWWM state that already matches our needs, but we |
1412 | // shouldn't need to do anything. |
1413 | assert(Needs & State); |
1414 | } |
1415 | } |
1416 | } |
1417 | |
1418 | if (Needs != (StateExact | StateWQM | StateStrict)) { |
1419 | if (Needs != (StateExact | StateWQM)) |
1420 | FirstWQM = IE; |
1421 | FirstStrict = IE; |
1422 | } |
1423 | |
1424 | if (II == IE) |
1425 | break; |
1426 | |
1427 | II = Next; |
1428 | } |
1429 | assert(!SavedWQMReg); |
1430 | assert(!SavedNonStrictReg); |
1431 | } |
1432 | |
1433 | bool SIWholeQuadMode::lowerLiveMaskQueries() { |
1434 | for (MachineInstr *MI : LiveMaskQueries) { |
1435 | const DebugLoc &DL = MI->getDebugLoc(); |
1436 | Register Dest = MI->getOperand(i: 0).getReg(); |
1437 | |
1438 | MachineInstr *Copy = |
1439 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Dest) |
1440 | .addReg(RegNo: LiveMaskReg); |
1441 | |
1442 | LIS->ReplaceMachineInstrInMaps(MI&: *MI, NewMI&: *Copy); |
1443 | MI->eraseFromParent(); |
1444 | } |
1445 | return !LiveMaskQueries.empty(); |
1446 | } |
1447 | |
1448 | bool SIWholeQuadMode::lowerCopyInstrs() { |
1449 | for (MachineInstr *MI : LowerToMovInstrs) { |
1450 | assert(MI->getNumExplicitOperands() == 2); |
1451 | |
1452 | const Register Reg = MI->getOperand(i: 0).getReg(); |
1453 | |
1454 | const TargetRegisterClass *regClass = |
1455 | TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: 0)); |
1456 | if (TRI->isVGPRClass(RC: regClass)) { |
1457 | const unsigned MovOp = TII->getMovOpcode(DstRC: regClass); |
1458 | MI->setDesc(TII->get(Opcode: MovOp)); |
1459 | |
1460 | // Check that it already implicitly depends on exec (like all VALU movs |
1461 | // should do). |
1462 | assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) { |
1463 | return MO.isUse() && MO.getReg() == AMDGPU::EXEC; |
1464 | })); |
1465 | } else { |
1466 | // Remove early-clobber and exec dependency from simple SGPR copies. |
1467 | // This allows some to be eliminated during/post RA. |
1468 | LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI); |
1469 | if (MI->getOperand(i: 0).isEarlyClobber()) { |
1470 | LIS->removeInterval(Reg); |
1471 | MI->getOperand(i: 0).setIsEarlyClobber(false); |
1472 | LIS->createAndComputeVirtRegInterval(Reg); |
1473 | } |
1474 | int Index = MI->findRegisterUseOperandIdx(Reg: AMDGPU::EXEC, /*TRI=*/nullptr); |
1475 | while (Index >= 0) { |
1476 | MI->removeOperand(OpNo: Index); |
1477 | Index = MI->findRegisterUseOperandIdx(Reg: AMDGPU::EXEC, /*TRI=*/nullptr); |
1478 | } |
1479 | MI->setDesc(TII->get(Opcode: AMDGPU::COPY)); |
1480 | LLVM_DEBUG(dbgs() << " -> " << *MI); |
1481 | } |
1482 | } |
1483 | for (MachineInstr *MI : LowerToCopyInstrs) { |
1484 | if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || |
1485 | MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { |
1486 | assert(MI->getNumExplicitOperands() == 3); |
1487 | // the only reason we should be here is V_SET_INACTIVE has |
1488 | // an undef input so it is being replaced by a simple copy. |
1489 | // There should be a second undef source that we should remove. |
1490 | assert(MI->getOperand(2).isUndef()); |
1491 | MI->removeOperand(OpNo: 2); |
1492 | MI->untieRegOperand(OpIdx: 1); |
1493 | } else { |
1494 | assert(MI->getNumExplicitOperands() == 2); |
1495 | } |
1496 | |
1497 | unsigned CopyOp = MI->getOperand(i: 1).isReg() |
1498 | ? (unsigned)AMDGPU::COPY |
1499 | : TII->getMovOpcode(DstRC: TRI->getRegClassForOperandReg( |
1500 | MRI: *MRI, MO: MI->getOperand(i: 0))); |
1501 | MI->setDesc(TII->get(Opcode: CopyOp)); |
1502 | } |
1503 | return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty(); |
1504 | } |
1505 | |
1506 | bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { |
1507 | for (MachineInstr *MI : KillInstrs) { |
1508 | MachineBasicBlock *MBB = MI->getParent(); |
1509 | MachineInstr *SplitPoint = nullptr; |
1510 | switch (MI->getOpcode()) { |
1511 | case AMDGPU::SI_DEMOTE_I1: |
1512 | case AMDGPU::SI_KILL_I1_TERMINATOR: |
1513 | SplitPoint = lowerKillI1(MBB&: *MBB, MI&: *MI, IsWQM); |
1514 | break; |
1515 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
1516 | SplitPoint = lowerKillF32(MBB&: *MBB, MI&: *MI); |
1517 | break; |
1518 | } |
1519 | if (SplitPoint) |
1520 | splitBlock(BB: MBB, TermMI: SplitPoint); |
1521 | } |
1522 | return !KillInstrs.empty(); |
1523 | } |
1524 | |
1525 | void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { |
1526 | MachineBasicBlock *MBB = MI.getParent(); |
1527 | bool IsWave32 = ST->isWave32(); |
1528 | |
1529 | if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { |
1530 | // This should be before all vector instructions. |
1531 | MachineInstr *InitMI = |
1532 | BuildMI(BB&: *MBB, I: MBB->begin(), MIMD: MI.getDebugLoc(), |
1533 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), |
1534 | DestReg: Exec) |
1535 | .addImm(Val: MI.getOperand(i: 0).getImm()); |
1536 | if (LIS) { |
1537 | LIS->RemoveMachineInstrFromMaps(MI); |
1538 | LIS->InsertMachineInstrInMaps(MI&: *InitMI); |
1539 | } |
1540 | MI.eraseFromParent(); |
1541 | return; |
1542 | } |
1543 | |
1544 | // Extract the thread count from an SGPR input and set EXEC accordingly. |
1545 | // Since BFM can't shift by 64, handle that case with CMP + CMOV. |
1546 | // |
1547 | // S_BFE_U32 count, input, {shift, 7} |
1548 | // S_BFM_B64 exec, count, 0 |
1549 | // S_CMP_EQ_U32 count, 64 |
1550 | // S_CMOV_B64 exec, -1 |
1551 | Register InputReg = MI.getOperand(i: 0).getReg(); |
1552 | MachineInstr *FirstMI = &*MBB->begin(); |
1553 | if (InputReg.isVirtual()) { |
1554 | MachineInstr *DefInstr = MRI->getVRegDef(Reg: InputReg); |
1555 | assert(DefInstr && DefInstr->isCopy()); |
1556 | if (DefInstr->getParent() == MBB) { |
1557 | if (DefInstr != FirstMI) { |
1558 | // If the `InputReg` is defined in current block, we also need to |
1559 | // move that instruction to the beginning of the block. |
1560 | DefInstr->removeFromParent(); |
1561 | MBB->insert(I: FirstMI, MI: DefInstr); |
1562 | if (LIS) |
1563 | LIS->handleMove(MI&: *DefInstr); |
1564 | } else { |
1565 | // If first instruction is definition then move pointer after it. |
1566 | FirstMI = &*std::next(x: FirstMI->getIterator()); |
1567 | } |
1568 | } |
1569 | } |
1570 | |
1571 | // Insert instruction sequence at block beginning (before vector operations). |
1572 | const DebugLoc DL = MI.getDebugLoc(); |
1573 | const unsigned WavefrontSize = ST->getWavefrontSize(); |
1574 | const unsigned Mask = (WavefrontSize << 1) - 1; |
1575 | Register CountReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass); |
1576 | auto BfeMI = BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BFE_U32), DestReg: CountReg) |
1577 | .addReg(RegNo: InputReg) |
1578 | .addImm(Val: (MI.getOperand(i: 1).getImm() & Mask) | 0x70000); |
1579 | auto BfmMI = |
1580 | BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, |
1581 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), DestReg: Exec) |
1582 | .addReg(RegNo: CountReg) |
1583 | .addImm(Val: 0); |
1584 | auto CmpMI = BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32)) |
1585 | .addReg(RegNo: CountReg, flags: RegState::Kill) |
1586 | .addImm(Val: WavefrontSize); |
1587 | auto CmovMI = |
1588 | BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, |
1589 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), |
1590 | DestReg: Exec) |
1591 | .addImm(Val: -1); |
1592 | |
1593 | if (!LIS) { |
1594 | MI.eraseFromParent(); |
1595 | return; |
1596 | } |
1597 | |
1598 | LIS->RemoveMachineInstrFromMaps(MI); |
1599 | MI.eraseFromParent(); |
1600 | |
1601 | LIS->InsertMachineInstrInMaps(MI&: *BfeMI); |
1602 | LIS->InsertMachineInstrInMaps(MI&: *BfmMI); |
1603 | LIS->InsertMachineInstrInMaps(MI&: *CmpMI); |
1604 | LIS->InsertMachineInstrInMaps(MI&: *CmovMI); |
1605 | |
1606 | LIS->removeInterval(Reg: InputReg); |
1607 | LIS->createAndComputeVirtRegInterval(Reg: InputReg); |
1608 | LIS->createAndComputeVirtRegInterval(Reg: CountReg); |
1609 | } |
1610 | |
1611 | /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry |
1612 | /// for instructions that depend on EXEC. |
1613 | MachineBasicBlock::iterator |
1614 | SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) { |
1615 | MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI(); |
1616 | |
1617 | for (MachineInstr *MI : InitExecInstrs) { |
1618 | // Try to handle undefined cases gracefully: |
1619 | // - multiple INIT_EXEC instructions |
1620 | // - INIT_EXEC instructions not in the entry block |
1621 | if (MI->getParent() == &Entry) |
1622 | InsertPt = std::next(x: MI->getIterator()); |
1623 | |
1624 | lowerInitExec(MI&: *MI); |
1625 | Changed = true; |
1626 | } |
1627 | |
1628 | return InsertPt; |
1629 | } |
1630 | |
1631 | bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { |
1632 | LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName() |
1633 | << " ------------- \n" ); |
1634 | LLVM_DEBUG(MF.dump();); |
1635 | |
1636 | Instructions.clear(); |
1637 | Blocks.clear(); |
1638 | LiveMaskQueries.clear(); |
1639 | LowerToCopyInstrs.clear(); |
1640 | LowerToMovInstrs.clear(); |
1641 | KillInstrs.clear(); |
1642 | InitExecInstrs.clear(); |
1643 | StateTransition.clear(); |
1644 | |
1645 | ST = &MF.getSubtarget<GCNSubtarget>(); |
1646 | |
1647 | TII = ST->getInstrInfo(); |
1648 | TRI = &TII->getRegisterInfo(); |
1649 | MRI = &MF.getRegInfo(); |
1650 | LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); |
1651 | auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); |
1652 | MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; |
1653 | auto *PDTWrapper = |
1654 | getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>(); |
1655 | PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; |
1656 | |
1657 | if (ST->isWave32()) { |
1658 | AndOpc = AMDGPU::S_AND_B32; |
1659 | AndTermOpc = AMDGPU::S_AND_B32_term; |
1660 | AndN2Opc = AMDGPU::S_ANDN2_B32; |
1661 | XorOpc = AMDGPU::S_XOR_B32; |
1662 | AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; |
1663 | AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term; |
1664 | WQMOpc = AMDGPU::S_WQM_B32; |
1665 | Exec = AMDGPU::EXEC_LO; |
1666 | } else { |
1667 | AndOpc = AMDGPU::S_AND_B64; |
1668 | AndTermOpc = AMDGPU::S_AND_B64_term; |
1669 | AndN2Opc = AMDGPU::S_ANDN2_B64; |
1670 | XorOpc = AMDGPU::S_XOR_B64; |
1671 | AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; |
1672 | AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term; |
1673 | WQMOpc = AMDGPU::S_WQM_B64; |
1674 | Exec = AMDGPU::EXEC; |
1675 | } |
1676 | |
1677 | const char GlobalFlags = analyzeFunction(MF); |
1678 | bool Changed = false; |
1679 | |
1680 | LiveMaskReg = Exec; |
1681 | |
1682 | MachineBasicBlock &Entry = MF.front(); |
1683 | MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed); |
1684 | |
1685 | // Store a copy of the original live mask when required |
1686 | const bool HasLiveMaskQueries = !LiveMaskQueries.empty(); |
1687 | const bool HasWaveModes = GlobalFlags & ~StateExact; |
1688 | const bool HasKills = !KillInstrs.empty(); |
1689 | const bool UsesWQM = GlobalFlags & StateWQM; |
1690 | if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) { |
1691 | LiveMaskReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
1692 | MachineInstr *MI = |
1693 | BuildMI(BB&: Entry, I: EntryMI, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: LiveMaskReg) |
1694 | .addReg(RegNo: Exec); |
1695 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1696 | Changed = true; |
1697 | } |
1698 | |
1699 | LLVM_DEBUG(printInfo()); |
1700 | |
1701 | Changed |= lowerLiveMaskQueries(); |
1702 | Changed |= lowerCopyInstrs(); |
1703 | |
1704 | if (!HasWaveModes) { |
1705 | // No wave mode execution |
1706 | Changed |= lowerKillInstrs(IsWQM: false); |
1707 | } else if (GlobalFlags == StateWQM) { |
1708 | // Shader only needs WQM |
1709 | auto MI = BuildMI(BB&: Entry, I: EntryMI, MIMD: DebugLoc(), MCID: TII->get(Opcode: WQMOpc), DestReg: Exec) |
1710 | .addReg(RegNo: Exec); |
1711 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1712 | lowerKillInstrs(IsWQM: true); |
1713 | Changed = true; |
1714 | } else { |
1715 | // Wave mode switching requires full lowering pass. |
1716 | for (auto BII : Blocks) |
1717 | processBlock(MBB&: *BII.first, IsEntry: BII.first == &Entry); |
1718 | // Lowering blocks causes block splitting so perform as a second pass. |
1719 | for (auto BII : Blocks) |
1720 | lowerBlock(MBB&: *BII.first); |
1721 | Changed = true; |
1722 | } |
1723 | |
1724 | // Compute live range for live mask |
1725 | if (LiveMaskReg != Exec) |
1726 | LIS->createAndComputeVirtRegInterval(Reg: LiveMaskReg); |
1727 | |
1728 | // Physical registers like SCC aren't tracked by default anyway, so just |
1729 | // removing the ranges we computed is the simplest option for maintaining |
1730 | // the analysis results. |
1731 | LIS->removeAllRegUnitsForPhysReg(Reg: AMDGPU::SCC); |
1732 | |
1733 | // If we performed any kills then recompute EXEC |
1734 | if (!KillInstrs.empty() || !InitExecInstrs.empty()) |
1735 | LIS->removeAllRegUnitsForPhysReg(Reg: AMDGPU::EXEC); |
1736 | |
1737 | return Changed; |
1738 | } |
1739 | |