1//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass adds instructions to enable whole quad mode (strict or non-strict)
11/// for pixel shaders, and strict whole wavefront mode for all programs.
12///
13/// The "strict" prefix indicates that inactive lanes do not take part in
14/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15/// always be enabled irrespective of control flow decisions. Conversely in
16/// non-strict WQM inactive lanes may control flow decisions.
17///
18/// Whole quad mode is required for derivative computations, but it interferes
19/// with shader side effects (stores and atomics). It ensures that WQM is
20/// enabled when necessary, but disabled around stores and atomics.
21///
22/// When necessary, this pass creates a function prolog
23///
24/// S_MOV_B64 LiveMask, EXEC
25/// S_WQM_B64 EXEC, EXEC
26///
27/// to enter WQM at the top of the function and surrounds blocks of Exact
28/// instructions by
29///
30/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31/// ...
32/// S_MOV_B64 EXEC, Tmp
33///
34/// We also compute when a sequence of instructions requires strict whole
35/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36///
37/// S_OR_SAVEEXEC_B64 Tmp, -1
38/// ...
39/// S_MOV_B64 EXEC, Tmp
40///
41/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42/// we use a similar save and restore mechanism and force whole quad mode for
43/// those instructions:
44///
45/// S_MOV_B64 Tmp, EXEC
46/// S_WQM_B64 EXEC, EXEC
47/// ...
48/// S_MOV_B64 EXEC, Tmp
49///
50/// In order to avoid excessive switching during sequences of Exact
51/// instructions, the pass first analyzes which instructions must be run in WQM
52/// (aka which instructions produce values that lead to derivative
53/// computations).
54///
55/// Basic blocks are always exited in WQM as long as some successor needs WQM.
56///
57/// There is room for improvement given better control flow analysis:
58///
59/// (1) at the top level (outside of control flow statements, and as long as
60/// kill hasn't been used), one SGPR can be saved by recovering WQM from
61/// the LiveMask (this is implemented for the entry block).
62///
63/// (2) when entire regions (e.g. if-else blocks or entire loops) only
64/// consist of exact and don't-care instructions, the switch only has to
65/// be done at the entry and exit points rather than potentially in each
66/// block of the region.
67///
68//===----------------------------------------------------------------------===//
69
70#include "SIWholeQuadMode.h"
71#include "AMDGPU.h"
72#include "GCNSubtarget.h"
73#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
74#include "llvm/ADT/MapVector.h"
75#include "llvm/ADT/PostOrderIterator.h"
76#include "llvm/CodeGen/LiveIntervals.h"
77#include "llvm/CodeGen/MachineBasicBlock.h"
78#include "llvm/CodeGen/MachineDominators.h"
79#include "llvm/CodeGen/MachineFunctionPass.h"
80#include "llvm/CodeGen/MachineInstr.h"
81#include "llvm/CodeGen/MachinePostDominators.h"
82#include "llvm/IR/CallingConv.h"
83#include "llvm/InitializePasses.h"
84#include "llvm/Support/raw_ostream.h"
85
86using namespace llvm;
87
88#define DEBUG_TYPE "si-wqm"
89
90namespace {
91
92enum {
93 StateWQM = 0x1,
94 StateStrictWWM = 0x2,
95 StateStrictWQM = 0x4,
96 StateExact = 0x8,
97 StateStrict = StateStrictWWM | StateStrictWQM,
98};
99
100struct PrintState {
101public:
102 int State;
103
104 explicit PrintState(int State) : State(State) {}
105};
106
107#ifndef NDEBUG
108static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
109
110 static const std::pair<char, const char *> Mapping[] = {
111 std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
112 std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
113 char State = PS.State;
114 for (auto M : Mapping) {
115 if (State & M.first) {
116 OS << M.second;
117 State &= ~M.first;
118
119 if (State)
120 OS << '|';
121 }
122 }
123 assert(State == 0);
124 return OS;
125}
126#endif
127
128struct InstrInfo {
129 char Needs = 0;
130 char Disabled = 0;
131 char OutNeeds = 0;
132 char MarkedStates = 0;
133};
134
135struct BlockInfo {
136 char Needs = 0;
137 char InNeeds = 0;
138 char OutNeeds = 0;
139 char InitialState = 0;
140 bool NeedsLowering = false;
141};
142
143struct WorkItem {
144 MachineBasicBlock *MBB = nullptr;
145 MachineInstr *MI = nullptr;
146
147 WorkItem() = default;
148 WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
149 WorkItem(MachineInstr *MI) : MI(MI) {}
150};
151
152class SIWholeQuadMode {
153public:
154 SIWholeQuadMode(MachineFunction &MF, LiveIntervals *LIS,
155 MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
156 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
157 TRI(&TII->getRegisterInfo()), MRI(&MF.getRegInfo()), LIS(LIS), MDT(MDT),
158 PDT(PDT) {}
159 bool run(MachineFunction &MF);
160
161private:
162 const GCNSubtarget *ST;
163 const SIInstrInfo *TII;
164 const SIRegisterInfo *TRI;
165 MachineRegisterInfo *MRI;
166 LiveIntervals *LIS;
167 MachineDominatorTree *MDT;
168 MachinePostDominatorTree *PDT;
169
170 unsigned AndOpc;
171 unsigned AndTermOpc;
172 unsigned AndN2Opc;
173 unsigned XorOpc;
174 unsigned AndSaveExecOpc;
175 unsigned AndSaveExecTermOpc;
176 unsigned WQMOpc;
177 Register Exec;
178 Register LiveMaskReg;
179
180 DenseMap<const MachineInstr *, InstrInfo> Instructions;
181 MapVector<MachineBasicBlock *, BlockInfo> Blocks;
182
183 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
184 DenseMap<const MachineInstr *, char> StateTransition;
185
186 SmallVector<MachineInstr *, 2> LiveMaskQueries;
187 SmallVector<MachineInstr *, 4> LowerToMovInstrs;
188 SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
189 SmallVector<MachineInstr *, 4> KillInstrs;
190 SmallVector<MachineInstr *, 4> InitExecInstrs;
191 SmallVector<MachineInstr *, 4> SetInactiveInstrs;
192
193 void printInfo();
194
195 void markInstruction(MachineInstr &MI, char Flag,
196 std::vector<WorkItem> &Worklist);
197 void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
198 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
199 void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
200 std::vector<WorkItem> &Worklist);
201 void markInstructionUses(const MachineInstr &MI, char Flag,
202 std::vector<WorkItem> &Worklist);
203 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
204 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
205 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
206 char analyzeFunction(MachineFunction &MF);
207
208 MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
209 MachineBasicBlock::iterator Before);
210 MachineBasicBlock::iterator
211 prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
212 MachineBasicBlock::iterator Last, bool PreferLast,
213 bool SaveSCC);
214 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
215 Register SaveWQM);
216 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
217 Register SavedWQM);
218 void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
219 Register SaveOrig, char StrictStateNeeded);
220 void fromStrictMode(MachineBasicBlock &MBB,
221 MachineBasicBlock::iterator Before, Register SavedOrig,
222 char NonStrictState, char CurrentStrictState);
223
224 void splitBlock(MachineInstr *TermMI);
225 MachineInstr *lowerKillI1(MachineInstr &MI, bool IsWQM);
226 MachineInstr *lowerKillF32(MachineInstr &MI);
227
228 void lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI);
229 void processBlock(MachineBasicBlock &MBB, BlockInfo &BI, bool IsEntry);
230
231 bool lowerLiveMaskQueries();
232 bool lowerCopyInstrs();
233 bool lowerKillInstrs(bool IsWQM);
234 void lowerInitExec(MachineInstr &MI);
235 MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
236 bool &Changed);
237};
238
239class SIWholeQuadModeLegacy : public MachineFunctionPass {
240public:
241 static char ID;
242
243 SIWholeQuadModeLegacy() : MachineFunctionPass(ID) {}
244
245 bool runOnMachineFunction(MachineFunction &MF) override;
246
247 StringRef getPassName() const override { return "SI Whole Quad Mode"; }
248
249 void getAnalysisUsage(AnalysisUsage &AU) const override {
250 AU.addRequired<LiveIntervalsWrapperPass>();
251 AU.addPreserved<SlotIndexesWrapperPass>();
252 AU.addPreserved<LiveIntervalsWrapperPass>();
253 AU.addPreserved<MachineDominatorTreeWrapperPass>();
254 AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
255 MachineFunctionPass::getAnalysisUsage(AU);
256 }
257
258 MachineFunctionProperties getClearedProperties() const override {
259 return MachineFunctionProperties().setIsSSA();
260 }
261};
262} // end anonymous namespace
263
264char SIWholeQuadModeLegacy::ID = 0;
265
266INITIALIZE_PASS_BEGIN(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode",
267 false, false)
268INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
269INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
270INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
271INITIALIZE_PASS_END(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode",
272 false, false)
273
274char &llvm::SIWholeQuadModeID = SIWholeQuadModeLegacy::ID;
275
276FunctionPass *llvm::createSIWholeQuadModeLegacyPass() {
277 return new SIWholeQuadModeLegacy;
278}
279
280#ifndef NDEBUG
281LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
282 for (const auto &BII : Blocks) {
283 dbgs() << "\n"
284 << printMBBReference(*BII.first) << ":\n"
285 << " InNeeds = " << PrintState(BII.second.InNeeds)
286 << ", Needs = " << PrintState(BII.second.Needs)
287 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
288
289 for (const MachineInstr &MI : *BII.first) {
290 auto III = Instructions.find(&MI);
291 if (III != Instructions.end()) {
292 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
293 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
294 }
295 }
296 }
297}
298#endif
299
300void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
301 std::vector<WorkItem> &Worklist) {
302 InstrInfo &II = Instructions[&MI];
303
304 assert(!(Flag & StateExact) && Flag != 0);
305
306 // Capture all states requested in marking including disabled ones.
307 II.MarkedStates |= Flag;
308
309 // Remove any disabled states from the flag. The user that required it gets
310 // an undefined value in the helper lanes. For example, this can happen if
311 // the result of an atomic is used by instruction that requires WQM, where
312 // ignoring the request for WQM is correct as per the relevant specs.
313 Flag &= ~II.Disabled;
314
315 // Ignore if the flag is already encompassed by the existing needs, or we
316 // just disabled everything.
317 if ((II.Needs & Flag) == Flag)
318 return;
319
320 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
321 II.Needs |= Flag;
322 Worklist.emplace_back(args: &MI);
323}
324
325/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
326void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
327 Register Reg, unsigned SubReg, char Flag,
328 std::vector<WorkItem> &Worklist) {
329 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
330
331 LiveQueryResult UseLRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: UseMI));
332 const VNInfo *Value = UseLRQ.valueIn();
333 if (!Value)
334 return;
335
336 // Note: this code assumes that lane masks on AMDGPU completely
337 // cover registers.
338 const LaneBitmask UseLanes =
339 SubReg ? TRI->getSubRegIndexLaneMask(SubIdx: SubReg)
340 : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
341 : LaneBitmask::getNone());
342
343 // Perform a depth-first iteration of the LiveRange graph marking defs.
344 // Stop processing of a given branch when all use lanes have been defined.
345 // The first definition stops processing for a physical register.
346 struct PhiEntry {
347 const VNInfo *Phi;
348 unsigned PredIdx;
349 LaneBitmask DefinedLanes;
350
351 PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
352 : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
353 };
354 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
355 SmallVector<PhiEntry, 2> PhiStack;
356 SmallSet<VisitKey, 4> Visited;
357 LaneBitmask DefinedLanes;
358 unsigned NextPredIdx = 0; // Only used for processing phi nodes
359 do {
360 const VNInfo *NextValue = nullptr;
361 const VisitKey Key(Value, DefinedLanes);
362
363 if (Visited.insert(V: Key).second) {
364 // On first visit to a phi then start processing first predecessor
365 NextPredIdx = 0;
366 }
367
368 if (Value->isPHIDef()) {
369 // Each predecessor node in the phi must be processed as a subgraph
370 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(index: Value->def);
371 assert(MBB && "Phi-def has no defining MBB");
372
373 // Find next predecessor to process
374 unsigned Idx = NextPredIdx;
375 const auto *PI = MBB->pred_begin() + Idx;
376 const auto *PE = MBB->pred_end();
377 for (; PI != PE && !NextValue; ++PI, ++Idx) {
378 if (const VNInfo *VN = LR.getVNInfoBefore(Idx: LIS->getMBBEndIdx(mbb: *PI))) {
379 if (!Visited.count(V: VisitKey(VN, DefinedLanes)))
380 NextValue = VN;
381 }
382 }
383
384 // If there are more predecessors to process; add phi to stack
385 if (PI != PE)
386 PhiStack.emplace_back(Args&: Value, Args&: Idx, Args&: DefinedLanes);
387 } else {
388 MachineInstr *MI = LIS->getInstructionFromIndex(index: Value->def);
389 assert(MI && "Def has no defining instruction");
390
391 if (Reg.isVirtual()) {
392 // Iterate over all operands to find relevant definitions
393 bool HasDef = false;
394 for (const MachineOperand &Op : MI->all_defs()) {
395 if (Op.getReg() != Reg)
396 continue;
397
398 // Compute lanes defined and overlap with use
399 LaneBitmask OpLanes =
400 Op.isUndef() ? LaneBitmask::getAll()
401 : TRI->getSubRegIndexLaneMask(SubIdx: Op.getSubReg());
402 LaneBitmask Overlap = (UseLanes & OpLanes);
403
404 // Record if this instruction defined any of use
405 HasDef |= Overlap.any();
406
407 // Mark any lanes defined
408 DefinedLanes |= OpLanes;
409 }
410
411 // Check if all lanes of use have been defined
412 if ((DefinedLanes & UseLanes) != UseLanes) {
413 // Definition not complete; need to process input value
414 LiveQueryResult LRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: *MI));
415 if (const VNInfo *VN = LRQ.valueIn()) {
416 if (!Visited.count(V: VisitKey(VN, DefinedLanes)))
417 NextValue = VN;
418 }
419 }
420
421 // Only mark the instruction if it defines some part of the use
422 if (HasDef)
423 markInstruction(MI&: *MI, Flag, Worklist);
424 } else {
425 // For physical registers simply mark the defining instruction
426 markInstruction(MI&: *MI, Flag, Worklist);
427 }
428 }
429
430 if (!NextValue && !PhiStack.empty()) {
431 // Reach end of chain; revert to processing last phi
432 PhiEntry &Entry = PhiStack.back();
433 NextValue = Entry.Phi;
434 NextPredIdx = Entry.PredIdx;
435 DefinedLanes = Entry.DefinedLanes;
436 PhiStack.pop_back();
437 }
438
439 Value = NextValue;
440 } while (Value);
441}
442
443void SIWholeQuadMode::markOperand(const MachineInstr &MI,
444 const MachineOperand &Op, char Flag,
445 std::vector<WorkItem> &Worklist) {
446 assert(Op.isReg());
447 Register Reg = Op.getReg();
448
449 // Ignore some hardware registers
450 switch (Reg) {
451 case AMDGPU::EXEC:
452 case AMDGPU::EXEC_LO:
453 return;
454 default:
455 break;
456 }
457
458 LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
459 << " for " << MI);
460 if (Reg.isVirtual()) {
461 LiveRange &LR = LIS->getInterval(Reg);
462 markDefs(UseMI: MI, LR, Reg, SubReg: Op.getSubReg(), Flag, Worklist);
463 } else {
464 // Handle physical registers that we need to track; this is mostly relevant
465 // for VCC, which can appear as the (implicit) input of a uniform branch,
466 // e.g. when a loop counter is stored in a VGPR.
467 for (MCRegUnit Unit : TRI->regunits(Reg: Reg.asMCReg())) {
468 LiveRange &LR = LIS->getRegUnit(Unit);
469 const VNInfo *Value = LR.Query(Idx: LIS->getInstructionIndex(Instr: MI)).valueIn();
470 if (Value)
471 markDefs(UseMI: MI, LR, Reg: Unit, SubReg: AMDGPU::NoSubRegister, Flag, Worklist);
472 }
473 }
474}
475
476/// Mark all instructions defining the uses in \p MI with \p Flag.
477void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
478 std::vector<WorkItem> &Worklist) {
479 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
480 << MI);
481
482 for (const MachineOperand &Use : MI.all_uses())
483 markOperand(MI, Op: Use, Flag, Worklist);
484}
485
486// Scan instructions to determine which ones require an Exact execmask and
487// which ones seed WQM requirements.
488char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
489 std::vector<WorkItem> &Worklist) {
490 char GlobalFlags = 0;
491 bool WQMOutputs = MF.getFunction().hasFnAttribute(Kind: "amdgpu-ps-wqm-outputs");
492 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
493 bool HasImplicitDerivatives =
494 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
495
496 // We need to visit the basic blocks in reverse post-order so that we visit
497 // defs before uses, in particular so that we don't accidentally mark an
498 // instruction as needing e.g. WQM before visiting it and realizing it needs
499 // WQM disabled.
500 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
501 for (MachineBasicBlock *MBB : RPOT) {
502 BlockInfo &BBI = Blocks[MBB];
503
504 for (MachineInstr &MI : *MBB) {
505 InstrInfo &III = Instructions[&MI];
506 unsigned Opcode = MI.getOpcode();
507 char Flags = 0;
508
509 if (TII->isWQM(Opcode)) {
510 // If LOD is not supported WQM is not needed.
511 // Only generate implicit WQM if implicit derivatives are required.
512 // This avoids inserting unintended WQM if a shader type without
513 // implicit derivatives uses an image sampling instruction.
514 if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
515 // Sampling instructions don't need to produce results for all pixels
516 // in a quad, they just require all inputs of a quad to have been
517 // computed for derivatives.
518 markInstructionUses(MI, Flag: StateWQM, Worklist);
519 GlobalFlags |= StateWQM;
520 }
521 } else if (Opcode == AMDGPU::WQM) {
522 // The WQM intrinsic requires its output to have all the helper lanes
523 // correct, so we need it to be in WQM.
524 Flags = StateWQM;
525 LowerToCopyInstrs.insert(X: &MI);
526 } else if (Opcode == AMDGPU::SOFT_WQM) {
527 LowerToCopyInstrs.insert(X: &MI);
528 SoftWQMInstrs.push_back(Elt: &MI);
529 } else if (Opcode == AMDGPU::STRICT_WWM) {
530 // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
531 // it needs to be executed in WQM or Exact so that its copy doesn't
532 // clobber inactive lanes.
533 markInstructionUses(MI, Flag: StateStrictWWM, Worklist);
534 GlobalFlags |= StateStrictWWM;
535 LowerToMovInstrs.push_back(Elt: &MI);
536 } else if (Opcode == AMDGPU::STRICT_WQM ||
537 TII->isDualSourceBlendEXP(MI)) {
538 // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
539 // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
540 // quads that have at least one active thread.
541 markInstructionUses(MI, Flag: StateStrictWQM, Worklist);
542 GlobalFlags |= StateStrictWQM;
543
544 if (Opcode == AMDGPU::STRICT_WQM) {
545 LowerToMovInstrs.push_back(Elt: &MI);
546 } else {
547 // Dual source blend export acts as implicit strict-wqm, its sources
548 // need to be shuffled in strict wqm, but the export itself needs to
549 // run in exact mode.
550 BBI.Needs |= StateExact;
551 if (!(BBI.InNeeds & StateExact)) {
552 BBI.InNeeds |= StateExact;
553 Worklist.emplace_back(args&: MBB);
554 }
555 GlobalFlags |= StateExact;
556 III.Disabled = StateWQM | StateStrict;
557 }
558 } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
559 Opcode == AMDGPU::DS_PARAM_LOAD ||
560 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
561 Opcode == AMDGPU::DS_DIRECT_LOAD) {
562 // Mark these STRICTWQM, but only for the instruction, not its operands.
563 // This avoid unnecessarily marking M0 as requiring WQM.
564 III.Needs |= StateStrictWQM;
565 GlobalFlags |= StateStrictWQM;
566 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
567 // Disable strict states; StrictWQM will be added as required later.
568 III.Disabled = StateStrict;
569 MachineOperand &Inactive = MI.getOperand(i: 4);
570 if (Inactive.isReg()) {
571 if (Inactive.isUndef() && MI.getOperand(i: 3).getImm() == 0)
572 LowerToCopyInstrs.insert(X: &MI);
573 else
574 markOperand(MI, Op: Inactive, Flag: StateStrictWWM, Worklist);
575 }
576 SetInactiveInstrs.push_back(Elt: &MI);
577 BBI.NeedsLowering = true;
578 } else if (TII->isDisableWQM(MI)) {
579 BBI.Needs |= StateExact;
580 if (!(BBI.InNeeds & StateExact)) {
581 BBI.InNeeds |= StateExact;
582 Worklist.emplace_back(args&: MBB);
583 }
584 GlobalFlags |= StateExact;
585 III.Disabled = StateWQM | StateStrict;
586 } else if (Opcode == AMDGPU::SI_PS_LIVE ||
587 Opcode == AMDGPU::SI_LIVE_MASK) {
588 LiveMaskQueries.push_back(Elt: &MI);
589 } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
590 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
591 Opcode == AMDGPU::SI_DEMOTE_I1) {
592 KillInstrs.push_back(Elt: &MI);
593 BBI.NeedsLowering = true;
594 } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
595 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
596 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
597 InitExecInstrs.push_back(Elt: &MI);
598 } else if (WQMOutputs) {
599 // The function is in machine SSA form, which means that physical
600 // VGPRs correspond to shader inputs and outputs. Inputs are
601 // only used, outputs are only defined.
602 // FIXME: is this still valid?
603 for (const MachineOperand &MO : MI.defs()) {
604 Register Reg = MO.getReg();
605 if (Reg.isPhysical() &&
606 TRI->hasVectorRegisters(RC: TRI->getPhysRegBaseClass(Reg))) {
607 Flags = StateWQM;
608 break;
609 }
610 }
611 }
612
613 if (Flags) {
614 markInstruction(MI, Flag: Flags, Worklist);
615 GlobalFlags |= Flags;
616 }
617 }
618 }
619
620 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
621 // ever used anywhere in the function. This implements the corresponding
622 // semantics of @llvm.amdgcn.set.inactive.
623 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
624 if (GlobalFlags & StateWQM) {
625 for (MachineInstr *MI : SetInactiveInstrs)
626 markInstruction(MI&: *MI, Flag: StateWQM, Worklist);
627 for (MachineInstr *MI : SoftWQMInstrs)
628 markInstruction(MI&: *MI, Flag: StateWQM, Worklist);
629 }
630
631 return GlobalFlags;
632}
633
634void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
635 std::vector<WorkItem>& Worklist) {
636 MachineBasicBlock *MBB = MI.getParent();
637 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
638 BlockInfo &BI = Blocks[MBB];
639
640 // Control flow-type instructions and stores to temporary memory that are
641 // followed by WQM computations must themselves be in WQM.
642 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
643 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
644 Instructions[&MI].Needs = StateWQM;
645 II.Needs = StateWQM;
646 }
647
648 // Propagate to block level
649 if (II.Needs & StateWQM) {
650 BI.Needs |= StateWQM;
651 if (!(BI.InNeeds & StateWQM)) {
652 BI.InNeeds |= StateWQM;
653 Worklist.emplace_back(args&: MBB);
654 }
655 }
656
657 // Propagate backwards within block
658 if (MachineInstr *PrevMI = MI.getPrevNode()) {
659 char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
660 if (!PrevMI->isPHI()) {
661 InstrInfo &PrevII = Instructions[PrevMI];
662 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
663 PrevII.OutNeeds |= InNeeds;
664 Worklist.emplace_back(args&: PrevMI);
665 }
666 }
667 }
668
669 // Propagate WQM flag to instruction inputs
670 assert(!(II.Needs & StateExact));
671
672 if (II.Needs != 0)
673 markInstructionUses(MI, Flag: II.Needs, Worklist);
674
675 // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
676 // not require any WQM transitions.
677 if (II.Needs & StateStrictWWM)
678 BI.Needs |= StateStrictWWM;
679 if (II.Needs & StateStrictWQM)
680 BI.Needs |= StateStrictWQM;
681}
682
683void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
684 std::vector<WorkItem>& Worklist) {
685 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
686
687 // Propagate through instructions
688 if (!MBB.empty()) {
689 MachineInstr *LastMI = &*MBB.rbegin();
690 InstrInfo &LastII = Instructions[LastMI];
691 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
692 LastII.OutNeeds |= BI.OutNeeds;
693 Worklist.emplace_back(args&: LastMI);
694 }
695 }
696
697 // Predecessor blocks must provide for our WQM/Exact needs.
698 for (MachineBasicBlock *Pred : MBB.predecessors()) {
699 BlockInfo &PredBI = Blocks[Pred];
700 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
701 continue;
702
703 PredBI.OutNeeds |= BI.InNeeds;
704 PredBI.InNeeds |= BI.InNeeds;
705 Worklist.emplace_back(args&: Pred);
706 }
707
708 // All successors must be prepared to accept the same set of WQM/Exact data.
709 for (MachineBasicBlock *Succ : MBB.successors()) {
710 BlockInfo &SuccBI = Blocks[Succ];
711 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
712 continue;
713
714 SuccBI.InNeeds |= BI.OutNeeds;
715 Worklist.emplace_back(args&: Succ);
716 }
717}
718
719char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
720 std::vector<WorkItem> Worklist;
721 char GlobalFlags = scanInstructions(MF, Worklist);
722
723 while (!Worklist.empty()) {
724 WorkItem WI = Worklist.back();
725 Worklist.pop_back();
726
727 if (WI.MI)
728 propagateInstruction(MI&: *WI.MI, Worklist);
729 else
730 propagateBlock(MBB&: *WI.MBB, Worklist);
731 }
732
733 return GlobalFlags;
734}
735
736MachineBasicBlock::iterator
737SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
738 MachineBasicBlock::iterator Before) {
739 Register SaveReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
740
741 MachineInstr *Save =
742 BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SaveReg)
743 .addReg(RegNo: AMDGPU::SCC);
744 MachineInstr *Restore =
745 BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC)
746 .addReg(RegNo: SaveReg);
747
748 LIS->InsertMachineInstrInMaps(MI&: *Save);
749 LIS->InsertMachineInstrInMaps(MI&: *Restore);
750 LIS->createAndComputeVirtRegInterval(Reg: SaveReg);
751
752 return Restore;
753}
754
755void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
756 MachineBasicBlock *BB = TermMI->getParent();
757 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
758 << *TermMI << "\n");
759
760 MachineBasicBlock *SplitBB =
761 BB->splitAt(SplitInst&: *TermMI, /*UpdateLiveIns*/ true, LIS);
762
763 // Convert last instruction in block to a terminator.
764 // Note: this only covers the expected patterns
765 unsigned NewOpcode = 0;
766 switch (TermMI->getOpcode()) {
767 case AMDGPU::S_AND_B32:
768 NewOpcode = AMDGPU::S_AND_B32_term;
769 break;
770 case AMDGPU::S_AND_B64:
771 NewOpcode = AMDGPU::S_AND_B64_term;
772 break;
773 case AMDGPU::S_MOV_B32:
774 NewOpcode = AMDGPU::S_MOV_B32_term;
775 break;
776 case AMDGPU::S_MOV_B64:
777 NewOpcode = AMDGPU::S_MOV_B64_term;
778 break;
779 case AMDGPU::S_ANDN2_B32:
780 NewOpcode = AMDGPU::S_ANDN2_B32_term;
781 break;
782 case AMDGPU::S_ANDN2_B64:
783 NewOpcode = AMDGPU::S_ANDN2_B64_term;
784 break;
785 default:
786 llvm_unreachable("Unexpected instruction");
787 }
788
789 // These terminators fallthrough to the next block, no need to add an
790 // unconditional branch to the next block (SplitBB).
791 TermMI->setDesc(TII->get(Opcode: NewOpcode));
792
793 if (SplitBB != BB) {
794 // Update dominator trees
795 using DomTreeT = DomTreeBase<MachineBasicBlock>;
796 SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
797 for (MachineBasicBlock *Succ : SplitBB->successors()) {
798 DTUpdates.push_back(Elt: {DomTreeT::Insert, SplitBB, Succ});
799 DTUpdates.push_back(Elt: {DomTreeT::Delete, BB, Succ});
800 }
801 DTUpdates.push_back(Elt: {DomTreeT::Insert, BB, SplitBB});
802 if (MDT)
803 MDT->applyUpdates(Updates: DTUpdates);
804 if (PDT)
805 PDT->applyUpdates(Updates: DTUpdates);
806 }
807}
808
809MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
810 assert(LiveMaskReg.isVirtual());
811
812 const DebugLoc &DL = MI.getDebugLoc();
813 unsigned Opcode = 0;
814
815 assert(MI.getOperand(0).isReg());
816
817 // Comparison is for live lanes; however here we compute the inverse
818 // (killed lanes). This is because VCMP will always generate 0 bits
819 // for inactive lanes so a mask of live lanes would not be correct
820 // inside control flow.
821 // Invert the comparison by swapping the operands and adjusting
822 // the comparison codes.
823
824 switch (MI.getOperand(i: 2).getImm()) {
825 case ISD::SETUEQ:
826 Opcode = AMDGPU::V_CMP_LG_F32_e64;
827 break;
828 case ISD::SETUGT:
829 Opcode = AMDGPU::V_CMP_GE_F32_e64;
830 break;
831 case ISD::SETUGE:
832 Opcode = AMDGPU::V_CMP_GT_F32_e64;
833 break;
834 case ISD::SETULT:
835 Opcode = AMDGPU::V_CMP_LE_F32_e64;
836 break;
837 case ISD::SETULE:
838 Opcode = AMDGPU::V_CMP_LT_F32_e64;
839 break;
840 case ISD::SETUNE:
841 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
842 break;
843 case ISD::SETO:
844 Opcode = AMDGPU::V_CMP_O_F32_e64;
845 break;
846 case ISD::SETUO:
847 Opcode = AMDGPU::V_CMP_U_F32_e64;
848 break;
849 case ISD::SETOEQ:
850 case ISD::SETEQ:
851 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
852 break;
853 case ISD::SETOGT:
854 case ISD::SETGT:
855 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
856 break;
857 case ISD::SETOGE:
858 case ISD::SETGE:
859 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
860 break;
861 case ISD::SETOLT:
862 case ISD::SETLT:
863 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
864 break;
865 case ISD::SETOLE:
866 case ISD::SETLE:
867 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
868 break;
869 case ISD::SETONE:
870 case ISD::SETNE:
871 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
872 break;
873 default:
874 llvm_unreachable("invalid ISD:SET cond code");
875 }
876
877 MachineBasicBlock &MBB = *MI.getParent();
878
879 // Pick opcode based on comparison type.
880 MachineInstr *VcmpMI;
881 const MachineOperand &Op0 = MI.getOperand(i: 0);
882 const MachineOperand &Op1 = MI.getOperand(i: 1);
883
884 // VCC represents lanes killed.
885 Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
886
887 if (TRI->isVGPR(MRI: *MRI, Reg: Op0.getReg())) {
888 Opcode = AMDGPU::getVOPe32(Opcode);
889 VcmpMI = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode)).add(MO: Op1).add(MO: Op0);
890 } else {
891 VcmpMI = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode))
892 .addReg(RegNo: VCC, flags: RegState::Define)
893 .addImm(Val: 0) // src0 modifiers
894 .add(MO: Op1)
895 .addImm(Val: 0) // src1 modifiers
896 .add(MO: Op0)
897 .addImm(Val: 0); // omod
898 }
899
900 MachineInstr *MaskUpdateMI =
901 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg)
902 .addReg(RegNo: LiveMaskReg)
903 .addReg(RegNo: VCC);
904
905 // State of SCC represents whether any lanes are live in mask,
906 // if SCC is 0 then no lanes will be alive anymore.
907 MachineInstr *EarlyTermMI =
908 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_EARLY_TERMINATE_SCC0));
909
910 MachineInstr *ExecMaskMI =
911 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: Exec).addReg(RegNo: Exec).addReg(RegNo: VCC);
912
913 assert(MBB.succ_size() == 1);
914
915 // Update live intervals
916 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *VcmpMI);
917 MBB.remove(I: &MI);
918
919 LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI);
920 LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI);
921 LIS->InsertMachineInstrInMaps(MI&: *ExecMaskMI);
922
923 return ExecMaskMI;
924}
925
926MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
927 assert(LiveMaskReg.isVirtual());
928
929 MachineBasicBlock &MBB = *MI.getParent();
930
931 const DebugLoc &DL = MI.getDebugLoc();
932 MachineInstr *MaskUpdateMI = nullptr;
933
934 const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
935 const MachineOperand &Op = MI.getOperand(i: 0);
936 int64_t KillVal = MI.getOperand(i: 1).getImm();
937 MachineInstr *ComputeKilledMaskMI = nullptr;
938 Register CndReg = !Op.isImm() ? Op.getReg() : Register();
939 Register TmpReg;
940
941 // Is this a static or dynamic kill?
942 if (Op.isImm()) {
943 if (Op.getImm() == KillVal) {
944 // Static: all active lanes are killed
945 MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg)
946 .addReg(RegNo: LiveMaskReg)
947 .addReg(RegNo: Exec);
948 } else {
949 // Static: kill does nothing
950 bool IsLastTerminator = std::next(x: MI.getIterator()) == MBB.end();
951 if (!IsLastTerminator) {
952 LIS->RemoveMachineInstrFromMaps(MI);
953 } else {
954 assert(MBB.succ_size() == 1 && MI.getOpcode() != AMDGPU::SI_DEMOTE_I1);
955 MachineInstr *NewTerm = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH))
956 .addMBB(MBB: *MBB.succ_begin());
957 LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewTerm);
958 }
959 MBB.remove(I: &MI);
960 return nullptr;
961 }
962 } else {
963 if (!KillVal) {
964 // Op represents live lanes after kill,
965 // so exec mask needs to be factored in.
966 TmpReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC());
967 ComputeKilledMaskMI =
968 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: TmpReg).addReg(RegNo: Exec).add(MO: Op);
969 MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg)
970 .addReg(RegNo: LiveMaskReg)
971 .addReg(RegNo: TmpReg);
972 } else {
973 // Op represents lanes to kill
974 MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg)
975 .addReg(RegNo: LiveMaskReg)
976 .add(MO: Op);
977 }
978 }
979
980 // State of SCC represents whether any lanes are live in mask,
981 // if SCC is 0 then no lanes will be alive anymore.
982 MachineInstr *EarlyTermMI =
983 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_EARLY_TERMINATE_SCC0));
984
985 // In the case we got this far some lanes are still live,
986 // update EXEC to deactivate lanes as appropriate.
987 MachineInstr *NewTerm;
988 MachineInstr *WQMMaskMI = nullptr;
989 Register LiveMaskWQM;
990 if (IsDemote) {
991 // Demote - deactivate quads with only helper lanes
992 LiveMaskWQM = MRI->createVirtualRegister(RegClass: TRI->getBoolRC());
993 WQMMaskMI =
994 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: WQMOpc), DestReg: LiveMaskWQM).addReg(RegNo: LiveMaskReg);
995 NewTerm = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: Exec)
996 .addReg(RegNo: Exec)
997 .addReg(RegNo: LiveMaskWQM);
998 } else {
999 // Kill - deactivate lanes no longer in live mask
1000 if (Op.isImm()) {
1001 unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1002 NewTerm = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: Exec).addImm(Val: 0);
1003 } else if (!IsWQM) {
1004 NewTerm = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: Exec)
1005 .addReg(RegNo: Exec)
1006 .addReg(RegNo: LiveMaskReg);
1007 } else {
1008 unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1009 NewTerm =
1010 BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode), DestReg: Exec).addReg(RegNo: Exec).add(MO: Op);
1011 }
1012 }
1013
1014 // Update live intervals
1015 LIS->RemoveMachineInstrFromMaps(MI);
1016 MBB.remove(I: &MI);
1017 assert(EarlyTermMI);
1018 assert(MaskUpdateMI);
1019 assert(NewTerm);
1020 if (ComputeKilledMaskMI)
1021 LIS->InsertMachineInstrInMaps(MI&: *ComputeKilledMaskMI);
1022 LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI);
1023 LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI);
1024 if (WQMMaskMI)
1025 LIS->InsertMachineInstrInMaps(MI&: *WQMMaskMI);
1026 LIS->InsertMachineInstrInMaps(MI&: *NewTerm);
1027
1028 if (CndReg) {
1029 LIS->removeInterval(Reg: CndReg);
1030 LIS->createAndComputeVirtRegInterval(Reg: CndReg);
1031 }
1032 if (TmpReg)
1033 LIS->createAndComputeVirtRegInterval(Reg: TmpReg);
1034 if (LiveMaskWQM)
1035 LIS->createAndComputeVirtRegInterval(Reg: LiveMaskWQM);
1036
1037 return NewTerm;
1038}
1039
1040// Replace (or supplement) instructions accessing live mask.
1041// This can only happen once all the live mask registers have been created
1042// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1043void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) {
1044 if (!BI.NeedsLowering)
1045 return;
1046
1047 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1048
1049 SmallVector<MachineInstr *, 4> SplitPoints;
1050 Register ActiveLanesReg = 0;
1051 char State = BI.InitialState;
1052
1053 for (MachineInstr &MI : llvm::make_early_inc_range(
1054 Range: llvm::make_range(x: MBB.getFirstNonPHI(), y: MBB.end()))) {
1055 auto MIState = StateTransition.find(Val: &MI);
1056 if (MIState != StateTransition.end())
1057 State = MIState->second;
1058
1059 MachineInstr *SplitPoint = nullptr;
1060 switch (MI.getOpcode()) {
1061 case AMDGPU::SI_DEMOTE_I1:
1062 case AMDGPU::SI_KILL_I1_TERMINATOR:
1063 SplitPoint = lowerKillI1(MI, IsWQM: State == StateWQM);
1064 break;
1065 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1066 SplitPoint = lowerKillF32(MI);
1067 break;
1068 case AMDGPU::ENTER_STRICT_WWM:
1069 ActiveLanesReg = MI.getOperand(i: 0).getReg();
1070 break;
1071 case AMDGPU::EXIT_STRICT_WWM:
1072 ActiveLanesReg = 0;
1073 break;
1074 case AMDGPU::V_SET_INACTIVE_B32:
1075 if (ActiveLanesReg) {
1076 LiveInterval &LI = LIS->getInterval(Reg: MI.getOperand(i: 5).getReg());
1077 MRI->constrainRegClass(Reg: ActiveLanesReg, RC: TRI->getWaveMaskRegClass());
1078 MI.getOperand(i: 5).setReg(ActiveLanesReg);
1079 LIS->shrinkToUses(li: &LI);
1080 } else {
1081 assert(State == StateExact || State == StateWQM);
1082 }
1083 break;
1084 default:
1085 break;
1086 }
1087 if (SplitPoint)
1088 SplitPoints.push_back(Elt: SplitPoint);
1089 }
1090
1091 // Perform splitting after instruction scan to simplify iteration.
1092 for (MachineInstr *MI : SplitPoints)
1093 splitBlock(TermMI: MI);
1094}
1095
1096// Return an iterator in the (inclusive) range [First, Last] at which
1097// instructions can be safely inserted, keeping in mind that some of the
1098// instructions we want to add necessarily clobber SCC.
1099MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1100 MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1101 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1102 if (!SaveSCC)
1103 return PreferLast ? Last : First;
1104
1105 LiveRange &LR =
1106 LIS->getRegUnit(Unit: *TRI->regunits(Reg: MCRegister::from(Val: AMDGPU::SCC)).begin());
1107 auto MBBE = MBB.end();
1108 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(Instr: *First)
1109 : LIS->getMBBEndIdx(mbb: &MBB);
1110 SlotIndex LastIdx =
1111 Last != MBBE ? LIS->getInstructionIndex(Instr: *Last) : LIS->getMBBEndIdx(mbb: &MBB);
1112 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1113 const LiveRange::Segment *S;
1114
1115 for (;;) {
1116 S = LR.getSegmentContaining(Idx);
1117 if (!S)
1118 break;
1119
1120 if (PreferLast) {
1121 SlotIndex Next = S->start.getBaseIndex();
1122 if (Next < FirstIdx)
1123 break;
1124 Idx = Next;
1125 } else {
1126 MachineInstr *EndMI = LIS->getInstructionFromIndex(index: S->end.getBaseIndex());
1127 assert(EndMI && "Segment does not end on valid instruction");
1128 auto NextI = std::next(x: EndMI->getIterator());
1129 if (NextI == MBB.end())
1130 break;
1131 SlotIndex Next = LIS->getInstructionIndex(Instr: *NextI);
1132 if (Next > LastIdx)
1133 break;
1134 Idx = Next;
1135 }
1136 }
1137
1138 MachineBasicBlock::iterator MBBI;
1139
1140 if (MachineInstr *MI = LIS->getInstructionFromIndex(index: Idx))
1141 MBBI = MI;
1142 else {
1143 assert(Idx == LIS->getMBBEndIdx(&MBB));
1144 MBBI = MBB.end();
1145 }
1146
1147 // Move insertion point past any operations modifying EXEC.
1148 // This assumes that the value of SCC defined by any of these operations
1149 // does not need to be preserved.
1150 while (MBBI != Last) {
1151 bool IsExecDef = false;
1152 for (const MachineOperand &MO : MBBI->all_defs()) {
1153 IsExecDef |=
1154 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1155 }
1156 if (!IsExecDef)
1157 break;
1158 MBBI++;
1159 S = nullptr;
1160 }
1161
1162 if (S)
1163 MBBI = saveSCC(MBB, Before: MBBI);
1164
1165 return MBBI;
1166}
1167
1168void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1169 MachineBasicBlock::iterator Before,
1170 Register SaveWQM) {
1171 assert(LiveMaskReg.isVirtual());
1172
1173 bool IsTerminator = Before == MBB.end();
1174 if (!IsTerminator) {
1175 auto FirstTerm = MBB.getFirstTerminator();
1176 if (FirstTerm != MBB.end()) {
1177 SlotIndex FirstTermIdx = LIS->getInstructionIndex(Instr: *FirstTerm);
1178 SlotIndex BeforeIdx = LIS->getInstructionIndex(Instr: *Before);
1179 IsTerminator = BeforeIdx > FirstTermIdx;
1180 }
1181 }
1182
1183 MachineInstr *MI;
1184
1185 if (SaveWQM) {
1186 unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1187 MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode), DestReg: SaveWQM)
1188 .addReg(RegNo: LiveMaskReg);
1189 } else {
1190 unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1191 MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode), DestReg: Exec)
1192 .addReg(RegNo: Exec)
1193 .addReg(RegNo: LiveMaskReg);
1194 }
1195
1196 LIS->InsertMachineInstrInMaps(MI&: *MI);
1197 StateTransition[MI] = StateExact;
1198}
1199
1200void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1201 MachineBasicBlock::iterator Before,
1202 Register SavedWQM) {
1203 MachineInstr *MI;
1204
1205 if (SavedWQM) {
1206 MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Exec)
1207 .addReg(RegNo: SavedWQM);
1208 } else {
1209 MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: WQMOpc), DestReg: Exec).addReg(RegNo: Exec);
1210 }
1211
1212 LIS->InsertMachineInstrInMaps(MI&: *MI);
1213 StateTransition[MI] = StateWQM;
1214}
1215
1216void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1217 MachineBasicBlock::iterator Before,
1218 Register SaveOrig, char StrictStateNeeded) {
1219 MachineInstr *MI;
1220 assert(SaveOrig);
1221 assert(StrictStateNeeded == StateStrictWWM ||
1222 StrictStateNeeded == StateStrictWQM);
1223
1224 if (StrictStateNeeded == StateStrictWWM) {
1225 MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::ENTER_STRICT_WWM),
1226 DestReg: SaveOrig)
1227 .addImm(Val: -1);
1228 } else {
1229 MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::ENTER_STRICT_WQM),
1230 DestReg: SaveOrig)
1231 .addImm(Val: -1);
1232 }
1233 LIS->InsertMachineInstrInMaps(MI&: *MI);
1234 StateTransition[MI] = StrictStateNeeded;
1235}
1236
1237void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1238 MachineBasicBlock::iterator Before,
1239 Register SavedOrig, char NonStrictState,
1240 char CurrentStrictState) {
1241 MachineInstr *MI;
1242
1243 assert(SavedOrig);
1244 assert(CurrentStrictState == StateStrictWWM ||
1245 CurrentStrictState == StateStrictWQM);
1246
1247 if (CurrentStrictState == StateStrictWWM) {
1248 MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::EXIT_STRICT_WWM),
1249 DestReg: Exec)
1250 .addReg(RegNo: SavedOrig);
1251 } else {
1252 MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::EXIT_STRICT_WQM),
1253 DestReg: Exec)
1254 .addReg(RegNo: SavedOrig);
1255 }
1256 LIS->InsertMachineInstrInMaps(MI&: *MI);
1257 StateTransition[MI] = NonStrictState;
1258}
1259
1260void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI,
1261 bool IsEntry) {
1262 // This is a non-entry block that is WQM throughout, so no need to do
1263 // anything.
1264 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1265 BI.InitialState = StateWQM;
1266 return;
1267 }
1268
1269 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1270 << ":\n");
1271
1272 Register SavedWQMReg;
1273 Register SavedNonStrictReg;
1274 bool WQMFromExec = IsEntry;
1275 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1276 char NonStrictState = 0;
1277 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1278
1279 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1280 if (IsEntry) {
1281 // Skip the instruction that saves LiveMask
1282 if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1283 II->getOperand(i: 1).getReg() == TRI->getExec())
1284 ++II;
1285 }
1286
1287 // This stores the first instruction where it's safe to switch from WQM to
1288 // Exact or vice versa.
1289 MachineBasicBlock::iterator FirstWQM = IE;
1290
1291 // This stores the first instruction where it's safe to switch from Strict
1292 // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1293 // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1294 // be safe to switch to/from WQM as well.
1295 MachineBasicBlock::iterator FirstStrict = IE;
1296
1297 // Record initial state is block information.
1298 BI.InitialState = State;
1299
1300 for (unsigned Idx = 0;; ++Idx) {
1301 MachineBasicBlock::iterator Next = II;
1302 char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1303 char OutNeeds = 0;
1304
1305 if (FirstWQM == IE)
1306 FirstWQM = II;
1307
1308 if (FirstStrict == IE)
1309 FirstStrict = II;
1310
1311 // Adjust needs if this is first instruction of WQM requiring shader.
1312 if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1313 Needs = StateWQM;
1314
1315 // First, figure out the allowed states (Needs) based on the propagated
1316 // flags.
1317 if (II != IE) {
1318 MachineInstr &MI = *II;
1319
1320 if (MI.isTerminator() || TII->mayReadEXEC(MRI: *MRI, MI)) {
1321 auto III = Instructions.find(Val: &MI);
1322 if (III != Instructions.end()) {
1323 if (III->second.Needs & StateStrictWWM)
1324 Needs = StateStrictWWM;
1325 else if (III->second.Needs & StateStrictWQM)
1326 Needs = StateStrictWQM;
1327 else if (III->second.Needs & StateWQM)
1328 Needs = StateWQM;
1329 else
1330 Needs &= ~III->second.Disabled;
1331 OutNeeds = III->second.OutNeeds;
1332 }
1333 } else {
1334 // If the instruction doesn't actually need a correct EXEC, then we can
1335 // safely leave Strict mode enabled.
1336 Needs = StateExact | StateWQM | StateStrict;
1337 }
1338
1339 // Exact mode exit can occur in terminators, but must be before branches.
1340 if (MI.isBranch() && OutNeeds == StateExact)
1341 Needs = StateExact;
1342
1343 ++Next;
1344 } else {
1345 // End of basic block
1346 if (BI.OutNeeds & StateWQM)
1347 Needs = StateWQM;
1348 else if (BI.OutNeeds == StateExact)
1349 Needs = StateExact;
1350 else
1351 Needs = StateWQM | StateExact;
1352 }
1353
1354 // Now, transition if necessary.
1355 if (!(Needs & State)) {
1356 MachineBasicBlock::iterator First;
1357 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1358 State == StateStrictWQM || Needs == StateStrictWQM) {
1359 // We must switch to or from Strict mode.
1360 First = FirstStrict;
1361 } else {
1362 // We only need to switch to/from WQM, so we can use FirstWQM.
1363 First = FirstWQM;
1364 }
1365
1366 // Whether we need to save SCC depends on start and end states.
1367 bool SaveSCC = false;
1368 switch (State) {
1369 case StateExact:
1370 case StateStrictWWM:
1371 case StateStrictWQM:
1372 // Exact/Strict -> Strict: save SCC
1373 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1374 // Exact/Strict -> Exact: no save
1375 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1376 break;
1377 case StateWQM:
1378 // WQM -> Exact/Strict: save SCC
1379 SaveSCC = !(Needs & StateWQM);
1380 break;
1381 default:
1382 llvm_unreachable("Unknown state");
1383 break;
1384 }
1385 char StartState = State & StateStrict ? NonStrictState : State;
1386 bool WQMToExact =
1387 StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1388 bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1389 !(Needs & StateExact);
1390 bool PreferLast = Needs == StateWQM;
1391 // Exact regions in divergent control flow may run at EXEC=0, so try to
1392 // exclude instructions with unexpected effects from them.
1393 // FIXME: ideally we would branch over these when EXEC=0,
1394 // but this requires updating implicit values, live intervals and CFG.
1395 if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1396 for (MachineBasicBlock::iterator I = First; I != II; ++I) {
1397 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI: *I)) {
1398 PreferLast = WQMToExact;
1399 break;
1400 }
1401 }
1402 }
1403 MachineBasicBlock::iterator Before =
1404 prepareInsertion(MBB, First, Last: II, PreferLast, SaveSCC);
1405
1406 if (State & StateStrict) {
1407 assert(State == StateStrictWWM || State == StateStrictWQM);
1408 assert(SavedNonStrictReg);
1409 fromStrictMode(MBB, Before, SavedOrig: SavedNonStrictReg, NonStrictState, CurrentStrictState: State);
1410
1411 LIS->createAndComputeVirtRegInterval(Reg: SavedNonStrictReg);
1412 SavedNonStrictReg = 0;
1413 State = NonStrictState;
1414 }
1415
1416 if (Needs & StateStrict) {
1417 NonStrictState = State;
1418 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1419 assert(!SavedNonStrictReg);
1420 SavedNonStrictReg = MRI->createVirtualRegister(RegClass: BoolRC);
1421
1422 toStrictMode(MBB, Before, SaveOrig: SavedNonStrictReg, StrictStateNeeded: Needs);
1423 State = Needs;
1424 } else {
1425 if (WQMToExact) {
1426 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1427 assert(!SavedWQMReg);
1428 SavedWQMReg = MRI->createVirtualRegister(RegClass: BoolRC);
1429 }
1430
1431 toExact(MBB, Before, SaveWQM: SavedWQMReg);
1432 State = StateExact;
1433 } else if (ExactToWQM) {
1434 assert(WQMFromExec == (SavedWQMReg == 0));
1435
1436 toWQM(MBB, Before, SavedWQM: SavedWQMReg);
1437
1438 if (SavedWQMReg) {
1439 LIS->createAndComputeVirtRegInterval(Reg: SavedWQMReg);
1440 SavedWQMReg = 0;
1441 }
1442 State = StateWQM;
1443 } else {
1444 // We can get here if we transitioned from StrictWWM to a
1445 // non-StrictWWM state that already matches our needs, but we
1446 // shouldn't need to do anything.
1447 assert(Needs & State);
1448 }
1449 }
1450 }
1451
1452 if (Needs != (StateExact | StateWQM | StateStrict)) {
1453 if (Needs != (StateExact | StateWQM))
1454 FirstWQM = IE;
1455 FirstStrict = IE;
1456 }
1457
1458 if (II == IE)
1459 break;
1460
1461 II = Next;
1462 }
1463 assert(!SavedWQMReg);
1464 assert(!SavedNonStrictReg);
1465}
1466
1467bool SIWholeQuadMode::lowerLiveMaskQueries() {
1468 for (MachineInstr *MI : LiveMaskQueries) {
1469 const DebugLoc &DL = MI->getDebugLoc();
1470 Register Dest = MI->getOperand(i: 0).getReg();
1471
1472 MachineInstr *Copy =
1473 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Dest)
1474 .addReg(RegNo: LiveMaskReg);
1475
1476 LIS->ReplaceMachineInstrInMaps(MI&: *MI, NewMI&: *Copy);
1477 MI->eraseFromParent();
1478 }
1479 return !LiveMaskQueries.empty();
1480}
1481
1482bool SIWholeQuadMode::lowerCopyInstrs() {
1483 for (MachineInstr *MI : LowerToMovInstrs) {
1484 assert(MI->getNumExplicitOperands() == 2);
1485
1486 const Register Reg = MI->getOperand(i: 0).getReg();
1487
1488 const TargetRegisterClass *regClass =
1489 TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: 0));
1490 if (TRI->isVGPRClass(RC: regClass)) {
1491 const unsigned MovOp = TII->getMovOpcode(DstRC: regClass);
1492 MI->setDesc(TII->get(Opcode: MovOp));
1493
1494 // Check that it already implicitly depends on exec (like all VALU movs
1495 // should do).
1496 assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1497 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1498 }));
1499 } else {
1500 // Remove early-clobber and exec dependency from simple SGPR copies.
1501 // This allows some to be eliminated during/post RA.
1502 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1503 if (MI->getOperand(i: 0).isEarlyClobber()) {
1504 LIS->removeInterval(Reg);
1505 MI->getOperand(i: 0).setIsEarlyClobber(false);
1506 LIS->createAndComputeVirtRegInterval(Reg);
1507 }
1508 int Index = MI->findRegisterUseOperandIdx(Reg: AMDGPU::EXEC, /*TRI=*/nullptr);
1509 while (Index >= 0) {
1510 MI->removeOperand(OpNo: Index);
1511 Index = MI->findRegisterUseOperandIdx(Reg: AMDGPU::EXEC, /*TRI=*/nullptr);
1512 }
1513 MI->setDesc(TII->get(Opcode: AMDGPU::COPY));
1514 LLVM_DEBUG(dbgs() << " -> " << *MI);
1515 }
1516 }
1517 for (MachineInstr *MI : LowerToCopyInstrs) {
1518 LLVM_DEBUG(dbgs() << "simplify: " << *MI);
1519
1520 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1521 assert(MI->getNumExplicitOperands() == 6);
1522
1523 LiveInterval *RecomputeLI = nullptr;
1524 if (MI->getOperand(i: 4).isReg())
1525 RecomputeLI = &LIS->getInterval(Reg: MI->getOperand(i: 4).getReg());
1526
1527 MI->removeOperand(OpNo: 5);
1528 MI->removeOperand(OpNo: 4);
1529 MI->removeOperand(OpNo: 3);
1530 MI->removeOperand(OpNo: 1);
1531
1532 if (RecomputeLI)
1533 LIS->shrinkToUses(li: RecomputeLI);
1534 } else {
1535 assert(MI->getNumExplicitOperands() == 2);
1536 }
1537
1538 unsigned CopyOp = MI->getOperand(i: 1).isReg()
1539 ? (unsigned)AMDGPU::COPY
1540 : TII->getMovOpcode(DstRC: TRI->getRegClassForOperandReg(
1541 MRI: *MRI, MO: MI->getOperand(i: 0)));
1542 MI->setDesc(TII->get(Opcode: CopyOp));
1543 LLVM_DEBUG(dbgs() << " -> " << *MI);
1544 }
1545 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1546}
1547
1548bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1549 for (MachineInstr *MI : KillInstrs) {
1550 MachineInstr *SplitPoint = nullptr;
1551 switch (MI->getOpcode()) {
1552 case AMDGPU::SI_DEMOTE_I1:
1553 case AMDGPU::SI_KILL_I1_TERMINATOR:
1554 SplitPoint = lowerKillI1(MI&: *MI, IsWQM);
1555 break;
1556 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1557 SplitPoint = lowerKillF32(MI&: *MI);
1558 break;
1559 }
1560 if (SplitPoint)
1561 splitBlock(TermMI: SplitPoint);
1562 }
1563 return !KillInstrs.empty();
1564}
1565
1566void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1567 MachineBasicBlock *MBB = MI.getParent();
1568 bool IsWave32 = ST->isWave32();
1569
1570 if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1571 assert(MBB == &MBB->getParent()->front() &&
1572 "init whole wave not in entry block");
1573 Register EntryExec = MRI->createVirtualRegister(RegClass: TRI->getBoolRC());
1574 MachineInstr *SaveExec =
1575 BuildMI(BB&: *MBB, I: MBB->begin(), MIMD: MI.getDebugLoc(),
1576 MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1577 : AMDGPU::S_OR_SAVEEXEC_B64),
1578 DestReg: EntryExec)
1579 .addImm(Val: -1);
1580
1581 // Replace all uses of MI's destination reg with EntryExec.
1582 MRI->replaceRegWith(FromReg: MI.getOperand(i: 0).getReg(), ToReg: EntryExec);
1583
1584 if (LIS) {
1585 LIS->RemoveMachineInstrFromMaps(MI);
1586 }
1587
1588 MI.eraseFromParent();
1589
1590 if (LIS) {
1591 LIS->InsertMachineInstrInMaps(MI&: *SaveExec);
1592 LIS->createAndComputeVirtRegInterval(Reg: EntryExec);
1593 }
1594 return;
1595 }
1596
1597 if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1598 // This should be before all vector instructions.
1599 MachineInstr *InitMI =
1600 BuildMI(BB&: *MBB, I: MBB->begin(), MIMD: MI.getDebugLoc(),
1601 MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1602 DestReg: Exec)
1603 .addImm(Val: MI.getOperand(i: 0).getImm());
1604 if (LIS) {
1605 LIS->RemoveMachineInstrFromMaps(MI);
1606 LIS->InsertMachineInstrInMaps(MI&: *InitMI);
1607 }
1608 MI.eraseFromParent();
1609 return;
1610 }
1611
1612 // Extract the thread count from an SGPR input and set EXEC accordingly.
1613 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1614 //
1615 // S_BFE_U32 count, input, {shift, 7}
1616 // S_BFM_B64 exec, count, 0
1617 // S_CMP_EQ_U32 count, 64
1618 // S_CMOV_B64 exec, -1
1619 Register InputReg = MI.getOperand(i: 0).getReg();
1620 MachineInstr *FirstMI = &*MBB->begin();
1621 if (InputReg.isVirtual()) {
1622 MachineInstr *DefInstr = MRI->getVRegDef(Reg: InputReg);
1623 assert(DefInstr && DefInstr->isCopy());
1624 if (DefInstr->getParent() == MBB) {
1625 if (DefInstr != FirstMI) {
1626 // If the `InputReg` is defined in current block, we also need to
1627 // move that instruction to the beginning of the block.
1628 DefInstr->removeFromParent();
1629 MBB->insert(I: FirstMI, MI: DefInstr);
1630 if (LIS)
1631 LIS->handleMove(MI&: *DefInstr);
1632 } else {
1633 // If first instruction is definition then move pointer after it.
1634 FirstMI = &*std::next(x: FirstMI->getIterator());
1635 }
1636 }
1637 }
1638
1639 // Insert instruction sequence at block beginning (before vector operations).
1640 const DebugLoc DL = MI.getDebugLoc();
1641 const unsigned WavefrontSize = ST->getWavefrontSize();
1642 const unsigned Mask = (WavefrontSize << 1) - 1;
1643 Register CountReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
1644 auto BfeMI = BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BFE_U32), DestReg: CountReg)
1645 .addReg(RegNo: InputReg)
1646 .addImm(Val: (MI.getOperand(i: 1).getImm() & Mask) | 0x70000);
1647 auto BfmMI =
1648 BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL,
1649 MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), DestReg: Exec)
1650 .addReg(RegNo: CountReg)
1651 .addImm(Val: 0);
1652 auto CmpMI = BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
1653 .addReg(RegNo: CountReg, flags: RegState::Kill)
1654 .addImm(Val: WavefrontSize);
1655 auto CmovMI =
1656 BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL,
1657 MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1658 DestReg: Exec)
1659 .addImm(Val: -1);
1660
1661 if (!LIS) {
1662 MI.eraseFromParent();
1663 return;
1664 }
1665
1666 LIS->RemoveMachineInstrFromMaps(MI);
1667 MI.eraseFromParent();
1668
1669 LIS->InsertMachineInstrInMaps(MI&: *BfeMI);
1670 LIS->InsertMachineInstrInMaps(MI&: *BfmMI);
1671 LIS->InsertMachineInstrInMaps(MI&: *CmpMI);
1672 LIS->InsertMachineInstrInMaps(MI&: *CmovMI);
1673
1674 LIS->removeInterval(Reg: InputReg);
1675 LIS->createAndComputeVirtRegInterval(Reg: InputReg);
1676 LIS->createAndComputeVirtRegInterval(Reg: CountReg);
1677}
1678
1679/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1680/// for instructions that depend on EXEC.
1681MachineBasicBlock::iterator
1682SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1683 MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1684
1685 for (MachineInstr *MI : InitExecInstrs) {
1686 // Try to handle undefined cases gracefully:
1687 // - multiple INIT_EXEC instructions
1688 // - INIT_EXEC instructions not in the entry block
1689 if (MI->getParent() == &Entry)
1690 InsertPt = std::next(x: MI->getIterator());
1691
1692 lowerInitExec(MI&: *MI);
1693 Changed = true;
1694 }
1695
1696 return InsertPt;
1697}
1698
1699bool SIWholeQuadMode::run(MachineFunction &MF) {
1700 LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1701 << " ------------- \n");
1702 LLVM_DEBUG(MF.dump(););
1703
1704 Instructions.clear();
1705 Blocks.clear();
1706 LiveMaskQueries.clear();
1707 LowerToCopyInstrs.clear();
1708 LowerToMovInstrs.clear();
1709 KillInstrs.clear();
1710 InitExecInstrs.clear();
1711 SetInactiveInstrs.clear();
1712 StateTransition.clear();
1713
1714 if (ST->isWave32()) {
1715 AndOpc = AMDGPU::S_AND_B32;
1716 AndTermOpc = AMDGPU::S_AND_B32_term;
1717 AndN2Opc = AMDGPU::S_ANDN2_B32;
1718 XorOpc = AMDGPU::S_XOR_B32;
1719 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1720 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1721 WQMOpc = AMDGPU::S_WQM_B32;
1722 Exec = AMDGPU::EXEC_LO;
1723 } else {
1724 AndOpc = AMDGPU::S_AND_B64;
1725 AndTermOpc = AMDGPU::S_AND_B64_term;
1726 AndN2Opc = AMDGPU::S_ANDN2_B64;
1727 XorOpc = AMDGPU::S_XOR_B64;
1728 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1729 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1730 WQMOpc = AMDGPU::S_WQM_B64;
1731 Exec = AMDGPU::EXEC;
1732 }
1733
1734 const char GlobalFlags = analyzeFunction(MF);
1735 bool Changed = false;
1736
1737 LiveMaskReg = Exec;
1738
1739 MachineBasicBlock &Entry = MF.front();
1740 MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
1741
1742 // Store a copy of the original live mask when required
1743 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1744 const bool HasWaveModes = GlobalFlags & ~StateExact;
1745 const bool HasKills = !KillInstrs.empty();
1746 const bool UsesWQM = GlobalFlags & StateWQM;
1747 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1748 LiveMaskReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC());
1749 MachineInstr *MI =
1750 BuildMI(BB&: Entry, I: EntryMI, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: LiveMaskReg)
1751 .addReg(RegNo: Exec);
1752 LIS->InsertMachineInstrInMaps(MI&: *MI);
1753 Changed = true;
1754 }
1755
1756 // Check if V_SET_INACTIVE was touched by a strict state mode.
1757 // If so, promote to WWM; otherwise lower to COPY.
1758 for (MachineInstr *MI : SetInactiveInstrs) {
1759 if (LowerToCopyInstrs.contains(key: MI))
1760 continue;
1761 auto &Info = Instructions[MI];
1762 if (Info.MarkedStates & StateStrict) {
1763 Info.Needs |= StateStrictWWM;
1764 Info.Disabled &= ~StateStrictWWM;
1765 Blocks[MI->getParent()].Needs |= StateStrictWWM;
1766 } else {
1767 LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
1768 LowerToCopyInstrs.insert(X: MI);
1769 }
1770 }
1771
1772 LLVM_DEBUG(printInfo());
1773
1774 Changed |= lowerLiveMaskQueries();
1775 Changed |= lowerCopyInstrs();
1776
1777 if (!HasWaveModes) {
1778 // No wave mode execution
1779 Changed |= lowerKillInstrs(IsWQM: false);
1780 } else if (GlobalFlags == StateWQM) {
1781 // Shader only needs WQM
1782 auto MI = BuildMI(BB&: Entry, I: EntryMI, MIMD: DebugLoc(), MCID: TII->get(Opcode: WQMOpc), DestReg: Exec)
1783 .addReg(RegNo: Exec);
1784 LIS->InsertMachineInstrInMaps(MI&: *MI);
1785 lowerKillInstrs(IsWQM: true);
1786 Changed = true;
1787 } else {
1788 // Mark entry for WQM if required.
1789 if (GlobalFlags & StateWQM)
1790 Blocks[&Entry].InNeeds |= StateWQM;
1791 // Wave mode switching requires full lowering pass.
1792 for (auto &BII : Blocks)
1793 processBlock(MBB&: *BII.first, BI&: BII.second, IsEntry: BII.first == &Entry);
1794 // Lowering blocks causes block splitting so perform as a second pass.
1795 for (auto &BII : Blocks)
1796 lowerBlock(MBB&: *BII.first, BI&: BII.second);
1797 Changed = true;
1798 }
1799
1800 // Compute live range for live mask
1801 if (LiveMaskReg != Exec)
1802 LIS->createAndComputeVirtRegInterval(Reg: LiveMaskReg);
1803
1804 // Physical registers like SCC aren't tracked by default anyway, so just
1805 // removing the ranges we computed is the simplest option for maintaining
1806 // the analysis results.
1807 LIS->removeAllRegUnitsForPhysReg(Reg: AMDGPU::SCC);
1808
1809 // If we performed any kills then recompute EXEC
1810 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1811 LIS->removeAllRegUnitsForPhysReg(Reg: AMDGPU::EXEC);
1812
1813 return Changed;
1814}
1815
1816bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) {
1817 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1818 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1819 MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1820 auto *PDTWrapper =
1821 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1822 MachinePostDominatorTree *PDT =
1823 PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1824 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1825 return Impl.run(MF);
1826}
1827
1828PreservedAnalyses
1829SIWholeQuadModePass::run(MachineFunction &MF,
1830 MachineFunctionAnalysisManager &MFAM) {
1831 MFPropsModifier _(*this, MF);
1832
1833 LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(IR&: MF);
1834 MachineDominatorTree *MDT =
1835 MFAM.getCachedResult<MachineDominatorTreeAnalysis>(IR&: MF);
1836 MachinePostDominatorTree *PDT =
1837 MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(IR&: MF);
1838 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1839 bool Changed = Impl.run(MF);
1840 if (!Changed)
1841 return PreservedAnalyses::all();
1842
1843 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
1844 PA.preserve<SlotIndexesAnalysis>();
1845 PA.preserve<LiveIntervalsAnalysis>();
1846 PA.preserve<MachineDominatorTreeAnalysis>();
1847 PA.preserve<MachinePostDominatorTreeAnalysis>();
1848 return PA;
1849}
1850