| 1 | //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// This pass adds instructions to enable whole quad mode (strict or non-strict) |
| 11 | /// for pixel shaders, and strict whole wavefront mode for all programs. |
| 12 | /// |
| 13 | /// The "strict" prefix indicates that inactive lanes do not take part in |
| 14 | /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will |
| 15 | /// always be enabled irrespective of control flow decisions. Conversely in |
| 16 | /// non-strict WQM inactive lanes may control flow decisions. |
| 17 | /// |
| 18 | /// Whole quad mode is required for derivative computations, but it interferes |
| 19 | /// with shader side effects (stores and atomics). It ensures that WQM is |
| 20 | /// enabled when necessary, but disabled around stores and atomics. |
| 21 | /// |
| 22 | /// When necessary, this pass creates a function prolog |
| 23 | /// |
| 24 | /// S_MOV_B64 LiveMask, EXEC |
| 25 | /// S_WQM_B64 EXEC, EXEC |
| 26 | /// |
| 27 | /// to enter WQM at the top of the function and surrounds blocks of Exact |
| 28 | /// instructions by |
| 29 | /// |
| 30 | /// S_AND_SAVEEXEC_B64 Tmp, LiveMask |
| 31 | /// ... |
| 32 | /// S_MOV_B64 EXEC, Tmp |
| 33 | /// |
| 34 | /// We also compute when a sequence of instructions requires strict whole |
| 35 | /// wavefront mode (StrictWWM) and insert instructions to save and restore it: |
| 36 | /// |
| 37 | /// S_OR_SAVEEXEC_B64 Tmp, -1 |
| 38 | /// ... |
| 39 | /// S_MOV_B64 EXEC, Tmp |
| 40 | /// |
| 41 | /// When a sequence of instructions requires strict whole quad mode (StrictWQM) |
| 42 | /// we use a similar save and restore mechanism and force whole quad mode for |
| 43 | /// those instructions: |
| 44 | /// |
| 45 | /// S_MOV_B64 Tmp, EXEC |
| 46 | /// S_WQM_B64 EXEC, EXEC |
| 47 | /// ... |
| 48 | /// S_MOV_B64 EXEC, Tmp |
| 49 | /// |
| 50 | /// In order to avoid excessive switching during sequences of Exact |
| 51 | /// instructions, the pass first analyzes which instructions must be run in WQM |
| 52 | /// (aka which instructions produce values that lead to derivative |
| 53 | /// computations). |
| 54 | /// |
| 55 | /// Basic blocks are always exited in WQM as long as some successor needs WQM. |
| 56 | /// |
| 57 | /// There is room for improvement given better control flow analysis: |
| 58 | /// |
| 59 | /// (1) at the top level (outside of control flow statements, and as long as |
| 60 | /// kill hasn't been used), one SGPR can be saved by recovering WQM from |
| 61 | /// the LiveMask (this is implemented for the entry block). |
| 62 | /// |
| 63 | /// (2) when entire regions (e.g. if-else blocks or entire loops) only |
| 64 | /// consist of exact and don't-care instructions, the switch only has to |
| 65 | /// be done at the entry and exit points rather than potentially in each |
| 66 | /// block of the region. |
| 67 | /// |
| 68 | //===----------------------------------------------------------------------===// |
| 69 | |
| 70 | #include "SIWholeQuadMode.h" |
| 71 | #include "AMDGPU.h" |
| 72 | #include "GCNSubtarget.h" |
| 73 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 74 | #include "llvm/ADT/MapVector.h" |
| 75 | #include "llvm/ADT/PostOrderIterator.h" |
| 76 | #include "llvm/CodeGen/LiveIntervals.h" |
| 77 | #include "llvm/CodeGen/MachineBasicBlock.h" |
| 78 | #include "llvm/CodeGen/MachineDominators.h" |
| 79 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 80 | #include "llvm/CodeGen/MachineInstr.h" |
| 81 | #include "llvm/CodeGen/MachinePostDominators.h" |
| 82 | #include "llvm/IR/CallingConv.h" |
| 83 | #include "llvm/InitializePasses.h" |
| 84 | #include "llvm/Support/raw_ostream.h" |
| 85 | |
| 86 | using namespace llvm; |
| 87 | |
| 88 | #define DEBUG_TYPE "si-wqm" |
| 89 | |
| 90 | namespace { |
| 91 | |
| 92 | enum { |
| 93 | StateWQM = 0x1, |
| 94 | StateStrictWWM = 0x2, |
| 95 | StateStrictWQM = 0x4, |
| 96 | StateExact = 0x8, |
| 97 | StateStrict = StateStrictWWM | StateStrictWQM, |
| 98 | }; |
| 99 | |
| 100 | struct PrintState { |
| 101 | public: |
| 102 | int State; |
| 103 | |
| 104 | explicit PrintState(int State) : State(State) {} |
| 105 | }; |
| 106 | |
| 107 | #ifndef NDEBUG |
| 108 | static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { |
| 109 | |
| 110 | static const std::pair<char, const char *> Mapping[] = { |
| 111 | std::pair(StateWQM, "WQM" ), std::pair(StateStrictWWM, "StrictWWM" ), |
| 112 | std::pair(StateStrictWQM, "StrictWQM" ), std::pair(StateExact, "Exact" )}; |
| 113 | char State = PS.State; |
| 114 | for (auto M : Mapping) { |
| 115 | if (State & M.first) { |
| 116 | OS << M.second; |
| 117 | State &= ~M.first; |
| 118 | |
| 119 | if (State) |
| 120 | OS << '|'; |
| 121 | } |
| 122 | } |
| 123 | assert(State == 0); |
| 124 | return OS; |
| 125 | } |
| 126 | #endif |
| 127 | |
| 128 | struct InstrInfo { |
| 129 | char Needs = 0; |
| 130 | char Disabled = 0; |
| 131 | char OutNeeds = 0; |
| 132 | char MarkedStates = 0; |
| 133 | }; |
| 134 | |
| 135 | struct BlockInfo { |
| 136 | char Needs = 0; |
| 137 | char InNeeds = 0; |
| 138 | char OutNeeds = 0; |
| 139 | char InitialState = 0; |
| 140 | bool NeedsLowering = false; |
| 141 | }; |
| 142 | |
| 143 | struct WorkItem { |
| 144 | MachineBasicBlock *MBB = nullptr; |
| 145 | MachineInstr *MI = nullptr; |
| 146 | |
| 147 | WorkItem() = default; |
| 148 | WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} |
| 149 | WorkItem(MachineInstr *MI) : MI(MI) {} |
| 150 | }; |
| 151 | |
| 152 | class SIWholeQuadMode { |
| 153 | public: |
| 154 | SIWholeQuadMode(MachineFunction &MF, LiveIntervals *LIS, |
| 155 | MachineDominatorTree *MDT, MachinePostDominatorTree *PDT) |
| 156 | : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()), |
| 157 | TRI(&TII->getRegisterInfo()), MRI(&MF.getRegInfo()), LIS(LIS), MDT(MDT), |
| 158 | PDT(PDT) {} |
| 159 | bool run(MachineFunction &MF); |
| 160 | |
| 161 | private: |
| 162 | const GCNSubtarget *ST; |
| 163 | const SIInstrInfo *TII; |
| 164 | const SIRegisterInfo *TRI; |
| 165 | MachineRegisterInfo *MRI; |
| 166 | LiveIntervals *LIS; |
| 167 | MachineDominatorTree *MDT; |
| 168 | MachinePostDominatorTree *PDT; |
| 169 | |
| 170 | unsigned AndOpc; |
| 171 | unsigned AndTermOpc; |
| 172 | unsigned AndN2Opc; |
| 173 | unsigned XorOpc; |
| 174 | unsigned AndSaveExecOpc; |
| 175 | unsigned AndSaveExecTermOpc; |
| 176 | unsigned WQMOpc; |
| 177 | Register Exec; |
| 178 | Register LiveMaskReg; |
| 179 | |
| 180 | DenseMap<const MachineInstr *, InstrInfo> Instructions; |
| 181 | MapVector<MachineBasicBlock *, BlockInfo> Blocks; |
| 182 | |
| 183 | // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction |
| 184 | DenseMap<const MachineInstr *, char> StateTransition; |
| 185 | |
| 186 | SmallVector<MachineInstr *, 2> LiveMaskQueries; |
| 187 | SmallVector<MachineInstr *, 4> LowerToMovInstrs; |
| 188 | SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs; |
| 189 | SmallVector<MachineInstr *, 4> KillInstrs; |
| 190 | SmallVector<MachineInstr *, 4> InitExecInstrs; |
| 191 | SmallVector<MachineInstr *, 4> SetInactiveInstrs; |
| 192 | |
| 193 | void printInfo(); |
| 194 | |
| 195 | void markInstruction(MachineInstr &MI, char Flag, |
| 196 | std::vector<WorkItem> &Worklist); |
| 197 | void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, |
| 198 | unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist); |
| 199 | void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag, |
| 200 | std::vector<WorkItem> &Worklist); |
| 201 | void markInstructionUses(const MachineInstr &MI, char Flag, |
| 202 | std::vector<WorkItem> &Worklist); |
| 203 | char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); |
| 204 | void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); |
| 205 | void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); |
| 206 | char analyzeFunction(MachineFunction &MF); |
| 207 | |
| 208 | MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, |
| 209 | MachineBasicBlock::iterator Before); |
| 210 | MachineBasicBlock::iterator |
| 211 | prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, |
| 212 | MachineBasicBlock::iterator Last, bool PreferLast, |
| 213 | bool SaveSCC); |
| 214 | void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
| 215 | Register SaveWQM); |
| 216 | void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
| 217 | Register SavedWQM); |
| 218 | void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
| 219 | Register SaveOrig, char StrictStateNeeded); |
| 220 | void fromStrictMode(MachineBasicBlock &MBB, |
| 221 | MachineBasicBlock::iterator Before, Register SavedOrig, |
| 222 | char NonStrictState, char CurrentStrictState); |
| 223 | |
| 224 | void splitBlock(MachineInstr *TermMI); |
| 225 | MachineInstr *lowerKillI1(MachineInstr &MI, bool IsWQM); |
| 226 | MachineInstr *lowerKillF32(MachineInstr &MI); |
| 227 | |
| 228 | void lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI); |
| 229 | void processBlock(MachineBasicBlock &MBB, BlockInfo &BI, bool IsEntry); |
| 230 | |
| 231 | bool lowerLiveMaskQueries(); |
| 232 | bool lowerCopyInstrs(); |
| 233 | bool lowerKillInstrs(bool IsWQM); |
| 234 | void lowerInitExec(MachineInstr &MI); |
| 235 | MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry, |
| 236 | bool &Changed); |
| 237 | }; |
| 238 | |
| 239 | class SIWholeQuadModeLegacy : public MachineFunctionPass { |
| 240 | public: |
| 241 | static char ID; |
| 242 | |
| 243 | SIWholeQuadModeLegacy() : MachineFunctionPass(ID) {} |
| 244 | |
| 245 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 246 | |
| 247 | StringRef getPassName() const override { return "SI Whole Quad Mode" ; } |
| 248 | |
| 249 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 250 | AU.addRequired<LiveIntervalsWrapperPass>(); |
| 251 | AU.addPreserved<SlotIndexesWrapperPass>(); |
| 252 | AU.addPreserved<LiveIntervalsWrapperPass>(); |
| 253 | AU.addPreserved<MachineDominatorTreeWrapperPass>(); |
| 254 | AU.addPreserved<MachinePostDominatorTreeWrapperPass>(); |
| 255 | MachineFunctionPass::getAnalysisUsage(AU); |
| 256 | } |
| 257 | |
| 258 | MachineFunctionProperties getClearedProperties() const override { |
| 259 | return MachineFunctionProperties().setIsSSA(); |
| 260 | } |
| 261 | }; |
| 262 | } // end anonymous namespace |
| 263 | |
| 264 | char SIWholeQuadModeLegacy::ID = 0; |
| 265 | |
| 266 | INITIALIZE_PASS_BEGIN(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode" , |
| 267 | false, false) |
| 268 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
| 269 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
| 270 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) |
| 271 | INITIALIZE_PASS_END(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode" , |
| 272 | false, false) |
| 273 | |
| 274 | char &llvm::SIWholeQuadModeID = SIWholeQuadModeLegacy::ID; |
| 275 | |
| 276 | FunctionPass *llvm::createSIWholeQuadModeLegacyPass() { |
| 277 | return new SIWholeQuadModeLegacy; |
| 278 | } |
| 279 | |
| 280 | #ifndef NDEBUG |
| 281 | LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() { |
| 282 | for (const auto &BII : Blocks) { |
| 283 | dbgs() << "\n" |
| 284 | << printMBBReference(*BII.first) << ":\n" |
| 285 | << " InNeeds = " << PrintState(BII.second.InNeeds) |
| 286 | << ", Needs = " << PrintState(BII.second.Needs) |
| 287 | << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n" ; |
| 288 | |
| 289 | for (const MachineInstr &MI : *BII.first) { |
| 290 | auto III = Instructions.find(&MI); |
| 291 | if (III != Instructions.end()) { |
| 292 | dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) |
| 293 | << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; |
| 294 | } |
| 295 | } |
| 296 | } |
| 297 | } |
| 298 | #endif |
| 299 | |
| 300 | void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, |
| 301 | std::vector<WorkItem> &Worklist) { |
| 302 | InstrInfo &II = Instructions[&MI]; |
| 303 | |
| 304 | assert(!(Flag & StateExact) && Flag != 0); |
| 305 | |
| 306 | // Capture all states requested in marking including disabled ones. |
| 307 | II.MarkedStates |= Flag; |
| 308 | |
| 309 | // Remove any disabled states from the flag. The user that required it gets |
| 310 | // an undefined value in the helper lanes. For example, this can happen if |
| 311 | // the result of an atomic is used by instruction that requires WQM, where |
| 312 | // ignoring the request for WQM is correct as per the relevant specs. |
| 313 | Flag &= ~II.Disabled; |
| 314 | |
| 315 | // Ignore if the flag is already encompassed by the existing needs, or we |
| 316 | // just disabled everything. |
| 317 | if ((II.Needs & Flag) == Flag) |
| 318 | return; |
| 319 | |
| 320 | LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); |
| 321 | II.Needs |= Flag; |
| 322 | Worklist.emplace_back(args: &MI); |
| 323 | } |
| 324 | |
| 325 | /// Mark all relevant definitions of register \p Reg in usage \p UseMI. |
| 326 | void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, |
| 327 | Register Reg, unsigned SubReg, char Flag, |
| 328 | std::vector<WorkItem> &Worklist) { |
| 329 | LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); |
| 330 | |
| 331 | LiveQueryResult UseLRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: UseMI)); |
| 332 | const VNInfo *Value = UseLRQ.valueIn(); |
| 333 | if (!Value) |
| 334 | return; |
| 335 | |
| 336 | // Note: this code assumes that lane masks on AMDGPU completely |
| 337 | // cover registers. |
| 338 | const LaneBitmask UseLanes = |
| 339 | SubReg ? TRI->getSubRegIndexLaneMask(SubIdx: SubReg) |
| 340 | : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) |
| 341 | : LaneBitmask::getNone()); |
| 342 | |
| 343 | // Perform a depth-first iteration of the LiveRange graph marking defs. |
| 344 | // Stop processing of a given branch when all use lanes have been defined. |
| 345 | // The first definition stops processing for a physical register. |
| 346 | struct PhiEntry { |
| 347 | const VNInfo *Phi; |
| 348 | unsigned PredIdx; |
| 349 | LaneBitmask DefinedLanes; |
| 350 | |
| 351 | PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes) |
| 352 | : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {} |
| 353 | }; |
| 354 | using VisitKey = std::pair<const VNInfo *, LaneBitmask>; |
| 355 | SmallVector<PhiEntry, 2> PhiStack; |
| 356 | SmallSet<VisitKey, 4> Visited; |
| 357 | LaneBitmask DefinedLanes; |
| 358 | unsigned NextPredIdx = 0; // Only used for processing phi nodes |
| 359 | do { |
| 360 | const VNInfo *NextValue = nullptr; |
| 361 | const VisitKey Key(Value, DefinedLanes); |
| 362 | |
| 363 | if (Visited.insert(V: Key).second) { |
| 364 | // On first visit to a phi then start processing first predecessor |
| 365 | NextPredIdx = 0; |
| 366 | } |
| 367 | |
| 368 | if (Value->isPHIDef()) { |
| 369 | // Each predecessor node in the phi must be processed as a subgraph |
| 370 | const MachineBasicBlock *MBB = LIS->getMBBFromIndex(index: Value->def); |
| 371 | assert(MBB && "Phi-def has no defining MBB" ); |
| 372 | |
| 373 | // Find next predecessor to process |
| 374 | unsigned Idx = NextPredIdx; |
| 375 | const auto *PI = MBB->pred_begin() + Idx; |
| 376 | const auto *PE = MBB->pred_end(); |
| 377 | for (; PI != PE && !NextValue; ++PI, ++Idx) { |
| 378 | if (const VNInfo *VN = LR.getVNInfoBefore(Idx: LIS->getMBBEndIdx(mbb: *PI))) { |
| 379 | if (!Visited.count(V: VisitKey(VN, DefinedLanes))) |
| 380 | NextValue = VN; |
| 381 | } |
| 382 | } |
| 383 | |
| 384 | // If there are more predecessors to process; add phi to stack |
| 385 | if (PI != PE) |
| 386 | PhiStack.emplace_back(Args&: Value, Args&: Idx, Args&: DefinedLanes); |
| 387 | } else { |
| 388 | MachineInstr *MI = LIS->getInstructionFromIndex(index: Value->def); |
| 389 | assert(MI && "Def has no defining instruction" ); |
| 390 | |
| 391 | if (Reg.isVirtual()) { |
| 392 | // Iterate over all operands to find relevant definitions |
| 393 | bool HasDef = false; |
| 394 | for (const MachineOperand &Op : MI->all_defs()) { |
| 395 | if (Op.getReg() != Reg) |
| 396 | continue; |
| 397 | |
| 398 | // Compute lanes defined and overlap with use |
| 399 | LaneBitmask OpLanes = |
| 400 | Op.isUndef() ? LaneBitmask::getAll() |
| 401 | : TRI->getSubRegIndexLaneMask(SubIdx: Op.getSubReg()); |
| 402 | LaneBitmask Overlap = (UseLanes & OpLanes); |
| 403 | |
| 404 | // Record if this instruction defined any of use |
| 405 | HasDef |= Overlap.any(); |
| 406 | |
| 407 | // Mark any lanes defined |
| 408 | DefinedLanes |= OpLanes; |
| 409 | } |
| 410 | |
| 411 | // Check if all lanes of use have been defined |
| 412 | if ((DefinedLanes & UseLanes) != UseLanes) { |
| 413 | // Definition not complete; need to process input value |
| 414 | LiveQueryResult LRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: *MI)); |
| 415 | if (const VNInfo *VN = LRQ.valueIn()) { |
| 416 | if (!Visited.count(V: VisitKey(VN, DefinedLanes))) |
| 417 | NextValue = VN; |
| 418 | } |
| 419 | } |
| 420 | |
| 421 | // Only mark the instruction if it defines some part of the use |
| 422 | if (HasDef) |
| 423 | markInstruction(MI&: *MI, Flag, Worklist); |
| 424 | } else { |
| 425 | // For physical registers simply mark the defining instruction |
| 426 | markInstruction(MI&: *MI, Flag, Worklist); |
| 427 | } |
| 428 | } |
| 429 | |
| 430 | if (!NextValue && !PhiStack.empty()) { |
| 431 | // Reach end of chain; revert to processing last phi |
| 432 | PhiEntry &Entry = PhiStack.back(); |
| 433 | NextValue = Entry.Phi; |
| 434 | NextPredIdx = Entry.PredIdx; |
| 435 | DefinedLanes = Entry.DefinedLanes; |
| 436 | PhiStack.pop_back(); |
| 437 | } |
| 438 | |
| 439 | Value = NextValue; |
| 440 | } while (Value); |
| 441 | } |
| 442 | |
| 443 | void SIWholeQuadMode::markOperand(const MachineInstr &MI, |
| 444 | const MachineOperand &Op, char Flag, |
| 445 | std::vector<WorkItem> &Worklist) { |
| 446 | assert(Op.isReg()); |
| 447 | Register Reg = Op.getReg(); |
| 448 | |
| 449 | // Ignore some hardware registers |
| 450 | switch (Reg) { |
| 451 | case AMDGPU::EXEC: |
| 452 | case AMDGPU::EXEC_LO: |
| 453 | return; |
| 454 | default: |
| 455 | break; |
| 456 | } |
| 457 | |
| 458 | LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op |
| 459 | << " for " << MI); |
| 460 | if (Reg.isVirtual()) { |
| 461 | LiveRange &LR = LIS->getInterval(Reg); |
| 462 | markDefs(UseMI: MI, LR, Reg, SubReg: Op.getSubReg(), Flag, Worklist); |
| 463 | } else { |
| 464 | // Handle physical registers that we need to track; this is mostly relevant |
| 465 | // for VCC, which can appear as the (implicit) input of a uniform branch, |
| 466 | // e.g. when a loop counter is stored in a VGPR. |
| 467 | for (MCRegUnit Unit : TRI->regunits(Reg: Reg.asMCReg())) { |
| 468 | LiveRange &LR = LIS->getRegUnit(Unit); |
| 469 | const VNInfo *Value = LR.Query(Idx: LIS->getInstructionIndex(Instr: MI)).valueIn(); |
| 470 | if (Value) |
| 471 | markDefs(UseMI: MI, LR, Reg: Unit, SubReg: AMDGPU::NoSubRegister, Flag, Worklist); |
| 472 | } |
| 473 | } |
| 474 | } |
| 475 | |
| 476 | /// Mark all instructions defining the uses in \p MI with \p Flag. |
| 477 | void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, |
| 478 | std::vector<WorkItem> &Worklist) { |
| 479 | LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " |
| 480 | << MI); |
| 481 | |
| 482 | for (const MachineOperand &Use : MI.all_uses()) |
| 483 | markOperand(MI, Op: Use, Flag, Worklist); |
| 484 | } |
| 485 | |
| 486 | // Scan instructions to determine which ones require an Exact execmask and |
| 487 | // which ones seed WQM requirements. |
| 488 | char SIWholeQuadMode::scanInstructions(MachineFunction &MF, |
| 489 | std::vector<WorkItem> &Worklist) { |
| 490 | char GlobalFlags = 0; |
| 491 | bool WQMOutputs = MF.getFunction().hasFnAttribute(Kind: "amdgpu-ps-wqm-outputs" ); |
| 492 | SmallVector<MachineInstr *, 4> SoftWQMInstrs; |
| 493 | bool HasImplicitDerivatives = |
| 494 | MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; |
| 495 | |
| 496 | // We need to visit the basic blocks in reverse post-order so that we visit |
| 497 | // defs before uses, in particular so that we don't accidentally mark an |
| 498 | // instruction as needing e.g. WQM before visiting it and realizing it needs |
| 499 | // WQM disabled. |
| 500 | ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); |
| 501 | for (MachineBasicBlock *MBB : RPOT) { |
| 502 | BlockInfo &BBI = Blocks[MBB]; |
| 503 | |
| 504 | for (MachineInstr &MI : *MBB) { |
| 505 | InstrInfo &III = Instructions[&MI]; |
| 506 | unsigned Opcode = MI.getOpcode(); |
| 507 | char Flags = 0; |
| 508 | |
| 509 | if (TII->isWQM(Opcode)) { |
| 510 | // If LOD is not supported WQM is not needed. |
| 511 | // Only generate implicit WQM if implicit derivatives are required. |
| 512 | // This avoids inserting unintended WQM if a shader type without |
| 513 | // implicit derivatives uses an image sampling instruction. |
| 514 | if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) { |
| 515 | // Sampling instructions don't need to produce results for all pixels |
| 516 | // in a quad, they just require all inputs of a quad to have been |
| 517 | // computed for derivatives. |
| 518 | markInstructionUses(MI, Flag: StateWQM, Worklist); |
| 519 | GlobalFlags |= StateWQM; |
| 520 | } |
| 521 | } else if (Opcode == AMDGPU::WQM) { |
| 522 | // The WQM intrinsic requires its output to have all the helper lanes |
| 523 | // correct, so we need it to be in WQM. |
| 524 | Flags = StateWQM; |
| 525 | LowerToCopyInstrs.insert(X: &MI); |
| 526 | } else if (Opcode == AMDGPU::SOFT_WQM) { |
| 527 | LowerToCopyInstrs.insert(X: &MI); |
| 528 | SoftWQMInstrs.push_back(Elt: &MI); |
| 529 | } else if (Opcode == AMDGPU::STRICT_WWM) { |
| 530 | // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus |
| 531 | // it needs to be executed in WQM or Exact so that its copy doesn't |
| 532 | // clobber inactive lanes. |
| 533 | markInstructionUses(MI, Flag: StateStrictWWM, Worklist); |
| 534 | GlobalFlags |= StateStrictWWM; |
| 535 | LowerToMovInstrs.push_back(Elt: &MI); |
| 536 | } else if (Opcode == AMDGPU::STRICT_WQM || |
| 537 | TII->isDualSourceBlendEXP(MI)) { |
| 538 | // STRICT_WQM is similar to STRICTWWM, but instead of enabling all |
| 539 | // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in |
| 540 | // quads that have at least one active thread. |
| 541 | markInstructionUses(MI, Flag: StateStrictWQM, Worklist); |
| 542 | GlobalFlags |= StateStrictWQM; |
| 543 | |
| 544 | if (Opcode == AMDGPU::STRICT_WQM) { |
| 545 | LowerToMovInstrs.push_back(Elt: &MI); |
| 546 | } else { |
| 547 | // Dual source blend export acts as implicit strict-wqm, its sources |
| 548 | // need to be shuffled in strict wqm, but the export itself needs to |
| 549 | // run in exact mode. |
| 550 | BBI.Needs |= StateExact; |
| 551 | if (!(BBI.InNeeds & StateExact)) { |
| 552 | BBI.InNeeds |= StateExact; |
| 553 | Worklist.emplace_back(args&: MBB); |
| 554 | } |
| 555 | GlobalFlags |= StateExact; |
| 556 | III.Disabled = StateWQM | StateStrict; |
| 557 | } |
| 558 | } else if (Opcode == AMDGPU::LDS_PARAM_LOAD || |
| 559 | Opcode == AMDGPU::DS_PARAM_LOAD || |
| 560 | Opcode == AMDGPU::LDS_DIRECT_LOAD || |
| 561 | Opcode == AMDGPU::DS_DIRECT_LOAD) { |
| 562 | // Mark these STRICTWQM, but only for the instruction, not its operands. |
| 563 | // This avoid unnecessarily marking M0 as requiring WQM. |
| 564 | III.Needs |= StateStrictWQM; |
| 565 | GlobalFlags |= StateStrictWQM; |
| 566 | } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) { |
| 567 | // Disable strict states; StrictWQM will be added as required later. |
| 568 | III.Disabled = StateStrict; |
| 569 | MachineOperand &Inactive = MI.getOperand(i: 4); |
| 570 | if (Inactive.isReg()) { |
| 571 | if (Inactive.isUndef() && MI.getOperand(i: 3).getImm() == 0) |
| 572 | LowerToCopyInstrs.insert(X: &MI); |
| 573 | else |
| 574 | markOperand(MI, Op: Inactive, Flag: StateStrictWWM, Worklist); |
| 575 | } |
| 576 | SetInactiveInstrs.push_back(Elt: &MI); |
| 577 | BBI.NeedsLowering = true; |
| 578 | } else if (TII->isDisableWQM(MI)) { |
| 579 | BBI.Needs |= StateExact; |
| 580 | if (!(BBI.InNeeds & StateExact)) { |
| 581 | BBI.InNeeds |= StateExact; |
| 582 | Worklist.emplace_back(args&: MBB); |
| 583 | } |
| 584 | GlobalFlags |= StateExact; |
| 585 | III.Disabled = StateWQM | StateStrict; |
| 586 | } else if (Opcode == AMDGPU::SI_PS_LIVE || |
| 587 | Opcode == AMDGPU::SI_LIVE_MASK) { |
| 588 | LiveMaskQueries.push_back(Elt: &MI); |
| 589 | } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || |
| 590 | Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || |
| 591 | Opcode == AMDGPU::SI_DEMOTE_I1) { |
| 592 | KillInstrs.push_back(Elt: &MI); |
| 593 | BBI.NeedsLowering = true; |
| 594 | } else if (Opcode == AMDGPU::SI_INIT_EXEC || |
| 595 | Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT || |
| 596 | Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) { |
| 597 | InitExecInstrs.push_back(Elt: &MI); |
| 598 | } else if (WQMOutputs) { |
| 599 | // The function is in machine SSA form, which means that physical |
| 600 | // VGPRs correspond to shader inputs and outputs. Inputs are |
| 601 | // only used, outputs are only defined. |
| 602 | // FIXME: is this still valid? |
| 603 | for (const MachineOperand &MO : MI.defs()) { |
| 604 | Register Reg = MO.getReg(); |
| 605 | if (Reg.isPhysical() && |
| 606 | TRI->hasVectorRegisters(RC: TRI->getPhysRegBaseClass(Reg))) { |
| 607 | Flags = StateWQM; |
| 608 | break; |
| 609 | } |
| 610 | } |
| 611 | } |
| 612 | |
| 613 | if (Flags) { |
| 614 | markInstruction(MI, Flag: Flags, Worklist); |
| 615 | GlobalFlags |= Flags; |
| 616 | } |
| 617 | } |
| 618 | } |
| 619 | |
| 620 | // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is |
| 621 | // ever used anywhere in the function. This implements the corresponding |
| 622 | // semantics of @llvm.amdgcn.set.inactive. |
| 623 | // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm. |
| 624 | if (GlobalFlags & StateWQM) { |
| 625 | for (MachineInstr *MI : SetInactiveInstrs) |
| 626 | markInstruction(MI&: *MI, Flag: StateWQM, Worklist); |
| 627 | for (MachineInstr *MI : SoftWQMInstrs) |
| 628 | markInstruction(MI&: *MI, Flag: StateWQM, Worklist); |
| 629 | } |
| 630 | |
| 631 | return GlobalFlags; |
| 632 | } |
| 633 | |
| 634 | void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, |
| 635 | std::vector<WorkItem>& Worklist) { |
| 636 | MachineBasicBlock *MBB = MI.getParent(); |
| 637 | InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references |
| 638 | BlockInfo &BI = Blocks[MBB]; |
| 639 | |
| 640 | // Control flow-type instructions and stores to temporary memory that are |
| 641 | // followed by WQM computations must themselves be in WQM. |
| 642 | if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) && |
| 643 | (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { |
| 644 | Instructions[&MI].Needs = StateWQM; |
| 645 | II.Needs = StateWQM; |
| 646 | } |
| 647 | |
| 648 | // Propagate to block level |
| 649 | if (II.Needs & StateWQM) { |
| 650 | BI.Needs |= StateWQM; |
| 651 | if (!(BI.InNeeds & StateWQM)) { |
| 652 | BI.InNeeds |= StateWQM; |
| 653 | Worklist.emplace_back(args&: MBB); |
| 654 | } |
| 655 | } |
| 656 | |
| 657 | // Propagate backwards within block |
| 658 | if (MachineInstr *PrevMI = MI.getPrevNode()) { |
| 659 | char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds; |
| 660 | if (!PrevMI->isPHI()) { |
| 661 | InstrInfo &PrevII = Instructions[PrevMI]; |
| 662 | if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { |
| 663 | PrevII.OutNeeds |= InNeeds; |
| 664 | Worklist.emplace_back(args&: PrevMI); |
| 665 | } |
| 666 | } |
| 667 | } |
| 668 | |
| 669 | // Propagate WQM flag to instruction inputs |
| 670 | assert(!(II.Needs & StateExact)); |
| 671 | |
| 672 | if (II.Needs != 0) |
| 673 | markInstructionUses(MI, Flag: II.Needs, Worklist); |
| 674 | |
| 675 | // Ensure we process a block containing StrictWWM/StrictWQM, even if it does |
| 676 | // not require any WQM transitions. |
| 677 | if (II.Needs & StateStrictWWM) |
| 678 | BI.Needs |= StateStrictWWM; |
| 679 | if (II.Needs & StateStrictWQM) |
| 680 | BI.Needs |= StateStrictWQM; |
| 681 | } |
| 682 | |
| 683 | void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, |
| 684 | std::vector<WorkItem>& Worklist) { |
| 685 | BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. |
| 686 | |
| 687 | // Propagate through instructions |
| 688 | if (!MBB.empty()) { |
| 689 | MachineInstr *LastMI = &*MBB.rbegin(); |
| 690 | InstrInfo &LastII = Instructions[LastMI]; |
| 691 | if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { |
| 692 | LastII.OutNeeds |= BI.OutNeeds; |
| 693 | Worklist.emplace_back(args&: LastMI); |
| 694 | } |
| 695 | } |
| 696 | |
| 697 | // Predecessor blocks must provide for our WQM/Exact needs. |
| 698 | for (MachineBasicBlock *Pred : MBB.predecessors()) { |
| 699 | BlockInfo &PredBI = Blocks[Pred]; |
| 700 | if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) |
| 701 | continue; |
| 702 | |
| 703 | PredBI.OutNeeds |= BI.InNeeds; |
| 704 | PredBI.InNeeds |= BI.InNeeds; |
| 705 | Worklist.emplace_back(args&: Pred); |
| 706 | } |
| 707 | |
| 708 | // All successors must be prepared to accept the same set of WQM/Exact data. |
| 709 | for (MachineBasicBlock *Succ : MBB.successors()) { |
| 710 | BlockInfo &SuccBI = Blocks[Succ]; |
| 711 | if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) |
| 712 | continue; |
| 713 | |
| 714 | SuccBI.InNeeds |= BI.OutNeeds; |
| 715 | Worklist.emplace_back(args&: Succ); |
| 716 | } |
| 717 | } |
| 718 | |
| 719 | char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { |
| 720 | std::vector<WorkItem> Worklist; |
| 721 | char GlobalFlags = scanInstructions(MF, Worklist); |
| 722 | |
| 723 | while (!Worklist.empty()) { |
| 724 | WorkItem WI = Worklist.back(); |
| 725 | Worklist.pop_back(); |
| 726 | |
| 727 | if (WI.MI) |
| 728 | propagateInstruction(MI&: *WI.MI, Worklist); |
| 729 | else |
| 730 | propagateBlock(MBB&: *WI.MBB, Worklist); |
| 731 | } |
| 732 | |
| 733 | return GlobalFlags; |
| 734 | } |
| 735 | |
| 736 | MachineBasicBlock::iterator |
| 737 | SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, |
| 738 | MachineBasicBlock::iterator Before) { |
| 739 | Register SaveReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
| 740 | |
| 741 | MachineInstr *Save = |
| 742 | BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SaveReg) |
| 743 | .addReg(RegNo: AMDGPU::SCC); |
| 744 | MachineInstr *Restore = |
| 745 | BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::SCC) |
| 746 | .addReg(RegNo: SaveReg); |
| 747 | |
| 748 | LIS->InsertMachineInstrInMaps(MI&: *Save); |
| 749 | LIS->InsertMachineInstrInMaps(MI&: *Restore); |
| 750 | LIS->createAndComputeVirtRegInterval(Reg: SaveReg); |
| 751 | |
| 752 | return Restore; |
| 753 | } |
| 754 | |
| 755 | void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) { |
| 756 | MachineBasicBlock *BB = TermMI->getParent(); |
| 757 | LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " |
| 758 | << *TermMI << "\n" ); |
| 759 | |
| 760 | MachineBasicBlock *SplitBB = |
| 761 | BB->splitAt(SplitInst&: *TermMI, /*UpdateLiveIns*/ true, LIS); |
| 762 | |
| 763 | // Convert last instruction in block to a terminator. |
| 764 | // Note: this only covers the expected patterns |
| 765 | unsigned NewOpcode = 0; |
| 766 | switch (TermMI->getOpcode()) { |
| 767 | case AMDGPU::S_AND_B32: |
| 768 | NewOpcode = AMDGPU::S_AND_B32_term; |
| 769 | break; |
| 770 | case AMDGPU::S_AND_B64: |
| 771 | NewOpcode = AMDGPU::S_AND_B64_term; |
| 772 | break; |
| 773 | case AMDGPU::S_MOV_B32: |
| 774 | NewOpcode = AMDGPU::S_MOV_B32_term; |
| 775 | break; |
| 776 | case AMDGPU::S_MOV_B64: |
| 777 | NewOpcode = AMDGPU::S_MOV_B64_term; |
| 778 | break; |
| 779 | case AMDGPU::S_ANDN2_B32: |
| 780 | NewOpcode = AMDGPU::S_ANDN2_B32_term; |
| 781 | break; |
| 782 | case AMDGPU::S_ANDN2_B64: |
| 783 | NewOpcode = AMDGPU::S_ANDN2_B64_term; |
| 784 | break; |
| 785 | default: |
| 786 | llvm_unreachable("Unexpected instruction" ); |
| 787 | } |
| 788 | |
| 789 | // These terminators fallthrough to the next block, no need to add an |
| 790 | // unconditional branch to the next block (SplitBB). |
| 791 | TermMI->setDesc(TII->get(Opcode: NewOpcode)); |
| 792 | |
| 793 | if (SplitBB != BB) { |
| 794 | // Update dominator trees |
| 795 | using DomTreeT = DomTreeBase<MachineBasicBlock>; |
| 796 | SmallVector<DomTreeT::UpdateType, 16> DTUpdates; |
| 797 | for (MachineBasicBlock *Succ : SplitBB->successors()) { |
| 798 | DTUpdates.push_back(Elt: {DomTreeT::Insert, SplitBB, Succ}); |
| 799 | DTUpdates.push_back(Elt: {DomTreeT::Delete, BB, Succ}); |
| 800 | } |
| 801 | DTUpdates.push_back(Elt: {DomTreeT::Insert, BB, SplitBB}); |
| 802 | if (MDT) |
| 803 | MDT->applyUpdates(Updates: DTUpdates); |
| 804 | if (PDT) |
| 805 | PDT->applyUpdates(Updates: DTUpdates); |
| 806 | } |
| 807 | } |
| 808 | |
| 809 | MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) { |
| 810 | assert(LiveMaskReg.isVirtual()); |
| 811 | |
| 812 | const DebugLoc &DL = MI.getDebugLoc(); |
| 813 | unsigned Opcode = 0; |
| 814 | |
| 815 | assert(MI.getOperand(0).isReg()); |
| 816 | |
| 817 | // Comparison is for live lanes; however here we compute the inverse |
| 818 | // (killed lanes). This is because VCMP will always generate 0 bits |
| 819 | // for inactive lanes so a mask of live lanes would not be correct |
| 820 | // inside control flow. |
| 821 | // Invert the comparison by swapping the operands and adjusting |
| 822 | // the comparison codes. |
| 823 | |
| 824 | switch (MI.getOperand(i: 2).getImm()) { |
| 825 | case ISD::SETUEQ: |
| 826 | Opcode = AMDGPU::V_CMP_LG_F32_e64; |
| 827 | break; |
| 828 | case ISD::SETUGT: |
| 829 | Opcode = AMDGPU::V_CMP_GE_F32_e64; |
| 830 | break; |
| 831 | case ISD::SETUGE: |
| 832 | Opcode = AMDGPU::V_CMP_GT_F32_e64; |
| 833 | break; |
| 834 | case ISD::SETULT: |
| 835 | Opcode = AMDGPU::V_CMP_LE_F32_e64; |
| 836 | break; |
| 837 | case ISD::SETULE: |
| 838 | Opcode = AMDGPU::V_CMP_LT_F32_e64; |
| 839 | break; |
| 840 | case ISD::SETUNE: |
| 841 | Opcode = AMDGPU::V_CMP_EQ_F32_e64; |
| 842 | break; |
| 843 | case ISD::SETO: |
| 844 | Opcode = AMDGPU::V_CMP_O_F32_e64; |
| 845 | break; |
| 846 | case ISD::SETUO: |
| 847 | Opcode = AMDGPU::V_CMP_U_F32_e64; |
| 848 | break; |
| 849 | case ISD::SETOEQ: |
| 850 | case ISD::SETEQ: |
| 851 | Opcode = AMDGPU::V_CMP_NEQ_F32_e64; |
| 852 | break; |
| 853 | case ISD::SETOGT: |
| 854 | case ISD::SETGT: |
| 855 | Opcode = AMDGPU::V_CMP_NLT_F32_e64; |
| 856 | break; |
| 857 | case ISD::SETOGE: |
| 858 | case ISD::SETGE: |
| 859 | Opcode = AMDGPU::V_CMP_NLE_F32_e64; |
| 860 | break; |
| 861 | case ISD::SETOLT: |
| 862 | case ISD::SETLT: |
| 863 | Opcode = AMDGPU::V_CMP_NGT_F32_e64; |
| 864 | break; |
| 865 | case ISD::SETOLE: |
| 866 | case ISD::SETLE: |
| 867 | Opcode = AMDGPU::V_CMP_NGE_F32_e64; |
| 868 | break; |
| 869 | case ISD::SETONE: |
| 870 | case ISD::SETNE: |
| 871 | Opcode = AMDGPU::V_CMP_NLG_F32_e64; |
| 872 | break; |
| 873 | default: |
| 874 | llvm_unreachable("invalid ISD:SET cond code" ); |
| 875 | } |
| 876 | |
| 877 | MachineBasicBlock &MBB = *MI.getParent(); |
| 878 | |
| 879 | // Pick opcode based on comparison type. |
| 880 | MachineInstr *VcmpMI; |
| 881 | const MachineOperand &Op0 = MI.getOperand(i: 0); |
| 882 | const MachineOperand &Op1 = MI.getOperand(i: 1); |
| 883 | |
| 884 | // VCC represents lanes killed. |
| 885 | Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; |
| 886 | |
| 887 | if (TRI->isVGPR(MRI: *MRI, Reg: Op0.getReg())) { |
| 888 | Opcode = AMDGPU::getVOPe32(Opcode); |
| 889 | VcmpMI = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode)).add(MO: Op1).add(MO: Op0); |
| 890 | } else { |
| 891 | VcmpMI = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode)) |
| 892 | .addReg(RegNo: VCC, flags: RegState::Define) |
| 893 | .addImm(Val: 0) // src0 modifiers |
| 894 | .add(MO: Op1) |
| 895 | .addImm(Val: 0) // src1 modifiers |
| 896 | .add(MO: Op0) |
| 897 | .addImm(Val: 0); // omod |
| 898 | } |
| 899 | |
| 900 | MachineInstr *MaskUpdateMI = |
| 901 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
| 902 | .addReg(RegNo: LiveMaskReg) |
| 903 | .addReg(RegNo: VCC); |
| 904 | |
| 905 | // State of SCC represents whether any lanes are live in mask, |
| 906 | // if SCC is 0 then no lanes will be alive anymore. |
| 907 | MachineInstr *EarlyTermMI = |
| 908 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_EARLY_TERMINATE_SCC0)); |
| 909 | |
| 910 | MachineInstr *ExecMaskMI = |
| 911 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: Exec).addReg(RegNo: Exec).addReg(RegNo: VCC); |
| 912 | |
| 913 | assert(MBB.succ_size() == 1); |
| 914 | |
| 915 | // Update live intervals |
| 916 | LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *VcmpMI); |
| 917 | MBB.remove(I: &MI); |
| 918 | |
| 919 | LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI); |
| 920 | LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI); |
| 921 | LIS->InsertMachineInstrInMaps(MI&: *ExecMaskMI); |
| 922 | |
| 923 | return ExecMaskMI; |
| 924 | } |
| 925 | |
| 926 | MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) { |
| 927 | assert(LiveMaskReg.isVirtual()); |
| 928 | |
| 929 | MachineBasicBlock &MBB = *MI.getParent(); |
| 930 | |
| 931 | const DebugLoc &DL = MI.getDebugLoc(); |
| 932 | MachineInstr *MaskUpdateMI = nullptr; |
| 933 | |
| 934 | const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1); |
| 935 | const MachineOperand &Op = MI.getOperand(i: 0); |
| 936 | int64_t KillVal = MI.getOperand(i: 1).getImm(); |
| 937 | MachineInstr *ComputeKilledMaskMI = nullptr; |
| 938 | Register CndReg = !Op.isImm() ? Op.getReg() : Register(); |
| 939 | Register TmpReg; |
| 940 | |
| 941 | // Is this a static or dynamic kill? |
| 942 | if (Op.isImm()) { |
| 943 | if (Op.getImm() == KillVal) { |
| 944 | // Static: all active lanes are killed |
| 945 | MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
| 946 | .addReg(RegNo: LiveMaskReg) |
| 947 | .addReg(RegNo: Exec); |
| 948 | } else { |
| 949 | // Static: kill does nothing |
| 950 | bool IsLastTerminator = std::next(x: MI.getIterator()) == MBB.end(); |
| 951 | if (!IsLastTerminator) { |
| 952 | LIS->RemoveMachineInstrFromMaps(MI); |
| 953 | } else { |
| 954 | assert(MBB.succ_size() == 1 && MI.getOpcode() != AMDGPU::SI_DEMOTE_I1); |
| 955 | MachineInstr *NewTerm = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH)) |
| 956 | .addMBB(MBB: *MBB.succ_begin()); |
| 957 | LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewTerm); |
| 958 | } |
| 959 | MBB.remove(I: &MI); |
| 960 | return nullptr; |
| 961 | } |
| 962 | } else { |
| 963 | if (!KillVal) { |
| 964 | // Op represents live lanes after kill, |
| 965 | // so exec mask needs to be factored in. |
| 966 | TmpReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
| 967 | ComputeKilledMaskMI = |
| 968 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: TmpReg).addReg(RegNo: Exec).add(MO: Op); |
| 969 | MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
| 970 | .addReg(RegNo: LiveMaskReg) |
| 971 | .addReg(RegNo: TmpReg); |
| 972 | } else { |
| 973 | // Op represents lanes to kill |
| 974 | MaskUpdateMI = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndN2Opc), DestReg: LiveMaskReg) |
| 975 | .addReg(RegNo: LiveMaskReg) |
| 976 | .add(MO: Op); |
| 977 | } |
| 978 | } |
| 979 | |
| 980 | // State of SCC represents whether any lanes are live in mask, |
| 981 | // if SCC is 0 then no lanes will be alive anymore. |
| 982 | MachineInstr *EarlyTermMI = |
| 983 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_EARLY_TERMINATE_SCC0)); |
| 984 | |
| 985 | // In the case we got this far some lanes are still live, |
| 986 | // update EXEC to deactivate lanes as appropriate. |
| 987 | MachineInstr *NewTerm; |
| 988 | MachineInstr *WQMMaskMI = nullptr; |
| 989 | Register LiveMaskWQM; |
| 990 | if (IsDemote) { |
| 991 | // Demote - deactivate quads with only helper lanes |
| 992 | LiveMaskWQM = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
| 993 | WQMMaskMI = |
| 994 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: WQMOpc), DestReg: LiveMaskWQM).addReg(RegNo: LiveMaskReg); |
| 995 | NewTerm = BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: Exec) |
| 996 | .addReg(RegNo: Exec) |
| 997 | .addReg(RegNo: LiveMaskWQM); |
| 998 | } else { |
| 999 | // Kill - deactivate lanes no longer in live mask |
| 1000 | if (Op.isImm()) { |
| 1001 | unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
| 1002 | NewTerm = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: Exec).addImm(Val: 0); |
| 1003 | } else if (!IsWQM) { |
| 1004 | NewTerm = BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: Exec) |
| 1005 | .addReg(RegNo: Exec) |
| 1006 | .addReg(RegNo: LiveMaskReg); |
| 1007 | } else { |
| 1008 | unsigned Opcode = KillVal ? AndN2Opc : AndOpc; |
| 1009 | NewTerm = |
| 1010 | BuildMI(BB&: MBB, I: &MI, MIMD: DL, MCID: TII->get(Opcode), DestReg: Exec).addReg(RegNo: Exec).add(MO: Op); |
| 1011 | } |
| 1012 | } |
| 1013 | |
| 1014 | // Update live intervals |
| 1015 | LIS->RemoveMachineInstrFromMaps(MI); |
| 1016 | MBB.remove(I: &MI); |
| 1017 | assert(EarlyTermMI); |
| 1018 | assert(MaskUpdateMI); |
| 1019 | assert(NewTerm); |
| 1020 | if (ComputeKilledMaskMI) |
| 1021 | LIS->InsertMachineInstrInMaps(MI&: *ComputeKilledMaskMI); |
| 1022 | LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI); |
| 1023 | LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI); |
| 1024 | if (WQMMaskMI) |
| 1025 | LIS->InsertMachineInstrInMaps(MI&: *WQMMaskMI); |
| 1026 | LIS->InsertMachineInstrInMaps(MI&: *NewTerm); |
| 1027 | |
| 1028 | if (CndReg) { |
| 1029 | LIS->removeInterval(Reg: CndReg); |
| 1030 | LIS->createAndComputeVirtRegInterval(Reg: CndReg); |
| 1031 | } |
| 1032 | if (TmpReg) |
| 1033 | LIS->createAndComputeVirtRegInterval(Reg: TmpReg); |
| 1034 | if (LiveMaskWQM) |
| 1035 | LIS->createAndComputeVirtRegInterval(Reg: LiveMaskWQM); |
| 1036 | |
| 1037 | return NewTerm; |
| 1038 | } |
| 1039 | |
| 1040 | // Replace (or supplement) instructions accessing live mask. |
| 1041 | // This can only happen once all the live mask registers have been created |
| 1042 | // and the execute state (WQM/StrictWWM/Exact) of instructions is known. |
| 1043 | void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) { |
| 1044 | if (!BI.NeedsLowering) |
| 1045 | return; |
| 1046 | |
| 1047 | LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n" ); |
| 1048 | |
| 1049 | SmallVector<MachineInstr *, 4> SplitPoints; |
| 1050 | Register ActiveLanesReg = 0; |
| 1051 | char State = BI.InitialState; |
| 1052 | |
| 1053 | for (MachineInstr &MI : llvm::make_early_inc_range( |
| 1054 | Range: llvm::make_range(x: MBB.getFirstNonPHI(), y: MBB.end()))) { |
| 1055 | auto MIState = StateTransition.find(Val: &MI); |
| 1056 | if (MIState != StateTransition.end()) |
| 1057 | State = MIState->second; |
| 1058 | |
| 1059 | MachineInstr *SplitPoint = nullptr; |
| 1060 | switch (MI.getOpcode()) { |
| 1061 | case AMDGPU::SI_DEMOTE_I1: |
| 1062 | case AMDGPU::SI_KILL_I1_TERMINATOR: |
| 1063 | SplitPoint = lowerKillI1(MI, IsWQM: State == StateWQM); |
| 1064 | break; |
| 1065 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
| 1066 | SplitPoint = lowerKillF32(MI); |
| 1067 | break; |
| 1068 | case AMDGPU::ENTER_STRICT_WWM: |
| 1069 | ActiveLanesReg = MI.getOperand(i: 0).getReg(); |
| 1070 | break; |
| 1071 | case AMDGPU::EXIT_STRICT_WWM: |
| 1072 | ActiveLanesReg = 0; |
| 1073 | break; |
| 1074 | case AMDGPU::V_SET_INACTIVE_B32: |
| 1075 | if (ActiveLanesReg) { |
| 1076 | LiveInterval &LI = LIS->getInterval(Reg: MI.getOperand(i: 5).getReg()); |
| 1077 | MRI->constrainRegClass(Reg: ActiveLanesReg, RC: TRI->getWaveMaskRegClass()); |
| 1078 | MI.getOperand(i: 5).setReg(ActiveLanesReg); |
| 1079 | LIS->shrinkToUses(li: &LI); |
| 1080 | } else { |
| 1081 | assert(State == StateExact || State == StateWQM); |
| 1082 | } |
| 1083 | break; |
| 1084 | default: |
| 1085 | break; |
| 1086 | } |
| 1087 | if (SplitPoint) |
| 1088 | SplitPoints.push_back(Elt: SplitPoint); |
| 1089 | } |
| 1090 | |
| 1091 | // Perform splitting after instruction scan to simplify iteration. |
| 1092 | for (MachineInstr *MI : SplitPoints) |
| 1093 | splitBlock(TermMI: MI); |
| 1094 | } |
| 1095 | |
| 1096 | // Return an iterator in the (inclusive) range [First, Last] at which |
| 1097 | // instructions can be safely inserted, keeping in mind that some of the |
| 1098 | // instructions we want to add necessarily clobber SCC. |
| 1099 | MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( |
| 1100 | MachineBasicBlock &MBB, MachineBasicBlock::iterator First, |
| 1101 | MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { |
| 1102 | if (!SaveSCC) |
| 1103 | return PreferLast ? Last : First; |
| 1104 | |
| 1105 | LiveRange &LR = |
| 1106 | LIS->getRegUnit(Unit: *TRI->regunits(Reg: MCRegister::from(Val: AMDGPU::SCC)).begin()); |
| 1107 | auto MBBE = MBB.end(); |
| 1108 | SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(Instr: *First) |
| 1109 | : LIS->getMBBEndIdx(mbb: &MBB); |
| 1110 | SlotIndex LastIdx = |
| 1111 | Last != MBBE ? LIS->getInstructionIndex(Instr: *Last) : LIS->getMBBEndIdx(mbb: &MBB); |
| 1112 | SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; |
| 1113 | const LiveRange::Segment *S; |
| 1114 | |
| 1115 | for (;;) { |
| 1116 | S = LR.getSegmentContaining(Idx); |
| 1117 | if (!S) |
| 1118 | break; |
| 1119 | |
| 1120 | if (PreferLast) { |
| 1121 | SlotIndex Next = S->start.getBaseIndex(); |
| 1122 | if (Next < FirstIdx) |
| 1123 | break; |
| 1124 | Idx = Next; |
| 1125 | } else { |
| 1126 | MachineInstr *EndMI = LIS->getInstructionFromIndex(index: S->end.getBaseIndex()); |
| 1127 | assert(EndMI && "Segment does not end on valid instruction" ); |
| 1128 | auto NextI = std::next(x: EndMI->getIterator()); |
| 1129 | if (NextI == MBB.end()) |
| 1130 | break; |
| 1131 | SlotIndex Next = LIS->getInstructionIndex(Instr: *NextI); |
| 1132 | if (Next > LastIdx) |
| 1133 | break; |
| 1134 | Idx = Next; |
| 1135 | } |
| 1136 | } |
| 1137 | |
| 1138 | MachineBasicBlock::iterator MBBI; |
| 1139 | |
| 1140 | if (MachineInstr *MI = LIS->getInstructionFromIndex(index: Idx)) |
| 1141 | MBBI = MI; |
| 1142 | else { |
| 1143 | assert(Idx == LIS->getMBBEndIdx(&MBB)); |
| 1144 | MBBI = MBB.end(); |
| 1145 | } |
| 1146 | |
| 1147 | // Move insertion point past any operations modifying EXEC. |
| 1148 | // This assumes that the value of SCC defined by any of these operations |
| 1149 | // does not need to be preserved. |
| 1150 | while (MBBI != Last) { |
| 1151 | bool IsExecDef = false; |
| 1152 | for (const MachineOperand &MO : MBBI->all_defs()) { |
| 1153 | IsExecDef |= |
| 1154 | MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC; |
| 1155 | } |
| 1156 | if (!IsExecDef) |
| 1157 | break; |
| 1158 | MBBI++; |
| 1159 | S = nullptr; |
| 1160 | } |
| 1161 | |
| 1162 | if (S) |
| 1163 | MBBI = saveSCC(MBB, Before: MBBI); |
| 1164 | |
| 1165 | return MBBI; |
| 1166 | } |
| 1167 | |
| 1168 | void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, |
| 1169 | MachineBasicBlock::iterator Before, |
| 1170 | Register SaveWQM) { |
| 1171 | assert(LiveMaskReg.isVirtual()); |
| 1172 | |
| 1173 | bool IsTerminator = Before == MBB.end(); |
| 1174 | if (!IsTerminator) { |
| 1175 | auto FirstTerm = MBB.getFirstTerminator(); |
| 1176 | if (FirstTerm != MBB.end()) { |
| 1177 | SlotIndex FirstTermIdx = LIS->getInstructionIndex(Instr: *FirstTerm); |
| 1178 | SlotIndex BeforeIdx = LIS->getInstructionIndex(Instr: *Before); |
| 1179 | IsTerminator = BeforeIdx > FirstTermIdx; |
| 1180 | } |
| 1181 | } |
| 1182 | |
| 1183 | MachineInstr *MI; |
| 1184 | |
| 1185 | if (SaveWQM) { |
| 1186 | unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc; |
| 1187 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode), DestReg: SaveWQM) |
| 1188 | .addReg(RegNo: LiveMaskReg); |
| 1189 | } else { |
| 1190 | unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc; |
| 1191 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode), DestReg: Exec) |
| 1192 | .addReg(RegNo: Exec) |
| 1193 | .addReg(RegNo: LiveMaskReg); |
| 1194 | } |
| 1195 | |
| 1196 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
| 1197 | StateTransition[MI] = StateExact; |
| 1198 | } |
| 1199 | |
| 1200 | void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, |
| 1201 | MachineBasicBlock::iterator Before, |
| 1202 | Register SavedWQM) { |
| 1203 | MachineInstr *MI; |
| 1204 | |
| 1205 | if (SavedWQM) { |
| 1206 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Exec) |
| 1207 | .addReg(RegNo: SavedWQM); |
| 1208 | } else { |
| 1209 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: WQMOpc), DestReg: Exec).addReg(RegNo: Exec); |
| 1210 | } |
| 1211 | |
| 1212 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
| 1213 | StateTransition[MI] = StateWQM; |
| 1214 | } |
| 1215 | |
| 1216 | void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, |
| 1217 | MachineBasicBlock::iterator Before, |
| 1218 | Register SaveOrig, char StrictStateNeeded) { |
| 1219 | MachineInstr *MI; |
| 1220 | assert(SaveOrig); |
| 1221 | assert(StrictStateNeeded == StateStrictWWM || |
| 1222 | StrictStateNeeded == StateStrictWQM); |
| 1223 | |
| 1224 | if (StrictStateNeeded == StateStrictWWM) { |
| 1225 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::ENTER_STRICT_WWM), |
| 1226 | DestReg: SaveOrig) |
| 1227 | .addImm(Val: -1); |
| 1228 | } else { |
| 1229 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::ENTER_STRICT_WQM), |
| 1230 | DestReg: SaveOrig) |
| 1231 | .addImm(Val: -1); |
| 1232 | } |
| 1233 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
| 1234 | StateTransition[MI] = StrictStateNeeded; |
| 1235 | } |
| 1236 | |
| 1237 | void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, |
| 1238 | MachineBasicBlock::iterator Before, |
| 1239 | Register SavedOrig, char NonStrictState, |
| 1240 | char CurrentStrictState) { |
| 1241 | MachineInstr *MI; |
| 1242 | |
| 1243 | assert(SavedOrig); |
| 1244 | assert(CurrentStrictState == StateStrictWWM || |
| 1245 | CurrentStrictState == StateStrictWQM); |
| 1246 | |
| 1247 | if (CurrentStrictState == StateStrictWWM) { |
| 1248 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::EXIT_STRICT_WWM), |
| 1249 | DestReg: Exec) |
| 1250 | .addReg(RegNo: SavedOrig); |
| 1251 | } else { |
| 1252 | MI = BuildMI(BB&: MBB, I: Before, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::EXIT_STRICT_WQM), |
| 1253 | DestReg: Exec) |
| 1254 | .addReg(RegNo: SavedOrig); |
| 1255 | } |
| 1256 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
| 1257 | StateTransition[MI] = NonStrictState; |
| 1258 | } |
| 1259 | |
| 1260 | void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI, |
| 1261 | bool IsEntry) { |
| 1262 | // This is a non-entry block that is WQM throughout, so no need to do |
| 1263 | // anything. |
| 1264 | if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { |
| 1265 | BI.InitialState = StateWQM; |
| 1266 | return; |
| 1267 | } |
| 1268 | |
| 1269 | LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) |
| 1270 | << ":\n" ); |
| 1271 | |
| 1272 | Register SavedWQMReg; |
| 1273 | Register SavedNonStrictReg; |
| 1274 | bool WQMFromExec = IsEntry; |
| 1275 | char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; |
| 1276 | char NonStrictState = 0; |
| 1277 | const TargetRegisterClass *BoolRC = TRI->getBoolRC(); |
| 1278 | |
| 1279 | auto II = MBB.getFirstNonPHI(), IE = MBB.end(); |
| 1280 | if (IsEntry) { |
| 1281 | // Skip the instruction that saves LiveMask |
| 1282 | if (II != IE && II->getOpcode() == AMDGPU::COPY && |
| 1283 | II->getOperand(i: 1).getReg() == TRI->getExec()) |
| 1284 | ++II; |
| 1285 | } |
| 1286 | |
| 1287 | // This stores the first instruction where it's safe to switch from WQM to |
| 1288 | // Exact or vice versa. |
| 1289 | MachineBasicBlock::iterator FirstWQM = IE; |
| 1290 | |
| 1291 | // This stores the first instruction where it's safe to switch from Strict |
| 1292 | // mode to Exact/WQM or to switch to Strict mode. It must always be the same |
| 1293 | // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must |
| 1294 | // be safe to switch to/from WQM as well. |
| 1295 | MachineBasicBlock::iterator FirstStrict = IE; |
| 1296 | |
| 1297 | // Record initial state is block information. |
| 1298 | BI.InitialState = State; |
| 1299 | |
| 1300 | for (unsigned Idx = 0;; ++Idx) { |
| 1301 | MachineBasicBlock::iterator Next = II; |
| 1302 | char Needs = StateExact | StateWQM; // Strict mode is disabled by default. |
| 1303 | char OutNeeds = 0; |
| 1304 | |
| 1305 | if (FirstWQM == IE) |
| 1306 | FirstWQM = II; |
| 1307 | |
| 1308 | if (FirstStrict == IE) |
| 1309 | FirstStrict = II; |
| 1310 | |
| 1311 | // Adjust needs if this is first instruction of WQM requiring shader. |
| 1312 | if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM)) |
| 1313 | Needs = StateWQM; |
| 1314 | |
| 1315 | // First, figure out the allowed states (Needs) based on the propagated |
| 1316 | // flags. |
| 1317 | if (II != IE) { |
| 1318 | MachineInstr &MI = *II; |
| 1319 | |
| 1320 | if (MI.isTerminator() || TII->mayReadEXEC(MRI: *MRI, MI)) { |
| 1321 | auto III = Instructions.find(Val: &MI); |
| 1322 | if (III != Instructions.end()) { |
| 1323 | if (III->second.Needs & StateStrictWWM) |
| 1324 | Needs = StateStrictWWM; |
| 1325 | else if (III->second.Needs & StateStrictWQM) |
| 1326 | Needs = StateStrictWQM; |
| 1327 | else if (III->second.Needs & StateWQM) |
| 1328 | Needs = StateWQM; |
| 1329 | else |
| 1330 | Needs &= ~III->second.Disabled; |
| 1331 | OutNeeds = III->second.OutNeeds; |
| 1332 | } |
| 1333 | } else { |
| 1334 | // If the instruction doesn't actually need a correct EXEC, then we can |
| 1335 | // safely leave Strict mode enabled. |
| 1336 | Needs = StateExact | StateWQM | StateStrict; |
| 1337 | } |
| 1338 | |
| 1339 | // Exact mode exit can occur in terminators, but must be before branches. |
| 1340 | if (MI.isBranch() && OutNeeds == StateExact) |
| 1341 | Needs = StateExact; |
| 1342 | |
| 1343 | ++Next; |
| 1344 | } else { |
| 1345 | // End of basic block |
| 1346 | if (BI.OutNeeds & StateWQM) |
| 1347 | Needs = StateWQM; |
| 1348 | else if (BI.OutNeeds == StateExact) |
| 1349 | Needs = StateExact; |
| 1350 | else |
| 1351 | Needs = StateWQM | StateExact; |
| 1352 | } |
| 1353 | |
| 1354 | // Now, transition if necessary. |
| 1355 | if (!(Needs & State)) { |
| 1356 | MachineBasicBlock::iterator First; |
| 1357 | if (State == StateStrictWWM || Needs == StateStrictWWM || |
| 1358 | State == StateStrictWQM || Needs == StateStrictWQM) { |
| 1359 | // We must switch to or from Strict mode. |
| 1360 | First = FirstStrict; |
| 1361 | } else { |
| 1362 | // We only need to switch to/from WQM, so we can use FirstWQM. |
| 1363 | First = FirstWQM; |
| 1364 | } |
| 1365 | |
| 1366 | // Whether we need to save SCC depends on start and end states. |
| 1367 | bool SaveSCC = false; |
| 1368 | switch (State) { |
| 1369 | case StateExact: |
| 1370 | case StateStrictWWM: |
| 1371 | case StateStrictWQM: |
| 1372 | // Exact/Strict -> Strict: save SCC |
| 1373 | // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec |
| 1374 | // Exact/Strict -> Exact: no save |
| 1375 | SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec); |
| 1376 | break; |
| 1377 | case StateWQM: |
| 1378 | // WQM -> Exact/Strict: save SCC |
| 1379 | SaveSCC = !(Needs & StateWQM); |
| 1380 | break; |
| 1381 | default: |
| 1382 | llvm_unreachable("Unknown state" ); |
| 1383 | break; |
| 1384 | } |
| 1385 | char StartState = State & StateStrict ? NonStrictState : State; |
| 1386 | bool WQMToExact = |
| 1387 | StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM); |
| 1388 | bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) && |
| 1389 | !(Needs & StateExact); |
| 1390 | bool PreferLast = Needs == StateWQM; |
| 1391 | // Exact regions in divergent control flow may run at EXEC=0, so try to |
| 1392 | // exclude instructions with unexpected effects from them. |
| 1393 | // FIXME: ideally we would branch over these when EXEC=0, |
| 1394 | // but this requires updating implicit values, live intervals and CFG. |
| 1395 | if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) { |
| 1396 | for (MachineBasicBlock::iterator I = First; I != II; ++I) { |
| 1397 | if (TII->hasUnwantedEffectsWhenEXECEmpty(MI: *I)) { |
| 1398 | PreferLast = WQMToExact; |
| 1399 | break; |
| 1400 | } |
| 1401 | } |
| 1402 | } |
| 1403 | MachineBasicBlock::iterator Before = |
| 1404 | prepareInsertion(MBB, First, Last: II, PreferLast, SaveSCC); |
| 1405 | |
| 1406 | if (State & StateStrict) { |
| 1407 | assert(State == StateStrictWWM || State == StateStrictWQM); |
| 1408 | assert(SavedNonStrictReg); |
| 1409 | fromStrictMode(MBB, Before, SavedOrig: SavedNonStrictReg, NonStrictState, CurrentStrictState: State); |
| 1410 | |
| 1411 | LIS->createAndComputeVirtRegInterval(Reg: SavedNonStrictReg); |
| 1412 | SavedNonStrictReg = 0; |
| 1413 | State = NonStrictState; |
| 1414 | } |
| 1415 | |
| 1416 | if (Needs & StateStrict) { |
| 1417 | NonStrictState = State; |
| 1418 | assert(Needs == StateStrictWWM || Needs == StateStrictWQM); |
| 1419 | assert(!SavedNonStrictReg); |
| 1420 | SavedNonStrictReg = MRI->createVirtualRegister(RegClass: BoolRC); |
| 1421 | |
| 1422 | toStrictMode(MBB, Before, SaveOrig: SavedNonStrictReg, StrictStateNeeded: Needs); |
| 1423 | State = Needs; |
| 1424 | } else { |
| 1425 | if (WQMToExact) { |
| 1426 | if (!WQMFromExec && (OutNeeds & StateWQM)) { |
| 1427 | assert(!SavedWQMReg); |
| 1428 | SavedWQMReg = MRI->createVirtualRegister(RegClass: BoolRC); |
| 1429 | } |
| 1430 | |
| 1431 | toExact(MBB, Before, SaveWQM: SavedWQMReg); |
| 1432 | State = StateExact; |
| 1433 | } else if (ExactToWQM) { |
| 1434 | assert(WQMFromExec == (SavedWQMReg == 0)); |
| 1435 | |
| 1436 | toWQM(MBB, Before, SavedWQM: SavedWQMReg); |
| 1437 | |
| 1438 | if (SavedWQMReg) { |
| 1439 | LIS->createAndComputeVirtRegInterval(Reg: SavedWQMReg); |
| 1440 | SavedWQMReg = 0; |
| 1441 | } |
| 1442 | State = StateWQM; |
| 1443 | } else { |
| 1444 | // We can get here if we transitioned from StrictWWM to a |
| 1445 | // non-StrictWWM state that already matches our needs, but we |
| 1446 | // shouldn't need to do anything. |
| 1447 | assert(Needs & State); |
| 1448 | } |
| 1449 | } |
| 1450 | } |
| 1451 | |
| 1452 | if (Needs != (StateExact | StateWQM | StateStrict)) { |
| 1453 | if (Needs != (StateExact | StateWQM)) |
| 1454 | FirstWQM = IE; |
| 1455 | FirstStrict = IE; |
| 1456 | } |
| 1457 | |
| 1458 | if (II == IE) |
| 1459 | break; |
| 1460 | |
| 1461 | II = Next; |
| 1462 | } |
| 1463 | assert(!SavedWQMReg); |
| 1464 | assert(!SavedNonStrictReg); |
| 1465 | } |
| 1466 | |
| 1467 | bool SIWholeQuadMode::lowerLiveMaskQueries() { |
| 1468 | for (MachineInstr *MI : LiveMaskQueries) { |
| 1469 | const DebugLoc &DL = MI->getDebugLoc(); |
| 1470 | Register Dest = MI->getOperand(i: 0).getReg(); |
| 1471 | |
| 1472 | MachineInstr *Copy = |
| 1473 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Dest) |
| 1474 | .addReg(RegNo: LiveMaskReg); |
| 1475 | |
| 1476 | LIS->ReplaceMachineInstrInMaps(MI&: *MI, NewMI&: *Copy); |
| 1477 | MI->eraseFromParent(); |
| 1478 | } |
| 1479 | return !LiveMaskQueries.empty(); |
| 1480 | } |
| 1481 | |
| 1482 | bool SIWholeQuadMode::lowerCopyInstrs() { |
| 1483 | for (MachineInstr *MI : LowerToMovInstrs) { |
| 1484 | assert(MI->getNumExplicitOperands() == 2); |
| 1485 | |
| 1486 | const Register Reg = MI->getOperand(i: 0).getReg(); |
| 1487 | |
| 1488 | const TargetRegisterClass *regClass = |
| 1489 | TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: 0)); |
| 1490 | if (TRI->isVGPRClass(RC: regClass)) { |
| 1491 | const unsigned MovOp = TII->getMovOpcode(DstRC: regClass); |
| 1492 | MI->setDesc(TII->get(Opcode: MovOp)); |
| 1493 | |
| 1494 | // Check that it already implicitly depends on exec (like all VALU movs |
| 1495 | // should do). |
| 1496 | assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) { |
| 1497 | return MO.isUse() && MO.getReg() == AMDGPU::EXEC; |
| 1498 | })); |
| 1499 | } else { |
| 1500 | // Remove early-clobber and exec dependency from simple SGPR copies. |
| 1501 | // This allows some to be eliminated during/post RA. |
| 1502 | LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI); |
| 1503 | if (MI->getOperand(i: 0).isEarlyClobber()) { |
| 1504 | LIS->removeInterval(Reg); |
| 1505 | MI->getOperand(i: 0).setIsEarlyClobber(false); |
| 1506 | LIS->createAndComputeVirtRegInterval(Reg); |
| 1507 | } |
| 1508 | int Index = MI->findRegisterUseOperandIdx(Reg: AMDGPU::EXEC, /*TRI=*/nullptr); |
| 1509 | while (Index >= 0) { |
| 1510 | MI->removeOperand(OpNo: Index); |
| 1511 | Index = MI->findRegisterUseOperandIdx(Reg: AMDGPU::EXEC, /*TRI=*/nullptr); |
| 1512 | } |
| 1513 | MI->setDesc(TII->get(Opcode: AMDGPU::COPY)); |
| 1514 | LLVM_DEBUG(dbgs() << " -> " << *MI); |
| 1515 | } |
| 1516 | } |
| 1517 | for (MachineInstr *MI : LowerToCopyInstrs) { |
| 1518 | LLVM_DEBUG(dbgs() << "simplify: " << *MI); |
| 1519 | |
| 1520 | if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) { |
| 1521 | assert(MI->getNumExplicitOperands() == 6); |
| 1522 | |
| 1523 | LiveInterval *RecomputeLI = nullptr; |
| 1524 | if (MI->getOperand(i: 4).isReg()) |
| 1525 | RecomputeLI = &LIS->getInterval(Reg: MI->getOperand(i: 4).getReg()); |
| 1526 | |
| 1527 | MI->removeOperand(OpNo: 5); |
| 1528 | MI->removeOperand(OpNo: 4); |
| 1529 | MI->removeOperand(OpNo: 3); |
| 1530 | MI->removeOperand(OpNo: 1); |
| 1531 | |
| 1532 | if (RecomputeLI) |
| 1533 | LIS->shrinkToUses(li: RecomputeLI); |
| 1534 | } else { |
| 1535 | assert(MI->getNumExplicitOperands() == 2); |
| 1536 | } |
| 1537 | |
| 1538 | unsigned CopyOp = MI->getOperand(i: 1).isReg() |
| 1539 | ? (unsigned)AMDGPU::COPY |
| 1540 | : TII->getMovOpcode(DstRC: TRI->getRegClassForOperandReg( |
| 1541 | MRI: *MRI, MO: MI->getOperand(i: 0))); |
| 1542 | MI->setDesc(TII->get(Opcode: CopyOp)); |
| 1543 | LLVM_DEBUG(dbgs() << " -> " << *MI); |
| 1544 | } |
| 1545 | return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty(); |
| 1546 | } |
| 1547 | |
| 1548 | bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { |
| 1549 | for (MachineInstr *MI : KillInstrs) { |
| 1550 | MachineInstr *SplitPoint = nullptr; |
| 1551 | switch (MI->getOpcode()) { |
| 1552 | case AMDGPU::SI_DEMOTE_I1: |
| 1553 | case AMDGPU::SI_KILL_I1_TERMINATOR: |
| 1554 | SplitPoint = lowerKillI1(MI&: *MI, IsWQM); |
| 1555 | break; |
| 1556 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
| 1557 | SplitPoint = lowerKillF32(MI&: *MI); |
| 1558 | break; |
| 1559 | } |
| 1560 | if (SplitPoint) |
| 1561 | splitBlock(TermMI: SplitPoint); |
| 1562 | } |
| 1563 | return !KillInstrs.empty(); |
| 1564 | } |
| 1565 | |
| 1566 | void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { |
| 1567 | MachineBasicBlock *MBB = MI.getParent(); |
| 1568 | bool IsWave32 = ST->isWave32(); |
| 1569 | |
| 1570 | if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) { |
| 1571 | assert(MBB == &MBB->getParent()->front() && |
| 1572 | "init whole wave not in entry block" ); |
| 1573 | Register EntryExec = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
| 1574 | MachineInstr *SaveExec = |
| 1575 | BuildMI(BB&: *MBB, I: MBB->begin(), MIMD: MI.getDebugLoc(), |
| 1576 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 |
| 1577 | : AMDGPU::S_OR_SAVEEXEC_B64), |
| 1578 | DestReg: EntryExec) |
| 1579 | .addImm(Val: -1); |
| 1580 | |
| 1581 | // Replace all uses of MI's destination reg with EntryExec. |
| 1582 | MRI->replaceRegWith(FromReg: MI.getOperand(i: 0).getReg(), ToReg: EntryExec); |
| 1583 | |
| 1584 | if (LIS) { |
| 1585 | LIS->RemoveMachineInstrFromMaps(MI); |
| 1586 | } |
| 1587 | |
| 1588 | MI.eraseFromParent(); |
| 1589 | |
| 1590 | if (LIS) { |
| 1591 | LIS->InsertMachineInstrInMaps(MI&: *SaveExec); |
| 1592 | LIS->createAndComputeVirtRegInterval(Reg: EntryExec); |
| 1593 | } |
| 1594 | return; |
| 1595 | } |
| 1596 | |
| 1597 | if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { |
| 1598 | // This should be before all vector instructions. |
| 1599 | MachineInstr *InitMI = |
| 1600 | BuildMI(BB&: *MBB, I: MBB->begin(), MIMD: MI.getDebugLoc(), |
| 1601 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), |
| 1602 | DestReg: Exec) |
| 1603 | .addImm(Val: MI.getOperand(i: 0).getImm()); |
| 1604 | if (LIS) { |
| 1605 | LIS->RemoveMachineInstrFromMaps(MI); |
| 1606 | LIS->InsertMachineInstrInMaps(MI&: *InitMI); |
| 1607 | } |
| 1608 | MI.eraseFromParent(); |
| 1609 | return; |
| 1610 | } |
| 1611 | |
| 1612 | // Extract the thread count from an SGPR input and set EXEC accordingly. |
| 1613 | // Since BFM can't shift by 64, handle that case with CMP + CMOV. |
| 1614 | // |
| 1615 | // S_BFE_U32 count, input, {shift, 7} |
| 1616 | // S_BFM_B64 exec, count, 0 |
| 1617 | // S_CMP_EQ_U32 count, 64 |
| 1618 | // S_CMOV_B64 exec, -1 |
| 1619 | Register InputReg = MI.getOperand(i: 0).getReg(); |
| 1620 | MachineInstr *FirstMI = &*MBB->begin(); |
| 1621 | if (InputReg.isVirtual()) { |
| 1622 | MachineInstr *DefInstr = MRI->getVRegDef(Reg: InputReg); |
| 1623 | assert(DefInstr && DefInstr->isCopy()); |
| 1624 | if (DefInstr->getParent() == MBB) { |
| 1625 | if (DefInstr != FirstMI) { |
| 1626 | // If the `InputReg` is defined in current block, we also need to |
| 1627 | // move that instruction to the beginning of the block. |
| 1628 | DefInstr->removeFromParent(); |
| 1629 | MBB->insert(I: FirstMI, MI: DefInstr); |
| 1630 | if (LIS) |
| 1631 | LIS->handleMove(MI&: *DefInstr); |
| 1632 | } else { |
| 1633 | // If first instruction is definition then move pointer after it. |
| 1634 | FirstMI = &*std::next(x: FirstMI->getIterator()); |
| 1635 | } |
| 1636 | } |
| 1637 | } |
| 1638 | |
| 1639 | // Insert instruction sequence at block beginning (before vector operations). |
| 1640 | const DebugLoc DL = MI.getDebugLoc(); |
| 1641 | const unsigned WavefrontSize = ST->getWavefrontSize(); |
| 1642 | const unsigned Mask = (WavefrontSize << 1) - 1; |
| 1643 | Register CountReg = MRI->createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass); |
| 1644 | auto BfeMI = BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BFE_U32), DestReg: CountReg) |
| 1645 | .addReg(RegNo: InputReg) |
| 1646 | .addImm(Val: (MI.getOperand(i: 1).getImm() & Mask) | 0x70000); |
| 1647 | auto BfmMI = |
| 1648 | BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, |
| 1649 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), DestReg: Exec) |
| 1650 | .addReg(RegNo: CountReg) |
| 1651 | .addImm(Val: 0); |
| 1652 | auto CmpMI = BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32)) |
| 1653 | .addReg(RegNo: CountReg, flags: RegState::Kill) |
| 1654 | .addImm(Val: WavefrontSize); |
| 1655 | auto CmovMI = |
| 1656 | BuildMI(BB&: *MBB, I: FirstMI, MIMD: DL, |
| 1657 | MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), |
| 1658 | DestReg: Exec) |
| 1659 | .addImm(Val: -1); |
| 1660 | |
| 1661 | if (!LIS) { |
| 1662 | MI.eraseFromParent(); |
| 1663 | return; |
| 1664 | } |
| 1665 | |
| 1666 | LIS->RemoveMachineInstrFromMaps(MI); |
| 1667 | MI.eraseFromParent(); |
| 1668 | |
| 1669 | LIS->InsertMachineInstrInMaps(MI&: *BfeMI); |
| 1670 | LIS->InsertMachineInstrInMaps(MI&: *BfmMI); |
| 1671 | LIS->InsertMachineInstrInMaps(MI&: *CmpMI); |
| 1672 | LIS->InsertMachineInstrInMaps(MI&: *CmovMI); |
| 1673 | |
| 1674 | LIS->removeInterval(Reg: InputReg); |
| 1675 | LIS->createAndComputeVirtRegInterval(Reg: InputReg); |
| 1676 | LIS->createAndComputeVirtRegInterval(Reg: CountReg); |
| 1677 | } |
| 1678 | |
| 1679 | /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry |
| 1680 | /// for instructions that depend on EXEC. |
| 1681 | MachineBasicBlock::iterator |
| 1682 | SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) { |
| 1683 | MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI(); |
| 1684 | |
| 1685 | for (MachineInstr *MI : InitExecInstrs) { |
| 1686 | // Try to handle undefined cases gracefully: |
| 1687 | // - multiple INIT_EXEC instructions |
| 1688 | // - INIT_EXEC instructions not in the entry block |
| 1689 | if (MI->getParent() == &Entry) |
| 1690 | InsertPt = std::next(x: MI->getIterator()); |
| 1691 | |
| 1692 | lowerInitExec(MI&: *MI); |
| 1693 | Changed = true; |
| 1694 | } |
| 1695 | |
| 1696 | return InsertPt; |
| 1697 | } |
| 1698 | |
| 1699 | bool SIWholeQuadMode::run(MachineFunction &MF) { |
| 1700 | LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName() |
| 1701 | << " ------------- \n" ); |
| 1702 | LLVM_DEBUG(MF.dump();); |
| 1703 | |
| 1704 | Instructions.clear(); |
| 1705 | Blocks.clear(); |
| 1706 | LiveMaskQueries.clear(); |
| 1707 | LowerToCopyInstrs.clear(); |
| 1708 | LowerToMovInstrs.clear(); |
| 1709 | KillInstrs.clear(); |
| 1710 | InitExecInstrs.clear(); |
| 1711 | SetInactiveInstrs.clear(); |
| 1712 | StateTransition.clear(); |
| 1713 | |
| 1714 | if (ST->isWave32()) { |
| 1715 | AndOpc = AMDGPU::S_AND_B32; |
| 1716 | AndTermOpc = AMDGPU::S_AND_B32_term; |
| 1717 | AndN2Opc = AMDGPU::S_ANDN2_B32; |
| 1718 | XorOpc = AMDGPU::S_XOR_B32; |
| 1719 | AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; |
| 1720 | AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term; |
| 1721 | WQMOpc = AMDGPU::S_WQM_B32; |
| 1722 | Exec = AMDGPU::EXEC_LO; |
| 1723 | } else { |
| 1724 | AndOpc = AMDGPU::S_AND_B64; |
| 1725 | AndTermOpc = AMDGPU::S_AND_B64_term; |
| 1726 | AndN2Opc = AMDGPU::S_ANDN2_B64; |
| 1727 | XorOpc = AMDGPU::S_XOR_B64; |
| 1728 | AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; |
| 1729 | AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term; |
| 1730 | WQMOpc = AMDGPU::S_WQM_B64; |
| 1731 | Exec = AMDGPU::EXEC; |
| 1732 | } |
| 1733 | |
| 1734 | const char GlobalFlags = analyzeFunction(MF); |
| 1735 | bool Changed = false; |
| 1736 | |
| 1737 | LiveMaskReg = Exec; |
| 1738 | |
| 1739 | MachineBasicBlock &Entry = MF.front(); |
| 1740 | MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed); |
| 1741 | |
| 1742 | // Store a copy of the original live mask when required |
| 1743 | const bool HasLiveMaskQueries = !LiveMaskQueries.empty(); |
| 1744 | const bool HasWaveModes = GlobalFlags & ~StateExact; |
| 1745 | const bool HasKills = !KillInstrs.empty(); |
| 1746 | const bool UsesWQM = GlobalFlags & StateWQM; |
| 1747 | if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) { |
| 1748 | LiveMaskReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
| 1749 | MachineInstr *MI = |
| 1750 | BuildMI(BB&: Entry, I: EntryMI, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: LiveMaskReg) |
| 1751 | .addReg(RegNo: Exec); |
| 1752 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
| 1753 | Changed = true; |
| 1754 | } |
| 1755 | |
| 1756 | // Check if V_SET_INACTIVE was touched by a strict state mode. |
| 1757 | // If so, promote to WWM; otherwise lower to COPY. |
| 1758 | for (MachineInstr *MI : SetInactiveInstrs) { |
| 1759 | if (LowerToCopyInstrs.contains(key: MI)) |
| 1760 | continue; |
| 1761 | auto &Info = Instructions[MI]; |
| 1762 | if (Info.MarkedStates & StateStrict) { |
| 1763 | Info.Needs |= StateStrictWWM; |
| 1764 | Info.Disabled &= ~StateStrictWWM; |
| 1765 | Blocks[MI->getParent()].Needs |= StateStrictWWM; |
| 1766 | } else { |
| 1767 | LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI); |
| 1768 | LowerToCopyInstrs.insert(X: MI); |
| 1769 | } |
| 1770 | } |
| 1771 | |
| 1772 | LLVM_DEBUG(printInfo()); |
| 1773 | |
| 1774 | Changed |= lowerLiveMaskQueries(); |
| 1775 | Changed |= lowerCopyInstrs(); |
| 1776 | |
| 1777 | if (!HasWaveModes) { |
| 1778 | // No wave mode execution |
| 1779 | Changed |= lowerKillInstrs(IsWQM: false); |
| 1780 | } else if (GlobalFlags == StateWQM) { |
| 1781 | // Shader only needs WQM |
| 1782 | auto MI = BuildMI(BB&: Entry, I: EntryMI, MIMD: DebugLoc(), MCID: TII->get(Opcode: WQMOpc), DestReg: Exec) |
| 1783 | .addReg(RegNo: Exec); |
| 1784 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
| 1785 | lowerKillInstrs(IsWQM: true); |
| 1786 | Changed = true; |
| 1787 | } else { |
| 1788 | // Mark entry for WQM if required. |
| 1789 | if (GlobalFlags & StateWQM) |
| 1790 | Blocks[&Entry].InNeeds |= StateWQM; |
| 1791 | // Wave mode switching requires full lowering pass. |
| 1792 | for (auto &BII : Blocks) |
| 1793 | processBlock(MBB&: *BII.first, BI&: BII.second, IsEntry: BII.first == &Entry); |
| 1794 | // Lowering blocks causes block splitting so perform as a second pass. |
| 1795 | for (auto &BII : Blocks) |
| 1796 | lowerBlock(MBB&: *BII.first, BI&: BII.second); |
| 1797 | Changed = true; |
| 1798 | } |
| 1799 | |
| 1800 | // Compute live range for live mask |
| 1801 | if (LiveMaskReg != Exec) |
| 1802 | LIS->createAndComputeVirtRegInterval(Reg: LiveMaskReg); |
| 1803 | |
| 1804 | // Physical registers like SCC aren't tracked by default anyway, so just |
| 1805 | // removing the ranges we computed is the simplest option for maintaining |
| 1806 | // the analysis results. |
| 1807 | LIS->removeAllRegUnitsForPhysReg(Reg: AMDGPU::SCC); |
| 1808 | |
| 1809 | // If we performed any kills then recompute EXEC |
| 1810 | if (!KillInstrs.empty() || !InitExecInstrs.empty()) |
| 1811 | LIS->removeAllRegUnitsForPhysReg(Reg: AMDGPU::EXEC); |
| 1812 | |
| 1813 | return Changed; |
| 1814 | } |
| 1815 | |
| 1816 | bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) { |
| 1817 | LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); |
| 1818 | auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); |
| 1819 | MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; |
| 1820 | auto *PDTWrapper = |
| 1821 | getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>(); |
| 1822 | MachinePostDominatorTree *PDT = |
| 1823 | PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; |
| 1824 | SIWholeQuadMode Impl(MF, LIS, MDT, PDT); |
| 1825 | return Impl.run(MF); |
| 1826 | } |
| 1827 | |
| 1828 | PreservedAnalyses |
| 1829 | SIWholeQuadModePass::run(MachineFunction &MF, |
| 1830 | MachineFunctionAnalysisManager &MFAM) { |
| 1831 | MFPropsModifier _(*this, MF); |
| 1832 | |
| 1833 | LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(IR&: MF); |
| 1834 | MachineDominatorTree *MDT = |
| 1835 | MFAM.getCachedResult<MachineDominatorTreeAnalysis>(IR&: MF); |
| 1836 | MachinePostDominatorTree *PDT = |
| 1837 | MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(IR&: MF); |
| 1838 | SIWholeQuadMode Impl(MF, LIS, MDT, PDT); |
| 1839 | bool Changed = Impl.run(MF); |
| 1840 | if (!Changed) |
| 1841 | return PreservedAnalyses::all(); |
| 1842 | |
| 1843 | PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); |
| 1844 | PA.preserve<SlotIndexesAnalysis>(); |
| 1845 | PA.preserve<LiveIntervalsAnalysis>(); |
| 1846 | PA.preserve<MachineDominatorTreeAnalysis>(); |
| 1847 | PA.preserve<MachinePostDominatorTreeAnalysis>(); |
| 1848 | return PA; |
| 1849 | } |
| 1850 | |