| 1 | //===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// Lower VGPRs above first 256 on gfx1250. |
| 11 | /// |
| 12 | /// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch |
| 13 | /// VGPR addressing mode. The mode change is effective until the next change. |
| 14 | /// This instruction provides high bits of a VGPR address for four of the |
| 15 | /// operands: vdst, src0, src1, and src2, or other 4 operands depending on the |
| 16 | /// instruction encoding. If bits are set they are added as MSB to the |
| 17 | /// corresponding operand VGPR number. |
| 18 | /// |
| 19 | /// There is no need to replace actual register operands because encoding of the |
| 20 | /// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does |
| 21 | /// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high |
| 22 | /// VGPRs will survive until actual encoding and will result in a same actual |
| 23 | /// bit encoding. |
| 24 | /// |
| 25 | /// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset |
| 26 | /// to a VGPR address of the subseqent instructions. The InstPrinter will take |
| 27 | /// care of the printing a low VGPR instead of a high one. In prinicple this |
| 28 | /// shall be viable to print actual high VGPR numbers, but that would disagree |
| 29 | /// with a disasm printing and create a situation where asm text is not |
| 30 | /// deterministic. |
| 31 | /// |
| 32 | /// This pass creates a convention where non-fall through basic blocks shall |
| 33 | /// start with all 4 MSBs zero. Otherwise a disassembly would not be readable. |
| 34 | /// An optimization here is possible but deemed not desirable because of the |
| 35 | /// readbility concerns. |
| 36 | /// |
| 37 | /// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry. |
| 38 | /// The pass must run very late in the pipeline to make sure no changes to VGPR |
| 39 | /// operands will be made after it. |
| 40 | // |
| 41 | //===----------------------------------------------------------------------===// |
| 42 | |
| 43 | #include "AMDGPULowerVGPREncoding.h" |
| 44 | #include "AMDGPU.h" |
| 45 | #include "GCNSubtarget.h" |
| 46 | #include "SIDefines.h" |
| 47 | #include "SIInstrInfo.h" |
| 48 | #include "llvm/ADT/bit.h" |
| 49 | #include "llvm/Support/Debug.h" |
| 50 | #include "llvm/Support/MathExtras.h" |
| 51 | |
| 52 | using namespace llvm; |
| 53 | |
| 54 | #define DEBUG_TYPE "amdgpu-lower-vgpr-encoding" |
| 55 | |
| 56 | namespace { |
| 57 | |
| 58 | class AMDGPULowerVGPREncoding { |
| 59 | static constexpr unsigned OpNum = 4; |
| 60 | static constexpr unsigned BitsPerField = 2; |
| 61 | static constexpr unsigned NumFields = 4; |
| 62 | static constexpr unsigned ModeWidth = NumFields * BitsPerField; |
| 63 | static constexpr unsigned ModeMask = (1 << ModeWidth) - 1; |
| 64 | static constexpr unsigned VGPRMSBShift = |
| 65 | llvm::countr_zero_constexpr<unsigned>(Val: AMDGPU::Hwreg::DST_VGPR_MSB); |
| 66 | |
| 67 | struct OpMode { |
| 68 | // No MSBs set means they are not required to be of a particular value. |
| 69 | std::optional<unsigned> MSBits; |
| 70 | |
| 71 | bool update(const OpMode &New, bool &Rewritten) { |
| 72 | bool Updated = false; |
| 73 | if (New.MSBits) { |
| 74 | if (*New.MSBits != MSBits.value_or(u: 0)) { |
| 75 | Updated = true; |
| 76 | Rewritten |= MSBits.has_value(); |
| 77 | } |
| 78 | MSBits = New.MSBits; |
| 79 | } |
| 80 | return Updated; |
| 81 | } |
| 82 | }; |
| 83 | |
| 84 | struct ModeTy { |
| 85 | OpMode Ops[OpNum]; |
| 86 | |
| 87 | bool update(const ModeTy &New, bool &Rewritten) { |
| 88 | bool Updated = false; |
| 89 | for (unsigned I : seq(Size: OpNum)) |
| 90 | Updated |= Ops[I].update(New: New.Ops[I], Rewritten); |
| 91 | return Updated; |
| 92 | } |
| 93 | |
| 94 | unsigned encode() const { |
| 95 | // Layout: [src0 msb, src1 msb, src2 msb, dst msb]. |
| 96 | unsigned V = 0; |
| 97 | for (const auto &[I, Op] : enumerate(First: Ops)) |
| 98 | V |= Op.MSBits.value_or(u: 0) << (I * 2); |
| 99 | return V; |
| 100 | } |
| 101 | |
| 102 | void print(raw_ostream &OS) const { |
| 103 | static const char *FieldNames[] = {"src0" , "src1" , "src2" , "dst" }; |
| 104 | OS << '{'; |
| 105 | for (const auto &[I, Op] : enumerate(First: Ops)) { |
| 106 | if (I) |
| 107 | OS << ", " ; |
| 108 | OS << FieldNames[I] << '='; |
| 109 | if (Op.MSBits) |
| 110 | OS << *Op.MSBits; |
| 111 | else |
| 112 | OS << '?'; |
| 113 | } |
| 114 | OS << '}'; |
| 115 | } |
| 116 | |
| 117 | // Check if this mode is compatible with required \p NewMode without |
| 118 | // modification. |
| 119 | bool isCompatible(const ModeTy NewMode) const { |
| 120 | for (unsigned I : seq(Size: OpNum)) { |
| 121 | if (!NewMode.Ops[I].MSBits.has_value()) |
| 122 | continue; |
| 123 | if (Ops[I].MSBits.value_or(u: 0) != NewMode.Ops[I].MSBits.value_or(u: 0)) |
| 124 | return false; |
| 125 | } |
| 126 | return true; |
| 127 | } |
| 128 | }; |
| 129 | |
| 130 | public: |
| 131 | bool run(MachineFunction &MF); |
| 132 | |
| 133 | private: |
| 134 | const SIInstrInfo *TII; |
| 135 | const SIRegisterInfo *TRI; |
| 136 | |
| 137 | // Current basic block. |
| 138 | MachineBasicBlock *MBB; |
| 139 | |
| 140 | /// Most recent s_set_* instruction. |
| 141 | MachineInstr *MostRecentModeSet; |
| 142 | |
| 143 | /// Current mode bits. |
| 144 | ModeTy CurrentMode; |
| 145 | |
| 146 | /// Number of current hard clause instructions. |
| 147 | unsigned ClauseLen; |
| 148 | |
| 149 | /// Number of hard clause instructions remaining. |
| 150 | unsigned ClauseRemaining; |
| 151 | |
| 152 | /// Clause group breaks. |
| 153 | unsigned ClauseBreaks; |
| 154 | |
| 155 | /// Last hard clause instruction. |
| 156 | MachineInstr *Clause; |
| 157 | |
| 158 | /// S_SET_VGPR_MSB immediately after S_SETREG_IMM32_B32 targeting MODE is |
| 159 | /// silently dropped on GFX1250. When set, the next S_SET_VGPR_MSB insertion |
| 160 | /// must be preceded by S_NOP to avoid the hazard. |
| 161 | bool NeedNopBeforeSetVGPRMSB; |
| 162 | |
| 163 | /// Insert mode change before \p I. \returns true if mode was changed. |
| 164 | bool setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I); |
| 165 | |
| 166 | /// Reset mode to default. |
| 167 | void resetMode(MachineBasicBlock::instr_iterator I) { |
| 168 | ModeTy Mode; |
| 169 | for (OpMode &Op : Mode.Ops) |
| 170 | Op.MSBits = 0; |
| 171 | setMode(NewMode: Mode, I); |
| 172 | } |
| 173 | |
| 174 | /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt. |
| 175 | std::optional<unsigned> getMSBs(const MachineOperand &MO) const; |
| 176 | |
| 177 | /// Handle single \p MI. \return true if changed. |
| 178 | bool runOnMachineInstr(MachineInstr &MI); |
| 179 | |
| 180 | /// Compute the mode for a single \p MI given \p Ops operands |
| 181 | /// bit mapping. Optionally takes second array \p Ops2 for VOPD. |
| 182 | /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2 |
| 183 | /// is checked. |
| 184 | void computeMode(ModeTy &NewMode, const MachineInstr &MI, |
| 185 | const AMDGPU::OpName Ops[OpNum], |
| 186 | const AMDGPU::OpName *Ops2 = nullptr); |
| 187 | |
| 188 | /// Check if an instruction \p I is within a clause and returns a suitable |
| 189 | /// iterator to insert mode change. It may also modify the S_CLAUSE |
| 190 | /// instruction to extend it or drop the clause if it cannot be adjusted. |
| 191 | MachineBasicBlock::instr_iterator |
| 192 | handleClause(MachineBasicBlock::instr_iterator I); |
| 193 | |
| 194 | /// Check if an instruction \p I is immediately after another program state |
| 195 | /// instruction which it cannot coissue with. If so, insert before that |
| 196 | /// instruction to encourage more coissuing. |
| 197 | MachineBasicBlock::instr_iterator |
| 198 | handleCoissue(MachineBasicBlock::instr_iterator I); |
| 199 | |
| 200 | /// Handle S_SETREG_IMM32_B32 targeting MODE register. On certain hardware, |
| 201 | /// this instruction clobbers VGPR MSB bits[12:19], so we need to restore |
| 202 | /// the current mode. \returns true if the instruction was modified or a |
| 203 | /// new one was inserted. |
| 204 | bool handleSetregMode(MachineInstr &MI); |
| 205 | |
| 206 | /// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain |
| 207 | /// the VGPR MSB mode value. \returns true if the immediate was changed. |
| 208 | bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue); |
| 209 | }; |
| 210 | |
| 211 | bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, |
| 212 | MachineBasicBlock::instr_iterator I) { |
| 213 | LLVM_DEBUG({ |
| 214 | dbgs() << " setMode: NewMode=" ; |
| 215 | NewMode.print(dbgs()); |
| 216 | dbgs() << " CurrentMode=" ; |
| 217 | CurrentMode.print(dbgs()); |
| 218 | dbgs() << " MostRecentModeSet=" << (MostRecentModeSet ? "yes" : "null" ); |
| 219 | if (I != MBB->instr_end()) |
| 220 | dbgs() << " before: " << *I; |
| 221 | else |
| 222 | dbgs() << " at end\n" ; |
| 223 | }); |
| 224 | |
| 225 | // Record previous mode into high 8 bits of the immediate. |
| 226 | int64_t OldModeBits = CurrentMode.encode() << ModeWidth; |
| 227 | |
| 228 | bool Rewritten = false; |
| 229 | if (!CurrentMode.update(New: NewMode, Rewritten)) { |
| 230 | LLVM_DEBUG(dbgs() << " -> no change needed\n" ); |
| 231 | return false; |
| 232 | } |
| 233 | |
| 234 | LLVM_DEBUG(dbgs() << " Rewritten=" << Rewritten << " after update\n" ); |
| 235 | |
| 236 | if (MostRecentModeSet && !Rewritten) { |
| 237 | // Update MostRecentModeSet with the new mode. It can be either |
| 238 | // S_SET_VGPR_MSB or S_SETREG_IMM32_B32 (with Size <= 12). |
| 239 | if (MostRecentModeSet->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { |
| 240 | MachineOperand &Op = MostRecentModeSet->getOperand(i: 0); |
| 241 | // Carry old mode bits from the existing instruction. |
| 242 | int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth); |
| 243 | Op.setImm(CurrentMode.encode() | OldModeBits); |
| 244 | LLVM_DEBUG(dbgs() << " -> piggybacked onto S_SET_VGPR_MSB: " |
| 245 | << *MostRecentModeSet); |
| 246 | } else { |
| 247 | assert(MostRecentModeSet->getOpcode() == AMDGPU::S_SETREG_IMM32_B32 && |
| 248 | "unexpected MostRecentModeSet opcode" ); |
| 249 | updateSetregModeImm(MI&: *MostRecentModeSet, ModeValue: CurrentMode.encode()); |
| 250 | LLVM_DEBUG(dbgs() << " -> piggybacked onto S_SETREG_IMM32_B32: " |
| 251 | << *MostRecentModeSet); |
| 252 | } |
| 253 | |
| 254 | return true; |
| 255 | } |
| 256 | |
| 257 | I = handleClause(I); |
| 258 | I = handleCoissue(I); |
| 259 | // Case 2 match in handleSetregMode: the setreg's imm[12:19] matched |
| 260 | // current MSBs, but the next VALU needs different MSBs, so this |
| 261 | // S_SET_VGPR_MSB would land right after the setreg. Insert S_NOP to |
| 262 | // prevent it from being silently dropped. |
| 263 | if (NeedNopBeforeSetVGPRMSB) { |
| 264 | BuildMI(BB&: *MBB, I, MIMD: {}, MCID: TII->get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0); |
| 265 | NeedNopBeforeSetVGPRMSB = false; |
| 266 | } |
| 267 | MostRecentModeSet = BuildMI(BB&: *MBB, I, MIMD: {}, MCID: TII->get(Opcode: AMDGPU::S_SET_VGPR_MSB)) |
| 268 | .addImm(Val: NewMode.encode() | OldModeBits); |
| 269 | LLVM_DEBUG(dbgs() << " -> inserted new S_SET_VGPR_MSB: " |
| 270 | << *MostRecentModeSet); |
| 271 | |
| 272 | CurrentMode = NewMode; |
| 273 | return true; |
| 274 | } |
| 275 | |
| 276 | std::optional<unsigned> |
| 277 | AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const { |
| 278 | if (!MO.isReg()) |
| 279 | return std::nullopt; |
| 280 | |
| 281 | MCRegister Reg = MO.getReg(); |
| 282 | const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); |
| 283 | if (!RC || !TRI->isVGPRClass(RC)) |
| 284 | return std::nullopt; |
| 285 | |
| 286 | unsigned Idx = TRI->getHWRegIndex(Reg); |
| 287 | return Idx >> 8; |
| 288 | } |
| 289 | |
| 290 | void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, |
| 291 | const MachineInstr &MI, |
| 292 | const AMDGPU::OpName Ops[OpNum], |
| 293 | const AMDGPU::OpName *Ops2) { |
| 294 | NewMode = {}; |
| 295 | |
| 296 | for (unsigned I = 0; I < OpNum; ++I) { |
| 297 | const MachineOperand *Op = TII->getNamedOperand(MI, OperandName: Ops[I]); |
| 298 | |
| 299 | std::optional<unsigned> MSBits; |
| 300 | if (Op) |
| 301 | MSBits = getMSBs(MO: *Op); |
| 302 | |
| 303 | #if !defined(NDEBUG) |
| 304 | if (MSBits.has_value() && Ops2) { |
| 305 | const MachineOperand *Op2 = TII->getNamedOperand(MI, Ops2[I]); |
| 306 | if (Op2) { |
| 307 | std::optional<unsigned> MSBits2; |
| 308 | MSBits2 = getMSBs(*Op2); |
| 309 | if (MSBits2.has_value() && MSBits != MSBits2) |
| 310 | llvm_unreachable("Invalid VOPD pair was created" ); |
| 311 | } |
| 312 | } |
| 313 | #endif |
| 314 | |
| 315 | if (!MSBits.has_value() && Ops2) { |
| 316 | Op = TII->getNamedOperand(MI, OperandName: Ops2[I]); |
| 317 | if (Op) |
| 318 | MSBits = getMSBs(MO: *Op); |
| 319 | } |
| 320 | |
| 321 | if (!MSBits.has_value()) |
| 322 | continue; |
| 323 | |
| 324 | // Skip tied uses of src2 of VOP2, these will be handled along with defs and |
| 325 | // only vdst bit affects these operands. We cannot skip tied uses of VOP3, |
| 326 | // these uses are real even if must match the vdst. |
| 327 | if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() && |
| 328 | (SIInstrInfo::isVOP2(MI) || |
| 329 | (SIInstrInfo::isVOP3(MI) && |
| 330 | TII->hasVALU32BitEncoding(Opcode: MI.getOpcode())))) |
| 331 | continue; |
| 332 | |
| 333 | NewMode.Ops[I].MSBits = MSBits.value(); |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) { |
| 338 | auto Ops = AMDGPU::getVGPRLoweringOperandTables(Desc: MI.getDesc()); |
| 339 | if (Ops.first) { |
| 340 | ModeTy NewMode; |
| 341 | computeMode(NewMode, MI, Ops: Ops.first, Ops2: Ops.second); |
| 342 | LLVM_DEBUG({ |
| 343 | dbgs() << " runOnMachineInstr: " ; |
| 344 | MI.print(dbgs()); |
| 345 | dbgs() << " computed NewMode=" ; |
| 346 | NewMode.print(dbgs()); |
| 347 | dbgs() << " compatible=" << CurrentMode.isCompatible(NewMode) << '\n'; |
| 348 | }); |
| 349 | if (!CurrentMode.isCompatible(NewMode) && MI.isCommutable() && |
| 350 | TII->commuteInstruction(MI)) { |
| 351 | ModeTy NewModeCommuted; |
| 352 | computeMode(NewMode&: NewModeCommuted, MI, Ops: Ops.first, Ops2: Ops.second); |
| 353 | LLVM_DEBUG({ |
| 354 | dbgs() << " commuted NewMode=" ; |
| 355 | NewModeCommuted.print(dbgs()); |
| 356 | dbgs() << " compatible=" << CurrentMode.isCompatible(NewModeCommuted) |
| 357 | << '\n'; |
| 358 | }); |
| 359 | if (CurrentMode.isCompatible(NewMode: NewModeCommuted)) { |
| 360 | // Update CurrentMode with mode bits the commuted instruction relies on. |
| 361 | // This prevents later instructions from piggybacking and corrupting |
| 362 | // those bits (e.g., a nullopt src treated as 0 could be overwritten). |
| 363 | bool Unused = false; |
| 364 | CurrentMode.update(New: NewModeCommuted, Rewritten&: Unused); |
| 365 | // MI was modified by the commute above. |
| 366 | return true; |
| 367 | } |
| 368 | // Commute back. |
| 369 | if (!TII->commuteInstruction(MI)) |
| 370 | llvm_unreachable("Failed to restore commuted instruction." ); |
| 371 | } |
| 372 | return setMode(NewMode, I: MI.getIterator()); |
| 373 | } |
| 374 | assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo()); |
| 375 | |
| 376 | return false; |
| 377 | } |
| 378 | |
| 379 | MachineBasicBlock::instr_iterator |
| 380 | AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) { |
| 381 | if (!ClauseRemaining) |
| 382 | return I; |
| 383 | |
| 384 | // A clause cannot start with a special instruction, place it right before |
| 385 | // the clause. |
| 386 | if (ClauseRemaining == ClauseLen) { |
| 387 | I = Clause->getPrevNode()->getIterator(); |
| 388 | assert(I->isBundle()); |
| 389 | return I; |
| 390 | } |
| 391 | |
| 392 | // If a clause defines breaks each group cannot start with a mode change. |
| 393 | // just drop the clause. |
| 394 | if (ClauseBreaks) { |
| 395 | Clause->eraseFromBundle(); |
| 396 | ClauseRemaining = 0; |
| 397 | return I; |
| 398 | } |
| 399 | |
| 400 | // Otherwise adjust a number of instructions in the clause if it fits. |
| 401 | // If it does not clause will just become shorter. Since the length |
| 402 | // recorded in the clause is one less, increment the length after the |
| 403 | // update. Note that SIMM16[5:0] must be 1-62, not 0 or 63. |
| 404 | if (ClauseLen < 63) |
| 405 | Clause->getOperand(i: 0).setImm(ClauseLen | (ClauseBreaks << 8)); |
| 406 | |
| 407 | ++ClauseLen; |
| 408 | |
| 409 | return I; |
| 410 | } |
| 411 | |
| 412 | MachineBasicBlock::instr_iterator |
| 413 | AMDGPULowerVGPREncoding::handleCoissue(MachineBasicBlock::instr_iterator I) { |
| 414 | if (I.isEnd()) |
| 415 | return I; |
| 416 | |
| 417 | // "Program State instructions" are instructions which are used to control |
| 418 | // operation of the GPU rather than performing arithmetic. Such instructions |
| 419 | // have different coissuing rules w.r.t s_set_vgpr_msb. |
| 420 | auto isProgramStateInstr = [this](MachineInstr *MI) { |
| 421 | unsigned Opc = MI->getOpcode(); |
| 422 | return TII->isBarrier(Opcode: Opc) || TII->isWaitcnt(Opcode: Opc) || |
| 423 | Opc == AMDGPU::S_DELAY_ALU; |
| 424 | }; |
| 425 | |
| 426 | while (!I.isEnd() && I != I->getParent()->begin()) { |
| 427 | auto Prev = std::prev(x: I); |
| 428 | if (!isProgramStateInstr(&*Prev)) |
| 429 | return I; |
| 430 | I = Prev; |
| 431 | } |
| 432 | |
| 433 | return I; |
| 434 | } |
| 435 | |
| 436 | /// Convert mode value from S_SET_VGPR_MSB format to MODE register format. |
| 437 | /// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7]) |
| 438 | /// MODE register uses: (dst[0-1], src0[2-3], src1[4-5], src2[6-7]) |
| 439 | /// This is a left rotation by 2 bits on an 8-bit value. |
| 440 | static int64_t convertModeToSetregFormat(int64_t Mode) { |
| 441 | assert(isUInt<8>(Mode) && "Mode expected to be 8-bit" ); |
| 442 | return llvm::rotl<uint8_t>(V: static_cast<uint8_t>(Mode), /*R=*/2); |
| 443 | } |
| 444 | |
| 445 | bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI, |
| 446 | int64_t ModeValue) { |
| 447 | assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32); |
| 448 | |
| 449 | // Convert from S_SET_VGPR_MSB format to MODE register format |
| 450 | int64_t SetregMode = convertModeToSetregFormat(Mode: ModeValue); |
| 451 | |
| 452 | MachineOperand *ImmOp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::imm); |
| 453 | int64_t OldImm = ImmOp->getImm(); |
| 454 | int64_t NewImm = |
| 455 | (OldImm & ~AMDGPU::Hwreg::VGPR_MSB_MASK) | (SetregMode << VGPRMSBShift); |
| 456 | ImmOp->setImm(NewImm); |
| 457 | return NewImm != OldImm; |
| 458 | } |
| 459 | |
| 460 | bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) { |
| 461 | using namespace AMDGPU::Hwreg; |
| 462 | |
| 463 | assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 && |
| 464 | "only S_SETREG_IMM32_B32 needs to be handled" ); |
| 465 | |
| 466 | LLVM_DEBUG(dbgs() << " handleSetregMode: " << MI); |
| 467 | |
| 468 | MachineOperand *SIMM16Op = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::simm16); |
| 469 | assert(SIMM16Op && "SIMM16Op must be present" ); |
| 470 | |
| 471 | auto [HwRegId, Offset, Size] = HwregEncoding::decode(Encoded: SIMM16Op->getImm()); |
| 472 | (void)Offset; |
| 473 | LLVM_DEBUG(dbgs() << " HwRegId=" << HwRegId << " Offset=" << Offset |
| 474 | << " Size=" << Size << '\n'); |
| 475 | if (HwRegId != ID_MODE) { |
| 476 | LLVM_DEBUG(dbgs() << " -> not ID_MODE, skipping\n" ); |
| 477 | return false; |
| 478 | } |
| 479 | |
| 480 | int64_t ModeValue = CurrentMode.encode(); |
| 481 | LLVM_DEBUG({ |
| 482 | dbgs() << " CurrentMode=" ; |
| 483 | CurrentMode.print(dbgs()); |
| 484 | dbgs() << " encoded=0x" << Twine::utohexstr(ModeValue) |
| 485 | << " VGPRMSBShift=" << VGPRMSBShift << '\n'; |
| 486 | }); |
| 487 | |
| 488 | // Case 1: Size <= 12 - the original instruction uses imm32[0:Size-1], so |
| 489 | // imm32[12:19] is unused. Safe to set imm32[12:19] to the correct VGPR |
| 490 | // MSBs. |
| 491 | if (Size <= VGPRMSBShift) { |
| 492 | LLVM_DEBUG(dbgs() << " Case 1: Size(" << Size << ") <= VGPRMSBShift(" |
| 493 | << VGPRMSBShift |
| 494 | << "), treating as mode scope boundary\n" ); |
| 495 | // This instruction is at the boundary of the old mode's control range. |
| 496 | // Reset CurrentMode so that the next setMode call can freely piggyback |
| 497 | // the required mode into bits[12:19] without triggering Rewritten. |
| 498 | MostRecentModeSet = &MI; |
| 499 | CurrentMode = {}; |
| 500 | bool Changed = updateSetregModeImm(MI, ModeValue: 0); |
| 501 | LLVM_DEBUG(dbgs() << " -> reset CurrentMode, cleared bits[12:19]: " |
| 502 | << MI); |
| 503 | return Changed; |
| 504 | } |
| 505 | |
| 506 | // Case 2: Size > 12 - the original instruction uses bits beyond 11, so we |
| 507 | // cannot arbitrarily modify imm32[12:19]. Check if it already matches VGPR |
| 508 | // MSBs. Note: imm32[12:19] is in MODE register format, while ModeValue is |
| 509 | // in S_SET_VGPR_MSB format, so we need to convert before comparing. |
| 510 | MachineOperand *ImmOp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::imm); |
| 511 | assert(ImmOp && "ImmOp must be present" ); |
| 512 | int64_t ImmBits12To19 = (ImmOp->getImm() & VGPR_MSB_MASK) >> VGPRMSBShift; |
| 513 | int64_t SetregModeValue = convertModeToSetregFormat(Mode: ModeValue); |
| 514 | LLVM_DEBUG(dbgs() << " Case 2: Size(" << Size << ") > VGPRMSBShift, " |
| 515 | << "ImmBits12To19=0x" << Twine::utohexstr(ImmBits12To19) |
| 516 | << " SetregModeValue=0x" |
| 517 | << Twine::utohexstr(SetregModeValue) << '\n'); |
| 518 | if (ImmBits12To19 == SetregModeValue) { |
| 519 | // Already correct, but we must invalidate MostRecentModeSet because this |
| 520 | // instruction will overwrite mode[12:19]. We can't update this instruction |
| 521 | // via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes, |
| 522 | // a new s_set_vgpr_msb will be inserted after this instruction. |
| 523 | MostRecentModeSet = nullptr; |
| 524 | NeedNopBeforeSetVGPRMSB = true; |
| 525 | LLVM_DEBUG(dbgs() << " -> bits[12:19] already correct, " |
| 526 | "invalidated MostRecentModeSet\n" ); |
| 527 | return false; |
| 528 | } |
| 529 | |
| 530 | // imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after |
| 531 | // the original instruction to restore the correct value. Insert S_NOP |
| 532 | // to avoid the GFX1250 hazard where S_SET_VGPR_MSB immediately after |
| 533 | // S_SETREG_IMM32_B32(MODE) is silently dropped. |
| 534 | MachineBasicBlock::iterator InsertPt = std::next(x: MI.getIterator()); |
| 535 | BuildMI(BB&: *MBB, I: InsertPt, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0); |
| 536 | MostRecentModeSet = BuildMI(BB&: *MBB, I: InsertPt, MIMD: MI.getDebugLoc(), |
| 537 | MCID: TII->get(Opcode: AMDGPU::S_SET_VGPR_MSB)) |
| 538 | .addImm(Val: ModeValue); |
| 539 | LLVM_DEBUG(dbgs() << " -> inserted S_SET_VGPR_MSB after setreg: " |
| 540 | << *MostRecentModeSet); |
| 541 | return true; |
| 542 | } |
| 543 | |
| 544 | bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { |
| 545 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 546 | if (!ST.has1024AddressableVGPRs()) |
| 547 | return false; |
| 548 | |
| 549 | TII = ST.getInstrInfo(); |
| 550 | TRI = ST.getRegisterInfo(); |
| 551 | |
| 552 | LLVM_DEBUG(dbgs() << "*** AMDGPULowerVGPREncoding on " << MF.getName() |
| 553 | << " ***\n" ); |
| 554 | |
| 555 | bool Changed = false; |
| 556 | ClauseLen = ClauseRemaining = 0; |
| 557 | CurrentMode = {}; |
| 558 | for (auto &MBB : MF) { |
| 559 | MostRecentModeSet = nullptr; |
| 560 | NeedNopBeforeSetVGPRMSB = false; |
| 561 | this->MBB = &MBB; |
| 562 | |
| 563 | LLVM_DEBUG(dbgs() << "BB#" << MBB.getNumber() << ' ' << MBB.getName() |
| 564 | << ":\n" ); |
| 565 | |
| 566 | for (auto &MI : llvm::make_early_inc_range(Range: MBB.instrs())) { |
| 567 | if (MI.isMetaInstruction()) |
| 568 | continue; |
| 569 | |
| 570 | if (MI.isTerminator() || MI.isCall()) { |
| 571 | LLVM_DEBUG(dbgs() << " terminator/call: " << MI); |
| 572 | if (MI.getOpcode() == AMDGPU::S_ENDPGM || |
| 573 | MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) |
| 574 | CurrentMode = {}; |
| 575 | else |
| 576 | resetMode(I: MI.getIterator()); |
| 577 | NeedNopBeforeSetVGPRMSB = false; |
| 578 | continue; |
| 579 | } |
| 580 | |
| 581 | if (MI.isInlineAsm()) { |
| 582 | LLVM_DEBUG(dbgs() << " inline asm: " << MI); |
| 583 | if (TII->hasVGPRUses(MI)) |
| 584 | resetMode(I: MI.getIterator()); |
| 585 | NeedNopBeforeSetVGPRMSB = false; |
| 586 | continue; |
| 587 | } |
| 588 | |
| 589 | if (MI.getOpcode() == AMDGPU::S_CLAUSE) { |
| 590 | assert(!ClauseRemaining && "Nested clauses are not supported" ); |
| 591 | ClauseLen = MI.getOperand(i: 0).getImm(); |
| 592 | ClauseBreaks = (ClauseLen >> 8) & 15; |
| 593 | ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1; |
| 594 | Clause = &MI; |
| 595 | LLVM_DEBUG(dbgs() << " clause: len=" << ClauseLen |
| 596 | << " breaks=" << ClauseBreaks << '\n'); |
| 597 | continue; |
| 598 | } |
| 599 | |
| 600 | if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 && |
| 601 | ST.hasSetregVGPRMSBFixup()) { |
| 602 | Changed |= handleSetregMode(MI); |
| 603 | continue; |
| 604 | } |
| 605 | |
| 606 | Changed |= runOnMachineInstr(MI); |
| 607 | NeedNopBeforeSetVGPRMSB = false; |
| 608 | |
| 609 | if (ClauseRemaining) |
| 610 | --ClauseRemaining; |
| 611 | } |
| 612 | |
| 613 | // Reset the mode if we are falling through. |
| 614 | LLVM_DEBUG(dbgs() << " end of BB, resetting mode\n" ); |
| 615 | resetMode(I: MBB.instr_end()); |
| 616 | } |
| 617 | |
| 618 | return Changed; |
| 619 | } |
| 620 | |
| 621 | class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass { |
| 622 | public: |
| 623 | static char ID; |
| 624 | |
| 625 | AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {} |
| 626 | |
| 627 | bool runOnMachineFunction(MachineFunction &MF) override { |
| 628 | return AMDGPULowerVGPREncoding().run(MF); |
| 629 | } |
| 630 | |
| 631 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 632 | AU.setPreservesCFG(); |
| 633 | MachineFunctionPass::getAnalysisUsage(AU); |
| 634 | } |
| 635 | }; |
| 636 | |
| 637 | } // namespace |
| 638 | |
| 639 | char AMDGPULowerVGPREncodingLegacy::ID = 0; |
| 640 | |
| 641 | char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID; |
| 642 | |
| 643 | INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE, |
| 644 | "AMDGPU Lower VGPR Encoding" , false, false) |
| 645 | |
| 646 | PreservedAnalyses |
| 647 | AMDGPULowerVGPREncodingPass::run(MachineFunction &MF, |
| 648 | MachineFunctionAnalysisManager &MFAM) { |
| 649 | if (!AMDGPULowerVGPREncoding().run(MF)) |
| 650 | return PreservedAnalyses::all(); |
| 651 | |
| 652 | return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>(); |
| 653 | } |
| 654 | |