1//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Lower VGPRs above first 256 on gfx1250.
11///
12/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
13/// VGPR addressing mode. The mode change is effective until the next change.
14/// This instruction provides high bits of a VGPR address for four of the
15/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
16/// instruction encoding. If bits are set they are added as MSB to the
17/// corresponding operand VGPR number.
18///
19/// There is no need to replace actual register operands because encoding of the
20/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
21/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
22/// VGPRs will survive until actual encoding and will result in a same actual
23/// bit encoding.
24///
25/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
26/// to a VGPR address of the subseqent instructions. The InstPrinter will take
27/// care of the printing a low VGPR instead of a high one. In prinicple this
28/// shall be viable to print actual high VGPR numbers, but that would disagree
29/// with a disasm printing and create a situation where asm text is not
30/// deterministic.
31///
32/// This pass creates a convention where non-fall through basic blocks shall
33/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
34/// An optimization here is possible but deemed not desirable because of the
35/// readbility concerns.
36///
37/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
38/// The pass must run very late in the pipeline to make sure no changes to VGPR
39/// operands will be made after it.
40//
41//===----------------------------------------------------------------------===//
42
43#include "AMDGPULowerVGPREncoding.h"
44#include "AMDGPU.h"
45#include "GCNSubtarget.h"
46#include "SIDefines.h"
47#include "SIInstrInfo.h"
48#include "llvm/ADT/bit.h"
49#include "llvm/Support/MathExtras.h"
50
51using namespace llvm;
52
53#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"
54
55namespace {
56
57class AMDGPULowerVGPREncoding {
58 static constexpr unsigned OpNum = 4;
59 static constexpr unsigned BitsPerField = 2;
60 static constexpr unsigned NumFields = 4;
61 static constexpr unsigned ModeWidth = NumFields * BitsPerField;
62 static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
63 static constexpr unsigned VGPRMSBShift =
64 llvm::countr_zero_constexpr<unsigned>(Val: AMDGPU::Hwreg::DST_VGPR_MSB);
65
66 struct OpMode {
67 // No MSBs set means they are not required to be of a particular value.
68 std::optional<unsigned> MSBits;
69
70 bool update(const OpMode &New, bool &Rewritten) {
71 bool Updated = false;
72 if (New.MSBits) {
73 if (*New.MSBits != MSBits.value_or(u: 0)) {
74 Updated = true;
75 Rewritten |= MSBits.has_value();
76 }
77 MSBits = New.MSBits;
78 }
79 return Updated;
80 }
81 };
82
83 struct ModeTy {
84 OpMode Ops[OpNum];
85
86 bool update(const ModeTy &New, bool &Rewritten) {
87 bool Updated = false;
88 for (unsigned I : seq(Size: OpNum))
89 Updated |= Ops[I].update(New: New.Ops[I], Rewritten);
90 return Updated;
91 }
92
93 unsigned encode() const {
94 // Layout: [src0 msb, src1 msb, src2 msb, dst msb].
95 unsigned V = 0;
96 for (const auto &[I, Op] : enumerate(First: Ops))
97 V |= Op.MSBits.value_or(u: 0) << (I * 2);
98 return V;
99 }
100
101 // Check if this mode is compatible with required \p NewMode without
102 // modification.
103 bool isCompatible(const ModeTy NewMode) const {
104 for (unsigned I : seq(Size: OpNum)) {
105 if (!NewMode.Ops[I].MSBits.has_value())
106 continue;
107 if (Ops[I].MSBits.value_or(u: 0) != NewMode.Ops[I].MSBits.value_or(u: 0))
108 return false;
109 }
110 return true;
111 }
112 };
113
114public:
115 bool run(MachineFunction &MF);
116
117private:
118 const SIInstrInfo *TII;
119 const SIRegisterInfo *TRI;
120
121 // Current basic block.
122 MachineBasicBlock *MBB;
123
124 /// Most recent s_set_* instruction.
125 MachineInstr *MostRecentModeSet;
126
127 /// Current mode bits.
128 ModeTy CurrentMode;
129
130 /// Number of current hard clause instructions.
131 unsigned ClauseLen;
132
133 /// Number of hard clause instructions remaining.
134 unsigned ClauseRemaining;
135
136 /// Clause group breaks.
137 unsigned ClauseBreaks;
138
139 /// Last hard clause instruction.
140 MachineInstr *Clause;
141
142 /// Insert mode change before \p I. \returns true if mode was changed.
143 bool setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I);
144
145 /// Reset mode to default.
146 void resetMode(MachineBasicBlock::instr_iterator I) {
147 ModeTy Mode;
148 for (OpMode &Op : Mode.Ops)
149 Op.MSBits = 0;
150 setMode(NewMode: Mode, I);
151 }
152
153 /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
154 std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
155
156 /// Handle single \p MI. \return true if changed.
157 bool runOnMachineInstr(MachineInstr &MI);
158
159 /// Compute the mode for a single \p MI given \p Ops operands
160 /// bit mapping. Optionally takes second array \p Ops2 for VOPD.
161 /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
162 /// is checked.
163 void computeMode(ModeTy &NewMode, const MachineInstr &MI,
164 const AMDGPU::OpName Ops[OpNum],
165 const AMDGPU::OpName *Ops2 = nullptr);
166
167 /// Check if an instruction \p I is within a clause and returns a suitable
168 /// iterator to insert mode change. It may also modify the S_CLAUSE
169 /// instruction to extend it or drop the clause if it cannot be adjusted.
170 MachineBasicBlock::instr_iterator
171 handleClause(MachineBasicBlock::instr_iterator I);
172
173 /// Check if an instruction \p I is immediately after another program state
174 /// instruction which it cannot coissue with. If so, insert before that
175 /// instruction to encourage more coissuing.
176 MachineBasicBlock::instr_iterator
177 handleCoissue(MachineBasicBlock::instr_iterator I);
178
179 /// Handle S_SETREG_IMM32_B32 targeting MODE register. On certain hardware,
180 /// this instruction clobbers VGPR MSB bits[12:19], so we need to restore
181 /// the current mode. \returns true if the instruction was modified or a
182 /// new one was inserted.
183 bool handleSetregMode(MachineInstr &MI);
184
185 /// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain
186 /// the VGPR MSB mode value. \returns true if the immediate was changed.
187 bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
188};
189
190bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
191 MachineBasicBlock::instr_iterator I) {
192 // Record previous mode into high 8 bits of the immediate.
193 int64_t OldModeBits = CurrentMode.encode() << ModeWidth;
194
195 bool Rewritten = false;
196 if (!CurrentMode.update(New: NewMode, Rewritten))
197 return false;
198
199 if (MostRecentModeSet && !Rewritten) {
200 // Update MostRecentModeSet with the new mode. It can be either
201 // S_SET_VGPR_MSB or S_SETREG_IMM32_B32 (with Size <= 12).
202 if (MostRecentModeSet->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
203 MachineOperand &Op = MostRecentModeSet->getOperand(i: 0);
204 // Carry old mode bits from the existing instruction.
205 int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
206 Op.setImm(CurrentMode.encode() | OldModeBits);
207 } else {
208 assert(MostRecentModeSet->getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
209 "unexpected MostRecentModeSet opcode");
210 updateSetregModeImm(MI&: *MostRecentModeSet, ModeValue: CurrentMode.encode());
211 }
212
213 return true;
214 }
215
216 I = handleClause(I);
217 I = handleCoissue(I);
218 MostRecentModeSet = BuildMI(BB&: *MBB, I, MIMD: {}, MCID: TII->get(Opcode: AMDGPU::S_SET_VGPR_MSB))
219 .addImm(Val: NewMode.encode() | OldModeBits);
220
221 CurrentMode = NewMode;
222 return true;
223}
224
225std::optional<unsigned>
226AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
227 if (!MO.isReg())
228 return std::nullopt;
229
230 MCRegister Reg = MO.getReg();
231 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
232 if (!RC || !TRI->isVGPRClass(RC))
233 return std::nullopt;
234
235 unsigned Idx = TRI->getHWRegIndex(Reg);
236 return Idx >> 8;
237}
238
239void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode,
240 const MachineInstr &MI,
241 const AMDGPU::OpName Ops[OpNum],
242 const AMDGPU::OpName *Ops2) {
243 NewMode = {};
244
245 for (unsigned I = 0; I < OpNum; ++I) {
246 const MachineOperand *Op = TII->getNamedOperand(MI, OperandName: Ops[I]);
247
248 std::optional<unsigned> MSBits;
249 if (Op)
250 MSBits = getMSBs(MO: *Op);
251
252#if !defined(NDEBUG)
253 if (MSBits.has_value() && Ops2) {
254 const MachineOperand *Op2 = TII->getNamedOperand(MI, Ops2[I]);
255 if (Op2) {
256 std::optional<unsigned> MSBits2;
257 MSBits2 = getMSBs(*Op2);
258 if (MSBits2.has_value() && MSBits != MSBits2)
259 llvm_unreachable("Invalid VOPD pair was created");
260 }
261 }
262#endif
263
264 if (!MSBits.has_value() && Ops2) {
265 Op = TII->getNamedOperand(MI, OperandName: Ops2[I]);
266 if (Op)
267 MSBits = getMSBs(MO: *Op);
268 }
269
270 if (!MSBits.has_value())
271 continue;
272
273 // Skip tied uses of src2 of VOP2, these will be handled along with defs and
274 // only vdst bit affects these operands. We cannot skip tied uses of VOP3,
275 // these uses are real even if must match the vdst.
276 if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
277 (SIInstrInfo::isVOP2(MI) ||
278 (SIInstrInfo::isVOP3(MI) &&
279 TII->hasVALU32BitEncoding(Opcode: MI.getOpcode()))))
280 continue;
281
282 NewMode.Ops[I].MSBits = MSBits.value();
283 }
284}
285
286bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
287 auto Ops = AMDGPU::getVGPRLoweringOperandTables(Desc: MI.getDesc());
288 if (Ops.first) {
289 ModeTy NewMode;
290 computeMode(NewMode, MI, Ops: Ops.first, Ops2: Ops.second);
291 if (!CurrentMode.isCompatible(NewMode) && MI.isCommutable() &&
292 TII->commuteInstruction(MI)) {
293 ModeTy NewModeCommuted;
294 computeMode(NewMode&: NewModeCommuted, MI, Ops: Ops.first, Ops2: Ops.second);
295 if (CurrentMode.isCompatible(NewMode: NewModeCommuted))
296 return false;
297 // Commute back.
298 if (!TII->commuteInstruction(MI))
299 llvm_unreachable("Failed to restore commuted instruction.");
300 }
301 return setMode(NewMode, I: MI.getIterator());
302 }
303 assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
304
305 return false;
306}
307
308MachineBasicBlock::instr_iterator
309AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
310 if (!ClauseRemaining)
311 return I;
312
313 // A clause cannot start with a special instruction, place it right before
314 // the clause.
315 if (ClauseRemaining == ClauseLen) {
316 I = Clause->getPrevNode()->getIterator();
317 assert(I->isBundle());
318 return I;
319 }
320
321 // If a clause defines breaks each group cannot start with a mode change.
322 // just drop the clause.
323 if (ClauseBreaks) {
324 Clause->eraseFromBundle();
325 ClauseRemaining = 0;
326 return I;
327 }
328
329 // Otherwise adjust a number of instructions in the clause if it fits.
330 // If it does not clause will just become shorter. Since the length
331 // recorded in the clause is one less, increment the length after the
332 // update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
333 if (ClauseLen < 63)
334 Clause->getOperand(i: 0).setImm(ClauseLen | (ClauseBreaks << 8));
335
336 ++ClauseLen;
337
338 return I;
339}
340
341MachineBasicBlock::instr_iterator
342AMDGPULowerVGPREncoding::handleCoissue(MachineBasicBlock::instr_iterator I) {
343 if (I.isEnd())
344 return I;
345
346 // "Program State instructions" are instructions which are used to control
347 // operation of the GPU rather than performing arithmetic. Such instructions
348 // have different coissuing rules w.r.t s_set_vgpr_msb.
349 auto isProgramStateInstr = [this](MachineInstr *MI) {
350 unsigned Opc = MI->getOpcode();
351 return TII->isBarrier(Opcode: Opc) || TII->isWaitcnt(Opcode: Opc) ||
352 Opc == AMDGPU::S_DELAY_ALU;
353 };
354
355 while (!I.isEnd() && I != I->getParent()->begin()) {
356 auto Prev = std::prev(x: I);
357 if (!isProgramStateInstr(&*Prev))
358 return I;
359 I = Prev;
360 }
361
362 return I;
363}
364
365/// Convert mode value from S_SET_VGPR_MSB format to MODE register format.
366/// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7])
367/// MODE register uses: (dst[0-1], src0[2-3], src1[4-5], src2[6-7])
368/// This is a left rotation by 2 bits on an 8-bit value.
369static int64_t convertModeToSetregFormat(int64_t Mode) {
370 assert(isUInt<8>(Mode) && "Mode expected to be 8-bit");
371 return llvm::rotl<uint8_t>(V: static_cast<uint8_t>(Mode), /*R=*/2);
372}
373
374bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
375 int64_t ModeValue) {
376 assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32);
377
378 // Convert from S_SET_VGPR_MSB format to MODE register format
379 int64_t SetregMode = convertModeToSetregFormat(Mode: ModeValue);
380
381 MachineOperand *ImmOp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::imm);
382 int64_t OldImm = ImmOp->getImm();
383 int64_t NewImm =
384 (OldImm & ~AMDGPU::Hwreg::VGPR_MSB_MASK) | (SetregMode << VGPRMSBShift);
385 ImmOp->setImm(NewImm);
386 return NewImm != OldImm;
387}
388
389bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
390 using namespace AMDGPU::Hwreg;
391
392 assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
393 "only S_SETREG_IMM32_B32 needs to be handled");
394
395 MachineOperand *SIMM16Op = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::simm16);
396 assert(SIMM16Op && "SIMM16Op must be present");
397
398 auto [HwRegId, Offset, Size] = HwregEncoding::decode(Encoded: SIMM16Op->getImm());
399 (void)Offset;
400 if (HwRegId != ID_MODE)
401 return false;
402
403 int64_t ModeValue = CurrentMode.encode();
404
405 // Case 1: Size <= 12 - the original instruction uses imm32[0:Size-1], so
406 // imm32[12:19] is unused. Safe to set imm32[12:19] to the correct VGPR
407 // MSBs.
408 if (Size <= VGPRMSBShift) {
409 // This instruction now acts as MostRecentModeSet so it can be updated if
410 // CurrentMode changes via piggybacking.
411 MostRecentModeSet = &MI;
412 return updateSetregModeImm(MI, ModeValue);
413 }
414
415 // Case 2: Size > 12 - the original instruction uses bits beyond 11, so we
416 // cannot arbitrarily modify imm32[12:19]. Check if it already matches VGPR
417 // MSBs. Note: imm32[12:19] is in MODE register format, while ModeValue is
418 // in S_SET_VGPR_MSB format, so we need to convert before comparing.
419 MachineOperand *ImmOp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::imm);
420 assert(ImmOp && "ImmOp must be present");
421 int64_t ImmBits12To19 = (ImmOp->getImm() & VGPR_MSB_MASK) >> VGPRMSBShift;
422 int64_t SetregModeValue = convertModeToSetregFormat(Mode: ModeValue);
423 if (ImmBits12To19 == SetregModeValue) {
424 // Already correct, but we must invalidate MostRecentModeSet because this
425 // instruction will overwrite mode[12:19]. We can't update this instruction
426 // via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes,
427 // a new s_set_vgpr_msb will be inserted after this instruction.
428 MostRecentModeSet = nullptr;
429 return false;
430 }
431
432 // imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after
433 // the original instruction to restore the correct value.
434 MachineBasicBlock::iterator InsertPt = std::next(x: MI.getIterator());
435 MostRecentModeSet = BuildMI(BB&: *MBB, I: InsertPt, MIMD: MI.getDebugLoc(),
436 MCID: TII->get(Opcode: AMDGPU::S_SET_VGPR_MSB))
437 .addImm(Val: ModeValue);
438 return true;
439}
440
441bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
442 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
443 if (!ST.has1024AddressableVGPRs())
444 return false;
445
446 TII = ST.getInstrInfo();
447 TRI = ST.getRegisterInfo();
448
449 bool Changed = false;
450 ClauseLen = ClauseRemaining = 0;
451 CurrentMode = {};
452 for (auto &MBB : MF) {
453 MostRecentModeSet = nullptr;
454 this->MBB = &MBB;
455
456 for (auto &MI : llvm::make_early_inc_range(Range: MBB.instrs())) {
457 if (MI.isMetaInstruction())
458 continue;
459
460 if (MI.isTerminator() || MI.isCall()) {
461 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
462 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
463 CurrentMode = {};
464 else
465 resetMode(I: MI.getIterator());
466 continue;
467 }
468
469 if (MI.isInlineAsm()) {
470 if (TII->hasVGPRUses(MI))
471 resetMode(I: MI.getIterator());
472 continue;
473 }
474
475 if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
476 assert(!ClauseRemaining && "Nested clauses are not supported");
477 ClauseLen = MI.getOperand(i: 0).getImm();
478 ClauseBreaks = (ClauseLen >> 8) & 15;
479 ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
480 Clause = &MI;
481 continue;
482 }
483
484 if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
485 ST.hasSetregVGPRMSBFixup()) {
486 Changed |= handleSetregMode(MI);
487 continue;
488 }
489
490 Changed |= runOnMachineInstr(MI);
491
492 if (ClauseRemaining)
493 --ClauseRemaining;
494 }
495
496 // Reset the mode if we are falling through.
497 resetMode(I: MBB.instr_end());
498 }
499
500 return Changed;
501}
502
503class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass {
504public:
505 static char ID;
506
507 AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {}
508
509 bool runOnMachineFunction(MachineFunction &MF) override {
510 return AMDGPULowerVGPREncoding().run(MF);
511 }
512
513 void getAnalysisUsage(AnalysisUsage &AU) const override {
514 AU.setPreservesCFG();
515 MachineFunctionPass::getAnalysisUsage(AU);
516 }
517};
518
519} // namespace
520
521char AMDGPULowerVGPREncodingLegacy::ID = 0;
522
523char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID;
524
525INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE,
526 "AMDGPU Lower VGPR Encoding", false, false)
527
528PreservedAnalyses
529AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
530 MachineFunctionAnalysisManager &MFAM) {
531 if (!AMDGPULowerVGPREncoding().run(MF))
532 return PreservedAnalyses::all();
533
534 return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
535}
536