1//===-- SIPreEmitPeephole.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass performs the peephole optimizations before code emission.
11///
12/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
13/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
14/// co-issued. This helps with overlapping MFMA and certain vector instructions
15/// in machine schedules and is expected to improve performance. Only those
16/// packed instructions are unpacked that are overlapped by the MFMA latency.
17/// Rest should remain untouched.
18/// TODO: Add support for F16 packed instructions
19//===----------------------------------------------------------------------===//
20
21#include "AMDGPU.h"
22#include "GCNSubtarget.h"
23#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24#include "llvm/ADT/SetVector.h"
25#include "llvm/CodeGen/MachineDominators.h"
26#include "llvm/CodeGen/MachineFunctionPass.h"
27#include "llvm/CodeGen/MachinePostDominators.h"
28#include "llvm/CodeGen/TargetSchedule.h"
29#include "llvm/Support/BranchProbability.h"
30using namespace llvm;
31
32#define DEBUG_TYPE "si-pre-emit-peephole"
33
34namespace {
35
36class SIPreEmitPeephole {
37private:
38 const SIInstrInfo *TII = nullptr;
39 const SIRegisterInfo *TRI = nullptr;
40
41 bool optimizeVccBranch(MachineInstr &MI) const;
42 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
43 bool getBlockDestinations(MachineBasicBlock &SrcMBB,
44 MachineBasicBlock *&TrueMBB,
45 MachineBasicBlock *&FalseMBB,
46 SmallVectorImpl<MachineOperand> &Cond);
47 bool mustRetainExeczBranch(const MachineInstr &Branch,
48 const MachineBasicBlock &From,
49 const MachineBasicBlock &To) const;
50 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
51 // Creates a list of packed instructions following an MFMA that are suitable
52 // for unpacking.
53 void collectUnpackingCandidates(MachineInstr &BeginMI,
54 SetVector<MachineInstr *> &InstrsToUnpack,
55 uint16_t NumMFMACycles);
56 // v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
57 // op_sel_hi:[0,0,0]
58 // ==>
59 // v_fma_f32 v0, v1, v3, v3
60 // v_fma_f32 v1, v0, v2, v2
61 // Here, we have overwritten v0 before we use it. This function checks if
62 // unpacking can lead to such a situation.
63 bool canUnpackingClobberRegister(const MachineInstr &MI);
64 // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
65 // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
66 // this transformation.
67 void performF32Unpacking(MachineInstr &I);
68 // Select corresponding unpacked instruction
69 uint16_t mapToUnpackedOpcode(MachineInstr &I);
70 // Creates the unpacked instruction to be inserted. Adds source modifiers to
71 // the unpacked instructions based on the source modifiers in the packed
72 // instruction.
73 MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode,
74 bool IsHiBits);
75 // Process operands/source modifiers from packed instructions and insert the
76 // appropriate source modifers and operands into the unpacked instructions.
77 void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods,
78 bool IsHiBits, const MachineOperand &SrcMO);
79
80public:
81 bool run(MachineFunction &MF);
82};
83
84class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
85public:
86 static char ID;
87
88 SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {
89 initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry());
90 }
91
92 bool runOnMachineFunction(MachineFunction &MF) override {
93 return SIPreEmitPeephole().run(MF);
94 }
95};
96
97} // End anonymous namespace.
98
99INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE,
100 "SI peephole optimizations", false, false)
101
102char SIPreEmitPeepholeLegacy::ID = 0;
103
104char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID;
105
106bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
107 // Match:
108 // sreg = -1 or 0
109 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
110 // S_CBRANCH_VCC[N]Z
111 // =>
112 // S_CBRANCH_EXEC[N]Z
113 // We end up with this pattern sometimes after basic block placement.
114 // It happens while combining a block which assigns -1 or 0 to a saved mask
115 // and another block which consumes that saved mask and then a branch.
116 //
117 // While searching this also performs the following substitution:
118 // vcc = V_CMP
119 // vcc = S_AND exec, vcc
120 // S_CBRANCH_VCC[N]Z
121 // =>
122 // vcc = V_CMP
123 // S_CBRANCH_VCC[N]Z
124
125 bool Changed = false;
126 MachineBasicBlock &MBB = *MI.getParent();
127 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
128 const bool IsWave32 = ST.isWave32();
129 const unsigned CondReg = TRI->getVCC();
130 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
131 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
132 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
133 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
134
135 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
136 E = MBB.rend();
137 bool ReadsCond = false;
138 unsigned Threshold = 5;
139 for (++A; A != E; ++A) {
140 if (!--Threshold)
141 return false;
142 if (A->modifiesRegister(Reg: ExecReg, TRI))
143 return false;
144 if (A->modifiesRegister(Reg: CondReg, TRI)) {
145 if (!A->definesRegister(Reg: CondReg, TRI) ||
146 (A->getOpcode() != And && A->getOpcode() != AndN2))
147 return false;
148 break;
149 }
150 ReadsCond |= A->readsRegister(Reg: CondReg, TRI);
151 }
152 if (A == E)
153 return false;
154
155 MachineOperand &Op1 = A->getOperand(i: 1);
156 MachineOperand &Op2 = A->getOperand(i: 2);
157 if ((!Op1.isReg() || Op1.getReg() != ExecReg) && Op2.isReg() &&
158 Op2.getReg() == ExecReg) {
159 TII->commuteInstruction(MI&: *A);
160 Changed = true;
161 }
162 if (!Op1.isReg() || Op1.getReg() != ExecReg)
163 return Changed;
164 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
165 return Changed;
166
167 int64_t MaskValue = 0;
168 Register SReg;
169 if (Op2.isReg()) {
170 SReg = Op2.getReg();
171 auto M = std::next(x: A);
172 bool ReadsSreg = false;
173 bool ModifiesExec = false;
174 for (; M != E; ++M) {
175 if (M->definesRegister(Reg: SReg, TRI))
176 break;
177 if (M->modifiesRegister(Reg: SReg, TRI))
178 return Changed;
179 ReadsSreg |= M->readsRegister(Reg: SReg, TRI);
180 ModifiesExec |= M->modifiesRegister(Reg: ExecReg, TRI);
181 }
182 if (M == E)
183 return Changed;
184 // If SReg is VCC and SReg definition is a VALU comparison.
185 // This means S_AND with EXEC is not required.
186 // Erase the S_AND and return.
187 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
188 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
189 TII->isVOPC(MI: *M)) {
190 A->eraseFromParent();
191 return true;
192 }
193 if (!M->isMoveImmediate() || !M->getOperand(i: 1).isImm() ||
194 (M->getOperand(i: 1).getImm() != -1 && M->getOperand(i: 1).getImm() != 0))
195 return Changed;
196 MaskValue = M->getOperand(i: 1).getImm();
197 // First if sreg is only used in the AND instruction fold the immediate
198 // into the AND.
199 if (!ReadsSreg && Op2.isKill()) {
200 A->getOperand(i: 2).ChangeToImmediate(ImmVal: MaskValue);
201 M->eraseFromParent();
202 }
203 } else if (Op2.isImm()) {
204 MaskValue = Op2.getImm();
205 } else {
206 llvm_unreachable("Op2 must be register or immediate");
207 }
208
209 // Invert mask for s_andn2
210 assert(MaskValue == 0 || MaskValue == -1);
211 if (A->getOpcode() == AndN2)
212 MaskValue = ~MaskValue;
213
214 if (!ReadsCond && A->registerDefIsDead(Reg: AMDGPU::SCC, /*TRI=*/nullptr)) {
215 if (!MI.killsRegister(Reg: CondReg, TRI)) {
216 // Replace AND with MOV
217 if (MaskValue == 0) {
218 BuildMI(BB&: *A->getParent(), I&: *A, MIMD: A->getDebugLoc(), MCID: TII->get(Opcode: Mov), DestReg: CondReg)
219 .addImm(Val: 0);
220 } else {
221 BuildMI(BB&: *A->getParent(), I&: *A, MIMD: A->getDebugLoc(), MCID: TII->get(Opcode: Mov), DestReg: CondReg)
222 .addReg(RegNo: ExecReg);
223 }
224 }
225 // Remove AND instruction
226 A->eraseFromParent();
227 }
228
229 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
230 if (SReg == ExecReg) {
231 // EXEC is updated directly
232 if (IsVCCZ) {
233 MI.eraseFromParent();
234 return true;
235 }
236 MI.setDesc(TII->get(Opcode: AMDGPU::S_BRANCH));
237 } else if (IsVCCZ && MaskValue == 0) {
238 // Will always branch
239 // Remove all successors shadowed by new unconditional branch
240 MachineBasicBlock *Parent = MI.getParent();
241 SmallVector<MachineInstr *, 4> ToRemove;
242 bool Found = false;
243 for (MachineInstr &Term : Parent->terminators()) {
244 if (Found) {
245 if (Term.isBranch())
246 ToRemove.push_back(Elt: &Term);
247 } else {
248 Found = Term.isIdenticalTo(Other: MI);
249 }
250 }
251 assert(Found && "conditional branch is not terminator");
252 for (auto *BranchMI : ToRemove) {
253 MachineOperand &Dst = BranchMI->getOperand(i: 0);
254 assert(Dst.isMBB() && "destination is not basic block");
255 Parent->removeSuccessor(Succ: Dst.getMBB());
256 BranchMI->eraseFromParent();
257 }
258
259 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
260 Parent->removeSuccessor(Succ);
261 }
262
263 // Rewrite to unconditional branch
264 MI.setDesc(TII->get(Opcode: AMDGPU::S_BRANCH));
265 } else if (!IsVCCZ && MaskValue == 0) {
266 // Will never branch
267 MachineOperand &Dst = MI.getOperand(i: 0);
268 assert(Dst.isMBB() && "destination is not basic block");
269 MI.getParent()->removeSuccessor(Succ: Dst.getMBB());
270 MI.eraseFromParent();
271 return true;
272 } else if (MaskValue == -1) {
273 // Depends only on EXEC
274 MI.setDesc(
275 TII->get(Opcode: IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
276 }
277
278 MI.removeOperand(OpNo: MI.findRegisterUseOperandIdx(Reg: CondReg, TRI, isKill: false /*Kill*/));
279 MI.addImplicitDefUseOperands(MF&: *MBB.getParent());
280
281 return true;
282}
283
284bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
285 MachineInstr &MI) const {
286 MachineBasicBlock &MBB = *MI.getParent();
287 const MachineFunction &MF = *MBB.getParent();
288 const MachineRegisterInfo &MRI = MF.getRegInfo();
289 MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
290 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
291 SmallVector<MachineInstr *, 4> ToRemove;
292 bool IdxOn = true;
293
294 if (!MI.isIdenticalTo(Other: First))
295 return false;
296
297 // Scan back to find an identical S_SET_GPR_IDX_ON
298 for (MachineBasicBlock::instr_iterator I = std::next(x: First.getIterator()),
299 E = MI.getIterator();
300 I != E; ++I) {
301 if (I->isBundle() || I->isDebugInstr())
302 continue;
303 switch (I->getOpcode()) {
304 case AMDGPU::S_SET_GPR_IDX_MODE:
305 return false;
306 case AMDGPU::S_SET_GPR_IDX_OFF:
307 IdxOn = false;
308 ToRemove.push_back(Elt: &*I);
309 break;
310 default:
311 if (I->modifiesRegister(Reg: AMDGPU::M0, TRI))
312 return false;
313 if (IdxReg && I->modifiesRegister(Reg: IdxReg, TRI))
314 return false;
315 if (llvm::any_of(Range: I->operands(), P: [&MRI, this](const MachineOperand &MO) {
316 return MO.isReg() && TRI->isVectorRegister(MRI, Reg: MO.getReg());
317 })) {
318 // The only exception allowed here is another indirect vector move
319 // with the same mode.
320 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
321 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
322 return false;
323 }
324 }
325 }
326
327 MI.eraseFromBundle();
328 for (MachineInstr *RI : ToRemove)
329 RI->eraseFromBundle();
330 return true;
331}
332
333bool SIPreEmitPeephole::getBlockDestinations(
334 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
335 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
336 if (TII->analyzeBranch(MBB&: SrcMBB, TBB&: TrueMBB, FBB&: FalseMBB, Cond))
337 return false;
338
339 if (!FalseMBB)
340 FalseMBB = SrcMBB.getNextNode();
341
342 return true;
343}
344
345namespace {
346class BranchWeightCostModel {
347 const SIInstrInfo &TII;
348 const TargetSchedModel &SchedModel;
349 BranchProbability BranchProb;
350 static constexpr uint64_t BranchNotTakenCost = 1;
351 uint64_t BranchTakenCost;
352 uint64_t ThenCyclesCost = 0;
353
354public:
355 BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
356 const MachineBasicBlock &Succ)
357 : TII(TII), SchedModel(TII.getSchedModel()) {
358 const MachineBasicBlock &Head = *Branch.getParent();
359 const auto *FromIt = find(Range: Head.successors(), Val: &Succ);
360 assert(FromIt != Head.succ_end());
361
362 BranchProb = Head.getSuccProbability(Succ: FromIt);
363 if (BranchProb.isUnknown())
364 BranchProb = BranchProbability::getZero();
365 BranchTakenCost = SchedModel.computeInstrLatency(MI: &Branch);
366 }
367
368 bool isProfitable(const MachineInstr &MI) {
369 if (TII.isWaitcnt(Opcode: MI.getOpcode()))
370 return false;
371
372 ThenCyclesCost += SchedModel.computeInstrLatency(MI: &MI);
373
374 // Consider `P = N/D` to be the probability of execz being false (skipping
375 // the then-block) The transformation is profitable if always executing the
376 // 'then' block is cheaper than executing sometimes 'then' and always
377 // executing s_cbranch_execz:
378 // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
379 // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
380 // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
381 // BranchNotTakenCost
382 uint64_t Numerator = BranchProb.getNumerator();
383 uint64_t Denominator = BranchProb.getDenominator();
384 return (Denominator - Numerator) * ThenCyclesCost <=
385 ((Denominator - Numerator) * BranchTakenCost +
386 Numerator * BranchNotTakenCost);
387 }
388};
389
390bool SIPreEmitPeephole::mustRetainExeczBranch(
391 const MachineInstr &Branch, const MachineBasicBlock &From,
392 const MachineBasicBlock &To) const {
393 assert(is_contained(Branch.getParent()->successors(), &From));
394 BranchWeightCostModel CostModel{*TII, Branch, From};
395
396 const MachineFunction *MF = From.getParent();
397 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
398 MBBI != End && MBBI != ToI; ++MBBI) {
399 const MachineBasicBlock &MBB = *MBBI;
400
401 for (const MachineInstr &MI : MBB) {
402 // When a uniform loop is inside non-uniform control flow, the branch
403 // leaving the loop might never be taken when EXEC = 0.
404 // Hence we should retain cbranch out of the loop lest it become infinite.
405 if (MI.isConditionalBranch())
406 return true;
407
408 if (MI.isUnconditionalBranch() &&
409 TII->getBranchDestBlock(MI) != MBB.getNextNode())
410 return true;
411
412 if (MI.isMetaInstruction())
413 continue;
414
415 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
416 return true;
417
418 if (!CostModel.isProfitable(MI))
419 return true;
420 }
421 }
422
423 return false;
424}
425} // namespace
426
427// Returns true if the skip branch instruction is removed.
428bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
429 MachineBasicBlock &SrcMBB) {
430
431 if (!TII->getSchedModel().hasInstrSchedModel())
432 return false;
433
434 MachineBasicBlock *TrueMBB = nullptr;
435 MachineBasicBlock *FalseMBB = nullptr;
436 SmallVector<MachineOperand, 1> Cond;
437
438 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
439 return false;
440
441 // Consider only the forward branches.
442 if (SrcMBB.getNumber() >= TrueMBB->getNumber())
443 return false;
444
445 // Consider only when it is legal and profitable
446 if (mustRetainExeczBranch(Branch: MI, From: *FalseMBB, To: *TrueMBB))
447 return false;
448
449 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
450 MI.eraseFromParent();
451 SrcMBB.removeSuccessor(Succ: TrueMBB);
452
453 return true;
454}
455
456bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
457 unsigned OpCode = MI.getOpcode();
458 Register DstReg = MI.getOperand(i: 0).getReg();
459 // Only the first register in the register pair needs to be checked due to the
460 // unpacking order. Packed instructions are unpacked such that the lower 32
461 // bits (i.e., the first register in the pair) are written first. This can
462 // introduce dependencies if the first register is written in one instruction
463 // and then read as part of the higher 32 bits in the subsequent instruction.
464 // Such scenarios can arise due to specific combinations of op_sel and
465 // op_sel_hi modifiers.
466 Register UnpackedDstReg = TRI->getSubReg(Reg: DstReg, Idx: AMDGPU::sub0);
467
468 const MachineOperand *Src0MO = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
469 if (Src0MO && Src0MO->isReg()) {
470 Register SrcReg0 = Src0MO->getReg();
471 unsigned Src0Mods =
472 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)->getImm();
473 Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
474 ? TRI->getSubReg(Reg: SrcReg0, Idx: AMDGPU::sub1)
475 : TRI->getSubReg(Reg: SrcReg0, Idx: AMDGPU::sub0);
476 // Check if the register selected by op_sel_hi is the same as the first
477 // register in the destination register pair.
478 if (TRI->regsOverlap(RegA: UnpackedDstReg, RegB: HiSrc0Reg))
479 return true;
480 }
481
482 const MachineOperand *Src1MO = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
483 if (Src1MO && Src1MO->isReg()) {
484 Register SrcReg1 = Src1MO->getReg();
485 unsigned Src1Mods =
486 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)->getImm();
487 Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
488 ? TRI->getSubReg(Reg: SrcReg1, Idx: AMDGPU::sub1)
489 : TRI->getSubReg(Reg: SrcReg1, Idx: AMDGPU::sub0);
490 if (TRI->regsOverlap(RegA: UnpackedDstReg, RegB: HiSrc1Reg))
491 return true;
492 }
493
494 // Applicable for packed instructions with 3 source operands, such as
495 // V_PK_FMA.
496 if (AMDGPU::hasNamedOperand(Opcode: OpCode, NamedIdx: AMDGPU::OpName::src2)) {
497 const MachineOperand *Src2MO =
498 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
499 if (Src2MO && Src2MO->isReg()) {
500 Register SrcReg2 = Src2MO->getReg();
501 unsigned Src2Mods =
502 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers)->getImm();
503 Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
504 ? TRI->getSubReg(Reg: SrcReg2, Idx: AMDGPU::sub1)
505 : TRI->getSubReg(Reg: SrcReg2, Idx: AMDGPU::sub0);
506 if (TRI->regsOverlap(RegA: UnpackedDstReg, RegB: HiSrc2Reg))
507 return true;
508 }
509 }
510 return false;
511}
512
513uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
514 unsigned Opcode = I.getOpcode();
515 // Use 64 bit encoding to allow use of VOP3 instructions.
516 // VOP3 e64 instructions allow source modifiers
517 // e32 instructions don't allow source modifiers.
518 switch (Opcode) {
519 case AMDGPU::V_PK_ADD_F32:
520 return AMDGPU::V_ADD_F32_e64;
521 case AMDGPU::V_PK_MUL_F32:
522 return AMDGPU::V_MUL_F32_e64;
523 case AMDGPU::V_PK_FMA_F32:
524 return AMDGPU::V_FMA_F32_e64;
525 default:
526 return std::numeric_limits<uint16_t>::max();
527 }
528 llvm_unreachable("Fully covered switch");
529}
530
531void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
532 unsigned SrcMods, bool IsHiBits,
533 const MachineOperand &SrcMO) {
534 unsigned NewSrcMods = 0;
535 unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
536 unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
537 // Packed instructions (VOP3P) do not support ABS. Hence, no checks are done
538 // for ABS modifiers.
539 // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
540 // lane.
541 // NEG_HI shares the same bit position with ABS. But packed instructions do
542 // not support ABS. Therefore, NEG_HI must be translated to NEG source
543 // modifier for the higher 32 bits. Unpacked VOP3 instructions support
544 // ABS, but do not support NEG_HI. Therefore we need to explicitly add the
545 // NEG modifier if present in the packed instruction.
546 if (SrcMods & NegModifier)
547 NewSrcMods |= SISrcMods::NEG;
548 // Src modifiers. Only negative modifiers are added if needed. Unpacked
549 // operations do not have op_sel, therefore it must be handled explicitly as
550 // done below.
551 NewMI.addImm(Val: NewSrcMods);
552 if (SrcMO.isImm()) {
553 NewMI.addImm(Val: SrcMO.getImm());
554 return;
555 }
556 // If op_sel == 0, select register 0 of reg:sub0_sub1.
557 Register UnpackedSrcReg = (SrcMods & OpSelModifier)
558 ? TRI->getSubReg(Reg: SrcMO.getReg(), Idx: AMDGPU::sub1)
559 : TRI->getSubReg(Reg: SrcMO.getReg(), Idx: AMDGPU::sub0);
560
561 MachineOperand UnpackedSrcMO =
562 MachineOperand::CreateReg(Reg: UnpackedSrcReg, /*isDef=*/false);
563 if (SrcMO.isKill()) {
564 // For each unpacked instruction, mark its source registers as killed if the
565 // corresponding source register in the original packed instruction was
566 // marked as killed.
567 //
568 // Exception:
569 // If the op_sel and op_sel_hi modifiers require both unpacked instructions
570 // to use the same register (e.g., due to overlapping access to low/high
571 // bits of the same packed register), then only the *second* (latter)
572 // instruction should mark the register as killed. This is because the
573 // second instruction handles the higher bits and is effectively the last
574 // user of the full register pair.
575
576 bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
577 bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
578 bool KillState = true;
579 if ((OpSel == OpSelHi) && !IsHiBits)
580 KillState = false;
581 UnpackedSrcMO.setIsKill(KillState);
582 }
583 NewMI.add(MO: UnpackedSrcMO);
584}
585
586void SIPreEmitPeephole::collectUnpackingCandidates(
587 MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
588 uint16_t NumMFMACycles) {
589 auto *BB = BeginMI.getParent();
590 auto E = BB->end();
591 int TotalCyclesBetweenCandidates = 0;
592 auto SchedModel = TII->getSchedModel();
593 Register MFMADef = BeginMI.getOperand(i: 0).getReg();
594
595 for (auto I = std::next(x: BeginMI.getIterator()); I != E; ++I) {
596 MachineInstr &Instr = *I;
597 uint16_t UnpackedOpCode = mapToUnpackedOpcode(I&: Instr);
598 bool IsUnpackable =
599 !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
600 if (Instr.isMetaInstruction())
601 continue;
602 if ((Instr.isTerminator()) ||
603 (TII->isNeverCoissue(MI&: Instr) && !IsUnpackable) ||
604 (SIInstrInfo::modifiesModeRegister(MI: Instr) &&
605 Instr.modifiesRegister(Reg: AMDGPU::EXEC, TRI)))
606 return;
607
608 const MCSchedClassDesc *InstrSchedClassDesc =
609 SchedModel.resolveSchedClass(MI: &Instr);
610 uint16_t Latency =
611 SchedModel.getWriteProcResBegin(SC: InstrSchedClassDesc)->ReleaseAtCycle;
612 TotalCyclesBetweenCandidates += Latency;
613
614 if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
615 return;
616 // Identify register dependencies between those used by the MFMA
617 // instruction and the following packed instructions. Also checks for
618 // transitive dependencies between the MFMA def and candidate instruction
619 // def and uses. Conservatively ensures that we do not incorrectly
620 // read/write registers.
621 for (const MachineOperand &InstrMO : Instr.operands()) {
622 if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
623 continue;
624 if (TRI->regsOverlap(RegA: MFMADef, RegB: InstrMO.getReg()))
625 return;
626 }
627 if (!IsUnpackable)
628 continue;
629
630 if (canUnpackingClobberRegister(MI: Instr))
631 return;
632 // If it's a packed instruction, adjust latency: remove the packed
633 // latency, add latency of two unpacked instructions (currently estimated
634 // as 2 cycles).
635 TotalCyclesBetweenCandidates -= Latency;
636 // TODO: improve latency handling based on instruction modeling.
637 TotalCyclesBetweenCandidates += 2;
638 // Subtract 1 to account for MFMA issue latency.
639 if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
640 InstrsToUnpack.insert(X: &Instr);
641 }
642}
643
644void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
645 const MachineOperand &DstOp = I.getOperand(i: 0);
646
647 uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
648 assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
649 "Unsupported Opcode");
650
651 MachineInstrBuilder Op0LOp1L =
652 createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
653 MachineOperand LoDstOp = Op0LOp1L->getOperand(i: 0);
654
655 LoDstOp.setIsUndef(DstOp.isUndef());
656
657 MachineInstrBuilder Op0HOp1H =
658 createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
659 MachineOperand HiDstOp = Op0HOp1H->getOperand(i: 0);
660
661 uint32_t IFlags = I.getFlags();
662 Op0LOp1L->setFlags(IFlags);
663 Op0HOp1H->setFlags(IFlags);
664 LoDstOp.setIsRenamable(DstOp.isRenamable());
665 HiDstOp.setIsRenamable(DstOp.isRenamable());
666
667 I.eraseFromParent();
668}
669
670MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
671 uint16_t UnpackedOpcode,
672 bool IsHiBits) {
673 MachineBasicBlock &MBB = *I.getParent();
674 const DebugLoc &DL = I.getDebugLoc();
675 const MachineOperand *SrcMO0 = TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src0);
676 const MachineOperand *SrcMO1 = TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src1);
677 Register DstReg = I.getOperand(i: 0).getReg();
678 unsigned OpCode = I.getOpcode();
679 Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(Reg: DstReg, Idx: AMDGPU::sub1)
680 : TRI->getSubReg(Reg: DstReg, Idx: AMDGPU::sub0);
681
682 int64_t ClampVal = TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::clamp)->getImm();
683 unsigned Src0Mods =
684 TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src0_modifiers)->getImm();
685 unsigned Src1Mods =
686 TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src1_modifiers)->getImm();
687
688 MachineInstrBuilder NewMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: UnpackedOpcode));
689 NewMI.addDef(RegNo: UnpackedDstReg); // vdst
690 addOperandAndMods(NewMI, SrcMods: Src0Mods, IsHiBits, SrcMO: *SrcMO0);
691 addOperandAndMods(NewMI, SrcMods: Src1Mods, IsHiBits, SrcMO: *SrcMO1);
692
693 if (AMDGPU::hasNamedOperand(Opcode: OpCode, NamedIdx: AMDGPU::OpName::src2)) {
694 const MachineOperand *SrcMO2 =
695 TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src2);
696 unsigned Src2Mods =
697 TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src2_modifiers)->getImm();
698 addOperandAndMods(NewMI, SrcMods: Src2Mods, IsHiBits, SrcMO: *SrcMO2);
699 }
700 NewMI.addImm(Val: ClampVal); // clamp
701 // Packed instructions do not support output modifiers. safe to assign them 0
702 // for this use case
703 NewMI.addImm(Val: 0); // omod
704 return NewMI;
705}
706
707PreservedAnalyses
708llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
709 MachineFunctionAnalysisManager &MFAM) {
710 auto *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(IR&: MF);
711 auto *MPDT = MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(IR&: MF);
712
713 if (SIPreEmitPeephole().run(MF))
714 return getMachineFunctionPassPreservedAnalyses();
715
716 if (MDT)
717 MDT->updateBlockNumbers();
718 if (MPDT)
719 MPDT->updateBlockNumbers();
720 return PreservedAnalyses::all();
721}
722
723bool SIPreEmitPeephole::run(MachineFunction &MF) {
724 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
725 TII = ST.getInstrInfo();
726 TRI = &TII->getRegisterInfo();
727 bool Changed = false;
728
729 MF.RenumberBlocks();
730
731 for (MachineBasicBlock &MBB : MF) {
732 MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
733 // Check first terminator for branches to optimize
734 if (TermI != MBB.end()) {
735 MachineInstr &MI = *TermI;
736 switch (MI.getOpcode()) {
737 case AMDGPU::S_CBRANCH_VCCZ:
738 case AMDGPU::S_CBRANCH_VCCNZ:
739 Changed |= optimizeVccBranch(MI);
740 break;
741 case AMDGPU::S_CBRANCH_EXECZ:
742 Changed |= removeExeczBranch(MI, SrcMBB&: MBB);
743 break;
744 }
745 }
746
747 if (!ST.hasVGPRIndexMode())
748 continue;
749
750 MachineInstr *SetGPRMI = nullptr;
751 const unsigned Threshold = 20;
752 unsigned Count = 0;
753 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
754 // second is not needed. Do expensive checks in the optimizeSetGPR()
755 // and limit the distance to 20 instructions for compile time purposes.
756 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
757 // may be bundled with the instructions they modify.
758 for (auto &MI : make_early_inc_range(Range: MBB.instrs())) {
759 if (Count == Threshold)
760 SetGPRMI = nullptr;
761 else
762 ++Count;
763
764 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
765 continue;
766
767 Count = 0;
768 if (!SetGPRMI) {
769 SetGPRMI = &MI;
770 continue;
771 }
772
773 if (optimizeSetGPR(First&: *SetGPRMI, MI))
774 Changed = true;
775 else
776 SetGPRMI = &MI;
777 }
778 }
779
780 // TODO: Fold this into previous block, if possible. Evaluate and handle any
781 // side effects.
782
783 // Perform the extra MF scans only for supported archs
784 if (!ST.hasGFX940Insts())
785 return Changed;
786 for (MachineBasicBlock &MBB : MF) {
787 // Unpack packed instructions overlapped by MFMAs. This allows the
788 // compiler to co-issue unpacked instructions with MFMA
789 auto SchedModel = TII->getSchedModel();
790 SetVector<MachineInstr *> InstrsToUnpack;
791 for (auto &MI : make_early_inc_range(Range: MBB.instrs())) {
792 if (!SIInstrInfo::isMFMA(MI))
793 continue;
794 const MCSchedClassDesc *SchedClassDesc =
795 SchedModel.resolveSchedClass(MI: &MI);
796 uint16_t NumMFMACycles =
797 SchedModel.getWriteProcResBegin(SC: SchedClassDesc)->ReleaseAtCycle;
798 collectUnpackingCandidates(BeginMI&: MI, InstrsToUnpack, NumMFMACycles);
799 }
800 for (MachineInstr *MI : InstrsToUnpack) {
801 performF32Unpacking(I&: *MI);
802 }
803 }
804
805 return Changed;
806}
807