SIPreEmitPeephole.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp]

1	//===-- SIPreEmitPeephole.cpp ------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This pass performs the peephole optimizations before code emission.
11	///
12	/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
13	/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
14	/// co-issued. This helps with overlapping MFMA and certain vector instructions
15	/// in machine schedules and is expected to improve performance. Only those
16	/// packed instructions are unpacked that are overlapped by the MFMA latency.
17	/// Rest should remain untouched.
18	/// TODO: Add support for F16 packed instructions
19	//===----------------------------------------------------------------------===//
20
21	#include "AMDGPU.h"
22	#include "GCNSubtarget.h"
23	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24	#include "llvm/ADT/SetVector.h"
25	#include "llvm/CodeGen/MachineDominators.h"
26	#include "llvm/CodeGen/MachineFunctionPass.h"
27	#include "llvm/CodeGen/MachineLoopInfo.h"
28	#include "llvm/CodeGen/MachinePostDominators.h"
29	#include "llvm/CodeGen/TargetSchedule.h"
30	#include "llvm/Support/BranchProbability.h"
31	using namespace llvm;
32
33	#define DEBUG_TYPE "si-pre-emit-peephole"
34
35	namespace {
36
37	class SIPreEmitPeephole {
38	private:
39	const SIInstrInfo TII = nullptr*;
40	const SIRegisterInfo TRI = nullptr*;
41	MachineLoopInfo MLI = nullptr*;
42
43	bool optimizeVccBranch(MachineInstr &MI) const;
44	void updateMLIBeforeRemovingEdge(MachineBasicBlock *From,
45	MachineBasicBlock To) const*;
46	bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
47	bool getBlockDestinations(MachineBasicBlock &SrcMBB,
48	MachineBasicBlock *&TrueMBB,
49	MachineBasicBlock *&FalseMBB,
50	SmallVectorImpl<MachineOperand> &Cond);
51	bool mustRetainExeczBranch(const MachineInstr &Branch,
52	const MachineBasicBlock &From,
53	const MachineBasicBlock &To) const;
54	bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
55	// Creates a list of packed instructions following an MFMA that are suitable
56	// for unpacking.
57	void collectUnpackingCandidates(MachineInstr &BeginMI,
58	SetVector<MachineInstr *> &InstrsToUnpack,
59	uint16_t NumMFMACycles);
60	// v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
61	// op_sel_hi:[0,0,0]
62	// ==>
63	// v_fma_f32 v0, v1, v3, v3
64	// v_fma_f32 v1, v0, v2, v2
65	// Here, we have overwritten v0 before we use it. This function checks if
66	// unpacking can lead to such a situation.
67	bool canUnpackingClobberRegister(const MachineInstr &MI);
68	// Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
69	// V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
70	// this transformation.
71	void performF32Unpacking(MachineInstr &I);
72	// Select corresponding unpacked instruction
73	uint32_t mapToUnpackedOpcode(MachineInstr &I);
74	// Creates the unpacked instruction to be inserted. Adds source modifiers to
75	// the unpacked instructions based on the source modifiers in the packed
76	// instruction.
77	MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint32_t UnpackedOpcode,
78	bool IsHiBits);
79	// Process operands/source modifiers from packed instructions and insert the
80	// appropriate source modifers and operands into the unpacked instructions.
81	void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods,
82	bool IsHiBits, const MachineOperand &SrcMO);
83
84	public:
85	bool run(MachineFunction &MF, MachineLoopInfo *MLI);
86	};
87
88	class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
89	public:
90	static char ID;
91
92	SIPreEmitPeepholeLegacy() : MachineFunctionPass (ID) {}
93
94	void getAnalysisUsage(AnalysisUsage &AU) const override {
95	AU.addUsedIfAvailable<MachineLoopInfoWrapperPass>();
96	AU.addPreserved<MachineLoopInfoWrapperPass>();
97	MachineFunctionPass::getAnalysisUsage(AU);
98	}
99
100	bool runOnMachineFunction(MachineFunction &MF) override {
101	auto *MLIWrapper = getAnalysisIfAvailable<MachineLoopInfoWrapperPass>();
102	MachineLoopInfo MLI = MLIWrapper ? &MLIWrapper->getLI() : nullptr*;
103	return SIPreEmitPeephole ().run(MF, MLI);
104	}
105	};
106
107	} // End anonymous namespace.
108
109	INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE,
110	"SI peephole optimizations", false, false)
111
112	char SIPreEmitPeepholeLegacy::ID = `0`;
113
114	char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID;
115
116	void SIPreEmitPeephole::updateMLIBeforeRemovingEdge(
117	MachineBasicBlock From, MachineBasicBlock To) const {
118	if (!MLI)
119	return;
120
121	// Only handle back-edges: To must be a loop header with From inside the loop.
122	MachineLoop *Loop = MLI->getLoopFor(BB: To);
123	if (!Loop \|\| Loop->getHeader() != To \|\| !Loop->contains(BB: From))
124	return;
125
126	// Count back-edges
127	unsigned BackEdgeCount = `0`;
128	for (MachineBasicBlock *Pred : To->predecessors()) {
129	if (Loop->contains(BB: Pred))
130	BackEdgeCount++;
131	}
132
133	if (BackEdgeCount > `1`)
134	return;
135
136	MachineLoop *ParentLoop = Loop->getParentLoop();
137
138	// Re-map blocks directly owned by this loop to the parent.
139	for (MachineBasicBlock *BB : Loop->blocks()) {
140	if (MLI->getLoopFor(BB) == Loop)
141	MLI->changeLoopFor(BB, L: ParentLoop);
142	}
143
144	// Reparent all child loops.
145	while (!Loop->isInnermost()) {
146	MachineLoop *Child = Loop->removeChildLoop(I: std::prev(x: Loop->end()));
147	if (ParentLoop)
148	ParentLoop->addChildLoop(NewChild: Child);
149	else
150	MLI->addTopLevelLoop(New: Child);
151	}
152
153	if (ParentLoop)
154	ParentLoop->removeChildLoop(Child: Loop);
155	else
156	MLI->removeLoop(I: llvm::find(Range&: *MLI, Val: Loop));
157
158	MLI->destroy(L: Loop);
159	}
160
161	bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
162	// Match:
163	// sreg = -1 or 0
164	// vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
165	// S_CBRANCH_VCC[N]Z
166	// =>
167	// S_CBRANCH_EXEC[N]Z
168	// We end up with this pattern sometimes after basic block placement.
169	// It happens while combining a block which assigns -1 or 0 to a saved mask
170	// and another block which consumes that saved mask and then a branch.
171	//
172	// While searching this also performs the following substitution:
173	// vcc = V_CMP
174	// vcc = S_AND exec, vcc
175	// S_CBRANCH_VCC[N]Z
176	// =>
177	// vcc = V_CMP
178	// S_CBRANCH_VCC[N]Z
179
180	bool Changed = false;
181	MachineBasicBlock &MBB = *MI.getParent();
182	const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
183	const bool IsWave32 = ST.isWave32();
184	const unsigned CondReg = TRI->getVCC();
185	const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
186	const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
187	const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
188	const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
189
190	MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
191	E = MBB.rend();
192	bool ReadsCond = false;
193	unsigned Threshold = `5`;
194	for (++A; A != E; ++A) {
195	if (!--Threshold)
196	return false;
197	if (A ->modifiesRegister(Reg: ExecReg, TRI))
198	return false;
199	if (A ->modifiesRegister(Reg: CondReg, TRI)) {
200	if (!A ->definesRegister(Reg: CondReg, TRI) \|\|
201	(A ->getOpcode() != And && A ->getOpcode() != AndN2))
202	return false;
203	break;
204	}
205	ReadsCond \|= A ->readsRegister(Reg: CondReg, TRI);
206	}
207	if (A == E)
208	return false;
209
210	MachineOperand &Op1 = A ->getOperand(i: `1`);
211	MachineOperand &Op2 = A ->getOperand(i: `2`);
212	if ((!Op1.isReg() \|\| Op1.getReg() != ExecReg) && Op2.isReg() &&
213	Op2.getReg() == ExecReg) {
214	TII->commuteInstruction(MI&: *A);
215	Changed = true;
216	}
217	if (!Op1.isReg() \|\| Op1.getReg() != ExecReg)
218	return Changed;
219	if (Op2.isImm() && !(Op2.getImm() == -`1` \|\| Op2.getImm() == `0`))
220	return Changed;
221
222	int64_t MaskValue = `0`;
223	Register SReg;
224	if (Op2.isReg()) {
225	SReg = Op2.getReg();
226	auto M = std::next(x: A);
227	bool ReadsSreg = false;
228	bool ModifiesExec = false;
229	for (; M != E; ++M) {
230	if (M ->definesRegister(Reg: SReg, TRI))
231	break;
232	if (M ->modifiesRegister(Reg: SReg, TRI))
233	return Changed;
234	ReadsSreg \|= M ->readsRegister(Reg: SReg, TRI);
235	ModifiesExec \|= M ->modifiesRegister(Reg: ExecReg, TRI);
236	}
237	if (M == E)
238	return Changed;
239	// If SReg is VCC and SReg definition is a VALU comparison.
240	// This means S_AND with EXEC is not required.
241	// Erase the S_AND and return.
242	// Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
243	if (A ->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
244	TII->isVOPC(MI: *M)) {
245	A ->eraseFromParent();
246	return true;
247	}
248	if (!M ->isMoveImmediate() \|\| !M ->getOperand(i: `1`).isImm() \|\|
249	(M ->getOperand(i: `1`).getImm() != -`1` && M ->getOperand(i: `1`).getImm() != `0`))
250	return Changed;
251	MaskValue = M ->getOperand(i: `1`).getImm();
252	// First if sreg is only used in the AND instruction fold the immediate
253	// into the AND.
254	if (!ReadsSreg && Op2.isKill()) {
255	A ->getOperand(i: `2`).ChangeToImmediate(ImmVal: MaskValue);
256	M ->eraseFromParent();
257	}
258	} else if (Op2.isImm()) {
259	MaskValue = Op2.getImm();
260	} else {
261	llvm_unreachable("Op2 must be register or immediate");
262	}
263
264	// Invert mask for s_andn2
265	assert(MaskValue == `0` \|\| MaskValue == -`1`);
266	if (A ->getOpcode() == AndN2)
267	MaskValue = ~MaskValue;
268
269	if (!ReadsCond && A ->registerDefIsDead(Reg: AMDGPU::SCC, /TRI=/nullptr)) {
270	if (!MI.killsRegister(Reg: CondReg, TRI)) {
271	// Replace AND with MOV
272	if (MaskValue == `0`) {
273	BuildMI(BB&: A ->getParent(), I&: A, MIMD: A ->getDebugLoc(), MCID: TII->get(Opcode: Mov), DestReg: CondReg)
274	.addImm(Val: `0`);
275	} else {
276	BuildMI(BB&: A ->getParent(), I&: A, MIMD: A ->getDebugLoc(), MCID: TII->get(Opcode: Mov), DestReg: CondReg)
277	.addReg(RegNo: ExecReg);
278	}
279	}
280	// Remove AND instruction
281	A ->eraseFromParent();
282	}
283
284	bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
285	if (SReg == ExecReg) {
286	// EXEC is updated directly
287	if (IsVCCZ) {
288	MI.eraseFromParent();
289	return true;
290	}
291	MI.setDesc(TII->get(Opcode: AMDGPU::S_BRANCH));
292	} else if (IsVCCZ && MaskValue == `0`) {
293	// Will always branch
294	// Remove all successors shadowed by new unconditional branch
295	MachineBasicBlock *Parent = MI.getParent();
296	SmallVector<MachineInstr *, `4`> ToRemove;
297	bool Found = false;
298	for (MachineInstr &Term : Parent->terminators()) {
299	if (Found) {
300	if (Term.isBranch())
301	ToRemove.push_back(Elt: &Term);
302	} else {
303	Found = Term.isIdenticalTo(Other: MI);
304	}
305	}
306	assert(Found && "conditional branch is not terminator");
307	for (auto *BranchMI : ToRemove) {
308	MachineOperand &Dst = BranchMI->getOperand(i: `0`);
309	assert(Dst.isMBB() && "destination is not basic block");
310	updateMLIBeforeRemovingEdge(From: Parent, To: Dst.getMBB());
311	Parent->removeSuccessor(Succ: Dst.getMBB());
312	BranchMI->eraseFromParent();
313	}
314
315	if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
316	updateMLIBeforeRemovingEdge(From: Parent, To: Succ);
317	Parent->removeSuccessor(Succ);
318	}
319
320	// Rewrite to unconditional branch
321	MI.setDesc(TII->get(Opcode: AMDGPU::S_BRANCH));
322	} else if (!IsVCCZ && MaskValue == `0`) {
323	// Will never branch
324	MachineOperand &Dst = MI.getOperand(i: `0`);
325	assert(Dst.isMBB() && "destination is not basic block");
326	MachineBasicBlock *Parent = MI.getParent();
327	updateMLIBeforeRemovingEdge(From: Parent, To: Dst.getMBB());
328	Parent->removeSuccessor(Succ: Dst.getMBB());
329	MI.eraseFromParent();
330	return true;
331	} else if (MaskValue == -`1`) {
332	// Depends only on EXEC
333	MI.setDesc(
334	TII->get(Opcode: IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
335	}
336
337	MI.removeOperand(OpNo: MI.findRegisterUseOperandIdx(Reg: CondReg, TRI, isKill: false /Kill/));
338	MI.addImplicitDefUseOperands(MF&: *MBB.getParent());
339
340	return true;
341	}
342
343	bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
344	MachineInstr &MI) const {
345	MachineBasicBlock &MBB = *MI.getParent();
346	const MachineFunction &MF = *MBB.getParent();
347	const MachineRegisterInfo &MRI = MF.getRegInfo();
348	MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
349	Register IdxReg = Idx->isReg() ? Idx->getReg() : Register ();
350	SmallVector<MachineInstr *, `4`> ToRemove;
351	bool IdxOn = true;
352
353	if (!MI.isIdenticalTo(Other: First))
354	return false;
355
356	// Scan back to find an identical S_SET_GPR_IDX_ON
357	for (MachineBasicBlock::instr_iterator I = std::next(x: First.getIterator()),
358	E = MI.getIterator();
359	I != E; ++I) {
360	if (I ->isBundle() \|\| I ->isDebugInstr())
361	continue;
362	switch (I ->getOpcode()) {
363	case AMDGPU::S_SET_GPR_IDX_MODE:
364	return false;
365	case AMDGPU::S_SET_GPR_IDX_OFF:
366	IdxOn = false;
367	ToRemove.push_back(Elt: &*I);
368	break;
369	default:
370	if (I ->modifiesRegister(Reg: AMDGPU::M0, TRI))
371	return false;
372	if (IdxReg && I ->modifiesRegister(Reg: IdxReg, TRI))
373	return false;
374	if (llvm::any_of(Range: I ->operands(), P: [&MRI, this](const MachineOperand &MO) {
375	return MO.isReg() && TRI->isVectorRegister(MRI, Reg: MO.getReg());
376	})) {
377	// The only exception allowed here is another indirect vector move
378	// with the same mode.
379	if (!IdxOn \|\| !(I ->getOpcode() == AMDGPU::V_MOV_B32_indirect_write \|\|
380	I ->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
381	return false;
382	}
383	}
384	}
385
386	MI.eraseFromBundle();
387	for (MachineInstr *RI : ToRemove)
388	RI->eraseFromBundle();
389	return true;
390	}
391
392	bool SIPreEmitPeephole::getBlockDestinations(
393	MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
394	MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
395	if (TII->analyzeBranch(MBB&: SrcMBB, TBB&: TrueMBB, FBB&: FalseMBB, Cond))
396	return false;
397
398	if (!FalseMBB)
399	FalseMBB = SrcMBB.getNextNode();
400
401	return true;
402	}
403
404	namespace {
405	class BranchWeightCostModel {
406	const SIInstrInfo &TII;
407	const TargetSchedModel &SchedModel;
408	BranchProbability BranchProb;
409	static constexpr uint64_t BranchNotTakenCost = `1`;
410	uint64_t BranchTakenCost;
411	uint64_t ThenCyclesCost = `0`;
412
413	public:
414	BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
415	const MachineBasicBlock &Succ)
416	: TII(TII), SchedModel(TII.getSchedModel()) {
417	const MachineBasicBlock &Head = *Branch.getParent();
418	const auto *FromIt = find(Range: Head.successors(), Val: &Succ);
419	assert(FromIt != Head.succ_end());
420
421	BranchProb = Head.getSuccProbability(Succ: FromIt);
422	if (BranchProb.isUnknown())
423	BranchProb = BranchProbability::getZero();
424	BranchTakenCost = SchedModel.computeInstrLatency(MI: &Branch);
425	}
426
427	bool isProfitable(const MachineInstr &MI) {
428	if (TII.isWaitcnt(Opcode: MI.getOpcode()))
429	return false;
430
431	ThenCyclesCost += SchedModel.computeInstrLatency(MI: &MI);
432
433	// Consider `P = N/D` to be the probability of execz being false (skipping
434	// the then-block) The transformation is profitable if always executing the
435	// 'then' block is cheaper than executing sometimes 'then' and always
436	// executing s_cbranch_execz:
437	// ThenCost <= PThenCost + (1-P)BranchTakenCost + PBranchNotTakenCost
438	// (1-P) * ThenCost <= (1-P)BranchTakenCost + PBranchNotTakenCost*
439	// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
440	// BranchNotTakenCost
441	uint64_t Numerator = BranchProb.getNumerator();
442	uint64_t Denominator = BranchProb.getDenominator();
443	return (Denominator - Numerator) * ThenCyclesCost <=
444	((Denominator - Numerator) * BranchTakenCost +
445	Numerator * BranchNotTakenCost);
446	}
447	};
448
449	bool SIPreEmitPeephole::mustRetainExeczBranch(
450	const MachineInstr &Branch, const MachineBasicBlock &From,
451	const MachineBasicBlock &To) const {
452	assert(is_contained(Branch.getParent()->successors(), &From));
453	BranchWeightCostModel CostModel{*TII, Branch, From};
454
455	const MachineFunction *MF = From.getParent();
456	for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
457	MBBI != End && MBBI != ToI; ++MBBI) {
458	const MachineBasicBlock &MBB = *MBBI;
459
460	for (const MachineInstr &MI : MBB) {
461	// When a uniform loop is inside non-uniform control flow, the branch
462	// leaving the loop might never be taken when EXEC = 0.
463	// Hence we should retain cbranch out of the loop lest it become infinite.
464	if (MI.isConditionalBranch())
465	return true;
466
467	if (MI.isUnconditionalBranch() &&
468	TII->getBranchDestBlock(MI) != MBB.getNextNode())
469	return true;
470
471	if (MI.isMetaInstruction())
472	continue;
473
474	if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
475	return true;
476
477	if (!CostModel.isProfitable(MI))
478	return true;
479	}
480	}
481
482	return false;
483	}
484	} // namespace
485
486	// Returns true if the skip branch instruction is removed.
487	bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
488	MachineBasicBlock &SrcMBB) {
489
490	if (!TII->getSchedModel().hasInstrSchedModel())
491	return false;
492
493	MachineBasicBlock TrueMBB = nullptr*;
494	MachineBasicBlock FalseMBB = nullptr*;
495	SmallVector<MachineOperand, `1`> Cond;
496
497	if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
498	return false;
499
500	// Consider only the forward branches.
501	if (SrcMBB.getNumber() >= TrueMBB->getNumber())
502	return false;
503
504	// Consider only when it is legal and profitable
505	if (mustRetainExeczBranch(Branch: MI, From: FalseMBB, To: TrueMBB))
506	return false;
507
508	LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
509	MI.eraseFromParent();
510	SrcMBB.removeSuccessor(Succ: TrueMBB);
511
512	return true;
513	}
514
515	bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
516	unsigned OpCode = MI.getOpcode();
517	Register DstReg = MI.getOperand(i: `0`).getReg();
518	// Only the first register in the register pair needs to be checked due to the
519	// unpacking order. Packed instructions are unpacked such that the lower 32
520	// bits (i.e., the first register in the pair) are written first. This can
521	// introduce dependencies if the first register is written in one instruction
522	// and then read as part of the higher 32 bits in the subsequent instruction.
523	// Such scenarios can arise due to specific combinations of op_sel and
524	// op_sel_hi modifiers.
525	Register UnpackedDstReg = TRI->getSubReg(Reg: DstReg, Idx: AMDGPU::sub0);
526
527	const MachineOperand *Src0MO = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
528	if (Src0MO && Src0MO->isReg()) {
529	Register SrcReg0 = Src0MO->getReg();
530	unsigned Src0Mods =
531	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers)->getImm();
532	Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
533	? TRI->getSubReg(Reg: SrcReg0, Idx: AMDGPU::sub1)
534	: TRI->getSubReg(Reg: SrcReg0, Idx: AMDGPU::sub0);
535	// Check if the register selected by op_sel_hi is the same as the first
536	// register in the destination register pair.
537	if (TRI->regsOverlap(RegA: UnpackedDstReg, RegB: HiSrc0Reg))
538	return true;
539	}
540
541	const MachineOperand *Src1MO = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
542	if (Src1MO && Src1MO->isReg()) {
543	Register SrcReg1 = Src1MO->getReg();
544	unsigned Src1Mods =
545	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers)->getImm();
546	Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
547	? TRI->getSubReg(Reg: SrcReg1, Idx: AMDGPU::sub1)
548	: TRI->getSubReg(Reg: SrcReg1, Idx: AMDGPU::sub0);
549	if (TRI->regsOverlap(RegA: UnpackedDstReg, RegB: HiSrc1Reg))
550	return true;
551	}
552
553	// Applicable for packed instructions with 3 source operands, such as
554	// V_PK_FMA.
555	if (AMDGPU::hasNamedOperand(Opcode: OpCode, NamedIdx: AMDGPU::OpName::src2)) {
556	const MachineOperand *Src2MO =
557	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
558	if (Src2MO && Src2MO->isReg()) {
559	Register SrcReg2 = Src2MO->getReg();
560	unsigned Src2Mods =
561	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers)->getImm();
562	Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
563	? TRI->getSubReg(Reg: SrcReg2, Idx: AMDGPU::sub1)
564	: TRI->getSubReg(Reg: SrcReg2, Idx: AMDGPU::sub0);
565	if (TRI->regsOverlap(RegA: UnpackedDstReg, RegB: HiSrc2Reg))
566	return true;
567	}
568	}
569	return false;
570	}
571
572	uint32_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
573	unsigned Opcode = I.getOpcode();
574	// Use 64 bit encoding to allow use of VOP3 instructions.
575	// VOP3 e64 instructions allow source modifiers
576	// e32 instructions don't allow source modifiers.
577	switch (Opcode) {
578	case AMDGPU::V_PK_ADD_F32:
579	return AMDGPU::V_ADD_F32_e64;
580	case AMDGPU::V_PK_MUL_F32:
581	return AMDGPU::V_MUL_F32_e64;
582	case AMDGPU::V_PK_FMA_F32:
583	return AMDGPU::V_FMA_F32_e64;
584	default:
585	return std::numeric_limits<uint32_t>::max();
586	}
587	llvm_unreachable("Fully covered switch");
588	}
589
590	void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
591	unsigned SrcMods, bool IsHiBits,
592	const MachineOperand &SrcMO) {
593	unsigned NewSrcMods = `0`;
594	unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
595	unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
596	// Packed instructions (VOP3P) do not support ABS. Hence, no checks are done
597	// for ABS modifiers.
598	// If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
599	// lane.
600	// NEG_HI shares the same bit position with ABS. But packed instructions do
601	// not support ABS. Therefore, NEG_HI must be translated to NEG source
602	// modifier for the higher 32 bits. Unpacked VOP3 instructions support
603	// ABS, but do not support NEG_HI. Therefore we need to explicitly add the
604	// NEG modifier if present in the packed instruction.
605	if (SrcMods & NegModifier)
606	NewSrcMods \|= SISrcMods::NEG;
607	// Src modifiers. Only negative modifiers are added if needed. Unpacked
608	// operations do not have op_sel, therefore it must be handled explicitly as
609	// done below.
610	NewMI.addImm(Val: NewSrcMods);
611	if (SrcMO.isImm()) {
612	NewMI.addImm(Val: SrcMO.getImm());
613	return;
614	}
615	// If op_sel == 0, select register 0 of reg:sub0_sub1.
616	Register UnpackedSrcReg = (SrcMods & OpSelModifier)
617	? TRI->getSubReg(Reg: SrcMO.getReg(), Idx: AMDGPU::sub1)
618	: TRI->getSubReg(Reg: SrcMO.getReg(), Idx: AMDGPU::sub0);
619
620	MachineOperand UnpackedSrcMO =
621	MachineOperand::CreateReg(Reg: UnpackedSrcReg, /isDef=/false);
622	if (SrcMO.isKill()) {
623	// For each unpacked instruction, mark its source registers as killed if the
624	// corresponding source register in the original packed instruction was
625	// marked as killed.
626	//
627	// Exception:
628	// If the op_sel and op_sel_hi modifiers require both unpacked instructions
629	// to use the same register (e.g., due to overlapping access to low/high
630	// bits of the same packed register), then only the second* (latter)*
631	// instruction should mark the register as killed. This is because the
632	// second instruction handles the higher bits and is effectively the last
633	// user of the full register pair.
634
635	bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
636	bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
637	bool KillState = true;
638	if ((OpSel == OpSelHi) && !IsHiBits)
639	KillState = false;
640	UnpackedSrcMO.setIsKill(KillState);
641	}
642	NewMI.add(MO: UnpackedSrcMO);
643	}
644
645	void SIPreEmitPeephole::collectUnpackingCandidates(
646	MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
647	uint16_t NumMFMACycles) {
648	auto *BB = BeginMI.getParent();
649	auto E = BB->end();
650	int TotalCyclesBetweenCandidates = `0`;
651	auto SchedModel = TII->getSchedModel();
652	Register MFMADef = BeginMI.getOperand(i: `0`).getReg();
653
654	for (auto I = std::next(x: BeginMI.getIterator()); I != E; ++I) {
655	MachineInstr &Instr = *I;
656	uint32_t UnpackedOpCode = mapToUnpackedOpcode(I&: Instr);
657	bool IsUnpackable =
658	!(UnpackedOpCode == std::numeric_limits<uint32_t>::max());
659	if (Instr.isMetaInstruction())
660	continue;
661	if ((Instr.isTerminator()) \|\|
662	(TII->isNeverCoissue(MI&: Instr) && !IsUnpackable) \|\|
663	(SIInstrInfo::modifiesModeRegister(MI: Instr) &&
664	Instr.modifiesRegister(Reg: AMDGPU::EXEC, TRI)))
665	return;
666
667	const MCSchedClassDesc *InstrSchedClassDesc =
668	SchedModel.resolveSchedClass(MI: &Instr);
669	uint16_t Latency =
670	SchedModel.getWriteProcResBegin(SC: InstrSchedClassDesc)->ReleaseAtCycle;
671	TotalCyclesBetweenCandidates += Latency;
672
673	if (TotalCyclesBetweenCandidates >= NumMFMACycles - `1`)
674	return;
675	// Identify register dependencies between those used by the MFMA
676	// instruction and the following packed instructions. Also checks for
677	// transitive dependencies between the MFMA def and candidate instruction
678	// def and uses. Conservatively ensures that we do not incorrectly
679	// read/write registers.
680	for (const MachineOperand &InstrMO : Instr.operands()) {
681	if (!InstrMO.isReg() \|\| !InstrMO.getReg().isValid())
682	continue;
683	if (TRI->regsOverlap(RegA: MFMADef, RegB: InstrMO.getReg()))
684	return;
685	}
686	if (!IsUnpackable)
687	continue;
688
689	if (canUnpackingClobberRegister(MI: Instr))
690	return;
691	// If it's a packed instruction, adjust latency: remove the packed
692	// latency, add latency of two unpacked instructions (currently estimated
693	// as 2 cycles).
694	TotalCyclesBetweenCandidates -= Latency;
695	// TODO: improve latency handling based on instruction modeling.
696	TotalCyclesBetweenCandidates += `2`;
697	// Subtract 1 to account for MFMA issue latency.
698	if (TotalCyclesBetweenCandidates < NumMFMACycles - `1`)
699	InstrsToUnpack.insert(X: &Instr);
700	}
701	}
702
703	void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
704	const MachineOperand &DstOp = I.getOperand(i: `0`);
705
706	uint32_t UnpackedOpcode = mapToUnpackedOpcode(I);
707	assert(UnpackedOpcode != std::numeric_limits<uint32_t>::max() &&
708	"Unsupported Opcode");
709
710	MachineInstrBuilder Op0LOp1L =
711	createUnpackedMI(I, UnpackedOpcode, /IsHiBits=/false);
712	MachineOperand LoDstOp = Op0LOp1L ->getOperand(i: `0`);
713
714	LoDstOp.setIsUndef(DstOp.isUndef());
715
716	MachineInstrBuilder Op0HOp1H =
717	createUnpackedMI(I, UnpackedOpcode, /IsHiBits=/true);
718	MachineOperand HiDstOp = Op0HOp1H ->getOperand(i: `0`);
719
720	uint32_t IFlags = I.getFlags();
721	Op0LOp1L ->setFlags(IFlags);
722	Op0HOp1H ->setFlags(IFlags);
723	LoDstOp.setIsRenamable(DstOp.isRenamable());
724	HiDstOp.setIsRenamable(DstOp.isRenamable());
725
726	I.eraseFromParent();
727	}
728
729	MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
730	uint32_t UnpackedOpcode,
731	bool IsHiBits) {
732	MachineBasicBlock &MBB = *I.getParent();
733	const DebugLoc &DL = I.getDebugLoc();
734	const MachineOperand *SrcMO0 = TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src0);
735	const MachineOperand *SrcMO1 = TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src1);
736	Register DstReg = I.getOperand(i: `0`).getReg();
737	unsigned OpCode = I.getOpcode();
738	Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(Reg: DstReg, Idx: AMDGPU::sub1)
739	: TRI->getSubReg(Reg: DstReg, Idx: AMDGPU::sub0);
740
741	int64_t ClampVal = TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::clamp)->getImm();
742	unsigned Src0Mods =
743	TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src0_modifiers)->getImm();
744	unsigned Src1Mods =
745	TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src1_modifiers)->getImm();
746
747	MachineInstrBuilder NewMI = BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: UnpackedOpcode));
748	NewMI.addDef(RegNo: UnpackedDstReg); // vdst
749	addOperandAndMods(NewMI, SrcMods: Src0Mods, IsHiBits, SrcMO: *SrcMO0);
750	addOperandAndMods(NewMI, SrcMods: Src1Mods, IsHiBits, SrcMO: *SrcMO1);
751
752	if (AMDGPU::hasNamedOperand(Opcode: OpCode, NamedIdx: AMDGPU::OpName::src2)) {
753	const MachineOperand *SrcMO2 =
754	TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src2);
755	unsigned Src2Mods =
756	TII->getNamedOperand(MI&: I, OperandName: AMDGPU::OpName::src2_modifiers)->getImm();
757	addOperandAndMods(NewMI, SrcMods: Src2Mods, IsHiBits, SrcMO: *SrcMO2);
758	}
759	NewMI.addImm(Val: ClampVal); // clamp
760	// Packed instructions do not support output modifiers. safe to assign them 0
761	// for this use case
762	NewMI.addImm(Val: `0`); // omod
763	return NewMI;
764	}
765
766	PreservedAnalyses
767	llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
768	MachineFunctionAnalysisManager &MFAM) {
769	auto *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(IR&: MF);
770	auto *MPDT = MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(IR&: MF);
771	auto *MLI = MFAM.getCachedResult<MachineLoopAnalysis>(IR&: MF);
772	SIPreEmitPeephole Impl;
773
774	if (Impl.run(MF, MLI)) {
775	auto PA = getMachineFunctionPassPreservedAnalyses();
776	PA.preserve<MachineLoopAnalysis>();
777	return PA;
778	}
779
780	if (MDT)
781	MDT->updateBlockNumbers();
782	if (MPDT)
783	MPDT->updateBlockNumbers();
784	return PreservedAnalyses::all();
785	}
786
787	bool SIPreEmitPeephole::run(MachineFunction &MF, MachineLoopInfo *LoopInfo) {
788	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
789	TII = ST.getInstrInfo();
790	TRI = &TII->getRegisterInfo();
791	MLI = LoopInfo;
792	bool Changed = false;
793
794	MF.RenumberBlocks();
795
796	for (MachineBasicBlock &MBB : MF) {
797	MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
798	// Check first terminator for branches to optimize
799	if (TermI != MBB.end()) {
800	MachineInstr &MI = *TermI;
801	switch (MI.getOpcode()) {
802	case AMDGPU::S_CBRANCH_VCCZ:
803	case AMDGPU::S_CBRANCH_VCCNZ:
804	Changed \|= optimizeVccBranch(MI);
805	break;
806	case AMDGPU::S_CBRANCH_EXECZ:
807	Changed \|= removeExeczBranch(MI, SrcMBB&: MBB);
808	break;
809	}
810	}
811
812	if (!ST.hasVGPRIndexMode())
813	continue;
814
815	MachineInstr SetGPRMI = nullptr*;
816	const unsigned Threshold = `20`;
817	unsigned Count = `0`;
818	// Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
819	// second is not needed. Do expensive checks in the optimizeSetGPR()
820	// and limit the distance to 20 instructions for compile time purposes.
821	// Note: this needs to work on bundles as S_SET_GPR_IDX instructions*
822	// may be bundled with the instructions they modify.
823	for (auto &MI : make_early_inc_range(Range: MBB.instrs())) {
824	if (Count == Threshold)
825	SetGPRMI = nullptr;
826	else
827	++Count;
828
829	if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
830	continue;
831
832	Count = `0`;
833	if (!SetGPRMI) {
834	SetGPRMI = &MI;
835	continue;
836	}
837
838	if (optimizeSetGPR(First&: *SetGPRMI, MI))
839	Changed = true;
840	else
841	SetGPRMI = &MI;
842	}
843	}
844
845	// TODO: Fold this into previous block, if possible. Evaluate and handle any
846	// side effects.
847
848	// Perform the extra MF scans only for supported archs
849	if (!ST.hasGFX940Insts())
850	return Changed;
851	for (MachineBasicBlock &MBB : MF) {
852	// Unpack packed instructions overlapped by MFMAs. This allows the
853	// compiler to co-issue unpacked instructions with MFMA
854	auto SchedModel = TII->getSchedModel();
855	SetVector<MachineInstr *> InstrsToUnpack;
856	for (auto &MI : make_early_inc_range(Range: MBB.instrs())) {
857	if (!SIInstrInfo::isMFMA(MI))
858	continue;
859	const MCSchedClassDesc *SchedClassDesc =
860	SchedModel.resolveSchedClass(MI: &MI);
861	uint16_t NumMFMACycles =
862	SchedModel.getWriteProcResBegin(SC: SchedClassDesc)->ReleaseAtCycle;
863	collectUnpackingCandidates(BeginMI&: MI, InstrsToUnpack, NumMFMACycles);
864	}
865	for (MachineInstr *MI : InstrsToUnpack) {
866	performF32Unpacking(I&: *MI);
867	}
868	}
869
870	return Changed;
871	}
872

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp