SIFixSGPRCopies.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp]

1	//===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Copies from VGPR to SGPR registers are illegal and the register coalescer
11	/// will sometimes generate these illegal copies in situations like this:
12	///
13	/// Register Class <vsrc> is the union of <vgpr> and <sgpr>
14	///
15	/// BB0:
16	/// %0 <sgpr> = SCALAR_INST
17	/// %1 <vsrc> = COPY %0 <sgpr>
18	/// ...
19	/// BRANCH %cond BB1, BB2
20	/// BB1:
21	/// %2 <vgpr> = VECTOR_INST
22	/// %3 <vsrc> = COPY %2 <vgpr>
23	/// BB2:
24	/// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1>
25	/// %5 <vgpr> = VECTOR_INST %4 <vsrc>
26	///
27	///
28	/// The coalescer will begin at BB0 and eliminate its copy, then the resulting
29	/// code will look like this:
30	///
31	/// BB0:
32	/// %0 <sgpr> = SCALAR_INST
33	/// ...
34	/// BRANCH %cond BB1, BB2
35	/// BB1:
36	/// %2 <vgpr> = VECTOR_INST
37	/// %3 <vsrc> = COPY %2 <vgpr>
38	/// BB2:
39	/// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1>
40	/// %5 <vgpr> = VECTOR_INST %4 <sgpr>
41	///
42	/// Now that the result of the PHI instruction is an SGPR, the register
43	/// allocator is now forced to constrain the register class of %3 to
44	/// <sgpr> so we end up with final code like this:
45	///
46	/// BB0:
47	/// %0 <sgpr> = SCALAR_INST
48	/// ...
49	/// BRANCH %cond BB1, BB2
50	/// BB1:
51	/// %2 <vgpr> = VECTOR_INST
52	/// %3 <sgpr> = COPY %2 <vgpr>
53	/// BB2:
54	/// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1>
55	/// %5 <vgpr> = VECTOR_INST %4 <sgpr>
56	///
57	/// Now this code contains an illegal copy from a VGPR to an SGPR.
58	///
59	/// In order to avoid this problem, this pass searches for PHI instructions
60	/// which define a <vsrc> register and constrains its definition class to
61	/// <vgpr> if the user of the PHI's definition register is a vector instruction.
62	/// If the PHI's definition class is constrained to <vgpr> then the coalescer
63	/// will be unable to perform the COPY removal from the above example which
64	/// ultimately led to the creation of an illegal COPY.
65	//===----------------------------------------------------------------------===//
66
67	#include "SIFixSGPRCopies.h"
68	#include "AMDGPU.h"
69	#include "GCNSubtarget.h"
70	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
71	#include "llvm/CodeGen/MachineDominators.h"
72	#include "llvm/InitializePasses.h"
73	#include "llvm/Target/TargetMachine.h"
74
75	using namespace llvm;
76
77	#define DEBUG_TYPE "si-fix-sgpr-copies"
78
79	static cl::opt<bool> EnableM0Merge(
80	"amdgpu-enable-merge-m0",
81	cl::desc ("Merge and hoist M0 initializations"),
82	cl::init(Val: true));
83
84	namespace {
85
86	class V2SCopyInfo {
87	public:
88	// VGPR to SGPR copy being processed
89	MachineInstr *Copy;
90	// All SALU instructions reachable from this copy in SSA graph
91	SetVector<MachineInstr *> SChain;
92	// Number of SGPR to VGPR copies that are used to put the SALU computation
93	// results back to VALU.
94	unsigned NumSVCopies = `0`;
95
96	unsigned Score = `0`;
97	// Actual count of v_readfirstlane_b32
98	// which need to be inserted to keep SChain SALU
99	unsigned NumReadfirstlanes = `0`;
100	// Current score state. To speedup selection V2SCopyInfos for processing
101	bool NeedToBeConvertedToVALU = false;
102	// Unique ID. Used as a key for mapping to keep permanent order.
103	unsigned ID;
104
105	// Count of another VGPR to SGPR copies that contribute to the
106	// current copy SChain
107	unsigned SiblingPenalty = `0`;
108	SetVector<unsigned> Siblings;
109	V2SCopyInfo() : Copy(nullptr), ID(`0`){};
110	V2SCopyInfo(unsigned Id, MachineInstr C, unsigned* Width)
111	: Copy(C), NumReadfirstlanes(Width / `32`), ID(Id){};
112	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
113	void dump() {
114	dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
115	<< "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty
116	<< "\nScore: " << Score << "\n";
117	}
118	#endif
119	};
120
121	class SIFixSGPRCopies {
122	MachineDominatorTree *MDT;
123	SmallVector<MachineInstr*, `4`> SCCCopies;
124	SmallVector<MachineInstr*, `4`> RegSequences;
125	SmallVector<MachineInstr*, `4`> PHINodes;
126	SmallVector<MachineInstr*, `4`> S2VCopies;
127	unsigned NextVGPRToSGPRCopyID = `0`;
128	MapVector<unsigned, V2SCopyInfo> V2SCopies;
129	DenseMap<MachineInstr , SetVector<unsigned*>> SiblingPenalty;
130	DenseSet<MachineInstr *> PHISources;
131
132	public:
133	MachineRegisterInfo *MRI;
134	const SIRegisterInfo *TRI;
135	const SIInstrInfo *TII;
136
137	SIFixSGPRCopies(MachineDominatorTree *MDT) : MDT(MDT) {}
138
139	bool run(MachineFunction &MF);
140	void fixSCCCopies(MachineFunction &MF);
141	void prepareRegSequenceAndPHIs(MachineFunction &MF);
142	unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; }
143	bool needToBeConvertedToVALU(V2SCopyInfo *I);
144	void analyzeVGPRToSGPRCopy(MachineInstr *MI);
145	void lowerVGPR2SGPRCopies(MachineFunction &MF);
146	// Handles copies which source register is:
147	// 1. Physical register
148	// 2. AGPR
149	// 3. Defined by the instruction the merely moves the immediate
150	bool lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I);
151
152	void processPHINode(MachineInstr &MI);
153
154	// Check if MO is an immediate materialized into a VGPR, and if so replace it
155	// with an SGPR immediate. The VGPR immediate is also deleted if it does not
156	// have any other uses.
157	bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst,
158	MachineBasicBlock *BlockToInsertTo,
159	MachineBasicBlock::iterator PointToInsertTo,
160	const DebugLoc &DL);
161	};
162
163	class SIFixSGPRCopiesLegacy : public MachineFunctionPass {
164	public:
165	static char ID;
166
167	SIFixSGPRCopiesLegacy() : MachineFunctionPass (ID) {}
168
169	bool runOnMachineFunction(MachineFunction &MF) override {
170	MachineDominatorTree *MDT =
171	&getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
172	SIFixSGPRCopies Impl(MDT);
173	return Impl.run(MF);
174	}
175
176	StringRef getPassName() const override { return "SI Fix SGPR copies"; }
177
178	void getAnalysisUsage(AnalysisUsage &AU) const override {
179	AU.addRequired<MachineDominatorTreeWrapperPass>();
180	AU.addPreserved<MachineDominatorTreeWrapperPass>();
181	AU.setPreservesCFG();
182	MachineFunctionPass::getAnalysisUsage(AU);
183	}
184	};
185
186	} // end anonymous namespace
187
188	INITIALIZE_PASS_BEGIN(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies",
189	false, false)
190	INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
191	INITIALIZE_PASS_END(SIFixSGPRCopiesLegacy, DEBUG_TYPE, "SI Fix SGPR copies",
192	false, false)
193
194	char SIFixSGPRCopiesLegacy::ID = `0`;
195
196	char &llvm::SIFixSGPRCopiesLegacyID = SIFixSGPRCopiesLegacy::ID;
197
198	FunctionPass *llvm::createSIFixSGPRCopiesLegacyPass() {
199	return new SIFixSGPRCopiesLegacy ();
200	}
201
202	static std::pair<const TargetRegisterClass , const* TargetRegisterClass *>
203	getCopyRegClasses(const MachineInstr &Copy,
204	const SIRegisterInfo &TRI,
205	const MachineRegisterInfo &MRI) {
206	Register DstReg = Copy.getOperand(i: `0`).getReg();
207	Register SrcReg = Copy.getOperand(i: `1`).getReg();
208
209	const TargetRegisterClass *SrcRC = SrcReg.isVirtual()
210	? MRI.getRegClass(Reg: SrcReg)
211	: TRI.getPhysRegBaseClass(Reg: SrcReg);
212
213	// We don't really care about the subregister here.
214	// SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
215
216	const TargetRegisterClass *DstRC = DstReg.isVirtual()
217	? MRI.getRegClass(Reg: DstReg)
218	: TRI.getPhysRegBaseClass(Reg: DstReg);
219
220	return std::pair(SrcRC, DstRC);
221	}
222
223	static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
224	const TargetRegisterClass *DstRC,
225	const SIRegisterInfo &TRI) {
226	return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: DstRC) &&
227	TRI.hasVectorRegisters(RC: SrcRC);
228	}
229
230	static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
231	const TargetRegisterClass *DstRC,
232	const SIRegisterInfo &TRI) {
233	return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: SrcRC) &&
234	TRI.hasVectorRegisters(RC: DstRC);
235	}
236
237	static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
238	const SIRegisterInfo *TRI,
239	const SIInstrInfo *TII) {
240	MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
241	auto &Src = MI.getOperand(i: `1`);
242	Register DstReg = MI.getOperand(i: `0`).getReg();
243	Register SrcReg = Src.getReg();
244	if (!SrcReg.isVirtual() \|\| !DstReg.isVirtual())
245	return false;
246
247	for (const auto &MO : MRI.reg_nodbg_operands(Reg: DstReg)) {
248	const auto *UseMI = MO.getParent();
249	if (UseMI == &MI)
250	continue;
251	if (MO.isDef() \|\| UseMI->getParent() != MI.getParent() \|\|
252	UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
253	return false;
254
255	unsigned OpIdx = MO.getOperandNo();
256	if (OpIdx >= UseMI->getDesc().getNumOperands() \|\|
257	!TII->isOperandLegal(MI: *UseMI, OpIdx, MO: &Src))
258	return false;
259	}
260	// Change VGPR to SGPR destination.
261	MRI.setRegClass(Reg: DstReg, RC: TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: DstReg)));
262	return true;
263	}
264
265	// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
266	//
267	// SGPRx = ...
268	// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
269	// VGPRz = COPY SGPRy
270	//
271	// ==>
272	//
273	// VGPRx = COPY SGPRx
274	// VGPRz = REG_SEQUENCE VGPRx, sub0
275	//
276	// This exposes immediate folding opportunities when materializing 64-bit
277	// immediates.
278	static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
279	const SIRegisterInfo *TRI,
280	const SIInstrInfo *TII,
281	MachineRegisterInfo &MRI) {
282	assert(MI.isRegSequence());
283
284	Register DstReg = MI.getOperand(i: `0`).getReg();
285	if (!TRI->isSGPRClass(RC: MRI.getRegClass(Reg: DstReg)))
286	return false;
287
288	if (!MRI.hasOneUse(RegNo: DstReg))
289	return false;
290
291	MachineInstr &CopyUse = *MRI.use_instr_begin(RegNo: DstReg);
292	if (!CopyUse.isCopy())
293	return false;
294
295	// It is illegal to have vreg inputs to a physreg defining reg_sequence.
296	if (CopyUse.getOperand(i: `0`).getReg().isPhysical())
297	return false;
298
299	const TargetRegisterClass SrcRC, DstRC;
300	std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: CopyUse, TRI: *TRI, MRI);
301
302	if (!isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI))
303	return false;
304
305	if (tryChangeVGPRtoSGPRinCopy(MI&: CopyUse, TRI, TII))
306	return true;
307
308	// TODO: Could have multiple extracts?
309	unsigned SubReg = CopyUse.getOperand(i: `1`).getSubReg();
310	if (SubReg != AMDGPU::NoSubRegister)
311	return false;
312
313	MRI.setRegClass(Reg: DstReg, RC: DstRC);
314
315	// SGPRx = ...
316	// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
317	// VGPRz = COPY SGPRy
318
319	// =>
320	// VGPRx = COPY SGPRx
321	// VGPRz = REG_SEQUENCE VGPRx, sub0
322
323	MI.getOperand(i: `0`).setReg(CopyUse.getOperand(i: `0`).getReg());
324	bool IsAGPR = TRI->isAGPRClass(RC: DstRC);
325
326	for (unsigned I = `1`, N = MI.getNumOperands(); I != N; I += `2`) {
327	const TargetRegisterClass *SrcRC =
328	TRI->getRegClassForOperandReg(MRI, MO: MI.getOperand(i: I));
329	assert(TRI->isSGPRClass(SrcRC) &&
330	"Expected SGPR REG_SEQUENCE to only have SGPR inputs");
331	const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SRC: SrcRC);
332
333	Register TmpReg = MRI.createVirtualRegister(RegClass: NewSrcRC);
334
335	BuildMI(BB&: *MI.getParent(), I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY),
336	DestReg: TmpReg)
337	.add(MO: MI.getOperand(i: I));
338
339	if (IsAGPR) {
340	const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SRC: SrcRC);
341	Register TmpAReg = MRI.createVirtualRegister(RegClass: NewSrcRC);
342	unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
343	AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY;
344	BuildMI(BB&: *MI.getParent(), I: &MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc),
345	DestReg: TmpAReg)
346	.addReg(RegNo: TmpReg, flags: RegState::Kill);
347	TmpReg = TmpAReg;
348	}
349
350	MI.getOperand(i: I).setReg(TmpReg);
351	}
352
353	CopyUse.eraseFromParent();
354	return true;
355	}
356
357	static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
358	const MachineInstr *MoveImm,
359	const SIInstrInfo *TII,
360	unsigned &SMovOp,
361	int64_t &Imm) {
362	if (Copy->getOpcode() != AMDGPU::COPY)
363	return false;
364
365	if (!MoveImm->isMoveImmediate())
366	return false;
367
368	const MachineOperand *ImmOp =
369	TII->getNamedOperand(MI: *MoveImm, OperandName: AMDGPU::OpName::src0);
370	if (!ImmOp->isImm())
371	return false;
372
373	// FIXME: Handle copies with sub-regs.
374	if (Copy->getOperand(i: `1`).getSubReg())
375	return false;
376
377	switch (MoveImm->getOpcode()) {
378	default:
379	return false;
380	case AMDGPU::V_MOV_B32_e32:
381	SMovOp = AMDGPU::S_MOV_B32;
382	break;
383	case AMDGPU::V_MOV_B64_PSEUDO:
384	SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO;
385	break;
386	}
387	Imm = ImmOp->getImm();
388	return true;
389	}
390
391	template <class UnaryPredicate>
392	bool searchPredecessors(const MachineBasicBlock *MBB,
393	const MachineBasicBlock *CutOff,
394	UnaryPredicate Predicate) {
395	if (MBB == CutOff)
396	return false;
397
398	DenseSet<const MachineBasicBlock *> Visited;
399	SmallVector<MachineBasicBlock *, `4`> Worklist(MBB->predecessors());
400
401	while (!Worklist.empty()) {
402	MachineBasicBlock *MBB = Worklist.pop_back_val();
403
404	if (!Visited.insert(V: MBB).second)
405	continue;
406	if (MBB == CutOff)
407	continue;
408	if (Predicate(MBB))
409	return true;
410
411	Worklist.append(in_start: MBB->pred_begin(), in_end: MBB->pred_end());
412	}
413
414	return false;
415	}
416
417	// Checks if there is potential path From instruction To instruction.
418	// If CutOff is specified and it sits in between of that path we ignore
419	// a higher portion of the path and report it is not reachable.
420	static bool isReachable(const MachineInstr *From,
421	const MachineInstr *To,
422	const MachineBasicBlock *CutOff,
423	MachineDominatorTree &MDT) {
424	if (MDT.dominates(A: From, B: To))
425	return true;
426
427	const MachineBasicBlock *MBBFrom = From->getParent();
428	const MachineBasicBlock *MBBTo = To->getParent();
429
430	// Do predecessor search.
431	// We should almost never get here since we do not usually produce M0 stores
432	// other than -1.
433	return searchPredecessors(MBB: MBBTo, CutOff, Predicate: [MBBFrom]
434	(const MachineBasicBlock MBB) { return* MBB == MBBFrom; });
435	}
436
437	// Return the first non-prologue instruction in the block.
438	static MachineBasicBlock::iterator
439	getFirstNonPrologue(MachineBasicBlock MBB, const* TargetInstrInfo *TII) {
440	MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
441	while (I != MBB->end() && TII->isBasicBlockPrologue(MI: *I))
442	++I;
443
444	return I;
445	}
446
447	// Hoist and merge identical SGPR initializations into a common predecessor.
448	// This is intended to combine M0 initializations, but can work with any
449	// SGPR. A VGPR cannot be processed since we cannot guarantee vector
450	// executioon.
451	static bool hoistAndMergeSGPRInits(unsigned Reg,
452	const MachineRegisterInfo &MRI,
453	const TargetRegisterInfo *TRI,
454	MachineDominatorTree &MDT,
455	const TargetInstrInfo *TII) {
456	// List of inits by immediate value.
457	using InitListMap = std::map<unsigned, std::list<MachineInstr *>>;
458	InitListMap Inits;
459	// List of clobbering instructions.
460	SmallVector<MachineInstr*, `8`> Clobbers;
461	// List of instructions marked for deletion.
462	SmallSet<MachineInstr*, `8`> MergedInstrs;
463
464	bool Changed = false;
465
466	for (auto &MI : MRI.def_instructions(Reg)) {
467	MachineOperand Imm = nullptr*;
468	for (auto &MO : MI.operands()) {
469	if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) \|\| !MO.isDef())) \|\|
470	(!MO.isImm() && !MO.isReg()) \|\| (MO.isImm() && Imm)) {
471	Imm = nullptr;
472	break;
473	}
474	if (MO.isImm())
475	Imm = &MO;
476	}
477	if (Imm)
478	Inits [Imm->getImm()].push_front(x: &MI);
479	else
480	Clobbers.push_back(Elt: &MI);
481	}
482
483	for (auto &Init : Inits) {
484	auto &Defs = Init.second;
485
486	for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
487	MachineInstr MI1 = I1;
488
489	for (auto I2 = std::next(x: I1); I2 != E; ) {
490	MachineInstr MI2 = I2;
491
492	// Check any possible interference
493	auto interferes = [&](MachineBasicBlock::iterator From,
494	MachineBasicBlock::iterator To) -> bool {
495
496	assert(MDT.dominates(&To, &From));
497
498	auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
499	const MachineBasicBlock *MBBFrom = From ->getParent();
500	const MachineBasicBlock *MBBTo = To ->getParent();
501	bool MayClobberFrom = isReachable(From: Clobber, To: &*From, CutOff: MBBTo, MDT);
502	bool MayClobberTo = isReachable(From: Clobber, To: &*To, CutOff: MBBTo, MDT);
503	if (!MayClobberFrom && !MayClobberTo)
504	return false;
505	if ((MayClobberFrom && !MayClobberTo) \|\|
506	(!MayClobberFrom && MayClobberTo))
507	return true;
508	// Both can clobber, this is not an interference only if both are
509	// dominated by Clobber and belong to the same block or if Clobber
510	// properly dominates To, given that To >> From, so it dominates
511	// both and located in a common dominator.
512	return !((MBBFrom == MBBTo &&
513	MDT.dominates(A: Clobber, B: &*From) &&
514	MDT.dominates(A: Clobber, B: &*To)) \|\|
515	MDT.properlyDominates(A: Clobber->getParent(), B: MBBTo));
516	};
517
518	return (llvm::any_of(Range&: Clobbers, P: interferes)) \|\|
519	(llvm::any_of(Range&: Inits, P: [&](InitListMap::value_type &C) {
520	return C.first != Init.first &&
521	llvm::any_of(Range&: C.second, P: interferes);
522	}));
523	};
524
525	if (MDT.dominates(A: MI1, B: MI2)) {
526	if (!interferes (MI2, MI1)) {
527	LLVM_DEBUG(dbgs()
528	<< "Erasing from "
529	<< printMBBReference(MI2->getParent()) << " " << MI2);
530	MergedInstrs.insert(Ptr: MI2);
531	Changed = true;
532	++I2;
533	continue;
534	}
535	} else if (MDT.dominates(A: MI2, B: MI1)) {
536	if (!interferes (MI1, MI2)) {
537	LLVM_DEBUG(dbgs()
538	<< "Erasing from "
539	<< printMBBReference(MI1->getParent()) << " " << MI1);
540	MergedInstrs.insert(Ptr: MI1);
541	Changed = true;
542	++I1;
543	break;
544	}
545	} else {
546	auto *MBB = MDT.findNearestCommonDominator(A: MI1->getParent(),
547	B: MI2->getParent());
548	if (!MBB) {
549	++I2;
550	continue;
551	}
552
553	MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII);
554	if (!interferes (MI1, I) && !interferes (MI2, I)) {
555	LLVM_DEBUG(dbgs()
556	<< "Erasing from "
557	<< printMBBReference(MI1->getParent()) << " " << MI1
558	<< "and moving from "
559	<< printMBBReference(*MI2->getParent()) << " to "
560	<< printMBBReference(I->getParent()) << " " << MI2);
561	I ->getParent()->splice(Where: I, Other: MI2->getParent(), From: MI2);
562	MergedInstrs.insert(Ptr: MI1);
563	Changed = true;
564	++I1;
565	break;
566	}
567	}
568	++I2;
569	}
570	++I1;
571	}
572	}
573
574	// Remove initializations that were merged into another.
575	for (auto &Init : Inits) {
576	auto &Defs = Init.second;
577	auto I = Defs.begin();
578	while (I != Defs.end()) {
579	if (MergedInstrs.count(Ptr: *I)) {
580	(*I)->eraseFromParent();
581	I = Defs.erase(position: I);
582	} else
583	++I;
584	}
585	}
586
587	// Try to schedule SGPR initializations as early as possible in the MBB.
588	for (auto &Init : Inits) {
589	auto &Defs = Init.second;
590	for (auto *MI : Defs) {
591	auto *MBB = MI->getParent();
592	MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII);
593	MachineBasicBlock::reverse_iterator B(BoundaryMI);
594	// Check if B should actually be a boundary. If not set the previous
595	// instruction as the boundary instead.
596	if (!TII->isBasicBlockPrologue(MI: *B))
597	B ++;
598
599	auto R = std::next(x: MI->getReverseIterator());
600	const unsigned Threshold = `50`;
601	// Search until B or Threshold for a place to insert the initialization.
602	for (unsigned I = `0`; R != B && I < Threshold; ++R, ++I)
603	if (R ->readsRegister(Reg, TRI) \|\| R ->definesRegister(Reg, TRI) \|\|
604	TII->isSchedulingBoundary(MI: R, MBB, MF: MBB->getParent()))
605	break;
606
607	// Move to directly after R.
608	if (&*--R != MI)
609	MBB->splice(Where: *R, Other: MBB, From: MI);
610	}
611	}
612
613	if (Changed)
614	MRI.clearKillFlags(Reg);
615
616	return Changed;
617	}
618
619	bool SIFixSGPRCopies::run(MachineFunction &MF) {
620	// Only need to run this in SelectionDAG path.
621	if (MF.getProperties().hasSelected())
622	return false;
623
624	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
625	MRI = &MF.getRegInfo();
626	TRI = ST.getRegisterInfo();
627	TII = ST.getInstrInfo();
628
629	for (MachineBasicBlock &MBB : MF) {
630	for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
631	++I) {
632	MachineInstr &MI = *I;
633
634	switch (MI.getOpcode()) {
635	default:
636	continue;
637	case AMDGPU::COPY: {
638	const TargetRegisterClass SrcRC, DstRC;
639	std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: MI, TRI: TRI, MRI: MRI);
640
641	if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) {
642	// Since VGPR to SGPR copies affect VGPR to SGPR copy
643	// score and, hence the lowering decision, let's try to get rid of
644	// them as early as possible
645	if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII))
646	continue;
647
648	// Collect those not changed to try them after VGPR to SGPR copies
649	// lowering as there will be more opportunities.
650	S2VCopies.push_back(Elt: &MI);
651	}
652	if (!isVGPRToSGPRCopy(SrcRC, DstRC, TRI: *TRI))
653	continue;
654	if (lowerSpecialCase(MI, I))
655	continue;
656
657	analyzeVGPRToSGPRCopy(MI: &MI);
658
659	break;
660	}
661	case AMDGPU::WQM:
662	case AMDGPU::STRICT_WQM:
663	case AMDGPU::SOFT_WQM:
664	case AMDGPU::STRICT_WWM:
665	case AMDGPU::INSERT_SUBREG:
666	case AMDGPU::PHI:
667	case AMDGPU::REG_SEQUENCE: {
668	if (TRI->isSGPRClass(RC: TII->getOpRegClass(MI, OpNo: `0`))) {
669	for (MachineOperand &MO : MI.operands()) {
670	if (!MO.isReg() \|\| !MO.getReg().isVirtual())
671	continue;
672	const TargetRegisterClass *SrcRC = MRI->getRegClass(Reg: MO.getReg());
673	if (SrcRC == &AMDGPU::VReg_1RegClass)
674	continue;
675
676	if (TRI->hasVectorRegisters(RC: SrcRC)) {
677	const TargetRegisterClass *DestRC =
678	TRI->getEquivalentSGPRClass(VRC: SrcRC);
679	Register NewDst = MRI->createVirtualRegister(RegClass: DestRC);
680	MachineBasicBlock *BlockToInsertCopy =
681	MI.isPHI() ? MI.getOperand(i: MO.getOperandNo() + `1`).getMBB()
682	: &MBB;
683	MachineBasicBlock::iterator PointToInsertCopy =
684	MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I;
685
686	const DebugLoc &DL = MI.getDebugLoc();
687	if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertTo: BlockToInsertCopy,
688	PointToInsertTo: PointToInsertCopy, DL)) {
689	MachineInstr *NewCopy =
690	BuildMI(BB&: *BlockToInsertCopy, I: PointToInsertCopy, MIMD: DL,
691	MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: NewDst)
692	.addReg(RegNo: MO.getReg());
693	MO.setReg(NewDst);
694	analyzeVGPRToSGPRCopy(MI: NewCopy);
695	PHISources.insert(V: NewCopy);
696	}
697	}
698	}
699	}
700
701	if (MI.isPHI())
702	PHINodes.push_back(Elt: &MI);
703	else if (MI.isRegSequence())
704	RegSequences.push_back(Elt: &MI);
705
706	break;
707	}
708	case AMDGPU::V_WRITELANE_B32: {
709	// Some architectures allow more than one constant bus access without
710	// SGPR restriction
711	if (ST.getConstantBusLimit(Opcode: MI.getOpcode()) != `1`)
712	break;
713
714	// Writelane is special in that it can use SGPR and M0 (which would
715	// normally count as using the constant bus twice - but in this case it
716	// is allowed since the lane selector doesn't count as a use of the
717	// constant bus). However, it is still required to abide by the 1 SGPR
718	// rule. Apply a fix here as we might have multiple SGPRs after
719	// legalizing VGPRs to SGPRs
720	int Src0Idx =
721	AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0);
722	int Src1Idx =
723	AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src1);
724	MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
725	MachineOperand &Src1 = MI.getOperand(i: Src1Idx);
726
727	// Check to see if the instruction violates the 1 SGPR rule
728	if ((Src0.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src0.getReg()) &&
729	Src0.getReg() != AMDGPU::M0) &&
730	(Src1.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src1.getReg()) &&
731	Src1.getReg() != AMDGPU::M0)) {
732
733	// Check for trivially easy constant prop into one of the operands
734	// If this is the case then perform the operation now to resolve SGPR
735	// issue. If we don't do that here we will always insert a mov to m0
736	// that can't be resolved in later operand folding pass
737	bool Resolved = false;
738	for (MachineOperand *MO : {&Src0, &Src1}) {
739	if (MO->getReg().isVirtual()) {
740	MachineInstr *DefMI = MRI->getVRegDef(Reg: MO->getReg());
741	if (DefMI && TII->isFoldableCopy(MI: *DefMI)) {
742	const MachineOperand &Def = DefMI->getOperand(i: `0`);
743	if (Def.isReg() &&
744	MO->getReg() == Def.getReg() &&
745	MO->getSubReg() == Def.getSubReg()) {
746	const MachineOperand &Copied = DefMI->getOperand(i: `1`);
747	if (Copied.isImm() &&
748	TII->isInlineConstant(Imm: APInt (`64`, Copied.getImm(), true))) {
749	MO->ChangeToImmediate(ImmVal: Copied.getImm());
750	Resolved = true;
751	break;
752	}
753	}
754	}
755	}
756	}
757
758	if (!Resolved) {
759	// Haven't managed to resolve by replacing an SGPR with an immediate
760	// Move src1 to be in M0
761	BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(),
762	MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
763	.add(MO: Src1);
764	Src1.ChangeToRegister(Reg: AMDGPU::M0, isDef: false);
765	}
766	}
767	break;
768	}
769	}
770	}
771	}
772
773	lowerVGPR2SGPRCopies(MF);
774	// Postprocessing
775	fixSCCCopies(MF);
776	for (auto *MI : S2VCopies) {
777	// Check if it is still valid
778	if (MI->isCopy()) {
779	const TargetRegisterClass SrcRC, DstRC;
780	std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: MI, TRI: TRI, MRI: *MRI);
781	if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI))
782	tryChangeVGPRtoSGPRinCopy(MI&: *MI, TRI, TII);
783	}
784	}
785	for (auto *MI : RegSequences) {
786	// Check if it is still valid
787	if (MI->isRegSequence())
788	foldVGPRCopyIntoRegSequence(MI&: MI, TRI, TII, MRI&: MRI);
789	}
790	for (auto *MI : PHINodes) {
791	processPHINode(MI&: *MI);
792	}
793	if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge)
794	hoistAndMergeSGPRInits(Reg: AMDGPU::M0, MRI: MRI, TRI, MDT&: MDT, TII);
795
796	SiblingPenalty.clear();
797	V2SCopies.clear();
798	SCCCopies.clear();
799	RegSequences.clear();
800	PHINodes.clear();
801	S2VCopies.clear();
802	PHISources.clear();
803
804	return true;
805	}
806
807	void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
808	bool AllAGPRUses = true;
809	SetVector<const MachineInstr *> worklist;
810	SmallSet<const MachineInstr *, `4`> Visited;
811	SetVector<MachineInstr *> PHIOperands;
812	worklist.insert(X: &MI);
813	Visited.insert(Ptr: &MI);
814	// HACK to make MIR tests with no uses happy
815	bool HasUses = false;
816	while (!worklist.empty()) {
817	const MachineInstr *Instr = worklist.pop_back_val();
818	Register Reg = Instr->getOperand(i: `0`).getReg();
819	for (const auto &Use : MRI->use_operands(Reg)) {
820	HasUses = true;
821	const MachineInstr *UseMI = Use.getParent();
822	AllAGPRUses &= (UseMI->isCopy() &&
823	TRI->isAGPR(MRI: *MRI, Reg: UseMI->getOperand(i: `0`).getReg())) \|\|
824	TRI->isAGPR(MRI: *MRI, Reg: Use.getReg());
825	if (UseMI->isCopy() \|\| UseMI->isRegSequence()) {
826	if (Visited.insert(Ptr: UseMI).second)
827	worklist.insert(X: UseMI);
828
829	continue;
830	}
831	}
832	}
833
834	Register PHIRes = MI.getOperand(i: `0`).getReg();
835	const TargetRegisterClass *RC0 = MRI->getRegClass(Reg: PHIRes);
836	if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC: RC0)) {
837	LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
838	MRI->setRegClass(Reg: PHIRes, RC: TRI->getEquivalentAGPRClass(SRC: RC0));
839	for (unsigned I = `1`, N = MI.getNumOperands(); I != N; I += `2`) {
840	MachineInstr *DefMI = MRI->getVRegDef(Reg: MI.getOperand(i: I).getReg());
841	if (DefMI && DefMI->isPHI())
842	PHIOperands.insert(X: DefMI);
843	}
844	}
845
846	if (TRI->isVectorRegister(MRI: *MRI, Reg: PHIRes) \|\|
847	RC0 == &AMDGPU::VReg_1RegClass) {
848	LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
849	TII->legalizeOperands(MI, MDT);
850	}
851
852	// Propagate register class back to PHI operands which are PHI themselves.
853	while (!PHIOperands.empty()) {
854	processPHINode(MI&: *PHIOperands.pop_back_val());
855	}
856	}
857
858	bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR(
859	MachineOperand &MaybeVGPRConstMO, Register DstReg,
860	MachineBasicBlock *BlockToInsertTo,
861	MachineBasicBlock::iterator PointToInsertTo, const DebugLoc &DL) {
862
863	MachineInstr *DefMI = MRI->getVRegDef(Reg: MaybeVGPRConstMO.getReg());
864	if (!DefMI \|\| !DefMI->isMoveImmediate())
865	return false;
866
867	MachineOperand SrcConst = TII->getNamedOperand(MI&: DefMI, OperandName: AMDGPU::OpName::src0);
868	if (SrcConst->isReg())
869	return false;
870
871	const TargetRegisterClass *SrcRC =
872	MRI->getRegClass(Reg: MaybeVGPRConstMO.getReg());
873	unsigned MoveSize = TRI->getRegSizeInBits(RC: *SrcRC);
874	unsigned MoveOp = MoveSize == `64` ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
875	BuildMI(BB&: *BlockToInsertTo, I: PointToInsertTo, MIMD: DL, MCID: TII->get(Opcode: MoveOp), DestReg: DstReg)
876	.add(MO: *SrcConst);
877	if (MRI->hasOneUse(RegNo: MaybeVGPRConstMO.getReg()))
878	DefMI->eraseFromParent();
879	MaybeVGPRConstMO.setReg(DstReg);
880	return true;
881	}
882
883	bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
884	MachineBasicBlock::iterator &I) {
885	Register DstReg = MI.getOperand(i: `0`).getReg();
886	Register SrcReg = MI.getOperand(i: `1`).getReg();
887	if (!DstReg.isVirtual()) {
888	// If the destination register is a physical register there isn't
889	// really much we can do to fix this.
890	// Some special instructions use M0 as an input. Some even only use
891	// the first lane. Insert a readfirstlane and hope for the best.
892	if (DstReg == AMDGPU::M0 &&
893	TRI->hasVectorRegisters(RC: MRI->getRegClass(Reg: SrcReg))) {
894	Register TmpReg =
895	MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
896	BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(),
897	MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: TmpReg)
898	.add(MO: MI.getOperand(i: `1`));
899	MI.getOperand(i: `1`).setReg(TmpReg);
900	} else if (tryMoveVGPRConstToSGPR(MaybeVGPRConstMO&: MI.getOperand(i: `1`), DstReg, BlockToInsertTo: MI.getParent(),
901	PointToInsertTo: MI, DL: MI.getDebugLoc())) {
902	I = std::next(x: I);
903	MI.eraseFromParent();
904	}
905	return true;
906	}
907	if (!SrcReg.isVirtual() \|\| TRI->isAGPR(MRI: *MRI, Reg: SrcReg)) {
908	SIInstrWorklist worklist;
909	worklist.insert(MI: &MI);
910	TII->moveToVALU(Worklist&: worklist, MDT);
911	return true;
912	}
913
914	unsigned SMovOp;
915	int64_t Imm;
916	// If we are just copying an immediate, we can replace the copy with
917	// s_mov_b32.
918	if (isSafeToFoldImmIntoCopy(Copy: &MI, MoveImm: MRI->getVRegDef(Reg: SrcReg), TII, SMovOp, Imm)) {
919	MI.getOperand(i: `1`).ChangeToImmediate(ImmVal: Imm);
920	MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent());
921	MI.setDesc(TII->get(Opcode: SMovOp));
922	return true;
923	}
924	return false;
925	}
926
927	void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
928	if (PHISources.contains(V: MI))
929	return;
930	Register DstReg = MI->getOperand(i: `0`).getReg();
931	const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg);
932
933	V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI,
934	TRI->getRegSizeInBits(RC: *DstRC));
935	SmallVector<MachineInstr *, `8`> AnalysisWorklist;
936	// Needed because the SSA is not a tree but a graph and may have
937	// forks and joins. We should not then go same way twice.
938	DenseSet<MachineInstr *> Visited;
939	AnalysisWorklist.push_back(Elt: Info.Copy);
940	while (!AnalysisWorklist.empty()) {
941
942	MachineInstr *Inst = AnalysisWorklist.pop_back_val();
943
944	if (!Visited.insert(V: Inst).second)
945	continue;
946
947	// Copies and REG_SEQUENCE do not contribute to the final assembly
948	// So, skip them but take care of the SGPR to VGPR copies bookkeeping.
949	if (Inst->isCopy() \|\| Inst->isRegSequence()) {
950	if (TRI->isVGPR(MRI: *MRI, Reg: Inst->getOperand(i: `0`).getReg())) {
951	if (!Inst->isCopy() \|\|
952	!tryChangeVGPRtoSGPRinCopy(MI&: *Inst, TRI, TII)) {
953	Info.NumSVCopies++;
954	continue;
955	}
956	}
957	}
958
959	SiblingPenalty [Inst].insert(X: Info.ID);
960
961	SmallVector<MachineInstr *, `4`> Users;
962	if ((TII->isSALU(MI: *Inst) && Inst->isCompare()) \|\|
963	(Inst->isCopy() && Inst->getOperand(i: `0`).getReg() == AMDGPU::SCC)) {
964	auto I = Inst->getIterator();
965	auto E = Inst->getParent()->end();
966	while (++I != E &&
967	!I ->findRegisterDefOperand(Reg: AMDGPU::SCC, /TRI=/nullptr)) {
968	if (I ->readsRegister(Reg: AMDGPU::SCC, /TRI=/nullptr))
969	Users.push_back(Elt: &*I);
970	}
971	} else if (Inst->getNumExplicitDefs() != `0`) {
972	Register Reg = Inst->getOperand(i: `0`).getReg();
973	if (Reg.isVirtual() && TRI->isSGPRReg(MRI: MRI, Reg) && !TII->isVALU(MI: Inst)) {
974	for (auto &U : MRI->use_instructions(Reg))
975	Users.push_back(Elt: &U);
976	}
977	}
978	for (auto *U : Users) {
979	if (TII->isSALU(MI: *U))
980	Info.SChain.insert(X: U);
981	AnalysisWorklist.push_back(Elt: U);
982	}
983	}
984	V2SCopies [Info.ID] = Info;
985	}
986
987	// The main function that computes the VGPR to SGPR copy score
988	// and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
989	bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) {
990	if (Info->SChain.empty()) {
991	Info->Score = `0`;
992	return true;
993	}
994	Info->Siblings = SiblingPenalty [*llvm::max_element(
995	Range&: Info->SChain, C: [&](MachineInstr A, MachineInstr B) -> bool {
996	return SiblingPenalty [A].size() < SiblingPenalty [B].size();
997	})];
998	Info->Siblings.remove_if(P: [&](unsigned ID) { return ID == Info->ID; });
999	// The loop below computes the number of another VGPR to SGPR V2SCopies
1000	// which contribute to the current copy SALU chain. We assume that all the
1001	// V2SCopies with the same source virtual register will be squashed to one
1002	// by regalloc. Also we take care of the V2SCopies of the differnt subregs
1003	// of the same register.
1004	SmallSet<std::pair<Register, unsigned>, `4`> SrcRegs;
1005	for (auto J : Info->Siblings) {
1006	auto *InfoIt = V2SCopies.find(Key: J);
1007	if (InfoIt != V2SCopies.end()) {
1008	MachineInstr *SiblingCopy = InfoIt->second.Copy;
1009	if (SiblingCopy->isImplicitDef())
1010	// the COPY has already been MoveToVALUed
1011	continue;
1012
1013	SrcRegs.insert(V: std::pair(SiblingCopy->getOperand(i: `1`).getReg(),
1014	SiblingCopy->getOperand(i: `1`).getSubReg()));
1015	}
1016	}
1017	Info->SiblingPenalty = SrcRegs.size();
1018
1019	unsigned Penalty =
1020	Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes;
1021	unsigned Profit = Info->SChain.size();
1022	Info->Score = Penalty > Profit ? `0` : Profit - Penalty;
1023	Info->NeedToBeConvertedToVALU = Info->Score < `3`;
1024	return Info->NeedToBeConvertedToVALU;
1025	}
1026
1027	void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
1028
1029	SmallVector<unsigned, `8`> LoweringWorklist;
1030	for (auto &C : V2SCopies) {
1031	if (needToBeConvertedToVALU(Info: &C.second))
1032	LoweringWorklist.push_back(Elt: C.second.ID);
1033	}
1034
1035	// Store all the V2S copy instructions that need to be moved to VALU
1036	// in the Copies worklist.
1037	SIInstrWorklist Copies;
1038
1039	while (!LoweringWorklist.empty()) {
1040	unsigned CurID = LoweringWorklist.pop_back_val();
1041	auto *CurInfoIt = V2SCopies.find(Key: CurID);
1042	if (CurInfoIt != V2SCopies.end()) {
1043	V2SCopyInfo C = CurInfoIt->second;
1044	LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
1045	for (auto S : C.Siblings) {
1046	auto *SibInfoIt = V2SCopies.find(Key: S);
1047	if (SibInfoIt != V2SCopies.end()) {
1048	V2SCopyInfo &SI = SibInfoIt->second;
1049	LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
1050	if (!SI.NeedToBeConvertedToVALU) {
1051	SI.SChain.set_subtract(C.SChain);
1052	if (needToBeConvertedToVALU(Info: &SI))
1053	LoweringWorklist.push_back(Elt: SI.ID);
1054	}
1055	SI.Siblings.remove_if(P: [&](unsigned ID) { return ID == C.ID; });
1056	}
1057	}
1058	LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
1059	<< " is being turned to VALU\n");
1060	// TODO: MapVector::erase is inefficient. Do bulk removal with remove_if
1061	// instead.
1062	V2SCopies.erase(Key: C.ID);
1063	Copies.insert(MI: C.Copy);
1064	}
1065	}
1066
1067	TII->moveToVALU(Worklist&: Copies, MDT);
1068	Copies.clear();
1069
1070	// Now do actual lowering
1071	for (auto C : V2SCopies) {
1072	MachineInstr *MI = C.second.Copy;
1073	MachineBasicBlock *MBB = MI->getParent();
1074	// We decide to turn V2S copy to v_readfirstlane_b32
1075	// remove it from the V2SCopies and remove it from all its siblings
1076	LLVM_DEBUG(dbgs() << "V2S copy " << *MI
1077	<< " is being turned to v_readfirstlane_b32"
1078	<< " Score: " << C.second.Score << "\n");
1079	Register DstReg = MI->getOperand(i: `0`).getReg();
1080	MRI->constrainRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32_XM0RegClass);
1081
1082	Register SrcReg = MI->getOperand(i: `1`).getReg();
1083	unsigned SubReg = MI->getOperand(i: `1`).getSubReg();
1084	const TargetRegisterClass *SrcRC =
1085	TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: `1`));
1086	size_t SrcSize = TRI->getRegSizeInBits(RC: *SrcRC);
1087	if (SrcSize == `16`) {
1088	assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
1089	"We do not expect to see 16-bit copies from VGPR to SGPR unless "
1090	"we have 16-bit VGPRs");
1091	assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass \|\|
1092	MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass \|\|
1093	MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
1094	// There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
1095	MRI->setRegClass(Reg: DstReg, RC: &AMDGPU::SReg_32_XM0RegClass);
1096	Register VReg32 = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1097	const DebugLoc &DL = MI->getDebugLoc();
1098	Register Undef = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_16RegClass);
1099	BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Undef);
1100	BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: VReg32)
1101	.addReg(RegNo: SrcReg, flags: `0`, SubReg)
1102	.addImm(Val: AMDGPU::lo16)
1103	.addReg(RegNo: Undef)
1104	.addImm(Val: AMDGPU::hi16);
1105	BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
1106	.addReg(RegNo: VReg32);
1107	} else if (SrcSize == `32`) {
1108	auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(),
1109	MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg);
1110	MIB.addReg(RegNo: SrcReg, flags: `0`, SubReg);
1111	} else {
1112	auto Result = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(),
1113	MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: DstReg);
1114	int N = TRI->getRegSizeInBits(RC: *SrcRC) / `32`;
1115	for (int i = `0`; i < N; i++) {
1116	Register PartialSrc = TII->buildExtractSubReg(
1117	MI: Result, MRI&: *MRI, SuperReg: MI->getOperand(i: `1`), SuperRC: SrcRC,
1118	SubIdx: TRI->getSubRegFromChannel(Channel: i), SubRC: &AMDGPU::VGPR_32RegClass);
1119	Register PartialDst =
1120	MRI->createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
1121	BuildMI(BB&: MBB, I&: Result, MIMD: Result ->getDebugLoc(),
1122	MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: PartialDst)
1123	.addReg(RegNo: PartialSrc);
1124	Result.addReg(RegNo: PartialDst).addImm(Val: TRI->getSubRegFromChannel(Channel: i));
1125	}
1126	}
1127	MI->eraseFromParent();
1128	}
1129	}
1130
1131	void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
1132	bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32();
1133	for (MachineBasicBlock &MBB : MF) {
1134	for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1135	++I) {
1136	MachineInstr &MI = *I;
1137	// May already have been lowered.
1138	if (!MI.isCopy())
1139	continue;
1140	Register SrcReg = MI.getOperand(i: `1`).getReg();
1141	Register DstReg = MI.getOperand(i: `0`).getReg();
1142	if (SrcReg == AMDGPU::SCC) {
1143	Register SCCCopy =
1144	MRI->createVirtualRegister(RegClass: TRI->getWaveMaskRegClass());
1145	I = BuildMI(BB&: *MI.getParent(), I: std::next(x: MachineBasicBlock::iterator (MI)),
1146	MIMD: MI.getDebugLoc(),
1147	MCID: TII->get(Opcode: IsWave32 ? AMDGPU::S_CSELECT_B32
1148	: AMDGPU::S_CSELECT_B64),
1149	DestReg: SCCCopy)
1150	.addImm(Val: -`1`)
1151	.addImm(Val: `0`);
1152	I = BuildMI(BB&: *MI.getParent(), I: std::next(x: I), MIMD: I ->getDebugLoc(),
1153	MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: DstReg)
1154	.addReg(RegNo: SCCCopy);
1155	MI.eraseFromParent();
1156	continue;
1157	}
1158	if (DstReg == AMDGPU::SCC) {
1159	unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
1160	Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1161	Register Tmp = MRI->createVirtualRegister(RegClass: TRI->getBoolRC());
1162	I = BuildMI(BB&: *MI.getParent(), I: std::next(x: MachineBasicBlock::iterator (MI)),
1163	MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode))
1164	.addReg(RegNo: Tmp, flags: getDefRegState(B: true))
1165	.addReg(RegNo: SrcReg)
1166	.addReg(RegNo: Exec);
1167	MI.eraseFromParent();
1168	}
1169	}
1170	}
1171	}
1172
1173	PreservedAnalyses
1174	SIFixSGPRCopiesPass::run(MachineFunction &MF,
1175	MachineFunctionAnalysisManager &MFAM) {
1176	MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(IR&: MF);
1177	SIFixSGPRCopies Impl(&MDT);
1178	bool Changed = Impl.run(MF);
1179	if (!Changed)
1180	return PreservedAnalyses::all();
1181
1182	// TODO: We could detect CFG changed.
1183	auto PA = getMachineFunctionPassPreservedAnalyses();
1184	return PA;
1185	}
1186

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp