SIShrinkInstructions.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp]

1	//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	/// The pass tries to use the 32-bit encoding for instructions when possible.
8	//===----------------------------------------------------------------------===//
9	//
10
11	#include "SIShrinkInstructions.h"
12	#include "AMDGPU.h"
13	#include "GCNSubtarget.h"
14	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15	#include "Utils/AMDGPUBaseInfo.h"
16	#include "llvm/ADT/Statistic.h"
17	#include "llvm/CodeGen/MachineFunctionPass.h"
18
19	#define DEBUG_TYPE "si-shrink-instructions"
20
21	STATISTIC(NumInstructionsShrunk,
22	"Number of 64-bit instruction reduced to 32-bit.");
23	STATISTIC(NumLiteralConstantsFolded,
24	"Number of literal constants folded into 32-bit instructions.");
25
26	using namespace llvm;
27
28	namespace {
29
30	enum ChangeKind { None, UpdateHint, UpdateInst };
31
32	class SIShrinkInstructions {
33	MachineFunction *MF;
34	MachineRegisterInfo *MRI;
35	const GCNSubtarget *ST;
36	const SIInstrInfo *TII;
37	const SIRegisterInfo *TRI;
38	bool IsPostRA;
39
40	bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
41	bool shouldShrinkTrue16(MachineInstr &MI) const;
42	bool isKImmOperand(const MachineOperand &Src) const;
43	bool isKUImmOperand(const MachineOperand &Src) const;
44	bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
45	void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
46	bool shrinkScalarCompare(MachineInstr &MI) const;
47	bool shrinkMIMG(MachineInstr &MI) const;
48	bool shrinkMadFma(MachineInstr &MI) const;
49	ChangeKind shrinkScalarLogicOp(MachineInstr &MI) const;
50	bool tryReplaceDeadSDST(MachineInstr &MI) const;
51	bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
52	Register Reg, unsigned SubReg) const;
53	bool instReadsReg(const MachineInstr MI, unsigned* Reg,
54	unsigned SubReg) const;
55	bool instModifiesReg(const MachineInstr MI, unsigned* Reg,
56	unsigned SubReg) const;
57	TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
58	unsigned I) const;
59	void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
60	MachineInstr matchSwap(MachineInstr &MovT) const*;
61
62	public:
63	SIShrinkInstructions() = default;
64	bool run(MachineFunction &MF);
65	};
66
67	class SIShrinkInstructionsLegacy : public MachineFunctionPass {
68
69	public:
70	static char ID;
71
72	SIShrinkInstructionsLegacy() : MachineFunctionPass (ID) {}
73
74	bool runOnMachineFunction(MachineFunction &MF) override;
75
76	StringRef getPassName() const override { return "SI Shrink Instructions"; }
77
78	void getAnalysisUsage(AnalysisUsage &AU) const override {
79	AU.setPreservesCFG();
80	MachineFunctionPass::getAnalysisUsage(AU);
81	}
82	};
83
84	} // End anonymous namespace.
85
86	INITIALIZE_PASS(SIShrinkInstructionsLegacy, DEBUG_TYPE,
87	"SI Shrink Instructions", false, false)
88
89	char SIShrinkInstructionsLegacy::ID = `0`;
90
91	FunctionPass *llvm::createSIShrinkInstructionsLegacyPass() {
92	return new SIShrinkInstructionsLegacy ();
93	}
94
95	/// This function checks \p MI for operands defined by a move immediate
96	/// instruction and then folds the literal constant into the instruction if it
97	/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
98	bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
99	bool TryToCommute) const {
100	assert(TII->isVOP1(MI) \|\| TII->isVOP2(MI) \|\| TII->isVOPC(MI));
101
102	int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0);
103
104	// Try to fold Src0
105	MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
106	if (Src0.isReg()) {
107	Register Reg = Src0.getReg();
108	if (Reg.isVirtual()) {
109	MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
110	if (Def && Def->isMoveImmediate()) {
111	MachineOperand &MovSrc = Def->getOperand(i: `1`);
112	bool ConstantFolded = false;
113
114	if (TII->isOperandLegal(MI, OpIdx: Src0Idx, MO: &MovSrc)) {
115	if (MovSrc.isImm()) {
116	Src0.ChangeToImmediate(ImmVal: MovSrc.getImm());
117	ConstantFolded = true;
118	} else if (MovSrc.isFI()) {
119	Src0.ChangeToFrameIndex(Idx: MovSrc.getIndex());
120	ConstantFolded = true;
121	} else if (MovSrc.isGlobal()) {
122	Src0.ChangeToGA(GV: MovSrc.getGlobal(), Offset: MovSrc.getOffset(),
123	TargetFlags: MovSrc.getTargetFlags());
124	ConstantFolded = true;
125	}
126	}
127
128	if (ConstantFolded) {
129	if (MRI->use_nodbg_empty(RegNo: Reg))
130	Def->eraseFromParent();
131	++NumLiteralConstantsFolded;
132	return true;
133	}
134	}
135	}
136	}
137
138	// We have failed to fold src0, so commute the instruction and try again.
139	if (TryToCommute && MI.isCommutable()) {
140	if (TII->commuteInstruction(MI)) {
141	if (foldImmediates(MI, TryToCommute: false))
142	return true;
143
144	// Commute back.
145	TII->commuteInstruction(MI);
146	}
147	}
148
149	return false;
150	}
151
152	/// Do not shrink the instruction if its registers are not expressible in the
153	/// shrunk encoding.
154	bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const {
155	for (unsigned I = `0`, E = MI.getNumExplicitOperands(); I != E; ++I) {
156	const MachineOperand &MO = MI.getOperand(i: I);
157	if (MO.isReg()) {
158	Register Reg = MO.getReg();
159	assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink "
160	"True16 Instructions post-RA");
161	if (AMDGPU::VGPR_32RegClass.contains(Reg) &&
162	!AMDGPU::VGPR_32_Lo128RegClass.contains(Reg))
163	return false;
164
165	if (AMDGPU::VGPR_16RegClass.contains(Reg) &&
166	!AMDGPU::VGPR_16_Lo128RegClass.contains(Reg))
167	return false;
168	}
169	}
170	return true;
171	}
172
173	bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
174	return isInt<`16`>(x: SignExtend64(X: Src.getImm(), B: `32`)) &&
175	!TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo());
176	}
177
178	bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
179	return isUInt<`16`>(x: Src.getImm()) &&
180	!TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo());
181	}
182
183	bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
184	bool &IsUnsigned) const {
185	if (isInt<`16`>(x: SignExtend64(X: Src.getImm(), B: `32`))) {
186	IsUnsigned = false;
187	return !TII->isInlineConstant(MO: Src);
188	}
189
190	if (isUInt<`16`>(x: Src.getImm())) {
191	IsUnsigned = true;
192	return !TII->isInlineConstant(MO: Src);
193	}
194
195	return false;
196	}
197
198	/// \returns the opcode of an instruction a move immediate of the constant \p
199	/// Src can be replaced with if the constant is replaced with \p ModifiedImm.
200	/// i.e.
201	///
202	/// If the bitreverse of a constant is an inline immediate, reverse the
203	/// immediate and return the bitreverse opcode.
204	///
205	/// If the bitwise negation of a constant is an inline immediate, reverse the
206	/// immediate and return the bitwise not opcode.
207	static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII,
208	const MachineOperand &Src,
209	int32_t &ModifiedImm, bool Scalar) {
210	if (TII->isInlineConstant(MO: Src))
211	return `0`;
212	int32_t SrcImm = static_cast<int32_t>(Src.getImm());
213
214	if (!Scalar) {
215	// We could handle the scalar case with here, but we would need to check
216	// that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth
217	// it, as the reasonable values are already covered by s_movk_i32.
218	ModifiedImm = ~SrcImm;
219	if (TII->isInlineConstant(Imm: APInt (`32`, ModifiedImm, true)))
220	return AMDGPU::V_NOT_B32_e32;
221	}
222
223	ModifiedImm = reverseBits<int32_t>(Val: SrcImm);
224	if (TII->isInlineConstant(Imm: APInt (`32`, ModifiedImm, true)))
225	return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32;
226
227	return `0`;
228	}
229
230	/// Copy implicit register operands from specified instruction to this
231	/// instruction that are not part of the instruction definition.
232	void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
233	MachineInstr &MI) const {
234	MachineFunction &MF = *MI.getMF();
235	for (unsigned i = MI.getDesc().getNumOperands() +
236	MI.getDesc().implicit_uses().size() +
237	MI.getDesc().implicit_defs().size(),
238	e = MI.getNumOperands();
239	i != e; ++i) {
240	const MachineOperand &MO = MI.getOperand(i);
241	if ((MO.isReg() && MO.isImplicit()) \|\| MO.isRegMask())
242	NewMI.addOperand(MF, Op: MO);
243	}
244	}
245
246	bool SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
247	if (!ST->hasSCmpK())
248	return false;
249
250	// cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
251	// get constants on the RHS.
252	bool Changed = false;
253	if (!MI.getOperand(i: `0`).isReg()) {
254	if (TII->commuteInstruction(MI, NewMI: false, OpIdx1: `0`, OpIdx2: `1`))
255	Changed = true;
256	}
257
258	// cmpk requires src0 to be a register
259	const MachineOperand &Src0 = MI.getOperand(i: `0`);
260	if (!Src0.isReg())
261	return Changed;
262
263	MachineOperand &Src1 = MI.getOperand(i: `1`);
264	if (!Src1.isImm())
265	return Changed;
266
267	int SOPKOpc = AMDGPU::getSOPKOp(Opcode: MI.getOpcode());
268	if (SOPKOpc == -`1`)
269	return Changed;
270
271	// eq/ne is special because the imm16 can be treated as signed or unsigned,
272	// and initially selected to the unsigned versions.
273	if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 \|\| SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
274	bool HasUImm;
275	if (isKImmOrKUImmOperand(Src: Src1, IsUnsigned&: HasUImm)) {
276	if (!HasUImm) {
277	SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
278	AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
279	Src1.setImm(SignExtend32(X: Src1.getImm(), B: `32`));
280	}
281
282	MI.setDesc(TII->get(Opcode: SOPKOpc));
283	Changed = true;
284	}
285
286	return Changed;
287	}
288
289	const MCInstrDesc &NewDesc = TII->get(Opcode: SOPKOpc);
290
291	if ((SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKUImmOperand(Src: Src1)) \|\|
292	(!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKImmOperand(Src: Src1))) {
293	if (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc))
294	Src1.setImm(SignExtend64(X: Src1.getImm(), B: `32`));
295	MI.setDesc(NewDesc);
296	Changed = true;
297	}
298	return Changed;
299	}
300
301	// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
302	bool SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
303	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
304	if (!Info)
305	return false;
306
307	uint8_t NewEncoding;
308	switch (Info->MIMGEncoding) {
309	case AMDGPU::MIMGEncGfx10NSA:
310	NewEncoding = AMDGPU::MIMGEncGfx10Default;
311	break;
312	case AMDGPU::MIMGEncGfx11NSA:
313	NewEncoding = AMDGPU::MIMGEncGfx11Default;
314	break;
315	default:
316	return false;
317	}
318
319	int VAddr0Idx =
320	AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vaddr0);
321	unsigned NewAddrDwords = Info->VAddrDwords;
322	const TargetRegisterClass *RC;
323
324	if (Info->VAddrDwords == `2`) {
325	RC = &AMDGPU::VReg_64RegClass;
326	} else if (Info->VAddrDwords == `3`) {
327	RC = &AMDGPU::VReg_96RegClass;
328	} else if (Info->VAddrDwords == `4`) {
329	RC = &AMDGPU::VReg_128RegClass;
330	} else if (Info->VAddrDwords == `5`) {
331	RC = &AMDGPU::VReg_160RegClass;
332	} else if (Info->VAddrDwords == `6`) {
333	RC = &AMDGPU::VReg_192RegClass;
334	} else if (Info->VAddrDwords == `7`) {
335	RC = &AMDGPU::VReg_224RegClass;
336	} else if (Info->VAddrDwords == `8`) {
337	RC = &AMDGPU::VReg_256RegClass;
338	} else if (Info->VAddrDwords == `9`) {
339	RC = &AMDGPU::VReg_288RegClass;
340	} else if (Info->VAddrDwords == `10`) {
341	RC = &AMDGPU::VReg_320RegClass;
342	} else if (Info->VAddrDwords == `11`) {
343	RC = &AMDGPU::VReg_352RegClass;
344	} else if (Info->VAddrDwords == `12`) {
345	RC = &AMDGPU::VReg_384RegClass;
346	} else {
347	RC = &AMDGPU::VReg_512RegClass;
348	NewAddrDwords = `16`;
349	}
350
351	unsigned VgprBase = `0`;
352	unsigned NextVgpr = `0`;
353	bool IsUndef = true;
354	bool IsKill = NewAddrDwords == Info->VAddrDwords;
355	const unsigned NSAMaxSize = ST->getNSAMaxSize();
356	const bool IsPartialNSA = NewAddrDwords > NSAMaxSize;
357	const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands;
358	for (unsigned Idx = `0`; Idx < EndVAddr; ++Idx) {
359	const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + Idx);
360	unsigned Vgpr = TRI->getHWRegIndex(Reg: Op.getReg());
361	unsigned Dwords = TRI->getRegSizeInBits(Reg: Op.getReg(), MRI: *MRI) / `32`;
362	assert(Dwords > `0` && "Un-implemented for less than 32 bit regs");
363
364	if (Idx == `0`) {
365	VgprBase = Vgpr;
366	NextVgpr = Vgpr + Dwords;
367	} else if (Vgpr == NextVgpr) {
368	NextVgpr = Vgpr + Dwords;
369	} else {
370	return false;
371	}
372
373	if (!Op.isUndef())
374	IsUndef = false;
375	if (!Op.isKill())
376	IsKill = false;
377	}
378
379	if (VgprBase + NewAddrDwords > `256`)
380	return false;
381
382	// Further check for implicit tied operands - this may be present if TFE is
383	// enabled
384	int TFEIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::tfe);
385	int LWEIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::lwe);
386	unsigned TFEVal = (TFEIdx == -`1`) ? `0` : MI.getOperand(i: TFEIdx).getImm();
387	unsigned LWEVal = (LWEIdx == -`1`) ? `0` : MI.getOperand(i: LWEIdx).getImm();
388	int ToUntie = -`1`;
389	if (TFEVal \|\| LWEVal) {
390	// TFE/LWE is enabled so we need to deal with an implicit tied operand
391	for (unsigned i = LWEIdx + `1`, e = MI.getNumOperands(); i != e; ++i) {
392	if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
393	MI.getOperand(i).isImplicit()) {
394	// This is the tied operand
395	assert(
396	ToUntie == -`1` &&
397	"found more than one tied implicit operand when expecting only 1");
398	ToUntie = i;
399	MI.untieRegOperand(OpIdx: ToUntie);
400	}
401	}
402	}
403
404	unsigned NewOpcode = AMDGPU::getMIMGOpcode(BaseOpcode: Info->BaseOpcode, MIMGEncoding: NewEncoding,
405	VDataDwords: Info->VDataDwords, VAddrDwords: NewAddrDwords);
406	MI.setDesc(TII->get(Opcode: NewOpcode));
407	MI.getOperand(i: VAddr0Idx).setReg(RC->getRegister(i: VgprBase));
408	MI.getOperand(i: VAddr0Idx).setIsUndef(IsUndef);
409	MI.getOperand(i: VAddr0Idx).setIsKill(IsKill);
410
411	for (unsigned i = `1`; i < EndVAddr; ++i)
412	MI.removeOperand(OpNo: VAddr0Idx + `1`);
413
414	if (ToUntie >= `0`) {
415	MI.tieOperands(
416	DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata),
417	UseIdx: ToUntie - (EndVAddr - `1`));
418	}
419	return true;
420	}
421
422	// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
423	bool SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
424	// Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so
425	// there is no reason to try to shrink them.
426	if (!ST->hasVOP3Literal())
427	return false;
428
429	// There is no advantage to doing this pre-RA.
430	if (!IsPostRA)
431	return false;
432
433	if (TII->hasAnyModifiersSet(MI))
434	return false;
435
436	const unsigned Opcode = MI.getOpcode();
437	MachineOperand &Src0 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
438	MachineOperand &Src1 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
439	MachineOperand &Src2 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
440	unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
441
442	bool Swap;
443
444	// Detect "Dst = VSrc VGPR + Imm" and convert to AK form.*
445	if (Src2.isImm() && !TII->isInlineConstant(MO: Src2)) {
446	if (Src1.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src1.getReg()))
447	Swap = false;
448	else if (Src0.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src0.getReg()))
449	Swap = true;
450	else
451	return false;
452
453	switch (Opcode) {
454	default:
455	llvm_unreachable("Unexpected mad/fma opcode!");
456	case AMDGPU::V_MAD_F32_e64:
457	NewOpcode = AMDGPU::V_MADAK_F32;
458	break;
459	case AMDGPU::V_FMA_F32_e64:
460	NewOpcode = AMDGPU::V_FMAAK_F32;
461	break;
462	case AMDGPU::V_MAD_F16_e64:
463	NewOpcode = AMDGPU::V_MADAK_F16;
464	break;
465	case AMDGPU::V_FMA_F16_e64:
466	case AMDGPU::V_FMA_F16_gfx9_e64:
467	NewOpcode = AMDGPU::V_FMAAK_F16;
468	break;
469	case AMDGPU::V_FMA_F16_gfx9_t16_e64:
470	NewOpcode = AMDGPU::V_FMAAK_F16_t16;
471	break;
472	case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
473	NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
474	break;
475	case AMDGPU::V_FMA_F64_e64:
476	if (ST->hasFmaakFmamkF64Insts())
477	NewOpcode = AMDGPU::V_FMAAK_F64;
478	break;
479	}
480	}
481
482	// Detect "Dst = VSrc Imm + VGPR" and convert to MK form.*
483	if (Src2.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src2.getReg())) {
484	if (Src1.isImm() && !TII->isInlineConstant(MO: Src1))
485	Swap = false;
486	else if (Src0.isImm() && !TII->isInlineConstant(MO: Src0))
487	Swap = true;
488	else
489	return false;
490
491	switch (Opcode) {
492	default:
493	llvm_unreachable("Unexpected mad/fma opcode!");
494	case AMDGPU::V_MAD_F32_e64:
495	NewOpcode = AMDGPU::V_MADMK_F32;
496	break;
497	case AMDGPU::V_FMA_F32_e64:
498	NewOpcode = AMDGPU::V_FMAMK_F32;
499	break;
500	case AMDGPU::V_MAD_F16_e64:
501	NewOpcode = AMDGPU::V_MADMK_F16;
502	break;
503	case AMDGPU::V_FMA_F16_e64:
504	case AMDGPU::V_FMA_F16_gfx9_e64:
505	NewOpcode = AMDGPU::V_FMAMK_F16;
506	break;
507	case AMDGPU::V_FMA_F16_gfx9_t16_e64:
508	NewOpcode = AMDGPU::V_FMAMK_F16_t16;
509	break;
510	case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
511	NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
512	break;
513	case AMDGPU::V_FMA_F64_e64:
514	if (ST->hasFmaakFmamkF64Insts())
515	NewOpcode = AMDGPU::V_FMAMK_F64;
516	break;
517	}
518	}
519
520	if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
521	return false;
522
523	if (AMDGPU::isTrue16Inst(Opc: NewOpcode) && !shouldShrinkTrue16(MI))
524	return false;
525
526	if (Swap) {
527	// Swap Src0 and Src1 by building a new instruction.
528	BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: NewOpcode),
529	DestReg: MI.getOperand(i: `0`).getReg())
530	.add(MO: Src1)
531	.add(MO: Src0)
532	.add(MO: Src2)
533	.setMIFlags(MI.getFlags());
534	MI.eraseFromParent();
535	} else {
536	TII->removeModOperands(MI);
537	MI.setDesc(TII->get(Opcode: NewOpcode));
538	}
539	return true;
540	}
541
542	/// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals.
543	/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
544	/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
545	/// XNOR (as a ^ b == ~(a ^ ~b)).
546	/// \return ChangeKind::None if no changes were made.
547	/// ChangeKind::UpdateHint if regalloc hints were updated.
548	/// ChangeKind::UpdateInst if the instruction was modified.
549	ChangeKind SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
550	unsigned Opc = MI.getOpcode();
551	const MachineOperand *Dest = &MI.getOperand(i: `0`);
552	MachineOperand *Src0 = &MI.getOperand(i: `1`);
553	MachineOperand *Src1 = &MI.getOperand(i: `2`);
554	MachineOperand *SrcReg = Src0;
555	MachineOperand *SrcImm = Src1;
556
557	if (!SrcImm->isImm() \|\|
558	AMDGPU::isInlinableLiteral32(Literal: SrcImm->getImm(), HasInv2Pi: ST->hasInv2PiInlineImm()))
559	return ChangeKind::None;
560
561	uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
562	uint32_t NewImm = `0`;
563
564	if (Opc == AMDGPU::S_AND_B32) {
565	if (isPowerOf2_32(Value: ~Imm) &&
566	MI.findRegisterDefOperand(Reg: AMDGPU::SCC, /TRI=/nullptr)->isDead()) {
567	NewImm = llvm::countr_one(Value: Imm);
568	Opc = AMDGPU::S_BITSET0_B32;
569	} else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
570	NewImm = ~Imm;
571	Opc = AMDGPU::S_ANDN2_B32;
572	}
573	} else if (Opc == AMDGPU::S_OR_B32) {
574	if (isPowerOf2_32(Value: Imm) &&
575	MI.findRegisterDefOperand(Reg: AMDGPU::SCC, /TRI=/nullptr)->isDead()) {
576	NewImm = llvm::countr_zero(Val: Imm);
577	Opc = AMDGPU::S_BITSET1_B32;
578	} else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
579	NewImm = ~Imm;
580	Opc = AMDGPU::S_ORN2_B32;
581	}
582	} else if (Opc == AMDGPU::S_XOR_B32) {
583	if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
584	NewImm = ~Imm;
585	Opc = AMDGPU::S_XNOR_B32;
586	}
587	} else {
588	llvm_unreachable("unexpected opcode");
589	}
590
591	if (NewImm != `0`) {
592	if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
593	MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: `0`, PrefReg: SrcReg->getReg());
594	MRI->setRegAllocationHint(VReg: SrcReg->getReg(), Type: `0`, PrefReg: Dest->getReg());
595	return ChangeKind::UpdateHint;
596	}
597
598	if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
599	const bool IsUndef = SrcReg->isUndef();
600	const bool IsKill = SrcReg->isKill();
601	TII->mutateAndCleanupImplicit(MI, NewDesc: TII->get(Opcode: Opc));
602	if (Opc == AMDGPU::S_BITSET0_B32 \|\|
603	Opc == AMDGPU::S_BITSET1_B32) {
604	Src0->ChangeToImmediate(ImmVal: NewImm);
605	// Remove the immediate and add the tied input.
606	MI.getOperand(i: `2`).ChangeToRegister(Reg: Dest->getReg(), /IsDef/ isDef: false,
607	/isImp/ false, isKill: IsKill,
608	/isDead/ false, isUndef: IsUndef);
609	MI.tieOperands(DefIdx: `0`, UseIdx: `2`);
610	} else {
611	SrcImm->setImm(NewImm);
612	}
613	return ChangeKind::UpdateInst;
614	}
615	}
616
617	return ChangeKind::None;
618	}
619
620	// This is the same as MachineInstr::readsRegister/modifiesRegister except
621	// it takes subregs into account.
622	bool SIShrinkInstructions::instAccessReg(
623	iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
624	unsigned SubReg) const {
625	for (const MachineOperand &MO : R) {
626	if (!MO.isReg())
627	continue;
628
629	if (Reg.isPhysical() && MO.getReg().isPhysical()) {
630	if (TRI->regsOverlap(RegA: Reg, RegB: MO.getReg()))
631	return true;
632	} else if (MO.getReg() == Reg && Reg.isVirtual()) {
633	LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubIdx: SubReg) &
634	TRI->getSubRegIndexLaneMask(SubIdx: MO.getSubReg());
635	if (Overlap.any())
636	return true;
637	}
638	}
639	return false;
640	}
641
642	bool SIShrinkInstructions::instReadsReg(const MachineInstr MI, unsigned* Reg,
643	unsigned SubReg) const {
644	return instAccessReg(R: MI->uses(), Reg, SubReg);
645	}
646
647	bool SIShrinkInstructions::instModifiesReg(const MachineInstr MI, unsigned* Reg,
648	unsigned SubReg) const {
649	return instAccessReg(R: MI->defs(), Reg, SubReg);
650	}
651
652	TargetInstrInfo::RegSubRegPair
653	SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
654	unsigned I) const {
655	if (TRI->getRegSizeInBits(Reg, MRI: *MRI) != `32`) {
656	if (Reg.isPhysical()) {
657	Reg = TRI->getSubReg(Reg, Idx: TRI->getSubRegFromChannel(Channel: I));
658	} else {
659	Sub = TRI->getSubRegFromChannel(Channel: I + TRI->getChannelFromSubReg(SubReg: Sub));
660	}
661	}
662	return TargetInstrInfo::RegSubRegPair (Reg, Sub);
663	}
664
665	void SIShrinkInstructions::dropInstructionKeepingImpDefs(
666	MachineInstr &MI) const {
667	for (unsigned i = MI.getDesc().getNumOperands() +
668	MI.getDesc().implicit_uses().size() +
669	MI.getDesc().implicit_defs().size(),
670	e = MI.getNumOperands();
671	i != e; ++i) {
672	const MachineOperand &Op = MI.getOperand(i);
673	if (!Op.isDef())
674	continue;
675	BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
676	MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Op.getReg());
677	}
678
679	MI.eraseFromParent();
680	}
681
682	// Match:
683	// mov t, x
684	// mov x, y
685	// mov y, t
686	//
687	// =>
688	//
689	// mov t, x (t is potentially dead and move eliminated)
690	// v_swap_b32 x, y
691	//
692	// Returns next valid instruction pointer if was able to create v_swap_b32.
693	//
694	// This shall not be done too early not to prevent possible folding which may
695	// remove matched moves, and this should preferably be done before RA to
696	// release saved registers and also possibly after RA which can insert copies
697	// too.
698	//
699	// This is really just a generic peephole that is not a canonical shrinking,
700	// although requirements match the pass placement and it reduces code size too.
701	MachineInstr SIShrinkInstructions::matchSwap(MachineInstr &MovT) const* {
702	assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 \|\|
703	MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 \|\|
704	MovT.getOpcode() == AMDGPU::COPY);
705
706	Register T = MovT.getOperand(i: `0`).getReg();
707	unsigned Tsub = MovT.getOperand(i: `0`).getSubReg();
708	MachineOperand &Xop = MovT.getOperand(i: `1`);
709
710	if (!Xop.isReg())
711	return nullptr;
712	Register X = Xop.getReg();
713	unsigned Xsub = Xop.getSubReg();
714	Register Y;
715	unsigned Ysub;
716
717	unsigned Size = TII->getOpSize(MI: MovT, OpNo: `0`);
718
719	// We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers
720	// are not allocatble.
721	if (Size == `2` && X.isVirtual())
722	return nullptr;
723
724	if (!TRI->isVGPR(MRI: *MRI, Reg: X))
725	return nullptr;
726
727	const unsigned SearchLimit = `16`;
728	unsigned Count = `0`;
729
730	MachineInstr MovX = nullptr*;
731	MachineInstr InsertionPt = nullptr*;
732	MachineInstr MovY = nullptr*;
733
734	for (auto Iter = std::next(x: MovT.getIterator()),
735	E = MovT.getParent()->instr_end();
736	Iter != E && Count < SearchLimit; ++Iter) {
737	if (Iter ->isDebugInstr())
738	continue;
739	++Count;
740
741	if (instModifiesReg(MI: &*Iter, Reg: T, SubReg: Tsub))
742	return nullptr;
743
744	if (!MovX) {
745	// Search for mov x, y.
746	if ((Iter ->getOpcode() == AMDGPU::V_MOV_B32_e32 \|\|
747	Iter ->getOpcode() == AMDGPU::V_MOV_B16_t16_e32 \|\|
748	Iter ->getOpcode() == AMDGPU::COPY) &&
749	Iter ->getOperand(i: `0`).getReg() == X &&
750	Iter ->getOperand(i: `0`).getSubReg() == Xsub &&
751	Iter ->getOperand(i: `1`).isReg()) {
752	MovX = &*Iter;
753	Y = MovX->getOperand(i: `1`).getReg();
754	Ysub = MovX->getOperand(i: `1`).getSubReg();
755	} else if (instModifiesReg(MI: &*Iter, Reg: X, SubReg: Xsub)) {
756	// Writes to x are not allowed until mov x, y has been found
757	return nullptr;
758	}
759	} else {
760	// mov x, y has been found.
761	// Search for mov y, t.
762	if ((Iter ->getOpcode() == AMDGPU::V_MOV_B32_e32 \|\|
763	Iter ->getOpcode() == AMDGPU::V_MOV_B16_t16_e32 \|\|
764	Iter ->getOpcode() == AMDGPU::COPY) &&
765	Iter ->getOperand(i: `0`).getReg() == Y &&
766	Iter ->getOperand(i: `0`).getSubReg() == Ysub &&
767	Iter ->getOperand(i: `1`).isReg() && Iter ->getOperand(i: `1`).getReg() == T &&
768	Iter ->getOperand(i: `1`).getSubReg() == Tsub) {
769	MovY = &*Iter;
770	break;
771	}
772
773	// Effectively, mov x, y must be moved downward
774	// and mov y, t must be moved upward so that they can be fused into a
775	// swap. A write to y creates a barrier that prevents the two moves from
776	// being moved adjacent to each other.
777	if (instModifiesReg(MI: &*Iter, Reg: Y, SubReg: Ysub))
778	return nullptr;
779
780	// Reads or writes to x prevent mov x, y from being moved farther
781	// downward. Select this to be the insertion point.
782	if (!InsertionPt &&
783	(instReadsReg(MI: &Iter, Reg: X, SubReg: Xsub) \|\| instModifiesReg(MI: &Iter, Reg: X, SubReg: Xsub))) {
784	InsertionPt = &*Iter;
785	}
786	// If the insertion point has been found, then mov y, t must be moved
787	// upward past all subsequent instructions. A read of y will block this
788	// movement.
789	if (InsertionPt) {
790	if (instReadsReg(MI: &*Iter, Reg: Y, SubReg: Ysub))
791	return nullptr;
792	}
793	}
794	}
795	if (MovY) {
796	LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << MovX << MovY);
797
798	MachineBasicBlock &MBB = *MovT.getParent();
799	SmallVector<MachineInstr *, `4`> Swaps;
800
801	if (!InsertionPt)
802	InsertionPt = MovY;
803	if (Size == `2`) {
804	auto *MIB = BuildMI(BB&: MBB, I: InsertionPt->getIterator(), MIMD: MovT.getDebugLoc(),
805	MCID: TII->get(Opcode: AMDGPU::V_SWAP_B16))
806	.addDef(RegNo: X)
807	.addDef(RegNo: Y)
808	.addReg(RegNo: Y)
809	.addReg(RegNo: X)
810	.getInstr();
811	Swaps.push_back(Elt: MIB);
812	} else {
813	assert(Size > `0` && Size % `4` == `0`);
814	for (unsigned I = `0`; I < Size / `4`; ++I) {
815	TargetInstrInfo::RegSubRegPair X1, Y1;
816	X1 = getSubRegForIndex(Reg: X, Sub: Xsub, I);
817	Y1 = getSubRegForIndex(Reg: Y, Sub: Ysub, I);
818	auto *MIB = BuildMI(BB&: MBB, I: InsertionPt->getIterator(), MIMD: MovT.getDebugLoc(),
819	MCID: TII->get(Opcode: AMDGPU::V_SWAP_B32))
820	.addDef(RegNo: X1.Reg, Flags: {}, SubReg: X1.SubReg)
821	.addDef(RegNo: Y1.Reg, Flags: {}, SubReg: Y1.SubReg)
822	.addReg(RegNo: Y1.Reg, Flags: {}, SubReg: Y1.SubReg)
823	.addReg(RegNo: X1.Reg, Flags: {}, SubReg: X1.SubReg)
824	.getInstr();
825	Swaps.push_back(Elt: MIB);
826	}
827	}
828	// Drop implicit EXEC.
829	if (MovX->hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) {
830	for (MachineInstr *Swap : Swaps) {
831	Swap->removeOperand(OpNo: Swap->getNumExplicitOperands());
832	Swap->copyImplicitOps(MF&: MBB.getParent(), MI: MovX);
833	}
834	}
835	MovX->eraseFromParent();
836	dropInstructionKeepingImpDefs(MI&: *MovY);
837	MachineInstr Next = &std::next(x: MovT.getIterator());
838
839	if (T.isVirtual() && MRI->use_nodbg_empty(RegNo: T)) {
840	dropInstructionKeepingImpDefs(MI&: MovT);
841	} else {
842	Xop.setIsKill(false);
843	for (int I = MovT.getNumImplicitOperands() - `1`; I >= `0`; --I ) {
844	unsigned OpNo = MovT.getNumExplicitOperands() + I;
845	const MachineOperand &Op = MovT.getOperand(i: OpNo);
846	if (Op.isKill() && TRI->regsOverlap(RegA: X, RegB: Op.getReg()))
847	MovT.removeOperand(OpNo);
848	}
849	}
850
851	return Next;
852	}
853	return nullptr;
854	}
855
856	// If an instruction has dead sdst replace it with NULL register on gfx1030+
857	bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
858	if (!ST->hasGFX10_3Insts())
859	return false;
860
861	MachineOperand *Op = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
862	if (!Op)
863	return false;
864	Register SDstReg = Op->getReg();
865	if (SDstReg.isPhysical() \|\| !MRI->use_nodbg_empty(RegNo: SDstReg))
866	return false;
867
868	Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
869	return true;
870	}
871
872	bool SIShrinkInstructions::run(MachineFunction &MF) {
873
874	this->MF = &MF;
875	MRI = &MF.getRegInfo();
876	ST = &MF.getSubtarget<GCNSubtarget>();
877	TII = ST->getInstrInfo();
878	TRI = &TII->getRegisterInfo();
879	IsPostRA = MF.getProperties().hasNoVRegs();
880
881	unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
882	bool Changed = false;
883
884	for (MachineBasicBlock &MBB : MF) {
885	MachineBasicBlock::iterator I, Next;
886	for (I = MBB.begin(); I != MBB.end(); I = Next) {
887	Next = std::next(x: I);
888	MachineInstr &MI = *I;
889
890	if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
891	// If this has a literal constant source that is the same as the
892	// reversed bits of an inline immediate, replace with a bitreverse of
893	// that constant. This saves 4 bytes in the common case of materializing
894	// sign bits.
895
896	// Test if we are after regalloc. We only want to do this after any
897	// optimizations happen because this will confuse them.
898	MachineOperand &Src = MI.getOperand(i: `1`);
899	if (Src.isImm() && IsPostRA) {
900	int32_t ModImm;
901	unsigned ModOpcode =
902	canModifyToInlineImmOp32(TII, Src, ModifiedImm&: ModImm, /Scalar=/false);
903	if (ModOpcode != `0`) {
904	MI.setDesc(TII->get(Opcode: ModOpcode));
905	Src.setImm(static_cast<int64_t>(ModImm));
906	Changed = true;
907	continue;
908	}
909	}
910	}
911
912	if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 \|\|
913	MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 \|\|
914	MI.getOpcode() == AMDGPU::COPY)) {
915	if (auto *NextMI = matchSwap(MovT&: MI)) {
916	Next = NextMI->getIterator();
917	Changed = true;
918	continue;
919	}
920	}
921
922	// Shrink scalar logic operations.
923	if (MI.getOpcode() == AMDGPU::S_AND_B32 \|\|
924	MI.getOpcode() == AMDGPU::S_OR_B32 \|\|
925	MI.getOpcode() == AMDGPU::S_XOR_B32) {
926	ChangeKind CK = shrinkScalarLogicOp(MI);
927	if (CK == ChangeKind::UpdateHint)
928	continue;
929	Changed \|= (CK == ChangeKind::UpdateInst);
930	}
931
932	// Try to use S_ADDK_I32 and S_MULK_I32.
933	if (MI.getOpcode() == AMDGPU::S_ADD_I32 \|\|
934	MI.getOpcode() == AMDGPU::S_MUL_I32 \|\|
935	(MI.getOpcode() == AMDGPU::S_OR_B32 &&
936	MI.getFlag(Flag: MachineInstr::MIFlag::Disjoint))) {
937	const MachineOperand *Dest = &MI.getOperand(i: `0`);
938	MachineOperand *Src0 = &MI.getOperand(i: `1`);
939	MachineOperand *Src1 = &MI.getOperand(i: `2`);
940
941	if (!Src0->isReg() && Src1->isReg()) {
942	if (TII->commuteInstruction(MI, NewMI: false, OpIdx1: `1`, OpIdx2: `2`)) {
943	std::swap(a&: Src0, b&: Src1);
944	Changed = true;
945	}
946	}
947
948	// FIXME: This could work better if hints worked with subregisters. If
949	// we have a vector add of a constant, we usually don't get the correct
950	// allocation due to the subregister usage.
951	if (Dest->getReg().isVirtual() && Src0->isReg()) {
952	MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: `0`, PrefReg: Src0->getReg());
953	MRI->setRegAllocationHint(VReg: Src0->getReg(), Type: `0`, PrefReg: Dest->getReg());
954	continue;
955	}
956	if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
957	if (Src1->isImm() && isKImmOperand(Src: *Src1)) {
958	unsigned Opc = (MI.getOpcode() == AMDGPU::S_MUL_I32)
959	? AMDGPU::S_MULK_I32
960	: AMDGPU::S_ADDK_I32;
961	Src1->setImm(SignExtend64(X: Src1->getImm(), B: `32`));
962	MI.setDesc(TII->get(Opcode: Opc));
963	MI.tieOperands(DefIdx: `0`, UseIdx: `1`);
964	Changed = true;
965	}
966	}
967	}
968
969	// Try to use s_cmpk_*
970	if (MI.isCompare() && TII->isSOPC(MI)) {
971	Changed \|= shrinkScalarCompare(MI);
972	continue;
973	}
974
975	// Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
976	if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
977	const MachineOperand &Dst = MI.getOperand(i: `0`);
978	MachineOperand &Src = MI.getOperand(i: `1`);
979
980	if (Src.isImm() && Dst.getReg().isPhysical()) {
981	unsigned ModOpc;
982	int32_t ModImm;
983	if (isKImmOperand(Src)) {
984	MI.setDesc(TII->get(Opcode: AMDGPU::S_MOVK_I32));
985	Src.setImm(SignExtend64(X: Src.getImm(), B: `32`));
986	Changed = true;
987	} else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModifiedImm&: ModImm,
988	/Scalar=/true))) {
989	MI.setDesc(TII->get(Opcode: ModOpc));
990	Src.setImm(static_cast<int64_t>(ModImm));
991	Changed = true;
992	}
993	}
994
995	continue;
996	}
997
998	if (IsPostRA && TII->isMIMG(Opcode: MI.getOpcode()) &&
999	ST->getGeneration() >= AMDGPUSubtarget::GFX10) {
1000	Changed \|= shrinkMIMG(MI);
1001	continue;
1002	}
1003
1004	if (!TII->isVOP3(MI))
1005	continue;
1006
1007	if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 \|\|
1008	MI.getOpcode() == AMDGPU::V_FMA_F32_e64 \|\|
1009	MI.getOpcode() == AMDGPU::V_MAD_F16_e64 \|\|
1010	MI.getOpcode() == AMDGPU::V_FMA_F16_e64 \|\|
1011	MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 \|\|
1012	MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 \|\|
1013	MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64 \|\|
1014	(MI.getOpcode() == AMDGPU::V_FMA_F64_e64 &&
1015	ST->hasFmaakFmamkF64Insts())) {
1016	Changed \|= shrinkMadFma(MI);
1017	continue;
1018	}
1019
1020	// If there is no chance we will shrink it and use VCC as sdst to get
1021	// a 32 bit form try to replace dead sdst with NULL.
1022	if (TII->isVOP3(Opcode: MI.getOpcode())) {
1023	Changed \|= tryReplaceDeadSDST(MI);
1024	if (!TII->hasVALU32BitEncoding(Opcode: MI.getOpcode())) {
1025	continue;
1026	}
1027	}
1028
1029	if (!TII->canShrink(MI, MRI: *MRI)) {
1030	// Try commuting the instruction and see if that enables us to shrink
1031	// it.
1032	if (!MI.isCommutable() \|\| !TII->commuteInstruction(MI) \|\|
1033	!TII->canShrink(MI, MRI: *MRI)) {
1034	Changed \|= tryReplaceDeadSDST(MI);
1035	continue;
1036	}
1037
1038	// Operands were commuted.
1039	Changed = true;
1040	}
1041
1042	int Op32 = AMDGPU::getVOPe32(Opcode: MI.getOpcode());
1043
1044	if (TII->isVOPC(Opcode: Op32)) {
1045	MachineOperand &Op0 = MI.getOperand(i: `0`);
1046	if (Op0.isReg()) {
1047	// Exclude VOPCX instructions as these don't explicitly write a
1048	// dst.
1049	Register DstReg = Op0.getReg();
1050	if (DstReg.isVirtual()) {
1051	// VOPC instructions can only write to the VCC register. We can't
1052	// force them to use VCC here, because this is only one register and
1053	// cannot deal with sequences which would require multiple copies of
1054	// VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
1055	//
1056	// So, instead of forcing the instruction to write to VCC, we
1057	// provide a hint to the register allocator to use VCC and then we
1058	// will run this pass again after RA and shrink it if it outputs to
1059	// VCC.
1060	MRI->setRegAllocationHint(VReg: DstReg, Type: `0`, PrefReg: VCCReg);
1061	continue;
1062	}
1063	if (DstReg != VCCReg)
1064	continue;
1065	}
1066	}
1067
1068	if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
1069	// We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
1070	// instructions.
1071	const MachineOperand *Src2 =
1072	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
1073	if (!Src2->isReg())
1074	continue;
1075	Register SReg = Src2->getReg();
1076	if (SReg.isVirtual()) {
1077	MRI->setRegAllocationHint(VReg: SReg, Type: `0`, PrefReg: VCCReg);
1078	continue;
1079	}
1080	if (SReg != VCCReg)
1081	continue;
1082	}
1083
1084	// Check for the bool flag output for instructions like V_ADD_I32_e64.
1085	const MachineOperand *SDst = TII->getNamedOperand(MI,
1086	OperandName: AMDGPU::OpName::sdst);
1087
1088	if (SDst) {
1089	bool Next = false;
1090
1091	if (SDst->getReg() != VCCReg) {
1092	if (SDst->getReg().isVirtual())
1093	MRI->setRegAllocationHint(VReg: SDst->getReg(), Type: `0`, PrefReg: VCCReg);
1094	Next = true;
1095	}
1096
1097	// All of the instructions with carry outs also have an SGPR input in
1098	// src2.
1099	const MachineOperand *Src2 = TII->getNamedOperand(MI,
1100	OperandName: AMDGPU::OpName::src2);
1101	if (Src2 && Src2->getReg() != VCCReg) {
1102	if (Src2->getReg().isVirtual())
1103	MRI->setRegAllocationHint(VReg: Src2->getReg(), Type: `0`, PrefReg: VCCReg);
1104	Next = true;
1105	}
1106
1107	if (Next)
1108	continue;
1109	}
1110
1111	// Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to
1112	// fold an immediate into the shrunk instruction as a literal operand. In
1113	// GFX10 VOP3 instructions can take a literal operand anyway, so there is
1114	// no advantage to doing this.
1115	// However, if 64-bit literals are allowed we still need to shrink it
1116	// for such literal to be able to fold.
1117	if (ST->hasVOP3Literal() &&
1118	(!ST->has64BitLiterals() \|\| AMDGPU::isTrue16Inst(Opc: MI.getOpcode())) &&
1119	!IsPostRA)
1120	continue;
1121
1122	if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(Opc: MI.getOpcode()) &&
1123	!shouldShrinkTrue16(MI))
1124	continue;
1125
1126	// We can shrink this instruction
1127	LLVM_DEBUG(dbgs() << "Shrinking " << MI);
1128
1129	MachineInstr *Inst32 = TII->buildShrunkInst(MI, NewOpcode: Op32);
1130	++NumInstructionsShrunk;
1131
1132	// Copy extra operands not present in the instruction definition.
1133	copyExtraImplicitOps(NewMI&: *Inst32, MI);
1134
1135	// Copy deadness from the old explicit vcc def to the new implicit def.
1136	if (SDst && SDst->isDead())
1137	Inst32->findRegisterDefOperand(Reg: VCCReg, /TRI=/nullptr)->setIsDead();
1138
1139	MI.eraseFromParent();
1140	foldImmediates(MI&: *Inst32);
1141
1142	LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << `'\n'`);
1143	Changed = true;
1144	}
1145	}
1146	return Changed;
1147	}
1148
1149	bool SIShrinkInstructionsLegacy::runOnMachineFunction(MachineFunction &MF) {
1150	if (skipFunction(F: MF.getFunction()))
1151	return false;
1152
1153	return SIShrinkInstructions ().run(MF);
1154	}
1155
1156	PreservedAnalyses
1157	SIShrinkInstructionsPass::run(MachineFunction &MF,
1158	MachineFunctionAnalysisManager &) {
1159	if (MF.getFunction().hasOptNone() \|\| !SIShrinkInstructions ().run(MF))
1160	return PreservedAnalyses::all();
1161
1162	auto PA = getMachineFunctionPassPreservedAnalyses();
1163	PA.preserveSet<CFGAnalyses>();
1164	return PA;
1165	}
1166

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp