SIShrinkInstructions.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp]

1	//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	/// The pass tries to use the 32-bit encoding for instructions when possible.
8	//===----------------------------------------------------------------------===//
9	//
10
11	#include "AMDGPU.h"
12	#include "GCNSubtarget.h"
13	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14	#include "Utils/AMDGPUBaseInfo.h"
15	#include "llvm/ADT/Statistic.h"
16	#include "llvm/CodeGen/MachineFunctionPass.h"
17
18	#define DEBUG_TYPE "si-shrink-instructions"
19
20	STATISTIC(NumInstructionsShrunk,
21	"Number of 64-bit instruction reduced to 32-bit.");
22	STATISTIC(NumLiteralConstantsFolded,
23	"Number of literal constants folded into 32-bit instructions.");
24
25	using namespace llvm;
26
27	namespace {
28
29	class SIShrinkInstructions : public MachineFunctionPass {
30	MachineFunction *MF;
31	MachineRegisterInfo *MRI;
32	const GCNSubtarget *ST;
33	const SIInstrInfo *TII;
34	const SIRegisterInfo *TRI;
35
36	public:
37	static char ID;
38
39	public:
40	SIShrinkInstructions() : MachineFunctionPass (ID) {
41	}
42
43	bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
44	bool shouldShrinkTrue16(MachineInstr &MI) const;
45	bool isKImmOperand(const MachineOperand &Src) const;
46	bool isKUImmOperand(const MachineOperand &Src) const;
47	bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
48	void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
49	void shrinkScalarCompare(MachineInstr &MI) const;
50	void shrinkMIMG(MachineInstr &MI) const;
51	void shrinkMadFma(MachineInstr &MI) const;
52	bool shrinkScalarLogicOp(MachineInstr &MI) const;
53	bool tryReplaceDeadSDST(MachineInstr &MI) const;
54	bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
55	Register Reg, unsigned SubReg) const;
56	bool instReadsReg(const MachineInstr MI, unsigned* Reg,
57	unsigned SubReg) const;
58	bool instModifiesReg(const MachineInstr MI, unsigned* Reg,
59	unsigned SubReg) const;
60	TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
61	unsigned I) const;
62	void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
63	MachineInstr matchSwap(MachineInstr &MovT) const*;
64
65	bool runOnMachineFunction(MachineFunction &MF) override;
66
67	StringRef getPassName() const override { return "SI Shrink Instructions"; }
68
69	void getAnalysisUsage(AnalysisUsage &AU) const override {
70	AU.setPreservesCFG();
71	MachineFunctionPass::getAnalysisUsage(AU);
72	}
73	};
74
75	} // End anonymous namespace.
76
77	INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
78	"SI Shrink Instructions", false, false)
79
80	char SIShrinkInstructions::ID = `0`;
81
82	FunctionPass *llvm::createSIShrinkInstructionsPass() {
83	return new SIShrinkInstructions ();
84	}
85
86	/// This function checks \p MI for operands defined by a move immediate
87	/// instruction and then folds the literal constant into the instruction if it
88	/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
89	bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
90	bool TryToCommute) const {
91	assert(TII->isVOP1(MI) \|\| TII->isVOP2(MI) \|\| TII->isVOPC(MI));
92
93	int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::src0);
94
95	// Try to fold Src0
96	MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
97	if (Src0.isReg()) {
98	Register Reg = Src0.getReg();
99	if (Reg.isVirtual()) {
100	MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
101	if (Def && Def->isMoveImmediate()) {
102	MachineOperand &MovSrc = Def->getOperand(i: `1`);
103	bool ConstantFolded = false;
104
105	if (TII->isOperandLegal(MI, OpIdx: Src0Idx, MO: &MovSrc)) {
106	if (MovSrc.isImm()) {
107	Src0.ChangeToImmediate(ImmVal: MovSrc.getImm());
108	ConstantFolded = true;
109	} else if (MovSrc.isFI()) {
110	Src0.ChangeToFrameIndex(Idx: MovSrc.getIndex());
111	ConstantFolded = true;
112	} else if (MovSrc.isGlobal()) {
113	Src0.ChangeToGA(GV: MovSrc.getGlobal(), Offset: MovSrc.getOffset(),
114	TargetFlags: MovSrc.getTargetFlags());
115	ConstantFolded = true;
116	}
117	}
118
119	if (ConstantFolded) {
120	if (MRI->use_nodbg_empty(RegNo: Reg))
121	Def->eraseFromParent();
122	++NumLiteralConstantsFolded;
123	return true;
124	}
125	}
126	}
127	}
128
129	// We have failed to fold src0, so commute the instruction and try again.
130	if (TryToCommute && MI.isCommutable()) {
131	if (TII->commuteInstruction(MI)) {
132	if (foldImmediates(MI, TryToCommute: false))
133	return true;
134
135	// Commute back.
136	TII->commuteInstruction(MI);
137	}
138	}
139
140	return false;
141	}
142
143	/// Do not shrink the instruction if its registers are not expressible in the
144	/// shrunk encoding.
145	bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const {
146	for (unsigned I = `0`, E = MI.getNumExplicitOperands(); I != E; ++I) {
147	const MachineOperand &MO = MI.getOperand(i: I);
148	if (MO.isReg()) {
149	Register Reg = MO.getReg();
150	assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink "
151	"True16 Instructions post-RA");
152	if (AMDGPU::VGPR_32RegClass.contains(Reg) &&
153	!AMDGPU::VGPR_32_Lo128RegClass.contains(Reg))
154	return false;
155	}
156	}
157	return true;
158	}
159
160	bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
161	return isInt<`16`>(x: SignExtend64(X: Src.getImm(), B: `32`)) &&
162	!TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo());
163	}
164
165	bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
166	return isUInt<`16`>(x: Src.getImm()) &&
167	!TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo());
168	}
169
170	bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
171	bool &IsUnsigned) const {
172	if (isInt<`16`>(x: SignExtend64(X: Src.getImm(), B: `32`))) {
173	IsUnsigned = false;
174	return !TII->isInlineConstant(MO: Src);
175	}
176
177	if (isUInt<`16`>(x: Src.getImm())) {
178	IsUnsigned = true;
179	return !TII->isInlineConstant(MO: Src);
180	}
181
182	return false;
183	}
184
185	/// \returns the opcode of an instruction a move immediate of the constant \p
186	/// Src can be replaced with if the constant is replaced with \p ModifiedImm.
187	/// i.e.
188	///
189	/// If the bitreverse of a constant is an inline immediate, reverse the
190	/// immediate and return the bitreverse opcode.
191	///
192	/// If the bitwise negation of a constant is an inline immediate, reverse the
193	/// immediate and return the bitwise not opcode.
194	static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII,
195	const MachineOperand &Src,
196	int32_t &ModifiedImm, bool Scalar) {
197	if (TII->isInlineConstant(MO: Src))
198	return `0`;
199	int32_t SrcImm = static_cast<int32_t>(Src.getImm());
200
201	if (!Scalar) {
202	// We could handle the scalar case with here, but we would need to check
203	// that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth
204	// it, as the reasonable values are already covered by s_movk_i32.
205	ModifiedImm = ~SrcImm;
206	if (TII->isInlineConstant(Imm: APInt (`32`, ModifiedImm)))
207	return AMDGPU::V_NOT_B32_e32;
208	}
209
210	ModifiedImm = reverseBits<int32_t>(Val: SrcImm);
211	if (TII->isInlineConstant(Imm: APInt (`32`, ModifiedImm)))
212	return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32;
213
214	return `0`;
215	}
216
217	/// Copy implicit register operands from specified instruction to this
218	/// instruction that are not part of the instruction definition.
219	void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
220	MachineInstr &MI) const {
221	MachineFunction &MF = *MI.getMF();
222	for (unsigned i = MI.getDesc().getNumOperands() +
223	MI.getDesc().implicit_uses().size() +
224	MI.getDesc().implicit_defs().size(),
225	e = MI.getNumOperands();
226	i != e; ++i) {
227	const MachineOperand &MO = MI.getOperand(i);
228	if ((MO.isReg() && MO.isImplicit()) \|\| MO.isRegMask())
229	NewMI.addOperand(MF, Op: MO);
230	}
231	}
232
233	void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
234	if (!ST->hasSCmpK())
235	return;
236
237	// cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
238	// get constants on the RHS.
239	if (!MI.getOperand(i: `0`).isReg())
240	TII->commuteInstruction(MI, NewMI: false, OpIdx1: `0`, OpIdx2: `1`);
241
242	// cmpk requires src0 to be a register
243	const MachineOperand &Src0 = MI.getOperand(i: `0`);
244	if (!Src0.isReg())
245	return;
246
247	MachineOperand &Src1 = MI.getOperand(i: `1`);
248	if (!Src1.isImm())
249	return;
250
251	int SOPKOpc = AMDGPU::getSOPKOp(Opcode: MI.getOpcode());
252	if (SOPKOpc == -`1`)
253	return;
254
255	// eq/ne is special because the imm16 can be treated as signed or unsigned,
256	// and initially selected to the unsigned versions.
257	if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 \|\| SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
258	bool HasUImm;
259	if (isKImmOrKUImmOperand(Src: Src1, IsUnsigned&: HasUImm)) {
260	if (!HasUImm) {
261	SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
262	AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
263	Src1.setImm(SignExtend32(X: Src1.getImm(), B: `32`));
264	}
265
266	MI.setDesc(TII->get(Opcode: SOPKOpc));
267	}
268
269	return;
270	}
271
272	const MCInstrDesc &NewDesc = TII->get(Opcode: SOPKOpc);
273
274	if ((SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKUImmOperand(Src: Src1)) \|\|
275	(!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKImmOperand(Src: Src1))) {
276	if (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc))
277	Src1.setImm(SignExtend64(X: Src1.getImm(), B: `32`));
278	MI.setDesc(NewDesc);
279	}
280	}
281
282	// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
283	void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
284	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
285	if (!Info)
286	return;
287
288	uint8_t NewEncoding;
289	switch (Info->MIMGEncoding) {
290	case AMDGPU::MIMGEncGfx10NSA:
291	NewEncoding = AMDGPU::MIMGEncGfx10Default;
292	break;
293	case AMDGPU::MIMGEncGfx11NSA:
294	NewEncoding = AMDGPU::MIMGEncGfx11Default;
295	break;
296	default:
297	return;
298	}
299
300	int VAddr0Idx =
301	AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::vaddr0);
302	unsigned NewAddrDwords = Info->VAddrDwords;
303	const TargetRegisterClass *RC;
304
305	if (Info->VAddrDwords == `2`) {
306	RC = &AMDGPU::VReg_64RegClass;
307	} else if (Info->VAddrDwords == `3`) {
308	RC = &AMDGPU::VReg_96RegClass;
309	} else if (Info->VAddrDwords == `4`) {
310	RC = &AMDGPU::VReg_128RegClass;
311	} else if (Info->VAddrDwords == `5`) {
312	RC = &AMDGPU::VReg_160RegClass;
313	} else if (Info->VAddrDwords == `6`) {
314	RC = &AMDGPU::VReg_192RegClass;
315	} else if (Info->VAddrDwords == `7`) {
316	RC = &AMDGPU::VReg_224RegClass;
317	} else if (Info->VAddrDwords == `8`) {
318	RC = &AMDGPU::VReg_256RegClass;
319	} else if (Info->VAddrDwords == `9`) {
320	RC = &AMDGPU::VReg_288RegClass;
321	} else if (Info->VAddrDwords == `10`) {
322	RC = &AMDGPU::VReg_320RegClass;
323	} else if (Info->VAddrDwords == `11`) {
324	RC = &AMDGPU::VReg_352RegClass;
325	} else if (Info->VAddrDwords == `12`) {
326	RC = &AMDGPU::VReg_384RegClass;
327	} else {
328	RC = &AMDGPU::VReg_512RegClass;
329	NewAddrDwords = `16`;
330	}
331
332	unsigned VgprBase = `0`;
333	unsigned NextVgpr = `0`;
334	bool IsUndef = true;
335	bool IsKill = NewAddrDwords == Info->VAddrDwords;
336	const unsigned NSAMaxSize = ST->getNSAMaxSize();
337	const bool IsPartialNSA = NewAddrDwords > NSAMaxSize;
338	const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands;
339	for (unsigned Idx = `0`; Idx < EndVAddr; ++Idx) {
340	const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + Idx);
341	unsigned Vgpr = TRI->getHWRegIndex(Reg: Op.getReg());
342	unsigned Dwords = TRI->getRegSizeInBits(Reg: Op.getReg(), MRI: *MRI) / `32`;
343	assert(Dwords > `0` && "Un-implemented for less than 32 bit regs");
344
345	if (Idx == `0`) {
346	VgprBase = Vgpr;
347	NextVgpr = Vgpr + Dwords;
348	} else if (Vgpr == NextVgpr) {
349	NextVgpr = Vgpr + Dwords;
350	} else {
351	return;
352	}
353
354	if (!Op.isUndef())
355	IsUndef = false;
356	if (!Op.isKill())
357	IsKill = false;
358	}
359
360	if (VgprBase + NewAddrDwords > `256`)
361	return;
362
363	// Further check for implicit tied operands - this may be present if TFE is
364	// enabled
365	int TFEIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::tfe);
366	int LWEIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::lwe);
367	unsigned TFEVal = (TFEIdx == -`1`) ? `0` : MI.getOperand(i: TFEIdx).getImm();
368	unsigned LWEVal = (LWEIdx == -`1`) ? `0` : MI.getOperand(i: LWEIdx).getImm();
369	int ToUntie = -`1`;
370	if (TFEVal \|\| LWEVal) {
371	// TFE/LWE is enabled so we need to deal with an implicit tied operand
372	for (unsigned i = LWEIdx + `1`, e = MI.getNumOperands(); i != e; ++i) {
373	if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
374	MI.getOperand(i).isImplicit()) {
375	// This is the tied operand
376	assert(
377	ToUntie == -`1` &&
378	"found more than one tied implicit operand when expecting only 1");
379	ToUntie = i;
380	MI.untieRegOperand(OpIdx: ToUntie);
381	}
382	}
383	}
384
385	unsigned NewOpcode = AMDGPU::getMIMGOpcode(BaseOpcode: Info->BaseOpcode, MIMGEncoding: NewEncoding,
386	VDataDwords: Info->VDataDwords, VAddrDwords: NewAddrDwords);
387	MI.setDesc(TII->get(Opcode: NewOpcode));
388	MI.getOperand(i: VAddr0Idx).setReg(RC->getRegister(i: VgprBase));
389	MI.getOperand(i: VAddr0Idx).setIsUndef(IsUndef);
390	MI.getOperand(i: VAddr0Idx).setIsKill(IsKill);
391
392	for (unsigned i = `1`; i < EndVAddr; ++i)
393	MI.removeOperand(OpNo: VAddr0Idx + `1`);
394
395	if (ToUntie >= `0`) {
396	MI.tieOperands(
397	DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::vdata),
398	UseIdx: ToUntie - (EndVAddr - `1`));
399	}
400	}
401
402	// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
403	void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
404	// Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so
405	// there is no reason to try to shrink them.
406	if (!ST->hasVOP3Literal())
407	return;
408
409	// There is no advantage to doing this pre-RA.
410	if (!MF->getProperties().hasProperty(
411	P: MachineFunctionProperties::Property::NoVRegs))
412	return;
413
414	if (TII->hasAnyModifiersSet(MI))
415	return;
416
417	const unsigned Opcode = MI.getOpcode();
418	MachineOperand &Src0 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
419	MachineOperand &Src1 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
420	MachineOperand &Src2 = *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
421	unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
422
423	bool Swap;
424
425	// Detect "Dst = VSrc VGPR + Imm" and convert to AK form.*
426	if (Src2.isImm() && !TII->isInlineConstant(MO: Src2)) {
427	if (Src1.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src1.getReg()))
428	Swap = false;
429	else if (Src0.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src0.getReg()))
430	Swap = true;
431	else
432	return;
433
434	switch (Opcode) {
435	default:
436	llvm_unreachable("Unexpected mad/fma opcode!");
437	case AMDGPU::V_MAD_F32_e64:
438	NewOpcode = AMDGPU::V_MADAK_F32;
439	break;
440	case AMDGPU::V_FMA_F32_e64:
441	NewOpcode = AMDGPU::V_FMAAK_F32;
442	break;
443	case AMDGPU::V_MAD_F16_e64:
444	NewOpcode = AMDGPU::V_MADAK_F16;
445	break;
446	case AMDGPU::V_FMA_F16_e64:
447	case AMDGPU::V_FMA_F16_gfx9_e64:
448	NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
449	: AMDGPU::V_FMAAK_F16;
450	break;
451	}
452	}
453
454	// Detect "Dst = VSrc Imm + VGPR" and convert to MK form.*
455	if (Src2.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src2.getReg())) {
456	if (Src1.isImm() && !TII->isInlineConstant(MO: Src1))
457	Swap = false;
458	else if (Src0.isImm() && !TII->isInlineConstant(MO: Src0))
459	Swap = true;
460	else
461	return;
462
463	switch (Opcode) {
464	default:
465	llvm_unreachable("Unexpected mad/fma opcode!");
466	case AMDGPU::V_MAD_F32_e64:
467	NewOpcode = AMDGPU::V_MADMK_F32;
468	break;
469	case AMDGPU::V_FMA_F32_e64:
470	NewOpcode = AMDGPU::V_FMAMK_F32;
471	break;
472	case AMDGPU::V_MAD_F16_e64:
473	NewOpcode = AMDGPU::V_MADMK_F16;
474	break;
475	case AMDGPU::V_FMA_F16_e64:
476	case AMDGPU::V_FMA_F16_gfx9_e64:
477	NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
478	: AMDGPU::V_FMAMK_F16;
479	break;
480	}
481	}
482
483	if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
484	return;
485
486	if (AMDGPU::isTrue16Inst(Opc: NewOpcode) && !shouldShrinkTrue16(MI))
487	return;
488
489	if (Swap) {
490	// Swap Src0 and Src1 by building a new instruction.
491	BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: NewOpcode),
492	DestReg: MI.getOperand(i: `0`).getReg())
493	.add(MO: Src1)
494	.add(MO: Src0)
495	.add(MO: Src2)
496	.setMIFlags(MI.getFlags());
497	MI.eraseFromParent();
498	} else {
499	TII->removeModOperands(MI);
500	MI.setDesc(TII->get(Opcode: NewOpcode));
501	}
502	}
503
504	/// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals.
505	/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
506	/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
507	/// XNOR (as a ^ b == ~(a ^ ~b)).
508	/// \returns true if the caller should continue the machine function iterator
509	bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
510	unsigned Opc = MI.getOpcode();
511	const MachineOperand *Dest = &MI.getOperand(i: `0`);
512	MachineOperand *Src0 = &MI.getOperand(i: `1`);
513	MachineOperand *Src1 = &MI.getOperand(i: `2`);
514	MachineOperand *SrcReg = Src0;
515	MachineOperand *SrcImm = Src1;
516
517	if (!SrcImm->isImm() \|\|
518	AMDGPU::isInlinableLiteral32(Literal: SrcImm->getImm(), HasInv2Pi: ST->hasInv2PiInlineImm()))
519	return false;
520
521	uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
522	uint32_t NewImm = `0`;
523
524	if (Opc == AMDGPU::S_AND_B32) {
525	if (isPowerOf2_32(Value: ~Imm)) {
526	NewImm = llvm::countr_one(Value: Imm);
527	Opc = AMDGPU::S_BITSET0_B32;
528	} else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
529	NewImm = ~Imm;
530	Opc = AMDGPU::S_ANDN2_B32;
531	}
532	} else if (Opc == AMDGPU::S_OR_B32) {
533	if (isPowerOf2_32(Value: Imm)) {
534	NewImm = llvm::countr_zero(Val: Imm);
535	Opc = AMDGPU::S_BITSET1_B32;
536	} else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
537	NewImm = ~Imm;
538	Opc = AMDGPU::S_ORN2_B32;
539	}
540	} else if (Opc == AMDGPU::S_XOR_B32) {
541	if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
542	NewImm = ~Imm;
543	Opc = AMDGPU::S_XNOR_B32;
544	}
545	} else {
546	llvm_unreachable("unexpected opcode");
547	}
548
549	if (NewImm != `0`) {
550	if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
551	MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: `0`, PrefReg: SrcReg->getReg());
552	MRI->setRegAllocationHint(VReg: SrcReg->getReg(), Type: `0`, PrefReg: Dest->getReg());
553	return true;
554	}
555
556	if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
557	const bool IsUndef = SrcReg->isUndef();
558	const bool IsKill = SrcReg->isKill();
559	MI.setDesc(TII->get(Opcode: Opc));
560	if (Opc == AMDGPU::S_BITSET0_B32 \|\|
561	Opc == AMDGPU::S_BITSET1_B32) {
562	Src0->ChangeToImmediate(ImmVal: NewImm);
563	// Remove the immediate and add the tied input.
564	MI.getOperand(i: `2`).ChangeToRegister(Reg: Dest->getReg(), /IsDef/ isDef: false,
565	/isImp/ false, isKill: IsKill,
566	/isDead/ false, isUndef: IsUndef);
567	MI.tieOperands(DefIdx: `0`, UseIdx: `2`);
568	} else {
569	SrcImm->setImm(NewImm);
570	}
571	}
572	}
573
574	return false;
575	}
576
577	// This is the same as MachineInstr::readsRegister/modifiesRegister except
578	// it takes subregs into account.
579	bool SIShrinkInstructions::instAccessReg(
580	iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
581	unsigned SubReg) const {
582	for (const MachineOperand &MO : R) {
583	if (!MO.isReg())
584	continue;
585
586	if (Reg.isPhysical() && MO.getReg().isPhysical()) {
587	if (TRI->regsOverlap(RegA: Reg, RegB: MO.getReg()))
588	return true;
589	} else if (MO.getReg() == Reg && Reg.isVirtual()) {
590	LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubIdx: SubReg) &
591	TRI->getSubRegIndexLaneMask(SubIdx: MO.getSubReg());
592	if (Overlap.any())
593	return true;
594	}
595	}
596	return false;
597	}
598
599	bool SIShrinkInstructions::instReadsReg(const MachineInstr MI, unsigned* Reg,
600	unsigned SubReg) const {
601	return instAccessReg(R: MI->uses(), Reg, SubReg);
602	}
603
604	bool SIShrinkInstructions::instModifiesReg(const MachineInstr MI, unsigned* Reg,
605	unsigned SubReg) const {
606	return instAccessReg(R: MI->defs(), Reg, SubReg);
607	}
608
609	TargetInstrInfo::RegSubRegPair
610	SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
611	unsigned I) const {
612	if (TRI->getRegSizeInBits(Reg, MRI: *MRI) != `32`) {
613	if (Reg.isPhysical()) {
614	Reg = TRI->getSubReg(Reg, Idx: TRI->getSubRegFromChannel(Channel: I));
615	} else {
616	Sub = TRI->getSubRegFromChannel(Channel: I + TRI->getChannelFromSubReg(SubReg: Sub));
617	}
618	}
619	return TargetInstrInfo::RegSubRegPair (Reg, Sub);
620	}
621
622	void SIShrinkInstructions::dropInstructionKeepingImpDefs(
623	MachineInstr &MI) const {
624	for (unsigned i = MI.getDesc().getNumOperands() +
625	MI.getDesc().implicit_uses().size() +
626	MI.getDesc().implicit_defs().size(),
627	e = MI.getNumOperands();
628	i != e; ++i) {
629	const MachineOperand &Op = MI.getOperand(i);
630	if (!Op.isDef())
631	continue;
632	BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
633	MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: Op.getReg());
634	}
635
636	MI.eraseFromParent();
637	}
638
639	// Match:
640	// mov t, x
641	// mov x, y
642	// mov y, t
643	//
644	// =>
645	//
646	// mov t, x (t is potentially dead and move eliminated)
647	// v_swap_b32 x, y
648	//
649	// Returns next valid instruction pointer if was able to create v_swap_b32.
650	//
651	// This shall not be done too early not to prevent possible folding which may
652	// remove matched moves, and this should preferably be done before RA to
653	// release saved registers and also possibly after RA which can insert copies
654	// too.
655	//
656	// This is really just a generic peephole that is not a canonical shrinking,
657	// although requirements match the pass placement and it reduces code size too.
658	MachineInstr SIShrinkInstructions::matchSwap(MachineInstr &MovT) const* {
659	assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 \|\|
660	MovT.getOpcode() == AMDGPU::COPY);
661
662	Register T = MovT.getOperand(i: `0`).getReg();
663	unsigned Tsub = MovT.getOperand(i: `0`).getSubReg();
664	MachineOperand &Xop = MovT.getOperand(i: `1`);
665
666	if (!Xop.isReg())
667	return nullptr;
668	Register X = Xop.getReg();
669	unsigned Xsub = Xop.getSubReg();
670
671	unsigned Size = TII->getOpSize(MI: MovT, OpNo: `0`) / `4`;
672
673	if (!TRI->isVGPR(MRI: *MRI, Reg: X))
674	return nullptr;
675
676	const unsigned SearchLimit = `16`;
677	unsigned Count = `0`;
678	bool KilledT = false;
679	for (auto Iter = std::next(x: MovT.getIterator()),
680	E = MovT.getParent()->instr_end();
681	Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
682
683	MachineInstr MovY = &Iter;
684	KilledT = MovY->killsRegister(Reg: T, TRI);
685
686	if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
687	MovY->getOpcode() != AMDGPU::COPY) \|\|
688	!MovY->getOperand(i: `1`).isReg() \|\|
689	MovY->getOperand(i: `1`).getReg() != T \|\|
690	MovY->getOperand(i: `1`).getSubReg() != Tsub)
691	continue;
692
693	Register Y = MovY->getOperand(i: `0`).getReg();
694	unsigned Ysub = MovY->getOperand(i: `0`).getSubReg();
695
696	if (!TRI->isVGPR(MRI: *MRI, Reg: Y))
697	continue;
698
699	MachineInstr MovX = nullptr*;
700	for (auto IY = MovY->getIterator(), I = std::next(x: MovT.getIterator());
701	I != IY; ++I) {
702	if (instReadsReg(MI: &I, Reg: X, SubReg: Xsub) \|\| instModifiesReg(MI: &I, Reg: Y, SubReg: Ysub) \|\|
703	instModifiesReg(MI: &*I, Reg: T, SubReg: Tsub) \|\|
704	(MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub))) {
705	MovX = nullptr;
706	break;
707	}
708	if (!instReadsReg(MI: &*I, Reg: Y, SubReg: Ysub)) {
709	if (!MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub)) {
710	MovX = nullptr;
711	break;
712	}
713	continue;
714	}
715	if (MovX \|\|
716	(I ->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
717	I ->getOpcode() != AMDGPU::COPY) \|\|
718	I ->getOperand(i: `0`).getReg() != X \|\|
719	I ->getOperand(i: `0`).getSubReg() != Xsub) {
720	MovX = nullptr;
721	break;
722	}
723
724	if (Size > `1` && (I ->getNumImplicitOperands() > (I ->isCopy() ? `0U` : `1U`)))
725	continue;
726
727	MovX = &*I;
728	}
729
730	if (!MovX)
731	continue;
732
733	LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << MovX << MovY);
734
735	for (unsigned I = `0`; I < Size; ++I) {
736	TargetInstrInfo::RegSubRegPair X1, Y1;
737	X1 = getSubRegForIndex(Reg: X, Sub: Xsub, I);
738	Y1 = getSubRegForIndex(Reg: Y, Sub: Ysub, I);
739	MachineBasicBlock &MBB = *MovT.getParent();
740	auto MIB = BuildMI(BB&: MBB, I: MovX->getIterator(), MIMD: MovT.getDebugLoc(),
741	MCID: TII->get(Opcode: AMDGPU::V_SWAP_B32))
742	.addDef(RegNo: X1.Reg, Flags: `0`, SubReg: X1.SubReg)
743	.addDef(RegNo: Y1.Reg, Flags: `0`, SubReg: Y1.SubReg)
744	.addReg(RegNo: Y1.Reg, flags: `0`, SubReg: Y1.SubReg)
745	.addReg(RegNo: X1.Reg, flags: `0`, SubReg: X1.SubReg).getInstr();
746	if (MovX->hasRegisterImplicitUseOperand(Reg: AMDGPU::EXEC)) {
747	// Drop implicit EXEC.
748	MIB->removeOperand(OpNo: MIB->getNumExplicitOperands());
749	MIB->copyImplicitOps(MF&: MBB.getParent(), MI: MovX);
750	}
751	}
752	MovX->eraseFromParent();
753	dropInstructionKeepingImpDefs(MI&: *MovY);
754	MachineInstr Next = &std::next(x: MovT.getIterator());
755
756	if (T.isVirtual() && MRI->use_nodbg_empty(RegNo: T)) {
757	dropInstructionKeepingImpDefs(MI&: MovT);
758	} else {
759	Xop.setIsKill(false);
760	for (int I = MovT.getNumImplicitOperands() - `1`; I >= `0`; --I ) {
761	unsigned OpNo = MovT.getNumExplicitOperands() + I;
762	const MachineOperand &Op = MovT.getOperand(i: OpNo);
763	if (Op.isKill() && TRI->regsOverlap(RegA: X, RegB: Op.getReg()))
764	MovT.removeOperand(OpNo);
765	}
766	}
767
768	return Next;
769	}
770
771	return nullptr;
772	}
773
774	// If an instruction has dead sdst replace it with NULL register on gfx1030+
775	bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
776	if (!ST->hasGFX10_3Insts())
777	return false;
778
779	MachineOperand *Op = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
780	if (!Op)
781	return false;
782	Register SDstReg = Op->getReg();
783	if (SDstReg.isPhysical() \|\| !MRI->use_nodbg_empty(RegNo: SDstReg))
784	return false;
785
786	Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
787	return true;
788	}
789
790	bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
791	if (skipFunction(F: MF.getFunction()))
792	return false;
793
794	this->MF = &MF;
795	MRI = &MF.getRegInfo();
796	ST = &MF.getSubtarget<GCNSubtarget>();
797	TII = ST->getInstrInfo();
798	TRI = &TII->getRegisterInfo();
799
800	unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
801
802	std::vector<unsigned> I1Defs;
803
804	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
805	BI != BE; ++BI) {
806
807	MachineBasicBlock &MBB = *BI;
808	MachineBasicBlock::iterator I, Next;
809	for (I = MBB.begin(); I != MBB.end(); I = Next) {
810	Next = std::next(x: I);
811	MachineInstr &MI = *I;
812
813	if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
814	// If this has a literal constant source that is the same as the
815	// reversed bits of an inline immediate, replace with a bitreverse of
816	// that constant. This saves 4 bytes in the common case of materializing
817	// sign bits.
818
819	// Test if we are after regalloc. We only want to do this after any
820	// optimizations happen because this will confuse them.
821	// XXX - not exactly a check for post-regalloc run.
822	MachineOperand &Src = MI.getOperand(i: `1`);
823	if (Src.isImm() && MI.getOperand(i: `0`).getReg().isPhysical()) {
824	int32_t ModImm;
825	unsigned ModOpcode =
826	canModifyToInlineImmOp32(TII, Src, ModifiedImm&: ModImm, /Scalar=/false);
827	if (ModOpcode != `0`) {
828	MI.setDesc(TII->get(Opcode: ModOpcode));
829	Src.setImm(static_cast<int64_t>(ModImm));
830	continue;
831	}
832	}
833	}
834
835	if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 \|\|
836	MI.getOpcode() == AMDGPU::COPY)) {
837	if (auto *NextMI = matchSwap(MovT&: MI)) {
838	Next = NextMI->getIterator();
839	continue;
840	}
841	}
842
843	// Try to use S_ADDK_I32 and S_MULK_I32.
844	if (MI.getOpcode() == AMDGPU::S_ADD_I32 \|\|
845	MI.getOpcode() == AMDGPU::S_MUL_I32) {
846	const MachineOperand *Dest = &MI.getOperand(i: `0`);
847	MachineOperand *Src0 = &MI.getOperand(i: `1`);
848	MachineOperand *Src1 = &MI.getOperand(i: `2`);
849
850	if (!Src0->isReg() && Src1->isReg()) {
851	if (TII->commuteInstruction(MI, NewMI: false, OpIdx1: `1`, OpIdx2: `2`))
852	std::swap(a&: Src0, b&: Src1);
853	}
854
855	// FIXME: This could work better if hints worked with subregisters. If
856	// we have a vector add of a constant, we usually don't get the correct
857	// allocation due to the subregister usage.
858	if (Dest->getReg().isVirtual() && Src0->isReg()) {
859	MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: `0`, PrefReg: Src0->getReg());
860	MRI->setRegAllocationHint(VReg: Src0->getReg(), Type: `0`, PrefReg: Dest->getReg());
861	continue;
862	}
863
864	if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
865	if (Src1->isImm() && isKImmOperand(Src: *Src1)) {
866	unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
867	AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
868
869	Src1->setImm(SignExtend64(X: Src1->getImm(), B: `32`));
870	MI.setDesc(TII->get(Opcode: Opc));
871	MI.tieOperands(DefIdx: `0`, UseIdx: `1`);
872	}
873	}
874	}
875
876	// Try to use s_cmpk_*
877	if (MI.isCompare() && TII->isSOPC(MI)) {
878	shrinkScalarCompare(MI);
879	continue;
880	}
881
882	// Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
883	if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
884	const MachineOperand &Dst = MI.getOperand(i: `0`);
885	MachineOperand &Src = MI.getOperand(i: `1`);
886
887	if (Src.isImm() && Dst.getReg().isPhysical()) {
888	unsigned ModOpc;
889	int32_t ModImm;
890	if (isKImmOperand(Src)) {
891	MI.setDesc(TII->get(Opcode: AMDGPU::S_MOVK_I32));
892	Src.setImm(SignExtend64(X: Src.getImm(), B: `32`));
893	} else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModifiedImm&: ModImm,
894	/Scalar=/true))) {
895	MI.setDesc(TII->get(Opcode: ModOpc));
896	Src.setImm(static_cast<int64_t>(ModImm));
897	}
898	}
899
900	continue;
901	}
902
903	// Shrink scalar logic operations.
904	if (MI.getOpcode() == AMDGPU::S_AND_B32 \|\|
905	MI.getOpcode() == AMDGPU::S_OR_B32 \|\|
906	MI.getOpcode() == AMDGPU::S_XOR_B32) {
907	if (shrinkScalarLogicOp(MI))
908	continue;
909	}
910
911	if (TII->isMIMG(Opcode: MI.getOpcode()) &&
912	ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
913	MF.getProperties().hasProperty(
914	P: MachineFunctionProperties::Property::NoVRegs)) {
915	shrinkMIMG(MI);
916	continue;
917	}
918
919	if (!TII->isVOP3(MI))
920	continue;
921
922	if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 \|\|
923	MI.getOpcode() == AMDGPU::V_FMA_F32_e64 \|\|
924	MI.getOpcode() == AMDGPU::V_MAD_F16_e64 \|\|
925	MI.getOpcode() == AMDGPU::V_FMA_F16_e64 \|\|
926	MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) {
927	shrinkMadFma(MI);
928	continue;
929	}
930
931	if (!TII->hasVALU32BitEncoding(Opcode: MI.getOpcode())) {
932	// If there is no chance we will shrink it and use VCC as sdst to get
933	// a 32 bit form try to replace dead sdst with NULL.
934	tryReplaceDeadSDST(MI);
935	continue;
936	}
937
938	if (!TII->canShrink(MI, MRI: *MRI)) {
939	// Try commuting the instruction and see if that enables us to shrink
940	// it.
941	if (!MI.isCommutable() \|\| !TII->commuteInstruction(MI) \|\|
942	!TII->canShrink(MI, MRI: *MRI)) {
943	tryReplaceDeadSDST(MI);
944	continue;
945	}
946	}
947
948	int Op32 = AMDGPU::getVOPe32(Opcode: MI.getOpcode());
949
950	if (TII->isVOPC(Opcode: Op32)) {
951	MachineOperand &Op0 = MI.getOperand(i: `0`);
952	if (Op0.isReg()) {
953	// Exclude VOPCX instructions as these don't explicitly write a
954	// dst.
955	Register DstReg = Op0.getReg();
956	if (DstReg.isVirtual()) {
957	// VOPC instructions can only write to the VCC register. We can't
958	// force them to use VCC here, because this is only one register and
959	// cannot deal with sequences which would require multiple copies of
960	// VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
961	//
962	// So, instead of forcing the instruction to write to VCC, we
963	// provide a hint to the register allocator to use VCC and then we
964	// will run this pass again after RA and shrink it if it outputs to
965	// VCC.
966	MRI->setRegAllocationHint(VReg: DstReg, Type: `0`, PrefReg: VCCReg);
967	continue;
968	}
969	if (DstReg != VCCReg)
970	continue;
971	}
972	}
973
974	if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
975	// We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
976	// instructions.
977	const MachineOperand *Src2 =
978	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
979	if (!Src2->isReg())
980	continue;
981	Register SReg = Src2->getReg();
982	if (SReg.isVirtual()) {
983	MRI->setRegAllocationHint(VReg: SReg, Type: `0`, PrefReg: VCCReg);
984	continue;
985	}
986	if (SReg != VCCReg)
987	continue;
988	}
989
990	// Check for the bool flag output for instructions like V_ADD_I32_e64.
991	const MachineOperand *SDst = TII->getNamedOperand(MI,
992	OperandName: AMDGPU::OpName::sdst);
993
994	if (SDst) {
995	bool Next = false;
996
997	if (SDst->getReg() != VCCReg) {
998	if (SDst->getReg().isVirtual())
999	MRI->setRegAllocationHint(VReg: SDst->getReg(), Type: `0`, PrefReg: VCCReg);
1000	Next = true;
1001	}
1002
1003	// All of the instructions with carry outs also have an SGPR input in
1004	// src2.
1005	const MachineOperand *Src2 = TII->getNamedOperand(MI,
1006	OperandName: AMDGPU::OpName::src2);
1007	if (Src2 && Src2->getReg() != VCCReg) {
1008	if (Src2->getReg().isVirtual())
1009	MRI->setRegAllocationHint(VReg: Src2->getReg(), Type: `0`, PrefReg: VCCReg);
1010	Next = true;
1011	}
1012
1013	if (Next)
1014	continue;
1015	}
1016
1017	// Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to
1018	// fold an immediate into the shrunk instruction as a literal operand. In
1019	// GFX10 VOP3 instructions can take a literal operand anyway, so there is
1020	// no advantage to doing this.
1021	if (ST->hasVOP3Literal() &&
1022	!MF.getProperties().hasProperty(
1023	P: MachineFunctionProperties::Property::NoVRegs))
1024	continue;
1025
1026	if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(Opc: MI.getOpcode()) &&
1027	!shouldShrinkTrue16(MI))
1028	continue;
1029
1030	// We can shrink this instruction
1031	LLVM_DEBUG(dbgs() << "Shrinking " << MI);
1032
1033	MachineInstr *Inst32 = TII->buildShrunkInst(MI, NewOpcode: Op32);
1034	++NumInstructionsShrunk;
1035
1036	// Copy extra operands not present in the instruction definition.
1037	copyExtraImplicitOps(NewMI&: *Inst32, MI);
1038
1039	// Copy deadness from the old explicit vcc def to the new implicit def.
1040	if (SDst && SDst->isDead())
1041	Inst32->findRegisterDefOperand(Reg: VCCReg, /TRI=/nullptr)->setIsDead();
1042
1043	MI.eraseFromParent();
1044	foldImmediates(MI&: *Inst32);
1045
1046	LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << `'\n'`);
1047	}
1048	}
1049	return false;
1050	}
1051

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp