GCNDPPCombine.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp]

1	//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9	// operand. If any of the use instruction cannot be combined with the mov the
10	// whole sequence is reverted.
11	//
12	// $old = ...
13	// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14	// dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15	// $res = VALU $dpp_value [, src1]
16	//
17	// to
18	//
19	// $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20	// dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
21	//
22	// Combining rules :
23	//
24	// if $row_mask and $bank_mask are fully enabled (0xF) and
25	// $bound_ctrl==DPP_BOUND_ZERO or $old==0
26	// -> $combined_old = undef,
27	// $combined_bound_ctrl = DPP_BOUND_ZERO
28	//
29	// if the VALU op is binary and
30	// $bound_ctrl==DPP_BOUND_OFF and
31	// $old==identity value (immediate) for the VALU op
32	// -> $combined_old = src1,
33	// $combined_bound_ctrl = DPP_BOUND_OFF
34	//
35	// Otherwise cancel.
36	//
37	// The mov_dpp instruction should reside in the same BB as all its uses
38	//===----------------------------------------------------------------------===//
39
40	#include "GCNDPPCombine.h"
41	#include "AMDGPU.h"
42	#include "GCNSubtarget.h"
43	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44	#include "llvm/ADT/Statistic.h"
45	#include "llvm/CodeGen/MachineFunctionPass.h"
46	#include "llvm/CodeGen/MachineInstrBuilder.h"
47
48	using namespace llvm;
49
50	#define DEBUG_TYPE "gcn-dpp-combine"
51
52	STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
53
54	namespace {
55
56	class GCNDPPCombine {
57	MachineRegisterInfo *MRI;
58	const SIInstrInfo *TII;
59	const GCNSubtarget *ST;
60
61	using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
62
63	MachineOperand getOldOpndValue(MachineOperand &OldOpnd) const*;
64
65	MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
66	RegSubRegPair CombOldVGPR,
67	MachineOperand OldOpnd, bool* CombBCZ,
68	bool IsShrinkable) const;
69
70	MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
71	RegSubRegPair CombOldVGPR, bool CombBCZ,
72	bool IsShrinkable) const;
73
74	bool hasNoImmOrEqual(MachineInstr &MI, AMDGPU::OpName OpndName, int64_t Value,
75	int64_t Mask = -`1`) const;
76
77	bool combineDPPMov(MachineInstr &MI) const;
78
79	int getDPPOp(unsigned Op, bool IsShrinkable) const;
80	bool isShrinkable(MachineInstr &MI) const;
81
82	public:
83	bool run(MachineFunction &MF);
84	};
85
86	class GCNDPPCombineLegacy : public MachineFunctionPass {
87	public:
88	static char ID;
89
90	GCNDPPCombineLegacy() : MachineFunctionPass (ID) {}
91
92	bool runOnMachineFunction(MachineFunction &MF) override;
93
94	StringRef getPassName() const override { return "GCN DPP Combine"; }
95
96	void getAnalysisUsage(AnalysisUsage &AU) const override {
97	AU.setPreservesCFG();
98	MachineFunctionPass::getAnalysisUsage(AU);
99	}
100
101	MachineFunctionProperties getRequiredProperties() const override {
102	return MachineFunctionProperties ().setIsSSA();
103	}
104	};
105
106	} // end anonymous namespace
107
108	INITIALIZE_PASS(GCNDPPCombineLegacy, DEBUG_TYPE, "GCN DPP Combine", false,
109	false)
110
111	char GCNDPPCombineLegacy::ID = `0`;
112
113	char &llvm::GCNDPPCombineLegacyID = GCNDPPCombineLegacy::ID;
114
115	FunctionPass *llvm::createGCNDPPCombinePass() {
116	return new GCNDPPCombineLegacy ();
117	}
118
119	bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
120	unsigned Op = MI.getOpcode();
121	if (!TII->isVOP3(Opcode: Op)) {
122	return false;
123	}
124	if (!TII->hasVALU32BitEncoding(Opcode: Op)) {
125	LLVM_DEBUG(dbgs() << " Inst hasn't e32 equivalent\n");
126	return false;
127	}
128	// Do not shrink True16 instructions pre-RA to avoid the restriction in
129	// register allocation from only being able to use 128 VGPRs
130	if (AMDGPU::isTrue16Inst(Opc: Op))
131	return false;
132	if (const auto *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst)) {
133	// Give up if there are any uses of the sdst in carry-out or VOPC.
134	// The shrunken form of the instruction would write it to vcc instead of to
135	// a virtual register. If we rewrote the uses the shrinking would be
136	// possible.
137	if (!MRI->use_nodbg_empty(RegNo: SDst->getReg()))
138	return false;
139	}
140	// check if other than abs\|neg modifiers are set (opsel for example)
141	const int64_t Mask = ~(SISrcMods::ABS \| SISrcMods::NEG);
142	if (!hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::src0_modifiers, Value: `0`, Mask) \|\|
143	!hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::src1_modifiers, Value: `0`, Mask) \|\|
144	!hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::clamp, Value: `0`) \|\|
145	!hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::omod, Value: `0`) \|\|
146	!hasNoImmOrEqual(MI, OpndName: AMDGPU::OpName::byte_sel, Value: `0`)) {
147	LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n");
148	return false;
149	}
150	return true;
151	}
152
153	int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
154	int DPP32 = AMDGPU::getDPPOp32(Opcode: Op);
155	if (IsShrinkable) {
156	assert(DPP32 == -`1`);
157	int E32 = AMDGPU::getVOPe32(Opcode: Op);
158	DPP32 = (E32 == -`1`) ? -`1` : AMDGPU::getDPPOp32(Opcode: E32);
159	}
160	if (DPP32 != -`1` && TII->pseudoToMCOpcode(Opcode: DPP32) != -`1`)
161	return DPP32;
162	int DPP64 = -`1`;
163	if (ST->hasVOP3DPP())
164	DPP64 = AMDGPU::getDPPOp64(Opcode: Op);
165	if (DPP64 != -`1` && TII->pseudoToMCOpcode(Opcode: DPP64) != -`1`)
166	return DPP64;
167	return -`1`;
168	}
169
170	// tracks the register operand definition and returns:
171	// 1. immediate operand used to initialize the register if found
172	// 2. nullptr if the register operand is undef
173	// 3. the operand itself otherwise
174	MachineOperand GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const* {
175	auto Def = getVRegSubRegDef(P: getRegSubRegPair(O: OldOpnd), MRI: MRI);
176	if (!Def)
177	return nullptr;
178
179	switch(Def->getOpcode()) {
180	default: break;
181	case AMDGPU::IMPLICIT_DEF:
182	return nullptr;
183	case AMDGPU::COPY:
184	case AMDGPU::V_MOV_B32_e32:
185	case AMDGPU::V_MOV_B64_PSEUDO:
186	case AMDGPU::V_MOV_B64_e32:
187	case AMDGPU::V_MOV_B64_e64: {
188	auto &Op1 = Def->getOperand(i: `1`);
189	if (Op1.isImm())
190	return &Op1;
191	break;
192	}
193	}
194	return &OldOpnd;
195	}
196
197	MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
198	MachineInstr &MovMI,
199	RegSubRegPair CombOldVGPR,
200	bool CombBCZ,
201	bool IsShrinkable) const {
202	assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp \|\|
203	MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp \|\|
204	MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
205
206	bool HasVOP3DPP = ST->hasVOP3DPP();
207	auto OrigOp = OrigMI.getOpcode();
208	if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(Opc: OrigOp)) {
209	LLVM_DEBUG(
210	dbgs() << " failed: Did not expect any 16-bit uses of dpp values\n");
211	return nullptr;
212	}
213	auto DPPOp = getDPPOp(Op: OrigOp, IsShrinkable);
214	if (DPPOp == -`1`) {
215	LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
216	return nullptr;
217	}
218	int OrigOpE32 = AMDGPU::getVOPe32(Opcode: OrigOp);
219	// Prior checks cover Mask with VOPC condition, but not on purpose
220	auto *RowMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask);
221	assert(RowMaskOpnd && RowMaskOpnd->isImm());
222	auto *BankMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask);
223	assert(BankMaskOpnd && BankMaskOpnd->isImm());
224	const bool MaskAllLanes =
225	RowMaskOpnd->getImm() == `0xF` && BankMaskOpnd->getImm() == `0xF`;
226	(void)MaskAllLanes;
227	assert((MaskAllLanes \|\|
228	!(TII->isVOPC(DPPOp) \|\| (TII->isVOP3(DPPOp) && OrigOpE32 != -`1` &&
229	TII->isVOPC(OrigOpE32)))) &&
230	"VOPC cannot form DPP unless mask is full");
231
232	auto DPPInst = BuildMI(BB&: *OrigMI.getParent(), I&: OrigMI,
233	MIMD: OrigMI.getDebugLoc(), MCID: TII->get(Opcode: DPPOp))
234	.setMIFlags(OrigMI.getFlags());
235
236	bool Fail = false;
237	do {
238	int NumOperands = `0`;
239	if (auto *Dst = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::vdst)) {
240	DPPInst.add(MO: *Dst);
241	++NumOperands;
242	}
243	if (auto *SDst = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::sdst)) {
244	if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::sdst)) {
245	DPPInst.add(MO: *SDst);
246	++NumOperands;
247	}
248	// If we shrunk a 64bit vop3b to 32bits, just ignore the sdst
249	}
250
251	const int OldIdx = AMDGPU::getNamedOperandIdx(Opcode: DPPOp, Name: AMDGPU::OpName::old);
252	if (OldIdx != -`1`) {
253	assert(OldIdx == NumOperands);
254	assert(isOfRegClass(
255	CombOldVGPR,
256	*MRI->getRegClass(
257	TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
258	*MRI));
259	auto Def = getVRegSubRegDef(P: CombOldVGPR, MRI: MRI);
260	DPPInst.addReg(RegNo: CombOldVGPR.Reg, Flags: getUndefRegState(B: !Def),
261	SubReg: CombOldVGPR.SubReg);
262	++NumOperands;
263	} else if (TII->isVOPC(Opcode: DPPOp) \|\| (TII->isVOP3(Opcode: DPPOp) && OrigOpE32 != -`1` &&
264	TII->isVOPC(Opcode: OrigOpE32))) {
265	// VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand
266	// because they write to SGPRs not VGPRs
267	} else {
268	// TODO: this discards MAC/FMA instructions for now, let's add it later
269	LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
270	" TBD\n");
271	Fail = true;
272	break;
273	}
274
275	auto *Mod0 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src0_modifiers);
276	if (Mod0) {
277	assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
278	AMDGPU::OpName::src0_modifiers));
279	assert(HasVOP3DPP \|\|
280	(`0LL` == (Mod0->getImm() & ~(SISrcMods::ABS \| SISrcMods::NEG))));
281	DPPInst.addImm(Val: Mod0->getImm());
282	++NumOperands;
283	} else if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::src0_modifiers)) {
284	DPPInst.addImm(Val: `0`);
285	++NumOperands;
286	}
287	auto *Src0 = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::src0);
288	assert(Src0);
289	[[maybe_unused]] int Src0Idx = NumOperands;
290
291	DPPInst.add(MO: *Src0);
292	DPPInst ->getOperand(i: NumOperands).setIsKill(false);
293	++NumOperands;
294
295	auto *Mod1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1_modifiers);
296	if (Mod1) {
297	assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
298	AMDGPU::OpName::src1_modifiers));
299	assert(HasVOP3DPP \|\|
300	(`0LL` == (Mod1->getImm() & ~(SISrcMods::ABS \| SISrcMods::NEG))));
301	DPPInst.addImm(Val: Mod1->getImm());
302	++NumOperands;
303	} else if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::src1_modifiers)) {
304	DPPInst.addImm(Val: `0`);
305	++NumOperands;
306	}
307	auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1);
308	if (Src1) {
309	assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src1) &&
310	"dpp version of instruction missing src1");
311	// If subtarget does not support SGPRs for src1 operand then the
312	// requirements are the same as for src0. We check src0 instead because
313	// pseudos are shared between subtargets and allow SGPR for src1 on all.
314	if (!ST->hasDPPSrc1SGPR()) {
315	assert(TII->getOpSize(*DPPInst, Src0Idx) ==
316	TII->getOpSize(*DPPInst, NumOperands) &&
317	"Src0 and Src1 operands should have the same size");
318	}
319
320	DPPInst.add(MO: *Src1);
321	++NumOperands;
322	}
323
324	auto *Mod2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2_modifiers);
325	if (Mod2) {
326	assert(NumOperands ==
327	AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
328	assert(HasVOP3DPP \|\|
329	(`0LL` == (Mod2->getImm() & ~(SISrcMods::ABS \| SISrcMods::NEG))));
330	DPPInst.addImm(Val: Mod2->getImm());
331	++NumOperands;
332	}
333	auto *Src2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2);
334	if (Src2) {
335	if (!AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::src2)) {
336	LLVM_DEBUG(dbgs() << " failed: dpp does not have src2\n");
337	Fail = true;
338	break;
339	}
340	DPPInst.add(MO: *Src2);
341	++NumOperands;
342	}
343
344	if (HasVOP3DPP) {
345	auto *ClampOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::clamp);
346	if (ClampOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::clamp)) {
347	DPPInst.addImm(Val: ClampOpr->getImm());
348	}
349	auto *VdstInOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::vdst_in);
350	if (VdstInOpr &&
351	AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::vdst_in)) {
352	DPPInst.add(MO: *VdstInOpr);
353	}
354	auto *OmodOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::omod);
355	if (OmodOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::omod)) {
356	DPPInst.addImm(Val: OmodOpr->getImm());
357	}
358	// Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to
359	// all 1.
360	if (TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::op_sel)) {
361	int64_t OpSel = `0`;
362	OpSel \|= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << `0`) : `0`);
363	OpSel \|= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << `1`) : `0`);
364	OpSel \|= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << `2`) : `0`);
365	if (Mod0 && TII->isVOP3(MI: OrigMI) && !TII->isVOP3P(MI: OrigMI))
366	OpSel \|= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << `3`;
367
368	if (OpSel != `0`) {
369	LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n");
370	Fail = true;
371	break;
372	}
373	if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::op_sel))
374	DPPInst.addImm(Val: OpSel);
375	}
376	if (TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::op_sel_hi)) {
377	int64_t OpSelHi = `0`;
378	OpSelHi \|= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << `0`) : `0`);
379	OpSelHi \|= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << `1`) : `0`);
380	OpSelHi \|= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << `2`) : `0`);
381
382	// Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
383	// the bitmask for 3 op_sel_hi bits set
384	assert(Src2 && "Expected vop3p with 3 operands");
385	if (OpSelHi != `7`) {
386	LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n");
387	Fail = true;
388	break;
389	}
390	if (AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::op_sel_hi))
391	DPPInst.addImm(Val: OpSelHi);
392	}
393	auto *NegOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::neg_lo);
394	if (NegOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::neg_lo)) {
395	DPPInst.addImm(Val: NegOpr->getImm());
396	}
397	auto *NegHiOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::neg_hi);
398	if (NegHiOpr && AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::neg_hi)) {
399	DPPInst.addImm(Val: NegHiOpr->getImm());
400	}
401	auto *ByteSelOpr = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::byte_sel);
402	if (ByteSelOpr &&
403	AMDGPU::hasNamedOperand(Opcode: DPPOp, NamedIdx: AMDGPU::OpName::byte_sel)) {
404	DPPInst.addImm(Val: ByteSelOpr->getImm());
405	}
406	if (MachineOperand *BitOp3 =
407	TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::bitop3)) {
408	assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::bitop3));
409	DPPInst.add(MO: *BitOp3);
410	}
411	}
412	DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::dpp_ctrl));
413	DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask));
414	DPPInst.add(MO: *TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask));
415	DPPInst.addImm(Val: CombBCZ ? `1` : `0`);
416
417	constexpr AMDGPU::OpName Srcs[] = {
418	AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
419
420	// FIXME: isOperandLegal expects to operate on an completely built
421	// instruction. We should have better legality APIs to check if the
422	// candidate operands will be legal without building the instruction first.
423	for (auto [I, OpName] : enumerate(First: Srcs)) {
424	int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: DPPOp, Name: OpName);
425	if (OpIdx == -`1`)
426	break;
427
428	if (!TII->isOperandLegal(MI: *DPPInst, OpIdx)) {
429	LLVM_DEBUG(dbgs() << " failed: src" << I << " operand is illegal\n");
430	Fail = true;
431	break;
432	}
433	}
434	} while (false);
435
436	if (Fail) {
437	DPPInst.getInstr()->eraseFromParent();
438	return nullptr;
439	}
440	LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
441	return DPPInst.getInstr();
442	}
443
444	static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
445	assert(OldOpnd->isImm());
446	switch (OrigMIOp) {
447	default: break;
448	case AMDGPU::V_ADD_U32_e32:
449	case AMDGPU::V_ADD_U32_e64:
450	case AMDGPU::V_ADD_CO_U32_e32:
451	case AMDGPU::V_ADD_CO_U32_e64:
452	case AMDGPU::V_OR_B32_e32:
453	case AMDGPU::V_OR_B32_e64:
454	case AMDGPU::V_SUBREV_U32_e32:
455	case AMDGPU::V_SUBREV_U32_e64:
456	case AMDGPU::V_SUBREV_CO_U32_e32:
457	case AMDGPU::V_SUBREV_CO_U32_e64:
458	case AMDGPU::V_MAX_U32_e32:
459	case AMDGPU::V_MAX_U32_e64:
460	case AMDGPU::V_XOR_B32_e32:
461	case AMDGPU::V_XOR_B32_e64:
462	if (OldOpnd->getImm() == `0`)
463	return true;
464	break;
465	case AMDGPU::V_AND_B32_e32:
466	case AMDGPU::V_AND_B32_e64:
467	case AMDGPU::V_MIN_U32_e32:
468	case AMDGPU::V_MIN_U32_e64:
469	if (static_cast<uint32_t>(OldOpnd->getImm()) ==
470	std::numeric_limits<uint32_t>::max())
471	return true;
472	break;
473	case AMDGPU::V_MIN_I32_e32:
474	case AMDGPU::V_MIN_I32_e64:
475	if (static_cast<int32_t>(OldOpnd->getImm()) ==
476	std::numeric_limits<int32_t>::max())
477	return true;
478	break;
479	case AMDGPU::V_MAX_I32_e32:
480	case AMDGPU::V_MAX_I32_e64:
481	if (static_cast<int32_t>(OldOpnd->getImm()) ==
482	std::numeric_limits<int32_t>::min())
483	return true;
484	break;
485	case AMDGPU::V_MUL_I32_I24_e32:
486	case AMDGPU::V_MUL_I32_I24_e64:
487	case AMDGPU::V_MUL_U32_U24_e32:
488	case AMDGPU::V_MUL_U32_U24_e64:
489	if (OldOpnd->getImm() == `1`)
490	return true;
491	break;
492	}
493	return false;
494	}
495
496	MachineInstr *GCNDPPCombine::createDPPInst(
497	MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR,
498	MachineOperand OldOpndValue, bool* CombBCZ, bool IsShrinkable) const {
499	assert(CombOldVGPR.Reg);
500	if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
501	auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1);
502	if (!Src1 \|\| !Src1->isReg()) {
503	LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
504	return nullptr;
505	}
506	if (!isIdentityValue(OrigMIOp: OrigMI.getOpcode(), OldOpnd: OldOpndValue)) {
507	LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
508	return nullptr;
509	}
510	CombOldVGPR = getRegSubRegPair(O: *Src1);
511	auto *MovDst = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::vdst);
512	const TargetRegisterClass *RC = MRI->getRegClass(Reg: MovDst->getReg());
513	if (!isOfRegClass(P: CombOldVGPR, TRC: RC, MRI&: MRI)) {
514	LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n");
515	return nullptr;
516	}
517	}
518	return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable);
519	}
520
521	// returns true if MI doesn't have OpndName immediate operand or the
522	// operand has Value
523	bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, AMDGPU::OpName OpndName,
524	int64_t Value, int64_t Mask) const {
525	auto *Imm = TII->getNamedOperand(MI, OperandName: OpndName);
526	if (!Imm)
527	return true;
528
529	assert(Imm->isImm());
530	return (Imm->getImm() & Mask) == Value;
531	}
532
533	bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
534	assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp \|\|
535	MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp \|\|
536	MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
537	LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
538
539	auto *DstOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::vdst);
540	assert(DstOpnd && DstOpnd->isReg());
541	auto DPPMovReg = DstOpnd->getReg();
542	if (DPPMovReg.isPhysical()) {
543	LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n");
544	return false;
545	}
546	if (execMayBeModifiedBeforeAnyUse(MRI: *MRI, VReg: DPPMovReg, DefMI: MovMI)) {
547	LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
548	" for all uses\n");
549	return false;
550	}
551
552	auto *DppCtrl = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::dpp_ctrl);
553	assert(DppCtrl && DppCtrl->isImm());
554	unsigned DppCtrlVal = DppCtrl->getImm();
555	if ((MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO \|\|
556	MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp)) {
557	if (!ST->hasFeature(Feature: AMDGPU::FeatureDPALU_DPP)) {
558	LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move is unsupported\n");
559	// Split it.
560	return false;
561	}
562	if (!AMDGPU::isLegalDPALU_DPPControl(ST: *ST, DC: DppCtrlVal)) {
563	LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported"
564	" control value\n");
565	// Let it split, then control may become legal.
566	return false;
567	}
568	}
569
570	auto *RowMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::row_mask);
571	assert(RowMaskOpnd && RowMaskOpnd->isImm());
572	auto *BankMaskOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bank_mask);
573	assert(BankMaskOpnd && BankMaskOpnd->isImm());
574	const bool MaskAllLanes = RowMaskOpnd->getImm() == `0xF` &&
575	BankMaskOpnd->getImm() == `0xF`;
576
577	auto *BCZOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::bound_ctrl);
578	assert(BCZOpnd && BCZOpnd->isImm());
579	bool BoundCtrlZero = BCZOpnd->getImm();
580
581	auto *OldOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::old);
582	auto *SrcOpnd = TII->getNamedOperand(MI&: MovMI, OperandName: AMDGPU::OpName::src0);
583	assert(OldOpnd && OldOpnd->isReg());
584	assert(SrcOpnd && SrcOpnd->isReg());
585	if (OldOpnd->getReg().isPhysical() \|\| SrcOpnd->getReg().isPhysical()) {
586	LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n");
587	return false;
588	}
589
590	auto * const OldOpndValue = getOldOpndValue(OldOpnd&: *OldOpnd);
591	// OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
592	// We could use: assert(!OldOpndValue \|\| OldOpndValue->isImm())
593	// but the third option is used to distinguish undef from non-immediate
594	// to reuse IMPLICIT_DEF instruction later
595	assert(!OldOpndValue \|\| OldOpndValue->isImm() \|\| OldOpndValue == OldOpnd);
596
597	bool CombBCZ = false;
598
599	if (MaskAllLanes && BoundCtrlZero) { // [1]
600	CombBCZ = true;
601	} else {
602	if (!OldOpndValue \|\| !OldOpndValue->isImm()) {
603	LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
604	return false;
605	}
606
607	if (OldOpndValue->getImm() == `0`) {
608	if (MaskAllLanes) {
609	assert(!BoundCtrlZero); // by check [1]
610	CombBCZ = true;
611	}
612	} else if (BoundCtrlZero) {
613	assert(!MaskAllLanes); // by check [1]
614	LLVM_DEBUG(dbgs() <<
615	" failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
616	return false;
617	}
618	}
619
620	LLVM_DEBUG(dbgs() << " old=";
621	if (!OldOpndValue)
622	dbgs() << "undef";
623	else
624	dbgs() << *OldOpndValue;
625	dbgs() << ", bound_ctrl=" << CombBCZ << `'\n'`);
626
627	SmallVector<MachineInstr*, `4`> OrigMIs, DPPMIs;
628	DenseMap<MachineInstr, SmallVector<unsigned*, `4`>> RegSeqWithOpNos;
629	auto CombOldVGPR = getRegSubRegPair(O: *OldOpnd);
630	// try to reuse previous old reg if its undefined (IMPLICIT_DEF)
631	if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
632	const TargetRegisterClass *RC = MRI->getRegClass(Reg: DPPMovReg);
633	CombOldVGPR = RegSubRegPair (
634	MRI->createVirtualRegister(RegClass: RC));
635	auto UndefInst = BuildMI(BB&: *MovMI.getParent(), I&: MovMI, MIMD: MovMI.getDebugLoc(),
636	MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: CombOldVGPR.Reg);
637	DPPMIs.push_back(Elt: UndefInst.getInstr());
638	}
639
640	OrigMIs.push_back(Elt: &MovMI);
641	bool Rollback = true;
642	SmallVector<MachineOperand *, `16`> Uses(
643	llvm::make_pointer_range(Range: MRI->use_nodbg_operands(Reg: DPPMovReg)));
644
645	while (!Uses.empty()) {
646	MachineOperand *Use = Uses.pop_back_val();
647	Rollback = true;
648
649	auto &OrigMI = *Use->getParent();
650	LLVM_DEBUG(dbgs() << " try: " << OrigMI);
651
652	auto OrigOp = OrigMI.getOpcode();
653	assert((TII->get(OrigOp).getSize() != `4` \|\| !AMDGPU::isTrue16Inst(OrigOp)) &&
654	"There should not be e32 True16 instructions pre-RA");
655	if (OrigOp == AMDGPU::REG_SEQUENCE) {
656	Register FwdReg = OrigMI.getOperand(i: `0`).getReg();
657	unsigned FwdSubReg = `0`;
658
659	if (execMayBeModifiedBeforeAnyUse(MRI: *MRI, VReg: FwdReg, DefMI: OrigMI)) {
660	LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
661	" for all uses\n");
662	break;
663	}
664
665	unsigned OpNo, E = OrigMI.getNumOperands();
666	for (OpNo = `1`; OpNo < E; OpNo += `2`) {
667	if (OrigMI.getOperand(i: OpNo).getReg() == DPPMovReg) {
668	FwdSubReg = OrigMI.getOperand(i: OpNo + `1`).getImm();
669	break;
670	}
671	}
672
673	if (!FwdSubReg)
674	break;
675
676	for (auto &Op : MRI->use_nodbg_operands(Reg: FwdReg)) {
677	if (Op.getSubReg() == FwdSubReg)
678	Uses.push_back(Elt: &Op);
679	}
680	RegSeqWithOpNos [&OrigMI].push_back(Elt: OpNo);
681	continue;
682	}
683
684	bool IsShrinkable = isShrinkable(MI&: OrigMI);
685	if (!(IsShrinkable \|\|
686	((TII->isVOP3P(Opcode: OrigOp) \|\| TII->isVOPC(Opcode: OrigOp) \|\|
687	TII->isVOP3(Opcode: OrigOp)) &&
688	ST->hasVOP3DPP()) \|\|
689	TII->isVOP1(Opcode: OrigOp) \|\| TII->isVOP2(Opcode: OrigOp))) {
690	LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n");
691	break;
692	}
693	if (OrigMI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: ST->getRegisterInfo())) {
694	LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n");
695	break;
696	}
697
698	auto *Src0 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src0);
699	auto *Src1 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src1);
700	if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1]
701	LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
702	break;
703	}
704
705	auto *Src2 = TII->getNamedOperand(MI&: OrigMI, OperandName: AMDGPU::OpName::src2);
706	assert(Src0 && "Src1 without Src0?");
707	if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(Other: *Src0)) \|\|
708	(Src2 && Src2->isIdenticalTo(Other: *Src0)))) \|\|
709	(Use == Src1 && (Src1->isIdenticalTo(Other: *Src0) \|\|
710	(Src2 && Src2->isIdenticalTo(Other: *Src1))))) {
711	LLVM_DEBUG(
712	dbgs()
713	<< " " << OrigMI
714	<< " failed: DPP register is used more than once per instruction\n");
715	break;
716	}
717
718	if (!ST->hasFeature(Feature: AMDGPU::FeatureDPALU_DPP) &&
719	AMDGPU::isDPALU_DPP32BitOpc(Opc: OrigOp)) {
720	LLVM_DEBUG(dbgs() << " " << OrigMI
721	<< " failed: DPP ALU DPP is not supported\n");
722	break;
723	}
724
725	if (!AMDGPU::isLegalDPALU_DPPControl(ST: *ST, DC: DppCtrlVal) &&
726	AMDGPU::isDPALU_DPP(OpDesc: TII->get(Opcode: OrigOp), MII: TII, ST: ST)) {
727	LLVM_DEBUG(dbgs() << " " << OrigMI
728	<< " failed: not valid 64-bit DPP control value\n");
729	break;
730	}
731
732	LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
733	if (Use == Src0) {
734	if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
735	OldOpndValue, CombBCZ, IsShrinkable)) {
736	DPPMIs.push_back(Elt: DPPInst);
737	Rollback = false;
738	}
739	} else {
740	assert(Use == Src1 && OrigMI.isCommutable()); // by check [1]
741	auto *BB = OrigMI.getParent();
742	auto *NewMI = BB->getParent()->CloneMachineInstr(Orig: &OrigMI);
743	BB->insert(I: OrigMI, MI: NewMI);
744	if (TII->commuteInstruction(MI&: *NewMI)) {
745	LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
746	if (auto *DPPInst =
747	createDPPInst(OrigMI&: *NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ,
748	IsShrinkable)) {
749	DPPMIs.push_back(Elt: DPPInst);
750	Rollback = false;
751	}
752	} else
753	LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
754	NewMI->eraseFromParent();
755	}
756	if (Rollback)
757	break;
758	OrigMIs.push_back(Elt: &OrigMI);
759	}
760
761	Rollback \|= !Uses.empty();
762
763	for (auto MI : (Rollback? &DPPMIs : &OrigMIs))
764	MI->eraseFromParent();
765
766	if (!Rollback) {
767	for (auto &S : RegSeqWithOpNos) {
768	if (MRI->use_nodbg_empty(RegNo: S.first->getOperand(i: `0`).getReg())) {
769	S.first->eraseFromParent();
770	continue;
771	}
772	while (!S.second.empty())
773	S.first->getOperand(i: S.second.pop_back_val()).setIsUndef();
774	}
775	}
776
777	return !Rollback;
778	}
779
780	bool GCNDPPCombineLegacy::runOnMachineFunction(MachineFunction &MF) {
781	if (skipFunction(F: MF.getFunction()))
782	return false;
783
784	return GCNDPPCombine ().run(MF);
785	}
786
787	bool GCNDPPCombine::run(MachineFunction &MF) {
788	ST = &MF.getSubtarget<GCNSubtarget>();
789	if (!ST->hasDPP())
790	return false;
791
792	MRI = &MF.getRegInfo();
793	TII = ST->getInstrInfo();
794
795	bool Changed = false;
796	for (auto &MBB : MF) {
797	for (MachineInstr &MI : llvm::make_early_inc_range(Range: llvm::reverse(C&: MBB))) {
798	if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MovMI&: MI)) {
799	Changed = true;
800	++NumDPPMovsCombined;
801	} else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO \|\|
802	MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
803	if (ST->hasDPALU_DPP() && combineDPPMov(MovMI&: MI)) {
804	Changed = true;
805	++NumDPPMovsCombined;
806	} else {
807	auto Split = TII->expandMovDPP64(MI);
808	for (auto *M : {Split.first, Split.second}) {
809	if (M && combineDPPMov(MovMI&: *M))
810	++NumDPPMovsCombined;
811	}
812	Changed = true;
813	}
814	}
815	}
816	}
817	return Changed;
818	}
819
820	PreservedAnalyses GCNDPPCombinePass::run(MachineFunction &MF,
821	MachineFunctionAnalysisManager &) {
822	MFPropsModifier _(*this, MF);
823
824	if (MF.getFunction().hasOptNone())
825	return PreservedAnalyses::all();
826
827	bool Changed = GCNDPPCombine ().run(MF);
828	if (!Changed)
829	return PreservedAnalyses::all();
830
831	auto PA = getMachineFunctionPassPreservedAnalyses();
832	PA.preserveSet<CFGAnalyses>();
833	return PA;
834	}
835

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp