SIPeepholeSDWA.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp]

1	//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file This pass tries to apply several peephole SDWA patterns.
10	///
11	/// E.g. original:
12	/// V_LSHRREV_B32_e32 %0, 16, %1
13	/// V_ADD_CO_U32_e32 %2, %0, %3
14	/// V_LSHLREV_B32_e32 %4, 16, %2
15	///
16	/// Replace:
17	/// V_ADD_CO_U32_sdwa %4, %1, %3
18	/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19	///
20	//===----------------------------------------------------------------------===//
21
22	#include "SIPeepholeSDWA.h"
23	#include "AMDGPU.h"
24	#include "GCNSubtarget.h"
25	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26	#include "llvm/ADT/MapVector.h"
27	#include "llvm/ADT/Statistic.h"
28	#include "llvm/CodeGen/MachineFunctionPass.h"
29	#include <optional>
30
31	using namespace llvm;
32
33	#define DEBUG_TYPE "si-peephole-sdwa"
34
35	STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
36	STATISTIC(NumSDWAInstructionsPeepholed,
37	"Number of instruction converted to SDWA.");
38
39	namespace {
40
41	bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
42	const SIInstrInfo *TII);
43	class SDWAOperand;
44	class SDWADstOperand;
45
46	using SDWAOperandsVector = SmallVector<SDWAOperand *, `4`>;
47	using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
48
49	class SIPeepholeSDWA {
50	private:
51	MachineRegisterInfo *MRI;
52	const SIRegisterInfo *TRI;
53	const SIInstrInfo *TII;
54
55	MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
56	SDWAOperandsMap PotentialMatches;
57	SmallVector<MachineInstr *, `8`> ConvertedInstructions;
58
59	std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
60
61	void matchSDWAOperands(MachineBasicBlock &MBB);
62	std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
63	void pseudoOpConvertToVOP2(MachineInstr &MI,
64	const GCNSubtarget &ST) const;
65	void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
66	MachineInstr *createSDWAVersion(MachineInstr &MI);
67	bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
68	void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
69
70	public:
71	bool run(MachineFunction &MF);
72	};
73
74	class SIPeepholeSDWALegacy : public MachineFunctionPass {
75	public:
76	static char ID;
77
78	SIPeepholeSDWALegacy() : MachineFunctionPass (ID) {}
79
80	StringRef getPassName() const override { return "SI Peephole SDWA"; }
81
82	bool runOnMachineFunction(MachineFunction &MF) override;
83
84	void getAnalysisUsage(AnalysisUsage &AU) const override {
85	AU.setPreservesCFG();
86	MachineFunctionPass::getAnalysisUsage(AU);
87	}
88	};
89
90	using namespace AMDGPU::SDWA;
91
92	class SDWAOperand {
93	private:
94	MachineOperand Target; // Operand that would be used in converted instruction*
95	MachineOperand Replaced; // Operand that would be replace by Target*
96
97	/// Returns true iff the SDWA selection of this SDWAOperand can be combined
98	/// with the SDWA selections of its uses in \p MI.
99	virtual bool canCombineSelections(const MachineInstr &MI,
100	const SIInstrInfo *TII) = `0`;
101
102	public:
103	SDWAOperand(MachineOperand TargetOp, MachineOperand ReplacedOp)
104	: Target(TargetOp), Replaced(ReplacedOp) {
105	assert(Target->isReg());
106	assert(Replaced->isReg());
107	}
108
109	virtual ~SDWAOperand() = default;
110
111	virtual MachineInstr potentialToConvert(const* SIInstrInfo *TII,
112	const GCNSubtarget &ST,
113	SDWAOperandsMap PotentialMatches = nullptr*) = `0`;
114	virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = `0`;
115
116	MachineOperand getTargetOperand() const* { return Target; }
117	MachineOperand getReplacedOperand() const* { return Replaced; }
118	MachineInstr getParentInst() const* { return Target->getParent(); }
119
120	MachineRegisterInfo getMRI() const* {
121	return &getParentInst()->getParent()->getParent()->getRegInfo();
122	}
123
124	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
125	virtual void print(raw_ostream& OS) const = `0`;
126	void dump() const { print(dbgs()); }
127	#endif
128	};
129
130	class SDWASrcOperand : public SDWAOperand {
131	private:
132	SdwaSel SrcSel;
133	bool Abs;
134	bool Neg;
135	bool Sext;
136
137	public:
138	SDWASrcOperand(MachineOperand TargetOp, MachineOperand ReplacedOp,
139	SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
140	bool Sext_ = false)
141	: SDWAOperand (TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
142	Neg(Neg_), Sext(Sext_) {}
143
144	MachineInstr potentialToConvert(const* SIInstrInfo *TII,
145	const GCNSubtarget &ST,
146	SDWAOperandsMap PotentialMatches = nullptr*) override;
147	bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
148	bool canCombineSelections(const MachineInstr &MI,
149	const SIInstrInfo *TII) override;
150
151	SdwaSel getSrcSel() const { return SrcSel; }
152	bool getAbs() const { return Abs; }
153	bool getNeg() const { return Neg; }
154	bool getSext() const { return Sext; }
155
156	uint64_t getSrcMods(const SIInstrInfo *TII,
157	const MachineOperand SrcOp) const*;
158
159	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
160	void print(raw_ostream& OS) const override;
161	#endif
162	};
163
164	class SDWADstOperand : public SDWAOperand {
165	private:
166	SdwaSel DstSel;
167	DstUnused DstUn;
168
169	public:
170	SDWADstOperand(MachineOperand TargetOp, MachineOperand ReplacedOp,
171	SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
172	: SDWAOperand (TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
173
174	MachineInstr potentialToConvert(const* SIInstrInfo *TII,
175	const GCNSubtarget &ST,
176	SDWAOperandsMap PotentialMatches = nullptr*) override;
177	bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
178	bool canCombineSelections(const MachineInstr &MI,
179	const SIInstrInfo *TII) override;
180
181	SdwaSel getDstSel() const { return DstSel; }
182	DstUnused getDstUnused() const { return DstUn; }
183
184	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
185	void print(raw_ostream& OS) const override;
186	#endif
187	};
188
189	class SDWADstPreserveOperand : public SDWADstOperand {
190	private:
191	MachineOperand *Preserve;
192
193	public:
194	SDWADstPreserveOperand(MachineOperand TargetOp, MachineOperand ReplacedOp,
195	MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
196	: SDWADstOperand (TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
197	Preserve(PreserveOp) {}
198
199	bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
200	bool canCombineSelections(const MachineInstr &MI,
201	const SIInstrInfo *TII) override;
202
203	MachineOperand getPreservedOperand() const* { return Preserve; }
204
205	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
206	void print(raw_ostream& OS) const override;
207	#endif
208	};
209
210	} // end anonymous namespace
211
212	INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
213	false)
214
215	char SIPeepholeSDWALegacy::ID = `0`;
216
217	char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;
218
219	FunctionPass *llvm::createSIPeepholeSDWALegacyPass() {
220	return new SIPeepholeSDWALegacy ();
221	}
222
223	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
224	static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
225	switch(Sel) {
226	case BYTE_0: OS << "BYTE_0"; break;
227	case BYTE_1: OS << "BYTE_1"; break;
228	case BYTE_2: OS << "BYTE_2"; break;
229	case BYTE_3: OS << "BYTE_3"; break;
230	case WORD_0: OS << "WORD_0"; break;
231	case WORD_1: OS << "WORD_1"; break;
232	case DWORD: OS << "DWORD"; break;
233	}
234	return OS;
235	}
236
237	static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
238	switch(Un) {
239	case UNUSED_PAD: OS << "UNUSED_PAD"; break;
240	case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
241	case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
242	}
243	return OS;
244	}
245
246	LLVM_DUMP_METHOD
247	void SDWASrcOperand::print(raw_ostream& OS) const {
248	OS << "SDWA src: " << *getTargetOperand()
249	<< " src_sel:" << getSrcSel()
250	<< " abs:" << getAbs() << " neg:" << getNeg()
251	<< " sext:" << getSext() << `'\n'`;
252	}
253
254	LLVM_DUMP_METHOD
255	void SDWADstOperand::print(raw_ostream& OS) const {
256	OS << "SDWA dst: " << *getTargetOperand()
257	<< " dst_sel:" << getDstSel()
258	<< " dst_unused:" << getDstUnused() << `'\n'`;
259	}
260
261	LLVM_DUMP_METHOD
262	void SDWADstPreserveOperand::print(raw_ostream& OS) const {
263	OS << "SDWA preserve dst: " << *getTargetOperand()
264	<< " dst_sel:" << getDstSel()
265	<< " preserve:" << *getPreservedOperand() << `'\n'`;
266	}
267
268	#endif
269
270	static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
271	assert(To.isReg() && From.isReg());
272	To.setReg(From.getReg());
273	To.setSubReg(From.getSubReg());
274	To.setIsUndef(From.isUndef());
275	if (To.isUse()) {
276	To.setIsKill(From.isKill());
277	} else {
278	To.setIsDead(From.isDead());
279	}
280	}
281
282	static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
283	return LHS.isReg() &&
284	RHS.isReg() &&
285	LHS.getReg() == RHS.getReg() &&
286	LHS.getSubReg() == RHS.getSubReg();
287	}
288
289	static MachineOperand findSingleRegUse(const* MachineOperand *Reg,
290	const MachineRegisterInfo *MRI) {
291	if (!Reg->isReg() \|\| !Reg->isDef())
292	return nullptr;
293
294	MachineOperand ResMO = nullptr*;
295	for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg: Reg->getReg())) {
296	// If there exist use of subreg of Reg then return nullptr
297	if (!isSameReg(LHS: UseMO, RHS: *Reg))
298	return nullptr;
299
300	// Check that there is only one instruction that uses Reg
301	if (!ResMO) {
302	ResMO = &UseMO;
303	} else if (ResMO->getParent() != UseMO.getParent()) {
304	return nullptr;
305	}
306	}
307
308	return ResMO;
309	}
310
311	static MachineOperand findSingleRegDef(const* MachineOperand *Reg,
312	const MachineRegisterInfo *MRI) {
313	if (!Reg->isReg())
314	return nullptr;
315
316	MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg: Reg->getReg());
317	if (!DefInstr)
318	return nullptr;
319
320	for (auto &DefMO : DefInstr->defs()) {
321	if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
322	return &DefMO;
323	}
324
325	// Ignore implicit defs.
326	return nullptr;
327	}
328
329	/// Combine an SDWA instruction's existing SDWA selection \p Sel with
330	/// the SDWA selection \p OperandSel of its operand. If the selections
331	/// are compatible, return the combined selection, otherwise return a
332	/// nullopt.
333	/// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1:
334	/// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
335	static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) {
336	if (Sel == SdwaSel::DWORD)
337	return OperandSel;
338
339	if (Sel == OperandSel \|\| OperandSel == SdwaSel::DWORD)
340	return Sel;
341
342	if (Sel == SdwaSel::WORD_1 \|\| Sel == SdwaSel::BYTE_2 \|\|
343	Sel == SdwaSel::BYTE_3)
344	return {};
345
346	if (OperandSel == SdwaSel::WORD_0)
347	return Sel;
348
349	if (OperandSel == SdwaSel::WORD_1) {
350	if (Sel == SdwaSel::BYTE_0)
351	return SdwaSel::BYTE_2;
352	if (Sel == SdwaSel::BYTE_1)
353	return SdwaSel::BYTE_3;
354	if (Sel == SdwaSel::WORD_0)
355	return SdwaSel::WORD_1;
356	}
357
358	return {};
359	}
360
361	uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
362	const MachineOperand SrcOp) const* {
363	uint64_t Mods = `0`;
364	const auto *MI = SrcOp->getParent();
365	if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src0) == SrcOp) {
366	if (auto Mod = TII->getNamedOperand(MI: MI, OperandName: AMDGPU::OpName::src0_modifiers)) {
367	Mods = Mod->getImm();
368	}
369	} else if (TII->getNamedOperand(MI: *MI, OperandName: AMDGPU::OpName::src1) == SrcOp) {
370	if (auto Mod = TII->getNamedOperand(MI: MI, OperandName: AMDGPU::OpName::src1_modifiers)) {
371	Mods = Mod->getImm();
372	}
373	}
374	if (Abs \|\| Neg) {
375	assert(!Sext &&
376	"Float and integer src modifiers can't be set simultaneously");
377	Mods \|= Abs ? SISrcMods::ABS : `0u`;
378	Mods ^= Neg ? SISrcMods::NEG : `0u`;
379	} else if (Sext) {
380	Mods \|= SISrcMods::SEXT;
381	}
382
383	return Mods;
384	}
385
386	MachineInstr SDWASrcOperand::potentialToConvert(const* SIInstrInfo *TII,
387	const GCNSubtarget &ST,
388	SDWAOperandsMap *PotentialMatches) {
389	if (PotentialMatches != nullptr) {
390	// Fill out the map for all uses if all can be converted
391	MachineOperand *Reg = getReplacedOperand();
392	if (!Reg->isReg() \|\| !Reg->isDef())
393	return nullptr;
394
395	for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg: Reg->getReg()))
396	// Check that all instructions that use Reg can be converted
397	if (!isConvertibleToSDWA(MI&: UseMI, ST, TII) \|\|
398	!canCombineSelections(MI: UseMI, TII))
399	return nullptr;
400
401	// Now that it's guaranteed all uses are legal, iterate over the uses again
402	// to add them for later conversion.
403	for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg: Reg->getReg())) {
404	// Should not get a subregister here
405	assert(isSameReg(UseMO, *Reg));
406
407	SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
408	MachineInstr *UseMI = UseMO.getParent();
409	potentialMatchesMap [UseMI].push_back(Elt: this);
410	}
411	return nullptr;
412	}
413
414	// For SDWA src operand potential instruction is one that use register
415	// defined by parent instruction
416	MachineOperand *PotentialMO = findSingleRegUse(Reg: getReplacedOperand(), MRI: getMRI());
417	if (!PotentialMO)
418	return nullptr;
419
420	MachineInstr *Parent = PotentialMO->getParent();
421
422	return canCombineSelections(MI: Parent, TII) ? Parent : nullptr*;
423	}
424
425	bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
426	switch (MI.getOpcode()) {
427	case AMDGPU::V_CVT_F32_FP8_sdwa:
428	case AMDGPU::V_CVT_F32_BF8_sdwa:
429	case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
430	case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
431	// Does not support input modifiers: noabs, noneg, nosext.
432	return false;
433	case AMDGPU::V_CNDMASK_B32_sdwa:
434	// SISrcMods uses the same bitmask for SEXT and NEG modifiers and
435	// hence the compiler can only support one type of modifier for
436	// each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG
437	// since its operands get printed using
438	// AMDGPUInstPrinter::printOperandAndFPInputMods which produces
439	// the output intended for NEG if SEXT is set.
440	//
441	// The ISA does actually support both modifiers on most SDWA
442	// instructions.
443	//
444	// FIXME Accept SEXT here after fixing this issue.
445	if (Sext)
446	return false;
447	break;
448	}
449
450	// Find operand in instruction that matches source operand and replace it with
451	// target operand. Set corresponding src_sel
452	bool IsPreserveSrc = false;
453	MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
454	MachineOperand *SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_sel);
455	MachineOperand *SrcMods =
456	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers);
457	assert(Src && (Src->isReg() \|\| Src->isImm()));
458	if (!isSameReg(LHS: Src, RHS: getReplacedOperand())) {
459	// If this is not src0 then it could be src1
460	Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
461	SrcSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_sel);
462	SrcMods = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers);
463
464	if (!Src \|\|
465	!isSameReg(LHS: Src, RHS: getReplacedOperand())) {
466	// It's possible this Src is a tied operand for
467	// UNUSED_PRESERVE, in which case we can either
468	// abandon the peephole attempt, or if legal we can
469	// copy the target operand into the tied slot
470	// if the preserve operation will effectively cause the same
471	// result by overwriting the rest of the dst.
472	MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
473	MachineOperand *DstUnused =
474	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
475
476	if (Dst &&
477	DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
478	// This will work if the tied src is accessing WORD_0, and the dst is
479	// writing WORD_1. Modifiers don't matter because all the bits that
480	// would be impacted are being overwritten by the dst.
481	// Any other case will not work.
482	SdwaSel DstSel = static_cast<SdwaSel>(
483	TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::dst_sel));
484	if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
485	getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
486	IsPreserveSrc = true;
487	auto DstIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
488	Name: AMDGPU::OpName::vdst);
489	auto TiedIdx = MI.findTiedOperandIdx(OpIdx: DstIdx);
490	Src = &MI.getOperand(i: TiedIdx);
491	SrcSel = nullptr;
492	SrcMods = nullptr;
493	} else {
494	// Not legal to convert this src
495	return false;
496	}
497	}
498	}
499	assert(Src && Src->isReg());
500
501	if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa \|\|
502	MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa \|\|
503	MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa \|\|
504	MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
505	!isSameReg(LHS: Src, RHS: getReplacedOperand())) {
506	// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
507	// src2. This is not allowed.
508	return false;
509	}
510
511	assert(isSameReg(Src, getReplacedOperand()) &&
512	(IsPreserveSrc \|\| (SrcSel && SrcMods)));
513	}
514	copyRegOperand(To&: Src, From: getTargetOperand());
515	if (!IsPreserveSrc) {
516	SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm());
517	SrcSel->setImm(*combineSdwaSel(Sel: ExistingSel, OperandSel: getSrcSel()));
518	SrcMods->setImm(getSrcMods(TII, SrcOp: Src));
519	}
520	getTargetOperand()->setIsKill(false);
521	return true;
522	}
523
524	/// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA
525	/// instruction \p MI can be combined with the selection \p OpSel.
526	static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
527	AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) {
528	assert(TII->isSDWA(MI.getOpcode()));
529
530	const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, OperandName: SrcSelOpName);
531	SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm());
532
533	return combineSdwaSel(Sel: SrcSel, OperandSel: OpSel).has_value();
534	}
535
536	/// Verify that \p Op is the same register as the operand of the SDWA
537	/// instruction \p MI named by \p SrcOpName and that the SDWA
538	/// selection \p SrcSelOpName can be combined with the \p OpSel.
539	static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
540	AMDGPU::OpName SrcOpName,
541	AMDGPU::OpName SrcSelOpName, MachineOperand *Op,
542	SdwaSel OpSel) {
543	assert(TII->isSDWA(MI.getOpcode()));
544
545	const MachineOperand *Src = TII->getNamedOperand(MI, OperandName: SrcOpName);
546	if (!Src \|\| !isSameReg(LHS: Src, RHS: Op))
547	return true;
548
549	return canCombineOpSel(MI, TII, SrcSelOpName, OpSel);
550	}
551
552	bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI,
553	const SIInstrInfo *TII) {
554	if (!TII->isSDWA(Opcode: MI.getOpcode()))
555	return true;
556
557	using namespace AMDGPU;
558
559	return canCombineOpSel(MI, TII, SrcOpName: OpName::src0, SrcSelOpName: OpName::src0_sel,
560	Op: getReplacedOperand(), OpSel: getSrcSel()) &&
561	canCombineOpSel(MI, TII, SrcOpName: OpName::src1, SrcSelOpName: OpName::src1_sel,
562	Op: getReplacedOperand(), OpSel: getSrcSel());
563	}
564
565	MachineInstr SDWADstOperand::potentialToConvert(const* SIInstrInfo *TII,
566	const GCNSubtarget &ST,
567	SDWAOperandsMap *PotentialMatches) {
568	// For SDWA dst operand potential instruction is one that defines register
569	// that this operand uses
570	MachineRegisterInfo *MRI = getMRI();
571	MachineInstr *ParentMI = getParentInst();
572
573	MachineOperand *PotentialMO = findSingleRegDef(Reg: getReplacedOperand(), MRI);
574	if (!PotentialMO)
575	return nullptr;
576
577	// Check that ParentMI is the only instruction that uses replaced register
578	for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg: PotentialMO->getReg())) {
579	if (&UseInst != ParentMI)
580	return nullptr;
581	}
582
583	MachineInstr *Parent = PotentialMO->getParent();
584	return canCombineSelections(MI: Parent, TII) ? Parent : nullptr*;
585	}
586
587	bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
588	// Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
589
590	if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa \|\|
591	MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa \|\|
592	MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa \|\|
593	MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
594	getDstSel() != AMDGPU::SDWA::DWORD) {
595	// v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
596	return false;
597	}
598
599	MachineOperand *Operand = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
600	assert(Operand &&
601	Operand->isReg() &&
602	isSameReg(Operand, getReplacedOperand()));
603	copyRegOperand(To&: Operand, From: getTargetOperand());
604	MachineOperand *DstSel= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel);
605	assert(DstSel);
606
607	SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm());
608	DstSel->setImm(combineSdwaSel(Sel: ExistingSel, OperandSel: getDstSel()).value());
609
610	MachineOperand *DstUnused= TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_unused);
611	assert(DstUnused);
612	DstUnused->setImm(getDstUnused());
613
614	// Remove original instruction because it would conflict with our new
615	// instruction by register definition
616	getParentInst()->eraseFromParent();
617	return true;
618	}
619
620	bool SDWADstOperand::canCombineSelections(const MachineInstr &MI,
621	const SIInstrInfo *TII) {
622	if (!TII->isSDWA(Opcode: MI.getOpcode()))
623	return true;
624
625	return canCombineOpSel(MI, TII, SrcSelOpName: AMDGPU::OpName::dst_sel, OpSel: getDstSel());
626	}
627
628	bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
629	const SIInstrInfo *TII) {
630	// MI should be moved right before v_or_b32.
631	// For this we should clear all kill flags on uses of MI src-operands or else
632	// we can encounter problem with use of killed operand.
633	for (MachineOperand &MO : MI.uses()) {
634	if (!MO.isReg())
635	continue;
636	getMRI()->clearKillFlags(Reg: MO.getReg());
637	}
638
639	// Move MI before v_or_b32
640	MI.getParent()->remove(I: &MI);
641	getParentInst()->getParent()->insert(I: getParentInst(), MI: &MI);
642
643	// Add Implicit use of preserved register
644	MachineInstrBuilder MIB(*MI.getMF(), MI);
645	MIB.addReg(RegNo: getPreservedOperand()->getReg(),
646	flags: RegState::ImplicitKill,
647	SubReg: getPreservedOperand()->getSubReg());
648
649	// Tie dst to implicit use
650	MI.tieOperands(DefIdx: AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdst),
651	UseIdx: MI.getNumOperands() - `1`);
652
653	// Convert MI as any other SDWADstOperand and remove v_or_b32
654	return SDWADstOperand::convertToSDWA(MI, TII);
655	}
656
657	bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI,
658	const SIInstrInfo *TII) {
659	return SDWADstOperand::canCombineSelections(MI, TII);
660	}
661
662	std::optional<int64_t>
663	SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
664	if (Op.isImm()) {
665	return Op.getImm();
666	}
667
668	// If this is not immediate then it can be copy of immediate value, e.g.:
669	// %1 = S_MOV_B32 255;
670	if (Op.isReg()) {
671	for (const MachineOperand &Def : MRI->def_operands(Reg: Op.getReg())) {
672	if (!isSameReg(LHS: Op, RHS: Def))
673	continue;
674
675	const MachineInstr *DefInst = Def.getParent();
676	if (!TII->isFoldableCopy(MI: *DefInst))
677	return std::nullopt;
678
679	const MachineOperand &Copied = DefInst->getOperand(i: `1`);
680	if (!Copied.isImm())
681	return std::nullopt;
682
683	return Copied.getImm();
684	}
685	}
686
687	return std::nullopt;
688	}
689
690	std::unique_ptr<SDWAOperand>
691	SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
692	unsigned Opcode = MI.getOpcode();
693	switch (Opcode) {
694	case AMDGPU::V_LSHRREV_B32_e32:
695	case AMDGPU::V_ASHRREV_I32_e32:
696	case AMDGPU::V_LSHLREV_B32_e32:
697	case AMDGPU::V_LSHRREV_B32_e64:
698	case AMDGPU::V_ASHRREV_I32_e64:
699	case AMDGPU::V_LSHLREV_B32_e64: {
700	// from: v_lshrrev_b32_e32 v1, 16/24, v0
701	// to SDWA src:v0 src_sel:WORD_1/BYTE_3
702
703	// from: v_ashrrev_i32_e32 v1, 16/24, v0
704	// to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
705
706	// from: v_lshlrev_b32_e32 v1, 16/24, v0
707	// to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
708	MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
709	auto Imm = foldToImm(Op: *Src0);
710	if (!Imm)
711	break;
712
713	if (Imm != `16` && Imm != `24`)
714	break;
715
716	MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
717	MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
718	if (!Src1->isReg() \|\| Src1->getReg().isPhysical() \|\|
719	Dst->getReg().isPhysical())
720	break;
721
722	if (Opcode == AMDGPU::V_LSHLREV_B32_e32 \|\|
723	Opcode == AMDGPU::V_LSHLREV_B32_e64) {
724	return std::make_unique<SDWADstOperand>(
725	args&: Dst, args&: Src1, args: *Imm == `16` ? WORD_1 : BYTE_3, args: UNUSED_PAD);
726	}
727	return std::make_unique<SDWASrcOperand>(
728	args&: Src1, args&: Dst, args: Imm == `16` ? WORD_1 : BYTE_3, args: false, args: false*,
729	args: Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
730	Opcode != AMDGPU::V_LSHRREV_B32_e64);
731	break;
732	}
733
734	case AMDGPU::V_LSHRREV_B16_e32:
735	case AMDGPU::V_ASHRREV_I16_e32:
736	case AMDGPU::V_LSHLREV_B16_e32:
737	case AMDGPU::V_LSHRREV_B16_e64:
738	case AMDGPU::V_LSHRREV_B16_opsel_e64:
739	case AMDGPU::V_ASHRREV_I16_e64:
740	case AMDGPU::V_LSHLREV_B16_opsel_e64:
741	case AMDGPU::V_LSHLREV_B16_e64: {
742	// from: v_lshrrev_b16_e32 v1, 8, v0
743	// to SDWA src:v0 src_sel:BYTE_1
744
745	// from: v_ashrrev_i16_e32 v1, 8, v0
746	// to SDWA src:v0 src_sel:BYTE_1 sext:1
747
748	// from: v_lshlrev_b16_e32 v1, 8, v0
749	// to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
750	MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
751	auto Imm = foldToImm(Op: *Src0);
752	if (!Imm \|\| *Imm != `8`)
753	break;
754
755	MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
756	MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
757
758	if (!Src1->isReg() \|\| Src1->getReg().isPhysical() \|\|
759	Dst->getReg().isPhysical())
760	break;
761
762	if (Opcode == AMDGPU::V_LSHLREV_B16_e32 \|\|
763	Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 \|\|
764	Opcode == AMDGPU::V_LSHLREV_B16_e64)
765	return std::make_unique<SDWADstOperand>(args&: Dst, args&: Src1, args: BYTE_1, args: UNUSED_PAD);
766	return std::make_unique<SDWASrcOperand>(
767	args&: Src1, args&: Dst, args: BYTE_1, args: false, args: false,
768	args: Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
769	Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 &&
770	Opcode != AMDGPU::V_LSHRREV_B16_e64);
771	break;
772	}
773
774	case AMDGPU::V_BFE_I32_e64:
775	case AMDGPU::V_BFE_U32_e64: {
776	// e.g.:
777	// from: v_bfe_u32 v1, v0, 8, 8
778	// to SDWA src:v0 src_sel:BYTE_1
779
780	// offset \| width \| src_sel
781	// ------------------------
782	// 0 \| 8 \| BYTE_0
783	// 0 \| 16 \| WORD_0
784	// 0 \| 32 \| DWORD ?
785	// 8 \| 8 \| BYTE_1
786	// 16 \| 8 \| BYTE_2
787	// 16 \| 16 \| WORD_1
788	// 24 \| 8 \| BYTE_3
789
790	MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
791	auto Offset = foldToImm(Op: *Src1);
792	if (!Offset)
793	break;
794
795	MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
796	auto Width = foldToImm(Op: *Src2);
797	if (!Width)
798	break;
799
800	SdwaSel SrcSel = DWORD;
801
802	if (Offset == `0` && Width == `8`)
803	SrcSel = BYTE_0;
804	else if (Offset == `0` && Width == `16`)
805	SrcSel = WORD_0;
806	else if (Offset == `0` && Width == `32`)
807	SrcSel = DWORD;
808	else if (Offset == `8` && Width == `8`)
809	SrcSel = BYTE_1;
810	else if (Offset == `16` && Width == `8`)
811	SrcSel = BYTE_2;
812	else if (Offset == `16` && Width == `16`)
813	SrcSel = WORD_1;
814	else if (Offset == `24` && Width == `8`)
815	SrcSel = BYTE_3;
816	else
817	break;
818
819	MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
820	MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
821
822	if (!Src0->isReg() \|\| Src0->getReg().isPhysical() \|\|
823	Dst->getReg().isPhysical())
824	break;
825
826	return std::make_unique<SDWASrcOperand>(
827	args&: Src0, args&: Dst, args&: SrcSel, args: false, args: false, args: Opcode != AMDGPU::V_BFE_U32_e64);
828	}
829
830	case AMDGPU::V_AND_B32_e32:
831	case AMDGPU::V_AND_B32_e64: {
832	// e.g.:
833	// from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
834	// to SDWA src:v0 src_sel:WORD_0/BYTE_0
835
836	MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
837	MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
838	auto *ValSrc = Src1;
839	auto Imm = foldToImm(Op: *Src0);
840
841	if (!Imm) {
842	Imm = foldToImm(Op: *Src1);
843	ValSrc = Src0;
844	}
845
846	if (!Imm \|\| (Imm != `0x0000ffff` && Imm != `0x000000ff`))
847	break;
848
849	MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
850
851	if (!ValSrc->isReg() \|\| ValSrc->getReg().isPhysical() \|\|
852	Dst->getReg().isPhysical())
853	break;
854
855	return std::make_unique<SDWASrcOperand>(
856	args&: ValSrc, args&: Dst, args: *Imm == `0x0000ffff` ? WORD_0 : BYTE_0);
857	}
858
859	case AMDGPU::V_OR_B32_e32:
860	case AMDGPU::V_OR_B32_e64: {
861	// Patterns for dst_unused:UNUSED_PRESERVE.
862	// e.g., from:
863	// v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
864	// src1_sel:WORD_1 src2_sel:WORD1
865	// v_add_f16_e32 v3, v1, v2
866	// v_or_b32_e32 v4, v0, v3
867	// to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
868
869	// Check if one of operands of v_or_b32 is SDWA instruction
870	using CheckRetType =
871	std::optional<std::pair<MachineOperand , MachineOperand >>;
872	auto CheckOROperandsForSDWA =
873	[&](const MachineOperand Op1, const* MachineOperand *Op2) -> CheckRetType {
874	if (!Op1 \|\| !Op1->isReg() \|\| !Op2 \|\| !Op2->isReg())
875	return CheckRetType (std::nullopt);
876
877	MachineOperand *Op1Def = findSingleRegDef(Reg: Op1, MRI);
878	if (!Op1Def)
879	return CheckRetType (std::nullopt);
880
881	MachineInstr *Op1Inst = Op1Def->getParent();
882	if (!TII->isSDWA(MI: *Op1Inst))
883	return CheckRetType (std::nullopt);
884
885	MachineOperand *Op2Def = findSingleRegDef(Reg: Op2, MRI);
886	if (!Op2Def)
887	return CheckRetType (std::nullopt);
888
889	return CheckRetType (std::pair(Op1Def, Op2Def));
890	};
891
892	MachineOperand *OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
893	MachineOperand *OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
894	assert(OrSDWA && OrOther);
895	auto Res = CheckOROperandsForSDWA (OrSDWA, OrOther);
896	if (!Res) {
897	OrSDWA = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
898	OrOther = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
899	assert(OrSDWA && OrOther);
900	Res = CheckOROperandsForSDWA (OrSDWA, OrOther);
901	if (!Res)
902	break;
903	}
904
905	MachineOperand *OrSDWADef = Res ->first;
906	MachineOperand *OrOtherDef = Res ->second;
907	assert(OrSDWADef && OrOtherDef);
908
909	MachineInstr *SDWAInst = OrSDWADef->getParent();
910	MachineInstr *OtherInst = OrOtherDef->getParent();
911
912	// Check that OtherInstr is actually bitwise compatible with SDWAInst = their
913	// destination patterns don't overlap. Compatible instruction can be either
914	// regular instruction with compatible bitness or SDWA instruction with
915	// correct dst_sel
916	// SDWAInst \| OtherInst bitness / OtherInst dst_sel
917	// -----------------------------------------------------
918	// DWORD \| no / no
919	// WORD_0 \| no / BYTE_2/3, WORD_1
920	// WORD_1 \| 8/16-bit instructions / BYTE_0/1, WORD_0
921	// BYTE_0 \| no / BYTE_1/2/3, WORD_1
922	// BYTE_1 \| 8-bit / BYTE_0/2/3, WORD_1
923	// BYTE_2 \| 8/16-bit / BYTE_0/1/3. WORD_0
924	// BYTE_3 \| 8/16/24-bit / BYTE_0/1/2, WORD_0
925	// E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
926	// but v_add_f32 is not.
927
928	// TODO: add support for non-SDWA instructions as OtherInst.
929	// For now this only works with SDWA instructions. For regular instructions
930	// there is no way to determine if the instruction writes only 8/16/24-bit
931	// out of full register size and all registers are at min 32-bit wide.
932	if (!TII->isSDWA(MI: *OtherInst))
933	break;
934
935	SdwaSel DstSel = static_cast<SdwaSel>(
936	TII->getNamedImmOperand(MI: *SDWAInst, OperandName: AMDGPU::OpName::dst_sel));
937	SdwaSel OtherDstSel = static_cast<SdwaSel>(
938	TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_sel));
939
940	bool DstSelAgree = false;
941	switch (DstSel) {
942	case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) \|\|
943	(OtherDstSel == BYTE_3) \|\|
944	(OtherDstSel == WORD_1));
945	break;
946	case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) \|\|
947	(OtherDstSel == BYTE_1) \|\|
948	(OtherDstSel == WORD_0));
949	break;
950	case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) \|\|
951	(OtherDstSel == BYTE_2) \|\|
952	(OtherDstSel == BYTE_3) \|\|
953	(OtherDstSel == WORD_1));
954	break;
955	case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) \|\|
956	(OtherDstSel == BYTE_2) \|\|
957	(OtherDstSel == BYTE_3) \|\|
958	(OtherDstSel == WORD_1));
959	break;
960	case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) \|\|
961	(OtherDstSel == BYTE_1) \|\|
962	(OtherDstSel == BYTE_3) \|\|
963	(OtherDstSel == WORD_0));
964	break;
965	case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) \|\|
966	(OtherDstSel == BYTE_1) \|\|
967	(OtherDstSel == BYTE_2) \|\|
968	(OtherDstSel == WORD_0));
969	break;
970	default: DstSelAgree = false;
971	}
972
973	if (!DstSelAgree)
974	break;
975
976	// Also OtherInst dst_unused should be UNUSED_PAD
977	DstUnused OtherDstUnused = static_cast<DstUnused>(
978	TII->getNamedImmOperand(MI: *OtherInst, OperandName: AMDGPU::OpName::dst_unused));
979	if (OtherDstUnused != DstUnused::UNUSED_PAD)
980	break;
981
982	// Create DstPreserveOperand
983	MachineOperand *OrDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
984	assert(OrDst && OrDst->isReg());
985
986	return std::make_unique<SDWADstPreserveOperand>(
987	args&: OrDst, args&: OrSDWADef, args&: OrOtherDef, args&: DstSel);
988
989	}
990	}
991
992	return std::unique_ptr<SDWAOperand>(nullptr);
993	}
994
995	#if !defined(NDEBUG)
996	static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
997	Operand.print(OS);
998	return OS;
999	}
1000	#endif
1001
1002	void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
1003	for (MachineInstr &MI : MBB) {
1004	if (auto Operand = matchSDWAOperand(MI)) {
1005	LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << `'\n'`);
1006	SDWAOperands [&MI] = std::move(Operand);
1007	++NumSDWAPatternsFound;
1008	}
1009	}
1010	}
1011
1012	// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
1013	// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
1014	// V_ADD_CO_U32_sdwa.
1015	//
1016	// We are transforming from a VOP3 into a VOP2 form of the instruction.
1017	// %19:vgpr_32 = V_AND_B32_e32 255,
1018	// killed %16:vgpr_32, implicit $exec
1019	// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
1020	// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
1021	// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
1022	// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
1023	//
1024	// becomes
1025	// %47:vgpr_32 = V_ADD_CO_U32_sdwa
1026	// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
1027	// implicit-def $vcc, implicit $exec
1028	// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
1029	// %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
1030	void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
1031	const GCNSubtarget &ST) const {
1032	int Opc = MI.getOpcode();
1033	assert((Opc == AMDGPU::V_ADD_CO_U32_e64 \|\| Opc == AMDGPU::V_SUB_CO_U32_e64) &&
1034	"Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
1035
1036	// Can the candidate MI be shrunk?
1037	if (!TII->canShrink(MI, MRI: *MRI))
1038	return;
1039	Opc = AMDGPU::getVOPe32(Opcode: Opc);
1040	// Find the related ADD instruction.
1041	const MachineOperand *Sdst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
1042	if (!Sdst)
1043	return;
1044	MachineOperand *NextOp = findSingleRegUse(Reg: Sdst, MRI);
1045	if (!NextOp)
1046	return;
1047	MachineInstr &MISucc = *NextOp->getParent();
1048
1049	// Make sure the carry in/out are subsequently unused.
1050	MachineOperand *CarryIn = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::src2);
1051	if (!CarryIn)
1052	return;
1053	MachineOperand *CarryOut = TII->getNamedOperand(MI&: MISucc, OperandName: AMDGPU::OpName::sdst);
1054	if (!CarryOut)
1055	return;
1056	if (!MRI->hasOneUse(RegNo: CarryIn->getReg()) \|\| !MRI->use_empty(RegNo: CarryOut->getReg()))
1057	return;
1058	// Make sure VCC or its subregs are dead before MI.
1059	MachineBasicBlock &MBB = *MI.getParent();
1060	MachineBasicBlock::LivenessQueryResult Liveness =
1061	MBB.computeRegisterLiveness(TRI, Reg: AMDGPU::VCC, Before: MI, Neighborhood: `25`);
1062	if (Liveness != MachineBasicBlock::LQR_Dead)
1063	return;
1064	// Check if VCC is referenced in range of (MI,MISucc].
1065	for (auto I = std::next(x: MI.getIterator()), E = MISucc.getIterator();
1066	I != E; ++I) {
1067	if (I ->modifiesRegister(Reg: AMDGPU::VCC, TRI))
1068	return;
1069	}
1070
1071	// Replace MI with V_{SUB\|ADD}_I32_e32
1072	BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc))
1073	.add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst))
1074	.add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
1075	.add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1))
1076	.setMIFlags(MI.getFlags());
1077
1078	MI.eraseFromParent();
1079
1080	// Since the carry output of MI is now VCC, update its use in MISucc.
1081
1082	MISucc.substituteRegister(FromReg: CarryIn->getReg(), ToReg: TRI->getVCC(), SubIdx: `0`, RegInfo: *TRI);
1083	}
1084
1085	/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
1086	/// operand into the corresponding VOP2 form which expects the
1087	/// argument in VCC. To this end, add an copy from the carry-in to
1088	/// VCC. The conversion will only be applied if \p MI can be shrunk
1089	/// to VOP2 and if VCC can be proven to be dead before \p MI.
1090	void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
1091	const GCNSubtarget &ST) const {
1092	assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1093
1094	LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
1095	if (!TII->canShrink(MI, MRI: *MRI)) {
1096	LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
1097	return;
1098	}
1099
1100	const MachineOperand &CarryIn =
1101	*TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
1102	Register CarryReg = CarryIn.getReg();
1103	MachineInstr *CarryDef = MRI->getVRegDef(Reg: CarryReg);
1104	if (!CarryDef) {
1105	LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
1106	return;
1107	}
1108
1109	// Make sure VCC or its subregs are dead before MI.
1110	MCRegister Vcc = TRI->getVCC();
1111	MachineBasicBlock &MBB = *MI.getParent();
1112	MachineBasicBlock::LivenessQueryResult Liveness =
1113	MBB.computeRegisterLiveness(TRI, Reg: Vcc, Before: MI);
1114	if (Liveness != MachineBasicBlock::LQR_Dead) {
1115	LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
1116	return;
1117	}
1118
1119	BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: Vcc).add(MO: CarryIn);
1120
1121	auto Converted = BuildMI(BB&: MBB, I&: MI, MIMD: MI.getDebugLoc(),
1122	MCID: TII->get(Opcode: AMDGPU::getVOPe32(Opcode: MI.getOpcode())))
1123	.add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst))
1124	.add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0))
1125	.add(MO: *TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1))
1126	.setMIFlags(MI.getFlags());
1127	TII->fixImplicitOperands(MI&: *Converted);
1128	LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
1129	(void)Converted;
1130	MI.eraseFromParent();
1131	}
1132
1133	namespace {
1134	bool isConvertibleToSDWA(MachineInstr &MI,
1135	const GCNSubtarget &ST,
1136	const SIInstrInfo* TII) {
1137	// Check if this is already an SDWA instruction
1138	unsigned Opc = MI.getOpcode();
1139	if (TII->isSDWA(Opcode: Opc))
1140	return true;
1141
1142	// Can only be handled after ealier conversion to
1143	// AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
1144	if (Opc == AMDGPU::V_CNDMASK_B32_e64)
1145	return false;
1146
1147	// Check if this instruction has opcode that supports SDWA
1148	if (AMDGPU::getSDWAOp(Opcode: Opc) == -`1`)
1149	Opc = AMDGPU::getVOPe32(Opcode: Opc);
1150
1151	if (AMDGPU::getSDWAOp(Opcode: Opc) == -`1`)
1152	return false;
1153
1154	if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod))
1155	return false;
1156
1157	if (TII->isVOPC(Opcode: Opc)) {
1158	if (!ST.hasSDWASdst()) {
1159	const MachineOperand *SDst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst);
1160	if (SDst && (SDst->getReg() != AMDGPU::VCC &&
1161	SDst->getReg() != AMDGPU::VCC_LO))
1162	return false;
1163	}
1164
1165	if (!ST.hasSDWAOutModsVOPC() &&
1166	(TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::clamp) \|\|
1167	TII->hasModifiersSet(MI, OpName: AMDGPU::OpName::omod)))
1168	return false;
1169
1170	} else if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst) \|\|
1171	!TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)) {
1172	return false;
1173	}
1174
1175	if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 \|\|
1176	Opc == AMDGPU::V_FMAC_F32_e32 \|\|
1177	Opc == AMDGPU::V_MAC_F16_e32 \|\|
1178	Opc == AMDGPU::V_MAC_F32_e32))
1179	return false;
1180
1181	// Check if target supports this SDWA opcode
1182	if (TII->pseudoToMCOpcode(Opcode: Opc) == -`1`)
1183	return false;
1184
1185	if (MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)) {
1186	if (!Src0->isReg() && !Src0->isImm())
1187	return false;
1188	}
1189
1190	if (MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)) {
1191	if (!Src1->isReg() && !Src1->isImm())
1192	return false;
1193	}
1194
1195	return true;
1196	}
1197	} // namespace
1198
1199	MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
1200	unsigned Opcode = MI.getOpcode();
1201	assert(!TII->isSDWA(Opcode));
1202
1203	int SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1204	if (SDWAOpcode == -`1`)
1205	SDWAOpcode = AMDGPU::getSDWAOp(Opcode: AMDGPU::getVOPe32(Opcode));
1206	assert(SDWAOpcode != -`1`);
1207
1208	const MCInstrDesc &SDWADesc = TII->get(Opcode: SDWAOpcode);
1209
1210	// Create SDWA version of instruction MI and initialize its operands
1211	MachineInstrBuilder SDWAInst =
1212	BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: SDWADesc)
1213	.setMIFlags(MI.getFlags());
1214
1215	// Copy dst, if it is present in original then should also be present in SDWA
1216	MachineOperand *Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1217	if (Dst) {
1218	assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
1219	SDWAInst.add(MO: *Dst);
1220	} else if ((Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))) {
1221	assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1222	SDWAInst.add(MO: *Dst);
1223	} else {
1224	assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1225	SDWAInst.addReg(RegNo: TRI->getVCC(), flags: RegState::Define);
1226	}
1227
1228	// Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1229	// src0_modifiers (except for v_nop_sdwa, but it can't get here)
1230	MachineOperand *Src0 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0);
1231	assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) &&
1232	AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers));
1233	if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers))
1234	SDWAInst.addImm(Val: Mod->getImm());
1235	else
1236	SDWAInst.addImm(Val: `0`);
1237	SDWAInst.add(MO: *Src0);
1238
1239	// Copy src1 if present, initialize src1_modifiers.
1240	MachineOperand *Src1 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1);
1241	if (Src1) {
1242	assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) &&
1243	AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers));
1244	if (auto *Mod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src1_modifiers))
1245	SDWAInst.addImm(Val: Mod->getImm());
1246	else
1247	SDWAInst.addImm(Val: `0`);
1248	SDWAInst.add(MO: *Src1);
1249	}
1250
1251	if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa \|\|
1252	SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa \|\|
1253	SDWAOpcode == AMDGPU::V_MAC_F16_sdwa \|\|
1254	SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1255	// v_mac_f16/32 has additional src2 operand tied to vdst
1256	MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
1257	assert(Src2);
1258	SDWAInst.add(MO: *Src2);
1259	}
1260
1261	// Copy clamp if present, initialize otherwise
1262	assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp));
1263	MachineOperand *Clamp = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::clamp);
1264	if (Clamp) {
1265	SDWAInst.add(MO: *Clamp);
1266	} else {
1267	SDWAInst.addImm(Val: `0`);
1268	}
1269
1270	// Copy omod if present, initialize otherwise if needed
1271	if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::omod)) {
1272	MachineOperand *OMod = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::omod);
1273	if (OMod) {
1274	SDWAInst.add(MO: *OMod);
1275	} else {
1276	SDWAInst.addImm(Val: `0`);
1277	}
1278	}
1279
1280	// Initialize SDWA specific operands
1281	if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_sel))
1282	SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1283
1284	if (AMDGPU::hasNamedOperand(Opcode: SDWAOpcode, NamedIdx: AMDGPU::OpName::dst_unused))
1285	SDWAInst.addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1286
1287	assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
1288	SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1289
1290	if (Src1) {
1291	assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
1292	SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD);
1293	}
1294
1295	// Check for a preserved register that needs to be copied.
1296	MachineInstr *Ret = SDWAInst.getInstr();
1297	TII->fixImplicitOperands(MI&: *Ret);
1298	return Ret;
1299	}
1300
1301	bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
1302	const SDWAOperandsVector &SDWAOperands) {
1303	LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
1304
1305	MachineInstr *SDWAInst;
1306	if (TII->isSDWA(Opcode: MI.getOpcode())) {
1307	// Clone the instruction to allow revoking changes
1308	// made to MI during the processing of the operands
1309	// if the conversion fails.
1310	SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(Orig: &MI);
1311	MI.getParent()->insert(I: MI.getIterator(), M: SDWAInst);
1312	} else {
1313	SDWAInst = createSDWAVersion(MI);
1314	}
1315
1316	// Apply all sdwa operand patterns.
1317	bool Converted = false;
1318	for (auto &Operand : SDWAOperands) {
1319	LLVM_DEBUG(dbgs() << SDWAInst << "\nOperand: " << Operand);
1320	// There should be no intersection between SDWA operands and potential MIs
1321	// e.g.:
1322	// v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1323	// v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1324	// v_add_u32 v3, v4, v2
1325	//
1326	// In that example it is possible that we would fold 2nd instruction into
1327	// 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
1328	// was already destroyed). So if SDWAOperand is also a potential MI then do
1329	// not apply it.
1330	if (PotentialMatches.count(Key: Operand->getParentInst()) == `0`)
1331	Converted \|= Operand->convertToSDWA(MI&: *SDWAInst, TII);
1332	}
1333
1334	if (!Converted) {
1335	SDWAInst->eraseFromParent();
1336	return false;
1337	}
1338
1339	ConvertedInstructions.push_back(Elt: SDWAInst);
1340	for (MachineOperand &MO : SDWAInst->uses()) {
1341	if (!MO.isReg())
1342	continue;
1343
1344	MRI->clearKillFlags(Reg: MO.getReg());
1345	}
1346	LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << `'\n'`);
1347	++NumSDWAInstructionsPeepholed;
1348
1349	MI.eraseFromParent();
1350	return true;
1351	}
1352
1353	// If an instruction was converted to SDWA it should not have immediates or SGPR
1354	// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1355	void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1356	const GCNSubtarget &ST) const {
1357	const MCInstrDesc &Desc = TII->get(Opcode: MI.getOpcode());
1358	unsigned ConstantBusCount = `0`;
1359	for (MachineOperand &Op : MI.explicit_uses()) {
1360	if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(MRI: *MRI, Reg: Op.getReg())))
1361	continue;
1362
1363	unsigned I = Op.getOperandNo();
1364	if (Desc.operands()[I].RegClass == -`1` \|\|
1365	!TRI->isVSSuperClass(RC: TRI->getRegClass(RCID: Desc.operands()[I].RegClass)))
1366	continue;
1367
1368	if (ST.hasSDWAScalar() && ConstantBusCount == `0` && Op.isReg() &&
1369	TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) {
1370	++ConstantBusCount;
1371	continue;
1372	}
1373
1374	Register VGPR = MRI->createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
1375	auto Copy = BuildMI(BB&: *MI.getParent(), I: MI.getIterator(), MIMD: MI.getDebugLoc(),
1376	MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: VGPR);
1377	if (Op.isImm())
1378	Copy.addImm(Val: Op.getImm());
1379	else if (Op.isReg())
1380	Copy.addReg(RegNo: Op.getReg(), flags: Op.isKill() ? RegState::Kill : `0`,
1381	SubReg: Op.getSubReg());
1382	Op.ChangeToRegister(Reg: VGPR, isDef: false);
1383	}
1384	}
1385
1386	bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1387	if (skipFunction(F: MF.getFunction()))
1388	return false;
1389
1390	return SIPeepholeSDWA ().run(MF);
1391	}
1392
1393	bool SIPeepholeSDWA::run(MachineFunction &MF) {
1394	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1395
1396	if (!ST.hasSDWA())
1397	return false;
1398
1399	MRI = &MF.getRegInfo();
1400	TRI = ST.getRegisterInfo();
1401	TII = ST.getInstrInfo();
1402
1403	// Find all SDWA operands in MF.
1404	bool Ret = false;
1405	for (MachineBasicBlock &MBB : MF) {
1406	bool Changed = false;
1407	do {
1408	// Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1409	// Look for a possible ADD or SUB that resulted from a previously lowered
1410	// V_{ADD\|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1411	// lowers the pair of instructions into e32 form.
1412	matchSDWAOperands(MBB);
1413	for (const auto &OperandPair : SDWAOperands) {
1414	const auto &Operand = OperandPair.second;
1415	MachineInstr *PotentialMI = Operand ->potentialToConvert(TII, ST);
1416	if (!PotentialMI)
1417	continue;
1418
1419	switch (PotentialMI->getOpcode()) {
1420	case AMDGPU::V_ADD_CO_U32_e64:
1421	case AMDGPU::V_SUB_CO_U32_e64:
1422	pseudoOpConvertToVOP2(MI&: *PotentialMI, ST);
1423	break;
1424	case AMDGPU::V_CNDMASK_B32_e64:
1425	convertVcndmaskToVOP2(MI&: *PotentialMI, ST);
1426	break;
1427	};
1428	}
1429	SDWAOperands.clear();
1430
1431	// Generate potential match list.
1432	matchSDWAOperands(MBB);
1433
1434	for (const auto &OperandPair : SDWAOperands) {
1435	const auto &Operand = OperandPair.second;
1436	MachineInstr *PotentialMI =
1437	Operand ->potentialToConvert(TII, ST, PotentialMatches: &PotentialMatches);
1438
1439	if (PotentialMI && isConvertibleToSDWA(MI&: *PotentialMI, ST, TII))
1440	PotentialMatches [PotentialMI].push_back(Elt: Operand.get());
1441	}
1442
1443	for (auto &PotentialPair : PotentialMatches) {
1444	MachineInstr &PotentialMI = *PotentialPair.first;
1445	convertToSDWA(MI&: PotentialMI, SDWAOperands: PotentialPair.second);
1446	}
1447
1448	PotentialMatches.clear();
1449	SDWAOperands.clear();
1450
1451	Changed = !ConvertedInstructions.empty();
1452
1453	if (Changed)
1454	Ret = true;
1455	while (!ConvertedInstructions.empty())
1456	legalizeScalarOperands(MI&: *ConvertedInstructions.pop_back_val(), ST);
1457	} while (Changed);
1458	}
1459
1460	return Ret;
1461	}
1462
1463	PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF,
1464	MachineFunctionAnalysisManager &) {
1465	if (MF.getFunction().hasOptNone() \|\| !SIPeepholeSDWA ().run(MF))
1466	return PreservedAnalyses::all();
1467
1468	PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
1469	PA.preserveSet<CFGAnalyses>();
1470	return PA;
1471	}
1472

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp