AArch64SIMDInstrOpt.cpp source code [llvm_projects/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp]

1	//
2	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3	// See https://llvm.org/LICENSE.txt for license information.
4	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5	//
6	//===----------------------------------------------------------------------===//
7	//
8	// This file contains a pass that performs optimization on SIMD instructions
9	// with high latency by splitting them into more efficient series of
10	// instructions.
11	//
12	// 1. Rewrite certain SIMD instructions with vector element due to their
13	// inefficiency on some targets.
14	//
15	// For example:
16	// fmla v0.4s, v1.4s, v2.s[1]
17	//
18	// Is rewritten into:
19	// dup v3.4s, v2.s[1]
20	// fmla v0.4s, v1.4s, v3.4s
21	//
22	// 2. Rewrite interleaved memory access instructions due to their
23	// inefficiency on some targets.
24	//
25	// For example:
26	// st2 {v0.4s, v1.4s}, addr
27	//
28	// Is rewritten into:
29	// zip1 v2.4s, v0.4s, v1.4s
30	// zip2 v3.4s, v0.4s, v1.4s
31	// stp q2, q3, addr
32	//
33	//===----------------------------------------------------------------------===//
34
35	#include "AArch64InstrInfo.h"
36	#include "AArch64Subtarget.h"
37	#include "llvm/ADT/SmallVector.h"
38	#include "llvm/ADT/Statistic.h"
39	#include "llvm/ADT/StringRef.h"
40	#include "llvm/CodeGen/MachineBasicBlock.h"
41	#include "llvm/CodeGen/MachineFunction.h"
42	#include "llvm/CodeGen/MachineFunctionPass.h"
43	#include "llvm/CodeGen/MachineInstr.h"
44	#include "llvm/CodeGen/MachineInstrBuilder.h"
45	#include "llvm/CodeGen/MachineOperand.h"
46	#include "llvm/CodeGen/MachineRegisterInfo.h"
47	#include "llvm/CodeGen/TargetInstrInfo.h"
48	#include "llvm/CodeGen/TargetSchedule.h"
49	#include "llvm/CodeGen/TargetSubtargetInfo.h"
50	#include "llvm/MC/MCInstrDesc.h"
51	#include "llvm/MC/MCSchedule.h"
52	#include "llvm/Pass.h"
53	#include <map>
54	#include <unordered_map>
55
56	using namespace llvm;
57
58	#define DEBUG_TYPE "aarch64-simdinstr-opt"
59
60	STATISTIC(NumModifiedInstr,
61	"Number of SIMD instructions modified");
62
63	#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
64	"AArch64 SIMD instructions optimization pass"
65
66	namespace {
67
68	struct AArch64SIMDInstrOpt : public MachineFunctionPass {
69	static char ID;
70
71	const AArch64InstrInfo *TII;
72	MachineRegisterInfo *MRI;
73	TargetSchedModel SchedModel;
74
75	// The two maps below are used to cache decisions instead of recomputing:
76	// This is used to cache instruction replacement decisions within function
77	// units and across function units.
78	std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
79	// This is used to cache the decision of whether to leave the interleaved
80	// store instructions replacement pass early or not for a particular target.
81	std::unordered_map<std::string, bool> InterlEarlyExit;
82
83	typedef enum {
84	VectorElem,
85	Interleave
86	} Subpass;
87
88	// Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
89	struct InstReplInfo {
90	unsigned OrigOpc;
91	std::vector<unsigned> ReplOpc;
92	const TargetRegisterClass RC;
93	};
94
95	#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
96	{OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
97	#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
98	OpcR7, OpcR8, OpcR9, RC) \
99	{OpcOrg, \
100	{OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
101
102	// The Instruction Replacement Table:
103	std::vector<InstReplInfo> IRT = {
104	// ST2 instructions
105	RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
106	AArch64::STPQi, AArch64::FPR128RegClass),
107	RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
108	AArch64::STPQi, AArch64::FPR128RegClass),
109	RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
110	AArch64::STPDi, AArch64::FPR64RegClass),
111	RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
112	AArch64::STPQi, AArch64::FPR128RegClass),
113	RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
114	AArch64::STPDi, AArch64::FPR64RegClass),
115	RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
116	AArch64::STPQi, AArch64::FPR128RegClass),
117	RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
118	AArch64::STPDi, AArch64::FPR64RegClass),
119	// ST4 instructions
120	RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
121	AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
122	AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
123	AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
124	RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
125	AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
126	AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
127	AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
128	RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
129	AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
130	AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
131	AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
132	RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
133	AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
134	AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
135	AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
136	RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
137	AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
138	AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
139	AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
140	RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
141	AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
142	AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
143	AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
144	RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
145	AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
146	AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
147	AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
148	};
149
150	// A costly instruction is replaced in this work by N efficient instructions
151	// The maximum of N is currently 10 and it is for ST4 case.
152	static const unsigned MaxNumRepl = `10`;
153
154	AArch64SIMDInstrOpt() : MachineFunctionPass (ID) {}
155
156	/// Based only on latency of instructions, determine if it is cost efficient
157	/// to replace the instruction InstDesc by the instructions stored in the
158	/// array InstDescRepl.
159	/// Return true if replacement is expected to be faster.
160	bool shouldReplaceInst(MachineFunction MF, const* MCInstrDesc *InstDesc,
161	SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
162
163	/// Determine if we need to exit the instruction replacement optimization
164	/// passes early. This makes sure that no compile time is spent in this pass
165	/// for targets with no need for any of these optimizations.
166	/// Return true if early exit of the pass is recommended.
167	bool shouldExitEarly(MachineFunction *MF, Subpass SP);
168
169	/// Check whether an equivalent DUP instruction has already been
170	/// created or not.
171	/// Return true when the DUP instruction already exists. In this case,
172	/// DestReg will point to the destination of the already created DUP.
173	bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
174	unsigned LaneNumber, unsigned DestReg) const*;
175
176	/// Certain SIMD instructions with vector element operand are not efficient.
177	/// Rewrite them into SIMD instructions with vector operands. This rewrite
178	/// is driven by the latency of the instructions.
179	/// Return true if the SIMD instruction is modified.
180	bool optimizeVectElement(MachineInstr &MI);
181
182	/// Process The REG_SEQUENCE instruction, and extract the source
183	/// operands of the ST2/4 instruction from it.
184	/// Example of such instructions.
185	/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
186	/// Return true when the instruction is processed successfully.
187	bool processSeqRegInst(MachineInstr DefiningMI, unsigned* *StReg,
188	RegState StRegKill, unsigned* NumArg) const;
189
190	/// Load/Store Interleaving instructions are not always beneficial.
191	/// Replace them by ZIP instructionand classical load/store.
192	/// Return true if the SIMD instruction is modified.
193	bool optimizeLdStInterleave(MachineInstr &MI);
194
195	/// Return the number of useful source registers for this
196	/// instruction (2 for ST2 and 4 for ST4).
197	unsigned determineSrcReg(MachineInstr &MI) const;
198
199	bool runOnMachineFunction(MachineFunction &Fn) override;
200
201	StringRef getPassName() const override {
202	return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
203	}
204	};
205
206	char AArch64SIMDInstrOpt::ID = `0`;
207
208	} // end anonymous namespace
209
210	INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",
211	AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
212
213	/// Based only on latency of instructions, determine if it is cost efficient
214	/// to replace the instruction InstDesc by the instructions stored in the
215	/// array InstDescRepl.
216	/// Return true if replacement is expected to be faster.
217	bool AArch64SIMDInstrOpt::
218	shouldReplaceInst(MachineFunction MF, const* MCInstrDesc *InstDesc,
219	SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
220	// Check if replacement decision is already available in the cached table.
221	// if so, return it.
222	std::string Subtarget = std::string (SchedModel.getSubtargetInfo()->getCPU());
223	auto InstID = std::make_pair(x: InstDesc->getOpcode(), y&: Subtarget);
224	auto It = SIMDInstrTable.find(x: InstID);
225	if (It != SIMDInstrTable.end())
226	return It ->second;
227
228	unsigned SCIdx = InstDesc->getSchedClass();
229	const MCSchedClassDesc *SCDesc =
230	SchedModel.getMCSchedModel()->getSchedClassDesc(SchedClassIdx: SCIdx);
231
232	// If a target does not define resources for the instructions
233	// of interest, then return false for no replacement.
234	const MCSchedClassDesc *SCDescRepl;
235	if (!SCDesc->isValid() \|\| SCDesc->isVariant())
236	{
237	SIMDInstrTable [InstID] = false;
238	return false;
239	}
240	for (const auto *IDesc : InstDescRepl)
241	{
242	SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
243	SchedClassIdx: IDesc->getSchedClass());
244	if (!SCDescRepl->isValid() \|\| SCDescRepl->isVariant())
245	{
246	SIMDInstrTable [InstID] = false;
247	return false;
248	}
249	}
250
251	// Replacement cost.
252	unsigned ReplCost = `0`;
253	for (const auto *IDesc :InstDescRepl)
254	ReplCost += SchedModel.computeInstrLatency(Opcode: IDesc->getOpcode());
255
256	if (SchedModel.computeInstrLatency(Opcode: InstDesc->getOpcode()) > ReplCost)
257	{
258	SIMDInstrTable [InstID] = true;
259	return true;
260	}
261	else
262	{
263	SIMDInstrTable [InstID] = false;
264	return false;
265	}
266	}
267
268	/// Determine if we need to exit this pass for a kind of instruction replacement
269	/// early. This makes sure that no compile time is spent in this pass for
270	/// targets with no need for any of these optimizations beyond performing this
271	/// check.
272	/// Return true if early exit of this pass for a kind of instruction
273	/// replacement is recommended for a target.
274	bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
275	const MCInstrDesc* OriginalMCID;
276	SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
277
278	switch (SP) {
279	// For this optimization, check by comparing the latency of a representative
280	// instruction to that of the replacement instructions.
281	// TODO: check for all concerned instructions.
282	case VectorElem:
283	OriginalMCID = &TII->get(Opcode: AArch64::FMLAv4i32_indexed);
284	ReplInstrMCID.push_back(Elt: &TII->get(Opcode: AArch64::DUPv4i32lane));
285	ReplInstrMCID.push_back(Elt: &TII->get(Opcode: AArch64::FMLAv4f32));
286	if (shouldReplaceInst(MF, InstDesc: OriginalMCID, InstDescRepl&: ReplInstrMCID))
287	return false;
288	break;
289
290	// For this optimization, check for all concerned instructions.
291	case Interleave:
292	std::string Subtarget =
293	std::string (SchedModel.getSubtargetInfo()->getCPU());
294	auto It = InterlEarlyExit.find(x: Subtarget);
295	if (It != InterlEarlyExit.end())
296	return It ->second;
297
298	for (auto &I : IRT) {
299	OriginalMCID = &TII->get(Opcode: I.OrigOpc);
300	for (auto &Repl : I.ReplOpc)
301	ReplInstrMCID.push_back(Elt: &TII->get(Opcode: Repl));
302	if (shouldReplaceInst(MF, InstDesc: OriginalMCID, InstDescRepl&: ReplInstrMCID)) {
303	InterlEarlyExit [Subtarget] = false;
304	return false;
305	}
306	ReplInstrMCID.clear();
307	}
308	InterlEarlyExit [Subtarget] = true;
309	break;
310	}
311
312	return true;
313	}
314
315	/// Check whether an equivalent DUP instruction has already been
316	/// created or not.
317	/// Return true when the DUP instruction already exists. In this case,
318	/// DestReg will point to the destination of the already created DUP.
319	bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
320	unsigned SrcReg, unsigned LaneNumber,
321	unsigned DestReg) const* {
322	for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
323	MII != MIE;) {
324	MII --;
325	MachineInstr CurrentMI = &MII;
326
327	if (CurrentMI->getOpcode() == DupOpcode &&
328	CurrentMI->getNumOperands() == `3` &&
329	CurrentMI->getOperand(i: `1`).getReg() == SrcReg &&
330	CurrentMI->getOperand(i: `2`).getImm() == LaneNumber) {
331	*DestReg = CurrentMI->getOperand(i: `0`).getReg();
332	return true;
333	}
334	}
335
336	return false;
337	}
338
339	/// Certain SIMD instructions with vector element operand are not efficient.
340	/// Rewrite them into SIMD instructions with vector operands. This rewrite
341	/// is driven by the latency of the instructions.
342	/// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
343	/// and FMULX and hence they are hardcoded.
344	///
345	/// For example:
346	/// fmla v0.4s, v1.4s, v2.s[1]
347	///
348	/// Is rewritten into
349	/// dup v3.4s, v2.s[1] // DUP not necessary if redundant
350	/// fmla v0.4s, v1.4s, v3.4s
351	///
352	/// Return true if the SIMD instruction is modified.
353	bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
354	const MCInstrDesc MulMCID, DupMCID;
355	const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
356
357	switch (MI.getOpcode()) {
358	default:
359	return false;
360
361	// 4X32 instructions
362	case AArch64::FMLAv4i32_indexed:
363	DupMCID = &TII->get(Opcode: AArch64::DUPv4i32lane);
364	MulMCID = &TII->get(Opcode: AArch64::FMLAv4f32);
365	break;
366	case AArch64::FMLSv4i32_indexed:
367	DupMCID = &TII->get(Opcode: AArch64::DUPv4i32lane);
368	MulMCID = &TII->get(Opcode: AArch64::FMLSv4f32);
369	break;
370	case AArch64::FMULXv4i32_indexed:
371	DupMCID = &TII->get(Opcode: AArch64::DUPv4i32lane);
372	MulMCID = &TII->get(Opcode: AArch64::FMULXv4f32);
373	break;
374	case AArch64::FMULv4i32_indexed:
375	DupMCID = &TII->get(Opcode: AArch64::DUPv4i32lane);
376	MulMCID = &TII->get(Opcode: AArch64::FMULv4f32);
377	break;
378
379	// 2X64 instructions
380	case AArch64::FMLAv2i64_indexed:
381	DupMCID = &TII->get(Opcode: AArch64::DUPv2i64lane);
382	MulMCID = &TII->get(Opcode: AArch64::FMLAv2f64);
383	break;
384	case AArch64::FMLSv2i64_indexed:
385	DupMCID = &TII->get(Opcode: AArch64::DUPv2i64lane);
386	MulMCID = &TII->get(Opcode: AArch64::FMLSv2f64);
387	break;
388	case AArch64::FMULXv2i64_indexed:
389	DupMCID = &TII->get(Opcode: AArch64::DUPv2i64lane);
390	MulMCID = &TII->get(Opcode: AArch64::FMULXv2f64);
391	break;
392	case AArch64::FMULv2i64_indexed:
393	DupMCID = &TII->get(Opcode: AArch64::DUPv2i64lane);
394	MulMCID = &TII->get(Opcode: AArch64::FMULv2f64);
395	break;
396
397	// 2X32 instructions
398	case AArch64::FMLAv2i32_indexed:
399	RC = &AArch64::FPR64RegClass;
400	DupMCID = &TII->get(Opcode: AArch64::DUPv2i32lane);
401	MulMCID = &TII->get(Opcode: AArch64::FMLAv2f32);
402	break;
403	case AArch64::FMLSv2i32_indexed:
404	RC = &AArch64::FPR64RegClass;
405	DupMCID = &TII->get(Opcode: AArch64::DUPv2i32lane);
406	MulMCID = &TII->get(Opcode: AArch64::FMLSv2f32);
407	break;
408	case AArch64::FMULXv2i32_indexed:
409	RC = &AArch64::FPR64RegClass;
410	DupMCID = &TII->get(Opcode: AArch64::DUPv2i32lane);
411	MulMCID = &TII->get(Opcode: AArch64::FMULXv2f32);
412	break;
413	case AArch64::FMULv2i32_indexed:
414	RC = &AArch64::FPR64RegClass;
415	DupMCID = &TII->get(Opcode: AArch64::DUPv2i32lane);
416	MulMCID = &TII->get(Opcode: AArch64::FMULv2f32);
417	break;
418	}
419
420	SmallVector<const MCInstrDesc*, `2`> ReplInstrMCID;
421	ReplInstrMCID.push_back(Elt: DupMCID);
422	ReplInstrMCID.push_back(Elt: MulMCID);
423	if (!shouldReplaceInst(MF: MI.getParent()->getParent(), InstDesc: &TII->get(Opcode: MI.getOpcode()),
424	InstDescRepl&: ReplInstrMCID))
425	return false;
426
427	const DebugLoc &DL = MI.getDebugLoc();
428	MachineBasicBlock &MBB = *MI.getParent();
429	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
430
431	// Get the operands of the current SIMD arithmetic instruction.
432	Register MulDest = MI.getOperand(i: `0`).getReg();
433	Register SrcReg0 = MI.getOperand(i: `1`).getReg();
434	RegState Src0IsKill = getKillRegState(B: MI.getOperand(i: `1`).isKill());
435	Register SrcReg1 = MI.getOperand(i: `2`).getReg();
436	RegState Src1IsKill = getKillRegState(B: MI.getOperand(i: `2`).isKill());
437	unsigned DupDest;
438
439	// Instructions of interest have either 4 or 5 operands.
440	if (MI.getNumOperands() == `5`) {
441	Register SrcReg2 = MI.getOperand(i: `3`).getReg();
442	RegState Src2IsKill = getKillRegState(B: MI.getOperand(i: `3`).isKill());
443	unsigned LaneNumber = MI.getOperand(i: `4`).getImm();
444	// Create a new DUP instruction. Note that if an equivalent DUP instruction
445	// has already been created before, then use that one instead of creating
446	// a new one.
447	if (!reuseDUP(MI, DupOpcode: DupMCID->getOpcode(), SrcReg: SrcReg2, LaneNumber, DestReg: &DupDest)) {
448	DupDest = MRI.createVirtualRegister(RegClass: RC);
449	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *DupMCID, DestReg: DupDest)
450	.addReg(RegNo: SrcReg2, Flags: Src2IsKill)
451	.addImm(Val: LaneNumber);
452	}
453	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *MulMCID, DestReg: MulDest)
454	.addReg(RegNo: SrcReg0, Flags: Src0IsKill)
455	.addReg(RegNo: SrcReg1, Flags: Src1IsKill)
456	.addReg(RegNo: DupDest, Flags: Src2IsKill);
457	} else if (MI.getNumOperands() == `4`) {
458	unsigned LaneNumber = MI.getOperand(i: `3`).getImm();
459	if (!reuseDUP(MI, DupOpcode: DupMCID->getOpcode(), SrcReg: SrcReg1, LaneNumber, DestReg: &DupDest)) {
460	DupDest = MRI.createVirtualRegister(RegClass: RC);
461	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *DupMCID, DestReg: DupDest)
462	.addReg(RegNo: SrcReg1, Flags: Src1IsKill)
463	.addImm(Val: LaneNumber);
464	}
465	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *MulMCID, DestReg: MulDest)
466	.addReg(RegNo: SrcReg0, Flags: Src0IsKill)
467	.addReg(RegNo: DupDest, Flags: Src1IsKill);
468	} else {
469	return false;
470	}
471
472	++NumModifiedInstr;
473	return true;
474	}
475
476	/// Load/Store Interleaving instructions are not always beneficial.
477	/// Replace them by ZIP instructions and classical load/store.
478	///
479	/// For example:
480	/// st2 {v0.4s, v1.4s}, addr
481	///
482	/// Is rewritten into:
483	/// zip1 v2.4s, v0.4s, v1.4s
484	/// zip2 v3.4s, v0.4s, v1.4s
485	/// stp q2, q3, addr
486	//
487	/// For example:
488	/// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
489	///
490	/// Is rewritten into:
491	/// zip1 v4.4s, v0.4s, v2.4s
492	/// zip2 v5.4s, v0.4s, v2.4s
493	/// zip1 v6.4s, v1.4s, v3.4s
494	/// zip2 v7.4s, v1.4s, v3.4s
495	/// zip1 v8.4s, v4.4s, v6.4s
496	/// zip2 v9.4s, v4.4s, v6.4s
497	/// zip1 v10.4s, v5.4s, v7.4s
498	/// zip2 v11.4s, v5.4s, v7.4s
499	/// stp q8, q9, addr
500	/// stp q10, q11, addr+32
501	///
502	/// Currently only instructions related to ST2 and ST4 are considered.
503	/// Other may be added later.
504	/// Return true if the SIMD instruction is modified.
505	bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
506
507	unsigned SeqReg, AddrReg;
508	unsigned StReg[`4`];
509	RegState StRegKill[`4`];
510	MachineInstr *DefiningMI;
511	const DebugLoc &DL = MI.getDebugLoc();
512	MachineBasicBlock &MBB = *MI.getParent();
513	SmallVector<unsigned, MaxNumRepl> ZipDest;
514	SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
515
516	// If current instruction matches any of the rewriting rules, then
517	// gather information about parameters of the new instructions.
518	bool Match = false;
519	for (auto &I : IRT) {
520	if (MI.getOpcode() == I.OrigOpc) {
521	SeqReg = MI.getOperand(i: `0`).getReg();
522	AddrReg = MI.getOperand(i: `1`).getReg();
523	DefiningMI = MRI->getUniqueVRegDef(Reg: SeqReg);
524	unsigned NumReg = determineSrcReg(MI);
525	if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumArg: NumReg))
526	return false;
527
528	for (auto &Repl : I.ReplOpc) {
529	ReplInstrMCID.push_back(Elt: &TII->get(Opcode: Repl));
530	// Generate destination registers but only for non-store instruction.
531	if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
532	ZipDest.push_back(Elt: MRI->createVirtualRegister(RegClass: &I.RC));
533	}
534	Match = true;
535	break;
536	}
537	}
538
539	if (!Match)
540	return false;
541
542	// Determine if it is profitable to replace MI by the series of instructions
543	// represented in ReplInstrMCID.
544	if (!shouldReplaceInst(MF: MI.getParent()->getParent(), InstDesc: &TII->get(Opcode: MI.getOpcode()),
545	InstDescRepl&: ReplInstrMCID))
546	return false;
547
548	// Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
549	// this point, the code generation is hardcoded and does not rely on the IRT
550	// table used above given that code generation for ST2 replacement is somewhat
551	// different than for ST4 replacement. We could have added more info into the
552	// table related to how we build new instructions but we may be adding more
553	// complexity with that).
554	switch (MI.getOpcode()) {
555	default:
556	return false;
557
558	case AArch64::ST2Twov16b:
559	case AArch64::ST2Twov8b:
560	case AArch64::ST2Twov8h:
561	case AArch64::ST2Twov4h:
562	case AArch64::ST2Twov4s:
563	case AArch64::ST2Twov2s:
564	case AArch64::ST2Twov2d:
565	// ZIP instructions
566	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`0`], DestReg: ZipDest [`0`])
567	.addReg(RegNo: StReg[`0`])
568	.addReg(RegNo: StReg[`1`]);
569	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`1`], DestReg: ZipDest [`1`])
570	.addReg(RegNo: StReg[`0`], Flags: StRegKill[`0`])
571	.addReg(RegNo: StReg[`1`], Flags: StRegKill[`1`]);
572	// STP instructions
573	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`2`])
574	.addReg(RegNo: ZipDest [`0`])
575	.addReg(RegNo: ZipDest [`1`])
576	.addReg(RegNo: AddrReg)
577	.addImm(Val: `0`);
578	break;
579
580	case AArch64::ST4Fourv16b:
581	case AArch64::ST4Fourv8b:
582	case AArch64::ST4Fourv8h:
583	case AArch64::ST4Fourv4h:
584	case AArch64::ST4Fourv4s:
585	case AArch64::ST4Fourv2s:
586	case AArch64::ST4Fourv2d:
587	// ZIP instructions
588	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`0`], DestReg: ZipDest [`0`])
589	.addReg(RegNo: StReg[`0`])
590	.addReg(RegNo: StReg[`2`]);
591	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`1`], DestReg: ZipDest [`1`])
592	.addReg(RegNo: StReg[`0`], Flags: StRegKill[`0`])
593	.addReg(RegNo: StReg[`2`], Flags: StRegKill[`2`]);
594	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`2`], DestReg: ZipDest [`2`])
595	.addReg(RegNo: StReg[`1`])
596	.addReg(RegNo: StReg[`3`]);
597	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`3`], DestReg: ZipDest [`3`])
598	.addReg(RegNo: StReg[`1`], Flags: StRegKill[`1`])
599	.addReg(RegNo: StReg[`3`], Flags: StRegKill[`3`]);
600	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`4`], DestReg: ZipDest [`4`])
601	.addReg(RegNo: ZipDest [`0`])
602	.addReg(RegNo: ZipDest [`2`]);
603	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`5`], DestReg: ZipDest [`5`])
604	.addReg(RegNo: ZipDest [`0`])
605	.addReg(RegNo: ZipDest [`2`]);
606	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`6`], DestReg: ZipDest [`6`])
607	.addReg(RegNo: ZipDest [`1`])
608	.addReg(RegNo: ZipDest [`3`]);
609	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`7`], DestReg: ZipDest [`7`])
610	.addReg(RegNo: ZipDest [`1`])
611	.addReg(RegNo: ZipDest [`3`]);
612	// stp instructions
613	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`8`])
614	.addReg(RegNo: ZipDest [`4`])
615	.addReg(RegNo: ZipDest [`5`])
616	.addReg(RegNo: AddrReg)
617	.addImm(Val: `0`);
618	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: *ReplInstrMCID [`9`])
619	.addReg(RegNo: ZipDest [`6`])
620	.addReg(RegNo: ZipDest [`7`])
621	.addReg(RegNo: AddrReg)
622	.addImm(Val: `2`);
623	break;
624	}
625
626	++NumModifiedInstr;
627	return true;
628	}
629
630	/// Process The REG_SEQUENCE instruction, and extract the source
631	/// operands of the ST2/4 instruction from it.
632	/// Example of such instruction.
633	/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
634	/// Return true when the instruction is processed successfully.
635	bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
636	unsigned *StReg,
637	RegState *StRegKill,
638	unsigned NumArg) const {
639	assert(DefiningMI != nullptr);
640	if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
641	return false;
642
643	for (unsigned i=`0`; i<NumArg; i++) {
644	StReg[i] = DefiningMI->getOperand(i: `2`*i+`1`).getReg();
645	StRegKill[i] = getKillRegState(B: DefiningMI->getOperand(i: `2`*i+`1`).isKill());
646
647	// Validation check for the other arguments.
648	if (DefiningMI->getOperand(i: `2`*i+`2`).isImm()) {
649	switch (DefiningMI->getOperand(i: `2`*i+`2`).getImm()) {
650	default:
651	return false;
652
653	case AArch64::dsub0:
654	case AArch64::dsub1:
655	case AArch64::dsub2:
656	case AArch64::dsub3:
657	case AArch64::qsub0:
658	case AArch64::qsub1:
659	case AArch64::qsub2:
660	case AArch64::qsub3:
661	break;
662	}
663	}
664	else
665	return false;
666	}
667	return true;
668	}
669
670	/// Return the number of useful source registers for this instruction
671	/// (2 for ST2 and 4 for ST4).
672	unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
673	switch (MI.getOpcode()) {
674	default:
675	llvm_unreachable("Unsupported instruction for this pass");
676
677	case AArch64::ST2Twov16b:
678	case AArch64::ST2Twov8b:
679	case AArch64::ST2Twov8h:
680	case AArch64::ST2Twov4h:
681	case AArch64::ST2Twov4s:
682	case AArch64::ST2Twov2s:
683	case AArch64::ST2Twov2d:
684	return `2`;
685
686	case AArch64::ST4Fourv16b:
687	case AArch64::ST4Fourv8b:
688	case AArch64::ST4Fourv8h:
689	case AArch64::ST4Fourv4h:
690	case AArch64::ST4Fourv4s:
691	case AArch64::ST4Fourv2s:
692	case AArch64::ST4Fourv2d:
693	return `4`;
694	}
695	}
696
697	bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
698	if (skipFunction(F: MF.getFunction()))
699	return false;
700
701	MRI = &MF.getRegInfo();
702	const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
703	TII = ST.getInstrInfo();
704	SchedModel.init(TSInfo: &ST);
705	if (!SchedModel.hasInstrSchedModel())
706	return false;
707
708	bool Changed = false;
709	for (auto OptimizationKind : {VectorElem, Interleave}) {
710	if (!shouldExitEarly(MF: &MF, SP: OptimizationKind)) {
711	SmallVector<MachineInstr *, `8`> RemoveMIs;
712	for (MachineBasicBlock &MBB : MF) {
713	for (MachineInstr &MI : MBB) {
714	bool InstRewrite;
715	if (OptimizationKind == VectorElem)
716	InstRewrite = optimizeVectElement(MI) ;
717	else
718	InstRewrite = optimizeLdStInterleave(MI);
719	if (InstRewrite) {
720	// Add MI to the list of instructions to be removed given that it
721	// has been replaced.
722	RemoveMIs.push_back(Elt: &MI);
723	Changed = true;
724	}
725	}
726	}
727	for (MachineInstr *MI : RemoveMIs)
728	MI->eraseFromParent();
729	}
730	}
731
732	return Changed;
733	}
734
735	/// Returns an instance of the high cost ASIMD instruction replacement
736	/// optimization pass.
737	FunctionPass *llvm::createAArch64SIMDInstrOptPass() {
738	return new AArch64SIMDInstrOpt ();
739	}
740

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp