ARMLatencyMutations.cpp source code [llvm_projects/llvm/lib/Target/ARM/ARMLatencyMutations.cpp]

1	//===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file This file contains the ARM definition DAG scheduling mutations which
10	/// change inter-instruction latencies
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "ARMLatencyMutations.h"
15	#include "ARMSubtarget.h"
16	#include "Thumb2InstrInfo.h"
17	#include "llvm/Analysis/AliasAnalysis.h"
18	#include "llvm/CodeGen/ScheduleDAG.h"
19	#include "llvm/CodeGen/ScheduleDAGMutation.h"
20	#include "llvm/CodeGen/TargetInstrInfo.h"
21	#include <algorithm>
22	#include <array>
23	#include <initializer_list>
24	#include <memory>
25
26	namespace llvm {
27
28	namespace {
29
30	// Precompute information about opcodes to speed up pass
31
32	class InstructionInformation {
33	protected:
34	struct IInfo {
35	bool HasBRegAddr : `1`; // B-side of addr gen is a register
36	bool HasBRegAddrShift : `1`; // B-side of addr gen has a shift
37	bool IsDivide : `1`; // Some form of integer divide
38	bool IsInlineShiftALU : `1`; // Inline shift+ALU
39	bool IsMultiply : `1`; // Some form of integer multiply
40	bool IsMVEIntMAC : `1`; // MVE 8/16/32-bit integer MAC operation
41	bool IsNonSubwordLoad : `1`; // Load which is a word or larger
42	bool IsShift : `1`; // Shift operation
43	bool IsRev : `1`; // REV operation
44	bool ProducesQP : `1`; // Produces a vector register result
45	bool ProducesDP : `1`; // Produces a double-precision register result
46	bool ProducesSP : `1`; // Produces a single-precision register result
47	bool ConsumesQP : `1`; // Consumes a vector register result
48	bool ConsumesDP : `1`; // Consumes a double-precision register result
49	bool ConsumesSP : `1`; // Consumes a single-precision register result
50	unsigned MVEIntMACMatched; // Matched operand type (for MVE)
51	unsigned AddressOpMask; // Mask indicating which operands go into AGU
52	IInfo()
53	: HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
54	IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
55	IsNonSubwordLoad(false), IsShift(false), IsRev(false),
56	ProducesQP(false), ProducesDP(false), ProducesSP(false),
57	ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
58	MVEIntMACMatched(`0`), AddressOpMask(`0`) {}
59	};
60	typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;
61	IInfoArray Info;
62
63	public:
64	// Always available information
65	unsigned getAddressOpMask(unsigned Op) { return Info [Op].AddressOpMask; }
66	bool hasBRegAddr(unsigned Op) { return Info [Op].HasBRegAddr; }
67	bool hasBRegAddrShift(unsigned Op) { return Info [Op].HasBRegAddrShift; }
68	bool isDivide(unsigned Op) { return Info [Op].IsDivide; }
69	bool isInlineShiftALU(unsigned Op) { return Info [Op].IsInlineShiftALU; }
70	bool isMultiply(unsigned Op) { return Info [Op].IsMultiply; }
71	bool isMVEIntMAC(unsigned Op) { return Info [Op].IsMVEIntMAC; }
72	bool isNonSubwordLoad(unsigned Op) { return Info [Op].IsNonSubwordLoad; }
73	bool isRev(unsigned Op) { return Info [Op].IsRev; }
74	bool isShift(unsigned Op) { return Info [Op].IsShift; }
75
76	// information available if markDPConsumers is called.
77	bool producesQP(unsigned Op) { return Info [Op].ProducesQP; }
78	bool producesDP(unsigned Op) { return Info [Op].ProducesDP; }
79	bool producesSP(unsigned Op) { return Info [Op].ProducesSP; }
80	bool consumesQP(unsigned Op) { return Info [Op].ConsumesQP; }
81	bool consumesDP(unsigned Op) { return Info [Op].ConsumesDP; }
82	bool consumesSP(unsigned Op) { return Info [Op].ConsumesSP; }
83
84	bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {
85	return SrcOp == DstOp \|\| Info [DstOp].MVEIntMACMatched == SrcOp;
86	}
87
88	InstructionInformation(const ARMBaseInstrInfo *TII);
89
90	protected:
91	void markDPProducersConsumers(const ARMBaseInstrInfo *TII);
92	};
93
94	InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {
95	using namespace ARM;
96
97	std::initializer_list<unsigned> hasBRegAddrList = {
98	t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
99	tLDRr, tLDRBr, tLDRHr, tSTRr, tSTRBr, tSTRHr,
100	};
101	for (auto op : hasBRegAddrList) {
102	Info [op].HasBRegAddr = true;
103	}
104
105	std::initializer_list<unsigned> hasBRegAddrShiftList = {
106	t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
107	};
108	for (auto op : hasBRegAddrShiftList) {
109	Info [op].HasBRegAddrShift = true;
110	}
111
112	Info [t2SDIV].IsDivide = Info [t2UDIV].IsDivide = true;
113
114	std::initializer_list<unsigned> isInlineShiftALUList = {
115	t2ADCrs, t2ADDSrs, t2ADDrs, t2BICrs, t2EORrs,
116	t2ORNrs, t2RSBSrs, t2RSBrs, t2SBCrs, t2SUBrs,
117	t2SUBSrs, t2CMPrs, t2CMNzrs, t2TEQrs, t2TSTrs,
118	};
119	for (auto op : isInlineShiftALUList) {
120	Info [op].IsInlineShiftALU = true;
121	}
122
123	Info [t2SDIV].IsDivide = Info [t2UDIV].IsDivide = true;
124
125	std::initializer_list<unsigned> isMultiplyList = {
126	t2MUL, t2MLA, t2MLS, t2SMLABB, t2SMLABT, t2SMLAD, t2SMLADX,
127	t2SMLAL, t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,
128	t2SMLATB, t2SMLATT, t2SMLAWT, t2SMLSD, t2SMLSDX, t2SMLSLD, t2SMLSLDX,
129	t2SMMLA, t2SMMLAR, t2SMMLS, t2SMMLSR, t2SMMUL, t2SMMULR, t2SMUAD,
130	t2SMUADX, t2SMULBB, t2SMULBT, t2SMULL, t2SMULTB, t2SMULTT, t2SMULWT,
131	t2SMUSD, t2SMUSDX, t2UMAAL, t2UMLAL, t2UMULL, tMUL,
132	};
133	for (auto op : isMultiplyList) {
134	Info [op].IsMultiply = true;
135	}
136
137	std::initializer_list<unsigned> isMVEIntMACList = {
138	MVE_VMLAS_qr_i16, MVE_VMLAS_qr_i32, MVE_VMLAS_qr_i8,
139	MVE_VMLA_qr_i16, MVE_VMLA_qr_i32, MVE_VMLA_qr_i8,
140	MVE_VQDMLAH_qrs16, MVE_VQDMLAH_qrs32, MVE_VQDMLAH_qrs8,
141	MVE_VQDMLASH_qrs16, MVE_VQDMLASH_qrs32, MVE_VQDMLASH_qrs8,
142	MVE_VQRDMLAH_qrs16, MVE_VQRDMLAH_qrs32, MVE_VQRDMLAH_qrs8,
143	MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,
144	MVE_VQDMLADHXs16, MVE_VQDMLADHXs32, MVE_VQDMLADHXs8,
145	MVE_VQDMLADHs16, MVE_VQDMLADHs32, MVE_VQDMLADHs8,
146	MVE_VQDMLSDHXs16, MVE_VQDMLSDHXs32, MVE_VQDMLSDHXs8,
147	MVE_VQDMLSDHs16, MVE_VQDMLSDHs32, MVE_VQDMLSDHs8,
148	MVE_VQRDMLADHXs16, MVE_VQRDMLADHXs32, MVE_VQRDMLADHXs8,
149	MVE_VQRDMLADHs16, MVE_VQRDMLADHs32, MVE_VQRDMLADHs8,
150	MVE_VQRDMLSDHXs16, MVE_VQRDMLSDHXs32, MVE_VQRDMLSDHXs8,
151	MVE_VQRDMLSDHs16, MVE_VQRDMLSDHs32, MVE_VQRDMLSDHs8,
152	};
153	for (auto op : isMVEIntMACList) {
154	Info [op].IsMVEIntMAC = true;
155	}
156
157	std::initializer_list<unsigned> isNonSubwordLoadList = {
158	t2LDRi12, t2LDRi8, t2LDR_POST, t2LDR_PRE, t2LDRpci,
159	t2LDRs, t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,
160	tLDRpci, tLDRr, tLDRspi,
161	};
162	for (auto op : isNonSubwordLoadList) {
163	Info [op].IsNonSubwordLoad = true;
164	}
165
166	std::initializer_list<unsigned> isRevList = {
167	t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,
168	};
169	for (auto op : isRevList) {
170	Info [op].IsRev = true;
171	}
172
173	std::initializer_list<unsigned> isShiftList = {
174	t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,
175	tASRri, tASRrr, tLSLSri, tLSLri, tLSLrr, tLSRri, tLSRrr, tROR,
176	};
177	for (auto op : isShiftList) {
178	Info [op].IsShift = true;
179	}
180
181	std::initializer_list<unsigned> Address1List = {
182	t2LDRBi12,
183	t2LDRBi8,
184	t2LDRBpci,
185	t2LDRBs,
186	t2LDRHi12,
187	t2LDRHi8,
188	t2LDRHpci,
189	t2LDRHs,
190	t2LDRSBi12,
191	t2LDRSBi8,
192	t2LDRSBpci,
193	t2LDRSBs,
194	t2LDRSHi12,
195	t2LDRSHi8,
196	t2LDRSHpci,
197	t2LDRSHs,
198	t2LDRi12,
199	t2LDRi8,
200	t2LDRpci,
201	t2LDRs,
202	tLDRBi,
203	tLDRBr,
204	tLDRHi,
205	tLDRHr,
206	tLDRSB,
207	tLDRSH,
208	tLDRi,
209	tLDRpci,
210	tLDRr,
211	tLDRspi,
212	t2STRBi12,
213	t2STRBi8,
214	t2STRBs,
215	t2STRHi12,
216	t2STRHi8,
217	t2STRHs,
218	t2STRi12,
219	t2STRi8,
220	t2STRs,
221	tSTRBi,
222	tSTRBr,
223	tSTRHi,
224	tSTRHr,
225	tSTRi,
226	tSTRr,
227	tSTRspi,
228	VLDRD,
229	VLDRH,
230	VLDRS,
231	VSTRD,
232	VSTRH,
233	VSTRS,
234	MVE_VLD20_16,
235	MVE_VLD20_32,
236	MVE_VLD20_8,
237	MVE_VLD21_16,
238	MVE_VLD21_32,
239	MVE_VLD21_8,
240	MVE_VLD40_16,
241	MVE_VLD40_32,
242	MVE_VLD40_8,
243	MVE_VLD41_16,
244	MVE_VLD41_32,
245	MVE_VLD41_8,
246	MVE_VLD42_16,
247	MVE_VLD42_32,
248	MVE_VLD42_8,
249	MVE_VLD43_16,
250	MVE_VLD43_32,
251	MVE_VLD43_8,
252	MVE_VLDRBS16,
253	MVE_VLDRBS16_rq,
254	MVE_VLDRBS32,
255	MVE_VLDRBS32_rq,
256	MVE_VLDRBU16,
257	MVE_VLDRBU16_rq,
258	MVE_VLDRBU32,
259	MVE_VLDRBU32_rq,
260	MVE_VLDRBU8,
261	MVE_VLDRBU8_rq,
262	MVE_VLDRDU64_qi,
263	MVE_VLDRDU64_rq,
264	MVE_VLDRDU64_rq_u,
265	MVE_VLDRHS32,
266	MVE_VLDRHS32_rq,
267	MVE_VLDRHS32_rq_u,
268	MVE_VLDRHU16,
269	MVE_VLDRHU16_rq,
270	MVE_VLDRHU16_rq_u,
271	MVE_VLDRHU32,
272	MVE_VLDRHU32_rq,
273	MVE_VLDRHU32_rq_u,
274	MVE_VLDRWU32,
275	MVE_VLDRWU32_qi,
276	MVE_VLDRWU32_rq,
277	MVE_VLDRWU32_rq_u,
278	MVE_VST20_16,
279	MVE_VST20_32,
280	MVE_VST20_8,
281	MVE_VST21_16,
282	MVE_VST21_32,
283	MVE_VST21_8,
284	MVE_VST40_16,
285	MVE_VST40_32,
286	MVE_VST40_8,
287	MVE_VST41_16,
288	MVE_VST41_32,
289	MVE_VST41_8,
290	MVE_VST42_16,
291	MVE_VST42_32,
292	MVE_VST42_8,
293	MVE_VST43_16,
294	MVE_VST43_32,
295	MVE_VST43_8,
296	MVE_VSTRB16,
297	MVE_VSTRB16_rq,
298	MVE_VSTRB32,
299	MVE_VSTRB32_rq,
300	MVE_VSTRBU8,
301	MVE_VSTRB8_rq,
302	MVE_VSTRD64_qi,
303	MVE_VSTRD64_rq,
304	MVE_VSTRD64_rq_u,
305	MVE_VSTRH32,
306	MVE_VSTRH32_rq,
307	MVE_VSTRH32_rq_u,
308	MVE_VSTRHU16,
309	MVE_VSTRH16_rq,
310	MVE_VSTRH16_rq_u,
311	MVE_VSTRWU32,
312	MVE_VSTRW32_qi,
313	MVE_VSTRW32_rq,
314	MVE_VSTRW32_rq_u,
315	};
316	std::initializer_list<unsigned> Address2List = {
317	t2LDRB_POST,
318	t2LDRB_PRE,
319	t2LDRDi8,
320	t2LDRH_POST,
321	t2LDRH_PRE,
322	t2LDRSB_POST,
323	t2LDRSB_PRE,
324	t2LDRSH_POST,
325	t2LDRSH_PRE,
326	t2LDR_POST,
327	t2LDR_PRE,
328	t2STRB_POST,
329	t2STRB_PRE,
330	t2STRDi8,
331	t2STRH_POST,
332	t2STRH_PRE,
333	t2STR_POST,
334	t2STR_PRE,
335	MVE_VLD20_16_wb,
336	MVE_VLD20_32_wb,
337	MVE_VLD20_8_wb,
338	MVE_VLD21_16_wb,
339	MVE_VLD21_32_wb,
340	MVE_VLD21_8_wb,
341	MVE_VLD40_16_wb,
342	MVE_VLD40_32_wb,
343	MVE_VLD40_8_wb,
344	MVE_VLD41_16_wb,
345	MVE_VLD41_32_wb,
346	MVE_VLD41_8_wb,
347	MVE_VLD42_16_wb,
348	MVE_VLD42_32_wb,
349	MVE_VLD42_8_wb,
350	MVE_VLD43_16_wb,
351	MVE_VLD43_32_wb,
352	MVE_VLD43_8_wb,
353	MVE_VLDRBS16_post,
354	MVE_VLDRBS16_pre,
355	MVE_VLDRBS32_post,
356	MVE_VLDRBS32_pre,
357	MVE_VLDRBU16_post,
358	MVE_VLDRBU16_pre,
359	MVE_VLDRBU32_post,
360	MVE_VLDRBU32_pre,
361	MVE_VLDRBU8_post,
362	MVE_VLDRBU8_pre,
363	MVE_VLDRDU64_qi_pre,
364	MVE_VLDRHS32_post,
365	MVE_VLDRHS32_pre,
366	MVE_VLDRHU16_post,
367	MVE_VLDRHU16_pre,
368	MVE_VLDRHU32_post,
369	MVE_VLDRHU32_pre,
370	MVE_VLDRWU32_post,
371	MVE_VLDRWU32_pre,
372	MVE_VLDRWU32_qi_pre,
373	MVE_VST20_16_wb,
374	MVE_VST20_32_wb,
375	MVE_VST20_8_wb,
376	MVE_VST21_16_wb,
377	MVE_VST21_32_wb,
378	MVE_VST21_8_wb,
379	MVE_VST40_16_wb,
380	MVE_VST40_32_wb,
381	MVE_VST40_8_wb,
382	MVE_VST41_16_wb,
383	MVE_VST41_32_wb,
384	MVE_VST41_8_wb,
385	MVE_VST42_16_wb,
386	MVE_VST42_32_wb,
387	MVE_VST42_8_wb,
388	MVE_VST43_16_wb,
389	MVE_VST43_32_wb,
390	MVE_VST43_8_wb,
391	MVE_VSTRB16_post,
392	MVE_VSTRB16_pre,
393	MVE_VSTRB32_post,
394	MVE_VSTRB32_pre,
395	MVE_VSTRBU8_post,
396	MVE_VSTRBU8_pre,
397	MVE_VSTRD64_qi_pre,
398	MVE_VSTRH32_post,
399	MVE_VSTRH32_pre,
400	MVE_VSTRHU16_post,
401	MVE_VSTRHU16_pre,
402	MVE_VSTRWU32_post,
403	MVE_VSTRWU32_pre,
404	MVE_VSTRW32_qi_pre,
405	};
406	std::initializer_list<unsigned> Address3List = {
407	t2LDRD_POST,
408	t2LDRD_PRE,
409	t2STRD_POST,
410	t2STRD_PRE,
411	};
412	// Compute a mask of which operands are involved in address computation
413	for (auto &op : Address1List) {
414	Info [op].AddressOpMask = `0x6`;
415	}
416	for (auto &op : Address2List) {
417	Info [op].AddressOpMask = `0xc`;
418	}
419	for (auto &op : Address3List) {
420	Info [op].AddressOpMask = `0x18`;
421	}
422	for (auto &op : hasBRegAddrShiftList) {
423	Info [op].AddressOpMask \|= `0x8`;
424	}
425	}
426
427	void InstructionInformation::markDPProducersConsumers(
428	const ARMBaseInstrInfo *TII) {
429	// Learn about all instructions which have FP source/dest registers
430	for (unsigned MI = `0`; MI < ARM::INSTRUCTION_LIST_END; ++MI) {
431	const MCInstrDesc &MID = TII->get(Opcode: MI);
432	auto Operands = MID.operands();
433	for (unsigned OI = `0`, OIE = MID.getNumOperands(); OI != OIE; ++OI) {
434	bool MarkQP = false, MarkDP = false, MarkSP = false;
435	switch (Operands [OI].RegClass) {
436	case ARM::MQPRRegClassID:
437	case ARM::DPRRegClassID:
438	case ARM::DPR_8RegClassID:
439	case ARM::DPR_VFP2RegClassID:
440	case ARM::DPairRegClassID:
441	case ARM::DPairSpcRegClassID:
442	case ARM::DQuadRegClassID:
443	case ARM::DQuadSpcRegClassID:
444	case ARM::DTripleRegClassID:
445	case ARM::DTripleSpcRegClassID:
446	MarkDP = true;
447	break;
448	case ARM::QPRRegClassID:
449	case ARM::QPR_8RegClassID:
450	case ARM::QPR_VFP2RegClassID:
451	case ARM::QQPRRegClassID:
452	case ARM::QQQQPRRegClassID:
453	MarkQP = true;
454	break;
455	case ARM::SPRRegClassID:
456	case ARM::SPR_8RegClassID:
457	case ARM::FPWithVPRRegClassID:
458	MarkSP = true;
459	break;
460	default:
461	break;
462	}
463	if (MarkQP) {
464	if (OI < MID.getNumDefs())
465	Info [MI].ProducesQP = true;
466	else
467	Info [MI].ConsumesQP = true;
468	}
469	if (MarkDP) {
470	if (OI < MID.getNumDefs())
471	Info [MI].ProducesDP = true;
472	else
473	Info [MI].ConsumesDP = true;
474	}
475	if (MarkSP) {
476	if (OI < MID.getNumDefs())
477	Info [MI].ProducesSP = true;
478	else
479	Info [MI].ConsumesSP = true;
480	}
481	}
482	}
483	}
484
485	} // anonymous namespace
486
487	static bool hasImplicitCPSRUse(const MachineInstr *MI) {
488	return MI->getDesc().hasImplicitUseOfPhysReg(Reg: ARM::CPSR);
489	}
490
491	void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,
492	unsigned latency) {
493	SDep Reverse = SrcDep;
494	Reverse.setSUnit(&SrcSU);
495	for (SDep &PDep : SrcDep.getSUnit()->Preds) {
496	if (PDep == Reverse) {
497	PDep.setLatency(latency);
498	SrcDep.getSUnit()->setDepthDirty();
499	break;
500	}
501	}
502	SrcDep.setLatency(latency);
503	SrcSU.setHeightDirty();
504	}
505
506	static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {
507	return (a & `0xe`) != (b & `0xe`);
508	}
509
510	// Set output dependences to zero latency for processors which can
511	// simultaneously issue to the same register. Returns true if a change
512	// was made.
513	bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {
514	if (Dep.getKind() == SDep::Output) {
515	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: `0`);
516	return true;
517	}
518	return false;
519	}
520
521	// The graph doesn't look inside of bundles to determine their
522	// scheduling boundaries and reports zero latency into and out of them
523	// (except for CPSR into the bundle, which has latency 1).
524	// Make some better scheduling assumptions:
525	// 1) CPSR uses have zero latency; other uses have incoming latency 1
526	// 2) CPSR defs retain a latency of zero; others have a latency of 1.
527	//
528	// Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
529	unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {
530
531	SUnit &DepSU = *Dep.getSUnit();
532	const MachineInstr *SrcMI = ISU.getInstr();
533	unsigned SrcOpcode = SrcMI->getOpcode();
534	const MachineInstr *DstMI = DepSU.getInstr();
535	unsigned DstOpcode = DstMI->getOpcode();
536
537	if (DstOpcode == ARM::BUNDLE && TII->isPredicated(MI: *DstMI)) {
538	setBidirLatencies(
539	SrcSU&: ISU, SrcDep&: Dep,
540	latency: (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? `0` : `1`);
541	return `1`;
542	}
543	if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(MI: *SrcMI) &&
544	Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {
545	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: `1`);
546	return `2`;
547	}
548	return `0`;
549	}
550
551	// Determine whether there is a memory RAW hazard here and set up latency
552	// accordingly
553	bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,
554	unsigned latency) {
555	if (!Dep.isNormalMemory())
556	return false;
557	auto &SrcInst = *ISU.getInstr();
558	auto &DstInst = *Dep.getSUnit()->getInstr();
559	if (!SrcInst.mayStore() \|\| !DstInst.mayLoad())
560	return false;
561
562	auto SrcMO = *SrcInst.memoperands().begin();
563	auto DstMO = *DstInst.memoperands().begin();
564	auto SrcVal = SrcMO->getValue();
565	auto DstVal = DstMO->getValue();
566	auto SrcPseudoVal = SrcMO->getPseudoValue();
567	auto DstPseudoVal = DstMO->getPseudoValue();
568	if (SrcVal && DstVal && AA->alias(V1: SrcVal, V2: DstVal) == AliasResult::MustAlias &&
569	SrcMO->getOffset() == DstMO->getOffset()) {
570	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency);
571	return true;
572	} else if (SrcPseudoVal && DstPseudoVal &&
573	SrcPseudoVal->kind() == DstPseudoVal->kind() &&
574	SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {
575	// Spills/fills
576	auto FS0 = cast<FixedStackPseudoSourceValue>(Val: SrcPseudoVal);
577	auto FS1 = cast<FixedStackPseudoSourceValue>(Val: DstPseudoVal);
578	if (FS0 == FS1) {
579	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency);
580	return true;
581	}
582	}
583	return false;
584	}
585
586	namespace {
587
588	std::unique_ptr<InstructionInformation> II;
589
590	class CortexM7InstructionInformation : public InstructionInformation {
591	public:
592	CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)
593	: InstructionInformation (TII) {}
594	};
595
596	class CortexM7Overrides : public ARMOverrideBypasses {
597	public:
598	CortexM7Overrides(const ARMBaseInstrInfo TII, AAResults AA)
599	: ARMOverrideBypasses (TII, AA) {
600	if (!II)
601	II.reset(p: new CortexM7InstructionInformation (TII));
602	}
603
604	void modifyBypasses(SUnit &) override;
605	};
606
607	void CortexM7Overrides::modifyBypasses(SUnit &ISU) {
608	const MachineInstr *SrcMI = ISU.getInstr();
609	unsigned SrcOpcode = SrcMI->getOpcode();
610	bool isNSWload = II ->isNonSubwordLoad(Op: SrcOpcode);
611
612	// Walk the successors looking for latency overrides that are needed
613	for (SDep &Dep : ISU.Succs) {
614
615	// Output dependences should have 0 latency, as M7 is able to
616	// schedule writers to the same register for simultaneous issue.
617	if (zeroOutputDependences(ISU, Dep))
618	continue;
619
620	if (memoryRAWHazard(ISU, Dep, latency: `4`))
621	continue;
622
623	// Ignore dependencies other than data
624	if (Dep.getKind() != SDep::Data)
625	continue;
626
627	SUnit &DepSU = *Dep.getSUnit();
628	if (DepSU.isBoundaryNode())
629	continue;
630
631	if (makeBundleAssumptions(ISU, Dep) == `1`)
632	continue;
633
634	const MachineInstr *DstMI = DepSU.getInstr();
635	unsigned DstOpcode = DstMI->getOpcode();
636
637	// Word loads into any multiply or divide instruction are considered
638	// cannot bypass their scheduling stage. Didn't do this in the .td file
639	// because we cannot easily create a read advance that is 0 from certain
640	// writer classes and 1 from all the rest.
641	// (The other way around would have been easy.)
642	if (isNSWload && (II ->isMultiply(Op: DstOpcode) \|\| II ->isDivide(Op: DstOpcode)))
643	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + `1`);
644
645	// Word loads into B operand of a load/store are considered cannot bypass
646	// their scheduling stage. Cannot do in the .td file because
647	// need to decide between -1 and -2 for ReadAdvance
648	if (isNSWload && II ->hasBRegAddr(Op: DstOpcode) &&
649	DstMI->getOperand(i: `2`).getReg() == Dep.getReg())
650	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + `1`);
651
652	// Multiplies into any address generation cannot bypass from EX3. Cannot do
653	// in the .td file because need to decide between -1 and -2 for ReadAdvance
654	if (II ->isMultiply(Op: SrcOpcode)) {
655	unsigned OpMask = II ->getAddressOpMask(Op: DstOpcode) >> `1`;
656	for (unsigned i = `1`; OpMask; ++i, OpMask >>= `1`) {
657	if ((OpMask & `1`) && DstMI->getOperand(i).isReg() &&
658	DstMI->getOperand(i).getReg() == Dep.getReg()) {
659	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: `4`); // first legal bypass is EX4->EX1
660	break;
661	}
662	}
663	}
664
665	// Mismatched conditional producers take longer on M7; they end up looking
666	// like they were produced at EX3 and read at IS.
667	if (TII->isPredicated(MI: *SrcMI) && Dep.isAssignedRegDep() &&
668	(SrcOpcode == ARM::BUNDLE \|\|
669	mismatchedPred(a: TII->getPredicate(MI: *SrcMI),
670	b: TII->getPredicate(MI: *DstMI)))) {
671	unsigned Lat = `1`;
672	// Operand A of shift+ALU is treated as an EX1 read instead of EX2.
673	if (II ->isInlineShiftALU(Op: DstOpcode) && DstMI->getOperand(i: `3`).getImm() &&
674	DstMI->getOperand(i: `1`).getReg() == Dep.getReg())
675	Lat = `2`;
676	Lat = std::min(a: `3u`, b: Dep.getLatency() + Lat);
677	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: std::max(a: Dep.getLatency(), b: Lat));
678	}
679
680	// CC setter into conditional producer shouldn't have a latency of more
681	// than 1 unless it's due to an implicit read. (All the "true" readers
682	// of the condition code use an implicit read, and predicates use an
683	// explicit.)
684	if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
685	TII->isPredicated(MI: *DstMI) && !hasImplicitCPSRUse(MI: DstMI))
686	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: `1`);
687
688	// REV instructions cannot bypass directly into the EX1 shifter. The
689	// code is slightly inexact as it doesn't attempt to ensure that the bypass
690	// is to the shifter operands.
691	if (II ->isRev(Op: SrcOpcode)) {
692	if (II ->isInlineShiftALU(Op: DstOpcode))
693	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: `2`);
694	else if (II ->isShift(Op: DstOpcode))
695	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: `1`);
696	}
697	}
698	}
699
700	class M85InstructionInformation : public InstructionInformation {
701	public:
702	M85InstructionInformation(const ARMBaseInstrInfo *t)
703	: InstructionInformation (t) {
704	markDPProducersConsumers(TII: t);
705	}
706	};
707
708	class M85Overrides : public ARMOverrideBypasses {
709	public:
710	M85Overrides(const ARMBaseInstrInfo t, AAResults a)
711	: ARMOverrideBypasses (t, a) {
712	if (!II)
713	II.reset(p: new M85InstructionInformation (t));
714	}
715
716	void modifyBypasses(SUnit &) override;
717
718	private:
719	unsigned computeBypassStage(const MCSchedClassDesc *SCD);
720	signed modifyMixedWidthFP(const MachineInstr *SrcMI,
721	const MachineInstr DstMI, unsigned* RegID,
722	const MCSchedClassDesc *SCD);
723	};
724
725	unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {
726	auto SM = DAG->getSchedModel();
727	unsigned DefIdx = `0`; // just look for the first output's timing
728	if (DefIdx < SCDesc->NumWriteLatencyEntries) {
729	// Lookup the definition's write latency in SubtargetInfo.
730	const MCWriteLatencyEntry *WLEntry =
731	SM->getSubtargetInfo()->getWriteLatencyEntry(SC: SCDesc, DefIdx);
732	unsigned Latency = WLEntry->Cycles >= `0` ? WLEntry->Cycles : `1000`;
733	if (Latency == `4`)
734	return `2`;
735	else if (Latency == `5`)
736	return `3`;
737	else if (Latency > `3`)
738	return `3`;
739	else
740	return Latency;
741	}
742	return `2`;
743	}
744
745	// Latency changes for bypassing between FP registers of different sizes:
746	//
747	// Note that mixed DP/SP are unlikely because of the semantics
748	// of C. Mixed MVE/SP are quite common when MVE intrinsics are used.
749	signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,
750	const MachineInstr *DstMI,
751	unsigned RegID,
752	const MCSchedClassDesc *SCD) {
753
754	if (!II ->producesSP(Op: SrcMI->getOpcode()) &&
755	!II ->producesDP(Op: SrcMI->getOpcode()) &&
756	!II ->producesQP(Op: SrcMI->getOpcode()))
757	return `0`;
758
759	if (Register::isVirtualRegister(Reg: RegID)) {
760	if (II ->producesSP(Op: SrcMI->getOpcode()) &&
761	II ->consumesDP(Op: DstMI->getOpcode())) {
762	for (auto &OP : SrcMI->operands())
763	if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
764	OP.getSubReg() == ARM::ssub_1)
765	return `5` - computeBypassStage(SCDesc: SCD);
766	} else if (II ->producesSP(Op: SrcMI->getOpcode()) &&
767	II ->consumesQP(Op: DstMI->getOpcode())) {
768	for (auto &OP : SrcMI->operands())
769	if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
770	(OP.getSubReg() == ARM::ssub_1 \|\| OP.getSubReg() == ARM::ssub_3))
771	return `5` - computeBypassStage(SCDesc: SCD) -
772	((OP.getSubReg() == ARM::ssub_2 \|\|
773	OP.getSubReg() == ARM::ssub_3)
774	? `1`
775	: `0`);
776	} else if (II ->producesDP(Op: SrcMI->getOpcode()) &&
777	II ->consumesQP(Op: DstMI->getOpcode())) {
778	for (auto &OP : SrcMI->operands())
779	if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
780	OP.getSubReg() == ARM::ssub_1)
781	return -`1`;
782	} else if (II ->producesDP(Op: SrcMI->getOpcode()) &&
783	II ->consumesSP(Op: DstMI->getOpcode())) {
784	for (auto &OP : DstMI->operands())
785	if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
786	OP.getSubReg() == ARM::ssub_1)
787	return `5` - computeBypassStage(SCDesc: SCD);
788	} else if (II ->producesQP(Op: SrcMI->getOpcode()) &&
789	II ->consumesSP(Op: DstMI->getOpcode())) {
790	for (auto &OP : DstMI->operands())
791	if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
792	(OP.getSubReg() == ARM::ssub_1 \|\| OP.getSubReg() == ARM::ssub_3))
793	return `5` - computeBypassStage(SCDesc: SCD) +
794	((OP.getSubReg() == ARM::ssub_2 \|\|
795	OP.getSubReg() == ARM::ssub_3)
796	? `1`
797	: `0`);
798	} else if (II ->producesQP(Op: SrcMI->getOpcode()) &&
799	II ->consumesDP(Op: DstMI->getOpcode())) {
800	for (auto &OP : DstMI->operands())
801	if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
802	OP.getSubReg() == ARM::ssub_1)
803	return `1`;
804	}
805	} else if (Register::isPhysicalRegister(Reg: RegID)) {
806	// Note that when the producer is narrower, not all of the producers
807	// may be present in the scheduling graph; somewhere earlier in the
808	// compiler, an implicit def/use of the aliased full register gets
809	// added to the producer, and so only that producer is seen as the
810	// single producer. This behavior also has the unfortunate effect of
811	// serializing the producers in the compiler's view of things.
812	if (II ->producesSP(Op: SrcMI->getOpcode()) &&
813	II ->consumesDP(Op: DstMI->getOpcode())) {
814	for (auto &OP : SrcMI->operands())
815	if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
816	OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % `2` &&
817	(OP.getReg() == RegID \|\|
818	(OP.getReg() - ARM::S0) / `2` + ARM::D0 == RegID \|\|
819	(OP.getReg() - ARM::S0) / `4` + ARM::Q0 == RegID))
820	return `5` - computeBypassStage(SCDesc: SCD);
821	} else if (II ->producesSP(Op: SrcMI->getOpcode()) &&
822	II ->consumesQP(Op: DstMI->getOpcode())) {
823	for (auto &OP : SrcMI->operands())
824	if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
825	OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % `2` &&
826	(OP.getReg() == RegID \|\|
827	(OP.getReg() - ARM::S0) / `2` + ARM::D0 == RegID \|\|
828	(OP.getReg() - ARM::S0) / `4` + ARM::Q0 == RegID))
829	return `5` - computeBypassStage(SCDesc: SCD) -
830	(((OP.getReg() - ARM::S0) / `2`) % `2` ? `1` : `0`);
831	} else if (II ->producesDP(Op: SrcMI->getOpcode()) &&
832	II ->consumesQP(Op: DstMI->getOpcode())) {
833	for (auto &OP : SrcMI->operands())
834	if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&
835	OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % `2` &&
836	(OP.getReg() == RegID \|\|
837	(OP.getReg() - ARM::D0) / `2` + ARM::Q0 == RegID))
838	return -`1`;
839	} else if (II ->producesDP(Op: SrcMI->getOpcode()) &&
840	II ->consumesSP(Op: DstMI->getOpcode())) {
841	if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % `2`)
842	return `5` - computeBypassStage(SCDesc: SCD);
843	} else if (II ->producesQP(Op: SrcMI->getOpcode()) &&
844	II ->consumesSP(Op: DstMI->getOpcode())) {
845	if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % `2`)
846	return `5` - computeBypassStage(SCDesc: SCD) +
847	(((RegID - ARM::S0) / `2`) % `2` ? `1` : `0`);
848	} else if (II ->producesQP(Op: SrcMI->getOpcode()) &&
849	II ->consumesDP(Op: DstMI->getOpcode())) {
850	if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % `2`)
851	return `1`;
852	}
853	}
854	return `0`;
855	}
856
857	void M85Overrides::modifyBypasses(SUnit &ISU) {
858	const MachineInstr *SrcMI = ISU.getInstr();
859	unsigned SrcOpcode = SrcMI->getOpcode();
860	bool isNSWload = II ->isNonSubwordLoad(Op: SrcOpcode);
861
862	// Walk the successors looking for latency overrides that are needed
863	for (SDep &Dep : ISU.Succs) {
864
865	// Output dependences should have 0 latency, as CortexM85 is able to
866	// schedule writers to the same register for simultaneous issue.
867	if (zeroOutputDependences(ISU, Dep))
868	continue;
869
870	if (memoryRAWHazard(ISU, Dep, latency: `3`))
871	continue;
872
873	// Ignore dependencies other than data or strong ordering.
874	if (Dep.getKind() != SDep::Data)
875	continue;
876
877	SUnit &DepSU = *Dep.getSUnit();
878	if (DepSU.isBoundaryNode())
879	continue;
880
881	if (makeBundleAssumptions(ISU, Dep) == `1`)
882	continue;
883
884	const MachineInstr *DstMI = DepSU.getInstr();
885	unsigned DstOpcode = DstMI->getOpcode();
886
887	// Word loads into B operand of a load/store with cannot bypass their
888	// scheduling stage. Cannot do in the .td file because need to decide
889	// between -1 and -2 for ReadAdvance
890
891	if (isNSWload && II ->hasBRegAddrShift(Op: DstOpcode) &&
892	DstMI->getOperand(i: `3`).getImm() != `0` && // shift operand
893	DstMI->getOperand(i: `2`).getReg() == Dep.getReg())
894	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + `1`);
895
896	if (isNSWload && isMVEVectorInstruction(MI: DstMI)) {
897	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + `1`);
898	}
899
900	if (II ->isMVEIntMAC(Op: DstOpcode) &&
901	II ->isMVEIntMACMatched(SrcOp: SrcOpcode, DstOp: DstOpcode) &&
902	DstMI->getOperand(i: `0`).isReg() &&
903	DstMI->getOperand(i: `0`).getReg() == Dep.getReg())
904	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() - `1`);
905
906	// CC setter into conditional producer shouldn't have a latency of more
907	// than 0 unless it's due to an implicit read.
908	if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
909	TII->isPredicated(MI: *DstMI) && !hasImplicitCPSRUse(MI: DstMI))
910	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: `0`);
911
912	if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, RegID: Dep.getReg(),
913	SCD: DAG->getSchedClass(SU: &ISU)))
914	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: std::max(a: `0`, b: signed(Dep.getLatency()) + ALat));
915
916	if (II ->isRev(Op: SrcOpcode)) {
917	if (II ->isInlineShiftALU(Op: DstOpcode))
918	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: `1`);
919	else if (II ->isShift(Op: DstOpcode))
920	setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: `1`);
921	}
922	}
923	}
924
925	// Add M55 specific overrides for latencies between instructions. Currently it:
926	// - Adds an extra cycle latency between MVE VMLAV and scalar instructions.
927	class CortexM55Overrides : public ARMOverrideBypasses {
928	public:
929	CortexM55Overrides(const ARMBaseInstrInfo TII, AAResults AA)
930	: ARMOverrideBypasses (TII, AA) {}
931
932	void modifyBypasses(SUnit &SU) override {
933	MachineInstr *SrcMI = SU.getInstr();
934	if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))
935	return;
936
937	for (SDep &Dep : SU.Succs) {
938	if (Dep.getKind() != SDep::Data)
939	continue;
940	SUnit &DepSU = *Dep.getSUnit();
941	if (DepSU.isBoundaryNode())
942	continue;
943	MachineInstr *DstMI = DepSU.getInstr();
944
945	if (!isMVEVectorInstruction(MI: DstMI) && !DstMI->mayStore())
946	setBidirLatencies(SrcSU&: SU, SrcDep&: Dep, latency: `3`);
947	}
948	}
949	};
950
951	} // end anonymous namespace
952
953	void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) {
954	DAG = DAGInstrs;
955	for (SUnit &ISU : DAGInstrs->SUnits) {
956	if (ISU.isBoundaryNode())
957	continue;
958	modifyBypasses(ISU);
959	}
960	if (DAGInstrs->ExitSU.getInstr())
961	modifyBypasses(DAGInstrs->ExitSU);
962	}
963
964	std::unique_ptr<ScheduleDAGMutation>
965	createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) {
966	if (ST.isCortexM85())
967	return std::make_unique<M85Overrides>(args: ST.getInstrInfo(), args&: AA);
968	else if (ST.isCortexM7())
969	return std::make_unique<CortexM7Overrides>(args: ST.getInstrInfo(), args&: AA);
970	else if (ST.isCortexM55())
971	return std::make_unique<CortexM55Overrides>(args: ST.getInstrInfo(), args&: AA);
972
973	return nullptr;
974	}
975
976	} // end namespace llvm
977

Browse the source code of llvm_projects/llvm/lib/Target/ARM/ARMLatencyMutations.cpp