ARMLowOverheadLoops.cpp source code [llvm_projects/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp]

1	//===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10	/// instructions into machine operations.
11	/// The expectation is that the loop contains three pseudo instructions:
12	/// - t2LoopStart - placed in the preheader or pre-preheader. The do-loop*
13	/// form should be in the preheader, whereas the while form should be in the
14	/// preheaders only predecessor.
15	/// - t2LoopDec - placed within in the loop body.
16	/// - t2LoopEnd - the loop latch terminator.
17	///
18	/// In addition to this, we also look for the presence of the VCTP instruction,
19	/// which determines whether we can generated the tail-predicated low-overhead
20	/// loop form.
21	///
22	/// Assumptions and Dependencies:
23	/// Low-overhead loops are constructed and executed using a setup instruction:
24	/// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25	/// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26	/// but fixed polarity: WLS can only branch forwards and LE can only branch
27	/// backwards. These restrictions mean that this pass is dependent upon block
28	/// layout and block sizes, which is why it's the last pass to run. The same is
29	/// true for ConstantIslands, but this pass does not increase the size of the
30	/// basic blocks, nor does it change the CFG. Instructions are mainly removed
31	/// during the transform and pseudo instructions are replaced by real ones. In
32	/// some cases, when we have to revert to a 'normal' loop, we have to introduce
33	/// multiple instructions for a single pseudo (see RevertWhile and
34	/// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
35	/// are defined to be as large as this maximum sequence of replacement
36	/// instructions.
37	///
38	/// A note on VPR.P0 (the lane mask):
39	/// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
40	/// "VPT Active" context (which includes low-overhead loops and vpt blocks).
41	/// They will simply "and" the result of their calculation with the current
42	/// value of VPR.P0. You can think of it like this:
43	/// \verbatim
44	/// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs
45	/// VPR.P0 &= Value
46	/// else
47	/// VPR.P0 = Value
48	/// \endverbatim
49	/// When we're inside the low-overhead loop (between DLSTP and LETP), we always
50	/// fall in the "VPT active" case, so we can consider that all VPR writes by
51	/// one of those instruction is actually a "and".
52	//===----------------------------------------------------------------------===//
53
54	#include "ARM.h"
55	#include "ARMBaseInstrInfo.h"
56	#include "ARMBasicBlockInfo.h"
57	#include "ARMSubtarget.h"
58	#include "MVETailPredUtils.h"
59	#include "Thumb2InstrInfo.h"
60	#include "llvm/ADT/SetVector.h"
61	#include "llvm/CodeGen/LivePhysRegs.h"
62	#include "llvm/CodeGen/MachineFrameInfo.h"
63	#include "llvm/CodeGen/MachineFunctionPass.h"
64	#include "llvm/CodeGen/MachineLoopInfo.h"
65	#include "llvm/CodeGen/MachineLoopUtils.h"
66	#include "llvm/CodeGen/MachineRegisterInfo.h"
67	#include "llvm/CodeGen/Passes.h"
68	#include "llvm/CodeGen/ReachingDefAnalysis.h"
69	#include "llvm/MC/MCInstrDesc.h"
70
71	using namespace llvm;
72
73	#define DEBUG_TYPE "arm-low-overhead-loops"
74	#define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
75
76	static cl::opt<bool>
77	DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
78	cl::desc ("Disable tail-predication in the ARM LowOverheadLoop pass"),
79	cl::init(Val: false));
80
81	static cl::opt<bool>
82	DisableOmitDLS("arm-disable-omit-dls", cl::Hidden,
83	cl::desc ("Disable omitting 'dls lr, lr' instructions"),
84	cl::init(Val: false));
85
86	static bool isVectorPredicated(MachineInstr *MI) {
87	int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI);
88	return PIdx != -`1` && MI->getOperand(i: PIdx + `1`).getReg() == ARM::VPR;
89	}
90
91	static bool isVectorPredicate(MachineInstr *MI) {
92	return MI->findRegisterDefOperandIdx(Reg: ARM::VPR, /TRI=/nullptr) != -`1`;
93	}
94
95	static bool hasVPRUse(MachineInstr &MI) {
96	return MI.findRegisterUseOperandIdx(Reg: ARM::VPR, /TRI=/nullptr) != -`1`;
97	}
98
99	static bool isDomainMVE(MachineInstr *MI) {
100	uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
101	return Domain == ARMII::DomainMVE;
102	}
103
104	static int getVecSize(const MachineInstr &MI) {
105	const MCInstrDesc &MCID = MI.getDesc();
106	uint64_t Flags = MCID.TSFlags;
107	return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
108	}
109
110	static bool shouldInspect(MachineInstr &MI) {
111	if (MI.isDebugInstr())
112	return false;
113	return isDomainMVE(MI: &MI) \|\| isVectorPredicate(MI: &MI) \|\| hasVPRUse(MI);
114	}
115
116	static bool isHorizontalReduction(const MachineInstr &MI) {
117	const MCInstrDesc &MCID = MI.getDesc();
118	uint64_t Flags = MCID.TSFlags;
119	return (Flags & ARMII::HorizontalReduction) != `0`;
120	}
121
122	namespace {
123
124	using InstSet = SmallPtrSetImpl<MachineInstr *>;
125
126	class PostOrderLoopTraversal {
127	MachineLoop &ML;
128	MachineLoopInfo &MLI;
129	SmallPtrSet<MachineBasicBlock*, `4`> Visited;
130	SmallVector<MachineBasicBlock*, `4`> Order;
131
132	public:
133	PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
134	: ML(ML), MLI(MLI) { }
135
136	const SmallVectorImpl<MachineBasicBlock> &getOrder() const* {
137	return Order;
138	}
139
140	// Visit all the blocks within the loop, as well as exit blocks and any
141	// blocks properly dominating the header.
142	void ProcessLoop() {
143	std::function<void(MachineBasicBlock *)> Search =
144	[this, &Search](MachineBasicBlock MBB) -> void* {
145	if (!Visited.insert(Ptr: MBB).second)
146	return;
147
148	for (auto *Succ : MBB->successors()) {
149	if (!ML.contains(BB: Succ))
150	continue;
151	Search (Succ);
152	}
153	Order.push_back(Elt: MBB);
154	};
155
156	// Insert exit blocks.
157	SmallVector<MachineBasicBlock*, `2`> ExitBlocks;
158	ML.getExitBlocks(ExitBlocks);
159	append_range(C&: Order, R&: ExitBlocks);
160
161	// Then add the loop body.
162	Search (ML.getHeader());
163
164	// Then try the preheader and its predecessors.
165	std::function<void(MachineBasicBlock*)> GetPredecessor =
166	[this, &GetPredecessor] (MachineBasicBlock MBB) -> void* {
167	Order.push_back(Elt: MBB);
168	if (MBB->pred_size() == `1`)
169	GetPredecessor (*MBB->pred_begin());
170	};
171
172	if (auto *Preheader = ML.getLoopPreheader())
173	GetPredecessor (Preheader);
174	else if (auto Preheader = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true*))
175	GetPredecessor (Preheader);
176	}
177	};
178
179	class VPTBlock {
180	SmallVector<MachineInstr *, `4`> Insts;
181
182	public:
183	VPTBlock(MachineInstr *MI) { Insts.push_back(Elt: MI); }
184
185	// Have we found an instruction within the block which defines the vpr? If
186	// so, not all the instructions in the block will have the same predicate.
187	bool hasUniformPredicate() { return getDivergent() == nullptr; }
188
189	// If it exists, return the first internal instruction which modifies the
190	// VPR.
191	MachineInstr *getDivergent() {
192	SmallVectorImpl<MachineInstr *> &Insts = getInsts();
193	for (unsigned i = `1`; i < Insts.size(); ++i) {
194	MachineInstr *Next = Insts [i];
195	if (isVectorPredicate(MI: Next))
196	return Next; // Found an instruction altering the vpr.
197	}
198	return nullptr;
199	}
200
201	void insert(MachineInstr *MI) {
202	Insts.push_back(Elt: MI);
203	// VPT/VPST + 4 predicated instructions.
204	assert(Insts.size() <= `5` && "Too many instructions in VPT block!");
205	}
206
207	bool containsVCTP() const { return llvm::any_of(Range: Insts, P: isVCTP); }
208
209	unsigned size() const { return Insts.size(); }
210	SmallVectorImpl<MachineInstr > &getInsts() { return* Insts; }
211	};
212
213	// Represent the current state of the VPR and hold all instances which
214	// represent a VPT block, which is a list of instructions that begins with a
215	// VPT/VPST and has a maximum of four proceeding instructions. All
216	// instructions within the block are predicated upon the vpr and we allow
217	// instructions to define the vpr within in the block too.
218	class VPTState {
219	friend struct LowOverheadLoop;
220
221	SmallVector<VPTBlock, `4`> Blocks;
222	SetVector<MachineInstr *> CurrentPredicates;
223	std::map<MachineInstr , SetVector<MachineInstr >> PredicatedInsts;
224
225	void CreateVPTBlock(MachineInstr *MI) {
226	assert((CurrentPredicates.size() \|\| MI->getParent()->isLiveIn(ARM::VPR))
227	&& "Can't begin VPT without predicate");
228	Blocks.emplace_back(Args&: MI);
229	// The execution of MI is predicated upon the current set of instructions
230	// that are AND'ed together to form the VPR predicate value. In the case
231	// that MI is a VPT, CurrentPredicates will also just be MI.
232	PredicatedInsts [MI] = CurrentPredicates;
233	}
234
235	void addInst(MachineInstr *MI) {
236	Blocks.back().insert(MI);
237	PredicatedInsts [MI] = CurrentPredicates;
238	}
239
240	void addPredicate(MachineInstr *MI) {
241	LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
242	CurrentPredicates.insert(X: MI);
243	}
244
245	void resetPredicate(MachineInstr *MI) {
246	LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
247	CurrentPredicates.clear();
248	CurrentPredicates.insert(X: MI);
249	}
250
251	public:
252	// Return whether the given instruction is predicated upon a VCTP.
253	bool isPredicatedOnVCTP(MachineInstr MI, bool* Exclusive = false) {
254	SetVector<MachineInstr *> &Predicates = PredicatedInsts [MI];
255	if (Exclusive && Predicates.size() != `1`)
256	return false;
257	// We do not know how to convert an else predicate of a VCTP.
258	if (getVPTInstrPredicate(MI: *MI) == ARMVCC::Else)
259	return false;
260	return llvm::any_of(Range&: Predicates, P: isVCTP);
261	}
262
263	// Is the VPST, controlling the block entry, predicated upon a VCTP.
264	bool isEntryPredicatedOnVCTP(VPTBlock &Block, bool Exclusive = false) {
265	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
266	return isPredicatedOnVCTP(MI: Insts.front(), Exclusive);
267	}
268
269	// If this block begins with a VPT, we can check whether it's using
270	// at least one predicated input(s), as well as possible loop invariant
271	// which would result in it being implicitly predicated.
272	bool hasImplicitlyValidVPT(VPTBlock &Block, ReachingDefAnalysis &RDA) {
273	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
274	MachineInstr *VPT = Insts.front();
275	assert(isVPTOpcode(VPT->getOpcode()) &&
276	"Expected VPT block to begin with VPT/VPST");
277
278	if (VPT->getOpcode() == ARM::MVE_VPST)
279	return false;
280
281	// If the VPT block does not define something that is an "output", then
282	// the tail-predicated version will just perform a subset of the original
283	// vpt block, where the last lanes should not be used.
284	if (isVPTOpcode(Opc: VPT->getOpcode()) &&
285	all_of(Range&: Block.getInsts(), P: [](const MachineInstr *MI) {
286	return !MI->mayStore() && !MI->mayLoad() &&
287	!isHorizontalReduction(MI: *MI) && !isVCTP(MI);
288	}))
289	return true;
290
291	auto IsOperandPredicated = [&](MachineInstr MI, unsigned* Idx) {
292	MachineInstr *Op = RDA.getMIOperand(MI, MO&: MI->getOperand(i: Idx));
293	return Op && PredicatedInsts.count(x: Op) && isPredicatedOnVCTP(MI: Op);
294	};
295
296	auto IsOperandInvariant = [&](MachineInstr MI, unsigned* Idx) {
297	MachineOperand &MO = MI->getOperand(i: Idx);
298	if (!MO.isReg() \|\| !MO.getReg())
299	return true;
300
301	SmallPtrSet<MachineInstr *, `2`> Defs;
302	RDA.getGlobalReachingDefs(MI, Reg: MO.getReg(), Defs);
303	if (Defs.empty())
304	return true;
305
306	for (auto *Def : Defs)
307	if (Def->getParent() == VPT->getParent())
308	return false;
309	return true;
310	};
311
312	// Check that at least one of the operands is directly predicated on a
313	// vctp and allow an invariant value too.
314	return (IsOperandPredicated(VPT, `1`) \|\| IsOperandPredicated(VPT, `2`)) &&
315	(IsOperandPredicated(VPT, `1`) \|\| IsOperandInvariant(VPT, `1`)) &&
316	(IsOperandPredicated(VPT, `2`) \|\| IsOperandInvariant(VPT, `2`));
317	}
318
319	bool isValid(ReachingDefAnalysis &RDA) {
320	// All predication within the loop should be based on vctp. If the block
321	// isn't predicated on entry, check whether the vctp is within the block
322	// and that all other instructions are then predicated on it.
323	for (auto &Block : Blocks) {
324	if (isEntryPredicatedOnVCTP(Block, Exclusive: false) &&
325	!any_of(Range: drop_begin(RangeOrContainer&: Block.getInsts()), P: [](const MachineInstr *MI) {
326	return getVPTInstrPredicate(MI: *MI) == ARMVCC::Else;
327	}))
328	continue;
329	if (hasImplicitlyValidVPT(Block, RDA))
330	continue;
331
332	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
333	// We don't know how to convert a block with just a VPT;VCTP into
334	// anything valid once we remove the VCTP. For now just bail out.
335	assert(isVPTOpcode(Insts.front()->getOpcode()) &&
336	"Expected VPT block to start with a VPST or VPT!");
337	if (Insts.size() == `2` && Insts.front()->getOpcode() != ARM::MVE_VPST &&
338	isVCTP(MI: Insts.back()))
339	return false;
340
341	for (auto *MI : Insts) {
342	// Check that any internal VCTPs are 'Then' predicated.
343	if (isVCTP(MI) && getVPTInstrPredicate(MI: *MI) != ARMVCC::Then)
344	return false;
345	// Skip other instructions that build up the predicate.
346	if (MI->getOpcode() == ARM::MVE_VPST \|\| isVectorPredicate(MI))
347	continue;
348	// Check that any other instructions are predicated upon a vctp.
349	// TODO: We could infer when VPTs are implicitly predicated on the
350	// vctp (when the operands are predicated).
351	if (!isPredicatedOnVCTP(MI)) {
352	LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
353	return false;
354	}
355	}
356	}
357	return true;
358	}
359	};
360
361	struct LowOverheadLoop {
362
363	MachineLoop &ML;
364	MachineBasicBlock Preheader = nullptr*;
365	MachineLoopInfo &MLI;
366	ReachingDefAnalysis &RDA;
367	const TargetRegisterInfo &TRI;
368	const ARMBaseInstrInfo &TII;
369	MachineFunction MF = nullptr*;
370	MachineBasicBlock::iterator StartInsertPt;
371	MachineBasicBlock StartInsertBB = nullptr*;
372	MachineInstr Start = nullptr*;
373	MachineInstr Dec = nullptr*;
374	MachineInstr End = nullptr*;
375	MachineOperand TPNumElements;
376	SmallVector<MachineInstr *, `4`> VCTPs;
377	SmallPtrSet<MachineInstr *, `4`> ToRemove;
378	SmallPtrSet<MachineInstr *, `4`> BlockMasksToRecompute;
379	SmallPtrSet<MachineInstr *, `4`> DoubleWidthResultInstrs;
380	SmallPtrSet<MachineInstr *, `4`> VMOVCopies;
381	bool Revert = false;
382	bool CannotTailPredicate = false;
383	VPTState VPTstate;
384
385	LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
386	ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
387	const ARMBaseInstrInfo &TII)
388	: ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
389	TPNumElements(MachineOperand::CreateImm(Val: `0`)) {
390	MF = ML.getHeader()->getParent();
391	if (auto *MBB = ML.getLoopPreheader())
392	Preheader = MBB;
393	else if (auto MBB = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true*))
394	Preheader = MBB;
395	}
396
397	// If this is an MVE instruction, check that we know how to use tail
398	// predication with it. Record VPT blocks and return whether the
399	// instruction is valid for tail predication.
400	bool ValidateMVEInst(MachineInstr *MI);
401
402	void AnalyseMVEInst(MachineInstr *MI) {
403	CannotTailPredicate = !ValidateMVEInst(MI);
404	}
405
406	bool IsTailPredicationLegal() const {
407	// For now, let's keep things really simple and only support a single
408	// block for tail predication.
409	return !Revert && FoundAllComponents() && !VCTPs.empty() &&
410	!CannotTailPredicate && ML.getNumBlocks() == `1`;
411	}
412
413	// Given that MI is a VCTP, check that is equivalent to any other VCTPs
414	// found.
415	bool AddVCTP(MachineInstr *MI);
416
417	// Check that the predication in the loop will be equivalent once we
418	// perform the conversion. Also ensure that we can provide the number
419	// of elements to the loop start instruction.
420	bool ValidateTailPredicate();
421
422	// Check that any values available outside of the loop will be the same
423	// after tail predication conversion.
424	bool ValidateLiveOuts();
425
426	// Check the branch targets are within range and we satisfy our
427	// restrictions.
428	void Validate(ARMBasicBlockUtils *BBUtils);
429
430	bool FoundAllComponents() const {
431	return Start && Dec && End;
432	}
433
434	SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTstate.Blocks; }
435
436	// Return the operand for the loop start instruction. This will be the loop
437	// iteration count, or the number of elements if we're tail predicating.
438	MachineOperand &getLoopStartOperand() {
439	if (IsTailPredicationLegal())
440	return TPNumElements;
441	return Start->getOperand(i: `1`);
442	}
443
444	unsigned getStartOpcode() const {
445	bool IsDo = isDoLoopStart(MI: *Start);
446	if (!IsTailPredicationLegal())
447	return IsDo ? ARM::t2DLS : ARM::t2WLS;
448
449	return VCTPOpcodeToLSTP(Opcode: VCTPs.back()->getOpcode(), IsDoLoop: IsDo);
450	}
451
452	void dump() const {
453	if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
454	if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
455	if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
456	if (!VCTPs.empty()) {
457	dbgs() << "ARM Loops: Found VCTP(s):\n";
458	for (auto *MI : VCTPs)
459	dbgs() << " - " << *MI;
460	}
461	if (!FoundAllComponents())
462	dbgs() << "ARM Loops: Not a low-overhead loop.\n";
463	else if (!(Start && Dec && End))
464	dbgs() << "ARM Loops: Failed to find all loop components.\n";
465	}
466	};
467
468	class ARMLowOverheadLoops : public MachineFunctionPass {
469	MachineFunction MF = nullptr*;
470	MachineLoopInfo MLI = nullptr*;
471	ReachingDefAnalysis RDA = nullptr*;
472	const ARMBaseInstrInfo TII = nullptr*;
473	MachineRegisterInfo MRI = nullptr*;
474	const TargetRegisterInfo TRI = nullptr*;
475	std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
476
477	public:
478	static char ID;
479
480	ARMLowOverheadLoops() : MachineFunctionPass (ID) { }
481
482	void getAnalysisUsage(AnalysisUsage &AU) const override {
483	AU.setPreservesCFG();
484	AU.addRequired<MachineLoopInfoWrapperPass>();
485	AU.addRequired<ReachingDefAnalysis>();
486	MachineFunctionPass::getAnalysisUsage(AU);
487	}
488
489	bool runOnMachineFunction(MachineFunction &MF) override;
490
491	MachineFunctionProperties getRequiredProperties() const override {
492	return MachineFunctionProperties ().setNoVRegs().setTracksLiveness();
493	}
494
495	StringRef getPassName() const override {
496	return ARM_LOW_OVERHEAD_LOOPS_NAME;
497	}
498
499	private:
500	bool ProcessLoop(MachineLoop *ML);
501
502	bool RevertNonLoops();
503
504	void RevertWhile(MachineInstr MI) const*;
505	void RevertDo(MachineInstr MI) const*;
506
507	bool RevertLoopDec(MachineInstr MI) const*;
508
509	void RevertLoopEnd(MachineInstr MI, bool* SkipCmp = false) const;
510
511	void RevertLoopEndDec(MachineInstr MI) const*;
512
513	void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
514
515	MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
516
517	void Expand(LowOverheadLoop &LoLoop);
518
519	void IterationCountDCE(LowOverheadLoop &LoLoop);
520	};
521	}
522
523	char ARMLowOverheadLoops::ID = `0`;
524
525	INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
526	false, false)
527
528	static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
529	InstSet &ToRemove, InstSet &Ignore) {
530
531	// Check that we can remove all of Killed without having to modify any IT
532	// blocks.
533	auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
534	// Collect the dead code and the MBBs in which they reside.
535	SmallPtrSet<MachineBasicBlock*, `2`> BasicBlocks;
536	for (auto *Dead : Killed)
537	BasicBlocks.insert(Ptr: Dead->getParent());
538
539	// Collect IT blocks in all affected basic blocks.
540	std::map<MachineInstr , SmallPtrSet<MachineInstr , `2`>> ITBlocks;
541	for (auto *MBB : BasicBlocks) {
542	for (auto &IT : *MBB) {
543	if (IT.getOpcode() != ARM::t2IT)
544	continue;
545	RDA.getReachingLocalUses(MI: &IT, Reg: MCRegister::from(Val: ARM::ITSTATE),
546	Uses&: ITBlocks [&IT]);
547	}
548	}
549
550	// If we're removing all of the instructions within an IT block, then
551	// also remove the IT instruction.
552	SmallPtrSet<MachineInstr *, `2`> ModifiedITs;
553	SmallPtrSet<MachineInstr *, `2`> RemoveITs;
554	for (auto *Dead : Killed) {
555	if (MachineOperand *MO =
556	Dead->findRegisterUseOperand(Reg: ARM::ITSTATE, /TRI=/nullptr)) {
557	MachineInstr IT = RDA.getMIOperand(MI: Dead, MO&: MO);
558	RemoveITs.insert(Ptr: IT);
559	auto &CurrentBlock = ITBlocks [IT];
560	CurrentBlock.erase(Ptr: Dead);
561	if (CurrentBlock.empty())
562	ModifiedITs.erase(Ptr: IT);
563	else
564	ModifiedITs.insert(Ptr: IT);
565	}
566	}
567	if (!ModifiedITs.empty())
568	return false;
569	Killed.insert_range(R&: RemoveITs);
570	return true;
571	};
572
573	SmallPtrSet<MachineInstr *, `2`> Uses;
574	if (!RDA.isSafeToRemove(MI, ToRemove&: Uses, Ignore))
575	return false;
576
577	if (WontCorruptITs (Uses, RDA)) {
578	ToRemove.insert_range(R&: Uses);
579	LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
580	<< " - can also remove:\n";
581	for (auto *Use : Uses)
582	dbgs() << " - " << *Use);
583
584	SmallPtrSet<MachineInstr*, `4`> Killed;
585	RDA.collectKilledOperands(MI, Dead&: Killed);
586	if (WontCorruptITs (Killed, RDA)) {
587	ToRemove.insert_range(R&: Killed);
588	LLVM_DEBUG(for (auto *Dead : Killed)
589	dbgs() << " - " << *Dead);
590	}
591	return true;
592	}
593	return false;
594	}
595
596	bool LowOverheadLoop::ValidateTailPredicate() {
597	if (!IsTailPredicationLegal()) {
598	LLVM_DEBUG(if (VCTPs.empty())
599	dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
600	dbgs() << "ARM Loops: Tail-predication is not valid.\n");
601	return false;
602	}
603
604	assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
605	assert(ML.getBlocks().size() == `1` &&
606	"Shouldn't be processing a loop with more than one block");
607
608	if (DisableTailPredication) {
609	LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
610	return false;
611	}
612
613	if (!VPTstate.isValid(RDA)) {
614	LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
615	return false;
616	}
617
618	if (!ValidateLiveOuts()) {
619	LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
620	return false;
621	}
622
623	// For tail predication, we need to provide the number of elements, instead
624	// of the iteration count, to the loop start instruction. The number of
625	// elements is provided to the vctp instruction, so we need to check that
626	// we can use this register at InsertPt.
627	MachineInstr *VCTP = VCTPs.back();
628	if (Start->getOpcode() == ARM::t2DoLoopStartTP \|\|
629	Start->getOpcode() == ARM::t2WhileLoopStartTP) {
630	TPNumElements = Start->getOperand(i: `2`);
631	StartInsertPt = Start;
632	StartInsertBB = Start->getParent();
633	} else {
634	TPNumElements = VCTP->getOperand(i: `1`);
635	MCRegister NumElements = TPNumElements.getReg().asMCReg();
636
637	// If the register is defined within loop, then we can't perform TP.
638	// TODO: Check whether this is just a mov of a register that would be
639	// available.
640	if (RDA.hasLocalDefBefore(MI: VCTP, Reg: NumElements)) {
641	LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
642	return false;
643	}
644
645	// The element count register maybe defined after InsertPt, in which case we
646	// need to try to move either InsertPt or the def so that the [w\|d]lstp can
647	// use the value.
648
649	if (StartInsertPt != StartInsertBB->end() &&
650	!RDA.isReachingDefLiveOut(MI: &*StartInsertPt, Reg: NumElements)) {
651	if (auto *ElemDef =
652	RDA.getLocalLiveOutMIDef(MBB: StartInsertBB, Reg: NumElements)) {
653	if (RDA.isSafeToMoveForwards(From: ElemDef, To: &*StartInsertPt)) {
654	ElemDef->removeFromParent();
655	StartInsertBB->insert(I: StartInsertPt, MI: ElemDef);
656	LLVM_DEBUG(dbgs()
657	<< "ARM Loops: Moved element count def: " << *ElemDef);
658	} else if (RDA.isSafeToMoveBackwards(From: &*StartInsertPt, To: ElemDef)) {
659	StartInsertPt ->removeFromParent();
660	StartInsertBB->insertAfter(I: MachineBasicBlock::iterator (ElemDef),
661	MI: &*StartInsertPt);
662	LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
663	} else {
664	// If we fail to move an instruction and the element count is provided
665	// by a mov, use the mov operand if it will have the same value at the
666	// insertion point
667	MachineOperand Operand = ElemDef->getOperand(i: `1`);
668	if (isMovRegOpcode(Opc: ElemDef->getOpcode()) &&
669	RDA.getUniqueReachingMIDef(MI: ElemDef, Reg: Operand.getReg().asMCReg()) ==
670	RDA.getUniqueReachingMIDef(MI: &*StartInsertPt,
671	Reg: Operand.getReg().asMCReg())) {
672	TPNumElements = Operand;
673	NumElements = TPNumElements.getReg();
674	} else {
675	LLVM_DEBUG(dbgs()
676	<< "ARM Loops: Unable to move element count to loop "
677	<< "start instruction.\n");
678	return false;
679	}
680	}
681	}
682	}
683
684	// Especially in the case of while loops, InsertBB may not be the
685	// preheader, so we need to check that the register isn't redefined
686	// before entering the loop.
687	auto CannotProvideElements = [this](MachineBasicBlock *MBB,
688	MCRegister NumElements) {
689	if (MBB->empty())
690	return false;
691	// NumElements is redefined in this block.
692	if (RDA.hasLocalDefBefore(MI: &MBB->back(), Reg: NumElements))
693	return true;
694
695	// Don't continue searching up through multiple predecessors.
696	if (MBB->pred_size() > `1`)
697	return true;
698
699	return false;
700	};
701
702	// Search backwards for a def, until we get to InsertBB.
703	MachineBasicBlock *MBB = Preheader;
704	while (MBB && MBB != StartInsertBB) {
705	if (CannotProvideElements (MBB, NumElements)) {
706	LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
707	return false;
708	}
709	MBB = *MBB->pred_begin();
710	}
711	}
712
713	// Could inserting the [W\|D]LSTP cause some unintended affects? In a perfect
714	// world the [w\|d]lstp instruction would be last instruction in the preheader
715	// and so it would only affect instructions within the loop body. But due to
716	// scheduling, and/or the logic in this pass (above), the insertion point can
717	// be moved earlier. So if the Loop Start isn't the last instruction in the
718	// preheader, and if the initial element count is smaller than the vector
719	// width, the Loop Start instruction will immediately generate one or more
720	// false lane mask which can, incorrectly, affect the proceeding MVE
721	// instructions in the preheader.
722	if (std::any_of(first: StartInsertPt, last: StartInsertBB->end(), pred: shouldInspect)) {
723	LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W\|D]LSTP\n");
724	return false;
725	}
726
727	// For any DoubleWidthResultInstrs we found whilst scanning instructions, they
728	// need to compute an output size that is smaller than the VCTP mask operates
729	// on. The VecSize of the DoubleWidthResult is the larger vector size - the
730	// size it extends into, so any VCTP VecSize <= is valid.
731	unsigned VCTPVecSize = getVecSize(MI: *VCTP);
732	for (MachineInstr *MI : DoubleWidthResultInstrs) {
733	unsigned InstrVecSize = getVecSize(MI: *MI);
734	if (InstrVecSize > VCTPVecSize) {
735	LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
736	<< "VecSize:\n" << *MI);
737	return false;
738	}
739	}
740
741	// Check that the value change of the element count is what we expect and
742	// that the predication will be equivalent. For this we need:
743	// NumElements = NumElements - VectorWidth. The sub will be a sub immediate
744	// and we can also allow register copies within the chain too.
745	auto IsValidSub = [](MachineInstr MI, int* ExpectedVecWidth) {
746	return -getAddSubImmediate(MI&: *MI) == ExpectedVecWidth;
747	};
748
749	MachineBasicBlock *MBB = VCTP->getParent();
750	// Remove modifications to the element count since they have no purpose in a
751	// tail predicated loop. Explicitly refer to the vctp operand no matter which
752	// register NumElements has been assigned to, since that is what the
753	// modifications will be using
754	if (auto *Def = RDA.getUniqueReachingMIDef(
755	MI: &MBB->back(), Reg: VCTP->getOperand(i: `1`).getReg().asMCReg())) {
756	SmallPtrSet<MachineInstr*, `2`> ElementChain;
757	SmallPtrSet<MachineInstr*, `2`> Ignore;
758	unsigned ExpectedVectorWidth = getTailPredVectorWidth(Opcode: VCTP->getOpcode());
759
760	Ignore.insert_range(R&: VCTPs);
761
762	if (TryRemove(MI: Def, RDA, ToRemove&: ElementChain, Ignore)) {
763	bool FoundSub = false;
764
765	for (auto *MI : ElementChain) {
766	if (isMovRegOpcode(Opc: MI->getOpcode()))
767	continue;
768
769	if (isSubImmOpcode(Opc: MI->getOpcode())) {
770	if (FoundSub \|\| !IsValidSub (MI, ExpectedVectorWidth)) {
771	LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
772	" count: " << *MI);
773	return false;
774	}
775	FoundSub = true;
776	} else {
777	LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
778	" count: " << *MI);
779	return false;
780	}
781	}
782	ToRemove.insert_range(R&: ElementChain);
783	}
784	}
785
786	// If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
787	// can also remove any extra instructions in the preheader, which often
788	// includes a now unused MOV.
789	if ((Start->getOpcode() == ARM::t2DoLoopStartTP \|\|
790	Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
791	Preheader && !Preheader->empty() &&
792	!RDA.hasLocalDefBefore(MI: VCTP, Reg: VCTP->getOperand(i: `1`).getReg())) {
793	if (auto *Def = RDA.getUniqueReachingMIDef(
794	MI: &Preheader->back(), Reg: VCTP->getOperand(i: `1`).getReg().asMCReg())) {
795	SmallPtrSet<MachineInstr *, `2`> Ignore(llvm::from_range, VCTPs);
796	TryRemove(MI: Def, RDA, ToRemove, Ignore);
797	}
798	}
799
800	return true;
801	}
802
803	static bool isRegInClass(const MachineOperand &MO,
804	const TargetRegisterClass *Class) {
805	return MO.isReg() && MO.getReg() && Class->contains(Reg: MO.getReg());
806	}
807
808	// MVE 'narrowing' operate on half a lane, reading from half and writing
809	// to half, which are referred to has the top and bottom half. The other
810	// half retains its previous value.
811	static bool retainsPreviousHalfElement(const MachineInstr &MI) {
812	const MCInstrDesc &MCID = MI.getDesc();
813	uint64_t Flags = MCID.TSFlags;
814	return (Flags & ARMII::RetainsPreviousHalfElement) != `0`;
815	}
816
817	// Some MVE instructions read from the top/bottom halves of their operand(s)
818	// and generate a vector result with result elements that are double the
819	// width of the input.
820	static bool producesDoubleWidthResult(const MachineInstr &MI) {
821	const MCInstrDesc &MCID = MI.getDesc();
822	uint64_t Flags = MCID.TSFlags;
823	return (Flags & ARMII::DoubleWidthResult) != `0`;
824	}
825
826	// Can this instruction generate a non-zero result when given only zeroed
827	// operands? This allows us to know that, given operands with false bytes
828	// zeroed by masked loads, that the result will also contain zeros in those
829	// bytes.
830	static bool canGenerateNonZeros(const MachineInstr &MI) {
831
832	// Check for instructions which can write into a larger element size,
833	// possibly writing into a previous zero'd lane.
834	if (producesDoubleWidthResult(MI))
835	return true;
836
837	switch (MI.getOpcode()) {
838	default:
839	break;
840	// FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
841	// fp16 -> fp32 vector conversions.
842	// Instructions that perform a NOT will generate 1s from 0s.
843	case ARM::MVE_VMVN:
844	case ARM::MVE_VORN:
845	// Count leading zeros will do just that!
846	case ARM::MVE_VCLZs8:
847	case ARM::MVE_VCLZs16:
848	case ARM::MVE_VCLZs32:
849	return true;
850	}
851	return false;
852	}
853
854	// Look at its register uses to see if it only can only receive zeros
855	// into its false lanes which would then produce zeros. Also check that
856	// the output register is also defined by an FalseLanesZero instruction
857	// so that if tail-predication happens, the lanes that aren't updated will
858	// still be zeros.
859	static bool producesFalseLanesZero(MachineInstr &MI,
860	const TargetRegisterClass *QPRs,
861	const ReachingDefAnalysis &RDA,
862	InstSet &FalseLanesZero) {
863	if (canGenerateNonZeros(MI))
864	return false;
865
866	bool isPredicated = isVectorPredicated(MI: &MI);
867	// Predicated loads will write zeros to the falsely predicated bytes of the
868	// destination register.
869	if (MI.mayLoad())
870	return isPredicated;
871
872	auto IsZeroInit = [](MachineInstr *Def) {
873	return !isVectorPredicated(MI: Def) &&
874	Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
875	Def->getOperand(i: `1`).getImm() == `0`;
876	};
877
878	bool AllowScalars = isHorizontalReduction(MI);
879	for (auto &MO : MI.operands()) {
880	if (!MO.isReg() \|\| !MO.getReg())
881	continue;
882	if (!isRegInClass(MO, Class: QPRs) && AllowScalars)
883	continue;
884	// Skip the lr predicate reg
885	int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
886	if (PIdx != -`1` && (int)MO.getOperandNo() == PIdx + `2`)
887	continue;
888
889	// Check that this instruction will produce zeros in its false lanes:
890	// - If it only consumes false lanes zero or constant 0 (vmov #0)
891	// - If it's predicated, it only matters that it's def register already has
892	// false lane zeros, so we can ignore the uses.
893	SmallPtrSet<MachineInstr *, `2`> Defs;
894	RDA.getGlobalReachingDefs(MI: &MI, Reg: MO.getReg(), Defs);
895	if (Defs.empty())
896	return false;
897	for (auto *Def : Defs) {
898	if (Def == &MI \|\| FalseLanesZero.count(Ptr: Def) \|\| IsZeroInit (Def))
899	continue;
900	if (MO.isUse() && isPredicated)
901	continue;
902	return false;
903	}
904	}
905	LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
906	return true;
907	}
908
909	bool LowOverheadLoop::ValidateLiveOuts() {
910	// We want to find out if the tail-predicated version of this loop will
911	// produce the same values as the loop in its original form. For this to
912	// be true, the newly inserted implicit predication must not change the
913	// the (observable) results.
914	// We're doing this because many instructions in the loop will not be
915	// predicated and so the conversion from VPT predication to tail-predication
916	// can result in different values being produced; due to the tail-predication
917	// preventing many instructions from updating their falsely predicated
918	// lanes. This analysis assumes that all the instructions perform lane-wise
919	// operations and don't perform any exchanges.
920	// A masked load, whether through VPT or tail predication, will write zeros
921	// to any of the falsely predicated bytes. So, from the loads, we know that
922	// the false lanes are zeroed and here we're trying to track that those false
923	// lanes remain zero, or where they change, the differences are masked away
924	// by their user(s).
925	// All MVE stores have to be predicated, so we know that any predicate load
926	// operands, or stored results are equivalent already. Other explicitly
927	// predicated instructions will perform the same operation in the original
928	// loop and the tail-predicated form too. Because of this, we can insert
929	// loads, stores and other predicated instructions into our Predicated
930	// set and build from there.
931	const TargetRegisterClass *QPRs = TRI.getRegClass(i: ARM::MQPRRegClassID);
932	SetVector<MachineInstr *> FalseLanesUnknown;
933	SmallPtrSet<MachineInstr *, `4`> FalseLanesZero;
934	SmallPtrSet<MachineInstr *, `4`> Predicated;
935	MachineBasicBlock *Header = ML.getHeader();
936
937	LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
938
939	for (auto &MI : *Header) {
940	if (!shouldInspect(MI))
941	continue;
942
943	if (isVCTP(MI: &MI) \|\| isVPTOpcode(Opc: MI.getOpcode()))
944	continue;
945
946	bool isPredicated = isVectorPredicated(MI: &MI);
947	bool retainsOrReduces =
948	retainsPreviousHalfElement(MI) \|\| isHorizontalReduction(MI);
949
950	if (isPredicated)
951	Predicated.insert(Ptr: &MI);
952	if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
953	FalseLanesZero.insert(Ptr: &MI);
954	else if (MI.getNumDefs() == `0`)
955	continue;
956	else if (!isPredicated && retainsOrReduces) {
957	LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI);
958	return false;
959	} else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
960	FalseLanesUnknown.insert(X: &MI);
961	}
962
963	LLVM_DEBUG({
964	dbgs() << " Predicated:\n";
965	for (auto *I : Predicated)
966	dbgs() << " " << *I;
967	dbgs() << " FalseLanesZero:\n";
968	for (auto *I : FalseLanesZero)
969	dbgs() << " " << *I;
970	dbgs() << " FalseLanesUnknown:\n";
971	for (auto *I : FalseLanesUnknown)
972	dbgs() << " " << *I;
973	});
974
975	auto HasPredicatedUsers = [this](MachineInstr MI, const* MachineOperand &MO,
976	SmallPtrSetImpl<MachineInstr *> &Predicated) {
977	SmallPtrSet<MachineInstr *, `2`> Uses;
978	RDA.getGlobalUses(MI, Reg: MO.getReg().asMCReg(), Uses);
979	for (auto *Use : Uses) {
980	if (Use != MI && !Predicated.count(Ptr: Use))
981	return false;
982	}
983	return true;
984	};
985
986	// Visit the unknowns in reverse so that we can start at the values being
987	// stored and then we can work towards the leaves, hopefully adding more
988	// instructions to Predicated. Successfully terminating the loop means that
989	// all the unknown values have to found to be masked by predicated user(s).
990	// For any unpredicated values, we store them in NonPredicated so that we
991	// can later check whether these form a reduction.
992	SmallPtrSet<MachineInstr*, `2`> NonPredicated;
993	for (auto *MI : reverse(C&: FalseLanesUnknown)) {
994	for (auto &MO : MI->operands()) {
995	if (!isRegInClass(MO, Class: QPRs) \|\| !MO.isDef())
996	continue;
997	if (!HasPredicatedUsers (MI, MO, Predicated)) {
998	LLVM_DEBUG(dbgs() << " Found an unknown def of : "
999	<< TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
1000	NonPredicated.insert(Ptr: MI);
1001	break;
1002	}
1003	}
1004	// Any unknown false lanes have been masked away by the user(s).
1005	if (!NonPredicated.contains(Ptr: MI))
1006	Predicated.insert(Ptr: MI);
1007	}
1008
1009	SmallPtrSet<MachineInstr *, `2`> LiveOutMIs;
1010	SmallVector<MachineBasicBlock *, `2`> ExitBlocks;
1011	ML.getExitBlocks(ExitBlocks);
1012	assert(ML.getNumBlocks() == `1` && "Expected single block loop!");
1013	assert(ExitBlocks.size() == `1` && "Expected a single exit block");
1014	MachineBasicBlock *ExitBB = ExitBlocks.front();
1015	for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
1016	// TODO: Instead of blocking predication, we could move the vctp to the exit
1017	// block and calculate it's operand there in or the preheader.
1018	if (RegMask.PhysReg == ARM::VPR) {
1019	LLVM_DEBUG(dbgs() << " VPR is live in to the exit block.");
1020	return false;
1021	}
1022	// Check Q-regs that are live in the exit blocks. We don't collect scalars
1023	// because they won't be affected by lane predication.
1024	if (QPRs->contains(Reg: RegMask.PhysReg))
1025	if (auto *MI = RDA.getLocalLiveOutMIDef(MBB: Header, Reg: RegMask.PhysReg))
1026	LiveOutMIs.insert(Ptr: MI);
1027	}
1028
1029	// We've already validated that any VPT predication within the loop will be
1030	// equivalent when we perform the predication transformation; so we know that
1031	// any VPT predicated instruction is predicated upon VCTP. Any live-out
1032	// instruction needs to be predicated, so check this here. The instructions
1033	// in NonPredicated have been found to be a reduction that we can ensure its
1034	// legality. Any MQPRCopy found will need to validate its input as if it was
1035	// live out.
1036	SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
1037	while (!Worklist.empty()) {
1038	MachineInstr *MI = Worklist.pop_back_val();
1039	if (MI->getOpcode() == ARM::MQPRCopy) {
1040	VMOVCopies.insert(Ptr: MI);
1041	MachineInstr *CopySrc =
1042	RDA.getUniqueReachingMIDef(MI, Reg: MI->getOperand(i: `1`).getReg());
1043	if (CopySrc)
1044	Worklist.push_back(Elt: CopySrc);
1045	} else if (NonPredicated.count(Ptr: MI) && FalseLanesUnknown.contains(key: MI)) {
1046	LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
1047	VMOVCopies.clear();
1048	return false;
1049	}
1050	}
1051
1052	return true;
1053	}
1054
1055	void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
1056	if (Revert)
1057	return;
1058
1059	// Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
1060	// can only jump back.
1061	auto ValidateRanges = [](MachineInstr Start, MachineInstr End,
1062	ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
1063	MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
1064	? End->getOperand(i: `1`).getMBB()
1065	: End->getOperand(i: `2`).getMBB();
1066	// TODO Maybe there's cases where the target doesn't have to be the header,
1067	// but for now be safe and revert.
1068	if (TgtBB != ML.getHeader()) {
1069	LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
1070	return false;
1071	}
1072
1073	// The WLS and LE instructions have 12-bits for the label offset. WLS
1074	// requires a positive offset, while LE uses negative.
1075	if (BBUtils->getOffsetOf(MI: End) < BBUtils->getOffsetOf(MBB: ML.getHeader()) \|\|
1076	!BBUtils->isBBInRange(MI: End, DestBB: ML.getHeader(), MaxDisp: `4094`)) {
1077	LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
1078	return false;
1079	}
1080
1081	if (isWhileLoopStart(MI: *Start)) {
1082	MachineBasicBlock TargetBB = getWhileLoopStartTargetBB(MI: Start);
1083	if (BBUtils->getOffsetOf(MI: Start) > BBUtils->getOffsetOf(MBB: TargetBB) \|\|
1084	!BBUtils->isBBInRange(MI: Start, DestBB: TargetBB, MaxDisp: `4094`)) {
1085	LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
1086	return false;
1087	}
1088	}
1089	return true;
1090	};
1091
1092	StartInsertPt = MachineBasicBlock::iterator (Start);
1093	StartInsertBB = Start->getParent();
1094	LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
1095	<< *StartInsertPt);
1096
1097	Revert = !ValidateRanges (Start, End, BBUtils, ML);
1098	CannotTailPredicate = !ValidateTailPredicate();
1099	}
1100
1101	bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
1102	LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
1103	if (VCTPs.empty()) {
1104	VCTPs.push_back(Elt: MI);
1105	return true;
1106	}
1107
1108	// If we find another VCTP, check whether it uses the same value as the main VCTP.
1109	// If it does, store it in the VCTPs set, else refuse it.
1110	MachineInstr *Prev = VCTPs.back();
1111	if (!Prev->getOperand(i: `1`).isIdenticalTo(Other: MI->getOperand(i: `1`)) \|\|
1112	!RDA.hasSameReachingDef(A: Prev, B: MI, Reg: MI->getOperand(i: `1`).getReg().asMCReg())) {
1113	LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
1114	"definition from the main VCTP");
1115	return false;
1116	}
1117	VCTPs.push_back(Elt: MI);
1118	return true;
1119	}
1120
1121	static bool ValidateMVEStore(MachineInstr MI, MachineLoop ML) {
1122
1123	auto GetFrameIndex = [](MachineMemOperand *Operand) {
1124	const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
1125	if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
1126	if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(Val: PseudoValue)) {
1127	return FS->getFrameIndex();
1128	}
1129	}
1130	return -`1`;
1131	};
1132
1133	auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
1134	switch (I->getOpcode()) {
1135	case ARM::MVE_VSTRWU32:
1136	case ARM::MVE_VLDRWU32: {
1137	return I->getOperand(i: `1`).getReg() == ARM::SP &&
1138	I->memoperands().size() == `1` &&
1139	GetFrameIndex (I->memoperands().front()) >= `0`;
1140	}
1141	default:
1142	return false;
1143	}
1144	};
1145
1146	// An unpredicated vector register spill is allowed if all of the uses of the
1147	// stack slot are within the loop
1148	if (MI->getOpcode() != ARM::MVE_VSTRWU32 \|\| !IsStackOp (MI))
1149	return false;
1150
1151	// Search all blocks after the loop for accesses to the same stack slot.
1152	// ReachingDefAnalysis doesn't work for sp as it relies on registers being
1153	// live-out (which sp never is) to know what blocks to look in
1154	if (MI->memoperands().size() == `0`)
1155	return false;
1156	int FI = GetFrameIndex (MI->memoperands().front());
1157
1158	auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
1159	if (FI == -`1` \|\| !FrameInfo.isSpillSlotObjectIndex(ObjectIdx: FI))
1160	return false;
1161
1162	SmallVector<MachineBasicBlock *> Frontier;
1163	ML->getExitBlocks(ExitBlocks&: Frontier);
1164	SmallPtrSet<MachineBasicBlock *, `4`> Visited{MI->getParent()};
1165	unsigned Idx = `0`;
1166	while (Idx < Frontier.size()) {
1167	MachineBasicBlock *BB = Frontier [Idx];
1168	bool LookAtSuccessors = true;
1169	for (auto &I : *BB) {
1170	if (!IsStackOp (&I) \|\| I.memoperands().size() == `0`)
1171	continue;
1172	if (GetFrameIndex (I.memoperands().front()) != FI)
1173	continue;
1174	// If this block has a store to the stack slot before any loads then we
1175	// can ignore the block
1176	if (I.getOpcode() == ARM::MVE_VSTRWU32) {
1177	LookAtSuccessors = false;
1178	break;
1179	}
1180	// If the store and the load are using the same stack slot then the
1181	// store isn't valid for tail predication
1182	if (I.getOpcode() == ARM::MVE_VLDRWU32)
1183	return false;
1184	}
1185
1186	if (LookAtSuccessors) {
1187	for (auto *Succ : BB->successors()) {
1188	if (!Visited.contains(Ptr: Succ) && !is_contained(Range&: Frontier, Element: Succ))
1189	Frontier.push_back(Elt: Succ);
1190	}
1191	}
1192	Visited.insert(Ptr: BB);
1193	Idx++;
1194	}
1195
1196	return true;
1197	}
1198
1199	bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
1200	if (CannotTailPredicate)
1201	return false;
1202
1203	if (!shouldInspect(MI&: *MI))
1204	return true;
1205
1206	if (MI->getOpcode() == ARM::MVE_VPSEL \|\|
1207	MI->getOpcode() == ARM::MVE_VPNOT) {
1208	// TODO: Allow VPSEL and VPNOT, we currently cannot because:
1209	// 1) It will use the VPR as a predicate operand, but doesn't have to be
1210	// instead a VPT block, which means we can assert while building up
1211	// the VPT block because we don't find another VPT or VPST to being a new
1212	// one.
1213	// 2) VPSEL still requires a VPR operand even after tail predicating,
1214	// which means we can't remove it unless there is another
1215	// instruction, such as vcmp, that can provide the VPR def.
1216	return false;
1217	}
1218
1219	// Record all VCTPs and check that they're equivalent to one another.
1220	if (isVCTP(MI) && !AddVCTP(MI))
1221	return false;
1222
1223	// Inspect uses first so that any instructions that alter the VPR don't
1224	// alter the predicate upon themselves.
1225	const MCInstrDesc &MCID = MI->getDesc();
1226	bool IsUse = false;
1227	unsigned LastOpIdx = MI->getNumOperands() - `1`;
1228	for (const auto &Op : enumerate(First: reverse(C: MCID.operands()))) {
1229	const MachineOperand &MO = MI->getOperand(i: LastOpIdx - Op.index());
1230	if (!MO.isReg() \|\| !MO.isUse() \|\| MO.getReg() != ARM::VPR)
1231	continue;
1232
1233	if (ARM::isVpred(op: Op.value().OperandType)) {
1234	VPTstate.addInst(MI);
1235	IsUse = true;
1236	} else if (MI->getOpcode() != ARM::MVE_VPST) {
1237	LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
1238	return false;
1239	}
1240	}
1241
1242	// If we find an instruction that has been marked as not valid for tail
1243	// predication, only allow the instruction if it's contained within a valid
1244	// VPT block.
1245	bool RequiresExplicitPredication =
1246	(MCID.TSFlags & ARMII::ValidForTailPredication) == `0`;
1247	if (isDomainMVE(MI) && RequiresExplicitPredication) {
1248	if (MI->getOpcode() == ARM::MQPRCopy)
1249	return true;
1250	if (!IsUse && producesDoubleWidthResult(MI: *MI)) {
1251	DoubleWidthResultInstrs.insert(Ptr: MI);
1252	return true;
1253	}
1254
1255	LLVM_DEBUG(if (!IsUse) dbgs()
1256	<< "ARM Loops: Can't tail predicate: " << *MI);
1257	return IsUse;
1258	}
1259
1260	// If the instruction is already explicitly predicated, then the conversion
1261	// will be fine, but ensure that all store operations are predicated.
1262	if (MI->mayStore() && !ValidateMVEStore(MI, ML: &ML))
1263	return IsUse;
1264
1265	// If this instruction defines the VPR, update the predicate for the
1266	// proceeding instructions.
1267	if (isVectorPredicate(MI)) {
1268	// Clear the existing predicate when we're not in VPT Active state,
1269	// otherwise we add to it.
1270	if (!isVectorPredicated(MI))
1271	VPTstate.resetPredicate(MI);
1272	else
1273	VPTstate.addPredicate(MI);
1274	}
1275
1276	// Finally once the predicate has been modified, we can start a new VPT
1277	// block if necessary.
1278	if (isVPTOpcode(Opc: MI->getOpcode()))
1279	VPTstate.CreateVPTBlock(MI);
1280
1281	return true;
1282	}
1283
1284	bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
1285	const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
1286	if (!ST.hasLOB())
1287	return false;
1288
1289	MF = &mf;
1290	LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
1291
1292	MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
1293	RDA = &getAnalysis<ReachingDefAnalysis>();
1294	MF->getProperties().setTracksLiveness();
1295	MRI = &MF->getRegInfo();
1296	TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
1297	TRI = ST.getRegisterInfo();
1298	BBUtils = std::make_unique<ARMBasicBlockUtils>(args&: *MF);
1299	BBUtils ->computeAllBlockSizes();
1300	BBUtils ->adjustBBOffsetsAfter(MBB: &MF->front());
1301
1302	bool Changed = false;
1303	for (auto ML : MLI) {
1304	if (ML->isOutermost())
1305	Changed \|= ProcessLoop(ML);
1306	}
1307	Changed \|= RevertNonLoops();
1308	return Changed;
1309	}
1310
1311	bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
1312	bool Changed = false;
1313
1314	// Process inner loops first.
1315	for (MachineLoop L : ML)
1316	Changed \|= ProcessLoop(ML: L);
1317
1318	LLVM_DEBUG({
1319	dbgs() << "ARM Loops: Processing loop containing:\n";
1320	if (auto *Preheader = ML->getLoopPreheader())
1321	dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1322	else if (auto Preheader = MLI->findLoopPreheader(ML, true, true*))
1323	dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1324	for (auto *MBB : ML->getBlocks())
1325	dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
1326	});
1327
1328	// Search the given block for a loop start instruction. If one isn't found,
1329	// and there's only one predecessor block, search that one too.
1330	std::function<MachineInstr(MachineBasicBlock)> SearchForStart =
1331	[&SearchForStart](MachineBasicBlock MBB) -> MachineInstr {
1332	for (auto &MI : *MBB) {
1333	if (isLoopStart(MI))
1334	return &MI;
1335	}
1336	if (MBB->pred_size() == `1`)
1337	return SearchForStart (*MBB->pred_begin());
1338	return nullptr;
1339	};
1340
1341	LowOverheadLoop LoLoop(ML, MLI, RDA, TRI, *TII);
1342	// Search the preheader for the start intrinsic.
1343	// FIXME: I don't see why we shouldn't be supporting multiple predecessors
1344	// with potentially multiple set.loop.iterations, so we need to enable this.
1345	if (LoLoop.Preheader)
1346	LoLoop.Start = SearchForStart (LoLoop.Preheader);
1347	else
1348	return Changed;
1349
1350	// Find the low-overhead loop components and decide whether or not to fall
1351	// back to a normal loop. Also look for a vctp instructions and decide
1352	// whether we can convert that predicate using tail predication.
1353	for (auto *MBB : reverse(C: ML->getBlocks())) {
1354	for (auto &MI : *MBB) {
1355	if (MI.isDebugValue())
1356	continue;
1357	else if (MI.getOpcode() == ARM::t2LoopDec)
1358	LoLoop.Dec = &MI;
1359	else if (MI.getOpcode() == ARM::t2LoopEnd)
1360	LoLoop.End = &MI;
1361	else if (MI.getOpcode() == ARM::t2LoopEndDec)
1362	LoLoop.End = LoLoop.Dec = &MI;
1363	else if (isLoopStart(MI))
1364	LoLoop.Start = &MI;
1365	else if (MI.getDesc().isCall()) {
1366	// TODO: Though the call will require LE to execute again, does this
1367	// mean we should revert? Always executing LE hopefully should be
1368	// faster than performing a sub,cmp,br or even subs,br.
1369	LoLoop.Revert = true;
1370	LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
1371	} else {
1372	// Record VPR defs and build up their corresponding vpt blocks.
1373	// Check we know how to tail predicate any mve instructions.
1374	LoLoop.AnalyseMVEInst(MI: &MI);
1375	}
1376	}
1377	}
1378
1379	LLVM_DEBUG(LoLoop.dump());
1380	if (!LoLoop.FoundAllComponents()) {
1381	LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
1382	return Changed;
1383	}
1384
1385	assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
1386	"Expected t2WhileLoopStart to be removed before regalloc!");
1387
1388	// Check that the only instruction using LoopDec is LoopEnd. This can only
1389	// happen when the Dec and End are separate, not a single t2LoopEndDec.
1390	// TODO: Check for copy chains that really have no effect.
1391	if (LoLoop.Dec != LoLoop.End) {
1392	SmallPtrSet<MachineInstr *, `2`> Uses;
1393	RDA->getReachingLocalUses(MI: LoLoop.Dec, Reg: MCRegister::from(Val: ARM::LR), Uses);
1394	if (Uses.size() > `1` \|\| !Uses.count(Ptr: LoLoop.End)) {
1395	LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
1396	LoLoop.Revert = true;
1397	}
1398	}
1399	LoLoop.Validate(BBUtils: BBUtils.get());
1400	Expand(LoLoop);
1401	return true;
1402	}
1403
1404	// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
1405	// beq that branches to the exit branch.
1406	// TODO: We could also try to generate a cbz if the value in LR is also in
1407	// another low register.
1408	void ARMLowOverheadLoops::RevertWhile(MachineInstr MI) const* {
1409	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
1410	MachineBasicBlock DestBB = getWhileLoopStartTargetBB(MI: MI);
1411	unsigned BrOpc = BBUtils ->isBBInRange(MI, DestBB, MaxDisp: `254`) ?
1412	ARM::tBcc : ARM::t2Bcc;
1413
1414	RevertWhileLoopStartLR(MI, TII, BrOpc);
1415	}
1416
1417	void ARMLowOverheadLoops::RevertDo(MachineInstr MI) const* {
1418	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
1419	RevertDoLoopStart(MI, TII);
1420	}
1421
1422	bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr MI) const* {
1423	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
1424	MachineBasicBlock *MBB = MI->getParent();
1425	SmallPtrSet<MachineInstr*, `1`> Ignore;
1426	for (auto I = MachineBasicBlock::iterator (MI), E = MBB->end(); I != E; ++I) {
1427	if (I ->getOpcode() == ARM::t2LoopEnd) {
1428	Ignore.insert(Ptr: &*I);
1429	break;
1430	}
1431	}
1432
1433	// If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
1434	bool SetFlags =
1435	RDA->isSafeToDefRegAt(MI, Reg: MCRegister::from(Val: ARM::CPSR), Ignore);
1436
1437	llvm::RevertLoopDec(MI, TII, SetFlags);
1438	return SetFlags;
1439	}
1440
1441	// Generate a subs, or sub and cmp, and a branch instead of an LE.
1442	void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr MI, bool* SkipCmp) const {
1443	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
1444
1445	MachineBasicBlock *DestBB = MI->getOperand(i: `1`).getMBB();
1446	unsigned BrOpc = BBUtils ->isBBInRange(MI, DestBB, MaxDisp: `254`) ?
1447	ARM::tBcc : ARM::t2Bcc;
1448
1449	llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
1450	}
1451
1452	// Generate a subs, or sub and cmp, and a branch instead of an LE.
1453	void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr MI) const* {
1454	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
1455	assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
1456	MachineBasicBlock *MBB = MI->getParent();
1457
1458	MachineInstrBuilder MIB =
1459	BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::t2SUBri));
1460	MIB.addDef(RegNo: ARM::LR);
1461	MIB.add(MO: MI->getOperand(i: `1`));
1462	MIB.addImm(Val: `1`);
1463	MIB.addImm(Val: ARMCC::AL);
1464	MIB.addReg(RegNo: ARM::NoRegister);
1465	MIB.addReg(RegNo: ARM::CPSR);
1466	MIB ->getOperand(i: `5`).setIsDef(true);
1467
1468	MachineBasicBlock *DestBB = MI->getOperand(i: `2`).getMBB();
1469	unsigned BrOpc =
1470	BBUtils ->isBBInRange(MI, DestBB, MaxDisp: `254`) ? ARM::tBcc : ARM::t2Bcc;
1471
1472	// Create bne
1473	MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: BrOpc));
1474	MIB.add(MO: MI->getOperand(i: `2`)); // branch target
1475	MIB.addImm(Val: ARMCC::NE); // condition code
1476	MIB.addReg(RegNo: ARM::CPSR);
1477
1478	MI->eraseFromParent();
1479	}
1480
1481	// Perform dead code elimation on the loop iteration count setup expression.
1482	// If we are tail-predicating, the number of elements to be processed is the
1483	// operand of the VCTP instruction in the vector body, see getCount(), which is
1484	// register $r3 in this example:
1485	//
1486	// $lr = big-itercount-expression
1487	// ..
1488	// $lr = t2DoLoopStart renamable $lr
1489	// vector.body:
1490	// ..
1491	// $vpr = MVE_VCTP32 renamable $r3
1492	// renamable $lr = t2LoopDec killed renamable $lr, 1
1493	// t2LoopEnd renamable $lr, %vector.body
1494	// tB %end
1495	//
1496	// What we would like achieve here is to replace the do-loop start pseudo
1497	// instruction t2DoLoopStart with:
1498	//
1499	// $lr = MVE_DLSTP_32 killed renamable $r3
1500	//
1501	// Thus, $r3 which defines the number of elements, is written to $lr,
1502	// and then we want to delete the whole chain that used to define $lr,
1503	// see the comment below how this chain could look like.
1504	//
1505	void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
1506	if (!LoLoop.IsTailPredicationLegal())
1507	return;
1508
1509	LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
1510
1511	MachineInstr *Def = RDA->getMIOperand(MI: LoLoop.Start, Idx: `1`);
1512	if (!Def) {
1513	LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
1514	return;
1515	}
1516
1517	// Collect and remove the users of iteration count.
1518	SmallPtrSet<MachineInstr*, `4`> Killed = { LoLoop.Start, LoLoop.Dec,
1519	LoLoop.End };
1520	if (!TryRemove(MI: Def, RDA&: *RDA, ToRemove&: LoLoop.ToRemove, Ignore&: Killed))
1521	LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
1522	}
1523
1524	MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
1525	LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
1526	// When using tail-predication, try to delete the dead code that was used to
1527	// calculate the number of loop iterations.
1528	IterationCountDCE(LoLoop);
1529
1530	MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
1531	MachineInstr *Start = LoLoop.Start;
1532	MachineBasicBlock *MBB = LoLoop.StartInsertBB;
1533	unsigned Opc = LoLoop.getStartOpcode();
1534	MachineOperand &Count = LoLoop.getLoopStartOperand();
1535
1536	// A DLS lr, lr we needn't emit
1537	MachineInstr* NewStart;
1538	if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() &&
1539	Count.getReg() == ARM::LR) {
1540	LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
1541	NewStart = nullptr;
1542	} else {
1543	MachineInstrBuilder MIB =
1544	BuildMI(BB&: *MBB, I: InsertPt, MIMD: Start->getDebugLoc(), MCID: TII->get(Opcode: Opc));
1545
1546	MIB.addDef(RegNo: ARM::LR);
1547	MIB.add(MO: Count);
1548	if (isWhileLoopStart(MI: *Start))
1549	MIB.addMBB(MBB: getWhileLoopStartTargetBB(MI: *Start));
1550
1551	LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
1552	NewStart = &*MIB;
1553	}
1554
1555	LoLoop.ToRemove.insert(Ptr: Start);
1556	return NewStart;
1557	}
1558
1559	void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
1560	auto RemovePredicate = [](MachineInstr *MI) {
1561	if (MI->isDebugInstr())
1562	return;
1563	LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
1564	int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI);
1565	assert(PIdx >= `1` && "Trying to unpredicate a non-predicated instruction");
1566	assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
1567	"Expected Then predicate!");
1568	MI->getOperand(i: PIdx).setImm(ARMVCC::None);
1569	MI->getOperand(i: PIdx + `1`).setReg(`0`);
1570	};
1571
1572	for (auto &Block : LoLoop.getVPTBlocks()) {
1573	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
1574
1575	auto ReplaceVCMPWithVPT = [&](MachineInstr &TheVCMP, MachineInstr At) {
1576	assert(TheVCMP && "Replacing a removed or non-existent VCMP");
1577	// Replace the VCMP with a VPT
1578	MachineInstrBuilder MIB =
1579	BuildMI(BB&: *At->getParent(), I: At, MIMD: At->getDebugLoc(),
1580	MCID: TII->get(Opcode: VCMPOpcodeToVPT(Opcode: TheVCMP->getOpcode())));
1581	MIB.addImm(Val: ARMVCC::Then);
1582	// Register one
1583	MIB.add(MO: TheVCMP->getOperand(i: `1`));
1584	// Register two
1585	MIB.add(MO: TheVCMP->getOperand(i: `2`));
1586	// The comparison code, e.g. ge, eq, lt
1587	MIB.add(MO: TheVCMP->getOperand(i: `3`));
1588	LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
1589	LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr());
1590	LoLoop.ToRemove.insert(Ptr: TheVCMP);
1591	TheVCMP = nullptr;
1592	};
1593
1594	if (LoLoop.VPTstate.isEntryPredicatedOnVCTP(Block, /exclusive/ Exclusive: true)) {
1595	MachineInstr *VPST = Insts.front();
1596	if (Block.hasUniformPredicate()) {
1597	// A vpt block starting with VPST, is only predicated upon vctp and has no
1598	// internal vpr defs:
1599	// - Remove vpst.
1600	// - Unpredicate the remaining instructions.
1601	LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1602	for (unsigned i = `1`; i < Insts.size(); ++i)
1603	RemovePredicate (Insts [i]);
1604	} else {
1605	// The VPT block has a non-uniform predicate but it uses a vpst and its
1606	// entry is guarded only by a vctp, which means we:
1607	// - Need to remove the original vpst.
1608	// - Then need to unpredicate any following instructions, until
1609	// we come across the divergent vpr def.
1610	// - Insert a new vpst to predicate the instruction(s) that following
1611	// the divergent vpr def.
1612	MachineInstr *Divergent = Block.getDivergent();
1613	MachineBasicBlock *MBB = Divergent->getParent();
1614	auto DivergentNext = ++MachineBasicBlock::iterator (Divergent);
1615	while (DivergentNext != MBB->end() && DivergentNext ->isDebugInstr())
1616	++DivergentNext;
1617
1618	bool DivergentNextIsPredicated =
1619	DivergentNext != MBB->end() &&
1620	getVPTInstrPredicate(MI: *DivergentNext) != ARMVCC::None;
1621
1622	for (auto I = ++MachineBasicBlock::iterator (VPST), E = DivergentNext;
1623	I != E; ++I)
1624	RemovePredicate (&*I);
1625
1626	// Check if the instruction defining vpr is a vcmp so it can be combined
1627	// with the VPST This should be the divergent instruction
1628	MachineInstr *VCMP =
1629	VCMPOpcodeToVPT(Opcode: Divergent->getOpcode()) != `0` ? Divergent : nullptr;
1630
1631	if (DivergentNextIsPredicated) {
1632	// Insert a VPST at the divergent only if the next instruction
1633	// would actually use it. A VCMP following a VPST can be
1634	// merged into a VPT so do that instead if the VCMP exists.
1635	if (!VCMP) {
1636	// Create a VPST (with a null mask for now, we'll recompute it
1637	// later)
1638	MachineInstrBuilder MIB =
1639	BuildMI(BB&: *Divergent->getParent(), I: Divergent,
1640	MIMD: Divergent->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VPST));
1641	MIB.addImm(Val: `0`);
1642	LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
1643	LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr());
1644	} else {
1645	// No RDA checks are necessary here since the VPST would have been
1646	// directly after the VCMP
1647	ReplaceVCMPWithVPT (VCMP, VCMP);
1648	}
1649	}
1650	}
1651	LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1652	LoLoop.ToRemove.insert(Ptr: VPST);
1653	} else if (Block.containsVCTP()) {
1654	// The vctp will be removed, so either the entire block will be dead or
1655	// the block mask of the vp(s)t will need to be recomputed.
1656	MachineInstr *VPST = Insts.front();
1657	if (Block.size() == `2`) {
1658	assert(VPST->getOpcode() == ARM::MVE_VPST &&
1659	"Found a VPST in an otherwise empty vpt block");
1660	LoLoop.ToRemove.insert(Ptr: VPST);
1661	} else
1662	LoLoop.BlockMasksToRecompute.insert(Ptr: VPST);
1663	} else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
1664	// If this block starts with a VPST then attempt to merge it with the
1665	// preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
1666	// block that no longer exists
1667	MachineInstr *VPST = Insts.front();
1668	auto Next = ++MachineBasicBlock::iterator (VPST);
1669	assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
1670	"The instruction after a VPST must be predicated");
1671	(void)Next;
1672	MachineInstr *VprDef = RDA->getUniqueReachingMIDef(MI: VPST, Reg: ARM::VPR);
1673	if (VprDef && VCMPOpcodeToVPT(Opcode: VprDef->getOpcode()) &&
1674	!LoLoop.ToRemove.contains(Ptr: VprDef)) {
1675	MachineInstr *VCMP = VprDef;
1676	// The VCMP and VPST can only be merged if the VCMP's operands will have
1677	// the same values at the VPST.
1678	// If any of the instructions between the VCMP and VPST are predicated
1679	// then a different code path is expected to have merged the VCMP and
1680	// VPST already.
1681	if (std::none_of(first: ++MachineBasicBlock::iterator (VCMP),
1682	last: MachineBasicBlock::iterator (VPST), pred: hasVPRUse) &&
1683	RDA->hasSameReachingDef(A: VCMP, B: VPST, Reg: VCMP->getOperand(i: `1`).getReg()) &&
1684	RDA->hasSameReachingDef(A: VCMP, B: VPST, Reg: VCMP->getOperand(i: `2`).getReg())) {
1685	ReplaceVCMPWithVPT (VCMP, VPST);
1686	LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1687	LoLoop.ToRemove.insert(Ptr: VPST);
1688	}
1689	}
1690	}
1691	}
1692
1693	LoLoop.ToRemove.insert_range(R&: LoLoop.VCTPs);
1694	}
1695
1696	void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1697
1698	// Combine the LoopDec and LoopEnd instructions into LE(TP).
1699	auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
1700	MachineInstr *End = LoLoop.End;
1701	MachineBasicBlock *MBB = End->getParent();
1702	unsigned Opc = LoLoop.IsTailPredicationLegal() ?
1703	ARM::MVE_LETP : ARM::t2LEUpdate;
1704	MachineInstrBuilder MIB = BuildMI(BB&: *MBB, I: End, MIMD: End->getDebugLoc(),
1705	MCID: TII->get(Opcode: Opc));
1706	MIB.addDef(RegNo: ARM::LR);
1707	unsigned Off = LoLoop.Dec == LoLoop.End ? `1` : `0`;
1708	MIB.add(MO: End->getOperand(i: Off + `0`));
1709	MIB.add(MO: End->getOperand(i: Off + `1`));
1710	LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
1711	LoLoop.ToRemove.insert(Ptr: LoLoop.Dec);
1712	LoLoop.ToRemove.insert(Ptr: End);
1713	return &*MIB;
1714	};
1715
1716	// TODO: We should be able to automatically remove these branches before we
1717	// get here - probably by teaching analyzeBranch about the pseudo
1718	// instructions.
1719	// If there is an unconditional branch, after I, that just branches to the
1720	// next block, remove it.
1721	auto RemoveDeadBranch = [](MachineInstr *I) {
1722	MachineBasicBlock *BB = I->getParent();
1723	MachineInstr *Terminator = &BB->instr_back();
1724	if (Terminator->isUnconditionalBranch() && I != Terminator) {
1725	MachineBasicBlock *Succ = Terminator->getOperand(i: `0`).getMBB();
1726	if (BB->isLayoutSuccessor(MBB: Succ)) {
1727	LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
1728	Terminator->eraseFromParent();
1729	}
1730	}
1731	};
1732
1733	// And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
1734	// Anything other MQPRCopy can be converted to MVE_VORR later on.
1735	auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, `4`> &VMOVCopies) {
1736	for (auto *MI : VMOVCopies) {
1737	LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
1738	assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1739	MachineBasicBlock *MBB = MI->getParent();
1740	Register Dst = MI->getOperand(i: `0`).getReg();
1741	Register Src = MI->getOperand(i: `1`).getReg();
1742	auto MIB1 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD),
1743	DestReg: ARM::D0 + (Dst - ARM::Q0) * `2`)
1744	.addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * `2`)
1745	.add(MOs: predOps(Pred: ARMCC::AL));
1746	(void)MIB1;
1747	LLVM_DEBUG(dbgs() << " into " << *MIB1);
1748	auto MIB2 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD),
1749	DestReg: ARM::D0 + (Dst - ARM::Q0) * `2` + `1`)
1750	.addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * `2` + `1`)
1751	.add(MOs: predOps(Pred: ARMCC::AL));
1752	LLVM_DEBUG(dbgs() << " and " << *MIB2);
1753	(void)MIB2;
1754	MI->eraseFromParent();
1755	}
1756	};
1757
1758	if (LoLoop.Revert) {
1759	if (isWhileLoopStart(MI: *LoLoop.Start))
1760	RevertWhile(MI: LoLoop.Start);
1761	else
1762	RevertDo(MI: LoLoop.Start);
1763	if (LoLoop.Dec == LoLoop.End)
1764	RevertLoopEndDec(MI: LoLoop.End);
1765	else
1766	RevertLoopEnd(MI: LoLoop.End, SkipCmp: RevertLoopDec(MI: LoLoop.Dec));
1767	} else {
1768	ExpandVMOVCopies (LoLoop.VMOVCopies);
1769	LoLoop.Start = ExpandLoopStart(LoLoop);
1770	if (LoLoop.Start)
1771	RemoveDeadBranch (LoLoop.Start);
1772	LoLoop.End = ExpandLoopEnd (LoLoop);
1773	RemoveDeadBranch (LoLoop.End);
1774	if (LoLoop.IsTailPredicationLegal())
1775	ConvertVPTBlocks(LoLoop);
1776	for (auto *I : LoLoop.ToRemove) {
1777	LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
1778	I->eraseFromParent();
1779	}
1780	for (auto *I : LoLoop.BlockMasksToRecompute) {
1781	LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
1782	recomputeVPTBlockMask(Instr&: *I);
1783	LLVM_DEBUG(dbgs() << " ... done: " << *I);
1784	}
1785	}
1786
1787	PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
1788	DFS.ProcessLoop();
1789	const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
1790	fullyRecomputeLiveIns(MBBs: PostOrder);
1791
1792	for (auto *MBB : reverse(C: PostOrder))
1793	recomputeLivenessFlags(MBB&: *MBB);
1794
1795	// We've moved, removed and inserted new instructions, so update RDA.
1796	RDA->reset();
1797	}
1798
1799	bool ARMLowOverheadLoops::RevertNonLoops() {
1800	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
1801	bool Changed = false;
1802
1803	for (auto &MBB : *MF) {
1804	SmallVector<MachineInstr*, `4`> Starts;
1805	SmallVector<MachineInstr*, `4`> Decs;
1806	SmallVector<MachineInstr*, `4`> Ends;
1807	SmallVector<MachineInstr *, `4`> EndDecs;
1808	SmallVector<MachineInstr *, `4`> MQPRCopies;
1809
1810	for (auto &I : MBB) {
1811	if (isLoopStart(MI: I))
1812	Starts.push_back(Elt: &I);
1813	else if (I.getOpcode() == ARM::t2LoopDec)
1814	Decs.push_back(Elt: &I);
1815	else if (I.getOpcode() == ARM::t2LoopEnd)
1816	Ends.push_back(Elt: &I);
1817	else if (I.getOpcode() == ARM::t2LoopEndDec)
1818	EndDecs.push_back(Elt: &I);
1819	else if (I.getOpcode() == ARM::MQPRCopy)
1820	MQPRCopies.push_back(Elt: &I);
1821	}
1822
1823	if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
1824	MQPRCopies.empty())
1825	continue;
1826
1827	Changed = true;
1828
1829	for (auto *Start : Starts) {
1830	if (isWhileLoopStart(MI: *Start))
1831	RevertWhile(MI: Start);
1832	else
1833	RevertDo(MI: Start);
1834	}
1835	for (auto *Dec : Decs)
1836	RevertLoopDec(MI: Dec);
1837
1838	for (auto *End : Ends)
1839	RevertLoopEnd(MI: End);
1840	for (auto *End : EndDecs)
1841	RevertLoopEndDec(MI: End);
1842	for (auto *MI : MQPRCopies) {
1843	LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
1844	assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1845	MachineBasicBlock *MBB = MI->getParent();
1846	auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VORR),
1847	DestReg: MI->getOperand(i: `0`).getReg())
1848	.add(MO: MI->getOperand(i: `1`))
1849	.add(MO: MI->getOperand(i: `1`));
1850	addUnpredicatedMveVpredROp(MIB, DestReg: MI->getOperand(i: `0`).getReg());
1851	MI->eraseFromParent();
1852	}
1853	}
1854	return Changed;
1855	}
1856
1857	FunctionPass *llvm::createARMLowOverheadLoopsPass() {
1858	return new ARMLowOverheadLoops ();
1859	}
1860

Browse the source code of llvm_projects/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp