ARMLowOverheadLoops.cpp source code [llvm_projects/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp]

1	//===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10	/// instructions into machine operations.
11	/// The expectation is that the loop contains three pseudo instructions:
12	/// - t2LoopStart - placed in the preheader or pre-preheader. The do-loop*
13	/// form should be in the preheader, whereas the while form should be in the
14	/// preheaders only predecessor.
15	/// - t2LoopDec - placed within in the loop body.
16	/// - t2LoopEnd - the loop latch terminator.
17	///
18	/// In addition to this, we also look for the presence of the VCTP instruction,
19	/// which determines whether we can generated the tail-predicated low-overhead
20	/// loop form.
21	///
22	/// Assumptions and Dependencies:
23	/// Low-overhead loops are constructed and executed using a setup instruction:
24	/// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25	/// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26	/// but fixed polarity: WLS can only branch forwards and LE can only branch
27	/// backwards. These restrictions mean that this pass is dependent upon block
28	/// layout and block sizes, which is why it's the last pass to run. The same is
29	/// true for ConstantIslands, but this pass does not increase the size of the
30	/// basic blocks, nor does it change the CFG. Instructions are mainly removed
31	/// during the transform and pseudo instructions are replaced by real ones. In
32	/// some cases, when we have to revert to a 'normal' loop, we have to introduce
33	/// multiple instructions for a single pseudo (see RevertWhile and
34	/// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
35	/// are defined to be as large as this maximum sequence of replacement
36	/// instructions.
37	///
38	/// A note on VPR.P0 (the lane mask):
39	/// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
40	/// "VPT Active" context (which includes low-overhead loops and vpt blocks).
41	/// They will simply "and" the result of their calculation with the current
42	/// value of VPR.P0. You can think of it like this:
43	/// \verbatim
44	/// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs
45	/// VPR.P0 &= Value
46	/// else
47	/// VPR.P0 = Value
48	/// \endverbatim
49	/// When we're inside the low-overhead loop (between DLSTP and LETP), we always
50	/// fall in the "VPT active" case, so we can consider that all VPR writes by
51	/// one of those instruction is actually a "and".
52	//===----------------------------------------------------------------------===//
53
54	#include "ARM.h"
55	#include "ARMBaseInstrInfo.h"
56	#include "ARMBaseRegisterInfo.h"
57	#include "ARMBasicBlockInfo.h"
58	#include "ARMSubtarget.h"
59	#include "MVETailPredUtils.h"
60	#include "Thumb2InstrInfo.h"
61	#include "llvm/ADT/SetOperations.h"
62	#include "llvm/ADT/SetVector.h"
63	#include "llvm/CodeGen/LivePhysRegs.h"
64	#include "llvm/CodeGen/MachineFrameInfo.h"
65	#include "llvm/CodeGen/MachineFunctionPass.h"
66	#include "llvm/CodeGen/MachineLoopInfo.h"
67	#include "llvm/CodeGen/MachineLoopUtils.h"
68	#include "llvm/CodeGen/MachineRegisterInfo.h"
69	#include "llvm/CodeGen/Passes.h"
70	#include "llvm/CodeGen/ReachingDefAnalysis.h"
71	#include "llvm/MC/MCInstrDesc.h"
72
73	using namespace llvm;
74
75	#define DEBUG_TYPE "arm-low-overhead-loops"
76	#define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
77
78	static cl::opt<bool>
79	DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
80	cl::desc ("Disable tail-predication in the ARM LowOverheadLoop pass"),
81	cl::init(Val: false));
82
83	static cl::opt<bool>
84	DisableOmitDLS("arm-disable-omit-dls", cl::Hidden,
85	cl::desc ("Disable omitting 'dls lr, lr' instructions"),
86	cl::init(Val: false));
87
88	static bool isVectorPredicated(MachineInstr *MI) {
89	int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI);
90	return PIdx != -`1` && MI->getOperand(i: PIdx + `1`).getReg() == ARM::VPR;
91	}
92
93	static bool isVectorPredicate(MachineInstr *MI) {
94	return MI->findRegisterDefOperandIdx(Reg: ARM::VPR, /TRI=/nullptr) != -`1`;
95	}
96
97	static bool hasVPRUse(MachineInstr &MI) {
98	return MI.findRegisterUseOperandIdx(Reg: ARM::VPR, /TRI=/nullptr) != -`1`;
99	}
100
101	static bool isDomainMVE(MachineInstr *MI) {
102	uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
103	return Domain == ARMII::DomainMVE;
104	}
105
106	static int getVecSize(const MachineInstr &MI) {
107	const MCInstrDesc &MCID = MI.getDesc();
108	uint64_t Flags = MCID.TSFlags;
109	return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
110	}
111
112	static bool shouldInspect(MachineInstr &MI) {
113	if (MI.isDebugInstr())
114	return false;
115	return isDomainMVE(MI: &MI) \|\| isVectorPredicate(MI: &MI) \|\| hasVPRUse(MI);
116	}
117
118	static bool isHorizontalReduction(const MachineInstr &MI) {
119	const MCInstrDesc &MCID = MI.getDesc();
120	uint64_t Flags = MCID.TSFlags;
121	return (Flags & ARMII::HorizontalReduction) != `0`;
122	}
123
124	namespace {
125
126	using InstSet = SmallPtrSetImpl<MachineInstr *>;
127
128	class PostOrderLoopTraversal {
129	MachineLoop &ML;
130	MachineLoopInfo &MLI;
131	SmallPtrSet<MachineBasicBlock*, `4`> Visited;
132	SmallVector<MachineBasicBlock*, `4`> Order;
133
134	public:
135	PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
136	: ML(ML), MLI(MLI) { }
137
138	const SmallVectorImpl<MachineBasicBlock> &getOrder() const* {
139	return Order;
140	}
141
142	// Visit all the blocks within the loop, as well as exit blocks and any
143	// blocks properly dominating the header.
144	void ProcessLoop() {
145	std::function<void(MachineBasicBlock)> Search = [this*, &Search]
146	(MachineBasicBlock MBB) -> void* {
147	if (Visited.count(Ptr: MBB))
148	return;
149
150	Visited.insert(Ptr: MBB);
151	for (auto *Succ : MBB->successors()) {
152	if (!ML.contains(BB: Succ))
153	continue;
154	Search (Succ);
155	}
156	Order.push_back(Elt: MBB);
157	};
158
159	// Insert exit blocks.
160	SmallVector<MachineBasicBlock*, `2`> ExitBlocks;
161	ML.getExitBlocks(ExitBlocks);
162	append_range(C&: Order, R&: ExitBlocks);
163
164	// Then add the loop body.
165	Search (ML.getHeader());
166
167	// Then try the preheader and its predecessors.
168	std::function<void(MachineBasicBlock*)> GetPredecessor =
169	[this, &GetPredecessor] (MachineBasicBlock MBB) -> void* {
170	Order.push_back(Elt: MBB);
171	if (MBB->pred_size() == `1`)
172	GetPredecessor (*MBB->pred_begin());
173	};
174
175	if (auto *Preheader = ML.getLoopPreheader())
176	GetPredecessor (Preheader);
177	else if (auto Preheader = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true*))
178	GetPredecessor (Preheader);
179	}
180	};
181
182	class VPTBlock {
183	SmallVector<MachineInstr *, `4`> Insts;
184
185	public:
186	VPTBlock(MachineInstr *MI) { Insts.push_back(Elt: MI); }
187
188	// Have we found an instruction within the block which defines the vpr? If
189	// so, not all the instructions in the block will have the same predicate.
190	bool hasUniformPredicate() { return getDivergent() == nullptr; }
191
192	// If it exists, return the first internal instruction which modifies the
193	// VPR.
194	MachineInstr *getDivergent() {
195	SmallVectorImpl<MachineInstr *> &Insts = getInsts();
196	for (unsigned i = `1`; i < Insts.size(); ++i) {
197	MachineInstr *Next = Insts [i];
198	if (isVectorPredicate(MI: Next))
199	return Next; // Found an instruction altering the vpr.
200	}
201	return nullptr;
202	}
203
204	void insert(MachineInstr *MI) {
205	Insts.push_back(Elt: MI);
206	// VPT/VPST + 4 predicated instructions.
207	assert(Insts.size() <= `5` && "Too many instructions in VPT block!");
208	}
209
210	bool containsVCTP() const { return llvm::any_of(Range: Insts, P: isVCTP); }
211
212	unsigned size() const { return Insts.size(); }
213	SmallVectorImpl<MachineInstr > &getInsts() { return* Insts; }
214	};
215
216	// Represent the current state of the VPR and hold all instances which
217	// represent a VPT block, which is a list of instructions that begins with a
218	// VPT/VPST and has a maximum of four proceeding instructions. All
219	// instructions within the block are predicated upon the vpr and we allow
220	// instructions to define the vpr within in the block too.
221	class VPTState {
222	friend struct LowOverheadLoop;
223
224	SmallVector<VPTBlock, `4`> Blocks;
225	SetVector<MachineInstr *> CurrentPredicates;
226	std::map<MachineInstr , SetVector<MachineInstr >> PredicatedInsts;
227
228	void CreateVPTBlock(MachineInstr *MI) {
229	assert((CurrentPredicates.size() \|\| MI->getParent()->isLiveIn(ARM::VPR))
230	&& "Can't begin VPT without predicate");
231	Blocks.emplace_back(Args&: MI);
232	// The execution of MI is predicated upon the current set of instructions
233	// that are AND'ed together to form the VPR predicate value. In the case
234	// that MI is a VPT, CurrentPredicates will also just be MI.
235	PredicatedInsts [MI] = CurrentPredicates;
236	}
237
238	void addInst(MachineInstr *MI) {
239	Blocks.back().insert(MI);
240	PredicatedInsts [MI] = CurrentPredicates;
241	}
242
243	void addPredicate(MachineInstr *MI) {
244	LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
245	CurrentPredicates.insert(X: MI);
246	}
247
248	void resetPredicate(MachineInstr *MI) {
249	LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
250	CurrentPredicates.clear();
251	CurrentPredicates.insert(X: MI);
252	}
253
254	public:
255	// Return whether the given instruction is predicated upon a VCTP.
256	bool isPredicatedOnVCTP(MachineInstr MI, bool* Exclusive = false) {
257	SetVector<MachineInstr *> &Predicates = PredicatedInsts [MI];
258	if (Exclusive && Predicates.size() != `1`)
259	return false;
260	// We do not know how to convert an else predicate of a VCTP.
261	if (getVPTInstrPredicate(MI: *MI) == ARMVCC::Else)
262	return false;
263	return llvm::any_of(Range&: Predicates, P: isVCTP);
264	}
265
266	// Is the VPST, controlling the block entry, predicated upon a VCTP.
267	bool isEntryPredicatedOnVCTP(VPTBlock &Block, bool Exclusive = false) {
268	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
269	return isPredicatedOnVCTP(MI: Insts.front(), Exclusive);
270	}
271
272	// If this block begins with a VPT, we can check whether it's using
273	// at least one predicated input(s), as well as possible loop invariant
274	// which would result in it being implicitly predicated.
275	bool hasImplicitlyValidVPT(VPTBlock &Block, ReachingDefAnalysis &RDA) {
276	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
277	MachineInstr *VPT = Insts.front();
278	assert(isVPTOpcode(VPT->getOpcode()) &&
279	"Expected VPT block to begin with VPT/VPST");
280
281	if (VPT->getOpcode() == ARM::MVE_VPST)
282	return false;
283
284	// If the VPT block does not define something that is an "output", then
285	// the tail-predicated version will just perform a subset of the original
286	// vpt block, where the last lanes should not be used.
287	if (isVPTOpcode(Opc: VPT->getOpcode()) &&
288	all_of(Range&: Block.getInsts(), P: [](const MachineInstr *MI) {
289	return !MI->mayStore() && !MI->mayLoad() &&
290	!isHorizontalReduction(MI: *MI) && !isVCTP(MI);
291	}))
292	return true;
293
294	auto IsOperandPredicated = [&](MachineInstr MI, unsigned* Idx) {
295	MachineInstr *Op = RDA.getMIOperand(MI, MO&: MI->getOperand(i: Idx));
296	return Op && PredicatedInsts.count(x: Op) && isPredicatedOnVCTP(MI: Op);
297	};
298
299	auto IsOperandInvariant = [&](MachineInstr MI, unsigned* Idx) {
300	MachineOperand &MO = MI->getOperand(i: Idx);
301	if (!MO.isReg() \|\| !MO.getReg())
302	return true;
303
304	SmallPtrSet<MachineInstr *, `2`> Defs;
305	RDA.getGlobalReachingDefs(MI, PhysReg: MO.getReg(), Defs);
306	if (Defs.empty())
307	return true;
308
309	for (auto *Def : Defs)
310	if (Def->getParent() == VPT->getParent())
311	return false;
312	return true;
313	};
314
315	// Check that at least one of the operands is directly predicated on a
316	// vctp and allow an invariant value too.
317	return (IsOperandPredicated(VPT, `1`) \|\| IsOperandPredicated(VPT, `2`)) &&
318	(IsOperandPredicated(VPT, `1`) \|\| IsOperandInvariant(VPT, `1`)) &&
319	(IsOperandPredicated(VPT, `2`) \|\| IsOperandInvariant(VPT, `2`));
320	}
321
322	bool isValid(ReachingDefAnalysis &RDA) {
323	// All predication within the loop should be based on vctp. If the block
324	// isn't predicated on entry, check whether the vctp is within the block
325	// and that all other instructions are then predicated on it.
326	for (auto &Block : Blocks) {
327	if (isEntryPredicatedOnVCTP(Block, Exclusive: false) &&
328	!any_of(Range: drop_begin(RangeOrContainer&: Block.getInsts()), P: [](const MachineInstr *MI) {
329	return getVPTInstrPredicate(MI: *MI) == ARMVCC::Else;
330	}))
331	continue;
332	if (hasImplicitlyValidVPT(Block, RDA))
333	continue;
334
335	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
336	// We don't know how to convert a block with just a VPT;VCTP into
337	// anything valid once we remove the VCTP. For now just bail out.
338	assert(isVPTOpcode(Insts.front()->getOpcode()) &&
339	"Expected VPT block to start with a VPST or VPT!");
340	if (Insts.size() == `2` && Insts.front()->getOpcode() != ARM::MVE_VPST &&
341	isVCTP(MI: Insts.back()))
342	return false;
343
344	for (auto *MI : Insts) {
345	// Check that any internal VCTPs are 'Then' predicated.
346	if (isVCTP(MI) && getVPTInstrPredicate(MI: *MI) != ARMVCC::Then)
347	return false;
348	// Skip other instructions that build up the predicate.
349	if (MI->getOpcode() == ARM::MVE_VPST \|\| isVectorPredicate(MI))
350	continue;
351	// Check that any other instructions are predicated upon a vctp.
352	// TODO: We could infer when VPTs are implicitly predicated on the
353	// vctp (when the operands are predicated).
354	if (!isPredicatedOnVCTP(MI)) {
355	LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
356	return false;
357	}
358	}
359	}
360	return true;
361	}
362	};
363
364	struct LowOverheadLoop {
365
366	MachineLoop &ML;
367	MachineBasicBlock Preheader = nullptr*;
368	MachineLoopInfo &MLI;
369	ReachingDefAnalysis &RDA;
370	const TargetRegisterInfo &TRI;
371	const ARMBaseInstrInfo &TII;
372	MachineFunction MF = nullptr*;
373	MachineBasicBlock::iterator StartInsertPt;
374	MachineBasicBlock StartInsertBB = nullptr*;
375	MachineInstr Start = nullptr*;
376	MachineInstr Dec = nullptr*;
377	MachineInstr End = nullptr*;
378	MachineOperand TPNumElements;
379	SmallVector<MachineInstr *, `4`> VCTPs;
380	SmallPtrSet<MachineInstr *, `4`> ToRemove;
381	SmallPtrSet<MachineInstr *, `4`> BlockMasksToRecompute;
382	SmallPtrSet<MachineInstr *, `4`> DoubleWidthResultInstrs;
383	SmallPtrSet<MachineInstr *, `4`> VMOVCopies;
384	bool Revert = false;
385	bool CannotTailPredicate = false;
386	VPTState VPTstate;
387
388	LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
389	ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
390	const ARMBaseInstrInfo &TII)
391	: ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
392	TPNumElements(MachineOperand::CreateImm(Val: `0`)) {
393	MF = ML.getHeader()->getParent();
394	if (auto *MBB = ML.getLoopPreheader())
395	Preheader = MBB;
396	else if (auto MBB = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true*))
397	Preheader = MBB;
398	}
399
400	// If this is an MVE instruction, check that we know how to use tail
401	// predication with it. Record VPT blocks and return whether the
402	// instruction is valid for tail predication.
403	bool ValidateMVEInst(MachineInstr *MI);
404
405	void AnalyseMVEInst(MachineInstr *MI) {
406	CannotTailPredicate = !ValidateMVEInst(MI);
407	}
408
409	bool IsTailPredicationLegal() const {
410	// For now, let's keep things really simple and only support a single
411	// block for tail predication.
412	return !Revert && FoundAllComponents() && !VCTPs.empty() &&
413	!CannotTailPredicate && ML.getNumBlocks() == `1`;
414	}
415
416	// Given that MI is a VCTP, check that is equivalent to any other VCTPs
417	// found.
418	bool AddVCTP(MachineInstr *MI);
419
420	// Check that the predication in the loop will be equivalent once we
421	// perform the conversion. Also ensure that we can provide the number
422	// of elements to the loop start instruction.
423	bool ValidateTailPredicate();
424
425	// Check that any values available outside of the loop will be the same
426	// after tail predication conversion.
427	bool ValidateLiveOuts();
428
429	// Check the branch targets are within range and we satisfy our
430	// restrictions.
431	void Validate(ARMBasicBlockUtils *BBUtils);
432
433	bool FoundAllComponents() const {
434	return Start && Dec && End;
435	}
436
437	SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTstate.Blocks; }
438
439	// Return the operand for the loop start instruction. This will be the loop
440	// iteration count, or the number of elements if we're tail predicating.
441	MachineOperand &getLoopStartOperand() {
442	if (IsTailPredicationLegal())
443	return TPNumElements;
444	return Start->getOperand(i: `1`);
445	}
446
447	unsigned getStartOpcode() const {
448	bool IsDo = isDoLoopStart(MI: *Start);
449	if (!IsTailPredicationLegal())
450	return IsDo ? ARM::t2DLS : ARM::t2WLS;
451
452	return VCTPOpcodeToLSTP(Opcode: VCTPs.back()->getOpcode(), IsDoLoop: IsDo);
453	}
454
455	void dump() const {
456	if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
457	if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
458	if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
459	if (!VCTPs.empty()) {
460	dbgs() << "ARM Loops: Found VCTP(s):\n";
461	for (auto *MI : VCTPs)
462	dbgs() << " - " << *MI;
463	}
464	if (!FoundAllComponents())
465	dbgs() << "ARM Loops: Not a low-overhead loop.\n";
466	else if (!(Start && Dec && End))
467	dbgs() << "ARM Loops: Failed to find all loop components.\n";
468	}
469	};
470
471	class ARMLowOverheadLoops : public MachineFunctionPass {
472	MachineFunction MF = nullptr*;
473	MachineLoopInfo MLI = nullptr*;
474	ReachingDefAnalysis RDA = nullptr*;
475	const ARMBaseInstrInfo TII = nullptr*;
476	MachineRegisterInfo MRI = nullptr*;
477	const TargetRegisterInfo TRI = nullptr*;
478	std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
479
480	public:
481	static char ID;
482
483	ARMLowOverheadLoops() : MachineFunctionPass (ID) { }
484
485	void getAnalysisUsage(AnalysisUsage &AU) const override {
486	AU.setPreservesCFG();
487	AU.addRequired<MachineLoopInfoWrapperPass>();
488	AU.addRequired<ReachingDefAnalysis>();
489	MachineFunctionPass::getAnalysisUsage(AU);
490	}
491
492	bool runOnMachineFunction(MachineFunction &MF) override;
493
494	MachineFunctionProperties getRequiredProperties() const override {
495	return MachineFunctionProperties ().set(
496	MachineFunctionProperties::Property::NoVRegs).set(
497	MachineFunctionProperties::Property::TracksLiveness);
498	}
499
500	StringRef getPassName() const override {
501	return ARM_LOW_OVERHEAD_LOOPS_NAME;
502	}
503
504	private:
505	bool ProcessLoop(MachineLoop *ML);
506
507	bool RevertNonLoops();
508
509	void RevertWhile(MachineInstr MI) const*;
510	void RevertDo(MachineInstr MI) const*;
511
512	bool RevertLoopDec(MachineInstr MI) const*;
513
514	void RevertLoopEnd(MachineInstr MI, bool* SkipCmp = false) const;
515
516	void RevertLoopEndDec(MachineInstr MI) const*;
517
518	void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
519
520	MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
521
522	void Expand(LowOverheadLoop &LoLoop);
523
524	void IterationCountDCE(LowOverheadLoop &LoLoop);
525	};
526	}
527
528	char ARMLowOverheadLoops::ID = `0`;
529
530	INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
531	false, false)
532
533	static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
534	InstSet &ToRemove, InstSet &Ignore) {
535
536	// Check that we can remove all of Killed without having to modify any IT
537	// blocks.
538	auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
539	// Collect the dead code and the MBBs in which they reside.
540	SmallPtrSet<MachineBasicBlock*, `2`> BasicBlocks;
541	for (auto *Dead : Killed)
542	BasicBlocks.insert(Ptr: Dead->getParent());
543
544	// Collect IT blocks in all affected basic blocks.
545	std::map<MachineInstr , SmallPtrSet<MachineInstr , `2`>> ITBlocks;
546	for (auto *MBB : BasicBlocks) {
547	for (auto &IT : *MBB) {
548	if (IT.getOpcode() != ARM::t2IT)
549	continue;
550	RDA.getReachingLocalUses(MI: &IT, PhysReg: MCRegister::from(Val: ARM::ITSTATE),
551	Uses&: ITBlocks [&IT]);
552	}
553	}
554
555	// If we're removing all of the instructions within an IT block, then
556	// also remove the IT instruction.
557	SmallPtrSet<MachineInstr *, `2`> ModifiedITs;
558	SmallPtrSet<MachineInstr *, `2`> RemoveITs;
559	for (auto *Dead : Killed) {
560	if (MachineOperand *MO =
561	Dead->findRegisterUseOperand(Reg: ARM::ITSTATE, /TRI=/nullptr)) {
562	MachineInstr IT = RDA.getMIOperand(MI: Dead, MO&: MO);
563	RemoveITs.insert(Ptr: IT);
564	auto &CurrentBlock = ITBlocks [IT];
565	CurrentBlock.erase(Ptr: Dead);
566	if (CurrentBlock.empty())
567	ModifiedITs.erase(Ptr: IT);
568	else
569	ModifiedITs.insert(Ptr: IT);
570	}
571	}
572	if (!ModifiedITs.empty())
573	return false;
574	Killed.insert(I: RemoveITs.begin(), E: RemoveITs.end());
575	return true;
576	};
577
578	SmallPtrSet<MachineInstr *, `2`> Uses;
579	if (!RDA.isSafeToRemove(MI, ToRemove&: Uses, Ignore))
580	return false;
581
582	if (WontCorruptITs (Uses, RDA)) {
583	ToRemove.insert(I: Uses.begin(), E: Uses.end());
584	LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
585	<< " - can also remove:\n";
586	for (auto *Use : Uses)
587	dbgs() << " - " << *Use);
588
589	SmallPtrSet<MachineInstr*, `4`> Killed;
590	RDA.collectKilledOperands(MI, Dead&: Killed);
591	if (WontCorruptITs (Killed, RDA)) {
592	ToRemove.insert(I: Killed.begin(), E: Killed.end());
593	LLVM_DEBUG(for (auto *Dead : Killed)
594	dbgs() << " - " << *Dead);
595	}
596	return true;
597	}
598	return false;
599	}
600
601	bool LowOverheadLoop::ValidateTailPredicate() {
602	if (!IsTailPredicationLegal()) {
603	LLVM_DEBUG(if (VCTPs.empty())
604	dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
605	dbgs() << "ARM Loops: Tail-predication is not valid.\n");
606	return false;
607	}
608
609	assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
610	assert(ML.getBlocks().size() == `1` &&
611	"Shouldn't be processing a loop with more than one block");
612
613	if (DisableTailPredication) {
614	LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
615	return false;
616	}
617
618	if (!VPTstate.isValid(RDA)) {
619	LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
620	return false;
621	}
622
623	if (!ValidateLiveOuts()) {
624	LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
625	return false;
626	}
627
628	// For tail predication, we need to provide the number of elements, instead
629	// of the iteration count, to the loop start instruction. The number of
630	// elements is provided to the vctp instruction, so we need to check that
631	// we can use this register at InsertPt.
632	MachineInstr *VCTP = VCTPs.back();
633	if (Start->getOpcode() == ARM::t2DoLoopStartTP \|\|
634	Start->getOpcode() == ARM::t2WhileLoopStartTP) {
635	TPNumElements = Start->getOperand(i: `2`);
636	StartInsertPt = Start;
637	StartInsertBB = Start->getParent();
638	} else {
639	TPNumElements = VCTP->getOperand(i: `1`);
640	MCRegister NumElements = TPNumElements.getReg().asMCReg();
641
642	// If the register is defined within loop, then we can't perform TP.
643	// TODO: Check whether this is just a mov of a register that would be
644	// available.
645	if (RDA.hasLocalDefBefore(MI: VCTP, PhysReg: NumElements)) {
646	LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
647	return false;
648	}
649
650	// The element count register maybe defined after InsertPt, in which case we
651	// need to try to move either InsertPt or the def so that the [w\|d]lstp can
652	// use the value.
653
654	if (StartInsertPt != StartInsertBB->end() &&
655	!RDA.isReachingDefLiveOut(MI: &*StartInsertPt, PhysReg: NumElements)) {
656	if (auto *ElemDef =
657	RDA.getLocalLiveOutMIDef(MBB: StartInsertBB, PhysReg: NumElements)) {
658	if (RDA.isSafeToMoveForwards(From: ElemDef, To: &*StartInsertPt)) {
659	ElemDef->removeFromParent();
660	StartInsertBB->insert(I: StartInsertPt, MI: ElemDef);
661	LLVM_DEBUG(dbgs()
662	<< "ARM Loops: Moved element count def: " << *ElemDef);
663	} else if (RDA.isSafeToMoveBackwards(From: &*StartInsertPt, To: ElemDef)) {
664	StartInsertPt ->removeFromParent();
665	StartInsertBB->insertAfter(I: MachineBasicBlock::iterator (ElemDef),
666	MI: &*StartInsertPt);
667	LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
668	} else {
669	// If we fail to move an instruction and the element count is provided
670	// by a mov, use the mov operand if it will have the same value at the
671	// insertion point
672	MachineOperand Operand = ElemDef->getOperand(i: `1`);
673	if (isMovRegOpcode(Opc: ElemDef->getOpcode()) &&
674	RDA.getUniqueReachingMIDef(MI: ElemDef, PhysReg: Operand.getReg().asMCReg()) ==
675	RDA.getUniqueReachingMIDef(MI: &*StartInsertPt,
676	PhysReg: Operand.getReg().asMCReg())) {
677	TPNumElements = Operand;
678	NumElements = TPNumElements.getReg();
679	} else {
680	LLVM_DEBUG(dbgs()
681	<< "ARM Loops: Unable to move element count to loop "
682	<< "start instruction.\n");
683	return false;
684	}
685	}
686	}
687	}
688
689	// Especially in the case of while loops, InsertBB may not be the
690	// preheader, so we need to check that the register isn't redefined
691	// before entering the loop.
692	auto CannotProvideElements = [this](MachineBasicBlock *MBB,
693	MCRegister NumElements) {
694	if (MBB->empty())
695	return false;
696	// NumElements is redefined in this block.
697	if (RDA.hasLocalDefBefore(MI: &MBB->back(), PhysReg: NumElements))
698	return true;
699
700	// Don't continue searching up through multiple predecessors.
701	if (MBB->pred_size() > `1`)
702	return true;
703
704	return false;
705	};
706
707	// Search backwards for a def, until we get to InsertBB.
708	MachineBasicBlock *MBB = Preheader;
709	while (MBB && MBB != StartInsertBB) {
710	if (CannotProvideElements (MBB, NumElements)) {
711	LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
712	return false;
713	}
714	MBB = *MBB->pred_begin();
715	}
716	}
717
718	// Could inserting the [W\|D]LSTP cause some unintended affects? In a perfect
719	// world the [w\|d]lstp instruction would be last instruction in the preheader
720	// and so it would only affect instructions within the loop body. But due to
721	// scheduling, and/or the logic in this pass (above), the insertion point can
722	// be moved earlier. So if the Loop Start isn't the last instruction in the
723	// preheader, and if the initial element count is smaller than the vector
724	// width, the Loop Start instruction will immediately generate one or more
725	// false lane mask which can, incorrectly, affect the proceeding MVE
726	// instructions in the preheader.
727	if (std::any_of(first: StartInsertPt, last: StartInsertBB->end(), pred: shouldInspect)) {
728	LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W\|D]LSTP\n");
729	return false;
730	}
731
732	// For any DoubleWidthResultInstrs we found whilst scanning instructions, they
733	// need to compute an output size that is smaller than the VCTP mask operates
734	// on. The VecSize of the DoubleWidthResult is the larger vector size - the
735	// size it extends into, so any VCTP VecSize <= is valid.
736	unsigned VCTPVecSize = getVecSize(MI: *VCTP);
737	for (MachineInstr *MI : DoubleWidthResultInstrs) {
738	unsigned InstrVecSize = getVecSize(MI: *MI);
739	if (InstrVecSize > VCTPVecSize) {
740	LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
741	<< "VecSize:\n" << *MI);
742	return false;
743	}
744	}
745
746	// Check that the value change of the element count is what we expect and
747	// that the predication will be equivalent. For this we need:
748	// NumElements = NumElements - VectorWidth. The sub will be a sub immediate
749	// and we can also allow register copies within the chain too.
750	auto IsValidSub = [](MachineInstr MI, int* ExpectedVecWidth) {
751	return -getAddSubImmediate(MI&: *MI) == ExpectedVecWidth;
752	};
753
754	MachineBasicBlock *MBB = VCTP->getParent();
755	// Remove modifications to the element count since they have no purpose in a
756	// tail predicated loop. Explicitly refer to the vctp operand no matter which
757	// register NumElements has been assigned to, since that is what the
758	// modifications will be using
759	if (auto *Def = RDA.getUniqueReachingMIDef(
760	MI: &MBB->back(), PhysReg: VCTP->getOperand(i: `1`).getReg().asMCReg())) {
761	SmallPtrSet<MachineInstr*, `2`> ElementChain;
762	SmallPtrSet<MachineInstr*, `2`> Ignore;
763	unsigned ExpectedVectorWidth = getTailPredVectorWidth(Opcode: VCTP->getOpcode());
764
765	Ignore.insert(I: VCTPs.begin(), E: VCTPs.end());
766
767	if (TryRemove(MI: Def, RDA, ToRemove&: ElementChain, Ignore)) {
768	bool FoundSub = false;
769
770	for (auto *MI : ElementChain) {
771	if (isMovRegOpcode(Opc: MI->getOpcode()))
772	continue;
773
774	if (isSubImmOpcode(Opc: MI->getOpcode())) {
775	if (FoundSub \|\| !IsValidSub (MI, ExpectedVectorWidth)) {
776	LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
777	" count: " << *MI);
778	return false;
779	}
780	FoundSub = true;
781	} else {
782	LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
783	" count: " << *MI);
784	return false;
785	}
786	}
787	ToRemove.insert(I: ElementChain.begin(), E: ElementChain.end());
788	}
789	}
790
791	// If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
792	// can also remove any extra instructions in the preheader, which often
793	// includes a now unused MOV.
794	if ((Start->getOpcode() == ARM::t2DoLoopStartTP \|\|
795	Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
796	Preheader && !Preheader->empty() &&
797	!RDA.hasLocalDefBefore(MI: VCTP, PhysReg: VCTP->getOperand(i: `1`).getReg())) {
798	if (auto *Def = RDA.getUniqueReachingMIDef(
799	MI: &Preheader->back(), PhysReg: VCTP->getOperand(i: `1`).getReg().asMCReg())) {
800	SmallPtrSet<MachineInstr*, `2`> Ignore;
801	Ignore.insert(I: VCTPs.begin(), E: VCTPs.end());
802	TryRemove(MI: Def, RDA, ToRemove, Ignore);
803	}
804	}
805
806	return true;
807	}
808
809	static bool isRegInClass(const MachineOperand &MO,
810	const TargetRegisterClass *Class) {
811	return MO.isReg() && MO.getReg() && Class->contains(Reg: MO.getReg());
812	}
813
814	// MVE 'narrowing' operate on half a lane, reading from half and writing
815	// to half, which are referred to has the top and bottom half. The other
816	// half retains its previous value.
817	static bool retainsPreviousHalfElement(const MachineInstr &MI) {
818	const MCInstrDesc &MCID = MI.getDesc();
819	uint64_t Flags = MCID.TSFlags;
820	return (Flags & ARMII::RetainsPreviousHalfElement) != `0`;
821	}
822
823	// Some MVE instructions read from the top/bottom halves of their operand(s)
824	// and generate a vector result with result elements that are double the
825	// width of the input.
826	static bool producesDoubleWidthResult(const MachineInstr &MI) {
827	const MCInstrDesc &MCID = MI.getDesc();
828	uint64_t Flags = MCID.TSFlags;
829	return (Flags & ARMII::DoubleWidthResult) != `0`;
830	}
831
832	// Can this instruction generate a non-zero result when given only zeroed
833	// operands? This allows us to know that, given operands with false bytes
834	// zeroed by masked loads, that the result will also contain zeros in those
835	// bytes.
836	static bool canGenerateNonZeros(const MachineInstr &MI) {
837
838	// Check for instructions which can write into a larger element size,
839	// possibly writing into a previous zero'd lane.
840	if (producesDoubleWidthResult(MI))
841	return true;
842
843	switch (MI.getOpcode()) {
844	default:
845	break;
846	// FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
847	// fp16 -> fp32 vector conversions.
848	// Instructions that perform a NOT will generate 1s from 0s.
849	case ARM::MVE_VMVN:
850	case ARM::MVE_VORN:
851	// Count leading zeros will do just that!
852	case ARM::MVE_VCLZs8:
853	case ARM::MVE_VCLZs16:
854	case ARM::MVE_VCLZs32:
855	return true;
856	}
857	return false;
858	}
859
860	// Look at its register uses to see if it only can only receive zeros
861	// into its false lanes which would then produce zeros. Also check that
862	// the output register is also defined by an FalseLanesZero instruction
863	// so that if tail-predication happens, the lanes that aren't updated will
864	// still be zeros.
865	static bool producesFalseLanesZero(MachineInstr &MI,
866	const TargetRegisterClass *QPRs,
867	const ReachingDefAnalysis &RDA,
868	InstSet &FalseLanesZero) {
869	if (canGenerateNonZeros(MI))
870	return false;
871
872	bool isPredicated = isVectorPredicated(MI: &MI);
873	// Predicated loads will write zeros to the falsely predicated bytes of the
874	// destination register.
875	if (MI.mayLoad())
876	return isPredicated;
877
878	auto IsZeroInit = [](MachineInstr *Def) {
879	return !isVectorPredicated(MI: Def) &&
880	Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
881	Def->getOperand(i: `1`).getImm() == `0`;
882	};
883
884	bool AllowScalars = isHorizontalReduction(MI);
885	for (auto &MO : MI.operands()) {
886	if (!MO.isReg() \|\| !MO.getReg())
887	continue;
888	if (!isRegInClass(MO, Class: QPRs) && AllowScalars)
889	continue;
890	// Skip the lr predicate reg
891	int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
892	if (PIdx != -`1` && (int)MO.getOperandNo() == PIdx + `2`)
893	continue;
894
895	// Check that this instruction will produce zeros in its false lanes:
896	// - If it only consumes false lanes zero or constant 0 (vmov #0)
897	// - If it's predicated, it only matters that it's def register already has
898	// false lane zeros, so we can ignore the uses.
899	SmallPtrSet<MachineInstr *, `2`> Defs;
900	RDA.getGlobalReachingDefs(MI: &MI, PhysReg: MO.getReg(), Defs);
901	if (Defs.empty())
902	return false;
903	for (auto *Def : Defs) {
904	if (Def == &MI \|\| FalseLanesZero.count(Ptr: Def) \|\| IsZeroInit (Def))
905	continue;
906	if (MO.isUse() && isPredicated)
907	continue;
908	return false;
909	}
910	}
911	LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
912	return true;
913	}
914
915	bool LowOverheadLoop::ValidateLiveOuts() {
916	// We want to find out if the tail-predicated version of this loop will
917	// produce the same values as the loop in its original form. For this to
918	// be true, the newly inserted implicit predication must not change the
919	// the (observable) results.
920	// We're doing this because many instructions in the loop will not be
921	// predicated and so the conversion from VPT predication to tail-predication
922	// can result in different values being produced; due to the tail-predication
923	// preventing many instructions from updating their falsely predicated
924	// lanes. This analysis assumes that all the instructions perform lane-wise
925	// operations and don't perform any exchanges.
926	// A masked load, whether through VPT or tail predication, will write zeros
927	// to any of the falsely predicated bytes. So, from the loads, we know that
928	// the false lanes are zeroed and here we're trying to track that those false
929	// lanes remain zero, or where they change, the differences are masked away
930	// by their user(s).
931	// All MVE stores have to be predicated, so we know that any predicate load
932	// operands, or stored results are equivalent already. Other explicitly
933	// predicated instructions will perform the same operation in the original
934	// loop and the tail-predicated form too. Because of this, we can insert
935	// loads, stores and other predicated instructions into our Predicated
936	// set and build from there.
937	const TargetRegisterClass *QPRs = TRI.getRegClass(i: ARM::MQPRRegClassID);
938	SetVector<MachineInstr *> FalseLanesUnknown;
939	SmallPtrSet<MachineInstr *, `4`> FalseLanesZero;
940	SmallPtrSet<MachineInstr *, `4`> Predicated;
941	MachineBasicBlock *Header = ML.getHeader();
942
943	LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
944
945	for (auto &MI : *Header) {
946	if (!shouldInspect(MI))
947	continue;
948
949	if (isVCTP(MI: &MI) \|\| isVPTOpcode(Opc: MI.getOpcode()))
950	continue;
951
952	bool isPredicated = isVectorPredicated(MI: &MI);
953	bool retainsOrReduces =
954	retainsPreviousHalfElement(MI) \|\| isHorizontalReduction(MI);
955
956	if (isPredicated)
957	Predicated.insert(Ptr: &MI);
958	if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
959	FalseLanesZero.insert(Ptr: &MI);
960	else if (MI.getNumDefs() == `0`)
961	continue;
962	else if (!isPredicated && retainsOrReduces) {
963	LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI);
964	return false;
965	} else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
966	FalseLanesUnknown.insert(X: &MI);
967	}
968
969	LLVM_DEBUG({
970	dbgs() << " Predicated:\n";
971	for (auto *I : Predicated)
972	dbgs() << " " << *I;
973	dbgs() << " FalseLanesZero:\n";
974	for (auto *I : FalseLanesZero)
975	dbgs() << " " << *I;
976	dbgs() << " FalseLanesUnknown:\n";
977	for (auto *I : FalseLanesUnknown)
978	dbgs() << " " << *I;
979	});
980
981	auto HasPredicatedUsers = [this](MachineInstr MI, const* MachineOperand &MO,
982	SmallPtrSetImpl<MachineInstr *> &Predicated) {
983	SmallPtrSet<MachineInstr *, `2`> Uses;
984	RDA.getGlobalUses(MI, PhysReg: MO.getReg().asMCReg(), Uses);
985	for (auto *Use : Uses) {
986	if (Use != MI && !Predicated.count(Ptr: Use))
987	return false;
988	}
989	return true;
990	};
991
992	// Visit the unknowns in reverse so that we can start at the values being
993	// stored and then we can work towards the leaves, hopefully adding more
994	// instructions to Predicated. Successfully terminating the loop means that
995	// all the unknown values have to found to be masked by predicated user(s).
996	// For any unpredicated values, we store them in NonPredicated so that we
997	// can later check whether these form a reduction.
998	SmallPtrSet<MachineInstr*, `2`> NonPredicated;
999	for (auto *MI : reverse(C&: FalseLanesUnknown)) {
1000	for (auto &MO : MI->operands()) {
1001	if (!isRegInClass(MO, Class: QPRs) \|\| !MO.isDef())
1002	continue;
1003	if (!HasPredicatedUsers (MI, MO, Predicated)) {
1004	LLVM_DEBUG(dbgs() << " Found an unknown def of : "
1005	<< TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
1006	NonPredicated.insert(Ptr: MI);
1007	break;
1008	}
1009	}
1010	// Any unknown false lanes have been masked away by the user(s).
1011	if (!NonPredicated.contains(Ptr: MI))
1012	Predicated.insert(Ptr: MI);
1013	}
1014
1015	SmallPtrSet<MachineInstr *, `2`> LiveOutMIs;
1016	SmallVector<MachineBasicBlock *, `2`> ExitBlocks;
1017	ML.getExitBlocks(ExitBlocks);
1018	assert(ML.getNumBlocks() == `1` && "Expected single block loop!");
1019	assert(ExitBlocks.size() == `1` && "Expected a single exit block");
1020	MachineBasicBlock *ExitBB = ExitBlocks.front();
1021	for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
1022	// TODO: Instead of blocking predication, we could move the vctp to the exit
1023	// block and calculate it's operand there in or the preheader.
1024	if (RegMask.PhysReg == ARM::VPR) {
1025	LLVM_DEBUG(dbgs() << " VPR is live in to the exit block.");
1026	return false;
1027	}
1028	// Check Q-regs that are live in the exit blocks. We don't collect scalars
1029	// because they won't be affected by lane predication.
1030	if (QPRs->contains(Reg: RegMask.PhysReg))
1031	if (auto *MI = RDA.getLocalLiveOutMIDef(MBB: Header, PhysReg: RegMask.PhysReg))
1032	LiveOutMIs.insert(Ptr: MI);
1033	}
1034
1035	// We've already validated that any VPT predication within the loop will be
1036	// equivalent when we perform the predication transformation; so we know that
1037	// any VPT predicated instruction is predicated upon VCTP. Any live-out
1038	// instruction needs to be predicated, so check this here. The instructions
1039	// in NonPredicated have been found to be a reduction that we can ensure its
1040	// legality. Any MQPRCopy found will need to validate its input as if it was
1041	// live out.
1042	SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
1043	while (!Worklist.empty()) {
1044	MachineInstr *MI = Worklist.pop_back_val();
1045	if (MI->getOpcode() == ARM::MQPRCopy) {
1046	VMOVCopies.insert(Ptr: MI);
1047	MachineInstr *CopySrc =
1048	RDA.getUniqueReachingMIDef(MI, PhysReg: MI->getOperand(i: `1`).getReg());
1049	if (CopySrc)
1050	Worklist.push_back(Elt: CopySrc);
1051	} else if (NonPredicated.count(Ptr: MI) && FalseLanesUnknown.contains(key: MI)) {
1052	LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
1053	VMOVCopies.clear();
1054	return false;
1055	}
1056	}
1057
1058	return true;
1059	}
1060
1061	void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
1062	if (Revert)
1063	return;
1064
1065	// Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
1066	// can only jump back.
1067	auto ValidateRanges = [](MachineInstr Start, MachineInstr End,
1068	ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
1069	MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
1070	? End->getOperand(i: `1`).getMBB()
1071	: End->getOperand(i: `2`).getMBB();
1072	// TODO Maybe there's cases where the target doesn't have to be the header,
1073	// but for now be safe and revert.
1074	if (TgtBB != ML.getHeader()) {
1075	LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
1076	return false;
1077	}
1078
1079	// The WLS and LE instructions have 12-bits for the label offset. WLS
1080	// requires a positive offset, while LE uses negative.
1081	if (BBUtils->getOffsetOf(MI: End) < BBUtils->getOffsetOf(MBB: ML.getHeader()) \|\|
1082	!BBUtils->isBBInRange(MI: End, DestBB: ML.getHeader(), MaxDisp: `4094`)) {
1083	LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
1084	return false;
1085	}
1086
1087	if (isWhileLoopStart(MI: *Start)) {
1088	MachineBasicBlock TargetBB = getWhileLoopStartTargetBB(MI: Start);
1089	if (BBUtils->getOffsetOf(MI: Start) > BBUtils->getOffsetOf(MBB: TargetBB) \|\|
1090	!BBUtils->isBBInRange(MI: Start, DestBB: TargetBB, MaxDisp: `4094`)) {
1091	LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
1092	return false;
1093	}
1094	}
1095	return true;
1096	};
1097
1098	StartInsertPt = MachineBasicBlock::iterator (Start);
1099	StartInsertBB = Start->getParent();
1100	LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
1101	<< *StartInsertPt);
1102
1103	Revert = !ValidateRanges (Start, End, BBUtils, ML);
1104	CannotTailPredicate = !ValidateTailPredicate();
1105	}
1106
1107	bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
1108	LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
1109	if (VCTPs.empty()) {
1110	VCTPs.push_back(Elt: MI);
1111	return true;
1112	}
1113
1114	// If we find another VCTP, check whether it uses the same value as the main VCTP.
1115	// If it does, store it in the VCTPs set, else refuse it.
1116	MachineInstr *Prev = VCTPs.back();
1117	if (!Prev->getOperand(i: `1`).isIdenticalTo(Other: MI->getOperand(i: `1`)) \|\|
1118	!RDA.hasSameReachingDef(A: Prev, B: MI, PhysReg: MI->getOperand(i: `1`).getReg().asMCReg())) {
1119	LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
1120	"definition from the main VCTP");
1121	return false;
1122	}
1123	VCTPs.push_back(Elt: MI);
1124	return true;
1125	}
1126
1127	static bool ValidateMVEStore(MachineInstr MI, MachineLoop ML) {
1128
1129	auto GetFrameIndex = [](MachineMemOperand *Operand) {
1130	const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
1131	if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
1132	if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(Val: PseudoValue)) {
1133	return FS->getFrameIndex();
1134	}
1135	}
1136	return -`1`;
1137	};
1138
1139	auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
1140	switch (I->getOpcode()) {
1141	case ARM::MVE_VSTRWU32:
1142	case ARM::MVE_VLDRWU32: {
1143	return I->getOperand(i: `1`).getReg() == ARM::SP &&
1144	I->memoperands().size() == `1` &&
1145	GetFrameIndex (I->memoperands().front()) >= `0`;
1146	}
1147	default:
1148	return false;
1149	}
1150	};
1151
1152	// An unpredicated vector register spill is allowed if all of the uses of the
1153	// stack slot are within the loop
1154	if (MI->getOpcode() != ARM::MVE_VSTRWU32 \|\| !IsStackOp (MI))
1155	return false;
1156
1157	// Search all blocks after the loop for accesses to the same stack slot.
1158	// ReachingDefAnalysis doesn't work for sp as it relies on registers being
1159	// live-out (which sp never is) to know what blocks to look in
1160	if (MI->memoperands().size() == `0`)
1161	return false;
1162	int FI = GetFrameIndex (MI->memoperands().front());
1163
1164	auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
1165	if (FI == -`1` \|\| !FrameInfo.isSpillSlotObjectIndex(ObjectIdx: FI))
1166	return false;
1167
1168	SmallVector<MachineBasicBlock *> Frontier;
1169	ML->getExitBlocks(ExitBlocks&: Frontier);
1170	SmallPtrSet<MachineBasicBlock *, `4`> Visited{MI->getParent()};
1171	unsigned Idx = `0`;
1172	while (Idx < Frontier.size()) {
1173	MachineBasicBlock *BB = Frontier [Idx];
1174	bool LookAtSuccessors = true;
1175	for (auto &I : *BB) {
1176	if (!IsStackOp (&I) \|\| I.memoperands().size() == `0`)
1177	continue;
1178	if (GetFrameIndex (I.memoperands().front()) != FI)
1179	continue;
1180	// If this block has a store to the stack slot before any loads then we
1181	// can ignore the block
1182	if (I.getOpcode() == ARM::MVE_VSTRWU32) {
1183	LookAtSuccessors = false;
1184	break;
1185	}
1186	// If the store and the load are using the same stack slot then the
1187	// store isn't valid for tail predication
1188	if (I.getOpcode() == ARM::MVE_VLDRWU32)
1189	return false;
1190	}
1191
1192	if (LookAtSuccessors) {
1193	for (auto *Succ : BB->successors()) {
1194	if (!Visited.contains(Ptr: Succ) && !is_contained(Range&: Frontier, Element: Succ))
1195	Frontier.push_back(Elt: Succ);
1196	}
1197	}
1198	Visited.insert(Ptr: BB);
1199	Idx++;
1200	}
1201
1202	return true;
1203	}
1204
1205	bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
1206	if (CannotTailPredicate)
1207	return false;
1208
1209	if (!shouldInspect(MI&: *MI))
1210	return true;
1211
1212	if (MI->getOpcode() == ARM::MVE_VPSEL \|\|
1213	MI->getOpcode() == ARM::MVE_VPNOT) {
1214	// TODO: Allow VPSEL and VPNOT, we currently cannot because:
1215	// 1) It will use the VPR as a predicate operand, but doesn't have to be
1216	// instead a VPT block, which means we can assert while building up
1217	// the VPT block because we don't find another VPT or VPST to being a new
1218	// one.
1219	// 2) VPSEL still requires a VPR operand even after tail predicating,
1220	// which means we can't remove it unless there is another
1221	// instruction, such as vcmp, that can provide the VPR def.
1222	return false;
1223	}
1224
1225	// Record all VCTPs and check that they're equivalent to one another.
1226	if (isVCTP(MI) && !AddVCTP(MI))
1227	return false;
1228
1229	// Inspect uses first so that any instructions that alter the VPR don't
1230	// alter the predicate upon themselves.
1231	const MCInstrDesc &MCID = MI->getDesc();
1232	bool IsUse = false;
1233	unsigned LastOpIdx = MI->getNumOperands() - `1`;
1234	for (const auto &Op : enumerate(First: reverse(C: MCID.operands()))) {
1235	const MachineOperand &MO = MI->getOperand(i: LastOpIdx - Op.index());
1236	if (!MO.isReg() \|\| !MO.isUse() \|\| MO.getReg() != ARM::VPR)
1237	continue;
1238
1239	if (ARM::isVpred(op: Op.value().OperandType)) {
1240	VPTstate.addInst(MI);
1241	IsUse = true;
1242	} else if (MI->getOpcode() != ARM::MVE_VPST) {
1243	LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
1244	return false;
1245	}
1246	}
1247
1248	// If we find an instruction that has been marked as not valid for tail
1249	// predication, only allow the instruction if it's contained within a valid
1250	// VPT block.
1251	bool RequiresExplicitPredication =
1252	(MCID.TSFlags & ARMII::ValidForTailPredication) == `0`;
1253	if (isDomainMVE(MI) && RequiresExplicitPredication) {
1254	if (MI->getOpcode() == ARM::MQPRCopy)
1255	return true;
1256	if (!IsUse && producesDoubleWidthResult(MI: *MI)) {
1257	DoubleWidthResultInstrs.insert(Ptr: MI);
1258	return true;
1259	}
1260
1261	LLVM_DEBUG(if (!IsUse) dbgs()
1262	<< "ARM Loops: Can't tail predicate: " << *MI);
1263	return IsUse;
1264	}
1265
1266	// If the instruction is already explicitly predicated, then the conversion
1267	// will be fine, but ensure that all store operations are predicated.
1268	if (MI->mayStore() && !ValidateMVEStore(MI, ML: &ML))
1269	return IsUse;
1270
1271	// If this instruction defines the VPR, update the predicate for the
1272	// proceeding instructions.
1273	if (isVectorPredicate(MI)) {
1274	// Clear the existing predicate when we're not in VPT Active state,
1275	// otherwise we add to it.
1276	if (!isVectorPredicated(MI))
1277	VPTstate.resetPredicate(MI);
1278	else
1279	VPTstate.addPredicate(MI);
1280	}
1281
1282	// Finally once the predicate has been modified, we can start a new VPT
1283	// block if necessary.
1284	if (isVPTOpcode(Opc: MI->getOpcode()))
1285	VPTstate.CreateVPTBlock(MI);
1286
1287	return true;
1288	}
1289
1290	bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
1291	const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
1292	if (!ST.hasLOB())
1293	return false;
1294
1295	MF = &mf;
1296	LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
1297
1298	MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
1299	RDA = &getAnalysis<ReachingDefAnalysis>();
1300	MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
1301	MRI = &MF->getRegInfo();
1302	TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
1303	TRI = ST.getRegisterInfo();
1304	BBUtils = std::make_unique<ARMBasicBlockUtils>(args&: *MF);
1305	BBUtils ->computeAllBlockSizes();
1306	BBUtils ->adjustBBOffsetsAfter(MBB: &MF->front());
1307
1308	bool Changed = false;
1309	for (auto ML : MLI) {
1310	if (ML->isOutermost())
1311	Changed \|= ProcessLoop(ML);
1312	}
1313	Changed \|= RevertNonLoops();
1314	return Changed;
1315	}
1316
1317	bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
1318	bool Changed = false;
1319
1320	// Process inner loops first.
1321	for (MachineLoop L : ML)
1322	Changed \|= ProcessLoop(ML: L);
1323
1324	LLVM_DEBUG({
1325	dbgs() << "ARM Loops: Processing loop containing:\n";
1326	if (auto *Preheader = ML->getLoopPreheader())
1327	dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1328	else if (auto Preheader = MLI->findLoopPreheader(ML, true, true*))
1329	dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1330	for (auto *MBB : ML->getBlocks())
1331	dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
1332	});
1333
1334	// Search the given block for a loop start instruction. If one isn't found,
1335	// and there's only one predecessor block, search that one too.
1336	std::function<MachineInstr(MachineBasicBlock)> SearchForStart =
1337	[&SearchForStart](MachineBasicBlock MBB) -> MachineInstr {
1338	for (auto &MI : *MBB) {
1339	if (isLoopStart(MI))
1340	return &MI;
1341	}
1342	if (MBB->pred_size() == `1`)
1343	return SearchForStart (*MBB->pred_begin());
1344	return nullptr;
1345	};
1346
1347	LowOverheadLoop LoLoop(ML, MLI, RDA, TRI, *TII);
1348	// Search the preheader for the start intrinsic.
1349	// FIXME: I don't see why we shouldn't be supporting multiple predecessors
1350	// with potentially multiple set.loop.iterations, so we need to enable this.
1351	if (LoLoop.Preheader)
1352	LoLoop.Start = SearchForStart (LoLoop.Preheader);
1353	else
1354	return Changed;
1355
1356	// Find the low-overhead loop components and decide whether or not to fall
1357	// back to a normal loop. Also look for a vctp instructions and decide
1358	// whether we can convert that predicate using tail predication.
1359	for (auto *MBB : reverse(C: ML->getBlocks())) {
1360	for (auto &MI : *MBB) {
1361	if (MI.isDebugValue())
1362	continue;
1363	else if (MI.getOpcode() == ARM::t2LoopDec)
1364	LoLoop.Dec = &MI;
1365	else if (MI.getOpcode() == ARM::t2LoopEnd)
1366	LoLoop.End = &MI;
1367	else if (MI.getOpcode() == ARM::t2LoopEndDec)
1368	LoLoop.End = LoLoop.Dec = &MI;
1369	else if (isLoopStart(MI))
1370	LoLoop.Start = &MI;
1371	else if (MI.getDesc().isCall()) {
1372	// TODO: Though the call will require LE to execute again, does this
1373	// mean we should revert? Always executing LE hopefully should be
1374	// faster than performing a sub,cmp,br or even subs,br.
1375	LoLoop.Revert = true;
1376	LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
1377	} else {
1378	// Record VPR defs and build up their corresponding vpt blocks.
1379	// Check we know how to tail predicate any mve instructions.
1380	LoLoop.AnalyseMVEInst(MI: &MI);
1381	}
1382	}
1383	}
1384
1385	LLVM_DEBUG(LoLoop.dump());
1386	if (!LoLoop.FoundAllComponents()) {
1387	LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
1388	return Changed;
1389	}
1390
1391	assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
1392	"Expected t2WhileLoopStart to be removed before regalloc!");
1393
1394	// Check that the only instruction using LoopDec is LoopEnd. This can only
1395	// happen when the Dec and End are separate, not a single t2LoopEndDec.
1396	// TODO: Check for copy chains that really have no effect.
1397	if (LoLoop.Dec != LoLoop.End) {
1398	SmallPtrSet<MachineInstr *, `2`> Uses;
1399	RDA->getReachingLocalUses(MI: LoLoop.Dec, PhysReg: MCRegister::from(Val: ARM::LR), Uses);
1400	if (Uses.size() > `1` \|\| !Uses.count(Ptr: LoLoop.End)) {
1401	LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
1402	LoLoop.Revert = true;
1403	}
1404	}
1405	LoLoop.Validate(BBUtils: BBUtils.get());
1406	Expand(LoLoop);
1407	return true;
1408	}
1409
1410	// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
1411	// beq that branches to the exit branch.
1412	// TODO: We could also try to generate a cbz if the value in LR is also in
1413	// another low register.
1414	void ARMLowOverheadLoops::RevertWhile(MachineInstr MI) const* {
1415	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
1416	MachineBasicBlock DestBB = getWhileLoopStartTargetBB(MI: MI);
1417	unsigned BrOpc = BBUtils ->isBBInRange(MI, DestBB, MaxDisp: `254`) ?
1418	ARM::tBcc : ARM::t2Bcc;
1419
1420	RevertWhileLoopStartLR(MI, TII, BrOpc);
1421	}
1422
1423	void ARMLowOverheadLoops::RevertDo(MachineInstr MI) const* {
1424	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
1425	RevertDoLoopStart(MI, TII);
1426	}
1427
1428	bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr MI) const* {
1429	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
1430	MachineBasicBlock *MBB = MI->getParent();
1431	SmallPtrSet<MachineInstr*, `1`> Ignore;
1432	for (auto I = MachineBasicBlock::iterator (MI), E = MBB->end(); I != E; ++I) {
1433	if (I ->getOpcode() == ARM::t2LoopEnd) {
1434	Ignore.insert(Ptr: &*I);
1435	break;
1436	}
1437	}
1438
1439	// If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
1440	bool SetFlags =
1441	RDA->isSafeToDefRegAt(MI, PhysReg: MCRegister::from(Val: ARM::CPSR), Ignore);
1442
1443	llvm::RevertLoopDec(MI, TII, SetFlags);
1444	return SetFlags;
1445	}
1446
1447	// Generate a subs, or sub and cmp, and a branch instead of an LE.
1448	void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr MI, bool* SkipCmp) const {
1449	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
1450
1451	MachineBasicBlock *DestBB = MI->getOperand(i: `1`).getMBB();
1452	unsigned BrOpc = BBUtils ->isBBInRange(MI, DestBB, MaxDisp: `254`) ?
1453	ARM::tBcc : ARM::t2Bcc;
1454
1455	llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
1456	}
1457
1458	// Generate a subs, or sub and cmp, and a branch instead of an LE.
1459	void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr MI) const* {
1460	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
1461	assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
1462	MachineBasicBlock *MBB = MI->getParent();
1463
1464	MachineInstrBuilder MIB =
1465	BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::t2SUBri));
1466	MIB.addDef(RegNo: ARM::LR);
1467	MIB.add(MO: MI->getOperand(i: `1`));
1468	MIB.addImm(Val: `1`);
1469	MIB.addImm(Val: ARMCC::AL);
1470	MIB.addReg(RegNo: ARM::NoRegister);
1471	MIB.addReg(RegNo: ARM::CPSR);
1472	MIB ->getOperand(i: `5`).setIsDef(true);
1473
1474	MachineBasicBlock *DestBB = MI->getOperand(i: `2`).getMBB();
1475	unsigned BrOpc =
1476	BBUtils ->isBBInRange(MI, DestBB, MaxDisp: `254`) ? ARM::tBcc : ARM::t2Bcc;
1477
1478	// Create bne
1479	MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: BrOpc));
1480	MIB.add(MO: MI->getOperand(i: `2`)); // branch target
1481	MIB.addImm(Val: ARMCC::NE); // condition code
1482	MIB.addReg(RegNo: ARM::CPSR);
1483
1484	MI->eraseFromParent();
1485	}
1486
1487	// Perform dead code elimation on the loop iteration count setup expression.
1488	// If we are tail-predicating, the number of elements to be processed is the
1489	// operand of the VCTP instruction in the vector body, see getCount(), which is
1490	// register $r3 in this example:
1491	//
1492	// $lr = big-itercount-expression
1493	// ..
1494	// $lr = t2DoLoopStart renamable $lr
1495	// vector.body:
1496	// ..
1497	// $vpr = MVE_VCTP32 renamable $r3
1498	// renamable $lr = t2LoopDec killed renamable $lr, 1
1499	// t2LoopEnd renamable $lr, %vector.body
1500	// tB %end
1501	//
1502	// What we would like achieve here is to replace the do-loop start pseudo
1503	// instruction t2DoLoopStart with:
1504	//
1505	// $lr = MVE_DLSTP_32 killed renamable $r3
1506	//
1507	// Thus, $r3 which defines the number of elements, is written to $lr,
1508	// and then we want to delete the whole chain that used to define $lr,
1509	// see the comment below how this chain could look like.
1510	//
1511	void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
1512	if (!LoLoop.IsTailPredicationLegal())
1513	return;
1514
1515	LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
1516
1517	MachineInstr *Def = RDA->getMIOperand(MI: LoLoop.Start, Idx: `1`);
1518	if (!Def) {
1519	LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
1520	return;
1521	}
1522
1523	// Collect and remove the users of iteration count.
1524	SmallPtrSet<MachineInstr*, `4`> Killed = { LoLoop.Start, LoLoop.Dec,
1525	LoLoop.End };
1526	if (!TryRemove(MI: Def, RDA&: *RDA, ToRemove&: LoLoop.ToRemove, Ignore&: Killed))
1527	LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
1528	}
1529
1530	MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
1531	LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
1532	// When using tail-predication, try to delete the dead code that was used to
1533	// calculate the number of loop iterations.
1534	IterationCountDCE(LoLoop);
1535
1536	MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
1537	MachineInstr *Start = LoLoop.Start;
1538	MachineBasicBlock *MBB = LoLoop.StartInsertBB;
1539	unsigned Opc = LoLoop.getStartOpcode();
1540	MachineOperand &Count = LoLoop.getLoopStartOperand();
1541
1542	// A DLS lr, lr we needn't emit
1543	MachineInstr* NewStart;
1544	if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() &&
1545	Count.getReg() == ARM::LR) {
1546	LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
1547	NewStart = nullptr;
1548	} else {
1549	MachineInstrBuilder MIB =
1550	BuildMI(BB&: *MBB, I: InsertPt, MIMD: Start->getDebugLoc(), MCID: TII->get(Opcode: Opc));
1551
1552	MIB.addDef(RegNo: ARM::LR);
1553	MIB.add(MO: Count);
1554	if (isWhileLoopStart(MI: *Start))
1555	MIB.addMBB(MBB: getWhileLoopStartTargetBB(MI: *Start));
1556
1557	LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
1558	NewStart = &*MIB;
1559	}
1560
1561	LoLoop.ToRemove.insert(Ptr: Start);
1562	return NewStart;
1563	}
1564
1565	void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
1566	auto RemovePredicate = [](MachineInstr *MI) {
1567	if (MI->isDebugInstr())
1568	return;
1569	LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
1570	int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI);
1571	assert(PIdx >= `1` && "Trying to unpredicate a non-predicated instruction");
1572	assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
1573	"Expected Then predicate!");
1574	MI->getOperand(i: PIdx).setImm(ARMVCC::None);
1575	MI->getOperand(i: PIdx + `1`).setReg(`0`);
1576	};
1577
1578	for (auto &Block : LoLoop.getVPTBlocks()) {
1579	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
1580
1581	auto ReplaceVCMPWithVPT = [&](MachineInstr &TheVCMP, MachineInstr At) {
1582	assert(TheVCMP && "Replacing a removed or non-existent VCMP");
1583	// Replace the VCMP with a VPT
1584	MachineInstrBuilder MIB =
1585	BuildMI(BB&: *At->getParent(), I: At, MIMD: At->getDebugLoc(),
1586	MCID: TII->get(Opcode: VCMPOpcodeToVPT(Opcode: TheVCMP->getOpcode())));
1587	MIB.addImm(Val: ARMVCC::Then);
1588	// Register one
1589	MIB.add(MO: TheVCMP->getOperand(i: `1`));
1590	// Register two
1591	MIB.add(MO: TheVCMP->getOperand(i: `2`));
1592	// The comparison code, e.g. ge, eq, lt
1593	MIB.add(MO: TheVCMP->getOperand(i: `3`));
1594	LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
1595	LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr());
1596	LoLoop.ToRemove.insert(Ptr: TheVCMP);
1597	TheVCMP = nullptr;
1598	};
1599
1600	if (LoLoop.VPTstate.isEntryPredicatedOnVCTP(Block, /exclusive/ Exclusive: true)) {
1601	MachineInstr *VPST = Insts.front();
1602	if (Block.hasUniformPredicate()) {
1603	// A vpt block starting with VPST, is only predicated upon vctp and has no
1604	// internal vpr defs:
1605	// - Remove vpst.
1606	// - Unpredicate the remaining instructions.
1607	LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1608	for (unsigned i = `1`; i < Insts.size(); ++i)
1609	RemovePredicate (Insts [i]);
1610	} else {
1611	// The VPT block has a non-uniform predicate but it uses a vpst and its
1612	// entry is guarded only by a vctp, which means we:
1613	// - Need to remove the original vpst.
1614	// - Then need to unpredicate any following instructions, until
1615	// we come across the divergent vpr def.
1616	// - Insert a new vpst to predicate the instruction(s) that following
1617	// the divergent vpr def.
1618	MachineInstr *Divergent = Block.getDivergent();
1619	MachineBasicBlock *MBB = Divergent->getParent();
1620	auto DivergentNext = ++MachineBasicBlock::iterator (Divergent);
1621	while (DivergentNext != MBB->end() && DivergentNext ->isDebugInstr())
1622	++DivergentNext;
1623
1624	bool DivergentNextIsPredicated =
1625	DivergentNext != MBB->end() &&
1626	getVPTInstrPredicate(MI: *DivergentNext) != ARMVCC::None;
1627
1628	for (auto I = ++MachineBasicBlock::iterator (VPST), E = DivergentNext;
1629	I != E; ++I)
1630	RemovePredicate (&*I);
1631
1632	// Check if the instruction defining vpr is a vcmp so it can be combined
1633	// with the VPST This should be the divergent instruction
1634	MachineInstr *VCMP =
1635	VCMPOpcodeToVPT(Opcode: Divergent->getOpcode()) != `0` ? Divergent : nullptr;
1636
1637	if (DivergentNextIsPredicated) {
1638	// Insert a VPST at the divergent only if the next instruction
1639	// would actually use it. A VCMP following a VPST can be
1640	// merged into a VPT so do that instead if the VCMP exists.
1641	if (!VCMP) {
1642	// Create a VPST (with a null mask for now, we'll recompute it
1643	// later)
1644	MachineInstrBuilder MIB =
1645	BuildMI(BB&: *Divergent->getParent(), I: Divergent,
1646	MIMD: Divergent->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VPST));
1647	MIB.addImm(Val: `0`);
1648	LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
1649	LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr());
1650	} else {
1651	// No RDA checks are necessary here since the VPST would have been
1652	// directly after the VCMP
1653	ReplaceVCMPWithVPT (VCMP, VCMP);
1654	}
1655	}
1656	}
1657	LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1658	LoLoop.ToRemove.insert(Ptr: VPST);
1659	} else if (Block.containsVCTP()) {
1660	// The vctp will be removed, so either the entire block will be dead or
1661	// the block mask of the vp(s)t will need to be recomputed.
1662	MachineInstr *VPST = Insts.front();
1663	if (Block.size() == `2`) {
1664	assert(VPST->getOpcode() == ARM::MVE_VPST &&
1665	"Found a VPST in an otherwise empty vpt block");
1666	LoLoop.ToRemove.insert(Ptr: VPST);
1667	} else
1668	LoLoop.BlockMasksToRecompute.insert(Ptr: VPST);
1669	} else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
1670	// If this block starts with a VPST then attempt to merge it with the
1671	// preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
1672	// block that no longer exists
1673	MachineInstr *VPST = Insts.front();
1674	auto Next = ++MachineBasicBlock::iterator (VPST);
1675	assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
1676	"The instruction after a VPST must be predicated");
1677	(void)Next;
1678	MachineInstr *VprDef = RDA->getUniqueReachingMIDef(MI: VPST, PhysReg: ARM::VPR);
1679	if (VprDef && VCMPOpcodeToVPT(Opcode: VprDef->getOpcode()) &&
1680	!LoLoop.ToRemove.contains(Ptr: VprDef)) {
1681	MachineInstr *VCMP = VprDef;
1682	// The VCMP and VPST can only be merged if the VCMP's operands will have
1683	// the same values at the VPST.
1684	// If any of the instructions between the VCMP and VPST are predicated
1685	// then a different code path is expected to have merged the VCMP and
1686	// VPST already.
1687	if (std::none_of(first: ++MachineBasicBlock::iterator (VCMP),
1688	last: MachineBasicBlock::iterator (VPST), pred: hasVPRUse) &&
1689	RDA->hasSameReachingDef(A: VCMP, B: VPST, PhysReg: VCMP->getOperand(i: `1`).getReg()) &&
1690	RDA->hasSameReachingDef(A: VCMP, B: VPST, PhysReg: VCMP->getOperand(i: `2`).getReg())) {
1691	ReplaceVCMPWithVPT (VCMP, VPST);
1692	LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1693	LoLoop.ToRemove.insert(Ptr: VPST);
1694	}
1695	}
1696	}
1697	}
1698
1699	LoLoop.ToRemove.insert(I: LoLoop.VCTPs.begin(), E: LoLoop.VCTPs.end());
1700	}
1701
1702	void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1703
1704	// Combine the LoopDec and LoopEnd instructions into LE(TP).
1705	auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
1706	MachineInstr *End = LoLoop.End;
1707	MachineBasicBlock *MBB = End->getParent();
1708	unsigned Opc = LoLoop.IsTailPredicationLegal() ?
1709	ARM::MVE_LETP : ARM::t2LEUpdate;
1710	MachineInstrBuilder MIB = BuildMI(BB&: *MBB, I: End, MIMD: End->getDebugLoc(),
1711	MCID: TII->get(Opcode: Opc));
1712	MIB.addDef(RegNo: ARM::LR);
1713	unsigned Off = LoLoop.Dec == LoLoop.End ? `1` : `0`;
1714	MIB.add(MO: End->getOperand(i: Off + `0`));
1715	MIB.add(MO: End->getOperand(i: Off + `1`));
1716	LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
1717	LoLoop.ToRemove.insert(Ptr: LoLoop.Dec);
1718	LoLoop.ToRemove.insert(Ptr: End);
1719	return &*MIB;
1720	};
1721
1722	// TODO: We should be able to automatically remove these branches before we
1723	// get here - probably by teaching analyzeBranch about the pseudo
1724	// instructions.
1725	// If there is an unconditional branch, after I, that just branches to the
1726	// next block, remove it.
1727	auto RemoveDeadBranch = [](MachineInstr *I) {
1728	MachineBasicBlock *BB = I->getParent();
1729	MachineInstr *Terminator = &BB->instr_back();
1730	if (Terminator->isUnconditionalBranch() && I != Terminator) {
1731	MachineBasicBlock *Succ = Terminator->getOperand(i: `0`).getMBB();
1732	if (BB->isLayoutSuccessor(MBB: Succ)) {
1733	LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
1734	Terminator->eraseFromParent();
1735	}
1736	}
1737	};
1738
1739	// And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
1740	// Anything other MQPRCopy can be converted to MVE_VORR later on.
1741	auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, `4`> &VMOVCopies) {
1742	for (auto *MI : VMOVCopies) {
1743	LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
1744	assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1745	MachineBasicBlock *MBB = MI->getParent();
1746	Register Dst = MI->getOperand(i: `0`).getReg();
1747	Register Src = MI->getOperand(i: `1`).getReg();
1748	auto MIB1 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD),
1749	DestReg: ARM::D0 + (Dst - ARM::Q0) * `2`)
1750	.addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * `2`)
1751	.add(MOs: predOps(Pred: ARMCC::AL));
1752	(void)MIB1;
1753	LLVM_DEBUG(dbgs() << " into " << *MIB1);
1754	auto MIB2 = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::VMOVD),
1755	DestReg: ARM::D0 + (Dst - ARM::Q0) * `2` + `1`)
1756	.addReg(RegNo: ARM::D0 + (Src - ARM::Q0) * `2` + `1`)
1757	.add(MOs: predOps(Pred: ARMCC::AL));
1758	LLVM_DEBUG(dbgs() << " and " << *MIB2);
1759	(void)MIB2;
1760	MI->eraseFromParent();
1761	}
1762	};
1763
1764	if (LoLoop.Revert) {
1765	if (isWhileLoopStart(MI: *LoLoop.Start))
1766	RevertWhile(MI: LoLoop.Start);
1767	else
1768	RevertDo(MI: LoLoop.Start);
1769	if (LoLoop.Dec == LoLoop.End)
1770	RevertLoopEndDec(MI: LoLoop.End);
1771	else
1772	RevertLoopEnd(MI: LoLoop.End, SkipCmp: RevertLoopDec(MI: LoLoop.Dec));
1773	} else {
1774	ExpandVMOVCopies (LoLoop.VMOVCopies);
1775	LoLoop.Start = ExpandLoopStart(LoLoop);
1776	if (LoLoop.Start)
1777	RemoveDeadBranch (LoLoop.Start);
1778	LoLoop.End = ExpandLoopEnd (LoLoop);
1779	RemoveDeadBranch (LoLoop.End);
1780	if (LoLoop.IsTailPredicationLegal())
1781	ConvertVPTBlocks(LoLoop);
1782	for (auto *I : LoLoop.ToRemove) {
1783	LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
1784	I->eraseFromParent();
1785	}
1786	for (auto *I : LoLoop.BlockMasksToRecompute) {
1787	LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
1788	recomputeVPTBlockMask(Instr&: *I);
1789	LLVM_DEBUG(dbgs() << " ... done: " << *I);
1790	}
1791	}
1792
1793	PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
1794	DFS.ProcessLoop();
1795	const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
1796	fullyRecomputeLiveIns(MBBs: PostOrder);
1797
1798	for (auto *MBB : reverse(C: PostOrder))
1799	recomputeLivenessFlags(MBB&: *MBB);
1800
1801	// We've moved, removed and inserted new instructions, so update RDA.
1802	RDA->reset();
1803	}
1804
1805	bool ARMLowOverheadLoops::RevertNonLoops() {
1806	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
1807	bool Changed = false;
1808
1809	for (auto &MBB : *MF) {
1810	SmallVector<MachineInstr*, `4`> Starts;
1811	SmallVector<MachineInstr*, `4`> Decs;
1812	SmallVector<MachineInstr*, `4`> Ends;
1813	SmallVector<MachineInstr *, `4`> EndDecs;
1814	SmallVector<MachineInstr *, `4`> MQPRCopies;
1815
1816	for (auto &I : MBB) {
1817	if (isLoopStart(MI: I))
1818	Starts.push_back(Elt: &I);
1819	else if (I.getOpcode() == ARM::t2LoopDec)
1820	Decs.push_back(Elt: &I);
1821	else if (I.getOpcode() == ARM::t2LoopEnd)
1822	Ends.push_back(Elt: &I);
1823	else if (I.getOpcode() == ARM::t2LoopEndDec)
1824	EndDecs.push_back(Elt: &I);
1825	else if (I.getOpcode() == ARM::MQPRCopy)
1826	MQPRCopies.push_back(Elt: &I);
1827	}
1828
1829	if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
1830	MQPRCopies.empty())
1831	continue;
1832
1833	Changed = true;
1834
1835	for (auto *Start : Starts) {
1836	if (isWhileLoopStart(MI: *Start))
1837	RevertWhile(MI: Start);
1838	else
1839	RevertDo(MI: Start);
1840	}
1841	for (auto *Dec : Decs)
1842	RevertLoopDec(MI: Dec);
1843
1844	for (auto *End : Ends)
1845	RevertLoopEnd(MI: End);
1846	for (auto *End : EndDecs)
1847	RevertLoopEndDec(MI: End);
1848	for (auto *MI : MQPRCopies) {
1849	LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
1850	assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1851	MachineBasicBlock *MBB = MI->getParent();
1852	auto MIB = BuildMI(BB&: *MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: ARM::MVE_VORR),
1853	DestReg: MI->getOperand(i: `0`).getReg())
1854	.add(MO: MI->getOperand(i: `1`))
1855	.add(MO: MI->getOperand(i: `1`));
1856	addUnpredicatedMveVpredROp(MIB, DestReg: MI->getOperand(i: `0`).getReg());
1857	MI->eraseFromParent();
1858	}
1859	}
1860	return Changed;
1861	}
1862
1863	FunctionPass *llvm::createARMLowOverheadLoopsPass() {
1864	return new ARMLowOverheadLoops ();
1865	}
1866

Browse the source code of llvm_projects/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp