HexagonLoopIdiomRecognition.cpp source code [llvm_projects/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp]

1	//===- HexagonLoopIdiomRecognition.cpp ------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "HexagonLoopIdiomRecognition.h"
10	#include "Hexagon.h"
11	#include "llvm/ADT/APInt.h"
12	#include "llvm/ADT/DenseMap.h"
13	#include "llvm/ADT/SetVector.h"
14	#include "llvm/ADT/SmallPtrSet.h"
15	#include "llvm/ADT/SmallSet.h"
16	#include "llvm/ADT/SmallVector.h"
17	#include "llvm/ADT/StringRef.h"
18	#include "llvm/Analysis/AliasAnalysis.h"
19	#include "llvm/Analysis/InstructionSimplify.h"
20	#include "llvm/Analysis/LoopAnalysisManager.h"
21	#include "llvm/Analysis/LoopInfo.h"
22	#include "llvm/Analysis/LoopPass.h"
23	#include "llvm/Analysis/MemoryLocation.h"
24	#include "llvm/Analysis/ScalarEvolution.h"
25	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
26	#include "llvm/Analysis/TargetLibraryInfo.h"
27	#include "llvm/Analysis/ValueTracking.h"
28	#include "llvm/IR/Attributes.h"
29	#include "llvm/IR/BasicBlock.h"
30	#include "llvm/IR/Constant.h"
31	#include "llvm/IR/Constants.h"
32	#include "llvm/IR/DataLayout.h"
33	#include "llvm/IR/DebugLoc.h"
34	#include "llvm/IR/DerivedTypes.h"
35	#include "llvm/IR/Dominators.h"
36	#include "llvm/IR/Function.h"
37	#include "llvm/IR/IRBuilder.h"
38	#include "llvm/IR/InstrTypes.h"
39	#include "llvm/IR/Instruction.h"
40	#include "llvm/IR/Instructions.h"
41	#include "llvm/IR/IntrinsicInst.h"
42	#include "llvm/IR/Intrinsics.h"
43	#include "llvm/IR/IntrinsicsHexagon.h"
44	#include "llvm/IR/Module.h"
45	#include "llvm/IR/PassManager.h"
46	#include "llvm/IR/PatternMatch.h"
47	#include "llvm/IR/Type.h"
48	#include "llvm/IR/User.h"
49	#include "llvm/IR/Value.h"
50	#include "llvm/InitializePasses.h"
51	#include "llvm/Pass.h"
52	#include "llvm/Support/Casting.h"
53	#include "llvm/Support/CommandLine.h"
54	#include "llvm/Support/Compiler.h"
55	#include "llvm/Support/Debug.h"
56	#include "llvm/Support/ErrorHandling.h"
57	#include "llvm/Support/KnownBits.h"
58	#include "llvm/Support/raw_ostream.h"
59	#include "llvm/TargetParser/Triple.h"
60	#include "llvm/Transforms/Scalar.h"
61	#include "llvm/Transforms/Utils.h"
62	#include "llvm/Transforms/Utils/Local.h"
63	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
64	#include <algorithm>
65	#include <array>
66	#include <cassert>
67	#include <cstdint>
68	#include <cstdlib>
69	#include <deque>
70	#include <functional>
71	#include <iterator>
72	#include <map>
73	#include <set>
74	#include <utility>
75	#include <vector>
76
77	#define DEBUG_TYPE "hexagon-lir"
78
79	using namespace llvm;
80
81	static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
82	cl::Hidden, cl::init(Val: false),
83	cl::desc ("Disable generation of memcpy in loop idiom recognition"));
84
85	static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
86	cl::Hidden, cl::init(Val: false),
87	cl::desc ("Disable generation of memmove in loop idiom recognition"));
88
89	static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
90	cl::Hidden, cl::init(Val: `0`), cl::desc ("Threshold (in bytes) for the runtime "
91	"check guarding the memmove."));
92
93	static cl::opt<unsigned> CompileTimeMemSizeThreshold(
94	"compile-time-mem-idiom-threshold", cl::Hidden, cl::init(Val: `64`),
95	cl::desc ("Threshold (in bytes) to perform the transformation, if the "
96	"runtime loop count (mem transfer size) is known at compile-time."));
97
98	static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
99	cl::Hidden, cl::init(Val: true),
100	cl::desc ("Only enable generating memmove in non-nested loops"));
101
102	static cl::opt<bool> HexagonVolatileMemcpy(
103	"disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(Val: false),
104	cl::desc ("Enable Hexagon-specific memcpy for volatile destination."));
105
106	static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(Val: `10000`),
107	cl::Hidden, cl::desc ("Maximum number of simplification steps in HLIR"));
108
109	static const char *HexagonVolatileMemcpyName
110	= "hexagon_memcpy_forward_vp4cp4n2";
111
112	namespace {
113
114	class HexagonLoopIdiomRecognize {
115	public:
116	explicit HexagonLoopIdiomRecognize(AliasAnalysis AA, DominatorTree DT,
117	LoopInfo LF, const* TargetLibraryInfo *TLI,
118	ScalarEvolution *SE)
119	: AA(AA), DT(DT), LF(LF), TLI(TLI), SE(SE) {}
120
121	bool run(Loop *L);
122
123	private:
124	int getSCEVStride(const SCEVAddRecExpr *StoreEv);
125	bool isLegalStore(Loop CurLoop, StoreInst SI);
126	void collectStores(Loop CurLoop, BasicBlock BB,
127	SmallVectorImpl<StoreInst *> &Stores);
128	bool processCopyingStore(Loop CurLoop, StoreInst SI, const SCEV *BECount);
129	bool coverLoop(Loop L, SmallVectorImpl<Instruction > &Insts) const;
130	bool runOnLoopBlock(Loop CurLoop, BasicBlock BB, const SCEV *BECount,
131	SmallVectorImpl<BasicBlock *> &ExitBlocks);
132	bool runOnCountableLoop(Loop *L);
133
134	AliasAnalysis *AA;
135	const DataLayout *DL;
136	DominatorTree *DT;
137	LoopInfo *LF;
138	const TargetLibraryInfo *TLI;
139	ScalarEvolution *SE;
140	bool HasMemcpy, HasMemmove;
141	};
142
143	class HexagonLoopIdiomRecognizeLegacyPass : public LoopPass {
144	public:
145	static char ID;
146
147	explicit HexagonLoopIdiomRecognizeLegacyPass() : LoopPass (ID) {}
148
149	StringRef getPassName() const override {
150	return "Recognize Hexagon-specific loop idioms";
151	}
152
153	void getAnalysisUsage(AnalysisUsage &AU) const override {
154	AU.addRequired<LoopInfoWrapperPass>();
155	AU.addRequiredID(ID&: LoopSimplifyID);
156	AU.addRequiredID(ID&: LCSSAID);
157	AU.addRequired<AAResultsWrapperPass>();
158	AU.addRequired<ScalarEvolutionWrapperPass>();
159	AU.addRequired<DominatorTreeWrapperPass>();
160	AU.addRequired<TargetLibraryInfoWrapperPass>();
161	AU.addPreserved<TargetLibraryInfoWrapperPass>();
162	}
163
164	bool runOnLoop(Loop *L, LPPassManager &LPM) override;
165	};
166
167	struct Simplifier {
168	struct Rule {
169	using FuncType = std::function<Value (Instruction , LLVMContext &)>;
170	Rule(StringRef N, FuncType F) : Name (N), Fn (F) {}
171	StringRef Name; // For debugging.
172	FuncType Fn;
173	};
174
175	void addRule(StringRef N, const Rule::FuncType &F) {
176	Rules.push_back(x: Rule (N, F));
177	}
178
179	private:
180	struct WorkListType {
181	WorkListType() = default;
182
183	void push_back(Value *V) {
184	// Do not push back duplicates.
185	if (S.insert(x: V).second)
186	Q.push_back(x: V);
187	}
188
189	Value *pop_front_val() {
190	Value *V = Q.front();
191	Q.pop_front();
192	S.erase(x: V);
193	return V;
194	}
195
196	bool empty() const { return Q.empty(); }
197
198	private:
199	std::deque<Value *> Q;
200	std::set<Value *> S;
201	};
202
203	using ValueSetType = std::set<Value *>;
204
205	std::vector<Rule> Rules;
206
207	public:
208	struct Context {
209	using ValueMapType = DenseMap<Value , Value >;
210
211	Value *Root;
212	ValueSetType Used; // The set of all cloned values used by Root.
213	ValueSetType Clones; // The set of all cloned values.
214	LLVMContext &Ctx;
215
216	Context(Instruction *Exp)
217	: Ctx(Exp->getParent()->getParent()->getContext()) {
218	initialize(Exp);
219	}
220
221	~Context() { cleanup(); }
222
223	void print(raw_ostream &OS, const Value V) const*;
224	Value materialize(BasicBlock B, BasicBlock::iterator At);
225
226	private:
227	friend struct Simplifier;
228
229	void initialize(Instruction *Exp);
230	void cleanup();
231
232	template <typename FuncT> void traverse(Value *V, FuncT F);
233	void record(Value *V);
234	void use(Value *V);
235	void unuse(Value *V);
236
237	bool equal(const Instruction I, const* Instruction J) const*;
238	Value find(Value Tree, Value Sub) const*;
239	Value subst(Value Tree, Value OldV, Value NewV);
240	void replace(Value OldV, Value NewV);
241	void link(Instruction I, BasicBlock B, BasicBlock::iterator At);
242	};
243
244	Value *simplify(Context &C);
245	};
246
247	struct PE {
248	PE(const Simplifier::Context &c, Value v = nullptr*) : C(c), V(v) {}
249
250	const Simplifier::Context &C;
251	const Value *V;
252	};
253
254	LLVM_ATTRIBUTE_USED
255	raw_ostream &operator<<(raw_ostream &OS, const PE &P) {
256	P.C.print(OS, V: P.V ? P.V : P.C.Root);
257	return OS;
258	}
259
260	} // end anonymous namespace
261
262	char HexagonLoopIdiomRecognizeLegacyPass::ID = `0`;
263
264	INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
265	"Recognize Hexagon-specific loop idioms", false, false)
266	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
267	INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
268	INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
269	INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
270	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
271	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
272	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
273	INITIALIZE_PASS_END(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
274	"Recognize Hexagon-specific loop idioms", false, false)
275
276	template <typename FuncT>
277	void Simplifier::Context::traverse(Value *V, FuncT F) {
278	WorkListType Q;
279	Q.push_back(V);
280
281	while (!Q.empty()) {
282	Instruction *U = dyn_cast<Instruction>(Val: Q.pop_front_val());
283	if (!U \|\| U->getParent())
284	continue;
285	if (!F(U))
286	continue;
287	for (Value *Op : U->operands())
288	Q.push_back(V: Op);
289	}
290	}
291
292	void Simplifier::Context::print(raw_ostream &OS, const Value V) const* {
293	const auto U = dyn_cast<const* Instruction>(Val: V);
294	if (!U) {
295	OS << V << `'('` << *V << `')'`;
296	return;
297	}
298
299	if (U->getParent()) {
300	OS << U << `'('`;
301	U->printAsOperand(O&: OS, PrintType: true);
302	OS << `')'`;
303	return;
304	}
305
306	unsigned N = U->getNumOperands();
307	if (N != `0`)
308	OS << U << `'('`;
309	OS << U->getOpcodeName();
310	for (const Value *Op : U->operands()) {
311	OS << `' '`;
312	print(OS, V: Op);
313	}
314	if (N != `0`)
315	OS << `')'`;
316	}
317
318	void Simplifier::Context::initialize(Instruction *Exp) {
319	// Perform a deep clone of the expression, set Root to the root
320	// of the clone, and build a map from the cloned values to the
321	// original ones.
322	ValueMapType M;
323	BasicBlock *Block = Exp->getParent();
324	WorkListType Q;
325	Q.push_back(V: Exp);
326
327	while (!Q.empty()) {
328	Value *V = Q.pop_front_val();
329	if (M.contains(Val: V))
330	continue;
331	if (Instruction *U = dyn_cast<Instruction>(Val: V)) {
332	if (isa<PHINode>(Val: U) \|\| U->getParent() != Block)
333	continue;
334	for (Value *Op : U->operands())
335	Q.push_back(V: Op);
336	M.insert(KV: {U, U->clone()});
337	}
338	}
339
340	for (std::pair<Value,Value> P : M) {
341	Instruction *U = cast<Instruction>(Val: P.second);
342	for (unsigned i = `0`, n = U->getNumOperands(); i != n; ++i) {
343	auto F = M.find(Val: U->getOperand(i));
344	if (F != M.end())
345	U->setOperand(i, Val: F ->second);
346	}
347	}
348
349	auto R = M.find(Val: Exp);
350	assert(R != M.end());
351	Root = R ->second;
352
353	record(V: Root);
354	use(V: Root);
355	}
356
357	void Simplifier::Context::record(Value *V) {
358	auto Record = [this](Instruction U) -> bool* {
359	Clones.insert(x: U);
360	return true;
361	};
362	traverse(V, F: Record);
363	}
364
365	void Simplifier::Context::use(Value *V) {
366	auto Use = [this](Instruction U) -> bool* {
367	Used.insert(x: U);
368	return true;
369	};
370	traverse(V, F: Use);
371	}
372
373	void Simplifier::Context::unuse(Value *V) {
374	if (!isa<Instruction>(Val: V) \|\| cast<Instruction>(Val: V)->getParent() != nullptr)
375	return;
376
377	auto Unuse = [this](Instruction U) -> bool* {
378	if (!U->use_empty())
379	return false;
380	Used.erase(x: U);
381	return true;
382	};
383	traverse(V, F: Unuse);
384	}
385
386	Value Simplifier::Context::subst(Value Tree, Value OldV, Value NewV) {
387	if (Tree == OldV)
388	return NewV;
389	if (OldV == NewV)
390	return Tree;
391
392	WorkListType Q;
393	Q.push_back(V: Tree);
394	while (!Q.empty()) {
395	Instruction *U = dyn_cast<Instruction>(Val: Q.pop_front_val());
396	// If U is not an instruction, or it's not a clone, skip it.
397	if (!U \|\| U->getParent())
398	continue;
399	for (unsigned i = `0`, n = U->getNumOperands(); i != n; ++i) {
400	Value *Op = U->getOperand(i);
401	if (Op == OldV) {
402	U->setOperand(i, Val: NewV);
403	unuse(V: OldV);
404	} else {
405	Q.push_back(V: Op);
406	}
407	}
408	}
409	return Tree;
410	}
411
412	void Simplifier::Context::replace(Value OldV, Value NewV) {
413	if (Root == OldV) {
414	Root = NewV;
415	use(V: Root);
416	return;
417	}
418
419	// NewV may be a complex tree that has just been created by one of the
420	// transformation rules. We need to make sure that it is commoned with
421	// the existing Root to the maximum extent possible.
422	// Identify all subtrees of NewV (including NewV itself) that have
423	// equivalent counterparts in Root, and replace those subtrees with
424	// these counterparts.
425	WorkListType Q;
426	Q.push_back(V: NewV);
427	while (!Q.empty()) {
428	Value *V = Q.pop_front_val();
429	Instruction *U = dyn_cast<Instruction>(Val: V);
430	if (!U \|\| U->getParent())
431	continue;
432	if (Value *DupV = find(Tree: Root, Sub: V)) {
433	if (DupV != V)
434	NewV = subst(Tree: NewV, OldV: V, NewV: DupV);
435	} else {
436	for (Value *Op : U->operands())
437	Q.push_back(V: Op);
438	}
439	}
440
441	// Now, simply replace OldV with NewV in Root.
442	Root = subst(Tree: Root, OldV, NewV);
443	use(V: Root);
444	}
445
446	void Simplifier::Context::cleanup() {
447	for (Value *V : Clones) {
448	Instruction *U = cast<Instruction>(Val: V);
449	if (!U->getParent())
450	U->dropAllReferences();
451	}
452
453	for (Value *V : Clones) {
454	Instruction *U = cast<Instruction>(Val: V);
455	if (!U->getParent())
456	U->deleteValue();
457	}
458	}
459
460	bool Simplifier::Context::equal(const Instruction *I,
461	const Instruction J) const* {
462	if (I == J)
463	return true;
464	if (!I->isSameOperationAs(I: J))
465	return false;
466	if (isa<PHINode>(Val: I))
467	return I->isIdenticalTo(I: J);
468
469	for (unsigned i = `0`, n = I->getNumOperands(); i != n; ++i) {
470	Value OpI = I->getOperand(i), OpJ = J->getOperand(i);
471	if (OpI == OpJ)
472	continue;
473	auto InI = dyn_cast<const* Instruction>(Val: OpI);
474	auto InJ = dyn_cast<const* Instruction>(Val: OpJ);
475	if (InI && InJ) {
476	if (!equal(I: InI, J: InJ))
477	return false;
478	} else if (InI != InJ \|\| !InI)
479	return false;
480	}
481	return true;
482	}
483
484	Value Simplifier::Context::find(Value Tree, Value Sub) const* {
485	Instruction *SubI = dyn_cast<Instruction>(Val: Sub);
486	WorkListType Q;
487	Q.push_back(V: Tree);
488
489	while (!Q.empty()) {
490	Value *V = Q.pop_front_val();
491	if (V == Sub)
492	return V;
493	Instruction *U = dyn_cast<Instruction>(Val: V);
494	if (!U \|\| U->getParent())
495	continue;
496	if (SubI && equal(I: SubI, J: U))
497	return U;
498	assert(!isa<PHINode>(U));
499	for (Value *Op : U->operands())
500	Q.push_back(V: Op);
501	}
502	return nullptr;
503	}
504
505	void Simplifier::Context::link(Instruction I, BasicBlock B,
506	BasicBlock::iterator At) {
507	if (I->getParent())
508	return;
509
510	for (Value *Op : I->operands()) {
511	if (Instruction *OpI = dyn_cast<Instruction>(Val: Op))
512	link(I: OpI, B, At);
513	}
514
515	I->insertInto(ParentBB: B, It: At);
516	}
517
518	Value Simplifier::Context::materialize(BasicBlock B,
519	BasicBlock::iterator At) {
520	if (Instruction *RootI = dyn_cast<Instruction>(Val: Root))
521	link(I: RootI, B, At);
522	return Root;
523	}
524
525	Value *Simplifier::simplify(Context &C) {
526	WorkListType Q;
527	Q.push_back(V: C.Root);
528	unsigned Count = `0`;
529	const unsigned Limit = SimplifyLimit;
530
531	while (!Q.empty()) {
532	if (Count++ >= Limit)
533	break;
534	Instruction *U = dyn_cast<Instruction>(Val: Q.pop_front_val());
535	if (!U \|\| U->getParent() \|\| !C.Used.count(x: U))
536	continue;
537	bool Changed = false;
538	for (Rule &R : Rules) {
539	Value *W = R.Fn (U, C.Ctx);
540	if (!W)
541	continue;
542	Changed = true;
543	C.record(V: W);
544	C.replace(OldV: U, NewV: W);
545	Q.push_back(V: C.Root);
546	break;
547	}
548	if (!Changed) {
549	for (Value *Op : U->operands())
550	Q.push_back(V: Op);
551	}
552	}
553	return Count < Limit ? C.Root : nullptr;
554	}
555
556	//===----------------------------------------------------------------------===//
557	//
558	// Implementation of PolynomialMultiplyRecognize
559	//
560	//===----------------------------------------------------------------------===//
561
562	namespace {
563
564	class PolynomialMultiplyRecognize {
565	public:
566	explicit PolynomialMultiplyRecognize(Loop loop, const* DataLayout &dl,
567	const DominatorTree &dt, const TargetLibraryInfo &tli,
568	ScalarEvolution &se)
569	: CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
570
571	bool recognize();
572
573	private:
574	using ValueSeq = SetVector<Value *>;
575
576	IntegerType getPmpyType() const* {
577	LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
578	return IntegerType::get(C&: Ctx, NumBits: `32`);
579	}
580
581	bool isPromotableTo(Value V, IntegerType Ty);
582	void promoteTo(Instruction In, IntegerType DestTy, BasicBlock *LoopB);
583	bool promoteTypes(BasicBlock LoopB, BasicBlock ExitB);
584
585	Value getCountIV(BasicBlock BB);
586	bool findCycle(Value Out, Value In, ValueSeq &Cycle);
587	void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
588	ValueSeq &Late);
589	bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
590	bool commutesWithShift(Instruction *I);
591	bool highBitsAreZero(Value V, unsigned* IterCount);
592	bool keepsHighBitsZero(Value V, unsigned* IterCount);
593	bool isOperandShifted(Instruction I, Value Op);
594	bool convertShiftsToLeft(BasicBlock LoopB, BasicBlock ExitB,
595	unsigned IterCount);
596	void cleanupLoopBody(BasicBlock *LoopB);
597
598	struct ParsedValues {
599	ParsedValues() = default;
600
601	Value M = nullptr*;
602	Value P = nullptr*;
603	Value Q = nullptr*;
604	Value R = nullptr*;
605	Value X = nullptr*;
606	Instruction Res = nullptr*;
607	unsigned IterCount = `0`;
608	bool Left = false;
609	bool Inv = false;
610	};
611
612	bool matchLeftShift(SelectInst SelI, Value CIV, ParsedValues &PV);
613	bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
614	bool scanSelect(SelectInst SI, BasicBlock LoopB, BasicBlock *PrehB,
615	Value CIV, ParsedValues &PV, bool* PreScan);
616	unsigned getInverseMxN(unsigned QP);
617	Value *generate(BasicBlock::iterator At, ParsedValues &PV);
618
619	void setupPreSimplifier(Simplifier &S);
620	void setupPostSimplifier(Simplifier &S);
621
622	Loop *CurLoop;
623	const DataLayout &DL;
624	const DominatorTree &DT;
625	const TargetLibraryInfo &TLI;
626	ScalarEvolution &SE;
627	};
628
629	} // end anonymous namespace
630
631	Value PolynomialMultiplyRecognize::getCountIV(BasicBlock BB) {
632	pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
633	if (std::distance(first: PI, last: PE) != `2`)
634	return nullptr;
635	BasicBlock PB = (PI == BB) ? std::next(x: PI) : PI;
636
637	for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(Val: I); ++I) {
638	auto *PN = cast<PHINode>(Val&: I);
639	Value *InitV = PN->getIncomingValueForBlock(BB: PB);
640	if (!isa<ConstantInt>(Val: InitV) \|\| !cast<ConstantInt>(Val: InitV)->isZero())
641	continue;
642	Value *IterV = PN->getIncomingValueForBlock(BB);
643	auto *BO = dyn_cast<BinaryOperator>(Val: IterV);
644	if (!BO)
645	continue;
646	if (BO->getOpcode() != Instruction::Add)
647	continue;
648	Value IncV = nullptr*;
649	if (BO->getOperand(i_nocapture: `0`) == PN)
650	IncV = BO->getOperand(i_nocapture: `1`);
651	else if (BO->getOperand(i_nocapture: `1`) == PN)
652	IncV = BO->getOperand(i_nocapture: `0`);
653	if (IncV == nullptr)
654	continue;
655
656	if (auto *T = dyn_cast<ConstantInt>(Val: IncV))
657	if (T->isOne())
658	return PN;
659	}
660	return nullptr;
661	}
662
663	static void replaceAllUsesOfWithIn(Value I, Value J, BasicBlock *BB) {
664	for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
665	Use &TheUse = UI.getUse();
666	++UI;
667	if (auto *II = dyn_cast<Instruction>(Val: TheUse.getUser()))
668	if (BB == II->getParent())
669	II->replaceUsesOfWith(From: I, To: J);
670	}
671	}
672
673	bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
674	Value *CIV, ParsedValues &PV) {
675	// Match the following:
676	// select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
677	// select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
678	// The condition may also check for equality with the masked value, i.e
679	// select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
680	// select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
681
682	Value *CondV = SelI->getCondition();
683	Value *TrueV = SelI->getTrueValue();
684	Value *FalseV = SelI->getFalseValue();
685
686	using namespace PatternMatch;
687
688	CmpPredicate P;
689	Value A = nullptr, B = nullptr, C = nullptr*;
690
691	if (!match(V: CondV, P: m_ICmp(Pred&: P, L: m_And(L: m_Value(V&: A), R: m_Value(V&: B)), R: m_Value(V&: C))) &&
692	!match(V: CondV, P: m_ICmp(Pred&: P, L: m_Value(V&: C), R: m_And(L: m_Value(V&: A), R: m_Value(V&: B)))))
693	return false;
694	if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
695	return false;
696	// Matched: select (A & B) == C ? ... : ...
697	// select (A & B) != C ? ... : ...
698
699	Value X = nullptr, Sh1 = nullptr;
700	// Check (A & B) for (X & (1 << i)):
701	if (match(V: A, P: m_Shl(L: m_One(), R: m_Specific(V: CIV)))) {
702	Sh1 = A;
703	X = B;
704	} else if (match(V: B, P: m_Shl(L: m_One(), R: m_Specific(V: CIV)))) {
705	Sh1 = B;
706	X = A;
707	} else {
708	// TODO: Could also check for an induction variable containing single
709	// bit shifted left by 1 in each iteration.
710	return false;
711	}
712
713	bool TrueIfZero;
714
715	// Check C against the possible values for comparison: 0 and (1 << i):
716	if (match(V: C, P: m_Zero()))
717	TrueIfZero = (P == CmpInst::ICMP_EQ);
718	else if (C == Sh1)
719	TrueIfZero = (P == CmpInst::ICMP_NE);
720	else
721	return false;
722
723	// So far, matched:
724	// select (X & (1 << i)) ? ... : ...
725	// including variations of the check against zero/non-zero value.
726
727	Value ShouldSameV = nullptr, ShouldXoredV = nullptr;
728	if (TrueIfZero) {
729	ShouldSameV = TrueV;
730	ShouldXoredV = FalseV;
731	} else {
732	ShouldSameV = FalseV;
733	ShouldXoredV = TrueV;
734	}
735
736	Value Q = nullptr, R = nullptr, Y = nullptr, Z = nullptr;
737	Value T = nullptr*;
738	if (match(V: ShouldXoredV, P: m_Xor(L: m_Value(V&: Y), R: m_Value(V&: Z)))) {
739	// Matched: select +++ ? ... : Y ^ Z
740	// select +++ ? Y ^ Z : ...
741	// where +++ denotes previously checked matches.
742	if (ShouldSameV == Y)
743	T = Z;
744	else if (ShouldSameV == Z)
745	T = Y;
746	else
747	return false;
748	R = ShouldSameV;
749	// Matched: select +++ ? R : R ^ T
750	// select +++ ? R ^ T : R
751	// depending on TrueIfZero.
752
753	} else if (match(V: ShouldSameV, P: m_Zero())) {
754	// Matched: select +++ ? 0 : ...
755	// select +++ ? ... : 0
756	if (!SelI->hasOneUse())
757	return false;
758	T = ShouldXoredV;
759	// Matched: select +++ ? 0 : T
760	// select +++ ? T : 0
761
762	Value U = SelI->user_begin();
763	if (!match(V: U, P: m_c_Xor(L: m_Specific(V: SelI), R: m_Value(V&: R))))
764	return false;
765	// Matched: xor (select +++ ? 0 : T), R
766	// xor (select +++ ? T : 0), R
767	} else
768	return false;
769
770	// The xor input value T is isolated into its own match so that it could
771	// be checked against an induction variable containing a shifted bit
772	// (todo).
773	// For now, check against (Q << i).
774	if (!match(V: T, P: m_Shl(L: m_Value(V&: Q), R: m_Specific(V: CIV))) &&
775	!match(V: T, P: m_Shl(L: m_ZExt(Op: m_Value(V&: Q)), R: m_ZExt(Op: m_Specific(V: CIV)))))
776	return false;
777	// Matched: select +++ ? R : R ^ (Q << i)
778	// select +++ ? R ^ (Q << i) : R
779
780	PV.X = X;
781	PV.Q = Q;
782	PV.R = R;
783	PV.Left = true;
784	return true;
785	}
786
787	bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
788	ParsedValues &PV) {
789	// Match the following:
790	// select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
791	// select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
792	// The condition may also check for equality with the masked value, i.e
793	// select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
794	// select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
795
796	Value *CondV = SelI->getCondition();
797	Value *TrueV = SelI->getTrueValue();
798	Value *FalseV = SelI->getFalseValue();
799
800	using namespace PatternMatch;
801
802	Value C = nullptr*;
803	CmpPredicate P;
804	bool TrueIfZero;
805
806	if (match(V: CondV, P: m_c_ICmp(Pred&: P, L: m_Value(V&: C), R: m_Zero()))) {
807	if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
808	return false;
809	// Matched: select C == 0 ? ... : ...
810	// select C != 0 ? ... : ...
811	TrueIfZero = (P == CmpInst::ICMP_EQ);
812	} else if (match(V: CondV, P: m_c_ICmp(Pred&: P, L: m_Value(V&: C), R: m_One()))) {
813	if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
814	return false;
815	// Matched: select C == 1 ? ... : ...
816	// select C != 1 ? ... : ...
817	TrueIfZero = (P == CmpInst::ICMP_NE);
818	} else
819	return false;
820
821	Value X = nullptr*;
822	if (!match(V: C, P: m_And(L: m_Value(V&: X), R: m_One())))
823	return false;
824	// Matched: select (X & 1) == +++ ? ... : ...
825	// select (X & 1) != +++ ? ... : ...
826
827	Value R = nullptr, Q = nullptr;
828	if (TrueIfZero) {
829	// The select's condition is true if the tested bit is 0.
830	// TrueV must be the shift, FalseV must be the xor.
831	if (!match(V: TrueV, P: m_LShr(L: m_Value(V&: R), R: m_One())))
832	return false;
833	// Matched: select +++ ? (R >> 1) : ...
834	if (!match(V: FalseV, P: m_c_Xor(L: m_Specific(V: TrueV), R: m_Value(V&: Q))))
835	return false;
836	// Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
837	// with commuting ^.
838	} else {
839	// The select's condition is true if the tested bit is 1.
840	// TrueV must be the xor, FalseV must be the shift.
841	if (!match(V: FalseV, P: m_LShr(L: m_Value(V&: R), R: m_One())))
842	return false;
843	// Matched: select +++ ? ... : (R >> 1)
844	if (!match(V: TrueV, P: m_c_Xor(L: m_Specific(V: FalseV), R: m_Value(V&: Q))))
845	return false;
846	// Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
847	// with commuting ^.
848	}
849
850	PV.X = X;
851	PV.Q = Q;
852	PV.R = R;
853	PV.Left = false;
854	return true;
855	}
856
857	bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
858	BasicBlock LoopB, BasicBlock PrehB, Value *CIV, ParsedValues &PV,
859	bool PreScan) {
860	using namespace PatternMatch;
861
862	// The basic pattern for R = P.Q is:
863	// for i = 0..31
864	// R = phi (0, R')
865	// if (P & (1 << i)) ; test-bit(P, i)
866	// R' = R ^ (Q << i)
867	//
868	// Similarly, the basic pattern for R = (P/Q).Q - P
869	// for i = 0..31
870	// R = phi(P, R')
871	// if (R & (1 << i))
872	// R' = R ^ (Q << i)
873
874	// There exist idioms, where instead of Q being shifted left, P is shifted
875	// right. This produces a result that is shifted right by 32 bits (the
876	// non-shifted result is 64-bit).
877	//
878	// For R = P.Q, this would be:
879	// for i = 0..31
880	// R = phi (0, R')
881	// if ((P >> i) & 1)
882	// R' = (R >> 1) ^ Q ; R is cycled through the loop, so it must
883	// else ; be shifted by 1, not i.
884	// R' = R >> 1
885	//
886	// And for the inverse:
887	// for i = 0..31
888	// R = phi (P, R')
889	// if (R & 1)
890	// R' = (R >> 1) ^ Q
891	// else
892	// R' = R >> 1
893
894	// The left-shifting idioms share the same pattern:
895	// select (X & (1 << i)) ? R ^ (Q << i) : R
896	// Similarly for right-shifting idioms:
897	// select (X & 1) ? (R >> 1) ^ Q
898
899	if (matchLeftShift(SelI, CIV, PV)) {
900	// If this is a pre-scan, getting this far is sufficient.
901	if (PreScan)
902	return true;
903
904	// Need to make sure that the SelI goes back into R.
905	auto *RPhi = dyn_cast<PHINode>(Val: PV.R);
906	if (!RPhi)
907	return false;
908	if (SelI != RPhi->getIncomingValueForBlock(BB: LoopB))
909	return false;
910	PV.Res = SelI;
911
912	// If X is loop invariant, it must be the input polynomial, and the
913	// idiom is the basic polynomial multiply.
914	if (CurLoop->isLoopInvariant(V: PV.X)) {
915	PV.P = PV.X;
916	PV.Inv = false;
917	} else {
918	// X is not loop invariant. If X == R, this is the inverse pmpy.
919	// Otherwise, check for an xor with an invariant value. If the
920	// variable argument to the xor is R, then this is still a valid
921	// inverse pmpy.
922	PV.Inv = true;
923	if (PV.X != PV.R) {
924	Value Var = nullptr, Inv = nullptr, X1 = nullptr, X2 = nullptr;
925	if (!match(V: PV.X, P: m_Xor(L: m_Value(V&: X1), R: m_Value(V&: X2))))
926	return false;
927	auto *I1 = dyn_cast<Instruction>(Val: X1);
928	auto *I2 = dyn_cast<Instruction>(Val: X2);
929	if (!I1 \|\| I1->getParent() != LoopB) {
930	Var = X2;
931	Inv = X1;
932	} else if (!I2 \|\| I2->getParent() != LoopB) {
933	Var = X1;
934	Inv = X2;
935	} else
936	return false;
937	if (Var != PV.R)
938	return false;
939	PV.M = Inv;
940	}
941	// The input polynomial P still needs to be determined. It will be
942	// the entry value of R.
943	Value *EntryP = RPhi->getIncomingValueForBlock(BB: PrehB);
944	PV.P = EntryP;
945	}
946
947	return true;
948	}
949
950	if (matchRightShift(SelI, PV)) {
951	// If this is an inverse pattern, the Q polynomial must be known at
952	// compile time.
953	if (PV.Inv && !isa<ConstantInt>(Val: PV.Q))
954	return false;
955	if (PreScan)
956	return true;
957	// There is no exact matching of right-shift pmpy.
958	return false;
959	}
960
961	return false;
962	}
963
964	bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
965	IntegerType *DestTy) {
966	IntegerType *T = dyn_cast<IntegerType>(Val: Val->getType());
967	if (!T \|\| T->getBitWidth() > DestTy->getBitWidth())
968	return false;
969	if (T->getBitWidth() == DestTy->getBitWidth())
970	return true;
971	// Non-instructions are promotable. The reason why an instruction may not
972	// be promotable is that it may produce a different result if its operands
973	// and the result are promoted, for example, it may produce more non-zero
974	// bits. While it would still be possible to represent the proper result
975	// in a wider type, it may require adding additional instructions (which
976	// we don't want to do).
977	Instruction *In = dyn_cast<Instruction>(Val);
978	if (!In)
979	return true;
980	// The bitwidth of the source type is smaller than the destination.
981	// Check if the individual operation can be promoted.
982	switch (In->getOpcode()) {
983	case Instruction::PHI:
984	case Instruction::ZExt:
985	case Instruction::And:
986	case Instruction::Or:
987	case Instruction::Xor:
988	case Instruction::LShr: // Shift right is ok.
989	case Instruction::Select:
990	case Instruction::Trunc:
991	return true;
992	case Instruction::ICmp:
993	if (CmpInst *CI = cast<CmpInst>(Val: In))
994	return CI->isEquality() \|\| CI->isUnsigned();
995	llvm_unreachable("Cast failed unexpectedly");
996	case Instruction::Add:
997	return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
998	}
999	return false;
1000	}
1001
1002	void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
1003	IntegerType DestTy, BasicBlock LoopB) {
1004	Type *OrigTy = In->getType();
1005	assert(!OrigTy->isVoidTy() && "Invalid instruction to promote");
1006
1007	// Leave boolean values alone.
1008	if (!In->getType()->isIntegerTy(Bitwidth: `1`))
1009	In->mutateType(Ty: DestTy);
1010	unsigned DestBW = DestTy->getBitWidth();
1011
1012	// Handle PHIs.
1013	if (PHINode *P = dyn_cast<PHINode>(Val: In)) {
1014	unsigned N = P->getNumIncomingValues();
1015	for (unsigned i = `0`; i != N; ++i) {
1016	BasicBlock *InB = P->getIncomingBlock(i);
1017	if (InB == LoopB)
1018	continue;
1019	Value *InV = P->getIncomingValue(i);
1020	IntegerType *Ty = cast<IntegerType>(Val: InV->getType());
1021	// Do not promote values in PHI nodes of type i1.
1022	if (Ty != P->getType()) {
1023	// If the value type does not match the PHI type, the PHI type
1024	// must have been promoted.
1025	assert(Ty->getBitWidth() < DestBW);
1026	InV = IRBuilder<>(InB->getTerminator()).CreateZExt(V: InV, DestTy);
1027	P->setIncomingValue(i, V: InV);
1028	}
1029	}
1030	} else if (ZExtInst *Z = dyn_cast<ZExtInst>(Val: In)) {
1031	Value *Op = Z->getOperand(i_nocapture: `0`);
1032	if (Op->getType() == Z->getType())
1033	Z->replaceAllUsesWith(V: Op);
1034	Z->eraseFromParent();
1035	return;
1036	}
1037	if (TruncInst *T = dyn_cast<TruncInst>(Val: In)) {
1038	IntegerType *TruncTy = cast<IntegerType>(Val: OrigTy);
1039	Value *Mask = ConstantInt::get(Ty: DestTy, V: (`1u` << TruncTy->getBitWidth()) - `1`);
1040	Value *And = IRBuilder<>(In).CreateAnd(LHS: T->getOperand(i_nocapture: `0`), RHS: Mask);
1041	T->replaceAllUsesWith(V: And);
1042	T->eraseFromParent();
1043	return;
1044	}
1045
1046	// Promote immediates.
1047	for (unsigned i = `0`, n = In->getNumOperands(); i != n; ++i) {
1048	if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: In->getOperand(i)))
1049	if (CI->getBitWidth() < DestBW)
1050	In->setOperand(i, Val: ConstantInt::get(Ty: DestTy, V: CI->getZExtValue()));
1051	}
1052	}
1053
1054	bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
1055	BasicBlock *ExitB) {
1056	assert(LoopB);
1057	// Skip loops where the exit block has more than one predecessor. The values
1058	// coming from the loop block will be promoted to another type, and so the
1059	// values coming into the exit block from other predecessors would also have
1060	// to be promoted.
1061	if (!ExitB \|\| (ExitB->getSinglePredecessor() != LoopB))
1062	return false;
1063	IntegerType *DestTy = getPmpyType();
1064	// Check if the exit values have types that are no wider than the type
1065	// that we want to promote to.
1066	unsigned DestBW = DestTy->getBitWidth();
1067	for (PHINode &P : ExitB->phis()) {
1068	if (P.getNumIncomingValues() != `1`)
1069	return false;
1070	assert(P.getIncomingBlock(`0`) == LoopB);
1071	IntegerType *T = dyn_cast<IntegerType>(Val: P.getType());
1072	if (!T \|\| T->getBitWidth() > DestBW)
1073	return false;
1074	}
1075
1076	// Check all instructions in the loop.
1077	for (Instruction &In : *LoopB)
1078	if (!In.isTerminator() && !isPromotableTo(Val: &In, DestTy))
1079	return false;
1080
1081	// Perform the promotion.
1082	SmallVector<Instruction > LoopIns(llvm::make_pointer_range(Range&: LoopB));
1083	for (Instruction *In : LoopIns)
1084	if (!In->isTerminator())
1085	promoteTo(In, DestTy, LoopB);
1086
1087	// Fix up the PHI nodes in the exit block.
1088	BasicBlock::iterator End = ExitB->getFirstNonPHIIt();
1089	for (auto I = ExitB->begin(); I != End; ++I) {
1090	PHINode *P = dyn_cast<PHINode>(Val&: I);
1091	if (!P)
1092	break;
1093	Type *Ty0 = P->getIncomingValue(i: `0`)->getType();
1094	Type *PTy = P->getType();
1095	if (PTy != Ty0) {
1096	assert(Ty0 == DestTy);
1097	// In order to create the trunc, P must have the promoted type.
1098	P->mutateType(Ty: Ty0);
1099	Value *T = IRBuilder<>(ExitB, End).CreateTrunc(V: P, DestTy: PTy);
1100	// In order for the RAUW to work, the types of P and T must match.
1101	P->mutateType(Ty: PTy);
1102	P->replaceAllUsesWith(V: T);
1103	// Final update of the P's type.
1104	P->mutateType(Ty: Ty0);
1105	cast<Instruction>(Val: T)->setOperand(i: `0`, Val: P);
1106	}
1107	}
1108
1109	return true;
1110	}
1111
1112	bool PolynomialMultiplyRecognize::findCycle(Value Out, Value In,
1113	ValueSeq &Cycle) {
1114	// Out = ..., In, ...
1115	if (Out == In)
1116	return true;
1117
1118	auto *BB = cast<Instruction>(Val: Out)->getParent();
1119	bool HadPhi = false;
1120
1121	for (auto *U : Out->users()) {
1122	auto I = dyn_cast<Instruction>(Val: &U);
1123	if (I == nullptr \|\| I->getParent() != BB)
1124	continue;
1125	// Make sure that there are no multi-iteration cycles, e.g.
1126	// p1 = phi(p2)
1127	// p2 = phi(p1)
1128	// The cycle p1->p2->p1 would span two loop iterations.
1129	// Check that there is only one phi in the cycle.
1130	bool IsPhi = isa<PHINode>(Val: I);
1131	if (IsPhi && HadPhi)
1132	return false;
1133	HadPhi \|= IsPhi;
1134	if (!Cycle.insert(X: I))
1135	return false;
1136	if (findCycle(Out: I, In, Cycle))
1137	break;
1138	Cycle.remove(X: I);
1139	}
1140	return !Cycle.empty();
1141	}
1142
1143	void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
1144	ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
1145	// All the values in the cycle that are between the phi node and the
1146	// divider instruction will be classified as "early", all other values
1147	// will be "late".
1148
1149	bool IsE = true;
1150	unsigned I, N = Cycle.size();
1151	for (I = `0`; I < N; ++I) {
1152	Value *V = Cycle [I];
1153	if (DivI == V)
1154	IsE = false;
1155	else if (!isa<PHINode>(Val: V))
1156	continue;
1157	// Stop if found either.
1158	break;
1159	}
1160	// "I" is the index of either DivI or the phi node, whichever was first.
1161	// "E" is "false" or "true" respectively.
1162	ValueSeq &First = !IsE ? Early : Late;
1163	for (unsigned J = `0`; J < I; ++J)
1164	First.insert(X: Cycle [J]);
1165
1166	ValueSeq &Second = IsE ? Early : Late;
1167	Second.insert(X: Cycle [I]);
1168	for (++I; I < N; ++I) {
1169	Value *V = Cycle [I];
1170	if (DivI == V \|\| isa<PHINode>(Val: V))
1171	break;
1172	Second.insert(X: V);
1173	}
1174
1175	for (; I < N; ++I)
1176	First.insert(X: Cycle [I]);
1177	}
1178
1179	bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
1180	ValueSeq &Early, ValueSeq &Late) {
1181	// Select is an exception, since the condition value does not have to be
1182	// classified in the same way as the true/false values. The true/false
1183	// values do have to be both early or both late.
1184	if (UseI->getOpcode() == Instruction::Select) {
1185	Value TV = UseI->getOperand(i: `1`), FV = UseI->getOperand(i: `2`);
1186	if (Early.count(key: TV) \|\| Early.count(key: FV)) {
1187	if (Late.count(key: TV) \|\| Late.count(key: FV))
1188	return false;
1189	Early.insert(X: UseI);
1190	} else if (Late.count(key: TV) \|\| Late.count(key: FV)) {
1191	if (Early.count(key: TV) \|\| Early.count(key: FV))
1192	return false;
1193	Late.insert(X: UseI);
1194	}
1195	return true;
1196	}
1197
1198	// Not sure what would be the example of this, but the code below relies
1199	// on having at least one operand.
1200	if (UseI->getNumOperands() == `0`)
1201	return true;
1202
1203	bool AE = true, AL = true;
1204	for (auto &I : UseI->operands()) {
1205	if (Early.count(key: &*I))
1206	AL = false;
1207	else if (Late.count(key: &*I))
1208	AE = false;
1209	}
1210	// If the operands appear "all early" and "all late" at the same time,
1211	// then it means that none of them are actually classified as either.
1212	// This is harmless.
1213	if (AE && AL)
1214	return true;
1215	// Conversely, if they are neither "all early" nor "all late", then
1216	// we have a mixture of early and late operands that is not a known
1217	// exception.
1218	if (!AE && !AL)
1219	return false;
1220
1221	// Check that we have covered the two special cases.
1222	assert(AE != AL);
1223
1224	if (AE)
1225	Early.insert(X: UseI);
1226	else
1227	Late.insert(X: UseI);
1228	return true;
1229	}
1230
1231	bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
1232	switch (I->getOpcode()) {
1233	case Instruction::And:
1234	case Instruction::Or:
1235	case Instruction::Xor:
1236	case Instruction::LShr:
1237	case Instruction::Shl:
1238	case Instruction::Select:
1239	case Instruction::ICmp:
1240	case Instruction::PHI:
1241	break;
1242	default:
1243	return false;
1244	}
1245	return true;
1246	}
1247
1248	bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
1249	unsigned IterCount) {
1250	auto *T = dyn_cast<IntegerType>(Val: V->getType());
1251	if (!T)
1252	return false;
1253
1254	KnownBits Known(T->getBitWidth());
1255	computeKnownBits(V, Known, DL);
1256	return Known.countMinLeadingZeros() >= IterCount;
1257	}
1258
1259	bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
1260	unsigned IterCount) {
1261	// Assume that all inputs to the value have the high bits zero.
1262	// Check if the value itself preserves the zeros in the high bits.
1263	if (auto *C = dyn_cast<ConstantInt>(Val: V))
1264	return C->getValue().countl_zero() >= IterCount;
1265
1266	if (auto *I = dyn_cast<Instruction>(Val: V)) {
1267	switch (I->getOpcode()) {
1268	case Instruction::And:
1269	case Instruction::Or:
1270	case Instruction::Xor:
1271	case Instruction::LShr:
1272	case Instruction::Select:
1273	case Instruction::ICmp:
1274	case Instruction::PHI:
1275	case Instruction::ZExt:
1276	return true;
1277	}
1278	}
1279
1280	return false;
1281	}
1282
1283	bool PolynomialMultiplyRecognize::isOperandShifted(Instruction I, Value Op) {
1284	unsigned Opc = I->getOpcode();
1285	if (Opc == Instruction::Shl \|\| Opc == Instruction::LShr)
1286	return Op != I->getOperand(i: `1`);
1287	return true;
1288	}
1289
1290	bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
1291	BasicBlock ExitB, unsigned* IterCount) {
1292	Value *CIV = getCountIV(BB: LoopB);
1293	if (CIV == nullptr)
1294	return false;
1295	auto *CIVTy = dyn_cast<IntegerType>(Val: CIV->getType());
1296	if (CIVTy == nullptr)
1297	return false;
1298
1299	ValueSeq RShifts;
1300	ValueSeq Early, Late, Cycled;
1301
1302	// Find all value cycles that contain logical right shifts by 1.
1303	for (Instruction &I : *LoopB) {
1304	using namespace PatternMatch;
1305
1306	Value V = nullptr*;
1307	if (!match(V: &I, P: m_LShr(L: m_Value(V), R: m_One())))
1308	continue;
1309	ValueSeq C;
1310	if (!findCycle(Out: &I, In: V, Cycle&: C))
1311	continue;
1312
1313	// Found a cycle.
1314	C.insert(X: &I);
1315	classifyCycle(DivI: &I, Cycle&: C, Early, Late);
1316	Cycled.insert_range(R&: C);
1317	RShifts.insert(X: &I);
1318	}
1319
1320	// Find the set of all values affected by the shift cycles, i.e. all
1321	// cycled values, and (recursively) all their users.
1322	ValueSeq Users(llvm::from_range, Cycled);
1323	for (unsigned i = `0`; i < Users.size(); ++i) {
1324	Value *V = Users [i];
1325	if (!isa<IntegerType>(Val: V->getType()))
1326	return false;
1327	auto *R = cast<Instruction>(Val: V);
1328	// If the instruction does not commute with shifts, the loop cannot
1329	// be unshifted.
1330	if (!commutesWithShift(I: R))
1331	return false;
1332	for (User *U : R->users()) {
1333	auto *T = cast<Instruction>(Val: U);
1334	// Skip users from outside of the loop. They will be handled later.
1335	// Also, skip the right-shifts and phi nodes, since they mix early
1336	// and late values.
1337	if (T->getParent() != LoopB \|\| RShifts.count(key: T) \|\| isa<PHINode>(Val: T))
1338	continue;
1339
1340	Users.insert(X: T);
1341	if (!classifyInst(UseI: T, Early, Late))
1342	return false;
1343	}
1344	}
1345
1346	if (Users.empty())
1347	return false;
1348
1349	// Verify that high bits remain zero.
1350	ValueSeq Internal(llvm::from_range, Users);
1351	ValueSeq Inputs;
1352	for (unsigned i = `0`; i < Internal.size(); ++i) {
1353	auto *R = dyn_cast<Instruction>(Val: Internal [i]);
1354	if (!R)
1355	continue;
1356	for (Value *Op : R->operands()) {
1357	auto *T = dyn_cast<Instruction>(Val: Op);
1358	if (T && T->getParent() != LoopB)
1359	Inputs.insert(X: Op);
1360	else
1361	Internal.insert(X: Op);
1362	}
1363	}
1364	for (Value *V : Inputs)
1365	if (!highBitsAreZero(V, IterCount))
1366	return false;
1367	for (Value *V : Internal)
1368	if (!keepsHighBitsZero(V, IterCount))
1369	return false;
1370
1371	// Finally, the work can be done. Unshift each user.
1372	IRBuilder<> IRB(LoopB);
1373	std::map<Value,Value> ShiftMap;
1374
1375	using CastMapType = std::map<std::pair<Value , Type >, Value *>;
1376
1377	CastMapType CastMap;
1378
1379	auto upcast = [](CastMapType &CM, IRBuilder<> &IRB, Value *V,
1380	IntegerType Ty) -> Value {
1381	auto [H, Inserted] = CM.try_emplace(k: std::make_pair(x&: V, y&: Ty));
1382	if (Inserted)
1383	H ->second = IRB.CreateIntCast(V, DestTy: Ty, isSigned: false);
1384	return H ->second;
1385	};
1386
1387	for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
1388	using namespace PatternMatch;
1389
1390	if (isa<PHINode>(Val: I) \|\| !Users.count(key: &*I))
1391	continue;
1392
1393	// Match lshr x, 1.
1394	Value V = nullptr*;
1395	if (match(V: &*I, P: m_LShr(L: m_Value(V), R: m_One()))) {
1396	replaceAllUsesOfWithIn(I: &*I, J: V, BB: LoopB);
1397	continue;
1398	}
1399	// For each non-cycled operand, replace it with the corresponding
1400	// value shifted left.
1401	for (auto &J : I ->operands()) {
1402	Value *Op = J.get();
1403	if (!isOperandShifted(I: &*I, Op))
1404	continue;
1405	if (Users.count(key: Op))
1406	continue;
1407	// Skip shifting zeros.
1408	if (isa<ConstantInt>(Val: Op) && cast<ConstantInt>(Val: Op)->isZero())
1409	continue;
1410	// Check if we have already generated a shift for this value.
1411	auto F = ShiftMap.find(x: Op);
1412	Value W = (F != ShiftMap.end()) ? F ->second : nullptr*;
1413	if (W == nullptr) {
1414	IRB.SetInsertPoint(&*I);
1415	// First, the shift amount will be CIV or CIV+1, depending on
1416	// whether the value is early or late. Instead of creating CIV+1,
1417	// do a single shift of the value.
1418	Value ShAmt = CIV, ShVal = Op;
1419	auto *VTy = cast<IntegerType>(Val: ShVal->getType());
1420	auto *ATy = cast<IntegerType>(Val: ShAmt->getType());
1421	if (Late.count(key: &*I))
1422	ShVal = IRB.CreateShl(LHS: Op, RHS: ConstantInt::get(Ty: VTy, V: `1`));
1423	// Second, the types of the shifted value and the shift amount
1424	// must match.
1425	if (VTy != ATy) {
1426	if (VTy->getBitWidth() < ATy->getBitWidth())
1427	ShVal = upcast (CastMap, IRB, ShVal, ATy);
1428	else
1429	ShAmt = upcast (CastMap, IRB, ShAmt, VTy);
1430	}
1431	// Ready to generate the shift and memoize it.
1432	W = IRB.CreateShl(LHS: ShVal, RHS: ShAmt);
1433	ShiftMap.insert(x: std::make_pair(x&: Op, y&: W));
1434	}
1435	I ->replaceUsesOfWith(From: Op, To: W);
1436	}
1437	}
1438
1439	// Update the users outside of the loop to account for having left
1440	// shifts. They would normally be shifted right in the loop, so shift
1441	// them right after the loop exit.
1442	// Take advantage of the loop-closed SSA form, which has all the post-
1443	// loop values in phi nodes.
1444	IRB.SetInsertPoint(TheBB: ExitB, IP: ExitB->getFirstInsertionPt());
1445	for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q; ++P) {
1446	if (!isa<PHINode>(Val: P))
1447	break;
1448	auto *PN = cast<PHINode>(Val&: P);
1449	Value *U = PN->getIncomingValueForBlock(BB: LoopB);
1450	if (!Users.count(key: U))
1451	continue;
1452	Value *S = IRB.CreateLShr(LHS: PN, RHS: ConstantInt::get(Ty: PN->getType(), V: IterCount));
1453	PN->replaceAllUsesWith(V: S);
1454	// The above RAUW will create
1455	// S = lshr S, IterCount
1456	// so we need to fix it back into
1457	// S = lshr PN, IterCount
1458	cast<User>(Val: S)->replaceUsesOfWith(From: S, To: PN);
1459	}
1460
1461	return true;
1462	}
1463
1464	void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
1465	for (auto &I : *LoopB)
1466	if (Value *SV = simplifyInstruction(I: &I, Q: {DL, &TLI, &DT}))
1467	I.replaceAllUsesWith(V: SV);
1468
1469	for (Instruction &I : llvm::make_early_inc_range(Range&: *LoopB))
1470	RecursivelyDeleteTriviallyDeadInstructions(V: &I, TLI: &TLI);
1471	}
1472
1473	unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
1474	// Arrays of coefficients of Q and the inverse, C.
1475	// Q[i] = coefficient at x^i.
1476	std::array<char,`32`> Q, C;
1477
1478	for (unsigned i = `0`; i < `32`; ++i) {
1479	Q [i] = QP & `1`;
1480	QP >>= `1`;
1481	}
1482	assert(Q[`0`] == `1`);
1483
1484	// Find C, such that
1485	// (Q[n]x^n + ... + Q[1]x + Q[0]) (C[n]x^n + ... + C[1]x + C[0]) = 1*
1486	//
1487	// For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
1488	// operations and + are & and ^ respectively.*
1489	//
1490	// Find C[i] recursively, by comparing i-th coefficient in the product
1491	// with 0 (or 1 for i=0).
1492	//
1493	// C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
1494	C [`0`] = `1`;
1495	for (unsigned i = `1`; i < `32`; ++i) {
1496	// Solve for C[i] in:
1497	// C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
1498	// This is equivalent to
1499	// C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
1500	// which is
1501	// C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
1502	unsigned T = `0`;
1503	for (unsigned j = `0`; j < i; ++j)
1504	T = T ^ (C [j] & Q [i-j]);
1505	C [i] = T;
1506	}
1507
1508	unsigned QV = `0`;
1509	for (unsigned i = `0`; i < `32`; ++i)
1510	if (C [i])
1511	QV \|= (`1` << i);
1512
1513	return QV;
1514	}
1515
1516	Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
1517	ParsedValues &PV) {
1518	IRBuilder<> B(&*At);
1519	Module *M = At ->getParent()->getParent()->getParent();
1520	Function *PMF =
1521	Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::hexagon_M4_pmpyw);
1522
1523	Value P = PV.P, Q = PV.Q, *P0 = P;
1524	unsigned IC = PV.IterCount;
1525
1526	if (PV.M != nullptr)
1527	P0 = P = B.CreateXor(LHS: P, RHS: PV.M);
1528
1529	// Create a bit mask to clear the high bits beyond IterCount.
1530	auto *BMI = ConstantInt::get(Ty: P->getType(), V: APInt::getLowBitsSet(numBits: `32`, loBitsSet: IC));
1531
1532	if (PV.IterCount != `32`)
1533	P = B.CreateAnd(LHS: P, RHS: BMI);
1534
1535	if (PV.Inv) {
1536	auto *QI = dyn_cast<ConstantInt>(Val: PV.Q);
1537	assert(QI && QI->getBitWidth() <= `32`);
1538
1539	// Again, clearing bits beyond IterCount.
1540	unsigned M = (`1` << PV.IterCount) - `1`;
1541	unsigned Tmp = (QI->getZExtValue() \| `1`) & M;
1542	unsigned QV = getInverseMxN(QP: Tmp) & M;
1543	auto *QVI = ConstantInt::get(Ty: QI->getType(), V: QV);
1544	P = B.CreateCall(Callee: PMF, Args: {P, QVI});
1545	P = B.CreateTrunc(V: P, DestTy: QI->getType());
1546	if (IC != `32`)
1547	P = B.CreateAnd(LHS: P, RHS: BMI);
1548	}
1549
1550	Value *R = B.CreateCall(Callee: PMF, Args: {P, Q});
1551
1552	if (PV.M != nullptr)
1553	R = B.CreateXor(LHS: R, RHS: B.CreateIntCast(V: P0, DestTy: R->getType(), isSigned: false));
1554
1555	return R;
1556	}
1557
1558	static bool hasZeroSignBit(const Value *V) {
1559	if (const auto CI = dyn_cast<const* ConstantInt>(Val: V))
1560	return CI->getValue().isNonNegative();
1561	const Instruction I = dyn_cast<const* Instruction>(Val: V);
1562	if (!I)
1563	return false;
1564	switch (I->getOpcode()) {
1565	case Instruction::LShr:
1566	if (const auto SI = dyn_cast<const ConstantInt>(Val: I->getOperand(i: `1`)))
1567	return SI->getZExtValue() > `0`;
1568	return false;
1569	case Instruction::Or:
1570	case Instruction::Xor:
1571	return hasZeroSignBit(V: I->getOperand(i: `0`)) &&
1572	hasZeroSignBit(V: I->getOperand(i: `1`));
1573	case Instruction::And:
1574	return hasZeroSignBit(V: I->getOperand(i: `0`)) \|\|
1575	hasZeroSignBit(V: I->getOperand(i: `1`));
1576	}
1577	return false;
1578	}
1579
1580	void PolynomialMultiplyRecognize::setupPreSimplifier(Simplifier &S) {
1581	S.addRule(N: "sink-zext",
1582	// Sink zext past bitwise operations.
1583	F: [](Instruction I, LLVMContext &Ctx) -> Value {
1584	if (I->getOpcode() != Instruction::ZExt)
1585	return nullptr;
1586	Instruction *T = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
1587	if (!T)
1588	return nullptr;
1589	switch (T->getOpcode()) {
1590	case Instruction::And:
1591	case Instruction::Or:
1592	case Instruction::Xor:
1593	break;
1594	default:
1595	return nullptr;
1596	}
1597	IRBuilder<> B(Ctx);
1598	return B.CreateBinOp(Opc: cast<BinaryOperator>(Val: T)->getOpcode(),
1599	LHS: B.CreateZExt(V: T->getOperand(i: `0`), DestTy: I->getType()),
1600	RHS: B.CreateZExt(V: T->getOperand(i: `1`), DestTy: I->getType()));
1601	});
1602	S.addRule(N: "xor/and -> and/xor",
1603	// (xor (and x a) (and y a)) -> (and (xor x y) a)
1604	F: [](Instruction I, LLVMContext &Ctx) -> Value {
1605	if (I->getOpcode() != Instruction::Xor)
1606	return nullptr;
1607	Instruction *And0 = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
1608	Instruction *And1 = dyn_cast<Instruction>(Val: I->getOperand(i: `1`));
1609	if (!And0 \|\| !And1)
1610	return nullptr;
1611	if (And0->getOpcode() != Instruction::And \|\|
1612	And1->getOpcode() != Instruction::And)
1613	return nullptr;
1614	if (And0->getOperand(i: `1`) != And1->getOperand(i: `1`))
1615	return nullptr;
1616	IRBuilder<> B(Ctx);
1617	return B.CreateAnd(LHS: B.CreateXor(LHS: And0->getOperand(i: `0`), RHS: And1->getOperand(i: `0`)),
1618	RHS: And0->getOperand(i: `1`));
1619	});
1620	S.addRule(N: "sink binop into select",
1621	// (Op (select c x y) z) -> (select c (Op x z) (Op y z))
1622	// (Op x (select c y z)) -> (select c (Op x y) (Op x z))
1623	F: [](Instruction I, LLVMContext &Ctx) -> Value {
1624	BinaryOperator *BO = dyn_cast<BinaryOperator>(Val: I);
1625	if (!BO)
1626	return nullptr;
1627	Instruction::BinaryOps Op = BO->getOpcode();
1628	if (SelectInst *Sel = dyn_cast<SelectInst>(Val: BO->getOperand(i_nocapture: `0`))) {
1629	IRBuilder<> B(Ctx);
1630	Value X = Sel->getTrueValue(), Y = Sel->getFalseValue();
1631	Value *Z = BO->getOperand(i_nocapture: `1`);
1632	return B.CreateSelect(C: Sel->getCondition(),
1633	True: B.CreateBinOp(Opc: Op, LHS: X, RHS: Z),
1634	False: B.CreateBinOp(Opc: Op, LHS: Y, RHS: Z));
1635	}
1636	if (SelectInst *Sel = dyn_cast<SelectInst>(Val: BO->getOperand(i_nocapture: `1`))) {
1637	IRBuilder<> B(Ctx);
1638	Value *X = BO->getOperand(i_nocapture: `0`);
1639	Value Y = Sel->getTrueValue(), Z = Sel->getFalseValue();
1640	return B.CreateSelect(C: Sel->getCondition(),
1641	True: B.CreateBinOp(Opc: Op, LHS: X, RHS: Y),
1642	False: B.CreateBinOp(Opc: Op, LHS: X, RHS: Z));
1643	}
1644	return nullptr;
1645	});
1646	S.addRule(N: "fold select-select",
1647	// (select c (select c x y) z) -> (select c x z)
1648	// (select c x (select c y z)) -> (select c x z)
1649	F: [](Instruction I, LLVMContext &Ctx) -> Value {
1650	SelectInst *Sel = dyn_cast<SelectInst>(Val: I);
1651	if (!Sel)
1652	return nullptr;
1653	IRBuilder<> B(Ctx);
1654	Value *C = Sel->getCondition();
1655	if (SelectInst *Sel0 = dyn_cast<SelectInst>(Val: Sel->getTrueValue())) {
1656	if (Sel0->getCondition() == C)
1657	return B.CreateSelect(C, True: Sel0->getTrueValue(), False: Sel->getFalseValue());
1658	}
1659	if (SelectInst *Sel1 = dyn_cast<SelectInst>(Val: Sel->getFalseValue())) {
1660	if (Sel1->getCondition() == C)
1661	return B.CreateSelect(C, True: Sel->getTrueValue(), False: Sel1->getFalseValue());
1662	}
1663	return nullptr;
1664	});
1665	S.addRule(N: "or-signbit -> xor-signbit",
1666	// (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
1667	F: [](Instruction I, LLVMContext &Ctx) -> Value {
1668	if (I->getOpcode() != Instruction::Or)
1669	return nullptr;
1670	ConstantInt *Msb = dyn_cast<ConstantInt>(Val: I->getOperand(i: `1`));
1671	if (!Msb \|\| !Msb->getValue().isSignMask())
1672	return nullptr;
1673	if (!hasZeroSignBit(V: I->getOperand(i: `0`)))
1674	return nullptr;
1675	return IRBuilder<>(Ctx).CreateXor(LHS: I->getOperand(i: `0`), RHS: Msb);
1676	});
1677	S.addRule(N: "sink lshr into binop",
1678	// (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
1679	F: [](Instruction I, LLVMContext &Ctx) -> Value {
1680	if (I->getOpcode() != Instruction::LShr)
1681	return nullptr;
1682	BinaryOperator *BitOp = dyn_cast<BinaryOperator>(Val: I->getOperand(i: `0`));
1683	if (!BitOp)
1684	return nullptr;
1685	switch (BitOp->getOpcode()) {
1686	case Instruction::And:
1687	case Instruction::Or:
1688	case Instruction::Xor:
1689	break;
1690	default:
1691	return nullptr;
1692	}
1693	IRBuilder<> B(Ctx);
1694	Value *S = I->getOperand(i: `1`);
1695	return B.CreateBinOp(Opc: BitOp->getOpcode(),
1696	LHS: B.CreateLShr(LHS: BitOp->getOperand(i_nocapture: `0`), RHS: S),
1697	RHS: B.CreateLShr(LHS: BitOp->getOperand(i_nocapture: `1`), RHS: S));
1698	});
1699	S.addRule(N: "expose bitop-const",
1700	// (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
1701	F: [](Instruction I, LLVMContext &Ctx) -> Value {
1702	auto IsBitOp = [](unsigned Op) -> bool {
1703	switch (Op) {
1704	case Instruction::And:
1705	case Instruction::Or:
1706	case Instruction::Xor:
1707	return true;
1708	}
1709	return false;
1710	};
1711	BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(Val: I);
1712	if (!BitOp1 \|\| !IsBitOp(BitOp1->getOpcode()))
1713	return nullptr;
1714	BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(Val: BitOp1->getOperand(i_nocapture: `0`));
1715	if (!BitOp2 \|\| !IsBitOp(BitOp2->getOpcode()))
1716	return nullptr;
1717	ConstantInt *CA = dyn_cast<ConstantInt>(Val: BitOp2->getOperand(i_nocapture: `1`));
1718	ConstantInt *CB = dyn_cast<ConstantInt>(Val: BitOp1->getOperand(i_nocapture: `1`));
1719	if (!CA \|\| !CB)
1720	return nullptr;
1721	IRBuilder<> B(Ctx);
1722	Value *X = BitOp2->getOperand(i_nocapture: `0`);
1723	return B.CreateBinOp(Opc: BitOp2->getOpcode(), LHS: X,
1724	RHS: B.CreateBinOp(Opc: BitOp1->getOpcode(), LHS: CA, RHS: CB));
1725	});
1726	}
1727
1728	void PolynomialMultiplyRecognize::setupPostSimplifier(Simplifier &S) {
1729	S.addRule(N: "(and (xor (and x a) y) b) -> (and (xor x y) b), if b == b&a",
1730	F: [](Instruction I, LLVMContext &Ctx) -> Value {
1731	if (I->getOpcode() != Instruction::And)
1732	return nullptr;
1733	Instruction *Xor = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
1734	ConstantInt *C0 = dyn_cast<ConstantInt>(Val: I->getOperand(i: `1`));
1735	if (!Xor \|\| !C0)
1736	return nullptr;
1737	if (Xor->getOpcode() != Instruction::Xor)
1738	return nullptr;
1739	Instruction *And0 = dyn_cast<Instruction>(Val: Xor->getOperand(i: `0`));
1740	Instruction *And1 = dyn_cast<Instruction>(Val: Xor->getOperand(i: `1`));
1741	// Pick the first non-null and.
1742	if (!And0 \|\| And0->getOpcode() != Instruction::And)
1743	std::swap(a&: And0, b&: And1);
1744	ConstantInt *C1 = dyn_cast<ConstantInt>(Val: And0->getOperand(i: `1`));
1745	if (!C1)
1746	return nullptr;
1747	uint32_t V0 = C0->getZExtValue();
1748	uint32_t V1 = C1->getZExtValue();
1749	if (V0 != (V0 & V1))
1750	return nullptr;
1751	IRBuilder<> B(Ctx);
1752	return B.CreateAnd(LHS: B.CreateXor(LHS: And0->getOperand(i: `0`), RHS: And1), RHS: C0);
1753	});
1754	}
1755
1756	bool PolynomialMultiplyRecognize::recognize() {
1757	LLVM_DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
1758	<< *CurLoop << `'\n'`);
1759	// Restrictions:
1760	// - The loop must consist of a single block.
1761	// - The iteration count must be known at compile-time.
1762	// - The loop must have an induction variable starting from 0, and
1763	// incremented in each iteration of the loop.
1764	BasicBlock *LoopB = CurLoop->getHeader();
1765	LLVM_DEBUG(dbgs() << "Loop header:\n" << *LoopB);
1766
1767	if (LoopB != CurLoop->getLoopLatch())
1768	return false;
1769	BasicBlock *ExitB = CurLoop->getExitBlock();
1770	if (ExitB == nullptr)
1771	return false;
1772	BasicBlock *EntryB = CurLoop->getLoopPreheader();
1773	if (EntryB == nullptr)
1774	return false;
1775
1776	unsigned IterCount = `0`;
1777	const SCEV *CT = SE.getBackedgeTakenCount(L: CurLoop);
1778	if (isa<SCEVCouldNotCompute>(Val: CT))
1779	return false;
1780	if (auto *CV = dyn_cast<SCEVConstant>(Val: CT))
1781	IterCount = CV->getValue()->getZExtValue() + `1`;
1782
1783	Value *CIV = getCountIV(BB: LoopB);
1784	if (CIV == nullptr)
1785	return false;
1786	ParsedValues PV;
1787	Simplifier PreSimp;
1788	PV.IterCount = IterCount;
1789	LLVM_DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount
1790	<< `'\n'`);
1791
1792	setupPreSimplifier(PreSimp);
1793
1794	// Perform a preliminary scan of select instructions to see if any of them
1795	// looks like a generator of the polynomial multiply steps. Assume that a
1796	// loop can only contain a single transformable operation, so stop the
1797	// traversal after the first reasonable candidate was found.
1798	// XXX: Currently this approach can modify the loop before being 100% sure
1799	// that the transformation can be carried out.
1800	bool FoundPreScan = false;
1801	auto FeedsPHI = [LoopB](const Value V) -> bool* {
1802	for (const Value *U : V->users()) {
1803	if (const auto P = dyn_cast<const* PHINode>(Val: U))
1804	if (P->getParent() == LoopB)
1805	return true;
1806	}
1807	return false;
1808	};
1809	for (Instruction &In : *LoopB) {
1810	SelectInst *SI = dyn_cast<SelectInst>(Val: &In);
1811	if (!SI \|\| !FeedsPHI (SI))
1812	continue;
1813
1814	Simplifier::Context C(SI);
1815	Value *T = PreSimp.simplify(C);
1816	SelectInst *SelI = (T && isa<SelectInst>(Val: T)) ? cast<SelectInst>(Val: T) : SI;
1817	LLVM_DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << `'\n'`);
1818	if (scanSelect(SelI, LoopB, PrehB: EntryB, CIV, PV, PreScan: true)) {
1819	FoundPreScan = true;
1820	if (SelI != SI) {
1821	Value *NewSel = C.materialize(B: LoopB, At: SI->getIterator());
1822	SI->replaceAllUsesWith(V: NewSel);
1823	RecursivelyDeleteTriviallyDeadInstructions(V: SI, TLI: &TLI);
1824	}
1825	break;
1826	}
1827	}
1828
1829	if (!FoundPreScan) {
1830	LLVM_DEBUG(dbgs() << "Have not found candidates for pmpy\n");
1831	return false;
1832	}
1833
1834	if (!PV.Left) {
1835	// The right shift version actually only returns the higher bits of
1836	// the result (each iteration discards the LSB). If we want to convert it
1837	// to a left-shifting loop, the working data type must be at least as
1838	// wide as the target's pmpy instruction.
1839	if (!promoteTypes(LoopB, ExitB))
1840	return false;
1841	// Run post-promotion simplifications.
1842	Simplifier PostSimp;
1843	setupPostSimplifier(PostSimp);
1844	for (Instruction &In : *LoopB) {
1845	SelectInst *SI = dyn_cast<SelectInst>(Val: &In);
1846	if (!SI \|\| !FeedsPHI (SI))
1847	continue;
1848	Simplifier::Context C(SI);
1849	Value *T = PostSimp.simplify(C);
1850	SelectInst *SelI = dyn_cast_or_null<SelectInst>(Val: T);
1851	if (SelI != SI) {
1852	Value *NewSel = C.materialize(B: LoopB, At: SI->getIterator());
1853	SI->replaceAllUsesWith(V: NewSel);
1854	RecursivelyDeleteTriviallyDeadInstructions(V: SI, TLI: &TLI);
1855	}
1856	break;
1857	}
1858
1859	if (!convertShiftsToLeft(LoopB, ExitB, IterCount))
1860	return false;
1861	cleanupLoopBody(LoopB);
1862	}
1863
1864	// Scan the loop again, find the generating select instruction.
1865	bool FoundScan = false;
1866	for (Instruction &In : *LoopB) {
1867	SelectInst *SelI = dyn_cast<SelectInst>(Val: &In);
1868	if (!SelI)
1869	continue;
1870	LLVM_DEBUG(dbgs() << "scanSelect: " << *SelI << `'\n'`);
1871	FoundScan = scanSelect(SelI, LoopB, PrehB: EntryB, CIV, PV, PreScan: false);
1872	if (FoundScan)
1873	break;
1874	}
1875	assert(FoundScan);
1876
1877	LLVM_DEBUG({
1878	StringRef PP = (PV.M ? "(P+M)" : "P");
1879	if (!PV.Inv)
1880	dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
1881	else
1882	dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
1883	<< PP << "\n";
1884	dbgs() << " Res:" << PV.Res << "\n P:" << PV.P << "\n";
1885	if (PV.M)
1886	dbgs() << " M:" << *PV.M << "\n";
1887	dbgs() << " Q:" << *PV.Q << "\n";
1888	dbgs() << " Iteration count:" << PV.IterCount << "\n";
1889	});
1890
1891	BasicBlock::iterator At(EntryB->getTerminator());
1892	Value *PM = generate(At, PV);
1893	if (PM == nullptr)
1894	return false;
1895
1896	if (PM->getType() != PV.Res->getType())
1897	PM = IRBuilder<>(&At).CreateIntCast(V: PM, DestTy: PV.Res->getType(), isSigned: false*);
1898
1899	PV.Res->replaceAllUsesWith(V: PM);
1900	PV.Res->eraseFromParent();
1901	return true;
1902	}
1903
1904	int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
1905	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Val: S->getOperand(i: `1`)))
1906	return SC->getAPInt().getSExtValue();
1907	return `0`;
1908	}
1909
1910	bool HexagonLoopIdiomRecognize::isLegalStore(Loop CurLoop, StoreInst SI) {
1911	// Allow volatile stores if HexagonVolatileMemcpy is enabled.
1912	if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
1913	return false;
1914
1915	Value *StoredVal = SI->getValueOperand();
1916	Value *StorePtr = SI->getPointerOperand();
1917
1918	// Reject stores that are so large that they overflow an unsigned.
1919	uint64_t SizeInBits = DL->getTypeSizeInBits(Ty: StoredVal->getType());
1920	if ((SizeInBits & `7`) \|\| (SizeInBits >> `32`) != `0`)
1921	return false;
1922
1923	// See if the pointer expression is an AddRec like {base,+,1} on the current
1924	// loop, which indicates a strided store. If we have something else, it's a
1925	// random store we can't handle.
1926	auto *StoreEv = dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: StorePtr));
1927	if (!StoreEv \|\| StoreEv->getLoop() != CurLoop \|\| !StoreEv->isAffine())
1928	return false;
1929
1930	// Check to see if the stride matches the size of the store. If so, then we
1931	// know that every byte is touched in the loop.
1932	int Stride = getSCEVStride(S: StoreEv);
1933	if (Stride == `0`)
1934	return false;
1935	unsigned StoreSize = DL->getTypeStoreSize(Ty: SI->getValueOperand()->getType());
1936	if (StoreSize != unsigned(std::abs(x: Stride)))
1937	return false;
1938
1939	// The store must be feeding a non-volatile load.
1940	LoadInst *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand());
1941	if (!LI \|\| !LI->isSimple())
1942	return false;
1943
1944	// See if the pointer expression is an AddRec like {base,+,1} on the current
1945	// loop, which indicates a strided load. If we have something else, it's a
1946	// random load we can't handle.
1947	Value *LoadPtr = LI->getPointerOperand();
1948	auto *LoadEv = dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: LoadPtr));
1949	if (!LoadEv \|\| LoadEv->getLoop() != CurLoop \|\| !LoadEv->isAffine())
1950	return false;
1951
1952	// The store and load must share the same stride.
1953	if (StoreEv->getOperand(i: `1`) != LoadEv->getOperand(i: `1`))
1954	return false;
1955
1956	// Success. This store can be converted into a memcpy.
1957	return true;
1958	}
1959
1960	/// mayLoopAccessLocation - Return true if the specified loop might access the
1961	/// specified pointer location, which is a loop-strided access. The 'Access'
1962	/// argument specifies what the verboten forms of access are (read or write).
1963	static bool
1964	mayLoopAccessLocation(Value Ptr, ModRefInfo Access, Loop L,
1965	const SCEV BECount, unsigned* StoreSize,
1966	AliasAnalysis &AA,
1967	SmallPtrSetImpl<Instruction *> &Ignored) {
1968	// Get the location that may be stored across the loop. Since the access
1969	// is strided positively through memory, we say that the modified location
1970	// starts at the pointer and has infinite size.
1971	LocationSize AccessSize = LocationSize::afterPointer();
1972
1973	// If the loop iterates a fixed number of times, we can refine the access
1974	// size to be exactly the size of the memset, which is (BECount+1)StoreSize*
1975	if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(Val: BECount))
1976	AccessSize = LocationSize::precise(Value: (BECst->getValue()->getZExtValue() + `1`) *
1977	StoreSize);
1978
1979	// TODO: For this to be really effective, we have to dive into the pointer
1980	// operand in the store. Store to &A[i] of 100 will always return may alias
1981	// with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1982	// which will then no-alias a store to &A[100].
1983	MemoryLocation StoreLoc(Ptr, AccessSize);
1984
1985	for (auto *B : L->blocks())
1986	for (auto &I : *B)
1987	if (Ignored.count(Ptr: &I) == `0` &&
1988	isModOrRefSet(MRI: AA.getModRefInfo(I: &I, OptLoc: StoreLoc) & Access))
1989	return true;
1990
1991	return false;
1992	}
1993
1994	void HexagonLoopIdiomRecognize::collectStores(Loop CurLoop, BasicBlock BB,
1995	SmallVectorImpl<StoreInst*> &Stores) {
1996	Stores.clear();
1997	for (Instruction &I : *BB)
1998	if (StoreInst *SI = dyn_cast<StoreInst>(Val: &I))
1999	if (isLegalStore(CurLoop, SI))
2000	Stores.push_back(Elt: SI);
2001	}
2002
2003	bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
2004	StoreInst SI, const* SCEV *BECount) {
2005	assert((SI->isSimple() \|\| (SI->isVolatile() && HexagonVolatileMemcpy)) &&
2006	"Expected only non-volatile stores, or Hexagon-specific memcpy"
2007	"to volatile destination.");
2008
2009	Value *StorePtr = SI->getPointerOperand();
2010	auto *StoreEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: StorePtr));
2011	unsigned Stride = getSCEVStride(S: StoreEv);
2012	unsigned StoreSize = DL->getTypeStoreSize(Ty: SI->getValueOperand()->getType());
2013	if (Stride != StoreSize)
2014	return false;
2015
2016	// See if the pointer expression is an AddRec like {base,+,1} on the current
2017	// loop, which indicates a strided load. If we have something else, it's a
2018	// random load we can't handle.
2019	auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
2020	auto *LoadEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: LI->getPointerOperand()));
2021
2022	// The trip count of the loop and the base pointer of the addrec SCEV is
2023	// guaranteed to be loop invariant, which means that it should dominate the
2024	// header. This allows us to insert code for it in the preheader.
2025	BasicBlock *Preheader = CurLoop->getLoopPreheader();
2026	Instruction *ExpPt = Preheader->getTerminator();
2027	IRBuilder<> Builder(ExpPt);
2028	SCEVExpander Expander(SE, DL, "hexagon-loop-idiom");
2029
2030	Type IntPtrTy = Builder.getIntPtrTy(DL: DL, AddrSpace: SI->getPointerAddressSpace());
2031
2032	// Okay, we have a strided store "p[i]" of a loaded value. We can turn
2033	// this into a memcpy/memmove in the loop preheader now if we want. However,
2034	// this would be unsafe to do if there is anything else in the loop that may
2035	// read or write the memory region we're storing to. For memcpy, this
2036	// includes the load that feeds the stores. Check for an alias by generating
2037	// the base address and checking everything.
2038	Value *StoreBasePtr = Expander.expandCodeFor(SH: StoreEv->getStart(),
2039	Ty: Builder.getPtrTy(AddrSpace: SI->getPointerAddressSpace()), I: ExpPt);
2040	Value LoadBasePtr = nullptr*;
2041
2042	bool Overlap = false;
2043	bool DestVolatile = SI->isVolatile();
2044	Type *BECountTy = BECount->getType();
2045
2046	if (DestVolatile) {
2047	// The trip count must fit in i32, since it is the type of the "num_words"
2048	// argument to hexagon_memcpy_forward_vp4cp4n2.
2049	if (StoreSize != `4` \|\| DL->getTypeSizeInBits(Ty: BECountTy) > `32`) {
2050	CleanupAndExit:
2051	// If we generated new code for the base pointer, clean up.
2052	Expander.clear();
2053	if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
2054	RecursivelyDeleteTriviallyDeadInstructions(V: StoreBasePtr, TLI);
2055	StoreBasePtr = nullptr;
2056	}
2057	if (LoadBasePtr) {
2058	RecursivelyDeleteTriviallyDeadInstructions(V: LoadBasePtr, TLI);
2059	LoadBasePtr = nullptr;
2060	}
2061	return false;
2062	}
2063	}
2064
2065	SmallPtrSet<Instruction*, `2`> Ignore1;
2066	Ignore1.insert(Ptr: SI);
2067	if (mayLoopAccessLocation(Ptr: StoreBasePtr, Access: ModRefInfo::ModRef, L: CurLoop, BECount,
2068	StoreSize, AA&: *AA, Ignored&: Ignore1)) {
2069	// Check if the load is the offending instruction.
2070	Ignore1.insert(Ptr: LI);
2071	if (mayLoopAccessLocation(Ptr: StoreBasePtr, Access: ModRefInfo::ModRef, L: CurLoop,
2072	BECount, StoreSize, AA&: *AA, Ignored&: Ignore1)) {
2073	// Still bad. Nothing we can do.
2074	goto CleanupAndExit;
2075	}
2076	// It worked with the load ignored.
2077	Overlap = true;
2078	}
2079
2080	if (!Overlap) {
2081	if (DisableMemcpyIdiom \|\| !HasMemcpy)
2082	goto CleanupAndExit;
2083	} else {
2084	// Don't generate memmove if this function will be inlined. This is
2085	// because the caller will undergo this transformation after inlining.
2086	Function *Func = CurLoop->getHeader()->getParent();
2087	if (Func->hasFnAttribute(Kind: Attribute::AlwaysInline))
2088	goto CleanupAndExit;
2089
2090	// In case of a memmove, the call to memmove will be executed instead
2091	// of the loop, so we need to make sure that there is nothing else in
2092	// the loop than the load, store and instructions that these two depend
2093	// on.
2094	SmallVector<Instruction*,`2`> Insts;
2095	Insts.push_back(Elt: SI);
2096	Insts.push_back(Elt: LI);
2097	if (!coverLoop(L: CurLoop, Insts))
2098	goto CleanupAndExit;
2099
2100	if (DisableMemmoveIdiom \|\| !HasMemmove)
2101	goto CleanupAndExit;
2102	bool IsNested = CurLoop->getParentLoop() != nullptr;
2103	if (IsNested && OnlyNonNestedMemmove)
2104	goto CleanupAndExit;
2105	}
2106
2107	// For a memcpy, we have to make sure that the input array is not being
2108	// mutated by the loop.
2109	LoadBasePtr = Expander.expandCodeFor(SH: LoadEv->getStart(),
2110	Ty: Builder.getPtrTy(AddrSpace: LI->getPointerAddressSpace()), I: ExpPt);
2111
2112	SmallPtrSet<Instruction*, `2`> Ignore2;
2113	Ignore2.insert(Ptr: SI);
2114	if (mayLoopAccessLocation(Ptr: LoadBasePtr, Access: ModRefInfo::Mod, L: CurLoop, BECount,
2115	StoreSize, AA&: *AA, Ignored&: Ignore2))
2116	goto CleanupAndExit;
2117
2118	// Check the stride.
2119	bool StridePos = getSCEVStride(S: LoadEv) >= `0`;
2120
2121	// Currently, the volatile memcpy only emulates traversing memory forward.
2122	if (!StridePos && DestVolatile)
2123	goto CleanupAndExit;
2124
2125	bool RuntimeCheck = (Overlap \|\| DestVolatile);
2126
2127	BasicBlock *ExitB;
2128	if (RuntimeCheck) {
2129	// The runtime check needs a single exit block.
2130	SmallVector<BasicBlock*, `8`> ExitBlocks;
2131	CurLoop->getUniqueExitBlocks(ExitBlocks);
2132	if (ExitBlocks.size() != `1`)
2133	goto CleanupAndExit;
2134	ExitB = ExitBlocks [`0`];
2135	}
2136
2137	// The # stored bytes is (BECount+1)Size. Expand the trip count out to*
2138	// pointer size if it isn't already.
2139	LLVMContext &Ctx = SI->getContext();
2140	BECount = SE->getTruncateOrZeroExtend(V: BECount, Ty: IntPtrTy);
2141	DebugLoc DLoc = SI->getDebugLoc();
2142
2143	const SCEV *NumBytesS =
2144	SE->getAddExpr(LHS: BECount, RHS: SE->getOne(Ty: IntPtrTy), Flags: SCEV::FlagNUW);
2145	if (StoreSize != `1`)
2146	NumBytesS = SE->getMulExpr(LHS: NumBytesS, RHS: SE->getConstant(Ty: IntPtrTy, V: StoreSize),
2147	Flags: SCEV::FlagNUW);
2148	Value *NumBytes = Expander.expandCodeFor(SH: NumBytesS, Ty: IntPtrTy, I: ExpPt);
2149	if (Instruction *In = dyn_cast<Instruction>(Val: NumBytes))
2150	if (Value Simp = simplifyInstruction(I: In, Q: {DL, TLI, DT}))
2151	NumBytes = Simp;
2152
2153	CallInst *NewCall;
2154
2155	if (RuntimeCheck) {
2156	unsigned Threshold = RuntimeMemSizeThreshold;
2157	if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: NumBytes)) {
2158	uint64_t C = CI->getZExtValue();
2159	if (Threshold != `0` && C < Threshold)
2160	goto CleanupAndExit;
2161	if (C < CompileTimeMemSizeThreshold)
2162	goto CleanupAndExit;
2163	}
2164
2165	BasicBlock *Header = CurLoop->getHeader();
2166	Function *Func = Header->getParent();
2167	Loop *ParentL = LF->getLoopFor(BB: Preheader);
2168	StringRef HeaderName = Header->getName();
2169
2170	// Create a new (empty) preheader, and update the PHI nodes in the
2171	// header to use the new preheader.
2172	BasicBlock *NewPreheader = BasicBlock::Create(Context&: Ctx, Name: HeaderName +".rtli.ph",
2173	Parent: Func, InsertBefore: Header);
2174	if (ParentL)
2175	ParentL->addBasicBlockToLoop(NewBB: NewPreheader, LI&: *LF);
2176	IRBuilder<>(NewPreheader).CreateBr(Dest: Header);
2177	for (auto &In : *Header) {
2178	PHINode *PN = dyn_cast<PHINode>(Val: &In);
2179	if (!PN)
2180	break;
2181	int bx = PN->getBasicBlockIndex(BB: Preheader);
2182	if (bx >= `0`)
2183	PN->setIncomingBlock(i: bx, BB: NewPreheader);
2184	}
2185	DT->addNewBlock(BB: NewPreheader, DomBB: Preheader);
2186	DT->changeImmediateDominator(BB: Header, NewBB: NewPreheader);
2187
2188	// Check for safe conditions to execute memmove.
2189	// If stride is positive, copying things from higher to lower addresses
2190	// is equivalent to memmove. For negative stride, it's the other way
2191	// around. Copying forward in memory with positive stride may not be
2192	// same as memmove since we may be copying values that we just stored
2193	// in some previous iteration.
2194	Value *LA = Builder.CreatePtrToInt(V: LoadBasePtr, DestTy: IntPtrTy);
2195	Value *SA = Builder.CreatePtrToInt(V: StoreBasePtr, DestTy: IntPtrTy);
2196	Value *LowA = StridePos ? SA : LA;
2197	Value *HighA = StridePos ? LA : SA;
2198	Value *CmpA = Builder.CreateICmpULT(LHS: LowA, RHS: HighA);
2199	Value *Cond = CmpA;
2200
2201	// Check for distance between pointers. Since the case LowA < HighA
2202	// is checked for above, assume LowA >= HighA.
2203	Value *Dist = Builder.CreateSub(LHS: LowA, RHS: HighA);
2204	Value *CmpD = Builder.CreateICmpSLE(LHS: NumBytes, RHS: Dist);
2205	Value *CmpEither = Builder.CreateOr(LHS: Cond, RHS: CmpD);
2206	Cond = CmpEither;
2207
2208	if (Threshold != `0`) {
2209	Type *Ty = NumBytes->getType();
2210	Value *Thr = ConstantInt::get(Ty, V: Threshold);
2211	Value *CmpB = Builder.CreateICmpULT(LHS: Thr, RHS: NumBytes);
2212	Value *CmpBoth = Builder.CreateAnd(LHS: Cond, RHS: CmpB);
2213	Cond = CmpBoth;
2214	}
2215	BasicBlock *MemmoveB = BasicBlock::Create(Context&: Ctx, Name: Header->getName()+".rtli",
2216	Parent: Func, InsertBefore: NewPreheader);
2217	if (ParentL)
2218	ParentL->addBasicBlockToLoop(NewBB: MemmoveB, LI&: *LF);
2219	Instruction *OldT = Preheader->getTerminator();
2220	Builder.CreateCondBr(Cond, True: MemmoveB, False: NewPreheader);
2221	OldT->eraseFromParent();
2222	Preheader->setName(Preheader->getName()+".old");
2223	DT->addNewBlock(BB: MemmoveB, DomBB: Preheader);
2224	// Find the new immediate dominator of the exit block.
2225	BasicBlock *ExitD = Preheader;
2226	for (BasicBlock *PB : predecessors(BB: ExitB)) {
2227	ExitD = DT->findNearestCommonDominator(A: ExitD, B: PB);
2228	if (!ExitD)
2229	break;
2230	}
2231	// If the prior immediate dominator of ExitB was dominated by the
2232	// old preheader, then the old preheader becomes the new immediate
2233	// dominator. Otherwise don't change anything (because the newly
2234	// added blocks are dominated by the old preheader).
2235	if (ExitD && DT->dominates(A: Preheader, B: ExitD)) {
2236	DomTreeNode *BN = DT->getNode(BB: ExitB);
2237	DomTreeNode *DN = DT->getNode(BB: ExitD);
2238	BN->setIDom(DN);
2239	}
2240
2241	// Add a call to memmove to the conditional block.
2242	IRBuilder<> CondBuilder(MemmoveB);
2243	CondBuilder.CreateBr(Dest: ExitB);
2244	CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
2245
2246	if (DestVolatile) {
2247	Type *Int32Ty = Type::getInt32Ty(C&: Ctx);
2248	Type *PtrTy = PointerType::get(C&: Ctx, AddressSpace: `0`);
2249	Type *VoidTy = Type::getVoidTy(C&: Ctx);
2250	Module *M = Func->getParent();
2251	FunctionCallee Fn = M->getOrInsertFunction(
2252	Name: HexagonVolatileMemcpyName, RetTy: VoidTy, Args: PtrTy, Args: PtrTy, Args: Int32Ty);
2253
2254	const SCEV *OneS = SE->getConstant(Ty: Int32Ty, V: `1`);
2255	const SCEV *BECount32 = SE->getTruncateOrZeroExtend(V: BECount, Ty: Int32Ty);
2256	const SCEV *NumWordsS = SE->getAddExpr(LHS: BECount32, RHS: OneS, Flags: SCEV::FlagNUW);
2257	Value *NumWords = Expander.expandCodeFor(SH: NumWordsS, Ty: Int32Ty,
2258	I: MemmoveB->getTerminator());
2259	if (Instruction *In = dyn_cast<Instruction>(Val: NumWords))
2260	if (Value Simp = simplifyInstruction(I: In, Q: {DL, TLI, DT}))
2261	NumWords = Simp;
2262
2263	NewCall = CondBuilder.CreateCall(Callee: Fn,
2264	Args: {StoreBasePtr, LoadBasePtr, NumWords});
2265	} else {
2266	NewCall = CondBuilder.CreateMemMove(
2267	Dst: StoreBasePtr, DstAlign: SI->getAlign(), Src: LoadBasePtr, SrcAlign: LI->getAlign(), Size: NumBytes);
2268	}
2269	} else {
2270	NewCall = Builder.CreateMemCpy(Dst: StoreBasePtr, DstAlign: SI->getAlign(), Src: LoadBasePtr,
2271	SrcAlign: LI->getAlign(), Size: NumBytes);
2272	// Okay, the memcpy has been formed. Zap the original store and
2273	// anything that feeds into it.
2274	RecursivelyDeleteTriviallyDeadInstructions(V: SI, TLI);
2275	}
2276
2277	NewCall->setDebugLoc(DLoc);
2278
2279	LLVM_DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ")
2280	<< *NewCall << "\n"
2281	<< " from load ptr=" << LoadEv << " at: " << LI << "\n"
2282	<< " from store ptr=" << StoreEv << " at: " << SI
2283	<< "\n");
2284
2285	return true;
2286	}
2287
2288	// Check if the instructions in Insts, together with their dependencies
2289	// cover the loop in the sense that the loop could be safely eliminated once
2290	// the instructions in Insts are removed.
2291	bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
2292	SmallVectorImpl<Instruction> &Insts) const* {
2293	SmallSet<BasicBlock*,`8`> LoopBlocks;
2294	LoopBlocks.insert_range(R: L->blocks());
2295
2296	SetVector<Instruction *> Worklist(llvm::from_range, Insts);
2297
2298	// Collect all instructions from the loop that the instructions in Insts
2299	// depend on (plus their dependencies, etc.). These instructions will
2300	// constitute the expression trees that feed those in Insts, but the trees
2301	// will be limited only to instructions contained in the loop.
2302	for (unsigned i = `0`; i < Worklist.size(); ++i) {
2303	Instruction *In = Worklist [i];
2304	for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
2305	Instruction *OpI = dyn_cast<Instruction>(Val: I);
2306	if (!OpI)
2307	continue;
2308	BasicBlock *PB = OpI->getParent();
2309	if (!LoopBlocks.count(Ptr: PB))
2310	continue;
2311	Worklist.insert(X: OpI);
2312	}
2313	}
2314
2315	// Scan all instructions in the loop, if any of them have a user outside
2316	// of the loop, or outside of the expressions collected above, then either
2317	// the loop has a side-effect visible outside of it, or there are
2318	// instructions in it that are not involved in the original set Insts.
2319	for (auto *B : L->blocks()) {
2320	for (auto &In : *B) {
2321	if (isa<BranchInst>(Val: In))
2322	continue;
2323	if (!Worklist.count(key: &In) && In.mayHaveSideEffects())
2324	return false;
2325	for (auto *K : In.users()) {
2326	Instruction *UseI = dyn_cast<Instruction>(Val: K);
2327	if (!UseI)
2328	continue;
2329	BasicBlock *UseB = UseI->getParent();
2330	if (LF->getLoopFor(BB: UseB) != L)
2331	return false;
2332	}
2333	}
2334	}
2335
2336	return true;
2337	}
2338
2339	/// runOnLoopBlock - Process the specified block, which lives in a counted loop
2340	/// with the specified backedge count. This block is known to be in the current
2341	/// loop and not in any subloops.
2342	bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop CurLoop, BasicBlock BB,
2343	const SCEV BECount, SmallVectorImpl<BasicBlock> &ExitBlocks) {
2344	// We can only promote stores in this block if they are unconditionally
2345	// executed in the loop. For a block to be unconditionally executed, it has
2346	// to dominate all the exit blocks of the loop. Verify this now.
2347	auto DominatedByBB = [this,BB] (BasicBlock EB) -> bool* {
2348	return DT->dominates(A: BB, B: EB);
2349	};
2350	if (!all_of(Range&: ExitBlocks, P: DominatedByBB))
2351	return false;
2352
2353	bool MadeChange = false;
2354	// Look for store instructions, which may be optimized to memset/memcpy.
2355	SmallVector<StoreInst*,`8`> Stores;
2356	collectStores(CurLoop, BB, Stores);
2357
2358	// Optimize the store into a memcpy, if it feeds an similarly strided load.
2359	for (auto &SI : Stores)
2360	MadeChange \|= processCopyingStore(CurLoop, SI, BECount);
2361
2362	return MadeChange;
2363	}
2364
2365	bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
2366	PolynomialMultiplyRecognize PMR(L, DL, DT, TLI, SE);
2367	if (PMR.recognize())
2368	return true;
2369
2370	if (!HasMemcpy && !HasMemmove)
2371	return false;
2372
2373	const SCEV *BECount = SE->getBackedgeTakenCount(L);
2374	assert(!isa<SCEVCouldNotCompute>(BECount) &&
2375	"runOnCountableLoop() called on a loop without a predictable"
2376	"backedge-taken count");
2377
2378	SmallVector<BasicBlock *, `8`> ExitBlocks;
2379	L->getUniqueExitBlocks(ExitBlocks);
2380
2381	bool Changed = false;
2382
2383	// Scan all the blocks in the loop that are not in subloops.
2384	for (auto *BB : L->getBlocks()) {
2385	// Ignore blocks in subloops.
2386	if (LF->getLoopFor(BB) != L)
2387	continue;
2388	Changed \|= runOnLoopBlock(CurLoop: L, BB, BECount, ExitBlocks);
2389	}
2390
2391	return Changed;
2392	}
2393
2394	bool HexagonLoopIdiomRecognize::run(Loop *L) {
2395	const Module &M = *L->getHeader()->getParent()->getParent();
2396	if (M.getTargetTriple().getArch() != Triple::hexagon)
2397	return false;
2398
2399	// If the loop could not be converted to canonical form, it must have an
2400	// indirectbr in it, just give up.
2401	if (!L->getLoopPreheader())
2402	return false;
2403
2404	// Disable loop idiom recognition if the function's name is a common idiom.
2405	StringRef Name = L->getHeader()->getParent()->getName();
2406	if (Name == "memset" \|\| Name == "memcpy" \|\| Name == "memmove")
2407	return false;
2408
2409	DL = &L->getHeader()->getDataLayout();
2410
2411	HasMemcpy = TLI->has(F: LibFunc_memcpy);
2412	HasMemmove = TLI->has(F: LibFunc_memmove);
2413
2414	if (SE->hasLoopInvariantBackedgeTakenCount(L))
2415	return runOnCountableLoop(L);
2416	return false;
2417	}
2418
2419	bool HexagonLoopIdiomRecognizeLegacyPass::runOnLoop(Loop *L,
2420	LPPassManager &LPM) {
2421	if (skipLoop(L))
2422	return false;
2423
2424	auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2425	auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2426	auto *LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2427	auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
2428	F: *L->getHeader()->getParent());
2429	auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2430	return HexagonLoopIdiomRecognize (AA, DT, LF, TLI, SE).run(L);
2431	}
2432
2433	Pass *llvm::createHexagonLoopIdiomPass() {
2434	return new HexagonLoopIdiomRecognizeLegacyPass ();
2435	}
2436
2437	PreservedAnalyses
2438	HexagonLoopIdiomRecognitionPass::run(Loop &L, LoopAnalysisManager &AM,
2439	LoopStandardAnalysisResults &AR,
2440	LPMUpdater &U) {
2441	return HexagonLoopIdiomRecognize (&AR.AA, &AR.DT, &AR.LI, &AR.TLI, &AR.SE)
2442	.run(L: &L)
2443	? getLoopPassPreservedAnalyses()
2444	: PreservedAnalyses::all();
2445	}
2446

Browse the source code of llvm_projects/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp