LoopIdiomRecognize.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp]

1	//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass implements an idiom recognizer that transforms simple loops into a
10	// non-loop form. In cases that this kicks in, it can be a significant
11	// performance win.
12	//
13	// If compiling for code size we avoid idiom recognition if the resulting
14	// code could be larger than the code for the original loop. One way this could
15	// happen is if the loop is not removable after idiom recognition due to the
16	// presence of non-idiom instructions. The initial implementation of the
17	// heuristics applies to idioms in multi-block loops.
18	//
19	//===----------------------------------------------------------------------===//
20	//
21	// TODO List:
22	//
23	// Future loop memory idioms to recognize: memcmp, etc.
24	//
25	// This could recognize common matrix multiplies and dot product idioms and
26	// replace them with calls to BLAS (if linked in??).
27	//
28	//===----------------------------------------------------------------------===//
29
30	#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
31	#include "llvm/ADT/APInt.h"
32	#include "llvm/ADT/ArrayRef.h"
33	#include "llvm/ADT/DenseMap.h"
34	#include "llvm/ADT/MapVector.h"
35	#include "llvm/ADT/SetVector.h"
36	#include "llvm/ADT/SmallPtrSet.h"
37	#include "llvm/ADT/SmallVector.h"
38	#include "llvm/ADT/Statistic.h"
39	#include "llvm/ADT/StringRef.h"
40	#include "llvm/Analysis/AliasAnalysis.h"
41	#include "llvm/Analysis/CmpInstAnalysis.h"
42	#include "llvm/Analysis/LoopInfo.h"
43	#include "llvm/Analysis/LoopPass.h"
44	#include "llvm/Analysis/MemoryLocation.h"
45	#include "llvm/Analysis/MemorySSA.h"
46	#include "llvm/Analysis/MemorySSAUpdater.h"
47	#include "llvm/Analysis/MustExecute.h"
48	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
49	#include "llvm/Analysis/ScalarEvolution.h"
50	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
51	#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
52	#include "llvm/Analysis/TargetLibraryInfo.h"
53	#include "llvm/Analysis/TargetTransformInfo.h"
54	#include "llvm/Analysis/ValueTracking.h"
55	#include "llvm/IR/BasicBlock.h"
56	#include "llvm/IR/Constant.h"
57	#include "llvm/IR/Constants.h"
58	#include "llvm/IR/DataLayout.h"
59	#include "llvm/IR/DebugLoc.h"
60	#include "llvm/IR/DerivedTypes.h"
61	#include "llvm/IR/Dominators.h"
62	#include "llvm/IR/GlobalValue.h"
63	#include "llvm/IR/GlobalVariable.h"
64	#include "llvm/IR/IRBuilder.h"
65	#include "llvm/IR/InstrTypes.h"
66	#include "llvm/IR/Instruction.h"
67	#include "llvm/IR/Instructions.h"
68	#include "llvm/IR/IntrinsicInst.h"
69	#include "llvm/IR/Intrinsics.h"
70	#include "llvm/IR/LLVMContext.h"
71	#include "llvm/IR/Module.h"
72	#include "llvm/IR/PassManager.h"
73	#include "llvm/IR/PatternMatch.h"
74	#include "llvm/IR/Type.h"
75	#include "llvm/IR/User.h"
76	#include "llvm/IR/Value.h"
77	#include "llvm/IR/ValueHandle.h"
78	#include "llvm/Support/Casting.h"
79	#include "llvm/Support/CommandLine.h"
80	#include "llvm/Support/Debug.h"
81	#include "llvm/Support/InstructionCost.h"
82	#include "llvm/Support/raw_ostream.h"
83	#include "llvm/Transforms/Utils/BuildLibCalls.h"
84	#include "llvm/Transforms/Utils/Local.h"
85	#include "llvm/Transforms/Utils/LoopUtils.h"
86	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
87	#include <algorithm>
88	#include <cassert>
89	#include <cstdint>
90	#include <utility>
91	#include <vector>
92
93	using namespace llvm;
94	using namespace SCEVPatternMatch;
95
96	#define DEBUG_TYPE "loop-idiom"
97
98	STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
99	STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
100	STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
101	STATISTIC(NumStrLen, "Number of strlen's and wcslen's formed from loop loads");
102	STATISTIC(
103	NumShiftUntilBitTest,
104	"Number of uncountable loops recognized as 'shift until bitttest' idiom");
105	STATISTIC(NumShiftUntilZero,
106	"Number of uncountable loops recognized as 'shift until zero' idiom");
107
108	bool DisableLIRP::All;
109	static cl::opt<bool, true>
110	DisableLIRPAll("disable-" DEBUG_TYPE "-all",
111	cl::desc ("Options to disable Loop Idiom Recognize Pass."),
112	cl::location(L&: DisableLIRP::All), cl::init(Val: false),
113	cl::ReallyHidden);
114
115	bool DisableLIRP::Memset;
116	static cl::opt<bool, true>
117	DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
118	cl::desc ("Proceed with loop idiom recognize pass, but do "
119	"not convert loop(s) to memset."),
120	cl::location(L&: DisableLIRP::Memset), cl::init(Val: false),
121	cl::ReallyHidden);
122
123	bool DisableLIRP::Memcpy;
124	static cl::opt<bool, true>
125	DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
126	cl::desc ("Proceed with loop idiom recognize pass, but do "
127	"not convert loop(s) to memcpy."),
128	cl::location(L&: DisableLIRP::Memcpy), cl::init(Val: false),
129	cl::ReallyHidden);
130
131	bool DisableLIRP::Strlen;
132	static cl::opt<bool, true>
133	DisableLIRPStrlen("disable-loop-idiom-strlen",
134	cl::desc ("Proceed with loop idiom recognize pass, but do "
135	"not convert loop(s) to strlen."),
136	cl::location(L&: DisableLIRP::Strlen), cl::init(Val: false),
137	cl::ReallyHidden);
138
139	bool DisableLIRP::Wcslen;
140	static cl::opt<bool, true>
141	EnableLIRPWcslen("disable-loop-idiom-wcslen",
142	cl::desc ("Proceed with loop idiom recognize pass, "
143	"enable conversion of loop(s) to wcslen."),
144	cl::location(L&: DisableLIRP::Wcslen), cl::init(Val: false),
145	cl::ReallyHidden);
146
147	static cl::opt<bool> UseLIRCodeSizeHeurs(
148	"use-lir-code-size-heurs",
149	cl::desc ("Use loop idiom recognition code size heuristics when compiling "
150	"with -Os/-Oz"),
151	cl::init(Val: true), cl::Hidden);
152
153	namespace {
154
155	class LoopIdiomRecognize {
156	Loop CurLoop = nullptr*;
157	AliasAnalysis *AA;
158	DominatorTree *DT;
159	LoopInfo *LI;
160	ScalarEvolution *SE;
161	TargetLibraryInfo *TLI;
162	const TargetTransformInfo *TTI;
163	const DataLayout *DL;
164	OptimizationRemarkEmitter &ORE;
165	bool ApplyCodeSizeHeuristics;
166	std::unique_ptr<MemorySSAUpdater> MSSAU;
167
168	public:
169	explicit LoopIdiomRecognize(AliasAnalysis AA, DominatorTree DT,
170	LoopInfo LI, ScalarEvolution SE,
171	TargetLibraryInfo *TLI,
172	const TargetTransformInfo TTI, MemorySSA MSSA,
173	const DataLayout *DL,
174	OptimizationRemarkEmitter &ORE)
175	: AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
176	if (MSSA)
177	MSSAU = std::make_unique<MemorySSAUpdater>(args&: MSSA);
178	}
179
180	bool runOnLoop(Loop *L);
181
182	private:
183	using StoreList = SmallVector<StoreInst *, `8`>;
184	using StoreListMap = MapVector<Value *, StoreList>;
185
186	StoreListMap StoreRefsForMemset;
187	StoreListMap StoreRefsForMemsetPattern;
188	StoreList StoreRefsForMemcpy;
189	bool HasMemset;
190	bool HasMemsetPattern;
191	bool HasMemcpy;
192
193	/// Return code for isLegalStore()
194	enum LegalStoreKind {
195	None = `0`,
196	Memset,
197	MemsetPattern,
198	Memcpy,
199	UnorderedAtomicMemcpy,
200	DontUse // Dummy retval never to be used. Allows catching errors in retval
201	// handling.
202	};
203
204	/// \name Countable Loop Idiom Handling
205	/// @{
206
207	bool runOnCountableLoop();
208	bool runOnLoopBlock(BasicBlock BB, const* SCEV *BECount,
209	SmallVectorImpl<BasicBlock *> &ExitBlocks);
210
211	void collectStores(BasicBlock *BB);
212	LegalStoreKind isLegalStore(StoreInst *SI);
213	enum class ForMemset { No, Yes };
214	bool processLoopStores(SmallVectorImpl<StoreInst > &SL, const* SCEV *BECount,
215	ForMemset For);
216
217	template <typename MemInst>
218	bool processLoopMemIntrinsic(
219	BasicBlock *BB,
220	bool (LoopIdiomRecognize::Processor)(MemInst , const SCEV *),
221	const SCEV *BECount);
222	bool processLoopMemCpy(MemCpyInst MCI, const* SCEV *BECount);
223	bool processLoopMemSet(MemSetInst MSI, const* SCEV *BECount);
224
225	bool processLoopStridedStore(Value DestPtr, const* SCEV *StoreSizeSCEV,
226	MaybeAlign StoreAlignment, Value *StoredVal,
227	Instruction *TheStore,
228	SmallPtrSetImpl<Instruction *> &Stores,
229	const SCEVAddRecExpr Ev, const* SCEV *BECount,
230	bool IsNegStride, bool IsLoopMemset = false);
231	bool processLoopStoreOfLoopLoad(StoreInst SI, const* SCEV *BECount);
232	bool processLoopStoreOfLoopLoad(Value DestPtr, Value SourcePtr,
233	const SCEV *StoreSize, MaybeAlign StoreAlign,
234	MaybeAlign LoadAlign, Instruction *TheStore,
235	Instruction *TheLoad,
236	const SCEVAddRecExpr *StoreEv,
237	const SCEVAddRecExpr *LoadEv,
238	const SCEV *BECount);
239	bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
240	bool IsLoopMemset = false);
241
242	/// @}
243	/// \name Noncountable Loop Idiom Handling
244	/// @{
245
246	bool runOnNoncountableLoop();
247
248	bool recognizePopcount();
249	void transformLoopToPopcount(BasicBlock PreCondBB, Instruction CntInst,
250	PHINode CntPhi, Value Var);
251	bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
252	bool ZeroCheck, size_t CanonicalSize);
253	bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
254	Instruction DefX, PHINode CntPhi,
255	Instruction *CntInst);
256	bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
257	bool recognizeShiftUntilLessThan();
258	void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
259	Instruction CntInst, PHINode CntPhi,
260	Value Var, Instruction DefX,
261	const DebugLoc &DL, bool ZeroCheck,
262	bool IsCntPhiUsedOutsideLoop,
263	bool InsertSub = false);
264
265	bool recognizeShiftUntilBitTest();
266	bool recognizeShiftUntilZero();
267	bool recognizeAndInsertStrLen();
268
269	/// @}
270	};
271	} // end anonymous namespace
272
273	PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
274	LoopStandardAnalysisResults &AR,
275	LPMUpdater &) {
276	if (DisableLIRP::All)
277	return PreservedAnalyses::all();
278
279	const auto *DL = &L.getHeader()->getDataLayout();
280
281	// For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
282	// pass. Function analyses need to be preserved across loop transformations
283	// but ORE cannot be preserved (see comment before the pass definition).
284	OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
285
286	LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
287	AR.MSSA, DL, ORE);
288	if (!LIR.runOnLoop(L: &L))
289	return PreservedAnalyses::all();
290
291	auto PA = getLoopPassPreservedAnalyses();
292	if (AR.MSSA)
293	PA.preserve<MemorySSAAnalysis>();
294	return PA;
295	}
296
297	static void deleteDeadInstruction(Instruction *I) {
298	I->replaceAllUsesWith(V: PoisonValue::get(T: I->getType()));
299	I->eraseFromParent();
300	}
301
302	//===----------------------------------------------------------------------===//
303	//
304	// Implementation of LoopIdiomRecognize
305	//
306	//===----------------------------------------------------------------------===//
307
308	bool LoopIdiomRecognize::runOnLoop(Loop *L) {
309	CurLoop = L;
310	// If the loop could not be converted to canonical form, it must have an
311	// indirectbr in it, just give up.
312	if (!L->getLoopPreheader())
313	return false;
314
315	// Disable loop idiom recognition if the function's name is a common idiom.
316	StringRef Name = L->getHeader()->getParent()->getName();
317	if (Name == "memset" \|\| Name == "memcpy" \|\| Name == "strlen" \|\|
318	Name == "wcslen")
319	return false;
320
321	// Determine if code size heuristics need to be applied.
322	ApplyCodeSizeHeuristics =
323	L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
324
325	HasMemset = TLI->has(F: LibFunc_memset);
326	HasMemsetPattern = TLI->has(F: LibFunc_memset_pattern16);
327	HasMemcpy = TLI->has(F: LibFunc_memcpy);
328
329	if (HasMemset \|\| HasMemsetPattern \|\| HasMemcpy)
330	if (SE->hasLoopInvariantBackedgeTakenCount(L))
331	return runOnCountableLoop();
332
333	return runOnNoncountableLoop();
334	}
335
336	bool LoopIdiomRecognize::runOnCountableLoop() {
337	const SCEV *BECount = SE->getBackedgeTakenCount(L: CurLoop);
338	assert(!isa<SCEVCouldNotCompute>(BECount) &&
339	"runOnCountableLoop() called on a loop without a predictable"
340	"backedge-taken count");
341
342	// If this loop executes exactly one time, then it should be peeled, not
343	// optimized by this pass.
344	if (BECount->isZero())
345	return false;
346
347	SmallVector<BasicBlock *, `8`> ExitBlocks;
348	CurLoop->getUniqueExitBlocks(ExitBlocks);
349
350	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
351	<< CurLoop->getHeader()->getParent()->getName()
352	<< "] Countable Loop %" << CurLoop->getHeader()->getName()
353	<< "\n");
354
355	// The following transforms hoist stores/memsets into the loop pre-header.
356	// Give up if the loop has instructions that may throw.
357	SimpleLoopSafetyInfo SafetyInfo;
358	SafetyInfo.computeLoopSafetyInfo(CurLoop);
359	if (SafetyInfo.anyBlockMayThrow())
360	return false;
361
362	bool MadeChange = false;
363
364	// Scan all the blocks in the loop that are not in subloops.
365	for (auto *BB : CurLoop->getBlocks()) {
366	// Ignore blocks in subloops.
367	if (LI->getLoopFor(BB) != CurLoop)
368	continue;
369
370	MadeChange \|= runOnLoopBlock(BB, BECount, ExitBlocks);
371	}
372	return MadeChange;
373	}
374
375	static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
376	const SCEVConstant *ConstStride = cast<SCEVConstant>(Val: StoreEv->getOperand(i: `1`));
377	return ConstStride->getAPInt();
378	}
379
380	/// getMemSetPatternValue - If a strided store of the specified value is safe to
381	/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
382	/// be passed in. Otherwise, return null.
383	///
384	/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
385	/// just replicate their input array and then pass on to memset_pattern16.
386	static Constant getMemSetPatternValue(Value V, const DataLayout *DL) {
387	// FIXME: This could check for UndefValue because it can be merged into any
388	// other valid pattern.
389
390	// If the value isn't a constant, we can't promote it to being in a constant
391	// array. We could theoretically do a store to an alloca or something, but
392	// that doesn't seem worthwhile.
393	Constant *C = dyn_cast<Constant>(Val: V);
394	if (!C \|\| isa<ConstantExpr>(Val: C))
395	return nullptr;
396
397	// Only handle simple values that are a power of two bytes in size.
398	uint64_t Size = DL->getTypeSizeInBits(Ty: V->getType());
399	if (Size == `0` \|\| (Size & `7`) \|\| (Size & (Size - `1`)))
400	return nullptr;
401
402	// Don't care enough about darwin/ppc to implement this.
403	if (DL->isBigEndian())
404	return nullptr;
405
406	// Convert to size in bytes.
407	Size /= `8`;
408
409	// TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
410	// if the top and bottom are the same (e.g. for vectors and large integers).
411	if (Size > `16`)
412	return nullptr;
413
414	// If the constant is exactly 16 bytes, just use it.
415	if (Size == `16`)
416	return C;
417
418	// Otherwise, we'll use an array of the constants.
419	unsigned ArraySize = `16` / Size;
420	ArrayType *AT = ArrayType::get(ElementType: V->getType(), NumElements: ArraySize);
421	return ConstantArray::get(T: AT, V: std::vector<Constant *>(ArraySize, C));
422	}
423
424	LoopIdiomRecognize::LegalStoreKind
425	LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
426	// Don't touch volatile stores.
427	if (SI->isVolatile())
428	return LegalStoreKind::None;
429	// We only want simple or unordered-atomic stores.
430	if (!SI->isUnordered())
431	return LegalStoreKind::None;
432
433	// Avoid merging nontemporal stores.
434	if (SI->getMetadata(KindID: LLVMContext::MD_nontemporal))
435	return LegalStoreKind::None;
436
437	Value *StoredVal = SI->getValueOperand();
438	Value *StorePtr = SI->getPointerOperand();
439
440	// Don't convert stores of non-integral pointer types to memsets (which stores
441	// integers).
442	if (DL->isNonIntegralPointerType(Ty: StoredVal->getType()->getScalarType()))
443	return LegalStoreKind::None;
444
445	// Reject stores that are so large that they overflow an unsigned.
446	// When storing out scalable vectors we bail out for now, since the code
447	// below currently only works for constant strides.
448	TypeSize SizeInBits = DL->getTypeSizeInBits(Ty: StoredVal->getType());
449	if (SizeInBits.isScalable() \|\| (SizeInBits.getFixedValue() & `7`) \|\|
450	(SizeInBits.getFixedValue() >> `32`) != `0`)
451	return LegalStoreKind::None;
452
453	// See if the pointer expression is an AddRec like {base,+,1} on the current
454	// loop, which indicates a strided store. If we have something else, it's a
455	// random store we can't handle.
456	const SCEV *StoreEv = SE->getSCEV(V: StorePtr);
457	const SCEVConstant *Stride;
458	if (!match(S: StoreEv, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEVConstant(V&: Stride),
459	L: m_SpecificLoop(L: CurLoop))))
460	return LegalStoreKind::None;
461
462	// See if the store can be turned into a memset.
463
464	// If the stored value is a byte-wise value (like i32 -1), then it may be
465	// turned into a memset of i8 -1, assuming that all the consecutive bytes
466	// are stored. A store of i32 0x01020304 can never be turned into a memset,
467	// but it can be turned into memset_pattern if the target supports it.
468	Value SplatValue = isBytewiseValue(V: StoredVal, DL: DL);
469
470	// Note: memset and memset_pattern on unordered-atomic is yet not supported
471	bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
472
473	// If we're allowed to form a memset, and the stored value would be
474	// acceptable for memset, use it.
475	if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
476	// Verify that the stored value is loop invariant. If not, we can't
477	// promote the memset.
478	CurLoop->isLoopInvariant(V: SplatValue)) {
479	// It looks like we can use SplatValue.
480	return LegalStoreKind::Memset;
481	}
482	if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
483	// Don't create memset_pattern16s with address spaces.
484	StorePtr->getType()->getPointerAddressSpace() == `0` &&
485	getMemSetPatternValue(V: StoredVal, DL)) {
486	// It looks like we can use PatternValue!
487	return LegalStoreKind::MemsetPattern;
488	}
489
490	// Otherwise, see if the store can be turned into a memcpy.
491	if (HasMemcpy && !DisableLIRP::Memcpy) {
492	// Check to see if the stride matches the size of the store. If so, then we
493	// know that every byte is touched in the loop.
494	unsigned StoreSize = DL->getTypeStoreSize(Ty: SI->getValueOperand()->getType());
495	APInt StrideAP = Stride->getAPInt();
496	if (StoreSize != StrideAP && StoreSize != -StrideAP)
497	return LegalStoreKind::None;
498
499	// The store must be feeding a non-volatile load.
500	LoadInst *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand());
501
502	// Only allow non-volatile loads
503	if (!LI \|\| LI->isVolatile())
504	return LegalStoreKind::None;
505	// Only allow simple or unordered-atomic loads
506	if (!LI->isUnordered())
507	return LegalStoreKind::None;
508
509	// See if the pointer expression is an AddRec like {base,+,1} on the current
510	// loop, which indicates a strided load. If we have something else, it's a
511	// random load we can't handle.
512	const SCEV *LoadEv = SE->getSCEV(V: LI->getPointerOperand());
513
514	// The store and load must share the same stride.
515	if (!match(S: LoadEv, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_scev_Specific(S: Stride),
516	L: m_SpecificLoop(L: CurLoop))))
517	return LegalStoreKind::None;
518
519	// Success. This store can be converted into a memcpy.
520	UnorderedAtomic = UnorderedAtomic \|\| LI->isAtomic();
521	return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
522	: LegalStoreKind::Memcpy;
523	}
524	// This store can't be transformed into a memset/memcpy.
525	return LegalStoreKind::None;
526	}
527
528	void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
529	StoreRefsForMemset.clear();
530	StoreRefsForMemsetPattern.clear();
531	StoreRefsForMemcpy.clear();
532	for (Instruction &I : *BB) {
533	StoreInst *SI = dyn_cast<StoreInst>(Val: &I);
534	if (!SI)
535	continue;
536
537	// Make sure this is a strided store with a constant stride.
538	switch (isLegalStore(SI)) {
539	case LegalStoreKind::None:
540	// Nothing to do
541	break;
542	case LegalStoreKind::Memset: {
543	// Find the base pointer.
544	Value *Ptr = getUnderlyingObject(V: SI->getPointerOperand());
545	StoreRefsForMemset [Ptr].push_back(Elt: SI);
546	} break;
547	case LegalStoreKind::MemsetPattern: {
548	// Find the base pointer.
549	Value *Ptr = getUnderlyingObject(V: SI->getPointerOperand());
550	StoreRefsForMemsetPattern [Ptr].push_back(Elt: SI);
551	} break;
552	case LegalStoreKind::Memcpy:
553	case LegalStoreKind::UnorderedAtomicMemcpy:
554	StoreRefsForMemcpy.push_back(Elt: SI);
555	break;
556	default:
557	assert(false && "unhandled return value");
558	break;
559	}
560	}
561	}
562
563	/// runOnLoopBlock - Process the specified block, which lives in a counted loop
564	/// with the specified backedge count. This block is known to be in the current
565	/// loop and not in any subloops.
566	bool LoopIdiomRecognize::runOnLoopBlock(
567	BasicBlock BB, const* SCEV *BECount,
568	SmallVectorImpl<BasicBlock *> &ExitBlocks) {
569	// We can only promote stores in this block if they are unconditionally
570	// executed in the loop. For a block to be unconditionally executed, it has
571	// to dominate all the exit blocks of the loop. Verify this now.
572	for (BasicBlock *ExitBlock : ExitBlocks)
573	if (!DT->dominates(A: BB, B: ExitBlock))
574	return false;
575
576	bool MadeChange = false;
577	// Look for store instructions, which may be optimized to memset/memcpy.
578	collectStores(BB);
579
580	// Look for a single store or sets of stores with a common base, which can be
581	// optimized into a memset (memset_pattern). The latter most commonly happens
582	// with structs and handunrolled loops.
583	for (auto &SL : StoreRefsForMemset)
584	MadeChange \|= processLoopStores(SL&: SL.second, BECount, For: ForMemset::Yes);
585
586	for (auto &SL : StoreRefsForMemsetPattern)
587	MadeChange \|= processLoopStores(SL&: SL.second, BECount, For: ForMemset::No);
588
589	// Optimize the store into a memcpy, if it feeds an similarly strided load.
590	for (auto &SI : StoreRefsForMemcpy)
591	MadeChange \|= processLoopStoreOfLoopLoad(SI, BECount);
592
593	MadeChange \|= processLoopMemIntrinsic<MemCpyInst>(
594	BB, Processor: &LoopIdiomRecognize::processLoopMemCpy, BECount);
595	MadeChange \|= processLoopMemIntrinsic<MemSetInst>(
596	BB, Processor: &LoopIdiomRecognize::processLoopMemSet, BECount);
597
598	return MadeChange;
599	}
600
601	/// See if this store(s) can be promoted to a memset.
602	bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
603	const SCEV *BECount, ForMemset For) {
604	// Try to find consecutive stores that can be transformed into memsets.
605	SetVector<StoreInst *> Heads, Tails;
606	SmallDenseMap<StoreInst , StoreInst > ConsecutiveChain;
607
608	// Do a quadratic search on all of the given stores and find
609	// all of the pairs of stores that follow each other.
610	SmallVector<unsigned, `16`> IndexQueue;
611	for (unsigned i = `0`, e = SL.size(); i < e; ++i) {
612	assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
613
614	Value *FirstStoredVal = SL [i]->getValueOperand();
615	Value *FirstStorePtr = SL [i]->getPointerOperand();
616	const SCEVAddRecExpr *FirstStoreEv =
617	cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: FirstStorePtr));
618	APInt FirstStride = getStoreStride(StoreEv: FirstStoreEv);
619	unsigned FirstStoreSize = DL->getTypeStoreSize(Ty: SL [i]->getValueOperand()->getType());
620
621	// See if we can optimize just this store in isolation.
622	if (FirstStride == FirstStoreSize \|\| -FirstStride == FirstStoreSize) {
623	Heads.insert(X: SL [i]);
624	continue;
625	}
626
627	Value FirstSplatValue = nullptr*;
628	Constant FirstPatternValue = nullptr*;
629
630	if (For == ForMemset::Yes)
631	FirstSplatValue = isBytewiseValue(V: FirstStoredVal, DL: *DL);
632	else
633	FirstPatternValue = getMemSetPatternValue(V: FirstStoredVal, DL);
634
635	assert((FirstSplatValue \|\| FirstPatternValue) &&
636	"Expected either splat value or pattern value.");
637
638	IndexQueue.clear();
639	// If a store has multiple consecutive store candidates, search Stores
640	// array according to the sequence: from i+1 to e, then from i-1 to 0.
641	// This is because usually pairing with immediate succeeding or preceding
642	// candidate create the best chance to find memset opportunity.
643	unsigned j = `0`;
644	for (j = i + `1`; j < e; ++j)
645	IndexQueue.push_back(Elt: j);
646	for (j = i; j > `0`; --j)
647	IndexQueue.push_back(Elt: j - `1`);
648
649	for (auto &k : IndexQueue) {
650	assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
651	Value *SecondStorePtr = SL [k]->getPointerOperand();
652	const SCEVAddRecExpr *SecondStoreEv =
653	cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: SecondStorePtr));
654	APInt SecondStride = getStoreStride(StoreEv: SecondStoreEv);
655
656	if (FirstStride != SecondStride)
657	continue;
658
659	Value *SecondStoredVal = SL [k]->getValueOperand();
660	Value SecondSplatValue = nullptr*;
661	Constant SecondPatternValue = nullptr*;
662
663	if (For == ForMemset::Yes)
664	SecondSplatValue = isBytewiseValue(V: SecondStoredVal, DL: *DL);
665	else
666	SecondPatternValue = getMemSetPatternValue(V: SecondStoredVal, DL);
667
668	assert((SecondSplatValue \|\| SecondPatternValue) &&
669	"Expected either splat value or pattern value.");
670
671	if (isConsecutiveAccess(A: SL [i], B: SL [k], DL: DL, SE&: SE, CheckType: false)) {
672	if (For == ForMemset::Yes) {
673	if (isa<UndefValue>(Val: FirstSplatValue))
674	FirstSplatValue = SecondSplatValue;
675	if (FirstSplatValue != SecondSplatValue)
676	continue;
677	} else {
678	if (isa<UndefValue>(Val: FirstPatternValue))
679	FirstPatternValue = SecondPatternValue;
680	if (FirstPatternValue != SecondPatternValue)
681	continue;
682	}
683	Tails.insert(X: SL [k]);
684	Heads.insert(X: SL [i]);
685	ConsecutiveChain [SL [i]] = SL [k];
686	break;
687	}
688	}
689	}
690
691	// We may run into multiple chains that merge into a single chain. We mark the
692	// stores that we transformed so that we don't visit the same store twice.
693	SmallPtrSet<Value *, `16`> TransformedStores;
694	bool Changed = false;
695
696	// For stores that start but don't end a link in the chain:
697	for (StoreInst *I : Heads) {
698	if (Tails.count(key: I))
699	continue;
700
701	// We found a store instr that starts a chain. Now follow the chain and try
702	// to transform it.
703	SmallPtrSet<Instruction *, `8`> AdjacentStores;
704	StoreInst *HeadStore = I;
705	unsigned StoreSize = `0`;
706
707	// Collect the chain into a list.
708	while (Tails.count(key: I) \|\| Heads.count(key: I)) {
709	if (TransformedStores.count(Ptr: I))
710	break;
711	AdjacentStores.insert(Ptr: I);
712
713	StoreSize += DL->getTypeStoreSize(Ty: I->getValueOperand()->getType());
714	// Move to the next value in the chain.
715	I = ConsecutiveChain [I];
716	}
717
718	Value *StoredVal = HeadStore->getValueOperand();
719	Value *StorePtr = HeadStore->getPointerOperand();
720	const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: StorePtr));
721	APInt Stride = getStoreStride(StoreEv);
722
723	// Check to see if the stride matches the size of the stores. If so, then
724	// we know that every byte is touched in the loop.
725	if (StoreSize != Stride && StoreSize != -Stride)
726	continue;
727
728	bool IsNegStride = StoreSize == -Stride;
729
730	Type *IntIdxTy = DL->getIndexType(PtrTy: StorePtr->getType());
731	const SCEV *StoreSizeSCEV = SE->getConstant(Ty: IntIdxTy, V: StoreSize);
732	if (processLoopStridedStore(DestPtr: StorePtr, StoreSizeSCEV,
733	StoreAlignment: MaybeAlign (HeadStore->getAlign()), StoredVal,
734	TheStore: HeadStore, Stores&: AdjacentStores, Ev: StoreEv, BECount,
735	IsNegStride)) {
736	TransformedStores.insert_range(R&: AdjacentStores);
737	Changed = true;
738	}
739	}
740
741	return Changed;
742	}
743
744	/// processLoopMemIntrinsic - Template function for calling different processor
745	/// functions based on mem intrinsic type.
746	template <typename MemInst>
747	bool LoopIdiomRecognize::processLoopMemIntrinsic(
748	BasicBlock *BB,
749	bool (LoopIdiomRecognize::Processor)(MemInst , const SCEV *),
750	const SCEV *BECount) {
751	bool MadeChange = false;
752	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
753	Instruction Inst = &I ++;
754	// Look for memory instructions, which may be optimized to a larger one.
755	if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
756	WeakTrackingVH InstPtr(&*I);
757	if (!(this->*Processor)(MI, BECount))
758	continue;
759	MadeChange = true;
760
761	// If processing the instruction invalidated our iterator, start over from
762	// the top of the block.
763	if (!InstPtr)
764	I = BB->begin();
765	}
766	}
767	return MadeChange;
768	}
769
770	/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
771	bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
772	const SCEV *BECount) {
773	// We can only handle non-volatile memcpys with a constant size.
774	if (MCI->isVolatile() \|\| !isa<ConstantInt>(Val: MCI->getLength()))
775	return false;
776
777	// If we're not allowed to hack on memcpy, we fail.
778	if ((!HasMemcpy && !MCI->isForceInlined()) \|\| DisableLIRP::Memcpy)
779	return false;
780
781	Value *Dest = MCI->getDest();
782	Value *Source = MCI->getSource();
783	if (!Dest \|\| !Source)
784	return false;
785
786	// See if the load and store pointer expressions are AddRec like {base,+,1} on
787	// the current loop, which indicates a strided load and store. If we have
788	// something else, it's a random load or store we can't handle.
789	const SCEV *StoreEv = SE->getSCEV(V: Dest);
790	const SCEV *LoadEv = SE->getSCEV(V: Source);
791	const APInt StoreStrideValue, LoadStrideValue;
792	if (!match(S: StoreEv,
793	P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_scev_APInt(C&: StoreStrideValue),
794	L: m_SpecificLoop(L: CurLoop))) \|\|
795	!match(S: LoadEv,
796	P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_scev_APInt(C&: LoadStrideValue),
797	L: m_SpecificLoop(L: CurLoop))))
798	return false;
799
800	// Reject memcpys that are so large that they overflow an unsigned.
801	uint64_t SizeInBytes = cast<ConstantInt>(Val: MCI->getLength())->getZExtValue();
802	if ((SizeInBytes >> `32`) != `0`)
803	return false;
804
805	// Huge stride value - give up
806	if (StoreStrideValue->getBitWidth() > `64` \|\|
807	LoadStrideValue->getBitWidth() > `64`)
808	return false;
809
810	if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
811	ORE.emit(RemarkBuilder: [&]() {
812	return OptimizationRemarkMissed (DEBUG_TYPE, "SizeStrideUnequal", MCI)
813	<< ore::NV ("Inst", "memcpy") << " in "
814	<< ore::NV ("Function", MCI->getFunction())
815	<< " function will not be hoisted: "
816	<< ore::NV ("Reason", "memcpy size is not equal to stride");
817	});
818	return false;
819	}
820
821	int64_t StoreStrideInt = StoreStrideValue->getSExtValue();
822	int64_t LoadStrideInt = LoadStrideValue->getSExtValue();
823	// Check if the load stride matches the store stride.
824	if (StoreStrideInt != LoadStrideInt)
825	return false;
826
827	return processLoopStoreOfLoopLoad(
828	DestPtr: Dest, SourcePtr: Source, StoreSize: SE->getConstant(Ty: Dest->getType(), V: SizeInBytes),
829	StoreAlign: MCI->getDestAlign(), LoadAlign: MCI->getSourceAlign(), TheStore: MCI, TheLoad: MCI,
830	StoreEv: cast<SCEVAddRecExpr>(Val: StoreEv), LoadEv: cast<SCEVAddRecExpr>(Val: LoadEv), BECount);
831	}
832
833	/// processLoopMemSet - See if this memset can be promoted to a large memset.
834	bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
835	const SCEV *BECount) {
836	// We can only handle non-volatile memsets.
837	if (MSI->isVolatile())
838	return false;
839
840	// If we're not allowed to hack on memset, we fail.
841	if (!HasMemset \|\| DisableLIRP::Memset)
842	return false;
843
844	Value *Pointer = MSI->getDest();
845
846	// See if the pointer expression is an AddRec like {base,+,1} on the current
847	// loop, which indicates a strided store. If we have something else, it's a
848	// random store we can't handle.
849	const SCEV *Ev = SE->getSCEV(V: Pointer);
850	const SCEV *PointerStrideSCEV;
851	if (!match(S: Ev, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V&: PointerStrideSCEV),
852	L: m_SpecificLoop(L: CurLoop)))) {
853	LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
854	return false;
855	}
856
857	const SCEV *MemsetSizeSCEV = SE->getSCEV(V: MSI->getLength());
858
859	bool IsNegStride = false;
860	const bool IsConstantSize = isa<ConstantInt>(Val: MSI->getLength());
861
862	if (IsConstantSize) {
863	// Memset size is constant.
864	// Check if the pointer stride matches the memset size. If so, then
865	// we know that every byte is touched in the loop.
866	LLVM_DEBUG(dbgs() << " memset size is constant\n");
867	uint64_t SizeInBytes = cast<ConstantInt>(Val: MSI->getLength())->getZExtValue();
868	const APInt *Stride;
869	if (!match(S: PointerStrideSCEV, P: m_scev_APInt(C&: Stride)))
870	return false;
871
872	if (SizeInBytes != Stride && SizeInBytes != -Stride)
873	return false;
874
875	IsNegStride = SizeInBytes == -*Stride;
876	} else {
877	// Memset size is non-constant.
878	// Check if the pointer stride matches the memset size.
879	// To be conservative, the pass would not promote pointers that aren't in
880	// address space zero. Also, the pass only handles memset length and stride
881	// that are invariant for the top level loop.
882	LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
883	if (Pointer->getType()->getPointerAddressSpace() != `0`) {
884	LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
885	<< "abort\n");
886	return false;
887	}
888	if (!SE->isLoopInvariant(S: MemsetSizeSCEV, L: CurLoop)) {
889	LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
890	<< "abort\n");
891	return false;
892	}
893
894	// Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
895	IsNegStride = PointerStrideSCEV->isNonConstantNegative();
896	const SCEV *PositiveStrideSCEV =
897	IsNegStride ? SE->getNegativeSCEV(V: PointerStrideSCEV)
898	: PointerStrideSCEV;
899	LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
900	<< " PositiveStrideSCEV: " << *PositiveStrideSCEV
901	<< "\n");
902
903	if (PositiveStrideSCEV != MemsetSizeSCEV) {
904	// If an expression is covered by the loop guard, compare again and
905	// proceed with optimization if equal.
906	const SCEV *FoldedPositiveStride =
907	SE->applyLoopGuards(Expr: PositiveStrideSCEV, L: CurLoop);
908	const SCEV *FoldedMemsetSize =
909	SE->applyLoopGuards(Expr: MemsetSizeSCEV, L: CurLoop);
910
911	LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
912	<< " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
913	<< " FoldedPositiveStride: " << *FoldedPositiveStride
914	<< "\n");
915
916	if (FoldedPositiveStride != FoldedMemsetSize) {
917	LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
918	return false;
919	}
920	}
921	}
922
923	// Verify that the memset value is loop invariant. If not, we can't promote
924	// the memset.
925	Value *SplatValue = MSI->getValue();
926	if (!SplatValue \|\| !CurLoop->isLoopInvariant(V: SplatValue))
927	return false;
928
929	SmallPtrSet<Instruction *, `1`> MSIs;
930	MSIs.insert(Ptr: MSI);
931	return processLoopStridedStore(DestPtr: Pointer, StoreSizeSCEV: SE->getSCEV(V: MSI->getLength()),
932	StoreAlignment: MSI->getDestAlign(), StoredVal: SplatValue, TheStore: MSI, Stores&: MSIs,
933	Ev: cast<SCEVAddRecExpr>(Val: Ev), BECount, IsNegStride,
934	/IsLoopMemset=/true);
935	}
936
937	/// mayLoopAccessLocation - Return true if the specified loop might access the
938	/// specified pointer location, which is a loop-strided access. The 'Access'
939	/// argument specifies what the verboten forms of access are (read or write).
940	static bool
941	mayLoopAccessLocation(Value Ptr, ModRefInfo Access, Loop L,
942	const SCEV BECount, const* SCEV *StoreSizeSCEV,
943	AliasAnalysis &AA,
944	SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
945	// Get the location that may be stored across the loop. Since the access is
946	// strided positively through memory, we say that the modified location starts
947	// at the pointer and has infinite size.
948	LocationSize AccessSize = LocationSize::afterPointer();
949
950	// If the loop iterates a fixed number of times, we can refine the access size
951	// to be exactly the size of the memset, which is (BECount+1)StoreSize*
952	const APInt BECst, ConstSize;
953	if (match(S: BECount, P: m_scev_APInt(C&: BECst)) &&
954	match(S: StoreSizeSCEV, P: m_scev_APInt(C&: ConstSize))) {
955	std::optional<uint64_t> BEInt = BECst->tryZExtValue();
956	std::optional<uint64_t> SizeInt = ConstSize->tryZExtValue();
957	// FIXME: Should this check for overflow?
958	if (BEInt && SizeInt)
959	AccessSize = LocationSize::precise(Value: (BEInt + `1`) *SizeInt);
960	}
961
962	// TODO: For this to be really effective, we have to dive into the pointer
963	// operand in the store. Store to &A[i] of 100 will always return may alias
964	// with store of &A[100], we need to StoreLoc to be "A" with size of 100,
965	// which will then no-alias a store to &A[100].
966	MemoryLocation StoreLoc(Ptr, AccessSize);
967
968	for (BasicBlock *B : L->blocks())
969	for (Instruction &I : *B)
970	if (!IgnoredInsts.contains(Ptr: &I) &&
971	isModOrRefSet(MRI: AA.getModRefInfo(I: &I, OptLoc: StoreLoc) & Access))
972	return true;
973	return false;
974	}
975
976	// If we have a negative stride, Start refers to the end of the memory location
977	// we're trying to memset. Therefore, we need to recompute the base pointer,
978	// which is just Start - BECountSize.*
979	static const SCEV getStartForNegStride(const* SCEV Start, const* SCEV *BECount,
980	Type IntPtr, const* SCEV *StoreSizeSCEV,
981	ScalarEvolution *SE) {
982	const SCEV *Index = SE->getTruncateOrZeroExtend(V: BECount, Ty: IntPtr);
983	if (!StoreSizeSCEV->isOne()) {
984	// index = back edge count store size*
985	Index = SE->getMulExpr(LHS: Index,
986	RHS: SE->getTruncateOrZeroExtend(V: StoreSizeSCEV, Ty: IntPtr),
987	Flags: SCEV::FlagNUW);
988	}
989	// base pointer = start - index store size*
990	return SE->getMinusSCEV(LHS: Start, RHS: Index);
991	}
992
993	/// Compute the number of bytes as a SCEV from the backedge taken count.
994	///
995	/// This also maps the SCEV into the provided type and tries to handle the
996	/// computation in a way that will fold cleanly.
997	static const SCEV getNumBytes(const* SCEV BECount, Type IntPtr,
998	const SCEV StoreSizeSCEV, Loop CurLoop,
999	const DataLayout DL, ScalarEvolution SE) {
1000	const SCEV *TripCountSCEV =
1001	SE->getTripCountFromExitCount(ExitCount: BECount, EvalTy: IntPtr, L: CurLoop);
1002	return SE->getMulExpr(LHS: TripCountSCEV,
1003	RHS: SE->getTruncateOrZeroExtend(V: StoreSizeSCEV, Ty: IntPtr),
1004	Flags: SCEV::FlagNUW);
1005	}
1006
1007	/// processLoopStridedStore - We see a strided store of some value. If we can
1008	/// transform this into a memset or memset_pattern in the loop preheader, do so.
1009	bool LoopIdiomRecognize::processLoopStridedStore(
1010	Value DestPtr, const* SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1011	Value StoredVal, Instruction TheStore,
1012	SmallPtrSetImpl<Instruction > &Stores, const* SCEVAddRecExpr *Ev,
1013	const SCEV BECount, bool* IsNegStride, bool IsLoopMemset) {
1014	Module *M = TheStore->getModule();
1015
1016	// The trip count of the loop and the base pointer of the addrec SCEV is
1017	// guaranteed to be loop invariant, which means that it should dominate the
1018	// header. This allows us to insert code for it in the preheader.
1019	unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1020	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1021	IRBuilder<> Builder(Preheader->getTerminator());
1022	SCEVExpander Expander(SE, DL, "loop-idiom");
1023	SCEVExpanderCleaner ExpCleaner(Expander);
1024
1025	Type *DestInt8PtrTy = Builder.getPtrTy(AddrSpace: DestAS);
1026	Type *IntIdxTy = DL->getIndexType(PtrTy: DestPtr->getType());
1027
1028	bool Changed = false;
1029	const SCEV *Start = Ev->getStart();
1030	// Handle negative strided loops.
1031	if (IsNegStride)
1032	Start = getStartForNegStride(Start, BECount, IntPtr: IntIdxTy, StoreSizeSCEV, SE);
1033
1034	// TODO: ideally we should still be able to generate memset if SCEV expander
1035	// is taught to generate the dependencies at the latest point.
1036	if (!Expander.isSafeToExpand(S: Start))
1037	return Changed;
1038
1039	// Okay, we have a strided store "p[i]" of a splattable value. We can turn
1040	// this into a memset in the loop preheader now if we want. However, this
1041	// would be unsafe to do if there is anything else in the loop that may read
1042	// or write to the aliased location. Check for any overlap by generating the
1043	// base pointer and checking the region.
1044	Value *BasePtr =
1045	Expander.expandCodeFor(SH: Start, Ty: DestInt8PtrTy, I: Preheader->getTerminator());
1046
1047	// From here on out, conservatively report to the pass manager that we've
1048	// changed the IR, even if we later clean up these added instructions. There
1049	// may be structural differences e.g. in the order of use lists not accounted
1050	// for in just a textual dump of the IR. This is written as a variable, even
1051	// though statically all the places this dominates could be replaced with
1052	// 'true', with the hope that anyone trying to be clever / "more precise" with
1053	// the return value will read this comment, and leave them alone.
1054	Changed = true;
1055
1056	if (mayLoopAccessLocation(Ptr: BasePtr, Access: ModRefInfo::ModRef, L: CurLoop, BECount,
1057	StoreSizeSCEV, AA&: *AA, IgnoredInsts&: Stores))
1058	return Changed;
1059
1060	if (avoidLIRForMultiBlockLoop(/IsMemset=/true, IsLoopMemset))
1061	return Changed;
1062
1063	// Okay, everything looks good, insert the memset.
1064
1065	const SCEV *NumBytesS =
1066	getNumBytes(BECount, IntPtr: IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1067
1068	// TODO: ideally we should still be able to generate memset if SCEV expander
1069	// is taught to generate the dependencies at the latest point.
1070	if (!Expander.isSafeToExpand(S: NumBytesS))
1071	return Changed;
1072
1073	Value *NumBytes =
1074	Expander.expandCodeFor(SH: NumBytesS, Ty: IntIdxTy, I: Preheader->getTerminator());
1075
1076	AAMDNodes AATags = TheStore->getAAMetadata();
1077	for (Instruction *Store : Stores)
1078	AATags = AATags.merge(Other: Store->getAAMetadata());
1079	if (auto CI = dyn_cast<ConstantInt>(Val: NumBytes))
1080	AATags = AATags.extendTo(Len: CI->getZExtValue());
1081	else
1082	AATags = AATags.extendTo(Len: -`1`);
1083
1084	CallInst *NewCall;
1085	if (Value SplatValue = isBytewiseValue(V: StoredVal, DL: DL)) {
1086	NewCall = Builder.CreateMemSet(Ptr: BasePtr, Val: SplatValue, Size: NumBytes,
1087	Align: MaybeAlign (StoreAlignment),
1088	/isVolatile=/false, AAInfo: AATags);
1089	} else if (isLibFuncEmittable(M, TLI, TheLibFunc: LibFunc_memset_pattern16)) {
1090	// Everything is emitted in default address space
1091	Type *Int8PtrTy = DestInt8PtrTy;
1092
1093	StringRef FuncName = "memset_pattern16";
1094	FunctionCallee MSP = getOrInsertLibFunc(M, TLI: *TLI, TheLibFunc: LibFunc_memset_pattern16,
1095	RetTy: Builder.getVoidTy(), Args: Int8PtrTy, Args: Int8PtrTy, Args: IntIdxTy);
1096	inferNonMandatoryLibFuncAttrs(M, Name: FuncName, TLI: *TLI);
1097
1098	// Otherwise we should form a memset_pattern16. PatternValue is known to be
1099	// an constant array of 16-bytes. Plop the value into a mergable global.
1100	Constant *PatternValue = getMemSetPatternValue(V: StoredVal, DL);
1101	assert(PatternValue && "Expected pattern value.");
1102	GlobalVariable GV = new* GlobalVariable (M, PatternValue->getType(), true*,
1103	GlobalValue::PrivateLinkage,
1104	PatternValue, ".memset_pattern");
1105	GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1106	GV->setAlignment(Align (`16`));
1107	NewCall = Builder.CreateCall(Callee: MSP, Args: {BasePtr, GV, NumBytes});
1108	NewCall->setAAMetadata(AATags);
1109	} else {
1110	// Neither a memset, nor memset_pattern16
1111	return Changed;
1112	}
1113
1114	NewCall->setDebugLoc(TheStore->getDebugLoc());
1115
1116	if (MSSAU) {
1117	MemoryAccess *NewMemAcc = MSSAU ->createMemoryAccessInBB(
1118	I: NewCall, Definition: nullptr, BB: NewCall->getParent(), Point: MemorySSA::BeforeTerminator);
1119	MSSAU ->insertDef(Def: cast<MemoryDef>(Val: NewMemAcc), RenameUses: true);
1120	}
1121
1122	LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1123	<< " from store to: " << Ev << " at: " << TheStore
1124	<< "\n");
1125
1126	ORE.emit(RemarkBuilder: [&]() {
1127	OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1128	NewCall->getDebugLoc(), Preheader);
1129	R << "Transformed loop-strided store in "
1130	<< ore::NV ("Function", TheStore->getFunction())
1131	<< " function into a call to "
1132	<< ore::NV ("NewFunction", NewCall->getCalledFunction())
1133	<< "() intrinsic";
1134	if (!Stores.empty())
1135	R << ore::setExtraArgs ();
1136	for (auto *I : Stores) {
1137	R << ore::NV ("FromBlock", I->getParent()->getName())
1138	<< ore::NV ("ToBlock", Preheader->getName());
1139	}
1140	return R;
1141	});
1142
1143	// Okay, the memset has been formed. Zap the original store and anything that
1144	// feeds into it.
1145	for (auto *I : Stores) {
1146	if (MSSAU)
1147	MSSAU ->removeMemoryAccess(I, OptimizePhis: true);
1148	deleteDeadInstruction(I);
1149	}
1150	if (MSSAU && VerifyMemorySSA)
1151	MSSAU ->getMemorySSA()->verifyMemorySSA();
1152	++NumMemSet;
1153	ExpCleaner.markResultUsed();
1154	return true;
1155	}
1156
1157	/// If the stored value is a strided load in the same loop with the same stride
1158	/// this may be transformable into a memcpy. This kicks in for stuff like
1159	/// for (i) A[i] = B[i];
1160	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1161	const SCEV *BECount) {
1162	assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1163
1164	Value *StorePtr = SI->getPointerOperand();
1165	const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: StorePtr));
1166	unsigned StoreSize = DL->getTypeStoreSize(Ty: SI->getValueOperand()->getType());
1167
1168	// The store must be feeding a non-volatile load.
1169	LoadInst *LI = cast<LoadInst>(Val: SI->getValueOperand());
1170	assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1171
1172	// See if the pointer expression is an AddRec like {base,+,1} on the current
1173	// loop, which indicates a strided load. If we have something else, it's a
1174	// random load we can't handle.
1175	Value *LoadPtr = LI->getPointerOperand();
1176	const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: LoadPtr));
1177
1178	const SCEV *StoreSizeSCEV = SE->getConstant(Ty: StorePtr->getType(), V: StoreSize);
1179	return processLoopStoreOfLoopLoad(DestPtr: StorePtr, SourcePtr: LoadPtr, StoreSize: StoreSizeSCEV,
1180	StoreAlign: SI->getAlign(), LoadAlign: LI->getAlign(), TheStore: SI, TheLoad: LI,
1181	StoreEv, LoadEv, BECount);
1182	}
1183
1184	namespace {
1185	class MemmoveVerifier {
1186	public:
1187	explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1188	const DataLayout &DL)
1189	: DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset(
1190	Ptr: LoadBasePtr.stripPointerCasts(), Offset&: LoadOff, DL)),
1191	BP2(llvm::GetPointerBaseWithConstantOffset(
1192	Ptr: StoreBasePtr.stripPointerCasts(), Offset&: StoreOff, DL)),
1193	IsSameObject(BP1 == BP2) {}
1194
1195	bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1196	const Instruction &TheLoad,
1197	bool IsMemCpy) const {
1198	if (IsMemCpy) {
1199	// Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1200	// for negative stride.
1201	if ((!IsNegStride && LoadOff <= StoreOff) \|\|
1202	(IsNegStride && LoadOff >= StoreOff))
1203	return false;
1204	} else {
1205	// Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1206	// for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1207	int64_t LoadSize =
1208	DL.getTypeSizeInBits(Ty: TheLoad.getType()).getFixedValue() / `8`;
1209	if (BP1 != BP2 \|\| LoadSize != int64_t(StoreSize))
1210	return false;
1211	if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) \|\|
1212	(IsNegStride && LoadOff + LoadSize > StoreOff))
1213	return false;
1214	}
1215	return true;
1216	}
1217
1218	private:
1219	const DataLayout &DL;
1220	int64_t LoadOff = `0`;
1221	int64_t StoreOff = `0`;
1222	const Value *BP1;
1223	const Value *BP2;
1224
1225	public:
1226	const bool IsSameObject;
1227	};
1228	} // namespace
1229
1230	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1231	Value DestPtr, Value SourcePtr, const SCEV *StoreSizeSCEV,
1232	MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1233	Instruction TheLoad, const* SCEVAddRecExpr *StoreEv,
1234	const SCEVAddRecExpr LoadEv, const* SCEV *BECount) {
1235
1236	// FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1237	// conservatively bail here, since otherwise we may have to transform
1238	// llvm.memcpy.inline into llvm.memcpy which is illegal.
1239	if (auto *MCI = dyn_cast<MemCpyInst>(Val: TheStore); MCI && MCI->isForceInlined())
1240	return false;
1241
1242	// The trip count of the loop and the base pointer of the addrec SCEV is
1243	// guaranteed to be loop invariant, which means that it should dominate the
1244	// header. This allows us to insert code for it in the preheader.
1245	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1246	IRBuilder<> Builder(Preheader->getTerminator());
1247	SCEVExpander Expander(SE, DL, "loop-idiom");
1248
1249	SCEVExpanderCleaner ExpCleaner(Expander);
1250
1251	bool Changed = false;
1252	const SCEV *StrStart = StoreEv->getStart();
1253	unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1254	Type *IntIdxTy = Builder.getIntNTy(N: DL->getIndexSizeInBits(AS: StrAS));
1255
1256	APInt Stride = getStoreStride(StoreEv);
1257	const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(Val: StoreSizeSCEV);
1258
1259	// TODO: Deal with non-constant size; Currently expect constant store size
1260	assert(ConstStoreSize && "store size is expected to be a constant");
1261
1262	int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1263	bool IsNegStride = StoreSize == -Stride;
1264
1265	// Handle negative strided loops.
1266	if (IsNegStride)
1267	StrStart =
1268	getStartForNegStride(Start: StrStart, BECount, IntPtr: IntIdxTy, StoreSizeSCEV, SE);
1269
1270	// Okay, we have a strided store "p[i]" of a loaded value. We can turn
1271	// this into a memcpy in the loop preheader now if we want. However, this
1272	// would be unsafe to do if there is anything else in the loop that may read
1273	// or write the memory region we're storing to. This includes the load that
1274	// feeds the stores. Check for an alias by generating the base address and
1275	// checking everything.
1276	Value *StoreBasePtr = Expander.expandCodeFor(
1277	SH: StrStart, Ty: Builder.getPtrTy(AddrSpace: StrAS), I: Preheader->getTerminator());
1278
1279	// From here on out, conservatively report to the pass manager that we've
1280	// changed the IR, even if we later clean up these added instructions. There
1281	// may be structural differences e.g. in the order of use lists not accounted
1282	// for in just a textual dump of the IR. This is written as a variable, even
1283	// though statically all the places this dominates could be replaced with
1284	// 'true', with the hope that anyone trying to be clever / "more precise" with
1285	// the return value will read this comment, and leave them alone.
1286	Changed = true;
1287
1288	SmallPtrSet<Instruction *, `2`> IgnoredInsts;
1289	IgnoredInsts.insert(Ptr: TheStore);
1290
1291	bool IsMemCpy = isa<MemCpyInst>(Val: TheStore);
1292	const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1293
1294	bool LoopAccessStore =
1295	mayLoopAccessLocation(Ptr: StoreBasePtr, Access: ModRefInfo::ModRef, L: CurLoop, BECount,
1296	StoreSizeSCEV, AA&: *AA, IgnoredInsts);
1297	if (LoopAccessStore) {
1298	// For memmove case it's not enough to guarantee that loop doesn't access
1299	// TheStore and TheLoad. Additionally we need to make sure that TheStore is
1300	// the only user of TheLoad.
1301	if (!TheLoad->hasOneUse())
1302	return Changed;
1303	IgnoredInsts.insert(Ptr: TheLoad);
1304	if (mayLoopAccessLocation(Ptr: StoreBasePtr, Access: ModRefInfo::ModRef, L: CurLoop,
1305	BECount, StoreSizeSCEV, AA&: *AA, IgnoredInsts)) {
1306	ORE.emit(RemarkBuilder: [&]() {
1307	return OptimizationRemarkMissed (DEBUG_TYPE, "LoopMayAccessStore",
1308	TheStore)
1309	<< ore::NV ("Inst", InstRemark) << " in "
1310	<< ore::NV ("Function", TheStore->getFunction())
1311	<< " function will not be hoisted: "
1312	<< ore::NV ("Reason", "The loop may access store location");
1313	});
1314	return Changed;
1315	}
1316	IgnoredInsts.erase(Ptr: TheLoad);
1317	}
1318
1319	const SCEV *LdStart = LoadEv->getStart();
1320	unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1321
1322	// Handle negative strided loops.
1323	if (IsNegStride)
1324	LdStart =
1325	getStartForNegStride(Start: LdStart, BECount, IntPtr: IntIdxTy, StoreSizeSCEV, SE);
1326
1327	// For a memcpy, we have to make sure that the input array is not being
1328	// mutated by the loop.
1329	Value *LoadBasePtr = Expander.expandCodeFor(SH: LdStart, Ty: Builder.getPtrTy(AddrSpace: LdAS),
1330	I: Preheader->getTerminator());
1331
1332	// If the store is a memcpy instruction, we must check if it will write to
1333	// the load memory locations. So remove it from the ignored stores.
1334	MemmoveVerifier Verifier(LoadBasePtr, StoreBasePtr, *DL);
1335	if (IsMemCpy && !Verifier.IsSameObject)
1336	IgnoredInsts.erase(Ptr: TheStore);
1337	if (mayLoopAccessLocation(Ptr: LoadBasePtr, Access: ModRefInfo::Mod, L: CurLoop, BECount,
1338	StoreSizeSCEV, AA&: *AA, IgnoredInsts)) {
1339	ORE.emit(RemarkBuilder: [&]() {
1340	return OptimizationRemarkMissed (DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1341	<< ore::NV ("Inst", InstRemark) << " in "
1342	<< ore::NV ("Function", TheStore->getFunction())
1343	<< " function will not be hoisted: "
1344	<< ore::NV ("Reason", "The loop may access load location");
1345	});
1346	return Changed;
1347	}
1348
1349	bool IsAtomic = TheStore->isAtomic() \|\| TheLoad->isAtomic();
1350	bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1351
1352	if (IsAtomic) {
1353	// For now don't support unordered atomic memmove.
1354	if (UseMemMove)
1355	return Changed;
1356
1357	// We cannot allow unaligned ops for unordered load/store, so reject
1358	// anything where the alignment isn't at least the element size.
1359	assert((StoreAlign && LoadAlign) &&
1360	"Expect unordered load/store to have align.");
1361	if (StoreAlign < StoreSize \|\| LoadAlign < StoreSize)
1362	return Changed;
1363
1364	// If the element.atomic memcpy is not lowered into explicit
1365	// loads/stores later, then it will be lowered into an element-size
1366	// specific lib call. If the lib call doesn't exist for our store size, then
1367	// we shouldn't generate the memcpy.
1368	if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1369	return Changed;
1370	}
1371
1372	if (UseMemMove)
1373	if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, TheLoad: *TheLoad,
1374	IsMemCpy))
1375	return Changed;
1376
1377	if (avoidLIRForMultiBlockLoop())
1378	return Changed;
1379
1380	// Okay, everything is safe, we can transform this!
1381
1382	const SCEV *NumBytesS =
1383	getNumBytes(BECount, IntPtr: IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1384
1385	Value *NumBytes =
1386	Expander.expandCodeFor(SH: NumBytesS, Ty: IntIdxTy, I: Preheader->getTerminator());
1387
1388	AAMDNodes AATags = TheLoad->getAAMetadata();
1389	AAMDNodes StoreAATags = TheStore->getAAMetadata();
1390	AATags = AATags.merge(Other: StoreAATags);
1391	if (auto CI = dyn_cast<ConstantInt>(Val: NumBytes))
1392	AATags = AATags.extendTo(Len: CI->getZExtValue());
1393	else
1394	AATags = AATags.extendTo(Len: -`1`);
1395
1396	CallInst NewCall = nullptr*;
1397	// Check whether to generate an unordered atomic memcpy:
1398	// If the load or store are atomic, then they must necessarily be unordered
1399	// by previous checks.
1400	if (!IsAtomic) {
1401	if (UseMemMove)
1402	NewCall = Builder.CreateMemMove(Dst: StoreBasePtr, DstAlign: StoreAlign, Src: LoadBasePtr,
1403	SrcAlign: LoadAlign, Size: NumBytes,
1404	/isVolatile=/false, AAInfo: AATags);
1405	else
1406	NewCall =
1407	Builder.CreateMemCpy(Dst: StoreBasePtr, DstAlign: StoreAlign, Src: LoadBasePtr, SrcAlign: LoadAlign,
1408	Size: NumBytes, /isVolatile=/false, AAInfo: AATags);
1409	} else {
1410	// Create the call.
1411	// Note that unordered atomic loads/stores are required* by the spec to*
1412	// have an alignment but non-atomic loads/stores may not.
1413	NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1414	Dst: StoreBasePtr, DstAlign: StoreAlign, Src: LoadBasePtr, SrcAlign: LoadAlign, Size: NumBytes, ElementSize: StoreSize,
1415	AAInfo: AATags);
1416	}
1417	NewCall->setDebugLoc(TheStore->getDebugLoc());
1418
1419	if (MSSAU) {
1420	MemoryAccess *NewMemAcc = MSSAU ->createMemoryAccessInBB(
1421	I: NewCall, Definition: nullptr, BB: NewCall->getParent(), Point: MemorySSA::BeforeTerminator);
1422	MSSAU ->insertDef(Def: cast<MemoryDef>(Val: NewMemAcc), RenameUses: true);
1423	}
1424
1425	LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1426	<< " from load ptr=" << LoadEv << " at: " << TheLoad
1427	<< "\n"
1428	<< " from store ptr=" << StoreEv << " at: " << TheStore
1429	<< "\n");
1430
1431	ORE.emit(RemarkBuilder: [&]() {
1432	return OptimizationRemark (DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1433	NewCall->getDebugLoc(), Preheader)
1434	<< "Formed a call to "
1435	<< ore::NV ("NewFunction", NewCall->getCalledFunction())
1436	<< "() intrinsic from " << ore::NV ("Inst", InstRemark)
1437	<< " instruction in " << ore::NV ("Function", TheStore->getFunction())
1438	<< " function"
1439	<< ore::setExtraArgs ()
1440	<< ore::NV ("FromBlock", TheStore->getParent()->getName())
1441	<< ore::NV ("ToBlock", Preheader->getName());
1442	});
1443
1444	// Okay, a new call to memcpy/memmove has been formed. Zap the original store
1445	// and anything that feeds into it.
1446	if (MSSAU)
1447	MSSAU ->removeMemoryAccess(I: TheStore, OptimizePhis: true);
1448	deleteDeadInstruction(I: TheStore);
1449	if (MSSAU && VerifyMemorySSA)
1450	MSSAU ->getMemorySSA()->verifyMemorySSA();
1451	if (UseMemMove)
1452	++NumMemMove;
1453	else
1454	++NumMemCpy;
1455	ExpCleaner.markResultUsed();
1456	return true;
1457	}
1458
1459	// When compiling for codesize we avoid idiom recognition for a multi-block loop
1460	// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1461	//
1462	bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1463	bool IsLoopMemset) {
1464	if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > `1`) {
1465	if (CurLoop->isOutermost() && (!IsMemset \|\| !IsLoopMemset)) {
1466	LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1467	<< " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1468	<< " avoided: multi-block top-level loop\n");
1469	return true;
1470	}
1471	}
1472
1473	return false;
1474	}
1475
1476	bool LoopIdiomRecognize::runOnNoncountableLoop() {
1477	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1478	<< CurLoop->getHeader()->getParent()->getName()
1479	<< "] Noncountable Loop %"
1480	<< CurLoop->getHeader()->getName() << "\n");
1481
1482	return recognizePopcount() \|\| recognizeAndInsertFFS() \|\|
1483	recognizeShiftUntilBitTest() \|\| recognizeShiftUntilZero() \|\|
1484	recognizeShiftUntilLessThan() \|\| recognizeAndInsertStrLen();
1485	}
1486
1487	/// Check if the given conditional branch is based on the comparison between
1488	/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1489	/// true), the control yields to the loop entry. If the branch matches the
1490	/// behavior, the variable involved in the comparison is returned. This function
1491	/// will be called to see if the precondition and postcondition of the loop are
1492	/// in desirable form.
1493	static Value matchCondition(BranchInst BI, BasicBlock *LoopEntry,
1494	bool JmpOnZero = false) {
1495	if (!BI \|\| !BI->isConditional())
1496	return nullptr;
1497
1498	ICmpInst *Cond = dyn_cast<ICmpInst>(Val: BI->getCondition());
1499	if (!Cond)
1500	return nullptr;
1501
1502	auto *CmpZero = dyn_cast<ConstantInt>(Val: Cond->getOperand(i_nocapture: `1`));
1503	if (!CmpZero \|\| !CmpZero->isZero())
1504	return nullptr;
1505
1506	BasicBlock *TrueSucc = BI->getSuccessor(i: `0`);
1507	BasicBlock *FalseSucc = BI->getSuccessor(i: `1`);
1508	if (JmpOnZero)
1509	std::swap(a&: TrueSucc, b&: FalseSucc);
1510
1511	ICmpInst::Predicate Pred = Cond->getPredicate();
1512	if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) \|\|
1513	(Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1514	return Cond->getOperand(i_nocapture: `0`);
1515
1516	return nullptr;
1517	}
1518
1519	namespace {
1520
1521	class StrlenVerifier {
1522	public:
1523	explicit StrlenVerifier(const Loop CurLoop, ScalarEvolution SE,
1524	const TargetLibraryInfo *TLI)
1525	: CurLoop(CurLoop), SE(SE), TLI(TLI) {}
1526
1527	bool isValidStrlenIdiom() {
1528	// Give up if the loop has multiple blocks, multiple backedges, or
1529	// multiple exit blocks
1530	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1` \|\|
1531	!CurLoop->getUniqueExitBlock())
1532	return false;
1533
1534	// It should have a preheader and a branch instruction.
1535	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1536	if (!Preheader)
1537	return false;
1538
1539	BranchInst *EntryBI = dyn_cast<BranchInst>(Val: Preheader->getTerminator());
1540	if (!EntryBI)
1541	return false;
1542
1543	// The loop exit must be conditioned on an icmp with 0 the null terminator.
1544	// The icmp operand has to be a load on some SSA reg that increments
1545	// by 1 in the loop.
1546	BasicBlock LoopBody = CurLoop->block_begin();
1547
1548	// Skip if the body is too big as it most likely is not a strlen idiom.
1549	if (!LoopBody \|\| LoopBody->size() >= `15`)
1550	return false;
1551
1552	BranchInst *LoopTerm = dyn_cast<BranchInst>(Val: LoopBody->getTerminator());
1553	Value *LoopCond = matchCondition(BI: LoopTerm, LoopEntry: LoopBody);
1554	if (!LoopCond)
1555	return false;
1556
1557	LoadInst *LoopLoad = dyn_cast<LoadInst>(Val: LoopCond);
1558	if (!LoopLoad \|\| LoopLoad->getPointerAddressSpace() != `0`)
1559	return false;
1560
1561	OperandType = LoopLoad->getType();
1562	if (!OperandType \|\| !OperandType->isIntegerTy())
1563	return false;
1564
1565	// See if the pointer expression is an AddRec with constant step a of form
1566	// ({n,+,a}) where a is the width of the char type.
1567	Value *IncPtr = LoopLoad->getPointerOperand();
1568	const SCEV *LoadEv = SE->getSCEV(V: IncPtr);
1569	const APInt *Step;
1570	if (!match(S: LoadEv,
1571	P: m_scev_AffineAddRec(Op0: m_SCEV(V&: LoadBaseEv), Op1: m_scev_APInt(C&: Step))))
1572	return false;
1573
1574	LLVM_DEBUG(dbgs() << "pointer load scev: " << *LoadEv << "\n");
1575
1576	unsigned StepSize = Step->getZExtValue();
1577
1578	// Verify that StepSize is consistent with platform char width.
1579	OpWidth = OperandType->getIntegerBitWidth();
1580	unsigned WcharSize = TLI->getWCharSize(M: *LoopLoad->getModule());
1581	if (OpWidth != StepSize * `8`)
1582	return false;
1583	if (OpWidth != `8` && OpWidth != `16` && OpWidth != `32`)
1584	return false;
1585	if (OpWidth >= `16`)
1586	if (OpWidth != WcharSize * `8`)
1587	return false;
1588
1589	// Scan every instruction in the loop to ensure there are no side effects.
1590	for (Instruction &I : *LoopBody)
1591	if (I.mayHaveSideEffects())
1592	return false;
1593
1594	BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1595	if (!LoopExitBB)
1596	return false;
1597
1598	for (PHINode &PN : LoopExitBB->phis()) {
1599	if (!SE->isSCEVable(Ty: PN.getType()))
1600	return false;
1601
1602	const SCEV *Ev = SE->getSCEV(V: &PN);
1603	if (!Ev)
1604	return false;
1605
1606	LLVM_DEBUG(dbgs() << "loop exit phi scev: " << *Ev << "\n");
1607
1608	// Since we verified that the loop trip count will be a valid strlen
1609	// idiom, we can expand all lcssa phi with {n,+,1} as (n + strlen) and use
1610	// SCEVExpander materialize the loop output.
1611	const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Val: Ev);
1612	if (!AddRecEv \|\| !AddRecEv->isAffine())
1613	return false;
1614
1615	// We only want RecAddExpr with recurrence step that is constant. This
1616	// is good enough for all the idioms we want to recognize. Later we expand
1617	// and materialize the recurrence as {base,+,a} -> (base + a strlen)*
1618	if (!isa<SCEVConstant>(Val: AddRecEv->getStepRecurrence(SE&: *SE)))
1619	return false;
1620	}
1621
1622	return true;
1623	}
1624
1625	public:
1626	const Loop *CurLoop;
1627	ScalarEvolution *SE;
1628	const TargetLibraryInfo *TLI;
1629
1630	unsigned OpWidth;
1631	ConstantInt *StepSizeCI;
1632	const SCEV *LoadBaseEv;
1633	Type *OperandType;
1634	};
1635
1636	} // namespace
1637
1638	/// The Strlen Idiom we are trying to detect has the following structure
1639	///
1640	/// preheader:
1641	/// ...
1642	/// br label %body, ...
1643	///
1644	/// body:
1645	/// ... ; %0 is incremented by a gep
1646	/// %1 = load i8, ptr %0, align 1
1647	/// %2 = icmp eq i8 %1, 0
1648	/// br i1 %2, label %exit, label %body
1649	///
1650	/// exit:
1651	/// %lcssa = phi [%0, %body], ...
1652	///
1653	/// We expect the strlen idiom to have a load of a character type that
1654	/// is compared against '\0', and such load pointer operand must have scev
1655	/// expression of the form {%str,+,c} where c is a ConstantInt of the
1656	/// appropiate character width for the idiom, and %str is the base of the string
1657	/// And, that all lcssa phis have the form {...,+,n} where n is a constant,
1658	///
1659	/// When transforming the output of the strlen idiom, the lccsa phi are
1660	/// expanded using SCEVExpander as {base scev,+,a} -> (base scev + a strlen)*
1661	/// and all subsequent uses are replaced. For example,
1662	///
1663	/// \code{.c}
1664	/// const char base = str;*
1665	/// while (str != '\0')*
1666	/// ++str;
1667	/// size_t result = str - base;
1668	/// \endcode
1669	///
1670	/// will be transformed as follows: The idiom will be replaced by a strlen
1671	/// computation to compute the address of the null terminator of the string.
1672	///
1673	/// \code{.c}
1674	/// const char base = str;*
1675	/// const char end = base + strlen(str);*
1676	/// size_t result = end - base;
1677	/// \endcode
1678	///
1679	/// In the case we index by an induction variable, as long as the induction
1680	/// variable has a constant int increment, we can replace all such indvars
1681	/// with the closed form computation of strlen
1682	///
1683	/// \code{.c}
1684	/// size_t i = 0;
1685	/// while (str[i] != '\0')
1686	/// ++i;
1687	/// size_t result = i;
1688	/// \endcode
1689	///
1690	/// Will be replaced by
1691	///
1692	/// \code{.c}
1693	/// size_t i = 0 + strlen(str);
1694	/// size_t result = i;
1695	/// \endcode
1696	///
1697	bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
1698	if (DisableLIRP::All)
1699	return false;
1700
1701	StrlenVerifier Verifier(CurLoop, SE, TLI);
1702
1703	if (!Verifier.isValidStrlenIdiom())
1704	return false;
1705
1706	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1707	BasicBlock LoopBody = CurLoop->block_begin();
1708	BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1709	BranchInst *LoopTerm = dyn_cast<BranchInst>(Val: LoopBody->getTerminator());
1710	assert(Preheader && LoopBody && LoopExitBB && LoopTerm &&
1711	"Should be verified to be valid by StrlenVerifier");
1712
1713	if (Verifier.OpWidth == `8`) {
1714	if (DisableLIRP::Strlen)
1715	return false;
1716	if (!isLibFuncEmittable(M: Preheader->getModule(), TLI, TheLibFunc: LibFunc_strlen))
1717	return false;
1718	} else {
1719	if (DisableLIRP::Wcslen)
1720	return false;
1721	if (!isLibFuncEmittable(M: Preheader->getModule(), TLI, TheLibFunc: LibFunc_wcslen))
1722	return false;
1723	}
1724
1725	IRBuilder<> Builder(Preheader->getTerminator());
1726	Builder.SetCurrentDebugLocation(CurLoop->getStartLoc());
1727	SCEVExpander Expander(*SE, Preheader->getModule()->getDataLayout(),
1728	"strlen_idiom");
1729	Value *MaterialzedBase = Expander.expandCodeFor(
1730	SH: Verifier.LoadBaseEv, Ty: Verifier.LoadBaseEv->getType(),
1731	I: Builder.GetInsertPoint());
1732
1733	Value StrLenFunc = nullptr*;
1734	if (Verifier.OpWidth == `8`) {
1735	StrLenFunc = emitStrLen(Ptr: MaterialzedBase, B&: Builder, DL: *DL, TLI);
1736	} else {
1737	StrLenFunc = emitWcsLen(Ptr: MaterialzedBase, B&: Builder, DL: *DL, TLI);
1738	}
1739	assert(StrLenFunc && "Failed to emit strlen function.");
1740
1741	const SCEV *StrlenEv = SE->getSCEV(V: StrLenFunc);
1742	SmallVector<PHINode *, `4`> Cleanup;
1743	for (PHINode &PN : LoopExitBB->phis()) {
1744	// We can now materialize the loop output as all phi have scev {base,+,a}.
1745	// We expand the phi as:
1746	// %strlen = call i64 @strlen(%str)
1747	// %phi.new = base expression + step %strlen*
1748	const SCEV *Ev = SE->getSCEV(V: &PN);
1749	const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Val: Ev);
1750	const SCEVConstant *Step =
1751	dyn_cast<SCEVConstant>(Val: AddRecEv->getStepRecurrence(SE&: *SE));
1752	const SCEV *Base = AddRecEv->getStart();
1753
1754	// It is safe to truncate to base since if base is narrower than size_t
1755	// the equivalent user code will have to truncate anyways.
1756	const SCEV *NewEv = SE->getAddExpr(
1757	LHS: Base, RHS: SE->getMulExpr(LHS: Step, RHS: SE->getTruncateOrSignExtend(
1758	V: StrlenEv, Ty: Base->getType())));
1759
1760	Value *MaterializedPHI = Expander.expandCodeFor(SH: NewEv, Ty: NewEv->getType(),
1761	I: Builder.GetInsertPoint());
1762	Expander.clear();
1763	PN.replaceAllUsesWith(V: MaterializedPHI);
1764	Cleanup.push_back(Elt: &PN);
1765	}
1766
1767	// All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned
1768	// up by later passes
1769	for (PHINode *PN : Cleanup)
1770	RecursivelyDeleteDeadPHINode(PN);
1771
1772	// LoopDeletion only delete invariant loops with known trip-count. We can
1773	// update the condition so it will reliablely delete the invariant loop
1774	assert(LoopTerm->getNumSuccessors() == `2` &&
1775	(LoopTerm->getSuccessor(`0`) == LoopBody \|\|
1776	LoopTerm->getSuccessor(`1`) == LoopBody) &&
1777	"loop body must have a successor that is it self");
1778	ConstantInt *NewLoopCond = LoopTerm->getSuccessor(i: `0`) == LoopBody
1779	? Builder.getFalse()
1780	: Builder.getTrue();
1781	LoopTerm->setCondition(NewLoopCond);
1782	SE->forgetLoop(L: CurLoop);
1783
1784	++NumStrLen;
1785	LLVM_DEBUG(dbgs() << " Formed strlen idiom: " << *StrLenFunc << "\n");
1786	ORE.emit(RemarkBuilder: [&]() {
1787	return OptimizationRemark (DEBUG_TYPE, "recognizeAndInsertStrLen",
1788	CurLoop->getStartLoc(), Preheader)
1789	<< "Transformed " << StrLenFunc->getName() << " loop idiom";
1790	});
1791
1792	return true;
1793	}
1794
1795	/// Check if the given conditional branch is based on an unsigned less-than
1796	/// comparison between a variable and a constant, and if the comparison is false
1797	/// the control yields to the loop entry. If the branch matches the behaviour,
1798	/// the variable involved in the comparison is returned.
1799	static Value matchShiftULTCondition(BranchInst BI, BasicBlock *LoopEntry,
1800	APInt &Threshold) {
1801	if (!BI \|\| !BI->isConditional())
1802	return nullptr;
1803
1804	ICmpInst *Cond = dyn_cast<ICmpInst>(Val: BI->getCondition());
1805	if (!Cond)
1806	return nullptr;
1807
1808	ConstantInt *CmpConst = dyn_cast<ConstantInt>(Val: Cond->getOperand(i_nocapture: `1`));
1809	if (!CmpConst)
1810	return nullptr;
1811
1812	BasicBlock *FalseSucc = BI->getSuccessor(i: `1`);
1813	ICmpInst::Predicate Pred = Cond->getPredicate();
1814
1815	if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
1816	Threshold = CmpConst->getValue();
1817	return Cond->getOperand(i_nocapture: `0`);
1818	}
1819
1820	return nullptr;
1821	}
1822
1823	// Check if the recurrence variable `VarX` is in the right form to create
1824	// the idiom. Returns the value coerced to a PHINode if so.
1825	static PHINode getRecurrenceVar(Value VarX, Instruction *DefX,
1826	BasicBlock *LoopEntry) {
1827	auto *PhiX = dyn_cast<PHINode>(Val: VarX);
1828	if (PhiX && PhiX->getParent() == LoopEntry &&
1829	(PhiX->getOperand(i_nocapture: `0`) == DefX \|\| PhiX->getOperand(i_nocapture: `1`) == DefX))
1830	return PhiX;
1831	return nullptr;
1832	}
1833
1834	/// Return true if the idiom is detected in the loop.
1835	///
1836	/// Additionally:
1837	/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1838	/// or nullptr if there is no such.
1839	/// 2) \p CntPhi is set to the corresponding phi node
1840	/// or nullptr if there is no such.
1841	/// 3) \p InitX is set to the value whose CTLZ could be used.
1842	/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1843	/// 5) \p Threshold is set to the constant involved in the unsigned less-than
1844	/// comparison.
1845	///
1846	/// The core idiom we are trying to detect is:
1847	/// \code
1848	/// if (x0 < 2)
1849	/// goto loop-exit // the precondition of the loop
1850	/// cnt0 = init-val
1851	/// do {
1852	/// x = phi (x0, x.next); //PhiX
1853	/// cnt = phi (cnt0, cnt.next)
1854	///
1855	/// cnt.next = cnt + 1;
1856	/// ...
1857	/// x.next = x >> 1; // DefX
1858	/// } while (x >= 4)
1859	/// loop-exit:
1860	/// \endcode
1861	static bool detectShiftUntilLessThanIdiom(Loop CurLoop, const* DataLayout &DL,
1862	Intrinsic::ID &IntrinID,
1863	Value &InitX, Instruction &CntInst,
1864	PHINode &CntPhi, Instruction &DefX,
1865	APInt &Threshold) {
1866	BasicBlock *LoopEntry;
1867
1868	DefX = nullptr;
1869	CntInst = nullptr;
1870	CntPhi = nullptr;
1871	LoopEntry = *(CurLoop->block_begin());
1872
1873	// step 1: Check if the loop-back branch is in desirable form.
1874	if (Value *T = matchShiftULTCondition(
1875	BI: dyn_cast<BranchInst>(Val: LoopEntry->getTerminator()), LoopEntry,
1876	Threshold))
1877	DefX = dyn_cast<Instruction>(Val: T);
1878	else
1879	return false;
1880
1881	// step 2: Check the recurrence of variable X
1882	if (!DefX \|\| !isa<PHINode>(Val: DefX))
1883	return false;
1884
1885	PHINode *VarPhi = cast<PHINode>(Val: DefX);
1886	int Idx = VarPhi->getBasicBlockIndex(BB: LoopEntry);
1887	if (Idx == -`1`)
1888	return false;
1889
1890	DefX = dyn_cast<Instruction>(Val: VarPhi->getIncomingValue(i: Idx));
1891	if (!DefX \|\| DefX->getNumOperands() == `0` \|\| DefX->getOperand(i: `0`) != VarPhi)
1892	return false;
1893
1894	// step 3: detect instructions corresponding to "x.next = x >> 1"
1895	if (DefX->getOpcode() != Instruction::LShr)
1896	return false;
1897
1898	IntrinID = Intrinsic::ctlz;
1899	ConstantInt *Shft = dyn_cast<ConstantInt>(Val: DefX->getOperand(i: `1`));
1900	if (!Shft \|\| !Shft->isOne())
1901	return false;
1902
1903	InitX = VarPhi->getIncomingValueForBlock(BB: CurLoop->getLoopPreheader());
1904
1905	// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1906	// or cnt.next = cnt + -1.
1907	// TODO: We can skip the step. If loop trip count is known (CTLZ),
1908	// then all uses of "cnt.next" could be optimized to the trip count
1909	// plus "cnt0". Currently it is not optimized.
1910	// This step could be used to detect POPCNT instruction:
1911	// cnt.next = cnt + (x.next & 1)
1912	for (Instruction &Inst :
1913	llvm::make_range(x: LoopEntry->getFirstNonPHIIt(), y: LoopEntry->end())) {
1914	if (Inst.getOpcode() != Instruction::Add)
1915	continue;
1916
1917	ConstantInt *Inc = dyn_cast<ConstantInt>(Val: Inst.getOperand(i: `1`));
1918	if (!Inc \|\| (!Inc->isOne() && !Inc->isMinusOne()))
1919	continue;
1920
1921	PHINode *Phi = getRecurrenceVar(VarX: Inst.getOperand(i: `0`), DefX: &Inst, LoopEntry);
1922	if (!Phi)
1923	continue;
1924
1925	CntInst = &Inst;
1926	CntPhi = Phi;
1927	break;
1928	}
1929	if (!CntInst)
1930	return false;
1931
1932	return true;
1933	}
1934
1935	/// Return true iff the idiom is detected in the loop.
1936	///
1937	/// Additionally:
1938	/// 1) \p CntInst is set to the instruction counting the population bit.
1939	/// 2) \p CntPhi is set to the corresponding phi node.
1940	/// 3) \p Var is set to the value whose population bits are being counted.
1941	///
1942	/// The core idiom we are trying to detect is:
1943	/// \code
1944	/// if (x0 != 0)
1945	/// goto loop-exit // the precondition of the loop
1946	/// cnt0 = init-val;
1947	/// do {
1948	/// x1 = phi (x0, x2);
1949	/// cnt1 = phi(cnt0, cnt2);
1950	///
1951	/// cnt2 = cnt1 + 1;
1952	/// ...
1953	/// x2 = x1 & (x1 - 1);
1954	/// ...
1955	/// } while(x != 0);
1956	///
1957	/// loop-exit:
1958	/// \endcode
1959	static bool detectPopcountIdiom(Loop CurLoop, BasicBlock PreCondBB,
1960	Instruction &CntInst, PHINode &CntPhi,
1961	Value *&Var) {
1962	// step 1: Check to see if the look-back branch match this pattern:
1963	// "if (a!=0) goto loop-entry".
1964	BasicBlock *LoopEntry;
1965	Instruction DefX2, CountInst;
1966	Value VarX1, VarX0;
1967	PHINode PhiX, CountPhi;
1968
1969	DefX2 = CountInst = nullptr;
1970	VarX1 = VarX0 = nullptr;
1971	PhiX = CountPhi = nullptr;
1972	LoopEntry = *(CurLoop->block_begin());
1973
1974	// step 1: Check if the loop-back branch is in desirable form.
1975	{
1976	if (Value *T = matchCondition(
1977	BI: dyn_cast<BranchInst>(Val: LoopEntry->getTerminator()), LoopEntry))
1978	DefX2 = dyn_cast<Instruction>(Val: T);
1979	else
1980	return false;
1981	}
1982
1983	// step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1984	{
1985	if (!DefX2 \|\| DefX2->getOpcode() != Instruction::And)
1986	return false;
1987
1988	BinaryOperator *SubOneOp;
1989
1990	if ((SubOneOp = dyn_cast<BinaryOperator>(Val: DefX2->getOperand(i: `0`))))
1991	VarX1 = DefX2->getOperand(i: `1`);
1992	else {
1993	VarX1 = DefX2->getOperand(i: `0`);
1994	SubOneOp = dyn_cast<BinaryOperator>(Val: DefX2->getOperand(i: `1`));
1995	}
1996	if (!SubOneOp \|\| SubOneOp->getOperand(i_nocapture: `0`) != VarX1)
1997	return false;
1998
1999	ConstantInt *Dec = dyn_cast<ConstantInt>(Val: SubOneOp->getOperand(i_nocapture: `1`));
2000	if (!Dec \|\|
2001	!((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) \|\|
2002	(SubOneOp->getOpcode() == Instruction::Add &&
2003	Dec->isMinusOne()))) {
2004	return false;
2005	}
2006	}
2007
2008	// step 3: Check the recurrence of variable X
2009	PhiX = getRecurrenceVar(VarX: VarX1, DefX: DefX2, LoopEntry);
2010	if (!PhiX)
2011	return false;
2012
2013	// step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
2014	{
2015	CountInst = nullptr;
2016	for (Instruction &Inst :
2017	llvm::make_range(x: LoopEntry->getFirstNonPHIIt(), y: LoopEntry->end())) {
2018	if (Inst.getOpcode() != Instruction::Add)
2019	continue;
2020
2021	ConstantInt *Inc = dyn_cast<ConstantInt>(Val: Inst.getOperand(i: `1`));
2022	if (!Inc \|\| !Inc->isOne())
2023	continue;
2024
2025	PHINode *Phi = getRecurrenceVar(VarX: Inst.getOperand(i: `0`), DefX: &Inst, LoopEntry);
2026	if (!Phi)
2027	continue;
2028
2029	// Check if the result of the instruction is live of the loop.
2030	bool LiveOutLoop = false;
2031	for (User *U : Inst.users()) {
2032	if ((cast<Instruction>(Val: U))->getParent() != LoopEntry) {
2033	LiveOutLoop = true;
2034	break;
2035	}
2036	}
2037
2038	if (LiveOutLoop) {
2039	CountInst = &Inst;
2040	CountPhi = Phi;
2041	break;
2042	}
2043	}
2044
2045	if (!CountInst)
2046	return false;
2047	}
2048
2049	// step 5: check if the precondition is in this form:
2050	// "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
2051	{
2052	auto *PreCondBr = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
2053	Value *T = matchCondition(BI: PreCondBr, LoopEntry: CurLoop->getLoopPreheader());
2054	if (T != PhiX->getOperand(i_nocapture: `0`) && T != PhiX->getOperand(i_nocapture: `1`))
2055	return false;
2056
2057	CntInst = CountInst;
2058	CntPhi = CountPhi;
2059	Var = T;
2060	}
2061
2062	return true;
2063	}
2064
2065	/// Return true if the idiom is detected in the loop.
2066	///
2067	/// Additionally:
2068	/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2069	/// or nullptr if there is no such.
2070	/// 2) \p CntPhi is set to the corresponding phi node
2071	/// or nullptr if there is no such.
2072	/// 3) \p Var is set to the value whose CTLZ could be used.
2073	/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2074	///
2075	/// The core idiom we are trying to detect is:
2076	/// \code
2077	/// if (x0 == 0)
2078	/// goto loop-exit // the precondition of the loop
2079	/// cnt0 = init-val;
2080	/// do {
2081	/// x = phi (x0, x.next); //PhiX
2082	/// cnt = phi(cnt0, cnt.next);
2083	///
2084	/// cnt.next = cnt + 1;
2085	/// ...
2086	/// x.next = x >> 1; // DefX
2087	/// ...
2088	/// } while(x.next != 0);
2089	///
2090	/// loop-exit:
2091	/// \endcode
2092	static bool detectShiftUntilZeroIdiom(Loop CurLoop, const* DataLayout &DL,
2093	Intrinsic::ID &IntrinID, Value *&InitX,
2094	Instruction &CntInst, PHINode &CntPhi,
2095	Instruction *&DefX) {
2096	BasicBlock *LoopEntry;
2097	Value VarX = nullptr*;
2098
2099	DefX = nullptr;
2100	CntInst = nullptr;
2101	CntPhi = nullptr;
2102	LoopEntry = *(CurLoop->block_begin());
2103
2104	// step 1: Check if the loop-back branch is in desirable form.
2105	if (Value *T = matchCondition(
2106	BI: dyn_cast<BranchInst>(Val: LoopEntry->getTerminator()), LoopEntry))
2107	DefX = dyn_cast<Instruction>(Val: T);
2108	else
2109	return false;
2110
2111	// step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
2112	if (!DefX \|\| !DefX->isShift())
2113	return false;
2114	IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
2115	Intrinsic::ctlz;
2116	ConstantInt *Shft = dyn_cast<ConstantInt>(Val: DefX->getOperand(i: `1`));
2117	if (!Shft \|\| !Shft->isOne())
2118	return false;
2119	VarX = DefX->getOperand(i: `0`);
2120
2121	// step 3: Check the recurrence of variable X
2122	PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
2123	if (!PhiX)
2124	return false;
2125
2126	InitX = PhiX->getIncomingValueForBlock(BB: CurLoop->getLoopPreheader());
2127
2128	// Make sure the initial value can't be negative otherwise the ashr in the
2129	// loop might never reach zero which would make the loop infinite.
2130	if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(V: InitX, SQ: DL))
2131	return false;
2132
2133	// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2134	// or cnt.next = cnt + -1.
2135	// TODO: We can skip the step. If loop trip count is known (CTLZ),
2136	// then all uses of "cnt.next" could be optimized to the trip count
2137	// plus "cnt0". Currently it is not optimized.
2138	// This step could be used to detect POPCNT instruction:
2139	// cnt.next = cnt + (x.next & 1)
2140	for (Instruction &Inst :
2141	llvm::make_range(x: LoopEntry->getFirstNonPHIIt(), y: LoopEntry->end())) {
2142	if (Inst.getOpcode() != Instruction::Add)
2143	continue;
2144
2145	ConstantInt *Inc = dyn_cast<ConstantInt>(Val: Inst.getOperand(i: `1`));
2146	if (!Inc \|\| (!Inc->isOne() && !Inc->isMinusOne()))
2147	continue;
2148
2149	PHINode *Phi = getRecurrenceVar(VarX: Inst.getOperand(i: `0`), DefX: &Inst, LoopEntry);
2150	if (!Phi)
2151	continue;
2152
2153	CntInst = &Inst;
2154	CntPhi = Phi;
2155	break;
2156	}
2157	if (!CntInst)
2158	return false;
2159
2160	return true;
2161	}
2162
2163	// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
2164	// profitable if we delete the loop.
2165	bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
2166	Value InitX, bool* ZeroCheck,
2167	size_t CanonicalSize) {
2168	const Value *Args[] = {InitX,
2169	ConstantInt::getBool(Context&: InitX->getContext(), V: ZeroCheck)};
2170
2171	// @llvm.dbg doesn't count as they have no semantic effect.
2172	auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
2173	uint32_t HeaderSize =
2174	std::distance(first: InstWithoutDebugIt.begin(), last: InstWithoutDebugIt.end());
2175
2176	IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
2177	InstructionCost Cost = TTI->getIntrinsicInstrCost(
2178	ICA: Attrs, CostKind: TargetTransformInfo::TCK_SizeAndLatency);
2179	if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
2180	return false;
2181
2182	return true;
2183	}
2184
2185	/// Convert CTLZ / CTTZ idiom loop into countable loop.
2186	/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
2187	/// returns false.
2188	bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
2189	Value InitX, Instruction DefX,
2190	PHINode *CntPhi,
2191	Instruction *CntInst) {
2192	bool IsCntPhiUsedOutsideLoop = false;
2193	for (User *U : CntPhi->users())
2194	if (!CurLoop->contains(Inst: cast<Instruction>(Val: U))) {
2195	IsCntPhiUsedOutsideLoop = true;
2196	break;
2197	}
2198	bool IsCntInstUsedOutsideLoop = false;
2199	for (User *U : CntInst->users())
2200	if (!CurLoop->contains(Inst: cast<Instruction>(Val: U))) {
2201	IsCntInstUsedOutsideLoop = true;
2202	break;
2203	}
2204	// If both CntInst and CntPhi are used outside the loop the profitability
2205	// is questionable.
2206	if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
2207	return false;
2208
2209	// For some CPUs result of CTLZ(X) intrinsic is undefined
2210	// when X is 0. If we can not guarantee X != 0, we need to check this
2211	// when expand.
2212	bool ZeroCheck = false;
2213	// It is safe to assume Preheader exist as it was checked in
2214	// parent function RunOnLoop.
2215	BasicBlock *PH = CurLoop->getLoopPreheader();
2216
2217	// If we are using the count instruction outside the loop, make sure we
2218	// have a zero check as a precondition. Without the check the loop would run
2219	// one iteration for before any check of the input value. This means 0 and 1
2220	// would have identical behavior in the original loop and thus
2221	if (!IsCntPhiUsedOutsideLoop) {
2222	auto *PreCondBB = PH->getSinglePredecessor();
2223	if (!PreCondBB)
2224	return false;
2225	auto *PreCondBI = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
2226	if (!PreCondBI)
2227	return false;
2228	if (matchCondition(BI: PreCondBI, LoopEntry: PH) != InitX)
2229	return false;
2230	ZeroCheck = true;
2231	}
2232
2233	// FFS idiom loop has only 6 instructions:
2234	// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2235	// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2236	// %shr = ashr %n.addr.0, 1
2237	// %tobool = icmp eq %shr, 0
2238	// %inc = add nsw %i.0, 1
2239	// br i1 %tobool
2240	size_t IdiomCanonicalSize = `6`;
2241	if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, CanonicalSize: IdiomCanonicalSize))
2242	return false;
2243
2244	transformLoopToCountable(IntrinID, PreCondBB: PH, CntInst, CntPhi, Var: InitX, DefX,
2245	DL: DefX->getDebugLoc(), ZeroCheck,
2246	IsCntPhiUsedOutsideLoop);
2247	return true;
2248	}
2249
2250	/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
2251	/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
2252	/// trip count returns true; otherwise, returns false.
2253	bool LoopIdiomRecognize::recognizeAndInsertFFS() {
2254	// Give up if the loop has multiple blocks or multiple backedges.
2255	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1`)
2256	return false;
2257
2258	Intrinsic::ID IntrinID;
2259	Value *InitX;
2260	Instruction DefX = nullptr*;
2261	PHINode CntPhi = nullptr*;
2262	Instruction CntInst = nullptr*;
2263
2264	if (!detectShiftUntilZeroIdiom(CurLoop, DL: *DL, IntrinID, InitX, CntInst, CntPhi,
2265	DefX))
2266	return false;
2267
2268	return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2269	}
2270
2271	bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2272	// Give up if the loop has multiple blocks or multiple backedges.
2273	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1`)
2274	return false;
2275
2276	Intrinsic::ID IntrinID;
2277	Value *InitX;
2278	Instruction DefX = nullptr*;
2279	PHINode CntPhi = nullptr*;
2280	Instruction CntInst = nullptr*;
2281
2282	APInt LoopThreshold;
2283	if (!detectShiftUntilLessThanIdiom(CurLoop, DL: *DL, IntrinID, InitX, CntInst,
2284	CntPhi, DefX, Threshold&: LoopThreshold))
2285	return false;
2286
2287	if (LoopThreshold == `2`) {
2288	// Treat as regular FFS.
2289	return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2290	}
2291
2292	// Look for Floor Log2 Idiom.
2293	if (LoopThreshold != `4`)
2294	return false;
2295
2296	// Abort if CntPhi is used outside of the loop.
2297	for (User *U : CntPhi->users())
2298	if (!CurLoop->contains(Inst: cast<Instruction>(Val: U)))
2299	return false;
2300
2301	// It is safe to assume Preheader exist as it was checked in
2302	// parent function RunOnLoop.
2303	BasicBlock *PH = CurLoop->getLoopPreheader();
2304	auto *PreCondBB = PH->getSinglePredecessor();
2305	if (!PreCondBB)
2306	return false;
2307	auto *PreCondBI = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
2308	if (!PreCondBI)
2309	return false;
2310
2311	APInt PreLoopThreshold;
2312	if (matchShiftULTCondition(BI: PreCondBI, LoopEntry: PH, Threshold&: PreLoopThreshold) != InitX \|\|
2313	PreLoopThreshold != `2`)
2314	return false;
2315
2316	bool ZeroCheck = true;
2317
2318	// the loop has only 6 instructions:
2319	// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2320	// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2321	// %shr = ashr %n.addr.0, 1
2322	// %tobool = icmp ult %n.addr.0, C
2323	// %inc = add nsw %i.0, 1
2324	// br i1 %tobool
2325	size_t IdiomCanonicalSize = `6`;
2326	if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, CanonicalSize: IdiomCanonicalSize))
2327	return false;
2328
2329	// log2(x) = w − 1 − clz(x)
2330	transformLoopToCountable(IntrinID, PreCondBB: PH, CntInst, CntPhi, Var: InitX, DefX,
2331	DL: DefX->getDebugLoc(), ZeroCheck,
2332	/IsCntPhiUsedOutsideLoop=/false,
2333	/InsertSub=/true);
2334	return true;
2335	}
2336
2337	/// Recognizes a population count idiom in a non-countable loop.
2338	///
2339	/// If detected, transforms the relevant code to issue the popcount intrinsic
2340	/// function call, and returns true; otherwise, returns false.
2341	bool LoopIdiomRecognize::recognizePopcount() {
2342	if (TTI->getPopcntSupport(IntTyWidthInBit: `32`) != TargetTransformInfo::PSK_FastHardware)
2343	return false;
2344
2345	// Counting population are usually conducted by few arithmetic instructions.
2346	// Such instructions can be easily "absorbed" by vacant slots in a
2347	// non-compact loop. Therefore, recognizing popcount idiom only makes sense
2348	// in a compact loop.
2349
2350	// Give up if the loop has multiple blocks or multiple backedges.
2351	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1`)
2352	return false;
2353
2354	BasicBlock LoopBody = (CurLoop->block_begin());
2355	if (LoopBody->size() >= `20`) {
2356	// The loop is too big, bail out.
2357	return false;
2358	}
2359
2360	// It should have a preheader containing nothing but an unconditional branch.
2361	BasicBlock *PH = CurLoop->getLoopPreheader();
2362	if (!PH \|\| &PH->front() != PH->getTerminator())
2363	return false;
2364	auto *EntryBI = dyn_cast<BranchInst>(Val: PH->getTerminator());
2365	if (!EntryBI \|\| EntryBI->isConditional())
2366	return false;
2367
2368	// It should have a precondition block where the generated popcount intrinsic
2369	// function can be inserted.
2370	auto *PreCondBB = PH->getSinglePredecessor();
2371	if (!PreCondBB)
2372	return false;
2373	auto *PreCondBI = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
2374	if (!PreCondBI \|\| PreCondBI->isUnconditional())
2375	return false;
2376
2377	Instruction *CntInst;
2378	PHINode *CntPhi;
2379	Value *Val;
2380	if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Var&: Val))
2381	return false;
2382
2383	transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Var: Val);
2384	return true;
2385	}
2386
2387	static CallInst createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value Val,
2388	const DebugLoc &DL) {
2389	Value *Ops[] = {Val};
2390	Type *Tys[] = {Val->getType()};
2391
2392	CallInst *CI = IRBuilder.CreateIntrinsic(ID: Intrinsic::ctpop, Types: Tys, Args: Ops);
2393	CI->setDebugLoc(DL);
2394
2395	return CI;
2396	}
2397
2398	static CallInst createFFSIntrinsic(IRBuilder<> &IRBuilder, Value Val,
2399	const DebugLoc &DL, bool ZeroCheck,
2400	Intrinsic::ID IID) {
2401	Value *Ops[] = {Val, IRBuilder.getInt1(V: ZeroCheck)};
2402	Type *Tys[] = {Val->getType()};
2403
2404	CallInst *CI = IRBuilder.CreateIntrinsic(ID: IID, Types: Tys, Args: Ops);
2405	CI->setDebugLoc(DL);
2406
2407	return CI;
2408	}
2409
2410	/// Transform the following loop (Using CTLZ, CTTZ is similar):
2411	/// loop:
2412	/// CntPhi = PHI [Cnt0, CntInst]
2413	/// PhiX = PHI [InitX, DefX]
2414	/// CntInst = CntPhi + 1
2415	/// DefX = PhiX >> 1
2416	/// LOOP_BODY
2417	/// Br: loop if (DefX != 0)
2418	/// Use(CntPhi) or Use(CntInst)
2419	///
2420	/// Into:
2421	/// If CntPhi used outside the loop:
2422	/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2423	/// Count = CountPrev + 1
2424	/// else
2425	/// Count = BitWidth(InitX) - CTLZ(InitX)
2426	/// loop:
2427	/// CntPhi = PHI [Cnt0, CntInst]
2428	/// PhiX = PHI [InitX, DefX]
2429	/// PhiCount = PHI [Count, Dec]
2430	/// CntInst = CntPhi + 1
2431	/// DefX = PhiX >> 1
2432	/// Dec = PhiCount - 1
2433	/// LOOP_BODY
2434	/// Br: loop if (Dec != 0)
2435	/// Use(CountPrev + Cnt0) // Use(CntPhi)
2436	/// or
2437	/// Use(Count + Cnt0) // Use(CntInst)
2438	///
2439	/// If LOOP_BODY is empty the loop will be deleted.
2440	/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2441	void LoopIdiomRecognize::transformLoopToCountable(
2442	Intrinsic::ID IntrinID, BasicBlock Preheader, Instruction CntInst,
2443	PHINode CntPhi, Value InitX, Instruction DefX, const* DebugLoc &DL,
2444	bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
2445	BranchInst *PreheaderBr = cast<BranchInst>(Val: Preheader->getTerminator());
2446
2447	// Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2448	IRBuilder<> Builder(PreheaderBr);
2449	Builder.SetCurrentDebugLocation(DL);
2450
2451	// If there are no uses of CntPhi crate:
2452	// Count = BitWidth - CTLZ(InitX);
2453	// NewCount = Count;
2454	// If there are uses of CntPhi create:
2455	// NewCount = BitWidth - CTLZ(InitX >> 1);
2456	// Count = NewCount + 1;
2457	Value *InitXNext;
2458	if (IsCntPhiUsedOutsideLoop) {
2459	if (DefX->getOpcode() == Instruction::AShr)
2460	InitXNext = Builder.CreateAShr(LHS: InitX, RHS: `1`);
2461	else if (DefX->getOpcode() == Instruction::LShr)
2462	InitXNext = Builder.CreateLShr(LHS: InitX, RHS: `1`);
2463	else if (DefX->getOpcode() == Instruction::Shl) // cttz
2464	InitXNext = Builder.CreateShl(LHS: InitX, RHS: `1`);
2465	else
2466	llvm_unreachable("Unexpected opcode!");
2467	} else
2468	InitXNext = InitX;
2469	Value *Count =
2470	createFFSIntrinsic(IRBuilder&: Builder, Val: InitXNext, DL, ZeroCheck, IID: IntrinID);
2471	Type *CountTy = Count->getType();
2472	Count = Builder.CreateSub(
2473	LHS: ConstantInt::get(Ty: CountTy, V: CountTy->getIntegerBitWidth()), RHS: Count);
2474	if (InsertSub)
2475	Count = Builder.CreateSub(LHS: Count, RHS: ConstantInt::get(Ty: CountTy, V: `1`));
2476	Value *NewCount = Count;
2477	if (IsCntPhiUsedOutsideLoop)
2478	Count = Builder.CreateAdd(LHS: Count, RHS: ConstantInt::get(Ty: CountTy, V: `1`));
2479
2480	NewCount = Builder.CreateZExtOrTrunc(V: NewCount, DestTy: CntInst->getType());
2481
2482	Value *CntInitVal = CntPhi->getIncomingValueForBlock(BB: Preheader);
2483	if (cast<ConstantInt>(Val: CntInst->getOperand(i: `1`))->isOne()) {
2484	// If the counter was being incremented in the loop, add NewCount to the
2485	// counter's initial value, but only if the initial value is not zero.
2486	ConstantInt *InitConst = dyn_cast<ConstantInt>(Val: CntInitVal);
2487	if (!InitConst \|\| !InitConst->isZero())
2488	NewCount = Builder.CreateAdd(LHS: NewCount, RHS: CntInitVal);
2489	} else {
2490	// If the count was being decremented in the loop, subtract NewCount from
2491	// the counter's initial value.
2492	NewCount = Builder.CreateSub(LHS: CntInitVal, RHS: NewCount);
2493	}
2494
2495	// Step 2: Insert new IV and loop condition:
2496	// loop:
2497	// ...
2498	// PhiCount = PHI [Count, Dec]
2499	// ...
2500	// Dec = PhiCount - 1
2501	// ...
2502	// Br: loop if (Dec != 0)
2503	BasicBlock Body = (CurLoop->block_begin());
2504	auto *LbBr = cast<BranchInst>(Val: Body->getTerminator());
2505	ICmpInst *LbCond = cast<ICmpInst>(Val: LbBr->getCondition());
2506
2507	PHINode *TcPhi = PHINode::Create(Ty: CountTy, NumReservedValues: `2`, NameStr: "tcphi");
2508	TcPhi->insertBefore(InsertPos: Body->begin());
2509
2510	Builder.SetInsertPoint(LbCond);
2511	Instruction *TcDec = cast<Instruction>(Val: Builder.CreateSub(
2512	LHS: TcPhi, RHS: ConstantInt::get(Ty: CountTy, V: `1`), Name: "tcdec", HasNUW: false, HasNSW: true));
2513
2514	TcPhi->addIncoming(V: Count, BB: Preheader);
2515	TcPhi->addIncoming(V: TcDec, BB: Body);
2516
2517	CmpInst::Predicate Pred =
2518	(LbBr->getSuccessor(i: `0`) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2519	LbCond->setPredicate(Pred);
2520	LbCond->setOperand(i_nocapture: `0`, Val_nocapture: TcDec);
2521	LbCond->setOperand(i_nocapture: `1`, Val_nocapture: ConstantInt::get(Ty: CountTy, V: `0`));
2522
2523	// Step 3: All the references to the original counter outside
2524	// the loop are replaced with the NewCount
2525	if (IsCntPhiUsedOutsideLoop)
2526	CntPhi->replaceUsesOutsideBlock(V: NewCount, BB: Body);
2527	else
2528	CntInst->replaceUsesOutsideBlock(V: NewCount, BB: Body);
2529
2530	// step 4: Forget the "non-computable" trip-count SCEV associated with the
2531	// loop. The loop would otherwise not be deleted even if it becomes empty.
2532	SE->forgetLoop(L: CurLoop);
2533	}
2534
2535	void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2536	Instruction *CntInst,
2537	PHINode CntPhi, Value Var) {
2538	BasicBlock *PreHead = CurLoop->getLoopPreheader();
2539	auto *PreCondBr = cast<BranchInst>(Val: PreCondBB->getTerminator());
2540	const DebugLoc &DL = CntInst->getDebugLoc();
2541
2542	// Assuming before transformation, the loop is following:
2543	// if (x) // the precondition
2544	// do { cnt++; x &= x - 1; } while(x);
2545
2546	// Step 1: Insert the ctpop instruction at the end of the precondition block
2547	IRBuilder<> Builder(PreCondBr);
2548	Value PopCnt, PopCntZext, NewCount, TripCnt;
2549	{
2550	PopCnt = createPopcntIntrinsic(IRBuilder&: Builder, Val: Var, DL);
2551	NewCount = PopCntZext =
2552	Builder.CreateZExtOrTrunc(V: PopCnt, DestTy: cast<IntegerType>(Val: CntPhi->getType()));
2553
2554	if (NewCount != PopCnt)
2555	(cast<Instruction>(Val: NewCount))->setDebugLoc(DL);
2556
2557	// TripCnt is exactly the number of iterations the loop has
2558	TripCnt = NewCount;
2559
2560	// If the population counter's initial value is not zero, insert Add Inst.
2561	Value *CntInitVal = CntPhi->getIncomingValueForBlock(BB: PreHead);
2562	ConstantInt *InitConst = dyn_cast<ConstantInt>(Val: CntInitVal);
2563	if (!InitConst \|\| !InitConst->isZero()) {
2564	NewCount = Builder.CreateAdd(LHS: NewCount, RHS: CntInitVal);
2565	(cast<Instruction>(Val: NewCount))->setDebugLoc(DL);
2566	}
2567	}
2568
2569	// Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2570	// "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2571	// function would be partial dead code, and downstream passes will drag
2572	// it back from the precondition block to the preheader.
2573	{
2574	ICmpInst *PreCond = cast<ICmpInst>(Val: PreCondBr->getCondition());
2575
2576	Value *Opnd0 = PopCntZext;
2577	Value *Opnd1 = ConstantInt::get(Ty: PopCntZext->getType(), V: `0`);
2578	if (PreCond->getOperand(i_nocapture: `0`) != Var)
2579	std::swap(a&: Opnd0, b&: Opnd1);
2580
2581	ICmpInst *NewPreCond = cast<ICmpInst>(
2582	Val: Builder.CreateICmp(P: PreCond->getPredicate(), LHS: Opnd0, RHS: Opnd1));
2583	PreCondBr->setCondition(NewPreCond);
2584
2585	RecursivelyDeleteTriviallyDeadInstructions(V: PreCond, TLI);
2586	}
2587
2588	// Step 3: Note that the population count is exactly the trip count of the
2589	// loop in question, which enable us to convert the loop from noncountable
2590	// loop into a countable one. The benefit is twofold:
2591	//
2592	// - If the loop only counts population, the entire loop becomes dead after
2593	// the transformation. It is a lot easier to prove a countable loop dead
2594	// than to prove a noncountable one. (In some C dialects, an infinite loop
2595	// isn't dead even if it computes nothing useful. In general, DCE needs
2596	// to prove a noncountable loop finite before safely delete it.)
2597	//
2598	// - If the loop also performs something else, it remains alive.
2599	// Since it is transformed to countable form, it can be aggressively
2600	// optimized by some optimizations which are in general not applicable
2601	// to a noncountable loop.
2602	//
2603	// After this step, this loop (conceptually) would look like following:
2604	// newcnt = __builtin_ctpop(x);
2605	// t = newcnt;
2606	// if (x)
2607	// do { cnt++; x &= x-1; t--) } while (t > 0);
2608	BasicBlock Body = (CurLoop->block_begin());
2609	{
2610	auto *LbBr = cast<BranchInst>(Val: Body->getTerminator());
2611	ICmpInst *LbCond = cast<ICmpInst>(Val: LbBr->getCondition());
2612	Type *Ty = TripCnt->getType();
2613
2614	PHINode *TcPhi = PHINode::Create(Ty, NumReservedValues: `2`, NameStr: "tcphi");
2615	TcPhi->insertBefore(InsertPos: Body->begin());
2616
2617	Builder.SetInsertPoint(LbCond);
2618	Instruction *TcDec = cast<Instruction>(
2619	Val: Builder.CreateSub(LHS: TcPhi, RHS: ConstantInt::get(Ty, V: `1`),
2620	Name: "tcdec", HasNUW: false, HasNSW: true));
2621
2622	TcPhi->addIncoming(V: TripCnt, BB: PreHead);
2623	TcPhi->addIncoming(V: TcDec, BB: Body);
2624
2625	CmpInst::Predicate Pred =
2626	(LbBr->getSuccessor(i: `0`) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2627	LbCond->setPredicate(Pred);
2628	LbCond->setOperand(i_nocapture: `0`, Val_nocapture: TcDec);
2629	LbCond->setOperand(i_nocapture: `1`, Val_nocapture: ConstantInt::get(Ty, V: `0`));
2630	}
2631
2632	// Step 4: All the references to the original population counter outside
2633	// the loop are replaced with the NewCount -- the value returned from
2634	// __builtin_ctpop().
2635	CntInst->replaceUsesOutsideBlock(V: NewCount, BB: Body);
2636
2637	// step 5: Forget the "non-computable" trip-count SCEV associated with the
2638	// loop. The loop would otherwise not be deleted even if it becomes empty.
2639	SE->forgetLoop(L: CurLoop);
2640	}
2641
2642	/// Match loop-invariant value.
2643	template <typename SubPattern_t> struct match_LoopInvariant {
2644	SubPattern_t SubPattern;
2645	const Loop *L;
2646
2647	match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2648	: SubPattern(SP), L(L) {}
2649
2650	template <typename ITy> bool match(ITy V) const* {
2651	return L->isLoopInvariant(V) && SubPattern.match(V);
2652	}
2653	};
2654
2655	/// Matches if the value is loop-invariant.
2656	template <typename Ty>
2657	inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2658	return match_LoopInvariant<Ty>(M, L);
2659	}
2660
2661	/// Return true if the idiom is detected in the loop.
2662	///
2663	/// The core idiom we are trying to detect is:
2664	/// \code
2665	/// entry:
2666	/// <...>
2667	/// %bitmask = shl i32 1, %bitpos
2668	/// br label %loop
2669	///
2670	/// loop:
2671	/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2672	/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2673	/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2674	/// %x.next = shl i32 %x.curr, 1
2675	/// <...>
2676	/// br i1 %x.curr.isbitunset, label %loop, label %end
2677	///
2678	/// end:
2679	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2680	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2681	/// <...>
2682	/// \endcode
2683	static bool detectShiftUntilBitTestIdiom(Loop CurLoop, Value &BaseX,
2684	Value &BitMask, Value &BitPos,
2685	Value &CurrX, Instruction &NextX) {
2686	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2687	" Performing shift-until-bittest idiom detection.\n");
2688
2689	// Give up if the loop has multiple blocks or multiple backedges.
2690	if (CurLoop->getNumBlocks() != `1` \|\| CurLoop->getNumBackEdges() != `1`) {
2691	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2692	return false;
2693	}
2694
2695	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2696	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2697	assert(LoopPreheaderBB && "There is always a loop preheader.");
2698
2699	using namespace PatternMatch;
2700
2701	// Step 1: Check if the loop backedge is in desirable form.
2702
2703	CmpPredicate Pred;
2704	Value CmpLHS, CmpRHS;
2705	BasicBlock TrueBB, FalseBB;
2706	if (!match(V: LoopHeaderBB->getTerminator(),
2707	P: m_Br(C: m_ICmp(Pred, L: m_Value(V&: CmpLHS), R: m_Value(V&: CmpRHS)),
2708	T: m_BasicBlock(V&: TrueBB), F: m_BasicBlock(V&: FalseBB)))) {
2709	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2710	return false;
2711	}
2712
2713	// Step 2: Check if the backedge's condition is in desirable form.
2714
2715	auto MatchVariableBitMask = [&]() {
2716	return ICmpInst::isEquality(P: Pred) && match(V: CmpRHS, P: m_Zero()) &&
2717	match(V: CmpLHS,
2718	P: m_c_And(L: m_Value(V&: CurrX),
2719	R: m_CombineAnd(
2720	L: m_Value(V&: BitMask),
2721	R: m_LoopInvariant(M: m_Shl(L: m_One(), R: m_Value(V&: BitPos)),
2722	L: CurLoop))));
2723	};
2724
2725	auto MatchDecomposableConstantBitMask = [&]() {
2726	auto Res = llvm::decomposeBitTestICmp(
2727	LHS: CmpLHS, RHS: CmpRHS, Pred, /LookThroughTrunc=/true,
2728	/AllowNonZeroC=/false, /DecomposeAnd=/true);
2729	if (Res && Res ->Mask.isPowerOf2()) {
2730	assert(ICmpInst::isEquality(Res->Pred));
2731	Pred = Res ->Pred;
2732	CurrX = Res ->X;
2733	BitMask = ConstantInt::get(Ty: CurrX->getType(), V: Res ->Mask);
2734	BitPos = ConstantInt::get(Ty: CurrX->getType(), V: Res ->Mask.logBase2());
2735	return true;
2736	}
2737	return false;
2738	};
2739
2740	if (!MatchVariableBitMask () && !MatchDecomposableConstantBitMask ()) {
2741	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2742	return false;
2743	}
2744
2745	// Step 3: Check if the recurrence is in desirable form.
2746	auto *CurrXPN = dyn_cast<PHINode>(Val: CurrX);
2747	if (!CurrXPN \|\| CurrXPN->getParent() != LoopHeaderBB) {
2748	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2749	return false;
2750	}
2751
2752	BaseX = CurrXPN->getIncomingValueForBlock(BB: LoopPreheaderBB);
2753	NextX =
2754	dyn_cast<Instruction>(Val: CurrXPN->getIncomingValueForBlock(BB: LoopHeaderBB));
2755
2756	assert(CurLoop->isLoopInvariant(BaseX) &&
2757	"Expected BaseX to be available in the preheader!");
2758
2759	if (!NextX \|\| !match(V: NextX, P: m_Shl(L: m_Specific(V: CurrX), R: m_One()))) {
2760	// FIXME: support right-shift?
2761	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2762	return false;
2763	}
2764
2765	// Step 4: Check if the backedge's destinations are in desirable form.
2766
2767	assert(ICmpInst::isEquality(Pred) &&
2768	"Should only get equality predicates here.");
2769
2770	// cmp-br is commutative, so canonicalize to a single variant.
2771	if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2772	Pred = ICmpInst::getInversePredicate(pred: Pred);
2773	std::swap(a&: TrueBB, b&: FalseBB);
2774	}
2775
2776	// We expect to exit loop when comparison yields false,
2777	// so when it yields true we should branch back to loop header.
2778	if (TrueBB != LoopHeaderBB) {
2779	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2780	return false;
2781	}
2782
2783	// Okay, idiom checks out.
2784	return true;
2785	}
2786
2787	/// Look for the following loop:
2788	/// \code
2789	/// entry:
2790	/// <...>
2791	/// %bitmask = shl i32 1, %bitpos
2792	/// br label %loop
2793	///
2794	/// loop:
2795	/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2796	/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2797	/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2798	/// %x.next = shl i32 %x.curr, 1
2799	/// <...>
2800	/// br i1 %x.curr.isbitunset, label %loop, label %end
2801	///
2802	/// end:
2803	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2804	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2805	/// <...>
2806	/// \endcode
2807	///
2808	/// And transform it into:
2809	/// \code
2810	/// entry:
2811	/// %bitmask = shl i32 1, %bitpos
2812	/// %lowbitmask = add i32 %bitmask, -1
2813	/// %mask = or i32 %lowbitmask, %bitmask
2814	/// %x.masked = and i32 %x, %mask
2815	/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2816	/// i1 true)
2817	/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2818	/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2819	/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2820	/// %tripcount = add i32 %backedgetakencount, 1
2821	/// %x.curr = shl i32 %x, %backedgetakencount
2822	/// %x.next = shl i32 %x, %tripcount
2823	/// br label %loop
2824	///
2825	/// loop:
2826	/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2827	/// %loop.iv.next = add nuw i32 %loop.iv, 1
2828	/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2829	/// <...>
2830	/// br i1 %loop.ivcheck, label %end, label %loop
2831	///
2832	/// end:
2833	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2834	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2835	/// <...>
2836	/// \endcode
2837	bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2838	bool MadeChange = false;
2839
2840	Value X, BitMask, BitPos, XCurr;
2841	Instruction *XNext;
2842	if (!detectShiftUntilBitTestIdiom(CurLoop, BaseX&: X, BitMask, BitPos, CurrX&: XCurr,
2843	NextX&: XNext)) {
2844	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2845	" shift-until-bittest idiom detection failed.\n");
2846	return MadeChange;
2847	}
2848	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2849
2850	// Ok, it is the idiom we were looking for, we could* transform this loop,*
2851	// but is it profitable to transform?
2852
2853	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2854	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2855	assert(LoopPreheaderBB && "There is always a loop preheader.");
2856
2857	BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2858	assert(SuccessorBB && "There is only a single successor.");
2859
2860	IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2861	Builder.SetCurrentDebugLocation(cast<Instruction>(Val: XCurr)->getDebugLoc());
2862
2863	Intrinsic::ID IntrID = Intrinsic::ctlz;
2864	Type *Ty = X->getType();
2865	unsigned Bitwidth = Ty->getScalarSizeInBits();
2866
2867	TargetTransformInfo::TargetCostKind CostKind =
2868	TargetTransformInfo::TCK_SizeAndLatency;
2869
2870	// The rewrite is considered to be unprofitable iff and only iff the
2871	// intrinsic/shift we'll use are not cheap. Note that we are okay with just
2872	// making the loop countable, even if nothing else changes.
2873	IntrinsicCostAttributes Attrs(
2874	IntrID, Ty, {PoisonValue::get(T: Ty), /is_zero_poison=/Builder.getTrue()});
2875	InstructionCost Cost = TTI->getIntrinsicInstrCost(ICA: Attrs, CostKind);
2876	if (Cost > TargetTransformInfo::TCC_Basic) {
2877	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2878	" Intrinsic is too costly, not beneficial\n");
2879	return MadeChange;
2880	}
2881	if (TTI->getArithmeticInstrCost(Opcode: Instruction::Shl, Ty, CostKind) >
2882	TargetTransformInfo::TCC_Basic) {
2883	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2884	return MadeChange;
2885	}
2886
2887	// Ok, transform appears worthwhile.
2888	MadeChange = true;
2889
2890	if (!isGuaranteedNotToBeUndefOrPoison(V: BitPos)) {
2891	// BitMask may be computed from BitPos, Freeze BitPos so we can increase
2892	// it's use count.
2893	std::optional<BasicBlock::iterator> InsertPt = std::nullopt;
2894	if (auto *BitPosI = dyn_cast<Instruction>(Val: BitPos))
2895	InsertPt = BitPosI->getInsertionPointAfterDef();
2896	else
2897	InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
2898	if (!InsertPt)
2899	return false;
2900	FreezeInst *BitPosFrozen =
2901	new FreezeInst (BitPos, BitPos->getName() + ".fr", *InsertPt);
2902	BitPos->replaceUsesWithIf(New: BitPosFrozen, ShouldReplace: [BitPosFrozen](Use &U) {
2903	return U.getUser() != BitPosFrozen;
2904	});
2905	BitPos = BitPosFrozen;
2906	}
2907
2908	// Step 1: Compute the loop trip count.
2909
2910	Value *LowBitMask = Builder.CreateAdd(LHS: BitMask, RHS: Constant::getAllOnesValue(Ty),
2911	Name: BitPos->getName() + ".lowbitmask");
2912	Value *Mask =
2913	Builder.CreateOr(LHS: LowBitMask, RHS: BitMask, Name: BitPos->getName() + ".mask");
2914	Value *XMasked = Builder.CreateAnd(LHS: X, RHS: Mask, Name: X->getName() + ".masked");
2915	CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2916	ID: IntrID, Types: Ty, Args: {XMasked, /is_zero_poison=/Builder.getTrue()},
2917	/FMFSource=/nullptr, Name: XMasked->getName() + ".numleadingzeros");
2918	Value *XMaskedNumActiveBits = Builder.CreateSub(
2919	LHS: ConstantInt::get(Ty, V: Ty->getScalarSizeInBits()), RHS: XMaskedNumLeadingZeros,
2920	Name: XMasked->getName() + ".numactivebits", /HasNUW=/true,
2921	/HasNSW=/Bitwidth != `2`);
2922	Value *XMaskedLeadingOnePos =
2923	Builder.CreateAdd(LHS: XMaskedNumActiveBits, RHS: Constant::getAllOnesValue(Ty),
2924	Name: XMasked->getName() + ".leadingonepos", /HasNUW=/false,
2925	/HasNSW=/Bitwidth > `2`);
2926
2927	Value *LoopBackedgeTakenCount = Builder.CreateSub(
2928	LHS: BitPos, RHS: XMaskedLeadingOnePos, Name: CurLoop->getName() + ".backedgetakencount",
2929	/HasNUW=/true, /HasNSW=/true);
2930	// We know loop's backedge-taken count, but what's loop's trip count?
2931	// Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2932	Value *LoopTripCount =
2933	Builder.CreateAdd(LHS: LoopBackedgeTakenCount, RHS: ConstantInt::get(Ty, V: `1`),
2934	Name: CurLoop->getName() + ".tripcount", /HasNUW=/true,
2935	/HasNSW=/Bitwidth != `2`);
2936
2937	// Step 2: Compute the recurrence's final value without a loop.
2938
2939	// NewX is always safe to compute, because `LoopBackedgeTakenCount`
2940	// will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2941	Value *NewX = Builder.CreateShl(LHS: X, RHS: LoopBackedgeTakenCount);
2942	NewX->takeName(V: XCurr);
2943	if (auto *I = dyn_cast<Instruction>(Val: NewX))
2944	I->copyIRFlags(V: XNext, /IncludeWrapFlags=/true);
2945
2946	Value *NewXNext;
2947	// Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2948	// will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2949	// iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2950	// that isn't the case, we'll need to emit an alternative, safe IR.
2951	if (XNext->hasNoSignedWrap() \|\| XNext->hasNoUnsignedWrap() \|\|
2952	PatternMatch::match(
2953	V: BitPos, P: PatternMatch::m_SpecificInt_ICMP(
2954	Predicate: ICmpInst::ICMP_NE, Threshold: APInt (Ty->getScalarSizeInBits(),
2955	Ty->getScalarSizeInBits() - `1`))))
2956	NewXNext = Builder.CreateShl(LHS: X, RHS: LoopTripCount);
2957	else {
2958	// Otherwise, just additionally shift by one. It's the smallest solution,
2959	// alternatively, we could check that NewX is INT_MIN (or BitPos is )
2960	// and select 0 instead.
2961	NewXNext = Builder.CreateShl(LHS: NewX, RHS: ConstantInt::get(Ty, V: `1`));
2962	}
2963
2964	NewXNext->takeName(V: XNext);
2965	if (auto *I = dyn_cast<Instruction>(Val: NewXNext))
2966	I->copyIRFlags(V: XNext, /IncludeWrapFlags=/true);
2967
2968	// Step 3: Adjust the successor basic block to recieve the computed
2969	// recurrence's final value instead of the recurrence itself.
2970
2971	XCurr->replaceUsesOutsideBlock(V: NewX, BB: LoopHeaderBB);
2972	XNext->replaceUsesOutsideBlock(V: NewXNext, BB: LoopHeaderBB);
2973
2974	// Step 4: Rewrite the loop into a countable form, with canonical IV.
2975
2976	// The new canonical induction variable.
2977	Builder.SetInsertPoint(TheBB: LoopHeaderBB, IP: LoopHeaderBB->begin());
2978	auto *IV = Builder.CreatePHI(Ty, NumReservedValues: `2`, Name: CurLoop->getName() + ".iv");
2979
2980	// The induction itself.
2981	// Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2982	Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2983	auto *IVNext =
2984	Builder.CreateAdd(LHS: IV, RHS: ConstantInt::get(Ty, V: `1`), Name: IV->getName() + ".next",
2985	/HasNUW=/true, /HasNSW=/Bitwidth != `2`);
2986
2987	// The loop trip count check.
2988	auto *IVCheck = Builder.CreateICmpEQ(LHS: IVNext, RHS: LoopTripCount,
2989	Name: CurLoop->getName() + ".ivcheck");
2990	Builder.CreateCondBr(Cond: IVCheck, True: SuccessorBB, False: LoopHeaderBB);
2991	LoopHeaderBB->getTerminator()->eraseFromParent();
2992
2993	// Populate the IV PHI.
2994	IV->addIncoming(V: ConstantInt::get(Ty, V: `0`), BB: LoopPreheaderBB);
2995	IV->addIncoming(V: IVNext, BB: LoopHeaderBB);
2996
2997	// Step 5: Forget the "non-computable" trip-count SCEV associated with the
2998	// loop. The loop would otherwise not be deleted even if it becomes empty.
2999
3000	SE->forgetLoop(L: CurLoop);
3001
3002	// Other passes will take care of actually deleting the loop if possible.
3003
3004	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
3005
3006	++NumShiftUntilBitTest;
3007	return MadeChange;
3008	}
3009
3010	/// Return true if the idiom is detected in the loop.
3011	///
3012	/// The core idiom we are trying to detect is:
3013	/// \code
3014	/// entry:
3015	/// <...>
3016	/// %start = <...>
3017	/// %extraoffset = <...>
3018	/// <...>
3019	/// br label %for.cond
3020	///
3021	/// loop:
3022	/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3023	/// %nbits = add nsw i8 %iv, %extraoffset
3024	/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3025	/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3026	/// %iv.next = add i8 %iv, 1
3027	/// <...>
3028	/// br i1 %val.shifted.iszero, label %end, label %loop
3029	///
3030	/// end:
3031	/// %iv.res = phi i8 [ %iv, %loop ] <...>
3032	/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3033	/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3034	/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3035	/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3036	/// <...>
3037	/// \endcode
3038	static bool detectShiftUntilZeroIdiom(Loop CurLoop, ScalarEvolution SE,
3039	Instruction *&ValShiftedIsZero,
3040	Intrinsic::ID &IntrinID, Instruction *&IV,
3041	Value &Start, Value &Val,
3042	const SCEV *&ExtraOffsetExpr,
3043	bool &InvertedCond) {
3044	LLVM_DEBUG(dbgs() << DEBUG_TYPE
3045	" Performing shift-until-zero idiom detection.\n");
3046
3047	// Give up if the loop has multiple blocks or multiple backedges.
3048	if (CurLoop->getNumBlocks() != `1` \|\| CurLoop->getNumBackEdges() != `1`) {
3049	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
3050	return false;
3051	}
3052
3053	Instruction ValShifted, NBits, *IVNext;
3054	Value *ExtraOffset;
3055
3056	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3057	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3058	assert(LoopPreheaderBB && "There is always a loop preheader.");
3059
3060	using namespace PatternMatch;
3061
3062	// Step 1: Check if the loop backedge, condition is in desirable form.
3063
3064	CmpPredicate Pred;
3065	BasicBlock TrueBB, FalseBB;
3066	if (!match(V: LoopHeaderBB->getTerminator(),
3067	P: m_Br(C: m_Instruction(I&: ValShiftedIsZero), T: m_BasicBlock(V&: TrueBB),
3068	F: m_BasicBlock(V&: FalseBB))) \|\|
3069	!match(V: ValShiftedIsZero,
3070	P: m_ICmp(Pred, L: m_Instruction(I&: ValShifted), R: m_Zero())) \|\|
3071	!ICmpInst::isEquality(P: Pred)) {
3072	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
3073	return false;
3074	}
3075
3076	// Step 2: Check if the comparison's operand is in desirable form.
3077	// FIXME: Val could be a one-input PHI node, which we should look past.
3078	if (!match(V: ValShifted, P: m_Shift(L: m_LoopInvariant(M: m_Value(V&: Val), L: CurLoop),
3079	R: m_Instruction(I&: NBits)))) {
3080	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
3081	return false;
3082	}
3083	IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
3084	: Intrinsic::ctlz;
3085
3086	// Step 3: Check if the shift amount is in desirable form.
3087
3088	if (match(V: NBits, P: m_c_Add(L: m_Instruction(I&: IV),
3089	R: m_LoopInvariant(M: m_Value(V&: ExtraOffset), L: CurLoop))) &&
3090	(NBits->hasNoSignedWrap() \|\| NBits->hasNoUnsignedWrap()))
3091	ExtraOffsetExpr = SE->getNegativeSCEV(V: SE->getSCEV(V: ExtraOffset));
3092	else if (match(V: NBits,
3093	P: m_Sub(L: m_Instruction(I&: IV),
3094	R: m_LoopInvariant(M: m_Value(V&: ExtraOffset), L: CurLoop))) &&
3095	NBits->hasNoSignedWrap())
3096	ExtraOffsetExpr = SE->getSCEV(V: ExtraOffset);
3097	else {
3098	IV = NBits;
3099	ExtraOffsetExpr = SE->getZero(Ty: NBits->getType());
3100	}
3101
3102	// Step 4: Check if the recurrence is in desirable form.
3103	auto *IVPN = dyn_cast<PHINode>(Val: IV);
3104	if (!IVPN \|\| IVPN->getParent() != LoopHeaderBB) {
3105	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
3106	return false;
3107	}
3108
3109	Start = IVPN->getIncomingValueForBlock(BB: LoopPreheaderBB);
3110	IVNext = dyn_cast<Instruction>(Val: IVPN->getIncomingValueForBlock(BB: LoopHeaderBB));
3111
3112	if (!IVNext \|\| !match(V: IVNext, P: m_Add(L: m_Specific(V: IVPN), R: m_One()))) {
3113	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
3114	return false;
3115	}
3116
3117	// Step 4: Check if the backedge's destinations are in desirable form.
3118
3119	assert(ICmpInst::isEquality(Pred) &&
3120	"Should only get equality predicates here.");
3121
3122	// cmp-br is commutative, so canonicalize to a single variant.
3123	InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
3124	if (InvertedCond) {
3125	Pred = ICmpInst::getInversePredicate(pred: Pred);
3126	std::swap(a&: TrueBB, b&: FalseBB);
3127	}
3128
3129	// We expect to exit loop when comparison yields true,
3130	// so when it yields false we should branch back to loop header.
3131	if (FalseBB != LoopHeaderBB) {
3132	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
3133	return false;
3134	}
3135
3136	// The new, countable, loop will certainly only run a known number of
3137	// iterations, It won't be infinite. But the old loop might be infinite
3138	// under certain conditions. For logical shifts, the value will become zero
3139	// after at most bitwidth(%Val) loop iterations. However, for arithmetic
3140	// right-shift, iff the sign bit was set, the value will never become zero,
3141	// and the loop may never finish.
3142	if (ValShifted->getOpcode() == Instruction::AShr &&
3143	!isMustProgress(L: CurLoop) && !SE->isKnownNonNegative(S: SE->getSCEV(V: Val))) {
3144	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
3145	return false;
3146	}
3147
3148	// Okay, idiom checks out.
3149	return true;
3150	}
3151
3152	/// Look for the following loop:
3153	/// \code
3154	/// entry:
3155	/// <...>
3156	/// %start = <...>
3157	/// %extraoffset = <...>
3158	/// <...>
3159	/// br label %for.cond
3160	///
3161	/// loop:
3162	/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3163	/// %nbits = add nsw i8 %iv, %extraoffset
3164	/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3165	/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3166	/// %iv.next = add i8 %iv, 1
3167	/// <...>
3168	/// br i1 %val.shifted.iszero, label %end, label %loop
3169	///
3170	/// end:
3171	/// %iv.res = phi i8 [ %iv, %loop ] <...>
3172	/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3173	/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3174	/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3175	/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3176	/// <...>
3177	/// \endcode
3178	///
3179	/// And transform it into:
3180	/// \code
3181	/// entry:
3182	/// <...>
3183	/// %start = <...>
3184	/// %extraoffset = <...>
3185	/// <...>
3186	/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
3187	/// %val.numactivebits = sub i8 8, %val.numleadingzeros
3188	/// %extraoffset.neg = sub i8 0, %extraoffset
3189	/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
3190	/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
3191	/// %loop.tripcount = sub i8 %iv.final, %start
3192	/// br label %loop
3193	///
3194	/// loop:
3195	/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
3196	/// %loop.iv.next = add i8 %loop.iv, 1
3197	/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
3198	/// %iv = add i8 %loop.iv, %start
3199	/// <...>
3200	/// br i1 %loop.ivcheck, label %end, label %loop
3201	///
3202	/// end:
3203	/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
3204	/// <...>
3205	/// \endcode
3206	bool LoopIdiomRecognize::recognizeShiftUntilZero() {
3207	bool MadeChange = false;
3208
3209	Instruction *ValShiftedIsZero;
3210	Intrinsic::ID IntrID;
3211	Instruction *IV;
3212	Value Start, Val;
3213	const SCEV *ExtraOffsetExpr;
3214	bool InvertedCond;
3215	if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrinID&: IntrID, IV,
3216	Start, Val, ExtraOffsetExpr, InvertedCond)) {
3217	LLVM_DEBUG(dbgs() << DEBUG_TYPE
3218	" shift-until-zero idiom detection failed.\n");
3219	return MadeChange;
3220	}
3221	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
3222
3223	// Ok, it is the idiom we were looking for, we could* transform this loop,*
3224	// but is it profitable to transform?
3225
3226	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3227	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3228	assert(LoopPreheaderBB && "There is always a loop preheader.");
3229
3230	BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3231	assert(SuccessorBB && "There is only a single successor.");
3232
3233	IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3234	Builder.SetCurrentDebugLocation(IV->getDebugLoc());
3235
3236	Type *Ty = Val->getType();
3237	unsigned Bitwidth = Ty->getScalarSizeInBits();
3238
3239	TargetTransformInfo::TargetCostKind CostKind =
3240	TargetTransformInfo::TCK_SizeAndLatency;
3241
3242	// The rewrite is considered to be unprofitable iff and only iff the
3243	// intrinsic we'll use are not cheap. Note that we are okay with just
3244	// making the loop countable, even if nothing else changes.
3245	IntrinsicCostAttributes Attrs(
3246	IntrID, Ty, {PoisonValue::get(T: Ty), /is_zero_poison=/Builder.getFalse()});
3247	InstructionCost Cost = TTI->getIntrinsicInstrCost(ICA: Attrs, CostKind);
3248	if (Cost > TargetTransformInfo::TCC_Basic) {
3249	LLVM_DEBUG(dbgs() << DEBUG_TYPE
3250	" Intrinsic is too costly, not beneficial\n");
3251	return MadeChange;
3252	}
3253
3254	// Ok, transform appears worthwhile.
3255	MadeChange = true;
3256
3257	bool OffsetIsZero = ExtraOffsetExpr->isZero();
3258
3259	// Step 1: Compute the loop's final IV value / trip count.
3260
3261	CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
3262	ID: IntrID, Types: Ty, Args: {Val, /is_zero_poison=/Builder.getFalse()},
3263	/FMFSource=/nullptr, Name: Val->getName() + ".numleadingzeros");
3264	Value *ValNumActiveBits = Builder.CreateSub(
3265	LHS: ConstantInt::get(Ty, V: Ty->getScalarSizeInBits()), RHS: ValNumLeadingZeros,
3266	Name: Val->getName() + ".numactivebits", /HasNUW=/true,
3267	/HasNSW=/Bitwidth != `2`);
3268
3269	SCEVExpander Expander(SE, DL, "loop-idiom");
3270	Expander.setInsertPoint(&*Builder.GetInsertPoint());
3271	Value *ExtraOffset = Expander.expandCodeFor(SH: ExtraOffsetExpr);
3272
3273	Value *ValNumActiveBitsOffset = Builder.CreateAdd(
3274	LHS: ValNumActiveBits, RHS: ExtraOffset, Name: ValNumActiveBits->getName() + ".offset",
3275	/HasNUW=/OffsetIsZero, /HasNSW=/true);
3276	Value *IVFinal = Builder.CreateIntrinsic(ID: Intrinsic::smax, Types: {Ty},
3277	Args: {ValNumActiveBitsOffset, Start},
3278	/FMFSource=/nullptr, Name: "iv.final");
3279
3280	auto *LoopBackedgeTakenCount = cast<Instruction>(Val: Builder.CreateSub(
3281	LHS: IVFinal, RHS: Start, Name: CurLoop->getName() + ".backedgetakencount",
3282	/HasNUW=/OffsetIsZero, /HasNSW=/true));
3283	// FIXME: or when the offset was `add nuw`
3284
3285	// We know loop's backedge-taken count, but what's loop's trip count?
3286	Value *LoopTripCount =
3287	Builder.CreateAdd(LHS: LoopBackedgeTakenCount, RHS: ConstantInt::get(Ty, V: `1`),
3288	Name: CurLoop->getName() + ".tripcount", /HasNUW=/true,
3289	/HasNSW=/Bitwidth != `2`);
3290
3291	// Step 2: Adjust the successor basic block to recieve the original
3292	// induction variable's final value instead of the orig. IV itself.
3293
3294	IV->replaceUsesOutsideBlock(V: IVFinal, BB: LoopHeaderBB);
3295
3296	// Step 3: Rewrite the loop into a countable form, with canonical IV.
3297
3298	// The new canonical induction variable.
3299	Builder.SetInsertPoint(TheBB: LoopHeaderBB, IP: LoopHeaderBB->begin());
3300	auto *CIV = Builder.CreatePHI(Ty, NumReservedValues: `2`, Name: CurLoop->getName() + ".iv");
3301
3302	// The induction itself.
3303	Builder.SetInsertPoint(TheBB: LoopHeaderBB, IP: LoopHeaderBB->getFirstNonPHIIt());
3304	auto *CIVNext =
3305	Builder.CreateAdd(LHS: CIV, RHS: ConstantInt::get(Ty, V: `1`), Name: CIV->getName() + ".next",
3306	/HasNUW=/true, /HasNSW=/Bitwidth != `2`);
3307
3308	// The loop trip count check.
3309	auto *CIVCheck = Builder.CreateICmpEQ(LHS: CIVNext, RHS: LoopTripCount,
3310	Name: CurLoop->getName() + ".ivcheck");
3311	auto *NewIVCheck = CIVCheck;
3312	if (InvertedCond) {
3313	NewIVCheck = Builder.CreateNot(V: CIVCheck);
3314	NewIVCheck->takeName(V: ValShiftedIsZero);
3315	}
3316
3317	// The original IV, but rebased to be an offset to the CIV.
3318	auto IVDePHId = Builder.CreateAdd(LHS: CIV, RHS: Start, Name: "", /HasNUW=/*false,
3319	/HasNSW=/true); // FIXME: what about NUW?
3320	IVDePHId->takeName(V: IV);
3321
3322	// The loop terminator.
3323	Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3324	Builder.CreateCondBr(Cond: CIVCheck, True: SuccessorBB, False: LoopHeaderBB);
3325	LoopHeaderBB->getTerminator()->eraseFromParent();
3326
3327	// Populate the IV PHI.
3328	CIV->addIncoming(V: ConstantInt::get(Ty, V: `0`), BB: LoopPreheaderBB);
3329	CIV->addIncoming(V: CIVNext, BB: LoopHeaderBB);
3330
3331	// Step 4: Forget the "non-computable" trip-count SCEV associated with the
3332	// loop. The loop would otherwise not be deleted even if it becomes empty.
3333
3334	SE->forgetLoop(L: CurLoop);
3335
3336	// Step 5: Try to cleanup the loop's body somewhat.
3337	IV->replaceAllUsesWith(V: IVDePHId);
3338	IV->eraseFromParent();
3339
3340	ValShiftedIsZero->replaceAllUsesWith(V: NewIVCheck);
3341	ValShiftedIsZero->eraseFromParent();
3342
3343	// Other passes will take care of actually deleting the loop if possible.
3344
3345	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
3346
3347	++NumShiftUntilZero;
3348	return MadeChange;
3349	}
3350

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp