LoopIdiomRecognize.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp]

1	//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass implements an idiom recognizer that transforms simple loops into a
10	// non-loop form. In cases that this kicks in, it can be a significant
11	// performance win.
12	//
13	// If compiling for code size we avoid idiom recognition if the resulting
14	// code could be larger than the code for the original loop. One way this could
15	// happen is if the loop is not removable after idiom recognition due to the
16	// presence of non-idiom instructions. The initial implementation of the
17	// heuristics applies to idioms in multi-block loops.
18	//
19	//===----------------------------------------------------------------------===//
20	//
21	// TODO List:
22	//
23	// Future loop memory idioms to recognize:
24	// memcmp, strlen, etc.
25	//
26	// This could recognize common matrix multiplies and dot product idioms and
27	// replace them with calls to BLAS (if linked in??).
28	//
29	//===----------------------------------------------------------------------===//
30
31	#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
32	#include "llvm/ADT/APInt.h"
33	#include "llvm/ADT/ArrayRef.h"
34	#include "llvm/ADT/DenseMap.h"
35	#include "llvm/ADT/MapVector.h"
36	#include "llvm/ADT/SetVector.h"
37	#include "llvm/ADT/SmallPtrSet.h"
38	#include "llvm/ADT/SmallVector.h"
39	#include "llvm/ADT/Statistic.h"
40	#include "llvm/ADT/StringRef.h"
41	#include "llvm/Analysis/AliasAnalysis.h"
42	#include "llvm/Analysis/CmpInstAnalysis.h"
43	#include "llvm/Analysis/LoopAccessAnalysis.h"
44	#include "llvm/Analysis/LoopInfo.h"
45	#include "llvm/Analysis/LoopPass.h"
46	#include "llvm/Analysis/MemoryLocation.h"
47	#include "llvm/Analysis/MemorySSA.h"
48	#include "llvm/Analysis/MemorySSAUpdater.h"
49	#include "llvm/Analysis/MustExecute.h"
50	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
51	#include "llvm/Analysis/ScalarEvolution.h"
52	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
53	#include "llvm/Analysis/TargetLibraryInfo.h"
54	#include "llvm/Analysis/TargetTransformInfo.h"
55	#include "llvm/Analysis/ValueTracking.h"
56	#include "llvm/IR/BasicBlock.h"
57	#include "llvm/IR/Constant.h"
58	#include "llvm/IR/Constants.h"
59	#include "llvm/IR/DataLayout.h"
60	#include "llvm/IR/DebugLoc.h"
61	#include "llvm/IR/DerivedTypes.h"
62	#include "llvm/IR/Dominators.h"
63	#include "llvm/IR/GlobalValue.h"
64	#include "llvm/IR/GlobalVariable.h"
65	#include "llvm/IR/IRBuilder.h"
66	#include "llvm/IR/InstrTypes.h"
67	#include "llvm/IR/Instruction.h"
68	#include "llvm/IR/Instructions.h"
69	#include "llvm/IR/IntrinsicInst.h"
70	#include "llvm/IR/Intrinsics.h"
71	#include "llvm/IR/LLVMContext.h"
72	#include "llvm/IR/Module.h"
73	#include "llvm/IR/PassManager.h"
74	#include "llvm/IR/PatternMatch.h"
75	#include "llvm/IR/Type.h"
76	#include "llvm/IR/User.h"
77	#include "llvm/IR/Value.h"
78	#include "llvm/IR/ValueHandle.h"
79	#include "llvm/Support/Casting.h"
80	#include "llvm/Support/CommandLine.h"
81	#include "llvm/Support/Debug.h"
82	#include "llvm/Support/InstructionCost.h"
83	#include "llvm/Support/raw_ostream.h"
84	#include "llvm/Transforms/Utils/BuildLibCalls.h"
85	#include "llvm/Transforms/Utils/Local.h"
86	#include "llvm/Transforms/Utils/LoopUtils.h"
87	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
88	#include <algorithm>
89	#include <cassert>
90	#include <cstdint>
91	#include <utility>
92	#include <vector>
93
94	using namespace llvm;
95
96	#define DEBUG_TYPE "loop-idiom"
97
98	STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
99	STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
100	STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
101	STATISTIC(
102	NumShiftUntilBitTest,
103	"Number of uncountable loops recognized as 'shift until bitttest' idiom");
104	STATISTIC(NumShiftUntilZero,
105	"Number of uncountable loops recognized as 'shift until zero' idiom");
106
107	bool DisableLIRP::All;
108	static cl::opt<bool, true>
109	DisableLIRPAll("disable-" DEBUG_TYPE "-all",
110	cl::desc ("Options to disable Loop Idiom Recognize Pass."),
111	cl::location(L&: DisableLIRP::All), cl::init(Val: false),
112	cl::ReallyHidden);
113
114	bool DisableLIRP::Memset;
115	static cl::opt<bool, true>
116	DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
117	cl::desc ("Proceed with loop idiom recognize pass, but do "
118	"not convert loop(s) to memset."),
119	cl::location(L&: DisableLIRP::Memset), cl::init(Val: false),
120	cl::ReallyHidden);
121
122	bool DisableLIRP::Memcpy;
123	static cl::opt<bool, true>
124	DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
125	cl::desc ("Proceed with loop idiom recognize pass, but do "
126	"not convert loop(s) to memcpy."),
127	cl::location(L&: DisableLIRP::Memcpy), cl::init(Val: false),
128	cl::ReallyHidden);
129
130	static cl::opt<bool> UseLIRCodeSizeHeurs(
131	"use-lir-code-size-heurs",
132	cl::desc ("Use loop idiom recognition code size heuristics when compiling"
133	"with -Os/-Oz"),
134	cl::init(Val: true), cl::Hidden);
135
136	namespace {
137
138	class LoopIdiomRecognize {
139	Loop CurLoop = nullptr*;
140	AliasAnalysis *AA;
141	DominatorTree *DT;
142	LoopInfo *LI;
143	ScalarEvolution *SE;
144	TargetLibraryInfo *TLI;
145	const TargetTransformInfo *TTI;
146	const DataLayout *DL;
147	OptimizationRemarkEmitter &ORE;
148	bool ApplyCodeSizeHeuristics;
149	std::unique_ptr<MemorySSAUpdater> MSSAU;
150
151	public:
152	explicit LoopIdiomRecognize(AliasAnalysis AA, DominatorTree DT,
153	LoopInfo LI, ScalarEvolution SE,
154	TargetLibraryInfo *TLI,
155	const TargetTransformInfo TTI, MemorySSA MSSA,
156	const DataLayout *DL,
157	OptimizationRemarkEmitter &ORE)
158	: AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
159	if (MSSA)
160	MSSAU = std::make_unique<MemorySSAUpdater>(args&: MSSA);
161	}
162
163	bool runOnLoop(Loop *L);
164
165	private:
166	using StoreList = SmallVector<StoreInst *, `8`>;
167	using StoreListMap = MapVector<Value *, StoreList>;
168
169	StoreListMap StoreRefsForMemset;
170	StoreListMap StoreRefsForMemsetPattern;
171	StoreList StoreRefsForMemcpy;
172	bool HasMemset;
173	bool HasMemsetPattern;
174	bool HasMemcpy;
175
176	/// Return code for isLegalStore()
177	enum LegalStoreKind {
178	None = `0`,
179	Memset,
180	MemsetPattern,
181	Memcpy,
182	UnorderedAtomicMemcpy,
183	DontUse // Dummy retval never to be used. Allows catching errors in retval
184	// handling.
185	};
186
187	/// \name Countable Loop Idiom Handling
188	/// @{
189
190	bool runOnCountableLoop();
191	bool runOnLoopBlock(BasicBlock BB, const* SCEV *BECount,
192	SmallVectorImpl<BasicBlock *> &ExitBlocks);
193
194	void collectStores(BasicBlock *BB);
195	LegalStoreKind isLegalStore(StoreInst *SI);
196	enum class ForMemset { No, Yes };
197	bool processLoopStores(SmallVectorImpl<StoreInst > &SL, const* SCEV *BECount,
198	ForMemset For);
199
200	template <typename MemInst>
201	bool processLoopMemIntrinsic(
202	BasicBlock *BB,
203	bool (LoopIdiomRecognize::Processor)(MemInst , const SCEV *),
204	const SCEV *BECount);
205	bool processLoopMemCpy(MemCpyInst MCI, const* SCEV *BECount);
206	bool processLoopMemSet(MemSetInst MSI, const* SCEV *BECount);
207
208	bool processLoopStridedStore(Value DestPtr, const* SCEV *StoreSizeSCEV,
209	MaybeAlign StoreAlignment, Value *StoredVal,
210	Instruction *TheStore,
211	SmallPtrSetImpl<Instruction *> &Stores,
212	const SCEVAddRecExpr Ev, const* SCEV *BECount,
213	bool IsNegStride, bool IsLoopMemset = false);
214	bool processLoopStoreOfLoopLoad(StoreInst SI, const* SCEV *BECount);
215	bool processLoopStoreOfLoopLoad(Value DestPtr, Value SourcePtr,
216	const SCEV *StoreSize, MaybeAlign StoreAlign,
217	MaybeAlign LoadAlign, Instruction *TheStore,
218	Instruction *TheLoad,
219	const SCEVAddRecExpr *StoreEv,
220	const SCEVAddRecExpr *LoadEv,
221	const SCEV *BECount);
222	bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
223	bool IsLoopMemset = false);
224
225	/// @}
226	/// \name Noncountable Loop Idiom Handling
227	/// @{
228
229	bool runOnNoncountableLoop();
230
231	bool recognizePopcount();
232	void transformLoopToPopcount(BasicBlock PreCondBB, Instruction CntInst,
233	PHINode CntPhi, Value Var);
234	bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
235	bool ZeroCheck, size_t CanonicalSize);
236	bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
237	Instruction DefX, PHINode CntPhi,
238	Instruction *CntInst);
239	bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
240	bool recognizeShiftUntilLessThan();
241	void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
242	Instruction CntInst, PHINode CntPhi,
243	Value Var, Instruction DefX,
244	const DebugLoc &DL, bool ZeroCheck,
245	bool IsCntPhiUsedOutsideLoop,
246	bool InsertSub = false);
247
248	bool recognizeShiftUntilBitTest();
249	bool recognizeShiftUntilZero();
250
251	/// @}
252	};
253	} // end anonymous namespace
254
255	PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
256	LoopStandardAnalysisResults &AR,
257	LPMUpdater &) {
258	if (DisableLIRP::All)
259	return PreservedAnalyses::all();
260
261	const auto *DL = &L.getHeader()->getDataLayout();
262
263	// For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
264	// pass. Function analyses need to be preserved across loop transformations
265	// but ORE cannot be preserved (see comment before the pass definition).
266	OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
267
268	LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
269	AR.MSSA, DL, ORE);
270	if (!LIR.runOnLoop(L: &L))
271	return PreservedAnalyses::all();
272
273	auto PA = getLoopPassPreservedAnalyses();
274	if (AR.MSSA)
275	PA.preserve<MemorySSAAnalysis>();
276	return PA;
277	}
278
279	static void deleteDeadInstruction(Instruction *I) {
280	I->replaceAllUsesWith(V: PoisonValue::get(T: I->getType()));
281	I->eraseFromParent();
282	}
283
284	//===----------------------------------------------------------------------===//
285	//
286	// Implementation of LoopIdiomRecognize
287	//
288	//===----------------------------------------------------------------------===//
289
290	bool LoopIdiomRecognize::runOnLoop(Loop *L) {
291	CurLoop = L;
292	// If the loop could not be converted to canonical form, it must have an
293	// indirectbr in it, just give up.
294	if (!L->getLoopPreheader())
295	return false;
296
297	// Disable loop idiom recognition if the function's name is a common idiom.
298	StringRef Name = L->getHeader()->getParent()->getName();
299	if (Name == "memset" \|\| Name == "memcpy")
300	return false;
301
302	// Determine if code size heuristics need to be applied.
303	ApplyCodeSizeHeuristics =
304	L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
305
306	HasMemset = TLI->has(F: LibFunc_memset);
307	HasMemsetPattern = TLI->has(F: LibFunc_memset_pattern16);
308	HasMemcpy = TLI->has(F: LibFunc_memcpy);
309
310	if (HasMemset \|\| HasMemsetPattern \|\| HasMemcpy)
311	if (SE->hasLoopInvariantBackedgeTakenCount(L))
312	return runOnCountableLoop();
313
314	return runOnNoncountableLoop();
315	}
316
317	bool LoopIdiomRecognize::runOnCountableLoop() {
318	const SCEV *BECount = SE->getBackedgeTakenCount(L: CurLoop);
319	assert(!isa<SCEVCouldNotCompute>(BECount) &&
320	"runOnCountableLoop() called on a loop without a predictable"
321	"backedge-taken count");
322
323	// If this loop executes exactly one time, then it should be peeled, not
324	// optimized by this pass.
325	if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(Val: BECount))
326	if (BECst->getAPInt() == `0`)
327	return false;
328
329	SmallVector<BasicBlock *, `8`> ExitBlocks;
330	CurLoop->getUniqueExitBlocks(ExitBlocks);
331
332	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
333	<< CurLoop->getHeader()->getParent()->getName()
334	<< "] Countable Loop %" << CurLoop->getHeader()->getName()
335	<< "\n");
336
337	// The following transforms hoist stores/memsets into the loop pre-header.
338	// Give up if the loop has instructions that may throw.
339	SimpleLoopSafetyInfo SafetyInfo;
340	SafetyInfo.computeLoopSafetyInfo(CurLoop);
341	if (SafetyInfo.anyBlockMayThrow())
342	return false;
343
344	bool MadeChange = false;
345
346	// Scan all the blocks in the loop that are not in subloops.
347	for (auto *BB : CurLoop->getBlocks()) {
348	// Ignore blocks in subloops.
349	if (LI->getLoopFor(BB) != CurLoop)
350	continue;
351
352	MadeChange \|= runOnLoopBlock(BB, BECount, ExitBlocks);
353	}
354	return MadeChange;
355	}
356
357	static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
358	const SCEVConstant *ConstStride = cast<SCEVConstant>(Val: StoreEv->getOperand(i: `1`));
359	return ConstStride->getAPInt();
360	}
361
362	/// getMemSetPatternValue - If a strided store of the specified value is safe to
363	/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
364	/// be passed in. Otherwise, return null.
365	///
366	/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
367	/// just replicate their input array and then pass on to memset_pattern16.
368	static Constant getMemSetPatternValue(Value V, const DataLayout *DL) {
369	// FIXME: This could check for UndefValue because it can be merged into any
370	// other valid pattern.
371
372	// If the value isn't a constant, we can't promote it to being in a constant
373	// array. We could theoretically do a store to an alloca or something, but
374	// that doesn't seem worthwhile.
375	Constant *C = dyn_cast<Constant>(Val: V);
376	if (!C \|\| isa<ConstantExpr>(Val: C))
377	return nullptr;
378
379	// Only handle simple values that are a power of two bytes in size.
380	uint64_t Size = DL->getTypeSizeInBits(Ty: V->getType());
381	if (Size == `0` \|\| (Size & `7`) \|\| (Size & (Size - `1`)))
382	return nullptr;
383
384	// Don't care enough about darwin/ppc to implement this.
385	if (DL->isBigEndian())
386	return nullptr;
387
388	// Convert to size in bytes.
389	Size /= `8`;
390
391	// TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
392	// if the top and bottom are the same (e.g. for vectors and large integers).
393	if (Size > `16`)
394	return nullptr;
395
396	// If the constant is exactly 16 bytes, just use it.
397	if (Size == `16`)
398	return C;
399
400	// Otherwise, we'll use an array of the constants.
401	unsigned ArraySize = `16` / Size;
402	ArrayType *AT = ArrayType::get(ElementType: V->getType(), NumElements: ArraySize);
403	return ConstantArray::get(T: AT, V: std::vector<Constant *>(ArraySize, C));
404	}
405
406	LoopIdiomRecognize::LegalStoreKind
407	LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
408	// Don't touch volatile stores.
409	if (SI->isVolatile())
410	return LegalStoreKind::None;
411	// We only want simple or unordered-atomic stores.
412	if (!SI->isUnordered())
413	return LegalStoreKind::None;
414
415	// Avoid merging nontemporal stores.
416	if (SI->getMetadata(KindID: LLVMContext::MD_nontemporal))
417	return LegalStoreKind::None;
418
419	Value *StoredVal = SI->getValueOperand();
420	Value *StorePtr = SI->getPointerOperand();
421
422	// Don't convert stores of non-integral pointer types to memsets (which stores
423	// integers).
424	if (DL->isNonIntegralPointerType(Ty: StoredVal->getType()->getScalarType()))
425	return LegalStoreKind::None;
426
427	// Reject stores that are so large that they overflow an unsigned.
428	// When storing out scalable vectors we bail out for now, since the code
429	// below currently only works for constant strides.
430	TypeSize SizeInBits = DL->getTypeSizeInBits(Ty: StoredVal->getType());
431	if (SizeInBits.isScalable() \|\| (SizeInBits.getFixedValue() & `7`) \|\|
432	(SizeInBits.getFixedValue() >> `32`) != `0`)
433	return LegalStoreKind::None;
434
435	// See if the pointer expression is an AddRec like {base,+,1} on the current
436	// loop, which indicates a strided store. If we have something else, it's a
437	// random store we can't handle.
438	const SCEVAddRecExpr *StoreEv =
439	dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: StorePtr));
440	if (!StoreEv \|\| StoreEv->getLoop() != CurLoop \|\| !StoreEv->isAffine())
441	return LegalStoreKind::None;
442
443	// Check to see if we have a constant stride.
444	if (!isa<SCEVConstant>(Val: StoreEv->getOperand(i: `1`)))
445	return LegalStoreKind::None;
446
447	// See if the store can be turned into a memset.
448
449	// If the stored value is a byte-wise value (like i32 -1), then it may be
450	// turned into a memset of i8 -1, assuming that all the consecutive bytes
451	// are stored. A store of i32 0x01020304 can never be turned into a memset,
452	// but it can be turned into memset_pattern if the target supports it.
453	Value SplatValue = isBytewiseValue(V: StoredVal, DL: DL);
454
455	// Note: memset and memset_pattern on unordered-atomic is yet not supported
456	bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
457
458	// If we're allowed to form a memset, and the stored value would be
459	// acceptable for memset, use it.
460	if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
461	// Verify that the stored value is loop invariant. If not, we can't
462	// promote the memset.
463	CurLoop->isLoopInvariant(V: SplatValue)) {
464	// It looks like we can use SplatValue.
465	return LegalStoreKind::Memset;
466	}
467	if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
468	// Don't create memset_pattern16s with address spaces.
469	StorePtr->getType()->getPointerAddressSpace() == `0` &&
470	getMemSetPatternValue(V: StoredVal, DL)) {
471	// It looks like we can use PatternValue!
472	return LegalStoreKind::MemsetPattern;
473	}
474
475	// Otherwise, see if the store can be turned into a memcpy.
476	if (HasMemcpy && !DisableLIRP::Memcpy) {
477	// Check to see if the stride matches the size of the store. If so, then we
478	// know that every byte is touched in the loop.
479	APInt Stride = getStoreStride(StoreEv);
480	unsigned StoreSize = DL->getTypeStoreSize(Ty: SI->getValueOperand()->getType());
481	if (StoreSize != Stride && StoreSize != -Stride)
482	return LegalStoreKind::None;
483
484	// The store must be feeding a non-volatile load.
485	LoadInst *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand());
486
487	// Only allow non-volatile loads
488	if (!LI \|\| LI->isVolatile())
489	return LegalStoreKind::None;
490	// Only allow simple or unordered-atomic loads
491	if (!LI->isUnordered())
492	return LegalStoreKind::None;
493
494	// See if the pointer expression is an AddRec like {base,+,1} on the current
495	// loop, which indicates a strided load. If we have something else, it's a
496	// random load we can't handle.
497	const SCEVAddRecExpr *LoadEv =
498	dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: LI->getPointerOperand()));
499	if (!LoadEv \|\| LoadEv->getLoop() != CurLoop \|\| !LoadEv->isAffine())
500	return LegalStoreKind::None;
501
502	// The store and load must share the same stride.
503	if (StoreEv->getOperand(i: `1`) != LoadEv->getOperand(i: `1`))
504	return LegalStoreKind::None;
505
506	// Success. This store can be converted into a memcpy.
507	UnorderedAtomic = UnorderedAtomic \|\| LI->isAtomic();
508	return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
509	: LegalStoreKind::Memcpy;
510	}
511	// This store can't be transformed into a memset/memcpy.
512	return LegalStoreKind::None;
513	}
514
515	void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
516	StoreRefsForMemset.clear();
517	StoreRefsForMemsetPattern.clear();
518	StoreRefsForMemcpy.clear();
519	for (Instruction &I : *BB) {
520	StoreInst *SI = dyn_cast<StoreInst>(Val: &I);
521	if (!SI)
522	continue;
523
524	// Make sure this is a strided store with a constant stride.
525	switch (isLegalStore(SI)) {
526	case LegalStoreKind::None:
527	// Nothing to do
528	break;
529	case LegalStoreKind::Memset: {
530	// Find the base pointer.
531	Value *Ptr = getUnderlyingObject(V: SI->getPointerOperand());
532	StoreRefsForMemset [Ptr].push_back(Elt: SI);
533	} break;
534	case LegalStoreKind::MemsetPattern: {
535	// Find the base pointer.
536	Value *Ptr = getUnderlyingObject(V: SI->getPointerOperand());
537	StoreRefsForMemsetPattern [Ptr].push_back(Elt: SI);
538	} break;
539	case LegalStoreKind::Memcpy:
540	case LegalStoreKind::UnorderedAtomicMemcpy:
541	StoreRefsForMemcpy.push_back(Elt: SI);
542	break;
543	default:
544	assert(false && "unhandled return value");
545	break;
546	}
547	}
548	}
549
550	/// runOnLoopBlock - Process the specified block, which lives in a counted loop
551	/// with the specified backedge count. This block is known to be in the current
552	/// loop and not in any subloops.
553	bool LoopIdiomRecognize::runOnLoopBlock(
554	BasicBlock BB, const* SCEV *BECount,
555	SmallVectorImpl<BasicBlock *> &ExitBlocks) {
556	// We can only promote stores in this block if they are unconditionally
557	// executed in the loop. For a block to be unconditionally executed, it has
558	// to dominate all the exit blocks of the loop. Verify this now.
559	for (BasicBlock *ExitBlock : ExitBlocks)
560	if (!DT->dominates(A: BB, B: ExitBlock))
561	return false;
562
563	bool MadeChange = false;
564	// Look for store instructions, which may be optimized to memset/memcpy.
565	collectStores(BB);
566
567	// Look for a single store or sets of stores with a common base, which can be
568	// optimized into a memset (memset_pattern). The latter most commonly happens
569	// with structs and handunrolled loops.
570	for (auto &SL : StoreRefsForMemset)
571	MadeChange \|= processLoopStores(SL&: SL.second, BECount, For: ForMemset::Yes);
572
573	for (auto &SL : StoreRefsForMemsetPattern)
574	MadeChange \|= processLoopStores(SL&: SL.second, BECount, For: ForMemset::No);
575
576	// Optimize the store into a memcpy, if it feeds an similarly strided load.
577	for (auto &SI : StoreRefsForMemcpy)
578	MadeChange \|= processLoopStoreOfLoopLoad(SI, BECount);
579
580	MadeChange \|= processLoopMemIntrinsic<MemCpyInst>(
581	BB, Processor: &LoopIdiomRecognize::processLoopMemCpy, BECount);
582	MadeChange \|= processLoopMemIntrinsic<MemSetInst>(
583	BB, Processor: &LoopIdiomRecognize::processLoopMemSet, BECount);
584
585	return MadeChange;
586	}
587
588	/// See if this store(s) can be promoted to a memset.
589	bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
590	const SCEV *BECount, ForMemset For) {
591	// Try to find consecutive stores that can be transformed into memsets.
592	SetVector<StoreInst *> Heads, Tails;
593	SmallDenseMap<StoreInst , StoreInst > ConsecutiveChain;
594
595	// Do a quadratic search on all of the given stores and find
596	// all of the pairs of stores that follow each other.
597	SmallVector<unsigned, `16`> IndexQueue;
598	for (unsigned i = `0`, e = SL.size(); i < e; ++i) {
599	assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
600
601	Value *FirstStoredVal = SL [i]->getValueOperand();
602	Value *FirstStorePtr = SL [i]->getPointerOperand();
603	const SCEVAddRecExpr *FirstStoreEv =
604	cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: FirstStorePtr));
605	APInt FirstStride = getStoreStride(StoreEv: FirstStoreEv);
606	unsigned FirstStoreSize = DL->getTypeStoreSize(Ty: SL [i]->getValueOperand()->getType());
607
608	// See if we can optimize just this store in isolation.
609	if (FirstStride == FirstStoreSize \|\| -FirstStride == FirstStoreSize) {
610	Heads.insert(X: SL [i]);
611	continue;
612	}
613
614	Value FirstSplatValue = nullptr*;
615	Constant FirstPatternValue = nullptr*;
616
617	if (For == ForMemset::Yes)
618	FirstSplatValue = isBytewiseValue(V: FirstStoredVal, DL: *DL);
619	else
620	FirstPatternValue = getMemSetPatternValue(V: FirstStoredVal, DL);
621
622	assert((FirstSplatValue \|\| FirstPatternValue) &&
623	"Expected either splat value or pattern value.");
624
625	IndexQueue.clear();
626	// If a store has multiple consecutive store candidates, search Stores
627	// array according to the sequence: from i+1 to e, then from i-1 to 0.
628	// This is because usually pairing with immediate succeeding or preceding
629	// candidate create the best chance to find memset opportunity.
630	unsigned j = `0`;
631	for (j = i + `1`; j < e; ++j)
632	IndexQueue.push_back(Elt: j);
633	for (j = i; j > `0`; --j)
634	IndexQueue.push_back(Elt: j - `1`);
635
636	for (auto &k : IndexQueue) {
637	assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
638	Value *SecondStorePtr = SL [k]->getPointerOperand();
639	const SCEVAddRecExpr *SecondStoreEv =
640	cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: SecondStorePtr));
641	APInt SecondStride = getStoreStride(StoreEv: SecondStoreEv);
642
643	if (FirstStride != SecondStride)
644	continue;
645
646	Value *SecondStoredVal = SL [k]->getValueOperand();
647	Value SecondSplatValue = nullptr*;
648	Constant SecondPatternValue = nullptr*;
649
650	if (For == ForMemset::Yes)
651	SecondSplatValue = isBytewiseValue(V: SecondStoredVal, DL: *DL);
652	else
653	SecondPatternValue = getMemSetPatternValue(V: SecondStoredVal, DL);
654
655	assert((SecondSplatValue \|\| SecondPatternValue) &&
656	"Expected either splat value or pattern value.");
657
658	if (isConsecutiveAccess(A: SL [i], B: SL [k], DL: DL, SE&: SE, CheckType: false)) {
659	if (For == ForMemset::Yes) {
660	if (isa<UndefValue>(Val: FirstSplatValue))
661	FirstSplatValue = SecondSplatValue;
662	if (FirstSplatValue != SecondSplatValue)
663	continue;
664	} else {
665	if (isa<UndefValue>(Val: FirstPatternValue))
666	FirstPatternValue = SecondPatternValue;
667	if (FirstPatternValue != SecondPatternValue)
668	continue;
669	}
670	Tails.insert(X: SL [k]);
671	Heads.insert(X: SL [i]);
672	ConsecutiveChain [SL [i]] = SL [k];
673	break;
674	}
675	}
676	}
677
678	// We may run into multiple chains that merge into a single chain. We mark the
679	// stores that we transformed so that we don't visit the same store twice.
680	SmallPtrSet<Value *, `16`> TransformedStores;
681	bool Changed = false;
682
683	// For stores that start but don't end a link in the chain:
684	for (StoreInst *I : Heads) {
685	if (Tails.count(key: I))
686	continue;
687
688	// We found a store instr that starts a chain. Now follow the chain and try
689	// to transform it.
690	SmallPtrSet<Instruction *, `8`> AdjacentStores;
691	StoreInst *HeadStore = I;
692	unsigned StoreSize = `0`;
693
694	// Collect the chain into a list.
695	while (Tails.count(key: I) \|\| Heads.count(key: I)) {
696	if (TransformedStores.count(Ptr: I))
697	break;
698	AdjacentStores.insert(Ptr: I);
699
700	StoreSize += DL->getTypeStoreSize(Ty: I->getValueOperand()->getType());
701	// Move to the next value in the chain.
702	I = ConsecutiveChain [I];
703	}
704
705	Value *StoredVal = HeadStore->getValueOperand();
706	Value *StorePtr = HeadStore->getPointerOperand();
707	const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: StorePtr));
708	APInt Stride = getStoreStride(StoreEv);
709
710	// Check to see if the stride matches the size of the stores. If so, then
711	// we know that every byte is touched in the loop.
712	if (StoreSize != Stride && StoreSize != -Stride)
713	continue;
714
715	bool IsNegStride = StoreSize == -Stride;
716
717	Type *IntIdxTy = DL->getIndexType(PtrTy: StorePtr->getType());
718	const SCEV *StoreSizeSCEV = SE->getConstant(Ty: IntIdxTy, V: StoreSize);
719	if (processLoopStridedStore(DestPtr: StorePtr, StoreSizeSCEV,
720	StoreAlignment: MaybeAlign (HeadStore->getAlign()), StoredVal,
721	TheStore: HeadStore, Stores&: AdjacentStores, Ev: StoreEv, BECount,
722	IsNegStride)) {
723	TransformedStores.insert(I: AdjacentStores.begin(), E: AdjacentStores.end());
724	Changed = true;
725	}
726	}
727
728	return Changed;
729	}
730
731	/// processLoopMemIntrinsic - Template function for calling different processor
732	/// functions based on mem intrinsic type.
733	template <typename MemInst>
734	bool LoopIdiomRecognize::processLoopMemIntrinsic(
735	BasicBlock *BB,
736	bool (LoopIdiomRecognize::Processor)(MemInst , const SCEV *),
737	const SCEV *BECount) {
738	bool MadeChange = false;
739	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
740	Instruction Inst = &I ++;
741	// Look for memory instructions, which may be optimized to a larger one.
742	if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
743	WeakTrackingVH InstPtr(&*I);
744	if (!(this->*Processor)(MI, BECount))
745	continue;
746	MadeChange = true;
747
748	// If processing the instruction invalidated our iterator, start over from
749	// the top of the block.
750	if (!InstPtr)
751	I = BB->begin();
752	}
753	}
754	return MadeChange;
755	}
756
757	/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
758	bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
759	const SCEV *BECount) {
760	// We can only handle non-volatile memcpys with a constant size.
761	if (MCI->isVolatile() \|\| !isa<ConstantInt>(Val: MCI->getLength()))
762	return false;
763
764	// If we're not allowed to hack on memcpy, we fail.
765	if ((!HasMemcpy && !isa<MemCpyInlineInst>(Val: MCI)) \|\| DisableLIRP::Memcpy)
766	return false;
767
768	Value *Dest = MCI->getDest();
769	Value *Source = MCI->getSource();
770	if (!Dest \|\| !Source)
771	return false;
772
773	// See if the load and store pointer expressions are AddRec like {base,+,1} on
774	// the current loop, which indicates a strided load and store. If we have
775	// something else, it's a random load or store we can't handle.
776	const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: Dest));
777	if (!StoreEv \|\| StoreEv->getLoop() != CurLoop \|\| !StoreEv->isAffine())
778	return false;
779	const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: Source));
780	if (!LoadEv \|\| LoadEv->getLoop() != CurLoop \|\| !LoadEv->isAffine())
781	return false;
782
783	// Reject memcpys that are so large that they overflow an unsigned.
784	uint64_t SizeInBytes = cast<ConstantInt>(Val: MCI->getLength())->getZExtValue();
785	if ((SizeInBytes >> `32`) != `0`)
786	return false;
787
788	// Check if the stride matches the size of the memcpy. If so, then we know
789	// that every byte is touched in the loop.
790	const SCEVConstant *ConstStoreStride =
791	dyn_cast<SCEVConstant>(Val: StoreEv->getOperand(i: `1`));
792	const SCEVConstant *ConstLoadStride =
793	dyn_cast<SCEVConstant>(Val: LoadEv->getOperand(i: `1`));
794	if (!ConstStoreStride \|\| !ConstLoadStride)
795	return false;
796
797	APInt StoreStrideValue = ConstStoreStride->getAPInt();
798	APInt LoadStrideValue = ConstLoadStride->getAPInt();
799	// Huge stride value - give up
800	if (StoreStrideValue.getBitWidth() > `64` \|\| LoadStrideValue.getBitWidth() > `64`)
801	return false;
802
803	if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
804	ORE.emit(RemarkBuilder: [&]() {
805	return OptimizationRemarkMissed (DEBUG_TYPE, "SizeStrideUnequal", MCI)
806	<< ore::NV ("Inst", "memcpy") << " in "
807	<< ore::NV ("Function", MCI->getFunction())
808	<< " function will not be hoisted: "
809	<< ore::NV ("Reason", "memcpy size is not equal to stride");
810	});
811	return false;
812	}
813
814	int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
815	int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
816	// Check if the load stride matches the store stride.
817	if (StoreStrideInt != LoadStrideInt)
818	return false;
819
820	return processLoopStoreOfLoopLoad(
821	DestPtr: Dest, SourcePtr: Source, StoreSize: SE->getConstant(Ty: Dest->getType(), V: SizeInBytes),
822	StoreAlign: MCI->getDestAlign(), LoadAlign: MCI->getSourceAlign(), TheStore: MCI, TheLoad: MCI, StoreEv, LoadEv,
823	BECount);
824	}
825
826	/// processLoopMemSet - See if this memset can be promoted to a large memset.
827	bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
828	const SCEV *BECount) {
829	// We can only handle non-volatile memsets.
830	if (MSI->isVolatile())
831	return false;
832
833	// If we're not allowed to hack on memset, we fail.
834	if (!HasMemset \|\| DisableLIRP::Memset)
835	return false;
836
837	Value *Pointer = MSI->getDest();
838
839	// See if the pointer expression is an AddRec like {base,+,1} on the current
840	// loop, which indicates a strided store. If we have something else, it's a
841	// random store we can't handle.
842	const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: Pointer));
843	if (!Ev \|\| Ev->getLoop() != CurLoop)
844	return false;
845	if (!Ev->isAffine()) {
846	LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
847	return false;
848	}
849
850	const SCEV *PointerStrideSCEV = Ev->getOperand(i: `1`);
851	const SCEV *MemsetSizeSCEV = SE->getSCEV(V: MSI->getLength());
852	if (!PointerStrideSCEV \|\| !MemsetSizeSCEV)
853	return false;
854
855	bool IsNegStride = false;
856	const bool IsConstantSize = isa<ConstantInt>(Val: MSI->getLength());
857
858	if (IsConstantSize) {
859	// Memset size is constant.
860	// Check if the pointer stride matches the memset size. If so, then
861	// we know that every byte is touched in the loop.
862	LLVM_DEBUG(dbgs() << " memset size is constant\n");
863	uint64_t SizeInBytes = cast<ConstantInt>(Val: MSI->getLength())->getZExtValue();
864	const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Val: Ev->getOperand(i: `1`));
865	if (!ConstStride)
866	return false;
867
868	APInt Stride = ConstStride->getAPInt();
869	if (SizeInBytes != Stride && SizeInBytes != -Stride)
870	return false;
871
872	IsNegStride = SizeInBytes == -Stride;
873	} else {
874	// Memset size is non-constant.
875	// Check if the pointer stride matches the memset size.
876	// To be conservative, the pass would not promote pointers that aren't in
877	// address space zero. Also, the pass only handles memset length and stride
878	// that are invariant for the top level loop.
879	LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
880	if (Pointer->getType()->getPointerAddressSpace() != `0`) {
881	LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
882	<< "abort\n");
883	return false;
884	}
885	if (!SE->isLoopInvariant(S: MemsetSizeSCEV, L: CurLoop)) {
886	LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
887	<< "abort\n");
888	return false;
889	}
890
891	// Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
892	IsNegStride = PointerStrideSCEV->isNonConstantNegative();
893	const SCEV *PositiveStrideSCEV =
894	IsNegStride ? SE->getNegativeSCEV(V: PointerStrideSCEV)
895	: PointerStrideSCEV;
896	LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
897	<< " PositiveStrideSCEV: " << *PositiveStrideSCEV
898	<< "\n");
899
900	if (PositiveStrideSCEV != MemsetSizeSCEV) {
901	// If an expression is covered by the loop guard, compare again and
902	// proceed with optimization if equal.
903	const SCEV *FoldedPositiveStride =
904	SE->applyLoopGuards(Expr: PositiveStrideSCEV, L: CurLoop);
905	const SCEV *FoldedMemsetSize =
906	SE->applyLoopGuards(Expr: MemsetSizeSCEV, L: CurLoop);
907
908	LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
909	<< " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
910	<< " FoldedPositiveStride: " << *FoldedPositiveStride
911	<< "\n");
912
913	if (FoldedPositiveStride != FoldedMemsetSize) {
914	LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
915	return false;
916	}
917	}
918	}
919
920	// Verify that the memset value is loop invariant. If not, we can't promote
921	// the memset.
922	Value *SplatValue = MSI->getValue();
923	if (!SplatValue \|\| !CurLoop->isLoopInvariant(V: SplatValue))
924	return false;
925
926	SmallPtrSet<Instruction *, `1`> MSIs;
927	MSIs.insert(Ptr: MSI);
928	return processLoopStridedStore(DestPtr: Pointer, StoreSizeSCEV: SE->getSCEV(V: MSI->getLength()),
929	StoreAlignment: MSI->getDestAlign(), StoredVal: SplatValue, TheStore: MSI, Stores&: MSIs, Ev,
930	BECount, IsNegStride, /IsLoopMemset=/true);
931	}
932
933	/// mayLoopAccessLocation - Return true if the specified loop might access the
934	/// specified pointer location, which is a loop-strided access. The 'Access'
935	/// argument specifies what the verboten forms of access are (read or write).
936	static bool
937	mayLoopAccessLocation(Value Ptr, ModRefInfo Access, Loop L,
938	const SCEV BECount, const* SCEV *StoreSizeSCEV,
939	AliasAnalysis &AA,
940	SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
941	// Get the location that may be stored across the loop. Since the access is
942	// strided positively through memory, we say that the modified location starts
943	// at the pointer and has infinite size.
944	LocationSize AccessSize = LocationSize::afterPointer();
945
946	// If the loop iterates a fixed number of times, we can refine the access size
947	// to be exactly the size of the memset, which is (BECount+1)StoreSize*
948	const SCEVConstant *BECst = dyn_cast<SCEVConstant>(Val: BECount);
949	const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(Val: StoreSizeSCEV);
950	if (BECst && ConstSize) {
951	std::optional<uint64_t> BEInt = BECst->getAPInt().tryZExtValue();
952	std::optional<uint64_t> SizeInt = ConstSize->getAPInt().tryZExtValue();
953	// FIXME: Should this check for overflow?
954	if (BEInt && SizeInt)
955	AccessSize = LocationSize::precise(Value: (BEInt + `1`) *SizeInt);
956	}
957
958	// TODO: For this to be really effective, we have to dive into the pointer
959	// operand in the store. Store to &A[i] of 100 will always return may alias
960	// with store of &A[100], we need to StoreLoc to be "A" with size of 100,
961	// which will then no-alias a store to &A[100].
962	MemoryLocation StoreLoc(Ptr, AccessSize);
963
964	for (BasicBlock *B : L->blocks())
965	for (Instruction &I : *B)
966	if (!IgnoredInsts.contains(Ptr: &I) &&
967	isModOrRefSet(MRI: AA.getModRefInfo(I: &I, OptLoc: StoreLoc) & Access))
968	return true;
969	return false;
970	}
971
972	// If we have a negative stride, Start refers to the end of the memory location
973	// we're trying to memset. Therefore, we need to recompute the base pointer,
974	// which is just Start - BECountSize.*
975	static const SCEV getStartForNegStride(const* SCEV Start, const* SCEV *BECount,
976	Type IntPtr, const* SCEV *StoreSizeSCEV,
977	ScalarEvolution *SE) {
978	const SCEV *Index = SE->getTruncateOrZeroExtend(V: BECount, Ty: IntPtr);
979	if (!StoreSizeSCEV->isOne()) {
980	// index = back edge count store size*
981	Index = SE->getMulExpr(LHS: Index,
982	RHS: SE->getTruncateOrZeroExtend(V: StoreSizeSCEV, Ty: IntPtr),
983	Flags: SCEV::FlagNUW);
984	}
985	// base pointer = start - index store size*
986	return SE->getMinusSCEV(LHS: Start, RHS: Index);
987	}
988
989	/// Compute the number of bytes as a SCEV from the backedge taken count.
990	///
991	/// This also maps the SCEV into the provided type and tries to handle the
992	/// computation in a way that will fold cleanly.
993	static const SCEV getNumBytes(const* SCEV BECount, Type IntPtr,
994	const SCEV StoreSizeSCEV, Loop CurLoop,
995	const DataLayout DL, ScalarEvolution SE) {
996	const SCEV *TripCountSCEV =
997	SE->getTripCountFromExitCount(ExitCount: BECount, EvalTy: IntPtr, L: CurLoop);
998	return SE->getMulExpr(LHS: TripCountSCEV,
999	RHS: SE->getTruncateOrZeroExtend(V: StoreSizeSCEV, Ty: IntPtr),
1000	Flags: SCEV::FlagNUW);
1001	}
1002
1003	/// processLoopStridedStore - We see a strided store of some value. If we can
1004	/// transform this into a memset or memset_pattern in the loop preheader, do so.
1005	bool LoopIdiomRecognize::processLoopStridedStore(
1006	Value DestPtr, const* SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1007	Value StoredVal, Instruction TheStore,
1008	SmallPtrSetImpl<Instruction > &Stores, const* SCEVAddRecExpr *Ev,
1009	const SCEV BECount, bool* IsNegStride, bool IsLoopMemset) {
1010	Module *M = TheStore->getModule();
1011	Value SplatValue = isBytewiseValue(V: StoredVal, DL: DL);
1012	Constant PatternValue = nullptr*;
1013
1014	if (!SplatValue)
1015	PatternValue = getMemSetPatternValue(V: StoredVal, DL);
1016
1017	assert((SplatValue \|\| PatternValue) &&
1018	"Expected either splat value or pattern value.");
1019
1020	// The trip count of the loop and the base pointer of the addrec SCEV is
1021	// guaranteed to be loop invariant, which means that it should dominate the
1022	// header. This allows us to insert code for it in the preheader.
1023	unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1024	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1025	IRBuilder<> Builder(Preheader->getTerminator());
1026	SCEVExpander Expander(SE, DL, "loop-idiom");
1027	SCEVExpanderCleaner ExpCleaner(Expander);
1028
1029	Type *DestInt8PtrTy = Builder.getPtrTy(AddrSpace: DestAS);
1030	Type *IntIdxTy = DL->getIndexType(PtrTy: DestPtr->getType());
1031
1032	bool Changed = false;
1033	const SCEV *Start = Ev->getStart();
1034	// Handle negative strided loops.
1035	if (IsNegStride)
1036	Start = getStartForNegStride(Start, BECount, IntPtr: IntIdxTy, StoreSizeSCEV, SE);
1037
1038	// TODO: ideally we should still be able to generate memset if SCEV expander
1039	// is taught to generate the dependencies at the latest point.
1040	if (!Expander.isSafeToExpand(S: Start))
1041	return Changed;
1042
1043	// Okay, we have a strided store "p[i]" of a splattable value. We can turn
1044	// this into a memset in the loop preheader now if we want. However, this
1045	// would be unsafe to do if there is anything else in the loop that may read
1046	// or write to the aliased location. Check for any overlap by generating the
1047	// base pointer and checking the region.
1048	Value *BasePtr =
1049	Expander.expandCodeFor(SH: Start, Ty: DestInt8PtrTy, I: Preheader->getTerminator());
1050
1051	// From here on out, conservatively report to the pass manager that we've
1052	// changed the IR, even if we later clean up these added instructions. There
1053	// may be structural differences e.g. in the order of use lists not accounted
1054	// for in just a textual dump of the IR. This is written as a variable, even
1055	// though statically all the places this dominates could be replaced with
1056	// 'true', with the hope that anyone trying to be clever / "more precise" with
1057	// the return value will read this comment, and leave them alone.
1058	Changed = true;
1059
1060	if (mayLoopAccessLocation(Ptr: BasePtr, Access: ModRefInfo::ModRef, L: CurLoop, BECount,
1061	StoreSizeSCEV, AA&: *AA, IgnoredInsts&: Stores))
1062	return Changed;
1063
1064	if (avoidLIRForMultiBlockLoop(/IsMemset=/true, IsLoopMemset))
1065	return Changed;
1066
1067	// Okay, everything looks good, insert the memset.
1068
1069	const SCEV *NumBytesS =
1070	getNumBytes(BECount, IntPtr: IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1071
1072	// TODO: ideally we should still be able to generate memset if SCEV expander
1073	// is taught to generate the dependencies at the latest point.
1074	if (!Expander.isSafeToExpand(S: NumBytesS))
1075	return Changed;
1076
1077	Value *NumBytes =
1078	Expander.expandCodeFor(SH: NumBytesS, Ty: IntIdxTy, I: Preheader->getTerminator());
1079
1080	if (!SplatValue && !isLibFuncEmittable(M, TLI, TheLibFunc: LibFunc_memset_pattern16))
1081	return Changed;
1082
1083	AAMDNodes AATags = TheStore->getAAMetadata();
1084	for (Instruction *Store : Stores)
1085	AATags = AATags.merge(Other: Store->getAAMetadata());
1086	if (auto CI = dyn_cast<ConstantInt>(Val: NumBytes))
1087	AATags = AATags.extendTo(Len: CI->getZExtValue());
1088	else
1089	AATags = AATags.extendTo(Len: -`1`);
1090
1091	CallInst *NewCall;
1092	if (SplatValue) {
1093	NewCall = Builder.CreateMemSet(
1094	Ptr: BasePtr, Val: SplatValue, Size: NumBytes, Align: MaybeAlign (StoreAlignment),
1095	/isVolatile=/false, TBAATag: AATags.TBAA, ScopeTag: AATags.Scope, NoAliasTag: AATags.NoAlias);
1096	} else {
1097	assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
1098	// Everything is emitted in default address space
1099	Type *Int8PtrTy = DestInt8PtrTy;
1100
1101	StringRef FuncName = "memset_pattern16";
1102	FunctionCallee MSP = getOrInsertLibFunc(M, TLI: *TLI, TheLibFunc: LibFunc_memset_pattern16,
1103	RetTy: Builder.getVoidTy(), Args: Int8PtrTy, Args: Int8PtrTy, Args: IntIdxTy);
1104	inferNonMandatoryLibFuncAttrs(M, Name: FuncName, TLI: *TLI);
1105
1106	// Otherwise we should form a memset_pattern16. PatternValue is known to be
1107	// an constant array of 16-bytes. Plop the value into a mergable global.
1108	GlobalVariable GV = new* GlobalVariable (M, PatternValue->getType(), true*,
1109	GlobalValue::PrivateLinkage,
1110	PatternValue, ".memset_pattern");
1111	GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1112	GV->setAlignment(Align (`16`));
1113	Value *PatternPtr = GV;
1114	NewCall = Builder.CreateCall(Callee: MSP, Args: {BasePtr, PatternPtr, NumBytes});
1115
1116	// Set the TBAA info if present.
1117	if (AATags.TBAA)
1118	NewCall->setMetadata(KindID: LLVMContext::MD_tbaa, Node: AATags.TBAA);
1119
1120	if (AATags.Scope)
1121	NewCall->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: AATags.Scope);
1122
1123	if (AATags.NoAlias)
1124	NewCall->setMetadata(KindID: LLVMContext::MD_noalias, Node: AATags.NoAlias);
1125	}
1126
1127	NewCall->setDebugLoc(TheStore->getDebugLoc());
1128
1129	if (MSSAU) {
1130	MemoryAccess *NewMemAcc = MSSAU ->createMemoryAccessInBB(
1131	I: NewCall, Definition: nullptr, BB: NewCall->getParent(), Point: MemorySSA::BeforeTerminator);
1132	MSSAU ->insertDef(Def: cast<MemoryDef>(Val: NewMemAcc), RenameUses: true);
1133	}
1134
1135	LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1136	<< " from store to: " << Ev << " at: " << TheStore
1137	<< "\n");
1138
1139	ORE.emit(RemarkBuilder: [&]() {
1140	OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1141	NewCall->getDebugLoc(), Preheader);
1142	R << "Transformed loop-strided store in "
1143	<< ore::NV ("Function", TheStore->getFunction())
1144	<< " function into a call to "
1145	<< ore::NV ("NewFunction", NewCall->getCalledFunction())
1146	<< "() intrinsic";
1147	if (!Stores.empty())
1148	R << ore::setExtraArgs ();
1149	for (auto *I : Stores) {
1150	R << ore::NV ("FromBlock", I->getParent()->getName())
1151	<< ore::NV ("ToBlock", Preheader->getName());
1152	}
1153	return R;
1154	});
1155
1156	// Okay, the memset has been formed. Zap the original store and anything that
1157	// feeds into it.
1158	for (auto *I : Stores) {
1159	if (MSSAU)
1160	MSSAU ->removeMemoryAccess(I, OptimizePhis: true);
1161	deleteDeadInstruction(I);
1162	}
1163	if (MSSAU && VerifyMemorySSA)
1164	MSSAU ->getMemorySSA()->verifyMemorySSA();
1165	++NumMemSet;
1166	ExpCleaner.markResultUsed();
1167	return true;
1168	}
1169
1170	/// If the stored value is a strided load in the same loop with the same stride
1171	/// this may be transformable into a memcpy. This kicks in for stuff like
1172	/// for (i) A[i] = B[i];
1173	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1174	const SCEV *BECount) {
1175	assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1176
1177	Value *StorePtr = SI->getPointerOperand();
1178	const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: StorePtr));
1179	unsigned StoreSize = DL->getTypeStoreSize(Ty: SI->getValueOperand()->getType());
1180
1181	// The store must be feeding a non-volatile load.
1182	LoadInst *LI = cast<LoadInst>(Val: SI->getValueOperand());
1183	assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1184
1185	// See if the pointer expression is an AddRec like {base,+,1} on the current
1186	// loop, which indicates a strided load. If we have something else, it's a
1187	// random load we can't handle.
1188	Value *LoadPtr = LI->getPointerOperand();
1189	const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: LoadPtr));
1190
1191	const SCEV *StoreSizeSCEV = SE->getConstant(Ty: StorePtr->getType(), V: StoreSize);
1192	return processLoopStoreOfLoopLoad(DestPtr: StorePtr, SourcePtr: LoadPtr, StoreSize: StoreSizeSCEV,
1193	StoreAlign: SI->getAlign(), LoadAlign: LI->getAlign(), TheStore: SI, TheLoad: LI,
1194	StoreEv, LoadEv, BECount);
1195	}
1196
1197	namespace {
1198	class MemmoveVerifier {
1199	public:
1200	explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1201	const DataLayout &DL)
1202	: DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset(
1203	Ptr: LoadBasePtr.stripPointerCasts(), Offset&: LoadOff, DL)),
1204	BP2(llvm::GetPointerBaseWithConstantOffset(
1205	Ptr: StoreBasePtr.stripPointerCasts(), Offset&: StoreOff, DL)),
1206	IsSameObject(BP1 == BP2) {}
1207
1208	bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1209	const Instruction &TheLoad,
1210	bool IsMemCpy) const {
1211	if (IsMemCpy) {
1212	// Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1213	// for negative stride.
1214	if ((!IsNegStride && LoadOff <= StoreOff) \|\|
1215	(IsNegStride && LoadOff >= StoreOff))
1216	return false;
1217	} else {
1218	// Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1219	// for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1220	int64_t LoadSize =
1221	DL.getTypeSizeInBits(Ty: TheLoad.getType()).getFixedValue() / `8`;
1222	if (BP1 != BP2 \|\| LoadSize != int64_t(StoreSize))
1223	return false;
1224	if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) \|\|
1225	(IsNegStride && LoadOff + LoadSize > StoreOff))
1226	return false;
1227	}
1228	return true;
1229	}
1230
1231	private:
1232	const DataLayout &DL;
1233	int64_t LoadOff = `0`;
1234	int64_t StoreOff = `0`;
1235	const Value *BP1;
1236	const Value *BP2;
1237
1238	public:
1239	const bool IsSameObject;
1240	};
1241	} // namespace
1242
1243	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1244	Value DestPtr, Value SourcePtr, const SCEV *StoreSizeSCEV,
1245	MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1246	Instruction TheLoad, const* SCEVAddRecExpr *StoreEv,
1247	const SCEVAddRecExpr LoadEv, const* SCEV *BECount) {
1248
1249	// FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1250	// conservatively bail here, since otherwise we may have to transform
1251	// llvm.memcpy.inline into llvm.memcpy which is illegal.
1252	if (isa<MemCpyInlineInst>(Val: TheStore))
1253	return false;
1254
1255	// The trip count of the loop and the base pointer of the addrec SCEV is
1256	// guaranteed to be loop invariant, which means that it should dominate the
1257	// header. This allows us to insert code for it in the preheader.
1258	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1259	IRBuilder<> Builder(Preheader->getTerminator());
1260	SCEVExpander Expander(SE, DL, "loop-idiom");
1261
1262	SCEVExpanderCleaner ExpCleaner(Expander);
1263
1264	bool Changed = false;
1265	const SCEV *StrStart = StoreEv->getStart();
1266	unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1267	Type *IntIdxTy = Builder.getIntNTy(N: DL->getIndexSizeInBits(AS: StrAS));
1268
1269	APInt Stride = getStoreStride(StoreEv);
1270	const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(Val: StoreSizeSCEV);
1271
1272	// TODO: Deal with non-constant size; Currently expect constant store size
1273	assert(ConstStoreSize && "store size is expected to be a constant");
1274
1275	int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1276	bool IsNegStride = StoreSize == -Stride;
1277
1278	// Handle negative strided loops.
1279	if (IsNegStride)
1280	StrStart =
1281	getStartForNegStride(Start: StrStart, BECount, IntPtr: IntIdxTy, StoreSizeSCEV, SE);
1282
1283	// Okay, we have a strided store "p[i]" of a loaded value. We can turn
1284	// this into a memcpy in the loop preheader now if we want. However, this
1285	// would be unsafe to do if there is anything else in the loop that may read
1286	// or write the memory region we're storing to. This includes the load that
1287	// feeds the stores. Check for an alias by generating the base address and
1288	// checking everything.
1289	Value *StoreBasePtr = Expander.expandCodeFor(
1290	SH: StrStart, Ty: Builder.getPtrTy(AddrSpace: StrAS), I: Preheader->getTerminator());
1291
1292	// From here on out, conservatively report to the pass manager that we've
1293	// changed the IR, even if we later clean up these added instructions. There
1294	// may be structural differences e.g. in the order of use lists not accounted
1295	// for in just a textual dump of the IR. This is written as a variable, even
1296	// though statically all the places this dominates could be replaced with
1297	// 'true', with the hope that anyone trying to be clever / "more precise" with
1298	// the return value will read this comment, and leave them alone.
1299	Changed = true;
1300
1301	SmallPtrSet<Instruction *, `2`> IgnoredInsts;
1302	IgnoredInsts.insert(Ptr: TheStore);
1303
1304	bool IsMemCpy = isa<MemCpyInst>(Val: TheStore);
1305	const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1306
1307	bool LoopAccessStore =
1308	mayLoopAccessLocation(Ptr: StoreBasePtr, Access: ModRefInfo::ModRef, L: CurLoop, BECount,
1309	StoreSizeSCEV, AA&: *AA, IgnoredInsts);
1310	if (LoopAccessStore) {
1311	// For memmove case it's not enough to guarantee that loop doesn't access
1312	// TheStore and TheLoad. Additionally we need to make sure that TheStore is
1313	// the only user of TheLoad.
1314	if (!TheLoad->hasOneUse())
1315	return Changed;
1316	IgnoredInsts.insert(Ptr: TheLoad);
1317	if (mayLoopAccessLocation(Ptr: StoreBasePtr, Access: ModRefInfo::ModRef, L: CurLoop,
1318	BECount, StoreSizeSCEV, AA&: *AA, IgnoredInsts)) {
1319	ORE.emit(RemarkBuilder: [&]() {
1320	return OptimizationRemarkMissed (DEBUG_TYPE, "LoopMayAccessStore",
1321	TheStore)
1322	<< ore::NV ("Inst", InstRemark) << " in "
1323	<< ore::NV ("Function", TheStore->getFunction())
1324	<< " function will not be hoisted: "
1325	<< ore::NV ("Reason", "The loop may access store location");
1326	});
1327	return Changed;
1328	}
1329	IgnoredInsts.erase(Ptr: TheLoad);
1330	}
1331
1332	const SCEV *LdStart = LoadEv->getStart();
1333	unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1334
1335	// Handle negative strided loops.
1336	if (IsNegStride)
1337	LdStart =
1338	getStartForNegStride(Start: LdStart, BECount, IntPtr: IntIdxTy, StoreSizeSCEV, SE);
1339
1340	// For a memcpy, we have to make sure that the input array is not being
1341	// mutated by the loop.
1342	Value *LoadBasePtr = Expander.expandCodeFor(SH: LdStart, Ty: Builder.getPtrTy(AddrSpace: LdAS),
1343	I: Preheader->getTerminator());
1344
1345	// If the store is a memcpy instruction, we must check if it will write to
1346	// the load memory locations. So remove it from the ignored stores.
1347	MemmoveVerifier Verifier(LoadBasePtr, StoreBasePtr, *DL);
1348	if (IsMemCpy && !Verifier.IsSameObject)
1349	IgnoredInsts.erase(Ptr: TheStore);
1350	if (mayLoopAccessLocation(Ptr: LoadBasePtr, Access: ModRefInfo::Mod, L: CurLoop, BECount,
1351	StoreSizeSCEV, AA&: *AA, IgnoredInsts)) {
1352	ORE.emit(RemarkBuilder: [&]() {
1353	return OptimizationRemarkMissed (DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1354	<< ore::NV ("Inst", InstRemark) << " in "
1355	<< ore::NV ("Function", TheStore->getFunction())
1356	<< " function will not be hoisted: "
1357	<< ore::NV ("Reason", "The loop may access load location");
1358	});
1359	return Changed;
1360	}
1361
1362	bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1363	if (UseMemMove)
1364	if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, TheLoad: *TheLoad,
1365	IsMemCpy))
1366	return Changed;
1367
1368	if (avoidLIRForMultiBlockLoop())
1369	return Changed;
1370
1371	// Okay, everything is safe, we can transform this!
1372
1373	const SCEV *NumBytesS =
1374	getNumBytes(BECount, IntPtr: IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1375
1376	Value *NumBytes =
1377	Expander.expandCodeFor(SH: NumBytesS, Ty: IntIdxTy, I: Preheader->getTerminator());
1378
1379	AAMDNodes AATags = TheLoad->getAAMetadata();
1380	AAMDNodes StoreAATags = TheStore->getAAMetadata();
1381	AATags = AATags.merge(Other: StoreAATags);
1382	if (auto CI = dyn_cast<ConstantInt>(Val: NumBytes))
1383	AATags = AATags.extendTo(Len: CI->getZExtValue());
1384	else
1385	AATags = AATags.extendTo(Len: -`1`);
1386
1387	CallInst NewCall = nullptr*;
1388	// Check whether to generate an unordered atomic memcpy:
1389	// If the load or store are atomic, then they must necessarily be unordered
1390	// by previous checks.
1391	if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1392	if (UseMemMove)
1393	NewCall = Builder.CreateMemMove(
1394	Dst: StoreBasePtr, DstAlign: StoreAlign, Src: LoadBasePtr, SrcAlign: LoadAlign, Size: NumBytes,
1395	/isVolatile=/false, TBAATag: AATags.TBAA, ScopeTag: AATags.Scope, NoAliasTag: AATags.NoAlias);
1396	else
1397	NewCall =
1398	Builder.CreateMemCpy(Dst: StoreBasePtr, DstAlign: StoreAlign, Src: LoadBasePtr, SrcAlign: LoadAlign,
1399	Size: NumBytes, /isVolatile=/false, TBAATag: AATags.TBAA,
1400	TBAAStructTag: AATags.TBAAStruct, ScopeTag: AATags.Scope, NoAliasTag: AATags.NoAlias);
1401	} else {
1402	// For now don't support unordered atomic memmove.
1403	if (UseMemMove)
1404	return Changed;
1405	// We cannot allow unaligned ops for unordered load/store, so reject
1406	// anything where the alignment isn't at least the element size.
1407	assert((StoreAlign && LoadAlign) &&
1408	"Expect unordered load/store to have align.");
1409	if (StoreAlign < StoreSize \|\| LoadAlign < StoreSize)
1410	return Changed;
1411
1412	// If the element.atomic memcpy is not lowered into explicit
1413	// loads/stores later, then it will be lowered into an element-size
1414	// specific lib call. If the lib call doesn't exist for our store size, then
1415	// we shouldn't generate the memcpy.
1416	if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1417	return Changed;
1418
1419	// Create the call.
1420	// Note that unordered atomic loads/stores are required* by the spec to*
1421	// have an alignment but non-atomic loads/stores may not.
1422	NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1423	Dst: StoreBasePtr, DstAlign: StoreAlign, Src: LoadBasePtr, SrcAlign: LoadAlign, Size: NumBytes, ElementSize: StoreSize,
1424	TBAATag: AATags.TBAA, TBAAStructTag: AATags.TBAAStruct, ScopeTag: AATags.Scope, NoAliasTag: AATags.NoAlias);
1425	}
1426	NewCall->setDebugLoc(TheStore->getDebugLoc());
1427
1428	if (MSSAU) {
1429	MemoryAccess *NewMemAcc = MSSAU ->createMemoryAccessInBB(
1430	I: NewCall, Definition: nullptr, BB: NewCall->getParent(), Point: MemorySSA::BeforeTerminator);
1431	MSSAU ->insertDef(Def: cast<MemoryDef>(Val: NewMemAcc), RenameUses: true);
1432	}
1433
1434	LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1435	<< " from load ptr=" << LoadEv << " at: " << TheLoad
1436	<< "\n"
1437	<< " from store ptr=" << StoreEv << " at: " << TheStore
1438	<< "\n");
1439
1440	ORE.emit(RemarkBuilder: [&]() {
1441	return OptimizationRemark (DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1442	NewCall->getDebugLoc(), Preheader)
1443	<< "Formed a call to "
1444	<< ore::NV ("NewFunction", NewCall->getCalledFunction())
1445	<< "() intrinsic from " << ore::NV ("Inst", InstRemark)
1446	<< " instruction in " << ore::NV ("Function", TheStore->getFunction())
1447	<< " function"
1448	<< ore::setExtraArgs ()
1449	<< ore::NV ("FromBlock", TheStore->getParent()->getName())
1450	<< ore::NV ("ToBlock", Preheader->getName());
1451	});
1452
1453	// Okay, a new call to memcpy/memmove has been formed. Zap the original store
1454	// and anything that feeds into it.
1455	if (MSSAU)
1456	MSSAU ->removeMemoryAccess(I: TheStore, OptimizePhis: true);
1457	deleteDeadInstruction(I: TheStore);
1458	if (MSSAU && VerifyMemorySSA)
1459	MSSAU ->getMemorySSA()->verifyMemorySSA();
1460	if (UseMemMove)
1461	++NumMemMove;
1462	else
1463	++NumMemCpy;
1464	ExpCleaner.markResultUsed();
1465	return true;
1466	}
1467
1468	// When compiling for codesize we avoid idiom recognition for a multi-block loop
1469	// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1470	//
1471	bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1472	bool IsLoopMemset) {
1473	if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > `1`) {
1474	if (CurLoop->isOutermost() && (!IsMemset \|\| !IsLoopMemset)) {
1475	LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1476	<< " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1477	<< " avoided: multi-block top-level loop\n");
1478	return true;
1479	}
1480	}
1481
1482	return false;
1483	}
1484
1485	bool LoopIdiomRecognize::runOnNoncountableLoop() {
1486	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1487	<< CurLoop->getHeader()->getParent()->getName()
1488	<< "] Noncountable Loop %"
1489	<< CurLoop->getHeader()->getName() << "\n");
1490
1491	return recognizePopcount() \|\| recognizeAndInsertFFS() \|\|
1492	recognizeShiftUntilBitTest() \|\| recognizeShiftUntilZero() \|\|
1493	recognizeShiftUntilLessThan();
1494	}
1495
1496	/// Check if the given conditional branch is based on the comparison between
1497	/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1498	/// true), the control yields to the loop entry. If the branch matches the
1499	/// behavior, the variable involved in the comparison is returned. This function
1500	/// will be called to see if the precondition and postcondition of the loop are
1501	/// in desirable form.
1502	static Value matchCondition(BranchInst BI, BasicBlock *LoopEntry,
1503	bool JmpOnZero = false) {
1504	if (!BI \|\| !BI->isConditional())
1505	return nullptr;
1506
1507	ICmpInst *Cond = dyn_cast<ICmpInst>(Val: BI->getCondition());
1508	if (!Cond)
1509	return nullptr;
1510
1511	ConstantInt *CmpZero = dyn_cast<ConstantInt>(Val: Cond->getOperand(i_nocapture: `1`));
1512	if (!CmpZero \|\| !CmpZero->isZero())
1513	return nullptr;
1514
1515	BasicBlock *TrueSucc = BI->getSuccessor(i: `0`);
1516	BasicBlock *FalseSucc = BI->getSuccessor(i: `1`);
1517	if (JmpOnZero)
1518	std::swap(a&: TrueSucc, b&: FalseSucc);
1519
1520	ICmpInst::Predicate Pred = Cond->getPredicate();
1521	if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) \|\|
1522	(Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1523	return Cond->getOperand(i_nocapture: `0`);
1524
1525	return nullptr;
1526	}
1527
1528	/// Check if the given conditional branch is based on an unsigned less-than
1529	/// comparison between a variable and a constant, and if the comparison is false
1530	/// the control yields to the loop entry. If the branch matches the behaviour,
1531	/// the variable involved in the comparison is returned.
1532	static Value matchShiftULTCondition(BranchInst BI, BasicBlock *LoopEntry,
1533	APInt &Threshold) {
1534	if (!BI \|\| !BI->isConditional())
1535	return nullptr;
1536
1537	ICmpInst *Cond = dyn_cast<ICmpInst>(Val: BI->getCondition());
1538	if (!Cond)
1539	return nullptr;
1540
1541	ConstantInt *CmpConst = dyn_cast<ConstantInt>(Val: Cond->getOperand(i_nocapture: `1`));
1542	if (!CmpConst)
1543	return nullptr;
1544
1545	BasicBlock *FalseSucc = BI->getSuccessor(i: `1`);
1546	ICmpInst::Predicate Pred = Cond->getPredicate();
1547
1548	if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
1549	Threshold = CmpConst->getValue();
1550	return Cond->getOperand(i_nocapture: `0`);
1551	}
1552
1553	return nullptr;
1554	}
1555
1556	// Check if the recurrence variable `VarX` is in the right form to create
1557	// the idiom. Returns the value coerced to a PHINode if so.
1558	static PHINode getRecurrenceVar(Value VarX, Instruction *DefX,
1559	BasicBlock *LoopEntry) {
1560	auto *PhiX = dyn_cast<PHINode>(Val: VarX);
1561	if (PhiX && PhiX->getParent() == LoopEntry &&
1562	(PhiX->getOperand(i_nocapture: `0`) == DefX \|\| PhiX->getOperand(i_nocapture: `1`) == DefX))
1563	return PhiX;
1564	return nullptr;
1565	}
1566
1567	/// Return true if the idiom is detected in the loop.
1568	///
1569	/// Additionally:
1570	/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1571	/// or nullptr if there is no such.
1572	/// 2) \p CntPhi is set to the corresponding phi node
1573	/// or nullptr if there is no such.
1574	/// 3) \p InitX is set to the value whose CTLZ could be used.
1575	/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1576	/// 5) \p Threshold is set to the constant involved in the unsigned less-than
1577	/// comparison.
1578	///
1579	/// The core idiom we are trying to detect is:
1580	/// \code
1581	/// if (x0 < 2)
1582	/// goto loop-exit // the precondition of the loop
1583	/// cnt0 = init-val
1584	/// do {
1585	/// x = phi (x0, x.next); //PhiX
1586	/// cnt = phi (cnt0, cnt.next)
1587	///
1588	/// cnt.next = cnt + 1;
1589	/// ...
1590	/// x.next = x >> 1; // DefX
1591	/// } while (x >= 4)
1592	/// loop-exit:
1593	/// \endcode
1594	static bool detectShiftUntilLessThanIdiom(Loop CurLoop, const* DataLayout &DL,
1595	Intrinsic::ID &IntrinID,
1596	Value &InitX, Instruction &CntInst,
1597	PHINode &CntPhi, Instruction &DefX,
1598	APInt &Threshold) {
1599	BasicBlock *LoopEntry;
1600
1601	DefX = nullptr;
1602	CntInst = nullptr;
1603	CntPhi = nullptr;
1604	LoopEntry = *(CurLoop->block_begin());
1605
1606	// step 1: Check if the loop-back branch is in desirable form.
1607	if (Value *T = matchShiftULTCondition(
1608	BI: dyn_cast<BranchInst>(Val: LoopEntry->getTerminator()), LoopEntry,
1609	Threshold))
1610	DefX = dyn_cast<Instruction>(Val: T);
1611	else
1612	return false;
1613
1614	// step 2: Check the recurrence of variable X
1615	if (!DefX \|\| !isa<PHINode>(Val: DefX))
1616	return false;
1617
1618	PHINode *VarPhi = cast<PHINode>(Val: DefX);
1619	int Idx = VarPhi->getBasicBlockIndex(BB: LoopEntry);
1620	if (Idx == -`1`)
1621	return false;
1622
1623	DefX = dyn_cast<Instruction>(Val: VarPhi->getIncomingValue(i: Idx));
1624	if (!DefX \|\| DefX->getNumOperands() == `0` \|\| DefX->getOperand(i: `0`) != VarPhi)
1625	return false;
1626
1627	// step 3: detect instructions corresponding to "x.next = x >> 1"
1628	if (DefX->getOpcode() != Instruction::LShr)
1629	return false;
1630
1631	IntrinID = Intrinsic::ctlz;
1632	ConstantInt *Shft = dyn_cast<ConstantInt>(Val: DefX->getOperand(i: `1`));
1633	if (!Shft \|\| !Shft->isOne())
1634	return false;
1635
1636	InitX = VarPhi->getIncomingValueForBlock(BB: CurLoop->getLoopPreheader());
1637
1638	// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1639	// or cnt.next = cnt + -1.
1640	// TODO: We can skip the step. If loop trip count is known (CTLZ),
1641	// then all uses of "cnt.next" could be optimized to the trip count
1642	// plus "cnt0". Currently it is not optimized.
1643	// This step could be used to detect POPCNT instruction:
1644	// cnt.next = cnt + (x.next & 1)
1645	for (Instruction &Inst : llvm::make_range(
1646	x: LoopEntry->getFirstNonPHI()->getIterator(), y: LoopEntry->end())) {
1647	if (Inst.getOpcode() != Instruction::Add)
1648	continue;
1649
1650	ConstantInt *Inc = dyn_cast<ConstantInt>(Val: Inst.getOperand(i: `1`));
1651	if (!Inc \|\| (!Inc->isOne() && !Inc->isMinusOne()))
1652	continue;
1653
1654	PHINode *Phi = getRecurrenceVar(VarX: Inst.getOperand(i: `0`), DefX: &Inst, LoopEntry);
1655	if (!Phi)
1656	continue;
1657
1658	CntInst = &Inst;
1659	CntPhi = Phi;
1660	break;
1661	}
1662	if (!CntInst)
1663	return false;
1664
1665	return true;
1666	}
1667
1668	/// Return true iff the idiom is detected in the loop.
1669	///
1670	/// Additionally:
1671	/// 1) \p CntInst is set to the instruction counting the population bit.
1672	/// 2) \p CntPhi is set to the corresponding phi node.
1673	/// 3) \p Var is set to the value whose population bits are being counted.
1674	///
1675	/// The core idiom we are trying to detect is:
1676	/// \code
1677	/// if (x0 != 0)
1678	/// goto loop-exit // the precondition of the loop
1679	/// cnt0 = init-val;
1680	/// do {
1681	/// x1 = phi (x0, x2);
1682	/// cnt1 = phi(cnt0, cnt2);
1683	///
1684	/// cnt2 = cnt1 + 1;
1685	/// ...
1686	/// x2 = x1 & (x1 - 1);
1687	/// ...
1688	/// } while(x != 0);
1689	///
1690	/// loop-exit:
1691	/// \endcode
1692	static bool detectPopcountIdiom(Loop CurLoop, BasicBlock PreCondBB,
1693	Instruction &CntInst, PHINode &CntPhi,
1694	Value *&Var) {
1695	// step 1: Check to see if the look-back branch match this pattern:
1696	// "if (a!=0) goto loop-entry".
1697	BasicBlock *LoopEntry;
1698	Instruction DefX2, CountInst;
1699	Value VarX1, VarX0;
1700	PHINode PhiX, CountPhi;
1701
1702	DefX2 = CountInst = nullptr;
1703	VarX1 = VarX0 = nullptr;
1704	PhiX = CountPhi = nullptr;
1705	LoopEntry = *(CurLoop->block_begin());
1706
1707	// step 1: Check if the loop-back branch is in desirable form.
1708	{
1709	if (Value *T = matchCondition(
1710	BI: dyn_cast<BranchInst>(Val: LoopEntry->getTerminator()), LoopEntry))
1711	DefX2 = dyn_cast<Instruction>(Val: T);
1712	else
1713	return false;
1714	}
1715
1716	// step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1717	{
1718	if (!DefX2 \|\| DefX2->getOpcode() != Instruction::And)
1719	return false;
1720
1721	BinaryOperator *SubOneOp;
1722
1723	if ((SubOneOp = dyn_cast<BinaryOperator>(Val: DefX2->getOperand(i: `0`))))
1724	VarX1 = DefX2->getOperand(i: `1`);
1725	else {
1726	VarX1 = DefX2->getOperand(i: `0`);
1727	SubOneOp = dyn_cast<BinaryOperator>(Val: DefX2->getOperand(i: `1`));
1728	}
1729	if (!SubOneOp \|\| SubOneOp->getOperand(i_nocapture: `0`) != VarX1)
1730	return false;
1731
1732	ConstantInt *Dec = dyn_cast<ConstantInt>(Val: SubOneOp->getOperand(i_nocapture: `1`));
1733	if (!Dec \|\|
1734	!((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) \|\|
1735	(SubOneOp->getOpcode() == Instruction::Add &&
1736	Dec->isMinusOne()))) {
1737	return false;
1738	}
1739	}
1740
1741	// step 3: Check the recurrence of variable X
1742	PhiX = getRecurrenceVar(VarX: VarX1, DefX: DefX2, LoopEntry);
1743	if (!PhiX)
1744	return false;
1745
1746	// step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1747	{
1748	CountInst = nullptr;
1749	for (Instruction &Inst : llvm::make_range(
1750	x: LoopEntry->getFirstNonPHI()->getIterator(), y: LoopEntry->end())) {
1751	if (Inst.getOpcode() != Instruction::Add)
1752	continue;
1753
1754	ConstantInt *Inc = dyn_cast<ConstantInt>(Val: Inst.getOperand(i: `1`));
1755	if (!Inc \|\| !Inc->isOne())
1756	continue;
1757
1758	PHINode *Phi = getRecurrenceVar(VarX: Inst.getOperand(i: `0`), DefX: &Inst, LoopEntry);
1759	if (!Phi)
1760	continue;
1761
1762	// Check if the result of the instruction is live of the loop.
1763	bool LiveOutLoop = false;
1764	for (User *U : Inst.users()) {
1765	if ((cast<Instruction>(Val: U))->getParent() != LoopEntry) {
1766	LiveOutLoop = true;
1767	break;
1768	}
1769	}
1770
1771	if (LiveOutLoop) {
1772	CountInst = &Inst;
1773	CountPhi = Phi;
1774	break;
1775	}
1776	}
1777
1778	if (!CountInst)
1779	return false;
1780	}
1781
1782	// step 5: check if the precondition is in this form:
1783	// "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1784	{
1785	auto *PreCondBr = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
1786	Value *T = matchCondition(BI: PreCondBr, LoopEntry: CurLoop->getLoopPreheader());
1787	if (T != PhiX->getOperand(i_nocapture: `0`) && T != PhiX->getOperand(i_nocapture: `1`))
1788	return false;
1789
1790	CntInst = CountInst;
1791	CntPhi = CountPhi;
1792	Var = T;
1793	}
1794
1795	return true;
1796	}
1797
1798	/// Return true if the idiom is detected in the loop.
1799	///
1800	/// Additionally:
1801	/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1802	/// or nullptr if there is no such.
1803	/// 2) \p CntPhi is set to the corresponding phi node
1804	/// or nullptr if there is no such.
1805	/// 3) \p Var is set to the value whose CTLZ could be used.
1806	/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1807	///
1808	/// The core idiom we are trying to detect is:
1809	/// \code
1810	/// if (x0 == 0)
1811	/// goto loop-exit // the precondition of the loop
1812	/// cnt0 = init-val;
1813	/// do {
1814	/// x = phi (x0, x.next); //PhiX
1815	/// cnt = phi(cnt0, cnt.next);
1816	///
1817	/// cnt.next = cnt + 1;
1818	/// ...
1819	/// x.next = x >> 1; // DefX
1820	/// ...
1821	/// } while(x.next != 0);
1822	///
1823	/// loop-exit:
1824	/// \endcode
1825	static bool detectShiftUntilZeroIdiom(Loop CurLoop, const* DataLayout &DL,
1826	Intrinsic::ID &IntrinID, Value *&InitX,
1827	Instruction &CntInst, PHINode &CntPhi,
1828	Instruction *&DefX) {
1829	BasicBlock *LoopEntry;
1830	Value VarX = nullptr*;
1831
1832	DefX = nullptr;
1833	CntInst = nullptr;
1834	CntPhi = nullptr;
1835	LoopEntry = *(CurLoop->block_begin());
1836
1837	// step 1: Check if the loop-back branch is in desirable form.
1838	if (Value *T = matchCondition(
1839	BI: dyn_cast<BranchInst>(Val: LoopEntry->getTerminator()), LoopEntry))
1840	DefX = dyn_cast<Instruction>(Val: T);
1841	else
1842	return false;
1843
1844	// step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1845	if (!DefX \|\| !DefX->isShift())
1846	return false;
1847	IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1848	Intrinsic::ctlz;
1849	ConstantInt *Shft = dyn_cast<ConstantInt>(Val: DefX->getOperand(i: `1`));
1850	if (!Shft \|\| !Shft->isOne())
1851	return false;
1852	VarX = DefX->getOperand(i: `0`);
1853
1854	// step 3: Check the recurrence of variable X
1855	PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1856	if (!PhiX)
1857	return false;
1858
1859	InitX = PhiX->getIncomingValueForBlock(BB: CurLoop->getLoopPreheader());
1860
1861	// Make sure the initial value can't be negative otherwise the ashr in the
1862	// loop might never reach zero which would make the loop infinite.
1863	if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(V: InitX, SQ: DL))
1864	return false;
1865
1866	// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1867	// or cnt.next = cnt + -1.
1868	// TODO: We can skip the step. If loop trip count is known (CTLZ),
1869	// then all uses of "cnt.next" could be optimized to the trip count
1870	// plus "cnt0". Currently it is not optimized.
1871	// This step could be used to detect POPCNT instruction:
1872	// cnt.next = cnt + (x.next & 1)
1873	for (Instruction &Inst : llvm::make_range(
1874	x: LoopEntry->getFirstNonPHI()->getIterator(), y: LoopEntry->end())) {
1875	if (Inst.getOpcode() != Instruction::Add)
1876	continue;
1877
1878	ConstantInt *Inc = dyn_cast<ConstantInt>(Val: Inst.getOperand(i: `1`));
1879	if (!Inc \|\| (!Inc->isOne() && !Inc->isMinusOne()))
1880	continue;
1881
1882	PHINode *Phi = getRecurrenceVar(VarX: Inst.getOperand(i: `0`), DefX: &Inst, LoopEntry);
1883	if (!Phi)
1884	continue;
1885
1886	CntInst = &Inst;
1887	CntPhi = Phi;
1888	break;
1889	}
1890	if (!CntInst)
1891	return false;
1892
1893	return true;
1894	}
1895
1896	// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1897	// profitable if we delete the loop.
1898	bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
1899	Value InitX, bool* ZeroCheck,
1900	size_t CanonicalSize) {
1901	const Value *Args[] = {InitX,
1902	ConstantInt::getBool(Context&: InitX->getContext(), V: ZeroCheck)};
1903
1904	// @llvm.dbg doesn't count as they have no semantic effect.
1905	auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1906	uint32_t HeaderSize =
1907	std::distance(first: InstWithoutDebugIt.begin(), last: InstWithoutDebugIt.end());
1908
1909	IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1910	InstructionCost Cost = TTI->getIntrinsicInstrCost(
1911	ICA: Attrs, CostKind: TargetTransformInfo::TCK_SizeAndLatency);
1912	if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
1913	return false;
1914
1915	return true;
1916	}
1917
1918	/// Convert CTLZ / CTTZ idiom loop into countable loop.
1919	/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
1920	/// returns false.
1921	bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
1922	Value InitX, Instruction DefX,
1923	PHINode *CntPhi,
1924	Instruction *CntInst) {
1925	bool IsCntPhiUsedOutsideLoop = false;
1926	for (User *U : CntPhi->users())
1927	if (!CurLoop->contains(Inst: cast<Instruction>(Val: U))) {
1928	IsCntPhiUsedOutsideLoop = true;
1929	break;
1930	}
1931	bool IsCntInstUsedOutsideLoop = false;
1932	for (User *U : CntInst->users())
1933	if (!CurLoop->contains(Inst: cast<Instruction>(Val: U))) {
1934	IsCntInstUsedOutsideLoop = true;
1935	break;
1936	}
1937	// If both CntInst and CntPhi are used outside the loop the profitability
1938	// is questionable.
1939	if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1940	return false;
1941
1942	// For some CPUs result of CTLZ(X) intrinsic is undefined
1943	// when X is 0. If we can not guarantee X != 0, we need to check this
1944	// when expand.
1945	bool ZeroCheck = false;
1946	// It is safe to assume Preheader exist as it was checked in
1947	// parent function RunOnLoop.
1948	BasicBlock *PH = CurLoop->getLoopPreheader();
1949
1950	// If we are using the count instruction outside the loop, make sure we
1951	// have a zero check as a precondition. Without the check the loop would run
1952	// one iteration for before any check of the input value. This means 0 and 1
1953	// would have identical behavior in the original loop and thus
1954	if (!IsCntPhiUsedOutsideLoop) {
1955	auto *PreCondBB = PH->getSinglePredecessor();
1956	if (!PreCondBB)
1957	return false;
1958	auto *PreCondBI = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
1959	if (!PreCondBI)
1960	return false;
1961	if (matchCondition(BI: PreCondBI, LoopEntry: PH) != InitX)
1962	return false;
1963	ZeroCheck = true;
1964	}
1965
1966	// FFS idiom loop has only 6 instructions:
1967	// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1968	// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1969	// %shr = ashr %n.addr.0, 1
1970	// %tobool = icmp eq %shr, 0
1971	// %inc = add nsw %i.0, 1
1972	// br i1 %tobool
1973	size_t IdiomCanonicalSize = `6`;
1974	if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, CanonicalSize: IdiomCanonicalSize))
1975	return false;
1976
1977	transformLoopToCountable(IntrinID, PreCondBB: PH, CntInst, CntPhi, Var: InitX, DefX,
1978	DL: DefX->getDebugLoc(), ZeroCheck,
1979	IsCntPhiUsedOutsideLoop);
1980	return true;
1981	}
1982
1983	/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1984	/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1985	/// trip count returns true; otherwise, returns false.
1986	bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1987	// Give up if the loop has multiple blocks or multiple backedges.
1988	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1`)
1989	return false;
1990
1991	Intrinsic::ID IntrinID;
1992	Value *InitX;
1993	Instruction DefX = nullptr*;
1994	PHINode CntPhi = nullptr*;
1995	Instruction CntInst = nullptr*;
1996
1997	if (!detectShiftUntilZeroIdiom(CurLoop, DL: *DL, IntrinID, InitX, CntInst, CntPhi,
1998	DefX))
1999	return false;
2000
2001	return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2002	}
2003
2004	bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2005	// Give up if the loop has multiple blocks or multiple backedges.
2006	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1`)
2007	return false;
2008
2009	Intrinsic::ID IntrinID;
2010	Value *InitX;
2011	Instruction DefX = nullptr*;
2012	PHINode CntPhi = nullptr*;
2013	Instruction CntInst = nullptr*;
2014
2015	APInt LoopThreshold;
2016	if (!detectShiftUntilLessThanIdiom(CurLoop, DL: *DL, IntrinID, InitX, CntInst,
2017	CntPhi, DefX, Threshold&: LoopThreshold))
2018	return false;
2019
2020	if (LoopThreshold == `2`) {
2021	// Treat as regular FFS.
2022	return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2023	}
2024
2025	// Look for Floor Log2 Idiom.
2026	if (LoopThreshold != `4`)
2027	return false;
2028
2029	// Abort if CntPhi is used outside of the loop.
2030	for (User *U : CntPhi->users())
2031	if (!CurLoop->contains(Inst: cast<Instruction>(Val: U)))
2032	return false;
2033
2034	// It is safe to assume Preheader exist as it was checked in
2035	// parent function RunOnLoop.
2036	BasicBlock *PH = CurLoop->getLoopPreheader();
2037	auto *PreCondBB = PH->getSinglePredecessor();
2038	if (!PreCondBB)
2039	return false;
2040	auto *PreCondBI = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
2041	if (!PreCondBI)
2042	return false;
2043
2044	APInt PreLoopThreshold;
2045	if (matchShiftULTCondition(BI: PreCondBI, LoopEntry: PH, Threshold&: PreLoopThreshold) != InitX \|\|
2046	PreLoopThreshold != `2`)
2047	return false;
2048
2049	bool ZeroCheck = true;
2050
2051	// the loop has only 6 instructions:
2052	// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2053	// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2054	// %shr = ashr %n.addr.0, 1
2055	// %tobool = icmp ult %n.addr.0, C
2056	// %inc = add nsw %i.0, 1
2057	// br i1 %tobool
2058	size_t IdiomCanonicalSize = `6`;
2059	if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, CanonicalSize: IdiomCanonicalSize))
2060	return false;
2061
2062	// log2(x) = w − 1 − clz(x)
2063	transformLoopToCountable(IntrinID, PreCondBB: PH, CntInst, CntPhi, Var: InitX, DefX,
2064	DL: DefX->getDebugLoc(), ZeroCheck,
2065	/IsCntPhiUsedOutsideLoop=/false,
2066	/InsertSub=/true);
2067	return true;
2068	}
2069
2070	/// Recognizes a population count idiom in a non-countable loop.
2071	///
2072	/// If detected, transforms the relevant code to issue the popcount intrinsic
2073	/// function call, and returns true; otherwise, returns false.
2074	bool LoopIdiomRecognize::recognizePopcount() {
2075	if (TTI->getPopcntSupport(IntTyWidthInBit: `32`) != TargetTransformInfo::PSK_FastHardware)
2076	return false;
2077
2078	// Counting population are usually conducted by few arithmetic instructions.
2079	// Such instructions can be easily "absorbed" by vacant slots in a
2080	// non-compact loop. Therefore, recognizing popcount idiom only makes sense
2081	// in a compact loop.
2082
2083	// Give up if the loop has multiple blocks or multiple backedges.
2084	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1`)
2085	return false;
2086
2087	BasicBlock LoopBody = (CurLoop->block_begin());
2088	if (LoopBody->size() >= `20`) {
2089	// The loop is too big, bail out.
2090	return false;
2091	}
2092
2093	// It should have a preheader containing nothing but an unconditional branch.
2094	BasicBlock *PH = CurLoop->getLoopPreheader();
2095	if (!PH \|\| &PH->front() != PH->getTerminator())
2096	return false;
2097	auto *EntryBI = dyn_cast<BranchInst>(Val: PH->getTerminator());
2098	if (!EntryBI \|\| EntryBI->isConditional())
2099	return false;
2100
2101	// It should have a precondition block where the generated popcount intrinsic
2102	// function can be inserted.
2103	auto *PreCondBB = PH->getSinglePredecessor();
2104	if (!PreCondBB)
2105	return false;
2106	auto *PreCondBI = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
2107	if (!PreCondBI \|\| PreCondBI->isUnconditional())
2108	return false;
2109
2110	Instruction *CntInst;
2111	PHINode *CntPhi;
2112	Value *Val;
2113	if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Var&: Val))
2114	return false;
2115
2116	transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Var: Val);
2117	return true;
2118	}
2119
2120	static CallInst createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value Val,
2121	const DebugLoc &DL) {
2122	Value *Ops[] = {Val};
2123	Type *Tys[] = {Val->getType()};
2124
2125	Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
2126	Function *Func = Intrinsic::getDeclaration(M, id: Intrinsic::ctpop, Tys);
2127	CallInst *CI = IRBuilder.CreateCall(Callee: Func, Args: Ops);
2128	CI->setDebugLoc(DL);
2129
2130	return CI;
2131	}
2132
2133	static CallInst createFFSIntrinsic(IRBuilder<> &IRBuilder, Value Val,
2134	const DebugLoc &DL, bool ZeroCheck,
2135	Intrinsic::ID IID) {
2136	Value *Ops[] = {Val, IRBuilder.getInt1(V: ZeroCheck)};
2137	Type *Tys[] = {Val->getType()};
2138
2139	Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
2140	Function *Func = Intrinsic::getDeclaration(M, id: IID, Tys);
2141	CallInst *CI = IRBuilder.CreateCall(Callee: Func, Args: Ops);
2142	CI->setDebugLoc(DL);
2143
2144	return CI;
2145	}
2146
2147	/// Transform the following loop (Using CTLZ, CTTZ is similar):
2148	/// loop:
2149	/// CntPhi = PHI [Cnt0, CntInst]
2150	/// PhiX = PHI [InitX, DefX]
2151	/// CntInst = CntPhi + 1
2152	/// DefX = PhiX >> 1
2153	/// LOOP_BODY
2154	/// Br: loop if (DefX != 0)
2155	/// Use(CntPhi) or Use(CntInst)
2156	///
2157	/// Into:
2158	/// If CntPhi used outside the loop:
2159	/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2160	/// Count = CountPrev + 1
2161	/// else
2162	/// Count = BitWidth(InitX) - CTLZ(InitX)
2163	/// loop:
2164	/// CntPhi = PHI [Cnt0, CntInst]
2165	/// PhiX = PHI [InitX, DefX]
2166	/// PhiCount = PHI [Count, Dec]
2167	/// CntInst = CntPhi + 1
2168	/// DefX = PhiX >> 1
2169	/// Dec = PhiCount - 1
2170	/// LOOP_BODY
2171	/// Br: loop if (Dec != 0)
2172	/// Use(CountPrev + Cnt0) // Use(CntPhi)
2173	/// or
2174	/// Use(Count + Cnt0) // Use(CntInst)
2175	///
2176	/// If LOOP_BODY is empty the loop will be deleted.
2177	/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2178	void LoopIdiomRecognize::transformLoopToCountable(
2179	Intrinsic::ID IntrinID, BasicBlock Preheader, Instruction CntInst,
2180	PHINode CntPhi, Value InitX, Instruction DefX, const* DebugLoc &DL,
2181	bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
2182	BranchInst *PreheaderBr = cast<BranchInst>(Val: Preheader->getTerminator());
2183
2184	// Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2185	IRBuilder<> Builder(PreheaderBr);
2186	Builder.SetCurrentDebugLocation(DL);
2187
2188	// If there are no uses of CntPhi crate:
2189	// Count = BitWidth - CTLZ(InitX);
2190	// NewCount = Count;
2191	// If there are uses of CntPhi create:
2192	// NewCount = BitWidth - CTLZ(InitX >> 1);
2193	// Count = NewCount + 1;
2194	Value *InitXNext;
2195	if (IsCntPhiUsedOutsideLoop) {
2196	if (DefX->getOpcode() == Instruction::AShr)
2197	InitXNext = Builder.CreateAShr(LHS: InitX, RHS: `1`);
2198	else if (DefX->getOpcode() == Instruction::LShr)
2199	InitXNext = Builder.CreateLShr(LHS: InitX, RHS: `1`);
2200	else if (DefX->getOpcode() == Instruction::Shl) // cttz
2201	InitXNext = Builder.CreateShl(LHS: InitX, RHS: `1`);
2202	else
2203	llvm_unreachable("Unexpected opcode!");
2204	} else
2205	InitXNext = InitX;
2206	Value *Count =
2207	createFFSIntrinsic(IRBuilder&: Builder, Val: InitXNext, DL, ZeroCheck, IID: IntrinID);
2208	Type *CountTy = Count->getType();
2209	Count = Builder.CreateSub(
2210	LHS: ConstantInt::get(Ty: CountTy, V: CountTy->getIntegerBitWidth()), RHS: Count);
2211	if (InsertSub)
2212	Count = Builder.CreateSub(LHS: Count, RHS: ConstantInt::get(Ty: CountTy, V: `1`));
2213	Value *NewCount = Count;
2214	if (IsCntPhiUsedOutsideLoop)
2215	Count = Builder.CreateAdd(LHS: Count, RHS: ConstantInt::get(Ty: CountTy, V: `1`));
2216
2217	NewCount = Builder.CreateZExtOrTrunc(V: NewCount, DestTy: CntInst->getType());
2218
2219	Value *CntInitVal = CntPhi->getIncomingValueForBlock(BB: Preheader);
2220	if (cast<ConstantInt>(Val: CntInst->getOperand(i: `1`))->isOne()) {
2221	// If the counter was being incremented in the loop, add NewCount to the
2222	// counter's initial value, but only if the initial value is not zero.
2223	ConstantInt *InitConst = dyn_cast<ConstantInt>(Val: CntInitVal);
2224	if (!InitConst \|\| !InitConst->isZero())
2225	NewCount = Builder.CreateAdd(LHS: NewCount, RHS: CntInitVal);
2226	} else {
2227	// If the count was being decremented in the loop, subtract NewCount from
2228	// the counter's initial value.
2229	NewCount = Builder.CreateSub(LHS: CntInitVal, RHS: NewCount);
2230	}
2231
2232	// Step 2: Insert new IV and loop condition:
2233	// loop:
2234	// ...
2235	// PhiCount = PHI [Count, Dec]
2236	// ...
2237	// Dec = PhiCount - 1
2238	// ...
2239	// Br: loop if (Dec != 0)
2240	BasicBlock Body = (CurLoop->block_begin());
2241	auto *LbBr = cast<BranchInst>(Val: Body->getTerminator());
2242	ICmpInst *LbCond = cast<ICmpInst>(Val: LbBr->getCondition());
2243
2244	PHINode *TcPhi = PHINode::Create(Ty: CountTy, NumReservedValues: `2`, NameStr: "tcphi");
2245	TcPhi->insertBefore(InsertPos: Body->begin());
2246
2247	Builder.SetInsertPoint(LbCond);
2248	Instruction *TcDec = cast<Instruction>(Val: Builder.CreateSub(
2249	LHS: TcPhi, RHS: ConstantInt::get(Ty: CountTy, V: `1`), Name: "tcdec", HasNUW: false, HasNSW: true));
2250
2251	TcPhi->addIncoming(V: Count, BB: Preheader);
2252	TcPhi->addIncoming(V: TcDec, BB: Body);
2253
2254	CmpInst::Predicate Pred =
2255	(LbBr->getSuccessor(i: `0`) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2256	LbCond->setPredicate(Pred);
2257	LbCond->setOperand(i_nocapture: `0`, Val_nocapture: TcDec);
2258	LbCond->setOperand(i_nocapture: `1`, Val_nocapture: ConstantInt::get(Ty: CountTy, V: `0`));
2259
2260	// Step 3: All the references to the original counter outside
2261	// the loop are replaced with the NewCount
2262	if (IsCntPhiUsedOutsideLoop)
2263	CntPhi->replaceUsesOutsideBlock(V: NewCount, BB: Body);
2264	else
2265	CntInst->replaceUsesOutsideBlock(V: NewCount, BB: Body);
2266
2267	// step 4: Forget the "non-computable" trip-count SCEV associated with the
2268	// loop. The loop would otherwise not be deleted even if it becomes empty.
2269	SE->forgetLoop(L: CurLoop);
2270	}
2271
2272	void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2273	Instruction *CntInst,
2274	PHINode CntPhi, Value Var) {
2275	BasicBlock *PreHead = CurLoop->getLoopPreheader();
2276	auto *PreCondBr = cast<BranchInst>(Val: PreCondBB->getTerminator());
2277	const DebugLoc &DL = CntInst->getDebugLoc();
2278
2279	// Assuming before transformation, the loop is following:
2280	// if (x) // the precondition
2281	// do { cnt++; x &= x - 1; } while(x);
2282
2283	// Step 1: Insert the ctpop instruction at the end of the precondition block
2284	IRBuilder<> Builder(PreCondBr);
2285	Value PopCnt, PopCntZext, NewCount, TripCnt;
2286	{
2287	PopCnt = createPopcntIntrinsic(IRBuilder&: Builder, Val: Var, DL);
2288	NewCount = PopCntZext =
2289	Builder.CreateZExtOrTrunc(V: PopCnt, DestTy: cast<IntegerType>(Val: CntPhi->getType()));
2290
2291	if (NewCount != PopCnt)
2292	(cast<Instruction>(Val: NewCount))->setDebugLoc(DL);
2293
2294	// TripCnt is exactly the number of iterations the loop has
2295	TripCnt = NewCount;
2296
2297	// If the population counter's initial value is not zero, insert Add Inst.
2298	Value *CntInitVal = CntPhi->getIncomingValueForBlock(BB: PreHead);
2299	ConstantInt *InitConst = dyn_cast<ConstantInt>(Val: CntInitVal);
2300	if (!InitConst \|\| !InitConst->isZero()) {
2301	NewCount = Builder.CreateAdd(LHS: NewCount, RHS: CntInitVal);
2302	(cast<Instruction>(Val: NewCount))->setDebugLoc(DL);
2303	}
2304	}
2305
2306	// Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2307	// "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2308	// function would be partial dead code, and downstream passes will drag
2309	// it back from the precondition block to the preheader.
2310	{
2311	ICmpInst *PreCond = cast<ICmpInst>(Val: PreCondBr->getCondition());
2312
2313	Value *Opnd0 = PopCntZext;
2314	Value *Opnd1 = ConstantInt::get(Ty: PopCntZext->getType(), V: `0`);
2315	if (PreCond->getOperand(i_nocapture: `0`) != Var)
2316	std::swap(a&: Opnd0, b&: Opnd1);
2317
2318	ICmpInst *NewPreCond = cast<ICmpInst>(
2319	Val: Builder.CreateICmp(P: PreCond->getPredicate(), LHS: Opnd0, RHS: Opnd1));
2320	PreCondBr->setCondition(NewPreCond);
2321
2322	RecursivelyDeleteTriviallyDeadInstructions(V: PreCond, TLI);
2323	}
2324
2325	// Step 3: Note that the population count is exactly the trip count of the
2326	// loop in question, which enable us to convert the loop from noncountable
2327	// loop into a countable one. The benefit is twofold:
2328	//
2329	// - If the loop only counts population, the entire loop becomes dead after
2330	// the transformation. It is a lot easier to prove a countable loop dead
2331	// than to prove a noncountable one. (In some C dialects, an infinite loop
2332	// isn't dead even if it computes nothing useful. In general, DCE needs
2333	// to prove a noncountable loop finite before safely delete it.)
2334	//
2335	// - If the loop also performs something else, it remains alive.
2336	// Since it is transformed to countable form, it can be aggressively
2337	// optimized by some optimizations which are in general not applicable
2338	// to a noncountable loop.
2339	//
2340	// After this step, this loop (conceptually) would look like following:
2341	// newcnt = __builtin_ctpop(x);
2342	// t = newcnt;
2343	// if (x)
2344	// do { cnt++; x &= x-1; t--) } while (t > 0);
2345	BasicBlock Body = (CurLoop->block_begin());
2346	{
2347	auto *LbBr = cast<BranchInst>(Val: Body->getTerminator());
2348	ICmpInst *LbCond = cast<ICmpInst>(Val: LbBr->getCondition());
2349	Type *Ty = TripCnt->getType();
2350
2351	PHINode *TcPhi = PHINode::Create(Ty, NumReservedValues: `2`, NameStr: "tcphi");
2352	TcPhi->insertBefore(InsertPos: Body->begin());
2353
2354	Builder.SetInsertPoint(LbCond);
2355	Instruction *TcDec = cast<Instruction>(
2356	Val: Builder.CreateSub(LHS: TcPhi, RHS: ConstantInt::get(Ty, V: `1`),
2357	Name: "tcdec", HasNUW: false, HasNSW: true));
2358
2359	TcPhi->addIncoming(V: TripCnt, BB: PreHead);
2360	TcPhi->addIncoming(V: TcDec, BB: Body);
2361
2362	CmpInst::Predicate Pred =
2363	(LbBr->getSuccessor(i: `0`) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2364	LbCond->setPredicate(Pred);
2365	LbCond->setOperand(i_nocapture: `0`, Val_nocapture: TcDec);
2366	LbCond->setOperand(i_nocapture: `1`, Val_nocapture: ConstantInt::get(Ty, V: `0`));
2367	}
2368
2369	// Step 4: All the references to the original population counter outside
2370	// the loop are replaced with the NewCount -- the value returned from
2371	// __builtin_ctpop().
2372	CntInst->replaceUsesOutsideBlock(V: NewCount, BB: Body);
2373
2374	// step 5: Forget the "non-computable" trip-count SCEV associated with the
2375	// loop. The loop would otherwise not be deleted even if it becomes empty.
2376	SE->forgetLoop(L: CurLoop);
2377	}
2378
2379	/// Match loop-invariant value.
2380	template <typename SubPattern_t> struct match_LoopInvariant {
2381	SubPattern_t SubPattern;
2382	const Loop *L;
2383
2384	match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2385	: SubPattern(SP), L(L) {}
2386
2387	template <typename ITy> bool match(ITy *V) {
2388	return L->isLoopInvariant(V) && SubPattern.match(V);
2389	}
2390	};
2391
2392	/// Matches if the value is loop-invariant.
2393	template <typename Ty>
2394	inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2395	return match_LoopInvariant<Ty>(M, L);
2396	}
2397
2398	/// Return true if the idiom is detected in the loop.
2399	///
2400	/// The core idiom we are trying to detect is:
2401	/// \code
2402	/// entry:
2403	/// <...>
2404	/// %bitmask = shl i32 1, %bitpos
2405	/// br label %loop
2406	///
2407	/// loop:
2408	/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2409	/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2410	/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2411	/// %x.next = shl i32 %x.curr, 1
2412	/// <...>
2413	/// br i1 %x.curr.isbitunset, label %loop, label %end
2414	///
2415	/// end:
2416	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2417	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2418	/// <...>
2419	/// \endcode
2420	static bool detectShiftUntilBitTestIdiom(Loop CurLoop, Value &BaseX,
2421	Value &BitMask, Value &BitPos,
2422	Value &CurrX, Instruction &NextX) {
2423	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2424	" Performing shift-until-bittest idiom detection.\n");
2425
2426	// Give up if the loop has multiple blocks or multiple backedges.
2427	if (CurLoop->getNumBlocks() != `1` \|\| CurLoop->getNumBackEdges() != `1`) {
2428	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2429	return false;
2430	}
2431
2432	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2433	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2434	assert(LoopPreheaderBB && "There is always a loop preheader.");
2435
2436	using namespace PatternMatch;
2437
2438	// Step 1: Check if the loop backedge is in desirable form.
2439
2440	ICmpInst::Predicate Pred;
2441	Value CmpLHS, CmpRHS;
2442	BasicBlock TrueBB, FalseBB;
2443	if (!match(V: LoopHeaderBB->getTerminator(),
2444	P: m_Br(C: m_ICmp(Pred, L: m_Value(V&: CmpLHS), R: m_Value(V&: CmpRHS)),
2445	T: m_BasicBlock(V&: TrueBB), F: m_BasicBlock(V&: FalseBB)))) {
2446	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2447	return false;
2448	}
2449
2450	// Step 2: Check if the backedge's condition is in desirable form.
2451
2452	auto MatchVariableBitMask = [&]() {
2453	return ICmpInst::isEquality(P: Pred) && match(V: CmpRHS, P: m_Zero()) &&
2454	match(V: CmpLHS,
2455	P: m_c_And(L: m_Value(V&: CurrX),
2456	R: m_CombineAnd(
2457	L: m_Value(V&: BitMask),
2458	R: m_LoopInvariant(M: m_Shl(L: m_One(), R: m_Value(V&: BitPos)),
2459	L: CurLoop))));
2460	};
2461	auto MatchConstantBitMask = [&]() {
2462	return ICmpInst::isEquality(P: Pred) && match(V: CmpRHS, P: m_Zero()) &&
2463	match(V: CmpLHS, P: m_And(L: m_Value(V&: CurrX),
2464	R: m_CombineAnd(L: m_Value(V&: BitMask), R: m_Power2()))) &&
2465	(BitPos = ConstantExpr::getExactLogBase2(C: cast<Constant>(Val: BitMask)));
2466	};
2467	auto MatchDecomposableConstantBitMask = [&]() {
2468	APInt Mask;
2469	return llvm::decomposeBitTestICmp(LHS: CmpLHS, RHS: CmpRHS, Pred, X&: CurrX, Mask) &&
2470	ICmpInst::isEquality(P: Pred) && Mask.isPowerOf2() &&
2471	(BitMask = ConstantInt::get(Ty: CurrX->getType(), V: Mask)) &&
2472	(BitPos = ConstantInt::get(Ty: CurrX->getType(), V: Mask.logBase2()));
2473	};
2474
2475	if (!MatchVariableBitMask () && !MatchConstantBitMask () &&
2476	!MatchDecomposableConstantBitMask ()) {
2477	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2478	return false;
2479	}
2480
2481	// Step 3: Check if the recurrence is in desirable form.
2482	auto *CurrXPN = dyn_cast<PHINode>(Val: CurrX);
2483	if (!CurrXPN \|\| CurrXPN->getParent() != LoopHeaderBB) {
2484	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2485	return false;
2486	}
2487
2488	BaseX = CurrXPN->getIncomingValueForBlock(BB: LoopPreheaderBB);
2489	NextX =
2490	dyn_cast<Instruction>(Val: CurrXPN->getIncomingValueForBlock(BB: LoopHeaderBB));
2491
2492	assert(CurLoop->isLoopInvariant(BaseX) &&
2493	"Expected BaseX to be avaliable in the preheader!");
2494
2495	if (!NextX \|\| !match(V: NextX, P: m_Shl(L: m_Specific(V: CurrX), R: m_One()))) {
2496	// FIXME: support right-shift?
2497	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2498	return false;
2499	}
2500
2501	// Step 4: Check if the backedge's destinations are in desirable form.
2502
2503	assert(ICmpInst::isEquality(Pred) &&
2504	"Should only get equality predicates here.");
2505
2506	// cmp-br is commutative, so canonicalize to a single variant.
2507	if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2508	Pred = ICmpInst::getInversePredicate(pred: Pred);
2509	std::swap(a&: TrueBB, b&: FalseBB);
2510	}
2511
2512	// We expect to exit loop when comparison yields false,
2513	// so when it yields true we should branch back to loop header.
2514	if (TrueBB != LoopHeaderBB) {
2515	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2516	return false;
2517	}
2518
2519	// Okay, idiom checks out.
2520	return true;
2521	}
2522
2523	/// Look for the following loop:
2524	/// \code
2525	/// entry:
2526	/// <...>
2527	/// %bitmask = shl i32 1, %bitpos
2528	/// br label %loop
2529	///
2530	/// loop:
2531	/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2532	/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2533	/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2534	/// %x.next = shl i32 %x.curr, 1
2535	/// <...>
2536	/// br i1 %x.curr.isbitunset, label %loop, label %end
2537	///
2538	/// end:
2539	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2540	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2541	/// <...>
2542	/// \endcode
2543	///
2544	/// And transform it into:
2545	/// \code
2546	/// entry:
2547	/// %bitmask = shl i32 1, %bitpos
2548	/// %lowbitmask = add i32 %bitmask, -1
2549	/// %mask = or i32 %lowbitmask, %bitmask
2550	/// %x.masked = and i32 %x, %mask
2551	/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2552	/// i1 true)
2553	/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2554	/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2555	/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2556	/// %tripcount = add i32 %backedgetakencount, 1
2557	/// %x.curr = shl i32 %x, %backedgetakencount
2558	/// %x.next = shl i32 %x, %tripcount
2559	/// br label %loop
2560	///
2561	/// loop:
2562	/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2563	/// %loop.iv.next = add nuw i32 %loop.iv, 1
2564	/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2565	/// <...>
2566	/// br i1 %loop.ivcheck, label %end, label %loop
2567	///
2568	/// end:
2569	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2570	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2571	/// <...>
2572	/// \endcode
2573	bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2574	bool MadeChange = false;
2575
2576	Value X, BitMask, BitPos, XCurr;
2577	Instruction *XNext;
2578	if (!detectShiftUntilBitTestIdiom(CurLoop, BaseX&: X, BitMask, BitPos, CurrX&: XCurr,
2579	NextX&: XNext)) {
2580	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2581	" shift-until-bittest idiom detection failed.\n");
2582	return MadeChange;
2583	}
2584	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2585
2586	// Ok, it is the idiom we were looking for, we could* transform this loop,*
2587	// but is it profitable to transform?
2588
2589	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2590	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2591	assert(LoopPreheaderBB && "There is always a loop preheader.");
2592
2593	BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2594	assert(SuccessorBB && "There is only a single successor.");
2595
2596	IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2597	Builder.SetCurrentDebugLocation(cast<Instruction>(Val: XCurr)->getDebugLoc());
2598
2599	Intrinsic::ID IntrID = Intrinsic::ctlz;
2600	Type *Ty = X->getType();
2601	unsigned Bitwidth = Ty->getScalarSizeInBits();
2602
2603	TargetTransformInfo::TargetCostKind CostKind =
2604	TargetTransformInfo::TCK_SizeAndLatency;
2605
2606	// The rewrite is considered to be unprofitable iff and only iff the
2607	// intrinsic/shift we'll use are not cheap. Note that we are okay with just
2608	// making the loop countable, even if nothing else changes.
2609	IntrinsicCostAttributes Attrs(
2610	IntrID, Ty, {PoisonValue::get(T: Ty), /is_zero_poison=/Builder.getTrue()});
2611	InstructionCost Cost = TTI->getIntrinsicInstrCost(ICA: Attrs, CostKind);
2612	if (Cost > TargetTransformInfo::TCC_Basic) {
2613	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2614	" Intrinsic is too costly, not beneficial\n");
2615	return MadeChange;
2616	}
2617	if (TTI->getArithmeticInstrCost(Opcode: Instruction::Shl, Ty, CostKind) >
2618	TargetTransformInfo::TCC_Basic) {
2619	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2620	return MadeChange;
2621	}
2622
2623	// Ok, transform appears worthwhile.
2624	MadeChange = true;
2625
2626	if (!isGuaranteedNotToBeUndefOrPoison(V: BitPos)) {
2627	// BitMask may be computed from BitPos, Freeze BitPos so we can increase
2628	// it's use count.
2629	std::optional<BasicBlock::iterator> InsertPt = std::nullopt;
2630	if (auto *BitPosI = dyn_cast<Instruction>(Val: BitPos))
2631	InsertPt = BitPosI->getInsertionPointAfterDef();
2632	else
2633	InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
2634	if (!InsertPt)
2635	return false;
2636	FreezeInst *BitPosFrozen =
2637	new FreezeInst (BitPos, BitPos->getName() + ".fr", *InsertPt);
2638	BitPos->replaceUsesWithIf(New: BitPosFrozen, ShouldReplace: [BitPosFrozen](Use &U) {
2639	return U.getUser() != BitPosFrozen;
2640	});
2641	BitPos = BitPosFrozen;
2642	}
2643
2644	// Step 1: Compute the loop trip count.
2645
2646	Value *LowBitMask = Builder.CreateAdd(LHS: BitMask, RHS: Constant::getAllOnesValue(Ty),
2647	Name: BitPos->getName() + ".lowbitmask");
2648	Value *Mask =
2649	Builder.CreateOr(LHS: LowBitMask, RHS: BitMask, Name: BitPos->getName() + ".mask");
2650	Value *XMasked = Builder.CreateAnd(LHS: X, RHS: Mask, Name: X->getName() + ".masked");
2651	CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2652	ID: IntrID, Types: Ty, Args: {XMasked, /is_zero_poison=/Builder.getTrue()},
2653	/FMFSource=/nullptr, Name: XMasked->getName() + ".numleadingzeros");
2654	Value *XMaskedNumActiveBits = Builder.CreateSub(
2655	LHS: ConstantInt::get(Ty, V: Ty->getScalarSizeInBits()), RHS: XMaskedNumLeadingZeros,
2656	Name: XMasked->getName() + ".numactivebits", /HasNUW=/true,
2657	/HasNSW=/Bitwidth != `2`);
2658	Value *XMaskedLeadingOnePos =
2659	Builder.CreateAdd(LHS: XMaskedNumActiveBits, RHS: Constant::getAllOnesValue(Ty),
2660	Name: XMasked->getName() + ".leadingonepos", /HasNUW=/false,
2661	/HasNSW=/Bitwidth > `2`);
2662
2663	Value *LoopBackedgeTakenCount = Builder.CreateSub(
2664	LHS: BitPos, RHS: XMaskedLeadingOnePos, Name: CurLoop->getName() + ".backedgetakencount",
2665	/HasNUW=/true, /HasNSW=/true);
2666	// We know loop's backedge-taken count, but what's loop's trip count?
2667	// Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2668	Value *LoopTripCount =
2669	Builder.CreateAdd(LHS: LoopBackedgeTakenCount, RHS: ConstantInt::get(Ty, V: `1`),
2670	Name: CurLoop->getName() + ".tripcount", /HasNUW=/true,
2671	/HasNSW=/Bitwidth != `2`);
2672
2673	// Step 2: Compute the recurrence's final value without a loop.
2674
2675	// NewX is always safe to compute, because `LoopBackedgeTakenCount`
2676	// will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2677	Value *NewX = Builder.CreateShl(LHS: X, RHS: LoopBackedgeTakenCount);
2678	NewX->takeName(V: XCurr);
2679	if (auto *I = dyn_cast<Instruction>(Val: NewX))
2680	I->copyIRFlags(V: XNext, /IncludeWrapFlags=/true);
2681
2682	Value *NewXNext;
2683	// Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2684	// will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2685	// iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2686	// that isn't the case, we'll need to emit an alternative, safe IR.
2687	if (XNext->hasNoSignedWrap() \|\| XNext->hasNoUnsignedWrap() \|\|
2688	PatternMatch::match(
2689	V: BitPos, P: PatternMatch::m_SpecificInt_ICMP(
2690	Predicate: ICmpInst::ICMP_NE, Threshold: APInt (Ty->getScalarSizeInBits(),
2691	Ty->getScalarSizeInBits() - `1`))))
2692	NewXNext = Builder.CreateShl(LHS: X, RHS: LoopTripCount);
2693	else {
2694	// Otherwise, just additionally shift by one. It's the smallest solution,
2695	// alternatively, we could check that NewX is INT_MIN (or BitPos is )
2696	// and select 0 instead.
2697	NewXNext = Builder.CreateShl(LHS: NewX, RHS: ConstantInt::get(Ty, V: `1`));
2698	}
2699
2700	NewXNext->takeName(V: XNext);
2701	if (auto *I = dyn_cast<Instruction>(Val: NewXNext))
2702	I->copyIRFlags(V: XNext, /IncludeWrapFlags=/true);
2703
2704	// Step 3: Adjust the successor basic block to recieve the computed
2705	// recurrence's final value instead of the recurrence itself.
2706
2707	XCurr->replaceUsesOutsideBlock(V: NewX, BB: LoopHeaderBB);
2708	XNext->replaceUsesOutsideBlock(V: NewXNext, BB: LoopHeaderBB);
2709
2710	// Step 4: Rewrite the loop into a countable form, with canonical IV.
2711
2712	// The new canonical induction variable.
2713	Builder.SetInsertPoint(TheBB: LoopHeaderBB, IP: LoopHeaderBB->begin());
2714	auto *IV = Builder.CreatePHI(Ty, NumReservedValues: `2`, Name: CurLoop->getName() + ".iv");
2715
2716	// The induction itself.
2717	// Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2718	Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2719	auto *IVNext =
2720	Builder.CreateAdd(LHS: IV, RHS: ConstantInt::get(Ty, V: `1`), Name: IV->getName() + ".next",
2721	/HasNUW=/true, /HasNSW=/Bitwidth != `2`);
2722
2723	// The loop trip count check.
2724	auto *IVCheck = Builder.CreateICmpEQ(LHS: IVNext, RHS: LoopTripCount,
2725	Name: CurLoop->getName() + ".ivcheck");
2726	Builder.CreateCondBr(Cond: IVCheck, True: SuccessorBB, False: LoopHeaderBB);
2727	LoopHeaderBB->getTerminator()->eraseFromParent();
2728
2729	// Populate the IV PHI.
2730	IV->addIncoming(V: ConstantInt::get(Ty, V: `0`), BB: LoopPreheaderBB);
2731	IV->addIncoming(V: IVNext, BB: LoopHeaderBB);
2732
2733	// Step 5: Forget the "non-computable" trip-count SCEV associated with the
2734	// loop. The loop would otherwise not be deleted even if it becomes empty.
2735
2736	SE->forgetLoop(L: CurLoop);
2737
2738	// Other passes will take care of actually deleting the loop if possible.
2739
2740	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2741
2742	++NumShiftUntilBitTest;
2743	return MadeChange;
2744	}
2745
2746	/// Return true if the idiom is detected in the loop.
2747	///
2748	/// The core idiom we are trying to detect is:
2749	/// \code
2750	/// entry:
2751	/// <...>
2752	/// %start = <...>
2753	/// %extraoffset = <...>
2754	/// <...>
2755	/// br label %for.cond
2756	///
2757	/// loop:
2758	/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2759	/// %nbits = add nsw i8 %iv, %extraoffset
2760	/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2761	/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2762	/// %iv.next = add i8 %iv, 1
2763	/// <...>
2764	/// br i1 %val.shifted.iszero, label %end, label %loop
2765	///
2766	/// end:
2767	/// %iv.res = phi i8 [ %iv, %loop ] <...>
2768	/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2769	/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2770	/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2771	/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2772	/// <...>
2773	/// \endcode
2774	static bool detectShiftUntilZeroIdiom(Loop CurLoop, ScalarEvolution SE,
2775	Instruction *&ValShiftedIsZero,
2776	Intrinsic::ID &IntrinID, Instruction *&IV,
2777	Value &Start, Value &Val,
2778	const SCEV *&ExtraOffsetExpr,
2779	bool &InvertedCond) {
2780	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2781	" Performing shift-until-zero idiom detection.\n");
2782
2783	// Give up if the loop has multiple blocks or multiple backedges.
2784	if (CurLoop->getNumBlocks() != `1` \|\| CurLoop->getNumBackEdges() != `1`) {
2785	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2786	return false;
2787	}
2788
2789	Instruction ValShifted, NBits, *IVNext;
2790	Value *ExtraOffset;
2791
2792	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2793	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2794	assert(LoopPreheaderBB && "There is always a loop preheader.");
2795
2796	using namespace PatternMatch;
2797
2798	// Step 1: Check if the loop backedge, condition is in desirable form.
2799
2800	ICmpInst::Predicate Pred;
2801	BasicBlock TrueBB, FalseBB;
2802	if (!match(V: LoopHeaderBB->getTerminator(),
2803	P: m_Br(C: m_Instruction(I&: ValShiftedIsZero), T: m_BasicBlock(V&: TrueBB),
2804	F: m_BasicBlock(V&: FalseBB))) \|\|
2805	!match(V: ValShiftedIsZero,
2806	P: m_ICmp(Pred, L: m_Instruction(I&: ValShifted), R: m_Zero())) \|\|
2807	!ICmpInst::isEquality(P: Pred)) {
2808	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2809	return false;
2810	}
2811
2812	// Step 2: Check if the comparison's operand is in desirable form.
2813	// FIXME: Val could be a one-input PHI node, which we should look past.
2814	if (!match(V: ValShifted, P: m_Shift(L: m_LoopInvariant(M: m_Value(V&: Val), L: CurLoop),
2815	R: m_Instruction(I&: NBits)))) {
2816	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2817	return false;
2818	}
2819	IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2820	: Intrinsic::ctlz;
2821
2822	// Step 3: Check if the shift amount is in desirable form.
2823
2824	if (match(V: NBits, P: m_c_Add(L: m_Instruction(I&: IV),
2825	R: m_LoopInvariant(M: m_Value(V&: ExtraOffset), L: CurLoop))) &&
2826	(NBits->hasNoSignedWrap() \|\| NBits->hasNoUnsignedWrap()))
2827	ExtraOffsetExpr = SE->getNegativeSCEV(V: SE->getSCEV(V: ExtraOffset));
2828	else if (match(V: NBits,
2829	P: m_Sub(L: m_Instruction(I&: IV),
2830	R: m_LoopInvariant(M: m_Value(V&: ExtraOffset), L: CurLoop))) &&
2831	NBits->hasNoSignedWrap())
2832	ExtraOffsetExpr = SE->getSCEV(V: ExtraOffset);
2833	else {
2834	IV = NBits;
2835	ExtraOffsetExpr = SE->getZero(Ty: NBits->getType());
2836	}
2837
2838	// Step 4: Check if the recurrence is in desirable form.
2839	auto *IVPN = dyn_cast<PHINode>(Val: IV);
2840	if (!IVPN \|\| IVPN->getParent() != LoopHeaderBB) {
2841	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2842	return false;
2843	}
2844
2845	Start = IVPN->getIncomingValueForBlock(BB: LoopPreheaderBB);
2846	IVNext = dyn_cast<Instruction>(Val: IVPN->getIncomingValueForBlock(BB: LoopHeaderBB));
2847
2848	if (!IVNext \|\| !match(V: IVNext, P: m_Add(L: m_Specific(V: IVPN), R: m_One()))) {
2849	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2850	return false;
2851	}
2852
2853	// Step 4: Check if the backedge's destinations are in desirable form.
2854
2855	assert(ICmpInst::isEquality(Pred) &&
2856	"Should only get equality predicates here.");
2857
2858	// cmp-br is commutative, so canonicalize to a single variant.
2859	InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2860	if (InvertedCond) {
2861	Pred = ICmpInst::getInversePredicate(pred: Pred);
2862	std::swap(a&: TrueBB, b&: FalseBB);
2863	}
2864
2865	// We expect to exit loop when comparison yields true,
2866	// so when it yields false we should branch back to loop header.
2867	if (FalseBB != LoopHeaderBB) {
2868	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2869	return false;
2870	}
2871
2872	// The new, countable, loop will certainly only run a known number of
2873	// iterations, It won't be infinite. But the old loop might be infinite
2874	// under certain conditions. For logical shifts, the value will become zero
2875	// after at most bitwidth(%Val) loop iterations. However, for arithmetic
2876	// right-shift, iff the sign bit was set, the value will never become zero,
2877	// and the loop may never finish.
2878	if (ValShifted->getOpcode() == Instruction::AShr &&
2879	!isMustProgress(L: CurLoop) && !SE->isKnownNonNegative(S: SE->getSCEV(V: Val))) {
2880	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2881	return false;
2882	}
2883
2884	// Okay, idiom checks out.
2885	return true;
2886	}
2887
2888	/// Look for the following loop:
2889	/// \code
2890	/// entry:
2891	/// <...>
2892	/// %start = <...>
2893	/// %extraoffset = <...>
2894	/// <...>
2895	/// br label %for.cond
2896	///
2897	/// loop:
2898	/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2899	/// %nbits = add nsw i8 %iv, %extraoffset
2900	/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2901	/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2902	/// %iv.next = add i8 %iv, 1
2903	/// <...>
2904	/// br i1 %val.shifted.iszero, label %end, label %loop
2905	///
2906	/// end:
2907	/// %iv.res = phi i8 [ %iv, %loop ] <...>
2908	/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2909	/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2910	/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2911	/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2912	/// <...>
2913	/// \endcode
2914	///
2915	/// And transform it into:
2916	/// \code
2917	/// entry:
2918	/// <...>
2919	/// %start = <...>
2920	/// %extraoffset = <...>
2921	/// <...>
2922	/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2923	/// %val.numactivebits = sub i8 8, %val.numleadingzeros
2924	/// %extraoffset.neg = sub i8 0, %extraoffset
2925	/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2926	/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2927	/// %loop.tripcount = sub i8 %iv.final, %start
2928	/// br label %loop
2929	///
2930	/// loop:
2931	/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2932	/// %loop.iv.next = add i8 %loop.iv, 1
2933	/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2934	/// %iv = add i8 %loop.iv, %start
2935	/// <...>
2936	/// br i1 %loop.ivcheck, label %end, label %loop
2937	///
2938	/// end:
2939	/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2940	/// <...>
2941	/// \endcode
2942	bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2943	bool MadeChange = false;
2944
2945	Instruction *ValShiftedIsZero;
2946	Intrinsic::ID IntrID;
2947	Instruction *IV;
2948	Value Start, Val;
2949	const SCEV *ExtraOffsetExpr;
2950	bool InvertedCond;
2951	if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrinID&: IntrID, IV,
2952	Start, Val, ExtraOffsetExpr, InvertedCond)) {
2953	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2954	" shift-until-zero idiom detection failed.\n");
2955	return MadeChange;
2956	}
2957	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2958
2959	// Ok, it is the idiom we were looking for, we could* transform this loop,*
2960	// but is it profitable to transform?
2961
2962	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2963	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2964	assert(LoopPreheaderBB && "There is always a loop preheader.");
2965
2966	BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2967	assert(SuccessorBB && "There is only a single successor.");
2968
2969	IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2970	Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2971
2972	Type *Ty = Val->getType();
2973	unsigned Bitwidth = Ty->getScalarSizeInBits();
2974
2975	TargetTransformInfo::TargetCostKind CostKind =
2976	TargetTransformInfo::TCK_SizeAndLatency;
2977
2978	// The rewrite is considered to be unprofitable iff and only iff the
2979	// intrinsic we'll use are not cheap. Note that we are okay with just
2980	// making the loop countable, even if nothing else changes.
2981	IntrinsicCostAttributes Attrs(
2982	IntrID, Ty, {PoisonValue::get(T: Ty), /is_zero_poison=/Builder.getFalse()});
2983	InstructionCost Cost = TTI->getIntrinsicInstrCost(ICA: Attrs, CostKind);
2984	if (Cost > TargetTransformInfo::TCC_Basic) {
2985	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2986	" Intrinsic is too costly, not beneficial\n");
2987	return MadeChange;
2988	}
2989
2990	// Ok, transform appears worthwhile.
2991	MadeChange = true;
2992
2993	bool OffsetIsZero = false;
2994	if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(Val: ExtraOffsetExpr))
2995	OffsetIsZero = ExtraOffsetExprC->isZero();
2996
2997	// Step 1: Compute the loop's final IV value / trip count.
2998
2999	CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
3000	ID: IntrID, Types: Ty, Args: {Val, /is_zero_poison=/Builder.getFalse()},
3001	/FMFSource=/nullptr, Name: Val->getName() + ".numleadingzeros");
3002	Value *ValNumActiveBits = Builder.CreateSub(
3003	LHS: ConstantInt::get(Ty, V: Ty->getScalarSizeInBits()), RHS: ValNumLeadingZeros,
3004	Name: Val->getName() + ".numactivebits", /HasNUW=/true,
3005	/HasNSW=/Bitwidth != `2`);
3006
3007	SCEVExpander Expander(SE, DL, "loop-idiom");
3008	Expander.setInsertPoint(&*Builder.GetInsertPoint());
3009	Value *ExtraOffset = Expander.expandCodeFor(SH: ExtraOffsetExpr);
3010
3011	Value *ValNumActiveBitsOffset = Builder.CreateAdd(
3012	LHS: ValNumActiveBits, RHS: ExtraOffset, Name: ValNumActiveBits->getName() + ".offset",
3013	/HasNUW=/OffsetIsZero, /HasNSW=/true);
3014	Value *IVFinal = Builder.CreateIntrinsic(ID: Intrinsic::smax, Types: {Ty},
3015	Args: {ValNumActiveBitsOffset, Start},
3016	/FMFSource=/nullptr, Name: "iv.final");
3017
3018	auto *LoopBackedgeTakenCount = cast<Instruction>(Val: Builder.CreateSub(
3019	LHS: IVFinal, RHS: Start, Name: CurLoop->getName() + ".backedgetakencount",
3020	/HasNUW=/OffsetIsZero, /HasNSW=/true));
3021	// FIXME: or when the offset was `add nuw`
3022
3023	// We know loop's backedge-taken count, but what's loop's trip count?
3024	Value *LoopTripCount =
3025	Builder.CreateAdd(LHS: LoopBackedgeTakenCount, RHS: ConstantInt::get(Ty, V: `1`),
3026	Name: CurLoop->getName() + ".tripcount", /HasNUW=/true,
3027	/HasNSW=/Bitwidth != `2`);
3028
3029	// Step 2: Adjust the successor basic block to recieve the original
3030	// induction variable's final value instead of the orig. IV itself.
3031
3032	IV->replaceUsesOutsideBlock(V: IVFinal, BB: LoopHeaderBB);
3033
3034	// Step 3: Rewrite the loop into a countable form, with canonical IV.
3035
3036	// The new canonical induction variable.
3037	Builder.SetInsertPoint(TheBB: LoopHeaderBB, IP: LoopHeaderBB->begin());
3038	auto *CIV = Builder.CreatePHI(Ty, NumReservedValues: `2`, Name: CurLoop->getName() + ".iv");
3039
3040	// The induction itself.
3041	Builder.SetInsertPoint(TheBB: LoopHeaderBB, IP: LoopHeaderBB->getFirstNonPHIIt());
3042	auto *CIVNext =
3043	Builder.CreateAdd(LHS: CIV, RHS: ConstantInt::get(Ty, V: `1`), Name: CIV->getName() + ".next",
3044	/HasNUW=/true, /HasNSW=/Bitwidth != `2`);
3045
3046	// The loop trip count check.
3047	auto *CIVCheck = Builder.CreateICmpEQ(LHS: CIVNext, RHS: LoopTripCount,
3048	Name: CurLoop->getName() + ".ivcheck");
3049	auto *NewIVCheck = CIVCheck;
3050	if (InvertedCond) {
3051	NewIVCheck = Builder.CreateNot(V: CIVCheck);
3052	NewIVCheck->takeName(V: ValShiftedIsZero);
3053	}
3054
3055	// The original IV, but rebased to be an offset to the CIV.
3056	auto IVDePHId = Builder.CreateAdd(LHS: CIV, RHS: Start, Name: "", /HasNUW=/*false,
3057	/HasNSW=/true); // FIXME: what about NUW?
3058	IVDePHId->takeName(V: IV);
3059
3060	// The loop terminator.
3061	Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3062	Builder.CreateCondBr(Cond: CIVCheck, True: SuccessorBB, False: LoopHeaderBB);
3063	LoopHeaderBB->getTerminator()->eraseFromParent();
3064
3065	// Populate the IV PHI.
3066	CIV->addIncoming(V: ConstantInt::get(Ty, V: `0`), BB: LoopPreheaderBB);
3067	CIV->addIncoming(V: CIVNext, BB: LoopHeaderBB);
3068
3069	// Step 4: Forget the "non-computable" trip-count SCEV associated with the
3070	// loop. The loop would otherwise not be deleted even if it becomes empty.
3071
3072	SE->forgetLoop(L: CurLoop);
3073
3074	// Step 5: Try to cleanup the loop's body somewhat.
3075	IV->replaceAllUsesWith(V: IVDePHId);
3076	IV->eraseFromParent();
3077
3078	ValShiftedIsZero->replaceAllUsesWith(V: NewIVCheck);
3079	ValShiftedIsZero->eraseFromParent();
3080
3081	// Other passes will take care of actually deleting the loop if possible.
3082
3083	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
3084
3085	++NumShiftUntilZero;
3086	return MadeChange;
3087	}
3088

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp