SROA.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/SROA.cpp]

1	//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// This transformation implements the well known scalar replacement of
10	/// aggregates transformation. It tries to identify promotable elements of an
11	/// aggregate alloca, and promote them to registers. It will also try to
12	/// convert uses of an element (or set of elements) of an alloca into a vector
13	/// or bitfield-style integer scalar if appropriate.
14	///
15	/// It works to do this with minimal slicing of the alloca so that regions
16	/// which are merely transferred in and out of external memory remain unchanged
17	/// and are not decomposed to scalar code.
18	///
19	/// Because this also performs alloca promotion, it can be thought of as also
20	/// serving the purpose of SSA formation. The algorithm iterates on the
21	/// function until all opportunities for promotion have been realized.
22	///
23	//===----------------------------------------------------------------------===//
24
25	#include "llvm/Transforms/Scalar/SROA.h"
26	#include "llvm/ADT/APInt.h"
27	#include "llvm/ADT/ArrayRef.h"
28	#include "llvm/ADT/DenseMap.h"
29	#include "llvm/ADT/MapVector.h"
30	#include "llvm/ADT/PointerIntPair.h"
31	#include "llvm/ADT/STLExtras.h"
32	#include "llvm/ADT/SetVector.h"
33	#include "llvm/ADT/SmallBitVector.h"
34	#include "llvm/ADT/SmallPtrSet.h"
35	#include "llvm/ADT/SmallVector.h"
36	#include "llvm/ADT/Statistic.h"
37	#include "llvm/ADT/StringRef.h"
38	#include "llvm/ADT/Twine.h"
39	#include "llvm/ADT/iterator.h"
40	#include "llvm/ADT/iterator_range.h"
41	#include "llvm/Analysis/AssumptionCache.h"
42	#include "llvm/Analysis/DomTreeUpdater.h"
43	#include "llvm/Analysis/GlobalsModRef.h"
44	#include "llvm/Analysis/Loads.h"
45	#include "llvm/Analysis/PtrUseVisitor.h"
46	#include "llvm/Analysis/ValueTracking.h"
47	#include "llvm/Config/llvm-config.h"
48	#include "llvm/IR/BasicBlock.h"
49	#include "llvm/IR/Constant.h"
50	#include "llvm/IR/ConstantFolder.h"
51	#include "llvm/IR/Constants.h"
52	#include "llvm/IR/DIBuilder.h"
53	#include "llvm/IR/DataLayout.h"
54	#include "llvm/IR/DebugInfo.h"
55	#include "llvm/IR/DebugInfoMetadata.h"
56	#include "llvm/IR/DerivedTypes.h"
57	#include "llvm/IR/Dominators.h"
58	#include "llvm/IR/Function.h"
59	#include "llvm/IR/GlobalAlias.h"
60	#include "llvm/IR/IRBuilder.h"
61	#include "llvm/IR/InstVisitor.h"
62	#include "llvm/IR/Instruction.h"
63	#include "llvm/IR/Instructions.h"
64	#include "llvm/IR/IntrinsicInst.h"
65	#include "llvm/IR/Intrinsics.h"
66	#include "llvm/IR/LLVMContext.h"
67	#include "llvm/IR/Metadata.h"
68	#include "llvm/IR/Module.h"
69	#include "llvm/IR/Operator.h"
70	#include "llvm/IR/PassManager.h"
71	#include "llvm/IR/Type.h"
72	#include "llvm/IR/Use.h"
73	#include "llvm/IR/User.h"
74	#include "llvm/IR/Value.h"
75	#include "llvm/IR/ValueHandle.h"
76	#include "llvm/InitializePasses.h"
77	#include "llvm/Pass.h"
78	#include "llvm/Support/Casting.h"
79	#include "llvm/Support/CommandLine.h"
80	#include "llvm/Support/Compiler.h"
81	#include "llvm/Support/Debug.h"
82	#include "llvm/Support/ErrorHandling.h"
83	#include "llvm/Support/raw_ostream.h"
84	#include "llvm/Transforms/Scalar.h"
85	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
86	#include "llvm/Transforms/Utils/Local.h"
87	#include "llvm/Transforms/Utils/PromoteMemToReg.h"
88	#include "llvm/Transforms/Utils/SSAUpdater.h"
89	#include <algorithm>
90	#include <cassert>
91	#include <cstddef>
92	#include <cstdint>
93	#include <cstring>
94	#include <iterator>
95	#include <queue>
96	#include <string>
97	#include <tuple>
98	#include <utility>
99	#include <variant>
100	#include <vector>
101
102	using namespace llvm;
103
104	#define DEBUG_TYPE "sroa"
105
106	STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
107	STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
108	STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
109	STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
110	STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
111	STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
112	STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
113	STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
114	STATISTIC(NumLoadsPredicated,
115	"Number of loads rewritten into predicated loads to allow promotion");
116	STATISTIC(
117	NumStoresPredicated,
118	"Number of stores rewritten into predicated loads to allow promotion");
119	STATISTIC(NumDeleted, "Number of instructions deleted");
120	STATISTIC(NumVectorized, "Number of vectorized aggregates");
121
122	namespace llvm {
123	/// Disable running mem2reg during SROA in order to test or debug SROA.
124	static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(Val: false),
125	cl::Hidden);
126	extern cl::opt<bool> ProfcheckDisableMetadataFixes;
127	} // namespace llvm
128
129	namespace {
130
131	class AllocaSliceRewriter;
132	class AllocaSlices;
133	class Partition;
134
135	class SelectHandSpeculativity {
136	unsigned char Storage = `0`; // None are speculatable by default.
137	using TrueVal = Bitfield::Element<bool, `0`, `1`>; // Low 0'th bit.
138	using FalseVal = Bitfield::Element<bool, `1`, `1`>; // Low 1'th bit.
139	public:
140	SelectHandSpeculativity() = default;
141	SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
142	bool isSpeculatable(bool isTrueVal) const;
143	bool areAllSpeculatable() const;
144	bool areAnySpeculatable() const;
145	bool areNoneSpeculatable() const;
146	// For interop as int half of PointerIntPair.
147	explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
148	explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
149	};
150	static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
151
152	using PossiblySpeculatableLoad =
153	PointerIntPair<LoadInst *, `2`, SelectHandSpeculativity>;
154	using UnspeculatableStore = StoreInst *;
155	using RewriteableMemOp =
156	std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
157	using RewriteableMemOps = SmallVector<RewriteableMemOp, `2`>;
158
159	/// An optimization pass providing Scalar Replacement of Aggregates.
160	///
161	/// This pass takes allocations which can be completely analyzed (that is, they
162	/// don't escape) and tries to turn them into scalar SSA values. There are
163	/// a few steps to this process.
164	///
165	/// 1) It takes allocations of aggregates and analyzes the ways in which they
166	/// are used to try to split them into smaller allocations, ideally of
167	/// a single scalar data type. It will split up memcpy and memset accesses
168	/// as necessary and try to isolate individual scalar accesses.
169	/// 2) It will transform accesses into forms which are suitable for SSA value
170	/// promotion. This can be replacing a memset with a scalar store of an
171	/// integer value, or it can involve speculating operations on a PHI or
172	/// select to be a PHI or select of the results.
173	/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
174	/// onto insert and extract operations on a vector value, and convert them to
175	/// this form. By doing so, it will enable promotion of vector aggregates to
176	/// SSA vector values.
177	class SROA {
178	LLVMContext *const C;
179	DomTreeUpdater *const DTU;
180	AssumptionCache *const AC;
181	const bool PreserveCFG;
182
183	/// Worklist of alloca instructions to simplify.
184	///
185	/// Each alloca in the function is added to this. Each new alloca formed gets
186	/// added to it as well to recursively simplify unless that alloca can be
187	/// directly promoted. Finally, each time we rewrite a use of an alloca other
188	/// the one being actively rewritten, we add it back onto the list if not
189	/// already present to ensure it is re-visited.
190	SmallSetVector<AllocaInst *, `16`> Worklist;
191
192	/// A collection of instructions to delete.
193	/// We try to batch deletions to simplify code and make things a bit more
194	/// efficient. We also make sure there is no dangling pointers.
195	SmallVector<WeakVH, `8`> DeadInsts;
196
197	/// Post-promotion worklist.
198	///
199	/// Sometimes we discover an alloca which has a high probability of becoming
200	/// viable for SROA after a round of promotion takes place. In those cases,
201	/// the alloca is enqueued here for re-processing.
202	///
203	/// Note that we have to be very careful to clear allocas out of this list in
204	/// the event they are deleted.
205	SmallSetVector<AllocaInst *, `16`> PostPromotionWorklist;
206
207	/// A collection of alloca instructions we can directly promote.
208	SetVector<AllocaInst , SmallVector<AllocaInst >,
209	SmallPtrSet<AllocaInst *, `16`>, `16`>
210	PromotableAllocas;
211
212	/// A worklist of PHIs to speculate prior to promoting allocas.
213	///
214	/// All of these PHIs have been checked for the safety of speculation and by
215	/// being speculated will allow promoting allocas currently in the promotable
216	/// queue.
217	SmallSetVector<PHINode *, `8`> SpeculatablePHIs;
218
219	/// A worklist of select instructions to rewrite prior to promoting
220	/// allocas.
221	SmallMapVector<SelectInst *, RewriteableMemOps, `8`> SelectsToRewrite;
222
223	/// Select instructions that use an alloca and are subsequently loaded can be
224	/// rewritten to load both input pointers and then select between the result,
225	/// allowing the load of the alloca to be promoted.
226	/// From this:
227	/// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
228	/// %V = load <type>, ptr %P2
229	/// to:
230	/// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
231	/// %V2 = load <type>, ptr %Other
232	/// %V = select i1 %cond, <type> %V1, <type> %V2
233	///
234	/// We can do this to a select if its only uses are loads
235	/// and if either the operand to the select can be loaded unconditionally,
236	/// or if we are allowed to perform CFG modifications.
237	/// If found an intervening bitcast with a single use of the load,
238	/// allow the promotion.
239	static std::optional<RewriteableMemOps>
240	isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
241
242	public:
243	SROA(LLVMContext C, DomTreeUpdater DTU, AssumptionCache *AC,
244	SROAOptions PreserveCFG_)
245	: C(C), DTU(DTU), AC(AC),
246	PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
247
248	/// Main run method used by both the SROAPass and by the legacy pass.
249	std::pair<bool /Changed/, bool /CFGChanged/> runSROA(Function &F);
250
251	private:
252	friend class AllocaSliceRewriter;
253
254	bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
255	std::pair<AllocaInst *, uint64_t>
256	rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
257	bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
258	bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
259	std::pair<bool /Changed/, bool /CFGChanged/> runOnAlloca(AllocaInst &AI);
260	void clobberUse(Use &U);
261	bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
262	bool promoteAllocas();
263	};
264
265	} // end anonymous namespace
266
267	/// Calculate the fragment of a variable to use when slicing a store
268	/// based on the slice dimensions, existing fragment, and base storage
269	/// fragment.
270	/// Results:
271	/// UseFrag - Use Target as the new fragment.
272	/// UseNoFrag - The new slice already covers the whole variable.
273	/// Skip - The new alloca slice doesn't include this variable.
274	/// FIXME: Can we use calculateFragmentIntersect instead?
275	namespace {
276	enum FragCalcResult { UseFrag, UseNoFrag, Skip };
277	}
278	static FragCalcResult
279	calculateFragment(DILocalVariable *Variable,
280	uint64_t NewStorageSliceOffsetInBits,
281	uint64_t NewStorageSliceSizeInBits,
282	std::optional<DIExpression::FragmentInfo> StorageFragment,
283	std::optional<DIExpression::FragmentInfo> CurrentFragment,
284	DIExpression::FragmentInfo &Target) {
285	// If the base storage describes part of the variable apply the offset and
286	// the size constraint.
287	if (StorageFragment) {
288	Target.SizeInBits =
289	std::min(a: NewStorageSliceSizeInBits, b: StorageFragment ->SizeInBits);
290	Target.OffsetInBits =
291	NewStorageSliceOffsetInBits + StorageFragment ->OffsetInBits;
292	} else {
293	Target.SizeInBits = NewStorageSliceSizeInBits;
294	Target.OffsetInBits = NewStorageSliceOffsetInBits;
295	}
296
297	// If this slice extracts the entirety of an independent variable from a
298	// larger alloca, do not produce a fragment expression, as the variable is
299	// not fragmented.
300	if (!CurrentFragment) {
301	if (auto Size = Variable->getSizeInBits()) {
302	// Treat the current fragment as covering the whole variable.
303	CurrentFragment = DIExpression::FragmentInfo (*Size, `0`);
304	if (Target == CurrentFragment)
305	return UseNoFrag;
306	}
307	}
308
309	// No additional work to do if there isn't a fragment already, or there is
310	// but it already exactly describes the new assignment.
311	if (!CurrentFragment \|\| *CurrentFragment == Target)
312	return UseFrag;
313
314	// Reject the target fragment if it doesn't fit wholly within the current
315	// fragment. TODO: We could instead chop up the target to fit in the case of
316	// a partial overlap.
317	if (Target.startInBits() < CurrentFragment ->startInBits() \|\|
318	Target.endInBits() > CurrentFragment ->endInBits())
319	return Skip;
320
321	// Target fits within the current fragment, return it.
322	return UseFrag;
323	}
324
325	static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
326	return DebugVariable (DVR->getVariable(), std::nullopt,
327	DVR->getDebugLoc().getInlinedAt());
328	}
329
330	/// Find linked dbg.assign and generate a new one with the correct
331	/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
332	/// value component is copied from the old dbg.assign to the new.
333	/// \param OldAlloca Alloca for the variable before splitting.
334	/// \param IsSplit True if the store (not necessarily alloca)
335	/// is being split.
336	/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
337	/// \param SliceSizeInBits New number of bits being written to.
338	/// \param OldInst Instruction that is being split.
339	/// \param Inst New instruction performing this part of the
340	/// split store.
341	/// \param Dest Store destination.
342	/// \param Value Stored value.
343	/// \param DL Datalayout.
344	static void migrateDebugInfo(AllocaInst OldAlloca, bool* IsSplit,
345	uint64_t OldAllocaOffsetInBits,
346	uint64_t SliceSizeInBits, Instruction *OldInst,
347	Instruction Inst, Value Dest, Value *Value,
348	const DataLayout &DL) {
349	// If we want allocas to be migrated using this helper then we need to ensure
350	// that the BaseFragments map code still works. A simple solution would be
351	// to choose to always clone alloca dbg_assigns (rather than sometimes
352	// "stealing" them).
353	assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
354
355	auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(Inst: OldInst);
356	// Nothing to do if OldInst has no linked dbg.assign intrinsics.
357	if (DVRAssignMarkerRange.empty())
358	return;
359
360	LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
361	LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
362	LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
363	LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
364	<< "\n");
365	LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
366	LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
367	LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
368	LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
369	if (Value)
370	LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
371
372	/// Map of aggregate variables to their fragment associated with OldAlloca.
373	DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
374	BaseFragments;
375	for (auto *DVR : at::getDVRAssignmentMarkers(Inst: OldAlloca))
376	BaseFragments [getAggregateVariable(DVR)] =
377	DVR->getExpression()->getFragmentInfo();
378
379	// The new inst needs a DIAssignID unique metadata tag (if OldInst has
380	// one). It shouldn't already have one: assert this assumption.
381	assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
382	DIAssignID NewID = nullptr*;
383	auto &Ctx = Inst->getContext();
384	DIBuilder DIB(OldInst->getModule(), /AllowUnresolved/* false);
385	assert(OldAlloca->isStaticAlloca());
386
387	auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
388	LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
389	<< "\n");
390	auto *Expr = DbgAssign->getExpression();
391	bool SetKillLocation = false;
392
393	if (IsSplit) {
394	std::optional<DIExpression::FragmentInfo> BaseFragment;
395	{
396	auto R = BaseFragments.find(Val: getAggregateVariable(DVR: DbgAssign));
397	if (R == BaseFragments.end())
398	return;
399	BaseFragment = R ->second;
400	}
401	std::optional<DIExpression::FragmentInfo> CurrentFragment =
402	Expr->getFragmentInfo();
403	DIExpression::FragmentInfo NewFragment;
404	FragCalcResult Result = calculateFragment(
405	Variable: DbgAssign->getVariable(), NewStorageSliceOffsetInBits: OldAllocaOffsetInBits, NewStorageSliceSizeInBits: SliceSizeInBits,
406	StorageFragment: BaseFragment, CurrentFragment, Target&: NewFragment);
407
408	if (Result == Skip)
409	return;
410	if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
411	if (CurrentFragment) {
412	// Rewrite NewFragment to be relative to the existing one (this is
413	// what createFragmentExpression wants). CalculateFragment has
414	// already resolved the size for us. FIXME: Should it return the
415	// relative fragment too?
416	NewFragment.OffsetInBits -= CurrentFragment ->OffsetInBits;
417	}
418	// Add the new fragment info to the existing expression if possible.
419	if (auto E = DIExpression::createFragmentExpression(
420	Expr, OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits)) {
421	Expr = *E;
422	} else {
423	// Otherwise, add the new fragment info to an empty expression and
424	// discard the value component of this dbg.assign as the value cannot
425	// be computed with the new fragment.
426	Expr = *DIExpression::createFragmentExpression(
427	Expr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
428	OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits);
429	SetKillLocation = true;
430	}
431	}
432	}
433
434	// If we haven't created a DIAssignID ID do that now and attach it to Inst.
435	if (!NewID) {
436	NewID = DIAssignID::getDistinct(Context&: Ctx);
437	Inst->setMetadata(KindID: LLVMContext::MD_DIAssignID, Node: NewID);
438	}
439
440	DbgVariableRecord *NewAssign;
441	if (IsSplit) {
442	::Value *NewValue = Value ? Value : DbgAssign->getValue();
443	NewAssign = cast<DbgVariableRecord>(Val: cast<DbgRecord *>(
444	Val: DIB.insertDbgAssign(LinkedInstr: Inst, Val: NewValue, SrcVar: DbgAssign->getVariable(), ValExpr: Expr,
445	Addr: Dest, AddrExpr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
446	DL: DbgAssign->getDebugLoc())));
447	} else {
448	// The store is not split, simply steal the existing dbg_assign.
449	NewAssign = DbgAssign;
450	NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
451	NewAssign->setAddress(Dest);
452	if (Value)
453	NewAssign->replaceVariableLocationOp(OpIdx: `0u`, NewValue: Value);
454	assert(Expr == NewAssign->getExpression());
455	}
456
457	// If we've updated the value but the original dbg.assign has an arglist
458	// then kill it now - we can't use the requested new value.
459	// We can't replace the DIArgList with the new value as it'd leave
460	// the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
461	// an arglist). And we can't keep the DIArgList in case the linked store
462	// is being split - in which case the DIArgList + expression may no longer
463	// be computing the correct value.
464	// This should be a very rare situation as it requires the value being
465	// stored to differ from the dbg.assign (i.e., the value has been
466	// represented differently in the debug intrinsic for some reason).
467	SetKillLocation \|=
468	Value && (DbgAssign->hasArgList() \|\|
469	!DbgAssign->getExpression()->isSingleLocationExpression());
470	if (SetKillLocation)
471	NewAssign->setKillLocation();
472
473	// We could use more precision here at the cost of some additional (code)
474	// complexity - if the original dbg.assign was adjacent to its store, we
475	// could position this new dbg.assign adjacent to its store rather than the
476	// old dbg.assgn. That would result in interleaved dbg.assigns rather than
477	// what we get now:
478	// split store !1
479	// split store !2
480	// dbg.assign !1
481	// dbg.assign !2
482	// This (current behaviour) results results in debug assignments being
483	// noted as slightly offset (in code) from the store. In practice this
484	// should have little effect on the debugging experience due to the fact
485	// that all the split stores should get the same line number.
486	if (NewAssign != DbgAssign) {
487	NewAssign->moveBefore(MoveBefore: DbgAssign->getIterator());
488	NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
489	}
490	LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
491	};
492
493	for_each(Range&: DVRAssignMarkerRange, F: MigrateDbgAssign);
494	}
495
496	namespace {
497
498	/// A custom IRBuilder inserter which prefixes all names, but only in
499	/// Assert builds.
500	class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
501	std::string Prefix;
502
503	Twine getNameWithPrefix(const Twine &Name) const {
504	return Name.isTriviallyEmpty() ? Name : Prefix + Name;
505	}
506
507	public:
508	void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
509
510	void InsertHelper(Instruction I, const* Twine &Name,
511	BasicBlock::iterator InsertPt) const override {
512	IRBuilderDefaultInserter::InsertHelper(I, Name: getNameWithPrefix(Name),
513	InsertPt);
514	}
515	};
516
517	/// Provide a type for IRBuilder that drops names in release builds.
518	using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
519
520	/// A used slice of an alloca.
521	///
522	/// This structure represents a slice of an alloca used by some instruction. It
523	/// stores both the begin and end offsets of this use, a pointer to the use
524	/// itself, and a flag indicating whether we can classify the use as splittable
525	/// or not when forming partitions of the alloca.
526	class Slice {
527	/// The beginning offset of the range.
528	uint64_t BeginOffset = `0`;
529
530	/// The ending offset, not included in the range.
531	uint64_t EndOffset = `0`;
532
533	/// Storage for both the use of this slice and whether it can be
534	/// split.
535	PointerIntPair<Use , `1`, bool*> UseAndIsSplittable;
536
537	public:
538	Slice() = default;
539
540	Slice(uint64_t BeginOffset, uint64_t EndOffset, Use U, bool* IsSplittable,
541	Value *ProtectedFieldDisc)
542	: BeginOffset(BeginOffset), EndOffset(EndOffset),
543	UseAndIsSplittable (U, IsSplittable),
544	ProtectedFieldDisc(ProtectedFieldDisc) {}
545
546	uint64_t beginOffset() const { return BeginOffset; }
547	uint64_t endOffset() const { return EndOffset; }
548
549	bool isSplittable() const { return UseAndIsSplittable.getInt(); }
550	void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
551
552	Use getUse() const* { return UseAndIsSplittable.getPointer(); }
553
554	bool isDead() const { return getUse() == nullptr; }
555	void kill() { UseAndIsSplittable.setPointer(nullptr); }
556
557	// When this access is via an llvm.protected.field.ptr intrinsic, contains
558	// the second argument to the intrinsic, the discriminator.
559	Value *ProtectedFieldDisc;
560
561	/// Support for ordering ranges.
562	///
563	/// This provides an ordering over ranges such that start offsets are
564	/// always increasing, and within equal start offsets, the end offsets are
565	/// decreasing. Thus the spanning range comes first in a cluster with the
566	/// same start position.
567	bool operator<(const Slice &RHS) const {
568	if (beginOffset() < RHS.beginOffset())
569	return true;
570	if (beginOffset() > RHS.beginOffset())
571	return false;
572	if (isSplittable() != RHS.isSplittable())
573	return !isSplittable();
574	if (endOffset() > RHS.endOffset())
575	return true;
576	return false;
577	}
578
579	/// Support comparison with a single offset to allow binary searches.
580	[[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
581	return LHS.beginOffset() < RHSOffset;
582	}
583	[[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
584	return LHSOffset < RHS.beginOffset();
585	}
586
587	bool operator==(const Slice &RHS) const {
588	return isSplittable() == RHS.isSplittable() &&
589	beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
590	}
591	bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
592	};
593
594	/// Representation of the alloca slices.
595	///
596	/// This class represents the slices of an alloca which are formed by its
597	/// various uses. If a pointer escapes, we can't fully build a representation
598	/// for the slices used and we reflect that in this structure. The uses are
599	/// stored, sorted by increasing beginning offset and with unsplittable slices
600	/// starting at a particular offset before splittable slices.
601	class AllocaSlices {
602	public:
603	/// Construct the slices of a particular alloca.
604	AllocaSlices(const DataLayout &DL, AllocaInst &AI);
605
606	/// Test whether a pointer to the allocation escapes our analysis.
607	///
608	/// If this is true, the slices are never fully built and should be
609	/// ignored.
610	bool isEscaped() const { return PointerEscapingInstr; }
611	bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
612
613	/// Support for iterating over the slices.
614	/// @{
615	using iterator = SmallVectorImpl<Slice>::iterator;
616	using range = iterator_range<iterator>;
617
618	iterator begin() { return Slices.begin(); }
619	iterator end() { return Slices.end(); }
620
621	using const_iterator = SmallVectorImpl<Slice>::const_iterator;
622	using const_range = iterator_range<const_iterator>;
623
624	const_iterator begin() const { return Slices.begin(); }
625	const_iterator end() const { return Slices.end(); }
626	/// @}
627
628	/// Erase a range of slices.
629	void erase(iterator Start, iterator Stop) { Slices.erase(CS: Start, CE: Stop); }
630
631	/// Insert new slices for this alloca.
632	///
633	/// This moves the slices into the alloca's slices collection, and re-sorts
634	/// everything so that the usual ordering properties of the alloca's slices
635	/// hold.
636	void insert(ArrayRef<Slice> NewSlices) {
637	int OldSize = Slices.size();
638	Slices.append(in_start: NewSlices.begin(), in_end: NewSlices.end());
639	auto SliceI = Slices.begin() + OldSize;
640	std::stable_sort(first: SliceI, last: Slices.end());
641	std::inplace_merge(first: Slices.begin(), middle: SliceI, last: Slices.end());
642	}
643
644	// Forward declare the iterator and range accessor for walking the
645	// partitions.
646	class partition_iterator;
647	iterator_range<partition_iterator> partitions();
648
649	/// Access the dead users for this alloca.
650	ArrayRef<Instruction > getDeadUsers() const* { return DeadUsers; }
651
652	/// Access the users for this alloca that are llvm.protected.field.ptr
653	/// intrinsics.
654	ArrayRef<IntrinsicInst > getPFPUsers() const* { return PFPUsers; }
655
656	/// Access Uses that should be dropped if the alloca is promotable.
657	ArrayRef<Use > getDeadUsesIfPromotable() const* {
658	return DeadUseIfPromotable;
659	}
660
661	/// Access the dead operands referring to this alloca.
662	///
663	/// These are operands which have cannot actually be used to refer to the
664	/// alloca as they are outside its range and the user doesn't correct for
665	/// that. These mostly consist of PHI node inputs and the like which we just
666	/// need to replace with undef.
667	ArrayRef<Use > getDeadOperands() const* { return DeadOperands; }
668
669	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
670	void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
671	void printSlice(raw_ostream &OS, const_iterator I,
672	StringRef Indent = " ") const;
673	void printUse(raw_ostream &OS, const_iterator I,
674	StringRef Indent = " ") const;
675	void print(raw_ostream &OS) const;
676	void dump(const_iterator I) const;
677	void dump() const;
678	#endif
679
680	private:
681	template <typename DerivedT, typename RetT = void> class BuilderBase;
682	class SliceBuilder;
683
684	friend class AllocaSlices::SliceBuilder;
685
686	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
687	/// Handle to alloca instruction to simplify method interfaces.
688	AllocaInst &AI;
689	#endif
690
691	/// The instruction responsible for this alloca not having a known set
692	/// of slices.
693	///
694	/// When an instruction (potentially) escapes the pointer to the alloca, we
695	/// store a pointer to that here and abort trying to form slices of the
696	/// alloca. This will be null if the alloca slices are analyzed successfully.
697	Instruction *PointerEscapingInstr;
698	Instruction *PointerEscapingInstrReadOnly;
699
700	/// The slices of the alloca.
701	///
702	/// We store a vector of the slices formed by uses of the alloca here. This
703	/// vector is sorted by increasing begin offset, and then the unsplittable
704	/// slices before the splittable ones. See the Slice inner class for more
705	/// details.
706	SmallVector<Slice, `8`> Slices;
707
708	/// Instructions which will become dead if we rewrite the alloca.
709	///
710	/// Note that these are not separated by slice. This is because we expect an
711	/// alloca to be completely rewritten or not rewritten at all. If rewritten,
712	/// all these instructions can simply be removed and replaced with poison as
713	/// they come from outside of the allocated space.
714	SmallVector<Instruction *, `8`> DeadUsers;
715
716	/// Users that are llvm.protected.field.ptr intrinsics. These will be RAUW'd
717	/// to their first argument if we rewrite the alloca.
718	SmallVector<IntrinsicInst *, `0`> PFPUsers;
719
720	/// Uses which will become dead if can promote the alloca.
721	SmallVector<Use *, `8`> DeadUseIfPromotable;
722
723	/// Operands which will become dead if we rewrite the alloca.
724	///
725	/// These are operands that in their particular use can be replaced with
726	/// poison when we rewrite the alloca. These show up in out-of-bounds inputs
727	/// to PHI nodes and the like. They aren't entirely dead (there might be
728	/// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
729	/// want to swap this particular input for poison to simplify the use lists of
730	/// the alloca.
731	SmallVector<Use *, `8`> DeadOperands;
732	};
733
734	/// A partition of the slices.
735	///
736	/// An ephemeral representation for a range of slices which can be viewed as
737	/// a partition of the alloca. This range represents a span of the alloca's
738	/// memory which cannot be split, and provides access to all of the slices
739	/// overlapping some part of the partition.
740	///
741	/// Objects of this type are produced by traversing the alloca's slices, but
742	/// are only ephemeral and not persistent.
743	class Partition {
744	private:
745	friend class AllocaSlices;
746	friend class AllocaSlices::partition_iterator;
747
748	using iterator = AllocaSlices::iterator;
749
750	/// The beginning and ending offsets of the alloca for this
751	/// partition.
752	uint64_t BeginOffset = `0`, EndOffset = `0`;
753
754	/// The start and end iterators of this partition.
755	iterator SI, SJ;
756
757	/// A collection of split slice tails overlapping the partition.
758	SmallVector<Slice *, `4`> SplitTails;
759
760	/// Raw constructor builds an empty partition starting and ending at
761	/// the given iterator.
762	Partition(iterator SI) : SI(SI), SJ(SI) {}
763
764	public:
765	/// The start offset of this partition.
766	///
767	/// All of the contained slices start at or after this offset.
768	uint64_t beginOffset() const { return BeginOffset; }
769
770	/// The end offset of this partition.
771	///
772	/// All of the contained slices end at or before this offset.
773	uint64_t endOffset() const { return EndOffset; }
774
775	/// The size of the partition.
776	///
777	/// Note that this can never be zero.
778	uint64_t size() const {
779	assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
780	return EndOffset - BeginOffset;
781	}
782
783	/// Test whether this partition contains no slices, and merely spans
784	/// a region occupied by split slices.
785	bool empty() const { return SI == SJ; }
786
787	/// \name Iterate slices that start within the partition.
788	/// These may be splittable or unsplittable. They have a begin offset >= the
789	/// partition begin offset.
790	/// @{
791	// FIXME: We should probably define a "concat_iterator" helper and use that
792	// to stitch together pointee_iterators over the split tails and the
793	// contiguous iterators of the partition. That would give a much nicer
794	// interface here. We could then additionally expose filtered iterators for
795	// split, unsplit, and unsplittable splices based on the usage patterns.
796	iterator begin() const { return SI; }
797	iterator end() const { return SJ; }
798	/// @}
799
800	/// Get the sequence of split slice tails.
801	///
802	/// These tails are of slices which start before this partition but are
803	/// split and overlap into the partition. We accumulate these while forming
804	/// partitions.
805	ArrayRef<Slice > splitSliceTails() const* { return SplitTails; }
806	};
807
808	} // end anonymous namespace
809
810	/// An iterator over partitions of the alloca's slices.
811	///
812	/// This iterator implements the core algorithm for partitioning the alloca's
813	/// slices. It is a forward iterator as we don't support backtracking for
814	/// efficiency reasons, and re-use a single storage area to maintain the
815	/// current set of split slices.
816	///
817	/// It is templated on the slice iterator type to use so that it can operate
818	/// with either const or non-const slice iterators.
819	class AllocaSlices::partition_iterator
820	: public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
821	Partition> {
822	friend class AllocaSlices;
823
824	/// Most of the state for walking the partitions is held in a class
825	/// with a nice interface for examining them.
826	Partition P;
827
828	/// We need to keep the end of the slices to know when to stop.
829	AllocaSlices::iterator SE;
830
831	/// We also need to keep track of the maximum split end offset seen.
832	/// FIXME: Do we really?
833	uint64_t MaxSplitSliceEndOffset = `0`;
834
835	/// Sets the partition to be empty at given iterator, and sets the
836	/// end iterator.
837	partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
838	: P (SI), SE(SE) {
839	// If not already at the end, advance our state to form the initial
840	// partition.
841	if (SI != SE)
842	advance();
843	}
844
845	/// Advance the iterator to the next partition.
846	///
847	/// Requires that the iterator not be at the end of the slices.
848	void advance() {
849	assert((P.SI != SE \|\| !P.SplitTails.empty()) &&
850	"Cannot advance past the end of the slices!");
851
852	// Clear out any split uses which have ended.
853	if (!P.SplitTails.empty()) {
854	if (P.EndOffset >= MaxSplitSliceEndOffset) {
855	// If we've finished all splits, this is easy.
856	P.SplitTails.clear();
857	MaxSplitSliceEndOffset = `0`;
858	} else {
859	// Remove the uses which have ended in the prior partition. This
860	// cannot change the max split slice end because we just checked that
861	// the prior partition ended prior to that max.
862	llvm::erase_if(C&: P.SplitTails,
863	P: [&](Slice S) { return* S->endOffset() <= P.EndOffset; });
864	assert(llvm::any_of(P.SplitTails,
865	[&](Slice *S) {
866	return S->endOffset() == MaxSplitSliceEndOffset;
867	}) &&
868	"Could not find the current max split slice offset!");
869	assert(llvm::all_of(P.SplitTails,
870	[&](Slice *S) {
871	return S->endOffset() <= MaxSplitSliceEndOffset;
872	}) &&
873	"Max split slice end offset is not actually the max!");
874	}
875	}
876
877	// If P.SI is already at the end, then we've cleared the split tail and
878	// now have an end iterator.
879	if (P.SI == SE) {
880	assert(P.SplitTails.empty() && "Failed to clear the split slices!");
881	return;
882	}
883
884	// If we had a non-empty partition previously, set up the state for
885	// subsequent partitions.
886	if (P.SI != P.SJ) {
887	// Accumulate all the splittable slices which started in the old
888	// partition into the split list.
889	for (Slice &S : P)
890	if (S.isSplittable() && S.endOffset() > P.EndOffset) {
891	P.SplitTails.push_back(Elt: &S);
892	MaxSplitSliceEndOffset =
893	std::max(a: S.endOffset(), b: MaxSplitSliceEndOffset);
894	}
895
896	// Start from the end of the previous partition.
897	P.SI = P.SJ;
898
899	// If P.SI is now at the end, we at most have a tail of split slices.
900	if (P.SI == SE) {
901	P.BeginOffset = P.EndOffset;
902	P.EndOffset = MaxSplitSliceEndOffset;
903	return;
904	}
905
906	// If the we have split slices and the next slice is after a gap and is
907	// not splittable immediately form an empty partition for the split
908	// slices up until the next slice begins.
909	if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
910	!P.SI->isSplittable()) {
911	P.BeginOffset = P.EndOffset;
912	P.EndOffset = P.SI->beginOffset();
913	return;
914	}
915	}
916
917	// OK, we need to consume new slices. Set the end offset based on the
918	// current slice, and step SJ past it. The beginning offset of the
919	// partition is the beginning offset of the next slice unless we have
920	// pre-existing split slices that are continuing, in which case we begin
921	// at the prior end offset.
922	P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
923	P.EndOffset = P.SI->endOffset();
924	++P.SJ;
925
926	// There are two strategies to form a partition based on whether the
927	// partition starts with an unsplittable slice or a splittable slice.
928	if (!P.SI->isSplittable()) {
929	// When we're forming an unsplittable region, it must always start at
930	// the first slice and will extend through its end.
931	assert(P.BeginOffset == P.SI->beginOffset());
932
933	// Form a partition including all of the overlapping slices with this
934	// unsplittable slice.
935	while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
936	if (!P.SJ->isSplittable())
937	P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
938	++P.SJ;
939	}
940
941	// We have a partition across a set of overlapping unsplittable
942	// partitions.
943	return;
944	}
945
946	// If we're starting with a splittable slice, then we need to form
947	// a synthetic partition spanning it and any other overlapping splittable
948	// splices.
949	assert(P.SI->isSplittable() && "Forming a splittable partition!");
950
951	// Collect all of the overlapping splittable slices.
952	while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
953	P.SJ->isSplittable()) {
954	P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
955	++P.SJ;
956	}
957
958	// Back upiP.EndOffset if we ended the span early when encountering an
959	// unsplittable slice. This synthesizes the early end offset of
960	// a partition spanning only splittable slices.
961	if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
962	assert(!P.SJ->isSplittable());
963	P.EndOffset = P.SJ->beginOffset();
964	}
965	}
966
967	public:
968	bool operator==(const partition_iterator &RHS) const {
969	assert(SE == RHS.SE &&
970	"End iterators don't match between compared partition iterators!");
971
972	// The observed positions of partitions is marked by the P.SI iterator and
973	// the emptiness of the split slices. The latter is only relevant when
974	// P.SI == SE, as the end iterator will additionally have an empty split
975	// slices list, but the prior may have the same P.SI and a tail of split
976	// slices.
977	if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
978	assert(P.SJ == RHS.P.SJ &&
979	"Same set of slices formed two different sized partitions!");
980	assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
981	"Same slice position with differently sized non-empty split "
982	"slice tails!");
983	return true;
984	}
985	return false;
986	}
987
988	partition_iterator &operator++() {
989	advance();
990	return *this;
991	}
992
993	Partition &operator() { return* P; }
994	};
995
996	/// A forward range over the partitions of the alloca's slices.
997	///
998	/// This accesses an iterator range over the partitions of the alloca's
999	/// slices. It computes these partitions on the fly based on the overlapping
1000	/// offsets of the slices and the ability to split them. It will visit "empty"
1001	/// partitions to cover regions of the alloca only accessed via split
1002	/// slices.
1003	iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
1004	return make_range(x: partition_iterator (begin(), end()),
1005	y: partition_iterator (end(), end()));
1006	}
1007
1008	static Value *foldSelectInst(SelectInst &SI) {
1009	// If the condition being selected on is a constant or the same value is
1010	// being selected between, fold the select. Yes this does (rarely) happen
1011	// early on.
1012	if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: SI.getCondition()))
1013	return SI.getOperand(i_nocapture: `1` + CI->isZero());
1014	if (SI.getOperand(i_nocapture: `1`) == SI.getOperand(i_nocapture: `2`))
1015	return SI.getOperand(i_nocapture: `1`);
1016
1017	return nullptr;
1018	}
1019
1020	/// A helper that folds a PHI node or a select.
1021	static Value *foldPHINodeOrSelectInst(Instruction &I) {
1022	if (PHINode *PN = dyn_cast<PHINode>(Val: &I)) {
1023	// If PN merges together the same value, return that value.
1024	return PN->hasConstantValue();
1025	}
1026	return foldSelectInst(SI&: cast<SelectInst>(Val&: I));
1027	}
1028
1029	/// Builder for the alloca slices.
1030	///
1031	/// This class builds a set of alloca slices by recursively visiting the uses
1032	/// of an alloca and making a slice for each load and store at each offset.
1033	class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1034	friend class PtrUseVisitor<SliceBuilder>;
1035	friend class InstVisitor<SliceBuilder>;
1036
1037	using Base = PtrUseVisitor<SliceBuilder>;
1038
1039	const uint64_t AllocSize;
1040	AllocaSlices &AS;
1041
1042	SmallDenseMap<Instruction , unsigned*> MemTransferSliceMap;
1043	SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
1044
1045	/// Set to de-duplicate dead instructions found in the use walk.
1046	SmallPtrSet<Instruction *, `4`> VisitedDeadInsts;
1047
1048	// When this access is via an llvm.protected.field.ptr intrinsic, contains
1049	// the second argument to the intrinsic, the discriminator.
1050	Value ProtectedFieldDisc = nullptr*;
1051
1052	public:
1053	SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1054	: PtrUseVisitor<SliceBuilder>(DL),
1055	AllocSize(AI.getAllocationSize(DL)->getFixedValue()), AS(AS) {}
1056
1057	private:
1058	void markAsDead(Instruction &I) {
1059	if (VisitedDeadInsts.insert(Ptr: &I).second)
1060	AS.DeadUsers.push_back(Elt: &I);
1061	}
1062
1063	void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1064	bool IsSplittable = false) {
1065	// Completely skip uses which have a zero size or start either before or
1066	// past the end of the allocation.
1067	if (Size == `0` \|\| Offset.uge(RHS: AllocSize)) {
1068	LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1069	<< Offset
1070	<< " which has zero size or starts outside of the "
1071	<< AllocSize << " byte alloca:\n"
1072	<< " alloca: " << AS.AI << "\n"
1073	<< " use: " << I << "\n");
1074	return markAsDead(I);
1075	}
1076
1077	uint64_t BeginOffset = Offset.getZExtValue();
1078	uint64_t EndOffset = BeginOffset + Size;
1079
1080	// Clamp the end offset to the end of the allocation. Note that this is
1081	// formulated to handle even the case where "BeginOffset + Size" overflows.
1082	// This may appear superficially to be something we could ignore entirely,
1083	// but that is not so! There may be widened loads or PHI-node uses where
1084	// some instructions are dead but not others. We can't completely ignore
1085	// them, and so have to record at least the information here.
1086	assert(AllocSize >= BeginOffset); // Established above.
1087	if (Size > AllocSize - BeginOffset) {
1088	LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1089	<< Offset << " to remain within the " << AllocSize
1090	<< " byte alloca:\n"
1091	<< " alloca: " << AS.AI << "\n"
1092	<< " use: " << I << "\n");
1093	EndOffset = AllocSize;
1094	}
1095
1096	AS.Slices.push_back(
1097	Elt: Slice (BeginOffset, EndOffset, U, IsSplittable, ProtectedFieldDisc));
1098	}
1099
1100	void visitBitCastInst(BitCastInst &BC) {
1101	if (BC.use_empty())
1102	return markAsDead(I&: BC);
1103
1104	return Base::visitBitCastInst(BC);
1105	}
1106
1107	void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1108	if (ASC.use_empty())
1109	return markAsDead(I&: ASC);
1110
1111	return Base::visitAddrSpaceCastInst(ASC);
1112	}
1113
1114	void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1115	if (GEPI.use_empty())
1116	return markAsDead(I&: GEPI);
1117
1118	return Base::visitGetElementPtrInst(GEPI);
1119	}
1120
1121	void handleLoadOrStore(Type Ty, Instruction &I, const* APInt &Offset,
1122	uint64_t Size, bool IsVolatile) {
1123	// We allow splitting of non-volatile loads and stores where the type is an
1124	// integer type. These may be used to implement 'memcpy' or other "transfer
1125	// of bits" patterns.
1126	bool IsSplittable =
1127	Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1128
1129	insertUse(I, Offset, Size, IsSplittable);
1130	}
1131
1132	void visitLoadInst(LoadInst &LI) {
1133	assert((!LI.isSimple() \|\| LI.getType()->isSingleValueType()) &&
1134	"All simple FCA loads should have been pre-split");
1135
1136	// If there is a load with an unknown offset, we can still perform store
1137	// to load forwarding for other known-offset loads.
1138	if (!IsOffsetKnown)
1139	return PI.setEscapedReadOnly(&LI);
1140
1141	TypeSize Size = DL.getTypeStoreSize(Ty: LI.getType());
1142	if (Size.isScalable()) {
1143	unsigned VScale = LI.getFunction()->getVScaleValue();
1144	if (!VScale)
1145	return PI.setAborted(&LI);
1146
1147	Size = TypeSize::getFixed(ExactSize: Size.getKnownMinValue() * VScale);
1148	}
1149
1150	return handleLoadOrStore(Ty: LI.getType(), I&: LI, Offset, Size: Size.getFixedValue(),
1151	IsVolatile: LI.isVolatile());
1152	}
1153
1154	void visitStoreInst(StoreInst &SI) {
1155	Value *ValOp = SI.getValueOperand();
1156	if (ValOp == *U)
1157	return PI.setEscapedAndAborted(&SI);
1158	if (!IsOffsetKnown)
1159	return PI.setAborted(&SI);
1160
1161	TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValOp->getType());
1162	if (StoreSize.isScalable()) {
1163	unsigned VScale = SI.getFunction()->getVScaleValue();
1164	if (!VScale)
1165	return PI.setAborted(&SI);
1166
1167	StoreSize = TypeSize::getFixed(ExactSize: StoreSize.getKnownMinValue() * VScale);
1168	}
1169
1170	uint64_t Size = StoreSize.getFixedValue();
1171
1172	// If this memory access can be shown to statically* extend outside the*
1173	// bounds of the allocation, it's behavior is undefined, so simply
1174	// ignore it. Note that this is more strict than the generic clamping
1175	// behavior of insertUse. We also try to handle cases which might run the
1176	// risk of overflow.
1177	// FIXME: We should instead consider the pointer to have escaped if this
1178	// function is being instrumented for addressing bugs or race conditions.
1179	if (Size > AllocSize \|\| Offset.ugt(RHS: AllocSize - Size)) {
1180	LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1181	<< Offset << " which extends past the end of the "
1182	<< AllocSize << " byte alloca:\n"
1183	<< " alloca: " << AS.AI << "\n"
1184	<< " use: " << SI << "\n");
1185	return markAsDead(I&: SI);
1186	}
1187
1188	assert((!SI.isSimple() \|\| ValOp->getType()->isSingleValueType()) &&
1189	"All simple FCA stores should have been pre-split");
1190	handleLoadOrStore(Ty: ValOp->getType(), I&: SI, Offset, Size, IsVolatile: SI.isVolatile());
1191	}
1192
1193	void visitMemSetInst(MemSetInst &II) {
1194	assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1195	ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1196	if ((Length && Length->getValue() == `0`) \|\|
1197	(IsOffsetKnown && Offset.uge(RHS: AllocSize)))
1198	// Zero-length mem transfer intrinsics can be ignored entirely.
1199	return markAsDead(I&: II);
1200
1201	if (!IsOffsetKnown)
1202	return PI.setAborted(&II);
1203
1204	insertUse(I&: II, Offset,
1205	Size: Length ? Length->getLimitedValue()
1206	: AllocSize - Offset.getLimitedValue(),
1207	IsSplittable: (bool)Length);
1208	}
1209
1210	void visitMemTransferInst(MemTransferInst &II) {
1211	ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1212	if (Length && Length->getValue() == `0`)
1213	// Zero-length mem transfer intrinsics can be ignored entirely.
1214	return markAsDead(I&: II);
1215
1216	// Because we can visit these intrinsics twice, also check to see if the
1217	// first time marked this instruction as dead. If so, skip it.
1218	if (VisitedDeadInsts.count(Ptr: &II))
1219	return;
1220
1221	if (!IsOffsetKnown)
1222	return PI.setAborted(&II);
1223
1224	// This side of the transfer is completely out-of-bounds, and so we can
1225	// nuke the entire transfer. However, we also need to nuke the other side
1226	// if already added to our partitions.
1227	// FIXME: Yet another place we really should bypass this when
1228	// instrumenting for ASan.
1229	if (Offset.uge(RHS: AllocSize)) {
1230	SmallDenseMap<Instruction , unsigned*>::iterator MTPI =
1231	MemTransferSliceMap.find(Val: &II);
1232	if (MTPI != MemTransferSliceMap.end())
1233	AS.Slices [MTPI ->second].kill();
1234	return markAsDead(I&: II);
1235	}
1236
1237	uint64_t RawOffset = Offset.getLimitedValue();
1238	uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1239
1240	// Check for the special case where the same exact value is used for both
1241	// source and dest.
1242	if (U == II.getRawDest() && U == II.getRawSource()) {
1243	// For non-volatile transfers this is a no-op.
1244	if (!II.isVolatile())
1245	return markAsDead(I&: II);
1246
1247	return insertUse(I&: II, Offset, Size, /IsSplittable=/false);
1248	}
1249
1250	// If we have seen both source and destination for a mem transfer, then
1251	// they both point to the same alloca.
1252	bool Inserted;
1253	SmallDenseMap<Instruction , unsigned*>::iterator MTPI;
1254	std::tie(args&: MTPI, args&: Inserted) =
1255	MemTransferSliceMap.insert(KV: std::make_pair(x: &II, y: AS.Slices.size()));
1256	unsigned PrevIdx = MTPI ->second;
1257	if (!Inserted) {
1258	Slice &PrevP = AS.Slices [PrevIdx];
1259
1260	// Check if the begin offsets match and this is a non-volatile transfer.
1261	// In that case, we can completely elide the transfer.
1262	if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1263	PrevP.kill();
1264	return markAsDead(I&: II);
1265	}
1266
1267	// Otherwise we have an offset transfer within the same alloca. We can't
1268	// split those.
1269	PrevP.makeUnsplittable();
1270	}
1271
1272	// Insert the use now that we've fixed up the splittable nature.
1273	insertUse(I&: II, Offset, Size, /IsSplittable=/Inserted && Length);
1274
1275	// Check that we ended up with a valid index in the map.
1276	assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1277	"Map index doesn't point back to a slice with this user.");
1278	}
1279
1280	// Disable SRoA for any intrinsics except for lifetime invariants.
1281	// FIXME: What about debug intrinsics? This matches old behavior, but
1282	// doesn't make sense.
1283	void visitIntrinsicInst(IntrinsicInst &II) {
1284	if (II.isDroppable()) {
1285	AS.DeadUseIfPromotable.push_back(Elt: U);
1286	return;
1287	}
1288
1289	if (!IsOffsetKnown)
1290	return PI.setAborted(&II);
1291
1292	if (II.isLifetimeStartOrEnd()) {
1293	insertUse(I&: II, Offset, Size: AllocSize, IsSplittable: true);
1294	return;
1295	}
1296
1297	if (II.getIntrinsicID() == Intrinsic::protected_field_ptr) {
1298	// We only handle loads and stores as users of llvm.protected.field.ptr.
1299	// Other uses may add items to the worklist, which will cause
1300	// ProtectedFieldDisc to be tracked incorrectly.
1301	AS.PFPUsers.push_back(Elt: &II);
1302	ProtectedFieldDisc = II.getArgOperand(i: `1`);
1303	for (Use &U : II.uses()) {
1304	this->U = &U;
1305	if (auto *LI = dyn_cast<LoadInst>(Val: U.getUser()))
1306	visitLoadInst(LI&: *LI);
1307	else if (auto *SI = dyn_cast<StoreInst>(Val: U.getUser()))
1308	visitStoreInst(SI&: *SI);
1309	else
1310	PI.setAborted(&II);
1311	if (PI.isAborted())
1312	break;
1313	}
1314	ProtectedFieldDisc = nullptr;
1315	return;
1316	}
1317
1318	Base::visitIntrinsicInst(II);
1319	}
1320
1321	Instruction hasUnsafePHIOrSelectUse(Instruction Root, uint64_t &Size) {
1322	// We consider any PHI or select that results in a direct load or store of
1323	// the same offset to be a viable use for slicing purposes. These uses
1324	// are considered unsplittable and the size is the maximum loaded or stored
1325	// size.
1326	SmallPtrSet<Instruction *, `4`> Visited;
1327	SmallVector<std::pair<Instruction , Instruction >, `4`> Uses;
1328	Visited.insert(Ptr: Root);
1329	Uses.push_back(Elt: std::make_pair(x: cast<Instruction>(Val&: *U), y&: Root));
1330	const DataLayout &DL = Root->getDataLayout();
1331	// If there are no loads or stores, the access is dead. We mark that as
1332	// a size zero access.
1333	Size = `0`;
1334	do {
1335	Instruction I, UsedI;
1336	std::tie(args&: UsedI, args&: I) = Uses.pop_back_val();
1337
1338	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
1339	TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
1340	if (LoadSize.isScalable()) {
1341	PI.setAborted(LI);
1342	return nullptr;
1343	}
1344	Size = std::max(a: Size, b: LoadSize.getFixedValue());
1345	continue;
1346	}
1347	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
1348	Value *Op = SI->getOperand(i_nocapture: `0`);
1349	if (Op == UsedI)
1350	return SI;
1351	TypeSize StoreSize = DL.getTypeStoreSize(Ty: Op->getType());
1352	if (StoreSize.isScalable()) {
1353	PI.setAborted(SI);
1354	return nullptr;
1355	}
1356	Size = std::max(a: Size, b: StoreSize.getFixedValue());
1357	continue;
1358	}
1359
1360	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: I)) {
1361	if (!GEP->hasAllZeroIndices())
1362	return GEP;
1363	} else if (!isa<BitCastInst>(Val: I) && !isa<PHINode>(Val: I) &&
1364	!isa<SelectInst>(Val: I) && !isa<AddrSpaceCastInst>(Val: I)) {
1365	return I;
1366	}
1367
1368	for (User *U : I->users())
1369	if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
1370	Uses.push_back(Elt: std::make_pair(x&: I, y: cast<Instruction>(Val: U)));
1371	} while (!Uses.empty());
1372
1373	return nullptr;
1374	}
1375
1376	void visitPHINodeOrSelectInst(Instruction &I) {
1377	assert(isa<PHINode>(I) \|\| isa<SelectInst>(I));
1378	if (I.use_empty())
1379	return markAsDead(I);
1380
1381	// If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1382	// instructions in this BB, which may be required during rewriting. Bail out
1383	// on these cases.
1384	if (isa<PHINode>(Val: I) && !I.getParent()->hasInsertionPt())
1385	return PI.setAborted(&I);
1386
1387	// TODO: We could use simplifyInstruction here to fold PHINodes and
1388	// SelectInsts. However, doing so requires to change the current
1389	// dead-operand-tracking mechanism. For instance, suppose neither loading
1390	// from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1391	// trap either. However, if we simply replace %U with undef using the
1392	// current dead-operand-tracking mechanism, "load (select undef, undef,
1393	// %other)" may trap because the select may return the first operand
1394	// "undef".
1395	if (Value *Result = foldPHINodeOrSelectInst(I)) {
1396	if (Result == *U)
1397	// If the result of the constant fold will be the pointer, recurse
1398	// through the PHI/select as if we had RAUW'ed it.
1399	enqueueUsers(I);
1400	else
1401	// Otherwise the operand to the PHI/select is dead, and we can replace
1402	// it with poison.
1403	AS.DeadOperands.push_back(Elt: U);
1404
1405	return;
1406	}
1407
1408	if (!IsOffsetKnown)
1409	return PI.setAborted(&I);
1410
1411	// See if we already have computed info on this node.
1412	uint64_t &Size = PHIOrSelectSizes [&I];
1413	if (!Size) {
1414	// This is a new PHI/Select, check for an unsafe use of it.
1415	if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(Root: &I, Size))
1416	return PI.setAborted(UnsafeI);
1417	}
1418
1419	// For PHI and select operands outside the alloca, we can't nuke the entire
1420	// phi or select -- the other side might still be relevant, so we special
1421	// case them here and use a separate structure to track the operands
1422	// themselves which should be replaced with poison.
1423	// FIXME: This should instead be escaped in the event we're instrumenting
1424	// for address sanitization.
1425	if (Offset.uge(RHS: AllocSize)) {
1426	AS.DeadOperands.push_back(Elt: U);
1427	return;
1428	}
1429
1430	insertUse(I, Offset, Size);
1431	}
1432
1433	void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(I&: PN); }
1434
1435	void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(I&: SI); }
1436
1437	/// Disable SROA entirely if there are unhandled users of the alloca.
1438	void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1439
1440	void visitCallBase(CallBase &CB) {
1441	// If the call operand is read-only and only does a read-only or address
1442	// capture, then we mark it as EscapedReadOnly.
1443	if (CB.isDataOperand(U) &&
1444	!capturesFullProvenance(CC: CB.getCaptureInfo(OpNo: U->getOperandNo())) &&
1445	CB.onlyReadsMemory(OpNo: U->getOperandNo())) {
1446	PI.setEscapedReadOnly(&CB);
1447	return;
1448	}
1449
1450	Base::visitCallBase(CB);
1451	}
1452	};
1453
1454	AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1455	:
1456	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1457	AI(AI),
1458	#endif
1459	PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1460	SliceBuilder PB(DL, AI, *this);
1461	SliceBuilder::PtrInfo PtrI = PB.visitPtr(I&: AI);
1462	if (PtrI.isEscaped() \|\| PtrI.isAborted()) {
1463	// FIXME: We should sink the escape vs. abort info into the caller nicely,
1464	// possibly by just storing the PtrInfo in the AllocaSlices.
1465	PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1466	: PtrI.getAbortingInst();
1467	assert(PointerEscapingInstr && "Did not track a bad instruction");
1468	return;
1469	}
1470	PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1471
1472	llvm::erase_if(C&: Slices, P: [](const Slice &S) { return S.isDead(); });
1473
1474	// Sort the uses. This arranges for the offsets to be in ascending order,
1475	// and the sizes to be in descending order.
1476	llvm::stable_sort(Range&: Slices);
1477	}
1478
1479	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1480
1481	void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1482	StringRef Indent) const {
1483	printSlice(OS, I, Indent);
1484	OS << "\n";
1485	printUse(OS, I, Indent);
1486	}
1487
1488	void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1489	StringRef Indent) const {
1490	OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1491	<< " slice #" << (I - begin())
1492	<< (I->isSplittable() ? " (splittable)" : "");
1493	}
1494
1495	void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1496	StringRef Indent) const {
1497	OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1498	}
1499
1500	void AllocaSlices::print(raw_ostream &OS) const {
1501	if (PointerEscapingInstr) {
1502	OS << "Can't analyze slices for alloca: " << AI << "\n"
1503	<< " A pointer to this alloca escaped by:\n"
1504	<< " " << *PointerEscapingInstr << "\n";
1505	return;
1506	}
1507
1508	if (PointerEscapingInstrReadOnly)
1509	OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1510
1511	OS << "Slices of alloca: " << AI << "\n";
1512	for (const_iterator I = begin(), E = end(); I != E; ++I)
1513	print(OS, I);
1514	}
1515
1516	LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1517	print(dbgs(), I);
1518	}
1519	LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1520
1521	#endif // !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1522
1523	/// Walk the range of a partitioning looking for a common type to cover this
1524	/// sequence of slices.
1525	static std::pair<Type , IntegerType >
1526	findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1527	uint64_t EndOffset) {
1528	Type Ty = nullptr*;
1529	bool TyIsCommon = true;
1530	IntegerType ITy = nullptr*;
1531
1532	// Note that we need to look at every* alloca slice's Use to ensure we*
1533	// always get consistent results regardless of the order of slices.
1534	for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1535	Use *U = I->getUse();
1536	if (isa<IntrinsicInst>(Val: *U->getUser()))
1537	continue;
1538	if (I->beginOffset() != B->beginOffset() \|\| I->endOffset() != EndOffset)
1539	continue;
1540
1541	Type UserTy = nullptr*;
1542	if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
1543	UserTy = LI->getType();
1544	} else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
1545	UserTy = SI->getValueOperand()->getType();
1546	}
1547
1548	if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(Val: UserTy)) {
1549	// If the type is larger than the partition, skip it. We only encounter
1550	// this for split integer operations where we want to use the type of the
1551	// entity causing the split. Also skip if the type is not a byte width
1552	// multiple.
1553	if (UserITy->getBitWidth() % `8` != `0` \|\|
1554	UserITy->getBitWidth() / `8` > (EndOffset - B->beginOffset()))
1555	continue;
1556
1557	// Track the largest bitwidth integer type used in this way in case there
1558	// is no common type.
1559	if (!ITy \|\| ITy->getBitWidth() < UserITy->getBitWidth())
1560	ITy = UserITy;
1561	}
1562
1563	// To avoid depending on the order of slices, Ty and TyIsCommon must not
1564	// depend on types skipped above.
1565	if (!UserTy \|\| (Ty && Ty != UserTy))
1566	TyIsCommon = false; // Give up on anything but an iN type.
1567	else
1568	Ty = UserTy;
1569	}
1570
1571	return {TyIsCommon ? Ty : nullptr, ITy};
1572	}
1573
1574	/// PHI instructions that use an alloca and are subsequently loaded can be
1575	/// rewritten to load both input pointers in the pred blocks and then PHI the
1576	/// results, allowing the load of the alloca to be promoted.
1577	/// From this:
1578	/// %P2 = phi [i32 %Alloca, i32* %Other]*
1579	/// %V = load i32 %P2*
1580	/// to:
1581	/// %V1 = load i32 %Alloca -> will be mem2reg'd*
1582	/// ...
1583	/// %V2 = load i32 %Other*
1584	/// ...
1585	/// %V = phi [i32 %V1, i32 %V2]
1586	///
1587	/// We can do this to a select if its only uses are loads and if the operands
1588	/// to the select can be loaded unconditionally.
1589	///
1590	/// FIXME: This should be hoisted into a generic utility, likely in
1591	/// Transforms/Util/Local.h
1592	static bool isSafePHIToSpeculate(PHINode &PN) {
1593	const DataLayout &DL = PN.getDataLayout();
1594
1595	// For now, we can only do this promotion if the load is in the same block
1596	// as the PHI, and if there are no stores between the phi and load.
1597	// TODO: Allow recursive phi users.
1598	// TODO: Allow stores.
1599	BasicBlock *BB = PN.getParent();
1600	Align MaxAlign;
1601	uint64_t APWidth = DL.getIndexTypeSizeInBits(Ty: PN.getType());
1602	Type LoadType = nullptr*;
1603	for (User *U : PN.users()) {
1604	LoadInst *LI = dyn_cast<LoadInst>(Val: U);
1605	if (!LI \|\| !LI->isSimple())
1606	return false;
1607
1608	// For now we only allow loads in the same block as the PHI. This is
1609	// a common case that happens when instcombine merges two loads through
1610	// a PHI.
1611	if (LI->getParent() != BB)
1612	return false;
1613
1614	if (LoadType) {
1615	if (LoadType != LI->getType())
1616	return false;
1617	} else {
1618	LoadType = LI->getType();
1619	}
1620
1621	// Ensure that there are no instructions between the PHI and the load that
1622	// could store.
1623	for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1624	if (BBI ->mayWriteToMemory())
1625	return false;
1626
1627	MaxAlign = std::max(a: MaxAlign, b: LI->getAlign());
1628	}
1629
1630	if (!LoadType)
1631	return false;
1632
1633	APInt LoadSize =
1634	APInt (APWidth, DL.getTypeStoreSize(Ty: LoadType).getFixedValue());
1635
1636	// We can only transform this if it is safe to push the loads into the
1637	// predecessor blocks. The only thing to watch out for is that we can't put
1638	// a possibly trapping load in the predecessor if it is a critical edge.
1639	for (unsigned Idx = `0`, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1640	Instruction *TI = PN.getIncomingBlock(i: Idx)->getTerminator();
1641	Value *InVal = PN.getIncomingValue(i: Idx);
1642
1643	// If the value is produced by the terminator of the predecessor (an
1644	// invoke) or it has side-effects, there is no valid place to put a load
1645	// in the predecessor.
1646	if (TI == InVal \|\| TI->mayHaveSideEffects())
1647	return false;
1648
1649	// If the predecessor has a single successor, then the edge isn't
1650	// critical.
1651	if (TI->getNumSuccessors() == `1`)
1652	continue;
1653
1654	// If this pointer is always safe to load, or if we can prove that there
1655	// is already a load in the block, then we can move the load to the pred
1656	// block.
1657	if (isSafeToLoadUnconditionally(V: InVal, Alignment: MaxAlign, Size: LoadSize, DL, ScanFrom: TI))
1658	continue;
1659
1660	return false;
1661	}
1662
1663	return true;
1664	}
1665
1666	static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1667	LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1668
1669	LoadInst *SomeLoad = cast<LoadInst>(Val: PN.user_back());
1670	Type *LoadTy = SomeLoad->getType();
1671	IRB.SetInsertPoint(&PN);
1672	PHINode *NewPN = IRB.CreatePHI(Ty: LoadTy, NumReservedValues: PN.getNumIncomingValues(),
1673	Name: PN.getName() + ".sroa.speculated");
1674
1675	// Get the AA tags and alignment to use from one of the loads. It does not
1676	// matter which one we get and if any differ.
1677	AAMDNodes AATags = SomeLoad->getAAMetadata();
1678	Align Alignment = SomeLoad->getAlign();
1679
1680	// Rewrite all loads of the PN to use the new PHI.
1681	while (!PN.use_empty()) {
1682	LoadInst *LI = cast<LoadInst>(Val: PN.user_back());
1683	LI->replaceAllUsesWith(V: NewPN);
1684	LI->eraseFromParent();
1685	}
1686
1687	// Inject loads into all of the pred blocks.
1688	DenseMap<BasicBlock , Value > InjectedLoads;
1689	for (unsigned Idx = `0`, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1690	BasicBlock *Pred = PN.getIncomingBlock(i: Idx);
1691	Value *InVal = PN.getIncomingValue(i: Idx);
1692
1693	// A PHI node is allowed to have multiple (duplicated) entries for the same
1694	// basic block, as long as the value is the same. So if we already injected
1695	// a load in the predecessor, then we should reuse the same load for all
1696	// duplicated entries.
1697	if (Value *V = InjectedLoads.lookup(Val: Pred)) {
1698	NewPN->addIncoming(V, BB: Pred);
1699	continue;
1700	}
1701
1702	Instruction *TI = Pred->getTerminator();
1703	IRB.SetInsertPoint(TI);
1704
1705	LoadInst *Load = IRB.CreateAlignedLoad(
1706	Ty: LoadTy, Ptr: InVal, Align: Alignment,
1707	Name: (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1708	++NumLoadsSpeculated;
1709	if (AATags)
1710	Load->setAAMetadata(AATags);
1711	NewPN->addIncoming(V: Load, BB: Pred);
1712	InjectedLoads [Pred] = Load;
1713	}
1714
1715	LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1716	PN.eraseFromParent();
1717	}
1718
1719	SelectHandSpeculativity &
1720	SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1721	if (isTrueVal)
1722	Bitfield::set<SelectHandSpeculativity::TrueVal>(Packed&: Storage, Value: true);
1723	else
1724	Bitfield::set<SelectHandSpeculativity::FalseVal>(Packed&: Storage, Value: true);
1725	return *this;
1726	}
1727
1728	bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1729	return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Packed: Storage)
1730	: Bitfield::get<SelectHandSpeculativity::FalseVal>(Packed: Storage);
1731	}
1732
1733	bool SelectHandSpeculativity::areAllSpeculatable() const {
1734	return isSpeculatable(/isTrueVal=/true) &&
1735	isSpeculatable(/isTrueVal=/false);
1736	}
1737
1738	bool SelectHandSpeculativity::areAnySpeculatable() const {
1739	return isSpeculatable(/isTrueVal=/true) \|\|
1740	isSpeculatable(/isTrueVal=/false);
1741	}
1742	bool SelectHandSpeculativity::areNoneSpeculatable() const {
1743	return !areAnySpeculatable();
1744	}
1745
1746	static SelectHandSpeculativity
1747	isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) {
1748	assert(LI.isSimple() && "Only for simple loads");
1749	SelectHandSpeculativity Spec;
1750
1751	const DataLayout &DL = SI.getDataLayout();
1752	for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1753	if (isSafeToLoadUnconditionally(V: Value, Ty: LI.getType(), Alignment: LI.getAlign(), DL,
1754	ScanFrom: &LI))
1755	Spec.setAsSpeculatable(/isTrueVal=/Value == SI.getTrueValue());
1756	else if (PreserveCFG)
1757	return Spec;
1758
1759	return Spec;
1760	}
1761
1762	std::optional<RewriteableMemOps>
1763	SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1764	RewriteableMemOps Ops;
1765
1766	for (User *U : SI.users()) {
1767	if (auto *BC = dyn_cast<BitCastInst>(Val: U); BC && BC->hasOneUse())
1768	U = *BC->user_begin();
1769
1770	if (auto *Store = dyn_cast<StoreInst>(Val: U)) {
1771	// Note that atomic stores can be transformed; atomic semantics do not
1772	// have any meaning for a local alloca. Stores are not speculatable,
1773	// however, so if we can't turn it into a predicated store, we are done.
1774	if (Store->isVolatile() \|\| PreserveCFG)
1775	return {}; // Give up on this `select`.
1776	Ops.emplace_back(Args&: Store);
1777	continue;
1778	}
1779
1780	auto *LI = dyn_cast<LoadInst>(Val: U);
1781
1782	// Note that atomic loads can be transformed;
1783	// atomic semantics do not have any meaning for a local alloca.
1784	if (!LI \|\| LI->isVolatile())
1785	return {}; // Give up on this `select`.
1786
1787	PossiblySpeculatableLoad Load(LI);
1788	if (!LI->isSimple()) {
1789	// If the `load` is not simple, we can't speculatively execute it,
1790	// but we could handle this via a CFG modification. But can we?
1791	if (PreserveCFG)
1792	return {}; // Give up on this `select`.
1793	Ops.emplace_back(Args&: Load);
1794	continue;
1795	}
1796
1797	SelectHandSpeculativity Spec =
1798	isSafeLoadOfSelectToSpeculate(LI&: *LI, SI, PreserveCFG);
1799	if (PreserveCFG && !Spec.areAllSpeculatable())
1800	return {}; // Give up on this `select`.
1801
1802	Load.setInt(Spec);
1803	Ops.emplace_back(Args&: Load);
1804	}
1805
1806	return Ops;
1807	}
1808
1809	static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
1810	IRBuilderTy &IRB) {
1811	LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1812
1813	Value *TV = SI.getTrueValue();
1814	Value *FV = SI.getFalseValue();
1815	// Replace the given load of the select with a select of two loads.
1816
1817	assert(LI.isSimple() && "We only speculate simple loads");
1818
1819	IRB.SetInsertPoint(&LI);
1820
1821	LoadInst *TL =
1822	IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: TV, Align: LI.getAlign(),
1823	Name: LI.getName() + ".sroa.speculate.load.true");
1824	LoadInst *FL =
1825	IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: FV, Align: LI.getAlign(),
1826	Name: LI.getName() + ".sroa.speculate.load.false");
1827	NumLoadsSpeculated += `2`;
1828
1829	// Transfer alignment and AA info if present.
1830	TL->setAlignment(LI.getAlign());
1831	FL->setAlignment(LI.getAlign());
1832
1833	AAMDNodes Tags = LI.getAAMetadata();
1834	if (Tags) {
1835	TL->setAAMetadata(Tags);
1836	FL->setAAMetadata(Tags);
1837	}
1838
1839	Value *V = IRB.CreateSelect(C: SI.getCondition(), True: TL, False: FL,
1840	Name: LI.getName() + ".sroa.speculated",
1841	MDFrom: ProfcheckDisableMetadataFixes ? nullptr : &SI);
1842
1843	LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1844	LI.replaceAllUsesWith(V);
1845	}
1846
1847	template <typename T>
1848	static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
1849	SelectHandSpeculativity Spec,
1850	DomTreeUpdater &DTU) {
1851	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) && "Only for load and store!");
1852	LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1853	BasicBlock *Head = I.getParent();
1854	Instruction ThenTerm = nullptr*;
1855	Instruction ElseTerm = nullptr*;
1856	if (Spec.areNoneSpeculatable())
1857	SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1858	SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU);
1859	else {
1860	SplitBlockAndInsertIfThen(SI.getCondition(), &I, /Unreachable=/false,
1861	SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU,
1862	/LI=/nullptr, /ThenBlock=/nullptr);
1863	if (Spec.isSpeculatable(/isTrueVal=/true))
1864	cast<CondBrInst>(Val: Head->getTerminator())->swapSuccessors();
1865	}
1866	auto *HeadBI = cast<CondBrInst>(Val: Head->getTerminator());
1867	Spec = {}; // Do not use `Spec` beyond this point.
1868	BasicBlock *Tail = I.getParent();
1869	Tail->setName(Head->getName() + ".cont");
1870	PHINode *PN;
1871	if (isa<LoadInst>(I))
1872	PN = PHINode::Create(Ty: I.getType(), NumReservedValues: `2`, NameStr: "", InsertBefore: I.getIterator());
1873	for (BasicBlock *SuccBB : successors(BB: Head)) {
1874	bool IsThen = SuccBB == HeadBI->getSuccessor(i: `0`);
1875	int SuccIdx = IsThen ? `0` : `1`;
1876	auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1877	auto &CondMemOp = cast<T>(*I.clone());
1878	if (NewMemOpBB != Head) {
1879	NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1880	if (isa<LoadInst>(I))
1881	++NumLoadsPredicated;
1882	else
1883	++NumStoresPredicated;
1884	} else {
1885	CondMemOp.dropUBImplyingAttrsAndMetadata();
1886	++NumLoadsSpeculated;
1887	}
1888	CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1889	Value *Ptr = SI.getOperand(i_nocapture: `1` + SuccIdx);
1890	CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1891	if (isa<LoadInst>(I)) {
1892	CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1893	PN->addIncoming(V: &CondMemOp, BB: NewMemOpBB);
1894	} else
1895	LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1896	}
1897	if (isa<LoadInst>(I)) {
1898	PN->takeName(V: &I);
1899	LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1900	I.replaceAllUsesWith(PN);
1901	}
1902	}
1903
1904	static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I,
1905	SelectHandSpeculativity Spec,
1906	DomTreeUpdater &DTU) {
1907	if (auto *LI = dyn_cast<LoadInst>(Val: &I))
1908	rewriteMemOpOfSelect(SI&: SelInst, I&: *LI, Spec, DTU);
1909	else if (auto *SI = dyn_cast<StoreInst>(Val: &I))
1910	rewriteMemOpOfSelect(SI&: SelInst, I&: *SI, Spec, DTU);
1911	else
1912	llvm_unreachable_internal(msg: "Only for load and store.");
1913	}
1914
1915	static bool rewriteSelectInstMemOps(SelectInst &SI,
1916	const RewriteableMemOps &Ops,
1917	IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1918	bool CFGChanged = false;
1919	LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1920
1921	for (const RewriteableMemOp &Op : Ops) {
1922	SelectHandSpeculativity Spec;
1923	Instruction *I;
1924	if (auto *const *US = std::get_if<UnspeculatableStore>(ptr: &Op)) {
1925	I = *US;
1926	} else {
1927	auto PSL = std::get<PossiblySpeculatableLoad>(v: Op);
1928	I = PSL.getPointer();
1929	Spec = PSL.getInt();
1930	}
1931	if (Spec.areAllSpeculatable()) {
1932	speculateSelectInstLoads(SI, LI&: cast<LoadInst>(Val&: *I), IRB);
1933	} else {
1934	assert(DTU && "Should not get here when not allowed to modify the CFG!");
1935	rewriteMemOpOfSelect(SelInst&: SI, I&: I, Spec, DTU&: DTU);
1936	CFGChanged = true;
1937	}
1938	I->eraseFromParent();
1939	}
1940
1941	for (User *U : make_early_inc_range(Range: SI.users()))
1942	cast<BitCastInst>(Val: U)->eraseFromParent();
1943	SI.eraseFromParent();
1944	return CFGChanged;
1945	}
1946
1947	/// Compute an adjusted pointer from Ptr by Offset bytes where the
1948	/// resulting pointer has PointerTy.
1949	static Value getAdjustedPtr(IRBuilderTy &IRB, const* DataLayout &DL, Value *Ptr,
1950	APInt Offset, Type *PointerTy,
1951	const Twine &NamePrefix) {
1952	if (Offset != `0`)
1953	Ptr = IRB.CreateInBoundsPtrAdd(Ptr, Offset: IRB.getInt(AI: Offset),
1954	Name: NamePrefix + "sroa_idx");
1955	return IRB.CreatePointerBitCastOrAddrSpaceCast(V: Ptr, DestTy: PointerTy,
1956	Name: NamePrefix + "sroa_cast");
1957	}
1958
1959	/// Compute the adjusted alignment for a load or store from an offset.
1960	static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
1961	return commonAlignment(A: getLoadStoreAlignment(I), Offset);
1962	}
1963
1964	/// Test whether we can convert a value from the old to the new type.
1965	///
1966	/// This predicate should be used to guard calls to convertValue in order to
1967	/// ensure that we only try to convert viable values. The strategy is that we
1968	/// will peel off single element struct and array wrappings to get to an
1969	/// underlying value, and convert that value.
1970	static bool canConvertValue(const DataLayout &DL, Type OldTy, Type NewTy,
1971	unsigned VScale = `0`) {
1972	if (OldTy == NewTy)
1973	return true;
1974
1975	// For integer types, we can't handle any bit-width differences. This would
1976	// break both vector conversions with extension and introduce endianness
1977	// issues when in conjunction with loads and stores.
1978	if (isa<IntegerType>(Val: OldTy) && isa<IntegerType>(Val: NewTy)) {
1979	assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1980	cast<IntegerType>(NewTy)->getBitWidth() &&
1981	"We can't have the same bitwidth for different int types");
1982	return false;
1983	}
1984
1985	TypeSize NewSize = DL.getTypeSizeInBits(Ty: NewTy);
1986	TypeSize OldSize = DL.getTypeSizeInBits(Ty: OldTy);
1987
1988	if ((isa<ScalableVectorType>(Val: NewTy) && isa<FixedVectorType>(Val: OldTy)) \|\|
1989	(isa<ScalableVectorType>(Val: OldTy) && isa<FixedVectorType>(Val: NewTy))) {
1990	// Conversion is only possible when the size of scalable vectors is known.
1991	if (!VScale)
1992	return false;
1993
1994	// For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1995	// a single domain (either fixed or scalable). Any additional conversion
1996	// between fixed and scalable types is handled through integer types.
1997	auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1998	auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1999
2000	if (isa<ScalableVectorType>(Val: NewTy)) {
2001	if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: NewVTy), EltTy: OldVTy))
2002	return false;
2003
2004	NewSize = TypeSize::getFixed(ExactSize: NewSize.getKnownMinValue() * VScale);
2005	} else {
2006	if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: OldVTy), EltTy: NewVTy))
2007	return false;
2008
2009	OldSize = TypeSize::getFixed(ExactSize: OldSize.getKnownMinValue() * VScale);
2010	}
2011	}
2012
2013	if (NewSize != OldSize)
2014	return false;
2015	if (!NewTy->isSingleValueType() \|\| !OldTy->isSingleValueType())
2016	return false;
2017
2018	// We can convert pointers to integers and vice-versa. Same for vectors
2019	// of pointers and integers.
2020	OldTy = OldTy->getScalarType();
2021	NewTy = NewTy->getScalarType();
2022	if (NewTy->isPointerTy() \|\| OldTy->isPointerTy()) {
2023	if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
2024	unsigned OldAS = OldTy->getPointerAddressSpace();
2025	unsigned NewAS = NewTy->getPointerAddressSpace();
2026	// Convert pointers if they are pointers from the same address space or
2027	// different integral (not non-integral) address spaces with the same
2028	// pointer size.
2029	return OldAS == NewAS \|\|
2030	(!DL.isNonIntegralAddressSpace(AddrSpace: OldAS) &&
2031	!DL.isNonIntegralAddressSpace(AddrSpace: NewAS) &&
2032	DL.getPointerSize(AS: OldAS) == DL.getPointerSize(AS: NewAS));
2033	}
2034
2035	// We can convert integers to integral pointers, but not to non-integral
2036	// pointers.
2037	if (OldTy->isIntegerTy())
2038	return !DL.isNonIntegralPointerType(Ty: NewTy);
2039
2040	// We can convert integral pointers to integers, but non-integral pointers
2041	// need to remain pointers.
2042	if (!DL.isNonIntegralPointerType(Ty: OldTy))
2043	return NewTy->isIntegerTy();
2044
2045	return false;
2046	}
2047
2048	if (OldTy->isTargetExtTy() \|\| NewTy->isTargetExtTy())
2049	return false;
2050
2051	return true;
2052	}
2053
2054	/// Test whether the given slice use can be promoted to a vector.
2055	///
2056	/// This function is called to test each entry in a partition which is slated
2057	/// for a single slice.
2058	static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2059	VectorType *Ty,
2060	uint64_t ElementSize,
2061	const DataLayout &DL,
2062	unsigned VScale) {
2063	// First validate the slice offsets.
2064	uint64_t BeginOffset =
2065	std::max(a: S.beginOffset(), b: P.beginOffset()) - P.beginOffset();
2066	uint64_t BeginIndex = BeginOffset / ElementSize;
2067	if (BeginIndex * ElementSize != BeginOffset \|\|
2068	BeginIndex >= cast<FixedVectorType>(Val: Ty)->getNumElements())
2069	return false;
2070	uint64_t EndOffset = std::min(a: S.endOffset(), b: P.endOffset()) - P.beginOffset();
2071	uint64_t EndIndex = EndOffset / ElementSize;
2072	if (EndIndex * ElementSize != EndOffset \|\|
2073	EndIndex > cast<FixedVectorType>(Val: Ty)->getNumElements())
2074	return false;
2075
2076	assert(EndIndex > BeginIndex && "Empty vector!");
2077	uint64_t NumElements = EndIndex - BeginIndex;
2078	Type *SliceTy = (NumElements == `1`)
2079	? Ty->getElementType()
2080	: FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElements);
2081
2082	Type *SplitIntTy =
2083	Type::getIntNTy(C&: Ty->getContext(), N: NumElements * ElementSize * `8`);
2084
2085	Use *U = S.getUse();
2086
2087	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2088	if (MI->isVolatile())
2089	return false;
2090	if (!S.isSplittable())
2091	return false; // Skip any unsplittable intrinsics.
2092	} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2093	if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2094	return false;
2095	} else if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2096	if (LI->isVolatile())
2097	return false;
2098	Type *LTy = LI->getType();
2099	// Disable vector promotion when there are loads or stores of an FCA.
2100	if (LTy->isStructTy())
2101	return false;
2102	if (P.beginOffset() > S.beginOffset() \|\| P.endOffset() < S.endOffset()) {
2103	assert(LTy->isIntegerTy());
2104	LTy = SplitIntTy;
2105	}
2106	if (!canConvertValue(DL, OldTy: SliceTy, NewTy: LTy, VScale))
2107	return false;
2108	} else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2109	if (SI->isVolatile())
2110	return false;
2111	Type *STy = SI->getValueOperand()->getType();
2112	// Disable vector promotion when there are loads or stores of an FCA.
2113	if (STy->isStructTy())
2114	return false;
2115	if (P.beginOffset() > S.beginOffset() \|\| P.endOffset() < S.endOffset()) {
2116	assert(STy->isIntegerTy());
2117	STy = SplitIntTy;
2118	}
2119	if (!canConvertValue(DL, OldTy: STy, NewTy: SliceTy, VScale))
2120	return false;
2121	} else {
2122	return false;
2123	}
2124
2125	return true;
2126	}
2127
2128	/// Test whether any vector type in \p CandidateTys is viable for promotion.
2129	///
2130	/// This implements the necessary checking for \c isVectorPromotionViable over
2131	/// all slices of the alloca for the given VectorType.
2132	static VectorType *
2133	checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
2134	SmallVectorImpl<VectorType *> &CandidateTys,
2135	bool HaveCommonEltTy, Type *CommonEltTy,
2136	bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2137	VectorType CommonVecPtrTy, unsigned* VScale) {
2138	// If we didn't find a vector type, nothing to do here.
2139	if (CandidateTys.empty())
2140	return nullptr;
2141
2142	// Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2143	// then we should choose it, not some other alternative.
2144	// But, we can't perform a no-op pointer address space change via bitcast,
2145	// so if we didn't have a common pointer element type, bail.
2146	if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2147	return nullptr;
2148
2149	// Try to pick the "best" element type out of the choices.
2150	if (!HaveCommonEltTy && HaveVecPtrTy) {
2151	// If there was a pointer element type, there's really only one choice.
2152	CandidateTys.clear();
2153	CandidateTys.push_back(Elt: CommonVecPtrTy);
2154	} else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2155	// Integer-ify vector types.
2156	for (VectorType *&VTy : CandidateTys) {
2157	if (!VTy->getElementType()->isIntegerTy())
2158	VTy = cast<VectorType>(Val: VTy->getWithNewType(EltTy: IntegerType::getIntNTy(
2159	C&: VTy->getContext(), N: VTy->getScalarSizeInBits())));
2160	}
2161
2162	// Rank the remaining candidate vector types. This is easy because we know
2163	// they're all integer vectors. We sort by ascending number of elements.
2164	auto RankVectorTypesComp = [&DL](VectorType RHSTy, VectorType LHSTy) {
2165	(void)DL;
2166	assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2167	DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2168	"Cannot have vector types of different sizes!");
2169	assert(RHSTy->getElementType()->isIntegerTy() &&
2170	"All non-integer types eliminated!");
2171	assert(LHSTy->getElementType()->isIntegerTy() &&
2172	"All non-integer types eliminated!");
2173	return cast<FixedVectorType>(Val: RHSTy)->getNumElements() <
2174	cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2175	};
2176	auto RankVectorTypesEq = [&DL](VectorType RHSTy, VectorType LHSTy) {
2177	(void)DL;
2178	assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2179	DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2180	"Cannot have vector types of different sizes!");
2181	assert(RHSTy->getElementType()->isIntegerTy() &&
2182	"All non-integer types eliminated!");
2183	assert(LHSTy->getElementType()->isIntegerTy() &&
2184	"All non-integer types eliminated!");
2185	return cast<FixedVectorType>(Val: RHSTy)->getNumElements() ==
2186	cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2187	};
2188	llvm::sort(C&: CandidateTys, Comp: RankVectorTypesComp);
2189	CandidateTys.erase(CS: llvm::unique(R&: CandidateTys, P: RankVectorTypesEq),
2190	CE: CandidateTys.end());
2191	} else {
2192	// The only way to have the same element type in every vector type is to
2193	// have the same vector type. Check that and remove all but one.
2194	#ifndef NDEBUG
2195	for (VectorType *VTy : CandidateTys) {
2196	assert(VTy->getElementType() == CommonEltTy &&
2197	"Unaccounted for element type!");
2198	assert(VTy == CandidateTys[`0`] &&
2199	"Different vector types with the same element type!");
2200	}
2201	#endif
2202	CandidateTys.resize(N: `1`);
2203	}
2204
2205	// FIXME: hack. Do we have a named constant for this?
2206	// SDAG SDNode can't have more than 65535 operands.
2207	llvm::erase_if(C&: CandidateTys, P: [](VectorType *VTy) {
2208	return cast<FixedVectorType>(Val: VTy)->getNumElements() >
2209	std::numeric_limits<unsigned short>::max();
2210	});
2211
2212	// Find a vector type viable for promotion by iterating over all slices.
2213	auto VTy = llvm::find_if(Range&: CandidateTys, P: [&](VectorType VTy) -> bool {
2214	uint64_t ElementSize =
2215	DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2216
2217	// While the definition of LLVM vectors is bitpacked, we don't support sizes
2218	// that aren't byte sized.
2219	if (ElementSize % `8`)
2220	return false;
2221	assert((DL.getTypeSizeInBits(VTy).getFixedValue() % `8`) == `0` &&
2222	"vector size not a multiple of element size?");
2223	ElementSize /= `8`;
2224
2225	for (const Slice &S : P)
2226	if (!isVectorPromotionViableForSlice(P, S, Ty: VTy, ElementSize, DL, VScale))
2227	return false;
2228
2229	for (const Slice *S : P.splitSliceTails())
2230	if (!isVectorPromotionViableForSlice(P, S: *S, Ty: VTy, ElementSize, DL, VScale))
2231	return false;
2232
2233	return true;
2234	});
2235	return VTy != CandidateTys.end() ? VTy : nullptr*;
2236	}
2237
2238	static VectorType *createAndCheckVectorTypesForPromotion(
2239	SetVector<Type > &OtherTys, ArrayRef<VectorType > CandidateTysCopy,
2240	function_ref<void(Type *)> CheckCandidateType, Partition &P,
2241	const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2242	bool &HaveCommonEltTy, Type &CommonEltTy, bool* &HaveVecPtrTy,
2243	bool &HaveCommonVecPtrTy, VectorType &CommonVecPtrTy, unsigned* VScale) {
2244	[[maybe_unused]] VectorType *OriginalElt =
2245	CandidateTysCopy.size() ? CandidateTysCopy [`0`] : nullptr;
2246	// Consider additional vector types where the element type size is a
2247	// multiple of load/store element size.
2248	for (Type *Ty : OtherTys) {
2249	if (!VectorType::isValidElementType(ElemTy: Ty))
2250	continue;
2251	unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2252	// Make a copy of CandidateTys and iterate through it, because we
2253	// might append to CandidateTys in the loop.
2254	for (VectorType *const VTy : CandidateTysCopy) {
2255	// The elements in the copy should remain invariant throughout the loop
2256	assert(CandidateTysCopy[`0`] == OriginalElt && "Different Element");
2257	unsigned VectorSize = DL.getTypeSizeInBits(Ty: VTy).getFixedValue();
2258	unsigned ElementSize =
2259	DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2260	if (TypeSize != VectorSize && TypeSize != ElementSize &&
2261	VectorSize % TypeSize == `0`) {
2262	VectorType NewVTy = VectorType::get(ElementType: Ty, NumElements: VectorSize / TypeSize, Scalable: false*);
2263	CheckCandidateType (NewVTy);
2264	}
2265	}
2266	}
2267
2268	return checkVectorTypesForPromotion(
2269	P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2270	HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2271	}
2272
2273	/// Test whether the given alloca partitioning and range of slices can be
2274	/// promoted to a vector.
2275	///
2276	/// This is a quick test to check whether we can rewrite a particular alloca
2277	/// partition (and its newly formed alloca) into a vector alloca with only
2278	/// whole-vector loads and stores such that it could be promoted to a vector
2279	/// SSA value. We only can ensure this for a limited set of operations, and we
2280	/// don't want to do the rewrites unless we are confident that the result will
2281	/// be promotable, so we have an early test here.
2282	static VectorType isVectorPromotionViable(Partition &P, const* DataLayout &DL,
2283	unsigned VScale) {
2284	// Collect the candidate types for vector-based promotion. Also track whether
2285	// we have different element types.
2286	SmallVector<VectorType *, `4`> CandidateTys;
2287	SetVector<Type *> LoadStoreTys;
2288	SetVector<Type *> DeferredTys;
2289	Type CommonEltTy = nullptr*;
2290	VectorType CommonVecPtrTy = nullptr*;
2291	bool HaveVecPtrTy = false;
2292	bool HaveCommonEltTy = true;
2293	bool HaveCommonVecPtrTy = true;
2294	auto CheckCandidateType = [&](Type *Ty) {
2295	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
2296	// Return if bitcast to vectors is different for total size in bits.
2297	if (!CandidateTys.empty()) {
2298	VectorType *V = CandidateTys [`0`];
2299	if (DL.getTypeSizeInBits(Ty: VTy).getFixedValue() !=
2300	DL.getTypeSizeInBits(Ty: V).getFixedValue()) {
2301	CandidateTys.clear();
2302	return;
2303	}
2304	}
2305	CandidateTys.push_back(Elt: VTy);
2306	Type *EltTy = VTy->getElementType();
2307
2308	if (!CommonEltTy)
2309	CommonEltTy = EltTy;
2310	else if (CommonEltTy != EltTy)
2311	HaveCommonEltTy = false;
2312
2313	if (EltTy->isPointerTy()) {
2314	HaveVecPtrTy = true;
2315	if (!CommonVecPtrTy)
2316	CommonVecPtrTy = VTy;
2317	else if (CommonVecPtrTy != VTy)
2318	HaveCommonVecPtrTy = false;
2319	}
2320	}
2321	};
2322
2323	// Put load and store types into a set for de-duplication.
2324	for (const Slice &S : P) {
2325	Type *Ty;
2326	if (auto *LI = dyn_cast<LoadInst>(Val: S.getUse()->getUser()))
2327	Ty = LI->getType();
2328	else if (auto *SI = dyn_cast<StoreInst>(Val: S.getUse()->getUser()))
2329	Ty = SI->getValueOperand()->getType();
2330	else
2331	continue;
2332
2333	auto CandTy = Ty->getScalarType();
2334	if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() \|\|
2335	S.endOffset() != P.endOffset())) {
2336	DeferredTys.insert(X: Ty);
2337	continue;
2338	}
2339
2340	LoadStoreTys.insert(X: Ty);
2341	// Consider any loads or stores that are the exact size of the slice.
2342	if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2343	CheckCandidateType (Ty);
2344	}
2345
2346	SmallVector<VectorType *, `4`> CandidateTysCopy = CandidateTys;
2347	if (auto *VTy = createAndCheckVectorTypesForPromotion(
2348	OtherTys&: LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2349	CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2350	HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2351	return VTy;
2352
2353	CandidateTys.clear();
2354	return createAndCheckVectorTypesForPromotion(
2355	OtherTys&: DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2356	HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2357	CommonVecPtrTy, VScale);
2358	}
2359
2360	/// Test whether a slice of an alloca is valid for integer widening.
2361	///
2362	/// This implements the necessary checking for the \c isIntegerWideningViable
2363	/// test below on a single slice of the alloca.
2364	static bool isIntegerWideningViableForSlice(const Slice &S,
2365	uint64_t AllocBeginOffset,
2366	Type *AllocaTy,
2367	const DataLayout &DL,
2368	bool &WholeAllocaOp) {
2369	uint64_t Size = DL.getTypeStoreSize(Ty: AllocaTy).getFixedValue();
2370
2371	uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2372	uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2373
2374	Use *U = S.getUse();
2375
2376	// Lifetime intrinsics operate over the whole alloca whose sizes are usually
2377	// larger than other load/store slices (RelEnd > Size). But lifetime are
2378	// always promotable and should not impact other slices' promotability of the
2379	// partition.
2380	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2381	if (II->isLifetimeStartOrEnd() \|\| II->isDroppable())
2382	return true;
2383	}
2384
2385	// We can't reasonably handle cases where the load or store extends past
2386	// the end of the alloca's type and into its padding.
2387	if (RelEnd > Size)
2388	return false;
2389
2390	if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2391	if (LI->isVolatile())
2392	return false;
2393	// We can't handle loads that extend past the allocated memory.
2394	TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
2395	if (!LoadSize.isFixed() \|\| LoadSize.getFixedValue() > Size)
2396	return false;
2397	// So far, AllocaSliceRewriter does not support widening split slice tails
2398	// in rewriteIntegerLoad.
2399	if (S.beginOffset() < AllocBeginOffset)
2400	return false;
2401	// Note that we don't count vector loads or stores as whole-alloca
2402	// operations which enable integer widening because we would prefer to use
2403	// vector widening instead.
2404	if (!isa<VectorType>(Val: LI->getType()) && RelBegin == `0` && RelEnd == Size)
2405	WholeAllocaOp = true;
2406	if (IntegerType *ITy = dyn_cast<IntegerType>(Val: LI->getType())) {
2407	if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2408	return false;
2409	} else if (RelBegin != `0` \|\| RelEnd != Size \|\|
2410	!canConvertValue(DL, OldTy: AllocaTy, NewTy: LI->getType())) {
2411	// Non-integer loads need to be convertible from the alloca type so that
2412	// they are promotable.
2413	return false;
2414	}
2415	} else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2416	Type *ValueTy = SI->getValueOperand()->getType();
2417	if (SI->isVolatile())
2418	return false;
2419	// We can't handle stores that extend past the allocated memory.
2420	TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValueTy);
2421	if (!StoreSize.isFixed() \|\| StoreSize.getFixedValue() > Size)
2422	return false;
2423	// So far, AllocaSliceRewriter does not support widening split slice tails
2424	// in rewriteIntegerStore.
2425	if (S.beginOffset() < AllocBeginOffset)
2426	return false;
2427	// Note that we don't count vector loads or stores as whole-alloca
2428	// operations which enable integer widening because we would prefer to use
2429	// vector widening instead.
2430	if (!isa<VectorType>(Val: ValueTy) && RelBegin == `0` && RelEnd == Size)
2431	WholeAllocaOp = true;
2432	if (IntegerType *ITy = dyn_cast<IntegerType>(Val: ValueTy)) {
2433	if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2434	return false;
2435	} else if (RelBegin != `0` \|\| RelEnd != Size \|\|
2436	!canConvertValue(DL, OldTy: ValueTy, NewTy: AllocaTy)) {
2437	// Non-integer stores need to be convertible to the alloca type so that
2438	// they are promotable.
2439	return false;
2440	}
2441	} else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2442	if (MI->isVolatile() \|\| !isa<Constant>(Val: MI->getLength()))
2443	return false;
2444	if (!S.isSplittable())
2445	return false; // Skip any unsplittable intrinsics.
2446	} else {
2447	return false;
2448	}
2449
2450	return true;
2451	}
2452
2453	/// Test whether the given alloca partition's integer operations can be
2454	/// widened to promotable ones.
2455	///
2456	/// This is a quick test to check whether we can rewrite the integer loads and
2457	/// stores to a particular alloca into wider loads and stores and be able to
2458	/// promote the resulting alloca.
2459	static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2460	const DataLayout &DL) {
2461	uint64_t SizeInBits = DL.getTypeSizeInBits(Ty: AllocaTy).getFixedValue();
2462	// Don't create integer types larger than the maximum bitwidth.
2463	if (SizeInBits > IntegerType::MAX_INT_BITS)
2464	return false;
2465
2466	// Don't try to handle allocas with bit-padding.
2467	if (SizeInBits != DL.getTypeStoreSizeInBits(Ty: AllocaTy).getFixedValue())
2468	return false;
2469
2470	// We need to ensure that an integer type with the appropriate bitwidth can
2471	// be converted to the alloca type, whatever that is. We don't want to force
2472	// the alloca itself to have an integer type if there is a more suitable one.
2473	Type *IntTy = Type::getIntNTy(C&: AllocaTy->getContext(), N: SizeInBits);
2474	if (!canConvertValue(DL, OldTy: AllocaTy, NewTy: IntTy) \|\|
2475	!canConvertValue(DL, OldTy: IntTy, NewTy: AllocaTy))
2476	return false;
2477
2478	// While examining uses, we ensure that the alloca has a covering load or
2479	// store. We don't want to widen the integer operations only to fail to
2480	// promote due to some other unsplittable entry (which we may make splittable
2481	// later). However, if there are only splittable uses, go ahead and assume
2482	// that we cover the alloca.
2483	// FIXME: We shouldn't consider split slices that happen to start in the
2484	// partition here...
2485	bool WholeAllocaOp = P.empty() && DL.isLegalInteger(Width: SizeInBits);
2486
2487	for (const Slice &S : P)
2488	if (!isIntegerWideningViableForSlice(S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2489	WholeAllocaOp))
2490	return false;
2491
2492	for (const Slice *S : P.splitSliceTails())
2493	if (!isIntegerWideningViableForSlice(S: *S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2494	WholeAllocaOp))
2495	return false;
2496
2497	return WholeAllocaOp;
2498	}
2499
2500	static Value extractInteger(const* DataLayout &DL, IRBuilderTy &IRB, Value *V,
2501	IntegerType *Ty, uint64_t Offset,
2502	const Twine &Name) {
2503	LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2504	IntegerType *IntTy = cast<IntegerType>(Val: V->getType());
2505	assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2506	DL.getTypeStoreSize(IntTy).getFixedValue() &&
2507	"Element extends past full value");
2508	uint64_t ShAmt = `8` * Offset;
2509	if (DL.isBigEndian())
2510	ShAmt = `8` * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2511	DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2512	if (ShAmt) {
2513	V = IRB.CreateLShr(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2514	LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2515	}
2516	assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2517	"Cannot extract to a larger integer!");
2518	if (Ty != IntTy) {
2519	V = IRB.CreateTrunc(V, DestTy: Ty, Name: Name + ".trunc");
2520	LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2521	}
2522	return V;
2523	}
2524
2525	static Value insertInteger(const* DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2526	Value V, uint64_t Offset, const* Twine &Name) {
2527	IntegerType *IntTy = cast<IntegerType>(Val: Old->getType());
2528	IntegerType *Ty = cast<IntegerType>(Val: V->getType());
2529	assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2530	"Cannot insert a larger integer!");
2531	LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2532	if (Ty != IntTy) {
2533	V = IRB.CreateZExt(V, DestTy: IntTy, Name: Name + ".ext");
2534	LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2535	}
2536	assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2537	DL.getTypeStoreSize(IntTy).getFixedValue() &&
2538	"Element store outside of alloca store");
2539	uint64_t ShAmt = `8` * Offset;
2540	if (DL.isBigEndian())
2541	ShAmt = `8` * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2542	DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2543	if (ShAmt) {
2544	V = IRB.CreateShl(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2545	LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2546	}
2547
2548	if (ShAmt \|\| Ty->getBitWidth() < IntTy->getBitWidth()) {
2549	APInt Mask = ~Ty->getMask().zext(width: IntTy->getBitWidth()).shl(shiftAmt: ShAmt);
2550	Old = IRB.CreateAnd(LHS: Old, RHS: Mask, Name: Name + ".mask");
2551	LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2552	V = IRB.CreateOr(LHS: Old, RHS: V, Name: Name + ".insert");
2553	LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2554	}
2555	return V;
2556	}
2557
2558	static Value extractVector(IRBuilderTy &IRB, Value V, unsigned BeginIndex,
2559	unsigned EndIndex, const Twine &Name) {
2560	auto *VecTy = cast<FixedVectorType>(Val: V->getType());
2561	unsigned NumElements = EndIndex - BeginIndex;
2562	assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2563
2564	if (NumElements == VecTy->getNumElements())
2565	return V;
2566
2567	if (NumElements == `1`) {
2568	V = IRB.CreateExtractElement(Vec: V, Idx: IRB.getInt32(C: BeginIndex),
2569	Name: Name + ".extract");
2570	LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2571	return V;
2572	}
2573
2574	auto Mask = llvm::to_vector<`8`>(Range: llvm::seq<int>(Begin: BeginIndex, End: EndIndex));
2575	V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".extract");
2576	LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2577	return V;
2578	}
2579
2580	static Value insertVector(IRBuilderTy &IRB, Value Old, Value *V,
2581	unsigned BeginIndex, const Twine &Name) {
2582	VectorType *VecTy = cast<VectorType>(Val: Old->getType());
2583	assert(VecTy && "Can only insert a vector into a vector");
2584
2585	VectorType *Ty = dyn_cast<VectorType>(Val: V->getType());
2586	if (!Ty) {
2587	// Single element to insert.
2588	V = IRB.CreateInsertElement(Vec: Old, NewElt: V, Idx: IRB.getInt32(C: BeginIndex),
2589	Name: Name + ".insert");
2590	LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2591	return V;
2592	}
2593
2594	unsigned NumSubElements = cast<FixedVectorType>(Val: Ty)->getNumElements();
2595	unsigned NumElements = cast<FixedVectorType>(Val: VecTy)->getNumElements();
2596
2597	assert(NumSubElements <= NumElements && "Too many elements!");
2598	if (NumSubElements == NumElements) {
2599	assert(V->getType() == VecTy && "Vector type mismatch");
2600	return V;
2601	}
2602	unsigned EndIndex = BeginIndex + NumSubElements;
2603
2604	// When inserting a smaller vector into the larger to store, we first
2605	// use a shuffle vector to widen it with undef elements, and then
2606	// a second shuffle vector to select between the loaded vector and the
2607	// incoming vector.
2608	SmallVector<int, `8`> Mask;
2609	Mask.reserve(N: NumElements);
2610	for (unsigned Idx = `0`; Idx != NumElements; ++Idx)
2611	if (Idx >= BeginIndex && Idx < EndIndex)
2612	Mask.push_back(Elt: Idx - BeginIndex);
2613	else
2614	Mask.push_back(Elt: -`1`);
2615	V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".expand");
2616	LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2617
2618	Mask.clear();
2619	for (unsigned Idx = `0`; Idx != NumElements; ++Idx)
2620	if (Idx >= BeginIndex && Idx < EndIndex)
2621	Mask.push_back(Elt: Idx);
2622	else
2623	Mask.push_back(Elt: Idx + NumElements);
2624	V = IRB.CreateShuffleVector(V1: V, V2: Old, Mask, Name: Name + "blend");
2625	LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2626	return V;
2627	}
2628
2629	/// This function takes two vector values and combines them into a single vector
2630	/// by concatenating their elements. The function handles:
2631	///
2632	/// 1. Element type mismatch: If either vector's element type differs from
2633	/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2634	/// preserving the total bit width (adjusting the number of elements
2635	/// accordingly).
2636	///
2637	/// 2. Size mismatch: After transforming the vectors to have the desired element
2638	/// type, if the two vectors have different numbers of elements, the smaller
2639	/// vector is extended with poison values to match the size of the larger
2640	/// vector before concatenation.
2641	///
2642	/// 3. Concatenation: The vectors are merged using a shuffle operation that
2643	/// places all elements of V0 first, followed by all elements of V1.
2644	///
2645	/// \param V0 The first vector to merge (must be a vector type)
2646	/// \param V1 The second vector to merge (must be a vector type)
2647	/// \param DL The data layout for size calculations
2648	/// \param NewAIEltTy The desired element type for the result vector
2649	/// \param Builder IRBuilder for creating new instructions
2650	/// \return A new vector containing all elements from V0 followed by all
2651	/// elements from V1
2652	static Value mergeTwoVectors(Value V0, Value V1, const* DataLayout &DL,
2653	Type *NewAIEltTy, IRBuilder<> &Builder) {
2654	// V0 and V1 are vectors
2655	// Create a new vector type with combined elements
2656	// Use ShuffleVector to concatenate the vectors
2657	auto *VecType0 = cast<FixedVectorType>(Val: V0->getType());
2658	auto *VecType1 = cast<FixedVectorType>(Val: V1->getType());
2659
2660	// If V0/V1 element types are different from NewAllocaElementType,
2661	// we need to introduce bitcasts before merging them
2662	auto BitcastIfNeeded = [&](Value &V, FixedVectorType &VecType,
2663	const char *DebugName) {
2664	Type *EltType = VecType->getElementType();
2665	if (EltType != NewAIEltTy) {
2666	// Calculate new number of elements to maintain same bit width
2667	unsigned TotalBits =
2668	VecType->getNumElements() * DL.getTypeSizeInBits(Ty: EltType);
2669	unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(Ty: NewAIEltTy);
2670
2671	auto *NewVecType = FixedVectorType::get(ElementType: NewAIEltTy, NumElts: NewNumElts);
2672	V = Builder.CreateBitCast(V, DestTy: NewVecType);
2673	VecType = NewVecType;
2674	LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2675	}
2676	};
2677
2678	BitcastIfNeeded (V0, VecType0, "V0");
2679	BitcastIfNeeded (V1, VecType1, "V1");
2680
2681	unsigned NumElts0 = VecType0->getNumElements();
2682	unsigned NumElts1 = VecType1->getNumElements();
2683
2684	SmallVector<int, `16`> ShuffleMask;
2685
2686	if (NumElts0 == NumElts1) {
2687	for (unsigned i = `0`; i < NumElts0 + NumElts1; ++i)
2688	ShuffleMask.push_back(Elt: i);
2689	} else {
2690	// If two vectors have different sizes, we need to extend
2691	// the smaller vector to the size of the larger vector.
2692	unsigned SmallSize = std::min(a: NumElts0, b: NumElts1);
2693	unsigned LargeSize = std::max(a: NumElts0, b: NumElts1);
2694	bool IsV0Smaller = NumElts0 < NumElts1;
2695	Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2696	SmallVector<int, `16`> ExtendMask;
2697	for (unsigned i = `0`; i < SmallSize; ++i)
2698	ExtendMask.push_back(Elt: i);
2699	for (unsigned i = SmallSize; i < LargeSize; ++i)
2700	ExtendMask.push_back(Elt: PoisonMaskElem);
2701	ExtendedVec = Builder.CreateShuffleVector(
2702	V1: ExtendedVec, V2: PoisonValue::get(T: ExtendedVec->getType()), Mask: ExtendMask);
2703	LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2704	for (unsigned i = `0`; i < NumElts0; ++i)
2705	ShuffleMask.push_back(Elt: i);
2706	for (unsigned i = `0`; i < NumElts1; ++i)
2707	ShuffleMask.push_back(Elt: LargeSize + i);
2708	}
2709
2710	return Builder.CreateShuffleVector(V1: V0, V2: V1, Mask: ShuffleMask);
2711	}
2712
2713	namespace {
2714
2715	/// Visitor to rewrite instructions using p particular slice of an alloca
2716	/// to use a new alloca.
2717	///
2718	/// Also implements the rewriting to vector-based accesses when the partition
2719	/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2720	/// lives here.
2721	class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2722	// Befriend the base class so it can delegate to private visit methods.
2723	friend class InstVisitor<AllocaSliceRewriter, bool>;
2724
2725	using Base = InstVisitor<AllocaSliceRewriter, bool>;
2726
2727	const DataLayout &DL;
2728	AllocaSlices &AS;
2729	SROA &Pass;
2730	AllocaInst &OldAI, &NewAI;
2731	const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2732	Type *NewAllocaTy;
2733
2734	// This is a convenience and flag variable that will be null unless the new
2735	// alloca's integer operations should be widened to this integer type due to
2736	// passing isIntegerWideningViable above. If it is non-null, the desired
2737	// integer type will be stored here for easy access during rewriting.
2738	IntegerType *IntTy;
2739
2740	// If we are rewriting an alloca partition which can be written as pure
2741	// vector operations, we stash extra information here. When VecTy is
2742	// non-null, we have some strict guarantees about the rewritten alloca:
2743	// - The new alloca is exactly the size of the vector type here.
2744	// - The accesses all either map to the entire vector or to a single
2745	// element.
2746	// - The set of accessing instructions is only one of those handled above
2747	// in isVectorPromotionViable. Generally these are the same access kinds
2748	// which are promotable via mem2reg.
2749	VectorType *VecTy;
2750	Type *ElementTy;
2751	uint64_t ElementSize;
2752
2753	// The original offset of the slice currently being rewritten relative to
2754	// the original alloca.
2755	uint64_t BeginOffset = `0`;
2756	uint64_t EndOffset = `0`;
2757
2758	// The new offsets of the slice currently being rewritten relative to the
2759	// original alloca.
2760	uint64_t NewBeginOffset = `0`, NewEndOffset = `0`;
2761
2762	uint64_t SliceSize = `0`;
2763	bool IsSplittable = false;
2764	bool IsSplit = false;
2765	Use OldUse = nullptr*;
2766	Instruction OldPtr = nullptr*;
2767
2768	// Track post-rewrite users which are PHI nodes and Selects.
2769	SmallSetVector<PHINode *, `8`> &PHIUsers;
2770	SmallSetVector<SelectInst *, `8`> &SelectUsers;
2771
2772	// Utility IR builder, whose name prefix is setup for each visited use, and
2773	// the insertion point is set to point to the user.
2774	IRBuilderTy IRB;
2775
2776	// Return the new alloca, addrspacecasted if required to avoid changing the
2777	// addrspace of a volatile access.
2778	Value getPtrToNewAI(unsigned* AddrSpace, bool IsVolatile) {
2779	if (!IsVolatile \|\| AddrSpace == NewAI.getType()->getPointerAddressSpace())
2780	return &NewAI;
2781
2782	Type *AccessTy = IRB.getPtrTy(AddrSpace);
2783	return IRB.CreateAddrSpaceCast(V: &NewAI, DestTy: AccessTy);
2784	}
2785
2786	public:
2787	AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2788	AllocaInst &OldAI, AllocaInst &NewAI, Type *NewAllocaTy,
2789	uint64_t NewAllocaBeginOffset,
2790	uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2791	VectorType *PromotableVecTy,
2792	SmallSetVector<PHINode *, `8`> &PHIUsers,
2793	SmallSetVector<SelectInst *, `8`> &SelectUsers)
2794	: DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2795	NewAllocaBeginOffset(NewAllocaBeginOffset),
2796	NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAllocaTy),
2797	IntTy(IsIntegerPromotable
2798	? Type::getIntNTy(
2799	C&: NewAI.getContext(),
2800	N: DL.getTypeSizeInBits(Ty: NewAllocaTy).getFixedValue())
2801	: nullptr),
2802	VecTy(PromotableVecTy),
2803	ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2804	ElementSize(VecTy ? DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / `8`
2805	: `0`),
2806	PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2807	IRB (NewAI.getContext(), ConstantFolder ()) {
2808	if (VecTy) {
2809	assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % `8`) == `0` &&
2810	"Only multiple-of-8 sized vector elements are viable");
2811	++NumVectorized;
2812	}
2813	assert((!IntTy && !VecTy) \|\| (IntTy && !VecTy) \|\| (!IntTy && VecTy));
2814	}
2815
2816	bool visit(AllocaSlices::const_iterator I) {
2817	bool CanSROA = true;
2818	BeginOffset = I->beginOffset();
2819	EndOffset = I->endOffset();
2820	IsSplittable = I->isSplittable();
2821	IsSplit =
2822	BeginOffset < NewAllocaBeginOffset \|\| EndOffset > NewAllocaEndOffset;
2823	LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2824	LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2825	LLVM_DEBUG(dbgs() << "\n");
2826
2827	// Compute the intersecting offset range.
2828	assert(BeginOffset < NewAllocaEndOffset);
2829	assert(EndOffset > NewAllocaBeginOffset);
2830	NewBeginOffset = std::max(a: BeginOffset, b: NewAllocaBeginOffset);
2831	NewEndOffset = std::min(a: EndOffset, b: NewAllocaEndOffset);
2832
2833	SliceSize = NewEndOffset - NewBeginOffset;
2834	LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2835	<< ") NewBegin:(" << NewBeginOffset << ", "
2836	<< NewEndOffset << ") NewAllocaBegin:("
2837	<< NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2838	<< ")\n");
2839	assert(IsSplit \|\| NewBeginOffset == BeginOffset);
2840	OldUse = I->getUse();
2841	OldPtr = cast<Instruction>(Val: OldUse->get());
2842
2843	Instruction *OldUserI = cast<Instruction>(Val: OldUse->getUser());
2844	IRB.SetInsertPoint(OldUserI);
2845	IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2846	IRB.getInserter().SetNamePrefix(Twine (NewAI.getName()) + "." +
2847	Twine (BeginOffset) + ".");
2848
2849	CanSROA &= visit(I: cast<Instruction>(Val: OldUse->getUser()));
2850	if (VecTy \|\| IntTy)
2851	assert(CanSROA);
2852	return CanSROA;
2853	}
2854
2855	/// Attempts to rewrite a partition using tree-structured merge optimization.
2856	///
2857	/// This function analyzes a partition to determine if it can be optimized
2858	/// using a tree-structured merge pattern, where multiple non-overlapping
2859	/// stores completely fill an alloca. And there is no load from the alloca in
2860	/// the middle of the stores. Such patterns can be optimized by eliminating
2861	/// the intermediate stores and directly constructing the final vector by
2862	/// using shufflevectors.
2863	///
2864	/// Example transformation:
2865	/// Before: (stores do not have to be in order)
2866	/// %alloca = alloca <8 x float>
2867	/// store <2 x float> %val0, ptr %alloca ; offset 0-1
2868	/// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2869	/// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2870	/// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2871	///
2872	/// After:
2873	/// %alloca = alloca <8 x float>
2874	/// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2875	/// i32 3>
2876	/// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2877	/// i32 3>
2878	/// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2879	/// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2880	/// store %shuffle2, ptr %alloca
2881	///
2882	/// The optimization looks for partitions that:
2883	/// 1. Have no overlapping split slice tails
2884	/// 2. Contain non-overlapping stores that cover the entire alloca
2885	/// 3. Have exactly one load that reads the complete alloca structure and not
2886	/// in the middle of the stores (TODO: maybe we can relax the constraint
2887	/// about reading the entire alloca structure)
2888	///
2889	/// \param P The partition to analyze and potentially rewrite
2890	/// \return An optional vector of values that were deleted during the rewrite
2891	/// process, or std::nullopt if the partition cannot be optimized
2892	/// using tree-structured merge
2893	std::optional<SmallVector<Value *, `4`>>
2894	rewriteTreeStructuredMerge(Partition &P) {
2895	// No tail slices that overlap with the partition
2896	if (P.splitSliceTails().size() > `0`)
2897	return std::nullopt;
2898
2899	SmallVector<Value *, `4`> DeletedValues;
2900	LoadInst TheLoad = nullptr*;
2901
2902	// Structure to hold store information
2903	struct StoreInfo {
2904	StoreInst *Store;
2905	uint64_t BeginOffset;
2906	uint64_t EndOffset;
2907	Value *StoredValue;
2908	StoreInfo(StoreInst SI, uint64_t Begin, uint64_t End, Value Val)
2909	: Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2910	};
2911
2912	SmallVector<StoreInfo, `4`> StoreInfos;
2913
2914	// If the new alloca is a fixed vector type, we use its element type as the
2915	// allocated element type, otherwise we use i8 as the allocated element
2916	Type *AllocatedEltTy =
2917	isa<FixedVectorType>(Val: NewAllocaTy)
2918	? cast<FixedVectorType>(Val: NewAllocaTy)->getElementType()
2919	: Type::getInt8Ty(C&: NewAI.getContext());
2920	unsigned AllocatedEltTySize = DL.getTypeSizeInBits(Ty: AllocatedEltTy);
2921
2922	// Helper to check if a type is
2923	// 1. A fixed vector type
2924	// 2. The element type is not a pointer
2925	// 3. The element type size is byte-aligned
2926	// We only handle the cases that the ld/st meet these conditions
2927	auto IsTypeValidForTreeStructuredMerge = [&](Type Ty) -> bool* {
2928	auto *FixedVecTy = dyn_cast<FixedVectorType>(Val: Ty);
2929	return FixedVecTy &&
2930	DL.getTypeSizeInBits(Ty: FixedVecTy->getElementType()) % `8` == `0` &&
2931	!FixedVecTy->getElementType()->isPointerTy();
2932	};
2933
2934	for (Slice &S : P) {
2935	auto *User = cast<Instruction>(Val: S.getUse()->getUser());
2936	if (auto *LI = dyn_cast<LoadInst>(Val: User)) {
2937	// Do not handle the case if
2938	// 1. There is more than one load
2939	// 2. The load is volatile
2940	// 3. The load does not read the entire alloca structure
2941	// 4. The load does not meet the conditions in the helper function
2942	if (TheLoad \|\| !IsTypeValidForTreeStructuredMerge(LI->getType()) \|\|
2943	S.beginOffset() != NewAllocaBeginOffset \|\|
2944	S.endOffset() != NewAllocaEndOffset \|\| LI->isVolatile())
2945	return std::nullopt;
2946	TheLoad = LI;
2947	} else if (auto *SI = dyn_cast<StoreInst>(Val: User)) {
2948	// Do not handle the case if
2949	// 1. The store does not meet the conditions in the helper function
2950	// 2. The store is volatile
2951	// 3. The total store size is not a multiple of the allocated element
2952	// type size
2953	if (!IsTypeValidForTreeStructuredMerge(
2954	SI->getValueOperand()->getType()) \|\|
2955	SI->isVolatile())
2956	return std::nullopt;
2957	auto *VecTy = cast<FixedVectorType>(Val: SI->getValueOperand()->getType());
2958	unsigned NumElts = VecTy->getNumElements();
2959	unsigned EltSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
2960	if (NumElts * EltSize % AllocatedEltTySize != `0`)
2961	return std::nullopt;
2962	StoreInfos.emplace_back(Args&: SI, Args: S.beginOffset(), Args: S.endOffset(),
2963	Args: SI->getValueOperand());
2964	} else {
2965	// If we have instructions other than load and store, we cannot do the
2966	// tree structured merge
2967	return std::nullopt;
2968	}
2969	}
2970	// If we do not have any load, we cannot do the tree structured merge
2971	if (!TheLoad)
2972	return std::nullopt;
2973
2974	// If we do not have multiple stores, we cannot do the tree structured merge
2975	if (StoreInfos.size() < `2`)
2976	return std::nullopt;
2977
2978	// Stores should not overlap and should cover the whole alloca
2979	// Sort by begin offset
2980	llvm::sort(C&: StoreInfos, Comp: [](const StoreInfo &A, const StoreInfo &B) {
2981	return A.BeginOffset < B.BeginOffset;
2982	});
2983
2984	// Check for overlaps and coverage
2985	uint64_t ExpectedStart = NewAllocaBeginOffset;
2986	for (auto &StoreInfo : StoreInfos) {
2987	uint64_t BeginOff = StoreInfo.BeginOffset;
2988	uint64_t EndOff = StoreInfo.EndOffset;
2989
2990	// Check for gap or overlap
2991	if (BeginOff != ExpectedStart)
2992	return std::nullopt;
2993
2994	ExpectedStart = EndOff;
2995	}
2996	// Check that stores cover the entire alloca
2997	if (ExpectedStart != NewAllocaEndOffset)
2998	return std::nullopt;
2999
3000	// Stores should be in the same basic block
3001	// The load should not be in the middle of the stores
3002	// Note:
3003	// If the load is in a different basic block with the stores, we can still
3004	// do the tree structured merge. This is because we do not have the
3005	// store->load forwarding here. The merged vector will be stored back to
3006	// NewAI and the new load will load from NewAI. The forwarding will be
3007	// handled later when we try to promote NewAI.
3008	BasicBlock *LoadBB = TheLoad->getParent();
3009	BasicBlock *StoreBB = StoreInfos [`0`].Store->getParent();
3010
3011	for (auto &StoreInfo : StoreInfos) {
3012	if (StoreInfo.Store->getParent() != StoreBB)
3013	return std::nullopt;
3014	if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(Other: TheLoad))
3015	return std::nullopt;
3016	}
3017
3018	// If we reach here, the partition can be merged with a tree structured
3019	// merge
3020	LLVM_DEBUG({
3021	dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
3022	<< "\n Ordered stores:\n";
3023	for (auto [i, Info] : enumerate(StoreInfos))
3024	dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
3025	<< Info.EndOffset << ") \tStore: " << *Info.Store
3026	<< "\tValue: " << *Info.StoredValue << "\n";
3027	});
3028
3029	// Instead of having these stores, we merge all the stored values into a
3030	// vector and store the merged value into the alloca
3031	std::queue<Value *> VecElements;
3032	IRBuilder<> Builder(StoreInfos.back().Store);
3033	for (const auto &Info : StoreInfos) {
3034	DeletedValues.push_back(Elt: Info.Store);
3035	VecElements.push(x: Info.StoredValue);
3036	}
3037
3038	LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3039	while (VecElements.size() > `1`) {
3040	const auto NumElts = VecElements.size();
3041	for ([[maybe_unused]] const auto _ : llvm::seq(Size: NumElts / `2`)) {
3042	Value *V0 = VecElements.front();
3043	VecElements.pop();
3044	Value *V1 = VecElements.front();
3045	VecElements.pop();
3046	Value *Merged = mergeTwoVectors(V0, V1, DL, NewAIEltTy: AllocatedEltTy, Builder);
3047	LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3048	VecElements.push(x: Merged);
3049	}
3050	if (NumElts % `2` == `1`) {
3051	Value *V = VecElements.front();
3052	VecElements.pop();
3053	VecElements.push(x: V);
3054	}
3055	}
3056
3057	// Store the merged value into the alloca
3058	Value *MergedValue = VecElements.front();
3059	Builder.CreateAlignedStore(Val: MergedValue, Ptr: &NewAI, Align: getSliceAlign());
3060
3061	IRBuilder<> LoadBuilder(TheLoad);
3062	TheLoad->replaceAllUsesWith(V: LoadBuilder.CreateAlignedLoad(
3063	Ty: TheLoad->getType(), Ptr: &NewAI, Align: getSliceAlign(), isVolatile: TheLoad->isVolatile(),
3064	Name: TheLoad->getName() + ".sroa.new.load"));
3065	DeletedValues.push_back(Elt: TheLoad);
3066
3067	return DeletedValues;
3068	}
3069
3070	private:
3071	// Make sure the other visit overloads are visible.
3072	using Base::visit;
3073
3074	// Every instruction which can end up as a user must have a rewrite rule.
3075	bool visitInstruction(Instruction &I) {
3076	LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3077	llvm_unreachable("No rewrite rule for this instruction!");
3078	}
3079
3080	Value getNewAllocaSlicePtr(IRBuilderTy &IRB, Type PointerTy) {
3081	// Note that the offset computation can use BeginOffset or NewBeginOffset
3082	// interchangeably for unsplit slices.
3083	assert(IsSplit \|\| BeginOffset == NewBeginOffset);
3084	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3085
3086	StringRef OldName = OldPtr->getName();
3087	// Skip through the last '.sroa.' component of the name.
3088	size_t LastSROAPrefix = OldName.rfind(Str: ".sroa.");
3089	if (LastSROAPrefix != StringRef::npos) {
3090	OldName = OldName.substr(Start: LastSROAPrefix + strlen(s: ".sroa."));
3091	// Look for an SROA slice index.
3092	size_t IndexEnd = OldName.find_first_not_of(Chars: "0123456789");
3093	if (IndexEnd != StringRef::npos && OldName [IndexEnd] == `'.'`) {
3094	// Strip the index and look for the offset.
3095	OldName = OldName.substr(Start: IndexEnd + `1`);
3096	size_t OffsetEnd = OldName.find_first_not_of(Chars: "0123456789");
3097	if (OffsetEnd != StringRef::npos && OldName [OffsetEnd] == `'.'`)
3098	// Strip the offset.
3099	OldName = OldName.substr(Start: OffsetEnd + `1`);
3100	}
3101	}
3102	// Strip any SROA suffixes as well.
3103	OldName = OldName.substr(Start: `0`, N: OldName.find(Str: ".sroa_"));
3104
3105	return getAdjustedPtr(IRB, DL, Ptr: &NewAI,
3106	Offset: APInt (DL.getIndexTypeSizeInBits(Ty: PointerTy), Offset),
3107	PointerTy, NamePrefix: Twine (OldName) + ".");
3108	}
3109
3110	/// Compute suitable alignment to access this slice of the new
3111	/// alloca.
3112	///
3113	/// You can optionally pass a type to this routine and if that type's ABI
3114	/// alignment is itself suitable, this will return zero.
3115	Align getSliceAlign() {
3116	return commonAlignment(A: NewAI.getAlign(),
3117	Offset: NewBeginOffset - NewAllocaBeginOffset);
3118	}
3119
3120	unsigned getIndex(uint64_t Offset) {
3121	assert(VecTy && "Can only call getIndex when rewriting a vector");
3122	uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3123	assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3124	uint32_t Index = RelOffset / ElementSize;
3125	assert(Index * ElementSize == RelOffset);
3126	return Index;
3127	}
3128
3129	void deleteIfTriviallyDead(Value *V) {
3130	Instruction *I = cast<Instruction>(Val: V);
3131	if (isInstructionTriviallyDead(I))
3132	Pass.DeadInsts.push_back(Elt: I);
3133	}
3134
3135	Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3136	unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3137	unsigned EndIndex = getIndex(Offset: NewEndOffset);
3138	assert(EndIndex > BeginIndex && "Empty vector!");
3139
3140	LoadInst *Load =
3141	IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3142
3143	Load->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3144	LLVMContext::MD_access_group});
3145	return extractVector(IRB, V: Load, BeginIndex, EndIndex, Name: "vec");
3146	}
3147
3148	Value *rewriteIntegerLoad(LoadInst &LI) {
3149	assert(IntTy && "We cannot insert an integer to the alloca");
3150	assert(!LI.isVolatile());
3151	Value *V =
3152	IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3153	V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: IntTy);
3154	assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3155	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3156	if (Offset > `0` \|\| NewEndOffset < NewAllocaEndOffset) {
3157	IntegerType ExtractTy = Type::getIntNTy(C&: LI.getContext(), N: SliceSize `8`);
3158	V = extractInteger(DL, IRB, V, Ty: ExtractTy, Offset, Name: "extract");
3159	}
3160	// It is possible that the extracted type is not the load type. This
3161	// happens if there is a load past the end of the alloca, and as
3162	// a consequence the slice is narrower but still a candidate for integer
3163	// lowering. To handle this case, we just zero extend the extracted
3164	// integer.
3165	assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * `8` &&
3166	"Can only handle an extract for an overly wide load");
3167	if (cast<IntegerType>(Val: LI.getType())->getBitWidth() > SliceSize * `8`)
3168	V = IRB.CreateZExt(V, DestTy: LI.getType());
3169	return V;
3170	}
3171
3172	bool visitLoadInst(LoadInst &LI) {
3173	LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3174	Value *OldOp = LI.getOperand(i_nocapture: `0`);
3175	assert(OldOp == OldPtr);
3176
3177	AAMDNodes AATags = LI.getAAMetadata();
3178
3179	unsigned AS = LI.getPointerAddressSpace();
3180
3181	Type TargetTy = IsSplit ? Type::getIntNTy(C&: LI.getContext(), N: SliceSize `8`)
3182	: LI.getType();
3183	bool IsPtrAdjusted = false;
3184	Value *V;
3185	if (VecTy) {
3186	V = rewriteVectorizedLoadInst(LI);
3187	} else if (IntTy && LI.getType()->isIntegerTy()) {
3188	V = rewriteIntegerLoad(LI);
3189	} else if (NewBeginOffset == NewAllocaBeginOffset &&
3190	NewEndOffset == NewAllocaEndOffset &&
3191	(canConvertValue(DL, OldTy: NewAllocaTy, NewTy: TargetTy) \|\|
3192	(NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3193	DL.getTypeStoreSize(Ty: TargetTy).getFixedValue() > SliceSize &&
3194	!LI.isVolatile()))) {
3195	Value *NewPtr =
3196	getPtrToNewAI(AddrSpace: LI.getPointerAddressSpace(), IsVolatile: LI.isVolatile());
3197	LoadInst *NewLI = IRB.CreateAlignedLoad(
3198	Ty: NewAllocaTy, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
3199	if (LI.isVolatile())
3200	NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
3201	if (NewLI->isAtomic())
3202	NewLI->setAlignment(LI.getAlign());
3203
3204	// Copy any metadata that is valid for the new load. This may require
3205	// conversion to a different kind of metadata, e.g. !nonnull might change
3206	// to !range or vice versa.
3207	copyMetadataForLoad(Dest&: *NewLI, Source: LI);
3208
3209	// Do this after copyMetadataForLoad() to preserve the TBAA shift.
3210	if (AATags)
3211	NewLI->setAAMetadata(AATags.adjustForAccess(
3212	Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
3213
3214	// Try to preserve nonnull metadata
3215	V = NewLI;
3216
3217	// If this is an integer load past the end of the slice (which means the
3218	// bytes outside the slice are undef or this load is dead) just forcibly
3219	// fix the integer size with correct handling of endianness.
3220	if (auto *AITy = dyn_cast<IntegerType>(Val: NewAllocaTy))
3221	if (auto *TITy = dyn_cast<IntegerType>(Val: TargetTy))
3222	if (AITy->getBitWidth() < TITy->getBitWidth()) {
3223	V = IRB.CreateZExt(V, DestTy: TITy, Name: "load.ext");
3224	if (DL.isBigEndian())
3225	V = IRB.CreateShl(LHS: V, RHS: TITy->getBitWidth() - AITy->getBitWidth(),
3226	Name: "endian_shift");
3227	}
3228	} else {
3229	Type *LTy = IRB.getPtrTy(AddrSpace: AS);
3230	LoadInst *NewLI =
3231	IRB.CreateAlignedLoad(Ty: TargetTy, Ptr: getNewAllocaSlicePtr(IRB, PointerTy: LTy),
3232	Align: getSliceAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
3233
3234	if (AATags)
3235	NewLI->setAAMetadata(AATags.adjustForAccess(
3236	Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
3237
3238	if (LI.isVolatile())
3239	NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
3240	NewLI->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3241	LLVMContext::MD_access_group});
3242
3243	V = NewLI;
3244	IsPtrAdjusted = true;
3245	}
3246	V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: TargetTy);
3247
3248	if (IsSplit) {
3249	assert(!LI.isVolatile());
3250	assert(LI.getType()->isIntegerTy() &&
3251	"Only integer type loads and stores are split");
3252	assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3253	"Split load isn't smaller than original load");
3254	assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3255	"Non-byte-multiple bit width");
3256	// Move the insertion point just past the load so that we can refer to it.
3257	BasicBlock::iterator LIIt = std::next(x: LI.getIterator());
3258	// Ensure the insertion point comes before any debug-info immediately
3259	// after the load, so that variable values referring to the load are
3260	// dominated by it.
3261	LIIt.setHeadBit(true);
3262	IRB.SetInsertPoint(TheBB: LI.getParent(), IP: LIIt);
3263	// Create a placeholder value with the same type as LI to use as the
3264	// basis for the new value. This allows us to replace the uses of LI with
3265	// the computed value, and then replace the placeholder with LI, leaving
3266	// LI only used for this computation.
3267	Value *Placeholder =
3268	new LoadInst (LI.getType(), PoisonValue::get(T: IRB.getPtrTy(AddrSpace: AS)), "",
3269	false, Align (`1`));
3270	V = insertInteger(DL, IRB, Old: Placeholder, V, Offset: NewBeginOffset - BeginOffset,
3271	Name: "insert");
3272	LI.replaceAllUsesWith(V);
3273	Placeholder->replaceAllUsesWith(V: &LI);
3274	Placeholder->deleteValue();
3275	} else {
3276	LI.replaceAllUsesWith(V);
3277	}
3278
3279	Pass.DeadInsts.push_back(Elt: &LI);
3280	deleteIfTriviallyDead(V: OldOp);
3281	LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3282	return !LI.isVolatile() && !IsPtrAdjusted;
3283	}
3284
3285	bool rewriteVectorizedStoreInst(Value V, StoreInst &SI, Value OldOp,
3286	AAMDNodes AATags) {
3287	// Capture V for the purpose of debug-info accounting once it's converted
3288	// to a vector store.
3289	Value *OrigV = V;
3290	if (V->getType() != VecTy) {
3291	unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3292	unsigned EndIndex = getIndex(Offset: NewEndOffset);
3293	assert(EndIndex > BeginIndex && "Empty vector!");
3294	unsigned NumElements = EndIndex - BeginIndex;
3295	assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3296	"Too many elements!");
3297	Type *SliceTy = (NumElements == `1`)
3298	? ElementTy
3299	: FixedVectorType::get(ElementType: ElementTy, NumElts: NumElements);
3300	if (V->getType() != SliceTy)
3301	V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: SliceTy);
3302
3303	// Mix in the existing elements.
3304	Value *Old =
3305	IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3306	V = insertVector(IRB, Old, V, BeginIndex, Name: "vec");
3307	}
3308	StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3309	Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3310	LLVMContext::MD_access_group});
3311	if (AATags)
3312	Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3313	AccessTy: V->getType(), DL));
3314	Pass.DeadInsts.push_back(Elt: &SI);
3315
3316	// NOTE: Careful to use OrigV rather than V.
3317	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &SI,
3318	Inst: Store, Dest: Store->getPointerOperand(), Value: OrigV, DL);
3319	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3320	return true;
3321	}
3322
3323	bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3324	assert(IntTy && "We cannot extract an integer from the alloca");
3325	assert(!SI.isVolatile());
3326	if (DL.getTypeSizeInBits(Ty: V->getType()).getFixedValue() !=
3327	IntTy->getBitWidth()) {
3328	Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3329	Name: "oldload");
3330	Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3331	assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3332	uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3333	V = insertInteger(DL, IRB, Old, V: SI.getValueOperand(), Offset, Name: "insert");
3334	}
3335	V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3336	StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3337	Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3338	LLVMContext::MD_access_group});
3339	if (AATags)
3340	Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3341	AccessTy: V->getType(), DL));
3342
3343	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &SI,
3344	Inst: Store, Dest: Store->getPointerOperand(),
3345	Value: Store->getValueOperand(), DL);
3346
3347	Pass.DeadInsts.push_back(Elt: &SI);
3348	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3349	return true;
3350	}
3351
3352	bool visitStoreInst(StoreInst &SI) {
3353	LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3354	Value *OldOp = SI.getOperand(i_nocapture: `1`);
3355	assert(OldOp == OldPtr);
3356
3357	AAMDNodes AATags = SI.getAAMetadata();
3358	Value *V = SI.getValueOperand();
3359
3360	// Strip all inbounds GEPs and pointer casts to try to dig out any root
3361	// alloca that should be re-examined after promoting this alloca.
3362	if (V->getType()->isPointerTy())
3363	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: V->stripInBoundsOffsets()))
3364	Pass.PostPromotionWorklist.insert(X: AI);
3365
3366	TypeSize StoreSize = DL.getTypeStoreSize(Ty: V->getType());
3367	if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3368	assert(!SI.isVolatile());
3369	assert(V->getType()->isIntegerTy() &&
3370	"Only integer type loads and stores are split");
3371	assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3372	"Non-byte-multiple bit width");
3373	IntegerType NarrowTy = Type::getIntNTy(C&: SI.getContext(), N: SliceSize `8`);
3374	V = extractInteger(DL, IRB, V, Ty: NarrowTy, Offset: NewBeginOffset - BeginOffset,
3375	Name: "extract");
3376	}
3377
3378	if (VecTy)
3379	return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3380	if (IntTy && V->getType()->isIntegerTy())
3381	return rewriteIntegerStore(V, SI, AATags);
3382
3383	StoreInst *NewSI;
3384	if (NewBeginOffset == NewAllocaBeginOffset &&
3385	NewEndOffset == NewAllocaEndOffset &&
3386	canConvertValue(DL, OldTy: V->getType(), NewTy: NewAllocaTy)) {
3387	V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3388	Value *NewPtr =
3389	getPtrToNewAI(AddrSpace: SI.getPointerAddressSpace(), IsVolatile: SI.isVolatile());
3390
3391	NewSI =
3392	IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: SI.isVolatile());
3393	} else {
3394	unsigned AS = SI.getPointerAddressSpace();
3395	Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: IRB.getPtrTy(AddrSpace: AS));
3396	NewSI =
3397	IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: getSliceAlign(), isVolatile: SI.isVolatile());
3398	}
3399	NewSI->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3400	LLVMContext::MD_access_group});
3401	if (AATags)
3402	NewSI->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3403	AccessTy: V->getType(), DL));
3404	if (SI.isVolatile())
3405	NewSI->setAtomic(Ordering: SI.getOrdering(), SSID: SI.getSyncScopeID());
3406	if (NewSI->isAtomic())
3407	NewSI->setAlignment(SI.getAlign());
3408
3409	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &SI,
3410	Inst: NewSI, Dest: NewSI->getPointerOperand(),
3411	Value: NewSI->getValueOperand(), DL);
3412
3413	Pass.DeadInsts.push_back(Elt: &SI);
3414	deleteIfTriviallyDead(V: OldOp);
3415
3416	LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3417	return NewSI->getPointerOperand() == &NewAI &&
3418	NewSI->getValueOperand()->getType() == NewAllocaTy &&
3419	!SI.isVolatile();
3420	}
3421
3422	/// Compute an integer value from splatting an i8 across the given
3423	/// number of bytes.
3424	///
3425	/// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3426	/// call this routine.
3427	/// FIXME: Heed the advice above.
3428	///
3429	/// \param V The i8 value to splat.
3430	/// \param Size The number of bytes in the output (assuming i8 is one byte)
3431	Value getIntegerSplat(Value V, unsigned Size) {
3432	assert(Size > `0` && "Expected a positive number of bytes.");
3433	IntegerType *VTy = cast<IntegerType>(Val: V->getType());
3434	assert(VTy->getBitWidth() == `8` && "Expected an i8 value for the byte");
3435	if (Size == `1`)
3436	return V;
3437
3438	Type SplatIntTy = Type::getIntNTy(C&: VTy->getContext(), N: Size `8`);
3439	V = IRB.CreateMul(
3440	LHS: IRB.CreateZExt(V, DestTy: SplatIntTy, Name: "zext"),
3441	RHS: IRB.CreateUDiv(LHS: Constant::getAllOnesValue(Ty: SplatIntTy),
3442	RHS: IRB.CreateZExt(V: Constant::getAllOnesValue(Ty: V->getType()),
3443	DestTy: SplatIntTy)),
3444	Name: "isplat");
3445	return V;
3446	}
3447
3448	/// Compute a vector splat for a given element value.
3449	Value getVectorSplat(Value V, unsigned NumElements) {
3450	V = IRB.CreateVectorSplat(NumElts: NumElements, V, Name: "vsplat");
3451	LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3452	return V;
3453	}
3454
3455	bool visitMemSetInst(MemSetInst &II) {
3456	LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3457	assert(II.getRawDest() == OldPtr);
3458
3459	AAMDNodes AATags = II.getAAMetadata();
3460
3461	// If the memset has a variable size, it cannot be split, just adjust the
3462	// pointer to the new alloca.
3463	if (!isa<ConstantInt>(Val: II.getLength())) {
3464	assert(!IsSplit);
3465	assert(NewBeginOffset == BeginOffset);
3466	II.setDest(getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()));
3467	II.setDestAlignment(getSliceAlign());
3468	// In theory we should call migrateDebugInfo here. However, we do not
3469	// emit dbg.assign intrinsics for mem intrinsics storing through non-
3470	// constant geps, or storing a variable number of bytes.
3471	assert(at::getDVRAssignmentMarkers(&II).empty() &&
3472	"AT: Unexpected link to non-const GEP");
3473	deleteIfTriviallyDead(V: OldPtr);
3474	return false;
3475	}
3476
3477	// Record this instruction for deletion.
3478	Pass.DeadInsts.push_back(Elt: &II);
3479
3480	Type *ScalarTy = NewAllocaTy->getScalarType();
3481
3482	const bool CanContinue = [&]() {
3483	if (VecTy \|\| IntTy)
3484	return true;
3485	if (BeginOffset > NewAllocaBeginOffset \|\| EndOffset < NewAllocaEndOffset)
3486	return false;
3487	// Length must be in range for FixedVectorType.
3488	auto *C = cast<ConstantInt>(Val: II.getLength());
3489	const uint64_t Len = C->getLimitedValue();
3490	if (Len > std::numeric_limits<unsigned>::max())
3491	return false;
3492	auto *Int8Ty = IntegerType::getInt8Ty(C&: NewAI.getContext());
3493	auto *SrcTy = FixedVectorType::get(ElementType: Int8Ty, NumElts: Len);
3494	return canConvertValue(DL, OldTy: SrcTy, NewTy: NewAllocaTy) &&
3495	DL.isLegalInteger(Width: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue());
3496	}();
3497
3498	// If this doesn't map cleanly onto the alloca type, and that type isn't
3499	// a single value type, just emit a memset.
3500	if (!CanContinue) {
3501	Type *SizeTy = II.getLength()->getType();
3502	unsigned Sz = NewEndOffset - NewBeginOffset;
3503	Constant *Size = ConstantInt::get(Ty: SizeTy, V: Sz);
3504	MemIntrinsic *New = cast<MemIntrinsic>(Val: IRB.CreateMemSet(
3505	Ptr: getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()), Val: II.getValue(), Size,
3506	Align: MaybeAlign (getSliceAlign()), isVolatile: II.isVolatile()));
3507	if (AATags)
3508	New->setAAMetadata(
3509	AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset, AccessSize: Sz));
3510
3511	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &II,
3512	Inst: New, Dest: New->getRawDest(), Value: nullptr, DL);
3513
3514	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3515	return false;
3516	}
3517
3518	// If we can represent this as a simple value, we have to build the actual
3519	// value to store, which requires expanding the byte present in memset to
3520	// a sensible representation for the alloca type. This is essentially
3521	// splatting the byte to a sufficiently wide integer, splatting it across
3522	// any desired vector width, and bitcasting to the final type.
3523	Value *V;
3524
3525	if (VecTy) {
3526	// If this is a memset of a vectorized alloca, insert it.
3527	assert(ElementTy == ScalarTy);
3528
3529	unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3530	unsigned EndIndex = getIndex(Offset: NewEndOffset);
3531	assert(EndIndex > BeginIndex && "Empty vector!");
3532	unsigned NumElements = EndIndex - BeginIndex;
3533	assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3534	"Too many elements!");
3535
3536	Value *Splat = getIntegerSplat(
3537	V: II.getValue(), Size: DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / `8`);
3538	Splat = IRB.CreateBitPreservingCastChain(DL, V: Splat, NewTy: ElementTy);
3539	if (NumElements > `1`)
3540	Splat = getVectorSplat(V: Splat, NumElements);
3541
3542	Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3543	Name: "oldload");
3544	V = insertVector(IRB, Old, V: Splat, BeginIndex, Name: "vec");
3545	} else if (IntTy) {
3546	// If this is a memset on an alloca where we can widen stores, insert the
3547	// set integer.
3548	assert(!II.isVolatile());
3549
3550	uint64_t Size = NewEndOffset - NewBeginOffset;
3551	V = getIntegerSplat(V: II.getValue(), Size);
3552
3553	if (IntTy && (NewBeginOffset != NewAllocaBeginOffset \|\|
3554	NewEndOffset != NewAllocaEndOffset)) {
3555	Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI,
3556	Align: NewAI.getAlign(), Name: "oldload");
3557	Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3558	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3559	V = insertInteger(DL, IRB, Old, V, Offset, Name: "insert");
3560	} else {
3561	assert(V->getType() == IntTy &&
3562	"Wrong type for an alloca wide integer!");
3563	}
3564	V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3565	} else {
3566	// Established these invariants above.
3567	assert(NewBeginOffset == NewAllocaBeginOffset);
3568	assert(NewEndOffset == NewAllocaEndOffset);
3569
3570	V = getIntegerSplat(V: II.getValue(),
3571	Size: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue() / `8`);
3572	if (VectorType *AllocaVecTy = dyn_cast<VectorType>(Val: NewAllocaTy))
3573	V = getVectorSplat(
3574	V, NumElements: cast<FixedVectorType>(Val: AllocaVecTy)->getNumElements());
3575
3576	V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3577	}
3578
3579	Value *NewPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3580	StoreInst *New =
3581	IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: II.isVolatile());
3582	New->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3583	LLVMContext::MD_access_group});
3584	if (AATags)
3585	New->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3586	AccessTy: V->getType(), DL));
3587
3588	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &II,
3589	Inst: New, Dest: New->getPointerOperand(), Value: V, DL);
3590
3591	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3592	return !II.isVolatile();
3593	}
3594
3595	bool visitMemTransferInst(MemTransferInst &II) {
3596	// Rewriting of memory transfer instructions can be a bit tricky. We break
3597	// them into two categories: split intrinsics and unsplit intrinsics.
3598
3599	LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3600
3601	AAMDNodes AATags = II.getAAMetadata();
3602
3603	bool IsDest = &II.getRawDestUse() == OldUse;
3604	assert((IsDest && II.getRawDest() == OldPtr) \|\|
3605	(!IsDest && II.getRawSource() == OldPtr));
3606
3607	Align SliceAlign = getSliceAlign();
3608	// For unsplit intrinsics, we simply modify the source and destination
3609	// pointers in place. This isn't just an optimization, it is a matter of
3610	// correctness. With unsplit intrinsics we may be dealing with transfers
3611	// within a single alloca before SROA ran, or with transfers that have
3612	// a variable length. We may also be dealing with memmove instead of
3613	// memcpy, and so simply updating the pointers is the necessary for us to
3614	// update both source and dest of a single call.
3615	if (!IsSplittable) {
3616	Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3617	if (IsDest) {
3618	// Update the address component of linked dbg.assigns.
3619	for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(Inst: &II)) {
3620	if (llvm::is_contained(Range: DbgAssign->location_ops(), Element: II.getDest()) \|\|
3621	DbgAssign->getAddress() == II.getDest())
3622	DbgAssign->replaceVariableLocationOp(OldValue: II.getDest(), NewValue: AdjustedPtr);
3623	}
3624	II.setDest(AdjustedPtr);
3625	II.setDestAlignment(SliceAlign);
3626	} else {
3627	II.setSource(AdjustedPtr);
3628	II.setSourceAlignment(SliceAlign);
3629	}
3630
3631	LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3632	deleteIfTriviallyDead(V: OldPtr);
3633	return false;
3634	}
3635	// For split transfer intrinsics we have an incredibly useful assurance:
3636	// the source and destination do not reside within the same alloca, and at
3637	// least one of them does not escape. This means that we can replace
3638	// memmove with memcpy, and we don't need to worry about all manner of
3639	// downsides to splitting and transforming the operations.
3640
3641	// If this doesn't map cleanly onto the alloca type, and that type isn't
3642	// a single value type, just emit a memcpy.
3643	bool EmitMemCpy =
3644	!VecTy && !IntTy &&
3645	(BeginOffset > NewAllocaBeginOffset \|\| EndOffset < NewAllocaEndOffset \|\|
3646	SliceSize != DL.getTypeStoreSize(Ty: NewAllocaTy).getFixedValue() \|\|
3647	!DL.typeSizeEqualsStoreSize(Ty: NewAllocaTy) \|\|
3648	!NewAllocaTy->isSingleValueType());
3649
3650	// If we're just going to emit a memcpy, the alloca hasn't changed, and the
3651	// size hasn't been shrunk based on analysis of the viable range, this is
3652	// a no-op.
3653	if (EmitMemCpy && &OldAI == &NewAI) {
3654	// Ensure the start lines up.
3655	assert(NewBeginOffset == BeginOffset);
3656
3657	// Rewrite the size as needed.
3658	if (NewEndOffset != EndOffset)
3659	II.setLength(NewEndOffset - NewBeginOffset);
3660	return false;
3661	}
3662	// Record this instruction for deletion.
3663	Pass.DeadInsts.push_back(Elt: &II);
3664
3665	// Strip all inbounds GEPs and pointer casts to try to dig out any root
3666	// alloca that should be re-examined after rewriting this instruction.
3667	Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3668	if (AllocaInst *AI =
3669	dyn_cast<AllocaInst>(Val: OtherPtr->stripInBoundsOffsets())) {
3670	assert(AI != &OldAI && AI != &NewAI &&
3671	"Splittable transfers cannot reach the same alloca on both ends.");
3672	Pass.Worklist.insert(X: AI);
3673	}
3674
3675	Type *OtherPtrTy = OtherPtr->getType();
3676	unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3677
3678	// Compute the relative offset for the other pointer within the transfer.
3679	unsigned OffsetWidth = DL.getIndexSizeInBits(AS: OtherAS);
3680	APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3681	Align OtherAlign =
3682	(IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3683	OtherAlign =
3684	commonAlignment(A: OtherAlign, Offset: OtherOffset.zextOrTrunc(width: `64`).getZExtValue());
3685
3686	if (EmitMemCpy) {
3687	// Compute the other pointer, folding as much as possible to produce
3688	// a single, simple GEP in most cases.
3689	OtherPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3690	NamePrefix: OtherPtr->getName() + ".");
3691
3692	Value *OurPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3693	Type *SizeTy = II.getLength()->getType();
3694	Constant *Size = ConstantInt::get(Ty: SizeTy, V: NewEndOffset - NewBeginOffset);
3695
3696	Value DestPtr, SrcPtr;
3697	MaybeAlign DestAlign, SrcAlign;
3698	// Note: IsDest is true iff we're copying into the new alloca slice
3699	if (IsDest) {
3700	DestPtr = OurPtr;
3701	DestAlign = SliceAlign;
3702	SrcPtr = OtherPtr;
3703	SrcAlign = OtherAlign;
3704	} else {
3705	DestPtr = OtherPtr;
3706	DestAlign = OtherAlign;
3707	SrcPtr = OurPtr;
3708	SrcAlign = SliceAlign;
3709	}
3710	CallInst *New = IRB.CreateMemCpy(Dst: DestPtr, DstAlign: DestAlign, Src: SrcPtr, SrcAlign,
3711	Size, isVolatile: II.isVolatile());
3712	if (AATags)
3713	New->setAAMetadata(AATags.shift(Offset: NewBeginOffset - BeginOffset));
3714
3715	APInt Offset(DL.getIndexTypeSizeInBits(Ty: DestPtr->getType()), `0`);
3716	if (IsDest) {
3717	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`,
3718	OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3719	} else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3720	Val: DestPtr->stripAndAccumulateConstantOffsets(
3721	DL, Offset, /AllowNonInbounds/ true))) {
3722	migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * `8`,
3723	SliceSizeInBits: SliceSize * `8`, OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3724	}
3725	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3726	return false;
3727	}
3728
3729	bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3730	NewEndOffset == NewAllocaEndOffset;
3731	uint64_t Size = NewEndOffset - NewBeginOffset;
3732	unsigned BeginIndex = VecTy ? getIndex(Offset: NewBeginOffset) : `0`;
3733	unsigned EndIndex = VecTy ? getIndex(Offset: NewEndOffset) : `0`;
3734	unsigned NumElements = EndIndex - BeginIndex;
3735	IntegerType *SubIntTy =
3736	IntTy ? Type::getIntNTy(C&: IntTy->getContext(), N: Size * `8`) : nullptr;
3737
3738	// Reset the other pointer type to match the register type we're going to
3739	// use, but using the address space of the original other pointer.
3740	Type *OtherTy;
3741	if (VecTy && !IsWholeAlloca) {
3742	if (NumElements == `1`)
3743	OtherTy = VecTy->getElementType();
3744	else
3745	OtherTy = FixedVectorType::get(ElementType: VecTy->getElementType(), NumElts: NumElements);
3746	} else if (IntTy && !IsWholeAlloca) {
3747	OtherTy = SubIntTy;
3748	} else {
3749	OtherTy = NewAllocaTy;
3750	}
3751
3752	Value *AdjPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3753	NamePrefix: OtherPtr->getName() + ".");
3754	MaybeAlign SrcAlign = OtherAlign;
3755	MaybeAlign DstAlign = SliceAlign;
3756	if (!IsDest)
3757	std::swap(a&: SrcAlign, b&: DstAlign);
3758
3759	Value *SrcPtr;
3760	Value *DstPtr;
3761
3762	if (IsDest) {
3763	DstPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3764	SrcPtr = AdjPtr;
3765	} else {
3766	DstPtr = AdjPtr;
3767	SrcPtr = getPtrToNewAI(AddrSpace: II.getSourceAddressSpace(), IsVolatile: II.isVolatile());
3768	}
3769
3770	Value *Src;
3771	if (VecTy && !IsWholeAlloca && !IsDest) {
3772	Src =
3773	IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3774	Src = extractVector(IRB, V: Src, BeginIndex, EndIndex, Name: "vec");
3775	} else if (IntTy && !IsWholeAlloca && !IsDest) {
3776	Src =
3777	IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3778	Src = IRB.CreateBitPreservingCastChain(DL, V: Src, NewTy: IntTy);
3779	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3780	Src = extractInteger(DL, IRB, V: Src, Ty: SubIntTy, Offset, Name: "extract");
3781	} else {
3782	LoadInst *Load = IRB.CreateAlignedLoad(Ty: OtherTy, Ptr: SrcPtr, Align: SrcAlign,
3783	isVolatile: II.isVolatile(), Name: "copyload");
3784	Load->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3785	LLVMContext::MD_access_group});
3786	if (AATags)
3787	Load->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3788	AccessTy: Load->getType(), DL));
3789	Src = Load;
3790	}
3791
3792	if (VecTy && !IsWholeAlloca && IsDest) {
3793	Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3794	Name: "oldload");
3795	Src = insertVector(IRB, Old, V: Src, BeginIndex, Name: "vec");
3796	} else if (IntTy && !IsWholeAlloca && IsDest) {
3797	Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3798	Name: "oldload");
3799	Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3800	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3801	Src = insertInteger(DL, IRB, Old, V: Src, Offset, Name: "insert");
3802	Src = IRB.CreateBitPreservingCastChain(DL, V: Src, NewTy: NewAllocaTy);
3803	}
3804
3805	StoreInst *Store = cast<StoreInst>(
3806	Val: IRB.CreateAlignedStore(Val: Src, Ptr: DstPtr, Align: DstAlign, isVolatile: II.isVolatile()));
3807	Store->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3808	LLVMContext::MD_access_group});
3809	if (AATags)
3810	Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3811	AccessTy: Src->getType(), DL));
3812
3813	APInt Offset(DL.getIndexTypeSizeInBits(Ty: DstPtr->getType()), `0`);
3814	if (IsDest) {
3815
3816	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &II,
3817	Inst: Store, Dest: DstPtr, Value: Src, DL);
3818	} else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3819	Val: DstPtr->stripAndAccumulateConstantOffsets(
3820	DL, Offset, /AllowNonInbounds/ true))) {
3821	migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * `8`, SliceSizeInBits: SliceSize * `8`,
3822	OldInst: &II, Inst: Store, Dest: DstPtr, Value: Src, DL);
3823	}
3824
3825	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3826	return !II.isVolatile();
3827	}
3828
3829	bool visitIntrinsicInst(IntrinsicInst &II) {
3830	assert((II.isLifetimeStartOrEnd() \|\| II.isDroppable()) &&
3831	"Unexpected intrinsic!");
3832	LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3833
3834	// Record this instruction for deletion.
3835	Pass.DeadInsts.push_back(Elt: &II);
3836
3837	if (II.isDroppable()) {
3838	assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3839	// TODO For now we forget assumed information, this can be improved.
3840	OldPtr->dropDroppableUsesIn(Usr&: II);
3841	return true;
3842	}
3843
3844	assert(II.getArgOperand(`0`) == OldPtr);
3845	Type *PointerTy = IRB.getPtrTy(AddrSpace: OldPtr->getType()->getPointerAddressSpace());
3846	Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3847	Value *New;
3848	if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3849	New = IRB.CreateLifetimeStart(Ptr);
3850	else
3851	New = IRB.CreateLifetimeEnd(Ptr);
3852
3853	(void)New;
3854	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3855
3856	return true;
3857	}
3858
3859	void fixLoadStoreAlign(Instruction &Root) {
3860	// This algorithm implements the same visitor loop as
3861	// hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3862	// or store found.
3863	SmallPtrSet<Instruction *, `4`> Visited;
3864	SmallVector<Instruction *, `4`> Uses;
3865	Visited.insert(Ptr: &Root);
3866	Uses.push_back(Elt: &Root);
3867	do {
3868	Instruction *I = Uses.pop_back_val();
3869
3870	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
3871	LI->setAlignment(std::min(a: LI->getAlign(), b: getSliceAlign()));
3872	continue;
3873	}
3874	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
3875	SI->setAlignment(std::min(a: SI->getAlign(), b: getSliceAlign()));
3876	continue;
3877	}
3878
3879	assert(isa<BitCastInst>(I) \|\| isa<AddrSpaceCastInst>(I) \|\|
3880	isa<PHINode>(I) \|\| isa<SelectInst>(I) \|\|
3881	isa<GetElementPtrInst>(I));
3882	for (User *U : I->users())
3883	if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
3884	Uses.push_back(Elt: cast<Instruction>(Val: U));
3885	} while (!Uses.empty());
3886	}
3887
3888	bool visitPHINode(PHINode &PN) {
3889	LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3890	assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3891	assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3892
3893	// We would like to compute a new pointer in only one place, but have it be
3894	// as local as possible to the PHI. To do that, we re-use the location of
3895	// the old pointer, which necessarily must be in the right position to
3896	// dominate the PHI.
3897	IRBuilderBase::InsertPointGuard Guard(IRB);
3898	if (isa<PHINode>(Val: OldPtr))
3899	IRB.SetInsertPoint(TheBB: OldPtr->getParent(),
3900	IP: OldPtr->getParent()->getFirstInsertionPt());
3901	else
3902	IRB.SetInsertPoint(OldPtr);
3903	IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3904
3905	Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3906	// Replace the operands which were using the old pointer.
3907	std::replace(first: PN.op_begin(), last: PN.op_end(), old_value: cast<Value>(Val: OldPtr), new_value: NewPtr);
3908
3909	LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3910	deleteIfTriviallyDead(V: OldPtr);
3911
3912	// Fix the alignment of any loads or stores using this PHI node.
3913	fixLoadStoreAlign(Root&: PN);
3914
3915	// PHIs can't be promoted on their own, but often can be speculated. We
3916	// check the speculation outside of the rewriter so that we see the
3917	// fully-rewritten alloca.
3918	PHIUsers.insert(X: &PN);
3919	return true;
3920	}
3921
3922	bool visitSelectInst(SelectInst &SI) {
3923	LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3924	assert((SI.getTrueValue() == OldPtr \|\| SI.getFalseValue() == OldPtr) &&
3925	"Pointer isn't an operand!");
3926	assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3927	assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3928
3929	Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3930	// Replace the operands which were using the old pointer.
3931	if (SI.getOperand(i_nocapture: `1`) == OldPtr)
3932	SI.setOperand(i_nocapture: `1`, Val_nocapture: NewPtr);
3933	if (SI.getOperand(i_nocapture: `2`) == OldPtr)
3934	SI.setOperand(i_nocapture: `2`, Val_nocapture: NewPtr);
3935
3936	LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3937	deleteIfTriviallyDead(V: OldPtr);
3938
3939	// Fix the alignment of any loads or stores using this select.
3940	fixLoadStoreAlign(Root&: SI);
3941
3942	// Selects can't be promoted on their own, but often can be speculated. We
3943	// check the speculation outside of the rewriter so that we see the
3944	// fully-rewritten alloca.
3945	SelectUsers.insert(X: &SI);
3946	return true;
3947	}
3948	};
3949
3950	/// Visitor to rewrite aggregate loads and stores as scalar.
3951	///
3952	/// This pass aggressively rewrites all aggregate loads and stores on
3953	/// a particular pointer (or any pointer derived from it which we can identify)
3954	/// with scalar loads and stores.
3955	class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3956	// Befriend the base class so it can delegate to private visit methods.
3957	friend class InstVisitor<AggLoadStoreRewriter, bool>;
3958
3959	/// Queue of pointer uses to analyze and potentially rewrite.
3960	SmallVector<Use *, `8`> Queue;
3961
3962	/// Set to prevent us from cycling with phi nodes and loops.
3963	SmallPtrSet<User *, `8`> Visited;
3964
3965	/// The current pointer use being rewritten. This is used to dig up the used
3966	/// value (as opposed to the user).
3967	Use U = nullptr*;
3968
3969	/// Used to calculate offsets, and hence alignment, of subobjects.
3970	const DataLayout &DL;
3971
3972	IRBuilderTy &IRB;
3973
3974	public:
3975	AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3976	: DL(DL), IRB(IRB) {}
3977
3978	/// Rewrite loads and stores through a pointer and all pointers derived from
3979	/// it.
3980	bool rewrite(Instruction &I) {
3981	LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3982	enqueueUsers(I);
3983	bool Changed = false;
3984	while (!Queue.empty()) {
3985	U = Queue.pop_back_val();
3986	Changed \|= visit(I: cast<Instruction>(Val: U->getUser()));
3987	}
3988	return Changed;
3989	}
3990
3991	private:
3992	/// Enqueue all the users of the given instruction for further processing.
3993	/// This uses a set to de-duplicate users.
3994	void enqueueUsers(Instruction &I) {
3995	for (Use &U : I.uses())
3996	if (Visited.insert(Ptr: U.getUser()).second)
3997	Queue.push_back(Elt: &U);
3998	}
3999
4000	// Conservative default is to not rewrite anything.
4001	bool visitInstruction(Instruction &I) { return false; }
4002
4003	/// Generic recursive split emission class.
4004	template <typename Derived> class OpSplitter {
4005	protected:
4006	/// The builder used to form new instructions.
4007	IRBuilderTy &IRB;
4008
4009	/// The indices which to be used with insert- or extractvalue to select the
4010	/// appropriate value within the aggregate.
4011	SmallVector<unsigned, `4`> Indices;
4012
4013	/// The indices to a GEP instruction which will move Ptr to the correct slot
4014	/// within the aggregate.
4015	SmallVector<Value *, `4`> GEPIndices;
4016
4017	/// The base pointer of the original op, used as a base for GEPing the
4018	/// split operations.
4019	Value *Ptr;
4020
4021	/// The base pointee type being GEPed into.
4022	Type *BaseTy;
4023
4024	/// Known alignment of the base pointer.
4025	Align BaseAlign;
4026
4027	/// To calculate offset of each component so we can correctly deduce
4028	/// alignments.
4029	const DataLayout &DL;
4030
4031	/// Initialize the splitter with an insertion point, Ptr and start with a
4032	/// single zero GEP index.
4033	OpSplitter(Instruction InsertionPoint, Value Ptr, Type *BaseTy,
4034	Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4035	: IRB(IRB), GEPIndices (`1`, IRB.getInt32(C: `0`)), Ptr(Ptr), BaseTy(BaseTy),
4036	BaseAlign (BaseAlign), DL(DL) {
4037	IRB.SetInsertPoint(InsertionPoint);
4038	}
4039
4040	public:
4041	/// Generic recursive split emission routine.
4042	///
4043	/// This method recursively splits an aggregate op (load or store) into
4044	/// scalar or vector ops. It splits recursively until it hits a single value
4045	/// and emits that single value operation via the template argument.
4046	///
4047	/// The logic of this routine relies on GEPs and insertvalue and
4048	/// extractvalue all operating with the same fundamental index list, merely
4049	/// formatted differently (GEPs need actual values).
4050	///
4051	/// \param Ty The type being split recursively into smaller ops.
4052	/// \param Agg The aggregate value being built up or stored, depending on
4053	/// whether this is splitting a load or a store respectively.
4054	void emitSplitOps(Type Ty, Value &Agg, const Twine &Name) {
4055	if (Ty->isSingleValueType()) {
4056	unsigned Offset = DL.getIndexedOffsetInType(ElemTy: BaseTy, Indices: GEPIndices);
4057	return static_cast<Derived >(this*)->emitFunc(
4058	Ty, Agg, commonAlignment(A: BaseAlign, Offset), Name);
4059	}
4060
4061	if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
4062	unsigned OldSize = Indices.size();
4063	(void)OldSize;
4064	for (unsigned Idx = `0`, Size = ATy->getNumElements(); Idx != Size;
4065	++Idx) {
4066	assert(Indices.size() == OldSize && "Did not return to the old size");
4067	Indices.push_back(Elt: Idx);
4068	GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
4069	emitSplitOps(Ty: ATy->getElementType(), Agg, Name: Name + "." + Twine (Idx));
4070	GEPIndices.pop_back();
4071	Indices.pop_back();
4072	}
4073	return;
4074	}
4075
4076	if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4077	unsigned OldSize = Indices.size();
4078	(void)OldSize;
4079	for (unsigned Idx = `0`, Size = STy->getNumElements(); Idx != Size;
4080	++Idx) {
4081	assert(Indices.size() == OldSize && "Did not return to the old size");
4082	Indices.push_back(Elt: Idx);
4083	GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
4084	emitSplitOps(Ty: STy->getElementType(N: Idx), Agg, Name: Name + "." + Twine (Idx));
4085	GEPIndices.pop_back();
4086	Indices.pop_back();
4087	}
4088	return;
4089	}
4090
4091	llvm_unreachable("Only arrays and structs are aggregate loadable types");
4092	}
4093	};
4094
4095	struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4096	AAMDNodes AATags;
4097	// A vector to hold the split components that we want to emit
4098	// separate fake uses for.
4099	SmallVector<Value *, `4`> Components;
4100	// A vector to hold all the fake uses of the struct that we are splitting.
4101	// Usually there should only be one, but we are handling the general case.
4102	SmallVector<Instruction *, `1`> FakeUses;
4103
4104	LoadOpSplitter(Instruction InsertionPoint, Value Ptr, Type *BaseTy,
4105	AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4106	IRBuilderTy &IRB)
4107	: OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4108	IRB),
4109	AATags (AATags) {}
4110
4111	/// Emit a leaf load of a single value. This is called at the leaves of the
4112	/// recursive emission to actually load values.
4113	void emitFunc(Type Ty, Value &Agg, Align Alignment, const Twine &Name) {
4114	assert(Ty->isSingleValueType());
4115	// Load the single value and insert it using the indices.
4116	Value *GEP =
4117	IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
4118	LoadInst *Load =
4119	IRB.CreateAlignedLoad(Ty, Ptr: GEP, Align: Alignment, Name: Name + ".load");
4120
4121	APInt Offset(
4122	DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), `0`);
4123	if (AATags &&
4124	GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset))
4125	Load->setAAMetadata(
4126	AATags.adjustForAccess(Offset: Offset.getZExtValue(), AccessTy: Load->getType(), DL));
4127	// Record the load so we can generate a fake use for this aggregate
4128	// component.
4129	Components.push_back(Elt: Load);
4130
4131	Agg = IRB.CreateInsertValue(Agg, Val: Load, Idxs: Indices, Name: Name + ".insert");
4132	LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4133	}
4134
4135	// Stash the fake uses that use the value generated by this instruction.
4136	void recordFakeUses(LoadInst &LI) {
4137	for (Use &U : LI.uses())
4138	if (auto *II = dyn_cast<IntrinsicInst>(Val: U.getUser()))
4139	if (II->getIntrinsicID() == Intrinsic::fake_use)
4140	FakeUses.push_back(Elt: II);
4141	}
4142
4143	// Replace all fake uses of the aggregate with a series of fake uses, one
4144	// for each split component.
4145	void emitFakeUses() {
4146	for (Instruction *I : FakeUses) {
4147	IRB.SetInsertPoint(I);
4148	for (auto *V : Components)
4149	IRB.CreateIntrinsic(ID: Intrinsic::fake_use, Args: {V});
4150	I->eraseFromParent();
4151	}
4152	}
4153	};
4154
4155	bool visitLoadInst(LoadInst &LI) {
4156	assert(LI.getPointerOperand() == *U);
4157	if (!LI.isSimple() \|\| LI.getType()->isSingleValueType())
4158	return false;
4159
4160	// We have an aggregate being loaded, split it apart.
4161	LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4162	LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4163	getAdjustedAlignment(I: &LI, Offset: `0`), DL, IRB);
4164	Splitter.recordFakeUses(LI);
4165	Value *V = PoisonValue::get(T: LI.getType());
4166	Splitter.emitSplitOps(Ty: LI.getType(), Agg&: V, Name: LI.getName() + ".fca");
4167	Splitter.emitFakeUses();
4168	Visited.erase(Ptr: &LI);
4169	LI.replaceAllUsesWith(V);
4170	LI.eraseFromParent();
4171	return true;
4172	}
4173
4174	struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4175	StoreOpSplitter(Instruction InsertionPoint, Value Ptr, Type *BaseTy,
4176	AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4177	const DataLayout &DL, IRBuilderTy &IRB)
4178	: OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4179	DL, IRB),
4180	AATags (AATags), AggStore(AggStore) {}
4181	AAMDNodes AATags;
4182	StoreInst *AggStore;
4183	/// Emit a leaf store of a single value. This is called at the leaves of the
4184	/// recursive emission to actually produce stores.
4185	void emitFunc(Type Ty, Value &Agg, Align Alignment, const Twine &Name) {
4186	assert(Ty->isSingleValueType());
4187	// Extract the single value and store it using the indices.
4188	//
4189	// The gep and extractvalue values are factored out of the CreateStore
4190	// call to make the output independent of the argument evaluation order.
4191	Value *ExtractValue =
4192	IRB.CreateExtractValue(Agg, Idxs: Indices, Name: Name + ".extract");
4193	Value *InBoundsGEP =
4194	IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
4195	StoreInst *Store =
4196	IRB.CreateAlignedStore(Val: ExtractValue, Ptr: InBoundsGEP, Align: Alignment);
4197
4198	APInt Offset(
4199	DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), `0`);
4200	GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset);
4201	if (AATags) {
4202	Store->setAAMetadata(AATags.adjustForAccess(
4203	Offset: Offset.getZExtValue(), AccessTy: ExtractValue->getType(), DL));
4204	}
4205
4206	// migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4207	// If we cannot (because there's an intervening non-const or unbounded
4208	// gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4209	// this instruction.
4210	Value *Base = AggStore->getPointerOperand()->stripInBoundsOffsets();
4211	if (auto *OldAI = dyn_cast<AllocaInst>(Val: Base)) {
4212	uint64_t SizeInBits =
4213	DL.getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
4214	migrateDebugInfo(OldAlloca: OldAI, /IsSplit/ true, OldAllocaOffsetInBits: Offset.getZExtValue() * `8`,
4215	SliceSizeInBits: SizeInBits, OldInst: AggStore, Inst: Store,
4216	Dest: Store->getPointerOperand(), Value: Store->getValueOperand(),
4217	DL);
4218	} else {
4219	assert(at::getDVRAssignmentMarkers(Store).empty() &&
4220	"AT: unexpected debug.assign linked to store through "
4221	"unbounded GEP");
4222	}
4223	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4224	}
4225	};
4226
4227	bool visitStoreInst(StoreInst &SI) {
4228	if (!SI.isSimple() \|\| SI.getPointerOperand() != *U)
4229	return false;
4230	Value *V = SI.getValueOperand();
4231	if (V->getType()->isSingleValueType())
4232	return false;
4233
4234	// We have an aggregate being stored, split it apart.
4235	LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4236	StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4237	getAdjustedAlignment(I: &SI, Offset: `0`), DL, IRB);
4238	Splitter.emitSplitOps(Ty: V->getType(), Agg&: V, Name: V->getName() + ".fca");
4239	Visited.erase(Ptr: &SI);
4240	// The stores replacing SI each have markers describing fragments of the
4241	// assignment so delete the assignment markers linked to SI.
4242	at::deleteAssignmentMarkers(Inst: &SI);
4243	SI.eraseFromParent();
4244	return true;
4245	}
4246
4247	bool visitBitCastInst(BitCastInst &BC) {
4248	enqueueUsers(I&: BC);
4249	return false;
4250	}
4251
4252	bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4253	enqueueUsers(I&: ASC);
4254	return false;
4255	}
4256
4257	// Unfold gep (select cond, ptr1, ptr2), idx
4258	// => select cond, gep(ptr1, idx), gep(ptr2, idx)
4259	// and gep ptr, (select cond, idx1, idx2)
4260	// => select cond, gep(ptr, idx1), gep(ptr, idx2)
4261	// We also allow for i1 zext indices, which are equivalent to selects.
4262	bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4263	// Check whether the GEP has exactly one select operand and all indices
4264	// will become constant after the transform.
4265	Instruction *Sel = dyn_cast<SelectInst>(Val: GEPI.getPointerOperand());
4266	for (Value *Op : GEPI.indices()) {
4267	if (auto *SI = dyn_cast<SelectInst>(Val: Op)) {
4268	if (Sel)
4269	return false;
4270
4271	Sel = SI;
4272	if (!isa<ConstantInt>(Val: SI->getTrueValue()) \|\|
4273	!isa<ConstantInt>(Val: SI->getFalseValue()))
4274	return false;
4275	continue;
4276	}
4277	if (auto *ZI = dyn_cast<ZExtInst>(Val: Op)) {
4278	if (Sel)
4279	return false;
4280	Sel = ZI;
4281	if (!ZI->getSrcTy()->isIntegerTy(Bitwidth: `1`))
4282	return false;
4283	continue;
4284	}
4285
4286	if (!isa<ConstantInt>(Val: Op))
4287	return false;
4288	}
4289
4290	if (!Sel)
4291	return false;
4292
4293	LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4294	dbgs() << " original: " << *Sel << "\n";
4295	dbgs() << " " << GEPI << "\n";);
4296
4297	auto GetNewOps = [&](Value *SelOp) {
4298	SmallVector<Value *> NewOps;
4299	for (Value *Op : GEPI.operands())
4300	if (Op == Sel)
4301	NewOps.push_back(Elt: SelOp);
4302	else
4303	NewOps.push_back(Elt: Op);
4304	return NewOps;
4305	};
4306
4307	Value Cond, True, *False;
4308	Instruction MDFrom = nullptr*;
4309	if (auto *SI = dyn_cast<SelectInst>(Val: Sel)) {
4310	Cond = SI->getCondition();
4311	True = SI->getTrueValue();
4312	False = SI->getFalseValue();
4313	if (!ProfcheckDisableMetadataFixes)
4314	MDFrom = SI;
4315	} else {
4316	Cond = Sel->getOperand(i: `0`);
4317	True = ConstantInt::get(Ty: Sel->getType(), V: `1`);
4318	False = ConstantInt::get(Ty: Sel->getType(), V: `0`);
4319	}
4320	SmallVector<Value *> TrueOps = GetNewOps(True);
4321	SmallVector<Value *> FalseOps = GetNewOps(False);
4322
4323	IRB.SetInsertPoint(&GEPI);
4324	GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4325
4326	Type *Ty = GEPI.getSourceElementType();
4327	Value *NTrue = IRB.CreateGEP(Ty, Ptr: TrueOps [`0`], IdxList: ArrayRef(TrueOps).drop_front(),
4328	Name: True->getName() + ".sroa.gep", NW);
4329
4330	Value *NFalse =
4331	IRB.CreateGEP(Ty, Ptr: FalseOps [`0`], IdxList: ArrayRef(FalseOps).drop_front(),
4332	Name: False->getName() + ".sroa.gep", NW);
4333
4334	Value *NSel = MDFrom
4335	? IRB.CreateSelect(C: Cond, True: NTrue, False: NFalse,
4336	Name: Sel->getName() + ".sroa.sel", MDFrom)
4337	: IRB.CreateSelectWithUnknownProfile(
4338	C: Cond, True: NTrue, False: NFalse, DEBUG_TYPE,
4339	Name: Sel->getName() + ".sroa.sel");
4340	Visited.erase(Ptr: &GEPI);
4341	GEPI.replaceAllUsesWith(V: NSel);
4342	GEPI.eraseFromParent();
4343	Instruction *NSelI = cast<Instruction>(Val: NSel);
4344	Visited.insert(Ptr: NSelI);
4345	enqueueUsers(I&: *NSelI);
4346
4347	LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4348	dbgs() << " " << *NFalse << "\n";
4349	dbgs() << " " << *NSel << "\n";);
4350
4351	return true;
4352	}
4353
4354	// Unfold gep (phi ptr1, ptr2), idx
4355	// => phi ((gep ptr1, idx), (gep ptr2, idx))
4356	// and gep ptr, (phi idx1, idx2)
4357	// => phi ((gep ptr, idx1), (gep ptr, idx2))
4358	bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4359	// To prevent infinitely expanding recursive phis, bail if the GEP pointer
4360	// operand (looking through the phi if it is the phi we want to unfold) is
4361	// an instruction besides a static alloca.
4362	PHINode *Phi = dyn_cast<PHINode>(Val: GEPI.getPointerOperand());
4363	auto IsInvalidPointerOperand = [](Value *V) {
4364	if (!isa<Instruction>(Val: V))
4365	return false;
4366	if (auto *AI = dyn_cast<AllocaInst>(Val: V))
4367	return !AI->isStaticAlloca();
4368	return true;
4369	};
4370	if (Phi) {
4371	if (any_of(Range: Phi->operands(), P: IsInvalidPointerOperand))
4372	return false;
4373	} else {
4374	if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4375	return false;
4376	}
4377	// Check whether the GEP has exactly one phi operand (including the pointer
4378	// operand) and all indices will become constant after the transform.
4379	for (Value *Op : GEPI.indices()) {
4380	if (auto *SI = dyn_cast<PHINode>(Val: Op)) {
4381	if (Phi)
4382	return false;
4383
4384	Phi = SI;
4385	if (!all_of(Range: Phi->incoming_values(),
4386	P: [](Value V) { return* isa<ConstantInt>(Val: V); }))
4387	return false;
4388	continue;
4389	}
4390
4391	if (!isa<ConstantInt>(Val: Op))
4392	return false;
4393	}
4394
4395	if (!Phi)
4396	return false;
4397
4398	LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4399	dbgs() << " original: " << *Phi << "\n";
4400	dbgs() << " " << GEPI << "\n";);
4401
4402	auto GetNewOps = [&](Value *PhiOp) {
4403	SmallVector<Value *> NewOps;
4404	for (Value *Op : GEPI.operands())
4405	if (Op == Phi)
4406	NewOps.push_back(Elt: PhiOp);
4407	else
4408	NewOps.push_back(Elt: Op);
4409	return NewOps;
4410	};
4411
4412	IRB.SetInsertPoint(Phi);
4413	PHINode *NewPhi = IRB.CreatePHI(Ty: GEPI.getType(), NumReservedValues: Phi->getNumIncomingValues(),
4414	Name: Phi->getName() + ".sroa.phi");
4415
4416	Type *SourceTy = GEPI.getSourceElementType();
4417	// We only handle arguments, constants, and static allocas here, so we can
4418	// insert GEPs at the end of the entry block.
4419	IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4420	for (unsigned I = `0`, E = Phi->getNumIncomingValues(); I != E; ++I) {
4421	Value *Op = Phi->getIncomingValue(i: I);
4422	BasicBlock *BB = Phi->getIncomingBlock(i: I);
4423	Value *NewGEP;
4424	if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= `0`) {
4425	NewGEP = NewPhi->getIncomingValue(i: NI);
4426	} else {
4427	SmallVector<Value *> NewOps = GetNewOps(Op);
4428	NewGEP =
4429	IRB.CreateGEP(Ty: SourceTy, Ptr: NewOps [`0`], IdxList: ArrayRef(NewOps).drop_front(),
4430	Name: Phi->getName() + ".sroa.gep", NW: GEPI.getNoWrapFlags());
4431	}
4432	NewPhi->addIncoming(V: NewGEP, BB);
4433	}
4434
4435	Visited.erase(Ptr: &GEPI);
4436	GEPI.replaceAllUsesWith(V: NewPhi);
4437	GEPI.eraseFromParent();
4438	Visited.insert(Ptr: NewPhi);
4439	enqueueUsers(I&: *NewPhi);
4440
4441	LLVM_DEBUG(dbgs() << " to: ";
4442	for (Value *In
4443	: NewPhi->incoming_values()) dbgs()
4444	<< "\n " << *In;
4445	dbgs() << "\n " << *NewPhi << `'\n'`);
4446
4447	return true;
4448	}
4449
4450	bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4451	if (unfoldGEPSelect(GEPI))
4452	return true;
4453
4454	if (unfoldGEPPhi(GEPI))
4455	return true;
4456
4457	enqueueUsers(I&: GEPI);
4458	return false;
4459	}
4460
4461	bool visitPHINode(PHINode &PN) {
4462	enqueueUsers(I&: PN);
4463	return false;
4464	}
4465
4466	bool visitSelectInst(SelectInst &SI) {
4467	enqueueUsers(I&: SI);
4468	return false;
4469	}
4470	};
4471
4472	} // end anonymous namespace
4473
4474	/// Strip aggregate type wrapping.
4475	///
4476	/// This removes no-op aggregate types wrapping an underlying type. It will
4477	/// strip as many layers of types as it can without changing either the type
4478	/// size or the allocated size.
4479	static Type stripAggregateTypeWrapping(const* DataLayout &DL, Type *Ty) {
4480	if (Ty->isSingleValueType())
4481	return Ty;
4482
4483	uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4484	uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4485
4486	Type *InnerTy;
4487	if (ArrayType *ArrTy = dyn_cast<ArrayType>(Val: Ty)) {
4488	InnerTy = ArrTy->getElementType();
4489	} else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4490	const StructLayout *SL = DL.getStructLayout(Ty: STy);
4491	unsigned Index = SL->getElementContainingOffset(FixedOffset: `0`);
4492	InnerTy = STy->getElementType(N: Index);
4493	} else {
4494	return Ty;
4495	}
4496
4497	if (AllocSize > DL.getTypeAllocSize(Ty: InnerTy).getFixedValue() \|\|
4498	TypeSize > DL.getTypeSizeInBits(Ty: InnerTy).getFixedValue())
4499	return Ty;
4500
4501	return stripAggregateTypeWrapping(DL, Ty: InnerTy);
4502	}
4503
4504	/// Try to find a partition of the aggregate type passed in for a given
4505	/// offset and size.
4506	///
4507	/// This recurses through the aggregate type and tries to compute a subtype
4508	/// based on the offset and size. When the offset and size span a sub-section
4509	/// of an array, it will even compute a new array type for that sub-section,
4510	/// and the same for structs.
4511	///
4512	/// Note that this routine is very strict and tries to find a partition of the
4513	/// type which produces the exact* right offset and size. It is not forgiving*
4514	/// when the size or offset cause either end of type-based partition to be off.
4515	/// Also, this is a best-effort routine. It is reasonable to give up and not
4516	/// return a type if necessary.
4517	static Type getTypePartition(const* DataLayout &DL, Type *Ty, uint64_t Offset,
4518	uint64_t Size) {
4519	if (Offset == `0` && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4520	return stripAggregateTypeWrapping(DL, Ty);
4521	if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() \|\|
4522	(DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4523	return nullptr;
4524
4525	if (isa<ArrayType>(Val: Ty) \|\| isa<VectorType>(Val: Ty)) {
4526	Type *ElementTy;
4527	uint64_t TyNumElements;
4528	if (auto *AT = dyn_cast<ArrayType>(Val: Ty)) {
4529	ElementTy = AT->getElementType();
4530	TyNumElements = AT->getNumElements();
4531	} else {
4532	// FIXME: This isn't right for vectors with non-byte-sized or
4533	// non-power-of-two sized elements.
4534	auto *VT = cast<FixedVectorType>(Val: Ty);
4535	ElementTy = VT->getElementType();
4536	TyNumElements = VT->getNumElements();
4537	}
4538	uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4539	uint64_t NumSkippedElements = Offset / ElementSize;
4540	if (NumSkippedElements >= TyNumElements)
4541	return nullptr;
4542	Offset -= NumSkippedElements * ElementSize;
4543
4544	// First check if we need to recurse.
4545	if (Offset > `0` \|\| Size < ElementSize) {
4546	// Bail if the partition ends in a different array element.
4547	if ((Offset + Size) > ElementSize)
4548	return nullptr;
4549	// Recurse through the element type trying to peel off offset bytes.
4550	return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4551	}
4552	assert(Offset == `0`);
4553
4554	if (Size == ElementSize)
4555	return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4556	assert(Size > ElementSize);
4557	uint64_t NumElements = Size / ElementSize;
4558	if (NumElements * ElementSize != Size)
4559	return nullptr;
4560	return ArrayType::get(ElementType: ElementTy, NumElements);
4561	}
4562
4563	StructType *STy = dyn_cast<StructType>(Val: Ty);
4564	if (!STy)
4565	return nullptr;
4566
4567	const StructLayout *SL = DL.getStructLayout(Ty: STy);
4568
4569	if (SL->getSizeInBits().isScalable())
4570	return nullptr;
4571
4572	if (Offset >= SL->getSizeInBytes())
4573	return nullptr;
4574	uint64_t EndOffset = Offset + Size;
4575	if (EndOffset > SL->getSizeInBytes())
4576	return nullptr;
4577
4578	unsigned Index = SL->getElementContainingOffset(FixedOffset: Offset);
4579	Offset -= SL->getElementOffset(Idx: Index);
4580
4581	Type *ElementTy = STy->getElementType(N: Index);
4582	uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4583	if (Offset >= ElementSize)
4584	return nullptr; // The offset points into alignment padding.
4585
4586	// See if any partition must be contained by the element.
4587	if (Offset > `0` \|\| Size < ElementSize) {
4588	if ((Offset + Size) > ElementSize)
4589	return nullptr;
4590	return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4591	}
4592	assert(Offset == `0`);
4593
4594	if (Size == ElementSize)
4595	return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4596
4597	StructType::element_iterator EI = STy->element_begin() + Index,
4598	EE = STy->element_end();
4599	if (EndOffset < SL->getSizeInBytes()) {
4600	unsigned EndIndex = SL->getElementContainingOffset(FixedOffset: EndOffset);
4601	if (Index == EndIndex)
4602	return nullptr; // Within a single element and its padding.
4603
4604	// Don't try to form "natural" types if the elements don't line up with the
4605	// expected size.
4606	// FIXME: We could potentially recurse down through the last element in the
4607	// sub-struct to find a natural end point.
4608	if (SL->getElementOffset(Idx: EndIndex) != EndOffset)
4609	return nullptr;
4610
4611	assert(Index < EndIndex);
4612	EE = STy->element_begin() + EndIndex;
4613	}
4614
4615	// Try to build up a sub-structure.
4616	StructType *SubTy =
4617	StructType::get(Context&: STy->getContext(), Elements: ArrayRef(EI, EE), isPacked: STy->isPacked());
4618	const StructLayout *SubSL = DL.getStructLayout(Ty: SubTy);
4619	if (Size != SubSL->getSizeInBytes())
4620	return nullptr; // The sub-struct doesn't have quite the size needed.
4621
4622	return SubTy;
4623	}
4624
4625	/// Pre-split loads and stores to simplify rewriting.
4626	///
4627	/// We want to break up the splittable load+store pairs as much as
4628	/// possible. This is important to do as a preprocessing step, as once we
4629	/// start rewriting the accesses to partitions of the alloca we lose the
4630	/// necessary information to correctly split apart paired loads and stores
4631	/// which both point into this alloca. The case to consider is something like
4632	/// the following:
4633	///
4634	/// %a = alloca [12 x i8]
4635	/// %gep1 = getelementptr i8, ptr %a, i32 0
4636	/// %gep2 = getelementptr i8, ptr %a, i32 4
4637	/// %gep3 = getelementptr i8, ptr %a, i32 8
4638	/// store float 0.0, ptr %gep1
4639	/// store float 1.0, ptr %gep2
4640	/// %v = load i64, ptr %gep1
4641	/// store i64 %v, ptr %gep2
4642	/// %f1 = load float, ptr %gep2
4643	/// %f2 = load float, ptr %gep3
4644	///
4645	/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4646	/// promote everything so we recover the 2 SSA values that should have been
4647	/// there all along.
4648	///
4649	/// \returns true if any changes are made.
4650	bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4651	LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4652
4653	// Track the loads and stores which are candidates for pre-splitting here, in
4654	// the order they first appear during the partition scan. These give stable
4655	// iteration order and a basis for tracking which loads and stores we
4656	// actually split.
4657	SmallVector<LoadInst *, `4`> Loads;
4658	SmallVector<StoreInst *, `4`> Stores;
4659
4660	// We need to accumulate the splits required of each load or store where we
4661	// can find them via a direct lookup. This is important to cross-check loads
4662	// and stores against each other. We also track the slice so that we can kill
4663	// all the slices that end up split.
4664	struct SplitOffsets {
4665	Slice *S;
4666	std::vector<uint64_t> Splits;
4667	};
4668	SmallDenseMap<Instruction *, SplitOffsets, `8`> SplitOffsetsMap;
4669
4670	// Track loads out of this alloca which cannot, for any reason, be pre-split.
4671	// This is important as we also cannot pre-split stores of those loads!
4672	// FIXME: This is all pretty gross. It means that we can be more aggressive
4673	// in pre-splitting when the load feeding the store happens to come from
4674	// a separate alloca. Put another way, the effectiveness of SROA would be
4675	// decreased by a frontend which just concatenated all of its local allocas
4676	// into one big flat alloca. But defeating such patterns is exactly the job
4677	// SROA is tasked with! Sadly, to not have this discrepancy we would have
4678	// change store pre-splitting to actually force pre-splitting of the load
4679	// that feeds it and all stores. That makes pre-splitting much harder, but
4680	// maybe it would make it more principled?
4681	SmallPtrSet<LoadInst *, `8`> UnsplittableLoads;
4682
4683	LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4684	for (auto &P : AS.partitions()) {
4685	for (Slice &S : P) {
4686	Instruction *I = cast<Instruction>(Val: S.getUse()->getUser());
4687	if (!S.isSplittable() \|\| S.endOffset() <= P.endOffset()) {
4688	// If this is a load we have to track that it can't participate in any
4689	// pre-splitting. If this is a store of a load we have to track that
4690	// that load also can't participate in any pre-splitting.
4691	if (auto *LI = dyn_cast<LoadInst>(Val: I))
4692	UnsplittableLoads.insert(Ptr: LI);
4693	else if (auto *SI = dyn_cast<StoreInst>(Val: I))
4694	if (auto *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand()))
4695	UnsplittableLoads.insert(Ptr: LI);
4696	continue;
4697	}
4698	assert(P.endOffset() > S.beginOffset() &&
4699	"Empty or backwards partition!");
4700
4701	// Determine if this is a pre-splittable slice.
4702	if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
4703	assert(!LI->isVolatile() && "Cannot split volatile loads!");
4704
4705	// The load must be used exclusively to store into other pointers for
4706	// us to be able to arbitrarily pre-split it. The stores must also be
4707	// simple to avoid changing semantics.
4708	auto IsLoadSimplyStored = [](LoadInst *LI) {
4709	for (User *LU : LI->users()) {
4710	auto *SI = dyn_cast<StoreInst>(Val: LU);
4711	if (!SI \|\| !SI->isSimple())
4712	return false;
4713	}
4714	return true;
4715	};
4716	if (!IsLoadSimplyStored (LI)) {
4717	UnsplittableLoads.insert(Ptr: LI);
4718	continue;
4719	}
4720
4721	Loads.push_back(Elt: LI);
4722	} else if (auto *SI = dyn_cast<StoreInst>(Val: I)) {
4723	if (S.getUse() != &SI->getOperandUse(i: SI->getPointerOperandIndex()))
4724	// Skip stores of* pointers. FIXME: This shouldn't even be possible!*
4725	continue;
4726	auto *StoredLoad = dyn_cast<LoadInst>(Val: SI->getValueOperand());
4727	if (!StoredLoad \|\| !StoredLoad->isSimple())
4728	continue;
4729	assert(!SI->isVolatile() && "Cannot split volatile stores!");
4730
4731	Stores.push_back(Elt: SI);
4732	} else {
4733	// Other uses cannot be pre-split.
4734	continue;
4735	}
4736
4737	// Record the initial split.
4738	LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4739	auto &Offsets = SplitOffsetsMap [I];
4740	assert(Offsets.Splits.empty() &&
4741	"Should not have splits the first time we see an instruction!");
4742	Offsets.S = &S;
4743	Offsets.Splits.push_back(x: P.endOffset() - S.beginOffset());
4744	}
4745
4746	// Now scan the already split slices, and add a split for any of them which
4747	// we're going to pre-split.
4748	for (Slice *S : P.splitSliceTails()) {
4749	auto SplitOffsetsMapI =
4750	SplitOffsetsMap.find(Val: cast<Instruction>(Val: S->getUse()->getUser()));
4751	if (SplitOffsetsMapI == SplitOffsetsMap.end())
4752	continue;
4753	auto &Offsets = SplitOffsetsMapI ->second;
4754
4755	assert(Offsets.S == S && "Found a mismatched slice!");
4756	assert(!Offsets.Splits.empty() &&
4757	"Cannot have an empty set of splits on the second partition!");
4758	assert(Offsets.Splits.back() ==
4759	P.beginOffset() - Offsets.S->beginOffset() &&
4760	"Previous split does not end where this one begins!");
4761
4762	// Record each split. The last partition's end isn't needed as the size
4763	// of the slice dictates that.
4764	if (S->endOffset() > P.endOffset())
4765	Offsets.Splits.push_back(x: P.endOffset() - Offsets.S->beginOffset());
4766	}
4767	}
4768
4769	// We may have split loads where some of their stores are split stores. For
4770	// such loads and stores, we can only pre-split them if their splits exactly
4771	// match relative to their starting offset. We have to verify this prior to
4772	// any rewriting.
4773	llvm::erase_if(C&: Stores, P: [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4774	// Lookup the load we are storing in our map of split
4775	// offsets.
4776	auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4777	// If it was completely unsplittable, then we're done,
4778	// and this store can't be pre-split.
4779	if (UnsplittableLoads.count(Ptr: LI))
4780	return true;
4781
4782	auto LoadOffsetsI = SplitOffsetsMap.find(Val: LI);
4783	if (LoadOffsetsI == SplitOffsetsMap.end())
4784	return false; // Unrelated loads are definitely safe.
4785	auto &LoadOffsets = LoadOffsetsI ->second;
4786
4787	// Now lookup the store's offsets.
4788	auto &StoreOffsets = SplitOffsetsMap [SI];
4789
4790	// If the relative offsets of each split in the load and
4791	// store match exactly, then we can split them and we
4792	// don't need to remove them here.
4793	if (LoadOffsets.Splits == StoreOffsets.Splits)
4794	return false;
4795
4796	LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4797	<< " " << *LI << "\n"
4798	<< " " << *SI << "\n");
4799
4800	// We've found a store and load that we need to split
4801	// with mismatched relative splits. Just give up on them
4802	// and remove both instructions from our list of
4803	// candidates.
4804	UnsplittableLoads.insert(Ptr: LI);
4805	return true;
4806	});
4807	// Now we have to go back* through all the stores, because a later store may*
4808	// have caused an earlier store's load to become unsplittable and if it is
4809	// unsplittable for the later store, then we can't rely on it being split in
4810	// the earlier store either.
4811	llvm::erase_if(C&: Stores, P: [&UnsplittableLoads](StoreInst *SI) {
4812	auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4813	return UnsplittableLoads.count(Ptr: LI);
4814	});
4815	// Once we've established all the loads that can't be split for some reason,
4816	// filter any that made it into our list out.
4817	llvm::erase_if(C&: Loads, P: [&UnsplittableLoads](LoadInst *LI) {
4818	return UnsplittableLoads.count(Ptr: LI);
4819	});
4820
4821	// If no loads or stores are left, there is no pre-splitting to be done for
4822	// this alloca.
4823	if (Loads.empty() && Stores.empty())
4824	return false;
4825
4826	// From here on, we can't fail and will be building new accesses, so rig up
4827	// an IR builder.
4828	IRBuilderTy IRB(&AI);
4829
4830	// Collect the new slices which we will merge into the alloca slices.
4831	SmallVector<Slice, `4`> NewSlices;
4832
4833	// Track any allocas we end up splitting loads and stores for so we iterate
4834	// on them.
4835	SmallPtrSet<AllocaInst *, `4`> ResplitPromotableAllocas;
4836
4837	// At this point, we have collected all of the loads and stores we can
4838	// pre-split, and the specific splits needed for them. We actually do the
4839	// splitting in a specific order in order to handle when one of the loads in
4840	// the value operand to one of the stores.
4841	//
4842	// First, we rewrite all of the split loads, and just accumulate each split
4843	// load in a parallel structure. We also build the slices for them and append
4844	// them to the alloca slices.
4845	SmallDenseMap<LoadInst , std::vector<LoadInst >, `1`> SplitLoadsMap;
4846	std::vector<LoadInst *> SplitLoads;
4847	const DataLayout &DL = AI.getDataLayout();
4848	for (LoadInst *LI : Loads) {
4849	SplitLoads.clear();
4850
4851	auto &Offsets = SplitOffsetsMap [LI];
4852	unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4853	assert(LI->getType()->getIntegerBitWidth() % `8` == `0` &&
4854	"Load must have type size equal to store size");
4855	assert(LI->getType()->getIntegerBitWidth() / `8` >= SliceSize &&
4856	"Load must be >= slice size");
4857
4858	uint64_t BaseOffset = Offsets.S->beginOffset();
4859	assert(BaseOffset + SliceSize > BaseOffset &&
4860	"Cannot represent alloca access size using 64-bit integers!");
4861
4862	Instruction *BasePtr = cast<Instruction>(Val: LI->getPointerOperand());
4863	IRB.SetInsertPoint(LI);
4864
4865	LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4866
4867	uint64_t PartOffset = `0`, PartSize = Offsets.Splits.front();
4868	int Idx = `0`, Size = Offsets.Splits.size();
4869	for (;;) {
4870	auto PartTy = Type::getIntNTy(C&: LI->getContext(), N: PartSize `8`);
4871	auto AS = LI->getPointerAddressSpace();
4872	auto *PartPtrTy = LI->getPointerOperandType();
4873	LoadInst *PLoad = IRB.CreateAlignedLoad(
4874	Ty: PartTy,
4875	Ptr: getAdjustedPtr(IRB, DL, Ptr: BasePtr,
4876	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
4877	PointerTy: PartPtrTy, NamePrefix: BasePtr->getName() + "."),
4878	Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
4879	/IsVolatile/ isVolatile: false, Name: LI->getName());
4880	PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4881	LLVMContext::MD_access_group});
4882
4883	// Append this load onto the list of split loads so we can find it later
4884	// to rewrite the stores.
4885	SplitLoads.push_back(x: PLoad);
4886
4887	// Now build a new slice for the alloca.
4888	NewSlices.push_back(
4889	Elt: Slice (BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4890	&PLoad->getOperandUse(i: PLoad->getPointerOperandIndex()),
4891	/IsSplittable/ false, nullptr));
4892	LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4893	<< ", " << NewSlices.back().endOffset()
4894	<< "): " << *PLoad << "\n");
4895
4896	// See if we've handled all the splits.
4897	if (Idx >= Size)
4898	break;
4899
4900	// Setup the next partition.
4901	PartOffset = Offsets.Splits [Idx];
4902	++Idx;
4903	PartSize = (Idx < Size ? Offsets.Splits [Idx] : SliceSize) - PartOffset;
4904	}
4905
4906	// Now that we have the split loads, do the slow walk over all uses of the
4907	// load and rewrite them as split stores, or save the split loads to use
4908	// below if the store is going to be split there anyways.
4909	bool DeferredStores = false;
4910	for (User *LU : LI->users()) {
4911	StoreInst *SI = cast<StoreInst>(Val: LU);
4912	if (!Stores.empty() && SplitOffsetsMap.count(Val: SI)) {
4913	DeferredStores = true;
4914	LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4915	<< "\n");
4916	continue;
4917	}
4918
4919	Value *StoreBasePtr = SI->getPointerOperand();
4920	IRB.SetInsertPoint(SI);
4921	AAMDNodes AATags = SI->getAAMetadata();
4922
4923	LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4924
4925	for (int Idx = `0`, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4926	LoadInst *PLoad = SplitLoads [Idx];
4927	uint64_t PartOffset = Idx == `0` ? `0` : Offsets.Splits [Idx - `1`];
4928	auto *PartPtrTy = SI->getPointerOperandType();
4929
4930	auto AS = SI->getPointerAddressSpace();
4931	StoreInst *PStore = IRB.CreateAlignedStore(
4932	Val: PLoad,
4933	Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
4934	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
4935	PointerTy: PartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
4936	Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
4937	/IsVolatile/ isVolatile: false);
4938	PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4939	LLVMContext::MD_access_group,
4940	LLVMContext::MD_DIAssignID});
4941
4942	if (AATags)
4943	PStore->setAAMetadata(
4944	AATags.adjustForAccess(Offset: PartOffset, AccessTy: PLoad->getType(), DL));
4945	LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4946	}
4947
4948	// We want to immediately iterate on any allocas impacted by splitting
4949	// this store, and we have to track any promotable alloca (indicated by
4950	// a direct store) as needing to be resplit because it is no longer
4951	// promotable.
4952	if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: StoreBasePtr)) {
4953	ResplitPromotableAllocas.insert(Ptr: OtherAI);
4954	Worklist.insert(X: OtherAI);
4955	} else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4956	Val: StoreBasePtr->stripInBoundsOffsets())) {
4957	Worklist.insert(X: OtherAI);
4958	}
4959
4960	// Mark the original store as dead.
4961	DeadInsts.push_back(Elt: SI);
4962	}
4963
4964	// Save the split loads if there are deferred stores among the users.
4965	if (DeferredStores)
4966	SplitLoadsMap.insert(KV: std::make_pair(x&: LI, y: std::move(SplitLoads)));
4967
4968	// Mark the original load as dead and kill the original slice.
4969	DeadInsts.push_back(Elt: LI);
4970	Offsets.S->kill();
4971	}
4972
4973	// Second, we rewrite all of the split stores. At this point, we know that
4974	// all loads from this alloca have been split already. For stores of such
4975	// loads, we can simply look up the pre-existing split loads. For stores of
4976	// other loads, we split those loads first and then write split stores of
4977	// them.
4978	for (StoreInst *SI : Stores) {
4979	auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4980	IntegerType *Ty = cast<IntegerType>(Val: LI->getType());
4981	assert(Ty->getBitWidth() % `8` == `0`);
4982	uint64_t StoreSize = Ty->getBitWidth() / `8`;
4983	assert(StoreSize > `0` && "Cannot have a zero-sized integer store!");
4984
4985	auto &Offsets = SplitOffsetsMap [SI];
4986	assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4987	"Slice size should always match load size exactly!");
4988	uint64_t BaseOffset = Offsets.S->beginOffset();
4989	assert(BaseOffset + StoreSize > BaseOffset &&
4990	"Cannot represent alloca access size using 64-bit integers!");
4991
4992	Value *LoadBasePtr = LI->getPointerOperand();
4993	Instruction *StoreBasePtr = cast<Instruction>(Val: SI->getPointerOperand());
4994
4995	LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4996
4997	// Check whether we have an already split load.
4998	auto SplitLoadsMapI = SplitLoadsMap.find(Val: LI);
4999	std::vector<LoadInst > SplitLoads = nullptr;
5000	if (SplitLoadsMapI != SplitLoadsMap.end()) {
5001	SplitLoads = &SplitLoadsMapI ->second;
5002	assert(SplitLoads->size() == Offsets.Splits.size() + `1` &&
5003	"Too few split loads for the number of splits in the store!");
5004	} else {
5005	LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5006	}
5007
5008	uint64_t PartOffset = `0`, PartSize = Offsets.Splits.front();
5009	int Idx = `0`, Size = Offsets.Splits.size();
5010	for (;;) {
5011	auto PartTy = Type::getIntNTy(C&: Ty->getContext(), N: PartSize `8`);
5012	auto *LoadPartPtrTy = LI->getPointerOperandType();
5013	auto *StorePartPtrTy = SI->getPointerOperandType();
5014
5015	// Either lookup a split load or create one.
5016	LoadInst *PLoad;
5017	if (SplitLoads) {
5018	PLoad = (*SplitLoads)[Idx];
5019	} else {
5020	IRB.SetInsertPoint(LI);
5021	auto AS = LI->getPointerAddressSpace();
5022	PLoad = IRB.CreateAlignedLoad(
5023	Ty: PartTy,
5024	Ptr: getAdjustedPtr(IRB, DL, Ptr: LoadBasePtr,
5025	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
5026	PointerTy: LoadPartPtrTy, NamePrefix: LoadBasePtr->getName() + "."),
5027	Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
5028	/IsVolatile/ isVolatile: false, Name: LI->getName());
5029	PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
5030	LLVMContext::MD_access_group});
5031	}
5032
5033	// And store this partition.
5034	IRB.SetInsertPoint(SI);
5035	auto AS = SI->getPointerAddressSpace();
5036	StoreInst *PStore = IRB.CreateAlignedStore(
5037	Val: PLoad,
5038	Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
5039	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
5040	PointerTy: StorePartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
5041	Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
5042	/IsVolatile/ isVolatile: false);
5043	PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
5044	LLVMContext::MD_access_group});
5045
5046	// Now build a new slice for the alloca.
5047	// ProtectedFieldDisc==nullptr is a lie, but it doesn't matter because we
5048	// already determined that all accesses are consistent.
5049	NewSlices.push_back(
5050	Elt: Slice (BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5051	&PStore->getOperandUse(i: PStore->getPointerOperandIndex()),
5052	/IsSplittable/ false, nullptr));
5053	LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5054	<< ", " << NewSlices.back().endOffset()
5055	<< "): " << *PStore << "\n");
5056	if (!SplitLoads) {
5057	LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5058	}
5059
5060	// See if we've finished all the splits.
5061	if (Idx >= Size)
5062	break;
5063
5064	// Setup the next partition.
5065	PartOffset = Offsets.Splits [Idx];
5066	++Idx;
5067	PartSize = (Idx < Size ? Offsets.Splits [Idx] : StoreSize) - PartOffset;
5068	}
5069
5070	// We want to immediately iterate on any allocas impacted by splitting
5071	// this load, which is only relevant if it isn't a load of this alloca and
5072	// thus we didn't already split the loads above. We also have to keep track
5073	// of any promotable allocas we split loads on as they can no longer be
5074	// promoted.
5075	if (!SplitLoads) {
5076	if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: LoadBasePtr)) {
5077	assert(OtherAI != &AI && "We can't re-split our own alloca!");
5078	ResplitPromotableAllocas.insert(Ptr: OtherAI);
5079	Worklist.insert(X: OtherAI);
5080	} else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5081	Val: LoadBasePtr->stripInBoundsOffsets())) {
5082	assert(OtherAI != &AI && "We can't re-split our own alloca!");
5083	Worklist.insert(X: OtherAI);
5084	}
5085	}
5086
5087	// Mark the original store as dead now that we've split it up and kill its
5088	// slice. Note that we leave the original load in place unless this store
5089	// was its only use. It may in turn be split up if it is an alloca load
5090	// for some other alloca, but it may be a normal load. This may introduce
5091	// redundant loads, but where those can be merged the rest of the optimizer
5092	// should handle the merging, and this uncovers SSA splits which is more
5093	// important. In practice, the original loads will almost always be fully
5094	// split and removed eventually, and the splits will be merged by any
5095	// trivial CSE, including instcombine.
5096	if (LI->hasOneUse()) {
5097	assert(*LI->user_begin() == SI && "Single use isn't this store!");
5098	DeadInsts.push_back(Elt: LI);
5099	}
5100	DeadInsts.push_back(Elt: SI);
5101	Offsets.S->kill();
5102	}
5103
5104	// Remove the killed slices that have ben pre-split.
5105	llvm::erase_if(C&: AS, P: [](const Slice &S) { return S.isDead(); });
5106
5107	// Insert our new slices. This will sort and merge them into the sorted
5108	// sequence.
5109	AS.insert(NewSlices);
5110
5111	LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5112	#ifndef NDEBUG
5113	for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5114	LLVM_DEBUG(AS.print(dbgs(), I, " "));
5115	#endif
5116
5117	// Finally, don't try to promote any allocas that new require re-splitting.
5118	// They have already been added to the worklist above.
5119	PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5120
5121	return true;
5122	}
5123
5124	/// Select a partition type for an alloca partition.
5125	///
5126	/// Try to compute a friendly type for this partition of the alloca. This
5127	/// won't always succeed, in which case we fall back to a legal integer type
5128	/// or an i8 array of an appropriate size.
5129	///
5130	/// \returns A tuple with the following elements:
5131	/// - PartitionType: The computed type for this partition.
5132	/// - IsIntegerWideningViable: True if integer widening promotion is used.
5133	/// - VectorType: The vector type if vector promotion is used, otherwise
5134	/// nullptr.
5135	static std::tuple<Type , bool, VectorType >
5136	selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI,
5137	LLVMContext &C) {
5138	// First check if the partition is viable for vector promotion.
5139	//
5140	// We prefer vector promotion over integer widening promotion when:
5141	// - The vector element type is a floating-point type.
5142	// - All the loads/stores to the alloca are vector loads/stores to the
5143	// entire alloca or load/store a single element of the vector.
5144	//
5145	// Otherwise when there is an integer vector with mixed type loads/stores we
5146	// prefer integer widening promotion because it's more likely the user is
5147	// doing bitwise arithmetic and we generate better code.
5148	VectorType *VecTy =
5149	isVectorPromotionViable(P, DL, VScale: AI.getFunction()->getVScaleValue());
5150	// If the vector element type is a floating-point type, we prefer vector
5151	// promotion. If the vector has one element, let the below code select
5152	// whether we promote with the vector or scalar.
5153	if (VecTy && VecTy->getElementType()->isFloatingPointTy() &&
5154	VecTy->getElementCount().getFixedValue() > `1`)
5155	return {VecTy, false, VecTy};
5156
5157	// Check if there is a common type that all slices of the partition use that
5158	// spans the partition.
5159	auto [CommonUseTy, LargestIntTy] =
5160	findCommonType(B: P.begin(), E: P.end(), EndOffset: P.endOffset());
5161	if (CommonUseTy) {
5162	TypeSize CommonUseSize = DL.getTypeAllocSize(Ty: CommonUseTy);
5163	if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5164	// We prefer vector promotion here because if vector promotion is viable
5165	// and there is a common type used, then it implies the second listed
5166	// condition for preferring vector promotion is true.
5167	if (VecTy)
5168	return {VecTy, false, VecTy};
5169	return {CommonUseTy, isIntegerWideningViable(P, AllocaTy: CommonUseTy, DL),
5170	nullptr};
5171	}
5172	}
5173
5174	// Can we find an appropriate subtype in the original allocated
5175	// type?
5176	if (Type *TypePartitionTy = getTypePartition(DL, Ty: AI.getAllocatedType(),
5177	Offset: P.beginOffset(), Size: P.size())) {
5178	// If the partition is an integer array that can be spanned by a legal
5179	// integer type, prefer to represent it as a legal integer type because
5180	// it's more likely to be promotable.
5181	if (TypePartitionTy->isArrayTy() &&
5182	TypePartitionTy->getArrayElementType()->isIntegerTy() &&
5183	DL.isLegalInteger(Width: P.size() * `8`))
5184	TypePartitionTy = Type::getIntNTy(C, N: P.size() * `8`);
5185	// There was no common type used, so we prefer integer widening promotion.
5186	if (isIntegerWideningViable(P, AllocaTy: TypePartitionTy, DL))
5187	return {TypePartitionTy, true, nullptr};
5188	if (VecTy)
5189	return {VecTy, false, VecTy};
5190	// If we couldn't promote with TypePartitionTy, try with the largest
5191	// integer type used.
5192	if (LargestIntTy &&
5193	DL.getTypeAllocSize(Ty: LargestIntTy).getFixedValue() >= P.size() &&
5194	isIntegerWideningViable(P, AllocaTy: LargestIntTy, DL))
5195	return {LargestIntTy, true, nullptr};
5196
5197	// Fallback to TypePartitionTy and we probably won't promote.
5198	return {TypePartitionTy, false, nullptr};
5199	}
5200
5201	// Select the largest integer type used if it spans the partition.
5202	if (LargestIntTy &&
5203	DL.getTypeAllocSize(Ty: LargestIntTy).getFixedValue() >= P.size())
5204	return {LargestIntTy, false, nullptr};
5205
5206	// Select a legal integer type if it spans the partition.
5207	if (DL.isLegalInteger(Width: P.size() * `8`))
5208	return {Type::getIntNTy(C, N: P.size() * `8`), false, nullptr};
5209
5210	// Fallback to an i8 array.
5211	return {ArrayType::get(ElementType: Type::getInt8Ty(C), NumElements: P.size()), false, nullptr};
5212	}
5213
5214	/// Rewrite an alloca partition's users.
5215	///
5216	/// This routine drives both of the rewriting goals of the SROA pass. It tries
5217	/// to rewrite uses of an alloca partition to be conducive for SSA value
5218	/// promotion. If the partition needs a new, more refined alloca, this will
5219	/// build that new alloca, preserving as much type information as possible, and
5220	/// rewrite the uses of the old alloca to point at the new one and have the
5221	/// appropriate new offsets. It also evaluates how successful the rewrite was
5222	/// at enabling promotion and if it was successful queues the alloca to be
5223	/// promoted.
5224	std::pair<AllocaInst *, uint64_t>
5225	SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) {
5226	const DataLayout &DL = AI.getDataLayout();
5227	// Select the type for the new alloca that spans the partition.
5228	auto [PartitionTy, IsIntegerWideningViable, VecTy] =
5229	selectPartitionType(P, DL, AI, C&: *C);
5230
5231	// Check for the case where we're going to rewrite to a new alloca of the
5232	// exact same type as the original, and with the same access offsets. In that
5233	// case, re-use the existing alloca, but still run through the rewriter to
5234	// perform phi and select speculation.
5235	// P.beginOffset() can be non-zero even with the same type in a case with
5236	// out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5237	AllocaInst *NewAI;
5238	if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == `0`) {
5239	NewAI = &AI;
5240	// FIXME: We should be able to bail at this point with "nothing changed".
5241	// FIXME: We might want to defer PHI speculation until after here.
5242	// FIXME: return nullptr;
5243	} else {
5244	// Make sure the alignment is compatible with P.beginOffset().
5245	const Align Alignment = commonAlignment(A: AI.getAlign(), Offset: P.beginOffset());
5246	// If we will get at least this much alignment from the type alone, leave
5247	// the alloca's alignment unconstrained.
5248	const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(Ty: PartitionTy);
5249	NewAI = new AllocaInst (
5250	PartitionTy, AI.getAddressSpace(), nullptr,
5251	IsUnconstrained ? DL.getPrefTypeAlign(Ty: PartitionTy) : Alignment,
5252	AI.getName() + ".sroa." + Twine (P.begin() - AS.begin()),
5253	AI.getIterator());
5254	// Copy the old AI debug location over to the new one.
5255	NewAI->setDebugLoc(AI.getDebugLoc());
5256	++NumNewAllocas;
5257	}
5258
5259	LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5260	<< "," << P.endOffset() << ") to: " << *NewAI << "\n");
5261
5262	// Track the high watermark on the worklist as it is only relevant for
5263	// promoted allocas. We will reset it to this point if the alloca is not in
5264	// fact scheduled for promotion.
5265	unsigned PPWOldSize = PostPromotionWorklist.size();
5266	unsigned NumUses = `0`;
5267	SmallSetVector<PHINode *, `8`> PHIUsers;
5268	SmallSetVector<SelectInst *, `8`> SelectUsers;
5269
5270	AllocaSliceRewriter Rewriter(
5271	DL, AS, *this, AI, *NewAI, PartitionTy, P.beginOffset(), P.endOffset(),
5272	IsIntegerWideningViable, VecTy, PHIUsers, SelectUsers);
5273	bool Promotable = true;
5274	// Check whether we can have tree-structured merge.
5275	if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5276	NumUses += DeletedValues ->size() + `1`;
5277	for (Value V : DeletedValues)
5278	DeadInsts.push_back(Elt: V);
5279	} else {
5280	for (Slice *S : P.splitSliceTails()) {
5281	Promotable &= Rewriter.visit(I: S);
5282	++NumUses;
5283	}
5284	for (Slice &S : P) {
5285	Promotable &= Rewriter.visit(I: &S);
5286	++NumUses;
5287	}
5288	}
5289
5290	NumAllocaPartitionUses += NumUses;
5291	MaxUsesPerAllocaPartition.updateMax(V: NumUses);
5292
5293	// Now that we've processed all the slices in the new partition, check if any
5294	// PHIs or Selects would block promotion.
5295	for (PHINode *PHI : PHIUsers)
5296	if (!isSafePHIToSpeculate(PN&: *PHI)) {
5297	Promotable = false;
5298	PHIUsers.clear();
5299	SelectUsers.clear();
5300	break;
5301	}
5302
5303	SmallVector<std::pair<SelectInst *, RewriteableMemOps>, `2`>
5304	NewSelectsToRewrite;
5305	NewSelectsToRewrite.reserve(N: SelectUsers.size());
5306	for (SelectInst *Sel : SelectUsers) {
5307	std::optional<RewriteableMemOps> Ops =
5308	isSafeSelectToSpeculate(SI&: *Sel, PreserveCFG);
5309	if (!Ops) {
5310	Promotable = false;
5311	PHIUsers.clear();
5312	SelectUsers.clear();
5313	NewSelectsToRewrite.clear();
5314	break;
5315	}
5316	NewSelectsToRewrite.emplace_back(Args: std::make_pair(x&: Sel, y&: *Ops));
5317	}
5318
5319	if (Promotable) {
5320	for (Use *U : AS.getDeadUsesIfPromotable()) {
5321	auto *OldInst = dyn_cast<Instruction>(Val: U->get());
5322	Value::dropDroppableUse(U&: *U);
5323	if (OldInst)
5324	if (isInstructionTriviallyDead(I: OldInst))
5325	DeadInsts.push_back(Elt: OldInst);
5326	}
5327	if (PHIUsers.empty() && SelectUsers.empty()) {
5328	// Promote the alloca.
5329	PromotableAllocas.insert(X: NewAI);
5330	} else {
5331	// If we have either PHIs or Selects to speculate, add them to those
5332	// worklists and re-queue the new alloca so that we promote in on the
5333	// next iteration.
5334	SpeculatablePHIs.insert_range(R&: PHIUsers);
5335	SelectsToRewrite.reserve(NumEntries: SelectsToRewrite.size() +
5336	NewSelectsToRewrite.size());
5337	for (auto &&KV : llvm::make_range(
5338	x: std::make_move_iterator(i: NewSelectsToRewrite.begin()),
5339	y: std::make_move_iterator(i: NewSelectsToRewrite.end())))
5340	SelectsToRewrite.insert(KV: std::move(KV));
5341	Worklist.insert(X: NewAI);
5342	}
5343	} else {
5344	// Drop any post-promotion work items if promotion didn't happen.
5345	while (PostPromotionWorklist.size() > PPWOldSize)
5346	PostPromotionWorklist.pop_back();
5347
5348	// We couldn't promote and we didn't create a new partition, nothing
5349	// happened.
5350	if (NewAI == &AI)
5351	return {nullptr, `0`};
5352
5353	// If we can't promote the alloca, iterate on it to check for new
5354	// refinements exposed by splitting the current alloca. Don't iterate on an
5355	// alloca which didn't actually change and didn't get promoted.
5356	Worklist.insert(X: NewAI);
5357	}
5358
5359	return {NewAI, DL.getTypeSizeInBits(Ty: PartitionTy).getFixedValue()};
5360	}
5361
5362	// There isn't a shared interface to get the "address" parts out of a
5363	// dbg.declare and dbg.assign, so provide some wrappers.
5364	bool isKillAddress(const DbgVariableRecord *DVR) {
5365	if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5366	return DVR->isKillAddress();
5367	return DVR->isKillLocation();
5368	}
5369
5370	const DIExpression getAddressExpression(const* DbgVariableRecord *DVR) {
5371	if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5372	return DVR->getAddressExpression();
5373	return DVR->getExpression();
5374	}
5375
5376	/// Create or replace an existing fragment in a DIExpression with \p Frag.
5377	/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5378	/// operation, add \p BitExtractOffset to the offset part.
5379	///
5380	/// Returns the new expression, or nullptr if this fails (see details below).
5381	///
5382	/// This function is similar to DIExpression::createFragmentExpression except
5383	/// for 3 important distinctions:
5384	/// 1. The new fragment isn't relative to an existing fragment.
5385	/// 2. It assumes the computed location is a memory location. This means we
5386	/// don't need to perform checks that creating the fragment preserves the
5387	/// expression semantics.
5388	/// 3. Existing extract_bits are modified independently of fragment changes
5389	/// using \p BitExtractOffset. A change to the fragment offset or size
5390	/// may affect a bit extract. But a bit extract offset can change
5391	/// independently of the fragment dimensions.
5392	///
5393	/// Returns the new expression, or nullptr if one couldn't be created.
5394	/// Ideally this is only used to signal that a bit-extract has become
5395	/// zero-sized (and thus the new debug record has no size and can be
5396	/// dropped), however, it fails for other reasons too - see the FIXME below.
5397	///
5398	/// FIXME: To keep the change that introduces this function NFC it bails
5399	/// in some situations unecessarily, e.g. when fragment and bit extract
5400	/// sizes differ.
5401	static DIExpression createOrReplaceFragment(const* DIExpression *Expr,
5402	DIExpression::FragmentInfo Frag,
5403	int64_t BitExtractOffset) {
5404	SmallVector<uint64_t, `8`> Ops;
5405	bool HasFragment = false;
5406	bool HasBitExtract = false;
5407
5408	for (auto &Op : Expr->expr_ops()) {
5409	if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5410	HasFragment = true;
5411	continue;
5412	}
5413	if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext \|\|
5414	Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5415	HasBitExtract = true;
5416	int64_t ExtractOffsetInBits = Op.getArg(I: `0`);
5417	int64_t ExtractSizeInBits = Op.getArg(I: `1`);
5418
5419	// DIExpression::createFragmentExpression doesn't know how to handle
5420	// a fragment that is smaller than the extract. Copy the behaviour
5421	// (bail) to avoid non-NFC changes.
5422	// FIXME: Don't do this.
5423	if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5424	return nullptr;
5425
5426	assert(BitExtractOffset <= `0`);
5427	int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5428
5429	// DIExpression::createFragmentExpression doesn't know what to do
5430	// if the new extract starts "outside" the existing one. Copy the
5431	// behaviour (bail) to avoid non-NFC changes.
5432	// FIXME: Don't do this.
5433	if (AdjustedOffset < `0`)
5434	return nullptr;
5435
5436	Ops.push_back(Elt: Op.getOp());
5437	Ops.push_back(Elt: std::max<int64_t>(a: `0`, b: AdjustedOffset));
5438	Ops.push_back(Elt: ExtractSizeInBits);
5439	continue;
5440	}
5441	Op.appendToVector(V&: Ops);
5442	}
5443
5444	// Unsupported by createFragmentExpression, so don't support it here yet to
5445	// preserve NFC-ness.
5446	if (HasFragment && HasBitExtract)
5447	return nullptr;
5448
5449	if (!HasBitExtract) {
5450	Ops.push_back(Elt: dwarf::DW_OP_LLVM_fragment);
5451	Ops.push_back(Elt: Frag.OffsetInBits);
5452	Ops.push_back(Elt: Frag.SizeInBits);
5453	}
5454	return DIExpression::get(Context&: Expr->getContext(), Elements: Ops);
5455	}
5456
5457	/// Insert a new DbgRecord.
5458	/// \p Orig Original to copy record type, debug loc and variable from, and
5459	/// additionally value and value expression for dbg_assign records.
5460	/// \p NewAddr Location's new base address.
5461	/// \p NewAddrExpr New expression to apply to address.
5462	/// \p BeforeInst Insert position.
5463	/// \p NewFragment New fragment (absolute, non-relative).
5464	/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5465	static void
5466	insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord Orig, AllocaInst NewAddr,
5467	DIExpression NewAddrExpr, Instruction BeforeInst,
5468	std::optional<DIExpression::FragmentInfo> NewFragment,
5469	int64_t BitExtractAdjustment) {
5470	(void)DIB;
5471
5472	// A dbg_assign puts fragment info in the value expression only. The address
5473	// expression has already been built: NewAddrExpr. A dbg_declare puts the
5474	// new fragment info into NewAddrExpr (as it only has one expression).
5475	DIExpression *NewFragmentExpr =
5476	Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5477	if (NewFragment)
5478	NewFragmentExpr = createOrReplaceFragment(Expr: NewFragmentExpr, Frag: *NewFragment,
5479	BitExtractOffset: BitExtractAdjustment);
5480	if (!NewFragmentExpr)
5481	return;
5482
5483	if (Orig->isDbgDeclare()) {
5484	DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
5485	Address: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5486	BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5487	Here: BeforeInst->getIterator());
5488	return;
5489	}
5490
5491	if (Orig->isDbgValue()) {
5492	DbgVariableRecord *DVR = DbgVariableRecord::createDbgVariableRecord(
5493	Location: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5494	// Drop debug information if the expression doesn't start with a
5495	// DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5496	// describes the address of alloca rather than the value inside the alloca.
5497	if (!NewFragmentExpr->startsWithDeref())
5498	DVR->setKillAddress();
5499	BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5500	Here: BeforeInst->getIterator());
5501	return;
5502	}
5503
5504	// Apply a DIAssignID to the store if it doesn't already have it.
5505	if (!NewAddr->hasMetadata(KindID: LLVMContext::MD_DIAssignID)) {
5506	NewAddr->setMetadata(KindID: LLVMContext::MD_DIAssignID,
5507	Node: DIAssignID::getDistinct(Context&: NewAddr->getContext()));
5508	}
5509
5510	DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
5511	LinkedInstr: NewAddr, Val: Orig->getValue(), Variable: Orig->getVariable(), Expression: NewFragmentExpr, Address: NewAddr,
5512	AddressExpression: NewAddrExpr, DI: Orig->getDebugLoc());
5513	LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5514	(void)NewAssign;
5515	}
5516
5517	/// Walks the slices of an alloca and form partitions based on them,
5518	/// rewriting each of their uses.
5519	bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5520	if (AS.begin() == AS.end())
5521	return false;
5522
5523	unsigned NumPartitions = `0`;
5524	bool Changed = false;
5525	const DataLayout &DL = AI.getModule()->getDataLayout();
5526
5527	// First try to pre-split loads and stores.
5528	Changed \|= presplitLoadsAndStores(AI, AS);
5529
5530	// Now that we have identified any pre-splitting opportunities,
5531	// mark loads and stores unsplittable except for the following case.
5532	// We leave a slice splittable if all other slices are disjoint or fully
5533	// included in the slice, such as whole-alloca loads and stores.
5534	// If we fail to split these during pre-splitting, we want to force them
5535	// to be rewritten into a partition.
5536	bool IsSorted = true;
5537
5538	uint64_t AllocaSize = AI.getAllocationSize(DL)->getFixedValue();
5539	const uint64_t MaxBitVectorSize = `1024`;
5540	if (AllocaSize <= MaxBitVectorSize) {
5541	// If a byte boundary is included in any load or store, a slice starting or
5542	// ending at the boundary is not splittable.
5543	SmallBitVector SplittableOffset(AllocaSize + `1`, true);
5544	for (Slice &S : AS)
5545	for (unsigned O = S.beginOffset() + `1`;
5546	O < S.endOffset() && O < AllocaSize; O++)
5547	SplittableOffset.reset(Idx: O);
5548
5549	for (Slice &S : AS) {
5550	if (!S.isSplittable())
5551	continue;
5552
5553	if ((S.beginOffset() > AllocaSize \|\| SplittableOffset [S.beginOffset()]) &&
5554	(S.endOffset() > AllocaSize \|\| SplittableOffset [S.endOffset()]))
5555	continue;
5556
5557	if (isa<LoadInst>(Val: S.getUse()->getUser()) \|\|
5558	isa<StoreInst>(Val: S.getUse()->getUser())) {
5559	S.makeUnsplittable();
5560	IsSorted = false;
5561	}
5562	}
5563	} else {
5564	// We only allow whole-alloca splittable loads and stores
5565	// for a large alloca to avoid creating too large BitVector.
5566	for (Slice &S : AS) {
5567	if (!S.isSplittable())
5568	continue;
5569
5570	if (S.beginOffset() == `0` && S.endOffset() >= AllocaSize)
5571	continue;
5572
5573	if (isa<LoadInst>(Val: S.getUse()->getUser()) \|\|
5574	isa<StoreInst>(Val: S.getUse()->getUser())) {
5575	S.makeUnsplittable();
5576	IsSorted = false;
5577	}
5578	}
5579	}
5580
5581	if (!IsSorted)
5582	llvm::stable_sort(Range&: AS);
5583
5584	/// Describes the allocas introduced by rewritePartition in order to migrate
5585	/// the debug info.
5586	struct Fragment {
5587	AllocaInst *Alloca;
5588	uint64_t Offset;
5589	uint64_t Size;
5590	Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5591	: Alloca(AI), Offset(O), Size(S) {}
5592	};
5593	SmallVector<Fragment, `4`> Fragments;
5594
5595	// Rewrite each partition.
5596	for (auto &P : AS.partitions()) {
5597	auto [NewAI, ActiveBits] = rewritePartition(AI, AS, P);
5598	if (NewAI) {
5599	Changed = true;
5600	if (NewAI != &AI) {
5601	uint64_t SizeOfByte = `8`;
5602	// Don't include any padding.
5603	uint64_t Size = std::min(a: ActiveBits, b: P.size() * SizeOfByte);
5604	Fragments.push_back(
5605	Elt: Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5606	}
5607	}
5608	++NumPartitions;
5609	}
5610
5611	NumAllocaPartitions += NumPartitions;
5612	MaxPartitionsPerAlloca.updateMax(V: NumPartitions);
5613
5614	// Migrate debug information from the old alloca to the new alloca(s)
5615	// and the individual partitions.
5616	auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5617	// Can't overlap with undef memory.
5618	if (isKillAddress(DVR: DbgVariable))
5619	return;
5620
5621	const Value *DbgPtr = DbgVariable->getAddress();
5622	DIExpression::FragmentInfo VarFrag =
5623	DbgVariable->getFragmentOrEntireVariable();
5624	// Get the address expression constant offset if one exists and the ops
5625	// that come after it.
5626	int64_t CurrentExprOffsetInBytes = `0`;
5627	SmallVector<uint64_t> PostOffsetOps;
5628	if (!getAddressExpression(DVR: DbgVariable)
5629	->extractLeadingOffset(OffsetInBytes&: CurrentExprOffsetInBytes, RemainingOps&: PostOffsetOps))
5630	return; // Couldn't interpret this DIExpression - drop the var.
5631
5632	// Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5633	int64_t ExtractOffsetInBits = `0`;
5634	for (auto Op : getAddressExpression(DVR: DbgVariable)->expr_ops()) {
5635	if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext \|\|
5636	Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5637	ExtractOffsetInBits = Op.getArg(I: `0`);
5638	break;
5639	}
5640	}
5641
5642	DIBuilder DIB(AI.getModule(), /AllowUnresolved/* false);
5643	for (auto Fragment : Fragments) {
5644	int64_t OffsetFromLocationInBits;
5645	std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5646	// Find the variable fragment that the new alloca slice covers.
5647	// Drop debug info for this variable fragment if we can't compute an
5648	// intersect between it and the alloca slice.
5649	if (!DIExpression::calculateFragmentIntersect(
5650	DL, SliceStart: &AI, SliceOffsetInBits: Fragment.Offset, SliceSizeInBits: Fragment.Size, DbgPtr,
5651	DbgPtrOffsetInBits: CurrentExprOffsetInBytes * `8`, DbgExtractOffsetInBits: ExtractOffsetInBits, VarFrag,
5652	Result&: NewDbgFragment, OffsetFromLocationInBits))
5653	continue; // Do not migrate this fragment to this slice.
5654
5655	// Zero sized fragment indicates there's no intersect between the variable
5656	// fragment and the alloca slice. Skip this slice for this variable
5657	// fragment.
5658	if (NewDbgFragment && !NewDbgFragment ->SizeInBits)
5659	continue; // Do not migrate this fragment to this slice.
5660
5661	// No fragment indicates DbgVariable's variable or fragment exactly
5662	// overlaps the slice; copy its fragment (or nullopt if there isn't one).
5663	if (!NewDbgFragment)
5664	NewDbgFragment = DbgVariable->getFragment();
5665
5666	// Reduce the new expression offset by the bit-extract offset since
5667	// we'll be keeping that.
5668	int64_t OffestFromNewAllocaInBits =
5669	OffsetFromLocationInBits - ExtractOffsetInBits;
5670	// We need to adjust an existing bit extract if the offset expression
5671	// can't eat the slack (i.e., if the new offset would be negative).
5672	int64_t BitExtractOffset =
5673	std::min<int64_t>(a: `0`, b: OffestFromNewAllocaInBits);
5674	// The magnitude of a negative value indicates the number of bits into
5675	// the existing variable fragment that the memory region begins. The new
5676	// variable fragment already excludes those bits - the new DbgPtr offset
5677	// only needs to be applied if it's positive.
5678	OffestFromNewAllocaInBits =
5679	std::max(a: int64_t(`0`), b: OffestFromNewAllocaInBits);
5680
5681	// Rebuild the expression:
5682	// {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5683	// Add NewDbgFragment later, because dbg.assigns don't want it in the
5684	// address expression but the value expression instead.
5685	DIExpression *NewExpr = DIExpression::get(Context&: AI.getContext(), Elements: PostOffsetOps);
5686	if (OffestFromNewAllocaInBits > `0`) {
5687	int64_t OffsetInBytes = (OffestFromNewAllocaInBits + `7`) / `8`;
5688	NewExpr = DIExpression::prepend(Expr: NewExpr, /flags=/Flags: `0`, Offset: OffsetInBytes);
5689	}
5690
5691	// Remove any existing intrinsics on the new alloca describing
5692	// the variable fragment.
5693	auto RemoveOne = [DbgVariable](auto *OldDII) {
5694	auto SameVariableFragment = [](const auto LHS, const* auto *RHS) {
5695	return LHS->getVariable() == RHS->getVariable() &&
5696	LHS->getDebugLoc()->getInlinedAt() ==
5697	RHS->getDebugLoc()->getInlinedAt();
5698	};
5699	if (SameVariableFragment(OldDII, DbgVariable))
5700	OldDII->eraseFromParent();
5701	};
5702	for_each(Range: findDVRDeclares(V: Fragment.Alloca), F: RemoveOne);
5703	for_each(Range: findDVRValues(V: Fragment.Alloca), F: RemoveOne);
5704	insertNewDbgInst(DIB, Orig: DbgVariable, NewAddr: Fragment.Alloca, NewAddrExpr: NewExpr, BeforeInst: &AI,
5705	NewFragment: NewDbgFragment, BitExtractAdjustment: BitExtractOffset);
5706	}
5707	};
5708
5709	// Migrate debug information from the old alloca to the new alloca(s)
5710	// and the individual partitions.
5711	for_each(Range: findDVRDeclares(V: &AI), F: MigrateOne);
5712	for_each(Range: findDVRValues(V: &AI), F: MigrateOne);
5713	for_each(Range: at::getDVRAssignmentMarkers(Inst: &AI), F: MigrateOne);
5714
5715	return Changed;
5716	}
5717
5718	/// Clobber a use with poison, deleting the used value if it becomes dead.
5719	void SROA::clobberUse(Use &U) {
5720	Value *OldV = U;
5721	// Replace the use with an poison value.
5722	U = PoisonValue::get(T: OldV->getType());
5723
5724	// Check for this making an instruction dead. We have to garbage collect
5725	// all the dead instructions to ensure the uses of any alloca end up being
5726	// minimal.
5727	if (Instruction *OldI = dyn_cast<Instruction>(Val: OldV))
5728	if (isInstructionTriviallyDead(I: OldI)) {
5729	DeadInsts.push_back(Elt: OldI);
5730	}
5731	}
5732
5733	/// A basic LoadAndStorePromoter that does not remove store nodes.
5734	class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
5735	public:
5736	BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
5737	Type *ZeroType)
5738	: LoadAndStorePromoter (Insts, S), ZeroType(ZeroType) {}
5739	bool shouldDelete(Instruction I) const* override {
5740	return !isa<StoreInst>(Val: I) && !isa<AllocaInst>(Val: I);
5741	}
5742
5743	Value getValueToUseForAlloca(Instruction I) const override {
5744	return UndefValue::get(T: ZeroType);
5745	}
5746
5747	private:
5748	Type *ZeroType;
5749	};
5750
5751	bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5752	// Look through each "partition", looking for slices with the same start/end
5753	// that do not overlap with any before them. The slices are sorted by
5754	// increasing beginOffset. We don't use AS.partitions(), as it will use a more
5755	// sophisticated algorithm that takes splittable slices into account.
5756	LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5757	bool AllSameAndValid = true;
5758	Type PartitionType = nullptr*;
5759	SmallVector<Instruction *> Insts;
5760	uint64_t BeginOffset = `0`;
5761	uint64_t EndOffset = `0`;
5762
5763	auto Flush = [&]() {
5764	if (AllSameAndValid && !Insts.empty()) {
5765	LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5766	<< EndOffset << ")\n");
5767	SmallVector<PHINode *, `4`> NewPHIs;
5768	SSAUpdater SSA(&NewPHIs);
5769	Insts.push_back(Elt: &AI);
5770	BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5771	Promoter.run(Insts);
5772	}
5773	AllSameAndValid = true;
5774	PartitionType = nullptr;
5775	Insts.clear();
5776	};
5777
5778	for (Slice &S : AS) {
5779	auto *User = cast<Instruction>(Val: S.getUse()->getUser());
5780	if (isAssumeLikeIntrinsic(I: User)) {
5781	LLVM_DEBUG({
5782	dbgs() << "Ignoring slice: ";
5783	AS.print(dbgs(), &S);
5784	});
5785	continue;
5786	}
5787	if (S.beginOffset() >= EndOffset) {
5788	Flush ();
5789	BeginOffset = S.beginOffset();
5790	EndOffset = S.endOffset();
5791	} else if (S.beginOffset() != BeginOffset \|\| S.endOffset() != EndOffset) {
5792	if (AllSameAndValid) {
5793	LLVM_DEBUG({
5794	dbgs() << "Slice does not match range [" << BeginOffset << ", "
5795	<< EndOffset << ")";
5796	AS.print(dbgs(), &S);
5797	});
5798	AllSameAndValid = false;
5799	}
5800	EndOffset = std::max(a: EndOffset, b: S.endOffset());
5801	continue;
5802	}
5803
5804	if (auto *LI = dyn_cast<LoadInst>(Val: User)) {
5805	Type *UserTy = LI->getType();
5806	// LoadAndStorePromoter requires all the types to be the same.
5807	if (!LI->isSimple() \|\| (PartitionType && UserTy != PartitionType))
5808	AllSameAndValid = false;
5809	PartitionType = UserTy;
5810	Insts.push_back(Elt: User);
5811	} else if (auto *SI = dyn_cast<StoreInst>(Val: User)) {
5812	Type *UserTy = SI->getValueOperand()->getType();
5813	if (!SI->isSimple() \|\| (PartitionType && UserTy != PartitionType))
5814	AllSameAndValid = false;
5815	PartitionType = UserTy;
5816	Insts.push_back(Elt: User);
5817	} else {
5818	AllSameAndValid = false;
5819	}
5820	}
5821
5822	Flush ();
5823	return true;
5824	}
5825
5826	/// Analyze an alloca for SROA.
5827	///
5828	/// This analyzes the alloca to ensure we can reason about it, builds
5829	/// the slices of the alloca, and then hands it off to be split and
5830	/// rewritten as needed.
5831	std::pair<bool /Changed/, bool /CFGChanged/>
5832	SROA::runOnAlloca(AllocaInst &AI) {
5833	bool Changed = false;
5834	bool CFGChanged = false;
5835
5836	LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5837	++NumAllocasAnalyzed;
5838
5839	// Special case dead allocas, as they're trivial.
5840	if (AI.use_empty()) {
5841	AI.eraseFromParent();
5842	Changed = true;
5843	return {Changed, CFGChanged};
5844	}
5845	const DataLayout &DL = AI.getDataLayout();
5846
5847	// Skip alloca forms that this analysis can't handle.
5848	std::optional<TypeSize> Size = AI.getAllocationSize(DL);
5849	if (AI.isArrayAllocation() \|\| !Size \|\| Size ->isScalable() \|\| Size ->isZero())
5850	return {Changed, CFGChanged};
5851
5852	// First, split any FCA loads and stores touching this alloca to promote
5853	// better splitting and promotion opportunities.
5854	IRBuilderTy IRB(&AI);
5855	AggLoadStoreRewriter AggRewriter(DL, IRB);
5856	Changed \|= AggRewriter.rewrite(I&: AI);
5857
5858	// Build the slices using a recursive instruction-visiting builder.
5859	AllocaSlices AS(DL, AI);
5860	LLVM_DEBUG(AS.print(dbgs()));
5861	if (AS.isEscaped())
5862	return {Changed, CFGChanged};
5863
5864	if (AS.isEscapedReadOnly()) {
5865	Changed \|= propagateStoredValuesToLoads(AI, AS);
5866	return {Changed, CFGChanged};
5867	}
5868
5869	for (auto &P : AS.partitions()) {
5870	// For now, we can't split if a field is accessed both via protected field
5871	// and not, because that would mean that we would need to introduce sign and
5872	// auth operations to convert between the protected and non-protected uses,
5873	// and this pass doesn't know how to do that. Also, this case is unlikely to
5874	// occur in normal code.
5875	std::optional<Value *> ProtectedFieldDisc;
5876	auto SliceHasMismatch = [&](Slice &S) {
5877	if (auto *II = dyn_cast<IntrinsicInst>(Val: S.getUse()->getUser()))
5878	if (II->getIntrinsicID() == Intrinsic::lifetime_start \|\|
5879	II->getIntrinsicID() == Intrinsic::lifetime_end)
5880	return false;
5881	if (!ProtectedFieldDisc)
5882	ProtectedFieldDisc = S.ProtectedFieldDisc;
5883	return *ProtectedFieldDisc != S.ProtectedFieldDisc;
5884	};
5885	for (Slice &S : P)
5886	if (SliceHasMismatch (S))
5887	return {Changed, CFGChanged};
5888	for (Slice *S : P.splitSliceTails())
5889	if (SliceHasMismatch (*S))
5890	return {Changed, CFGChanged};
5891	}
5892
5893	// Delete all the dead users of this alloca before splitting and rewriting it.
5894	for (Instruction *DeadUser : AS.getDeadUsers()) {
5895	// Free up everything used by this instruction.
5896	for (Use &DeadOp : DeadUser->operands())
5897	clobberUse(U&: DeadOp);
5898
5899	// Now replace the uses of this instruction.
5900	DeadUser->replaceAllUsesWith(V: PoisonValue::get(T: DeadUser->getType()));
5901
5902	// And mark it for deletion.
5903	DeadInsts.push_back(Elt: DeadUser);
5904	Changed = true;
5905	}
5906	for (Use *DeadOp : AS.getDeadOperands()) {
5907	clobberUse(U&: *DeadOp);
5908	Changed = true;
5909	}
5910	for (IntrinsicInst *PFPUser : AS.getPFPUsers()) {
5911	PFPUser->replaceAllUsesWith(V: PFPUser->getArgOperand(i: `0`));
5912
5913	DeadInsts.push_back(Elt: PFPUser);
5914	Changed = true;
5915	}
5916
5917	// No slices to split. Leave the dead alloca for a later pass to clean up.
5918	if (AS.begin() == AS.end())
5919	return {Changed, CFGChanged};
5920
5921	Changed \|= splitAlloca(AI, AS);
5922
5923	LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5924	while (!SpeculatablePHIs.empty())
5925	speculatePHINodeLoads(IRB, PN&: *SpeculatablePHIs.pop_back_val());
5926
5927	LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5928	auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5929	while (!RemainingSelectsToRewrite.empty()) {
5930	const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5931	CFGChanged \|=
5932	rewriteSelectInstMemOps(SI&: K, Ops: V, IRB, DTU: PreserveCFG ? nullptr* : DTU);
5933	}
5934
5935	return {Changed, CFGChanged};
5936	}
5937
5938	/// Delete the dead instructions accumulated in this run.
5939	///
5940	/// Recursively deletes the dead instructions we've accumulated. This is done
5941	/// at the very end to maximize locality of the recursive delete and to
5942	/// minimize the problems of invalidated instruction pointers as such pointers
5943	/// are used heavily in the intermediate stages of the algorithm.
5944	///
5945	/// We also record the alloca instructions deleted here so that they aren't
5946	/// subsequently handed to mem2reg to promote.
5947	bool SROA::deleteDeadInstructions(
5948	SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5949	bool Changed = false;
5950	while (!DeadInsts.empty()) {
5951	Instruction *I = dyn_cast_or_null<Instruction>(Val: DeadInsts.pop_back_val());
5952	if (!I)
5953	continue;
5954	LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5955
5956	// If the instruction is an alloca, find the possible dbg.declare connected
5957	// to it, and remove it too. We must do this before calling RAUW or we will
5958	// not be able to find it.
5959	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: I)) {
5960	DeletedAllocas.insert(Ptr: AI);
5961	for (DbgVariableRecord *OldDII : findDVRDeclares(V: AI))
5962	OldDII->eraseFromParent();
5963	}
5964
5965	at::deleteAssignmentMarkers(Inst: I);
5966	I->replaceAllUsesWith(V: UndefValue::get(T: I->getType()));
5967
5968	for (Use &Operand : I->operands())
5969	if (Instruction *U = dyn_cast<Instruction>(Val&: Operand)) {
5970	// Zero out the operand and see if it becomes trivially dead.
5971	Operand = nullptr;
5972	if (isInstructionTriviallyDead(I: U))
5973	DeadInsts.push_back(Elt: U);
5974	}
5975
5976	++NumDeleted;
5977	I->eraseFromParent();
5978	Changed = true;
5979	}
5980	return Changed;
5981	}
5982	/// Promote the allocas, using the best available technique.
5983	///
5984	/// This attempts to promote whatever allocas have been identified as viable in
5985	/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5986	/// This function returns whether any promotion occurred.
5987	bool SROA::promoteAllocas() {
5988	if (PromotableAllocas.empty())
5989	return false;
5990
5991	if (SROASkipMem2Reg) {
5992	LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5993	} else {
5994	LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5995	NumPromoted += PromotableAllocas.size();
5996	PromoteMemToReg(Allocas: PromotableAllocas.getArrayRef(), DT&: DTU->getDomTree(), AC);
5997	}
5998
5999	PromotableAllocas.clear();
6000	return true;
6001	}
6002
6003	std::pair<bool /Changed/, bool /CFGChanged/> SROA::runSROA(Function &F) {
6004	LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
6005
6006	const DataLayout &DL = F.getDataLayout();
6007	BasicBlock &EntryBB = F.getEntryBlock();
6008	for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(x: EntryBB.end());
6009	I != E; ++I) {
6010	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val&: I)) {
6011	std::optional<TypeSize> Size = AI->getAllocationSize(DL);
6012	if (Size && Size ->isScalable() && isAllocaPromotable(AI))
6013	PromotableAllocas.insert(X: AI);
6014	else
6015	Worklist.insert(X: AI);
6016	}
6017	}
6018
6019	bool Changed = false;
6020	bool CFGChanged = false;
6021	// A set of deleted alloca instruction pointers which should be removed from
6022	// the list of promotable allocas.
6023	SmallPtrSet<AllocaInst *, `4`> DeletedAllocas;
6024
6025	do {
6026	while (!Worklist.empty()) {
6027	auto [IterationChanged, IterationCFGChanged] =
6028	runOnAlloca(AI&: *Worklist.pop_back_val());
6029	Changed \|= IterationChanged;
6030	CFGChanged \|= IterationCFGChanged;
6031
6032	Changed \|= deleteDeadInstructions(DeletedAllocas);
6033
6034	// Remove the deleted allocas from various lists so that we don't try to
6035	// continue processing them.
6036	if (!DeletedAllocas.empty()) {
6037	Worklist.set_subtract(DeletedAllocas);
6038	PostPromotionWorklist.set_subtract(DeletedAllocas);
6039	PromotableAllocas.set_subtract(DeletedAllocas);
6040	DeletedAllocas.clear();
6041	}
6042	}
6043
6044	Changed \|= promoteAllocas();
6045
6046	Worklist = PostPromotionWorklist;
6047	PostPromotionWorklist.clear();
6048	} while (!Worklist.empty());
6049
6050	assert((!CFGChanged \|\| Changed) && "Can not only modify the CFG.");
6051	assert((!CFGChanged \|\| !PreserveCFG) &&
6052	"Should not have modified the CFG when told to preserve it.");
6053
6054	if (Changed && isAssignmentTrackingEnabled(M: *F.getParent())) {
6055	for (auto &BB : F) {
6056	RemoveRedundantDbgInstrs(BB: &BB);
6057	}
6058	}
6059
6060	return {Changed, CFGChanged};
6061	}
6062
6063	PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) {
6064	DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
6065	AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
6066	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6067	auto [Changed, CFGChanged] =
6068	SROA (&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6069	if (!Changed)
6070	return PreservedAnalyses::all();
6071	PreservedAnalyses PA;
6072	if (!CFGChanged)
6073	PA.preserveSet<CFGAnalyses>();
6074	PA.preserve<DominatorTreeAnalysis>();
6075	return PA;
6076	}
6077
6078	void SROAPass::printPipeline(
6079	raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6080	static_cast<PassInfoMixin<SROAPass> >(this*)->printPipeline(
6081	OS, MapClassName2PassName);
6082	OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6083	: "<modify-cfg>");
6084	}
6085
6086	SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6087
6088	namespace {
6089
6090	/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6091	class SROALegacyPass : public FunctionPass {
6092	SROAOptions PreserveCFG;
6093
6094	public:
6095	static char ID;
6096
6097	SROALegacyPass(SROAOptions PreserveCFG = SROAOptions::PreserveCFG)
6098	: FunctionPass (ID), PreserveCFG(PreserveCFG) {
6099	initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
6100	}
6101
6102	bool runOnFunction(Function &F) override {
6103	if (skipFunction(F))
6104	return false;
6105
6106	DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6107	AssumptionCache &AC =
6108	getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6109	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6110	auto [Changed, _] =
6111	SROA (&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6112	return Changed;
6113	}
6114
6115	void getAnalysisUsage(AnalysisUsage &AU) const override {
6116	AU.addRequired<AssumptionCacheTracker>();
6117	AU.addRequired<DominatorTreeWrapperPass>();
6118	AU.addPreserved<GlobalsAAWrapperPass>();
6119	AU.addPreserved<DominatorTreeWrapperPass>();
6120	}
6121
6122	StringRef getPassName() const override { return "SROA"; }
6123	};
6124
6125	} // end anonymous namespace
6126
6127	char SROALegacyPass::ID = `0`;
6128
6129	FunctionPass llvm::createSROAPass(bool* PreserveCFG) {
6130	return new SROALegacyPass (PreserveCFG ? SROAOptions::PreserveCFG
6131	: SROAOptions::ModifyCFG);
6132	}
6133
6134	INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6135	"Scalar Replacement Of Aggregates", false, false)
6136	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6137	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6138	INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
6139	false, false)
6140

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/SROA.cpp